1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun * Copyright (c) 2006 Mellanox Technologies. All rights reserved
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * This software is available to you under a choice of one of two
5*4882a593Smuzhiyun * licenses. You may choose to be licensed under the terms of the GNU
6*4882a593Smuzhiyun * General Public License (GPL) Version 2, available from the file
7*4882a593Smuzhiyun * COPYING in the main directory of this source tree, or the
8*4882a593Smuzhiyun * OpenIB.org BSD license below:
9*4882a593Smuzhiyun *
10*4882a593Smuzhiyun * Redistribution and use in source and binary forms, with or
11*4882a593Smuzhiyun * without modification, are permitted provided that the following
12*4882a593Smuzhiyun * conditions are met:
13*4882a593Smuzhiyun *
14*4882a593Smuzhiyun * - Redistributions of source code must retain the above
15*4882a593Smuzhiyun * copyright notice, this list of conditions and the following
16*4882a593Smuzhiyun * disclaimer.
17*4882a593Smuzhiyun *
18*4882a593Smuzhiyun * - Redistributions in binary form must reproduce the above
19*4882a593Smuzhiyun * copyright notice, this list of conditions and the following
20*4882a593Smuzhiyun * disclaimer in the documentation and/or other materials
21*4882a593Smuzhiyun * provided with the distribution.
22*4882a593Smuzhiyun *
23*4882a593Smuzhiyun * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24*4882a593Smuzhiyun * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25*4882a593Smuzhiyun * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26*4882a593Smuzhiyun * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27*4882a593Smuzhiyun * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28*4882a593Smuzhiyun * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29*4882a593Smuzhiyun * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30*4882a593Smuzhiyun * SOFTWARE.
31*4882a593Smuzhiyun */
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun #include <rdma/ib_cm.h>
34*4882a593Smuzhiyun #include <net/dst.h>
35*4882a593Smuzhiyun #include <net/icmp.h>
36*4882a593Smuzhiyun #include <linux/icmpv6.h>
37*4882a593Smuzhiyun #include <linux/delay.h>
38*4882a593Smuzhiyun #include <linux/slab.h>
39*4882a593Smuzhiyun #include <linux/vmalloc.h>
40*4882a593Smuzhiyun #include <linux/moduleparam.h>
41*4882a593Smuzhiyun #include <linux/sched/signal.h>
42*4882a593Smuzhiyun #include <linux/sched/mm.h>
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun #include "ipoib.h"
45*4882a593Smuzhiyun
46*4882a593Smuzhiyun int ipoib_max_conn_qp = 128;
47*4882a593Smuzhiyun
48*4882a593Smuzhiyun module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444);
49*4882a593Smuzhiyun MODULE_PARM_DESC(max_nonsrq_conn_qp,
50*4882a593Smuzhiyun "Max number of connected-mode QPs per interface "
51*4882a593Smuzhiyun "(applied only if shared receive queue is not available)");
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
54*4882a593Smuzhiyun static int data_debug_level;
55*4882a593Smuzhiyun
56*4882a593Smuzhiyun module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
57*4882a593Smuzhiyun MODULE_PARM_DESC(cm_data_debug_level,
58*4882a593Smuzhiyun "Enable data path debug tracing for connected mode if > 0");
59*4882a593Smuzhiyun #endif
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun #define IPOIB_CM_IETF_ID 0x1000000000000000ULL
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun #define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
64*4882a593Smuzhiyun #define IPOIB_CM_RX_TIMEOUT (2 * 256 * HZ)
65*4882a593Smuzhiyun #define IPOIB_CM_RX_DELAY (3 * 256 * HZ)
66*4882a593Smuzhiyun #define IPOIB_CM_RX_UPDATE_MASK (0x3)
67*4882a593Smuzhiyun
68*4882a593Smuzhiyun #define IPOIB_CM_RX_RESERVE (ALIGN(IPOIB_HARD_LEN, 16) - IPOIB_ENCAP_LEN)
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun static struct ib_qp_attr ipoib_cm_err_attr = {
71*4882a593Smuzhiyun .qp_state = IB_QPS_ERR
72*4882a593Smuzhiyun };
73*4882a593Smuzhiyun
74*4882a593Smuzhiyun #define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun static struct ib_send_wr ipoib_cm_rx_drain_wr = {
77*4882a593Smuzhiyun .opcode = IB_WR_SEND,
78*4882a593Smuzhiyun };
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
81*4882a593Smuzhiyun const struct ib_cm_event *event);
82*4882a593Smuzhiyun
ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv * priv,int frags,u64 mapping[IPOIB_CM_RX_SG])83*4882a593Smuzhiyun static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
84*4882a593Smuzhiyun u64 mapping[IPOIB_CM_RX_SG])
85*4882a593Smuzhiyun {
86*4882a593Smuzhiyun int i;
87*4882a593Smuzhiyun
88*4882a593Smuzhiyun ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
89*4882a593Smuzhiyun
90*4882a593Smuzhiyun for (i = 0; i < frags; ++i)
91*4882a593Smuzhiyun ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
92*4882a593Smuzhiyun }
93*4882a593Smuzhiyun
ipoib_cm_post_receive_srq(struct net_device * dev,int id)94*4882a593Smuzhiyun static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
95*4882a593Smuzhiyun {
96*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
97*4882a593Smuzhiyun int i, ret;
98*4882a593Smuzhiyun
99*4882a593Smuzhiyun priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun for (i = 0; i < priv->cm.num_frags; ++i)
102*4882a593Smuzhiyun priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, NULL);
105*4882a593Smuzhiyun if (unlikely(ret)) {
106*4882a593Smuzhiyun ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
107*4882a593Smuzhiyun ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1,
108*4882a593Smuzhiyun priv->cm.srq_ring[id].mapping);
109*4882a593Smuzhiyun dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
110*4882a593Smuzhiyun priv->cm.srq_ring[id].skb = NULL;
111*4882a593Smuzhiyun }
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun return ret;
114*4882a593Smuzhiyun }
115*4882a593Smuzhiyun
ipoib_cm_post_receive_nonsrq(struct net_device * dev,struct ipoib_cm_rx * rx,struct ib_recv_wr * wr,struct ib_sge * sge,int id)116*4882a593Smuzhiyun static int ipoib_cm_post_receive_nonsrq(struct net_device *dev,
117*4882a593Smuzhiyun struct ipoib_cm_rx *rx,
118*4882a593Smuzhiyun struct ib_recv_wr *wr,
119*4882a593Smuzhiyun struct ib_sge *sge, int id)
120*4882a593Smuzhiyun {
121*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
122*4882a593Smuzhiyun int i, ret;
123*4882a593Smuzhiyun
124*4882a593Smuzhiyun wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun for (i = 0; i < IPOIB_CM_RX_SG; ++i)
127*4882a593Smuzhiyun sge[i].addr = rx->rx_ring[id].mapping[i];
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun ret = ib_post_recv(rx->qp, wr, NULL);
130*4882a593Smuzhiyun if (unlikely(ret)) {
131*4882a593Smuzhiyun ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
132*4882a593Smuzhiyun ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
133*4882a593Smuzhiyun rx->rx_ring[id].mapping);
134*4882a593Smuzhiyun dev_kfree_skb_any(rx->rx_ring[id].skb);
135*4882a593Smuzhiyun rx->rx_ring[id].skb = NULL;
136*4882a593Smuzhiyun }
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun return ret;
139*4882a593Smuzhiyun }
140*4882a593Smuzhiyun
ipoib_cm_alloc_rx_skb(struct net_device * dev,struct ipoib_cm_rx_buf * rx_ring,int id,int frags,u64 mapping[IPOIB_CM_RX_SG],gfp_t gfp)141*4882a593Smuzhiyun static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,
142*4882a593Smuzhiyun struct ipoib_cm_rx_buf *rx_ring,
143*4882a593Smuzhiyun int id, int frags,
144*4882a593Smuzhiyun u64 mapping[IPOIB_CM_RX_SG],
145*4882a593Smuzhiyun gfp_t gfp)
146*4882a593Smuzhiyun {
147*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
148*4882a593Smuzhiyun struct sk_buff *skb;
149*4882a593Smuzhiyun int i;
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun skb = dev_alloc_skb(ALIGN(IPOIB_CM_HEAD_SIZE + IPOIB_PSEUDO_LEN, 16));
152*4882a593Smuzhiyun if (unlikely(!skb))
153*4882a593Smuzhiyun return NULL;
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun /*
156*4882a593Smuzhiyun * IPoIB adds a IPOIB_ENCAP_LEN byte header, this will align the
157*4882a593Smuzhiyun * IP header to a multiple of 16.
158*4882a593Smuzhiyun */
159*4882a593Smuzhiyun skb_reserve(skb, IPOIB_CM_RX_RESERVE);
160*4882a593Smuzhiyun
161*4882a593Smuzhiyun mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
162*4882a593Smuzhiyun DMA_FROM_DEVICE);
163*4882a593Smuzhiyun if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
164*4882a593Smuzhiyun dev_kfree_skb_any(skb);
165*4882a593Smuzhiyun return NULL;
166*4882a593Smuzhiyun }
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun for (i = 0; i < frags; i++) {
169*4882a593Smuzhiyun struct page *page = alloc_page(gfp);
170*4882a593Smuzhiyun
171*4882a593Smuzhiyun if (!page)
172*4882a593Smuzhiyun goto partial_error;
173*4882a593Smuzhiyun skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE);
174*4882a593Smuzhiyun
175*4882a593Smuzhiyun mapping[i + 1] = ib_dma_map_page(priv->ca, page,
176*4882a593Smuzhiyun 0, PAGE_SIZE, DMA_FROM_DEVICE);
177*4882a593Smuzhiyun if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1])))
178*4882a593Smuzhiyun goto partial_error;
179*4882a593Smuzhiyun }
180*4882a593Smuzhiyun
181*4882a593Smuzhiyun rx_ring[id].skb = skb;
182*4882a593Smuzhiyun return skb;
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun partial_error:
185*4882a593Smuzhiyun
186*4882a593Smuzhiyun ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun for (; i > 0; --i)
189*4882a593Smuzhiyun ib_dma_unmap_page(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE);
190*4882a593Smuzhiyun
191*4882a593Smuzhiyun dev_kfree_skb_any(skb);
192*4882a593Smuzhiyun return NULL;
193*4882a593Smuzhiyun }
194*4882a593Smuzhiyun
ipoib_cm_free_rx_ring(struct net_device * dev,struct ipoib_cm_rx_buf * rx_ring)195*4882a593Smuzhiyun static void ipoib_cm_free_rx_ring(struct net_device *dev,
196*4882a593Smuzhiyun struct ipoib_cm_rx_buf *rx_ring)
197*4882a593Smuzhiyun {
198*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
199*4882a593Smuzhiyun int i;
200*4882a593Smuzhiyun
201*4882a593Smuzhiyun for (i = 0; i < ipoib_recvq_size; ++i)
202*4882a593Smuzhiyun if (rx_ring[i].skb) {
203*4882a593Smuzhiyun ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
204*4882a593Smuzhiyun rx_ring[i].mapping);
205*4882a593Smuzhiyun dev_kfree_skb_any(rx_ring[i].skb);
206*4882a593Smuzhiyun }
207*4882a593Smuzhiyun
208*4882a593Smuzhiyun vfree(rx_ring);
209*4882a593Smuzhiyun }
210*4882a593Smuzhiyun
ipoib_cm_start_rx_drain(struct ipoib_dev_priv * priv)211*4882a593Smuzhiyun static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
212*4882a593Smuzhiyun {
213*4882a593Smuzhiyun struct ipoib_cm_rx *p;
214*4882a593Smuzhiyun
215*4882a593Smuzhiyun /* We only reserved 1 extra slot in CQ for drain WRs, so
216*4882a593Smuzhiyun * make sure we have at most 1 outstanding WR. */
217*4882a593Smuzhiyun if (list_empty(&priv->cm.rx_flush_list) ||
218*4882a593Smuzhiyun !list_empty(&priv->cm.rx_drain_list))
219*4882a593Smuzhiyun return;
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun /*
222*4882a593Smuzhiyun * QPs on flush list are error state. This way, a "flush
223*4882a593Smuzhiyun * error" WC will be immediately generated for each WR we post.
224*4882a593Smuzhiyun */
225*4882a593Smuzhiyun p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
226*4882a593Smuzhiyun ipoib_cm_rx_drain_wr.wr_id = IPOIB_CM_RX_DRAIN_WRID;
227*4882a593Smuzhiyun if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, NULL))
228*4882a593Smuzhiyun ipoib_warn(priv, "failed to post drain wr\n");
229*4882a593Smuzhiyun
230*4882a593Smuzhiyun list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
231*4882a593Smuzhiyun }
232*4882a593Smuzhiyun
ipoib_cm_rx_event_handler(struct ib_event * event,void * ctx)233*4882a593Smuzhiyun static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
234*4882a593Smuzhiyun {
235*4882a593Smuzhiyun struct ipoib_cm_rx *p = ctx;
236*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(p->dev);
237*4882a593Smuzhiyun unsigned long flags;
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
240*4882a593Smuzhiyun return;
241*4882a593Smuzhiyun
242*4882a593Smuzhiyun spin_lock_irqsave(&priv->lock, flags);
243*4882a593Smuzhiyun list_move(&p->list, &priv->cm.rx_flush_list);
244*4882a593Smuzhiyun p->state = IPOIB_CM_RX_FLUSH;
245*4882a593Smuzhiyun ipoib_cm_start_rx_drain(priv);
246*4882a593Smuzhiyun spin_unlock_irqrestore(&priv->lock, flags);
247*4882a593Smuzhiyun }
248*4882a593Smuzhiyun
ipoib_cm_create_rx_qp(struct net_device * dev,struct ipoib_cm_rx * p)249*4882a593Smuzhiyun static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
250*4882a593Smuzhiyun struct ipoib_cm_rx *p)
251*4882a593Smuzhiyun {
252*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
253*4882a593Smuzhiyun struct ib_qp_init_attr attr = {
254*4882a593Smuzhiyun .event_handler = ipoib_cm_rx_event_handler,
255*4882a593Smuzhiyun .send_cq = priv->recv_cq, /* For drain WR */
256*4882a593Smuzhiyun .recv_cq = priv->recv_cq,
257*4882a593Smuzhiyun .srq = priv->cm.srq,
258*4882a593Smuzhiyun .cap.max_send_wr = 1, /* For drain WR */
259*4882a593Smuzhiyun .cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
260*4882a593Smuzhiyun .sq_sig_type = IB_SIGNAL_ALL_WR,
261*4882a593Smuzhiyun .qp_type = IB_QPT_RC,
262*4882a593Smuzhiyun .qp_context = p,
263*4882a593Smuzhiyun };
264*4882a593Smuzhiyun
265*4882a593Smuzhiyun if (!ipoib_cm_has_srq(dev)) {
266*4882a593Smuzhiyun attr.cap.max_recv_wr = ipoib_recvq_size;
267*4882a593Smuzhiyun attr.cap.max_recv_sge = IPOIB_CM_RX_SG;
268*4882a593Smuzhiyun }
269*4882a593Smuzhiyun
270*4882a593Smuzhiyun return ib_create_qp(priv->pd, &attr);
271*4882a593Smuzhiyun }
272*4882a593Smuzhiyun
ipoib_cm_modify_rx_qp(struct net_device * dev,struct ib_cm_id * cm_id,struct ib_qp * qp,unsigned int psn)273*4882a593Smuzhiyun static int ipoib_cm_modify_rx_qp(struct net_device *dev,
274*4882a593Smuzhiyun struct ib_cm_id *cm_id, struct ib_qp *qp,
275*4882a593Smuzhiyun unsigned int psn)
276*4882a593Smuzhiyun {
277*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
278*4882a593Smuzhiyun struct ib_qp_attr qp_attr;
279*4882a593Smuzhiyun int qp_attr_mask, ret;
280*4882a593Smuzhiyun
281*4882a593Smuzhiyun qp_attr.qp_state = IB_QPS_INIT;
282*4882a593Smuzhiyun ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
283*4882a593Smuzhiyun if (ret) {
284*4882a593Smuzhiyun ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
285*4882a593Smuzhiyun return ret;
286*4882a593Smuzhiyun }
287*4882a593Smuzhiyun ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
288*4882a593Smuzhiyun if (ret) {
289*4882a593Smuzhiyun ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
290*4882a593Smuzhiyun return ret;
291*4882a593Smuzhiyun }
292*4882a593Smuzhiyun qp_attr.qp_state = IB_QPS_RTR;
293*4882a593Smuzhiyun ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
294*4882a593Smuzhiyun if (ret) {
295*4882a593Smuzhiyun ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
296*4882a593Smuzhiyun return ret;
297*4882a593Smuzhiyun }
298*4882a593Smuzhiyun qp_attr.rq_psn = psn;
299*4882a593Smuzhiyun ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
300*4882a593Smuzhiyun if (ret) {
301*4882a593Smuzhiyun ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
302*4882a593Smuzhiyun return ret;
303*4882a593Smuzhiyun }
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun /*
306*4882a593Smuzhiyun * Current Mellanox HCA firmware won't generate completions
307*4882a593Smuzhiyun * with error for drain WRs unless the QP has been moved to
308*4882a593Smuzhiyun * RTS first. This work-around leaves a window where a QP has
309*4882a593Smuzhiyun * moved to error asynchronously, but this will eventually get
310*4882a593Smuzhiyun * fixed in firmware, so let's not error out if modify QP
311*4882a593Smuzhiyun * fails.
312*4882a593Smuzhiyun */
313*4882a593Smuzhiyun qp_attr.qp_state = IB_QPS_RTS;
314*4882a593Smuzhiyun ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
315*4882a593Smuzhiyun if (ret) {
316*4882a593Smuzhiyun ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
317*4882a593Smuzhiyun return 0;
318*4882a593Smuzhiyun }
319*4882a593Smuzhiyun ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
320*4882a593Smuzhiyun if (ret) {
321*4882a593Smuzhiyun ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
322*4882a593Smuzhiyun return 0;
323*4882a593Smuzhiyun }
324*4882a593Smuzhiyun
325*4882a593Smuzhiyun return 0;
326*4882a593Smuzhiyun }
327*4882a593Smuzhiyun
ipoib_cm_init_rx_wr(struct net_device * dev,struct ib_recv_wr * wr,struct ib_sge * sge)328*4882a593Smuzhiyun static void ipoib_cm_init_rx_wr(struct net_device *dev,
329*4882a593Smuzhiyun struct ib_recv_wr *wr,
330*4882a593Smuzhiyun struct ib_sge *sge)
331*4882a593Smuzhiyun {
332*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
333*4882a593Smuzhiyun int i;
334*4882a593Smuzhiyun
335*4882a593Smuzhiyun for (i = 0; i < priv->cm.num_frags; ++i)
336*4882a593Smuzhiyun sge[i].lkey = priv->pd->local_dma_lkey;
337*4882a593Smuzhiyun
338*4882a593Smuzhiyun sge[0].length = IPOIB_CM_HEAD_SIZE;
339*4882a593Smuzhiyun for (i = 1; i < priv->cm.num_frags; ++i)
340*4882a593Smuzhiyun sge[i].length = PAGE_SIZE;
341*4882a593Smuzhiyun
342*4882a593Smuzhiyun wr->next = NULL;
343*4882a593Smuzhiyun wr->sg_list = sge;
344*4882a593Smuzhiyun wr->num_sge = priv->cm.num_frags;
345*4882a593Smuzhiyun }
346*4882a593Smuzhiyun
ipoib_cm_nonsrq_init_rx(struct net_device * dev,struct ib_cm_id * cm_id,struct ipoib_cm_rx * rx)347*4882a593Smuzhiyun static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id,
348*4882a593Smuzhiyun struct ipoib_cm_rx *rx)
349*4882a593Smuzhiyun {
350*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
351*4882a593Smuzhiyun struct {
352*4882a593Smuzhiyun struct ib_recv_wr wr;
353*4882a593Smuzhiyun struct ib_sge sge[IPOIB_CM_RX_SG];
354*4882a593Smuzhiyun } *t;
355*4882a593Smuzhiyun int ret;
356*4882a593Smuzhiyun int i;
357*4882a593Smuzhiyun
358*4882a593Smuzhiyun rx->rx_ring = vzalloc(array_size(ipoib_recvq_size,
359*4882a593Smuzhiyun sizeof(*rx->rx_ring)));
360*4882a593Smuzhiyun if (!rx->rx_ring)
361*4882a593Smuzhiyun return -ENOMEM;
362*4882a593Smuzhiyun
363*4882a593Smuzhiyun t = kmalloc(sizeof(*t), GFP_KERNEL);
364*4882a593Smuzhiyun if (!t) {
365*4882a593Smuzhiyun ret = -ENOMEM;
366*4882a593Smuzhiyun goto err_free_1;
367*4882a593Smuzhiyun }
368*4882a593Smuzhiyun
369*4882a593Smuzhiyun ipoib_cm_init_rx_wr(dev, &t->wr, t->sge);
370*4882a593Smuzhiyun
371*4882a593Smuzhiyun spin_lock_irq(&priv->lock);
372*4882a593Smuzhiyun
373*4882a593Smuzhiyun if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
374*4882a593Smuzhiyun spin_unlock_irq(&priv->lock);
375*4882a593Smuzhiyun ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
376*4882a593Smuzhiyun ret = -EINVAL;
377*4882a593Smuzhiyun goto err_free;
378*4882a593Smuzhiyun } else
379*4882a593Smuzhiyun ++priv->cm.nonsrq_conn_qp;
380*4882a593Smuzhiyun
381*4882a593Smuzhiyun spin_unlock_irq(&priv->lock);
382*4882a593Smuzhiyun
383*4882a593Smuzhiyun for (i = 0; i < ipoib_recvq_size; ++i) {
384*4882a593Smuzhiyun if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1,
385*4882a593Smuzhiyun rx->rx_ring[i].mapping,
386*4882a593Smuzhiyun GFP_KERNEL)) {
387*4882a593Smuzhiyun ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
388*4882a593Smuzhiyun ret = -ENOMEM;
389*4882a593Smuzhiyun goto err_count;
390*4882a593Smuzhiyun }
391*4882a593Smuzhiyun ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i);
392*4882a593Smuzhiyun if (ret) {
393*4882a593Smuzhiyun ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
394*4882a593Smuzhiyun "failed for buf %d\n", i);
395*4882a593Smuzhiyun ret = -EIO;
396*4882a593Smuzhiyun goto err_count;
397*4882a593Smuzhiyun }
398*4882a593Smuzhiyun }
399*4882a593Smuzhiyun
400*4882a593Smuzhiyun rx->recv_count = ipoib_recvq_size;
401*4882a593Smuzhiyun
402*4882a593Smuzhiyun kfree(t);
403*4882a593Smuzhiyun
404*4882a593Smuzhiyun return 0;
405*4882a593Smuzhiyun
406*4882a593Smuzhiyun err_count:
407*4882a593Smuzhiyun spin_lock_irq(&priv->lock);
408*4882a593Smuzhiyun --priv->cm.nonsrq_conn_qp;
409*4882a593Smuzhiyun spin_unlock_irq(&priv->lock);
410*4882a593Smuzhiyun
411*4882a593Smuzhiyun err_free:
412*4882a593Smuzhiyun kfree(t);
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun err_free_1:
415*4882a593Smuzhiyun ipoib_cm_free_rx_ring(dev, rx->rx_ring);
416*4882a593Smuzhiyun
417*4882a593Smuzhiyun return ret;
418*4882a593Smuzhiyun }
419*4882a593Smuzhiyun
ipoib_cm_send_rep(struct net_device * dev,struct ib_cm_id * cm_id,struct ib_qp * qp,const struct ib_cm_req_event_param * req,unsigned int psn)420*4882a593Smuzhiyun static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
421*4882a593Smuzhiyun struct ib_qp *qp,
422*4882a593Smuzhiyun const struct ib_cm_req_event_param *req,
423*4882a593Smuzhiyun unsigned int psn)
424*4882a593Smuzhiyun {
425*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
426*4882a593Smuzhiyun struct ipoib_cm_data data = {};
427*4882a593Smuzhiyun struct ib_cm_rep_param rep = {};
428*4882a593Smuzhiyun
429*4882a593Smuzhiyun data.qpn = cpu_to_be32(priv->qp->qp_num);
430*4882a593Smuzhiyun data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
431*4882a593Smuzhiyun
432*4882a593Smuzhiyun rep.private_data = &data;
433*4882a593Smuzhiyun rep.private_data_len = sizeof(data);
434*4882a593Smuzhiyun rep.flow_control = 0;
435*4882a593Smuzhiyun rep.rnr_retry_count = req->rnr_retry_count;
436*4882a593Smuzhiyun rep.srq = ipoib_cm_has_srq(dev);
437*4882a593Smuzhiyun rep.qp_num = qp->qp_num;
438*4882a593Smuzhiyun rep.starting_psn = psn;
439*4882a593Smuzhiyun return ib_send_cm_rep(cm_id, &rep);
440*4882a593Smuzhiyun }
441*4882a593Smuzhiyun
ipoib_cm_req_handler(struct ib_cm_id * cm_id,const struct ib_cm_event * event)442*4882a593Smuzhiyun static int ipoib_cm_req_handler(struct ib_cm_id *cm_id,
443*4882a593Smuzhiyun const struct ib_cm_event *event)
444*4882a593Smuzhiyun {
445*4882a593Smuzhiyun struct net_device *dev = cm_id->context;
446*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
447*4882a593Smuzhiyun struct ipoib_cm_rx *p;
448*4882a593Smuzhiyun unsigned int psn;
449*4882a593Smuzhiyun int ret;
450*4882a593Smuzhiyun
451*4882a593Smuzhiyun ipoib_dbg(priv, "REQ arrived\n");
452*4882a593Smuzhiyun p = kzalloc(sizeof(*p), GFP_KERNEL);
453*4882a593Smuzhiyun if (!p)
454*4882a593Smuzhiyun return -ENOMEM;
455*4882a593Smuzhiyun p->dev = dev;
456*4882a593Smuzhiyun p->id = cm_id;
457*4882a593Smuzhiyun cm_id->context = p;
458*4882a593Smuzhiyun p->state = IPOIB_CM_RX_LIVE;
459*4882a593Smuzhiyun p->jiffies = jiffies;
460*4882a593Smuzhiyun INIT_LIST_HEAD(&p->list);
461*4882a593Smuzhiyun
462*4882a593Smuzhiyun p->qp = ipoib_cm_create_rx_qp(dev, p);
463*4882a593Smuzhiyun if (IS_ERR(p->qp)) {
464*4882a593Smuzhiyun ret = PTR_ERR(p->qp);
465*4882a593Smuzhiyun goto err_qp;
466*4882a593Smuzhiyun }
467*4882a593Smuzhiyun
468*4882a593Smuzhiyun psn = prandom_u32() & 0xffffff;
469*4882a593Smuzhiyun ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
470*4882a593Smuzhiyun if (ret)
471*4882a593Smuzhiyun goto err_modify;
472*4882a593Smuzhiyun
473*4882a593Smuzhiyun if (!ipoib_cm_has_srq(dev)) {
474*4882a593Smuzhiyun ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p);
475*4882a593Smuzhiyun if (ret)
476*4882a593Smuzhiyun goto err_modify;
477*4882a593Smuzhiyun }
478*4882a593Smuzhiyun
479*4882a593Smuzhiyun spin_lock_irq(&priv->lock);
480*4882a593Smuzhiyun queue_delayed_work(priv->wq,
481*4882a593Smuzhiyun &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
482*4882a593Smuzhiyun /* Add this entry to passive ids list head, but do not re-add it
483*4882a593Smuzhiyun * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
484*4882a593Smuzhiyun p->jiffies = jiffies;
485*4882a593Smuzhiyun if (p->state == IPOIB_CM_RX_LIVE)
486*4882a593Smuzhiyun list_move(&p->list, &priv->cm.passive_ids);
487*4882a593Smuzhiyun spin_unlock_irq(&priv->lock);
488*4882a593Smuzhiyun
489*4882a593Smuzhiyun ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn);
490*4882a593Smuzhiyun if (ret) {
491*4882a593Smuzhiyun ipoib_warn(priv, "failed to send REP: %d\n", ret);
492*4882a593Smuzhiyun if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
493*4882a593Smuzhiyun ipoib_warn(priv, "unable to move qp to error state\n");
494*4882a593Smuzhiyun }
495*4882a593Smuzhiyun return 0;
496*4882a593Smuzhiyun
497*4882a593Smuzhiyun err_modify:
498*4882a593Smuzhiyun ib_destroy_qp(p->qp);
499*4882a593Smuzhiyun err_qp:
500*4882a593Smuzhiyun kfree(p);
501*4882a593Smuzhiyun return ret;
502*4882a593Smuzhiyun }
503*4882a593Smuzhiyun
ipoib_cm_rx_handler(struct ib_cm_id * cm_id,const struct ib_cm_event * event)504*4882a593Smuzhiyun static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
505*4882a593Smuzhiyun const struct ib_cm_event *event)
506*4882a593Smuzhiyun {
507*4882a593Smuzhiyun struct ipoib_cm_rx *p;
508*4882a593Smuzhiyun struct ipoib_dev_priv *priv;
509*4882a593Smuzhiyun
510*4882a593Smuzhiyun switch (event->event) {
511*4882a593Smuzhiyun case IB_CM_REQ_RECEIVED:
512*4882a593Smuzhiyun return ipoib_cm_req_handler(cm_id, event);
513*4882a593Smuzhiyun case IB_CM_DREQ_RECEIVED:
514*4882a593Smuzhiyun ib_send_cm_drep(cm_id, NULL, 0);
515*4882a593Smuzhiyun fallthrough;
516*4882a593Smuzhiyun case IB_CM_REJ_RECEIVED:
517*4882a593Smuzhiyun p = cm_id->context;
518*4882a593Smuzhiyun priv = ipoib_priv(p->dev);
519*4882a593Smuzhiyun if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
520*4882a593Smuzhiyun ipoib_warn(priv, "unable to move qp to error state\n");
521*4882a593Smuzhiyun fallthrough;
522*4882a593Smuzhiyun default:
523*4882a593Smuzhiyun return 0;
524*4882a593Smuzhiyun }
525*4882a593Smuzhiyun }
526*4882a593Smuzhiyun /* Adjust length of skb with fragments to match received data */
skb_put_frags(struct sk_buff * skb,unsigned int hdr_space,unsigned int length,struct sk_buff * toskb)527*4882a593Smuzhiyun static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
528*4882a593Smuzhiyun unsigned int length, struct sk_buff *toskb)
529*4882a593Smuzhiyun {
530*4882a593Smuzhiyun int i, num_frags;
531*4882a593Smuzhiyun unsigned int size;
532*4882a593Smuzhiyun
533*4882a593Smuzhiyun /* put header into skb */
534*4882a593Smuzhiyun size = min(length, hdr_space);
535*4882a593Smuzhiyun skb->tail += size;
536*4882a593Smuzhiyun skb->len += size;
537*4882a593Smuzhiyun length -= size;
538*4882a593Smuzhiyun
539*4882a593Smuzhiyun num_frags = skb_shinfo(skb)->nr_frags;
540*4882a593Smuzhiyun for (i = 0; i < num_frags; i++) {
541*4882a593Smuzhiyun skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
542*4882a593Smuzhiyun
543*4882a593Smuzhiyun if (length == 0) {
544*4882a593Smuzhiyun /* don't need this page */
545*4882a593Smuzhiyun skb_fill_page_desc(toskb, i, skb_frag_page(frag),
546*4882a593Smuzhiyun 0, PAGE_SIZE);
547*4882a593Smuzhiyun --skb_shinfo(skb)->nr_frags;
548*4882a593Smuzhiyun } else {
549*4882a593Smuzhiyun size = min_t(unsigned int, length, PAGE_SIZE);
550*4882a593Smuzhiyun
551*4882a593Smuzhiyun skb_frag_size_set(frag, size);
552*4882a593Smuzhiyun skb->data_len += size;
553*4882a593Smuzhiyun skb->truesize += size;
554*4882a593Smuzhiyun skb->len += size;
555*4882a593Smuzhiyun length -= size;
556*4882a593Smuzhiyun }
557*4882a593Smuzhiyun }
558*4882a593Smuzhiyun }
559*4882a593Smuzhiyun
ipoib_cm_handle_rx_wc(struct net_device * dev,struct ib_wc * wc)560*4882a593Smuzhiyun void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
561*4882a593Smuzhiyun {
562*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
563*4882a593Smuzhiyun struct ipoib_cm_rx_buf *rx_ring;
564*4882a593Smuzhiyun unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
565*4882a593Smuzhiyun struct sk_buff *skb, *newskb;
566*4882a593Smuzhiyun struct ipoib_cm_rx *p;
567*4882a593Smuzhiyun unsigned long flags;
568*4882a593Smuzhiyun u64 mapping[IPOIB_CM_RX_SG];
569*4882a593Smuzhiyun int frags;
570*4882a593Smuzhiyun int has_srq;
571*4882a593Smuzhiyun struct sk_buff *small_skb;
572*4882a593Smuzhiyun
573*4882a593Smuzhiyun ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
574*4882a593Smuzhiyun wr_id, wc->status);
575*4882a593Smuzhiyun
576*4882a593Smuzhiyun if (unlikely(wr_id >= ipoib_recvq_size)) {
577*4882a593Smuzhiyun if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
578*4882a593Smuzhiyun spin_lock_irqsave(&priv->lock, flags);
579*4882a593Smuzhiyun list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
580*4882a593Smuzhiyun ipoib_cm_start_rx_drain(priv);
581*4882a593Smuzhiyun queue_work(priv->wq, &priv->cm.rx_reap_task);
582*4882a593Smuzhiyun spin_unlock_irqrestore(&priv->lock, flags);
583*4882a593Smuzhiyun } else
584*4882a593Smuzhiyun ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
585*4882a593Smuzhiyun wr_id, ipoib_recvq_size);
586*4882a593Smuzhiyun return;
587*4882a593Smuzhiyun }
588*4882a593Smuzhiyun
589*4882a593Smuzhiyun p = wc->qp->qp_context;
590*4882a593Smuzhiyun
591*4882a593Smuzhiyun has_srq = ipoib_cm_has_srq(dev);
592*4882a593Smuzhiyun rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring;
593*4882a593Smuzhiyun
594*4882a593Smuzhiyun skb = rx_ring[wr_id].skb;
595*4882a593Smuzhiyun
596*4882a593Smuzhiyun if (unlikely(wc->status != IB_WC_SUCCESS)) {
597*4882a593Smuzhiyun ipoib_dbg(priv,
598*4882a593Smuzhiyun "cm recv error (status=%d, wrid=%d vend_err %#x)\n",
599*4882a593Smuzhiyun wc->status, wr_id, wc->vendor_err);
600*4882a593Smuzhiyun ++dev->stats.rx_dropped;
601*4882a593Smuzhiyun if (has_srq)
602*4882a593Smuzhiyun goto repost;
603*4882a593Smuzhiyun else {
604*4882a593Smuzhiyun if (!--p->recv_count) {
605*4882a593Smuzhiyun spin_lock_irqsave(&priv->lock, flags);
606*4882a593Smuzhiyun list_move(&p->list, &priv->cm.rx_reap_list);
607*4882a593Smuzhiyun spin_unlock_irqrestore(&priv->lock, flags);
608*4882a593Smuzhiyun queue_work(priv->wq, &priv->cm.rx_reap_task);
609*4882a593Smuzhiyun }
610*4882a593Smuzhiyun return;
611*4882a593Smuzhiyun }
612*4882a593Smuzhiyun }
613*4882a593Smuzhiyun
614*4882a593Smuzhiyun if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
615*4882a593Smuzhiyun if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
616*4882a593Smuzhiyun spin_lock_irqsave(&priv->lock, flags);
617*4882a593Smuzhiyun p->jiffies = jiffies;
618*4882a593Smuzhiyun /* Move this entry to list head, but do not re-add it
619*4882a593Smuzhiyun * if it has been moved out of list. */
620*4882a593Smuzhiyun if (p->state == IPOIB_CM_RX_LIVE)
621*4882a593Smuzhiyun list_move(&p->list, &priv->cm.passive_ids);
622*4882a593Smuzhiyun spin_unlock_irqrestore(&priv->lock, flags);
623*4882a593Smuzhiyun }
624*4882a593Smuzhiyun }
625*4882a593Smuzhiyun
626*4882a593Smuzhiyun if (wc->byte_len < IPOIB_CM_COPYBREAK) {
627*4882a593Smuzhiyun int dlen = wc->byte_len;
628*4882a593Smuzhiyun
629*4882a593Smuzhiyun small_skb = dev_alloc_skb(dlen + IPOIB_CM_RX_RESERVE);
630*4882a593Smuzhiyun if (small_skb) {
631*4882a593Smuzhiyun skb_reserve(small_skb, IPOIB_CM_RX_RESERVE);
632*4882a593Smuzhiyun ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
633*4882a593Smuzhiyun dlen, DMA_FROM_DEVICE);
634*4882a593Smuzhiyun skb_copy_from_linear_data(skb, small_skb->data, dlen);
635*4882a593Smuzhiyun ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0],
636*4882a593Smuzhiyun dlen, DMA_FROM_DEVICE);
637*4882a593Smuzhiyun skb_put(small_skb, dlen);
638*4882a593Smuzhiyun skb = small_skb;
639*4882a593Smuzhiyun goto copied;
640*4882a593Smuzhiyun }
641*4882a593Smuzhiyun }
642*4882a593Smuzhiyun
643*4882a593Smuzhiyun frags = PAGE_ALIGN(wc->byte_len -
644*4882a593Smuzhiyun min_t(u32, wc->byte_len, IPOIB_CM_HEAD_SIZE)) /
645*4882a593Smuzhiyun PAGE_SIZE;
646*4882a593Smuzhiyun
647*4882a593Smuzhiyun newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags,
648*4882a593Smuzhiyun mapping, GFP_ATOMIC);
649*4882a593Smuzhiyun if (unlikely(!newskb)) {
650*4882a593Smuzhiyun /*
651*4882a593Smuzhiyun * If we can't allocate a new RX buffer, dump
652*4882a593Smuzhiyun * this packet and reuse the old buffer.
653*4882a593Smuzhiyun */
654*4882a593Smuzhiyun ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
655*4882a593Smuzhiyun ++dev->stats.rx_dropped;
656*4882a593Smuzhiyun goto repost;
657*4882a593Smuzhiyun }
658*4882a593Smuzhiyun
659*4882a593Smuzhiyun ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping);
660*4882a593Smuzhiyun memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof(*mapping));
661*4882a593Smuzhiyun
662*4882a593Smuzhiyun ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
663*4882a593Smuzhiyun wc->byte_len, wc->slid);
664*4882a593Smuzhiyun
665*4882a593Smuzhiyun skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
666*4882a593Smuzhiyun
667*4882a593Smuzhiyun copied:
668*4882a593Smuzhiyun skb->protocol = ((struct ipoib_header *) skb->data)->proto;
669*4882a593Smuzhiyun skb_add_pseudo_hdr(skb);
670*4882a593Smuzhiyun
671*4882a593Smuzhiyun ++dev->stats.rx_packets;
672*4882a593Smuzhiyun dev->stats.rx_bytes += skb->len;
673*4882a593Smuzhiyun
674*4882a593Smuzhiyun skb->dev = dev;
675*4882a593Smuzhiyun /* XXX get correct PACKET_ type here */
676*4882a593Smuzhiyun skb->pkt_type = PACKET_HOST;
677*4882a593Smuzhiyun netif_receive_skb(skb);
678*4882a593Smuzhiyun
679*4882a593Smuzhiyun repost:
680*4882a593Smuzhiyun if (has_srq) {
681*4882a593Smuzhiyun if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id)))
682*4882a593Smuzhiyun ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
683*4882a593Smuzhiyun "for buf %d\n", wr_id);
684*4882a593Smuzhiyun } else {
685*4882a593Smuzhiyun if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p,
686*4882a593Smuzhiyun &priv->cm.rx_wr,
687*4882a593Smuzhiyun priv->cm.rx_sge,
688*4882a593Smuzhiyun wr_id))) {
689*4882a593Smuzhiyun --p->recv_count;
690*4882a593Smuzhiyun ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
691*4882a593Smuzhiyun "for buf %d\n", wr_id);
692*4882a593Smuzhiyun }
693*4882a593Smuzhiyun }
694*4882a593Smuzhiyun }
695*4882a593Smuzhiyun
post_send(struct ipoib_dev_priv * priv,struct ipoib_cm_tx * tx,unsigned int wr_id,struct ipoib_tx_buf * tx_req)696*4882a593Smuzhiyun static inline int post_send(struct ipoib_dev_priv *priv,
697*4882a593Smuzhiyun struct ipoib_cm_tx *tx,
698*4882a593Smuzhiyun unsigned int wr_id,
699*4882a593Smuzhiyun struct ipoib_tx_buf *tx_req)
700*4882a593Smuzhiyun {
701*4882a593Smuzhiyun ipoib_build_sge(priv, tx_req);
702*4882a593Smuzhiyun
703*4882a593Smuzhiyun priv->tx_wr.wr.wr_id = wr_id | IPOIB_OP_CM;
704*4882a593Smuzhiyun
705*4882a593Smuzhiyun return ib_post_send(tx->qp, &priv->tx_wr.wr, NULL);
706*4882a593Smuzhiyun }
707*4882a593Smuzhiyun
ipoib_cm_send(struct net_device * dev,struct sk_buff * skb,struct ipoib_cm_tx * tx)708*4882a593Smuzhiyun void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
709*4882a593Smuzhiyun {
710*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
711*4882a593Smuzhiyun struct ipoib_tx_buf *tx_req;
712*4882a593Smuzhiyun int rc;
713*4882a593Smuzhiyun unsigned int usable_sge = tx->max_send_sge - !!skb_headlen(skb);
714*4882a593Smuzhiyun
715*4882a593Smuzhiyun if (unlikely(skb->len > tx->mtu)) {
716*4882a593Smuzhiyun ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
717*4882a593Smuzhiyun skb->len, tx->mtu);
718*4882a593Smuzhiyun ++dev->stats.tx_dropped;
719*4882a593Smuzhiyun ++dev->stats.tx_errors;
720*4882a593Smuzhiyun ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN);
721*4882a593Smuzhiyun return;
722*4882a593Smuzhiyun }
723*4882a593Smuzhiyun if (skb_shinfo(skb)->nr_frags > usable_sge) {
724*4882a593Smuzhiyun if (skb_linearize(skb) < 0) {
725*4882a593Smuzhiyun ipoib_warn(priv, "skb could not be linearized\n");
726*4882a593Smuzhiyun ++dev->stats.tx_dropped;
727*4882a593Smuzhiyun ++dev->stats.tx_errors;
728*4882a593Smuzhiyun dev_kfree_skb_any(skb);
729*4882a593Smuzhiyun return;
730*4882a593Smuzhiyun }
731*4882a593Smuzhiyun /* Does skb_linearize return ok without reducing nr_frags? */
732*4882a593Smuzhiyun if (skb_shinfo(skb)->nr_frags > usable_sge) {
733*4882a593Smuzhiyun ipoib_warn(priv, "too many frags after skb linearize\n");
734*4882a593Smuzhiyun ++dev->stats.tx_dropped;
735*4882a593Smuzhiyun ++dev->stats.tx_errors;
736*4882a593Smuzhiyun dev_kfree_skb_any(skb);
737*4882a593Smuzhiyun return;
738*4882a593Smuzhiyun }
739*4882a593Smuzhiyun }
740*4882a593Smuzhiyun ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
741*4882a593Smuzhiyun tx->tx_head, skb->len, tx->qp->qp_num);
742*4882a593Smuzhiyun
743*4882a593Smuzhiyun /*
744*4882a593Smuzhiyun * We put the skb into the tx_ring _before_ we call post_send()
745*4882a593Smuzhiyun * because it's entirely possible that the completion handler will
746*4882a593Smuzhiyun * run before we execute anything after the post_send(). That
747*4882a593Smuzhiyun * means we have to make sure everything is properly recorded and
748*4882a593Smuzhiyun * our state is consistent before we call post_send().
749*4882a593Smuzhiyun */
750*4882a593Smuzhiyun tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
751*4882a593Smuzhiyun tx_req->skb = skb;
752*4882a593Smuzhiyun
753*4882a593Smuzhiyun if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
754*4882a593Smuzhiyun ++dev->stats.tx_errors;
755*4882a593Smuzhiyun dev_kfree_skb_any(skb);
756*4882a593Smuzhiyun return;
757*4882a593Smuzhiyun }
758*4882a593Smuzhiyun
759*4882a593Smuzhiyun if ((priv->global_tx_head - priv->global_tx_tail) ==
760*4882a593Smuzhiyun ipoib_sendq_size - 1) {
761*4882a593Smuzhiyun ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
762*4882a593Smuzhiyun tx->qp->qp_num);
763*4882a593Smuzhiyun netif_stop_queue(dev);
764*4882a593Smuzhiyun }
765*4882a593Smuzhiyun
766*4882a593Smuzhiyun skb_orphan(skb);
767*4882a593Smuzhiyun skb_dst_drop(skb);
768*4882a593Smuzhiyun
769*4882a593Smuzhiyun if (netif_queue_stopped(dev)) {
770*4882a593Smuzhiyun rc = ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP |
771*4882a593Smuzhiyun IB_CQ_REPORT_MISSED_EVENTS);
772*4882a593Smuzhiyun if (unlikely(rc < 0))
773*4882a593Smuzhiyun ipoib_warn(priv, "IPoIB/CM:request notify on send CQ failed\n");
774*4882a593Smuzhiyun else if (rc)
775*4882a593Smuzhiyun napi_schedule(&priv->send_napi);
776*4882a593Smuzhiyun }
777*4882a593Smuzhiyun
778*4882a593Smuzhiyun rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req);
779*4882a593Smuzhiyun if (unlikely(rc)) {
780*4882a593Smuzhiyun ipoib_warn(priv, "IPoIB/CM:post_send failed, error %d\n", rc);
781*4882a593Smuzhiyun ++dev->stats.tx_errors;
782*4882a593Smuzhiyun ipoib_dma_unmap_tx(priv, tx_req);
783*4882a593Smuzhiyun dev_kfree_skb_any(skb);
784*4882a593Smuzhiyun
785*4882a593Smuzhiyun if (netif_queue_stopped(dev))
786*4882a593Smuzhiyun netif_wake_queue(dev);
787*4882a593Smuzhiyun } else {
788*4882a593Smuzhiyun netif_trans_update(dev);
789*4882a593Smuzhiyun ++tx->tx_head;
790*4882a593Smuzhiyun ++priv->global_tx_head;
791*4882a593Smuzhiyun }
792*4882a593Smuzhiyun }
793*4882a593Smuzhiyun
ipoib_cm_handle_tx_wc(struct net_device * dev,struct ib_wc * wc)794*4882a593Smuzhiyun void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
795*4882a593Smuzhiyun {
796*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
797*4882a593Smuzhiyun struct ipoib_cm_tx *tx = wc->qp->qp_context;
798*4882a593Smuzhiyun unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
799*4882a593Smuzhiyun struct ipoib_tx_buf *tx_req;
800*4882a593Smuzhiyun unsigned long flags;
801*4882a593Smuzhiyun
802*4882a593Smuzhiyun ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
803*4882a593Smuzhiyun wr_id, wc->status);
804*4882a593Smuzhiyun
805*4882a593Smuzhiyun if (unlikely(wr_id >= ipoib_sendq_size)) {
806*4882a593Smuzhiyun ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
807*4882a593Smuzhiyun wr_id, ipoib_sendq_size);
808*4882a593Smuzhiyun return;
809*4882a593Smuzhiyun }
810*4882a593Smuzhiyun
811*4882a593Smuzhiyun tx_req = &tx->tx_ring[wr_id];
812*4882a593Smuzhiyun
813*4882a593Smuzhiyun ipoib_dma_unmap_tx(priv, tx_req);
814*4882a593Smuzhiyun
815*4882a593Smuzhiyun /* FIXME: is this right? Shouldn't we only increment on success? */
816*4882a593Smuzhiyun ++dev->stats.tx_packets;
817*4882a593Smuzhiyun dev->stats.tx_bytes += tx_req->skb->len;
818*4882a593Smuzhiyun
819*4882a593Smuzhiyun dev_kfree_skb_any(tx_req->skb);
820*4882a593Smuzhiyun
821*4882a593Smuzhiyun netif_tx_lock(dev);
822*4882a593Smuzhiyun
823*4882a593Smuzhiyun ++tx->tx_tail;
824*4882a593Smuzhiyun ++priv->global_tx_tail;
825*4882a593Smuzhiyun
826*4882a593Smuzhiyun if (unlikely(netif_queue_stopped(dev) &&
827*4882a593Smuzhiyun ((priv->global_tx_head - priv->global_tx_tail) <=
828*4882a593Smuzhiyun ipoib_sendq_size >> 1) &&
829*4882a593Smuzhiyun test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)))
830*4882a593Smuzhiyun netif_wake_queue(dev);
831*4882a593Smuzhiyun
832*4882a593Smuzhiyun if (wc->status != IB_WC_SUCCESS &&
833*4882a593Smuzhiyun wc->status != IB_WC_WR_FLUSH_ERR) {
834*4882a593Smuzhiyun struct ipoib_neigh *neigh;
835*4882a593Smuzhiyun
836*4882a593Smuzhiyun /* IB_WC[_RNR]_RETRY_EXC_ERR error is part of the life cycle,
837*4882a593Smuzhiyun * so don't make waves.
838*4882a593Smuzhiyun */
839*4882a593Smuzhiyun if (wc->status == IB_WC_RNR_RETRY_EXC_ERR ||
840*4882a593Smuzhiyun wc->status == IB_WC_RETRY_EXC_ERR)
841*4882a593Smuzhiyun ipoib_dbg(priv,
842*4882a593Smuzhiyun "%s: failed cm send event (status=%d, wrid=%d vend_err %#x)\n",
843*4882a593Smuzhiyun __func__, wc->status, wr_id, wc->vendor_err);
844*4882a593Smuzhiyun else
845*4882a593Smuzhiyun ipoib_warn(priv,
846*4882a593Smuzhiyun "%s: failed cm send event (status=%d, wrid=%d vend_err %#x)\n",
847*4882a593Smuzhiyun __func__, wc->status, wr_id, wc->vendor_err);
848*4882a593Smuzhiyun
849*4882a593Smuzhiyun spin_lock_irqsave(&priv->lock, flags);
850*4882a593Smuzhiyun neigh = tx->neigh;
851*4882a593Smuzhiyun
852*4882a593Smuzhiyun if (neigh) {
853*4882a593Smuzhiyun neigh->cm = NULL;
854*4882a593Smuzhiyun ipoib_neigh_free(neigh);
855*4882a593Smuzhiyun
856*4882a593Smuzhiyun tx->neigh = NULL;
857*4882a593Smuzhiyun }
858*4882a593Smuzhiyun
859*4882a593Smuzhiyun if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
860*4882a593Smuzhiyun list_move(&tx->list, &priv->cm.reap_list);
861*4882a593Smuzhiyun queue_work(priv->wq, &priv->cm.reap_task);
862*4882a593Smuzhiyun }
863*4882a593Smuzhiyun
864*4882a593Smuzhiyun clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
865*4882a593Smuzhiyun
866*4882a593Smuzhiyun spin_unlock_irqrestore(&priv->lock, flags);
867*4882a593Smuzhiyun }
868*4882a593Smuzhiyun
869*4882a593Smuzhiyun netif_tx_unlock(dev);
870*4882a593Smuzhiyun }
871*4882a593Smuzhiyun
ipoib_cm_dev_open(struct net_device * dev)872*4882a593Smuzhiyun int ipoib_cm_dev_open(struct net_device *dev)
873*4882a593Smuzhiyun {
874*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
875*4882a593Smuzhiyun int ret;
876*4882a593Smuzhiyun
877*4882a593Smuzhiyun if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
878*4882a593Smuzhiyun return 0;
879*4882a593Smuzhiyun
880*4882a593Smuzhiyun priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
881*4882a593Smuzhiyun if (IS_ERR(priv->cm.id)) {
882*4882a593Smuzhiyun pr_warn("%s: failed to create CM ID\n", priv->ca->name);
883*4882a593Smuzhiyun ret = PTR_ERR(priv->cm.id);
884*4882a593Smuzhiyun goto err_cm;
885*4882a593Smuzhiyun }
886*4882a593Smuzhiyun
887*4882a593Smuzhiyun ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
888*4882a593Smuzhiyun 0);
889*4882a593Smuzhiyun if (ret) {
890*4882a593Smuzhiyun pr_warn("%s: failed to listen on ID 0x%llx\n", priv->ca->name,
891*4882a593Smuzhiyun IPOIB_CM_IETF_ID | priv->qp->qp_num);
892*4882a593Smuzhiyun goto err_listen;
893*4882a593Smuzhiyun }
894*4882a593Smuzhiyun
895*4882a593Smuzhiyun return 0;
896*4882a593Smuzhiyun
897*4882a593Smuzhiyun err_listen:
898*4882a593Smuzhiyun ib_destroy_cm_id(priv->cm.id);
899*4882a593Smuzhiyun err_cm:
900*4882a593Smuzhiyun priv->cm.id = NULL;
901*4882a593Smuzhiyun return ret;
902*4882a593Smuzhiyun }
903*4882a593Smuzhiyun
ipoib_cm_free_rx_reap_list(struct net_device * dev)904*4882a593Smuzhiyun static void ipoib_cm_free_rx_reap_list(struct net_device *dev)
905*4882a593Smuzhiyun {
906*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
907*4882a593Smuzhiyun struct ipoib_cm_rx *rx, *n;
908*4882a593Smuzhiyun LIST_HEAD(list);
909*4882a593Smuzhiyun
910*4882a593Smuzhiyun spin_lock_irq(&priv->lock);
911*4882a593Smuzhiyun list_splice_init(&priv->cm.rx_reap_list, &list);
912*4882a593Smuzhiyun spin_unlock_irq(&priv->lock);
913*4882a593Smuzhiyun
914*4882a593Smuzhiyun list_for_each_entry_safe(rx, n, &list, list) {
915*4882a593Smuzhiyun ib_destroy_cm_id(rx->id);
916*4882a593Smuzhiyun ib_destroy_qp(rx->qp);
917*4882a593Smuzhiyun if (!ipoib_cm_has_srq(dev)) {
918*4882a593Smuzhiyun ipoib_cm_free_rx_ring(priv->dev, rx->rx_ring);
919*4882a593Smuzhiyun spin_lock_irq(&priv->lock);
920*4882a593Smuzhiyun --priv->cm.nonsrq_conn_qp;
921*4882a593Smuzhiyun spin_unlock_irq(&priv->lock);
922*4882a593Smuzhiyun }
923*4882a593Smuzhiyun kfree(rx);
924*4882a593Smuzhiyun }
925*4882a593Smuzhiyun }
926*4882a593Smuzhiyun
ipoib_cm_dev_stop(struct net_device * dev)927*4882a593Smuzhiyun void ipoib_cm_dev_stop(struct net_device *dev)
928*4882a593Smuzhiyun {
929*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
930*4882a593Smuzhiyun struct ipoib_cm_rx *p;
931*4882a593Smuzhiyun unsigned long begin;
932*4882a593Smuzhiyun int ret;
933*4882a593Smuzhiyun
934*4882a593Smuzhiyun if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id)
935*4882a593Smuzhiyun return;
936*4882a593Smuzhiyun
937*4882a593Smuzhiyun ib_destroy_cm_id(priv->cm.id);
938*4882a593Smuzhiyun priv->cm.id = NULL;
939*4882a593Smuzhiyun
940*4882a593Smuzhiyun spin_lock_irq(&priv->lock);
941*4882a593Smuzhiyun while (!list_empty(&priv->cm.passive_ids)) {
942*4882a593Smuzhiyun p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
943*4882a593Smuzhiyun list_move(&p->list, &priv->cm.rx_error_list);
944*4882a593Smuzhiyun p->state = IPOIB_CM_RX_ERROR;
945*4882a593Smuzhiyun spin_unlock_irq(&priv->lock);
946*4882a593Smuzhiyun ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
947*4882a593Smuzhiyun if (ret)
948*4882a593Smuzhiyun ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
949*4882a593Smuzhiyun spin_lock_irq(&priv->lock);
950*4882a593Smuzhiyun }
951*4882a593Smuzhiyun
952*4882a593Smuzhiyun /* Wait for all RX to be drained */
953*4882a593Smuzhiyun begin = jiffies;
954*4882a593Smuzhiyun
955*4882a593Smuzhiyun while (!list_empty(&priv->cm.rx_error_list) ||
956*4882a593Smuzhiyun !list_empty(&priv->cm.rx_flush_list) ||
957*4882a593Smuzhiyun !list_empty(&priv->cm.rx_drain_list)) {
958*4882a593Smuzhiyun if (time_after(jiffies, begin + 5 * HZ)) {
959*4882a593Smuzhiyun ipoib_warn(priv, "RX drain timing out\n");
960*4882a593Smuzhiyun
961*4882a593Smuzhiyun /*
962*4882a593Smuzhiyun * assume the HW is wedged and just free up everything.
963*4882a593Smuzhiyun */
964*4882a593Smuzhiyun list_splice_init(&priv->cm.rx_flush_list,
965*4882a593Smuzhiyun &priv->cm.rx_reap_list);
966*4882a593Smuzhiyun list_splice_init(&priv->cm.rx_error_list,
967*4882a593Smuzhiyun &priv->cm.rx_reap_list);
968*4882a593Smuzhiyun list_splice_init(&priv->cm.rx_drain_list,
969*4882a593Smuzhiyun &priv->cm.rx_reap_list);
970*4882a593Smuzhiyun break;
971*4882a593Smuzhiyun }
972*4882a593Smuzhiyun spin_unlock_irq(&priv->lock);
973*4882a593Smuzhiyun usleep_range(1000, 2000);
974*4882a593Smuzhiyun ipoib_drain_cq(dev);
975*4882a593Smuzhiyun spin_lock_irq(&priv->lock);
976*4882a593Smuzhiyun }
977*4882a593Smuzhiyun
978*4882a593Smuzhiyun spin_unlock_irq(&priv->lock);
979*4882a593Smuzhiyun
980*4882a593Smuzhiyun ipoib_cm_free_rx_reap_list(dev);
981*4882a593Smuzhiyun
982*4882a593Smuzhiyun cancel_delayed_work(&priv->cm.stale_task);
983*4882a593Smuzhiyun }
984*4882a593Smuzhiyun
ipoib_cm_rep_handler(struct ib_cm_id * cm_id,const struct ib_cm_event * event)985*4882a593Smuzhiyun static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id,
986*4882a593Smuzhiyun const struct ib_cm_event *event)
987*4882a593Smuzhiyun {
988*4882a593Smuzhiyun struct ipoib_cm_tx *p = cm_id->context;
989*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(p->dev);
990*4882a593Smuzhiyun struct ipoib_cm_data *data = event->private_data;
991*4882a593Smuzhiyun struct sk_buff_head skqueue;
992*4882a593Smuzhiyun struct ib_qp_attr qp_attr;
993*4882a593Smuzhiyun int qp_attr_mask, ret;
994*4882a593Smuzhiyun struct sk_buff *skb;
995*4882a593Smuzhiyun
996*4882a593Smuzhiyun p->mtu = be32_to_cpu(data->mtu);
997*4882a593Smuzhiyun
998*4882a593Smuzhiyun if (p->mtu <= IPOIB_ENCAP_LEN) {
999*4882a593Smuzhiyun ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n",
1000*4882a593Smuzhiyun p->mtu, IPOIB_ENCAP_LEN);
1001*4882a593Smuzhiyun return -EINVAL;
1002*4882a593Smuzhiyun }
1003*4882a593Smuzhiyun
1004*4882a593Smuzhiyun qp_attr.qp_state = IB_QPS_RTR;
1005*4882a593Smuzhiyun ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
1006*4882a593Smuzhiyun if (ret) {
1007*4882a593Smuzhiyun ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
1008*4882a593Smuzhiyun return ret;
1009*4882a593Smuzhiyun }
1010*4882a593Smuzhiyun
1011*4882a593Smuzhiyun qp_attr.rq_psn = 0 /* FIXME */;
1012*4882a593Smuzhiyun ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
1013*4882a593Smuzhiyun if (ret) {
1014*4882a593Smuzhiyun ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
1015*4882a593Smuzhiyun return ret;
1016*4882a593Smuzhiyun }
1017*4882a593Smuzhiyun
1018*4882a593Smuzhiyun qp_attr.qp_state = IB_QPS_RTS;
1019*4882a593Smuzhiyun ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
1020*4882a593Smuzhiyun if (ret) {
1021*4882a593Smuzhiyun ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
1022*4882a593Smuzhiyun return ret;
1023*4882a593Smuzhiyun }
1024*4882a593Smuzhiyun ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
1025*4882a593Smuzhiyun if (ret) {
1026*4882a593Smuzhiyun ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
1027*4882a593Smuzhiyun return ret;
1028*4882a593Smuzhiyun }
1029*4882a593Smuzhiyun
1030*4882a593Smuzhiyun skb_queue_head_init(&skqueue);
1031*4882a593Smuzhiyun
1032*4882a593Smuzhiyun netif_tx_lock_bh(p->dev);
1033*4882a593Smuzhiyun spin_lock_irq(&priv->lock);
1034*4882a593Smuzhiyun set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
1035*4882a593Smuzhiyun if (p->neigh)
1036*4882a593Smuzhiyun while ((skb = __skb_dequeue(&p->neigh->queue)))
1037*4882a593Smuzhiyun __skb_queue_tail(&skqueue, skb);
1038*4882a593Smuzhiyun spin_unlock_irq(&priv->lock);
1039*4882a593Smuzhiyun netif_tx_unlock_bh(p->dev);
1040*4882a593Smuzhiyun
1041*4882a593Smuzhiyun while ((skb = __skb_dequeue(&skqueue))) {
1042*4882a593Smuzhiyun skb->dev = p->dev;
1043*4882a593Smuzhiyun ret = dev_queue_xmit(skb);
1044*4882a593Smuzhiyun if (ret)
1045*4882a593Smuzhiyun ipoib_warn(priv, "%s:dev_queue_xmit failed to re-queue packet, ret:%d\n",
1046*4882a593Smuzhiyun __func__, ret);
1047*4882a593Smuzhiyun }
1048*4882a593Smuzhiyun
1049*4882a593Smuzhiyun ret = ib_send_cm_rtu(cm_id, NULL, 0);
1050*4882a593Smuzhiyun if (ret) {
1051*4882a593Smuzhiyun ipoib_warn(priv, "failed to send RTU: %d\n", ret);
1052*4882a593Smuzhiyun return ret;
1053*4882a593Smuzhiyun }
1054*4882a593Smuzhiyun return 0;
1055*4882a593Smuzhiyun }
1056*4882a593Smuzhiyun
ipoib_cm_create_tx_qp(struct net_device * dev,struct ipoib_cm_tx * tx)1057*4882a593Smuzhiyun static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx)
1058*4882a593Smuzhiyun {
1059*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
1060*4882a593Smuzhiyun struct ib_qp_init_attr attr = {
1061*4882a593Smuzhiyun .send_cq = priv->send_cq,
1062*4882a593Smuzhiyun .recv_cq = priv->recv_cq,
1063*4882a593Smuzhiyun .srq = priv->cm.srq,
1064*4882a593Smuzhiyun .cap.max_send_wr = ipoib_sendq_size,
1065*4882a593Smuzhiyun .cap.max_send_sge = 1,
1066*4882a593Smuzhiyun .sq_sig_type = IB_SIGNAL_ALL_WR,
1067*4882a593Smuzhiyun .qp_type = IB_QPT_RC,
1068*4882a593Smuzhiyun .qp_context = tx,
1069*4882a593Smuzhiyun .create_flags = 0
1070*4882a593Smuzhiyun };
1071*4882a593Smuzhiyun struct ib_qp *tx_qp;
1072*4882a593Smuzhiyun
1073*4882a593Smuzhiyun if (dev->features & NETIF_F_SG)
1074*4882a593Smuzhiyun attr.cap.max_send_sge = min_t(u32, priv->ca->attrs.max_send_sge,
1075*4882a593Smuzhiyun MAX_SKB_FRAGS + 1);
1076*4882a593Smuzhiyun
1077*4882a593Smuzhiyun tx_qp = ib_create_qp(priv->pd, &attr);
1078*4882a593Smuzhiyun tx->max_send_sge = attr.cap.max_send_sge;
1079*4882a593Smuzhiyun return tx_qp;
1080*4882a593Smuzhiyun }
1081*4882a593Smuzhiyun
ipoib_cm_send_req(struct net_device * dev,struct ib_cm_id * id,struct ib_qp * qp,u32 qpn,struct sa_path_rec * pathrec)1082*4882a593Smuzhiyun static int ipoib_cm_send_req(struct net_device *dev,
1083*4882a593Smuzhiyun struct ib_cm_id *id, struct ib_qp *qp,
1084*4882a593Smuzhiyun u32 qpn,
1085*4882a593Smuzhiyun struct sa_path_rec *pathrec)
1086*4882a593Smuzhiyun {
1087*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
1088*4882a593Smuzhiyun struct ipoib_cm_data data = {};
1089*4882a593Smuzhiyun struct ib_cm_req_param req = {};
1090*4882a593Smuzhiyun
1091*4882a593Smuzhiyun data.qpn = cpu_to_be32(priv->qp->qp_num);
1092*4882a593Smuzhiyun data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
1093*4882a593Smuzhiyun
1094*4882a593Smuzhiyun req.primary_path = pathrec;
1095*4882a593Smuzhiyun req.alternate_path = NULL;
1096*4882a593Smuzhiyun req.service_id = cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
1097*4882a593Smuzhiyun req.qp_num = qp->qp_num;
1098*4882a593Smuzhiyun req.qp_type = qp->qp_type;
1099*4882a593Smuzhiyun req.private_data = &data;
1100*4882a593Smuzhiyun req.private_data_len = sizeof(data);
1101*4882a593Smuzhiyun req.flow_control = 0;
1102*4882a593Smuzhiyun
1103*4882a593Smuzhiyun req.starting_psn = 0; /* FIXME */
1104*4882a593Smuzhiyun
1105*4882a593Smuzhiyun /*
1106*4882a593Smuzhiyun * Pick some arbitrary defaults here; we could make these
1107*4882a593Smuzhiyun * module parameters if anyone cared about setting them.
1108*4882a593Smuzhiyun */
1109*4882a593Smuzhiyun req.responder_resources = 4;
1110*4882a593Smuzhiyun req.remote_cm_response_timeout = 20;
1111*4882a593Smuzhiyun req.local_cm_response_timeout = 20;
1112*4882a593Smuzhiyun req.retry_count = 0; /* RFC draft warns against retries */
1113*4882a593Smuzhiyun req.rnr_retry_count = 0; /* RFC draft warns against retries */
1114*4882a593Smuzhiyun req.max_cm_retries = 15;
1115*4882a593Smuzhiyun req.srq = ipoib_cm_has_srq(dev);
1116*4882a593Smuzhiyun return ib_send_cm_req(id, &req);
1117*4882a593Smuzhiyun }
1118*4882a593Smuzhiyun
ipoib_cm_modify_tx_init(struct net_device * dev,struct ib_cm_id * cm_id,struct ib_qp * qp)1119*4882a593Smuzhiyun static int ipoib_cm_modify_tx_init(struct net_device *dev,
1120*4882a593Smuzhiyun struct ib_cm_id *cm_id, struct ib_qp *qp)
1121*4882a593Smuzhiyun {
1122*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
1123*4882a593Smuzhiyun struct ib_qp_attr qp_attr;
1124*4882a593Smuzhiyun int qp_attr_mask, ret;
1125*4882a593Smuzhiyun ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
1126*4882a593Smuzhiyun if (ret) {
1127*4882a593Smuzhiyun ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret);
1128*4882a593Smuzhiyun return ret;
1129*4882a593Smuzhiyun }
1130*4882a593Smuzhiyun
1131*4882a593Smuzhiyun qp_attr.qp_state = IB_QPS_INIT;
1132*4882a593Smuzhiyun qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
1133*4882a593Smuzhiyun qp_attr.port_num = priv->port;
1134*4882a593Smuzhiyun qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
1135*4882a593Smuzhiyun
1136*4882a593Smuzhiyun ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
1137*4882a593Smuzhiyun if (ret) {
1138*4882a593Smuzhiyun ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
1139*4882a593Smuzhiyun return ret;
1140*4882a593Smuzhiyun }
1141*4882a593Smuzhiyun return 0;
1142*4882a593Smuzhiyun }
1143*4882a593Smuzhiyun
ipoib_cm_tx_init(struct ipoib_cm_tx * p,u32 qpn,struct sa_path_rec * pathrec)1144*4882a593Smuzhiyun static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
1145*4882a593Smuzhiyun struct sa_path_rec *pathrec)
1146*4882a593Smuzhiyun {
1147*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(p->dev);
1148*4882a593Smuzhiyun unsigned int noio_flag;
1149*4882a593Smuzhiyun int ret;
1150*4882a593Smuzhiyun
1151*4882a593Smuzhiyun noio_flag = memalloc_noio_save();
1152*4882a593Smuzhiyun p->tx_ring = vzalloc(array_size(ipoib_sendq_size, sizeof(*p->tx_ring)));
1153*4882a593Smuzhiyun if (!p->tx_ring) {
1154*4882a593Smuzhiyun memalloc_noio_restore(noio_flag);
1155*4882a593Smuzhiyun ret = -ENOMEM;
1156*4882a593Smuzhiyun goto err_tx;
1157*4882a593Smuzhiyun }
1158*4882a593Smuzhiyun
1159*4882a593Smuzhiyun p->qp = ipoib_cm_create_tx_qp(p->dev, p);
1160*4882a593Smuzhiyun memalloc_noio_restore(noio_flag);
1161*4882a593Smuzhiyun if (IS_ERR(p->qp)) {
1162*4882a593Smuzhiyun ret = PTR_ERR(p->qp);
1163*4882a593Smuzhiyun ipoib_warn(priv, "failed to create tx qp: %d\n", ret);
1164*4882a593Smuzhiyun goto err_qp;
1165*4882a593Smuzhiyun }
1166*4882a593Smuzhiyun
1167*4882a593Smuzhiyun p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
1168*4882a593Smuzhiyun if (IS_ERR(p->id)) {
1169*4882a593Smuzhiyun ret = PTR_ERR(p->id);
1170*4882a593Smuzhiyun ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
1171*4882a593Smuzhiyun goto err_id;
1172*4882a593Smuzhiyun }
1173*4882a593Smuzhiyun
1174*4882a593Smuzhiyun ret = ipoib_cm_modify_tx_init(p->dev, p->id, p->qp);
1175*4882a593Smuzhiyun if (ret) {
1176*4882a593Smuzhiyun ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
1177*4882a593Smuzhiyun goto err_modify_send;
1178*4882a593Smuzhiyun }
1179*4882a593Smuzhiyun
1180*4882a593Smuzhiyun ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec);
1181*4882a593Smuzhiyun if (ret) {
1182*4882a593Smuzhiyun ipoib_warn(priv, "failed to send cm req: %d\n", ret);
1183*4882a593Smuzhiyun goto err_modify_send;
1184*4882a593Smuzhiyun }
1185*4882a593Smuzhiyun
1186*4882a593Smuzhiyun ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n",
1187*4882a593Smuzhiyun p->qp->qp_num, pathrec->dgid.raw, qpn);
1188*4882a593Smuzhiyun
1189*4882a593Smuzhiyun return 0;
1190*4882a593Smuzhiyun
1191*4882a593Smuzhiyun err_modify_send:
1192*4882a593Smuzhiyun ib_destroy_cm_id(p->id);
1193*4882a593Smuzhiyun err_id:
1194*4882a593Smuzhiyun p->id = NULL;
1195*4882a593Smuzhiyun ib_destroy_qp(p->qp);
1196*4882a593Smuzhiyun err_qp:
1197*4882a593Smuzhiyun p->qp = NULL;
1198*4882a593Smuzhiyun vfree(p->tx_ring);
1199*4882a593Smuzhiyun err_tx:
1200*4882a593Smuzhiyun return ret;
1201*4882a593Smuzhiyun }
1202*4882a593Smuzhiyun
ipoib_cm_tx_destroy(struct ipoib_cm_tx * p)1203*4882a593Smuzhiyun static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
1204*4882a593Smuzhiyun {
1205*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(p->dev);
1206*4882a593Smuzhiyun struct ipoib_tx_buf *tx_req;
1207*4882a593Smuzhiyun unsigned long begin;
1208*4882a593Smuzhiyun
1209*4882a593Smuzhiyun ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
1210*4882a593Smuzhiyun p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
1211*4882a593Smuzhiyun
1212*4882a593Smuzhiyun if (p->id)
1213*4882a593Smuzhiyun ib_destroy_cm_id(p->id);
1214*4882a593Smuzhiyun
1215*4882a593Smuzhiyun if (p->tx_ring) {
1216*4882a593Smuzhiyun /* Wait for all sends to complete */
1217*4882a593Smuzhiyun begin = jiffies;
1218*4882a593Smuzhiyun while ((int) p->tx_tail - (int) p->tx_head < 0) {
1219*4882a593Smuzhiyun if (time_after(jiffies, begin + 5 * HZ)) {
1220*4882a593Smuzhiyun ipoib_warn(priv, "timing out; %d sends not completed\n",
1221*4882a593Smuzhiyun p->tx_head - p->tx_tail);
1222*4882a593Smuzhiyun goto timeout;
1223*4882a593Smuzhiyun }
1224*4882a593Smuzhiyun
1225*4882a593Smuzhiyun usleep_range(1000, 2000);
1226*4882a593Smuzhiyun }
1227*4882a593Smuzhiyun }
1228*4882a593Smuzhiyun
1229*4882a593Smuzhiyun timeout:
1230*4882a593Smuzhiyun
1231*4882a593Smuzhiyun while ((int) p->tx_tail - (int) p->tx_head < 0) {
1232*4882a593Smuzhiyun tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
1233*4882a593Smuzhiyun ipoib_dma_unmap_tx(priv, tx_req);
1234*4882a593Smuzhiyun dev_kfree_skb_any(tx_req->skb);
1235*4882a593Smuzhiyun netif_tx_lock_bh(p->dev);
1236*4882a593Smuzhiyun ++p->tx_tail;
1237*4882a593Smuzhiyun ++priv->global_tx_tail;
1238*4882a593Smuzhiyun if (unlikely((priv->global_tx_head - priv->global_tx_tail) <=
1239*4882a593Smuzhiyun ipoib_sendq_size >> 1) &&
1240*4882a593Smuzhiyun netif_queue_stopped(p->dev) &&
1241*4882a593Smuzhiyun test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
1242*4882a593Smuzhiyun netif_wake_queue(p->dev);
1243*4882a593Smuzhiyun netif_tx_unlock_bh(p->dev);
1244*4882a593Smuzhiyun }
1245*4882a593Smuzhiyun
1246*4882a593Smuzhiyun if (p->qp)
1247*4882a593Smuzhiyun ib_destroy_qp(p->qp);
1248*4882a593Smuzhiyun
1249*4882a593Smuzhiyun vfree(p->tx_ring);
1250*4882a593Smuzhiyun kfree(p);
1251*4882a593Smuzhiyun }
1252*4882a593Smuzhiyun
ipoib_cm_tx_handler(struct ib_cm_id * cm_id,const struct ib_cm_event * event)1253*4882a593Smuzhiyun static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
1254*4882a593Smuzhiyun const struct ib_cm_event *event)
1255*4882a593Smuzhiyun {
1256*4882a593Smuzhiyun struct ipoib_cm_tx *tx = cm_id->context;
1257*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(tx->dev);
1258*4882a593Smuzhiyun struct net_device *dev = priv->dev;
1259*4882a593Smuzhiyun struct ipoib_neigh *neigh;
1260*4882a593Smuzhiyun unsigned long flags;
1261*4882a593Smuzhiyun int ret;
1262*4882a593Smuzhiyun
1263*4882a593Smuzhiyun switch (event->event) {
1264*4882a593Smuzhiyun case IB_CM_DREQ_RECEIVED:
1265*4882a593Smuzhiyun ipoib_dbg(priv, "DREQ received.\n");
1266*4882a593Smuzhiyun ib_send_cm_drep(cm_id, NULL, 0);
1267*4882a593Smuzhiyun break;
1268*4882a593Smuzhiyun case IB_CM_REP_RECEIVED:
1269*4882a593Smuzhiyun ipoib_dbg(priv, "REP received.\n");
1270*4882a593Smuzhiyun ret = ipoib_cm_rep_handler(cm_id, event);
1271*4882a593Smuzhiyun if (ret)
1272*4882a593Smuzhiyun ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
1273*4882a593Smuzhiyun NULL, 0, NULL, 0);
1274*4882a593Smuzhiyun break;
1275*4882a593Smuzhiyun case IB_CM_REQ_ERROR:
1276*4882a593Smuzhiyun case IB_CM_REJ_RECEIVED:
1277*4882a593Smuzhiyun case IB_CM_TIMEWAIT_EXIT:
1278*4882a593Smuzhiyun ipoib_dbg(priv, "CM error %d.\n", event->event);
1279*4882a593Smuzhiyun netif_tx_lock_bh(dev);
1280*4882a593Smuzhiyun spin_lock_irqsave(&priv->lock, flags);
1281*4882a593Smuzhiyun neigh = tx->neigh;
1282*4882a593Smuzhiyun
1283*4882a593Smuzhiyun if (neigh) {
1284*4882a593Smuzhiyun neigh->cm = NULL;
1285*4882a593Smuzhiyun ipoib_neigh_free(neigh);
1286*4882a593Smuzhiyun
1287*4882a593Smuzhiyun tx->neigh = NULL;
1288*4882a593Smuzhiyun }
1289*4882a593Smuzhiyun
1290*4882a593Smuzhiyun if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
1291*4882a593Smuzhiyun list_move(&tx->list, &priv->cm.reap_list);
1292*4882a593Smuzhiyun queue_work(priv->wq, &priv->cm.reap_task);
1293*4882a593Smuzhiyun }
1294*4882a593Smuzhiyun
1295*4882a593Smuzhiyun spin_unlock_irqrestore(&priv->lock, flags);
1296*4882a593Smuzhiyun netif_tx_unlock_bh(dev);
1297*4882a593Smuzhiyun break;
1298*4882a593Smuzhiyun default:
1299*4882a593Smuzhiyun break;
1300*4882a593Smuzhiyun }
1301*4882a593Smuzhiyun
1302*4882a593Smuzhiyun return 0;
1303*4882a593Smuzhiyun }
1304*4882a593Smuzhiyun
ipoib_cm_create_tx(struct net_device * dev,struct ipoib_path * path,struct ipoib_neigh * neigh)1305*4882a593Smuzhiyun struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
1306*4882a593Smuzhiyun struct ipoib_neigh *neigh)
1307*4882a593Smuzhiyun {
1308*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
1309*4882a593Smuzhiyun struct ipoib_cm_tx *tx;
1310*4882a593Smuzhiyun
1311*4882a593Smuzhiyun tx = kzalloc(sizeof(*tx), GFP_ATOMIC);
1312*4882a593Smuzhiyun if (!tx)
1313*4882a593Smuzhiyun return NULL;
1314*4882a593Smuzhiyun
1315*4882a593Smuzhiyun neigh->cm = tx;
1316*4882a593Smuzhiyun tx->neigh = neigh;
1317*4882a593Smuzhiyun tx->dev = dev;
1318*4882a593Smuzhiyun list_add(&tx->list, &priv->cm.start_list);
1319*4882a593Smuzhiyun set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
1320*4882a593Smuzhiyun queue_work(priv->wq, &priv->cm.start_task);
1321*4882a593Smuzhiyun return tx;
1322*4882a593Smuzhiyun }
1323*4882a593Smuzhiyun
ipoib_cm_destroy_tx(struct ipoib_cm_tx * tx)1324*4882a593Smuzhiyun void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
1325*4882a593Smuzhiyun {
1326*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(tx->dev);
1327*4882a593Smuzhiyun unsigned long flags;
1328*4882a593Smuzhiyun if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
1329*4882a593Smuzhiyun spin_lock_irqsave(&priv->lock, flags);
1330*4882a593Smuzhiyun list_move(&tx->list, &priv->cm.reap_list);
1331*4882a593Smuzhiyun queue_work(priv->wq, &priv->cm.reap_task);
1332*4882a593Smuzhiyun ipoib_dbg(priv, "Reap connection for gid %pI6\n",
1333*4882a593Smuzhiyun tx->neigh->daddr + 4);
1334*4882a593Smuzhiyun tx->neigh = NULL;
1335*4882a593Smuzhiyun spin_unlock_irqrestore(&priv->lock, flags);
1336*4882a593Smuzhiyun }
1337*4882a593Smuzhiyun }
1338*4882a593Smuzhiyun
1339*4882a593Smuzhiyun #define QPN_AND_OPTIONS_OFFSET 4
1340*4882a593Smuzhiyun
ipoib_cm_tx_start(struct work_struct * work)1341*4882a593Smuzhiyun static void ipoib_cm_tx_start(struct work_struct *work)
1342*4882a593Smuzhiyun {
1343*4882a593Smuzhiyun struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1344*4882a593Smuzhiyun cm.start_task);
1345*4882a593Smuzhiyun struct net_device *dev = priv->dev;
1346*4882a593Smuzhiyun struct ipoib_neigh *neigh;
1347*4882a593Smuzhiyun struct ipoib_cm_tx *p;
1348*4882a593Smuzhiyun unsigned long flags;
1349*4882a593Smuzhiyun struct ipoib_path *path;
1350*4882a593Smuzhiyun int ret;
1351*4882a593Smuzhiyun
1352*4882a593Smuzhiyun struct sa_path_rec pathrec;
1353*4882a593Smuzhiyun u32 qpn;
1354*4882a593Smuzhiyun
1355*4882a593Smuzhiyun netif_tx_lock_bh(dev);
1356*4882a593Smuzhiyun spin_lock_irqsave(&priv->lock, flags);
1357*4882a593Smuzhiyun
1358*4882a593Smuzhiyun while (!list_empty(&priv->cm.start_list)) {
1359*4882a593Smuzhiyun p = list_entry(priv->cm.start_list.next, typeof(*p), list);
1360*4882a593Smuzhiyun list_del_init(&p->list);
1361*4882a593Smuzhiyun neigh = p->neigh;
1362*4882a593Smuzhiyun
1363*4882a593Smuzhiyun qpn = IPOIB_QPN(neigh->daddr);
1364*4882a593Smuzhiyun /*
1365*4882a593Smuzhiyun * As long as the search is with these 2 locks,
1366*4882a593Smuzhiyun * path existence indicates its validity.
1367*4882a593Smuzhiyun */
1368*4882a593Smuzhiyun path = __path_find(dev, neigh->daddr + QPN_AND_OPTIONS_OFFSET);
1369*4882a593Smuzhiyun if (!path) {
1370*4882a593Smuzhiyun pr_info("%s ignore not valid path %pI6\n",
1371*4882a593Smuzhiyun __func__,
1372*4882a593Smuzhiyun neigh->daddr + QPN_AND_OPTIONS_OFFSET);
1373*4882a593Smuzhiyun goto free_neigh;
1374*4882a593Smuzhiyun }
1375*4882a593Smuzhiyun memcpy(&pathrec, &path->pathrec, sizeof(pathrec));
1376*4882a593Smuzhiyun
1377*4882a593Smuzhiyun spin_unlock_irqrestore(&priv->lock, flags);
1378*4882a593Smuzhiyun netif_tx_unlock_bh(dev);
1379*4882a593Smuzhiyun
1380*4882a593Smuzhiyun ret = ipoib_cm_tx_init(p, qpn, &pathrec);
1381*4882a593Smuzhiyun
1382*4882a593Smuzhiyun netif_tx_lock_bh(dev);
1383*4882a593Smuzhiyun spin_lock_irqsave(&priv->lock, flags);
1384*4882a593Smuzhiyun
1385*4882a593Smuzhiyun if (ret) {
1386*4882a593Smuzhiyun free_neigh:
1387*4882a593Smuzhiyun neigh = p->neigh;
1388*4882a593Smuzhiyun if (neigh) {
1389*4882a593Smuzhiyun neigh->cm = NULL;
1390*4882a593Smuzhiyun ipoib_neigh_free(neigh);
1391*4882a593Smuzhiyun }
1392*4882a593Smuzhiyun list_del(&p->list);
1393*4882a593Smuzhiyun kfree(p);
1394*4882a593Smuzhiyun }
1395*4882a593Smuzhiyun }
1396*4882a593Smuzhiyun
1397*4882a593Smuzhiyun spin_unlock_irqrestore(&priv->lock, flags);
1398*4882a593Smuzhiyun netif_tx_unlock_bh(dev);
1399*4882a593Smuzhiyun }
1400*4882a593Smuzhiyun
ipoib_cm_tx_reap(struct work_struct * work)1401*4882a593Smuzhiyun static void ipoib_cm_tx_reap(struct work_struct *work)
1402*4882a593Smuzhiyun {
1403*4882a593Smuzhiyun struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1404*4882a593Smuzhiyun cm.reap_task);
1405*4882a593Smuzhiyun struct net_device *dev = priv->dev;
1406*4882a593Smuzhiyun struct ipoib_cm_tx *p;
1407*4882a593Smuzhiyun unsigned long flags;
1408*4882a593Smuzhiyun
1409*4882a593Smuzhiyun netif_tx_lock_bh(dev);
1410*4882a593Smuzhiyun spin_lock_irqsave(&priv->lock, flags);
1411*4882a593Smuzhiyun
1412*4882a593Smuzhiyun while (!list_empty(&priv->cm.reap_list)) {
1413*4882a593Smuzhiyun p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
1414*4882a593Smuzhiyun list_del_init(&p->list);
1415*4882a593Smuzhiyun spin_unlock_irqrestore(&priv->lock, flags);
1416*4882a593Smuzhiyun netif_tx_unlock_bh(dev);
1417*4882a593Smuzhiyun ipoib_cm_tx_destroy(p);
1418*4882a593Smuzhiyun netif_tx_lock_bh(dev);
1419*4882a593Smuzhiyun spin_lock_irqsave(&priv->lock, flags);
1420*4882a593Smuzhiyun }
1421*4882a593Smuzhiyun
1422*4882a593Smuzhiyun spin_unlock_irqrestore(&priv->lock, flags);
1423*4882a593Smuzhiyun netif_tx_unlock_bh(dev);
1424*4882a593Smuzhiyun }
1425*4882a593Smuzhiyun
ipoib_cm_skb_reap(struct work_struct * work)1426*4882a593Smuzhiyun static void ipoib_cm_skb_reap(struct work_struct *work)
1427*4882a593Smuzhiyun {
1428*4882a593Smuzhiyun struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1429*4882a593Smuzhiyun cm.skb_task);
1430*4882a593Smuzhiyun struct net_device *dev = priv->dev;
1431*4882a593Smuzhiyun struct sk_buff *skb;
1432*4882a593Smuzhiyun unsigned long flags;
1433*4882a593Smuzhiyun unsigned int mtu = priv->mcast_mtu;
1434*4882a593Smuzhiyun
1435*4882a593Smuzhiyun netif_tx_lock_bh(dev);
1436*4882a593Smuzhiyun spin_lock_irqsave(&priv->lock, flags);
1437*4882a593Smuzhiyun
1438*4882a593Smuzhiyun while ((skb = skb_dequeue(&priv->cm.skb_queue))) {
1439*4882a593Smuzhiyun spin_unlock_irqrestore(&priv->lock, flags);
1440*4882a593Smuzhiyun netif_tx_unlock_bh(dev);
1441*4882a593Smuzhiyun
1442*4882a593Smuzhiyun if (skb->protocol == htons(ETH_P_IP)) {
1443*4882a593Smuzhiyun memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
1444*4882a593Smuzhiyun icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
1445*4882a593Smuzhiyun }
1446*4882a593Smuzhiyun #if IS_ENABLED(CONFIG_IPV6)
1447*4882a593Smuzhiyun else if (skb->protocol == htons(ETH_P_IPV6)) {
1448*4882a593Smuzhiyun memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
1449*4882a593Smuzhiyun icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1450*4882a593Smuzhiyun }
1451*4882a593Smuzhiyun #endif
1452*4882a593Smuzhiyun dev_kfree_skb_any(skb);
1453*4882a593Smuzhiyun
1454*4882a593Smuzhiyun netif_tx_lock_bh(dev);
1455*4882a593Smuzhiyun spin_lock_irqsave(&priv->lock, flags);
1456*4882a593Smuzhiyun }
1457*4882a593Smuzhiyun
1458*4882a593Smuzhiyun spin_unlock_irqrestore(&priv->lock, flags);
1459*4882a593Smuzhiyun netif_tx_unlock_bh(dev);
1460*4882a593Smuzhiyun }
1461*4882a593Smuzhiyun
ipoib_cm_skb_too_long(struct net_device * dev,struct sk_buff * skb,unsigned int mtu)1462*4882a593Smuzhiyun void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
1463*4882a593Smuzhiyun unsigned int mtu)
1464*4882a593Smuzhiyun {
1465*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
1466*4882a593Smuzhiyun int e = skb_queue_empty(&priv->cm.skb_queue);
1467*4882a593Smuzhiyun
1468*4882a593Smuzhiyun skb_dst_update_pmtu(skb, mtu);
1469*4882a593Smuzhiyun
1470*4882a593Smuzhiyun skb_queue_tail(&priv->cm.skb_queue, skb);
1471*4882a593Smuzhiyun if (e)
1472*4882a593Smuzhiyun queue_work(priv->wq, &priv->cm.skb_task);
1473*4882a593Smuzhiyun }
1474*4882a593Smuzhiyun
ipoib_cm_rx_reap(struct work_struct * work)1475*4882a593Smuzhiyun static void ipoib_cm_rx_reap(struct work_struct *work)
1476*4882a593Smuzhiyun {
1477*4882a593Smuzhiyun ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv,
1478*4882a593Smuzhiyun cm.rx_reap_task)->dev);
1479*4882a593Smuzhiyun }
1480*4882a593Smuzhiyun
ipoib_cm_stale_task(struct work_struct * work)1481*4882a593Smuzhiyun static void ipoib_cm_stale_task(struct work_struct *work)
1482*4882a593Smuzhiyun {
1483*4882a593Smuzhiyun struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
1484*4882a593Smuzhiyun cm.stale_task.work);
1485*4882a593Smuzhiyun struct ipoib_cm_rx *p;
1486*4882a593Smuzhiyun int ret;
1487*4882a593Smuzhiyun
1488*4882a593Smuzhiyun spin_lock_irq(&priv->lock);
1489*4882a593Smuzhiyun while (!list_empty(&priv->cm.passive_ids)) {
1490*4882a593Smuzhiyun /* List is sorted by LRU, start from tail,
1491*4882a593Smuzhiyun * stop when we see a recently used entry */
1492*4882a593Smuzhiyun p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
1493*4882a593Smuzhiyun if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
1494*4882a593Smuzhiyun break;
1495*4882a593Smuzhiyun list_move(&p->list, &priv->cm.rx_error_list);
1496*4882a593Smuzhiyun p->state = IPOIB_CM_RX_ERROR;
1497*4882a593Smuzhiyun spin_unlock_irq(&priv->lock);
1498*4882a593Smuzhiyun ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
1499*4882a593Smuzhiyun if (ret)
1500*4882a593Smuzhiyun ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
1501*4882a593Smuzhiyun spin_lock_irq(&priv->lock);
1502*4882a593Smuzhiyun }
1503*4882a593Smuzhiyun
1504*4882a593Smuzhiyun if (!list_empty(&priv->cm.passive_ids))
1505*4882a593Smuzhiyun queue_delayed_work(priv->wq,
1506*4882a593Smuzhiyun &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
1507*4882a593Smuzhiyun spin_unlock_irq(&priv->lock);
1508*4882a593Smuzhiyun }
1509*4882a593Smuzhiyun
show_mode(struct device * d,struct device_attribute * attr,char * buf)1510*4882a593Smuzhiyun static ssize_t show_mode(struct device *d, struct device_attribute *attr,
1511*4882a593Smuzhiyun char *buf)
1512*4882a593Smuzhiyun {
1513*4882a593Smuzhiyun struct net_device *dev = to_net_dev(d);
1514*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
1515*4882a593Smuzhiyun
1516*4882a593Smuzhiyun if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
1517*4882a593Smuzhiyun return sprintf(buf, "connected\n");
1518*4882a593Smuzhiyun else
1519*4882a593Smuzhiyun return sprintf(buf, "datagram\n");
1520*4882a593Smuzhiyun }
1521*4882a593Smuzhiyun
set_mode(struct device * d,struct device_attribute * attr,const char * buf,size_t count)1522*4882a593Smuzhiyun static ssize_t set_mode(struct device *d, struct device_attribute *attr,
1523*4882a593Smuzhiyun const char *buf, size_t count)
1524*4882a593Smuzhiyun {
1525*4882a593Smuzhiyun struct net_device *dev = to_net_dev(d);
1526*4882a593Smuzhiyun int ret;
1527*4882a593Smuzhiyun
1528*4882a593Smuzhiyun if (!rtnl_trylock()) {
1529*4882a593Smuzhiyun return restart_syscall();
1530*4882a593Smuzhiyun }
1531*4882a593Smuzhiyun
1532*4882a593Smuzhiyun if (dev->reg_state != NETREG_REGISTERED) {
1533*4882a593Smuzhiyun rtnl_unlock();
1534*4882a593Smuzhiyun return -EPERM;
1535*4882a593Smuzhiyun }
1536*4882a593Smuzhiyun
1537*4882a593Smuzhiyun ret = ipoib_set_mode(dev, buf);
1538*4882a593Smuzhiyun
1539*4882a593Smuzhiyun /* The assumption is that the function ipoib_set_mode returned
1540*4882a593Smuzhiyun * with the rtnl held by it, if not the value -EBUSY returned,
1541*4882a593Smuzhiyun * then no need to rtnl_unlock
1542*4882a593Smuzhiyun */
1543*4882a593Smuzhiyun if (ret != -EBUSY)
1544*4882a593Smuzhiyun rtnl_unlock();
1545*4882a593Smuzhiyun
1546*4882a593Smuzhiyun return (!ret || ret == -EBUSY) ? count : ret;
1547*4882a593Smuzhiyun }
1548*4882a593Smuzhiyun
1549*4882a593Smuzhiyun static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode);
1550*4882a593Smuzhiyun
ipoib_cm_add_mode_attr(struct net_device * dev)1551*4882a593Smuzhiyun int ipoib_cm_add_mode_attr(struct net_device *dev)
1552*4882a593Smuzhiyun {
1553*4882a593Smuzhiyun return device_create_file(&dev->dev, &dev_attr_mode);
1554*4882a593Smuzhiyun }
1555*4882a593Smuzhiyun
ipoib_cm_create_srq(struct net_device * dev,int max_sge)1556*4882a593Smuzhiyun static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)
1557*4882a593Smuzhiyun {
1558*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
1559*4882a593Smuzhiyun struct ib_srq_init_attr srq_init_attr = {
1560*4882a593Smuzhiyun .srq_type = IB_SRQT_BASIC,
1561*4882a593Smuzhiyun .attr = {
1562*4882a593Smuzhiyun .max_wr = ipoib_recvq_size,
1563*4882a593Smuzhiyun .max_sge = max_sge
1564*4882a593Smuzhiyun }
1565*4882a593Smuzhiyun };
1566*4882a593Smuzhiyun
1567*4882a593Smuzhiyun priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
1568*4882a593Smuzhiyun if (IS_ERR(priv->cm.srq)) {
1569*4882a593Smuzhiyun if (PTR_ERR(priv->cm.srq) != -EOPNOTSUPP)
1570*4882a593Smuzhiyun pr_warn("%s: failed to allocate SRQ, error %ld\n",
1571*4882a593Smuzhiyun priv->ca->name, PTR_ERR(priv->cm.srq));
1572*4882a593Smuzhiyun priv->cm.srq = NULL;
1573*4882a593Smuzhiyun return;
1574*4882a593Smuzhiyun }
1575*4882a593Smuzhiyun
1576*4882a593Smuzhiyun priv->cm.srq_ring = vzalloc(array_size(ipoib_recvq_size,
1577*4882a593Smuzhiyun sizeof(*priv->cm.srq_ring)));
1578*4882a593Smuzhiyun if (!priv->cm.srq_ring) {
1579*4882a593Smuzhiyun ib_destroy_srq(priv->cm.srq);
1580*4882a593Smuzhiyun priv->cm.srq = NULL;
1581*4882a593Smuzhiyun return;
1582*4882a593Smuzhiyun }
1583*4882a593Smuzhiyun
1584*4882a593Smuzhiyun }
1585*4882a593Smuzhiyun
ipoib_cm_dev_init(struct net_device * dev)1586*4882a593Smuzhiyun int ipoib_cm_dev_init(struct net_device *dev)
1587*4882a593Smuzhiyun {
1588*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
1589*4882a593Smuzhiyun int max_srq_sge, i;
1590*4882a593Smuzhiyun
1591*4882a593Smuzhiyun INIT_LIST_HEAD(&priv->cm.passive_ids);
1592*4882a593Smuzhiyun INIT_LIST_HEAD(&priv->cm.reap_list);
1593*4882a593Smuzhiyun INIT_LIST_HEAD(&priv->cm.start_list);
1594*4882a593Smuzhiyun INIT_LIST_HEAD(&priv->cm.rx_error_list);
1595*4882a593Smuzhiyun INIT_LIST_HEAD(&priv->cm.rx_flush_list);
1596*4882a593Smuzhiyun INIT_LIST_HEAD(&priv->cm.rx_drain_list);
1597*4882a593Smuzhiyun INIT_LIST_HEAD(&priv->cm.rx_reap_list);
1598*4882a593Smuzhiyun INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
1599*4882a593Smuzhiyun INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
1600*4882a593Smuzhiyun INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
1601*4882a593Smuzhiyun INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
1602*4882a593Smuzhiyun INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
1603*4882a593Smuzhiyun
1604*4882a593Smuzhiyun skb_queue_head_init(&priv->cm.skb_queue);
1605*4882a593Smuzhiyun
1606*4882a593Smuzhiyun ipoib_dbg(priv, "max_srq_sge=%d\n", priv->ca->attrs.max_srq_sge);
1607*4882a593Smuzhiyun
1608*4882a593Smuzhiyun max_srq_sge = min_t(int, IPOIB_CM_RX_SG, priv->ca->attrs.max_srq_sge);
1609*4882a593Smuzhiyun ipoib_cm_create_srq(dev, max_srq_sge);
1610*4882a593Smuzhiyun if (ipoib_cm_has_srq(dev)) {
1611*4882a593Smuzhiyun priv->cm.max_cm_mtu = max_srq_sge * PAGE_SIZE - 0x10;
1612*4882a593Smuzhiyun priv->cm.num_frags = max_srq_sge;
1613*4882a593Smuzhiyun ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
1614*4882a593Smuzhiyun priv->cm.max_cm_mtu, priv->cm.num_frags);
1615*4882a593Smuzhiyun } else {
1616*4882a593Smuzhiyun priv->cm.max_cm_mtu = IPOIB_CM_MTU;
1617*4882a593Smuzhiyun priv->cm.num_frags = IPOIB_CM_RX_SG;
1618*4882a593Smuzhiyun }
1619*4882a593Smuzhiyun
1620*4882a593Smuzhiyun ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge);
1621*4882a593Smuzhiyun
1622*4882a593Smuzhiyun if (ipoib_cm_has_srq(dev)) {
1623*4882a593Smuzhiyun for (i = 0; i < ipoib_recvq_size; ++i) {
1624*4882a593Smuzhiyun if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i,
1625*4882a593Smuzhiyun priv->cm.num_frags - 1,
1626*4882a593Smuzhiyun priv->cm.srq_ring[i].mapping,
1627*4882a593Smuzhiyun GFP_KERNEL)) {
1628*4882a593Smuzhiyun ipoib_warn(priv, "failed to allocate "
1629*4882a593Smuzhiyun "receive buffer %d\n", i);
1630*4882a593Smuzhiyun ipoib_cm_dev_cleanup(dev);
1631*4882a593Smuzhiyun return -ENOMEM;
1632*4882a593Smuzhiyun }
1633*4882a593Smuzhiyun
1634*4882a593Smuzhiyun if (ipoib_cm_post_receive_srq(dev, i)) {
1635*4882a593Smuzhiyun ipoib_warn(priv, "ipoib_cm_post_receive_srq "
1636*4882a593Smuzhiyun "failed for buf %d\n", i);
1637*4882a593Smuzhiyun ipoib_cm_dev_cleanup(dev);
1638*4882a593Smuzhiyun return -EIO;
1639*4882a593Smuzhiyun }
1640*4882a593Smuzhiyun }
1641*4882a593Smuzhiyun }
1642*4882a593Smuzhiyun
1643*4882a593Smuzhiyun priv->dev->dev_addr[0] = IPOIB_FLAGS_RC;
1644*4882a593Smuzhiyun return 0;
1645*4882a593Smuzhiyun }
1646*4882a593Smuzhiyun
ipoib_cm_dev_cleanup(struct net_device * dev)1647*4882a593Smuzhiyun void ipoib_cm_dev_cleanup(struct net_device *dev)
1648*4882a593Smuzhiyun {
1649*4882a593Smuzhiyun struct ipoib_dev_priv *priv = ipoib_priv(dev);
1650*4882a593Smuzhiyun
1651*4882a593Smuzhiyun if (!priv->cm.srq)
1652*4882a593Smuzhiyun return;
1653*4882a593Smuzhiyun
1654*4882a593Smuzhiyun ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
1655*4882a593Smuzhiyun
1656*4882a593Smuzhiyun ib_destroy_srq(priv->cm.srq);
1657*4882a593Smuzhiyun priv->cm.srq = NULL;
1658*4882a593Smuzhiyun if (!priv->cm.srq_ring)
1659*4882a593Smuzhiyun return;
1660*4882a593Smuzhiyun
1661*4882a593Smuzhiyun ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring);
1662*4882a593Smuzhiyun priv->cm.srq_ring = NULL;
1663*4882a593Smuzhiyun }
1664