1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * This software is available to you under a choice of one of two
5*4882a593Smuzhiyun * licenses. You may choose to be licensed under the terms of the GNU
6*4882a593Smuzhiyun * General Public License (GPL) Version 2, available from the file
7*4882a593Smuzhiyun * COPYING in the main directory of this source tree, or the
8*4882a593Smuzhiyun * OpenIB.org BSD license below:
9*4882a593Smuzhiyun *
10*4882a593Smuzhiyun * Redistribution and use in source and binary forms, with or
11*4882a593Smuzhiyun * without modification, are permitted provided that the following
12*4882a593Smuzhiyun * conditions are met:
13*4882a593Smuzhiyun *
14*4882a593Smuzhiyun * - Redistributions of source code must retain the above
15*4882a593Smuzhiyun * copyright notice, this list of conditions and the following
16*4882a593Smuzhiyun * disclaimer.
17*4882a593Smuzhiyun *
18*4882a593Smuzhiyun * - Redistributions in binary form must reproduce the above
19*4882a593Smuzhiyun * copyright notice, this list of conditions and the following
20*4882a593Smuzhiyun * disclaimer in the documentation and/or other materials
21*4882a593Smuzhiyun * provided with the distribution.
22*4882a593Smuzhiyun *
23*4882a593Smuzhiyun * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24*4882a593Smuzhiyun * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25*4882a593Smuzhiyun * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26*4882a593Smuzhiyun * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27*4882a593Smuzhiyun * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28*4882a593Smuzhiyun * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29*4882a593Smuzhiyun * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30*4882a593Smuzhiyun * SOFTWARE.
31*4882a593Smuzhiyun *
32*4882a593Smuzhiyun */
33*4882a593Smuzhiyun #include <linux/kernel.h>
34*4882a593Smuzhiyun #include <linux/random.h>
35*4882a593Smuzhiyun #include <linux/export.h>
36*4882a593Smuzhiyun
37*4882a593Smuzhiyun #include "rds.h"
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun /*
40*4882a593Smuzhiyun * All of connection management is simplified by serializing it through
41*4882a593Smuzhiyun * work queues that execute in a connection managing thread.
42*4882a593Smuzhiyun *
43*4882a593Smuzhiyun * TCP wants to send acks through sendpage() in response to data_ready(),
44*4882a593Smuzhiyun * but it needs a process context to do so.
45*4882a593Smuzhiyun *
46*4882a593Smuzhiyun * The receive paths need to allocate but can't drop packets (!) so we have
47*4882a593Smuzhiyun * a thread around to block allocating if the receive fast path sees an
48*4882a593Smuzhiyun * allocation failure.
49*4882a593Smuzhiyun */
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun /* Grand Unified Theory of connection life cycle:
52*4882a593Smuzhiyun * At any point in time, the connection can be in one of these states:
53*4882a593Smuzhiyun * DOWN, CONNECTING, UP, DISCONNECTING, ERROR
54*4882a593Smuzhiyun *
55*4882a593Smuzhiyun * The following transitions are possible:
56*4882a593Smuzhiyun * ANY -> ERROR
57*4882a593Smuzhiyun * UP -> DISCONNECTING
58*4882a593Smuzhiyun * ERROR -> DISCONNECTING
59*4882a593Smuzhiyun * DISCONNECTING -> DOWN
60*4882a593Smuzhiyun * DOWN -> CONNECTING
61*4882a593Smuzhiyun * CONNECTING -> UP
62*4882a593Smuzhiyun *
63*4882a593Smuzhiyun * Transition to state DISCONNECTING/DOWN:
64*4882a593Smuzhiyun * - Inside the shutdown worker; synchronizes with xmit path
65*4882a593Smuzhiyun * through RDS_IN_XMIT, and with connection management callbacks
66*4882a593Smuzhiyun * via c_cm_lock.
67*4882a593Smuzhiyun *
68*4882a593Smuzhiyun * For receive callbacks, we rely on the underlying transport
69*4882a593Smuzhiyun * (TCP, IB/RDMA) to provide the necessary synchronisation.
70*4882a593Smuzhiyun */
71*4882a593Smuzhiyun struct workqueue_struct *rds_wq;
72*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(rds_wq);
73*4882a593Smuzhiyun
rds_connect_path_complete(struct rds_conn_path * cp,int curr)74*4882a593Smuzhiyun void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
75*4882a593Smuzhiyun {
76*4882a593Smuzhiyun if (!rds_conn_path_transition(cp, curr, RDS_CONN_UP)) {
77*4882a593Smuzhiyun printk(KERN_WARNING "%s: Cannot transition to state UP, "
78*4882a593Smuzhiyun "current state is %d\n",
79*4882a593Smuzhiyun __func__,
80*4882a593Smuzhiyun atomic_read(&cp->cp_state));
81*4882a593Smuzhiyun rds_conn_path_drop(cp, false);
82*4882a593Smuzhiyun return;
83*4882a593Smuzhiyun }
84*4882a593Smuzhiyun
85*4882a593Smuzhiyun rdsdebug("conn %p for %pI6c to %pI6c complete\n",
86*4882a593Smuzhiyun cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
87*4882a593Smuzhiyun
88*4882a593Smuzhiyun cp->cp_reconnect_jiffies = 0;
89*4882a593Smuzhiyun set_bit(0, &cp->cp_conn->c_map_queued);
90*4882a593Smuzhiyun rcu_read_lock();
91*4882a593Smuzhiyun if (!rds_destroy_pending(cp->cp_conn)) {
92*4882a593Smuzhiyun queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
93*4882a593Smuzhiyun queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
94*4882a593Smuzhiyun }
95*4882a593Smuzhiyun rcu_read_unlock();
96*4882a593Smuzhiyun cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION;
97*4882a593Smuzhiyun }
98*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(rds_connect_path_complete);
99*4882a593Smuzhiyun
rds_connect_complete(struct rds_connection * conn)100*4882a593Smuzhiyun void rds_connect_complete(struct rds_connection *conn)
101*4882a593Smuzhiyun {
102*4882a593Smuzhiyun rds_connect_path_complete(&conn->c_path[0], RDS_CONN_CONNECTING);
103*4882a593Smuzhiyun }
104*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(rds_connect_complete);
105*4882a593Smuzhiyun
106*4882a593Smuzhiyun /*
107*4882a593Smuzhiyun * This random exponential backoff is relied on to eventually resolve racing
108*4882a593Smuzhiyun * connects.
109*4882a593Smuzhiyun *
110*4882a593Smuzhiyun * If connect attempts race then both parties drop both connections and come
111*4882a593Smuzhiyun * here to wait for a random amount of time before trying again. Eventually
112*4882a593Smuzhiyun * the backoff range will be so much greater than the time it takes to
113*4882a593Smuzhiyun * establish a connection that one of the pair will establish the connection
114*4882a593Smuzhiyun * before the other's random delay fires.
115*4882a593Smuzhiyun *
116*4882a593Smuzhiyun * Connection attempts that arrive while a connection is already established
117*4882a593Smuzhiyun * are also considered to be racing connects. This lets a connection from
118*4882a593Smuzhiyun * a rebooted machine replace an existing stale connection before the transport
119*4882a593Smuzhiyun * notices that the connection has failed.
120*4882a593Smuzhiyun *
121*4882a593Smuzhiyun * We should *always* start with a random backoff; otherwise a broken connection
122*4882a593Smuzhiyun * will always take several iterations to be re-established.
123*4882a593Smuzhiyun */
rds_queue_reconnect(struct rds_conn_path * cp)124*4882a593Smuzhiyun void rds_queue_reconnect(struct rds_conn_path *cp)
125*4882a593Smuzhiyun {
126*4882a593Smuzhiyun unsigned long rand;
127*4882a593Smuzhiyun struct rds_connection *conn = cp->cp_conn;
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun rdsdebug("conn %p for %pI6c to %pI6c reconnect jiffies %lu\n",
130*4882a593Smuzhiyun conn, &conn->c_laddr, &conn->c_faddr,
131*4882a593Smuzhiyun cp->cp_reconnect_jiffies);
132*4882a593Smuzhiyun
133*4882a593Smuzhiyun /* let peer with smaller addr initiate reconnect, to avoid duels */
134*4882a593Smuzhiyun if (conn->c_trans->t_type == RDS_TRANS_TCP &&
135*4882a593Smuzhiyun rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0)
136*4882a593Smuzhiyun return;
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
139*4882a593Smuzhiyun if (cp->cp_reconnect_jiffies == 0) {
140*4882a593Smuzhiyun cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
141*4882a593Smuzhiyun rcu_read_lock();
142*4882a593Smuzhiyun if (!rds_destroy_pending(cp->cp_conn))
143*4882a593Smuzhiyun queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
144*4882a593Smuzhiyun rcu_read_unlock();
145*4882a593Smuzhiyun return;
146*4882a593Smuzhiyun }
147*4882a593Smuzhiyun
148*4882a593Smuzhiyun get_random_bytes(&rand, sizeof(rand));
149*4882a593Smuzhiyun rdsdebug("%lu delay %lu ceil conn %p for %pI6c -> %pI6c\n",
150*4882a593Smuzhiyun rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
151*4882a593Smuzhiyun conn, &conn->c_laddr, &conn->c_faddr);
152*4882a593Smuzhiyun rcu_read_lock();
153*4882a593Smuzhiyun if (!rds_destroy_pending(cp->cp_conn))
154*4882a593Smuzhiyun queue_delayed_work(rds_wq, &cp->cp_conn_w,
155*4882a593Smuzhiyun rand % cp->cp_reconnect_jiffies);
156*4882a593Smuzhiyun rcu_read_unlock();
157*4882a593Smuzhiyun
158*4882a593Smuzhiyun cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2,
159*4882a593Smuzhiyun rds_sysctl_reconnect_max_jiffies);
160*4882a593Smuzhiyun }
161*4882a593Smuzhiyun
rds_connect_worker(struct work_struct * work)162*4882a593Smuzhiyun void rds_connect_worker(struct work_struct *work)
163*4882a593Smuzhiyun {
164*4882a593Smuzhiyun struct rds_conn_path *cp = container_of(work,
165*4882a593Smuzhiyun struct rds_conn_path,
166*4882a593Smuzhiyun cp_conn_w.work);
167*4882a593Smuzhiyun struct rds_connection *conn = cp->cp_conn;
168*4882a593Smuzhiyun int ret;
169*4882a593Smuzhiyun
170*4882a593Smuzhiyun if (cp->cp_index > 0 &&
171*4882a593Smuzhiyun rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) >= 0)
172*4882a593Smuzhiyun return;
173*4882a593Smuzhiyun clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
174*4882a593Smuzhiyun ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
175*4882a593Smuzhiyun if (ret) {
176*4882a593Smuzhiyun ret = conn->c_trans->conn_path_connect(cp);
177*4882a593Smuzhiyun rdsdebug("conn %p for %pI6c to %pI6c dispatched, ret %d\n",
178*4882a593Smuzhiyun conn, &conn->c_laddr, &conn->c_faddr, ret);
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun if (ret) {
181*4882a593Smuzhiyun if (rds_conn_path_transition(cp,
182*4882a593Smuzhiyun RDS_CONN_CONNECTING,
183*4882a593Smuzhiyun RDS_CONN_DOWN))
184*4882a593Smuzhiyun rds_queue_reconnect(cp);
185*4882a593Smuzhiyun else
186*4882a593Smuzhiyun rds_conn_path_error(cp, "connect failed\n");
187*4882a593Smuzhiyun }
188*4882a593Smuzhiyun }
189*4882a593Smuzhiyun }
190*4882a593Smuzhiyun
rds_send_worker(struct work_struct * work)191*4882a593Smuzhiyun void rds_send_worker(struct work_struct *work)
192*4882a593Smuzhiyun {
193*4882a593Smuzhiyun struct rds_conn_path *cp = container_of(work,
194*4882a593Smuzhiyun struct rds_conn_path,
195*4882a593Smuzhiyun cp_send_w.work);
196*4882a593Smuzhiyun int ret;
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun if (rds_conn_path_state(cp) == RDS_CONN_UP) {
199*4882a593Smuzhiyun clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags);
200*4882a593Smuzhiyun ret = rds_send_xmit(cp);
201*4882a593Smuzhiyun cond_resched();
202*4882a593Smuzhiyun rdsdebug("conn %p ret %d\n", cp->cp_conn, ret);
203*4882a593Smuzhiyun switch (ret) {
204*4882a593Smuzhiyun case -EAGAIN:
205*4882a593Smuzhiyun rds_stats_inc(s_send_immediate_retry);
206*4882a593Smuzhiyun queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
207*4882a593Smuzhiyun break;
208*4882a593Smuzhiyun case -ENOMEM:
209*4882a593Smuzhiyun rds_stats_inc(s_send_delayed_retry);
210*4882a593Smuzhiyun queue_delayed_work(rds_wq, &cp->cp_send_w, 2);
211*4882a593Smuzhiyun default:
212*4882a593Smuzhiyun break;
213*4882a593Smuzhiyun }
214*4882a593Smuzhiyun }
215*4882a593Smuzhiyun }
216*4882a593Smuzhiyun
rds_recv_worker(struct work_struct * work)217*4882a593Smuzhiyun void rds_recv_worker(struct work_struct *work)
218*4882a593Smuzhiyun {
219*4882a593Smuzhiyun struct rds_conn_path *cp = container_of(work,
220*4882a593Smuzhiyun struct rds_conn_path,
221*4882a593Smuzhiyun cp_recv_w.work);
222*4882a593Smuzhiyun int ret;
223*4882a593Smuzhiyun
224*4882a593Smuzhiyun if (rds_conn_path_state(cp) == RDS_CONN_UP) {
225*4882a593Smuzhiyun ret = cp->cp_conn->c_trans->recv_path(cp);
226*4882a593Smuzhiyun rdsdebug("conn %p ret %d\n", cp->cp_conn, ret);
227*4882a593Smuzhiyun switch (ret) {
228*4882a593Smuzhiyun case -EAGAIN:
229*4882a593Smuzhiyun rds_stats_inc(s_recv_immediate_retry);
230*4882a593Smuzhiyun queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
231*4882a593Smuzhiyun break;
232*4882a593Smuzhiyun case -ENOMEM:
233*4882a593Smuzhiyun rds_stats_inc(s_recv_delayed_retry);
234*4882a593Smuzhiyun queue_delayed_work(rds_wq, &cp->cp_recv_w, 2);
235*4882a593Smuzhiyun default:
236*4882a593Smuzhiyun break;
237*4882a593Smuzhiyun }
238*4882a593Smuzhiyun }
239*4882a593Smuzhiyun }
240*4882a593Smuzhiyun
rds_shutdown_worker(struct work_struct * work)241*4882a593Smuzhiyun void rds_shutdown_worker(struct work_struct *work)
242*4882a593Smuzhiyun {
243*4882a593Smuzhiyun struct rds_conn_path *cp = container_of(work,
244*4882a593Smuzhiyun struct rds_conn_path,
245*4882a593Smuzhiyun cp_down_w);
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun rds_conn_shutdown(cp);
248*4882a593Smuzhiyun }
249*4882a593Smuzhiyun
rds_threads_exit(void)250*4882a593Smuzhiyun void rds_threads_exit(void)
251*4882a593Smuzhiyun {
252*4882a593Smuzhiyun destroy_workqueue(rds_wq);
253*4882a593Smuzhiyun }
254*4882a593Smuzhiyun
rds_threads_init(void)255*4882a593Smuzhiyun int rds_threads_init(void)
256*4882a593Smuzhiyun {
257*4882a593Smuzhiyun rds_wq = create_singlethread_workqueue("krdsd");
258*4882a593Smuzhiyun if (!rds_wq)
259*4882a593Smuzhiyun return -ENOMEM;
260*4882a593Smuzhiyun
261*4882a593Smuzhiyun return 0;
262*4882a593Smuzhiyun }
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun /* Compare two IPv6 addresses. Return 0 if the two addresses are equal.
265*4882a593Smuzhiyun * Return 1 if the first is greater. Return -1 if the second is greater.
266*4882a593Smuzhiyun */
rds_addr_cmp(const struct in6_addr * addr1,const struct in6_addr * addr2)267*4882a593Smuzhiyun int rds_addr_cmp(const struct in6_addr *addr1,
268*4882a593Smuzhiyun const struct in6_addr *addr2)
269*4882a593Smuzhiyun {
270*4882a593Smuzhiyun #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
271*4882a593Smuzhiyun const __be64 *a1, *a2;
272*4882a593Smuzhiyun u64 x, y;
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun a1 = (__be64 *)addr1;
275*4882a593Smuzhiyun a2 = (__be64 *)addr2;
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun if (*a1 != *a2) {
278*4882a593Smuzhiyun if (be64_to_cpu(*a1) < be64_to_cpu(*a2))
279*4882a593Smuzhiyun return -1;
280*4882a593Smuzhiyun else
281*4882a593Smuzhiyun return 1;
282*4882a593Smuzhiyun } else {
283*4882a593Smuzhiyun x = be64_to_cpu(*++a1);
284*4882a593Smuzhiyun y = be64_to_cpu(*++a2);
285*4882a593Smuzhiyun if (x < y)
286*4882a593Smuzhiyun return -1;
287*4882a593Smuzhiyun else if (x > y)
288*4882a593Smuzhiyun return 1;
289*4882a593Smuzhiyun else
290*4882a593Smuzhiyun return 0;
291*4882a593Smuzhiyun }
292*4882a593Smuzhiyun #else
293*4882a593Smuzhiyun u32 a, b;
294*4882a593Smuzhiyun int i;
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun for (i = 0; i < 4; i++) {
297*4882a593Smuzhiyun if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) {
298*4882a593Smuzhiyun a = ntohl(addr1->s6_addr32[i]);
299*4882a593Smuzhiyun b = ntohl(addr2->s6_addr32[i]);
300*4882a593Smuzhiyun if (a < b)
301*4882a593Smuzhiyun return -1;
302*4882a593Smuzhiyun else if (a > b)
303*4882a593Smuzhiyun return 1;
304*4882a593Smuzhiyun }
305*4882a593Smuzhiyun }
306*4882a593Smuzhiyun return 0;
307*4882a593Smuzhiyun #endif
308*4882a593Smuzhiyun }
309*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(rds_addr_cmp);
310