1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-or-later
2*4882a593Smuzhiyun /* Handle vlserver selection and rotation.
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
5*4882a593Smuzhiyun * Written by David Howells (dhowells@redhat.com)
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun #include <linux/kernel.h>
9*4882a593Smuzhiyun #include <linux/sched.h>
10*4882a593Smuzhiyun #include <linux/sched/signal.h>
11*4882a593Smuzhiyun #include "internal.h"
12*4882a593Smuzhiyun #include "afs_vl.h"
13*4882a593Smuzhiyun
14*4882a593Smuzhiyun /*
15*4882a593Smuzhiyun * Begin an operation on a volume location server.
16*4882a593Smuzhiyun */
afs_begin_vlserver_operation(struct afs_vl_cursor * vc,struct afs_cell * cell,struct key * key)17*4882a593Smuzhiyun bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cell,
18*4882a593Smuzhiyun struct key *key)
19*4882a593Smuzhiyun {
20*4882a593Smuzhiyun memset(vc, 0, sizeof(*vc));
21*4882a593Smuzhiyun vc->cell = cell;
22*4882a593Smuzhiyun vc->key = key;
23*4882a593Smuzhiyun vc->error = -EDESTADDRREQ;
24*4882a593Smuzhiyun vc->ac.error = SHRT_MAX;
25*4882a593Smuzhiyun
26*4882a593Smuzhiyun if (signal_pending(current)) {
27*4882a593Smuzhiyun vc->error = -EINTR;
28*4882a593Smuzhiyun vc->flags |= AFS_VL_CURSOR_STOP;
29*4882a593Smuzhiyun return false;
30*4882a593Smuzhiyun }
31*4882a593Smuzhiyun
32*4882a593Smuzhiyun return true;
33*4882a593Smuzhiyun }
34*4882a593Smuzhiyun
35*4882a593Smuzhiyun /*
36*4882a593Smuzhiyun * Begin iteration through a server list, starting with the last used server if
37*4882a593Smuzhiyun * possible, or the last recorded good server if not.
38*4882a593Smuzhiyun */
afs_start_vl_iteration(struct afs_vl_cursor * vc)39*4882a593Smuzhiyun static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
40*4882a593Smuzhiyun {
41*4882a593Smuzhiyun struct afs_cell *cell = vc->cell;
42*4882a593Smuzhiyun unsigned int dns_lookup_count;
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun if (cell->dns_source == DNS_RECORD_UNAVAILABLE ||
45*4882a593Smuzhiyun cell->dns_expiry <= ktime_get_real_seconds()) {
46*4882a593Smuzhiyun dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count);
47*4882a593Smuzhiyun set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags);
48*4882a593Smuzhiyun afs_queue_cell(cell, afs_cell_trace_get_queue_dns);
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
51*4882a593Smuzhiyun if (wait_var_event_interruptible(
52*4882a593Smuzhiyun &cell->dns_lookup_count,
53*4882a593Smuzhiyun smp_load_acquire(&cell->dns_lookup_count)
54*4882a593Smuzhiyun != dns_lookup_count) < 0) {
55*4882a593Smuzhiyun vc->error = -ERESTARTSYS;
56*4882a593Smuzhiyun return false;
57*4882a593Smuzhiyun }
58*4882a593Smuzhiyun }
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun /* Status load is ordered after lookup counter load */
61*4882a593Smuzhiyun if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
62*4882a593Smuzhiyun vc->error = -EDESTADDRREQ;
63*4882a593Smuzhiyun return false;
64*4882a593Smuzhiyun }
65*4882a593Smuzhiyun }
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun read_lock(&cell->vl_servers_lock);
68*4882a593Smuzhiyun vc->server_list = afs_get_vlserverlist(
69*4882a593Smuzhiyun rcu_dereference_protected(cell->vl_servers,
70*4882a593Smuzhiyun lockdep_is_held(&cell->vl_servers_lock)));
71*4882a593Smuzhiyun read_unlock(&cell->vl_servers_lock);
72*4882a593Smuzhiyun if (!vc->server_list->nr_servers)
73*4882a593Smuzhiyun return false;
74*4882a593Smuzhiyun
75*4882a593Smuzhiyun vc->untried = (1UL << vc->server_list->nr_servers) - 1;
76*4882a593Smuzhiyun vc->index = -1;
77*4882a593Smuzhiyun return true;
78*4882a593Smuzhiyun }
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun /*
81*4882a593Smuzhiyun * Select the vlserver to use. May be called multiple times to rotate
82*4882a593Smuzhiyun * through the vlservers.
83*4882a593Smuzhiyun */
afs_select_vlserver(struct afs_vl_cursor * vc)84*4882a593Smuzhiyun bool afs_select_vlserver(struct afs_vl_cursor *vc)
85*4882a593Smuzhiyun {
86*4882a593Smuzhiyun struct afs_addr_list *alist;
87*4882a593Smuzhiyun struct afs_vlserver *vlserver;
88*4882a593Smuzhiyun struct afs_error e;
89*4882a593Smuzhiyun u32 rtt;
90*4882a593Smuzhiyun int error = vc->ac.error, i;
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun _enter("%lx[%d],%lx[%d],%d,%d",
93*4882a593Smuzhiyun vc->untried, vc->index,
94*4882a593Smuzhiyun vc->ac.tried, vc->ac.index,
95*4882a593Smuzhiyun error, vc->ac.abort_code);
96*4882a593Smuzhiyun
97*4882a593Smuzhiyun if (vc->flags & AFS_VL_CURSOR_STOP) {
98*4882a593Smuzhiyun _leave(" = f [stopped]");
99*4882a593Smuzhiyun return false;
100*4882a593Smuzhiyun }
101*4882a593Smuzhiyun
102*4882a593Smuzhiyun vc->nr_iterations++;
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun /* Evaluate the result of the previous operation, if there was one. */
105*4882a593Smuzhiyun switch (error) {
106*4882a593Smuzhiyun case SHRT_MAX:
107*4882a593Smuzhiyun goto start;
108*4882a593Smuzhiyun
109*4882a593Smuzhiyun default:
110*4882a593Smuzhiyun case 0:
111*4882a593Smuzhiyun /* Success or local failure. Stop. */
112*4882a593Smuzhiyun vc->error = error;
113*4882a593Smuzhiyun vc->flags |= AFS_VL_CURSOR_STOP;
114*4882a593Smuzhiyun _leave(" = f [okay/local %d]", vc->ac.error);
115*4882a593Smuzhiyun return false;
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun case -ECONNABORTED:
118*4882a593Smuzhiyun /* The far side rejected the operation on some grounds. This
119*4882a593Smuzhiyun * might involve the server being busy or the volume having been moved.
120*4882a593Smuzhiyun */
121*4882a593Smuzhiyun switch (vc->ac.abort_code) {
122*4882a593Smuzhiyun case AFSVL_IO:
123*4882a593Smuzhiyun case AFSVL_BADVOLOPER:
124*4882a593Smuzhiyun case AFSVL_NOMEM:
125*4882a593Smuzhiyun /* The server went weird. */
126*4882a593Smuzhiyun vc->error = -EREMOTEIO;
127*4882a593Smuzhiyun //write_lock(&vc->cell->vl_servers_lock);
128*4882a593Smuzhiyun //vc->server_list->weird_mask |= 1 << vc->index;
129*4882a593Smuzhiyun //write_unlock(&vc->cell->vl_servers_lock);
130*4882a593Smuzhiyun goto next_server;
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun default:
133*4882a593Smuzhiyun vc->error = afs_abort_to_error(vc->ac.abort_code);
134*4882a593Smuzhiyun goto failed;
135*4882a593Smuzhiyun }
136*4882a593Smuzhiyun
137*4882a593Smuzhiyun case -ERFKILL:
138*4882a593Smuzhiyun case -EADDRNOTAVAIL:
139*4882a593Smuzhiyun case -ENETUNREACH:
140*4882a593Smuzhiyun case -EHOSTUNREACH:
141*4882a593Smuzhiyun case -EHOSTDOWN:
142*4882a593Smuzhiyun case -ECONNREFUSED:
143*4882a593Smuzhiyun case -ETIMEDOUT:
144*4882a593Smuzhiyun case -ETIME:
145*4882a593Smuzhiyun _debug("no conn %d", error);
146*4882a593Smuzhiyun vc->error = error;
147*4882a593Smuzhiyun goto iterate_address;
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun case -ECONNRESET:
150*4882a593Smuzhiyun _debug("call reset");
151*4882a593Smuzhiyun vc->error = error;
152*4882a593Smuzhiyun vc->flags |= AFS_VL_CURSOR_RETRY;
153*4882a593Smuzhiyun goto next_server;
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun case -EOPNOTSUPP:
156*4882a593Smuzhiyun _debug("notsupp");
157*4882a593Smuzhiyun goto next_server;
158*4882a593Smuzhiyun }
159*4882a593Smuzhiyun
160*4882a593Smuzhiyun restart_from_beginning:
161*4882a593Smuzhiyun _debug("restart");
162*4882a593Smuzhiyun afs_end_cursor(&vc->ac);
163*4882a593Smuzhiyun afs_put_vlserverlist(vc->cell->net, vc->server_list);
164*4882a593Smuzhiyun vc->server_list = NULL;
165*4882a593Smuzhiyun if (vc->flags & AFS_VL_CURSOR_RETRIED)
166*4882a593Smuzhiyun goto failed;
167*4882a593Smuzhiyun vc->flags |= AFS_VL_CURSOR_RETRIED;
168*4882a593Smuzhiyun start:
169*4882a593Smuzhiyun _debug("start");
170*4882a593Smuzhiyun
171*4882a593Smuzhiyun if (!afs_start_vl_iteration(vc))
172*4882a593Smuzhiyun goto failed;
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list);
175*4882a593Smuzhiyun if (error < 0)
176*4882a593Smuzhiyun goto failed_set_error;
177*4882a593Smuzhiyun
178*4882a593Smuzhiyun pick_server:
179*4882a593Smuzhiyun _debug("pick [%lx]", vc->untried);
180*4882a593Smuzhiyun
181*4882a593Smuzhiyun error = afs_wait_for_vl_probes(vc->server_list, vc->untried);
182*4882a593Smuzhiyun if (error < 0)
183*4882a593Smuzhiyun goto failed_set_error;
184*4882a593Smuzhiyun
185*4882a593Smuzhiyun /* Pick the untried server with the lowest RTT. */
186*4882a593Smuzhiyun vc->index = vc->server_list->preferred;
187*4882a593Smuzhiyun if (test_bit(vc->index, &vc->untried))
188*4882a593Smuzhiyun goto selected_server;
189*4882a593Smuzhiyun
190*4882a593Smuzhiyun vc->index = -1;
191*4882a593Smuzhiyun rtt = U32_MAX;
192*4882a593Smuzhiyun for (i = 0; i < vc->server_list->nr_servers; i++) {
193*4882a593Smuzhiyun struct afs_vlserver *s = vc->server_list->servers[i].server;
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun if (!test_bit(i, &vc->untried) ||
196*4882a593Smuzhiyun !test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
197*4882a593Smuzhiyun continue;
198*4882a593Smuzhiyun if (s->probe.rtt < rtt) {
199*4882a593Smuzhiyun vc->index = i;
200*4882a593Smuzhiyun rtt = s->probe.rtt;
201*4882a593Smuzhiyun }
202*4882a593Smuzhiyun }
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun if (vc->index == -1)
205*4882a593Smuzhiyun goto no_more_servers;
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun selected_server:
208*4882a593Smuzhiyun _debug("use %d", vc->index);
209*4882a593Smuzhiyun __clear_bit(vc->index, &vc->untried);
210*4882a593Smuzhiyun
211*4882a593Smuzhiyun /* We're starting on a different vlserver from the list. We need to
212*4882a593Smuzhiyun * check it, find its address list and probe its capabilities before we
213*4882a593Smuzhiyun * use it.
214*4882a593Smuzhiyun */
215*4882a593Smuzhiyun ASSERTCMP(vc->ac.alist, ==, NULL);
216*4882a593Smuzhiyun vlserver = vc->server_list->servers[vc->index].server;
217*4882a593Smuzhiyun vc->server = vlserver;
218*4882a593Smuzhiyun
219*4882a593Smuzhiyun _debug("USING VLSERVER: %s", vlserver->name);
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun read_lock(&vlserver->lock);
222*4882a593Smuzhiyun alist = rcu_dereference_protected(vlserver->addresses,
223*4882a593Smuzhiyun lockdep_is_held(&vlserver->lock));
224*4882a593Smuzhiyun afs_get_addrlist(alist);
225*4882a593Smuzhiyun read_unlock(&vlserver->lock);
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun memset(&vc->ac, 0, sizeof(vc->ac));
228*4882a593Smuzhiyun
229*4882a593Smuzhiyun if (!vc->ac.alist)
230*4882a593Smuzhiyun vc->ac.alist = alist;
231*4882a593Smuzhiyun else
232*4882a593Smuzhiyun afs_put_addrlist(alist);
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun vc->ac.index = -1;
235*4882a593Smuzhiyun
236*4882a593Smuzhiyun iterate_address:
237*4882a593Smuzhiyun ASSERT(vc->ac.alist);
238*4882a593Smuzhiyun /* Iterate over the current server's address list to try and find an
239*4882a593Smuzhiyun * address on which it will respond to us.
240*4882a593Smuzhiyun */
241*4882a593Smuzhiyun if (!afs_iterate_addresses(&vc->ac))
242*4882a593Smuzhiyun goto next_server;
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs);
245*4882a593Smuzhiyun
246*4882a593Smuzhiyun _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].transport);
247*4882a593Smuzhiyun return true;
248*4882a593Smuzhiyun
249*4882a593Smuzhiyun next_server:
250*4882a593Smuzhiyun _debug("next");
251*4882a593Smuzhiyun afs_end_cursor(&vc->ac);
252*4882a593Smuzhiyun goto pick_server;
253*4882a593Smuzhiyun
254*4882a593Smuzhiyun no_more_servers:
255*4882a593Smuzhiyun /* That's all the servers poked to no good effect. Try again if some
256*4882a593Smuzhiyun * of them were busy.
257*4882a593Smuzhiyun */
258*4882a593Smuzhiyun if (vc->flags & AFS_VL_CURSOR_RETRY)
259*4882a593Smuzhiyun goto restart_from_beginning;
260*4882a593Smuzhiyun
261*4882a593Smuzhiyun e.error = -EDESTADDRREQ;
262*4882a593Smuzhiyun e.responded = false;
263*4882a593Smuzhiyun for (i = 0; i < vc->server_list->nr_servers; i++) {
264*4882a593Smuzhiyun struct afs_vlserver *s = vc->server_list->servers[i].server;
265*4882a593Smuzhiyun
266*4882a593Smuzhiyun if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
267*4882a593Smuzhiyun e.responded = true;
268*4882a593Smuzhiyun afs_prioritise_error(&e, READ_ONCE(s->probe.error),
269*4882a593Smuzhiyun s->probe.abort_code);
270*4882a593Smuzhiyun }
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun error = e.error;
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun failed_set_error:
275*4882a593Smuzhiyun vc->error = error;
276*4882a593Smuzhiyun failed:
277*4882a593Smuzhiyun vc->flags |= AFS_VL_CURSOR_STOP;
278*4882a593Smuzhiyun afs_end_cursor(&vc->ac);
279*4882a593Smuzhiyun _leave(" = f [failed %d]", vc->error);
280*4882a593Smuzhiyun return false;
281*4882a593Smuzhiyun }
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun /*
284*4882a593Smuzhiyun * Dump cursor state in the case of the error being EDESTADDRREQ.
285*4882a593Smuzhiyun */
afs_vl_dump_edestaddrreq(const struct afs_vl_cursor * vc)286*4882a593Smuzhiyun static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
287*4882a593Smuzhiyun {
288*4882a593Smuzhiyun static int count;
289*4882a593Smuzhiyun int i;
290*4882a593Smuzhiyun
291*4882a593Smuzhiyun if (!IS_ENABLED(CONFIG_AFS_DEBUG_CURSOR) || count > 3)
292*4882a593Smuzhiyun return;
293*4882a593Smuzhiyun count++;
294*4882a593Smuzhiyun
295*4882a593Smuzhiyun rcu_read_lock();
296*4882a593Smuzhiyun pr_notice("EDESTADDR occurred\n");
297*4882a593Smuzhiyun pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n",
298*4882a593Smuzhiyun vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error);
299*4882a593Smuzhiyun
300*4882a593Smuzhiyun if (vc->server_list) {
301*4882a593Smuzhiyun const struct afs_vlserver_list *sl = vc->server_list;
302*4882a593Smuzhiyun pr_notice("VC: SL nr=%u ix=%u\n",
303*4882a593Smuzhiyun sl->nr_servers, sl->index);
304*4882a593Smuzhiyun for (i = 0; i < sl->nr_servers; i++) {
305*4882a593Smuzhiyun const struct afs_vlserver *s = sl->servers[i].server;
306*4882a593Smuzhiyun pr_notice("VC: server %s+%hu fl=%lx E=%hd\n",
307*4882a593Smuzhiyun s->name, s->port, s->flags, s->probe.error);
308*4882a593Smuzhiyun if (s->addresses) {
309*4882a593Smuzhiyun const struct afs_addr_list *a =
310*4882a593Smuzhiyun rcu_dereference(s->addresses);
311*4882a593Smuzhiyun pr_notice("VC: - nr=%u/%u/%u pf=%u\n",
312*4882a593Smuzhiyun a->nr_ipv4, a->nr_addrs, a->max_addrs,
313*4882a593Smuzhiyun a->preferred);
314*4882a593Smuzhiyun pr_notice("VC: - R=%lx F=%lx\n",
315*4882a593Smuzhiyun a->responded, a->failed);
316*4882a593Smuzhiyun if (a == vc->ac.alist)
317*4882a593Smuzhiyun pr_notice("VC: - current\n");
318*4882a593Smuzhiyun }
319*4882a593Smuzhiyun }
320*4882a593Smuzhiyun }
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
323*4882a593Smuzhiyun vc->ac.tried, vc->ac.index, vc->ac.abort_code, vc->ac.error,
324*4882a593Smuzhiyun vc->ac.responded, vc->ac.nr_iterations);
325*4882a593Smuzhiyun rcu_read_unlock();
326*4882a593Smuzhiyun }
327*4882a593Smuzhiyun
328*4882a593Smuzhiyun /*
329*4882a593Smuzhiyun * Tidy up a volume location server cursor and unlock the vnode.
330*4882a593Smuzhiyun */
afs_end_vlserver_operation(struct afs_vl_cursor * vc)331*4882a593Smuzhiyun int afs_end_vlserver_operation(struct afs_vl_cursor *vc)
332*4882a593Smuzhiyun {
333*4882a593Smuzhiyun struct afs_net *net = vc->cell->net;
334*4882a593Smuzhiyun
335*4882a593Smuzhiyun if (vc->error == -EDESTADDRREQ ||
336*4882a593Smuzhiyun vc->error == -EADDRNOTAVAIL ||
337*4882a593Smuzhiyun vc->error == -ENETUNREACH ||
338*4882a593Smuzhiyun vc->error == -EHOSTUNREACH)
339*4882a593Smuzhiyun afs_vl_dump_edestaddrreq(vc);
340*4882a593Smuzhiyun
341*4882a593Smuzhiyun afs_end_cursor(&vc->ac);
342*4882a593Smuzhiyun afs_put_vlserverlist(net, vc->server_list);
343*4882a593Smuzhiyun
344*4882a593Smuzhiyun if (vc->error == -ECONNABORTED)
345*4882a593Smuzhiyun vc->error = afs_abort_to_error(vc->ac.abort_code);
346*4882a593Smuzhiyun
347*4882a593Smuzhiyun return vc->error;
348*4882a593Smuzhiyun }
349