xref: /OK3568_Linux_fs/kernel/fs/ocfs2/dlm/dlmrecovery.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-or-later
2*4882a593Smuzhiyun /* -*- mode: c; c-basic-offset: 8; -*-
3*4882a593Smuzhiyun  * vim: noexpandtab sw=8 ts=8 sts=0:
4*4882a593Smuzhiyun  *
5*4882a593Smuzhiyun  * dlmrecovery.c
6*4882a593Smuzhiyun  *
7*4882a593Smuzhiyun  * recovery stuff
8*4882a593Smuzhiyun  *
9*4882a593Smuzhiyun  * Copyright (C) 2004 Oracle.  All rights reserved.
10*4882a593Smuzhiyun  */
11*4882a593Smuzhiyun 
12*4882a593Smuzhiyun 
13*4882a593Smuzhiyun #include <linux/module.h>
14*4882a593Smuzhiyun #include <linux/fs.h>
15*4882a593Smuzhiyun #include <linux/types.h>
16*4882a593Smuzhiyun #include <linux/slab.h>
17*4882a593Smuzhiyun #include <linux/highmem.h>
18*4882a593Smuzhiyun #include <linux/init.h>
19*4882a593Smuzhiyun #include <linux/sysctl.h>
20*4882a593Smuzhiyun #include <linux/random.h>
21*4882a593Smuzhiyun #include <linux/blkdev.h>
22*4882a593Smuzhiyun #include <linux/socket.h>
23*4882a593Smuzhiyun #include <linux/inet.h>
24*4882a593Smuzhiyun #include <linux/timer.h>
25*4882a593Smuzhiyun #include <linux/kthread.h>
26*4882a593Smuzhiyun #include <linux/delay.h>
27*4882a593Smuzhiyun 
28*4882a593Smuzhiyun 
29*4882a593Smuzhiyun #include "../cluster/heartbeat.h"
30*4882a593Smuzhiyun #include "../cluster/nodemanager.h"
31*4882a593Smuzhiyun #include "../cluster/tcp.h"
32*4882a593Smuzhiyun 
33*4882a593Smuzhiyun #include "dlmapi.h"
34*4882a593Smuzhiyun #include "dlmcommon.h"
35*4882a593Smuzhiyun #include "dlmdomain.h"
36*4882a593Smuzhiyun 
37*4882a593Smuzhiyun #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY)
38*4882a593Smuzhiyun #include "../cluster/masklog.h"
39*4882a593Smuzhiyun 
40*4882a593Smuzhiyun static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
41*4882a593Smuzhiyun 
42*4882a593Smuzhiyun static int dlm_recovery_thread(void *data);
43*4882a593Smuzhiyun static int dlm_do_recovery(struct dlm_ctxt *dlm);
44*4882a593Smuzhiyun 
45*4882a593Smuzhiyun static int dlm_pick_recovery_master(struct dlm_ctxt *dlm);
46*4882a593Smuzhiyun static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
47*4882a593Smuzhiyun static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
48*4882a593Smuzhiyun static int dlm_request_all_locks(struct dlm_ctxt *dlm,
49*4882a593Smuzhiyun 				 u8 request_from, u8 dead_node);
50*4882a593Smuzhiyun static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm);
51*4882a593Smuzhiyun 
52*4882a593Smuzhiyun static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
53*4882a593Smuzhiyun static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
54*4882a593Smuzhiyun 					const char *lockname, int namelen,
55*4882a593Smuzhiyun 					int total_locks, u64 cookie,
56*4882a593Smuzhiyun 					u8 flags, u8 master);
57*4882a593Smuzhiyun static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
58*4882a593Smuzhiyun 				    struct dlm_migratable_lockres *mres,
59*4882a593Smuzhiyun 				    u8 send_to,
60*4882a593Smuzhiyun 				    struct dlm_lock_resource *res,
61*4882a593Smuzhiyun 				    int total_locks);
62*4882a593Smuzhiyun static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
63*4882a593Smuzhiyun 				     struct dlm_lock_resource *res,
64*4882a593Smuzhiyun 				     struct dlm_migratable_lockres *mres);
65*4882a593Smuzhiyun static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm);
66*4882a593Smuzhiyun static int dlm_send_all_done_msg(struct dlm_ctxt *dlm,
67*4882a593Smuzhiyun 				 u8 dead_node, u8 send_to);
68*4882a593Smuzhiyun static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node);
69*4882a593Smuzhiyun static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
70*4882a593Smuzhiyun 					struct list_head *list, u8 dead_node);
71*4882a593Smuzhiyun static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
72*4882a593Smuzhiyun 					      u8 dead_node, u8 new_master);
73*4882a593Smuzhiyun static void dlm_reco_ast(void *astdata);
74*4882a593Smuzhiyun static void dlm_reco_bast(void *astdata, int blocked_type);
75*4882a593Smuzhiyun static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st);
76*4882a593Smuzhiyun static void dlm_request_all_locks_worker(struct dlm_work_item *item,
77*4882a593Smuzhiyun 					 void *data);
78*4882a593Smuzhiyun static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
79*4882a593Smuzhiyun static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
80*4882a593Smuzhiyun 				      struct dlm_lock_resource *res,
81*4882a593Smuzhiyun 				      u8 *real_master);
82*4882a593Smuzhiyun 
83*4882a593Smuzhiyun static u64 dlm_get_next_mig_cookie(void);
84*4882a593Smuzhiyun 
85*4882a593Smuzhiyun static DEFINE_SPINLOCK(dlm_reco_state_lock);
86*4882a593Smuzhiyun static DEFINE_SPINLOCK(dlm_mig_cookie_lock);
87*4882a593Smuzhiyun static u64 dlm_mig_cookie = 1;
88*4882a593Smuzhiyun 
dlm_get_next_mig_cookie(void)89*4882a593Smuzhiyun static u64 dlm_get_next_mig_cookie(void)
90*4882a593Smuzhiyun {
91*4882a593Smuzhiyun 	u64 c;
92*4882a593Smuzhiyun 	spin_lock(&dlm_mig_cookie_lock);
93*4882a593Smuzhiyun 	c = dlm_mig_cookie;
94*4882a593Smuzhiyun 	if (dlm_mig_cookie == (~0ULL))
95*4882a593Smuzhiyun 		dlm_mig_cookie = 1;
96*4882a593Smuzhiyun 	else
97*4882a593Smuzhiyun 		dlm_mig_cookie++;
98*4882a593Smuzhiyun 	spin_unlock(&dlm_mig_cookie_lock);
99*4882a593Smuzhiyun 	return c;
100*4882a593Smuzhiyun }
101*4882a593Smuzhiyun 
dlm_set_reco_dead_node(struct dlm_ctxt * dlm,u8 dead_node)102*4882a593Smuzhiyun static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm,
103*4882a593Smuzhiyun 					  u8 dead_node)
104*4882a593Smuzhiyun {
105*4882a593Smuzhiyun 	assert_spin_locked(&dlm->spinlock);
106*4882a593Smuzhiyun 	if (dlm->reco.dead_node != dead_node)
107*4882a593Smuzhiyun 		mlog(0, "%s: changing dead_node from %u to %u\n",
108*4882a593Smuzhiyun 		     dlm->name, dlm->reco.dead_node, dead_node);
109*4882a593Smuzhiyun 	dlm->reco.dead_node = dead_node;
110*4882a593Smuzhiyun }
111*4882a593Smuzhiyun 
dlm_set_reco_master(struct dlm_ctxt * dlm,u8 master)112*4882a593Smuzhiyun static inline void dlm_set_reco_master(struct dlm_ctxt *dlm,
113*4882a593Smuzhiyun 				       u8 master)
114*4882a593Smuzhiyun {
115*4882a593Smuzhiyun 	assert_spin_locked(&dlm->spinlock);
116*4882a593Smuzhiyun 	mlog(0, "%s: changing new_master from %u to %u\n",
117*4882a593Smuzhiyun 	     dlm->name, dlm->reco.new_master, master);
118*4882a593Smuzhiyun 	dlm->reco.new_master = master;
119*4882a593Smuzhiyun }
120*4882a593Smuzhiyun 
__dlm_reset_recovery(struct dlm_ctxt * dlm)121*4882a593Smuzhiyun static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm)
122*4882a593Smuzhiyun {
123*4882a593Smuzhiyun 	assert_spin_locked(&dlm->spinlock);
124*4882a593Smuzhiyun 	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
125*4882a593Smuzhiyun 	dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
126*4882a593Smuzhiyun 	dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
127*4882a593Smuzhiyun }
128*4882a593Smuzhiyun 
dlm_reset_recovery(struct dlm_ctxt * dlm)129*4882a593Smuzhiyun static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
130*4882a593Smuzhiyun {
131*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
132*4882a593Smuzhiyun 	__dlm_reset_recovery(dlm);
133*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
134*4882a593Smuzhiyun }
135*4882a593Smuzhiyun 
136*4882a593Smuzhiyun /* Worker function used during recovery. */
dlm_dispatch_work(struct work_struct * work)137*4882a593Smuzhiyun void dlm_dispatch_work(struct work_struct *work)
138*4882a593Smuzhiyun {
139*4882a593Smuzhiyun 	struct dlm_ctxt *dlm =
140*4882a593Smuzhiyun 		container_of(work, struct dlm_ctxt, dispatched_work);
141*4882a593Smuzhiyun 	LIST_HEAD(tmp_list);
142*4882a593Smuzhiyun 	struct dlm_work_item *item, *next;
143*4882a593Smuzhiyun 	dlm_workfunc_t *workfunc;
144*4882a593Smuzhiyun 	int tot=0;
145*4882a593Smuzhiyun 
146*4882a593Smuzhiyun 	spin_lock(&dlm->work_lock);
147*4882a593Smuzhiyun 	list_splice_init(&dlm->work_list, &tmp_list);
148*4882a593Smuzhiyun 	spin_unlock(&dlm->work_lock);
149*4882a593Smuzhiyun 
150*4882a593Smuzhiyun 	list_for_each_entry(item, &tmp_list, list) {
151*4882a593Smuzhiyun 		tot++;
152*4882a593Smuzhiyun 	}
153*4882a593Smuzhiyun 	mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
154*4882a593Smuzhiyun 
155*4882a593Smuzhiyun 	list_for_each_entry_safe(item, next, &tmp_list, list) {
156*4882a593Smuzhiyun 		workfunc = item->func;
157*4882a593Smuzhiyun 		list_del_init(&item->list);
158*4882a593Smuzhiyun 
159*4882a593Smuzhiyun 		/* already have ref on dlm to avoid having
160*4882a593Smuzhiyun 		 * it disappear.  just double-check. */
161*4882a593Smuzhiyun 		BUG_ON(item->dlm != dlm);
162*4882a593Smuzhiyun 
163*4882a593Smuzhiyun 		/* this is allowed to sleep and
164*4882a593Smuzhiyun 		 * call network stuff */
165*4882a593Smuzhiyun 		workfunc(item, item->data);
166*4882a593Smuzhiyun 
167*4882a593Smuzhiyun 		dlm_put(dlm);
168*4882a593Smuzhiyun 		kfree(item);
169*4882a593Smuzhiyun 	}
170*4882a593Smuzhiyun }
171*4882a593Smuzhiyun 
172*4882a593Smuzhiyun /*
173*4882a593Smuzhiyun  * RECOVERY THREAD
174*4882a593Smuzhiyun  */
175*4882a593Smuzhiyun 
dlm_kick_recovery_thread(struct dlm_ctxt * dlm)176*4882a593Smuzhiyun void dlm_kick_recovery_thread(struct dlm_ctxt *dlm)
177*4882a593Smuzhiyun {
178*4882a593Smuzhiyun 	/* wake the recovery thread
179*4882a593Smuzhiyun 	 * this will wake the reco thread in one of three places
180*4882a593Smuzhiyun 	 * 1) sleeping with no recovery happening
181*4882a593Smuzhiyun 	 * 2) sleeping with recovery mastered elsewhere
182*4882a593Smuzhiyun 	 * 3) recovery mastered here, waiting on reco data */
183*4882a593Smuzhiyun 
184*4882a593Smuzhiyun 	wake_up(&dlm->dlm_reco_thread_wq);
185*4882a593Smuzhiyun }
186*4882a593Smuzhiyun 
187*4882a593Smuzhiyun /* Launch the recovery thread */
dlm_launch_recovery_thread(struct dlm_ctxt * dlm)188*4882a593Smuzhiyun int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
189*4882a593Smuzhiyun {
190*4882a593Smuzhiyun 	mlog(0, "starting dlm recovery thread...\n");
191*4882a593Smuzhiyun 
192*4882a593Smuzhiyun 	dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
193*4882a593Smuzhiyun 			"dlm_reco-%s", dlm->name);
194*4882a593Smuzhiyun 	if (IS_ERR(dlm->dlm_reco_thread_task)) {
195*4882a593Smuzhiyun 		mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
196*4882a593Smuzhiyun 		dlm->dlm_reco_thread_task = NULL;
197*4882a593Smuzhiyun 		return -EINVAL;
198*4882a593Smuzhiyun 	}
199*4882a593Smuzhiyun 
200*4882a593Smuzhiyun 	return 0;
201*4882a593Smuzhiyun }
202*4882a593Smuzhiyun 
dlm_complete_recovery_thread(struct dlm_ctxt * dlm)203*4882a593Smuzhiyun void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
204*4882a593Smuzhiyun {
205*4882a593Smuzhiyun 	if (dlm->dlm_reco_thread_task) {
206*4882a593Smuzhiyun 		mlog(0, "waiting for dlm recovery thread to exit\n");
207*4882a593Smuzhiyun 		kthread_stop(dlm->dlm_reco_thread_task);
208*4882a593Smuzhiyun 		dlm->dlm_reco_thread_task = NULL;
209*4882a593Smuzhiyun 	}
210*4882a593Smuzhiyun }
211*4882a593Smuzhiyun 
212*4882a593Smuzhiyun 
213*4882a593Smuzhiyun 
214*4882a593Smuzhiyun /*
215*4882a593Smuzhiyun  * this is lame, but here's how recovery works...
216*4882a593Smuzhiyun  * 1) all recovery threads cluster wide will work on recovering
217*4882a593Smuzhiyun  *    ONE node at a time
218*4882a593Smuzhiyun  * 2) negotiate who will take over all the locks for the dead node.
219*4882a593Smuzhiyun  *    thats right... ALL the locks.
220*4882a593Smuzhiyun  * 3) once a new master is chosen, everyone scans all locks
221*4882a593Smuzhiyun  *    and moves aside those mastered by the dead guy
222*4882a593Smuzhiyun  * 4) each of these locks should be locked until recovery is done
223*4882a593Smuzhiyun  * 5) the new master collects up all of secondary lock queue info
224*4882a593Smuzhiyun  *    one lock at a time, forcing each node to communicate back
225*4882a593Smuzhiyun  *    before continuing
226*4882a593Smuzhiyun  * 6) each secondary lock queue responds with the full known lock info
227*4882a593Smuzhiyun  * 7) once the new master has run all its locks, it sends a ALLDONE!
228*4882a593Smuzhiyun  *    message to everyone
229*4882a593Smuzhiyun  * 8) upon receiving this message, the secondary queue node unlocks
230*4882a593Smuzhiyun  *    and responds to the ALLDONE
231*4882a593Smuzhiyun  * 9) once the new master gets responses from everyone, he unlocks
232*4882a593Smuzhiyun  *    everything and recovery for this dead node is done
233*4882a593Smuzhiyun  *10) go back to 2) while there are still dead nodes
234*4882a593Smuzhiyun  *
235*4882a593Smuzhiyun  */
236*4882a593Smuzhiyun 
dlm_print_reco_node_status(struct dlm_ctxt * dlm)237*4882a593Smuzhiyun static void dlm_print_reco_node_status(struct dlm_ctxt *dlm)
238*4882a593Smuzhiyun {
239*4882a593Smuzhiyun 	struct dlm_reco_node_data *ndata;
240*4882a593Smuzhiyun 	struct dlm_lock_resource *res;
241*4882a593Smuzhiyun 
242*4882a593Smuzhiyun 	mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n",
243*4882a593Smuzhiyun 	     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
244*4882a593Smuzhiyun 	     dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive",
245*4882a593Smuzhiyun 	     dlm->reco.dead_node, dlm->reco.new_master);
246*4882a593Smuzhiyun 
247*4882a593Smuzhiyun 	list_for_each_entry(ndata, &dlm->reco.node_data, list) {
248*4882a593Smuzhiyun 		char *st = "unknown";
249*4882a593Smuzhiyun 		switch (ndata->state) {
250*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_INIT:
251*4882a593Smuzhiyun 				st = "init";
252*4882a593Smuzhiyun 				break;
253*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_REQUESTING:
254*4882a593Smuzhiyun 				st = "requesting";
255*4882a593Smuzhiyun 				break;
256*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_DEAD:
257*4882a593Smuzhiyun 				st = "dead";
258*4882a593Smuzhiyun 				break;
259*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_RECEIVING:
260*4882a593Smuzhiyun 				st = "receiving";
261*4882a593Smuzhiyun 				break;
262*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_REQUESTED:
263*4882a593Smuzhiyun 				st = "requested";
264*4882a593Smuzhiyun 				break;
265*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_DONE:
266*4882a593Smuzhiyun 				st = "done";
267*4882a593Smuzhiyun 				break;
268*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
269*4882a593Smuzhiyun 				st = "finalize-sent";
270*4882a593Smuzhiyun 				break;
271*4882a593Smuzhiyun 			default:
272*4882a593Smuzhiyun 				st = "bad";
273*4882a593Smuzhiyun 				break;
274*4882a593Smuzhiyun 		}
275*4882a593Smuzhiyun 		mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n",
276*4882a593Smuzhiyun 		     dlm->name, ndata->node_num, st);
277*4882a593Smuzhiyun 	}
278*4882a593Smuzhiyun 	list_for_each_entry(res, &dlm->reco.resources, recovering) {
279*4882a593Smuzhiyun 		mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n",
280*4882a593Smuzhiyun 		     dlm->name, res->lockname.len, res->lockname.name);
281*4882a593Smuzhiyun 	}
282*4882a593Smuzhiyun }
283*4882a593Smuzhiyun 
284*4882a593Smuzhiyun #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
285*4882a593Smuzhiyun 
dlm_recovery_thread(void * data)286*4882a593Smuzhiyun static int dlm_recovery_thread(void *data)
287*4882a593Smuzhiyun {
288*4882a593Smuzhiyun 	int status;
289*4882a593Smuzhiyun 	struct dlm_ctxt *dlm = data;
290*4882a593Smuzhiyun 	unsigned long timeout = msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS);
291*4882a593Smuzhiyun 
292*4882a593Smuzhiyun 	mlog(0, "dlm thread running for %s...\n", dlm->name);
293*4882a593Smuzhiyun 
294*4882a593Smuzhiyun 	while (!kthread_should_stop()) {
295*4882a593Smuzhiyun 		if (dlm_domain_fully_joined(dlm)) {
296*4882a593Smuzhiyun 			status = dlm_do_recovery(dlm);
297*4882a593Smuzhiyun 			if (status == -EAGAIN) {
298*4882a593Smuzhiyun 				/* do not sleep, recheck immediately. */
299*4882a593Smuzhiyun 				continue;
300*4882a593Smuzhiyun 			}
301*4882a593Smuzhiyun 			if (status < 0)
302*4882a593Smuzhiyun 				mlog_errno(status);
303*4882a593Smuzhiyun 		}
304*4882a593Smuzhiyun 
305*4882a593Smuzhiyun 		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
306*4882a593Smuzhiyun 						 kthread_should_stop(),
307*4882a593Smuzhiyun 						 timeout);
308*4882a593Smuzhiyun 	}
309*4882a593Smuzhiyun 
310*4882a593Smuzhiyun 	mlog(0, "quitting DLM recovery thread\n");
311*4882a593Smuzhiyun 	return 0;
312*4882a593Smuzhiyun }
313*4882a593Smuzhiyun 
314*4882a593Smuzhiyun /* returns true when the recovery master has contacted us */
dlm_reco_master_ready(struct dlm_ctxt * dlm)315*4882a593Smuzhiyun static int dlm_reco_master_ready(struct dlm_ctxt *dlm)
316*4882a593Smuzhiyun {
317*4882a593Smuzhiyun 	int ready;
318*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
319*4882a593Smuzhiyun 	ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM);
320*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
321*4882a593Smuzhiyun 	return ready;
322*4882a593Smuzhiyun }
323*4882a593Smuzhiyun 
324*4882a593Smuzhiyun /* returns true if node is no longer in the domain
325*4882a593Smuzhiyun  * could be dead or just not joined */
dlm_is_node_dead(struct dlm_ctxt * dlm,u8 node)326*4882a593Smuzhiyun int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
327*4882a593Smuzhiyun {
328*4882a593Smuzhiyun 	int dead;
329*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
330*4882a593Smuzhiyun 	dead = !test_bit(node, dlm->domain_map);
331*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
332*4882a593Smuzhiyun 	return dead;
333*4882a593Smuzhiyun }
334*4882a593Smuzhiyun 
335*4882a593Smuzhiyun /* returns true if node is no longer in the domain
336*4882a593Smuzhiyun  * could be dead or just not joined */
dlm_is_node_recovered(struct dlm_ctxt * dlm,u8 node)337*4882a593Smuzhiyun static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
338*4882a593Smuzhiyun {
339*4882a593Smuzhiyun 	int recovered;
340*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
341*4882a593Smuzhiyun 	recovered = !test_bit(node, dlm->recovery_map);
342*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
343*4882a593Smuzhiyun 	return recovered;
344*4882a593Smuzhiyun }
345*4882a593Smuzhiyun 
346*4882a593Smuzhiyun 
dlm_wait_for_node_death(struct dlm_ctxt * dlm,u8 node,int timeout)347*4882a593Smuzhiyun void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
348*4882a593Smuzhiyun {
349*4882a593Smuzhiyun 	if (dlm_is_node_dead(dlm, node))
350*4882a593Smuzhiyun 		return;
351*4882a593Smuzhiyun 
352*4882a593Smuzhiyun 	printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in "
353*4882a593Smuzhiyun 	       "domain %s\n", node, dlm->name);
354*4882a593Smuzhiyun 
355*4882a593Smuzhiyun 	if (timeout)
356*4882a593Smuzhiyun 		wait_event_timeout(dlm->dlm_reco_thread_wq,
357*4882a593Smuzhiyun 				   dlm_is_node_dead(dlm, node),
358*4882a593Smuzhiyun 				   msecs_to_jiffies(timeout));
359*4882a593Smuzhiyun 	else
360*4882a593Smuzhiyun 		wait_event(dlm->dlm_reco_thread_wq,
361*4882a593Smuzhiyun 			   dlm_is_node_dead(dlm, node));
362*4882a593Smuzhiyun }
363*4882a593Smuzhiyun 
dlm_wait_for_node_recovery(struct dlm_ctxt * dlm,u8 node,int timeout)364*4882a593Smuzhiyun void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
365*4882a593Smuzhiyun {
366*4882a593Smuzhiyun 	if (dlm_is_node_recovered(dlm, node))
367*4882a593Smuzhiyun 		return;
368*4882a593Smuzhiyun 
369*4882a593Smuzhiyun 	printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in "
370*4882a593Smuzhiyun 	       "domain %s\n", node, dlm->name);
371*4882a593Smuzhiyun 
372*4882a593Smuzhiyun 	if (timeout)
373*4882a593Smuzhiyun 		wait_event_timeout(dlm->dlm_reco_thread_wq,
374*4882a593Smuzhiyun 				   dlm_is_node_recovered(dlm, node),
375*4882a593Smuzhiyun 				   msecs_to_jiffies(timeout));
376*4882a593Smuzhiyun 	else
377*4882a593Smuzhiyun 		wait_event(dlm->dlm_reco_thread_wq,
378*4882a593Smuzhiyun 			   dlm_is_node_recovered(dlm, node));
379*4882a593Smuzhiyun }
380*4882a593Smuzhiyun 
381*4882a593Smuzhiyun /* callers of the top-level api calls (dlmlock/dlmunlock) should
382*4882a593Smuzhiyun  * block on the dlm->reco.event when recovery is in progress.
383*4882a593Smuzhiyun  * the dlm recovery thread will set this state when it begins
384*4882a593Smuzhiyun  * recovering a dead node (as the new master or not) and clear
385*4882a593Smuzhiyun  * the state and wake as soon as all affected lock resources have
386*4882a593Smuzhiyun  * been marked with the RECOVERY flag */
dlm_in_recovery(struct dlm_ctxt * dlm)387*4882a593Smuzhiyun static int dlm_in_recovery(struct dlm_ctxt *dlm)
388*4882a593Smuzhiyun {
389*4882a593Smuzhiyun 	int in_recovery;
390*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
391*4882a593Smuzhiyun 	in_recovery = !!(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
392*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
393*4882a593Smuzhiyun 	return in_recovery;
394*4882a593Smuzhiyun }
395*4882a593Smuzhiyun 
396*4882a593Smuzhiyun 
dlm_wait_for_recovery(struct dlm_ctxt * dlm)397*4882a593Smuzhiyun void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
398*4882a593Smuzhiyun {
399*4882a593Smuzhiyun 	if (dlm_in_recovery(dlm)) {
400*4882a593Smuzhiyun 		mlog(0, "%s: reco thread %d in recovery: "
401*4882a593Smuzhiyun 		     "state=%d, master=%u, dead=%u\n",
402*4882a593Smuzhiyun 		     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
403*4882a593Smuzhiyun 		     dlm->reco.state, dlm->reco.new_master,
404*4882a593Smuzhiyun 		     dlm->reco.dead_node);
405*4882a593Smuzhiyun 	}
406*4882a593Smuzhiyun 	wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
407*4882a593Smuzhiyun }
408*4882a593Smuzhiyun 
dlm_begin_recovery(struct dlm_ctxt * dlm)409*4882a593Smuzhiyun static void dlm_begin_recovery(struct dlm_ctxt *dlm)
410*4882a593Smuzhiyun {
411*4882a593Smuzhiyun 	assert_spin_locked(&dlm->spinlock);
412*4882a593Smuzhiyun 	BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
413*4882a593Smuzhiyun 	printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
414*4882a593Smuzhiyun 	       dlm->name, dlm->reco.dead_node);
415*4882a593Smuzhiyun 	dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
416*4882a593Smuzhiyun }
417*4882a593Smuzhiyun 
dlm_end_recovery(struct dlm_ctxt * dlm)418*4882a593Smuzhiyun static void dlm_end_recovery(struct dlm_ctxt *dlm)
419*4882a593Smuzhiyun {
420*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
421*4882a593Smuzhiyun 	BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
422*4882a593Smuzhiyun 	dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
423*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
424*4882a593Smuzhiyun 	printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name);
425*4882a593Smuzhiyun 	wake_up(&dlm->reco.event);
426*4882a593Smuzhiyun }
427*4882a593Smuzhiyun 
dlm_print_recovery_master(struct dlm_ctxt * dlm)428*4882a593Smuzhiyun static void dlm_print_recovery_master(struct dlm_ctxt *dlm)
429*4882a593Smuzhiyun {
430*4882a593Smuzhiyun 	printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the "
431*4882a593Smuzhiyun 	       "dead node %u in domain %s\n", dlm->reco.new_master,
432*4882a593Smuzhiyun 	       (dlm->node_num == dlm->reco.new_master ? "me" : "he"),
433*4882a593Smuzhiyun 	       dlm->reco.dead_node, dlm->name);
434*4882a593Smuzhiyun }
435*4882a593Smuzhiyun 
dlm_do_recovery(struct dlm_ctxt * dlm)436*4882a593Smuzhiyun static int dlm_do_recovery(struct dlm_ctxt *dlm)
437*4882a593Smuzhiyun {
438*4882a593Smuzhiyun 	int status = 0;
439*4882a593Smuzhiyun 	int ret;
440*4882a593Smuzhiyun 
441*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
442*4882a593Smuzhiyun 
443*4882a593Smuzhiyun 	if (dlm->migrate_done) {
444*4882a593Smuzhiyun 		mlog(0, "%s: no need do recovery after migrating all "
445*4882a593Smuzhiyun 		     "lock resources\n", dlm->name);
446*4882a593Smuzhiyun 		spin_unlock(&dlm->spinlock);
447*4882a593Smuzhiyun 		return 0;
448*4882a593Smuzhiyun 	}
449*4882a593Smuzhiyun 
450*4882a593Smuzhiyun 	/* check to see if the new master has died */
451*4882a593Smuzhiyun 	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
452*4882a593Smuzhiyun 	    test_bit(dlm->reco.new_master, dlm->recovery_map)) {
453*4882a593Smuzhiyun 		mlog(0, "new master %u died while recovering %u!\n",
454*4882a593Smuzhiyun 		     dlm->reco.new_master, dlm->reco.dead_node);
455*4882a593Smuzhiyun 		/* unset the new_master, leave dead_node */
456*4882a593Smuzhiyun 		dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
457*4882a593Smuzhiyun 	}
458*4882a593Smuzhiyun 
459*4882a593Smuzhiyun 	/* select a target to recover */
460*4882a593Smuzhiyun 	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
461*4882a593Smuzhiyun 		int bit;
462*4882a593Smuzhiyun 
463*4882a593Smuzhiyun 		bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
464*4882a593Smuzhiyun 		if (bit >= O2NM_MAX_NODES || bit < 0)
465*4882a593Smuzhiyun 			dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
466*4882a593Smuzhiyun 		else
467*4882a593Smuzhiyun 			dlm_set_reco_dead_node(dlm, bit);
468*4882a593Smuzhiyun 	} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
469*4882a593Smuzhiyun 		/* BUG? */
470*4882a593Smuzhiyun 		mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
471*4882a593Smuzhiyun 		     dlm->reco.dead_node);
472*4882a593Smuzhiyun 		dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
473*4882a593Smuzhiyun 	}
474*4882a593Smuzhiyun 
475*4882a593Smuzhiyun 	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
476*4882a593Smuzhiyun 		// mlog(0, "nothing to recover!  sleeping now!\n");
477*4882a593Smuzhiyun 		spin_unlock(&dlm->spinlock);
478*4882a593Smuzhiyun 		/* return to main thread loop and sleep. */
479*4882a593Smuzhiyun 		return 0;
480*4882a593Smuzhiyun 	}
481*4882a593Smuzhiyun 	mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
482*4882a593Smuzhiyun 	     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
483*4882a593Smuzhiyun 	     dlm->reco.dead_node);
484*4882a593Smuzhiyun 
485*4882a593Smuzhiyun 	/* take write barrier */
486*4882a593Smuzhiyun 	/* (stops the list reshuffling thread, proxy ast handling) */
487*4882a593Smuzhiyun 	dlm_begin_recovery(dlm);
488*4882a593Smuzhiyun 
489*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
490*4882a593Smuzhiyun 
491*4882a593Smuzhiyun 	if (dlm->reco.new_master == dlm->node_num)
492*4882a593Smuzhiyun 		goto master_here;
493*4882a593Smuzhiyun 
494*4882a593Smuzhiyun 	if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
495*4882a593Smuzhiyun 		/* choose a new master, returns 0 if this node
496*4882a593Smuzhiyun 		 * is the master, -EEXIST if it's another node.
497*4882a593Smuzhiyun 		 * this does not return until a new master is chosen
498*4882a593Smuzhiyun 		 * or recovery completes entirely. */
499*4882a593Smuzhiyun 		ret = dlm_pick_recovery_master(dlm);
500*4882a593Smuzhiyun 		if (!ret) {
501*4882a593Smuzhiyun 			/* already notified everyone.  go. */
502*4882a593Smuzhiyun 			goto master_here;
503*4882a593Smuzhiyun 		}
504*4882a593Smuzhiyun 		mlog(0, "another node will master this recovery session.\n");
505*4882a593Smuzhiyun 	}
506*4882a593Smuzhiyun 
507*4882a593Smuzhiyun 	dlm_print_recovery_master(dlm);
508*4882a593Smuzhiyun 
509*4882a593Smuzhiyun 	/* it is safe to start everything back up here
510*4882a593Smuzhiyun 	 * because all of the dead node's lock resources
511*4882a593Smuzhiyun 	 * have been marked as in-recovery */
512*4882a593Smuzhiyun 	dlm_end_recovery(dlm);
513*4882a593Smuzhiyun 
514*4882a593Smuzhiyun 	/* sleep out in main dlm_recovery_thread loop. */
515*4882a593Smuzhiyun 	return 0;
516*4882a593Smuzhiyun 
517*4882a593Smuzhiyun master_here:
518*4882a593Smuzhiyun 	dlm_print_recovery_master(dlm);
519*4882a593Smuzhiyun 
520*4882a593Smuzhiyun 	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
521*4882a593Smuzhiyun 	if (status < 0) {
522*4882a593Smuzhiyun 		/* we should never hit this anymore */
523*4882a593Smuzhiyun 		mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, "
524*4882a593Smuzhiyun 		     "retrying.\n", dlm->name, status, dlm->reco.dead_node);
525*4882a593Smuzhiyun 		/* yield a bit to allow any final network messages
526*4882a593Smuzhiyun 		 * to get handled on remaining nodes */
527*4882a593Smuzhiyun 		msleep(100);
528*4882a593Smuzhiyun 	} else {
529*4882a593Smuzhiyun 		/* success!  see if any other nodes need recovery */
530*4882a593Smuzhiyun 		mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
531*4882a593Smuzhiyun 		     dlm->name, dlm->reco.dead_node, dlm->node_num);
532*4882a593Smuzhiyun 		spin_lock(&dlm->spinlock);
533*4882a593Smuzhiyun 		__dlm_reset_recovery(dlm);
534*4882a593Smuzhiyun 		dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
535*4882a593Smuzhiyun 		spin_unlock(&dlm->spinlock);
536*4882a593Smuzhiyun 	}
537*4882a593Smuzhiyun 	dlm_end_recovery(dlm);
538*4882a593Smuzhiyun 
539*4882a593Smuzhiyun 	/* continue and look for another dead node */
540*4882a593Smuzhiyun 	return -EAGAIN;
541*4882a593Smuzhiyun }
542*4882a593Smuzhiyun 
dlm_remaster_locks(struct dlm_ctxt * dlm,u8 dead_node)543*4882a593Smuzhiyun static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
544*4882a593Smuzhiyun {
545*4882a593Smuzhiyun 	int status = 0;
546*4882a593Smuzhiyun 	struct dlm_reco_node_data *ndata;
547*4882a593Smuzhiyun 	int all_nodes_done;
548*4882a593Smuzhiyun 	int destroy = 0;
549*4882a593Smuzhiyun 	int pass = 0;
550*4882a593Smuzhiyun 
551*4882a593Smuzhiyun 	do {
552*4882a593Smuzhiyun 		/* we have become recovery master.  there is no escaping
553*4882a593Smuzhiyun 		 * this, so just keep trying until we get it. */
554*4882a593Smuzhiyun 		status = dlm_init_recovery_area(dlm, dead_node);
555*4882a593Smuzhiyun 		if (status < 0) {
556*4882a593Smuzhiyun 			mlog(ML_ERROR, "%s: failed to alloc recovery area, "
557*4882a593Smuzhiyun 			     "retrying\n", dlm->name);
558*4882a593Smuzhiyun 			msleep(1000);
559*4882a593Smuzhiyun 		}
560*4882a593Smuzhiyun 	} while (status != 0);
561*4882a593Smuzhiyun 
562*4882a593Smuzhiyun 	/* safe to access the node data list without a lock, since this
563*4882a593Smuzhiyun 	 * process is the only one to change the list */
564*4882a593Smuzhiyun 	list_for_each_entry(ndata, &dlm->reco.node_data, list) {
565*4882a593Smuzhiyun 		BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
566*4882a593Smuzhiyun 		ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
567*4882a593Smuzhiyun 
568*4882a593Smuzhiyun 		mlog(0, "%s: Requesting lock info from node %u\n", dlm->name,
569*4882a593Smuzhiyun 		     ndata->node_num);
570*4882a593Smuzhiyun 
571*4882a593Smuzhiyun 		if (ndata->node_num == dlm->node_num) {
572*4882a593Smuzhiyun 			ndata->state = DLM_RECO_NODE_DATA_DONE;
573*4882a593Smuzhiyun 			continue;
574*4882a593Smuzhiyun 		}
575*4882a593Smuzhiyun 
576*4882a593Smuzhiyun 		do {
577*4882a593Smuzhiyun 			status = dlm_request_all_locks(dlm, ndata->node_num,
578*4882a593Smuzhiyun 						       dead_node);
579*4882a593Smuzhiyun 			if (status < 0) {
580*4882a593Smuzhiyun 				mlog_errno(status);
581*4882a593Smuzhiyun 				if (dlm_is_host_down(status)) {
582*4882a593Smuzhiyun 					/* node died, ignore it for recovery */
583*4882a593Smuzhiyun 					status = 0;
584*4882a593Smuzhiyun 					ndata->state = DLM_RECO_NODE_DATA_DEAD;
585*4882a593Smuzhiyun 					/* wait for the domain map to catch up
586*4882a593Smuzhiyun 					 * with the network state. */
587*4882a593Smuzhiyun 					wait_event_timeout(dlm->dlm_reco_thread_wq,
588*4882a593Smuzhiyun 							   dlm_is_node_dead(dlm,
589*4882a593Smuzhiyun 								ndata->node_num),
590*4882a593Smuzhiyun 							   msecs_to_jiffies(1000));
591*4882a593Smuzhiyun 					mlog(0, "waited 1 sec for %u, "
592*4882a593Smuzhiyun 					     "dead? %s\n", ndata->node_num,
593*4882a593Smuzhiyun 					     dlm_is_node_dead(dlm, ndata->node_num) ?
594*4882a593Smuzhiyun 					     "yes" : "no");
595*4882a593Smuzhiyun 				} else {
596*4882a593Smuzhiyun 					/* -ENOMEM on the other node */
597*4882a593Smuzhiyun 					mlog(0, "%s: node %u returned "
598*4882a593Smuzhiyun 					     "%d during recovery, retrying "
599*4882a593Smuzhiyun 					     "after a short wait\n",
600*4882a593Smuzhiyun 					     dlm->name, ndata->node_num,
601*4882a593Smuzhiyun 					     status);
602*4882a593Smuzhiyun 					msleep(100);
603*4882a593Smuzhiyun 				}
604*4882a593Smuzhiyun 			}
605*4882a593Smuzhiyun 		} while (status != 0);
606*4882a593Smuzhiyun 
607*4882a593Smuzhiyun 		spin_lock(&dlm_reco_state_lock);
608*4882a593Smuzhiyun 		switch (ndata->state) {
609*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_INIT:
610*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
611*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_REQUESTED:
612*4882a593Smuzhiyun 				BUG();
613*4882a593Smuzhiyun 				break;
614*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_DEAD:
615*4882a593Smuzhiyun 				mlog(0, "node %u died after requesting "
616*4882a593Smuzhiyun 				     "recovery info for node %u\n",
617*4882a593Smuzhiyun 				     ndata->node_num, dead_node);
618*4882a593Smuzhiyun 				/* fine.  don't need this node's info.
619*4882a593Smuzhiyun 				 * continue without it. */
620*4882a593Smuzhiyun 				break;
621*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_REQUESTING:
622*4882a593Smuzhiyun 				ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
623*4882a593Smuzhiyun 				mlog(0, "now receiving recovery data from "
624*4882a593Smuzhiyun 				     "node %u for dead node %u\n",
625*4882a593Smuzhiyun 				     ndata->node_num, dead_node);
626*4882a593Smuzhiyun 				break;
627*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_RECEIVING:
628*4882a593Smuzhiyun 				mlog(0, "already receiving recovery data from "
629*4882a593Smuzhiyun 				     "node %u for dead node %u\n",
630*4882a593Smuzhiyun 				     ndata->node_num, dead_node);
631*4882a593Smuzhiyun 				break;
632*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_DONE:
633*4882a593Smuzhiyun 				mlog(0, "already DONE receiving recovery data "
634*4882a593Smuzhiyun 				     "from node %u for dead node %u\n",
635*4882a593Smuzhiyun 				     ndata->node_num, dead_node);
636*4882a593Smuzhiyun 				break;
637*4882a593Smuzhiyun 		}
638*4882a593Smuzhiyun 		spin_unlock(&dlm_reco_state_lock);
639*4882a593Smuzhiyun 	}
640*4882a593Smuzhiyun 
641*4882a593Smuzhiyun 	mlog(0, "%s: Done requesting all lock info\n", dlm->name);
642*4882a593Smuzhiyun 
643*4882a593Smuzhiyun 	/* nodes should be sending reco data now
644*4882a593Smuzhiyun 	 * just need to wait */
645*4882a593Smuzhiyun 
646*4882a593Smuzhiyun 	while (1) {
647*4882a593Smuzhiyun 		/* check all the nodes now to see if we are
648*4882a593Smuzhiyun 		 * done, or if anyone died */
649*4882a593Smuzhiyun 		all_nodes_done = 1;
650*4882a593Smuzhiyun 		spin_lock(&dlm_reco_state_lock);
651*4882a593Smuzhiyun 		list_for_each_entry(ndata, &dlm->reco.node_data, list) {
652*4882a593Smuzhiyun 			mlog(0, "checking recovery state of node %u\n",
653*4882a593Smuzhiyun 			     ndata->node_num);
654*4882a593Smuzhiyun 			switch (ndata->state) {
655*4882a593Smuzhiyun 				case DLM_RECO_NODE_DATA_INIT:
656*4882a593Smuzhiyun 				case DLM_RECO_NODE_DATA_REQUESTING:
657*4882a593Smuzhiyun 					mlog(ML_ERROR, "bad ndata state for "
658*4882a593Smuzhiyun 					     "node %u: state=%d\n",
659*4882a593Smuzhiyun 					     ndata->node_num, ndata->state);
660*4882a593Smuzhiyun 					BUG();
661*4882a593Smuzhiyun 					break;
662*4882a593Smuzhiyun 				case DLM_RECO_NODE_DATA_DEAD:
663*4882a593Smuzhiyun 					mlog(0, "node %u died after "
664*4882a593Smuzhiyun 					     "requesting recovery info for "
665*4882a593Smuzhiyun 					     "node %u\n", ndata->node_num,
666*4882a593Smuzhiyun 					     dead_node);
667*4882a593Smuzhiyun 					break;
668*4882a593Smuzhiyun 				case DLM_RECO_NODE_DATA_RECEIVING:
669*4882a593Smuzhiyun 				case DLM_RECO_NODE_DATA_REQUESTED:
670*4882a593Smuzhiyun 					mlog(0, "%s: node %u still in state %s\n",
671*4882a593Smuzhiyun 					     dlm->name, ndata->node_num,
672*4882a593Smuzhiyun 					     ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
673*4882a593Smuzhiyun 					     "receiving" : "requested");
674*4882a593Smuzhiyun 					all_nodes_done = 0;
675*4882a593Smuzhiyun 					break;
676*4882a593Smuzhiyun 				case DLM_RECO_NODE_DATA_DONE:
677*4882a593Smuzhiyun 					mlog(0, "%s: node %u state is done\n",
678*4882a593Smuzhiyun 					     dlm->name, ndata->node_num);
679*4882a593Smuzhiyun 					break;
680*4882a593Smuzhiyun 				case DLM_RECO_NODE_DATA_FINALIZE_SENT:
681*4882a593Smuzhiyun 					mlog(0, "%s: node %u state is finalize\n",
682*4882a593Smuzhiyun 					     dlm->name, ndata->node_num);
683*4882a593Smuzhiyun 					break;
684*4882a593Smuzhiyun 			}
685*4882a593Smuzhiyun 		}
686*4882a593Smuzhiyun 		spin_unlock(&dlm_reco_state_lock);
687*4882a593Smuzhiyun 
688*4882a593Smuzhiyun 		mlog(0, "pass #%d, all_nodes_done?: %s\n", ++pass,
689*4882a593Smuzhiyun 		     all_nodes_done?"yes":"no");
690*4882a593Smuzhiyun 		if (all_nodes_done) {
691*4882a593Smuzhiyun 			int ret;
692*4882a593Smuzhiyun 
693*4882a593Smuzhiyun 			/* Set this flag on recovery master to avoid
694*4882a593Smuzhiyun 			 * a new recovery for another dead node start
695*4882a593Smuzhiyun 			 * before the recovery is not done. That may
696*4882a593Smuzhiyun 			 * cause recovery hung.*/
697*4882a593Smuzhiyun 			spin_lock(&dlm->spinlock);
698*4882a593Smuzhiyun 			dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
699*4882a593Smuzhiyun 			spin_unlock(&dlm->spinlock);
700*4882a593Smuzhiyun 
701*4882a593Smuzhiyun 			/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
702*4882a593Smuzhiyun 	 		 * just send a finalize message to everyone and
703*4882a593Smuzhiyun 	 		 * clean up */
704*4882a593Smuzhiyun 			mlog(0, "all nodes are done! send finalize\n");
705*4882a593Smuzhiyun 			ret = dlm_send_finalize_reco_message(dlm);
706*4882a593Smuzhiyun 			if (ret < 0)
707*4882a593Smuzhiyun 				mlog_errno(ret);
708*4882a593Smuzhiyun 
709*4882a593Smuzhiyun 			spin_lock(&dlm->spinlock);
710*4882a593Smuzhiyun 			dlm_finish_local_lockres_recovery(dlm, dead_node,
711*4882a593Smuzhiyun 							  dlm->node_num);
712*4882a593Smuzhiyun 			spin_unlock(&dlm->spinlock);
713*4882a593Smuzhiyun 			mlog(0, "should be done with recovery!\n");
714*4882a593Smuzhiyun 
715*4882a593Smuzhiyun 			mlog(0, "finishing recovery of %s at %lu, "
716*4882a593Smuzhiyun 			     "dead=%u, this=%u, new=%u\n", dlm->name,
717*4882a593Smuzhiyun 			     jiffies, dlm->reco.dead_node,
718*4882a593Smuzhiyun 			     dlm->node_num, dlm->reco.new_master);
719*4882a593Smuzhiyun 			destroy = 1;
720*4882a593Smuzhiyun 			status = 0;
721*4882a593Smuzhiyun 			/* rescan everything marked dirty along the way */
722*4882a593Smuzhiyun 			dlm_kick_thread(dlm, NULL);
723*4882a593Smuzhiyun 			break;
724*4882a593Smuzhiyun 		}
725*4882a593Smuzhiyun 		/* wait to be signalled, with periodic timeout
726*4882a593Smuzhiyun 		 * to check for node death */
727*4882a593Smuzhiyun 		wait_event_interruptible_timeout(dlm->dlm_reco_thread_wq,
728*4882a593Smuzhiyun 					 kthread_should_stop(),
729*4882a593Smuzhiyun 					 msecs_to_jiffies(DLM_RECO_THREAD_TIMEOUT_MS));
730*4882a593Smuzhiyun 
731*4882a593Smuzhiyun 	}
732*4882a593Smuzhiyun 
733*4882a593Smuzhiyun 	if (destroy)
734*4882a593Smuzhiyun 		dlm_destroy_recovery_area(dlm);
735*4882a593Smuzhiyun 
736*4882a593Smuzhiyun 	return status;
737*4882a593Smuzhiyun }
738*4882a593Smuzhiyun 
dlm_init_recovery_area(struct dlm_ctxt * dlm,u8 dead_node)739*4882a593Smuzhiyun static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
740*4882a593Smuzhiyun {
741*4882a593Smuzhiyun 	int num=0;
742*4882a593Smuzhiyun 	struct dlm_reco_node_data *ndata;
743*4882a593Smuzhiyun 
744*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
745*4882a593Smuzhiyun 	memcpy(dlm->reco.node_map, dlm->domain_map, sizeof(dlm->domain_map));
746*4882a593Smuzhiyun 	/* nodes can only be removed (by dying) after dropping
747*4882a593Smuzhiyun 	 * this lock, and death will be trapped later, so this should do */
748*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
749*4882a593Smuzhiyun 
750*4882a593Smuzhiyun 	while (1) {
751*4882a593Smuzhiyun 		num = find_next_bit (dlm->reco.node_map, O2NM_MAX_NODES, num);
752*4882a593Smuzhiyun 		if (num >= O2NM_MAX_NODES) {
753*4882a593Smuzhiyun 			break;
754*4882a593Smuzhiyun 		}
755*4882a593Smuzhiyun 		BUG_ON(num == dead_node);
756*4882a593Smuzhiyun 
757*4882a593Smuzhiyun 		ndata = kzalloc(sizeof(*ndata), GFP_NOFS);
758*4882a593Smuzhiyun 		if (!ndata) {
759*4882a593Smuzhiyun 			dlm_destroy_recovery_area(dlm);
760*4882a593Smuzhiyun 			return -ENOMEM;
761*4882a593Smuzhiyun 		}
762*4882a593Smuzhiyun 		ndata->node_num = num;
763*4882a593Smuzhiyun 		ndata->state = DLM_RECO_NODE_DATA_INIT;
764*4882a593Smuzhiyun 		spin_lock(&dlm_reco_state_lock);
765*4882a593Smuzhiyun 		list_add_tail(&ndata->list, &dlm->reco.node_data);
766*4882a593Smuzhiyun 		spin_unlock(&dlm_reco_state_lock);
767*4882a593Smuzhiyun 		num++;
768*4882a593Smuzhiyun 	}
769*4882a593Smuzhiyun 
770*4882a593Smuzhiyun 	return 0;
771*4882a593Smuzhiyun }
772*4882a593Smuzhiyun 
dlm_destroy_recovery_area(struct dlm_ctxt * dlm)773*4882a593Smuzhiyun static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm)
774*4882a593Smuzhiyun {
775*4882a593Smuzhiyun 	struct dlm_reco_node_data *ndata, *next;
776*4882a593Smuzhiyun 	LIST_HEAD(tmplist);
777*4882a593Smuzhiyun 
778*4882a593Smuzhiyun 	spin_lock(&dlm_reco_state_lock);
779*4882a593Smuzhiyun 	list_splice_init(&dlm->reco.node_data, &tmplist);
780*4882a593Smuzhiyun 	spin_unlock(&dlm_reco_state_lock);
781*4882a593Smuzhiyun 
782*4882a593Smuzhiyun 	list_for_each_entry_safe(ndata, next, &tmplist, list) {
783*4882a593Smuzhiyun 		list_del_init(&ndata->list);
784*4882a593Smuzhiyun 		kfree(ndata);
785*4882a593Smuzhiyun 	}
786*4882a593Smuzhiyun }
787*4882a593Smuzhiyun 
dlm_request_all_locks(struct dlm_ctxt * dlm,u8 request_from,u8 dead_node)788*4882a593Smuzhiyun static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
789*4882a593Smuzhiyun 				 u8 dead_node)
790*4882a593Smuzhiyun {
791*4882a593Smuzhiyun 	struct dlm_lock_request lr;
792*4882a593Smuzhiyun 	int ret;
793*4882a593Smuzhiyun 	int status;
794*4882a593Smuzhiyun 
795*4882a593Smuzhiyun 	mlog(0, "\n");
796*4882a593Smuzhiyun 
797*4882a593Smuzhiyun 
798*4882a593Smuzhiyun 	mlog(0, "dlm_request_all_locks: dead node is %u, sending request "
799*4882a593Smuzhiyun 		  "to %u\n", dead_node, request_from);
800*4882a593Smuzhiyun 
801*4882a593Smuzhiyun 	memset(&lr, 0, sizeof(lr));
802*4882a593Smuzhiyun 	lr.node_idx = dlm->node_num;
803*4882a593Smuzhiyun 	lr.dead_node = dead_node;
804*4882a593Smuzhiyun 
805*4882a593Smuzhiyun 	// send message
806*4882a593Smuzhiyun 	ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
807*4882a593Smuzhiyun 				 &lr, sizeof(lr), request_from, &status);
808*4882a593Smuzhiyun 
809*4882a593Smuzhiyun 	/* negative status is handled by caller */
810*4882a593Smuzhiyun 	if (ret < 0)
811*4882a593Smuzhiyun 		mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
812*4882a593Smuzhiyun 		     "to recover dead node %u\n", dlm->name, ret,
813*4882a593Smuzhiyun 		     request_from, dead_node);
814*4882a593Smuzhiyun 	else
815*4882a593Smuzhiyun 		ret = status;
816*4882a593Smuzhiyun 	// return from here, then
817*4882a593Smuzhiyun 	// sleep until all received or error
818*4882a593Smuzhiyun 	return ret;
819*4882a593Smuzhiyun 
820*4882a593Smuzhiyun }
821*4882a593Smuzhiyun 
dlm_request_all_locks_handler(struct o2net_msg * msg,u32 len,void * data,void ** ret_data)822*4882a593Smuzhiyun int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data,
823*4882a593Smuzhiyun 				  void **ret_data)
824*4882a593Smuzhiyun {
825*4882a593Smuzhiyun 	struct dlm_ctxt *dlm = data;
826*4882a593Smuzhiyun 	struct dlm_lock_request *lr = (struct dlm_lock_request *)msg->buf;
827*4882a593Smuzhiyun 	char *buf = NULL;
828*4882a593Smuzhiyun 	struct dlm_work_item *item = NULL;
829*4882a593Smuzhiyun 
830*4882a593Smuzhiyun 	if (!dlm_grab(dlm))
831*4882a593Smuzhiyun 		return -EINVAL;
832*4882a593Smuzhiyun 
833*4882a593Smuzhiyun 	if (lr->dead_node != dlm->reco.dead_node) {
834*4882a593Smuzhiyun 		mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local "
835*4882a593Smuzhiyun 		     "dead_node is %u\n", dlm->name, lr->node_idx,
836*4882a593Smuzhiyun 		     lr->dead_node, dlm->reco.dead_node);
837*4882a593Smuzhiyun 		dlm_print_reco_node_status(dlm);
838*4882a593Smuzhiyun 		/* this is a hack */
839*4882a593Smuzhiyun 		dlm_put(dlm);
840*4882a593Smuzhiyun 		return -ENOMEM;
841*4882a593Smuzhiyun 	}
842*4882a593Smuzhiyun 	BUG_ON(lr->dead_node != dlm->reco.dead_node);
843*4882a593Smuzhiyun 
844*4882a593Smuzhiyun 	item = kzalloc(sizeof(*item), GFP_NOFS);
845*4882a593Smuzhiyun 	if (!item) {
846*4882a593Smuzhiyun 		dlm_put(dlm);
847*4882a593Smuzhiyun 		return -ENOMEM;
848*4882a593Smuzhiyun 	}
849*4882a593Smuzhiyun 
850*4882a593Smuzhiyun 	/* this will get freed by dlm_request_all_locks_worker */
851*4882a593Smuzhiyun 	buf = (char *) __get_free_page(GFP_NOFS);
852*4882a593Smuzhiyun 	if (!buf) {
853*4882a593Smuzhiyun 		kfree(item);
854*4882a593Smuzhiyun 		dlm_put(dlm);
855*4882a593Smuzhiyun 		return -ENOMEM;
856*4882a593Smuzhiyun 	}
857*4882a593Smuzhiyun 
858*4882a593Smuzhiyun 	/* queue up work for dlm_request_all_locks_worker */
859*4882a593Smuzhiyun 	dlm_grab(dlm);  /* get an extra ref for the work item */
860*4882a593Smuzhiyun 	dlm_init_work_item(dlm, item, dlm_request_all_locks_worker, buf);
861*4882a593Smuzhiyun 	item->u.ral.reco_master = lr->node_idx;
862*4882a593Smuzhiyun 	item->u.ral.dead_node = lr->dead_node;
863*4882a593Smuzhiyun 	spin_lock(&dlm->work_lock);
864*4882a593Smuzhiyun 	list_add_tail(&item->list, &dlm->work_list);
865*4882a593Smuzhiyun 	spin_unlock(&dlm->work_lock);
866*4882a593Smuzhiyun 	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
867*4882a593Smuzhiyun 
868*4882a593Smuzhiyun 	dlm_put(dlm);
869*4882a593Smuzhiyun 	return 0;
870*4882a593Smuzhiyun }
871*4882a593Smuzhiyun 
dlm_request_all_locks_worker(struct dlm_work_item * item,void * data)872*4882a593Smuzhiyun static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
873*4882a593Smuzhiyun {
874*4882a593Smuzhiyun 	struct dlm_migratable_lockres *mres;
875*4882a593Smuzhiyun 	struct dlm_lock_resource *res;
876*4882a593Smuzhiyun 	struct dlm_ctxt *dlm;
877*4882a593Smuzhiyun 	LIST_HEAD(resources);
878*4882a593Smuzhiyun 	int ret;
879*4882a593Smuzhiyun 	u8 dead_node, reco_master;
880*4882a593Smuzhiyun 	int skip_all_done = 0;
881*4882a593Smuzhiyun 
882*4882a593Smuzhiyun 	dlm = item->dlm;
883*4882a593Smuzhiyun 	dead_node = item->u.ral.dead_node;
884*4882a593Smuzhiyun 	reco_master = item->u.ral.reco_master;
885*4882a593Smuzhiyun 	mres = (struct dlm_migratable_lockres *)data;
886*4882a593Smuzhiyun 
887*4882a593Smuzhiyun 	mlog(0, "%s: recovery worker started, dead=%u, master=%u\n",
888*4882a593Smuzhiyun 	     dlm->name, dead_node, reco_master);
889*4882a593Smuzhiyun 
890*4882a593Smuzhiyun 	if (dead_node != dlm->reco.dead_node ||
891*4882a593Smuzhiyun 	    reco_master != dlm->reco.new_master) {
892*4882a593Smuzhiyun 		/* worker could have been created before the recovery master
893*4882a593Smuzhiyun 		 * died.  if so, do not continue, but do not error. */
894*4882a593Smuzhiyun 		if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
895*4882a593Smuzhiyun 			mlog(ML_NOTICE, "%s: will not send recovery state, "
896*4882a593Smuzhiyun 			     "recovery master %u died, thread=(dead=%u,mas=%u)"
897*4882a593Smuzhiyun 			     " current=(dead=%u,mas=%u)\n", dlm->name,
898*4882a593Smuzhiyun 			     reco_master, dead_node, reco_master,
899*4882a593Smuzhiyun 			     dlm->reco.dead_node, dlm->reco.new_master);
900*4882a593Smuzhiyun 		} else {
901*4882a593Smuzhiyun 			mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, "
902*4882a593Smuzhiyun 			     "master=%u), request(dead=%u, master=%u)\n",
903*4882a593Smuzhiyun 			     dlm->name, dlm->reco.dead_node,
904*4882a593Smuzhiyun 			     dlm->reco.new_master, dead_node, reco_master);
905*4882a593Smuzhiyun 		}
906*4882a593Smuzhiyun 		goto leave;
907*4882a593Smuzhiyun 	}
908*4882a593Smuzhiyun 
909*4882a593Smuzhiyun 	/* lock resources should have already been moved to the
910*4882a593Smuzhiyun  	 * dlm->reco.resources list.  now move items from that list
911*4882a593Smuzhiyun  	 * to a temp list if the dead owner matches.  note that the
912*4882a593Smuzhiyun 	 * whole cluster recovers only one node at a time, so we
913*4882a593Smuzhiyun 	 * can safely move UNKNOWN lock resources for each recovery
914*4882a593Smuzhiyun 	 * session. */
915*4882a593Smuzhiyun 	dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
916*4882a593Smuzhiyun 
917*4882a593Smuzhiyun 	/* now we can begin blasting lockreses without the dlm lock */
918*4882a593Smuzhiyun 
919*4882a593Smuzhiyun 	/* any errors returned will be due to the new_master dying,
920*4882a593Smuzhiyun 	 * the dlm_reco_thread should detect this */
921*4882a593Smuzhiyun 	list_for_each_entry(res, &resources, recovering) {
922*4882a593Smuzhiyun 		ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
923*4882a593Smuzhiyun 				   	DLM_MRES_RECOVERY);
924*4882a593Smuzhiyun 		if (ret < 0) {
925*4882a593Smuzhiyun 			mlog(ML_ERROR, "%s: node %u went down while sending "
926*4882a593Smuzhiyun 			     "recovery state for dead node %u, ret=%d\n", dlm->name,
927*4882a593Smuzhiyun 			     reco_master, dead_node, ret);
928*4882a593Smuzhiyun 			skip_all_done = 1;
929*4882a593Smuzhiyun 			break;
930*4882a593Smuzhiyun 		}
931*4882a593Smuzhiyun 	}
932*4882a593Smuzhiyun 
933*4882a593Smuzhiyun 	/* move the resources back to the list */
934*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
935*4882a593Smuzhiyun 	list_splice_init(&resources, &dlm->reco.resources);
936*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
937*4882a593Smuzhiyun 
938*4882a593Smuzhiyun 	if (!skip_all_done) {
939*4882a593Smuzhiyun 		ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
940*4882a593Smuzhiyun 		if (ret < 0) {
941*4882a593Smuzhiyun 			mlog(ML_ERROR, "%s: node %u went down while sending "
942*4882a593Smuzhiyun 			     "recovery all-done for dead node %u, ret=%d\n",
943*4882a593Smuzhiyun 			     dlm->name, reco_master, dead_node, ret);
944*4882a593Smuzhiyun 		}
945*4882a593Smuzhiyun 	}
946*4882a593Smuzhiyun leave:
947*4882a593Smuzhiyun 	free_page((unsigned long)data);
948*4882a593Smuzhiyun }
949*4882a593Smuzhiyun 
950*4882a593Smuzhiyun 
dlm_send_all_done_msg(struct dlm_ctxt * dlm,u8 dead_node,u8 send_to)951*4882a593Smuzhiyun static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
952*4882a593Smuzhiyun {
953*4882a593Smuzhiyun 	int ret, tmpret;
954*4882a593Smuzhiyun 	struct dlm_reco_data_done done_msg;
955*4882a593Smuzhiyun 
956*4882a593Smuzhiyun 	memset(&done_msg, 0, sizeof(done_msg));
957*4882a593Smuzhiyun 	done_msg.node_idx = dlm->node_num;
958*4882a593Smuzhiyun 	done_msg.dead_node = dead_node;
959*4882a593Smuzhiyun 	mlog(0, "sending DATA DONE message to %u, "
960*4882a593Smuzhiyun 	     "my node=%u, dead node=%u\n", send_to, done_msg.node_idx,
961*4882a593Smuzhiyun 	     done_msg.dead_node);
962*4882a593Smuzhiyun 
963*4882a593Smuzhiyun 	ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
964*4882a593Smuzhiyun 				 sizeof(done_msg), send_to, &tmpret);
965*4882a593Smuzhiyun 	if (ret < 0) {
966*4882a593Smuzhiyun 		mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u "
967*4882a593Smuzhiyun 		     "to recover dead node %u\n", dlm->name, ret, send_to,
968*4882a593Smuzhiyun 		     dead_node);
969*4882a593Smuzhiyun 		if (!dlm_is_host_down(ret)) {
970*4882a593Smuzhiyun 			BUG();
971*4882a593Smuzhiyun 		}
972*4882a593Smuzhiyun 	} else
973*4882a593Smuzhiyun 		ret = tmpret;
974*4882a593Smuzhiyun 	return ret;
975*4882a593Smuzhiyun }
976*4882a593Smuzhiyun 
977*4882a593Smuzhiyun 
dlm_reco_data_done_handler(struct o2net_msg * msg,u32 len,void * data,void ** ret_data)978*4882a593Smuzhiyun int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
979*4882a593Smuzhiyun 			       void **ret_data)
980*4882a593Smuzhiyun {
981*4882a593Smuzhiyun 	struct dlm_ctxt *dlm = data;
982*4882a593Smuzhiyun 	struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
983*4882a593Smuzhiyun 	struct dlm_reco_node_data *ndata = NULL;
984*4882a593Smuzhiyun 	int ret = -EINVAL;
985*4882a593Smuzhiyun 
986*4882a593Smuzhiyun 	if (!dlm_grab(dlm))
987*4882a593Smuzhiyun 		return -EINVAL;
988*4882a593Smuzhiyun 
989*4882a593Smuzhiyun 	mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
990*4882a593Smuzhiyun 	     "node_idx=%u, this node=%u\n", done->dead_node,
991*4882a593Smuzhiyun 	     dlm->reco.dead_node, done->node_idx, dlm->node_num);
992*4882a593Smuzhiyun 
993*4882a593Smuzhiyun 	mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node),
994*4882a593Smuzhiyun 			"Got DATA DONE: dead_node=%u, reco.dead_node=%u, "
995*4882a593Smuzhiyun 			"node_idx=%u, this node=%u\n", done->dead_node,
996*4882a593Smuzhiyun 			dlm->reco.dead_node, done->node_idx, dlm->node_num);
997*4882a593Smuzhiyun 
998*4882a593Smuzhiyun 	spin_lock(&dlm_reco_state_lock);
999*4882a593Smuzhiyun 	list_for_each_entry(ndata, &dlm->reco.node_data, list) {
1000*4882a593Smuzhiyun 		if (ndata->node_num != done->node_idx)
1001*4882a593Smuzhiyun 			continue;
1002*4882a593Smuzhiyun 
1003*4882a593Smuzhiyun 		switch (ndata->state) {
1004*4882a593Smuzhiyun 			/* should have moved beyond INIT but not to FINALIZE yet */
1005*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_INIT:
1006*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_DEAD:
1007*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
1008*4882a593Smuzhiyun 				mlog(ML_ERROR, "bad ndata state for node %u:"
1009*4882a593Smuzhiyun 				     " state=%d\n", ndata->node_num,
1010*4882a593Smuzhiyun 				     ndata->state);
1011*4882a593Smuzhiyun 				BUG();
1012*4882a593Smuzhiyun 				break;
1013*4882a593Smuzhiyun 			/* these states are possible at this point, anywhere along
1014*4882a593Smuzhiyun 			 * the line of recovery */
1015*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_DONE:
1016*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_RECEIVING:
1017*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_REQUESTED:
1018*4882a593Smuzhiyun 			case DLM_RECO_NODE_DATA_REQUESTING:
1019*4882a593Smuzhiyun 				mlog(0, "node %u is DONE sending "
1020*4882a593Smuzhiyun 					  "recovery data!\n",
1021*4882a593Smuzhiyun 					  ndata->node_num);
1022*4882a593Smuzhiyun 
1023*4882a593Smuzhiyun 				ndata->state = DLM_RECO_NODE_DATA_DONE;
1024*4882a593Smuzhiyun 				ret = 0;
1025*4882a593Smuzhiyun 				break;
1026*4882a593Smuzhiyun 		}
1027*4882a593Smuzhiyun 	}
1028*4882a593Smuzhiyun 	spin_unlock(&dlm_reco_state_lock);
1029*4882a593Smuzhiyun 
1030*4882a593Smuzhiyun 	/* wake the recovery thread, some node is done */
1031*4882a593Smuzhiyun 	if (!ret)
1032*4882a593Smuzhiyun 		dlm_kick_recovery_thread(dlm);
1033*4882a593Smuzhiyun 
1034*4882a593Smuzhiyun 	if (ret < 0)
1035*4882a593Smuzhiyun 		mlog(ML_ERROR, "failed to find recovery node data for node "
1036*4882a593Smuzhiyun 		     "%u\n", done->node_idx);
1037*4882a593Smuzhiyun 	dlm_put(dlm);
1038*4882a593Smuzhiyun 
1039*4882a593Smuzhiyun 	mlog(0, "leaving reco data done handler, ret=%d\n", ret);
1040*4882a593Smuzhiyun 	return ret;
1041*4882a593Smuzhiyun }
1042*4882a593Smuzhiyun 
dlm_move_reco_locks_to_list(struct dlm_ctxt * dlm,struct list_head * list,u8 dead_node)1043*4882a593Smuzhiyun static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
1044*4882a593Smuzhiyun 					struct list_head *list,
1045*4882a593Smuzhiyun 				       	u8 dead_node)
1046*4882a593Smuzhiyun {
1047*4882a593Smuzhiyun 	struct dlm_lock_resource *res, *next;
1048*4882a593Smuzhiyun 	struct dlm_lock *lock;
1049*4882a593Smuzhiyun 
1050*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
1051*4882a593Smuzhiyun 	list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
1052*4882a593Smuzhiyun 		/* always prune any $RECOVERY entries for dead nodes,
1053*4882a593Smuzhiyun 		 * otherwise hangs can occur during later recovery */
1054*4882a593Smuzhiyun 		if (dlm_is_recovery_lock(res->lockname.name,
1055*4882a593Smuzhiyun 					 res->lockname.len)) {
1056*4882a593Smuzhiyun 			spin_lock(&res->spinlock);
1057*4882a593Smuzhiyun 			list_for_each_entry(lock, &res->granted, list) {
1058*4882a593Smuzhiyun 				if (lock->ml.node == dead_node) {
1059*4882a593Smuzhiyun 					mlog(0, "AHA! there was "
1060*4882a593Smuzhiyun 					     "a $RECOVERY lock for dead "
1061*4882a593Smuzhiyun 					     "node %u (%s)!\n",
1062*4882a593Smuzhiyun 					     dead_node, dlm->name);
1063*4882a593Smuzhiyun 					list_del_init(&lock->list);
1064*4882a593Smuzhiyun 					dlm_lock_put(lock);
1065*4882a593Smuzhiyun 					/* Can't schedule DLM_UNLOCK_FREE_LOCK
1066*4882a593Smuzhiyun 					 * - do manually */
1067*4882a593Smuzhiyun 					dlm_lock_put(lock);
1068*4882a593Smuzhiyun 					break;
1069*4882a593Smuzhiyun 				}
1070*4882a593Smuzhiyun 			}
1071*4882a593Smuzhiyun 			spin_unlock(&res->spinlock);
1072*4882a593Smuzhiyun 			continue;
1073*4882a593Smuzhiyun 		}
1074*4882a593Smuzhiyun 
1075*4882a593Smuzhiyun 		if (res->owner == dead_node) {
1076*4882a593Smuzhiyun 			mlog(0, "found lockres owned by dead node while "
1077*4882a593Smuzhiyun 				  "doing recovery for node %u. sending it.\n",
1078*4882a593Smuzhiyun 				  dead_node);
1079*4882a593Smuzhiyun 			list_move_tail(&res->recovering, list);
1080*4882a593Smuzhiyun 		} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
1081*4882a593Smuzhiyun 			mlog(0, "found UNKNOWN owner while doing recovery "
1082*4882a593Smuzhiyun 				  "for node %u. sending it.\n", dead_node);
1083*4882a593Smuzhiyun 			list_move_tail(&res->recovering, list);
1084*4882a593Smuzhiyun 		}
1085*4882a593Smuzhiyun 	}
1086*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
1087*4882a593Smuzhiyun }
1088*4882a593Smuzhiyun 
dlm_num_locks_in_lockres(struct dlm_lock_resource * res)1089*4882a593Smuzhiyun static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res)
1090*4882a593Smuzhiyun {
1091*4882a593Smuzhiyun 	int total_locks = 0;
1092*4882a593Smuzhiyun 	struct list_head *iter, *queue = &res->granted;
1093*4882a593Smuzhiyun 	int i;
1094*4882a593Smuzhiyun 
1095*4882a593Smuzhiyun 	for (i=0; i<3; i++) {
1096*4882a593Smuzhiyun 		list_for_each(iter, queue)
1097*4882a593Smuzhiyun 			total_locks++;
1098*4882a593Smuzhiyun 		queue++;
1099*4882a593Smuzhiyun 	}
1100*4882a593Smuzhiyun 	return total_locks;
1101*4882a593Smuzhiyun }
1102*4882a593Smuzhiyun 
1103*4882a593Smuzhiyun 
dlm_send_mig_lockres_msg(struct dlm_ctxt * dlm,struct dlm_migratable_lockres * mres,u8 send_to,struct dlm_lock_resource * res,int total_locks)1104*4882a593Smuzhiyun static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
1105*4882a593Smuzhiyun 				      struct dlm_migratable_lockres *mres,
1106*4882a593Smuzhiyun 				      u8 send_to,
1107*4882a593Smuzhiyun 				      struct dlm_lock_resource *res,
1108*4882a593Smuzhiyun 				      int total_locks)
1109*4882a593Smuzhiyun {
1110*4882a593Smuzhiyun 	u64 mig_cookie = be64_to_cpu(mres->mig_cookie);
1111*4882a593Smuzhiyun 	int mres_total_locks = be32_to_cpu(mres->total_locks);
1112*4882a593Smuzhiyun 	int ret = 0, status = 0;
1113*4882a593Smuzhiyun 	u8 orig_flags = mres->flags,
1114*4882a593Smuzhiyun 	   orig_master = mres->master;
1115*4882a593Smuzhiyun 
1116*4882a593Smuzhiyun 	BUG_ON(mres->num_locks > DLM_MAX_MIGRATABLE_LOCKS);
1117*4882a593Smuzhiyun 	if (!mres->num_locks)
1118*4882a593Smuzhiyun 		return 0;
1119*4882a593Smuzhiyun 
1120*4882a593Smuzhiyun 	/* add an all-done flag if we reached the last lock */
1121*4882a593Smuzhiyun 	orig_flags = mres->flags;
1122*4882a593Smuzhiyun 	BUG_ON(total_locks > mres_total_locks);
1123*4882a593Smuzhiyun 	if (total_locks == mres_total_locks)
1124*4882a593Smuzhiyun 		mres->flags |= DLM_MRES_ALL_DONE;
1125*4882a593Smuzhiyun 
1126*4882a593Smuzhiyun 	mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n",
1127*4882a593Smuzhiyun 	     dlm->name, res->lockname.len, res->lockname.name,
1128*4882a593Smuzhiyun 	     orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery",
1129*4882a593Smuzhiyun 	     send_to);
1130*4882a593Smuzhiyun 
1131*4882a593Smuzhiyun 	/* send it */
1132*4882a593Smuzhiyun 	ret = o2net_send_message(DLM_MIG_LOCKRES_MSG, dlm->key, mres,
1133*4882a593Smuzhiyun 				 struct_size(mres, ml, mres->num_locks),
1134*4882a593Smuzhiyun 				 send_to, &status);
1135*4882a593Smuzhiyun 	if (ret < 0) {
1136*4882a593Smuzhiyun 		/* XXX: negative status is not handled.
1137*4882a593Smuzhiyun 		 * this will end up killing this node. */
1138*4882a593Smuzhiyun 		mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to "
1139*4882a593Smuzhiyun 		     "node %u (%s)\n", dlm->name, mres->lockname_len,
1140*4882a593Smuzhiyun 		     mres->lockname, ret, send_to,
1141*4882a593Smuzhiyun 		     (orig_flags & DLM_MRES_MIGRATION ?
1142*4882a593Smuzhiyun 		      "migration" : "recovery"));
1143*4882a593Smuzhiyun 	} else {
1144*4882a593Smuzhiyun 		/* might get an -ENOMEM back here */
1145*4882a593Smuzhiyun 		ret = status;
1146*4882a593Smuzhiyun 		if (ret < 0) {
1147*4882a593Smuzhiyun 			mlog_errno(ret);
1148*4882a593Smuzhiyun 
1149*4882a593Smuzhiyun 			if (ret == -EFAULT) {
1150*4882a593Smuzhiyun 				mlog(ML_ERROR, "node %u told me to kill "
1151*4882a593Smuzhiyun 				     "myself!\n", send_to);
1152*4882a593Smuzhiyun 				BUG();
1153*4882a593Smuzhiyun 			}
1154*4882a593Smuzhiyun 		}
1155*4882a593Smuzhiyun 	}
1156*4882a593Smuzhiyun 
1157*4882a593Smuzhiyun 	/* zero and reinit the message buffer */
1158*4882a593Smuzhiyun 	dlm_init_migratable_lockres(mres, res->lockname.name,
1159*4882a593Smuzhiyun 				    res->lockname.len, mres_total_locks,
1160*4882a593Smuzhiyun 				    mig_cookie, orig_flags, orig_master);
1161*4882a593Smuzhiyun 	return ret;
1162*4882a593Smuzhiyun }
1163*4882a593Smuzhiyun 
dlm_init_migratable_lockres(struct dlm_migratable_lockres * mres,const char * lockname,int namelen,int total_locks,u64 cookie,u8 flags,u8 master)1164*4882a593Smuzhiyun static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
1165*4882a593Smuzhiyun 					const char *lockname, int namelen,
1166*4882a593Smuzhiyun 					int total_locks, u64 cookie,
1167*4882a593Smuzhiyun 					u8 flags, u8 master)
1168*4882a593Smuzhiyun {
1169*4882a593Smuzhiyun 	/* mres here is one full page */
1170*4882a593Smuzhiyun 	clear_page(mres);
1171*4882a593Smuzhiyun 	mres->lockname_len = namelen;
1172*4882a593Smuzhiyun 	memcpy(mres->lockname, lockname, namelen);
1173*4882a593Smuzhiyun 	mres->num_locks = 0;
1174*4882a593Smuzhiyun 	mres->total_locks = cpu_to_be32(total_locks);
1175*4882a593Smuzhiyun 	mres->mig_cookie = cpu_to_be64(cookie);
1176*4882a593Smuzhiyun 	mres->flags = flags;
1177*4882a593Smuzhiyun 	mres->master = master;
1178*4882a593Smuzhiyun }
1179*4882a593Smuzhiyun 
dlm_prepare_lvb_for_migration(struct dlm_lock * lock,struct dlm_migratable_lockres * mres,int queue)1180*4882a593Smuzhiyun static void dlm_prepare_lvb_for_migration(struct dlm_lock *lock,
1181*4882a593Smuzhiyun 					  struct dlm_migratable_lockres *mres,
1182*4882a593Smuzhiyun 					  int queue)
1183*4882a593Smuzhiyun {
1184*4882a593Smuzhiyun 	if (!lock->lksb)
1185*4882a593Smuzhiyun 	       return;
1186*4882a593Smuzhiyun 
1187*4882a593Smuzhiyun 	/* Ignore lvb in all locks in the blocked list */
1188*4882a593Smuzhiyun 	if (queue == DLM_BLOCKED_LIST)
1189*4882a593Smuzhiyun 		return;
1190*4882a593Smuzhiyun 
1191*4882a593Smuzhiyun 	/* Only consider lvbs in locks with granted EX or PR lock levels */
1192*4882a593Smuzhiyun 	if (lock->ml.type != LKM_EXMODE && lock->ml.type != LKM_PRMODE)
1193*4882a593Smuzhiyun 		return;
1194*4882a593Smuzhiyun 
1195*4882a593Smuzhiyun 	if (dlm_lvb_is_empty(mres->lvb)) {
1196*4882a593Smuzhiyun 		memcpy(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN);
1197*4882a593Smuzhiyun 		return;
1198*4882a593Smuzhiyun 	}
1199*4882a593Smuzhiyun 
1200*4882a593Smuzhiyun 	/* Ensure the lvb copied for migration matches in other valid locks */
1201*4882a593Smuzhiyun 	if (!memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))
1202*4882a593Smuzhiyun 		return;
1203*4882a593Smuzhiyun 
1204*4882a593Smuzhiyun 	mlog(ML_ERROR, "Mismatched lvb in lock cookie=%u:%llu, name=%.*s, "
1205*4882a593Smuzhiyun 	     "node=%u\n",
1206*4882a593Smuzhiyun 	     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
1207*4882a593Smuzhiyun 	     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
1208*4882a593Smuzhiyun 	     lock->lockres->lockname.len, lock->lockres->lockname.name,
1209*4882a593Smuzhiyun 	     lock->ml.node);
1210*4882a593Smuzhiyun 	dlm_print_one_lock_resource(lock->lockres);
1211*4882a593Smuzhiyun 	BUG();
1212*4882a593Smuzhiyun }
1213*4882a593Smuzhiyun 
1214*4882a593Smuzhiyun /* returns 1 if this lock fills the network structure,
1215*4882a593Smuzhiyun  * 0 otherwise */
dlm_add_lock_to_array(struct dlm_lock * lock,struct dlm_migratable_lockres * mres,int queue)1216*4882a593Smuzhiyun static int dlm_add_lock_to_array(struct dlm_lock *lock,
1217*4882a593Smuzhiyun 				 struct dlm_migratable_lockres *mres, int queue)
1218*4882a593Smuzhiyun {
1219*4882a593Smuzhiyun 	struct dlm_migratable_lock *ml;
1220*4882a593Smuzhiyun 	int lock_num = mres->num_locks;
1221*4882a593Smuzhiyun 
1222*4882a593Smuzhiyun 	ml = &(mres->ml[lock_num]);
1223*4882a593Smuzhiyun 	ml->cookie = lock->ml.cookie;
1224*4882a593Smuzhiyun 	ml->type = lock->ml.type;
1225*4882a593Smuzhiyun 	ml->convert_type = lock->ml.convert_type;
1226*4882a593Smuzhiyun 	ml->highest_blocked = lock->ml.highest_blocked;
1227*4882a593Smuzhiyun 	ml->list = queue;
1228*4882a593Smuzhiyun 	if (lock->lksb) {
1229*4882a593Smuzhiyun 		ml->flags = lock->lksb->flags;
1230*4882a593Smuzhiyun 		dlm_prepare_lvb_for_migration(lock, mres, queue);
1231*4882a593Smuzhiyun 	}
1232*4882a593Smuzhiyun 	ml->node = lock->ml.node;
1233*4882a593Smuzhiyun 	mres->num_locks++;
1234*4882a593Smuzhiyun 	/* we reached the max, send this network message */
1235*4882a593Smuzhiyun 	if (mres->num_locks == DLM_MAX_MIGRATABLE_LOCKS)
1236*4882a593Smuzhiyun 		return 1;
1237*4882a593Smuzhiyun 	return 0;
1238*4882a593Smuzhiyun }
1239*4882a593Smuzhiyun 
dlm_add_dummy_lock(struct dlm_ctxt * dlm,struct dlm_migratable_lockres * mres)1240*4882a593Smuzhiyun static void dlm_add_dummy_lock(struct dlm_ctxt *dlm,
1241*4882a593Smuzhiyun 			       struct dlm_migratable_lockres *mres)
1242*4882a593Smuzhiyun {
1243*4882a593Smuzhiyun 	struct dlm_lock dummy;
1244*4882a593Smuzhiyun 	memset(&dummy, 0, sizeof(dummy));
1245*4882a593Smuzhiyun 	dummy.ml.cookie = 0;
1246*4882a593Smuzhiyun 	dummy.ml.type = LKM_IVMODE;
1247*4882a593Smuzhiyun 	dummy.ml.convert_type = LKM_IVMODE;
1248*4882a593Smuzhiyun 	dummy.ml.highest_blocked = LKM_IVMODE;
1249*4882a593Smuzhiyun 	dummy.lksb = NULL;
1250*4882a593Smuzhiyun 	dummy.ml.node = dlm->node_num;
1251*4882a593Smuzhiyun 	dlm_add_lock_to_array(&dummy, mres, DLM_BLOCKED_LIST);
1252*4882a593Smuzhiyun }
1253*4882a593Smuzhiyun 
dlm_is_dummy_lock(struct dlm_ctxt * dlm,struct dlm_migratable_lock * ml,u8 * nodenum)1254*4882a593Smuzhiyun static inline int dlm_is_dummy_lock(struct dlm_ctxt *dlm,
1255*4882a593Smuzhiyun 				    struct dlm_migratable_lock *ml,
1256*4882a593Smuzhiyun 				    u8 *nodenum)
1257*4882a593Smuzhiyun {
1258*4882a593Smuzhiyun 	if (unlikely(ml->cookie == 0 &&
1259*4882a593Smuzhiyun 	    ml->type == LKM_IVMODE &&
1260*4882a593Smuzhiyun 	    ml->convert_type == LKM_IVMODE &&
1261*4882a593Smuzhiyun 	    ml->highest_blocked == LKM_IVMODE &&
1262*4882a593Smuzhiyun 	    ml->list == DLM_BLOCKED_LIST)) {
1263*4882a593Smuzhiyun 		*nodenum = ml->node;
1264*4882a593Smuzhiyun 		return 1;
1265*4882a593Smuzhiyun 	}
1266*4882a593Smuzhiyun 	return 0;
1267*4882a593Smuzhiyun }
1268*4882a593Smuzhiyun 
dlm_send_one_lockres(struct dlm_ctxt * dlm,struct dlm_lock_resource * res,struct dlm_migratable_lockres * mres,u8 send_to,u8 flags)1269*4882a593Smuzhiyun int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1270*4882a593Smuzhiyun 			 struct dlm_migratable_lockres *mres,
1271*4882a593Smuzhiyun 			 u8 send_to, u8 flags)
1272*4882a593Smuzhiyun {
1273*4882a593Smuzhiyun 	struct list_head *queue;
1274*4882a593Smuzhiyun 	int total_locks, i;
1275*4882a593Smuzhiyun 	u64 mig_cookie = 0;
1276*4882a593Smuzhiyun 	struct dlm_lock *lock;
1277*4882a593Smuzhiyun 	int ret = 0;
1278*4882a593Smuzhiyun 
1279*4882a593Smuzhiyun 	BUG_ON(!(flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
1280*4882a593Smuzhiyun 
1281*4882a593Smuzhiyun 	mlog(0, "sending to %u\n", send_to);
1282*4882a593Smuzhiyun 
1283*4882a593Smuzhiyun 	total_locks = dlm_num_locks_in_lockres(res);
1284*4882a593Smuzhiyun 	if (total_locks > DLM_MAX_MIGRATABLE_LOCKS) {
1285*4882a593Smuzhiyun 		/* rare, but possible */
1286*4882a593Smuzhiyun 		mlog(0, "argh.  lockres has %d locks.  this will "
1287*4882a593Smuzhiyun 			  "require more than one network packet to "
1288*4882a593Smuzhiyun 			  "migrate\n", total_locks);
1289*4882a593Smuzhiyun 		mig_cookie = dlm_get_next_mig_cookie();
1290*4882a593Smuzhiyun 	}
1291*4882a593Smuzhiyun 
1292*4882a593Smuzhiyun 	dlm_init_migratable_lockres(mres, res->lockname.name,
1293*4882a593Smuzhiyun 				    res->lockname.len, total_locks,
1294*4882a593Smuzhiyun 				    mig_cookie, flags, res->owner);
1295*4882a593Smuzhiyun 
1296*4882a593Smuzhiyun 	total_locks = 0;
1297*4882a593Smuzhiyun 	for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
1298*4882a593Smuzhiyun 		queue = dlm_list_idx_to_ptr(res, i);
1299*4882a593Smuzhiyun 		list_for_each_entry(lock, queue, list) {
1300*4882a593Smuzhiyun 			/* add another lock. */
1301*4882a593Smuzhiyun 			total_locks++;
1302*4882a593Smuzhiyun 			if (!dlm_add_lock_to_array(lock, mres, i))
1303*4882a593Smuzhiyun 				continue;
1304*4882a593Smuzhiyun 
1305*4882a593Smuzhiyun 			/* this filled the lock message,
1306*4882a593Smuzhiyun 			 * we must send it immediately. */
1307*4882a593Smuzhiyun 			ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
1308*4882a593Smuzhiyun 						       res, total_locks);
1309*4882a593Smuzhiyun 			if (ret < 0)
1310*4882a593Smuzhiyun 				goto error;
1311*4882a593Smuzhiyun 		}
1312*4882a593Smuzhiyun 	}
1313*4882a593Smuzhiyun 	if (total_locks == 0) {
1314*4882a593Smuzhiyun 		/* send a dummy lock to indicate a mastery reference only */
1315*4882a593Smuzhiyun 		mlog(0, "%s:%.*s: sending dummy lock to %u, %s\n",
1316*4882a593Smuzhiyun 		     dlm->name, res->lockname.len, res->lockname.name,
1317*4882a593Smuzhiyun 		     send_to, flags & DLM_MRES_RECOVERY ? "recovery" :
1318*4882a593Smuzhiyun 		     "migration");
1319*4882a593Smuzhiyun 		dlm_add_dummy_lock(dlm, mres);
1320*4882a593Smuzhiyun 	}
1321*4882a593Smuzhiyun 	/* flush any remaining locks */
1322*4882a593Smuzhiyun 	ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
1323*4882a593Smuzhiyun 	if (ret < 0)
1324*4882a593Smuzhiyun 		goto error;
1325*4882a593Smuzhiyun 	return ret;
1326*4882a593Smuzhiyun 
1327*4882a593Smuzhiyun error:
1328*4882a593Smuzhiyun 	mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n",
1329*4882a593Smuzhiyun 	     dlm->name, ret);
1330*4882a593Smuzhiyun 	if (!dlm_is_host_down(ret))
1331*4882a593Smuzhiyun 		BUG();
1332*4882a593Smuzhiyun 	mlog(0, "%s: node %u went down while sending %s "
1333*4882a593Smuzhiyun 	     "lockres %.*s\n", dlm->name, send_to,
1334*4882a593Smuzhiyun 	     flags & DLM_MRES_RECOVERY ?  "recovery" : "migration",
1335*4882a593Smuzhiyun 	     res->lockname.len, res->lockname.name);
1336*4882a593Smuzhiyun 	return ret;
1337*4882a593Smuzhiyun }
1338*4882a593Smuzhiyun 
1339*4882a593Smuzhiyun 
1340*4882a593Smuzhiyun 
1341*4882a593Smuzhiyun /*
1342*4882a593Smuzhiyun  * this message will contain no more than one page worth of
1343*4882a593Smuzhiyun  * recovery data, and it will work on only one lockres.
1344*4882a593Smuzhiyun  * there may be many locks in this page, and we may need to wait
1345*4882a593Smuzhiyun  * for additional packets to complete all the locks (rare, but
1346*4882a593Smuzhiyun  * possible).
1347*4882a593Smuzhiyun  */
1348*4882a593Smuzhiyun /*
1349*4882a593Smuzhiyun  * NOTE: the allocation error cases here are scary
1350*4882a593Smuzhiyun  * we really cannot afford to fail an alloc in recovery
1351*4882a593Smuzhiyun  * do we spin?  returning an error only delays the problem really
1352*4882a593Smuzhiyun  */
1353*4882a593Smuzhiyun 
dlm_mig_lockres_handler(struct o2net_msg * msg,u32 len,void * data,void ** ret_data)1354*4882a593Smuzhiyun int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
1355*4882a593Smuzhiyun 			    void **ret_data)
1356*4882a593Smuzhiyun {
1357*4882a593Smuzhiyun 	struct dlm_ctxt *dlm = data;
1358*4882a593Smuzhiyun 	struct dlm_migratable_lockres *mres =
1359*4882a593Smuzhiyun 		(struct dlm_migratable_lockres *)msg->buf;
1360*4882a593Smuzhiyun 	int ret = 0;
1361*4882a593Smuzhiyun 	u8 real_master;
1362*4882a593Smuzhiyun 	u8 extra_refs = 0;
1363*4882a593Smuzhiyun 	char *buf = NULL;
1364*4882a593Smuzhiyun 	struct dlm_work_item *item = NULL;
1365*4882a593Smuzhiyun 	struct dlm_lock_resource *res = NULL;
1366*4882a593Smuzhiyun 	unsigned int hash;
1367*4882a593Smuzhiyun 
1368*4882a593Smuzhiyun 	if (!dlm_grab(dlm))
1369*4882a593Smuzhiyun 		return -EINVAL;
1370*4882a593Smuzhiyun 
1371*4882a593Smuzhiyun 	if (!dlm_joined(dlm)) {
1372*4882a593Smuzhiyun 		mlog(ML_ERROR, "Domain %s not joined! "
1373*4882a593Smuzhiyun 			  "lockres %.*s, master %u\n",
1374*4882a593Smuzhiyun 			  dlm->name, mres->lockname_len,
1375*4882a593Smuzhiyun 			  mres->lockname, mres->master);
1376*4882a593Smuzhiyun 		dlm_put(dlm);
1377*4882a593Smuzhiyun 		return -EINVAL;
1378*4882a593Smuzhiyun 	}
1379*4882a593Smuzhiyun 
1380*4882a593Smuzhiyun 	BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
1381*4882a593Smuzhiyun 
1382*4882a593Smuzhiyun 	real_master = mres->master;
1383*4882a593Smuzhiyun 	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1384*4882a593Smuzhiyun 		/* cannot migrate a lockres with no master */
1385*4882a593Smuzhiyun 		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
1386*4882a593Smuzhiyun 	}
1387*4882a593Smuzhiyun 
1388*4882a593Smuzhiyun 	mlog(0, "%s message received from node %u\n",
1389*4882a593Smuzhiyun 		  (mres->flags & DLM_MRES_RECOVERY) ?
1390*4882a593Smuzhiyun 		  "recovery" : "migration", mres->master);
1391*4882a593Smuzhiyun 	if (mres->flags & DLM_MRES_ALL_DONE)
1392*4882a593Smuzhiyun 		mlog(0, "all done flag.  all lockres data received!\n");
1393*4882a593Smuzhiyun 
1394*4882a593Smuzhiyun 	ret = -ENOMEM;
1395*4882a593Smuzhiyun 	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS);
1396*4882a593Smuzhiyun 	item = kzalloc(sizeof(*item), GFP_NOFS);
1397*4882a593Smuzhiyun 	if (!buf || !item)
1398*4882a593Smuzhiyun 		goto leave;
1399*4882a593Smuzhiyun 
1400*4882a593Smuzhiyun 	/* lookup the lock to see if we have a secondary queue for this
1401*4882a593Smuzhiyun 	 * already...  just add the locks in and this will have its owner
1402*4882a593Smuzhiyun 	 * and RECOVERY flag changed when it completes. */
1403*4882a593Smuzhiyun 	hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
1404*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
1405*4882a593Smuzhiyun 	res = __dlm_lookup_lockres_full(dlm, mres->lockname, mres->lockname_len,
1406*4882a593Smuzhiyun 			hash);
1407*4882a593Smuzhiyun 	if (res) {
1408*4882a593Smuzhiyun 	 	/* this will get a ref on res */
1409*4882a593Smuzhiyun 		/* mark it as recovering/migrating and hash it */
1410*4882a593Smuzhiyun 		spin_lock(&res->spinlock);
1411*4882a593Smuzhiyun 		if (res->state & DLM_LOCK_RES_DROPPING_REF) {
1412*4882a593Smuzhiyun 			mlog(0, "%s: node is attempting to migrate "
1413*4882a593Smuzhiyun 				"lockres %.*s, but marked as dropping "
1414*4882a593Smuzhiyun 				" ref!\n", dlm->name,
1415*4882a593Smuzhiyun 				mres->lockname_len, mres->lockname);
1416*4882a593Smuzhiyun 			ret = -EINVAL;
1417*4882a593Smuzhiyun 			spin_unlock(&res->spinlock);
1418*4882a593Smuzhiyun 			spin_unlock(&dlm->spinlock);
1419*4882a593Smuzhiyun 			dlm_lockres_put(res);
1420*4882a593Smuzhiyun 			goto leave;
1421*4882a593Smuzhiyun 		}
1422*4882a593Smuzhiyun 
1423*4882a593Smuzhiyun 		if (mres->flags & DLM_MRES_RECOVERY) {
1424*4882a593Smuzhiyun 			res->state |= DLM_LOCK_RES_RECOVERING;
1425*4882a593Smuzhiyun 		} else {
1426*4882a593Smuzhiyun 			if (res->state & DLM_LOCK_RES_MIGRATING) {
1427*4882a593Smuzhiyun 				/* this is at least the second
1428*4882a593Smuzhiyun 				 * lockres message */
1429*4882a593Smuzhiyun 				mlog(0, "lock %.*s is already migrating\n",
1430*4882a593Smuzhiyun 					  mres->lockname_len,
1431*4882a593Smuzhiyun 					  mres->lockname);
1432*4882a593Smuzhiyun 			} else if (res->state & DLM_LOCK_RES_RECOVERING) {
1433*4882a593Smuzhiyun 				/* caller should BUG */
1434*4882a593Smuzhiyun 				mlog(ML_ERROR, "node is attempting to migrate "
1435*4882a593Smuzhiyun 				     "lock %.*s, but marked as recovering!\n",
1436*4882a593Smuzhiyun 				     mres->lockname_len, mres->lockname);
1437*4882a593Smuzhiyun 				ret = -EFAULT;
1438*4882a593Smuzhiyun 				spin_unlock(&res->spinlock);
1439*4882a593Smuzhiyun 				spin_unlock(&dlm->spinlock);
1440*4882a593Smuzhiyun 				dlm_lockres_put(res);
1441*4882a593Smuzhiyun 				goto leave;
1442*4882a593Smuzhiyun 			}
1443*4882a593Smuzhiyun 			res->state |= DLM_LOCK_RES_MIGRATING;
1444*4882a593Smuzhiyun 		}
1445*4882a593Smuzhiyun 		spin_unlock(&res->spinlock);
1446*4882a593Smuzhiyun 		spin_unlock(&dlm->spinlock);
1447*4882a593Smuzhiyun 	} else {
1448*4882a593Smuzhiyun 		spin_unlock(&dlm->spinlock);
1449*4882a593Smuzhiyun 		/* need to allocate, just like if it was
1450*4882a593Smuzhiyun 		 * mastered here normally  */
1451*4882a593Smuzhiyun 		res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
1452*4882a593Smuzhiyun 		if (!res)
1453*4882a593Smuzhiyun 			goto leave;
1454*4882a593Smuzhiyun 
1455*4882a593Smuzhiyun 		/* to match the ref that we would have gotten if
1456*4882a593Smuzhiyun 		 * dlm_lookup_lockres had succeeded */
1457*4882a593Smuzhiyun 		dlm_lockres_get(res);
1458*4882a593Smuzhiyun 
1459*4882a593Smuzhiyun 		/* mark it as recovering/migrating and hash it */
1460*4882a593Smuzhiyun 		if (mres->flags & DLM_MRES_RECOVERY)
1461*4882a593Smuzhiyun 			res->state |= DLM_LOCK_RES_RECOVERING;
1462*4882a593Smuzhiyun 		else
1463*4882a593Smuzhiyun 			res->state |= DLM_LOCK_RES_MIGRATING;
1464*4882a593Smuzhiyun 
1465*4882a593Smuzhiyun 		spin_lock(&dlm->spinlock);
1466*4882a593Smuzhiyun 		__dlm_insert_lockres(dlm, res);
1467*4882a593Smuzhiyun 		spin_unlock(&dlm->spinlock);
1468*4882a593Smuzhiyun 
1469*4882a593Smuzhiyun 		/* Add an extra ref for this lock-less lockres lest the
1470*4882a593Smuzhiyun 		 * dlm_thread purges it before we get the chance to add
1471*4882a593Smuzhiyun 		 * locks to it */
1472*4882a593Smuzhiyun 		dlm_lockres_get(res);
1473*4882a593Smuzhiyun 
1474*4882a593Smuzhiyun 		/* There are three refs that need to be put.
1475*4882a593Smuzhiyun 		 * 1. Taken above.
1476*4882a593Smuzhiyun 		 * 2. kref_init in dlm_new_lockres()->dlm_init_lockres().
1477*4882a593Smuzhiyun 		 * 3. dlm_lookup_lockres()
1478*4882a593Smuzhiyun 		 * The first one is handled at the end of this function. The
1479*4882a593Smuzhiyun 		 * other two are handled in the worker thread after locks have
1480*4882a593Smuzhiyun 		 * been attached. Yes, we don't wait for purge time to match
1481*4882a593Smuzhiyun 		 * kref_init. The lockres will still have atleast one ref
1482*4882a593Smuzhiyun 		 * added because it is in the hash __dlm_insert_lockres() */
1483*4882a593Smuzhiyun 		extra_refs++;
1484*4882a593Smuzhiyun 
1485*4882a593Smuzhiyun 		/* now that the new lockres is inserted,
1486*4882a593Smuzhiyun 		 * make it usable by other processes */
1487*4882a593Smuzhiyun 		spin_lock(&res->spinlock);
1488*4882a593Smuzhiyun 		res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
1489*4882a593Smuzhiyun 		spin_unlock(&res->spinlock);
1490*4882a593Smuzhiyun 		wake_up(&res->wq);
1491*4882a593Smuzhiyun 	}
1492*4882a593Smuzhiyun 
1493*4882a593Smuzhiyun 	/* at this point we have allocated everything we need,
1494*4882a593Smuzhiyun 	 * and we have a hashed lockres with an extra ref and
1495*4882a593Smuzhiyun 	 * the proper res->state flags. */
1496*4882a593Smuzhiyun 	ret = 0;
1497*4882a593Smuzhiyun 	spin_lock(&res->spinlock);
1498*4882a593Smuzhiyun 	/* drop this either when master requery finds a different master
1499*4882a593Smuzhiyun 	 * or when a lock is added by the recovery worker */
1500*4882a593Smuzhiyun 	dlm_lockres_grab_inflight_ref(dlm, res);
1501*4882a593Smuzhiyun 	if (mres->master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1502*4882a593Smuzhiyun 		/* migration cannot have an unknown master */
1503*4882a593Smuzhiyun 		BUG_ON(!(mres->flags & DLM_MRES_RECOVERY));
1504*4882a593Smuzhiyun 		mlog(0, "recovery has passed me a lockres with an "
1505*4882a593Smuzhiyun 			  "unknown owner.. will need to requery: "
1506*4882a593Smuzhiyun 			  "%.*s\n", mres->lockname_len, mres->lockname);
1507*4882a593Smuzhiyun 	} else {
1508*4882a593Smuzhiyun 		/* take a reference now to pin the lockres, drop it
1509*4882a593Smuzhiyun 		 * when locks are added in the worker */
1510*4882a593Smuzhiyun 		dlm_change_lockres_owner(dlm, res, dlm->node_num);
1511*4882a593Smuzhiyun 	}
1512*4882a593Smuzhiyun 	spin_unlock(&res->spinlock);
1513*4882a593Smuzhiyun 
1514*4882a593Smuzhiyun 	/* queue up work for dlm_mig_lockres_worker */
1515*4882a593Smuzhiyun 	dlm_grab(dlm);  /* get an extra ref for the work item */
1516*4882a593Smuzhiyun 	memcpy(buf, msg->buf, be16_to_cpu(msg->data_len));  /* copy the whole message */
1517*4882a593Smuzhiyun 	dlm_init_work_item(dlm, item, dlm_mig_lockres_worker, buf);
1518*4882a593Smuzhiyun 	item->u.ml.lockres = res; /* already have a ref */
1519*4882a593Smuzhiyun 	item->u.ml.real_master = real_master;
1520*4882a593Smuzhiyun 	item->u.ml.extra_ref = extra_refs;
1521*4882a593Smuzhiyun 	spin_lock(&dlm->work_lock);
1522*4882a593Smuzhiyun 	list_add_tail(&item->list, &dlm->work_list);
1523*4882a593Smuzhiyun 	spin_unlock(&dlm->work_lock);
1524*4882a593Smuzhiyun 	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
1525*4882a593Smuzhiyun 
1526*4882a593Smuzhiyun leave:
1527*4882a593Smuzhiyun 	/* One extra ref taken needs to be put here */
1528*4882a593Smuzhiyun 	if (extra_refs)
1529*4882a593Smuzhiyun 		dlm_lockres_put(res);
1530*4882a593Smuzhiyun 
1531*4882a593Smuzhiyun 	dlm_put(dlm);
1532*4882a593Smuzhiyun 	if (ret < 0) {
1533*4882a593Smuzhiyun 		kfree(buf);
1534*4882a593Smuzhiyun 		kfree(item);
1535*4882a593Smuzhiyun 		mlog_errno(ret);
1536*4882a593Smuzhiyun 	}
1537*4882a593Smuzhiyun 
1538*4882a593Smuzhiyun 	return ret;
1539*4882a593Smuzhiyun }
1540*4882a593Smuzhiyun 
1541*4882a593Smuzhiyun 
dlm_mig_lockres_worker(struct dlm_work_item * item,void * data)1542*4882a593Smuzhiyun static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
1543*4882a593Smuzhiyun {
1544*4882a593Smuzhiyun 	struct dlm_ctxt *dlm;
1545*4882a593Smuzhiyun 	struct dlm_migratable_lockres *mres;
1546*4882a593Smuzhiyun 	int ret = 0;
1547*4882a593Smuzhiyun 	struct dlm_lock_resource *res;
1548*4882a593Smuzhiyun 	u8 real_master;
1549*4882a593Smuzhiyun 	u8 extra_ref;
1550*4882a593Smuzhiyun 
1551*4882a593Smuzhiyun 	dlm = item->dlm;
1552*4882a593Smuzhiyun 	mres = (struct dlm_migratable_lockres *)data;
1553*4882a593Smuzhiyun 
1554*4882a593Smuzhiyun 	res = item->u.ml.lockres;
1555*4882a593Smuzhiyun 	real_master = item->u.ml.real_master;
1556*4882a593Smuzhiyun 	extra_ref = item->u.ml.extra_ref;
1557*4882a593Smuzhiyun 
1558*4882a593Smuzhiyun 	if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1559*4882a593Smuzhiyun 		/* this case is super-rare. only occurs if
1560*4882a593Smuzhiyun 		 * node death happens during migration. */
1561*4882a593Smuzhiyun again:
1562*4882a593Smuzhiyun 		ret = dlm_lockres_master_requery(dlm, res, &real_master);
1563*4882a593Smuzhiyun 		if (ret < 0) {
1564*4882a593Smuzhiyun 			mlog(0, "dlm_lockres_master_requery ret=%d\n",
1565*4882a593Smuzhiyun 				  ret);
1566*4882a593Smuzhiyun 			goto again;
1567*4882a593Smuzhiyun 		}
1568*4882a593Smuzhiyun 		if (real_master == DLM_LOCK_RES_OWNER_UNKNOWN) {
1569*4882a593Smuzhiyun 			mlog(0, "lockres %.*s not claimed.  "
1570*4882a593Smuzhiyun 				   "this node will take it.\n",
1571*4882a593Smuzhiyun 				   res->lockname.len, res->lockname.name);
1572*4882a593Smuzhiyun 		} else {
1573*4882a593Smuzhiyun 			spin_lock(&res->spinlock);
1574*4882a593Smuzhiyun 			dlm_lockres_drop_inflight_ref(dlm, res);
1575*4882a593Smuzhiyun 			spin_unlock(&res->spinlock);
1576*4882a593Smuzhiyun 			mlog(0, "master needs to respond to sender "
1577*4882a593Smuzhiyun 				  "that node %u still owns %.*s\n",
1578*4882a593Smuzhiyun 				  real_master, res->lockname.len,
1579*4882a593Smuzhiyun 				  res->lockname.name);
1580*4882a593Smuzhiyun 			/* cannot touch this lockres */
1581*4882a593Smuzhiyun 			goto leave;
1582*4882a593Smuzhiyun 		}
1583*4882a593Smuzhiyun 	}
1584*4882a593Smuzhiyun 
1585*4882a593Smuzhiyun 	ret = dlm_process_recovery_data(dlm, res, mres);
1586*4882a593Smuzhiyun 	if (ret < 0)
1587*4882a593Smuzhiyun 		mlog(0, "dlm_process_recovery_data returned  %d\n", ret);
1588*4882a593Smuzhiyun 	else
1589*4882a593Smuzhiyun 		mlog(0, "dlm_process_recovery_data succeeded\n");
1590*4882a593Smuzhiyun 
1591*4882a593Smuzhiyun 	if ((mres->flags & (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) ==
1592*4882a593Smuzhiyun 	                   (DLM_MRES_MIGRATION|DLM_MRES_ALL_DONE)) {
1593*4882a593Smuzhiyun 		ret = dlm_finish_migration(dlm, res, mres->master);
1594*4882a593Smuzhiyun 		if (ret < 0)
1595*4882a593Smuzhiyun 			mlog_errno(ret);
1596*4882a593Smuzhiyun 	}
1597*4882a593Smuzhiyun 
1598*4882a593Smuzhiyun leave:
1599*4882a593Smuzhiyun 	/* See comment in dlm_mig_lockres_handler() */
1600*4882a593Smuzhiyun 	if (res) {
1601*4882a593Smuzhiyun 		if (extra_ref)
1602*4882a593Smuzhiyun 			dlm_lockres_put(res);
1603*4882a593Smuzhiyun 		dlm_lockres_put(res);
1604*4882a593Smuzhiyun 	}
1605*4882a593Smuzhiyun 	kfree(data);
1606*4882a593Smuzhiyun }
1607*4882a593Smuzhiyun 
1608*4882a593Smuzhiyun 
1609*4882a593Smuzhiyun 
dlm_lockres_master_requery(struct dlm_ctxt * dlm,struct dlm_lock_resource * res,u8 * real_master)1610*4882a593Smuzhiyun static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
1611*4882a593Smuzhiyun 				      struct dlm_lock_resource *res,
1612*4882a593Smuzhiyun 				      u8 *real_master)
1613*4882a593Smuzhiyun {
1614*4882a593Smuzhiyun 	struct dlm_node_iter iter;
1615*4882a593Smuzhiyun 	int nodenum;
1616*4882a593Smuzhiyun 	int ret = 0;
1617*4882a593Smuzhiyun 
1618*4882a593Smuzhiyun 	*real_master = DLM_LOCK_RES_OWNER_UNKNOWN;
1619*4882a593Smuzhiyun 
1620*4882a593Smuzhiyun 	/* we only reach here if one of the two nodes in a
1621*4882a593Smuzhiyun 	 * migration died while the migration was in progress.
1622*4882a593Smuzhiyun 	 * at this point we need to requery the master.  we
1623*4882a593Smuzhiyun 	 * know that the new_master got as far as creating
1624*4882a593Smuzhiyun 	 * an mle on at least one node, but we do not know
1625*4882a593Smuzhiyun 	 * if any nodes had actually cleared the mle and set
1626*4882a593Smuzhiyun 	 * the master to the new_master.  the old master
1627*4882a593Smuzhiyun 	 * is supposed to set the owner to UNKNOWN in the
1628*4882a593Smuzhiyun 	 * event of a new_master death, so the only possible
1629*4882a593Smuzhiyun 	 * responses that we can get from nodes here are
1630*4882a593Smuzhiyun 	 * that the master is new_master, or that the master
1631*4882a593Smuzhiyun 	 * is UNKNOWN.
1632*4882a593Smuzhiyun 	 * if all nodes come back with UNKNOWN then we know
1633*4882a593Smuzhiyun 	 * the lock needs remastering here.
1634*4882a593Smuzhiyun 	 * if any node comes back with a valid master, check
1635*4882a593Smuzhiyun 	 * to see if that master is the one that we are
1636*4882a593Smuzhiyun 	 * recovering.  if so, then the new_master died and
1637*4882a593Smuzhiyun 	 * we need to remaster this lock.  if not, then the
1638*4882a593Smuzhiyun 	 * new_master survived and that node will respond to
1639*4882a593Smuzhiyun 	 * other nodes about the owner.
1640*4882a593Smuzhiyun 	 * if there is an owner, this node needs to dump this
1641*4882a593Smuzhiyun 	 * lockres and alert the sender that this lockres
1642*4882a593Smuzhiyun 	 * was rejected. */
1643*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
1644*4882a593Smuzhiyun 	dlm_node_iter_init(dlm->domain_map, &iter);
1645*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
1646*4882a593Smuzhiyun 
1647*4882a593Smuzhiyun 	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
1648*4882a593Smuzhiyun 		/* do not send to self */
1649*4882a593Smuzhiyun 		if (nodenum == dlm->node_num)
1650*4882a593Smuzhiyun 			continue;
1651*4882a593Smuzhiyun 		ret = dlm_do_master_requery(dlm, res, nodenum, real_master);
1652*4882a593Smuzhiyun 		if (ret < 0) {
1653*4882a593Smuzhiyun 			mlog_errno(ret);
1654*4882a593Smuzhiyun 			if (!dlm_is_host_down(ret))
1655*4882a593Smuzhiyun 				BUG();
1656*4882a593Smuzhiyun 			/* host is down, so answer for that node would be
1657*4882a593Smuzhiyun 			 * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
1658*4882a593Smuzhiyun 		}
1659*4882a593Smuzhiyun 		if (*real_master != DLM_LOCK_RES_OWNER_UNKNOWN) {
1660*4882a593Smuzhiyun 			mlog(0, "lock master is %u\n", *real_master);
1661*4882a593Smuzhiyun 			break;
1662*4882a593Smuzhiyun 		}
1663*4882a593Smuzhiyun 	}
1664*4882a593Smuzhiyun 	return ret;
1665*4882a593Smuzhiyun }
1666*4882a593Smuzhiyun 
1667*4882a593Smuzhiyun 
dlm_do_master_requery(struct dlm_ctxt * dlm,struct dlm_lock_resource * res,u8 nodenum,u8 * real_master)1668*4882a593Smuzhiyun int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1669*4882a593Smuzhiyun 			  u8 nodenum, u8 *real_master)
1670*4882a593Smuzhiyun {
1671*4882a593Smuzhiyun 	int ret;
1672*4882a593Smuzhiyun 	struct dlm_master_requery req;
1673*4882a593Smuzhiyun 	int status = DLM_LOCK_RES_OWNER_UNKNOWN;
1674*4882a593Smuzhiyun 
1675*4882a593Smuzhiyun 	memset(&req, 0, sizeof(req));
1676*4882a593Smuzhiyun 	req.node_idx = dlm->node_num;
1677*4882a593Smuzhiyun 	req.namelen = res->lockname.len;
1678*4882a593Smuzhiyun 	memcpy(req.name, res->lockname.name, res->lockname.len);
1679*4882a593Smuzhiyun 
1680*4882a593Smuzhiyun resend:
1681*4882a593Smuzhiyun 	ret = o2net_send_message(DLM_MASTER_REQUERY_MSG, dlm->key,
1682*4882a593Smuzhiyun 				 &req, sizeof(req), nodenum, &status);
1683*4882a593Smuzhiyun 	if (ret < 0)
1684*4882a593Smuzhiyun 		mlog(ML_ERROR, "Error %d when sending message %u (key "
1685*4882a593Smuzhiyun 		     "0x%x) to node %u\n", ret, DLM_MASTER_REQUERY_MSG,
1686*4882a593Smuzhiyun 		     dlm->key, nodenum);
1687*4882a593Smuzhiyun 	else if (status == -ENOMEM) {
1688*4882a593Smuzhiyun 		mlog_errno(status);
1689*4882a593Smuzhiyun 		msleep(50);
1690*4882a593Smuzhiyun 		goto resend;
1691*4882a593Smuzhiyun 	} else {
1692*4882a593Smuzhiyun 		BUG_ON(status < 0);
1693*4882a593Smuzhiyun 		BUG_ON(status > DLM_LOCK_RES_OWNER_UNKNOWN);
1694*4882a593Smuzhiyun 		*real_master = (u8) (status & 0xff);
1695*4882a593Smuzhiyun 		mlog(0, "node %u responded to master requery with %u\n",
1696*4882a593Smuzhiyun 			  nodenum, *real_master);
1697*4882a593Smuzhiyun 		ret = 0;
1698*4882a593Smuzhiyun 	}
1699*4882a593Smuzhiyun 	return ret;
1700*4882a593Smuzhiyun }
1701*4882a593Smuzhiyun 
1702*4882a593Smuzhiyun 
1703*4882a593Smuzhiyun /* this function cannot error, so unless the sending
1704*4882a593Smuzhiyun  * or receiving of the message failed, the owner can
1705*4882a593Smuzhiyun  * be trusted */
dlm_master_requery_handler(struct o2net_msg * msg,u32 len,void * data,void ** ret_data)1706*4882a593Smuzhiyun int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
1707*4882a593Smuzhiyun 			       void **ret_data)
1708*4882a593Smuzhiyun {
1709*4882a593Smuzhiyun 	struct dlm_ctxt *dlm = data;
1710*4882a593Smuzhiyun 	struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
1711*4882a593Smuzhiyun 	struct dlm_lock_resource *res = NULL;
1712*4882a593Smuzhiyun 	unsigned int hash;
1713*4882a593Smuzhiyun 	int master = DLM_LOCK_RES_OWNER_UNKNOWN;
1714*4882a593Smuzhiyun 	u32 flags = DLM_ASSERT_MASTER_REQUERY;
1715*4882a593Smuzhiyun 	int dispatched = 0;
1716*4882a593Smuzhiyun 
1717*4882a593Smuzhiyun 	if (!dlm_grab(dlm)) {
1718*4882a593Smuzhiyun 		/* since the domain has gone away on this
1719*4882a593Smuzhiyun 		 * node, the proper response is UNKNOWN */
1720*4882a593Smuzhiyun 		return master;
1721*4882a593Smuzhiyun 	}
1722*4882a593Smuzhiyun 
1723*4882a593Smuzhiyun 	hash = dlm_lockid_hash(req->name, req->namelen);
1724*4882a593Smuzhiyun 
1725*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
1726*4882a593Smuzhiyun 	res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash);
1727*4882a593Smuzhiyun 	if (res) {
1728*4882a593Smuzhiyun 		spin_lock(&res->spinlock);
1729*4882a593Smuzhiyun 		master = res->owner;
1730*4882a593Smuzhiyun 		if (master == dlm->node_num) {
1731*4882a593Smuzhiyun 			int ret = dlm_dispatch_assert_master(dlm, res,
1732*4882a593Smuzhiyun 							     0, 0, flags);
1733*4882a593Smuzhiyun 			if (ret < 0) {
1734*4882a593Smuzhiyun 				mlog_errno(ret);
1735*4882a593Smuzhiyun 				spin_unlock(&res->spinlock);
1736*4882a593Smuzhiyun 				dlm_lockres_put(res);
1737*4882a593Smuzhiyun 				spin_unlock(&dlm->spinlock);
1738*4882a593Smuzhiyun 				dlm_put(dlm);
1739*4882a593Smuzhiyun 				/* sender will take care of this and retry */
1740*4882a593Smuzhiyun 				return ret;
1741*4882a593Smuzhiyun 			} else {
1742*4882a593Smuzhiyun 				dispatched = 1;
1743*4882a593Smuzhiyun 				__dlm_lockres_grab_inflight_worker(dlm, res);
1744*4882a593Smuzhiyun 				spin_unlock(&res->spinlock);
1745*4882a593Smuzhiyun 			}
1746*4882a593Smuzhiyun 		} else {
1747*4882a593Smuzhiyun 			/* put.. incase we are not the master */
1748*4882a593Smuzhiyun 			spin_unlock(&res->spinlock);
1749*4882a593Smuzhiyun 			dlm_lockres_put(res);
1750*4882a593Smuzhiyun 		}
1751*4882a593Smuzhiyun 	}
1752*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
1753*4882a593Smuzhiyun 
1754*4882a593Smuzhiyun 	if (!dispatched)
1755*4882a593Smuzhiyun 		dlm_put(dlm);
1756*4882a593Smuzhiyun 	return master;
1757*4882a593Smuzhiyun }
1758*4882a593Smuzhiyun 
1759*4882a593Smuzhiyun static inline struct list_head *
dlm_list_num_to_pointer(struct dlm_lock_resource * res,int list_num)1760*4882a593Smuzhiyun dlm_list_num_to_pointer(struct dlm_lock_resource *res, int list_num)
1761*4882a593Smuzhiyun {
1762*4882a593Smuzhiyun 	struct list_head *ret;
1763*4882a593Smuzhiyun 	BUG_ON(list_num < 0);
1764*4882a593Smuzhiyun 	BUG_ON(list_num > 2);
1765*4882a593Smuzhiyun 	ret = &(res->granted);
1766*4882a593Smuzhiyun 	ret += list_num;
1767*4882a593Smuzhiyun 	return ret;
1768*4882a593Smuzhiyun }
1769*4882a593Smuzhiyun /* TODO: do ast flush business
1770*4882a593Smuzhiyun  * TODO: do MIGRATING and RECOVERING spinning
1771*4882a593Smuzhiyun  */
1772*4882a593Smuzhiyun 
1773*4882a593Smuzhiyun /*
1774*4882a593Smuzhiyun * NOTE about in-flight requests during migration:
1775*4882a593Smuzhiyun *
1776*4882a593Smuzhiyun * Before attempting the migrate, the master has marked the lockres as
1777*4882a593Smuzhiyun * MIGRATING and then flushed all of its pending ASTS.  So any in-flight
1778*4882a593Smuzhiyun * requests either got queued before the MIGRATING flag got set, in which
1779*4882a593Smuzhiyun * case the lock data will reflect the change and a return message is on
1780*4882a593Smuzhiyun * the way, or the request failed to get in before MIGRATING got set.  In
1781*4882a593Smuzhiyun * this case, the caller will be told to spin and wait for the MIGRATING
1782*4882a593Smuzhiyun * flag to be dropped, then recheck the master.
1783*4882a593Smuzhiyun * This holds true for the convert, cancel and unlock cases, and since lvb
1784*4882a593Smuzhiyun * updates are tied to these same messages, it applies to lvb updates as
1785*4882a593Smuzhiyun * well.  For the lock case, there is no way a lock can be on the master
1786*4882a593Smuzhiyun * queue and not be on the secondary queue since the lock is always added
1787*4882a593Smuzhiyun * locally first.  This means that the new target node will never be sent
1788*4882a593Smuzhiyun * a lock that he doesn't already have on the list.
1789*4882a593Smuzhiyun * In total, this means that the local lock is correct and should not be
1790*4882a593Smuzhiyun * updated to match the one sent by the master.  Any messages sent back
1791*4882a593Smuzhiyun * from the master before the MIGRATING flag will bring the lock properly
1792*4882a593Smuzhiyun * up-to-date, and the change will be ordered properly for the waiter.
1793*4882a593Smuzhiyun * We will *not* attempt to modify the lock underneath the waiter.
1794*4882a593Smuzhiyun */
1795*4882a593Smuzhiyun 
dlm_process_recovery_data(struct dlm_ctxt * dlm,struct dlm_lock_resource * res,struct dlm_migratable_lockres * mres)1796*4882a593Smuzhiyun static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1797*4882a593Smuzhiyun 				     struct dlm_lock_resource *res,
1798*4882a593Smuzhiyun 				     struct dlm_migratable_lockres *mres)
1799*4882a593Smuzhiyun {
1800*4882a593Smuzhiyun 	struct dlm_migratable_lock *ml;
1801*4882a593Smuzhiyun 	struct list_head *queue, *iter;
1802*4882a593Smuzhiyun 	struct list_head *tmpq = NULL;
1803*4882a593Smuzhiyun 	struct dlm_lock *newlock = NULL;
1804*4882a593Smuzhiyun 	struct dlm_lockstatus *lksb = NULL;
1805*4882a593Smuzhiyun 	int ret = 0;
1806*4882a593Smuzhiyun 	int i, j, bad;
1807*4882a593Smuzhiyun 	struct dlm_lock *lock;
1808*4882a593Smuzhiyun 	u8 from = O2NM_MAX_NODES;
1809*4882a593Smuzhiyun 	__be64 c;
1810*4882a593Smuzhiyun 
1811*4882a593Smuzhiyun 	mlog(0, "running %d locks for this lockres\n", mres->num_locks);
1812*4882a593Smuzhiyun 	for (i=0; i<mres->num_locks; i++) {
1813*4882a593Smuzhiyun 		ml = &(mres->ml[i]);
1814*4882a593Smuzhiyun 
1815*4882a593Smuzhiyun 		if (dlm_is_dummy_lock(dlm, ml, &from)) {
1816*4882a593Smuzhiyun 			/* placeholder, just need to set the refmap bit */
1817*4882a593Smuzhiyun 			BUG_ON(mres->num_locks != 1);
1818*4882a593Smuzhiyun 			mlog(0, "%s:%.*s: dummy lock for %u\n",
1819*4882a593Smuzhiyun 			     dlm->name, mres->lockname_len, mres->lockname,
1820*4882a593Smuzhiyun 			     from);
1821*4882a593Smuzhiyun 			spin_lock(&res->spinlock);
1822*4882a593Smuzhiyun 			dlm_lockres_set_refmap_bit(dlm, res, from);
1823*4882a593Smuzhiyun 			spin_unlock(&res->spinlock);
1824*4882a593Smuzhiyun 			break;
1825*4882a593Smuzhiyun 		}
1826*4882a593Smuzhiyun 		BUG_ON(ml->highest_blocked != LKM_IVMODE);
1827*4882a593Smuzhiyun 		newlock = NULL;
1828*4882a593Smuzhiyun 		lksb = NULL;
1829*4882a593Smuzhiyun 
1830*4882a593Smuzhiyun 		queue = dlm_list_num_to_pointer(res, ml->list);
1831*4882a593Smuzhiyun 		tmpq = NULL;
1832*4882a593Smuzhiyun 
1833*4882a593Smuzhiyun 		/* if the lock is for the local node it needs to
1834*4882a593Smuzhiyun 		 * be moved to the proper location within the queue.
1835*4882a593Smuzhiyun 		 * do not allocate a new lock structure. */
1836*4882a593Smuzhiyun 		if (ml->node == dlm->node_num) {
1837*4882a593Smuzhiyun 			/* MIGRATION ONLY! */
1838*4882a593Smuzhiyun 			BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
1839*4882a593Smuzhiyun 
1840*4882a593Smuzhiyun 			lock = NULL;
1841*4882a593Smuzhiyun 			spin_lock(&res->spinlock);
1842*4882a593Smuzhiyun 			for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
1843*4882a593Smuzhiyun 				tmpq = dlm_list_idx_to_ptr(res, j);
1844*4882a593Smuzhiyun 				list_for_each(iter, tmpq) {
1845*4882a593Smuzhiyun 					lock = list_entry(iter,
1846*4882a593Smuzhiyun 						  struct dlm_lock, list);
1847*4882a593Smuzhiyun 					if (lock->ml.cookie == ml->cookie)
1848*4882a593Smuzhiyun 						break;
1849*4882a593Smuzhiyun 					lock = NULL;
1850*4882a593Smuzhiyun 				}
1851*4882a593Smuzhiyun 				if (lock)
1852*4882a593Smuzhiyun 					break;
1853*4882a593Smuzhiyun 			}
1854*4882a593Smuzhiyun 
1855*4882a593Smuzhiyun 			/* lock is always created locally first, and
1856*4882a593Smuzhiyun 			 * destroyed locally last.  it must be on the list */
1857*4882a593Smuzhiyun 			if (!lock) {
1858*4882a593Smuzhiyun 				c = ml->cookie;
1859*4882a593Smuzhiyun 				mlog(ML_ERROR, "Could not find local lock "
1860*4882a593Smuzhiyun 					       "with cookie %u:%llu, node %u, "
1861*4882a593Smuzhiyun 					       "list %u, flags 0x%x, type %d, "
1862*4882a593Smuzhiyun 					       "conv %d, highest blocked %d\n",
1863*4882a593Smuzhiyun 				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
1864*4882a593Smuzhiyun 				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
1865*4882a593Smuzhiyun 				     ml->node, ml->list, ml->flags, ml->type,
1866*4882a593Smuzhiyun 				     ml->convert_type, ml->highest_blocked);
1867*4882a593Smuzhiyun 				__dlm_print_one_lock_resource(res);
1868*4882a593Smuzhiyun 				BUG();
1869*4882a593Smuzhiyun 			}
1870*4882a593Smuzhiyun 
1871*4882a593Smuzhiyun 			if (lock->ml.node != ml->node) {
1872*4882a593Smuzhiyun 				c = lock->ml.cookie;
1873*4882a593Smuzhiyun 				mlog(ML_ERROR, "Mismatched node# in lock "
1874*4882a593Smuzhiyun 				     "cookie %u:%llu, name %.*s, node %u\n",
1875*4882a593Smuzhiyun 				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
1876*4882a593Smuzhiyun 				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
1877*4882a593Smuzhiyun 				     res->lockname.len, res->lockname.name,
1878*4882a593Smuzhiyun 				     lock->ml.node);
1879*4882a593Smuzhiyun 				c = ml->cookie;
1880*4882a593Smuzhiyun 				mlog(ML_ERROR, "Migrate lock cookie %u:%llu, "
1881*4882a593Smuzhiyun 				     "node %u, list %u, flags 0x%x, type %d, "
1882*4882a593Smuzhiyun 				     "conv %d, highest blocked %d\n",
1883*4882a593Smuzhiyun 				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
1884*4882a593Smuzhiyun 				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
1885*4882a593Smuzhiyun 				     ml->node, ml->list, ml->flags, ml->type,
1886*4882a593Smuzhiyun 				     ml->convert_type, ml->highest_blocked);
1887*4882a593Smuzhiyun 				__dlm_print_one_lock_resource(res);
1888*4882a593Smuzhiyun 				BUG();
1889*4882a593Smuzhiyun 			}
1890*4882a593Smuzhiyun 
1891*4882a593Smuzhiyun 			if (tmpq != queue) {
1892*4882a593Smuzhiyun 				c = ml->cookie;
1893*4882a593Smuzhiyun 				mlog(0, "Lock cookie %u:%llu was on list %u "
1894*4882a593Smuzhiyun 				     "instead of list %u for %.*s\n",
1895*4882a593Smuzhiyun 				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
1896*4882a593Smuzhiyun 				     dlm_get_lock_cookie_seq(be64_to_cpu(c)),
1897*4882a593Smuzhiyun 				     j, ml->list, res->lockname.len,
1898*4882a593Smuzhiyun 				     res->lockname.name);
1899*4882a593Smuzhiyun 				__dlm_print_one_lock_resource(res);
1900*4882a593Smuzhiyun 				spin_unlock(&res->spinlock);
1901*4882a593Smuzhiyun 				continue;
1902*4882a593Smuzhiyun 			}
1903*4882a593Smuzhiyun 
1904*4882a593Smuzhiyun 			/* see NOTE above about why we do not update
1905*4882a593Smuzhiyun 			 * to match the master here */
1906*4882a593Smuzhiyun 
1907*4882a593Smuzhiyun 			/* move the lock to its proper place */
1908*4882a593Smuzhiyun 			/* do not alter lock refcount.  switching lists. */
1909*4882a593Smuzhiyun 			list_move_tail(&lock->list, queue);
1910*4882a593Smuzhiyun 			spin_unlock(&res->spinlock);
1911*4882a593Smuzhiyun 
1912*4882a593Smuzhiyun 			mlog(0, "just reordered a local lock!\n");
1913*4882a593Smuzhiyun 			continue;
1914*4882a593Smuzhiyun 		}
1915*4882a593Smuzhiyun 
1916*4882a593Smuzhiyun 		/* lock is for another node. */
1917*4882a593Smuzhiyun 		newlock = dlm_new_lock(ml->type, ml->node,
1918*4882a593Smuzhiyun 				       be64_to_cpu(ml->cookie), NULL);
1919*4882a593Smuzhiyun 		if (!newlock) {
1920*4882a593Smuzhiyun 			ret = -ENOMEM;
1921*4882a593Smuzhiyun 			goto leave;
1922*4882a593Smuzhiyun 		}
1923*4882a593Smuzhiyun 		lksb = newlock->lksb;
1924*4882a593Smuzhiyun 		dlm_lock_attach_lockres(newlock, res);
1925*4882a593Smuzhiyun 
1926*4882a593Smuzhiyun 		if (ml->convert_type != LKM_IVMODE) {
1927*4882a593Smuzhiyun 			BUG_ON(queue != &res->converting);
1928*4882a593Smuzhiyun 			newlock->ml.convert_type = ml->convert_type;
1929*4882a593Smuzhiyun 		}
1930*4882a593Smuzhiyun 		lksb->flags |= (ml->flags &
1931*4882a593Smuzhiyun 				(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
1932*4882a593Smuzhiyun 
1933*4882a593Smuzhiyun 		if (ml->type == LKM_NLMODE)
1934*4882a593Smuzhiyun 			goto skip_lvb;
1935*4882a593Smuzhiyun 
1936*4882a593Smuzhiyun 		/*
1937*4882a593Smuzhiyun 		 * If the lock is in the blocked list it can't have a valid lvb,
1938*4882a593Smuzhiyun 		 * so skip it
1939*4882a593Smuzhiyun 		 */
1940*4882a593Smuzhiyun 		if (ml->list == DLM_BLOCKED_LIST)
1941*4882a593Smuzhiyun 			goto skip_lvb;
1942*4882a593Smuzhiyun 
1943*4882a593Smuzhiyun 		if (!dlm_lvb_is_empty(mres->lvb)) {
1944*4882a593Smuzhiyun 			if (lksb->flags & DLM_LKSB_PUT_LVB) {
1945*4882a593Smuzhiyun 				/* other node was trying to update
1946*4882a593Smuzhiyun 				 * lvb when node died.  recreate the
1947*4882a593Smuzhiyun 				 * lksb with the updated lvb. */
1948*4882a593Smuzhiyun 				memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
1949*4882a593Smuzhiyun 				/* the lock resource lvb update must happen
1950*4882a593Smuzhiyun 				 * NOW, before the spinlock is dropped.
1951*4882a593Smuzhiyun 				 * we no longer wait for the AST to update
1952*4882a593Smuzhiyun 				 * the lvb. */
1953*4882a593Smuzhiyun 				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
1954*4882a593Smuzhiyun 			} else {
1955*4882a593Smuzhiyun 				/* otherwise, the node is sending its
1956*4882a593Smuzhiyun 				 * most recent valid lvb info */
1957*4882a593Smuzhiyun 				BUG_ON(ml->type != LKM_EXMODE &&
1958*4882a593Smuzhiyun 				       ml->type != LKM_PRMODE);
1959*4882a593Smuzhiyun 				if (!dlm_lvb_is_empty(res->lvb) &&
1960*4882a593Smuzhiyun  				    (ml->type == LKM_EXMODE ||
1961*4882a593Smuzhiyun  				     memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
1962*4882a593Smuzhiyun  					int i;
1963*4882a593Smuzhiyun  					mlog(ML_ERROR, "%s:%.*s: received bad "
1964*4882a593Smuzhiyun  					     "lvb! type=%d\n", dlm->name,
1965*4882a593Smuzhiyun  					     res->lockname.len,
1966*4882a593Smuzhiyun  					     res->lockname.name, ml->type);
1967*4882a593Smuzhiyun  					printk("lockres lvb=[");
1968*4882a593Smuzhiyun  					for (i=0; i<DLM_LVB_LEN; i++)
1969*4882a593Smuzhiyun  						printk("%02x", res->lvb[i]);
1970*4882a593Smuzhiyun  					printk("]\nmigrated lvb=[");
1971*4882a593Smuzhiyun  					for (i=0; i<DLM_LVB_LEN; i++)
1972*4882a593Smuzhiyun  						printk("%02x", mres->lvb[i]);
1973*4882a593Smuzhiyun  					printk("]\n");
1974*4882a593Smuzhiyun  					dlm_print_one_lock_resource(res);
1975*4882a593Smuzhiyun  					BUG();
1976*4882a593Smuzhiyun 				}
1977*4882a593Smuzhiyun 				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
1978*4882a593Smuzhiyun 			}
1979*4882a593Smuzhiyun 		}
1980*4882a593Smuzhiyun skip_lvb:
1981*4882a593Smuzhiyun 
1982*4882a593Smuzhiyun 		/* NOTE:
1983*4882a593Smuzhiyun 		 * wrt lock queue ordering and recovery:
1984*4882a593Smuzhiyun 		 *    1. order of locks on granted queue is
1985*4882a593Smuzhiyun 		 *       meaningless.
1986*4882a593Smuzhiyun 		 *    2. order of locks on converting queue is
1987*4882a593Smuzhiyun 		 *       LOST with the node death.  sorry charlie.
1988*4882a593Smuzhiyun 		 *    3. order of locks on the blocked queue is
1989*4882a593Smuzhiyun 		 *       also LOST.
1990*4882a593Smuzhiyun 		 * order of locks does not affect integrity, it
1991*4882a593Smuzhiyun 		 * just means that a lock request may get pushed
1992*4882a593Smuzhiyun 		 * back in line as a result of the node death.
1993*4882a593Smuzhiyun 		 * also note that for a given node the lock order
1994*4882a593Smuzhiyun 		 * for its secondary queue locks is preserved
1995*4882a593Smuzhiyun 		 * relative to each other, but clearly *not*
1996*4882a593Smuzhiyun 		 * preserved relative to locks from other nodes.
1997*4882a593Smuzhiyun 		 */
1998*4882a593Smuzhiyun 		bad = 0;
1999*4882a593Smuzhiyun 		spin_lock(&res->spinlock);
2000*4882a593Smuzhiyun 		list_for_each_entry(lock, queue, list) {
2001*4882a593Smuzhiyun 			if (lock->ml.cookie == ml->cookie) {
2002*4882a593Smuzhiyun 				c = lock->ml.cookie;
2003*4882a593Smuzhiyun 				mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
2004*4882a593Smuzhiyun 				     "exists on this lockres!\n", dlm->name,
2005*4882a593Smuzhiyun 				     res->lockname.len, res->lockname.name,
2006*4882a593Smuzhiyun 				     dlm_get_lock_cookie_node(be64_to_cpu(c)),
2007*4882a593Smuzhiyun 				     dlm_get_lock_cookie_seq(be64_to_cpu(c)));
2008*4882a593Smuzhiyun 
2009*4882a593Smuzhiyun 				mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, "
2010*4882a593Smuzhiyun 				     "node=%u, cookie=%u:%llu, queue=%d\n",
2011*4882a593Smuzhiyun 	      			     ml->type, ml->convert_type, ml->node,
2012*4882a593Smuzhiyun 				     dlm_get_lock_cookie_node(be64_to_cpu(ml->cookie)),
2013*4882a593Smuzhiyun 				     dlm_get_lock_cookie_seq(be64_to_cpu(ml->cookie)),
2014*4882a593Smuzhiyun 				     ml->list);
2015*4882a593Smuzhiyun 
2016*4882a593Smuzhiyun 				__dlm_print_one_lock_resource(res);
2017*4882a593Smuzhiyun 				bad = 1;
2018*4882a593Smuzhiyun 				break;
2019*4882a593Smuzhiyun 			}
2020*4882a593Smuzhiyun 		}
2021*4882a593Smuzhiyun 		if (!bad) {
2022*4882a593Smuzhiyun 			dlm_lock_get(newlock);
2023*4882a593Smuzhiyun 			if (mres->flags & DLM_MRES_RECOVERY &&
2024*4882a593Smuzhiyun 					ml->list == DLM_CONVERTING_LIST &&
2025*4882a593Smuzhiyun 					newlock->ml.type >
2026*4882a593Smuzhiyun 					newlock->ml.convert_type) {
2027*4882a593Smuzhiyun 				/* newlock is doing downconvert, add it to the
2028*4882a593Smuzhiyun 				 * head of converting list */
2029*4882a593Smuzhiyun 				list_add(&newlock->list, queue);
2030*4882a593Smuzhiyun 			} else
2031*4882a593Smuzhiyun 				list_add_tail(&newlock->list, queue);
2032*4882a593Smuzhiyun 			mlog(0, "%s:%.*s: added lock for node %u, "
2033*4882a593Smuzhiyun 			     "setting refmap bit\n", dlm->name,
2034*4882a593Smuzhiyun 			     res->lockname.len, res->lockname.name, ml->node);
2035*4882a593Smuzhiyun 			dlm_lockres_set_refmap_bit(dlm, res, ml->node);
2036*4882a593Smuzhiyun 		}
2037*4882a593Smuzhiyun 		spin_unlock(&res->spinlock);
2038*4882a593Smuzhiyun 	}
2039*4882a593Smuzhiyun 	mlog(0, "done running all the locks\n");
2040*4882a593Smuzhiyun 
2041*4882a593Smuzhiyun leave:
2042*4882a593Smuzhiyun 	/* balance the ref taken when the work was queued */
2043*4882a593Smuzhiyun 	spin_lock(&res->spinlock);
2044*4882a593Smuzhiyun 	dlm_lockres_drop_inflight_ref(dlm, res);
2045*4882a593Smuzhiyun 	spin_unlock(&res->spinlock);
2046*4882a593Smuzhiyun 
2047*4882a593Smuzhiyun 	if (ret < 0)
2048*4882a593Smuzhiyun 		mlog_errno(ret);
2049*4882a593Smuzhiyun 
2050*4882a593Smuzhiyun 	return ret;
2051*4882a593Smuzhiyun }
2052*4882a593Smuzhiyun 
dlm_move_lockres_to_recovery_list(struct dlm_ctxt * dlm,struct dlm_lock_resource * res)2053*4882a593Smuzhiyun void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
2054*4882a593Smuzhiyun 				       struct dlm_lock_resource *res)
2055*4882a593Smuzhiyun {
2056*4882a593Smuzhiyun 	int i;
2057*4882a593Smuzhiyun 	struct list_head *queue;
2058*4882a593Smuzhiyun 	struct dlm_lock *lock, *next;
2059*4882a593Smuzhiyun 
2060*4882a593Smuzhiyun 	assert_spin_locked(&dlm->spinlock);
2061*4882a593Smuzhiyun 	assert_spin_locked(&res->spinlock);
2062*4882a593Smuzhiyun 	res->state |= DLM_LOCK_RES_RECOVERING;
2063*4882a593Smuzhiyun 	if (!list_empty(&res->recovering)) {
2064*4882a593Smuzhiyun 		mlog(0,
2065*4882a593Smuzhiyun 		     "Recovering res %s:%.*s, is already on recovery list!\n",
2066*4882a593Smuzhiyun 		     dlm->name, res->lockname.len, res->lockname.name);
2067*4882a593Smuzhiyun 		list_del_init(&res->recovering);
2068*4882a593Smuzhiyun 		dlm_lockres_put(res);
2069*4882a593Smuzhiyun 	}
2070*4882a593Smuzhiyun 	/* We need to hold a reference while on the recovery list */
2071*4882a593Smuzhiyun 	dlm_lockres_get(res);
2072*4882a593Smuzhiyun 	list_add_tail(&res->recovering, &dlm->reco.resources);
2073*4882a593Smuzhiyun 
2074*4882a593Smuzhiyun 	/* find any pending locks and put them back on proper list */
2075*4882a593Smuzhiyun 	for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
2076*4882a593Smuzhiyun 		queue = dlm_list_idx_to_ptr(res, i);
2077*4882a593Smuzhiyun 		list_for_each_entry_safe(lock, next, queue, list) {
2078*4882a593Smuzhiyun 			dlm_lock_get(lock);
2079*4882a593Smuzhiyun 			if (lock->convert_pending) {
2080*4882a593Smuzhiyun 				/* move converting lock back to granted */
2081*4882a593Smuzhiyun 				mlog(0, "node died with convert pending "
2082*4882a593Smuzhiyun 				     "on %.*s. move back to granted list.\n",
2083*4882a593Smuzhiyun 				     res->lockname.len, res->lockname.name);
2084*4882a593Smuzhiyun 				dlm_revert_pending_convert(res, lock);
2085*4882a593Smuzhiyun 				lock->convert_pending = 0;
2086*4882a593Smuzhiyun 			} else if (lock->lock_pending) {
2087*4882a593Smuzhiyun 				/* remove pending lock requests completely */
2088*4882a593Smuzhiyun 				BUG_ON(i != DLM_BLOCKED_LIST);
2089*4882a593Smuzhiyun 				mlog(0, "node died with lock pending "
2090*4882a593Smuzhiyun 				     "on %.*s. remove from blocked list and skip.\n",
2091*4882a593Smuzhiyun 				     res->lockname.len, res->lockname.name);
2092*4882a593Smuzhiyun 				/* lock will be floating until ref in
2093*4882a593Smuzhiyun 				 * dlmlock_remote is freed after the network
2094*4882a593Smuzhiyun 				 * call returns.  ok for it to not be on any
2095*4882a593Smuzhiyun 				 * list since no ast can be called
2096*4882a593Smuzhiyun 				 * (the master is dead). */
2097*4882a593Smuzhiyun 				dlm_revert_pending_lock(res, lock);
2098*4882a593Smuzhiyun 				lock->lock_pending = 0;
2099*4882a593Smuzhiyun 			} else if (lock->unlock_pending) {
2100*4882a593Smuzhiyun 				/* if an unlock was in progress, treat as
2101*4882a593Smuzhiyun 				 * if this had completed successfully
2102*4882a593Smuzhiyun 				 * before sending this lock state to the
2103*4882a593Smuzhiyun 				 * new master.  note that the dlm_unlock
2104*4882a593Smuzhiyun 				 * call is still responsible for calling
2105*4882a593Smuzhiyun 				 * the unlockast.  that will happen after
2106*4882a593Smuzhiyun 				 * the network call times out.  for now,
2107*4882a593Smuzhiyun 				 * just move lists to prepare the new
2108*4882a593Smuzhiyun 				 * recovery master.  */
2109*4882a593Smuzhiyun 				BUG_ON(i != DLM_GRANTED_LIST);
2110*4882a593Smuzhiyun 				mlog(0, "node died with unlock pending "
2111*4882a593Smuzhiyun 				     "on %.*s. remove from blocked list and skip.\n",
2112*4882a593Smuzhiyun 				     res->lockname.len, res->lockname.name);
2113*4882a593Smuzhiyun 				dlm_commit_pending_unlock(res, lock);
2114*4882a593Smuzhiyun 				lock->unlock_pending = 0;
2115*4882a593Smuzhiyun 			} else if (lock->cancel_pending) {
2116*4882a593Smuzhiyun 				/* if a cancel was in progress, treat as
2117*4882a593Smuzhiyun 				 * if this had completed successfully
2118*4882a593Smuzhiyun 				 * before sending this lock state to the
2119*4882a593Smuzhiyun 				 * new master */
2120*4882a593Smuzhiyun 				BUG_ON(i != DLM_CONVERTING_LIST);
2121*4882a593Smuzhiyun 				mlog(0, "node died with cancel pending "
2122*4882a593Smuzhiyun 				     "on %.*s. move back to granted list.\n",
2123*4882a593Smuzhiyun 				     res->lockname.len, res->lockname.name);
2124*4882a593Smuzhiyun 				dlm_commit_pending_cancel(res, lock);
2125*4882a593Smuzhiyun 				lock->cancel_pending = 0;
2126*4882a593Smuzhiyun 			}
2127*4882a593Smuzhiyun 			dlm_lock_put(lock);
2128*4882a593Smuzhiyun 		}
2129*4882a593Smuzhiyun 	}
2130*4882a593Smuzhiyun }
2131*4882a593Smuzhiyun 
2132*4882a593Smuzhiyun 
2133*4882a593Smuzhiyun 
2134*4882a593Smuzhiyun /* removes all recovered locks from the recovery list.
2135*4882a593Smuzhiyun  * sets the res->owner to the new master.
2136*4882a593Smuzhiyun  * unsets the RECOVERY flag and wakes waiters. */
dlm_finish_local_lockres_recovery(struct dlm_ctxt * dlm,u8 dead_node,u8 new_master)2137*4882a593Smuzhiyun static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
2138*4882a593Smuzhiyun 					      u8 dead_node, u8 new_master)
2139*4882a593Smuzhiyun {
2140*4882a593Smuzhiyun 	int i;
2141*4882a593Smuzhiyun 	struct hlist_head *bucket;
2142*4882a593Smuzhiyun 	struct dlm_lock_resource *res, *next;
2143*4882a593Smuzhiyun 
2144*4882a593Smuzhiyun 	assert_spin_locked(&dlm->spinlock);
2145*4882a593Smuzhiyun 
2146*4882a593Smuzhiyun 	list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
2147*4882a593Smuzhiyun 		if (res->owner == dead_node) {
2148*4882a593Smuzhiyun 			mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
2149*4882a593Smuzhiyun 			     dlm->name, res->lockname.len, res->lockname.name,
2150*4882a593Smuzhiyun 			     res->owner, new_master);
2151*4882a593Smuzhiyun 			list_del_init(&res->recovering);
2152*4882a593Smuzhiyun 			spin_lock(&res->spinlock);
2153*4882a593Smuzhiyun 			/* new_master has our reference from
2154*4882a593Smuzhiyun 			 * the lock state sent during recovery */
2155*4882a593Smuzhiyun 			dlm_change_lockres_owner(dlm, res, new_master);
2156*4882a593Smuzhiyun 			res->state &= ~DLM_LOCK_RES_RECOVERING;
2157*4882a593Smuzhiyun 			if (__dlm_lockres_has_locks(res))
2158*4882a593Smuzhiyun 				__dlm_dirty_lockres(dlm, res);
2159*4882a593Smuzhiyun 			spin_unlock(&res->spinlock);
2160*4882a593Smuzhiyun 			wake_up(&res->wq);
2161*4882a593Smuzhiyun 			dlm_lockres_put(res);
2162*4882a593Smuzhiyun 		}
2163*4882a593Smuzhiyun 	}
2164*4882a593Smuzhiyun 
2165*4882a593Smuzhiyun 	/* this will become unnecessary eventually, but
2166*4882a593Smuzhiyun 	 * for now we need to run the whole hash, clear
2167*4882a593Smuzhiyun 	 * the RECOVERING state and set the owner
2168*4882a593Smuzhiyun 	 * if necessary */
2169*4882a593Smuzhiyun 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
2170*4882a593Smuzhiyun 		bucket = dlm_lockres_hash(dlm, i);
2171*4882a593Smuzhiyun 		hlist_for_each_entry(res, bucket, hash_node) {
2172*4882a593Smuzhiyun 			if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) {
2173*4882a593Smuzhiyun 				spin_lock(&res->spinlock);
2174*4882a593Smuzhiyun 				res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING;
2175*4882a593Smuzhiyun 				spin_unlock(&res->spinlock);
2176*4882a593Smuzhiyun 				wake_up(&res->wq);
2177*4882a593Smuzhiyun 			}
2178*4882a593Smuzhiyun 
2179*4882a593Smuzhiyun 			if (!(res->state & DLM_LOCK_RES_RECOVERING))
2180*4882a593Smuzhiyun 				continue;
2181*4882a593Smuzhiyun 
2182*4882a593Smuzhiyun 			if (res->owner != dead_node &&
2183*4882a593Smuzhiyun 			    res->owner != dlm->node_num)
2184*4882a593Smuzhiyun 				continue;
2185*4882a593Smuzhiyun 
2186*4882a593Smuzhiyun 			if (!list_empty(&res->recovering)) {
2187*4882a593Smuzhiyun 				list_del_init(&res->recovering);
2188*4882a593Smuzhiyun 				dlm_lockres_put(res);
2189*4882a593Smuzhiyun 			}
2190*4882a593Smuzhiyun 
2191*4882a593Smuzhiyun 			/* new_master has our reference from
2192*4882a593Smuzhiyun 			 * the lock state sent during recovery */
2193*4882a593Smuzhiyun 			mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
2194*4882a593Smuzhiyun 			     dlm->name, res->lockname.len, res->lockname.name,
2195*4882a593Smuzhiyun 			     res->owner, new_master);
2196*4882a593Smuzhiyun 			spin_lock(&res->spinlock);
2197*4882a593Smuzhiyun 			dlm_change_lockres_owner(dlm, res, new_master);
2198*4882a593Smuzhiyun 			res->state &= ~DLM_LOCK_RES_RECOVERING;
2199*4882a593Smuzhiyun 			if (__dlm_lockres_has_locks(res))
2200*4882a593Smuzhiyun 				__dlm_dirty_lockres(dlm, res);
2201*4882a593Smuzhiyun 			spin_unlock(&res->spinlock);
2202*4882a593Smuzhiyun 			wake_up(&res->wq);
2203*4882a593Smuzhiyun 		}
2204*4882a593Smuzhiyun 	}
2205*4882a593Smuzhiyun }
2206*4882a593Smuzhiyun 
dlm_lvb_needs_invalidation(struct dlm_lock * lock,int local)2207*4882a593Smuzhiyun static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
2208*4882a593Smuzhiyun {
2209*4882a593Smuzhiyun 	if (local) {
2210*4882a593Smuzhiyun 		if (lock->ml.type != LKM_EXMODE &&
2211*4882a593Smuzhiyun 		    lock->ml.type != LKM_PRMODE)
2212*4882a593Smuzhiyun 			return 1;
2213*4882a593Smuzhiyun 	} else if (lock->ml.type == LKM_EXMODE)
2214*4882a593Smuzhiyun 		return 1;
2215*4882a593Smuzhiyun 	return 0;
2216*4882a593Smuzhiyun }
2217*4882a593Smuzhiyun 
dlm_revalidate_lvb(struct dlm_ctxt * dlm,struct dlm_lock_resource * res,u8 dead_node)2218*4882a593Smuzhiyun static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
2219*4882a593Smuzhiyun 			       struct dlm_lock_resource *res, u8 dead_node)
2220*4882a593Smuzhiyun {
2221*4882a593Smuzhiyun 	struct list_head *queue;
2222*4882a593Smuzhiyun 	struct dlm_lock *lock;
2223*4882a593Smuzhiyun 	int blank_lvb = 0, local = 0;
2224*4882a593Smuzhiyun 	int i;
2225*4882a593Smuzhiyun 	u8 search_node;
2226*4882a593Smuzhiyun 
2227*4882a593Smuzhiyun 	assert_spin_locked(&dlm->spinlock);
2228*4882a593Smuzhiyun 	assert_spin_locked(&res->spinlock);
2229*4882a593Smuzhiyun 
2230*4882a593Smuzhiyun 	if (res->owner == dlm->node_num)
2231*4882a593Smuzhiyun 		/* if this node owned the lockres, and if the dead node
2232*4882a593Smuzhiyun 		 * had an EX when he died, blank out the lvb */
2233*4882a593Smuzhiyun 		search_node = dead_node;
2234*4882a593Smuzhiyun 	else {
2235*4882a593Smuzhiyun 		/* if this is a secondary lockres, and we had no EX or PR
2236*4882a593Smuzhiyun 		 * locks granted, we can no longer trust the lvb */
2237*4882a593Smuzhiyun 		search_node = dlm->node_num;
2238*4882a593Smuzhiyun 		local = 1;  /* check local state for valid lvb */
2239*4882a593Smuzhiyun 	}
2240*4882a593Smuzhiyun 
2241*4882a593Smuzhiyun 	for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
2242*4882a593Smuzhiyun 		queue = dlm_list_idx_to_ptr(res, i);
2243*4882a593Smuzhiyun 		list_for_each_entry(lock, queue, list) {
2244*4882a593Smuzhiyun 			if (lock->ml.node == search_node) {
2245*4882a593Smuzhiyun 				if (dlm_lvb_needs_invalidation(lock, local)) {
2246*4882a593Smuzhiyun 					/* zero the lksb lvb and lockres lvb */
2247*4882a593Smuzhiyun 					blank_lvb = 1;
2248*4882a593Smuzhiyun 					memset(lock->lksb->lvb, 0, DLM_LVB_LEN);
2249*4882a593Smuzhiyun 				}
2250*4882a593Smuzhiyun 			}
2251*4882a593Smuzhiyun 		}
2252*4882a593Smuzhiyun 	}
2253*4882a593Smuzhiyun 
2254*4882a593Smuzhiyun 	if (blank_lvb) {
2255*4882a593Smuzhiyun 		mlog(0, "clearing %.*s lvb, dead node %u had EX\n",
2256*4882a593Smuzhiyun 		     res->lockname.len, res->lockname.name, dead_node);
2257*4882a593Smuzhiyun 		memset(res->lvb, 0, DLM_LVB_LEN);
2258*4882a593Smuzhiyun 	}
2259*4882a593Smuzhiyun }
2260*4882a593Smuzhiyun 
dlm_free_dead_locks(struct dlm_ctxt * dlm,struct dlm_lock_resource * res,u8 dead_node)2261*4882a593Smuzhiyun static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2262*4882a593Smuzhiyun 				struct dlm_lock_resource *res, u8 dead_node)
2263*4882a593Smuzhiyun {
2264*4882a593Smuzhiyun 	struct dlm_lock *lock, *next;
2265*4882a593Smuzhiyun 	unsigned int freed = 0;
2266*4882a593Smuzhiyun 
2267*4882a593Smuzhiyun 	/* this node is the lockres master:
2268*4882a593Smuzhiyun 	 * 1) remove any stale locks for the dead node
2269*4882a593Smuzhiyun 	 * 2) if the dead node had an EX when he died, blank out the lvb
2270*4882a593Smuzhiyun 	 */
2271*4882a593Smuzhiyun 	assert_spin_locked(&dlm->spinlock);
2272*4882a593Smuzhiyun 	assert_spin_locked(&res->spinlock);
2273*4882a593Smuzhiyun 
2274*4882a593Smuzhiyun 	/* We do two dlm_lock_put(). One for removing from list and the other is
2275*4882a593Smuzhiyun 	 * to force the DLM_UNLOCK_FREE_LOCK action so as to free the locks */
2276*4882a593Smuzhiyun 
2277*4882a593Smuzhiyun 	/* TODO: check pending_asts, pending_basts here */
2278*4882a593Smuzhiyun 	list_for_each_entry_safe(lock, next, &res->granted, list) {
2279*4882a593Smuzhiyun 		if (lock->ml.node == dead_node) {
2280*4882a593Smuzhiyun 			list_del_init(&lock->list);
2281*4882a593Smuzhiyun 			dlm_lock_put(lock);
2282*4882a593Smuzhiyun 			/* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
2283*4882a593Smuzhiyun 			dlm_lock_put(lock);
2284*4882a593Smuzhiyun 			freed++;
2285*4882a593Smuzhiyun 		}
2286*4882a593Smuzhiyun 	}
2287*4882a593Smuzhiyun 	list_for_each_entry_safe(lock, next, &res->converting, list) {
2288*4882a593Smuzhiyun 		if (lock->ml.node == dead_node) {
2289*4882a593Smuzhiyun 			list_del_init(&lock->list);
2290*4882a593Smuzhiyun 			dlm_lock_put(lock);
2291*4882a593Smuzhiyun 			/* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
2292*4882a593Smuzhiyun 			dlm_lock_put(lock);
2293*4882a593Smuzhiyun 			freed++;
2294*4882a593Smuzhiyun 		}
2295*4882a593Smuzhiyun 	}
2296*4882a593Smuzhiyun 	list_for_each_entry_safe(lock, next, &res->blocked, list) {
2297*4882a593Smuzhiyun 		if (lock->ml.node == dead_node) {
2298*4882a593Smuzhiyun 			list_del_init(&lock->list);
2299*4882a593Smuzhiyun 			dlm_lock_put(lock);
2300*4882a593Smuzhiyun 			/* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
2301*4882a593Smuzhiyun 			dlm_lock_put(lock);
2302*4882a593Smuzhiyun 			freed++;
2303*4882a593Smuzhiyun 		}
2304*4882a593Smuzhiyun 	}
2305*4882a593Smuzhiyun 
2306*4882a593Smuzhiyun 	if (freed) {
2307*4882a593Smuzhiyun 		mlog(0, "%s:%.*s: freed %u locks for dead node %u, "
2308*4882a593Smuzhiyun 		     "dropping ref from lockres\n", dlm->name,
2309*4882a593Smuzhiyun 		     res->lockname.len, res->lockname.name, freed, dead_node);
2310*4882a593Smuzhiyun 		if(!test_bit(dead_node, res->refmap)) {
2311*4882a593Smuzhiyun 			mlog(ML_ERROR, "%s:%.*s: freed %u locks for dead node %u, "
2312*4882a593Smuzhiyun 			     "but ref was not set\n", dlm->name,
2313*4882a593Smuzhiyun 			     res->lockname.len, res->lockname.name, freed, dead_node);
2314*4882a593Smuzhiyun 			__dlm_print_one_lock_resource(res);
2315*4882a593Smuzhiyun 		}
2316*4882a593Smuzhiyun 		res->state |= DLM_LOCK_RES_RECOVERY_WAITING;
2317*4882a593Smuzhiyun 		dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
2318*4882a593Smuzhiyun 	} else if (test_bit(dead_node, res->refmap)) {
2319*4882a593Smuzhiyun 		mlog(0, "%s:%.*s: dead node %u had a ref, but had "
2320*4882a593Smuzhiyun 		     "no locks and had not purged before dying\n", dlm->name,
2321*4882a593Smuzhiyun 		     res->lockname.len, res->lockname.name, dead_node);
2322*4882a593Smuzhiyun 		dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
2323*4882a593Smuzhiyun 	}
2324*4882a593Smuzhiyun 
2325*4882a593Smuzhiyun 	/* do not kick thread yet */
2326*4882a593Smuzhiyun 	__dlm_dirty_lockres(dlm, res);
2327*4882a593Smuzhiyun }
2328*4882a593Smuzhiyun 
dlm_do_local_recovery_cleanup(struct dlm_ctxt * dlm,u8 dead_node)2329*4882a593Smuzhiyun static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2330*4882a593Smuzhiyun {
2331*4882a593Smuzhiyun 	struct dlm_lock_resource *res;
2332*4882a593Smuzhiyun 	int i;
2333*4882a593Smuzhiyun 	struct hlist_head *bucket;
2334*4882a593Smuzhiyun 	struct hlist_node *tmp;
2335*4882a593Smuzhiyun 	struct dlm_lock *lock;
2336*4882a593Smuzhiyun 
2337*4882a593Smuzhiyun 
2338*4882a593Smuzhiyun 	/* purge any stale mles */
2339*4882a593Smuzhiyun 	dlm_clean_master_list(dlm, dead_node);
2340*4882a593Smuzhiyun 
2341*4882a593Smuzhiyun 	/*
2342*4882a593Smuzhiyun 	 * now clean up all lock resources.  there are two rules:
2343*4882a593Smuzhiyun 	 *
2344*4882a593Smuzhiyun 	 * 1) if the dead node was the master, move the lockres
2345*4882a593Smuzhiyun 	 *    to the recovering list.  set the RECOVERING flag.
2346*4882a593Smuzhiyun 	 *    this lockres needs to be cleaned up before it can
2347*4882a593Smuzhiyun 	 *    be used further.
2348*4882a593Smuzhiyun 	 *
2349*4882a593Smuzhiyun 	 * 2) if this node was the master, remove all locks from
2350*4882a593Smuzhiyun 	 *    each of the lockres queues that were owned by the
2351*4882a593Smuzhiyun 	 *    dead node.  once recovery finishes, the dlm thread
2352*4882a593Smuzhiyun 	 *    can be kicked again to see if any ASTs or BASTs
2353*4882a593Smuzhiyun 	 *    need to be fired as a result.
2354*4882a593Smuzhiyun 	 */
2355*4882a593Smuzhiyun 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
2356*4882a593Smuzhiyun 		bucket = dlm_lockres_hash(dlm, i);
2357*4882a593Smuzhiyun 		hlist_for_each_entry_safe(res, tmp, bucket, hash_node) {
2358*4882a593Smuzhiyun  			/* always prune any $RECOVERY entries for dead nodes,
2359*4882a593Smuzhiyun  			 * otherwise hangs can occur during later recovery */
2360*4882a593Smuzhiyun 			if (dlm_is_recovery_lock(res->lockname.name,
2361*4882a593Smuzhiyun 						 res->lockname.len)) {
2362*4882a593Smuzhiyun 				spin_lock(&res->spinlock);
2363*4882a593Smuzhiyun 				list_for_each_entry(lock, &res->granted, list) {
2364*4882a593Smuzhiyun 					if (lock->ml.node == dead_node) {
2365*4882a593Smuzhiyun 						mlog(0, "AHA! there was "
2366*4882a593Smuzhiyun 						     "a $RECOVERY lock for dead "
2367*4882a593Smuzhiyun 						     "node %u (%s)!\n",
2368*4882a593Smuzhiyun 						     dead_node, dlm->name);
2369*4882a593Smuzhiyun 						list_del_init(&lock->list);
2370*4882a593Smuzhiyun 						dlm_lock_put(lock);
2371*4882a593Smuzhiyun 						/* Can't schedule
2372*4882a593Smuzhiyun 						 * DLM_UNLOCK_FREE_LOCK
2373*4882a593Smuzhiyun 						 * - do manually */
2374*4882a593Smuzhiyun 						dlm_lock_put(lock);
2375*4882a593Smuzhiyun 						break;
2376*4882a593Smuzhiyun 					}
2377*4882a593Smuzhiyun 				}
2378*4882a593Smuzhiyun 
2379*4882a593Smuzhiyun 				if ((res->owner == dead_node) &&
2380*4882a593Smuzhiyun 							(res->state & DLM_LOCK_RES_DROPPING_REF)) {
2381*4882a593Smuzhiyun 					dlm_lockres_get(res);
2382*4882a593Smuzhiyun 					__dlm_do_purge_lockres(dlm, res);
2383*4882a593Smuzhiyun 					spin_unlock(&res->spinlock);
2384*4882a593Smuzhiyun 					wake_up(&res->wq);
2385*4882a593Smuzhiyun 					dlm_lockres_put(res);
2386*4882a593Smuzhiyun 					continue;
2387*4882a593Smuzhiyun 				} else if (res->owner == dlm->node_num)
2388*4882a593Smuzhiyun 					dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
2389*4882a593Smuzhiyun 				spin_unlock(&res->spinlock);
2390*4882a593Smuzhiyun 				continue;
2391*4882a593Smuzhiyun 			}
2392*4882a593Smuzhiyun 			spin_lock(&res->spinlock);
2393*4882a593Smuzhiyun 			/* zero the lvb if necessary */
2394*4882a593Smuzhiyun 			dlm_revalidate_lvb(dlm, res, dead_node);
2395*4882a593Smuzhiyun 			if (res->owner == dead_node) {
2396*4882a593Smuzhiyun 				if (res->state & DLM_LOCK_RES_DROPPING_REF) {
2397*4882a593Smuzhiyun 					mlog(0, "%s:%.*s: owned by "
2398*4882a593Smuzhiyun 						"dead node %u, this node was "
2399*4882a593Smuzhiyun 						"dropping its ref when master died. "
2400*4882a593Smuzhiyun 						"continue, purging the lockres.\n",
2401*4882a593Smuzhiyun 						dlm->name, res->lockname.len,
2402*4882a593Smuzhiyun 						res->lockname.name, dead_node);
2403*4882a593Smuzhiyun 					dlm_lockres_get(res);
2404*4882a593Smuzhiyun 					__dlm_do_purge_lockres(dlm, res);
2405*4882a593Smuzhiyun 					spin_unlock(&res->spinlock);
2406*4882a593Smuzhiyun 					wake_up(&res->wq);
2407*4882a593Smuzhiyun 					dlm_lockres_put(res);
2408*4882a593Smuzhiyun 					continue;
2409*4882a593Smuzhiyun 				}
2410*4882a593Smuzhiyun 				dlm_move_lockres_to_recovery_list(dlm, res);
2411*4882a593Smuzhiyun 			} else if (res->owner == dlm->node_num) {
2412*4882a593Smuzhiyun 				dlm_free_dead_locks(dlm, res, dead_node);
2413*4882a593Smuzhiyun 				__dlm_lockres_calc_usage(dlm, res);
2414*4882a593Smuzhiyun 			} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
2415*4882a593Smuzhiyun 				if (test_bit(dead_node, res->refmap)) {
2416*4882a593Smuzhiyun 					mlog(0, "%s:%.*s: dead node %u had a ref, but had "
2417*4882a593Smuzhiyun 						"no locks and had not purged before dying\n",
2418*4882a593Smuzhiyun 						dlm->name, res->lockname.len,
2419*4882a593Smuzhiyun 						res->lockname.name, dead_node);
2420*4882a593Smuzhiyun 					dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
2421*4882a593Smuzhiyun 				}
2422*4882a593Smuzhiyun 			}
2423*4882a593Smuzhiyun 			spin_unlock(&res->spinlock);
2424*4882a593Smuzhiyun 		}
2425*4882a593Smuzhiyun 	}
2426*4882a593Smuzhiyun 
2427*4882a593Smuzhiyun }
2428*4882a593Smuzhiyun 
__dlm_hb_node_down(struct dlm_ctxt * dlm,int idx)2429*4882a593Smuzhiyun static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
2430*4882a593Smuzhiyun {
2431*4882a593Smuzhiyun 	assert_spin_locked(&dlm->spinlock);
2432*4882a593Smuzhiyun 
2433*4882a593Smuzhiyun 	if (dlm->reco.new_master == idx) {
2434*4882a593Smuzhiyun 		mlog(0, "%s: recovery master %d just died\n",
2435*4882a593Smuzhiyun 		     dlm->name, idx);
2436*4882a593Smuzhiyun 		if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
2437*4882a593Smuzhiyun 			/* finalize1 was reached, so it is safe to clear
2438*4882a593Smuzhiyun 			 * the new_master and dead_node.  that recovery
2439*4882a593Smuzhiyun 			 * is complete. */
2440*4882a593Smuzhiyun 			mlog(0, "%s: dead master %d had reached "
2441*4882a593Smuzhiyun 			     "finalize1 state, clearing\n", dlm->name, idx);
2442*4882a593Smuzhiyun 			dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
2443*4882a593Smuzhiyun 			__dlm_reset_recovery(dlm);
2444*4882a593Smuzhiyun 		}
2445*4882a593Smuzhiyun 	}
2446*4882a593Smuzhiyun 
2447*4882a593Smuzhiyun 	/* Clean up join state on node death. */
2448*4882a593Smuzhiyun 	if (dlm->joining_node == idx) {
2449*4882a593Smuzhiyun 		mlog(0, "Clearing join state for node %u\n", idx);
2450*4882a593Smuzhiyun 		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
2451*4882a593Smuzhiyun 	}
2452*4882a593Smuzhiyun 
2453*4882a593Smuzhiyun 	/* check to see if the node is already considered dead */
2454*4882a593Smuzhiyun 	if (!test_bit(idx, dlm->live_nodes_map)) {
2455*4882a593Smuzhiyun 		mlog(0, "for domain %s, node %d is already dead. "
2456*4882a593Smuzhiyun 		     "another node likely did recovery already.\n",
2457*4882a593Smuzhiyun 		     dlm->name, idx);
2458*4882a593Smuzhiyun 		return;
2459*4882a593Smuzhiyun 	}
2460*4882a593Smuzhiyun 
2461*4882a593Smuzhiyun 	/* check to see if we do not care about this node */
2462*4882a593Smuzhiyun 	if (!test_bit(idx, dlm->domain_map)) {
2463*4882a593Smuzhiyun 		/* This also catches the case that we get a node down
2464*4882a593Smuzhiyun 		 * but haven't joined the domain yet. */
2465*4882a593Smuzhiyun 		mlog(0, "node %u already removed from domain!\n", idx);
2466*4882a593Smuzhiyun 		return;
2467*4882a593Smuzhiyun 	}
2468*4882a593Smuzhiyun 
2469*4882a593Smuzhiyun 	clear_bit(idx, dlm->live_nodes_map);
2470*4882a593Smuzhiyun 
2471*4882a593Smuzhiyun 	/* make sure local cleanup occurs before the heartbeat events */
2472*4882a593Smuzhiyun 	if (!test_bit(idx, dlm->recovery_map))
2473*4882a593Smuzhiyun 		dlm_do_local_recovery_cleanup(dlm, idx);
2474*4882a593Smuzhiyun 
2475*4882a593Smuzhiyun 	/* notify anything attached to the heartbeat events */
2476*4882a593Smuzhiyun 	dlm_hb_event_notify_attached(dlm, idx, 0);
2477*4882a593Smuzhiyun 
2478*4882a593Smuzhiyun 	mlog(0, "node %u being removed from domain map!\n", idx);
2479*4882a593Smuzhiyun 	clear_bit(idx, dlm->domain_map);
2480*4882a593Smuzhiyun 	clear_bit(idx, dlm->exit_domain_map);
2481*4882a593Smuzhiyun 	/* wake up migration waiters if a node goes down.
2482*4882a593Smuzhiyun 	 * perhaps later we can genericize this for other waiters. */
2483*4882a593Smuzhiyun 	wake_up(&dlm->migration_wq);
2484*4882a593Smuzhiyun 
2485*4882a593Smuzhiyun 	set_bit(idx, dlm->recovery_map);
2486*4882a593Smuzhiyun }
2487*4882a593Smuzhiyun 
dlm_hb_node_down_cb(struct o2nm_node * node,int idx,void * data)2488*4882a593Smuzhiyun void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
2489*4882a593Smuzhiyun {
2490*4882a593Smuzhiyun 	struct dlm_ctxt *dlm = data;
2491*4882a593Smuzhiyun 
2492*4882a593Smuzhiyun 	if (!dlm_grab(dlm))
2493*4882a593Smuzhiyun 		return;
2494*4882a593Smuzhiyun 
2495*4882a593Smuzhiyun 	/*
2496*4882a593Smuzhiyun 	 * This will notify any dlm users that a node in our domain
2497*4882a593Smuzhiyun 	 * went away without notifying us first.
2498*4882a593Smuzhiyun 	 */
2499*4882a593Smuzhiyun 	if (test_bit(idx, dlm->domain_map))
2500*4882a593Smuzhiyun 		dlm_fire_domain_eviction_callbacks(dlm, idx);
2501*4882a593Smuzhiyun 
2502*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
2503*4882a593Smuzhiyun 	__dlm_hb_node_down(dlm, idx);
2504*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
2505*4882a593Smuzhiyun 
2506*4882a593Smuzhiyun 	dlm_put(dlm);
2507*4882a593Smuzhiyun }
2508*4882a593Smuzhiyun 
dlm_hb_node_up_cb(struct o2nm_node * node,int idx,void * data)2509*4882a593Smuzhiyun void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
2510*4882a593Smuzhiyun {
2511*4882a593Smuzhiyun 	struct dlm_ctxt *dlm = data;
2512*4882a593Smuzhiyun 
2513*4882a593Smuzhiyun 	if (!dlm_grab(dlm))
2514*4882a593Smuzhiyun 		return;
2515*4882a593Smuzhiyun 
2516*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
2517*4882a593Smuzhiyun 	set_bit(idx, dlm->live_nodes_map);
2518*4882a593Smuzhiyun 	/* do NOT notify mle attached to the heartbeat events.
2519*4882a593Smuzhiyun 	 * new nodes are not interesting in mastery until joined. */
2520*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
2521*4882a593Smuzhiyun 
2522*4882a593Smuzhiyun 	dlm_put(dlm);
2523*4882a593Smuzhiyun }
2524*4882a593Smuzhiyun 
dlm_reco_ast(void * astdata)2525*4882a593Smuzhiyun static void dlm_reco_ast(void *astdata)
2526*4882a593Smuzhiyun {
2527*4882a593Smuzhiyun 	struct dlm_ctxt *dlm = astdata;
2528*4882a593Smuzhiyun 	mlog(0, "ast for recovery lock fired!, this=%u, dlm=%s\n",
2529*4882a593Smuzhiyun 	     dlm->node_num, dlm->name);
2530*4882a593Smuzhiyun }
dlm_reco_bast(void * astdata,int blocked_type)2531*4882a593Smuzhiyun static void dlm_reco_bast(void *astdata, int blocked_type)
2532*4882a593Smuzhiyun {
2533*4882a593Smuzhiyun 	struct dlm_ctxt *dlm = astdata;
2534*4882a593Smuzhiyun 	mlog(0, "bast for recovery lock fired!, this=%u, dlm=%s\n",
2535*4882a593Smuzhiyun 	     dlm->node_num, dlm->name);
2536*4882a593Smuzhiyun }
dlm_reco_unlock_ast(void * astdata,enum dlm_status st)2537*4882a593Smuzhiyun static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
2538*4882a593Smuzhiyun {
2539*4882a593Smuzhiyun 	mlog(0, "unlockast for recovery lock fired!\n");
2540*4882a593Smuzhiyun }
2541*4882a593Smuzhiyun 
2542*4882a593Smuzhiyun /*
2543*4882a593Smuzhiyun  * dlm_pick_recovery_master will continually attempt to use
2544*4882a593Smuzhiyun  * dlmlock() on the special "$RECOVERY" lockres with the
2545*4882a593Smuzhiyun  * LKM_NOQUEUE flag to get an EX.  every thread that enters
2546*4882a593Smuzhiyun  * this function on each node racing to become the recovery
2547*4882a593Smuzhiyun  * master will not stop attempting this until either:
2548*4882a593Smuzhiyun  * a) this node gets the EX (and becomes the recovery master),
2549*4882a593Smuzhiyun  * or b) dlm->reco.new_master gets set to some nodenum
2550*4882a593Smuzhiyun  * != O2NM_INVALID_NODE_NUM (another node will do the reco).
2551*4882a593Smuzhiyun  * so each time a recovery master is needed, the entire cluster
2552*4882a593Smuzhiyun  * will sync at this point.  if the new master dies, that will
2553*4882a593Smuzhiyun  * be detected in dlm_do_recovery */
dlm_pick_recovery_master(struct dlm_ctxt * dlm)2554*4882a593Smuzhiyun static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
2555*4882a593Smuzhiyun {
2556*4882a593Smuzhiyun 	enum dlm_status ret;
2557*4882a593Smuzhiyun 	struct dlm_lockstatus lksb;
2558*4882a593Smuzhiyun 	int status = -EINVAL;
2559*4882a593Smuzhiyun 
2560*4882a593Smuzhiyun 	mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
2561*4882a593Smuzhiyun 	     dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
2562*4882a593Smuzhiyun again:
2563*4882a593Smuzhiyun 	memset(&lksb, 0, sizeof(lksb));
2564*4882a593Smuzhiyun 
2565*4882a593Smuzhiyun 	ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
2566*4882a593Smuzhiyun 		      DLM_RECOVERY_LOCK_NAME, DLM_RECOVERY_LOCK_NAME_LEN,
2567*4882a593Smuzhiyun 		      dlm_reco_ast, dlm, dlm_reco_bast);
2568*4882a593Smuzhiyun 
2569*4882a593Smuzhiyun 	mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n",
2570*4882a593Smuzhiyun 	     dlm->name, ret, lksb.status);
2571*4882a593Smuzhiyun 
2572*4882a593Smuzhiyun 	if (ret == DLM_NORMAL) {
2573*4882a593Smuzhiyun 		mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
2574*4882a593Smuzhiyun 		     dlm->name, dlm->node_num);
2575*4882a593Smuzhiyun 
2576*4882a593Smuzhiyun 		/* got the EX lock.  check to see if another node
2577*4882a593Smuzhiyun 		 * just became the reco master */
2578*4882a593Smuzhiyun 		if (dlm_reco_master_ready(dlm)) {
2579*4882a593Smuzhiyun 			mlog(0, "%s: got reco EX lock, but %u will "
2580*4882a593Smuzhiyun 			     "do the recovery\n", dlm->name,
2581*4882a593Smuzhiyun 			     dlm->reco.new_master);
2582*4882a593Smuzhiyun 			status = -EEXIST;
2583*4882a593Smuzhiyun 		} else {
2584*4882a593Smuzhiyun 			status = 0;
2585*4882a593Smuzhiyun 
2586*4882a593Smuzhiyun 			/* see if recovery was already finished elsewhere */
2587*4882a593Smuzhiyun 			spin_lock(&dlm->spinlock);
2588*4882a593Smuzhiyun 			if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
2589*4882a593Smuzhiyun 				status = -EINVAL;
2590*4882a593Smuzhiyun 				mlog(0, "%s: got reco EX lock, but "
2591*4882a593Smuzhiyun 				     "node got recovered already\n", dlm->name);
2592*4882a593Smuzhiyun 				if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
2593*4882a593Smuzhiyun 					mlog(ML_ERROR, "%s: new master is %u "
2594*4882a593Smuzhiyun 					     "but no dead node!\n",
2595*4882a593Smuzhiyun 					     dlm->name, dlm->reco.new_master);
2596*4882a593Smuzhiyun 					BUG();
2597*4882a593Smuzhiyun 				}
2598*4882a593Smuzhiyun 			}
2599*4882a593Smuzhiyun 			spin_unlock(&dlm->spinlock);
2600*4882a593Smuzhiyun 		}
2601*4882a593Smuzhiyun 
2602*4882a593Smuzhiyun 		/* if this node has actually become the recovery master,
2603*4882a593Smuzhiyun 		 * set the master and send the messages to begin recovery */
2604*4882a593Smuzhiyun 		if (!status) {
2605*4882a593Smuzhiyun 			mlog(0, "%s: dead=%u, this=%u, sending "
2606*4882a593Smuzhiyun 			     "begin_reco now\n", dlm->name,
2607*4882a593Smuzhiyun 			     dlm->reco.dead_node, dlm->node_num);
2608*4882a593Smuzhiyun 			status = dlm_send_begin_reco_message(dlm,
2609*4882a593Smuzhiyun 				      dlm->reco.dead_node);
2610*4882a593Smuzhiyun 			/* this always succeeds */
2611*4882a593Smuzhiyun 			BUG_ON(status);
2612*4882a593Smuzhiyun 
2613*4882a593Smuzhiyun 			/* set the new_master to this node */
2614*4882a593Smuzhiyun 			spin_lock(&dlm->spinlock);
2615*4882a593Smuzhiyun 			dlm_set_reco_master(dlm, dlm->node_num);
2616*4882a593Smuzhiyun 			spin_unlock(&dlm->spinlock);
2617*4882a593Smuzhiyun 		}
2618*4882a593Smuzhiyun 
2619*4882a593Smuzhiyun 		/* recovery lock is a special case.  ast will not get fired,
2620*4882a593Smuzhiyun 		 * so just go ahead and unlock it. */
2621*4882a593Smuzhiyun 		ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
2622*4882a593Smuzhiyun 		if (ret == DLM_DENIED) {
2623*4882a593Smuzhiyun 			mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n");
2624*4882a593Smuzhiyun 			ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm);
2625*4882a593Smuzhiyun 		}
2626*4882a593Smuzhiyun 		if (ret != DLM_NORMAL) {
2627*4882a593Smuzhiyun 			/* this would really suck. this could only happen
2628*4882a593Smuzhiyun 			 * if there was a network error during the unlock
2629*4882a593Smuzhiyun 			 * because of node death.  this means the unlock
2630*4882a593Smuzhiyun 			 * is actually "done" and the lock structure is
2631*4882a593Smuzhiyun 			 * even freed.  we can continue, but only
2632*4882a593Smuzhiyun 			 * because this specific lock name is special. */
2633*4882a593Smuzhiyun 			mlog(ML_ERROR, "dlmunlock returned %d\n", ret);
2634*4882a593Smuzhiyun 		}
2635*4882a593Smuzhiyun 	} else if (ret == DLM_NOTQUEUED) {
2636*4882a593Smuzhiyun 		mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
2637*4882a593Smuzhiyun 		     dlm->name, dlm->node_num);
2638*4882a593Smuzhiyun 		/* another node is master. wait on
2639*4882a593Smuzhiyun 		 * reco.new_master != O2NM_INVALID_NODE_NUM
2640*4882a593Smuzhiyun 		 * for at most one second */
2641*4882a593Smuzhiyun 		wait_event_timeout(dlm->dlm_reco_thread_wq,
2642*4882a593Smuzhiyun 					 dlm_reco_master_ready(dlm),
2643*4882a593Smuzhiyun 					 msecs_to_jiffies(1000));
2644*4882a593Smuzhiyun 		if (!dlm_reco_master_ready(dlm)) {
2645*4882a593Smuzhiyun 			mlog(0, "%s: reco master taking awhile\n",
2646*4882a593Smuzhiyun 			     dlm->name);
2647*4882a593Smuzhiyun 			goto again;
2648*4882a593Smuzhiyun 		}
2649*4882a593Smuzhiyun 		/* another node has informed this one that it is reco master */
2650*4882a593Smuzhiyun 		mlog(0, "%s: reco master %u is ready to recover %u\n",
2651*4882a593Smuzhiyun 		     dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
2652*4882a593Smuzhiyun 		status = -EEXIST;
2653*4882a593Smuzhiyun 	} else if (ret == DLM_RECOVERING) {
2654*4882a593Smuzhiyun 		mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n",
2655*4882a593Smuzhiyun 		     dlm->name, dlm->node_num);
2656*4882a593Smuzhiyun 		goto again;
2657*4882a593Smuzhiyun 	} else {
2658*4882a593Smuzhiyun 		struct dlm_lock_resource *res;
2659*4882a593Smuzhiyun 
2660*4882a593Smuzhiyun 		/* dlmlock returned something other than NOTQUEUED or NORMAL */
2661*4882a593Smuzhiyun 		mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), "
2662*4882a593Smuzhiyun 		     "lksb.status=%s\n", dlm->name, dlm_errname(ret),
2663*4882a593Smuzhiyun 		     dlm_errname(lksb.status));
2664*4882a593Smuzhiyun 		res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
2665*4882a593Smuzhiyun 					 DLM_RECOVERY_LOCK_NAME_LEN);
2666*4882a593Smuzhiyun 		if (res) {
2667*4882a593Smuzhiyun 			dlm_print_one_lock_resource(res);
2668*4882a593Smuzhiyun 			dlm_lockres_put(res);
2669*4882a593Smuzhiyun 		} else {
2670*4882a593Smuzhiyun 			mlog(ML_ERROR, "recovery lock not found\n");
2671*4882a593Smuzhiyun 		}
2672*4882a593Smuzhiyun 		BUG();
2673*4882a593Smuzhiyun 	}
2674*4882a593Smuzhiyun 
2675*4882a593Smuzhiyun 	return status;
2676*4882a593Smuzhiyun }
2677*4882a593Smuzhiyun 
dlm_send_begin_reco_message(struct dlm_ctxt * dlm,u8 dead_node)2678*4882a593Smuzhiyun static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
2679*4882a593Smuzhiyun {
2680*4882a593Smuzhiyun 	struct dlm_begin_reco br;
2681*4882a593Smuzhiyun 	int ret = 0;
2682*4882a593Smuzhiyun 	struct dlm_node_iter iter;
2683*4882a593Smuzhiyun 	int nodenum;
2684*4882a593Smuzhiyun 	int status;
2685*4882a593Smuzhiyun 
2686*4882a593Smuzhiyun 	mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
2687*4882a593Smuzhiyun 
2688*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
2689*4882a593Smuzhiyun 	dlm_node_iter_init(dlm->domain_map, &iter);
2690*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
2691*4882a593Smuzhiyun 
2692*4882a593Smuzhiyun 	clear_bit(dead_node, iter.node_map);
2693*4882a593Smuzhiyun 
2694*4882a593Smuzhiyun 	memset(&br, 0, sizeof(br));
2695*4882a593Smuzhiyun 	br.node_idx = dlm->node_num;
2696*4882a593Smuzhiyun 	br.dead_node = dead_node;
2697*4882a593Smuzhiyun 
2698*4882a593Smuzhiyun 	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
2699*4882a593Smuzhiyun 		ret = 0;
2700*4882a593Smuzhiyun 		if (nodenum == dead_node) {
2701*4882a593Smuzhiyun 			mlog(0, "not sending begin reco to dead node "
2702*4882a593Smuzhiyun 				  "%u\n", dead_node);
2703*4882a593Smuzhiyun 			continue;
2704*4882a593Smuzhiyun 		}
2705*4882a593Smuzhiyun 		if (nodenum == dlm->node_num) {
2706*4882a593Smuzhiyun 			mlog(0, "not sending begin reco to self\n");
2707*4882a593Smuzhiyun 			continue;
2708*4882a593Smuzhiyun 		}
2709*4882a593Smuzhiyun retry:
2710*4882a593Smuzhiyun 		ret = -EINVAL;
2711*4882a593Smuzhiyun 		mlog(0, "attempting to send begin reco msg to %d\n",
2712*4882a593Smuzhiyun 			  nodenum);
2713*4882a593Smuzhiyun 		ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key,
2714*4882a593Smuzhiyun 					 &br, sizeof(br), nodenum, &status);
2715*4882a593Smuzhiyun 		/* negative status is handled ok by caller here */
2716*4882a593Smuzhiyun 		if (ret >= 0)
2717*4882a593Smuzhiyun 			ret = status;
2718*4882a593Smuzhiyun 		if (dlm_is_host_down(ret)) {
2719*4882a593Smuzhiyun 			/* node is down.  not involved in recovery
2720*4882a593Smuzhiyun 			 * so just keep going */
2721*4882a593Smuzhiyun 			mlog(ML_NOTICE, "%s: node %u was down when sending "
2722*4882a593Smuzhiyun 			     "begin reco msg (%d)\n", dlm->name, nodenum, ret);
2723*4882a593Smuzhiyun 			ret = 0;
2724*4882a593Smuzhiyun 		}
2725*4882a593Smuzhiyun 
2726*4882a593Smuzhiyun 		/*
2727*4882a593Smuzhiyun 		 * Prior to commit aad1b15310b9bcd59fa81ab8f2b1513b59553ea8,
2728*4882a593Smuzhiyun 		 * dlm_begin_reco_handler() returned EAGAIN and not -EAGAIN.
2729*4882a593Smuzhiyun 		 * We are handling both for compatibility reasons.
2730*4882a593Smuzhiyun 		 */
2731*4882a593Smuzhiyun 		if (ret == -EAGAIN || ret == EAGAIN) {
2732*4882a593Smuzhiyun 			mlog(0, "%s: trying to start recovery of node "
2733*4882a593Smuzhiyun 			     "%u, but node %u is waiting for last recovery "
2734*4882a593Smuzhiyun 			     "to complete, backoff for a bit\n", dlm->name,
2735*4882a593Smuzhiyun 			     dead_node, nodenum);
2736*4882a593Smuzhiyun 			msleep(100);
2737*4882a593Smuzhiyun 			goto retry;
2738*4882a593Smuzhiyun 		}
2739*4882a593Smuzhiyun 		if (ret < 0) {
2740*4882a593Smuzhiyun 			struct dlm_lock_resource *res;
2741*4882a593Smuzhiyun 
2742*4882a593Smuzhiyun 			/* this is now a serious problem, possibly ENOMEM
2743*4882a593Smuzhiyun 			 * in the network stack.  must retry */
2744*4882a593Smuzhiyun 			mlog_errno(ret);
2745*4882a593Smuzhiyun 			mlog(ML_ERROR, "begin reco of dlm %s to node %u "
2746*4882a593Smuzhiyun 			     "returned %d\n", dlm->name, nodenum, ret);
2747*4882a593Smuzhiyun 			res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
2748*4882a593Smuzhiyun 						 DLM_RECOVERY_LOCK_NAME_LEN);
2749*4882a593Smuzhiyun 			if (res) {
2750*4882a593Smuzhiyun 				dlm_print_one_lock_resource(res);
2751*4882a593Smuzhiyun 				dlm_lockres_put(res);
2752*4882a593Smuzhiyun 			} else {
2753*4882a593Smuzhiyun 				mlog(ML_ERROR, "recovery lock not found\n");
2754*4882a593Smuzhiyun 			}
2755*4882a593Smuzhiyun 			/* sleep for a bit in hopes that we can avoid
2756*4882a593Smuzhiyun 			 * another ENOMEM */
2757*4882a593Smuzhiyun 			msleep(100);
2758*4882a593Smuzhiyun 			goto retry;
2759*4882a593Smuzhiyun 		}
2760*4882a593Smuzhiyun 	}
2761*4882a593Smuzhiyun 
2762*4882a593Smuzhiyun 	return ret;
2763*4882a593Smuzhiyun }
2764*4882a593Smuzhiyun 
dlm_begin_reco_handler(struct o2net_msg * msg,u32 len,void * data,void ** ret_data)2765*4882a593Smuzhiyun int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2766*4882a593Smuzhiyun 			   void **ret_data)
2767*4882a593Smuzhiyun {
2768*4882a593Smuzhiyun 	struct dlm_ctxt *dlm = data;
2769*4882a593Smuzhiyun 	struct dlm_begin_reco *br = (struct dlm_begin_reco *)msg->buf;
2770*4882a593Smuzhiyun 
2771*4882a593Smuzhiyun 	/* ok to return 0, domain has gone away */
2772*4882a593Smuzhiyun 	if (!dlm_grab(dlm))
2773*4882a593Smuzhiyun 		return 0;
2774*4882a593Smuzhiyun 
2775*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
2776*4882a593Smuzhiyun 	if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
2777*4882a593Smuzhiyun 		mlog(0, "%s: node %u wants to recover node %u (%u:%u) "
2778*4882a593Smuzhiyun 		     "but this node is in finalize state, waiting on finalize2\n",
2779*4882a593Smuzhiyun 		     dlm->name, br->node_idx, br->dead_node,
2780*4882a593Smuzhiyun 		     dlm->reco.dead_node, dlm->reco.new_master);
2781*4882a593Smuzhiyun 		spin_unlock(&dlm->spinlock);
2782*4882a593Smuzhiyun 		dlm_put(dlm);
2783*4882a593Smuzhiyun 		return -EAGAIN;
2784*4882a593Smuzhiyun 	}
2785*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
2786*4882a593Smuzhiyun 
2787*4882a593Smuzhiyun 	mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n",
2788*4882a593Smuzhiyun 	     dlm->name, br->node_idx, br->dead_node,
2789*4882a593Smuzhiyun 	     dlm->reco.dead_node, dlm->reco.new_master);
2790*4882a593Smuzhiyun 
2791*4882a593Smuzhiyun 	dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
2792*4882a593Smuzhiyun 
2793*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
2794*4882a593Smuzhiyun 	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
2795*4882a593Smuzhiyun 		if (test_bit(dlm->reco.new_master, dlm->recovery_map)) {
2796*4882a593Smuzhiyun 			mlog(0, "%s: new_master %u died, changing "
2797*4882a593Smuzhiyun 			     "to %u\n", dlm->name, dlm->reco.new_master,
2798*4882a593Smuzhiyun 			     br->node_idx);
2799*4882a593Smuzhiyun 		} else {
2800*4882a593Smuzhiyun 			mlog(0, "%s: new_master %u NOT DEAD, changing "
2801*4882a593Smuzhiyun 			     "to %u\n", dlm->name, dlm->reco.new_master,
2802*4882a593Smuzhiyun 			     br->node_idx);
2803*4882a593Smuzhiyun 			/* may not have seen the new master as dead yet */
2804*4882a593Smuzhiyun 		}
2805*4882a593Smuzhiyun 	}
2806*4882a593Smuzhiyun 	if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
2807*4882a593Smuzhiyun 		mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
2808*4882a593Smuzhiyun 		     "node %u changing it to %u\n", dlm->name,
2809*4882a593Smuzhiyun 		     dlm->reco.dead_node, br->node_idx, br->dead_node);
2810*4882a593Smuzhiyun 	}
2811*4882a593Smuzhiyun 	dlm_set_reco_master(dlm, br->node_idx);
2812*4882a593Smuzhiyun 	dlm_set_reco_dead_node(dlm, br->dead_node);
2813*4882a593Smuzhiyun 	if (!test_bit(br->dead_node, dlm->recovery_map)) {
2814*4882a593Smuzhiyun 		mlog(0, "recovery master %u sees %u as dead, but this "
2815*4882a593Smuzhiyun 		     "node has not yet.  marking %u as dead\n",
2816*4882a593Smuzhiyun 		     br->node_idx, br->dead_node, br->dead_node);
2817*4882a593Smuzhiyun 		if (!test_bit(br->dead_node, dlm->domain_map) ||
2818*4882a593Smuzhiyun 		    !test_bit(br->dead_node, dlm->live_nodes_map))
2819*4882a593Smuzhiyun 			mlog(0, "%u not in domain/live_nodes map "
2820*4882a593Smuzhiyun 			     "so setting it in reco map manually\n",
2821*4882a593Smuzhiyun 			     br->dead_node);
2822*4882a593Smuzhiyun 		/* force the recovery cleanup in __dlm_hb_node_down
2823*4882a593Smuzhiyun 		 * both of these will be cleared in a moment */
2824*4882a593Smuzhiyun 		set_bit(br->dead_node, dlm->domain_map);
2825*4882a593Smuzhiyun 		set_bit(br->dead_node, dlm->live_nodes_map);
2826*4882a593Smuzhiyun 		__dlm_hb_node_down(dlm, br->dead_node);
2827*4882a593Smuzhiyun 	}
2828*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
2829*4882a593Smuzhiyun 
2830*4882a593Smuzhiyun 	dlm_kick_recovery_thread(dlm);
2831*4882a593Smuzhiyun 
2832*4882a593Smuzhiyun 	mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n",
2833*4882a593Smuzhiyun 	     dlm->name, br->node_idx, br->dead_node,
2834*4882a593Smuzhiyun 	     dlm->reco.dead_node, dlm->reco.new_master);
2835*4882a593Smuzhiyun 
2836*4882a593Smuzhiyun 	dlm_put(dlm);
2837*4882a593Smuzhiyun 	return 0;
2838*4882a593Smuzhiyun }
2839*4882a593Smuzhiyun 
2840*4882a593Smuzhiyun #define DLM_FINALIZE_STAGE2  0x01
dlm_send_finalize_reco_message(struct dlm_ctxt * dlm)2841*4882a593Smuzhiyun static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
2842*4882a593Smuzhiyun {
2843*4882a593Smuzhiyun 	int ret = 0;
2844*4882a593Smuzhiyun 	struct dlm_finalize_reco fr;
2845*4882a593Smuzhiyun 	struct dlm_node_iter iter;
2846*4882a593Smuzhiyun 	int nodenum;
2847*4882a593Smuzhiyun 	int status;
2848*4882a593Smuzhiyun 	int stage = 1;
2849*4882a593Smuzhiyun 
2850*4882a593Smuzhiyun 	mlog(0, "finishing recovery for node %s:%u, "
2851*4882a593Smuzhiyun 	     "stage %d\n", dlm->name, dlm->reco.dead_node, stage);
2852*4882a593Smuzhiyun 
2853*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
2854*4882a593Smuzhiyun 	dlm_node_iter_init(dlm->domain_map, &iter);
2855*4882a593Smuzhiyun 	spin_unlock(&dlm->spinlock);
2856*4882a593Smuzhiyun 
2857*4882a593Smuzhiyun stage2:
2858*4882a593Smuzhiyun 	memset(&fr, 0, sizeof(fr));
2859*4882a593Smuzhiyun 	fr.node_idx = dlm->node_num;
2860*4882a593Smuzhiyun 	fr.dead_node = dlm->reco.dead_node;
2861*4882a593Smuzhiyun 	if (stage == 2)
2862*4882a593Smuzhiyun 		fr.flags |= DLM_FINALIZE_STAGE2;
2863*4882a593Smuzhiyun 
2864*4882a593Smuzhiyun 	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
2865*4882a593Smuzhiyun 		if (nodenum == dlm->node_num)
2866*4882a593Smuzhiyun 			continue;
2867*4882a593Smuzhiyun 		ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
2868*4882a593Smuzhiyun 					 &fr, sizeof(fr), nodenum, &status);
2869*4882a593Smuzhiyun 		if (ret >= 0)
2870*4882a593Smuzhiyun 			ret = status;
2871*4882a593Smuzhiyun 		if (ret < 0) {
2872*4882a593Smuzhiyun 			mlog(ML_ERROR, "Error %d when sending message %u (key "
2873*4882a593Smuzhiyun 			     "0x%x) to node %u\n", ret, DLM_FINALIZE_RECO_MSG,
2874*4882a593Smuzhiyun 			     dlm->key, nodenum);
2875*4882a593Smuzhiyun 			if (dlm_is_host_down(ret)) {
2876*4882a593Smuzhiyun 				/* this has no effect on this recovery
2877*4882a593Smuzhiyun 				 * session, so set the status to zero to
2878*4882a593Smuzhiyun 				 * finish out the last recovery */
2879*4882a593Smuzhiyun 				mlog(ML_ERROR, "node %u went down after this "
2880*4882a593Smuzhiyun 				     "node finished recovery.\n", nodenum);
2881*4882a593Smuzhiyun 				ret = 0;
2882*4882a593Smuzhiyun 				continue;
2883*4882a593Smuzhiyun 			}
2884*4882a593Smuzhiyun 			break;
2885*4882a593Smuzhiyun 		}
2886*4882a593Smuzhiyun 	}
2887*4882a593Smuzhiyun 	if (stage == 1) {
2888*4882a593Smuzhiyun 		/* reset the node_iter back to the top and send finalize2 */
2889*4882a593Smuzhiyun 		iter.curnode = -1;
2890*4882a593Smuzhiyun 		stage = 2;
2891*4882a593Smuzhiyun 		goto stage2;
2892*4882a593Smuzhiyun 	}
2893*4882a593Smuzhiyun 
2894*4882a593Smuzhiyun 	return ret;
2895*4882a593Smuzhiyun }
2896*4882a593Smuzhiyun 
dlm_finalize_reco_handler(struct o2net_msg * msg,u32 len,void * data,void ** ret_data)2897*4882a593Smuzhiyun int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2898*4882a593Smuzhiyun 			      void **ret_data)
2899*4882a593Smuzhiyun {
2900*4882a593Smuzhiyun 	struct dlm_ctxt *dlm = data;
2901*4882a593Smuzhiyun 	struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
2902*4882a593Smuzhiyun 	int stage = 1;
2903*4882a593Smuzhiyun 
2904*4882a593Smuzhiyun 	/* ok to return 0, domain has gone away */
2905*4882a593Smuzhiyun 	if (!dlm_grab(dlm))
2906*4882a593Smuzhiyun 		return 0;
2907*4882a593Smuzhiyun 
2908*4882a593Smuzhiyun 	if (fr->flags & DLM_FINALIZE_STAGE2)
2909*4882a593Smuzhiyun 		stage = 2;
2910*4882a593Smuzhiyun 
2911*4882a593Smuzhiyun 	mlog(0, "%s: node %u finalizing recovery stage%d of "
2912*4882a593Smuzhiyun 	     "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
2913*4882a593Smuzhiyun 	     fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
2914*4882a593Smuzhiyun 
2915*4882a593Smuzhiyun 	spin_lock(&dlm->spinlock);
2916*4882a593Smuzhiyun 
2917*4882a593Smuzhiyun 	if (dlm->reco.new_master != fr->node_idx) {
2918*4882a593Smuzhiyun 		mlog(ML_ERROR, "node %u sent recovery finalize msg, but node "
2919*4882a593Smuzhiyun 		     "%u is supposed to be the new master, dead=%u\n",
2920*4882a593Smuzhiyun 		     fr->node_idx, dlm->reco.new_master, fr->dead_node);
2921*4882a593Smuzhiyun 		BUG();
2922*4882a593Smuzhiyun 	}
2923*4882a593Smuzhiyun 	if (dlm->reco.dead_node != fr->dead_node) {
2924*4882a593Smuzhiyun 		mlog(ML_ERROR, "node %u sent recovery finalize msg for dead "
2925*4882a593Smuzhiyun 		     "node %u, but node %u is supposed to be dead\n",
2926*4882a593Smuzhiyun 		     fr->node_idx, fr->dead_node, dlm->reco.dead_node);
2927*4882a593Smuzhiyun 		BUG();
2928*4882a593Smuzhiyun 	}
2929*4882a593Smuzhiyun 
2930*4882a593Smuzhiyun 	switch (stage) {
2931*4882a593Smuzhiyun 		case 1:
2932*4882a593Smuzhiyun 			dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
2933*4882a593Smuzhiyun 			if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
2934*4882a593Smuzhiyun 				mlog(ML_ERROR, "%s: received finalize1 from "
2935*4882a593Smuzhiyun 				     "new master %u for dead node %u, but "
2936*4882a593Smuzhiyun 				     "this node has already received it!\n",
2937*4882a593Smuzhiyun 				     dlm->name, fr->node_idx, fr->dead_node);
2938*4882a593Smuzhiyun 				dlm_print_reco_node_status(dlm);
2939*4882a593Smuzhiyun 				BUG();
2940*4882a593Smuzhiyun 			}
2941*4882a593Smuzhiyun 			dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
2942*4882a593Smuzhiyun 			spin_unlock(&dlm->spinlock);
2943*4882a593Smuzhiyun 			break;
2944*4882a593Smuzhiyun 		case 2:
2945*4882a593Smuzhiyun 			if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) {
2946*4882a593Smuzhiyun 				mlog(ML_ERROR, "%s: received finalize2 from "
2947*4882a593Smuzhiyun 				     "new master %u for dead node %u, but "
2948*4882a593Smuzhiyun 				     "this node did not have finalize1!\n",
2949*4882a593Smuzhiyun 				     dlm->name, fr->node_idx, fr->dead_node);
2950*4882a593Smuzhiyun 				dlm_print_reco_node_status(dlm);
2951*4882a593Smuzhiyun 				BUG();
2952*4882a593Smuzhiyun 			}
2953*4882a593Smuzhiyun 			dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
2954*4882a593Smuzhiyun 			__dlm_reset_recovery(dlm);
2955*4882a593Smuzhiyun 			spin_unlock(&dlm->spinlock);
2956*4882a593Smuzhiyun 			dlm_kick_recovery_thread(dlm);
2957*4882a593Smuzhiyun 			break;
2958*4882a593Smuzhiyun 	}
2959*4882a593Smuzhiyun 
2960*4882a593Smuzhiyun 	mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
2961*4882a593Smuzhiyun 	     dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master);
2962*4882a593Smuzhiyun 
2963*4882a593Smuzhiyun 	dlm_put(dlm);
2964*4882a593Smuzhiyun 	return 0;
2965*4882a593Smuzhiyun }
2966