1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-or-later
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Copyright (C) 2015, SUSE
4*4882a593Smuzhiyun */
5*4882a593Smuzhiyun
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun #include <linux/module.h>
8*4882a593Smuzhiyun #include <linux/kthread.h>
9*4882a593Smuzhiyun #include <linux/dlm.h>
10*4882a593Smuzhiyun #include <linux/sched.h>
11*4882a593Smuzhiyun #include <linux/raid/md_p.h>
12*4882a593Smuzhiyun #include "md.h"
13*4882a593Smuzhiyun #include "md-bitmap.h"
14*4882a593Smuzhiyun #include "md-cluster.h"
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun #define LVB_SIZE 64
17*4882a593Smuzhiyun #define NEW_DEV_TIMEOUT 5000
18*4882a593Smuzhiyun
19*4882a593Smuzhiyun struct dlm_lock_resource {
20*4882a593Smuzhiyun dlm_lockspace_t *ls;
21*4882a593Smuzhiyun struct dlm_lksb lksb;
22*4882a593Smuzhiyun char *name; /* lock name. */
23*4882a593Smuzhiyun uint32_t flags; /* flags to pass to dlm_lock() */
24*4882a593Smuzhiyun wait_queue_head_t sync_locking; /* wait queue for synchronized locking */
25*4882a593Smuzhiyun bool sync_locking_done;
26*4882a593Smuzhiyun void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
27*4882a593Smuzhiyun struct mddev *mddev; /* pointing back to mddev. */
28*4882a593Smuzhiyun int mode;
29*4882a593Smuzhiyun };
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun struct resync_info {
32*4882a593Smuzhiyun __le64 lo;
33*4882a593Smuzhiyun __le64 hi;
34*4882a593Smuzhiyun };
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun /* md_cluster_info flags */
37*4882a593Smuzhiyun #define MD_CLUSTER_WAITING_FOR_NEWDISK 1
38*4882a593Smuzhiyun #define MD_CLUSTER_SUSPEND_READ_BALANCING 2
39*4882a593Smuzhiyun #define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3
40*4882a593Smuzhiyun
41*4882a593Smuzhiyun /* Lock the send communication. This is done through
42*4882a593Smuzhiyun * bit manipulation as opposed to a mutex in order to
43*4882a593Smuzhiyun * accomodate lock and hold. See next comment.
44*4882a593Smuzhiyun */
45*4882a593Smuzhiyun #define MD_CLUSTER_SEND_LOCK 4
46*4882a593Smuzhiyun /* If cluster operations (such as adding a disk) must lock the
47*4882a593Smuzhiyun * communication channel, so as to perform extra operations
48*4882a593Smuzhiyun * (update metadata) and no other operation is allowed on the
49*4882a593Smuzhiyun * MD. Token needs to be locked and held until the operation
50*4882a593Smuzhiyun * completes witha md_update_sb(), which would eventually release
51*4882a593Smuzhiyun * the lock.
52*4882a593Smuzhiyun */
53*4882a593Smuzhiyun #define MD_CLUSTER_SEND_LOCKED_ALREADY 5
54*4882a593Smuzhiyun /* We should receive message after node joined cluster and
55*4882a593Smuzhiyun * set up all the related infos such as bitmap and personality */
56*4882a593Smuzhiyun #define MD_CLUSTER_ALREADY_IN_CLUSTER 6
57*4882a593Smuzhiyun #define MD_CLUSTER_PENDING_RECV_EVENT 7
58*4882a593Smuzhiyun #define MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD 8
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun struct md_cluster_info {
61*4882a593Smuzhiyun struct mddev *mddev; /* the md device which md_cluster_info belongs to */
62*4882a593Smuzhiyun /* dlm lock space and resources for clustered raid. */
63*4882a593Smuzhiyun dlm_lockspace_t *lockspace;
64*4882a593Smuzhiyun int slot_number;
65*4882a593Smuzhiyun struct completion completion;
66*4882a593Smuzhiyun struct mutex recv_mutex;
67*4882a593Smuzhiyun struct dlm_lock_resource *bitmap_lockres;
68*4882a593Smuzhiyun struct dlm_lock_resource **other_bitmap_lockres;
69*4882a593Smuzhiyun struct dlm_lock_resource *resync_lockres;
70*4882a593Smuzhiyun struct list_head suspend_list;
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun spinlock_t suspend_lock;
73*4882a593Smuzhiyun /* record the region which write should be suspended */
74*4882a593Smuzhiyun sector_t suspend_lo;
75*4882a593Smuzhiyun sector_t suspend_hi;
76*4882a593Smuzhiyun int suspend_from; /* the slot which broadcast suspend_lo/hi */
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun struct md_thread *recovery_thread;
79*4882a593Smuzhiyun unsigned long recovery_map;
80*4882a593Smuzhiyun /* communication loc resources */
81*4882a593Smuzhiyun struct dlm_lock_resource *ack_lockres;
82*4882a593Smuzhiyun struct dlm_lock_resource *message_lockres;
83*4882a593Smuzhiyun struct dlm_lock_resource *token_lockres;
84*4882a593Smuzhiyun struct dlm_lock_resource *no_new_dev_lockres;
85*4882a593Smuzhiyun struct md_thread *recv_thread;
86*4882a593Smuzhiyun struct completion newdisk_completion;
87*4882a593Smuzhiyun wait_queue_head_t wait;
88*4882a593Smuzhiyun unsigned long state;
89*4882a593Smuzhiyun /* record the region in RESYNCING message */
90*4882a593Smuzhiyun sector_t sync_low;
91*4882a593Smuzhiyun sector_t sync_hi;
92*4882a593Smuzhiyun };
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun enum msg_type {
95*4882a593Smuzhiyun METADATA_UPDATED = 0,
96*4882a593Smuzhiyun RESYNCING,
97*4882a593Smuzhiyun NEWDISK,
98*4882a593Smuzhiyun REMOVE,
99*4882a593Smuzhiyun RE_ADD,
100*4882a593Smuzhiyun BITMAP_NEEDS_SYNC,
101*4882a593Smuzhiyun CHANGE_CAPACITY,
102*4882a593Smuzhiyun BITMAP_RESIZE,
103*4882a593Smuzhiyun };
104*4882a593Smuzhiyun
105*4882a593Smuzhiyun struct cluster_msg {
106*4882a593Smuzhiyun __le32 type;
107*4882a593Smuzhiyun __le32 slot;
108*4882a593Smuzhiyun /* TODO: Unionize this for smaller footprint */
109*4882a593Smuzhiyun __le64 low;
110*4882a593Smuzhiyun __le64 high;
111*4882a593Smuzhiyun char uuid[16];
112*4882a593Smuzhiyun __le32 raid_slot;
113*4882a593Smuzhiyun };
114*4882a593Smuzhiyun
sync_ast(void * arg)115*4882a593Smuzhiyun static void sync_ast(void *arg)
116*4882a593Smuzhiyun {
117*4882a593Smuzhiyun struct dlm_lock_resource *res;
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun res = arg;
120*4882a593Smuzhiyun res->sync_locking_done = true;
121*4882a593Smuzhiyun wake_up(&res->sync_locking);
122*4882a593Smuzhiyun }
123*4882a593Smuzhiyun
dlm_lock_sync(struct dlm_lock_resource * res,int mode)124*4882a593Smuzhiyun static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
125*4882a593Smuzhiyun {
126*4882a593Smuzhiyun int ret = 0;
127*4882a593Smuzhiyun
128*4882a593Smuzhiyun ret = dlm_lock(res->ls, mode, &res->lksb,
129*4882a593Smuzhiyun res->flags, res->name, strlen(res->name),
130*4882a593Smuzhiyun 0, sync_ast, res, res->bast);
131*4882a593Smuzhiyun if (ret)
132*4882a593Smuzhiyun return ret;
133*4882a593Smuzhiyun wait_event(res->sync_locking, res->sync_locking_done);
134*4882a593Smuzhiyun res->sync_locking_done = false;
135*4882a593Smuzhiyun if (res->lksb.sb_status == 0)
136*4882a593Smuzhiyun res->mode = mode;
137*4882a593Smuzhiyun return res->lksb.sb_status;
138*4882a593Smuzhiyun }
139*4882a593Smuzhiyun
dlm_unlock_sync(struct dlm_lock_resource * res)140*4882a593Smuzhiyun static int dlm_unlock_sync(struct dlm_lock_resource *res)
141*4882a593Smuzhiyun {
142*4882a593Smuzhiyun return dlm_lock_sync(res, DLM_LOCK_NL);
143*4882a593Smuzhiyun }
144*4882a593Smuzhiyun
145*4882a593Smuzhiyun /*
146*4882a593Smuzhiyun * An variation of dlm_lock_sync, which make lock request could
147*4882a593Smuzhiyun * be interrupted
148*4882a593Smuzhiyun */
dlm_lock_sync_interruptible(struct dlm_lock_resource * res,int mode,struct mddev * mddev)149*4882a593Smuzhiyun static int dlm_lock_sync_interruptible(struct dlm_lock_resource *res, int mode,
150*4882a593Smuzhiyun struct mddev *mddev)
151*4882a593Smuzhiyun {
152*4882a593Smuzhiyun int ret = 0;
153*4882a593Smuzhiyun
154*4882a593Smuzhiyun ret = dlm_lock(res->ls, mode, &res->lksb,
155*4882a593Smuzhiyun res->flags, res->name, strlen(res->name),
156*4882a593Smuzhiyun 0, sync_ast, res, res->bast);
157*4882a593Smuzhiyun if (ret)
158*4882a593Smuzhiyun return ret;
159*4882a593Smuzhiyun
160*4882a593Smuzhiyun wait_event(res->sync_locking, res->sync_locking_done
161*4882a593Smuzhiyun || kthread_should_stop()
162*4882a593Smuzhiyun || test_bit(MD_CLOSING, &mddev->flags));
163*4882a593Smuzhiyun if (!res->sync_locking_done) {
164*4882a593Smuzhiyun /*
165*4882a593Smuzhiyun * the convert queue contains the lock request when request is
166*4882a593Smuzhiyun * interrupted, and sync_ast could still be run, so need to
167*4882a593Smuzhiyun * cancel the request and reset completion
168*4882a593Smuzhiyun */
169*4882a593Smuzhiyun ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_CANCEL,
170*4882a593Smuzhiyun &res->lksb, res);
171*4882a593Smuzhiyun res->sync_locking_done = false;
172*4882a593Smuzhiyun if (unlikely(ret != 0))
173*4882a593Smuzhiyun pr_info("failed to cancel previous lock request "
174*4882a593Smuzhiyun "%s return %d\n", res->name, ret);
175*4882a593Smuzhiyun return -EPERM;
176*4882a593Smuzhiyun } else
177*4882a593Smuzhiyun res->sync_locking_done = false;
178*4882a593Smuzhiyun if (res->lksb.sb_status == 0)
179*4882a593Smuzhiyun res->mode = mode;
180*4882a593Smuzhiyun return res->lksb.sb_status;
181*4882a593Smuzhiyun }
182*4882a593Smuzhiyun
lockres_init(struct mddev * mddev,char * name,void (* bastfn)(void * arg,int mode),int with_lvb)183*4882a593Smuzhiyun static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
184*4882a593Smuzhiyun char *name, void (*bastfn)(void *arg, int mode), int with_lvb)
185*4882a593Smuzhiyun {
186*4882a593Smuzhiyun struct dlm_lock_resource *res = NULL;
187*4882a593Smuzhiyun int ret, namelen;
188*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
189*4882a593Smuzhiyun
190*4882a593Smuzhiyun res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
191*4882a593Smuzhiyun if (!res)
192*4882a593Smuzhiyun return NULL;
193*4882a593Smuzhiyun init_waitqueue_head(&res->sync_locking);
194*4882a593Smuzhiyun res->sync_locking_done = false;
195*4882a593Smuzhiyun res->ls = cinfo->lockspace;
196*4882a593Smuzhiyun res->mddev = mddev;
197*4882a593Smuzhiyun res->mode = DLM_LOCK_IV;
198*4882a593Smuzhiyun namelen = strlen(name);
199*4882a593Smuzhiyun res->name = kzalloc(namelen + 1, GFP_KERNEL);
200*4882a593Smuzhiyun if (!res->name) {
201*4882a593Smuzhiyun pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name);
202*4882a593Smuzhiyun goto out_err;
203*4882a593Smuzhiyun }
204*4882a593Smuzhiyun strlcpy(res->name, name, namelen + 1);
205*4882a593Smuzhiyun if (with_lvb) {
206*4882a593Smuzhiyun res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL);
207*4882a593Smuzhiyun if (!res->lksb.sb_lvbptr) {
208*4882a593Smuzhiyun pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name);
209*4882a593Smuzhiyun goto out_err;
210*4882a593Smuzhiyun }
211*4882a593Smuzhiyun res->flags = DLM_LKF_VALBLK;
212*4882a593Smuzhiyun }
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun if (bastfn)
215*4882a593Smuzhiyun res->bast = bastfn;
216*4882a593Smuzhiyun
217*4882a593Smuzhiyun res->flags |= DLM_LKF_EXPEDITE;
218*4882a593Smuzhiyun
219*4882a593Smuzhiyun ret = dlm_lock_sync(res, DLM_LOCK_NL);
220*4882a593Smuzhiyun if (ret) {
221*4882a593Smuzhiyun pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name);
222*4882a593Smuzhiyun goto out_err;
223*4882a593Smuzhiyun }
224*4882a593Smuzhiyun res->flags &= ~DLM_LKF_EXPEDITE;
225*4882a593Smuzhiyun res->flags |= DLM_LKF_CONVERT;
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun return res;
228*4882a593Smuzhiyun out_err:
229*4882a593Smuzhiyun kfree(res->lksb.sb_lvbptr);
230*4882a593Smuzhiyun kfree(res->name);
231*4882a593Smuzhiyun kfree(res);
232*4882a593Smuzhiyun return NULL;
233*4882a593Smuzhiyun }
234*4882a593Smuzhiyun
lockres_free(struct dlm_lock_resource * res)235*4882a593Smuzhiyun static void lockres_free(struct dlm_lock_resource *res)
236*4882a593Smuzhiyun {
237*4882a593Smuzhiyun int ret = 0;
238*4882a593Smuzhiyun
239*4882a593Smuzhiyun if (!res)
240*4882a593Smuzhiyun return;
241*4882a593Smuzhiyun
242*4882a593Smuzhiyun /*
243*4882a593Smuzhiyun * use FORCEUNLOCK flag, so we can unlock even the lock is on the
244*4882a593Smuzhiyun * waiting or convert queue
245*4882a593Smuzhiyun */
246*4882a593Smuzhiyun ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_FORCEUNLOCK,
247*4882a593Smuzhiyun &res->lksb, res);
248*4882a593Smuzhiyun if (unlikely(ret != 0))
249*4882a593Smuzhiyun pr_err("failed to unlock %s return %d\n", res->name, ret);
250*4882a593Smuzhiyun else
251*4882a593Smuzhiyun wait_event(res->sync_locking, res->sync_locking_done);
252*4882a593Smuzhiyun
253*4882a593Smuzhiyun kfree(res->name);
254*4882a593Smuzhiyun kfree(res->lksb.sb_lvbptr);
255*4882a593Smuzhiyun kfree(res);
256*4882a593Smuzhiyun }
257*4882a593Smuzhiyun
add_resync_info(struct dlm_lock_resource * lockres,sector_t lo,sector_t hi)258*4882a593Smuzhiyun static void add_resync_info(struct dlm_lock_resource *lockres,
259*4882a593Smuzhiyun sector_t lo, sector_t hi)
260*4882a593Smuzhiyun {
261*4882a593Smuzhiyun struct resync_info *ri;
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun ri = (struct resync_info *)lockres->lksb.sb_lvbptr;
264*4882a593Smuzhiyun ri->lo = cpu_to_le64(lo);
265*4882a593Smuzhiyun ri->hi = cpu_to_le64(hi);
266*4882a593Smuzhiyun }
267*4882a593Smuzhiyun
read_resync_info(struct mddev * mddev,struct dlm_lock_resource * lockres)268*4882a593Smuzhiyun static int read_resync_info(struct mddev *mddev,
269*4882a593Smuzhiyun struct dlm_lock_resource *lockres)
270*4882a593Smuzhiyun {
271*4882a593Smuzhiyun struct resync_info ri;
272*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
273*4882a593Smuzhiyun int ret = 0;
274*4882a593Smuzhiyun
275*4882a593Smuzhiyun dlm_lock_sync(lockres, DLM_LOCK_CR);
276*4882a593Smuzhiyun memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
277*4882a593Smuzhiyun if (le64_to_cpu(ri.hi) > 0) {
278*4882a593Smuzhiyun cinfo->suspend_hi = le64_to_cpu(ri.hi);
279*4882a593Smuzhiyun cinfo->suspend_lo = le64_to_cpu(ri.lo);
280*4882a593Smuzhiyun ret = 1;
281*4882a593Smuzhiyun }
282*4882a593Smuzhiyun dlm_unlock_sync(lockres);
283*4882a593Smuzhiyun return ret;
284*4882a593Smuzhiyun }
285*4882a593Smuzhiyun
recover_bitmaps(struct md_thread * thread)286*4882a593Smuzhiyun static void recover_bitmaps(struct md_thread *thread)
287*4882a593Smuzhiyun {
288*4882a593Smuzhiyun struct mddev *mddev = thread->mddev;
289*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
290*4882a593Smuzhiyun struct dlm_lock_resource *bm_lockres;
291*4882a593Smuzhiyun char str[64];
292*4882a593Smuzhiyun int slot, ret;
293*4882a593Smuzhiyun sector_t lo, hi;
294*4882a593Smuzhiyun
295*4882a593Smuzhiyun while (cinfo->recovery_map) {
296*4882a593Smuzhiyun slot = fls64((u64)cinfo->recovery_map) - 1;
297*4882a593Smuzhiyun
298*4882a593Smuzhiyun snprintf(str, 64, "bitmap%04d", slot);
299*4882a593Smuzhiyun bm_lockres = lockres_init(mddev, str, NULL, 1);
300*4882a593Smuzhiyun if (!bm_lockres) {
301*4882a593Smuzhiyun pr_err("md-cluster: Cannot initialize bitmaps\n");
302*4882a593Smuzhiyun goto clear_bit;
303*4882a593Smuzhiyun }
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun ret = dlm_lock_sync_interruptible(bm_lockres, DLM_LOCK_PW, mddev);
306*4882a593Smuzhiyun if (ret) {
307*4882a593Smuzhiyun pr_err("md-cluster: Could not DLM lock %s: %d\n",
308*4882a593Smuzhiyun str, ret);
309*4882a593Smuzhiyun goto clear_bit;
310*4882a593Smuzhiyun }
311*4882a593Smuzhiyun ret = md_bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
312*4882a593Smuzhiyun if (ret) {
313*4882a593Smuzhiyun pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
314*4882a593Smuzhiyun goto clear_bit;
315*4882a593Smuzhiyun }
316*4882a593Smuzhiyun
317*4882a593Smuzhiyun /* Clear suspend_area associated with the bitmap */
318*4882a593Smuzhiyun spin_lock_irq(&cinfo->suspend_lock);
319*4882a593Smuzhiyun cinfo->suspend_hi = 0;
320*4882a593Smuzhiyun cinfo->suspend_lo = 0;
321*4882a593Smuzhiyun cinfo->suspend_from = -1;
322*4882a593Smuzhiyun spin_unlock_irq(&cinfo->suspend_lock);
323*4882a593Smuzhiyun
324*4882a593Smuzhiyun /* Kick off a reshape if needed */
325*4882a593Smuzhiyun if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
326*4882a593Smuzhiyun test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
327*4882a593Smuzhiyun mddev->reshape_position != MaxSector)
328*4882a593Smuzhiyun md_wakeup_thread(mddev->sync_thread);
329*4882a593Smuzhiyun
330*4882a593Smuzhiyun if (hi > 0) {
331*4882a593Smuzhiyun if (lo < mddev->recovery_cp)
332*4882a593Smuzhiyun mddev->recovery_cp = lo;
333*4882a593Smuzhiyun /* wake up thread to continue resync in case resync
334*4882a593Smuzhiyun * is not finished */
335*4882a593Smuzhiyun if (mddev->recovery_cp != MaxSector) {
336*4882a593Smuzhiyun /*
337*4882a593Smuzhiyun * clear the REMOTE flag since we will launch
338*4882a593Smuzhiyun * resync thread in current node.
339*4882a593Smuzhiyun */
340*4882a593Smuzhiyun clear_bit(MD_RESYNCING_REMOTE,
341*4882a593Smuzhiyun &mddev->recovery);
342*4882a593Smuzhiyun set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
343*4882a593Smuzhiyun md_wakeup_thread(mddev->thread);
344*4882a593Smuzhiyun }
345*4882a593Smuzhiyun }
346*4882a593Smuzhiyun clear_bit:
347*4882a593Smuzhiyun lockres_free(bm_lockres);
348*4882a593Smuzhiyun clear_bit(slot, &cinfo->recovery_map);
349*4882a593Smuzhiyun }
350*4882a593Smuzhiyun }
351*4882a593Smuzhiyun
recover_prep(void * arg)352*4882a593Smuzhiyun static void recover_prep(void *arg)
353*4882a593Smuzhiyun {
354*4882a593Smuzhiyun struct mddev *mddev = arg;
355*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
356*4882a593Smuzhiyun set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
357*4882a593Smuzhiyun }
358*4882a593Smuzhiyun
__recover_slot(struct mddev * mddev,int slot)359*4882a593Smuzhiyun static void __recover_slot(struct mddev *mddev, int slot)
360*4882a593Smuzhiyun {
361*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
362*4882a593Smuzhiyun
363*4882a593Smuzhiyun set_bit(slot, &cinfo->recovery_map);
364*4882a593Smuzhiyun if (!cinfo->recovery_thread) {
365*4882a593Smuzhiyun cinfo->recovery_thread = md_register_thread(recover_bitmaps,
366*4882a593Smuzhiyun mddev, "recover");
367*4882a593Smuzhiyun if (!cinfo->recovery_thread) {
368*4882a593Smuzhiyun pr_warn("md-cluster: Could not create recovery thread\n");
369*4882a593Smuzhiyun return;
370*4882a593Smuzhiyun }
371*4882a593Smuzhiyun }
372*4882a593Smuzhiyun md_wakeup_thread(cinfo->recovery_thread);
373*4882a593Smuzhiyun }
374*4882a593Smuzhiyun
recover_slot(void * arg,struct dlm_slot * slot)375*4882a593Smuzhiyun static void recover_slot(void *arg, struct dlm_slot *slot)
376*4882a593Smuzhiyun {
377*4882a593Smuzhiyun struct mddev *mddev = arg;
378*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
379*4882a593Smuzhiyun
380*4882a593Smuzhiyun pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
381*4882a593Smuzhiyun mddev->bitmap_info.cluster_name,
382*4882a593Smuzhiyun slot->nodeid, slot->slot,
383*4882a593Smuzhiyun cinfo->slot_number);
384*4882a593Smuzhiyun /* deduct one since dlm slot starts from one while the num of
385*4882a593Smuzhiyun * cluster-md begins with 0 */
386*4882a593Smuzhiyun __recover_slot(mddev, slot->slot - 1);
387*4882a593Smuzhiyun }
388*4882a593Smuzhiyun
recover_done(void * arg,struct dlm_slot * slots,int num_slots,int our_slot,uint32_t generation)389*4882a593Smuzhiyun static void recover_done(void *arg, struct dlm_slot *slots,
390*4882a593Smuzhiyun int num_slots, int our_slot,
391*4882a593Smuzhiyun uint32_t generation)
392*4882a593Smuzhiyun {
393*4882a593Smuzhiyun struct mddev *mddev = arg;
394*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
395*4882a593Smuzhiyun
396*4882a593Smuzhiyun cinfo->slot_number = our_slot;
397*4882a593Smuzhiyun /* completion is only need to be complete when node join cluster,
398*4882a593Smuzhiyun * it doesn't need to run during another node's failure */
399*4882a593Smuzhiyun if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) {
400*4882a593Smuzhiyun complete(&cinfo->completion);
401*4882a593Smuzhiyun clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
402*4882a593Smuzhiyun }
403*4882a593Smuzhiyun clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
404*4882a593Smuzhiyun }
405*4882a593Smuzhiyun
406*4882a593Smuzhiyun /* the ops is called when node join the cluster, and do lock recovery
407*4882a593Smuzhiyun * if node failure occurs */
408*4882a593Smuzhiyun static const struct dlm_lockspace_ops md_ls_ops = {
409*4882a593Smuzhiyun .recover_prep = recover_prep,
410*4882a593Smuzhiyun .recover_slot = recover_slot,
411*4882a593Smuzhiyun .recover_done = recover_done,
412*4882a593Smuzhiyun };
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun /*
415*4882a593Smuzhiyun * The BAST function for the ack lock resource
416*4882a593Smuzhiyun * This function wakes up the receive thread in
417*4882a593Smuzhiyun * order to receive and process the message.
418*4882a593Smuzhiyun */
ack_bast(void * arg,int mode)419*4882a593Smuzhiyun static void ack_bast(void *arg, int mode)
420*4882a593Smuzhiyun {
421*4882a593Smuzhiyun struct dlm_lock_resource *res = arg;
422*4882a593Smuzhiyun struct md_cluster_info *cinfo = res->mddev->cluster_info;
423*4882a593Smuzhiyun
424*4882a593Smuzhiyun if (mode == DLM_LOCK_EX) {
425*4882a593Smuzhiyun if (test_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state))
426*4882a593Smuzhiyun md_wakeup_thread(cinfo->recv_thread);
427*4882a593Smuzhiyun else
428*4882a593Smuzhiyun set_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state);
429*4882a593Smuzhiyun }
430*4882a593Smuzhiyun }
431*4882a593Smuzhiyun
remove_suspend_info(struct mddev * mddev,int slot)432*4882a593Smuzhiyun static void remove_suspend_info(struct mddev *mddev, int slot)
433*4882a593Smuzhiyun {
434*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
435*4882a593Smuzhiyun mddev->pers->quiesce(mddev, 1);
436*4882a593Smuzhiyun spin_lock_irq(&cinfo->suspend_lock);
437*4882a593Smuzhiyun cinfo->suspend_hi = 0;
438*4882a593Smuzhiyun cinfo->suspend_lo = 0;
439*4882a593Smuzhiyun spin_unlock_irq(&cinfo->suspend_lock);
440*4882a593Smuzhiyun mddev->pers->quiesce(mddev, 0);
441*4882a593Smuzhiyun }
442*4882a593Smuzhiyun
process_suspend_info(struct mddev * mddev,int slot,sector_t lo,sector_t hi)443*4882a593Smuzhiyun static void process_suspend_info(struct mddev *mddev,
444*4882a593Smuzhiyun int slot, sector_t lo, sector_t hi)
445*4882a593Smuzhiyun {
446*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
447*4882a593Smuzhiyun struct mdp_superblock_1 *sb = NULL;
448*4882a593Smuzhiyun struct md_rdev *rdev;
449*4882a593Smuzhiyun
450*4882a593Smuzhiyun if (!hi) {
451*4882a593Smuzhiyun /*
452*4882a593Smuzhiyun * clear the REMOTE flag since resync or recovery is finished
453*4882a593Smuzhiyun * in remote node.
454*4882a593Smuzhiyun */
455*4882a593Smuzhiyun clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
456*4882a593Smuzhiyun remove_suspend_info(mddev, slot);
457*4882a593Smuzhiyun set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
458*4882a593Smuzhiyun md_wakeup_thread(mddev->thread);
459*4882a593Smuzhiyun return;
460*4882a593Smuzhiyun }
461*4882a593Smuzhiyun
462*4882a593Smuzhiyun rdev_for_each(rdev, mddev)
463*4882a593Smuzhiyun if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
464*4882a593Smuzhiyun sb = page_address(rdev->sb_page);
465*4882a593Smuzhiyun break;
466*4882a593Smuzhiyun }
467*4882a593Smuzhiyun
468*4882a593Smuzhiyun /*
469*4882a593Smuzhiyun * The bitmaps are not same for different nodes
470*4882a593Smuzhiyun * if RESYNCING is happening in one node, then
471*4882a593Smuzhiyun * the node which received the RESYNCING message
472*4882a593Smuzhiyun * probably will perform resync with the region
473*4882a593Smuzhiyun * [lo, hi] again, so we could reduce resync time
474*4882a593Smuzhiyun * a lot if we can ensure that the bitmaps among
475*4882a593Smuzhiyun * different nodes are match up well.
476*4882a593Smuzhiyun *
477*4882a593Smuzhiyun * sync_low/hi is used to record the region which
478*4882a593Smuzhiyun * arrived in the previous RESYNCING message,
479*4882a593Smuzhiyun *
480*4882a593Smuzhiyun * Call md_bitmap_sync_with_cluster to clear NEEDED_MASK
481*4882a593Smuzhiyun * and set RESYNC_MASK since resync thread is running
482*4882a593Smuzhiyun * in another node, so we don't need to do the resync
483*4882a593Smuzhiyun * again with the same section.
484*4882a593Smuzhiyun *
485*4882a593Smuzhiyun * Skip md_bitmap_sync_with_cluster in case reshape
486*4882a593Smuzhiyun * happening, because reshaping region is small and
487*4882a593Smuzhiyun * we don't want to trigger lots of WARN.
488*4882a593Smuzhiyun */
489*4882a593Smuzhiyun if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE))
490*4882a593Smuzhiyun md_bitmap_sync_with_cluster(mddev, cinfo->sync_low,
491*4882a593Smuzhiyun cinfo->sync_hi, lo, hi);
492*4882a593Smuzhiyun cinfo->sync_low = lo;
493*4882a593Smuzhiyun cinfo->sync_hi = hi;
494*4882a593Smuzhiyun
495*4882a593Smuzhiyun mddev->pers->quiesce(mddev, 1);
496*4882a593Smuzhiyun spin_lock_irq(&cinfo->suspend_lock);
497*4882a593Smuzhiyun cinfo->suspend_from = slot;
498*4882a593Smuzhiyun cinfo->suspend_lo = lo;
499*4882a593Smuzhiyun cinfo->suspend_hi = hi;
500*4882a593Smuzhiyun spin_unlock_irq(&cinfo->suspend_lock);
501*4882a593Smuzhiyun mddev->pers->quiesce(mddev, 0);
502*4882a593Smuzhiyun }
503*4882a593Smuzhiyun
process_add_new_disk(struct mddev * mddev,struct cluster_msg * cmsg)504*4882a593Smuzhiyun static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
505*4882a593Smuzhiyun {
506*4882a593Smuzhiyun char disk_uuid[64];
507*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
508*4882a593Smuzhiyun char event_name[] = "EVENT=ADD_DEVICE";
509*4882a593Smuzhiyun char raid_slot[16];
510*4882a593Smuzhiyun char *envp[] = {event_name, disk_uuid, raid_slot, NULL};
511*4882a593Smuzhiyun int len;
512*4882a593Smuzhiyun
513*4882a593Smuzhiyun len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
514*4882a593Smuzhiyun sprintf(disk_uuid + len, "%pU", cmsg->uuid);
515*4882a593Smuzhiyun snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot));
516*4882a593Smuzhiyun pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
517*4882a593Smuzhiyun init_completion(&cinfo->newdisk_completion);
518*4882a593Smuzhiyun set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
519*4882a593Smuzhiyun kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp);
520*4882a593Smuzhiyun wait_for_completion_timeout(&cinfo->newdisk_completion,
521*4882a593Smuzhiyun NEW_DEV_TIMEOUT);
522*4882a593Smuzhiyun clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
523*4882a593Smuzhiyun }
524*4882a593Smuzhiyun
525*4882a593Smuzhiyun
process_metadata_update(struct mddev * mddev,struct cluster_msg * msg)526*4882a593Smuzhiyun static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
527*4882a593Smuzhiyun {
528*4882a593Smuzhiyun int got_lock = 0;
529*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
530*4882a593Smuzhiyun mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
531*4882a593Smuzhiyun
532*4882a593Smuzhiyun dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
533*4882a593Smuzhiyun wait_event(mddev->thread->wqueue,
534*4882a593Smuzhiyun (got_lock = mddev_trylock(mddev)) ||
535*4882a593Smuzhiyun test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
536*4882a593Smuzhiyun md_reload_sb(mddev, mddev->good_device_nr);
537*4882a593Smuzhiyun if (got_lock)
538*4882a593Smuzhiyun mddev_unlock(mddev);
539*4882a593Smuzhiyun }
540*4882a593Smuzhiyun
process_remove_disk(struct mddev * mddev,struct cluster_msg * msg)541*4882a593Smuzhiyun static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
542*4882a593Smuzhiyun {
543*4882a593Smuzhiyun struct md_rdev *rdev;
544*4882a593Smuzhiyun
545*4882a593Smuzhiyun rcu_read_lock();
546*4882a593Smuzhiyun rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot));
547*4882a593Smuzhiyun if (rdev) {
548*4882a593Smuzhiyun set_bit(ClusterRemove, &rdev->flags);
549*4882a593Smuzhiyun set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
550*4882a593Smuzhiyun md_wakeup_thread(mddev->thread);
551*4882a593Smuzhiyun }
552*4882a593Smuzhiyun else
553*4882a593Smuzhiyun pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
554*4882a593Smuzhiyun __func__, __LINE__, le32_to_cpu(msg->raid_slot));
555*4882a593Smuzhiyun rcu_read_unlock();
556*4882a593Smuzhiyun }
557*4882a593Smuzhiyun
process_readd_disk(struct mddev * mddev,struct cluster_msg * msg)558*4882a593Smuzhiyun static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
559*4882a593Smuzhiyun {
560*4882a593Smuzhiyun struct md_rdev *rdev;
561*4882a593Smuzhiyun
562*4882a593Smuzhiyun rcu_read_lock();
563*4882a593Smuzhiyun rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot));
564*4882a593Smuzhiyun if (rdev && test_bit(Faulty, &rdev->flags))
565*4882a593Smuzhiyun clear_bit(Faulty, &rdev->flags);
566*4882a593Smuzhiyun else
567*4882a593Smuzhiyun pr_warn("%s: %d Could not find disk(%d) which is faulty",
568*4882a593Smuzhiyun __func__, __LINE__, le32_to_cpu(msg->raid_slot));
569*4882a593Smuzhiyun rcu_read_unlock();
570*4882a593Smuzhiyun }
571*4882a593Smuzhiyun
process_recvd_msg(struct mddev * mddev,struct cluster_msg * msg)572*4882a593Smuzhiyun static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
573*4882a593Smuzhiyun {
574*4882a593Smuzhiyun int ret = 0;
575*4882a593Smuzhiyun
576*4882a593Smuzhiyun if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot),
577*4882a593Smuzhiyun "node %d received it's own msg\n", le32_to_cpu(msg->slot)))
578*4882a593Smuzhiyun return -1;
579*4882a593Smuzhiyun switch (le32_to_cpu(msg->type)) {
580*4882a593Smuzhiyun case METADATA_UPDATED:
581*4882a593Smuzhiyun process_metadata_update(mddev, msg);
582*4882a593Smuzhiyun break;
583*4882a593Smuzhiyun case CHANGE_CAPACITY:
584*4882a593Smuzhiyun set_capacity(mddev->gendisk, mddev->array_sectors);
585*4882a593Smuzhiyun revalidate_disk_size(mddev->gendisk, true);
586*4882a593Smuzhiyun break;
587*4882a593Smuzhiyun case RESYNCING:
588*4882a593Smuzhiyun set_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
589*4882a593Smuzhiyun process_suspend_info(mddev, le32_to_cpu(msg->slot),
590*4882a593Smuzhiyun le64_to_cpu(msg->low),
591*4882a593Smuzhiyun le64_to_cpu(msg->high));
592*4882a593Smuzhiyun break;
593*4882a593Smuzhiyun case NEWDISK:
594*4882a593Smuzhiyun process_add_new_disk(mddev, msg);
595*4882a593Smuzhiyun break;
596*4882a593Smuzhiyun case REMOVE:
597*4882a593Smuzhiyun process_remove_disk(mddev, msg);
598*4882a593Smuzhiyun break;
599*4882a593Smuzhiyun case RE_ADD:
600*4882a593Smuzhiyun process_readd_disk(mddev, msg);
601*4882a593Smuzhiyun break;
602*4882a593Smuzhiyun case BITMAP_NEEDS_SYNC:
603*4882a593Smuzhiyun __recover_slot(mddev, le32_to_cpu(msg->slot));
604*4882a593Smuzhiyun break;
605*4882a593Smuzhiyun case BITMAP_RESIZE:
606*4882a593Smuzhiyun if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0))
607*4882a593Smuzhiyun ret = md_bitmap_resize(mddev->bitmap,
608*4882a593Smuzhiyun le64_to_cpu(msg->high), 0, 0);
609*4882a593Smuzhiyun break;
610*4882a593Smuzhiyun default:
611*4882a593Smuzhiyun ret = -1;
612*4882a593Smuzhiyun pr_warn("%s:%d Received unknown message from %d\n",
613*4882a593Smuzhiyun __func__, __LINE__, msg->slot);
614*4882a593Smuzhiyun }
615*4882a593Smuzhiyun return ret;
616*4882a593Smuzhiyun }
617*4882a593Smuzhiyun
618*4882a593Smuzhiyun /*
619*4882a593Smuzhiyun * thread for receiving message
620*4882a593Smuzhiyun */
recv_daemon(struct md_thread * thread)621*4882a593Smuzhiyun static void recv_daemon(struct md_thread *thread)
622*4882a593Smuzhiyun {
623*4882a593Smuzhiyun struct md_cluster_info *cinfo = thread->mddev->cluster_info;
624*4882a593Smuzhiyun struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
625*4882a593Smuzhiyun struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
626*4882a593Smuzhiyun struct cluster_msg msg;
627*4882a593Smuzhiyun int ret;
628*4882a593Smuzhiyun
629*4882a593Smuzhiyun mutex_lock(&cinfo->recv_mutex);
630*4882a593Smuzhiyun /*get CR on Message*/
631*4882a593Smuzhiyun if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
632*4882a593Smuzhiyun pr_err("md/raid1:failed to get CR on MESSAGE\n");
633*4882a593Smuzhiyun mutex_unlock(&cinfo->recv_mutex);
634*4882a593Smuzhiyun return;
635*4882a593Smuzhiyun }
636*4882a593Smuzhiyun
637*4882a593Smuzhiyun /* read lvb and wake up thread to process this message_lockres */
638*4882a593Smuzhiyun memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg));
639*4882a593Smuzhiyun ret = process_recvd_msg(thread->mddev, &msg);
640*4882a593Smuzhiyun if (ret)
641*4882a593Smuzhiyun goto out;
642*4882a593Smuzhiyun
643*4882a593Smuzhiyun /*release CR on ack_lockres*/
644*4882a593Smuzhiyun ret = dlm_unlock_sync(ack_lockres);
645*4882a593Smuzhiyun if (unlikely(ret != 0))
646*4882a593Smuzhiyun pr_info("unlock ack failed return %d\n", ret);
647*4882a593Smuzhiyun /*up-convert to PR on message_lockres*/
648*4882a593Smuzhiyun ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR);
649*4882a593Smuzhiyun if (unlikely(ret != 0))
650*4882a593Smuzhiyun pr_info("lock PR on msg failed return %d\n", ret);
651*4882a593Smuzhiyun /*get CR on ack_lockres again*/
652*4882a593Smuzhiyun ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
653*4882a593Smuzhiyun if (unlikely(ret != 0))
654*4882a593Smuzhiyun pr_info("lock CR on ack failed return %d\n", ret);
655*4882a593Smuzhiyun out:
656*4882a593Smuzhiyun /*release CR on message_lockres*/
657*4882a593Smuzhiyun ret = dlm_unlock_sync(message_lockres);
658*4882a593Smuzhiyun if (unlikely(ret != 0))
659*4882a593Smuzhiyun pr_info("unlock msg failed return %d\n", ret);
660*4882a593Smuzhiyun mutex_unlock(&cinfo->recv_mutex);
661*4882a593Smuzhiyun }
662*4882a593Smuzhiyun
663*4882a593Smuzhiyun /* lock_token()
664*4882a593Smuzhiyun * Takes the lock on the TOKEN lock resource so no other
665*4882a593Smuzhiyun * node can communicate while the operation is underway.
666*4882a593Smuzhiyun */
lock_token(struct md_cluster_info * cinfo)667*4882a593Smuzhiyun static int lock_token(struct md_cluster_info *cinfo)
668*4882a593Smuzhiyun {
669*4882a593Smuzhiyun int error;
670*4882a593Smuzhiyun
671*4882a593Smuzhiyun error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
672*4882a593Smuzhiyun if (error) {
673*4882a593Smuzhiyun pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
674*4882a593Smuzhiyun __func__, __LINE__, error);
675*4882a593Smuzhiyun } else {
676*4882a593Smuzhiyun /* Lock the receive sequence */
677*4882a593Smuzhiyun mutex_lock(&cinfo->recv_mutex);
678*4882a593Smuzhiyun }
679*4882a593Smuzhiyun return error;
680*4882a593Smuzhiyun }
681*4882a593Smuzhiyun
682*4882a593Smuzhiyun /* lock_comm()
683*4882a593Smuzhiyun * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel.
684*4882a593Smuzhiyun */
lock_comm(struct md_cluster_info * cinfo,bool mddev_locked)685*4882a593Smuzhiyun static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
686*4882a593Smuzhiyun {
687*4882a593Smuzhiyun int rv, set_bit = 0;
688*4882a593Smuzhiyun struct mddev *mddev = cinfo->mddev;
689*4882a593Smuzhiyun
690*4882a593Smuzhiyun /*
691*4882a593Smuzhiyun * If resync thread run after raid1d thread, then process_metadata_update
692*4882a593Smuzhiyun * could not continue if raid1d held reconfig_mutex (and raid1d is blocked
693*4882a593Smuzhiyun * since another node already got EX on Token and waitting the EX of Ack),
694*4882a593Smuzhiyun * so let resync wake up thread in case flag is set.
695*4882a593Smuzhiyun */
696*4882a593Smuzhiyun if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
697*4882a593Smuzhiyun &cinfo->state)) {
698*4882a593Smuzhiyun rv = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
699*4882a593Smuzhiyun &cinfo->state);
700*4882a593Smuzhiyun WARN_ON_ONCE(rv);
701*4882a593Smuzhiyun md_wakeup_thread(mddev->thread);
702*4882a593Smuzhiyun set_bit = 1;
703*4882a593Smuzhiyun }
704*4882a593Smuzhiyun
705*4882a593Smuzhiyun wait_event(cinfo->wait,
706*4882a593Smuzhiyun !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
707*4882a593Smuzhiyun rv = lock_token(cinfo);
708*4882a593Smuzhiyun if (set_bit)
709*4882a593Smuzhiyun clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
710*4882a593Smuzhiyun return rv;
711*4882a593Smuzhiyun }
712*4882a593Smuzhiyun
unlock_comm(struct md_cluster_info * cinfo)713*4882a593Smuzhiyun static void unlock_comm(struct md_cluster_info *cinfo)
714*4882a593Smuzhiyun {
715*4882a593Smuzhiyun WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
716*4882a593Smuzhiyun mutex_unlock(&cinfo->recv_mutex);
717*4882a593Smuzhiyun dlm_unlock_sync(cinfo->token_lockres);
718*4882a593Smuzhiyun clear_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state);
719*4882a593Smuzhiyun wake_up(&cinfo->wait);
720*4882a593Smuzhiyun }
721*4882a593Smuzhiyun
722*4882a593Smuzhiyun /* __sendmsg()
723*4882a593Smuzhiyun * This function performs the actual sending of the message. This function is
724*4882a593Smuzhiyun * usually called after performing the encompassing operation
725*4882a593Smuzhiyun * The function:
726*4882a593Smuzhiyun * 1. Grabs the message lockresource in EX mode
727*4882a593Smuzhiyun * 2. Copies the message to the message LVB
728*4882a593Smuzhiyun * 3. Downconverts message lockresource to CW
729*4882a593Smuzhiyun * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
730*4882a593Smuzhiyun * and the other nodes read the message. The thread will wait here until all other
731*4882a593Smuzhiyun * nodes have released ack lock resource.
732*4882a593Smuzhiyun * 5. Downconvert ack lockresource to CR
733*4882a593Smuzhiyun */
__sendmsg(struct md_cluster_info * cinfo,struct cluster_msg * cmsg)734*4882a593Smuzhiyun static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
735*4882a593Smuzhiyun {
736*4882a593Smuzhiyun int error;
737*4882a593Smuzhiyun int slot = cinfo->slot_number - 1;
738*4882a593Smuzhiyun
739*4882a593Smuzhiyun cmsg->slot = cpu_to_le32(slot);
740*4882a593Smuzhiyun /*get EX on Message*/
741*4882a593Smuzhiyun error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX);
742*4882a593Smuzhiyun if (error) {
743*4882a593Smuzhiyun pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error);
744*4882a593Smuzhiyun goto failed_message;
745*4882a593Smuzhiyun }
746*4882a593Smuzhiyun
747*4882a593Smuzhiyun memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
748*4882a593Smuzhiyun sizeof(struct cluster_msg));
749*4882a593Smuzhiyun /*down-convert EX to CW on Message*/
750*4882a593Smuzhiyun error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW);
751*4882a593Smuzhiyun if (error) {
752*4882a593Smuzhiyun pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n",
753*4882a593Smuzhiyun error);
754*4882a593Smuzhiyun goto failed_ack;
755*4882a593Smuzhiyun }
756*4882a593Smuzhiyun
757*4882a593Smuzhiyun /*up-convert CR to EX on Ack*/
758*4882a593Smuzhiyun error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_EX);
759*4882a593Smuzhiyun if (error) {
760*4882a593Smuzhiyun pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n",
761*4882a593Smuzhiyun error);
762*4882a593Smuzhiyun goto failed_ack;
763*4882a593Smuzhiyun }
764*4882a593Smuzhiyun
765*4882a593Smuzhiyun /*down-convert EX to CR on Ack*/
766*4882a593Smuzhiyun error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR);
767*4882a593Smuzhiyun if (error) {
768*4882a593Smuzhiyun pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n",
769*4882a593Smuzhiyun error);
770*4882a593Smuzhiyun goto failed_ack;
771*4882a593Smuzhiyun }
772*4882a593Smuzhiyun
773*4882a593Smuzhiyun failed_ack:
774*4882a593Smuzhiyun error = dlm_unlock_sync(cinfo->message_lockres);
775*4882a593Smuzhiyun if (unlikely(error != 0)) {
776*4882a593Smuzhiyun pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
777*4882a593Smuzhiyun error);
778*4882a593Smuzhiyun /* in case the message can't be released due to some reason */
779*4882a593Smuzhiyun goto failed_ack;
780*4882a593Smuzhiyun }
781*4882a593Smuzhiyun failed_message:
782*4882a593Smuzhiyun return error;
783*4882a593Smuzhiyun }
784*4882a593Smuzhiyun
sendmsg(struct md_cluster_info * cinfo,struct cluster_msg * cmsg,bool mddev_locked)785*4882a593Smuzhiyun static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg,
786*4882a593Smuzhiyun bool mddev_locked)
787*4882a593Smuzhiyun {
788*4882a593Smuzhiyun int ret;
789*4882a593Smuzhiyun
790*4882a593Smuzhiyun ret = lock_comm(cinfo, mddev_locked);
791*4882a593Smuzhiyun if (!ret) {
792*4882a593Smuzhiyun ret = __sendmsg(cinfo, cmsg);
793*4882a593Smuzhiyun unlock_comm(cinfo);
794*4882a593Smuzhiyun }
795*4882a593Smuzhiyun return ret;
796*4882a593Smuzhiyun }
797*4882a593Smuzhiyun
gather_all_resync_info(struct mddev * mddev,int total_slots)798*4882a593Smuzhiyun static int gather_all_resync_info(struct mddev *mddev, int total_slots)
799*4882a593Smuzhiyun {
800*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
801*4882a593Smuzhiyun int i, ret = 0;
802*4882a593Smuzhiyun struct dlm_lock_resource *bm_lockres;
803*4882a593Smuzhiyun char str[64];
804*4882a593Smuzhiyun sector_t lo, hi;
805*4882a593Smuzhiyun
806*4882a593Smuzhiyun
807*4882a593Smuzhiyun for (i = 0; i < total_slots; i++) {
808*4882a593Smuzhiyun memset(str, '\0', 64);
809*4882a593Smuzhiyun snprintf(str, 64, "bitmap%04d", i);
810*4882a593Smuzhiyun bm_lockres = lockres_init(mddev, str, NULL, 1);
811*4882a593Smuzhiyun if (!bm_lockres)
812*4882a593Smuzhiyun return -ENOMEM;
813*4882a593Smuzhiyun if (i == (cinfo->slot_number - 1)) {
814*4882a593Smuzhiyun lockres_free(bm_lockres);
815*4882a593Smuzhiyun continue;
816*4882a593Smuzhiyun }
817*4882a593Smuzhiyun
818*4882a593Smuzhiyun bm_lockres->flags |= DLM_LKF_NOQUEUE;
819*4882a593Smuzhiyun ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
820*4882a593Smuzhiyun if (ret == -EAGAIN) {
821*4882a593Smuzhiyun if (read_resync_info(mddev, bm_lockres)) {
822*4882a593Smuzhiyun pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
823*4882a593Smuzhiyun __func__, __LINE__,
824*4882a593Smuzhiyun (unsigned long long) cinfo->suspend_lo,
825*4882a593Smuzhiyun (unsigned long long) cinfo->suspend_hi,
826*4882a593Smuzhiyun i);
827*4882a593Smuzhiyun cinfo->suspend_from = i;
828*4882a593Smuzhiyun }
829*4882a593Smuzhiyun ret = 0;
830*4882a593Smuzhiyun lockres_free(bm_lockres);
831*4882a593Smuzhiyun continue;
832*4882a593Smuzhiyun }
833*4882a593Smuzhiyun if (ret) {
834*4882a593Smuzhiyun lockres_free(bm_lockres);
835*4882a593Smuzhiyun goto out;
836*4882a593Smuzhiyun }
837*4882a593Smuzhiyun
838*4882a593Smuzhiyun /* Read the disk bitmap sb and check if it needs recovery */
839*4882a593Smuzhiyun ret = md_bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
840*4882a593Smuzhiyun if (ret) {
841*4882a593Smuzhiyun pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
842*4882a593Smuzhiyun lockres_free(bm_lockres);
843*4882a593Smuzhiyun continue;
844*4882a593Smuzhiyun }
845*4882a593Smuzhiyun if ((hi > 0) && (lo < mddev->recovery_cp)) {
846*4882a593Smuzhiyun set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
847*4882a593Smuzhiyun mddev->recovery_cp = lo;
848*4882a593Smuzhiyun md_check_recovery(mddev);
849*4882a593Smuzhiyun }
850*4882a593Smuzhiyun
851*4882a593Smuzhiyun lockres_free(bm_lockres);
852*4882a593Smuzhiyun }
853*4882a593Smuzhiyun out:
854*4882a593Smuzhiyun return ret;
855*4882a593Smuzhiyun }
856*4882a593Smuzhiyun
join(struct mddev * mddev,int nodes)857*4882a593Smuzhiyun static int join(struct mddev *mddev, int nodes)
858*4882a593Smuzhiyun {
859*4882a593Smuzhiyun struct md_cluster_info *cinfo;
860*4882a593Smuzhiyun int ret, ops_rv;
861*4882a593Smuzhiyun char str[64];
862*4882a593Smuzhiyun
863*4882a593Smuzhiyun cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
864*4882a593Smuzhiyun if (!cinfo)
865*4882a593Smuzhiyun return -ENOMEM;
866*4882a593Smuzhiyun
867*4882a593Smuzhiyun INIT_LIST_HEAD(&cinfo->suspend_list);
868*4882a593Smuzhiyun spin_lock_init(&cinfo->suspend_lock);
869*4882a593Smuzhiyun init_completion(&cinfo->completion);
870*4882a593Smuzhiyun set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
871*4882a593Smuzhiyun init_waitqueue_head(&cinfo->wait);
872*4882a593Smuzhiyun mutex_init(&cinfo->recv_mutex);
873*4882a593Smuzhiyun
874*4882a593Smuzhiyun mddev->cluster_info = cinfo;
875*4882a593Smuzhiyun cinfo->mddev = mddev;
876*4882a593Smuzhiyun
877*4882a593Smuzhiyun memset(str, 0, 64);
878*4882a593Smuzhiyun sprintf(str, "%pU", mddev->uuid);
879*4882a593Smuzhiyun ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
880*4882a593Smuzhiyun DLM_LSFL_FS, LVB_SIZE,
881*4882a593Smuzhiyun &md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
882*4882a593Smuzhiyun if (ret)
883*4882a593Smuzhiyun goto err;
884*4882a593Smuzhiyun wait_for_completion(&cinfo->completion);
885*4882a593Smuzhiyun if (nodes < cinfo->slot_number) {
886*4882a593Smuzhiyun pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).",
887*4882a593Smuzhiyun cinfo->slot_number, nodes);
888*4882a593Smuzhiyun ret = -ERANGE;
889*4882a593Smuzhiyun goto err;
890*4882a593Smuzhiyun }
891*4882a593Smuzhiyun /* Initiate the communication resources */
892*4882a593Smuzhiyun ret = -ENOMEM;
893*4882a593Smuzhiyun cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
894*4882a593Smuzhiyun if (!cinfo->recv_thread) {
895*4882a593Smuzhiyun pr_err("md-cluster: cannot allocate memory for recv_thread!\n");
896*4882a593Smuzhiyun goto err;
897*4882a593Smuzhiyun }
898*4882a593Smuzhiyun cinfo->message_lockres = lockres_init(mddev, "message", NULL, 1);
899*4882a593Smuzhiyun if (!cinfo->message_lockres)
900*4882a593Smuzhiyun goto err;
901*4882a593Smuzhiyun cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0);
902*4882a593Smuzhiyun if (!cinfo->token_lockres)
903*4882a593Smuzhiyun goto err;
904*4882a593Smuzhiyun cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0);
905*4882a593Smuzhiyun if (!cinfo->no_new_dev_lockres)
906*4882a593Smuzhiyun goto err;
907*4882a593Smuzhiyun
908*4882a593Smuzhiyun ret = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
909*4882a593Smuzhiyun if (ret) {
910*4882a593Smuzhiyun ret = -EAGAIN;
911*4882a593Smuzhiyun pr_err("md-cluster: can't join cluster to avoid lock issue\n");
912*4882a593Smuzhiyun goto err;
913*4882a593Smuzhiyun }
914*4882a593Smuzhiyun cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
915*4882a593Smuzhiyun if (!cinfo->ack_lockres) {
916*4882a593Smuzhiyun ret = -ENOMEM;
917*4882a593Smuzhiyun goto err;
918*4882a593Smuzhiyun }
919*4882a593Smuzhiyun /* get sync CR lock on ACK. */
920*4882a593Smuzhiyun if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR))
921*4882a593Smuzhiyun pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n",
922*4882a593Smuzhiyun ret);
923*4882a593Smuzhiyun dlm_unlock_sync(cinfo->token_lockres);
924*4882a593Smuzhiyun /* get sync CR lock on no-new-dev. */
925*4882a593Smuzhiyun if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR))
926*4882a593Smuzhiyun pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret);
927*4882a593Smuzhiyun
928*4882a593Smuzhiyun
929*4882a593Smuzhiyun pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number);
930*4882a593Smuzhiyun snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1);
931*4882a593Smuzhiyun cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1);
932*4882a593Smuzhiyun if (!cinfo->bitmap_lockres) {
933*4882a593Smuzhiyun ret = -ENOMEM;
934*4882a593Smuzhiyun goto err;
935*4882a593Smuzhiyun }
936*4882a593Smuzhiyun if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) {
937*4882a593Smuzhiyun pr_err("Failed to get bitmap lock\n");
938*4882a593Smuzhiyun ret = -EINVAL;
939*4882a593Smuzhiyun goto err;
940*4882a593Smuzhiyun }
941*4882a593Smuzhiyun
942*4882a593Smuzhiyun cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0);
943*4882a593Smuzhiyun if (!cinfo->resync_lockres) {
944*4882a593Smuzhiyun ret = -ENOMEM;
945*4882a593Smuzhiyun goto err;
946*4882a593Smuzhiyun }
947*4882a593Smuzhiyun
948*4882a593Smuzhiyun return 0;
949*4882a593Smuzhiyun err:
950*4882a593Smuzhiyun set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
951*4882a593Smuzhiyun md_unregister_thread(&cinfo->recovery_thread);
952*4882a593Smuzhiyun md_unregister_thread(&cinfo->recv_thread);
953*4882a593Smuzhiyun lockres_free(cinfo->message_lockres);
954*4882a593Smuzhiyun lockres_free(cinfo->token_lockres);
955*4882a593Smuzhiyun lockres_free(cinfo->ack_lockres);
956*4882a593Smuzhiyun lockres_free(cinfo->no_new_dev_lockres);
957*4882a593Smuzhiyun lockres_free(cinfo->resync_lockres);
958*4882a593Smuzhiyun lockres_free(cinfo->bitmap_lockres);
959*4882a593Smuzhiyun if (cinfo->lockspace)
960*4882a593Smuzhiyun dlm_release_lockspace(cinfo->lockspace, 2);
961*4882a593Smuzhiyun mddev->cluster_info = NULL;
962*4882a593Smuzhiyun kfree(cinfo);
963*4882a593Smuzhiyun return ret;
964*4882a593Smuzhiyun }
965*4882a593Smuzhiyun
load_bitmaps(struct mddev * mddev,int total_slots)966*4882a593Smuzhiyun static void load_bitmaps(struct mddev *mddev, int total_slots)
967*4882a593Smuzhiyun {
968*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
969*4882a593Smuzhiyun
970*4882a593Smuzhiyun /* load all the node's bitmap info for resync */
971*4882a593Smuzhiyun if (gather_all_resync_info(mddev, total_slots))
972*4882a593Smuzhiyun pr_err("md-cluster: failed to gather all resyn infos\n");
973*4882a593Smuzhiyun set_bit(MD_CLUSTER_ALREADY_IN_CLUSTER, &cinfo->state);
974*4882a593Smuzhiyun /* wake up recv thread in case something need to be handled */
975*4882a593Smuzhiyun if (test_and_clear_bit(MD_CLUSTER_PENDING_RECV_EVENT, &cinfo->state))
976*4882a593Smuzhiyun md_wakeup_thread(cinfo->recv_thread);
977*4882a593Smuzhiyun }
978*4882a593Smuzhiyun
resync_bitmap(struct mddev * mddev)979*4882a593Smuzhiyun static void resync_bitmap(struct mddev *mddev)
980*4882a593Smuzhiyun {
981*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
982*4882a593Smuzhiyun struct cluster_msg cmsg = {0};
983*4882a593Smuzhiyun int err;
984*4882a593Smuzhiyun
985*4882a593Smuzhiyun cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
986*4882a593Smuzhiyun err = sendmsg(cinfo, &cmsg, 1);
987*4882a593Smuzhiyun if (err)
988*4882a593Smuzhiyun pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
989*4882a593Smuzhiyun __func__, __LINE__, err);
990*4882a593Smuzhiyun }
991*4882a593Smuzhiyun
992*4882a593Smuzhiyun static void unlock_all_bitmaps(struct mddev *mddev);
leave(struct mddev * mddev)993*4882a593Smuzhiyun static int leave(struct mddev *mddev)
994*4882a593Smuzhiyun {
995*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
996*4882a593Smuzhiyun
997*4882a593Smuzhiyun if (!cinfo)
998*4882a593Smuzhiyun return 0;
999*4882a593Smuzhiyun
1000*4882a593Smuzhiyun /*
1001*4882a593Smuzhiyun * BITMAP_NEEDS_SYNC message should be sent when node
1002*4882a593Smuzhiyun * is leaving the cluster with dirty bitmap, also we
1003*4882a593Smuzhiyun * can only deliver it when dlm connection is available.
1004*4882a593Smuzhiyun *
1005*4882a593Smuzhiyun * Also, we should send BITMAP_NEEDS_SYNC message in
1006*4882a593Smuzhiyun * case reshaping is interrupted.
1007*4882a593Smuzhiyun */
1008*4882a593Smuzhiyun if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) ||
1009*4882a593Smuzhiyun (mddev->reshape_position != MaxSector &&
1010*4882a593Smuzhiyun test_bit(MD_CLOSING, &mddev->flags)))
1011*4882a593Smuzhiyun resync_bitmap(mddev);
1012*4882a593Smuzhiyun
1013*4882a593Smuzhiyun set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
1014*4882a593Smuzhiyun md_unregister_thread(&cinfo->recovery_thread);
1015*4882a593Smuzhiyun md_unregister_thread(&cinfo->recv_thread);
1016*4882a593Smuzhiyun lockres_free(cinfo->message_lockres);
1017*4882a593Smuzhiyun lockres_free(cinfo->token_lockres);
1018*4882a593Smuzhiyun lockres_free(cinfo->ack_lockres);
1019*4882a593Smuzhiyun lockres_free(cinfo->no_new_dev_lockres);
1020*4882a593Smuzhiyun lockres_free(cinfo->resync_lockres);
1021*4882a593Smuzhiyun lockres_free(cinfo->bitmap_lockres);
1022*4882a593Smuzhiyun unlock_all_bitmaps(mddev);
1023*4882a593Smuzhiyun dlm_release_lockspace(cinfo->lockspace, 2);
1024*4882a593Smuzhiyun kfree(cinfo);
1025*4882a593Smuzhiyun return 0;
1026*4882a593Smuzhiyun }
1027*4882a593Smuzhiyun
1028*4882a593Smuzhiyun /* slot_number(): Returns the MD slot number to use
1029*4882a593Smuzhiyun * DLM starts the slot numbers from 1, wheras cluster-md
1030*4882a593Smuzhiyun * wants the number to be from zero, so we deduct one
1031*4882a593Smuzhiyun */
slot_number(struct mddev * mddev)1032*4882a593Smuzhiyun static int slot_number(struct mddev *mddev)
1033*4882a593Smuzhiyun {
1034*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1035*4882a593Smuzhiyun
1036*4882a593Smuzhiyun return cinfo->slot_number - 1;
1037*4882a593Smuzhiyun }
1038*4882a593Smuzhiyun
1039*4882a593Smuzhiyun /*
1040*4882a593Smuzhiyun * Check if the communication is already locked, else lock the communication
1041*4882a593Smuzhiyun * channel.
1042*4882a593Smuzhiyun * If it is already locked, token is in EX mode, and hence lock_token()
1043*4882a593Smuzhiyun * should not be called.
1044*4882a593Smuzhiyun */
metadata_update_start(struct mddev * mddev)1045*4882a593Smuzhiyun static int metadata_update_start(struct mddev *mddev)
1046*4882a593Smuzhiyun {
1047*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1048*4882a593Smuzhiyun int ret;
1049*4882a593Smuzhiyun
1050*4882a593Smuzhiyun /*
1051*4882a593Smuzhiyun * metadata_update_start is always called with the protection of
1052*4882a593Smuzhiyun * reconfig_mutex, so set WAITING_FOR_TOKEN here.
1053*4882a593Smuzhiyun */
1054*4882a593Smuzhiyun ret = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD,
1055*4882a593Smuzhiyun &cinfo->state);
1056*4882a593Smuzhiyun WARN_ON_ONCE(ret);
1057*4882a593Smuzhiyun md_wakeup_thread(mddev->thread);
1058*4882a593Smuzhiyun
1059*4882a593Smuzhiyun wait_event(cinfo->wait,
1060*4882a593Smuzhiyun !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state) ||
1061*4882a593Smuzhiyun test_and_clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state));
1062*4882a593Smuzhiyun
1063*4882a593Smuzhiyun /* If token is already locked, return 0 */
1064*4882a593Smuzhiyun if (cinfo->token_lockres->mode == DLM_LOCK_EX) {
1065*4882a593Smuzhiyun clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
1066*4882a593Smuzhiyun return 0;
1067*4882a593Smuzhiyun }
1068*4882a593Smuzhiyun
1069*4882a593Smuzhiyun ret = lock_token(cinfo);
1070*4882a593Smuzhiyun clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
1071*4882a593Smuzhiyun return ret;
1072*4882a593Smuzhiyun }
1073*4882a593Smuzhiyun
metadata_update_finish(struct mddev * mddev)1074*4882a593Smuzhiyun static int metadata_update_finish(struct mddev *mddev)
1075*4882a593Smuzhiyun {
1076*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1077*4882a593Smuzhiyun struct cluster_msg cmsg;
1078*4882a593Smuzhiyun struct md_rdev *rdev;
1079*4882a593Smuzhiyun int ret = 0;
1080*4882a593Smuzhiyun int raid_slot = -1;
1081*4882a593Smuzhiyun
1082*4882a593Smuzhiyun memset(&cmsg, 0, sizeof(cmsg));
1083*4882a593Smuzhiyun cmsg.type = cpu_to_le32(METADATA_UPDATED);
1084*4882a593Smuzhiyun /* Pick up a good active device number to send.
1085*4882a593Smuzhiyun */
1086*4882a593Smuzhiyun rdev_for_each(rdev, mddev)
1087*4882a593Smuzhiyun if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
1088*4882a593Smuzhiyun raid_slot = rdev->desc_nr;
1089*4882a593Smuzhiyun break;
1090*4882a593Smuzhiyun }
1091*4882a593Smuzhiyun if (raid_slot >= 0) {
1092*4882a593Smuzhiyun cmsg.raid_slot = cpu_to_le32(raid_slot);
1093*4882a593Smuzhiyun ret = __sendmsg(cinfo, &cmsg);
1094*4882a593Smuzhiyun } else
1095*4882a593Smuzhiyun pr_warn("md-cluster: No good device id found to send\n");
1096*4882a593Smuzhiyun clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
1097*4882a593Smuzhiyun unlock_comm(cinfo);
1098*4882a593Smuzhiyun return ret;
1099*4882a593Smuzhiyun }
1100*4882a593Smuzhiyun
metadata_update_cancel(struct mddev * mddev)1101*4882a593Smuzhiyun static void metadata_update_cancel(struct mddev *mddev)
1102*4882a593Smuzhiyun {
1103*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1104*4882a593Smuzhiyun clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
1105*4882a593Smuzhiyun unlock_comm(cinfo);
1106*4882a593Smuzhiyun }
1107*4882a593Smuzhiyun
update_bitmap_size(struct mddev * mddev,sector_t size)1108*4882a593Smuzhiyun static int update_bitmap_size(struct mddev *mddev, sector_t size)
1109*4882a593Smuzhiyun {
1110*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1111*4882a593Smuzhiyun struct cluster_msg cmsg = {0};
1112*4882a593Smuzhiyun int ret;
1113*4882a593Smuzhiyun
1114*4882a593Smuzhiyun cmsg.type = cpu_to_le32(BITMAP_RESIZE);
1115*4882a593Smuzhiyun cmsg.high = cpu_to_le64(size);
1116*4882a593Smuzhiyun ret = sendmsg(cinfo, &cmsg, 0);
1117*4882a593Smuzhiyun if (ret)
1118*4882a593Smuzhiyun pr_err("%s:%d: failed to send BITMAP_RESIZE message (%d)\n",
1119*4882a593Smuzhiyun __func__, __LINE__, ret);
1120*4882a593Smuzhiyun return ret;
1121*4882a593Smuzhiyun }
1122*4882a593Smuzhiyun
resize_bitmaps(struct mddev * mddev,sector_t newsize,sector_t oldsize)1123*4882a593Smuzhiyun static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize)
1124*4882a593Smuzhiyun {
1125*4882a593Smuzhiyun struct bitmap_counts *counts;
1126*4882a593Smuzhiyun char str[64];
1127*4882a593Smuzhiyun struct dlm_lock_resource *bm_lockres;
1128*4882a593Smuzhiyun struct bitmap *bitmap = mddev->bitmap;
1129*4882a593Smuzhiyun unsigned long my_pages = bitmap->counts.pages;
1130*4882a593Smuzhiyun int i, rv;
1131*4882a593Smuzhiyun
1132*4882a593Smuzhiyun /*
1133*4882a593Smuzhiyun * We need to ensure all the nodes can grow to a larger
1134*4882a593Smuzhiyun * bitmap size before make the reshaping.
1135*4882a593Smuzhiyun */
1136*4882a593Smuzhiyun rv = update_bitmap_size(mddev, newsize);
1137*4882a593Smuzhiyun if (rv)
1138*4882a593Smuzhiyun return rv;
1139*4882a593Smuzhiyun
1140*4882a593Smuzhiyun for (i = 0; i < mddev->bitmap_info.nodes; i++) {
1141*4882a593Smuzhiyun if (i == md_cluster_ops->slot_number(mddev))
1142*4882a593Smuzhiyun continue;
1143*4882a593Smuzhiyun
1144*4882a593Smuzhiyun bitmap = get_bitmap_from_slot(mddev, i);
1145*4882a593Smuzhiyun if (IS_ERR(bitmap)) {
1146*4882a593Smuzhiyun pr_err("can't get bitmap from slot %d\n", i);
1147*4882a593Smuzhiyun bitmap = NULL;
1148*4882a593Smuzhiyun goto out;
1149*4882a593Smuzhiyun }
1150*4882a593Smuzhiyun counts = &bitmap->counts;
1151*4882a593Smuzhiyun
1152*4882a593Smuzhiyun /*
1153*4882a593Smuzhiyun * If we can hold the bitmap lock of one node then
1154*4882a593Smuzhiyun * the slot is not occupied, update the pages.
1155*4882a593Smuzhiyun */
1156*4882a593Smuzhiyun snprintf(str, 64, "bitmap%04d", i);
1157*4882a593Smuzhiyun bm_lockres = lockres_init(mddev, str, NULL, 1);
1158*4882a593Smuzhiyun if (!bm_lockres) {
1159*4882a593Smuzhiyun pr_err("Cannot initialize %s lock\n", str);
1160*4882a593Smuzhiyun goto out;
1161*4882a593Smuzhiyun }
1162*4882a593Smuzhiyun bm_lockres->flags |= DLM_LKF_NOQUEUE;
1163*4882a593Smuzhiyun rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
1164*4882a593Smuzhiyun if (!rv)
1165*4882a593Smuzhiyun counts->pages = my_pages;
1166*4882a593Smuzhiyun lockres_free(bm_lockres);
1167*4882a593Smuzhiyun
1168*4882a593Smuzhiyun if (my_pages != counts->pages)
1169*4882a593Smuzhiyun /*
1170*4882a593Smuzhiyun * Let's revert the bitmap size if one node
1171*4882a593Smuzhiyun * can't resize bitmap
1172*4882a593Smuzhiyun */
1173*4882a593Smuzhiyun goto out;
1174*4882a593Smuzhiyun md_bitmap_free(bitmap);
1175*4882a593Smuzhiyun }
1176*4882a593Smuzhiyun
1177*4882a593Smuzhiyun return 0;
1178*4882a593Smuzhiyun out:
1179*4882a593Smuzhiyun md_bitmap_free(bitmap);
1180*4882a593Smuzhiyun update_bitmap_size(mddev, oldsize);
1181*4882a593Smuzhiyun return -1;
1182*4882a593Smuzhiyun }
1183*4882a593Smuzhiyun
1184*4882a593Smuzhiyun /*
1185*4882a593Smuzhiyun * return 0 if all the bitmaps have the same sync_size
1186*4882a593Smuzhiyun */
cluster_check_sync_size(struct mddev * mddev)1187*4882a593Smuzhiyun static int cluster_check_sync_size(struct mddev *mddev)
1188*4882a593Smuzhiyun {
1189*4882a593Smuzhiyun int i, rv;
1190*4882a593Smuzhiyun bitmap_super_t *sb;
1191*4882a593Smuzhiyun unsigned long my_sync_size, sync_size = 0;
1192*4882a593Smuzhiyun int node_num = mddev->bitmap_info.nodes;
1193*4882a593Smuzhiyun int current_slot = md_cluster_ops->slot_number(mddev);
1194*4882a593Smuzhiyun struct bitmap *bitmap = mddev->bitmap;
1195*4882a593Smuzhiyun char str[64];
1196*4882a593Smuzhiyun struct dlm_lock_resource *bm_lockres;
1197*4882a593Smuzhiyun
1198*4882a593Smuzhiyun sb = kmap_atomic(bitmap->storage.sb_page);
1199*4882a593Smuzhiyun my_sync_size = sb->sync_size;
1200*4882a593Smuzhiyun kunmap_atomic(sb);
1201*4882a593Smuzhiyun
1202*4882a593Smuzhiyun for (i = 0; i < node_num; i++) {
1203*4882a593Smuzhiyun if (i == current_slot)
1204*4882a593Smuzhiyun continue;
1205*4882a593Smuzhiyun
1206*4882a593Smuzhiyun bitmap = get_bitmap_from_slot(mddev, i);
1207*4882a593Smuzhiyun if (IS_ERR(bitmap)) {
1208*4882a593Smuzhiyun pr_err("can't get bitmap from slot %d\n", i);
1209*4882a593Smuzhiyun return -1;
1210*4882a593Smuzhiyun }
1211*4882a593Smuzhiyun
1212*4882a593Smuzhiyun /*
1213*4882a593Smuzhiyun * If we can hold the bitmap lock of one node then
1214*4882a593Smuzhiyun * the slot is not occupied, update the sb.
1215*4882a593Smuzhiyun */
1216*4882a593Smuzhiyun snprintf(str, 64, "bitmap%04d", i);
1217*4882a593Smuzhiyun bm_lockres = lockres_init(mddev, str, NULL, 1);
1218*4882a593Smuzhiyun if (!bm_lockres) {
1219*4882a593Smuzhiyun pr_err("md-cluster: Cannot initialize %s\n", str);
1220*4882a593Smuzhiyun md_bitmap_free(bitmap);
1221*4882a593Smuzhiyun return -1;
1222*4882a593Smuzhiyun }
1223*4882a593Smuzhiyun bm_lockres->flags |= DLM_LKF_NOQUEUE;
1224*4882a593Smuzhiyun rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
1225*4882a593Smuzhiyun if (!rv)
1226*4882a593Smuzhiyun md_bitmap_update_sb(bitmap);
1227*4882a593Smuzhiyun lockres_free(bm_lockres);
1228*4882a593Smuzhiyun
1229*4882a593Smuzhiyun sb = kmap_atomic(bitmap->storage.sb_page);
1230*4882a593Smuzhiyun if (sync_size == 0)
1231*4882a593Smuzhiyun sync_size = sb->sync_size;
1232*4882a593Smuzhiyun else if (sync_size != sb->sync_size) {
1233*4882a593Smuzhiyun kunmap_atomic(sb);
1234*4882a593Smuzhiyun md_bitmap_free(bitmap);
1235*4882a593Smuzhiyun return -1;
1236*4882a593Smuzhiyun }
1237*4882a593Smuzhiyun kunmap_atomic(sb);
1238*4882a593Smuzhiyun md_bitmap_free(bitmap);
1239*4882a593Smuzhiyun }
1240*4882a593Smuzhiyun
1241*4882a593Smuzhiyun return (my_sync_size == sync_size) ? 0 : -1;
1242*4882a593Smuzhiyun }
1243*4882a593Smuzhiyun
1244*4882a593Smuzhiyun /*
1245*4882a593Smuzhiyun * Update the size for cluster raid is a little more complex, we perform it
1246*4882a593Smuzhiyun * by the steps:
1247*4882a593Smuzhiyun * 1. hold token lock and update superblock in initiator node.
1248*4882a593Smuzhiyun * 2. send METADATA_UPDATED msg to other nodes.
1249*4882a593Smuzhiyun * 3. The initiator node continues to check each bitmap's sync_size, if all
1250*4882a593Smuzhiyun * bitmaps have the same value of sync_size, then we can set capacity and
1251*4882a593Smuzhiyun * let other nodes to perform it. If one node can't update sync_size
1252*4882a593Smuzhiyun * accordingly, we need to revert to previous value.
1253*4882a593Smuzhiyun */
update_size(struct mddev * mddev,sector_t old_dev_sectors)1254*4882a593Smuzhiyun static void update_size(struct mddev *mddev, sector_t old_dev_sectors)
1255*4882a593Smuzhiyun {
1256*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1257*4882a593Smuzhiyun struct cluster_msg cmsg;
1258*4882a593Smuzhiyun struct md_rdev *rdev;
1259*4882a593Smuzhiyun int ret = 0;
1260*4882a593Smuzhiyun int raid_slot = -1;
1261*4882a593Smuzhiyun
1262*4882a593Smuzhiyun md_update_sb(mddev, 1);
1263*4882a593Smuzhiyun if (lock_comm(cinfo, 1)) {
1264*4882a593Smuzhiyun pr_err("%s: lock_comm failed\n", __func__);
1265*4882a593Smuzhiyun return;
1266*4882a593Smuzhiyun }
1267*4882a593Smuzhiyun
1268*4882a593Smuzhiyun memset(&cmsg, 0, sizeof(cmsg));
1269*4882a593Smuzhiyun cmsg.type = cpu_to_le32(METADATA_UPDATED);
1270*4882a593Smuzhiyun rdev_for_each(rdev, mddev)
1271*4882a593Smuzhiyun if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) {
1272*4882a593Smuzhiyun raid_slot = rdev->desc_nr;
1273*4882a593Smuzhiyun break;
1274*4882a593Smuzhiyun }
1275*4882a593Smuzhiyun if (raid_slot >= 0) {
1276*4882a593Smuzhiyun cmsg.raid_slot = cpu_to_le32(raid_slot);
1277*4882a593Smuzhiyun /*
1278*4882a593Smuzhiyun * We can only change capiticy after all the nodes can do it,
1279*4882a593Smuzhiyun * so need to wait after other nodes already received the msg
1280*4882a593Smuzhiyun * and handled the change
1281*4882a593Smuzhiyun */
1282*4882a593Smuzhiyun ret = __sendmsg(cinfo, &cmsg);
1283*4882a593Smuzhiyun if (ret) {
1284*4882a593Smuzhiyun pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
1285*4882a593Smuzhiyun __func__, __LINE__);
1286*4882a593Smuzhiyun unlock_comm(cinfo);
1287*4882a593Smuzhiyun return;
1288*4882a593Smuzhiyun }
1289*4882a593Smuzhiyun } else {
1290*4882a593Smuzhiyun pr_err("md-cluster: No good device id found to send\n");
1291*4882a593Smuzhiyun unlock_comm(cinfo);
1292*4882a593Smuzhiyun return;
1293*4882a593Smuzhiyun }
1294*4882a593Smuzhiyun
1295*4882a593Smuzhiyun /*
1296*4882a593Smuzhiyun * check the sync_size from other node's bitmap, if sync_size
1297*4882a593Smuzhiyun * have already updated in other nodes as expected, send an
1298*4882a593Smuzhiyun * empty metadata msg to permit the change of capacity
1299*4882a593Smuzhiyun */
1300*4882a593Smuzhiyun if (cluster_check_sync_size(mddev) == 0) {
1301*4882a593Smuzhiyun memset(&cmsg, 0, sizeof(cmsg));
1302*4882a593Smuzhiyun cmsg.type = cpu_to_le32(CHANGE_CAPACITY);
1303*4882a593Smuzhiyun ret = __sendmsg(cinfo, &cmsg);
1304*4882a593Smuzhiyun if (ret)
1305*4882a593Smuzhiyun pr_err("%s:%d: failed to send CHANGE_CAPACITY msg\n",
1306*4882a593Smuzhiyun __func__, __LINE__);
1307*4882a593Smuzhiyun set_capacity(mddev->gendisk, mddev->array_sectors);
1308*4882a593Smuzhiyun revalidate_disk_size(mddev->gendisk, true);
1309*4882a593Smuzhiyun } else {
1310*4882a593Smuzhiyun /* revert to previous sectors */
1311*4882a593Smuzhiyun ret = mddev->pers->resize(mddev, old_dev_sectors);
1312*4882a593Smuzhiyun if (!ret)
1313*4882a593Smuzhiyun revalidate_disk_size(mddev->gendisk, true);
1314*4882a593Smuzhiyun ret = __sendmsg(cinfo, &cmsg);
1315*4882a593Smuzhiyun if (ret)
1316*4882a593Smuzhiyun pr_err("%s:%d: failed to send METADATA_UPDATED msg\n",
1317*4882a593Smuzhiyun __func__, __LINE__);
1318*4882a593Smuzhiyun }
1319*4882a593Smuzhiyun unlock_comm(cinfo);
1320*4882a593Smuzhiyun }
1321*4882a593Smuzhiyun
resync_start(struct mddev * mddev)1322*4882a593Smuzhiyun static int resync_start(struct mddev *mddev)
1323*4882a593Smuzhiyun {
1324*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1325*4882a593Smuzhiyun return dlm_lock_sync_interruptible(cinfo->resync_lockres, DLM_LOCK_EX, mddev);
1326*4882a593Smuzhiyun }
1327*4882a593Smuzhiyun
resync_info_get(struct mddev * mddev,sector_t * lo,sector_t * hi)1328*4882a593Smuzhiyun static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi)
1329*4882a593Smuzhiyun {
1330*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1331*4882a593Smuzhiyun
1332*4882a593Smuzhiyun spin_lock_irq(&cinfo->suspend_lock);
1333*4882a593Smuzhiyun *lo = cinfo->suspend_lo;
1334*4882a593Smuzhiyun *hi = cinfo->suspend_hi;
1335*4882a593Smuzhiyun spin_unlock_irq(&cinfo->suspend_lock);
1336*4882a593Smuzhiyun }
1337*4882a593Smuzhiyun
resync_info_update(struct mddev * mddev,sector_t lo,sector_t hi)1338*4882a593Smuzhiyun static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
1339*4882a593Smuzhiyun {
1340*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1341*4882a593Smuzhiyun struct resync_info ri;
1342*4882a593Smuzhiyun struct cluster_msg cmsg = {0};
1343*4882a593Smuzhiyun
1344*4882a593Smuzhiyun /* do not send zero again, if we have sent before */
1345*4882a593Smuzhiyun if (hi == 0) {
1346*4882a593Smuzhiyun memcpy(&ri, cinfo->bitmap_lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
1347*4882a593Smuzhiyun if (le64_to_cpu(ri.hi) == 0)
1348*4882a593Smuzhiyun return 0;
1349*4882a593Smuzhiyun }
1350*4882a593Smuzhiyun
1351*4882a593Smuzhiyun add_resync_info(cinfo->bitmap_lockres, lo, hi);
1352*4882a593Smuzhiyun /* Re-acquire the lock to refresh LVB */
1353*4882a593Smuzhiyun dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
1354*4882a593Smuzhiyun cmsg.type = cpu_to_le32(RESYNCING);
1355*4882a593Smuzhiyun cmsg.low = cpu_to_le64(lo);
1356*4882a593Smuzhiyun cmsg.high = cpu_to_le64(hi);
1357*4882a593Smuzhiyun
1358*4882a593Smuzhiyun /*
1359*4882a593Smuzhiyun * mddev_lock is held if resync_info_update is called from
1360*4882a593Smuzhiyun * resync_finish (md_reap_sync_thread -> resync_finish)
1361*4882a593Smuzhiyun */
1362*4882a593Smuzhiyun if (lo == 0 && hi == 0)
1363*4882a593Smuzhiyun return sendmsg(cinfo, &cmsg, 1);
1364*4882a593Smuzhiyun else
1365*4882a593Smuzhiyun return sendmsg(cinfo, &cmsg, 0);
1366*4882a593Smuzhiyun }
1367*4882a593Smuzhiyun
resync_finish(struct mddev * mddev)1368*4882a593Smuzhiyun static int resync_finish(struct mddev *mddev)
1369*4882a593Smuzhiyun {
1370*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1371*4882a593Smuzhiyun int ret = 0;
1372*4882a593Smuzhiyun
1373*4882a593Smuzhiyun clear_bit(MD_RESYNCING_REMOTE, &mddev->recovery);
1374*4882a593Smuzhiyun
1375*4882a593Smuzhiyun /*
1376*4882a593Smuzhiyun * If resync thread is interrupted so we can't say resync is finished,
1377*4882a593Smuzhiyun * another node will launch resync thread to continue.
1378*4882a593Smuzhiyun */
1379*4882a593Smuzhiyun if (!test_bit(MD_CLOSING, &mddev->flags))
1380*4882a593Smuzhiyun ret = resync_info_update(mddev, 0, 0);
1381*4882a593Smuzhiyun dlm_unlock_sync(cinfo->resync_lockres);
1382*4882a593Smuzhiyun return ret;
1383*4882a593Smuzhiyun }
1384*4882a593Smuzhiyun
area_resyncing(struct mddev * mddev,int direction,sector_t lo,sector_t hi)1385*4882a593Smuzhiyun static int area_resyncing(struct mddev *mddev, int direction,
1386*4882a593Smuzhiyun sector_t lo, sector_t hi)
1387*4882a593Smuzhiyun {
1388*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1389*4882a593Smuzhiyun int ret = 0;
1390*4882a593Smuzhiyun
1391*4882a593Smuzhiyun if ((direction == READ) &&
1392*4882a593Smuzhiyun test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
1393*4882a593Smuzhiyun return 1;
1394*4882a593Smuzhiyun
1395*4882a593Smuzhiyun spin_lock_irq(&cinfo->suspend_lock);
1396*4882a593Smuzhiyun if (hi > cinfo->suspend_lo && lo < cinfo->suspend_hi)
1397*4882a593Smuzhiyun ret = 1;
1398*4882a593Smuzhiyun spin_unlock_irq(&cinfo->suspend_lock);
1399*4882a593Smuzhiyun return ret;
1400*4882a593Smuzhiyun }
1401*4882a593Smuzhiyun
1402*4882a593Smuzhiyun /* add_new_disk() - initiates a disk add
1403*4882a593Smuzhiyun * However, if this fails before writing md_update_sb(),
1404*4882a593Smuzhiyun * add_new_disk_cancel() must be called to release token lock
1405*4882a593Smuzhiyun */
add_new_disk(struct mddev * mddev,struct md_rdev * rdev)1406*4882a593Smuzhiyun static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
1407*4882a593Smuzhiyun {
1408*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1409*4882a593Smuzhiyun struct cluster_msg cmsg;
1410*4882a593Smuzhiyun int ret = 0;
1411*4882a593Smuzhiyun struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1412*4882a593Smuzhiyun char *uuid = sb->device_uuid;
1413*4882a593Smuzhiyun
1414*4882a593Smuzhiyun memset(&cmsg, 0, sizeof(cmsg));
1415*4882a593Smuzhiyun cmsg.type = cpu_to_le32(NEWDISK);
1416*4882a593Smuzhiyun memcpy(cmsg.uuid, uuid, 16);
1417*4882a593Smuzhiyun cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
1418*4882a593Smuzhiyun if (lock_comm(cinfo, 1))
1419*4882a593Smuzhiyun return -EAGAIN;
1420*4882a593Smuzhiyun ret = __sendmsg(cinfo, &cmsg);
1421*4882a593Smuzhiyun if (ret) {
1422*4882a593Smuzhiyun unlock_comm(cinfo);
1423*4882a593Smuzhiyun return ret;
1424*4882a593Smuzhiyun }
1425*4882a593Smuzhiyun cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE;
1426*4882a593Smuzhiyun ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX);
1427*4882a593Smuzhiyun cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE;
1428*4882a593Smuzhiyun /* Some node does not "see" the device */
1429*4882a593Smuzhiyun if (ret == -EAGAIN)
1430*4882a593Smuzhiyun ret = -ENOENT;
1431*4882a593Smuzhiyun if (ret)
1432*4882a593Smuzhiyun unlock_comm(cinfo);
1433*4882a593Smuzhiyun else {
1434*4882a593Smuzhiyun dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
1435*4882a593Smuzhiyun /* Since MD_CHANGE_DEVS will be set in add_bound_rdev which
1436*4882a593Smuzhiyun * will run soon after add_new_disk, the below path will be
1437*4882a593Smuzhiyun * invoked:
1438*4882a593Smuzhiyun * md_wakeup_thread(mddev->thread)
1439*4882a593Smuzhiyun * -> conf->thread (raid1d)
1440*4882a593Smuzhiyun * -> md_check_recovery -> md_update_sb
1441*4882a593Smuzhiyun * -> metadata_update_start/finish
1442*4882a593Smuzhiyun * MD_CLUSTER_SEND_LOCKED_ALREADY will be cleared eventually.
1443*4882a593Smuzhiyun *
1444*4882a593Smuzhiyun * For other failure cases, metadata_update_cancel and
1445*4882a593Smuzhiyun * add_new_disk_cancel also clear below bit as well.
1446*4882a593Smuzhiyun * */
1447*4882a593Smuzhiyun set_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
1448*4882a593Smuzhiyun wake_up(&cinfo->wait);
1449*4882a593Smuzhiyun }
1450*4882a593Smuzhiyun return ret;
1451*4882a593Smuzhiyun }
1452*4882a593Smuzhiyun
add_new_disk_cancel(struct mddev * mddev)1453*4882a593Smuzhiyun static void add_new_disk_cancel(struct mddev *mddev)
1454*4882a593Smuzhiyun {
1455*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1456*4882a593Smuzhiyun clear_bit(MD_CLUSTER_SEND_LOCKED_ALREADY, &cinfo->state);
1457*4882a593Smuzhiyun unlock_comm(cinfo);
1458*4882a593Smuzhiyun }
1459*4882a593Smuzhiyun
new_disk_ack(struct mddev * mddev,bool ack)1460*4882a593Smuzhiyun static int new_disk_ack(struct mddev *mddev, bool ack)
1461*4882a593Smuzhiyun {
1462*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1463*4882a593Smuzhiyun
1464*4882a593Smuzhiyun if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) {
1465*4882a593Smuzhiyun pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev));
1466*4882a593Smuzhiyun return -EINVAL;
1467*4882a593Smuzhiyun }
1468*4882a593Smuzhiyun
1469*4882a593Smuzhiyun if (ack)
1470*4882a593Smuzhiyun dlm_unlock_sync(cinfo->no_new_dev_lockres);
1471*4882a593Smuzhiyun complete(&cinfo->newdisk_completion);
1472*4882a593Smuzhiyun return 0;
1473*4882a593Smuzhiyun }
1474*4882a593Smuzhiyun
remove_disk(struct mddev * mddev,struct md_rdev * rdev)1475*4882a593Smuzhiyun static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1476*4882a593Smuzhiyun {
1477*4882a593Smuzhiyun struct cluster_msg cmsg = {0};
1478*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1479*4882a593Smuzhiyun cmsg.type = cpu_to_le32(REMOVE);
1480*4882a593Smuzhiyun cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
1481*4882a593Smuzhiyun return sendmsg(cinfo, &cmsg, 1);
1482*4882a593Smuzhiyun }
1483*4882a593Smuzhiyun
lock_all_bitmaps(struct mddev * mddev)1484*4882a593Smuzhiyun static int lock_all_bitmaps(struct mddev *mddev)
1485*4882a593Smuzhiyun {
1486*4882a593Smuzhiyun int slot, my_slot, ret, held = 1, i = 0;
1487*4882a593Smuzhiyun char str[64];
1488*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1489*4882a593Smuzhiyun
1490*4882a593Smuzhiyun cinfo->other_bitmap_lockres =
1491*4882a593Smuzhiyun kcalloc(mddev->bitmap_info.nodes - 1,
1492*4882a593Smuzhiyun sizeof(struct dlm_lock_resource *), GFP_KERNEL);
1493*4882a593Smuzhiyun if (!cinfo->other_bitmap_lockres) {
1494*4882a593Smuzhiyun pr_err("md: can't alloc mem for other bitmap locks\n");
1495*4882a593Smuzhiyun return 0;
1496*4882a593Smuzhiyun }
1497*4882a593Smuzhiyun
1498*4882a593Smuzhiyun my_slot = slot_number(mddev);
1499*4882a593Smuzhiyun for (slot = 0; slot < mddev->bitmap_info.nodes; slot++) {
1500*4882a593Smuzhiyun if (slot == my_slot)
1501*4882a593Smuzhiyun continue;
1502*4882a593Smuzhiyun
1503*4882a593Smuzhiyun memset(str, '\0', 64);
1504*4882a593Smuzhiyun snprintf(str, 64, "bitmap%04d", slot);
1505*4882a593Smuzhiyun cinfo->other_bitmap_lockres[i] = lockres_init(mddev, str, NULL, 1);
1506*4882a593Smuzhiyun if (!cinfo->other_bitmap_lockres[i])
1507*4882a593Smuzhiyun return -ENOMEM;
1508*4882a593Smuzhiyun
1509*4882a593Smuzhiyun cinfo->other_bitmap_lockres[i]->flags |= DLM_LKF_NOQUEUE;
1510*4882a593Smuzhiyun ret = dlm_lock_sync(cinfo->other_bitmap_lockres[i], DLM_LOCK_PW);
1511*4882a593Smuzhiyun if (ret)
1512*4882a593Smuzhiyun held = -1;
1513*4882a593Smuzhiyun i++;
1514*4882a593Smuzhiyun }
1515*4882a593Smuzhiyun
1516*4882a593Smuzhiyun return held;
1517*4882a593Smuzhiyun }
1518*4882a593Smuzhiyun
unlock_all_bitmaps(struct mddev * mddev)1519*4882a593Smuzhiyun static void unlock_all_bitmaps(struct mddev *mddev)
1520*4882a593Smuzhiyun {
1521*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1522*4882a593Smuzhiyun int i;
1523*4882a593Smuzhiyun
1524*4882a593Smuzhiyun /* release other node's bitmap lock if they are existed */
1525*4882a593Smuzhiyun if (cinfo->other_bitmap_lockres) {
1526*4882a593Smuzhiyun for (i = 0; i < mddev->bitmap_info.nodes - 1; i++) {
1527*4882a593Smuzhiyun if (cinfo->other_bitmap_lockres[i]) {
1528*4882a593Smuzhiyun lockres_free(cinfo->other_bitmap_lockres[i]);
1529*4882a593Smuzhiyun }
1530*4882a593Smuzhiyun }
1531*4882a593Smuzhiyun kfree(cinfo->other_bitmap_lockres);
1532*4882a593Smuzhiyun cinfo->other_bitmap_lockres = NULL;
1533*4882a593Smuzhiyun }
1534*4882a593Smuzhiyun }
1535*4882a593Smuzhiyun
gather_bitmaps(struct md_rdev * rdev)1536*4882a593Smuzhiyun static int gather_bitmaps(struct md_rdev *rdev)
1537*4882a593Smuzhiyun {
1538*4882a593Smuzhiyun int sn, err;
1539*4882a593Smuzhiyun sector_t lo, hi;
1540*4882a593Smuzhiyun struct cluster_msg cmsg = {0};
1541*4882a593Smuzhiyun struct mddev *mddev = rdev->mddev;
1542*4882a593Smuzhiyun struct md_cluster_info *cinfo = mddev->cluster_info;
1543*4882a593Smuzhiyun
1544*4882a593Smuzhiyun cmsg.type = cpu_to_le32(RE_ADD);
1545*4882a593Smuzhiyun cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
1546*4882a593Smuzhiyun err = sendmsg(cinfo, &cmsg, 1);
1547*4882a593Smuzhiyun if (err)
1548*4882a593Smuzhiyun goto out;
1549*4882a593Smuzhiyun
1550*4882a593Smuzhiyun for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) {
1551*4882a593Smuzhiyun if (sn == (cinfo->slot_number - 1))
1552*4882a593Smuzhiyun continue;
1553*4882a593Smuzhiyun err = md_bitmap_copy_from_slot(mddev, sn, &lo, &hi, false);
1554*4882a593Smuzhiyun if (err) {
1555*4882a593Smuzhiyun pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
1556*4882a593Smuzhiyun goto out;
1557*4882a593Smuzhiyun }
1558*4882a593Smuzhiyun if ((hi > 0) && (lo < mddev->recovery_cp))
1559*4882a593Smuzhiyun mddev->recovery_cp = lo;
1560*4882a593Smuzhiyun }
1561*4882a593Smuzhiyun out:
1562*4882a593Smuzhiyun return err;
1563*4882a593Smuzhiyun }
1564*4882a593Smuzhiyun
1565*4882a593Smuzhiyun static struct md_cluster_operations cluster_ops = {
1566*4882a593Smuzhiyun .join = join,
1567*4882a593Smuzhiyun .leave = leave,
1568*4882a593Smuzhiyun .slot_number = slot_number,
1569*4882a593Smuzhiyun .resync_start = resync_start,
1570*4882a593Smuzhiyun .resync_finish = resync_finish,
1571*4882a593Smuzhiyun .resync_info_update = resync_info_update,
1572*4882a593Smuzhiyun .resync_info_get = resync_info_get,
1573*4882a593Smuzhiyun .metadata_update_start = metadata_update_start,
1574*4882a593Smuzhiyun .metadata_update_finish = metadata_update_finish,
1575*4882a593Smuzhiyun .metadata_update_cancel = metadata_update_cancel,
1576*4882a593Smuzhiyun .area_resyncing = area_resyncing,
1577*4882a593Smuzhiyun .add_new_disk = add_new_disk,
1578*4882a593Smuzhiyun .add_new_disk_cancel = add_new_disk_cancel,
1579*4882a593Smuzhiyun .new_disk_ack = new_disk_ack,
1580*4882a593Smuzhiyun .remove_disk = remove_disk,
1581*4882a593Smuzhiyun .load_bitmaps = load_bitmaps,
1582*4882a593Smuzhiyun .gather_bitmaps = gather_bitmaps,
1583*4882a593Smuzhiyun .resize_bitmaps = resize_bitmaps,
1584*4882a593Smuzhiyun .lock_all_bitmaps = lock_all_bitmaps,
1585*4882a593Smuzhiyun .unlock_all_bitmaps = unlock_all_bitmaps,
1586*4882a593Smuzhiyun .update_size = update_size,
1587*4882a593Smuzhiyun };
1588*4882a593Smuzhiyun
cluster_init(void)1589*4882a593Smuzhiyun static int __init cluster_init(void)
1590*4882a593Smuzhiyun {
1591*4882a593Smuzhiyun pr_warn("md-cluster: support raid1 and raid10 (limited support)\n");
1592*4882a593Smuzhiyun pr_info("Registering Cluster MD functions\n");
1593*4882a593Smuzhiyun register_md_cluster_operations(&cluster_ops, THIS_MODULE);
1594*4882a593Smuzhiyun return 0;
1595*4882a593Smuzhiyun }
1596*4882a593Smuzhiyun
cluster_exit(void)1597*4882a593Smuzhiyun static void cluster_exit(void)
1598*4882a593Smuzhiyun {
1599*4882a593Smuzhiyun unregister_md_cluster_operations();
1600*4882a593Smuzhiyun }
1601*4882a593Smuzhiyun
1602*4882a593Smuzhiyun module_init(cluster_init);
1603*4882a593Smuzhiyun module_exit(cluster_exit);
1604*4882a593Smuzhiyun MODULE_AUTHOR("SUSE");
1605*4882a593Smuzhiyun MODULE_LICENSE("GPL");
1606*4882a593Smuzhiyun MODULE_DESCRIPTION("Clustering support for MD");
1607