xref: /OK3568_Linux_fs/kernel/fs/btrfs/dev-replace.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * Copyright (C) STRATO AG 2012.  All rights reserved.
4*4882a593Smuzhiyun  */
5*4882a593Smuzhiyun 
6*4882a593Smuzhiyun #include <linux/sched.h>
7*4882a593Smuzhiyun #include <linux/bio.h>
8*4882a593Smuzhiyun #include <linux/slab.h>
9*4882a593Smuzhiyun #include <linux/blkdev.h>
10*4882a593Smuzhiyun #include <linux/kthread.h>
11*4882a593Smuzhiyun #include <linux/math64.h>
12*4882a593Smuzhiyun #include "misc.h"
13*4882a593Smuzhiyun #include "ctree.h"
14*4882a593Smuzhiyun #include "extent_map.h"
15*4882a593Smuzhiyun #include "disk-io.h"
16*4882a593Smuzhiyun #include "transaction.h"
17*4882a593Smuzhiyun #include "print-tree.h"
18*4882a593Smuzhiyun #include "volumes.h"
19*4882a593Smuzhiyun #include "async-thread.h"
20*4882a593Smuzhiyun #include "check-integrity.h"
21*4882a593Smuzhiyun #include "rcu-string.h"
22*4882a593Smuzhiyun #include "dev-replace.h"
23*4882a593Smuzhiyun #include "sysfs.h"
24*4882a593Smuzhiyun 
25*4882a593Smuzhiyun /*
26*4882a593Smuzhiyun  * Device replace overview
27*4882a593Smuzhiyun  *
28*4882a593Smuzhiyun  * [Objective]
29*4882a593Smuzhiyun  * To copy all extents (both new and on-disk) from source device to target
30*4882a593Smuzhiyun  * device, while still keeping the filesystem read-write.
31*4882a593Smuzhiyun  *
32*4882a593Smuzhiyun  * [Method]
33*4882a593Smuzhiyun  * There are two main methods involved:
34*4882a593Smuzhiyun  *
35*4882a593Smuzhiyun  * - Write duplication
36*4882a593Smuzhiyun  *
37*4882a593Smuzhiyun  *   All new writes will be written to both target and source devices, so even
38*4882a593Smuzhiyun  *   if replace gets canceled, sources device still contans up-to-date data.
39*4882a593Smuzhiyun  *
40*4882a593Smuzhiyun  *   Location:		handle_ops_on_dev_replace() from __btrfs_map_block()
41*4882a593Smuzhiyun  *   Start:		btrfs_dev_replace_start()
42*4882a593Smuzhiyun  *   End:		btrfs_dev_replace_finishing()
43*4882a593Smuzhiyun  *   Content:		Latest data/metadata
44*4882a593Smuzhiyun  *
45*4882a593Smuzhiyun  * - Copy existing extents
46*4882a593Smuzhiyun  *
47*4882a593Smuzhiyun  *   This happens by re-using scrub facility, as scrub also iterates through
48*4882a593Smuzhiyun  *   existing extents from commit root.
49*4882a593Smuzhiyun  *
50*4882a593Smuzhiyun  *   Location:		scrub_write_block_to_dev_replace() from
51*4882a593Smuzhiyun  *   			scrub_block_complete()
52*4882a593Smuzhiyun  *   Content:		Data/meta from commit root.
53*4882a593Smuzhiyun  *
54*4882a593Smuzhiyun  * Due to the content difference, we need to avoid nocow write when dev-replace
55*4882a593Smuzhiyun  * is happening.  This is done by marking the block group read-only and waiting
56*4882a593Smuzhiyun  * for NOCOW writes.
57*4882a593Smuzhiyun  *
58*4882a593Smuzhiyun  * After replace is done, the finishing part is done by swapping the target and
59*4882a593Smuzhiyun  * source devices.
60*4882a593Smuzhiyun  *
61*4882a593Smuzhiyun  *   Location:		btrfs_dev_replace_update_device_in_mapping_tree() from
62*4882a593Smuzhiyun  *   			btrfs_dev_replace_finishing()
63*4882a593Smuzhiyun  */
64*4882a593Smuzhiyun 
65*4882a593Smuzhiyun static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
66*4882a593Smuzhiyun 				       int scrub_ret);
67*4882a593Smuzhiyun static int btrfs_dev_replace_kthread(void *data);
68*4882a593Smuzhiyun 
btrfs_init_dev_replace(struct btrfs_fs_info * fs_info)69*4882a593Smuzhiyun int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
70*4882a593Smuzhiyun {
71*4882a593Smuzhiyun 	struct btrfs_key key;
72*4882a593Smuzhiyun 	struct btrfs_root *dev_root = fs_info->dev_root;
73*4882a593Smuzhiyun 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
74*4882a593Smuzhiyun 	struct extent_buffer *eb;
75*4882a593Smuzhiyun 	int slot;
76*4882a593Smuzhiyun 	int ret = 0;
77*4882a593Smuzhiyun 	struct btrfs_path *path = NULL;
78*4882a593Smuzhiyun 	int item_size;
79*4882a593Smuzhiyun 	struct btrfs_dev_replace_item *ptr;
80*4882a593Smuzhiyun 	u64 src_devid;
81*4882a593Smuzhiyun 
82*4882a593Smuzhiyun 	path = btrfs_alloc_path();
83*4882a593Smuzhiyun 	if (!path) {
84*4882a593Smuzhiyun 		ret = -ENOMEM;
85*4882a593Smuzhiyun 		goto out;
86*4882a593Smuzhiyun 	}
87*4882a593Smuzhiyun 
88*4882a593Smuzhiyun 	key.objectid = 0;
89*4882a593Smuzhiyun 	key.type = BTRFS_DEV_REPLACE_KEY;
90*4882a593Smuzhiyun 	key.offset = 0;
91*4882a593Smuzhiyun 	ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
92*4882a593Smuzhiyun 	if (ret) {
93*4882a593Smuzhiyun no_valid_dev_replace_entry_found:
94*4882a593Smuzhiyun 		/*
95*4882a593Smuzhiyun 		 * We don't have a replace item or it's corrupted.  If there is
96*4882a593Smuzhiyun 		 * a replace target, fail the mount.
97*4882a593Smuzhiyun 		 */
98*4882a593Smuzhiyun 		if (btrfs_find_device(fs_info->fs_devices,
99*4882a593Smuzhiyun 				      BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) {
100*4882a593Smuzhiyun 			btrfs_err(fs_info,
101*4882a593Smuzhiyun 			"found replace target device without a valid replace item");
102*4882a593Smuzhiyun 			ret = -EUCLEAN;
103*4882a593Smuzhiyun 			goto out;
104*4882a593Smuzhiyun 		}
105*4882a593Smuzhiyun 		ret = 0;
106*4882a593Smuzhiyun 		dev_replace->replace_state =
107*4882a593Smuzhiyun 			BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
108*4882a593Smuzhiyun 		dev_replace->cont_reading_from_srcdev_mode =
109*4882a593Smuzhiyun 		    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
110*4882a593Smuzhiyun 		dev_replace->time_started = 0;
111*4882a593Smuzhiyun 		dev_replace->time_stopped = 0;
112*4882a593Smuzhiyun 		atomic64_set(&dev_replace->num_write_errors, 0);
113*4882a593Smuzhiyun 		atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
114*4882a593Smuzhiyun 		dev_replace->cursor_left = 0;
115*4882a593Smuzhiyun 		dev_replace->committed_cursor_left = 0;
116*4882a593Smuzhiyun 		dev_replace->cursor_left_last_write_of_item = 0;
117*4882a593Smuzhiyun 		dev_replace->cursor_right = 0;
118*4882a593Smuzhiyun 		dev_replace->srcdev = NULL;
119*4882a593Smuzhiyun 		dev_replace->tgtdev = NULL;
120*4882a593Smuzhiyun 		dev_replace->is_valid = 0;
121*4882a593Smuzhiyun 		dev_replace->item_needs_writeback = 0;
122*4882a593Smuzhiyun 		goto out;
123*4882a593Smuzhiyun 	}
124*4882a593Smuzhiyun 	slot = path->slots[0];
125*4882a593Smuzhiyun 	eb = path->nodes[0];
126*4882a593Smuzhiyun 	item_size = btrfs_item_size_nr(eb, slot);
127*4882a593Smuzhiyun 	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
128*4882a593Smuzhiyun 
129*4882a593Smuzhiyun 	if (item_size != sizeof(struct btrfs_dev_replace_item)) {
130*4882a593Smuzhiyun 		btrfs_warn(fs_info,
131*4882a593Smuzhiyun 			"dev_replace entry found has unexpected size, ignore entry");
132*4882a593Smuzhiyun 		goto no_valid_dev_replace_entry_found;
133*4882a593Smuzhiyun 	}
134*4882a593Smuzhiyun 
135*4882a593Smuzhiyun 	src_devid = btrfs_dev_replace_src_devid(eb, ptr);
136*4882a593Smuzhiyun 	dev_replace->cont_reading_from_srcdev_mode =
137*4882a593Smuzhiyun 		btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
138*4882a593Smuzhiyun 	dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
139*4882a593Smuzhiyun 	dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
140*4882a593Smuzhiyun 	dev_replace->time_stopped =
141*4882a593Smuzhiyun 		btrfs_dev_replace_time_stopped(eb, ptr);
142*4882a593Smuzhiyun 	atomic64_set(&dev_replace->num_write_errors,
143*4882a593Smuzhiyun 		     btrfs_dev_replace_num_write_errors(eb, ptr));
144*4882a593Smuzhiyun 	atomic64_set(&dev_replace->num_uncorrectable_read_errors,
145*4882a593Smuzhiyun 		     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
146*4882a593Smuzhiyun 	dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
147*4882a593Smuzhiyun 	dev_replace->committed_cursor_left = dev_replace->cursor_left;
148*4882a593Smuzhiyun 	dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
149*4882a593Smuzhiyun 	dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
150*4882a593Smuzhiyun 	dev_replace->is_valid = 1;
151*4882a593Smuzhiyun 
152*4882a593Smuzhiyun 	dev_replace->item_needs_writeback = 0;
153*4882a593Smuzhiyun 	switch (dev_replace->replace_state) {
154*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
155*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
156*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
157*4882a593Smuzhiyun 		/*
158*4882a593Smuzhiyun 		 * We don't have an active replace item but if there is a
159*4882a593Smuzhiyun 		 * replace target, fail the mount.
160*4882a593Smuzhiyun 		 */
161*4882a593Smuzhiyun 		if (btrfs_find_device(fs_info->fs_devices,
162*4882a593Smuzhiyun 				      BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) {
163*4882a593Smuzhiyun 			btrfs_err(fs_info,
164*4882a593Smuzhiyun "replace without active item, run 'device scan --forget' on the target device");
165*4882a593Smuzhiyun 			ret = -EUCLEAN;
166*4882a593Smuzhiyun 		} else {
167*4882a593Smuzhiyun 			dev_replace->srcdev = NULL;
168*4882a593Smuzhiyun 			dev_replace->tgtdev = NULL;
169*4882a593Smuzhiyun 		}
170*4882a593Smuzhiyun 		break;
171*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
172*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
173*4882a593Smuzhiyun 		dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
174*4882a593Smuzhiyun 						src_devid, NULL, NULL, true);
175*4882a593Smuzhiyun 		dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
176*4882a593Smuzhiyun 							BTRFS_DEV_REPLACE_DEVID,
177*4882a593Smuzhiyun 							NULL, NULL, true);
178*4882a593Smuzhiyun 		/*
179*4882a593Smuzhiyun 		 * allow 'btrfs dev replace_cancel' if src/tgt device is
180*4882a593Smuzhiyun 		 * missing
181*4882a593Smuzhiyun 		 */
182*4882a593Smuzhiyun 		if (!dev_replace->srcdev &&
183*4882a593Smuzhiyun 		    !btrfs_test_opt(fs_info, DEGRADED)) {
184*4882a593Smuzhiyun 			ret = -EIO;
185*4882a593Smuzhiyun 			btrfs_warn(fs_info,
186*4882a593Smuzhiyun 			   "cannot mount because device replace operation is ongoing and");
187*4882a593Smuzhiyun 			btrfs_warn(fs_info,
188*4882a593Smuzhiyun 			   "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
189*4882a593Smuzhiyun 			   src_devid);
190*4882a593Smuzhiyun 		}
191*4882a593Smuzhiyun 		if (!dev_replace->tgtdev &&
192*4882a593Smuzhiyun 		    !btrfs_test_opt(fs_info, DEGRADED)) {
193*4882a593Smuzhiyun 			ret = -EIO;
194*4882a593Smuzhiyun 			btrfs_warn(fs_info,
195*4882a593Smuzhiyun 			   "cannot mount because device replace operation is ongoing and");
196*4882a593Smuzhiyun 			btrfs_warn(fs_info,
197*4882a593Smuzhiyun 			   "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
198*4882a593Smuzhiyun 				BTRFS_DEV_REPLACE_DEVID);
199*4882a593Smuzhiyun 		}
200*4882a593Smuzhiyun 		if (dev_replace->tgtdev) {
201*4882a593Smuzhiyun 			if (dev_replace->srcdev) {
202*4882a593Smuzhiyun 				dev_replace->tgtdev->total_bytes =
203*4882a593Smuzhiyun 					dev_replace->srcdev->total_bytes;
204*4882a593Smuzhiyun 				dev_replace->tgtdev->disk_total_bytes =
205*4882a593Smuzhiyun 					dev_replace->srcdev->disk_total_bytes;
206*4882a593Smuzhiyun 				dev_replace->tgtdev->commit_total_bytes =
207*4882a593Smuzhiyun 					dev_replace->srcdev->commit_total_bytes;
208*4882a593Smuzhiyun 				dev_replace->tgtdev->bytes_used =
209*4882a593Smuzhiyun 					dev_replace->srcdev->bytes_used;
210*4882a593Smuzhiyun 				dev_replace->tgtdev->commit_bytes_used =
211*4882a593Smuzhiyun 					dev_replace->srcdev->commit_bytes_used;
212*4882a593Smuzhiyun 			}
213*4882a593Smuzhiyun 			set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
214*4882a593Smuzhiyun 				&dev_replace->tgtdev->dev_state);
215*4882a593Smuzhiyun 
216*4882a593Smuzhiyun 			WARN_ON(fs_info->fs_devices->rw_devices == 0);
217*4882a593Smuzhiyun 			dev_replace->tgtdev->io_width = fs_info->sectorsize;
218*4882a593Smuzhiyun 			dev_replace->tgtdev->io_align = fs_info->sectorsize;
219*4882a593Smuzhiyun 			dev_replace->tgtdev->sector_size = fs_info->sectorsize;
220*4882a593Smuzhiyun 			dev_replace->tgtdev->fs_info = fs_info;
221*4882a593Smuzhiyun 			set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
222*4882a593Smuzhiyun 				&dev_replace->tgtdev->dev_state);
223*4882a593Smuzhiyun 		}
224*4882a593Smuzhiyun 		break;
225*4882a593Smuzhiyun 	}
226*4882a593Smuzhiyun 
227*4882a593Smuzhiyun out:
228*4882a593Smuzhiyun 	btrfs_free_path(path);
229*4882a593Smuzhiyun 	return ret;
230*4882a593Smuzhiyun }
231*4882a593Smuzhiyun 
232*4882a593Smuzhiyun /*
233*4882a593Smuzhiyun  * Initialize a new device for device replace target from a given source dev
234*4882a593Smuzhiyun  * and path.
235*4882a593Smuzhiyun  *
236*4882a593Smuzhiyun  * Return 0 and new device in @device_out, otherwise return < 0
237*4882a593Smuzhiyun  */
btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info * fs_info,const char * device_path,struct btrfs_device * srcdev,struct btrfs_device ** device_out)238*4882a593Smuzhiyun static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
239*4882a593Smuzhiyun 				  const char *device_path,
240*4882a593Smuzhiyun 				  struct btrfs_device *srcdev,
241*4882a593Smuzhiyun 				  struct btrfs_device **device_out)
242*4882a593Smuzhiyun {
243*4882a593Smuzhiyun 	struct btrfs_device *device;
244*4882a593Smuzhiyun 	struct block_device *bdev;
245*4882a593Smuzhiyun 	struct rcu_string *name;
246*4882a593Smuzhiyun 	u64 devid = BTRFS_DEV_REPLACE_DEVID;
247*4882a593Smuzhiyun 	int ret = 0;
248*4882a593Smuzhiyun 
249*4882a593Smuzhiyun 	*device_out = NULL;
250*4882a593Smuzhiyun 	if (srcdev->fs_devices->seeding) {
251*4882a593Smuzhiyun 		btrfs_err(fs_info, "the filesystem is a seed filesystem!");
252*4882a593Smuzhiyun 		return -EINVAL;
253*4882a593Smuzhiyun 	}
254*4882a593Smuzhiyun 
255*4882a593Smuzhiyun 	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
256*4882a593Smuzhiyun 				  fs_info->bdev_holder);
257*4882a593Smuzhiyun 	if (IS_ERR(bdev)) {
258*4882a593Smuzhiyun 		btrfs_err(fs_info, "target device %s is invalid!", device_path);
259*4882a593Smuzhiyun 		return PTR_ERR(bdev);
260*4882a593Smuzhiyun 	}
261*4882a593Smuzhiyun 
262*4882a593Smuzhiyun 	sync_blockdev(bdev);
263*4882a593Smuzhiyun 
264*4882a593Smuzhiyun 	list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
265*4882a593Smuzhiyun 		if (device->bdev == bdev) {
266*4882a593Smuzhiyun 			btrfs_err(fs_info,
267*4882a593Smuzhiyun 				  "target device is in the filesystem!");
268*4882a593Smuzhiyun 			ret = -EEXIST;
269*4882a593Smuzhiyun 			goto error;
270*4882a593Smuzhiyun 		}
271*4882a593Smuzhiyun 	}
272*4882a593Smuzhiyun 
273*4882a593Smuzhiyun 
274*4882a593Smuzhiyun 	if (i_size_read(bdev->bd_inode) <
275*4882a593Smuzhiyun 	    btrfs_device_get_total_bytes(srcdev)) {
276*4882a593Smuzhiyun 		btrfs_err(fs_info,
277*4882a593Smuzhiyun 			  "target device is smaller than source device!");
278*4882a593Smuzhiyun 		ret = -EINVAL;
279*4882a593Smuzhiyun 		goto error;
280*4882a593Smuzhiyun 	}
281*4882a593Smuzhiyun 
282*4882a593Smuzhiyun 
283*4882a593Smuzhiyun 	device = btrfs_alloc_device(NULL, &devid, NULL);
284*4882a593Smuzhiyun 	if (IS_ERR(device)) {
285*4882a593Smuzhiyun 		ret = PTR_ERR(device);
286*4882a593Smuzhiyun 		goto error;
287*4882a593Smuzhiyun 	}
288*4882a593Smuzhiyun 
289*4882a593Smuzhiyun 	name = rcu_string_strdup(device_path, GFP_KERNEL);
290*4882a593Smuzhiyun 	if (!name) {
291*4882a593Smuzhiyun 		btrfs_free_device(device);
292*4882a593Smuzhiyun 		ret = -ENOMEM;
293*4882a593Smuzhiyun 		goto error;
294*4882a593Smuzhiyun 	}
295*4882a593Smuzhiyun 	rcu_assign_pointer(device->name, name);
296*4882a593Smuzhiyun 
297*4882a593Smuzhiyun 	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
298*4882a593Smuzhiyun 	device->generation = 0;
299*4882a593Smuzhiyun 	device->io_width = fs_info->sectorsize;
300*4882a593Smuzhiyun 	device->io_align = fs_info->sectorsize;
301*4882a593Smuzhiyun 	device->sector_size = fs_info->sectorsize;
302*4882a593Smuzhiyun 	device->total_bytes = btrfs_device_get_total_bytes(srcdev);
303*4882a593Smuzhiyun 	device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
304*4882a593Smuzhiyun 	device->bytes_used = btrfs_device_get_bytes_used(srcdev);
305*4882a593Smuzhiyun 	device->commit_total_bytes = srcdev->commit_total_bytes;
306*4882a593Smuzhiyun 	device->commit_bytes_used = device->bytes_used;
307*4882a593Smuzhiyun 	device->fs_info = fs_info;
308*4882a593Smuzhiyun 	device->bdev = bdev;
309*4882a593Smuzhiyun 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
310*4882a593Smuzhiyun 	set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
311*4882a593Smuzhiyun 	device->mode = FMODE_EXCL;
312*4882a593Smuzhiyun 	device->dev_stats_valid = 1;
313*4882a593Smuzhiyun 	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
314*4882a593Smuzhiyun 	device->fs_devices = fs_info->fs_devices;
315*4882a593Smuzhiyun 
316*4882a593Smuzhiyun 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
317*4882a593Smuzhiyun 	list_add(&device->dev_list, &fs_info->fs_devices->devices);
318*4882a593Smuzhiyun 	fs_info->fs_devices->num_devices++;
319*4882a593Smuzhiyun 	fs_info->fs_devices->open_devices++;
320*4882a593Smuzhiyun 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
321*4882a593Smuzhiyun 
322*4882a593Smuzhiyun 	*device_out = device;
323*4882a593Smuzhiyun 	return 0;
324*4882a593Smuzhiyun 
325*4882a593Smuzhiyun error:
326*4882a593Smuzhiyun 	blkdev_put(bdev, FMODE_EXCL);
327*4882a593Smuzhiyun 	return ret;
328*4882a593Smuzhiyun }
329*4882a593Smuzhiyun 
330*4882a593Smuzhiyun /*
331*4882a593Smuzhiyun  * called from commit_transaction. Writes changed device replace state to
332*4882a593Smuzhiyun  * disk.
333*4882a593Smuzhiyun  */
btrfs_run_dev_replace(struct btrfs_trans_handle * trans)334*4882a593Smuzhiyun int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
335*4882a593Smuzhiyun {
336*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = trans->fs_info;
337*4882a593Smuzhiyun 	int ret;
338*4882a593Smuzhiyun 	struct btrfs_root *dev_root = fs_info->dev_root;
339*4882a593Smuzhiyun 	struct btrfs_path *path;
340*4882a593Smuzhiyun 	struct btrfs_key key;
341*4882a593Smuzhiyun 	struct extent_buffer *eb;
342*4882a593Smuzhiyun 	struct btrfs_dev_replace_item *ptr;
343*4882a593Smuzhiyun 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
344*4882a593Smuzhiyun 
345*4882a593Smuzhiyun 	down_read(&dev_replace->rwsem);
346*4882a593Smuzhiyun 	if (!dev_replace->is_valid ||
347*4882a593Smuzhiyun 	    !dev_replace->item_needs_writeback) {
348*4882a593Smuzhiyun 		up_read(&dev_replace->rwsem);
349*4882a593Smuzhiyun 		return 0;
350*4882a593Smuzhiyun 	}
351*4882a593Smuzhiyun 	up_read(&dev_replace->rwsem);
352*4882a593Smuzhiyun 
353*4882a593Smuzhiyun 	key.objectid = 0;
354*4882a593Smuzhiyun 	key.type = BTRFS_DEV_REPLACE_KEY;
355*4882a593Smuzhiyun 	key.offset = 0;
356*4882a593Smuzhiyun 
357*4882a593Smuzhiyun 	path = btrfs_alloc_path();
358*4882a593Smuzhiyun 	if (!path) {
359*4882a593Smuzhiyun 		ret = -ENOMEM;
360*4882a593Smuzhiyun 		goto out;
361*4882a593Smuzhiyun 	}
362*4882a593Smuzhiyun 	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
363*4882a593Smuzhiyun 	if (ret < 0) {
364*4882a593Smuzhiyun 		btrfs_warn(fs_info,
365*4882a593Smuzhiyun 			   "error %d while searching for dev_replace item!",
366*4882a593Smuzhiyun 			   ret);
367*4882a593Smuzhiyun 		goto out;
368*4882a593Smuzhiyun 	}
369*4882a593Smuzhiyun 
370*4882a593Smuzhiyun 	if (ret == 0 &&
371*4882a593Smuzhiyun 	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
372*4882a593Smuzhiyun 		/*
373*4882a593Smuzhiyun 		 * need to delete old one and insert a new one.
374*4882a593Smuzhiyun 		 * Since no attempt is made to recover any old state, if the
375*4882a593Smuzhiyun 		 * dev_replace state is 'running', the data on the target
376*4882a593Smuzhiyun 		 * drive is lost.
377*4882a593Smuzhiyun 		 * It would be possible to recover the state: just make sure
378*4882a593Smuzhiyun 		 * that the beginning of the item is never changed and always
379*4882a593Smuzhiyun 		 * contains all the essential information. Then read this
380*4882a593Smuzhiyun 		 * minimal set of information and use it as a base for the
381*4882a593Smuzhiyun 		 * new state.
382*4882a593Smuzhiyun 		 */
383*4882a593Smuzhiyun 		ret = btrfs_del_item(trans, dev_root, path);
384*4882a593Smuzhiyun 		if (ret != 0) {
385*4882a593Smuzhiyun 			btrfs_warn(fs_info,
386*4882a593Smuzhiyun 				   "delete too small dev_replace item failed %d!",
387*4882a593Smuzhiyun 				   ret);
388*4882a593Smuzhiyun 			goto out;
389*4882a593Smuzhiyun 		}
390*4882a593Smuzhiyun 		ret = 1;
391*4882a593Smuzhiyun 	}
392*4882a593Smuzhiyun 
393*4882a593Smuzhiyun 	if (ret == 1) {
394*4882a593Smuzhiyun 		/* need to insert a new item */
395*4882a593Smuzhiyun 		btrfs_release_path(path);
396*4882a593Smuzhiyun 		ret = btrfs_insert_empty_item(trans, dev_root, path,
397*4882a593Smuzhiyun 					      &key, sizeof(*ptr));
398*4882a593Smuzhiyun 		if (ret < 0) {
399*4882a593Smuzhiyun 			btrfs_warn(fs_info,
400*4882a593Smuzhiyun 				   "insert dev_replace item failed %d!", ret);
401*4882a593Smuzhiyun 			goto out;
402*4882a593Smuzhiyun 		}
403*4882a593Smuzhiyun 	}
404*4882a593Smuzhiyun 
405*4882a593Smuzhiyun 	eb = path->nodes[0];
406*4882a593Smuzhiyun 	ptr = btrfs_item_ptr(eb, path->slots[0],
407*4882a593Smuzhiyun 			     struct btrfs_dev_replace_item);
408*4882a593Smuzhiyun 
409*4882a593Smuzhiyun 	down_write(&dev_replace->rwsem);
410*4882a593Smuzhiyun 	if (dev_replace->srcdev)
411*4882a593Smuzhiyun 		btrfs_set_dev_replace_src_devid(eb, ptr,
412*4882a593Smuzhiyun 			dev_replace->srcdev->devid);
413*4882a593Smuzhiyun 	else
414*4882a593Smuzhiyun 		btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
415*4882a593Smuzhiyun 	btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
416*4882a593Smuzhiyun 		dev_replace->cont_reading_from_srcdev_mode);
417*4882a593Smuzhiyun 	btrfs_set_dev_replace_replace_state(eb, ptr,
418*4882a593Smuzhiyun 		dev_replace->replace_state);
419*4882a593Smuzhiyun 	btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
420*4882a593Smuzhiyun 	btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
421*4882a593Smuzhiyun 	btrfs_set_dev_replace_num_write_errors(eb, ptr,
422*4882a593Smuzhiyun 		atomic64_read(&dev_replace->num_write_errors));
423*4882a593Smuzhiyun 	btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
424*4882a593Smuzhiyun 		atomic64_read(&dev_replace->num_uncorrectable_read_errors));
425*4882a593Smuzhiyun 	dev_replace->cursor_left_last_write_of_item =
426*4882a593Smuzhiyun 		dev_replace->cursor_left;
427*4882a593Smuzhiyun 	btrfs_set_dev_replace_cursor_left(eb, ptr,
428*4882a593Smuzhiyun 		dev_replace->cursor_left_last_write_of_item);
429*4882a593Smuzhiyun 	btrfs_set_dev_replace_cursor_right(eb, ptr,
430*4882a593Smuzhiyun 		dev_replace->cursor_right);
431*4882a593Smuzhiyun 	dev_replace->item_needs_writeback = 0;
432*4882a593Smuzhiyun 	up_write(&dev_replace->rwsem);
433*4882a593Smuzhiyun 
434*4882a593Smuzhiyun 	btrfs_mark_buffer_dirty(eb);
435*4882a593Smuzhiyun 
436*4882a593Smuzhiyun out:
437*4882a593Smuzhiyun 	btrfs_free_path(path);
438*4882a593Smuzhiyun 
439*4882a593Smuzhiyun 	return ret;
440*4882a593Smuzhiyun }
441*4882a593Smuzhiyun 
btrfs_dev_name(struct btrfs_device * device)442*4882a593Smuzhiyun static char* btrfs_dev_name(struct btrfs_device *device)
443*4882a593Smuzhiyun {
444*4882a593Smuzhiyun 	if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
445*4882a593Smuzhiyun 		return "<missing disk>";
446*4882a593Smuzhiyun 	else
447*4882a593Smuzhiyun 		return rcu_str_deref(device->name);
448*4882a593Smuzhiyun }
449*4882a593Smuzhiyun 
btrfs_dev_replace_start(struct btrfs_fs_info * fs_info,const char * tgtdev_name,u64 srcdevid,const char * srcdev_name,int read_src)450*4882a593Smuzhiyun static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
451*4882a593Smuzhiyun 		const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
452*4882a593Smuzhiyun 		int read_src)
453*4882a593Smuzhiyun {
454*4882a593Smuzhiyun 	struct btrfs_root *root = fs_info->dev_root;
455*4882a593Smuzhiyun 	struct btrfs_trans_handle *trans;
456*4882a593Smuzhiyun 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
457*4882a593Smuzhiyun 	int ret;
458*4882a593Smuzhiyun 	struct btrfs_device *tgt_device = NULL;
459*4882a593Smuzhiyun 	struct btrfs_device *src_device = NULL;
460*4882a593Smuzhiyun 
461*4882a593Smuzhiyun 	src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
462*4882a593Smuzhiyun 						  srcdev_name);
463*4882a593Smuzhiyun 	if (IS_ERR(src_device))
464*4882a593Smuzhiyun 		return PTR_ERR(src_device);
465*4882a593Smuzhiyun 
466*4882a593Smuzhiyun 	if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
467*4882a593Smuzhiyun 		btrfs_warn_in_rcu(fs_info,
468*4882a593Smuzhiyun 	  "cannot replace device %s (devid %llu) due to active swapfile",
469*4882a593Smuzhiyun 			btrfs_dev_name(src_device), src_device->devid);
470*4882a593Smuzhiyun 		return -ETXTBSY;
471*4882a593Smuzhiyun 	}
472*4882a593Smuzhiyun 
473*4882a593Smuzhiyun 	/*
474*4882a593Smuzhiyun 	 * Here we commit the transaction to make sure commit_total_bytes
475*4882a593Smuzhiyun 	 * of all the devices are updated.
476*4882a593Smuzhiyun 	 */
477*4882a593Smuzhiyun 	trans = btrfs_attach_transaction(root);
478*4882a593Smuzhiyun 	if (!IS_ERR(trans)) {
479*4882a593Smuzhiyun 		ret = btrfs_commit_transaction(trans);
480*4882a593Smuzhiyun 		if (ret)
481*4882a593Smuzhiyun 			return ret;
482*4882a593Smuzhiyun 	} else if (PTR_ERR(trans) != -ENOENT) {
483*4882a593Smuzhiyun 		return PTR_ERR(trans);
484*4882a593Smuzhiyun 	}
485*4882a593Smuzhiyun 
486*4882a593Smuzhiyun 	ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
487*4882a593Smuzhiyun 					    src_device, &tgt_device);
488*4882a593Smuzhiyun 	if (ret)
489*4882a593Smuzhiyun 		return ret;
490*4882a593Smuzhiyun 
491*4882a593Smuzhiyun 	down_write(&dev_replace->rwsem);
492*4882a593Smuzhiyun 	switch (dev_replace->replace_state) {
493*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
494*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
495*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
496*4882a593Smuzhiyun 		break;
497*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
498*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
499*4882a593Smuzhiyun 		ASSERT(0);
500*4882a593Smuzhiyun 		ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
501*4882a593Smuzhiyun 		up_write(&dev_replace->rwsem);
502*4882a593Smuzhiyun 		goto leave;
503*4882a593Smuzhiyun 	}
504*4882a593Smuzhiyun 
505*4882a593Smuzhiyun 	dev_replace->cont_reading_from_srcdev_mode = read_src;
506*4882a593Smuzhiyun 	dev_replace->srcdev = src_device;
507*4882a593Smuzhiyun 	dev_replace->tgtdev = tgt_device;
508*4882a593Smuzhiyun 
509*4882a593Smuzhiyun 	btrfs_info_in_rcu(fs_info,
510*4882a593Smuzhiyun 		      "dev_replace from %s (devid %llu) to %s started",
511*4882a593Smuzhiyun 		      btrfs_dev_name(src_device),
512*4882a593Smuzhiyun 		      src_device->devid,
513*4882a593Smuzhiyun 		      rcu_str_deref(tgt_device->name));
514*4882a593Smuzhiyun 
515*4882a593Smuzhiyun 	/*
516*4882a593Smuzhiyun 	 * from now on, the writes to the srcdev are all duplicated to
517*4882a593Smuzhiyun 	 * go to the tgtdev as well (refer to btrfs_map_block()).
518*4882a593Smuzhiyun 	 */
519*4882a593Smuzhiyun 	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
520*4882a593Smuzhiyun 	dev_replace->time_started = ktime_get_real_seconds();
521*4882a593Smuzhiyun 	dev_replace->cursor_left = 0;
522*4882a593Smuzhiyun 	dev_replace->committed_cursor_left = 0;
523*4882a593Smuzhiyun 	dev_replace->cursor_left_last_write_of_item = 0;
524*4882a593Smuzhiyun 	dev_replace->cursor_right = 0;
525*4882a593Smuzhiyun 	dev_replace->is_valid = 1;
526*4882a593Smuzhiyun 	dev_replace->item_needs_writeback = 1;
527*4882a593Smuzhiyun 	atomic64_set(&dev_replace->num_write_errors, 0);
528*4882a593Smuzhiyun 	atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
529*4882a593Smuzhiyun 	up_write(&dev_replace->rwsem);
530*4882a593Smuzhiyun 
531*4882a593Smuzhiyun 	ret = btrfs_sysfs_add_device(tgt_device);
532*4882a593Smuzhiyun 	if (ret)
533*4882a593Smuzhiyun 		btrfs_err(fs_info, "kobj add dev failed %d", ret);
534*4882a593Smuzhiyun 
535*4882a593Smuzhiyun 	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
536*4882a593Smuzhiyun 
537*4882a593Smuzhiyun 	/* Commit dev_replace state and reserve 1 item for it. */
538*4882a593Smuzhiyun 	trans = btrfs_start_transaction(root, 1);
539*4882a593Smuzhiyun 	if (IS_ERR(trans)) {
540*4882a593Smuzhiyun 		ret = PTR_ERR(trans);
541*4882a593Smuzhiyun 		down_write(&dev_replace->rwsem);
542*4882a593Smuzhiyun 		dev_replace->replace_state =
543*4882a593Smuzhiyun 			BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
544*4882a593Smuzhiyun 		dev_replace->srcdev = NULL;
545*4882a593Smuzhiyun 		dev_replace->tgtdev = NULL;
546*4882a593Smuzhiyun 		up_write(&dev_replace->rwsem);
547*4882a593Smuzhiyun 		goto leave;
548*4882a593Smuzhiyun 	}
549*4882a593Smuzhiyun 
550*4882a593Smuzhiyun 	ret = btrfs_commit_transaction(trans);
551*4882a593Smuzhiyun 	WARN_ON(ret);
552*4882a593Smuzhiyun 
553*4882a593Smuzhiyun 	/* the disk copy procedure reuses the scrub code */
554*4882a593Smuzhiyun 	ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
555*4882a593Smuzhiyun 			      btrfs_device_get_total_bytes(src_device),
556*4882a593Smuzhiyun 			      &dev_replace->scrub_progress, 0, 1);
557*4882a593Smuzhiyun 
558*4882a593Smuzhiyun 	ret = btrfs_dev_replace_finishing(fs_info, ret);
559*4882a593Smuzhiyun 	if (ret == -EINPROGRESS)
560*4882a593Smuzhiyun 		ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
561*4882a593Smuzhiyun 
562*4882a593Smuzhiyun 	return ret;
563*4882a593Smuzhiyun 
564*4882a593Smuzhiyun leave:
565*4882a593Smuzhiyun 	btrfs_destroy_dev_replace_tgtdev(tgt_device);
566*4882a593Smuzhiyun 	return ret;
567*4882a593Smuzhiyun }
568*4882a593Smuzhiyun 
btrfs_dev_replace_by_ioctl(struct btrfs_fs_info * fs_info,struct btrfs_ioctl_dev_replace_args * args)569*4882a593Smuzhiyun int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
570*4882a593Smuzhiyun 			    struct btrfs_ioctl_dev_replace_args *args)
571*4882a593Smuzhiyun {
572*4882a593Smuzhiyun 	int ret;
573*4882a593Smuzhiyun 
574*4882a593Smuzhiyun 	switch (args->start.cont_reading_from_srcdev_mode) {
575*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
576*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
577*4882a593Smuzhiyun 		break;
578*4882a593Smuzhiyun 	default:
579*4882a593Smuzhiyun 		return -EINVAL;
580*4882a593Smuzhiyun 	}
581*4882a593Smuzhiyun 
582*4882a593Smuzhiyun 	if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
583*4882a593Smuzhiyun 	    args->start.tgtdev_name[0] == '\0')
584*4882a593Smuzhiyun 		return -EINVAL;
585*4882a593Smuzhiyun 
586*4882a593Smuzhiyun 	ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
587*4882a593Smuzhiyun 					args->start.srcdevid,
588*4882a593Smuzhiyun 					args->start.srcdev_name,
589*4882a593Smuzhiyun 					args->start.cont_reading_from_srcdev_mode);
590*4882a593Smuzhiyun 	args->result = ret;
591*4882a593Smuzhiyun 	/* don't warn if EINPROGRESS, someone else might be running scrub */
592*4882a593Smuzhiyun 	if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
593*4882a593Smuzhiyun 	    ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
594*4882a593Smuzhiyun 		return 0;
595*4882a593Smuzhiyun 
596*4882a593Smuzhiyun 	return ret;
597*4882a593Smuzhiyun }
598*4882a593Smuzhiyun 
599*4882a593Smuzhiyun /*
600*4882a593Smuzhiyun  * blocked until all in-flight bios operations are finished.
601*4882a593Smuzhiyun  */
btrfs_rm_dev_replace_blocked(struct btrfs_fs_info * fs_info)602*4882a593Smuzhiyun static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
603*4882a593Smuzhiyun {
604*4882a593Smuzhiyun 	set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
605*4882a593Smuzhiyun 	wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
606*4882a593Smuzhiyun 		   &fs_info->dev_replace.bio_counter));
607*4882a593Smuzhiyun }
608*4882a593Smuzhiyun 
609*4882a593Smuzhiyun /*
610*4882a593Smuzhiyun  * we have removed target device, it is safe to allow new bios request.
611*4882a593Smuzhiyun  */
btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info * fs_info)612*4882a593Smuzhiyun static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
613*4882a593Smuzhiyun {
614*4882a593Smuzhiyun 	clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
615*4882a593Smuzhiyun 	wake_up(&fs_info->dev_replace.replace_wait);
616*4882a593Smuzhiyun }
617*4882a593Smuzhiyun 
618*4882a593Smuzhiyun /*
619*4882a593Smuzhiyun  * When finishing the device replace, before swapping the source device with the
620*4882a593Smuzhiyun  * target device we must update the chunk allocation state in the target device,
621*4882a593Smuzhiyun  * as it is empty because replace works by directly copying the chunks and not
622*4882a593Smuzhiyun  * through the normal chunk allocation path.
623*4882a593Smuzhiyun  */
btrfs_set_target_alloc_state(struct btrfs_device * srcdev,struct btrfs_device * tgtdev)624*4882a593Smuzhiyun static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
625*4882a593Smuzhiyun 					struct btrfs_device *tgtdev)
626*4882a593Smuzhiyun {
627*4882a593Smuzhiyun 	struct extent_state *cached_state = NULL;
628*4882a593Smuzhiyun 	u64 start = 0;
629*4882a593Smuzhiyun 	u64 found_start;
630*4882a593Smuzhiyun 	u64 found_end;
631*4882a593Smuzhiyun 	int ret = 0;
632*4882a593Smuzhiyun 
633*4882a593Smuzhiyun 	lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
634*4882a593Smuzhiyun 
635*4882a593Smuzhiyun 	while (!find_first_extent_bit(&srcdev->alloc_state, start,
636*4882a593Smuzhiyun 				      &found_start, &found_end,
637*4882a593Smuzhiyun 				      CHUNK_ALLOCATED, &cached_state)) {
638*4882a593Smuzhiyun 		ret = set_extent_bits(&tgtdev->alloc_state, found_start,
639*4882a593Smuzhiyun 				      found_end, CHUNK_ALLOCATED);
640*4882a593Smuzhiyun 		if (ret)
641*4882a593Smuzhiyun 			break;
642*4882a593Smuzhiyun 		start = found_end + 1;
643*4882a593Smuzhiyun 	}
644*4882a593Smuzhiyun 
645*4882a593Smuzhiyun 	free_extent_state(cached_state);
646*4882a593Smuzhiyun 	return ret;
647*4882a593Smuzhiyun }
648*4882a593Smuzhiyun 
btrfs_dev_replace_update_device_in_mapping_tree(struct btrfs_fs_info * fs_info,struct btrfs_device * srcdev,struct btrfs_device * tgtdev)649*4882a593Smuzhiyun static void btrfs_dev_replace_update_device_in_mapping_tree(
650*4882a593Smuzhiyun 						struct btrfs_fs_info *fs_info,
651*4882a593Smuzhiyun 						struct btrfs_device *srcdev,
652*4882a593Smuzhiyun 						struct btrfs_device *tgtdev)
653*4882a593Smuzhiyun {
654*4882a593Smuzhiyun 	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
655*4882a593Smuzhiyun 	struct extent_map *em;
656*4882a593Smuzhiyun 	struct map_lookup *map;
657*4882a593Smuzhiyun 	u64 start = 0;
658*4882a593Smuzhiyun 	int i;
659*4882a593Smuzhiyun 
660*4882a593Smuzhiyun 	write_lock(&em_tree->lock);
661*4882a593Smuzhiyun 	do {
662*4882a593Smuzhiyun 		em = lookup_extent_mapping(em_tree, start, (u64)-1);
663*4882a593Smuzhiyun 		if (!em)
664*4882a593Smuzhiyun 			break;
665*4882a593Smuzhiyun 		map = em->map_lookup;
666*4882a593Smuzhiyun 		for (i = 0; i < map->num_stripes; i++)
667*4882a593Smuzhiyun 			if (srcdev == map->stripes[i].dev)
668*4882a593Smuzhiyun 				map->stripes[i].dev = tgtdev;
669*4882a593Smuzhiyun 		start = em->start + em->len;
670*4882a593Smuzhiyun 		free_extent_map(em);
671*4882a593Smuzhiyun 	} while (start);
672*4882a593Smuzhiyun 	write_unlock(&em_tree->lock);
673*4882a593Smuzhiyun }
674*4882a593Smuzhiyun 
btrfs_dev_replace_finishing(struct btrfs_fs_info * fs_info,int scrub_ret)675*4882a593Smuzhiyun static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
676*4882a593Smuzhiyun 				       int scrub_ret)
677*4882a593Smuzhiyun {
678*4882a593Smuzhiyun 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
679*4882a593Smuzhiyun 	struct btrfs_device *tgt_device;
680*4882a593Smuzhiyun 	struct btrfs_device *src_device;
681*4882a593Smuzhiyun 	struct btrfs_root *root = fs_info->tree_root;
682*4882a593Smuzhiyun 	u8 uuid_tmp[BTRFS_UUID_SIZE];
683*4882a593Smuzhiyun 	struct btrfs_trans_handle *trans;
684*4882a593Smuzhiyun 	int ret = 0;
685*4882a593Smuzhiyun 
686*4882a593Smuzhiyun 	/* don't allow cancel or unmount to disturb the finishing procedure */
687*4882a593Smuzhiyun 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
688*4882a593Smuzhiyun 
689*4882a593Smuzhiyun 	down_read(&dev_replace->rwsem);
690*4882a593Smuzhiyun 	/* was the operation canceled, or is it finished? */
691*4882a593Smuzhiyun 	if (dev_replace->replace_state !=
692*4882a593Smuzhiyun 	    BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
693*4882a593Smuzhiyun 		up_read(&dev_replace->rwsem);
694*4882a593Smuzhiyun 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
695*4882a593Smuzhiyun 		return 0;
696*4882a593Smuzhiyun 	}
697*4882a593Smuzhiyun 
698*4882a593Smuzhiyun 	tgt_device = dev_replace->tgtdev;
699*4882a593Smuzhiyun 	src_device = dev_replace->srcdev;
700*4882a593Smuzhiyun 	up_read(&dev_replace->rwsem);
701*4882a593Smuzhiyun 
702*4882a593Smuzhiyun 	/*
703*4882a593Smuzhiyun 	 * flush all outstanding I/O and inode extent mappings before the
704*4882a593Smuzhiyun 	 * copy operation is declared as being finished
705*4882a593Smuzhiyun 	 */
706*4882a593Smuzhiyun 	ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false);
707*4882a593Smuzhiyun 	if (ret) {
708*4882a593Smuzhiyun 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
709*4882a593Smuzhiyun 		return ret;
710*4882a593Smuzhiyun 	}
711*4882a593Smuzhiyun 	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
712*4882a593Smuzhiyun 
713*4882a593Smuzhiyun 	if (!scrub_ret)
714*4882a593Smuzhiyun 		btrfs_reada_remove_dev(src_device);
715*4882a593Smuzhiyun 
716*4882a593Smuzhiyun 	/*
717*4882a593Smuzhiyun 	 * We have to use this loop approach because at this point src_device
718*4882a593Smuzhiyun 	 * has to be available for transaction commit to complete, yet new
719*4882a593Smuzhiyun 	 * chunks shouldn't be allocated on the device.
720*4882a593Smuzhiyun 	 */
721*4882a593Smuzhiyun 	while (1) {
722*4882a593Smuzhiyun 		trans = btrfs_start_transaction(root, 0);
723*4882a593Smuzhiyun 		if (IS_ERR(trans)) {
724*4882a593Smuzhiyun 			btrfs_reada_undo_remove_dev(src_device);
725*4882a593Smuzhiyun 			mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
726*4882a593Smuzhiyun 			return PTR_ERR(trans);
727*4882a593Smuzhiyun 		}
728*4882a593Smuzhiyun 		ret = btrfs_commit_transaction(trans);
729*4882a593Smuzhiyun 		WARN_ON(ret);
730*4882a593Smuzhiyun 
731*4882a593Smuzhiyun 		/* Prevent write_all_supers() during the finishing procedure */
732*4882a593Smuzhiyun 		mutex_lock(&fs_info->fs_devices->device_list_mutex);
733*4882a593Smuzhiyun 		/* Prevent new chunks being allocated on the source device */
734*4882a593Smuzhiyun 		mutex_lock(&fs_info->chunk_mutex);
735*4882a593Smuzhiyun 
736*4882a593Smuzhiyun 		if (!list_empty(&src_device->post_commit_list)) {
737*4882a593Smuzhiyun 			mutex_unlock(&fs_info->fs_devices->device_list_mutex);
738*4882a593Smuzhiyun 			mutex_unlock(&fs_info->chunk_mutex);
739*4882a593Smuzhiyun 		} else {
740*4882a593Smuzhiyun 			break;
741*4882a593Smuzhiyun 		}
742*4882a593Smuzhiyun 	}
743*4882a593Smuzhiyun 
744*4882a593Smuzhiyun 	down_write(&dev_replace->rwsem);
745*4882a593Smuzhiyun 	dev_replace->replace_state =
746*4882a593Smuzhiyun 		scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
747*4882a593Smuzhiyun 			  : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
748*4882a593Smuzhiyun 	dev_replace->tgtdev = NULL;
749*4882a593Smuzhiyun 	dev_replace->srcdev = NULL;
750*4882a593Smuzhiyun 	dev_replace->time_stopped = ktime_get_real_seconds();
751*4882a593Smuzhiyun 	dev_replace->item_needs_writeback = 1;
752*4882a593Smuzhiyun 
753*4882a593Smuzhiyun 	/*
754*4882a593Smuzhiyun 	 * Update allocation state in the new device and replace the old device
755*4882a593Smuzhiyun 	 * with the new one in the mapping tree.
756*4882a593Smuzhiyun 	 */
757*4882a593Smuzhiyun 	if (!scrub_ret) {
758*4882a593Smuzhiyun 		scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
759*4882a593Smuzhiyun 		if (scrub_ret)
760*4882a593Smuzhiyun 			goto error;
761*4882a593Smuzhiyun 		btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
762*4882a593Smuzhiyun 								src_device,
763*4882a593Smuzhiyun 								tgt_device);
764*4882a593Smuzhiyun 	} else {
765*4882a593Smuzhiyun 		if (scrub_ret != -ECANCELED)
766*4882a593Smuzhiyun 			btrfs_err_in_rcu(fs_info,
767*4882a593Smuzhiyun 				 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
768*4882a593Smuzhiyun 				 btrfs_dev_name(src_device),
769*4882a593Smuzhiyun 				 src_device->devid,
770*4882a593Smuzhiyun 				 rcu_str_deref(tgt_device->name), scrub_ret);
771*4882a593Smuzhiyun error:
772*4882a593Smuzhiyun 		up_write(&dev_replace->rwsem);
773*4882a593Smuzhiyun 		mutex_unlock(&fs_info->chunk_mutex);
774*4882a593Smuzhiyun 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
775*4882a593Smuzhiyun 		btrfs_reada_undo_remove_dev(src_device);
776*4882a593Smuzhiyun 		btrfs_rm_dev_replace_blocked(fs_info);
777*4882a593Smuzhiyun 		if (tgt_device)
778*4882a593Smuzhiyun 			btrfs_destroy_dev_replace_tgtdev(tgt_device);
779*4882a593Smuzhiyun 		btrfs_rm_dev_replace_unblocked(fs_info);
780*4882a593Smuzhiyun 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
781*4882a593Smuzhiyun 
782*4882a593Smuzhiyun 		return scrub_ret;
783*4882a593Smuzhiyun 	}
784*4882a593Smuzhiyun 
785*4882a593Smuzhiyun 	btrfs_info_in_rcu(fs_info,
786*4882a593Smuzhiyun 			  "dev_replace from %s (devid %llu) to %s finished",
787*4882a593Smuzhiyun 			  btrfs_dev_name(src_device),
788*4882a593Smuzhiyun 			  src_device->devid,
789*4882a593Smuzhiyun 			  rcu_str_deref(tgt_device->name));
790*4882a593Smuzhiyun 	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
791*4882a593Smuzhiyun 	tgt_device->devid = src_device->devid;
792*4882a593Smuzhiyun 	src_device->devid = BTRFS_DEV_REPLACE_DEVID;
793*4882a593Smuzhiyun 	memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
794*4882a593Smuzhiyun 	memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
795*4882a593Smuzhiyun 	memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
796*4882a593Smuzhiyun 	btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
797*4882a593Smuzhiyun 	btrfs_device_set_disk_total_bytes(tgt_device,
798*4882a593Smuzhiyun 					  src_device->disk_total_bytes);
799*4882a593Smuzhiyun 	btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
800*4882a593Smuzhiyun 	tgt_device->commit_bytes_used = src_device->bytes_used;
801*4882a593Smuzhiyun 
802*4882a593Smuzhiyun 	btrfs_assign_next_active_device(src_device, tgt_device);
803*4882a593Smuzhiyun 
804*4882a593Smuzhiyun 	list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
805*4882a593Smuzhiyun 	fs_info->fs_devices->rw_devices++;
806*4882a593Smuzhiyun 
807*4882a593Smuzhiyun 	up_write(&dev_replace->rwsem);
808*4882a593Smuzhiyun 	btrfs_rm_dev_replace_blocked(fs_info);
809*4882a593Smuzhiyun 
810*4882a593Smuzhiyun 	btrfs_rm_dev_replace_remove_srcdev(src_device);
811*4882a593Smuzhiyun 
812*4882a593Smuzhiyun 	btrfs_rm_dev_replace_unblocked(fs_info);
813*4882a593Smuzhiyun 
814*4882a593Smuzhiyun 	/*
815*4882a593Smuzhiyun 	 * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
816*4882a593Smuzhiyun 	 * update on-disk dev stats value during commit transaction
817*4882a593Smuzhiyun 	 */
818*4882a593Smuzhiyun 	atomic_inc(&tgt_device->dev_stats_ccnt);
819*4882a593Smuzhiyun 
820*4882a593Smuzhiyun 	/*
821*4882a593Smuzhiyun 	 * this is again a consistent state where no dev_replace procedure
822*4882a593Smuzhiyun 	 * is running, the target device is part of the filesystem, the
823*4882a593Smuzhiyun 	 * source device is not part of the filesystem anymore and its 1st
824*4882a593Smuzhiyun 	 * superblock is scratched out so that it is no longer marked to
825*4882a593Smuzhiyun 	 * belong to this filesystem.
826*4882a593Smuzhiyun 	 */
827*4882a593Smuzhiyun 	mutex_unlock(&fs_info->chunk_mutex);
828*4882a593Smuzhiyun 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
829*4882a593Smuzhiyun 
830*4882a593Smuzhiyun 	/* replace the sysfs entry */
831*4882a593Smuzhiyun 	btrfs_sysfs_remove_device(src_device);
832*4882a593Smuzhiyun 	btrfs_sysfs_update_devid(tgt_device);
833*4882a593Smuzhiyun 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
834*4882a593Smuzhiyun 		btrfs_scratch_superblocks(fs_info, src_device->bdev,
835*4882a593Smuzhiyun 					  src_device->name->str);
836*4882a593Smuzhiyun 
837*4882a593Smuzhiyun 	/* write back the superblocks */
838*4882a593Smuzhiyun 	trans = btrfs_start_transaction(root, 0);
839*4882a593Smuzhiyun 	if (!IS_ERR(trans))
840*4882a593Smuzhiyun 		btrfs_commit_transaction(trans);
841*4882a593Smuzhiyun 
842*4882a593Smuzhiyun 	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
843*4882a593Smuzhiyun 
844*4882a593Smuzhiyun 	btrfs_rm_dev_replace_free_srcdev(src_device);
845*4882a593Smuzhiyun 
846*4882a593Smuzhiyun 	return 0;
847*4882a593Smuzhiyun }
848*4882a593Smuzhiyun 
849*4882a593Smuzhiyun /*
850*4882a593Smuzhiyun  * Read progress of device replace status according to the state and last
851*4882a593Smuzhiyun  * stored position. The value format is the same as for
852*4882a593Smuzhiyun  * btrfs_dev_replace::progress_1000
853*4882a593Smuzhiyun  */
btrfs_dev_replace_progress(struct btrfs_fs_info * fs_info)854*4882a593Smuzhiyun static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info)
855*4882a593Smuzhiyun {
856*4882a593Smuzhiyun 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
857*4882a593Smuzhiyun 	u64 ret = 0;
858*4882a593Smuzhiyun 
859*4882a593Smuzhiyun 	switch (dev_replace->replace_state) {
860*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
861*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
862*4882a593Smuzhiyun 		ret = 0;
863*4882a593Smuzhiyun 		break;
864*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
865*4882a593Smuzhiyun 		ret = 1000;
866*4882a593Smuzhiyun 		break;
867*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
868*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
869*4882a593Smuzhiyun 		ret = div64_u64(dev_replace->cursor_left,
870*4882a593Smuzhiyun 				div_u64(btrfs_device_get_total_bytes(
871*4882a593Smuzhiyun 						dev_replace->srcdev), 1000));
872*4882a593Smuzhiyun 		break;
873*4882a593Smuzhiyun 	}
874*4882a593Smuzhiyun 
875*4882a593Smuzhiyun 	return ret;
876*4882a593Smuzhiyun }
877*4882a593Smuzhiyun 
btrfs_dev_replace_status(struct btrfs_fs_info * fs_info,struct btrfs_ioctl_dev_replace_args * args)878*4882a593Smuzhiyun void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
879*4882a593Smuzhiyun 			      struct btrfs_ioctl_dev_replace_args *args)
880*4882a593Smuzhiyun {
881*4882a593Smuzhiyun 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
882*4882a593Smuzhiyun 
883*4882a593Smuzhiyun 	down_read(&dev_replace->rwsem);
884*4882a593Smuzhiyun 	/* even if !dev_replace_is_valid, the values are good enough for
885*4882a593Smuzhiyun 	 * the replace_status ioctl */
886*4882a593Smuzhiyun 	args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
887*4882a593Smuzhiyun 	args->status.replace_state = dev_replace->replace_state;
888*4882a593Smuzhiyun 	args->status.time_started = dev_replace->time_started;
889*4882a593Smuzhiyun 	args->status.time_stopped = dev_replace->time_stopped;
890*4882a593Smuzhiyun 	args->status.num_write_errors =
891*4882a593Smuzhiyun 		atomic64_read(&dev_replace->num_write_errors);
892*4882a593Smuzhiyun 	args->status.num_uncorrectable_read_errors =
893*4882a593Smuzhiyun 		atomic64_read(&dev_replace->num_uncorrectable_read_errors);
894*4882a593Smuzhiyun 	args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
895*4882a593Smuzhiyun 	up_read(&dev_replace->rwsem);
896*4882a593Smuzhiyun }
897*4882a593Smuzhiyun 
btrfs_dev_replace_cancel(struct btrfs_fs_info * fs_info)898*4882a593Smuzhiyun int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
899*4882a593Smuzhiyun {
900*4882a593Smuzhiyun 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
901*4882a593Smuzhiyun 	struct btrfs_device *tgt_device = NULL;
902*4882a593Smuzhiyun 	struct btrfs_device *src_device = NULL;
903*4882a593Smuzhiyun 	struct btrfs_trans_handle *trans;
904*4882a593Smuzhiyun 	struct btrfs_root *root = fs_info->tree_root;
905*4882a593Smuzhiyun 	int result;
906*4882a593Smuzhiyun 	int ret;
907*4882a593Smuzhiyun 
908*4882a593Smuzhiyun 	if (sb_rdonly(fs_info->sb))
909*4882a593Smuzhiyun 		return -EROFS;
910*4882a593Smuzhiyun 
911*4882a593Smuzhiyun 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
912*4882a593Smuzhiyun 	down_write(&dev_replace->rwsem);
913*4882a593Smuzhiyun 	switch (dev_replace->replace_state) {
914*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
915*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
916*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
917*4882a593Smuzhiyun 		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
918*4882a593Smuzhiyun 		up_write(&dev_replace->rwsem);
919*4882a593Smuzhiyun 		break;
920*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
921*4882a593Smuzhiyun 		tgt_device = dev_replace->tgtdev;
922*4882a593Smuzhiyun 		src_device = dev_replace->srcdev;
923*4882a593Smuzhiyun 		up_write(&dev_replace->rwsem);
924*4882a593Smuzhiyun 		ret = btrfs_scrub_cancel(fs_info);
925*4882a593Smuzhiyun 		if (ret < 0) {
926*4882a593Smuzhiyun 			result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
927*4882a593Smuzhiyun 		} else {
928*4882a593Smuzhiyun 			result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
929*4882a593Smuzhiyun 			/*
930*4882a593Smuzhiyun 			 * btrfs_dev_replace_finishing() will handle the
931*4882a593Smuzhiyun 			 * cleanup part
932*4882a593Smuzhiyun 			 */
933*4882a593Smuzhiyun 			btrfs_info_in_rcu(fs_info,
934*4882a593Smuzhiyun 				"dev_replace from %s (devid %llu) to %s canceled",
935*4882a593Smuzhiyun 				btrfs_dev_name(src_device), src_device->devid,
936*4882a593Smuzhiyun 				btrfs_dev_name(tgt_device));
937*4882a593Smuzhiyun 		}
938*4882a593Smuzhiyun 		break;
939*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
940*4882a593Smuzhiyun 		/*
941*4882a593Smuzhiyun 		 * Scrub doing the replace isn't running so we need to do the
942*4882a593Smuzhiyun 		 * cleanup step of btrfs_dev_replace_finishing() here
943*4882a593Smuzhiyun 		 */
944*4882a593Smuzhiyun 		result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
945*4882a593Smuzhiyun 		tgt_device = dev_replace->tgtdev;
946*4882a593Smuzhiyun 		src_device = dev_replace->srcdev;
947*4882a593Smuzhiyun 		dev_replace->tgtdev = NULL;
948*4882a593Smuzhiyun 		dev_replace->srcdev = NULL;
949*4882a593Smuzhiyun 		dev_replace->replace_state =
950*4882a593Smuzhiyun 				BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
951*4882a593Smuzhiyun 		dev_replace->time_stopped = ktime_get_real_seconds();
952*4882a593Smuzhiyun 		dev_replace->item_needs_writeback = 1;
953*4882a593Smuzhiyun 
954*4882a593Smuzhiyun 		up_write(&dev_replace->rwsem);
955*4882a593Smuzhiyun 
956*4882a593Smuzhiyun 		/* Scrub for replace must not be running in suspended state */
957*4882a593Smuzhiyun 		btrfs_scrub_cancel(fs_info);
958*4882a593Smuzhiyun 
959*4882a593Smuzhiyun 		trans = btrfs_start_transaction(root, 0);
960*4882a593Smuzhiyun 		if (IS_ERR(trans)) {
961*4882a593Smuzhiyun 			mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
962*4882a593Smuzhiyun 			return PTR_ERR(trans);
963*4882a593Smuzhiyun 		}
964*4882a593Smuzhiyun 		ret = btrfs_commit_transaction(trans);
965*4882a593Smuzhiyun 		WARN_ON(ret);
966*4882a593Smuzhiyun 
967*4882a593Smuzhiyun 		btrfs_info_in_rcu(fs_info,
968*4882a593Smuzhiyun 		"suspended dev_replace from %s (devid %llu) to %s canceled",
969*4882a593Smuzhiyun 			btrfs_dev_name(src_device), src_device->devid,
970*4882a593Smuzhiyun 			btrfs_dev_name(tgt_device));
971*4882a593Smuzhiyun 
972*4882a593Smuzhiyun 		if (tgt_device)
973*4882a593Smuzhiyun 			btrfs_destroy_dev_replace_tgtdev(tgt_device);
974*4882a593Smuzhiyun 		break;
975*4882a593Smuzhiyun 	default:
976*4882a593Smuzhiyun 		up_write(&dev_replace->rwsem);
977*4882a593Smuzhiyun 		result = -EINVAL;
978*4882a593Smuzhiyun 	}
979*4882a593Smuzhiyun 
980*4882a593Smuzhiyun 	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
981*4882a593Smuzhiyun 	return result;
982*4882a593Smuzhiyun }
983*4882a593Smuzhiyun 
btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info * fs_info)984*4882a593Smuzhiyun void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
985*4882a593Smuzhiyun {
986*4882a593Smuzhiyun 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
987*4882a593Smuzhiyun 
988*4882a593Smuzhiyun 	mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
989*4882a593Smuzhiyun 	down_write(&dev_replace->rwsem);
990*4882a593Smuzhiyun 
991*4882a593Smuzhiyun 	switch (dev_replace->replace_state) {
992*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
993*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
994*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
995*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
996*4882a593Smuzhiyun 		break;
997*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
998*4882a593Smuzhiyun 		dev_replace->replace_state =
999*4882a593Smuzhiyun 			BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
1000*4882a593Smuzhiyun 		dev_replace->time_stopped = ktime_get_real_seconds();
1001*4882a593Smuzhiyun 		dev_replace->item_needs_writeback = 1;
1002*4882a593Smuzhiyun 		btrfs_info(fs_info, "suspending dev_replace for unmount");
1003*4882a593Smuzhiyun 		break;
1004*4882a593Smuzhiyun 	}
1005*4882a593Smuzhiyun 
1006*4882a593Smuzhiyun 	up_write(&dev_replace->rwsem);
1007*4882a593Smuzhiyun 	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
1008*4882a593Smuzhiyun }
1009*4882a593Smuzhiyun 
1010*4882a593Smuzhiyun /* resume dev_replace procedure that was interrupted by unmount */
btrfs_resume_dev_replace_async(struct btrfs_fs_info * fs_info)1011*4882a593Smuzhiyun int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
1012*4882a593Smuzhiyun {
1013*4882a593Smuzhiyun 	struct task_struct *task;
1014*4882a593Smuzhiyun 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1015*4882a593Smuzhiyun 
1016*4882a593Smuzhiyun 	down_write(&dev_replace->rwsem);
1017*4882a593Smuzhiyun 
1018*4882a593Smuzhiyun 	switch (dev_replace->replace_state) {
1019*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
1020*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
1021*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
1022*4882a593Smuzhiyun 		up_write(&dev_replace->rwsem);
1023*4882a593Smuzhiyun 		return 0;
1024*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
1025*4882a593Smuzhiyun 		break;
1026*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
1027*4882a593Smuzhiyun 		dev_replace->replace_state =
1028*4882a593Smuzhiyun 			BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
1029*4882a593Smuzhiyun 		break;
1030*4882a593Smuzhiyun 	}
1031*4882a593Smuzhiyun 	if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
1032*4882a593Smuzhiyun 		btrfs_info(fs_info,
1033*4882a593Smuzhiyun 			   "cannot continue dev_replace, tgtdev is missing");
1034*4882a593Smuzhiyun 		btrfs_info(fs_info,
1035*4882a593Smuzhiyun 			   "you may cancel the operation after 'mount -o degraded'");
1036*4882a593Smuzhiyun 		dev_replace->replace_state =
1037*4882a593Smuzhiyun 					BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
1038*4882a593Smuzhiyun 		up_write(&dev_replace->rwsem);
1039*4882a593Smuzhiyun 		return 0;
1040*4882a593Smuzhiyun 	}
1041*4882a593Smuzhiyun 	up_write(&dev_replace->rwsem);
1042*4882a593Smuzhiyun 
1043*4882a593Smuzhiyun 	/*
1044*4882a593Smuzhiyun 	 * This could collide with a paused balance, but the exclusive op logic
1045*4882a593Smuzhiyun 	 * should never allow both to start and pause. We don't want to allow
1046*4882a593Smuzhiyun 	 * dev-replace to start anyway.
1047*4882a593Smuzhiyun 	 */
1048*4882a593Smuzhiyun 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
1049*4882a593Smuzhiyun 		down_write(&dev_replace->rwsem);
1050*4882a593Smuzhiyun 		dev_replace->replace_state =
1051*4882a593Smuzhiyun 					BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
1052*4882a593Smuzhiyun 		up_write(&dev_replace->rwsem);
1053*4882a593Smuzhiyun 		btrfs_info(fs_info,
1054*4882a593Smuzhiyun 		"cannot resume dev-replace, other exclusive operation running");
1055*4882a593Smuzhiyun 		return 0;
1056*4882a593Smuzhiyun 	}
1057*4882a593Smuzhiyun 
1058*4882a593Smuzhiyun 	task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
1059*4882a593Smuzhiyun 	return PTR_ERR_OR_ZERO(task);
1060*4882a593Smuzhiyun }
1061*4882a593Smuzhiyun 
btrfs_dev_replace_kthread(void * data)1062*4882a593Smuzhiyun static int btrfs_dev_replace_kthread(void *data)
1063*4882a593Smuzhiyun {
1064*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = data;
1065*4882a593Smuzhiyun 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1066*4882a593Smuzhiyun 	u64 progress;
1067*4882a593Smuzhiyun 	int ret;
1068*4882a593Smuzhiyun 
1069*4882a593Smuzhiyun 	progress = btrfs_dev_replace_progress(fs_info);
1070*4882a593Smuzhiyun 	progress = div_u64(progress, 10);
1071*4882a593Smuzhiyun 	btrfs_info_in_rcu(fs_info,
1072*4882a593Smuzhiyun 		"continuing dev_replace from %s (devid %llu) to target %s @%u%%",
1073*4882a593Smuzhiyun 		btrfs_dev_name(dev_replace->srcdev),
1074*4882a593Smuzhiyun 		dev_replace->srcdev->devid,
1075*4882a593Smuzhiyun 		btrfs_dev_name(dev_replace->tgtdev),
1076*4882a593Smuzhiyun 		(unsigned int)progress);
1077*4882a593Smuzhiyun 
1078*4882a593Smuzhiyun 	ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
1079*4882a593Smuzhiyun 			      dev_replace->committed_cursor_left,
1080*4882a593Smuzhiyun 			      btrfs_device_get_total_bytes(dev_replace->srcdev),
1081*4882a593Smuzhiyun 			      &dev_replace->scrub_progress, 0, 1);
1082*4882a593Smuzhiyun 	ret = btrfs_dev_replace_finishing(fs_info, ret);
1083*4882a593Smuzhiyun 	WARN_ON(ret && ret != -ECANCELED);
1084*4882a593Smuzhiyun 
1085*4882a593Smuzhiyun 	btrfs_exclop_finish(fs_info);
1086*4882a593Smuzhiyun 	return 0;
1087*4882a593Smuzhiyun }
1088*4882a593Smuzhiyun 
btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace * dev_replace)1089*4882a593Smuzhiyun int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
1090*4882a593Smuzhiyun {
1091*4882a593Smuzhiyun 	if (!dev_replace->is_valid)
1092*4882a593Smuzhiyun 		return 0;
1093*4882a593Smuzhiyun 
1094*4882a593Smuzhiyun 	switch (dev_replace->replace_state) {
1095*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
1096*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
1097*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
1098*4882a593Smuzhiyun 		return 0;
1099*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
1100*4882a593Smuzhiyun 	case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
1101*4882a593Smuzhiyun 		/*
1102*4882a593Smuzhiyun 		 * return true even if tgtdev is missing (this is
1103*4882a593Smuzhiyun 		 * something that can happen if the dev_replace
1104*4882a593Smuzhiyun 		 * procedure is suspended by an umount and then
1105*4882a593Smuzhiyun 		 * the tgtdev is missing (or "btrfs dev scan") was
1106*4882a593Smuzhiyun 		 * not called and the filesystem is remounted
1107*4882a593Smuzhiyun 		 * in degraded state. This does not stop the
1108*4882a593Smuzhiyun 		 * dev_replace procedure. It needs to be canceled
1109*4882a593Smuzhiyun 		 * manually if the cancellation is wanted.
1110*4882a593Smuzhiyun 		 */
1111*4882a593Smuzhiyun 		break;
1112*4882a593Smuzhiyun 	}
1113*4882a593Smuzhiyun 	return 1;
1114*4882a593Smuzhiyun }
1115*4882a593Smuzhiyun 
btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info * fs_info)1116*4882a593Smuzhiyun void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
1117*4882a593Smuzhiyun {
1118*4882a593Smuzhiyun 	percpu_counter_inc(&fs_info->dev_replace.bio_counter);
1119*4882a593Smuzhiyun }
1120*4882a593Smuzhiyun 
btrfs_bio_counter_sub(struct btrfs_fs_info * fs_info,s64 amount)1121*4882a593Smuzhiyun void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
1122*4882a593Smuzhiyun {
1123*4882a593Smuzhiyun 	percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
1124*4882a593Smuzhiyun 	cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
1125*4882a593Smuzhiyun }
1126*4882a593Smuzhiyun 
btrfs_bio_counter_inc_blocked(struct btrfs_fs_info * fs_info)1127*4882a593Smuzhiyun void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
1128*4882a593Smuzhiyun {
1129*4882a593Smuzhiyun 	while (1) {
1130*4882a593Smuzhiyun 		percpu_counter_inc(&fs_info->dev_replace.bio_counter);
1131*4882a593Smuzhiyun 		if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
1132*4882a593Smuzhiyun 				     &fs_info->fs_state)))
1133*4882a593Smuzhiyun 			break;
1134*4882a593Smuzhiyun 
1135*4882a593Smuzhiyun 		btrfs_bio_counter_dec(fs_info);
1136*4882a593Smuzhiyun 		wait_event(fs_info->dev_replace.replace_wait,
1137*4882a593Smuzhiyun 			   !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
1138*4882a593Smuzhiyun 				     &fs_info->fs_state));
1139*4882a593Smuzhiyun 	}
1140*4882a593Smuzhiyun }
1141