xref: /OK3568_Linux_fs/kernel/fs/btrfs/scrub.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
4*4882a593Smuzhiyun  */
5*4882a593Smuzhiyun 
6*4882a593Smuzhiyun #include <linux/blkdev.h>
7*4882a593Smuzhiyun #include <linux/ratelimit.h>
8*4882a593Smuzhiyun #include <linux/sched/mm.h>
9*4882a593Smuzhiyun #include <crypto/hash.h>
10*4882a593Smuzhiyun #include "ctree.h"
11*4882a593Smuzhiyun #include "discard.h"
12*4882a593Smuzhiyun #include "volumes.h"
13*4882a593Smuzhiyun #include "disk-io.h"
14*4882a593Smuzhiyun #include "ordered-data.h"
15*4882a593Smuzhiyun #include "transaction.h"
16*4882a593Smuzhiyun #include "backref.h"
17*4882a593Smuzhiyun #include "extent_io.h"
18*4882a593Smuzhiyun #include "dev-replace.h"
19*4882a593Smuzhiyun #include "check-integrity.h"
20*4882a593Smuzhiyun #include "rcu-string.h"
21*4882a593Smuzhiyun #include "raid56.h"
22*4882a593Smuzhiyun #include "block-group.h"
23*4882a593Smuzhiyun 
24*4882a593Smuzhiyun /*
25*4882a593Smuzhiyun  * This is only the first step towards a full-features scrub. It reads all
26*4882a593Smuzhiyun  * extent and super block and verifies the checksums. In case a bad checksum
27*4882a593Smuzhiyun  * is found or the extent cannot be read, good data will be written back if
28*4882a593Smuzhiyun  * any can be found.
29*4882a593Smuzhiyun  *
30*4882a593Smuzhiyun  * Future enhancements:
31*4882a593Smuzhiyun  *  - In case an unrepairable extent is encountered, track which files are
32*4882a593Smuzhiyun  *    affected and report them
33*4882a593Smuzhiyun  *  - track and record media errors, throw out bad devices
34*4882a593Smuzhiyun  *  - add a mode to also read unallocated space
35*4882a593Smuzhiyun  */
36*4882a593Smuzhiyun 
37*4882a593Smuzhiyun struct scrub_block;
38*4882a593Smuzhiyun struct scrub_ctx;
39*4882a593Smuzhiyun 
40*4882a593Smuzhiyun /*
41*4882a593Smuzhiyun  * the following three values only influence the performance.
42*4882a593Smuzhiyun  * The last one configures the number of parallel and outstanding I/O
43*4882a593Smuzhiyun  * operations. The first two values configure an upper limit for the number
44*4882a593Smuzhiyun  * of (dynamically allocated) pages that are added to a bio.
45*4882a593Smuzhiyun  */
46*4882a593Smuzhiyun #define SCRUB_PAGES_PER_RD_BIO	32	/* 128k per bio */
47*4882a593Smuzhiyun #define SCRUB_PAGES_PER_WR_BIO	32	/* 128k per bio */
48*4882a593Smuzhiyun #define SCRUB_BIOS_PER_SCTX	64	/* 8MB per device in flight */
49*4882a593Smuzhiyun 
50*4882a593Smuzhiyun /*
51*4882a593Smuzhiyun  * the following value times PAGE_SIZE needs to be large enough to match the
52*4882a593Smuzhiyun  * largest node/leaf/sector size that shall be supported.
53*4882a593Smuzhiyun  * Values larger than BTRFS_STRIPE_LEN are not supported.
54*4882a593Smuzhiyun  */
55*4882a593Smuzhiyun #define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
56*4882a593Smuzhiyun 
57*4882a593Smuzhiyun struct scrub_recover {
58*4882a593Smuzhiyun 	refcount_t		refs;
59*4882a593Smuzhiyun 	struct btrfs_bio	*bbio;
60*4882a593Smuzhiyun 	u64			map_length;
61*4882a593Smuzhiyun };
62*4882a593Smuzhiyun 
63*4882a593Smuzhiyun struct scrub_page {
64*4882a593Smuzhiyun 	struct scrub_block	*sblock;
65*4882a593Smuzhiyun 	struct page		*page;
66*4882a593Smuzhiyun 	struct btrfs_device	*dev;
67*4882a593Smuzhiyun 	struct list_head	list;
68*4882a593Smuzhiyun 	u64			flags;  /* extent flags */
69*4882a593Smuzhiyun 	u64			generation;
70*4882a593Smuzhiyun 	u64			logical;
71*4882a593Smuzhiyun 	u64			physical;
72*4882a593Smuzhiyun 	u64			physical_for_dev_replace;
73*4882a593Smuzhiyun 	atomic_t		refs;
74*4882a593Smuzhiyun 	struct {
75*4882a593Smuzhiyun 		unsigned int	mirror_num:8;
76*4882a593Smuzhiyun 		unsigned int	have_csum:1;
77*4882a593Smuzhiyun 		unsigned int	io_error:1;
78*4882a593Smuzhiyun 	};
79*4882a593Smuzhiyun 	u8			csum[BTRFS_CSUM_SIZE];
80*4882a593Smuzhiyun 
81*4882a593Smuzhiyun 	struct scrub_recover	*recover;
82*4882a593Smuzhiyun };
83*4882a593Smuzhiyun 
84*4882a593Smuzhiyun struct scrub_bio {
85*4882a593Smuzhiyun 	int			index;
86*4882a593Smuzhiyun 	struct scrub_ctx	*sctx;
87*4882a593Smuzhiyun 	struct btrfs_device	*dev;
88*4882a593Smuzhiyun 	struct bio		*bio;
89*4882a593Smuzhiyun 	blk_status_t		status;
90*4882a593Smuzhiyun 	u64			logical;
91*4882a593Smuzhiyun 	u64			physical;
92*4882a593Smuzhiyun #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
93*4882a593Smuzhiyun 	struct scrub_page	*pagev[SCRUB_PAGES_PER_WR_BIO];
94*4882a593Smuzhiyun #else
95*4882a593Smuzhiyun 	struct scrub_page	*pagev[SCRUB_PAGES_PER_RD_BIO];
96*4882a593Smuzhiyun #endif
97*4882a593Smuzhiyun 	int			page_count;
98*4882a593Smuzhiyun 	int			next_free;
99*4882a593Smuzhiyun 	struct btrfs_work	work;
100*4882a593Smuzhiyun };
101*4882a593Smuzhiyun 
102*4882a593Smuzhiyun struct scrub_block {
103*4882a593Smuzhiyun 	struct scrub_page	*pagev[SCRUB_MAX_PAGES_PER_BLOCK];
104*4882a593Smuzhiyun 	int			page_count;
105*4882a593Smuzhiyun 	atomic_t		outstanding_pages;
106*4882a593Smuzhiyun 	refcount_t		refs; /* free mem on transition to zero */
107*4882a593Smuzhiyun 	struct scrub_ctx	*sctx;
108*4882a593Smuzhiyun 	struct scrub_parity	*sparity;
109*4882a593Smuzhiyun 	struct {
110*4882a593Smuzhiyun 		unsigned int	header_error:1;
111*4882a593Smuzhiyun 		unsigned int	checksum_error:1;
112*4882a593Smuzhiyun 		unsigned int	no_io_error_seen:1;
113*4882a593Smuzhiyun 		unsigned int	generation_error:1; /* also sets header_error */
114*4882a593Smuzhiyun 
115*4882a593Smuzhiyun 		/* The following is for the data used to check parity */
116*4882a593Smuzhiyun 		/* It is for the data with checksum */
117*4882a593Smuzhiyun 		unsigned int	data_corrected:1;
118*4882a593Smuzhiyun 	};
119*4882a593Smuzhiyun 	struct btrfs_work	work;
120*4882a593Smuzhiyun };
121*4882a593Smuzhiyun 
122*4882a593Smuzhiyun /* Used for the chunks with parity stripe such RAID5/6 */
123*4882a593Smuzhiyun struct scrub_parity {
124*4882a593Smuzhiyun 	struct scrub_ctx	*sctx;
125*4882a593Smuzhiyun 
126*4882a593Smuzhiyun 	struct btrfs_device	*scrub_dev;
127*4882a593Smuzhiyun 
128*4882a593Smuzhiyun 	u64			logic_start;
129*4882a593Smuzhiyun 
130*4882a593Smuzhiyun 	u64			logic_end;
131*4882a593Smuzhiyun 
132*4882a593Smuzhiyun 	int			nsectors;
133*4882a593Smuzhiyun 
134*4882a593Smuzhiyun 	u64			stripe_len;
135*4882a593Smuzhiyun 
136*4882a593Smuzhiyun 	refcount_t		refs;
137*4882a593Smuzhiyun 
138*4882a593Smuzhiyun 	struct list_head	spages;
139*4882a593Smuzhiyun 
140*4882a593Smuzhiyun 	/* Work of parity check and repair */
141*4882a593Smuzhiyun 	struct btrfs_work	work;
142*4882a593Smuzhiyun 
143*4882a593Smuzhiyun 	/* Mark the parity blocks which have data */
144*4882a593Smuzhiyun 	unsigned long		*dbitmap;
145*4882a593Smuzhiyun 
146*4882a593Smuzhiyun 	/*
147*4882a593Smuzhiyun 	 * Mark the parity blocks which have data, but errors happen when
148*4882a593Smuzhiyun 	 * read data or check data
149*4882a593Smuzhiyun 	 */
150*4882a593Smuzhiyun 	unsigned long		*ebitmap;
151*4882a593Smuzhiyun 
152*4882a593Smuzhiyun 	unsigned long		bitmap[];
153*4882a593Smuzhiyun };
154*4882a593Smuzhiyun 
155*4882a593Smuzhiyun struct scrub_ctx {
156*4882a593Smuzhiyun 	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
157*4882a593Smuzhiyun 	struct btrfs_fs_info	*fs_info;
158*4882a593Smuzhiyun 	int			first_free;
159*4882a593Smuzhiyun 	int			curr;
160*4882a593Smuzhiyun 	atomic_t		bios_in_flight;
161*4882a593Smuzhiyun 	atomic_t		workers_pending;
162*4882a593Smuzhiyun 	spinlock_t		list_lock;
163*4882a593Smuzhiyun 	wait_queue_head_t	list_wait;
164*4882a593Smuzhiyun 	u16			csum_size;
165*4882a593Smuzhiyun 	struct list_head	csum_list;
166*4882a593Smuzhiyun 	atomic_t		cancel_req;
167*4882a593Smuzhiyun 	int			readonly;
168*4882a593Smuzhiyun 	int			pages_per_rd_bio;
169*4882a593Smuzhiyun 
170*4882a593Smuzhiyun 	int			is_dev_replace;
171*4882a593Smuzhiyun 
172*4882a593Smuzhiyun 	struct scrub_bio        *wr_curr_bio;
173*4882a593Smuzhiyun 	struct mutex            wr_lock;
174*4882a593Smuzhiyun 	int                     pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
175*4882a593Smuzhiyun 	struct btrfs_device     *wr_tgtdev;
176*4882a593Smuzhiyun 	bool                    flush_all_writes;
177*4882a593Smuzhiyun 
178*4882a593Smuzhiyun 	/*
179*4882a593Smuzhiyun 	 * statistics
180*4882a593Smuzhiyun 	 */
181*4882a593Smuzhiyun 	struct btrfs_scrub_progress stat;
182*4882a593Smuzhiyun 	spinlock_t		stat_lock;
183*4882a593Smuzhiyun 
184*4882a593Smuzhiyun 	/*
185*4882a593Smuzhiyun 	 * Use a ref counter to avoid use-after-free issues. Scrub workers
186*4882a593Smuzhiyun 	 * decrement bios_in_flight and workers_pending and then do a wakeup
187*4882a593Smuzhiyun 	 * on the list_wait wait queue. We must ensure the main scrub task
188*4882a593Smuzhiyun 	 * doesn't free the scrub context before or while the workers are
189*4882a593Smuzhiyun 	 * doing the wakeup() call.
190*4882a593Smuzhiyun 	 */
191*4882a593Smuzhiyun 	refcount_t              refs;
192*4882a593Smuzhiyun };
193*4882a593Smuzhiyun 
194*4882a593Smuzhiyun struct scrub_warning {
195*4882a593Smuzhiyun 	struct btrfs_path	*path;
196*4882a593Smuzhiyun 	u64			extent_item_size;
197*4882a593Smuzhiyun 	const char		*errstr;
198*4882a593Smuzhiyun 	u64			physical;
199*4882a593Smuzhiyun 	u64			logical;
200*4882a593Smuzhiyun 	struct btrfs_device	*dev;
201*4882a593Smuzhiyun };
202*4882a593Smuzhiyun 
203*4882a593Smuzhiyun struct full_stripe_lock {
204*4882a593Smuzhiyun 	struct rb_node node;
205*4882a593Smuzhiyun 	u64 logical;
206*4882a593Smuzhiyun 	u64 refs;
207*4882a593Smuzhiyun 	struct mutex mutex;
208*4882a593Smuzhiyun };
209*4882a593Smuzhiyun 
210*4882a593Smuzhiyun static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
211*4882a593Smuzhiyun static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
212*4882a593Smuzhiyun static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
213*4882a593Smuzhiyun static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
214*4882a593Smuzhiyun 				     struct scrub_block *sblocks_for_recheck);
215*4882a593Smuzhiyun static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
216*4882a593Smuzhiyun 				struct scrub_block *sblock,
217*4882a593Smuzhiyun 				int retry_failed_mirror);
218*4882a593Smuzhiyun static void scrub_recheck_block_checksum(struct scrub_block *sblock);
219*4882a593Smuzhiyun static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
220*4882a593Smuzhiyun 					     struct scrub_block *sblock_good);
221*4882a593Smuzhiyun static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
222*4882a593Smuzhiyun 					    struct scrub_block *sblock_good,
223*4882a593Smuzhiyun 					    int page_num, int force_write);
224*4882a593Smuzhiyun static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
225*4882a593Smuzhiyun static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
226*4882a593Smuzhiyun 					   int page_num);
227*4882a593Smuzhiyun static int scrub_checksum_data(struct scrub_block *sblock);
228*4882a593Smuzhiyun static int scrub_checksum_tree_block(struct scrub_block *sblock);
229*4882a593Smuzhiyun static int scrub_checksum_super(struct scrub_block *sblock);
230*4882a593Smuzhiyun static void scrub_block_get(struct scrub_block *sblock);
231*4882a593Smuzhiyun static void scrub_block_put(struct scrub_block *sblock);
232*4882a593Smuzhiyun static void scrub_page_get(struct scrub_page *spage);
233*4882a593Smuzhiyun static void scrub_page_put(struct scrub_page *spage);
234*4882a593Smuzhiyun static void scrub_parity_get(struct scrub_parity *sparity);
235*4882a593Smuzhiyun static void scrub_parity_put(struct scrub_parity *sparity);
236*4882a593Smuzhiyun static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
237*4882a593Smuzhiyun 				    struct scrub_page *spage);
238*4882a593Smuzhiyun static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
239*4882a593Smuzhiyun 		       u64 physical, struct btrfs_device *dev, u64 flags,
240*4882a593Smuzhiyun 		       u64 gen, int mirror_num, u8 *csum, int force,
241*4882a593Smuzhiyun 		       u64 physical_for_dev_replace);
242*4882a593Smuzhiyun static void scrub_bio_end_io(struct bio *bio);
243*4882a593Smuzhiyun static void scrub_bio_end_io_worker(struct btrfs_work *work);
244*4882a593Smuzhiyun static void scrub_block_complete(struct scrub_block *sblock);
245*4882a593Smuzhiyun static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
246*4882a593Smuzhiyun 			       u64 extent_logical, u64 extent_len,
247*4882a593Smuzhiyun 			       u64 *extent_physical,
248*4882a593Smuzhiyun 			       struct btrfs_device **extent_dev,
249*4882a593Smuzhiyun 			       int *extent_mirror_num);
250*4882a593Smuzhiyun static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
251*4882a593Smuzhiyun 				    struct scrub_page *spage);
252*4882a593Smuzhiyun static void scrub_wr_submit(struct scrub_ctx *sctx);
253*4882a593Smuzhiyun static void scrub_wr_bio_end_io(struct bio *bio);
254*4882a593Smuzhiyun static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
255*4882a593Smuzhiyun static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
256*4882a593Smuzhiyun static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
257*4882a593Smuzhiyun static void scrub_put_ctx(struct scrub_ctx *sctx);
258*4882a593Smuzhiyun 
scrub_is_page_on_raid56(struct scrub_page * page)259*4882a593Smuzhiyun static inline int scrub_is_page_on_raid56(struct scrub_page *page)
260*4882a593Smuzhiyun {
261*4882a593Smuzhiyun 	return page->recover &&
262*4882a593Smuzhiyun 	       (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
263*4882a593Smuzhiyun }
264*4882a593Smuzhiyun 
scrub_pending_bio_inc(struct scrub_ctx * sctx)265*4882a593Smuzhiyun static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
266*4882a593Smuzhiyun {
267*4882a593Smuzhiyun 	refcount_inc(&sctx->refs);
268*4882a593Smuzhiyun 	atomic_inc(&sctx->bios_in_flight);
269*4882a593Smuzhiyun }
270*4882a593Smuzhiyun 
scrub_pending_bio_dec(struct scrub_ctx * sctx)271*4882a593Smuzhiyun static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
272*4882a593Smuzhiyun {
273*4882a593Smuzhiyun 	atomic_dec(&sctx->bios_in_flight);
274*4882a593Smuzhiyun 	wake_up(&sctx->list_wait);
275*4882a593Smuzhiyun 	scrub_put_ctx(sctx);
276*4882a593Smuzhiyun }
277*4882a593Smuzhiyun 
__scrub_blocked_if_needed(struct btrfs_fs_info * fs_info)278*4882a593Smuzhiyun static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
279*4882a593Smuzhiyun {
280*4882a593Smuzhiyun 	while (atomic_read(&fs_info->scrub_pause_req)) {
281*4882a593Smuzhiyun 		mutex_unlock(&fs_info->scrub_lock);
282*4882a593Smuzhiyun 		wait_event(fs_info->scrub_pause_wait,
283*4882a593Smuzhiyun 		   atomic_read(&fs_info->scrub_pause_req) == 0);
284*4882a593Smuzhiyun 		mutex_lock(&fs_info->scrub_lock);
285*4882a593Smuzhiyun 	}
286*4882a593Smuzhiyun }
287*4882a593Smuzhiyun 
scrub_pause_on(struct btrfs_fs_info * fs_info)288*4882a593Smuzhiyun static void scrub_pause_on(struct btrfs_fs_info *fs_info)
289*4882a593Smuzhiyun {
290*4882a593Smuzhiyun 	atomic_inc(&fs_info->scrubs_paused);
291*4882a593Smuzhiyun 	wake_up(&fs_info->scrub_pause_wait);
292*4882a593Smuzhiyun }
293*4882a593Smuzhiyun 
scrub_pause_off(struct btrfs_fs_info * fs_info)294*4882a593Smuzhiyun static void scrub_pause_off(struct btrfs_fs_info *fs_info)
295*4882a593Smuzhiyun {
296*4882a593Smuzhiyun 	mutex_lock(&fs_info->scrub_lock);
297*4882a593Smuzhiyun 	__scrub_blocked_if_needed(fs_info);
298*4882a593Smuzhiyun 	atomic_dec(&fs_info->scrubs_paused);
299*4882a593Smuzhiyun 	mutex_unlock(&fs_info->scrub_lock);
300*4882a593Smuzhiyun 
301*4882a593Smuzhiyun 	wake_up(&fs_info->scrub_pause_wait);
302*4882a593Smuzhiyun }
303*4882a593Smuzhiyun 
scrub_blocked_if_needed(struct btrfs_fs_info * fs_info)304*4882a593Smuzhiyun static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
305*4882a593Smuzhiyun {
306*4882a593Smuzhiyun 	scrub_pause_on(fs_info);
307*4882a593Smuzhiyun 	scrub_pause_off(fs_info);
308*4882a593Smuzhiyun }
309*4882a593Smuzhiyun 
310*4882a593Smuzhiyun /*
311*4882a593Smuzhiyun  * Insert new full stripe lock into full stripe locks tree
312*4882a593Smuzhiyun  *
313*4882a593Smuzhiyun  * Return pointer to existing or newly inserted full_stripe_lock structure if
314*4882a593Smuzhiyun  * everything works well.
315*4882a593Smuzhiyun  * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
316*4882a593Smuzhiyun  *
317*4882a593Smuzhiyun  * NOTE: caller must hold full_stripe_locks_root->lock before calling this
318*4882a593Smuzhiyun  * function
319*4882a593Smuzhiyun  */
insert_full_stripe_lock(struct btrfs_full_stripe_locks_tree * locks_root,u64 fstripe_logical)320*4882a593Smuzhiyun static struct full_stripe_lock *insert_full_stripe_lock(
321*4882a593Smuzhiyun 		struct btrfs_full_stripe_locks_tree *locks_root,
322*4882a593Smuzhiyun 		u64 fstripe_logical)
323*4882a593Smuzhiyun {
324*4882a593Smuzhiyun 	struct rb_node **p;
325*4882a593Smuzhiyun 	struct rb_node *parent = NULL;
326*4882a593Smuzhiyun 	struct full_stripe_lock *entry;
327*4882a593Smuzhiyun 	struct full_stripe_lock *ret;
328*4882a593Smuzhiyun 
329*4882a593Smuzhiyun 	lockdep_assert_held(&locks_root->lock);
330*4882a593Smuzhiyun 
331*4882a593Smuzhiyun 	p = &locks_root->root.rb_node;
332*4882a593Smuzhiyun 	while (*p) {
333*4882a593Smuzhiyun 		parent = *p;
334*4882a593Smuzhiyun 		entry = rb_entry(parent, struct full_stripe_lock, node);
335*4882a593Smuzhiyun 		if (fstripe_logical < entry->logical) {
336*4882a593Smuzhiyun 			p = &(*p)->rb_left;
337*4882a593Smuzhiyun 		} else if (fstripe_logical > entry->logical) {
338*4882a593Smuzhiyun 			p = &(*p)->rb_right;
339*4882a593Smuzhiyun 		} else {
340*4882a593Smuzhiyun 			entry->refs++;
341*4882a593Smuzhiyun 			return entry;
342*4882a593Smuzhiyun 		}
343*4882a593Smuzhiyun 	}
344*4882a593Smuzhiyun 
345*4882a593Smuzhiyun 	/*
346*4882a593Smuzhiyun 	 * Insert new lock.
347*4882a593Smuzhiyun 	 */
348*4882a593Smuzhiyun 	ret = kmalloc(sizeof(*ret), GFP_KERNEL);
349*4882a593Smuzhiyun 	if (!ret)
350*4882a593Smuzhiyun 		return ERR_PTR(-ENOMEM);
351*4882a593Smuzhiyun 	ret->logical = fstripe_logical;
352*4882a593Smuzhiyun 	ret->refs = 1;
353*4882a593Smuzhiyun 	mutex_init(&ret->mutex);
354*4882a593Smuzhiyun 
355*4882a593Smuzhiyun 	rb_link_node(&ret->node, parent, p);
356*4882a593Smuzhiyun 	rb_insert_color(&ret->node, &locks_root->root);
357*4882a593Smuzhiyun 	return ret;
358*4882a593Smuzhiyun }
359*4882a593Smuzhiyun 
360*4882a593Smuzhiyun /*
361*4882a593Smuzhiyun  * Search for a full stripe lock of a block group
362*4882a593Smuzhiyun  *
363*4882a593Smuzhiyun  * Return pointer to existing full stripe lock if found
364*4882a593Smuzhiyun  * Return NULL if not found
365*4882a593Smuzhiyun  */
search_full_stripe_lock(struct btrfs_full_stripe_locks_tree * locks_root,u64 fstripe_logical)366*4882a593Smuzhiyun static struct full_stripe_lock *search_full_stripe_lock(
367*4882a593Smuzhiyun 		struct btrfs_full_stripe_locks_tree *locks_root,
368*4882a593Smuzhiyun 		u64 fstripe_logical)
369*4882a593Smuzhiyun {
370*4882a593Smuzhiyun 	struct rb_node *node;
371*4882a593Smuzhiyun 	struct full_stripe_lock *entry;
372*4882a593Smuzhiyun 
373*4882a593Smuzhiyun 	lockdep_assert_held(&locks_root->lock);
374*4882a593Smuzhiyun 
375*4882a593Smuzhiyun 	node = locks_root->root.rb_node;
376*4882a593Smuzhiyun 	while (node) {
377*4882a593Smuzhiyun 		entry = rb_entry(node, struct full_stripe_lock, node);
378*4882a593Smuzhiyun 		if (fstripe_logical < entry->logical)
379*4882a593Smuzhiyun 			node = node->rb_left;
380*4882a593Smuzhiyun 		else if (fstripe_logical > entry->logical)
381*4882a593Smuzhiyun 			node = node->rb_right;
382*4882a593Smuzhiyun 		else
383*4882a593Smuzhiyun 			return entry;
384*4882a593Smuzhiyun 	}
385*4882a593Smuzhiyun 	return NULL;
386*4882a593Smuzhiyun }
387*4882a593Smuzhiyun 
388*4882a593Smuzhiyun /*
389*4882a593Smuzhiyun  * Helper to get full stripe logical from a normal bytenr.
390*4882a593Smuzhiyun  *
391*4882a593Smuzhiyun  * Caller must ensure @cache is a RAID56 block group.
392*4882a593Smuzhiyun  */
get_full_stripe_logical(struct btrfs_block_group * cache,u64 bytenr)393*4882a593Smuzhiyun static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
394*4882a593Smuzhiyun {
395*4882a593Smuzhiyun 	u64 ret;
396*4882a593Smuzhiyun 
397*4882a593Smuzhiyun 	/*
398*4882a593Smuzhiyun 	 * Due to chunk item size limit, full stripe length should not be
399*4882a593Smuzhiyun 	 * larger than U32_MAX. Just a sanity check here.
400*4882a593Smuzhiyun 	 */
401*4882a593Smuzhiyun 	WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
402*4882a593Smuzhiyun 
403*4882a593Smuzhiyun 	/*
404*4882a593Smuzhiyun 	 * round_down() can only handle power of 2, while RAID56 full
405*4882a593Smuzhiyun 	 * stripe length can be 64KiB * n, so we need to manually round down.
406*4882a593Smuzhiyun 	 */
407*4882a593Smuzhiyun 	ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
408*4882a593Smuzhiyun 			cache->full_stripe_len + cache->start;
409*4882a593Smuzhiyun 	return ret;
410*4882a593Smuzhiyun }
411*4882a593Smuzhiyun 
412*4882a593Smuzhiyun /*
413*4882a593Smuzhiyun  * Lock a full stripe to avoid concurrency of recovery and read
414*4882a593Smuzhiyun  *
415*4882a593Smuzhiyun  * It's only used for profiles with parities (RAID5/6), for other profiles it
416*4882a593Smuzhiyun  * does nothing.
417*4882a593Smuzhiyun  *
418*4882a593Smuzhiyun  * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
419*4882a593Smuzhiyun  * So caller must call unlock_full_stripe() at the same context.
420*4882a593Smuzhiyun  *
421*4882a593Smuzhiyun  * Return <0 if encounters error.
422*4882a593Smuzhiyun  */
lock_full_stripe(struct btrfs_fs_info * fs_info,u64 bytenr,bool * locked_ret)423*4882a593Smuzhiyun static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
424*4882a593Smuzhiyun 			    bool *locked_ret)
425*4882a593Smuzhiyun {
426*4882a593Smuzhiyun 	struct btrfs_block_group *bg_cache;
427*4882a593Smuzhiyun 	struct btrfs_full_stripe_locks_tree *locks_root;
428*4882a593Smuzhiyun 	struct full_stripe_lock *existing;
429*4882a593Smuzhiyun 	u64 fstripe_start;
430*4882a593Smuzhiyun 	int ret = 0;
431*4882a593Smuzhiyun 
432*4882a593Smuzhiyun 	*locked_ret = false;
433*4882a593Smuzhiyun 	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
434*4882a593Smuzhiyun 	if (!bg_cache) {
435*4882a593Smuzhiyun 		ASSERT(0);
436*4882a593Smuzhiyun 		return -ENOENT;
437*4882a593Smuzhiyun 	}
438*4882a593Smuzhiyun 
439*4882a593Smuzhiyun 	/* Profiles not based on parity don't need full stripe lock */
440*4882a593Smuzhiyun 	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
441*4882a593Smuzhiyun 		goto out;
442*4882a593Smuzhiyun 	locks_root = &bg_cache->full_stripe_locks_root;
443*4882a593Smuzhiyun 
444*4882a593Smuzhiyun 	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
445*4882a593Smuzhiyun 
446*4882a593Smuzhiyun 	/* Now insert the full stripe lock */
447*4882a593Smuzhiyun 	mutex_lock(&locks_root->lock);
448*4882a593Smuzhiyun 	existing = insert_full_stripe_lock(locks_root, fstripe_start);
449*4882a593Smuzhiyun 	mutex_unlock(&locks_root->lock);
450*4882a593Smuzhiyun 	if (IS_ERR(existing)) {
451*4882a593Smuzhiyun 		ret = PTR_ERR(existing);
452*4882a593Smuzhiyun 		goto out;
453*4882a593Smuzhiyun 	}
454*4882a593Smuzhiyun 	mutex_lock(&existing->mutex);
455*4882a593Smuzhiyun 	*locked_ret = true;
456*4882a593Smuzhiyun out:
457*4882a593Smuzhiyun 	btrfs_put_block_group(bg_cache);
458*4882a593Smuzhiyun 	return ret;
459*4882a593Smuzhiyun }
460*4882a593Smuzhiyun 
461*4882a593Smuzhiyun /*
462*4882a593Smuzhiyun  * Unlock a full stripe.
463*4882a593Smuzhiyun  *
464*4882a593Smuzhiyun  * NOTE: Caller must ensure it's the same context calling corresponding
465*4882a593Smuzhiyun  * lock_full_stripe().
466*4882a593Smuzhiyun  *
467*4882a593Smuzhiyun  * Return 0 if we unlock full stripe without problem.
468*4882a593Smuzhiyun  * Return <0 for error
469*4882a593Smuzhiyun  */
unlock_full_stripe(struct btrfs_fs_info * fs_info,u64 bytenr,bool locked)470*4882a593Smuzhiyun static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
471*4882a593Smuzhiyun 			      bool locked)
472*4882a593Smuzhiyun {
473*4882a593Smuzhiyun 	struct btrfs_block_group *bg_cache;
474*4882a593Smuzhiyun 	struct btrfs_full_stripe_locks_tree *locks_root;
475*4882a593Smuzhiyun 	struct full_stripe_lock *fstripe_lock;
476*4882a593Smuzhiyun 	u64 fstripe_start;
477*4882a593Smuzhiyun 	bool freeit = false;
478*4882a593Smuzhiyun 	int ret = 0;
479*4882a593Smuzhiyun 
480*4882a593Smuzhiyun 	/* If we didn't acquire full stripe lock, no need to continue */
481*4882a593Smuzhiyun 	if (!locked)
482*4882a593Smuzhiyun 		return 0;
483*4882a593Smuzhiyun 
484*4882a593Smuzhiyun 	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
485*4882a593Smuzhiyun 	if (!bg_cache) {
486*4882a593Smuzhiyun 		ASSERT(0);
487*4882a593Smuzhiyun 		return -ENOENT;
488*4882a593Smuzhiyun 	}
489*4882a593Smuzhiyun 	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
490*4882a593Smuzhiyun 		goto out;
491*4882a593Smuzhiyun 
492*4882a593Smuzhiyun 	locks_root = &bg_cache->full_stripe_locks_root;
493*4882a593Smuzhiyun 	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
494*4882a593Smuzhiyun 
495*4882a593Smuzhiyun 	mutex_lock(&locks_root->lock);
496*4882a593Smuzhiyun 	fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
497*4882a593Smuzhiyun 	/* Unpaired unlock_full_stripe() detected */
498*4882a593Smuzhiyun 	if (!fstripe_lock) {
499*4882a593Smuzhiyun 		WARN_ON(1);
500*4882a593Smuzhiyun 		ret = -ENOENT;
501*4882a593Smuzhiyun 		mutex_unlock(&locks_root->lock);
502*4882a593Smuzhiyun 		goto out;
503*4882a593Smuzhiyun 	}
504*4882a593Smuzhiyun 
505*4882a593Smuzhiyun 	if (fstripe_lock->refs == 0) {
506*4882a593Smuzhiyun 		WARN_ON(1);
507*4882a593Smuzhiyun 		btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
508*4882a593Smuzhiyun 			fstripe_lock->logical);
509*4882a593Smuzhiyun 	} else {
510*4882a593Smuzhiyun 		fstripe_lock->refs--;
511*4882a593Smuzhiyun 	}
512*4882a593Smuzhiyun 
513*4882a593Smuzhiyun 	if (fstripe_lock->refs == 0) {
514*4882a593Smuzhiyun 		rb_erase(&fstripe_lock->node, &locks_root->root);
515*4882a593Smuzhiyun 		freeit = true;
516*4882a593Smuzhiyun 	}
517*4882a593Smuzhiyun 	mutex_unlock(&locks_root->lock);
518*4882a593Smuzhiyun 
519*4882a593Smuzhiyun 	mutex_unlock(&fstripe_lock->mutex);
520*4882a593Smuzhiyun 	if (freeit)
521*4882a593Smuzhiyun 		kfree(fstripe_lock);
522*4882a593Smuzhiyun out:
523*4882a593Smuzhiyun 	btrfs_put_block_group(bg_cache);
524*4882a593Smuzhiyun 	return ret;
525*4882a593Smuzhiyun }
526*4882a593Smuzhiyun 
scrub_free_csums(struct scrub_ctx * sctx)527*4882a593Smuzhiyun static void scrub_free_csums(struct scrub_ctx *sctx)
528*4882a593Smuzhiyun {
529*4882a593Smuzhiyun 	while (!list_empty(&sctx->csum_list)) {
530*4882a593Smuzhiyun 		struct btrfs_ordered_sum *sum;
531*4882a593Smuzhiyun 		sum = list_first_entry(&sctx->csum_list,
532*4882a593Smuzhiyun 				       struct btrfs_ordered_sum, list);
533*4882a593Smuzhiyun 		list_del(&sum->list);
534*4882a593Smuzhiyun 		kfree(sum);
535*4882a593Smuzhiyun 	}
536*4882a593Smuzhiyun }
537*4882a593Smuzhiyun 
scrub_free_ctx(struct scrub_ctx * sctx)538*4882a593Smuzhiyun static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
539*4882a593Smuzhiyun {
540*4882a593Smuzhiyun 	int i;
541*4882a593Smuzhiyun 
542*4882a593Smuzhiyun 	if (!sctx)
543*4882a593Smuzhiyun 		return;
544*4882a593Smuzhiyun 
545*4882a593Smuzhiyun 	/* this can happen when scrub is cancelled */
546*4882a593Smuzhiyun 	if (sctx->curr != -1) {
547*4882a593Smuzhiyun 		struct scrub_bio *sbio = sctx->bios[sctx->curr];
548*4882a593Smuzhiyun 
549*4882a593Smuzhiyun 		for (i = 0; i < sbio->page_count; i++) {
550*4882a593Smuzhiyun 			WARN_ON(!sbio->pagev[i]->page);
551*4882a593Smuzhiyun 			scrub_block_put(sbio->pagev[i]->sblock);
552*4882a593Smuzhiyun 		}
553*4882a593Smuzhiyun 		bio_put(sbio->bio);
554*4882a593Smuzhiyun 	}
555*4882a593Smuzhiyun 
556*4882a593Smuzhiyun 	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
557*4882a593Smuzhiyun 		struct scrub_bio *sbio = sctx->bios[i];
558*4882a593Smuzhiyun 
559*4882a593Smuzhiyun 		if (!sbio)
560*4882a593Smuzhiyun 			break;
561*4882a593Smuzhiyun 		kfree(sbio);
562*4882a593Smuzhiyun 	}
563*4882a593Smuzhiyun 
564*4882a593Smuzhiyun 	kfree(sctx->wr_curr_bio);
565*4882a593Smuzhiyun 	scrub_free_csums(sctx);
566*4882a593Smuzhiyun 	kfree(sctx);
567*4882a593Smuzhiyun }
568*4882a593Smuzhiyun 
scrub_put_ctx(struct scrub_ctx * sctx)569*4882a593Smuzhiyun static void scrub_put_ctx(struct scrub_ctx *sctx)
570*4882a593Smuzhiyun {
571*4882a593Smuzhiyun 	if (refcount_dec_and_test(&sctx->refs))
572*4882a593Smuzhiyun 		scrub_free_ctx(sctx);
573*4882a593Smuzhiyun }
574*4882a593Smuzhiyun 
scrub_setup_ctx(struct btrfs_fs_info * fs_info,int is_dev_replace)575*4882a593Smuzhiyun static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
576*4882a593Smuzhiyun 		struct btrfs_fs_info *fs_info, int is_dev_replace)
577*4882a593Smuzhiyun {
578*4882a593Smuzhiyun 	struct scrub_ctx *sctx;
579*4882a593Smuzhiyun 	int		i;
580*4882a593Smuzhiyun 
581*4882a593Smuzhiyun 	sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
582*4882a593Smuzhiyun 	if (!sctx)
583*4882a593Smuzhiyun 		goto nomem;
584*4882a593Smuzhiyun 	refcount_set(&sctx->refs, 1);
585*4882a593Smuzhiyun 	sctx->is_dev_replace = is_dev_replace;
586*4882a593Smuzhiyun 	sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
587*4882a593Smuzhiyun 	sctx->curr = -1;
588*4882a593Smuzhiyun 	sctx->fs_info = fs_info;
589*4882a593Smuzhiyun 	INIT_LIST_HEAD(&sctx->csum_list);
590*4882a593Smuzhiyun 	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
591*4882a593Smuzhiyun 		struct scrub_bio *sbio;
592*4882a593Smuzhiyun 
593*4882a593Smuzhiyun 		sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
594*4882a593Smuzhiyun 		if (!sbio)
595*4882a593Smuzhiyun 			goto nomem;
596*4882a593Smuzhiyun 		sctx->bios[i] = sbio;
597*4882a593Smuzhiyun 
598*4882a593Smuzhiyun 		sbio->index = i;
599*4882a593Smuzhiyun 		sbio->sctx = sctx;
600*4882a593Smuzhiyun 		sbio->page_count = 0;
601*4882a593Smuzhiyun 		btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
602*4882a593Smuzhiyun 				NULL);
603*4882a593Smuzhiyun 
604*4882a593Smuzhiyun 		if (i != SCRUB_BIOS_PER_SCTX - 1)
605*4882a593Smuzhiyun 			sctx->bios[i]->next_free = i + 1;
606*4882a593Smuzhiyun 		else
607*4882a593Smuzhiyun 			sctx->bios[i]->next_free = -1;
608*4882a593Smuzhiyun 	}
609*4882a593Smuzhiyun 	sctx->first_free = 0;
610*4882a593Smuzhiyun 	atomic_set(&sctx->bios_in_flight, 0);
611*4882a593Smuzhiyun 	atomic_set(&sctx->workers_pending, 0);
612*4882a593Smuzhiyun 	atomic_set(&sctx->cancel_req, 0);
613*4882a593Smuzhiyun 	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
614*4882a593Smuzhiyun 
615*4882a593Smuzhiyun 	spin_lock_init(&sctx->list_lock);
616*4882a593Smuzhiyun 	spin_lock_init(&sctx->stat_lock);
617*4882a593Smuzhiyun 	init_waitqueue_head(&sctx->list_wait);
618*4882a593Smuzhiyun 
619*4882a593Smuzhiyun 	WARN_ON(sctx->wr_curr_bio != NULL);
620*4882a593Smuzhiyun 	mutex_init(&sctx->wr_lock);
621*4882a593Smuzhiyun 	sctx->wr_curr_bio = NULL;
622*4882a593Smuzhiyun 	if (is_dev_replace) {
623*4882a593Smuzhiyun 		WARN_ON(!fs_info->dev_replace.tgtdev);
624*4882a593Smuzhiyun 		sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
625*4882a593Smuzhiyun 		sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
626*4882a593Smuzhiyun 		sctx->flush_all_writes = false;
627*4882a593Smuzhiyun 	}
628*4882a593Smuzhiyun 
629*4882a593Smuzhiyun 	return sctx;
630*4882a593Smuzhiyun 
631*4882a593Smuzhiyun nomem:
632*4882a593Smuzhiyun 	scrub_free_ctx(sctx);
633*4882a593Smuzhiyun 	return ERR_PTR(-ENOMEM);
634*4882a593Smuzhiyun }
635*4882a593Smuzhiyun 
scrub_print_warning_inode(u64 inum,u64 offset,u64 root,void * warn_ctx)636*4882a593Smuzhiyun static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
637*4882a593Smuzhiyun 				     void *warn_ctx)
638*4882a593Smuzhiyun {
639*4882a593Smuzhiyun 	u64 isize;
640*4882a593Smuzhiyun 	u32 nlink;
641*4882a593Smuzhiyun 	int ret;
642*4882a593Smuzhiyun 	int i;
643*4882a593Smuzhiyun 	unsigned nofs_flag;
644*4882a593Smuzhiyun 	struct extent_buffer *eb;
645*4882a593Smuzhiyun 	struct btrfs_inode_item *inode_item;
646*4882a593Smuzhiyun 	struct scrub_warning *swarn = warn_ctx;
647*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
648*4882a593Smuzhiyun 	struct inode_fs_paths *ipath = NULL;
649*4882a593Smuzhiyun 	struct btrfs_root *local_root;
650*4882a593Smuzhiyun 	struct btrfs_key key;
651*4882a593Smuzhiyun 
652*4882a593Smuzhiyun 	local_root = btrfs_get_fs_root(fs_info, root, true);
653*4882a593Smuzhiyun 	if (IS_ERR(local_root)) {
654*4882a593Smuzhiyun 		ret = PTR_ERR(local_root);
655*4882a593Smuzhiyun 		goto err;
656*4882a593Smuzhiyun 	}
657*4882a593Smuzhiyun 
658*4882a593Smuzhiyun 	/*
659*4882a593Smuzhiyun 	 * this makes the path point to (inum INODE_ITEM ioff)
660*4882a593Smuzhiyun 	 */
661*4882a593Smuzhiyun 	key.objectid = inum;
662*4882a593Smuzhiyun 	key.type = BTRFS_INODE_ITEM_KEY;
663*4882a593Smuzhiyun 	key.offset = 0;
664*4882a593Smuzhiyun 
665*4882a593Smuzhiyun 	ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
666*4882a593Smuzhiyun 	if (ret) {
667*4882a593Smuzhiyun 		btrfs_put_root(local_root);
668*4882a593Smuzhiyun 		btrfs_release_path(swarn->path);
669*4882a593Smuzhiyun 		goto err;
670*4882a593Smuzhiyun 	}
671*4882a593Smuzhiyun 
672*4882a593Smuzhiyun 	eb = swarn->path->nodes[0];
673*4882a593Smuzhiyun 	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
674*4882a593Smuzhiyun 					struct btrfs_inode_item);
675*4882a593Smuzhiyun 	isize = btrfs_inode_size(eb, inode_item);
676*4882a593Smuzhiyun 	nlink = btrfs_inode_nlink(eb, inode_item);
677*4882a593Smuzhiyun 	btrfs_release_path(swarn->path);
678*4882a593Smuzhiyun 
679*4882a593Smuzhiyun 	/*
680*4882a593Smuzhiyun 	 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
681*4882a593Smuzhiyun 	 * uses GFP_NOFS in this context, so we keep it consistent but it does
682*4882a593Smuzhiyun 	 * not seem to be strictly necessary.
683*4882a593Smuzhiyun 	 */
684*4882a593Smuzhiyun 	nofs_flag = memalloc_nofs_save();
685*4882a593Smuzhiyun 	ipath = init_ipath(4096, local_root, swarn->path);
686*4882a593Smuzhiyun 	memalloc_nofs_restore(nofs_flag);
687*4882a593Smuzhiyun 	if (IS_ERR(ipath)) {
688*4882a593Smuzhiyun 		btrfs_put_root(local_root);
689*4882a593Smuzhiyun 		ret = PTR_ERR(ipath);
690*4882a593Smuzhiyun 		ipath = NULL;
691*4882a593Smuzhiyun 		goto err;
692*4882a593Smuzhiyun 	}
693*4882a593Smuzhiyun 	ret = paths_from_inode(inum, ipath);
694*4882a593Smuzhiyun 
695*4882a593Smuzhiyun 	if (ret < 0)
696*4882a593Smuzhiyun 		goto err;
697*4882a593Smuzhiyun 
698*4882a593Smuzhiyun 	/*
699*4882a593Smuzhiyun 	 * we deliberately ignore the bit ipath might have been too small to
700*4882a593Smuzhiyun 	 * hold all of the paths here
701*4882a593Smuzhiyun 	 */
702*4882a593Smuzhiyun 	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
703*4882a593Smuzhiyun 		btrfs_warn_in_rcu(fs_info,
704*4882a593Smuzhiyun "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
705*4882a593Smuzhiyun 				  swarn->errstr, swarn->logical,
706*4882a593Smuzhiyun 				  rcu_str_deref(swarn->dev->name),
707*4882a593Smuzhiyun 				  swarn->physical,
708*4882a593Smuzhiyun 				  root, inum, offset,
709*4882a593Smuzhiyun 				  min(isize - offset, (u64)PAGE_SIZE), nlink,
710*4882a593Smuzhiyun 				  (char *)(unsigned long)ipath->fspath->val[i]);
711*4882a593Smuzhiyun 
712*4882a593Smuzhiyun 	btrfs_put_root(local_root);
713*4882a593Smuzhiyun 	free_ipath(ipath);
714*4882a593Smuzhiyun 	return 0;
715*4882a593Smuzhiyun 
716*4882a593Smuzhiyun err:
717*4882a593Smuzhiyun 	btrfs_warn_in_rcu(fs_info,
718*4882a593Smuzhiyun 			  "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
719*4882a593Smuzhiyun 			  swarn->errstr, swarn->logical,
720*4882a593Smuzhiyun 			  rcu_str_deref(swarn->dev->name),
721*4882a593Smuzhiyun 			  swarn->physical,
722*4882a593Smuzhiyun 			  root, inum, offset, ret);
723*4882a593Smuzhiyun 
724*4882a593Smuzhiyun 	free_ipath(ipath);
725*4882a593Smuzhiyun 	return 0;
726*4882a593Smuzhiyun }
727*4882a593Smuzhiyun 
scrub_print_warning(const char * errstr,struct scrub_block * sblock)728*4882a593Smuzhiyun static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
729*4882a593Smuzhiyun {
730*4882a593Smuzhiyun 	struct btrfs_device *dev;
731*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info;
732*4882a593Smuzhiyun 	struct btrfs_path *path;
733*4882a593Smuzhiyun 	struct btrfs_key found_key;
734*4882a593Smuzhiyun 	struct extent_buffer *eb;
735*4882a593Smuzhiyun 	struct btrfs_extent_item *ei;
736*4882a593Smuzhiyun 	struct scrub_warning swarn;
737*4882a593Smuzhiyun 	unsigned long ptr = 0;
738*4882a593Smuzhiyun 	u64 extent_item_pos;
739*4882a593Smuzhiyun 	u64 flags = 0;
740*4882a593Smuzhiyun 	u64 ref_root;
741*4882a593Smuzhiyun 	u32 item_size;
742*4882a593Smuzhiyun 	u8 ref_level = 0;
743*4882a593Smuzhiyun 	int ret;
744*4882a593Smuzhiyun 
745*4882a593Smuzhiyun 	WARN_ON(sblock->page_count < 1);
746*4882a593Smuzhiyun 	dev = sblock->pagev[0]->dev;
747*4882a593Smuzhiyun 	fs_info = sblock->sctx->fs_info;
748*4882a593Smuzhiyun 
749*4882a593Smuzhiyun 	path = btrfs_alloc_path();
750*4882a593Smuzhiyun 	if (!path)
751*4882a593Smuzhiyun 		return;
752*4882a593Smuzhiyun 
753*4882a593Smuzhiyun 	swarn.physical = sblock->pagev[0]->physical;
754*4882a593Smuzhiyun 	swarn.logical = sblock->pagev[0]->logical;
755*4882a593Smuzhiyun 	swarn.errstr = errstr;
756*4882a593Smuzhiyun 	swarn.dev = NULL;
757*4882a593Smuzhiyun 
758*4882a593Smuzhiyun 	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
759*4882a593Smuzhiyun 				  &flags);
760*4882a593Smuzhiyun 	if (ret < 0)
761*4882a593Smuzhiyun 		goto out;
762*4882a593Smuzhiyun 
763*4882a593Smuzhiyun 	extent_item_pos = swarn.logical - found_key.objectid;
764*4882a593Smuzhiyun 	swarn.extent_item_size = found_key.offset;
765*4882a593Smuzhiyun 
766*4882a593Smuzhiyun 	eb = path->nodes[0];
767*4882a593Smuzhiyun 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
768*4882a593Smuzhiyun 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
769*4882a593Smuzhiyun 
770*4882a593Smuzhiyun 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
771*4882a593Smuzhiyun 		do {
772*4882a593Smuzhiyun 			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
773*4882a593Smuzhiyun 						      item_size, &ref_root,
774*4882a593Smuzhiyun 						      &ref_level);
775*4882a593Smuzhiyun 			btrfs_warn_in_rcu(fs_info,
776*4882a593Smuzhiyun "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
777*4882a593Smuzhiyun 				errstr, swarn.logical,
778*4882a593Smuzhiyun 				rcu_str_deref(dev->name),
779*4882a593Smuzhiyun 				swarn.physical,
780*4882a593Smuzhiyun 				ref_level ? "node" : "leaf",
781*4882a593Smuzhiyun 				ret < 0 ? -1 : ref_level,
782*4882a593Smuzhiyun 				ret < 0 ? -1 : ref_root);
783*4882a593Smuzhiyun 		} while (ret != 1);
784*4882a593Smuzhiyun 		btrfs_release_path(path);
785*4882a593Smuzhiyun 	} else {
786*4882a593Smuzhiyun 		btrfs_release_path(path);
787*4882a593Smuzhiyun 		swarn.path = path;
788*4882a593Smuzhiyun 		swarn.dev = dev;
789*4882a593Smuzhiyun 		iterate_extent_inodes(fs_info, found_key.objectid,
790*4882a593Smuzhiyun 					extent_item_pos, 1,
791*4882a593Smuzhiyun 					scrub_print_warning_inode, &swarn, false);
792*4882a593Smuzhiyun 	}
793*4882a593Smuzhiyun 
794*4882a593Smuzhiyun out:
795*4882a593Smuzhiyun 	btrfs_free_path(path);
796*4882a593Smuzhiyun }
797*4882a593Smuzhiyun 
scrub_get_recover(struct scrub_recover * recover)798*4882a593Smuzhiyun static inline void scrub_get_recover(struct scrub_recover *recover)
799*4882a593Smuzhiyun {
800*4882a593Smuzhiyun 	refcount_inc(&recover->refs);
801*4882a593Smuzhiyun }
802*4882a593Smuzhiyun 
scrub_put_recover(struct btrfs_fs_info * fs_info,struct scrub_recover * recover)803*4882a593Smuzhiyun static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
804*4882a593Smuzhiyun 				     struct scrub_recover *recover)
805*4882a593Smuzhiyun {
806*4882a593Smuzhiyun 	if (refcount_dec_and_test(&recover->refs)) {
807*4882a593Smuzhiyun 		btrfs_bio_counter_dec(fs_info);
808*4882a593Smuzhiyun 		btrfs_put_bbio(recover->bbio);
809*4882a593Smuzhiyun 		kfree(recover);
810*4882a593Smuzhiyun 	}
811*4882a593Smuzhiyun }
812*4882a593Smuzhiyun 
813*4882a593Smuzhiyun /*
814*4882a593Smuzhiyun  * scrub_handle_errored_block gets called when either verification of the
815*4882a593Smuzhiyun  * pages failed or the bio failed to read, e.g. with EIO. In the latter
816*4882a593Smuzhiyun  * case, this function handles all pages in the bio, even though only one
817*4882a593Smuzhiyun  * may be bad.
818*4882a593Smuzhiyun  * The goal of this function is to repair the errored block by using the
819*4882a593Smuzhiyun  * contents of one of the mirrors.
820*4882a593Smuzhiyun  */
scrub_handle_errored_block(struct scrub_block * sblock_to_check)821*4882a593Smuzhiyun static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
822*4882a593Smuzhiyun {
823*4882a593Smuzhiyun 	struct scrub_ctx *sctx = sblock_to_check->sctx;
824*4882a593Smuzhiyun 	struct btrfs_device *dev;
825*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info;
826*4882a593Smuzhiyun 	u64 logical;
827*4882a593Smuzhiyun 	unsigned int failed_mirror_index;
828*4882a593Smuzhiyun 	unsigned int is_metadata;
829*4882a593Smuzhiyun 	unsigned int have_csum;
830*4882a593Smuzhiyun 	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
831*4882a593Smuzhiyun 	struct scrub_block *sblock_bad;
832*4882a593Smuzhiyun 	int ret;
833*4882a593Smuzhiyun 	int mirror_index;
834*4882a593Smuzhiyun 	int page_num;
835*4882a593Smuzhiyun 	int success;
836*4882a593Smuzhiyun 	bool full_stripe_locked;
837*4882a593Smuzhiyun 	unsigned int nofs_flag;
838*4882a593Smuzhiyun 	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
839*4882a593Smuzhiyun 				      DEFAULT_RATELIMIT_BURST);
840*4882a593Smuzhiyun 
841*4882a593Smuzhiyun 	BUG_ON(sblock_to_check->page_count < 1);
842*4882a593Smuzhiyun 	fs_info = sctx->fs_info;
843*4882a593Smuzhiyun 	if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
844*4882a593Smuzhiyun 		/*
845*4882a593Smuzhiyun 		 * if we find an error in a super block, we just report it.
846*4882a593Smuzhiyun 		 * They will get written with the next transaction commit
847*4882a593Smuzhiyun 		 * anyway
848*4882a593Smuzhiyun 		 */
849*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
850*4882a593Smuzhiyun 		++sctx->stat.super_errors;
851*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
852*4882a593Smuzhiyun 		return 0;
853*4882a593Smuzhiyun 	}
854*4882a593Smuzhiyun 	logical = sblock_to_check->pagev[0]->logical;
855*4882a593Smuzhiyun 	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
856*4882a593Smuzhiyun 	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
857*4882a593Smuzhiyun 	is_metadata = !(sblock_to_check->pagev[0]->flags &
858*4882a593Smuzhiyun 			BTRFS_EXTENT_FLAG_DATA);
859*4882a593Smuzhiyun 	have_csum = sblock_to_check->pagev[0]->have_csum;
860*4882a593Smuzhiyun 	dev = sblock_to_check->pagev[0]->dev;
861*4882a593Smuzhiyun 
862*4882a593Smuzhiyun 	/*
863*4882a593Smuzhiyun 	 * We must use GFP_NOFS because the scrub task might be waiting for a
864*4882a593Smuzhiyun 	 * worker task executing this function and in turn a transaction commit
865*4882a593Smuzhiyun 	 * might be waiting the scrub task to pause (which needs to wait for all
866*4882a593Smuzhiyun 	 * the worker tasks to complete before pausing).
867*4882a593Smuzhiyun 	 * We do allocations in the workers through insert_full_stripe_lock()
868*4882a593Smuzhiyun 	 * and scrub_add_page_to_wr_bio(), which happens down the call chain of
869*4882a593Smuzhiyun 	 * this function.
870*4882a593Smuzhiyun 	 */
871*4882a593Smuzhiyun 	nofs_flag = memalloc_nofs_save();
872*4882a593Smuzhiyun 	/*
873*4882a593Smuzhiyun 	 * For RAID5/6, race can happen for a different device scrub thread.
874*4882a593Smuzhiyun 	 * For data corruption, Parity and Data threads will both try
875*4882a593Smuzhiyun 	 * to recovery the data.
876*4882a593Smuzhiyun 	 * Race can lead to doubly added csum error, or even unrecoverable
877*4882a593Smuzhiyun 	 * error.
878*4882a593Smuzhiyun 	 */
879*4882a593Smuzhiyun 	ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
880*4882a593Smuzhiyun 	if (ret < 0) {
881*4882a593Smuzhiyun 		memalloc_nofs_restore(nofs_flag);
882*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
883*4882a593Smuzhiyun 		if (ret == -ENOMEM)
884*4882a593Smuzhiyun 			sctx->stat.malloc_errors++;
885*4882a593Smuzhiyun 		sctx->stat.read_errors++;
886*4882a593Smuzhiyun 		sctx->stat.uncorrectable_errors++;
887*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
888*4882a593Smuzhiyun 		return ret;
889*4882a593Smuzhiyun 	}
890*4882a593Smuzhiyun 
891*4882a593Smuzhiyun 	/*
892*4882a593Smuzhiyun 	 * read all mirrors one after the other. This includes to
893*4882a593Smuzhiyun 	 * re-read the extent or metadata block that failed (that was
894*4882a593Smuzhiyun 	 * the cause that this fixup code is called) another time,
895*4882a593Smuzhiyun 	 * page by page this time in order to know which pages
896*4882a593Smuzhiyun 	 * caused I/O errors and which ones are good (for all mirrors).
897*4882a593Smuzhiyun 	 * It is the goal to handle the situation when more than one
898*4882a593Smuzhiyun 	 * mirror contains I/O errors, but the errors do not
899*4882a593Smuzhiyun 	 * overlap, i.e. the data can be repaired by selecting the
900*4882a593Smuzhiyun 	 * pages from those mirrors without I/O error on the
901*4882a593Smuzhiyun 	 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
902*4882a593Smuzhiyun 	 * would be that mirror #1 has an I/O error on the first page,
903*4882a593Smuzhiyun 	 * the second page is good, and mirror #2 has an I/O error on
904*4882a593Smuzhiyun 	 * the second page, but the first page is good.
905*4882a593Smuzhiyun 	 * Then the first page of the first mirror can be repaired by
906*4882a593Smuzhiyun 	 * taking the first page of the second mirror, and the
907*4882a593Smuzhiyun 	 * second page of the second mirror can be repaired by
908*4882a593Smuzhiyun 	 * copying the contents of the 2nd page of the 1st mirror.
909*4882a593Smuzhiyun 	 * One more note: if the pages of one mirror contain I/O
910*4882a593Smuzhiyun 	 * errors, the checksum cannot be verified. In order to get
911*4882a593Smuzhiyun 	 * the best data for repairing, the first attempt is to find
912*4882a593Smuzhiyun 	 * a mirror without I/O errors and with a validated checksum.
913*4882a593Smuzhiyun 	 * Only if this is not possible, the pages are picked from
914*4882a593Smuzhiyun 	 * mirrors with I/O errors without considering the checksum.
915*4882a593Smuzhiyun 	 * If the latter is the case, at the end, the checksum of the
916*4882a593Smuzhiyun 	 * repaired area is verified in order to correctly maintain
917*4882a593Smuzhiyun 	 * the statistics.
918*4882a593Smuzhiyun 	 */
919*4882a593Smuzhiyun 
920*4882a593Smuzhiyun 	sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
921*4882a593Smuzhiyun 				      sizeof(*sblocks_for_recheck), GFP_KERNEL);
922*4882a593Smuzhiyun 	if (!sblocks_for_recheck) {
923*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
924*4882a593Smuzhiyun 		sctx->stat.malloc_errors++;
925*4882a593Smuzhiyun 		sctx->stat.read_errors++;
926*4882a593Smuzhiyun 		sctx->stat.uncorrectable_errors++;
927*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
928*4882a593Smuzhiyun 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
929*4882a593Smuzhiyun 		goto out;
930*4882a593Smuzhiyun 	}
931*4882a593Smuzhiyun 
932*4882a593Smuzhiyun 	/* setup the context, map the logical blocks and alloc the pages */
933*4882a593Smuzhiyun 	ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
934*4882a593Smuzhiyun 	if (ret) {
935*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
936*4882a593Smuzhiyun 		sctx->stat.read_errors++;
937*4882a593Smuzhiyun 		sctx->stat.uncorrectable_errors++;
938*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
939*4882a593Smuzhiyun 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
940*4882a593Smuzhiyun 		goto out;
941*4882a593Smuzhiyun 	}
942*4882a593Smuzhiyun 	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
943*4882a593Smuzhiyun 	sblock_bad = sblocks_for_recheck + failed_mirror_index;
944*4882a593Smuzhiyun 
945*4882a593Smuzhiyun 	/* build and submit the bios for the failed mirror, check checksums */
946*4882a593Smuzhiyun 	scrub_recheck_block(fs_info, sblock_bad, 1);
947*4882a593Smuzhiyun 
948*4882a593Smuzhiyun 	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
949*4882a593Smuzhiyun 	    sblock_bad->no_io_error_seen) {
950*4882a593Smuzhiyun 		/*
951*4882a593Smuzhiyun 		 * the error disappeared after reading page by page, or
952*4882a593Smuzhiyun 		 * the area was part of a huge bio and other parts of the
953*4882a593Smuzhiyun 		 * bio caused I/O errors, or the block layer merged several
954*4882a593Smuzhiyun 		 * read requests into one and the error is caused by a
955*4882a593Smuzhiyun 		 * different bio (usually one of the two latter cases is
956*4882a593Smuzhiyun 		 * the cause)
957*4882a593Smuzhiyun 		 */
958*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
959*4882a593Smuzhiyun 		sctx->stat.unverified_errors++;
960*4882a593Smuzhiyun 		sblock_to_check->data_corrected = 1;
961*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
962*4882a593Smuzhiyun 
963*4882a593Smuzhiyun 		if (sctx->is_dev_replace)
964*4882a593Smuzhiyun 			scrub_write_block_to_dev_replace(sblock_bad);
965*4882a593Smuzhiyun 		goto out;
966*4882a593Smuzhiyun 	}
967*4882a593Smuzhiyun 
968*4882a593Smuzhiyun 	if (!sblock_bad->no_io_error_seen) {
969*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
970*4882a593Smuzhiyun 		sctx->stat.read_errors++;
971*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
972*4882a593Smuzhiyun 		if (__ratelimit(&rs))
973*4882a593Smuzhiyun 			scrub_print_warning("i/o error", sblock_to_check);
974*4882a593Smuzhiyun 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
975*4882a593Smuzhiyun 	} else if (sblock_bad->checksum_error) {
976*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
977*4882a593Smuzhiyun 		sctx->stat.csum_errors++;
978*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
979*4882a593Smuzhiyun 		if (__ratelimit(&rs))
980*4882a593Smuzhiyun 			scrub_print_warning("checksum error", sblock_to_check);
981*4882a593Smuzhiyun 		btrfs_dev_stat_inc_and_print(dev,
982*4882a593Smuzhiyun 					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
983*4882a593Smuzhiyun 	} else if (sblock_bad->header_error) {
984*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
985*4882a593Smuzhiyun 		sctx->stat.verify_errors++;
986*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
987*4882a593Smuzhiyun 		if (__ratelimit(&rs))
988*4882a593Smuzhiyun 			scrub_print_warning("checksum/header error",
989*4882a593Smuzhiyun 					    sblock_to_check);
990*4882a593Smuzhiyun 		if (sblock_bad->generation_error)
991*4882a593Smuzhiyun 			btrfs_dev_stat_inc_and_print(dev,
992*4882a593Smuzhiyun 				BTRFS_DEV_STAT_GENERATION_ERRS);
993*4882a593Smuzhiyun 		else
994*4882a593Smuzhiyun 			btrfs_dev_stat_inc_and_print(dev,
995*4882a593Smuzhiyun 				BTRFS_DEV_STAT_CORRUPTION_ERRS);
996*4882a593Smuzhiyun 	}
997*4882a593Smuzhiyun 
998*4882a593Smuzhiyun 	if (sctx->readonly) {
999*4882a593Smuzhiyun 		ASSERT(!sctx->is_dev_replace);
1000*4882a593Smuzhiyun 		goto out;
1001*4882a593Smuzhiyun 	}
1002*4882a593Smuzhiyun 
1003*4882a593Smuzhiyun 	/*
1004*4882a593Smuzhiyun 	 * now build and submit the bios for the other mirrors, check
1005*4882a593Smuzhiyun 	 * checksums.
1006*4882a593Smuzhiyun 	 * First try to pick the mirror which is completely without I/O
1007*4882a593Smuzhiyun 	 * errors and also does not have a checksum error.
1008*4882a593Smuzhiyun 	 * If one is found, and if a checksum is present, the full block
1009*4882a593Smuzhiyun 	 * that is known to contain an error is rewritten. Afterwards
1010*4882a593Smuzhiyun 	 * the block is known to be corrected.
1011*4882a593Smuzhiyun 	 * If a mirror is found which is completely correct, and no
1012*4882a593Smuzhiyun 	 * checksum is present, only those pages are rewritten that had
1013*4882a593Smuzhiyun 	 * an I/O error in the block to be repaired, since it cannot be
1014*4882a593Smuzhiyun 	 * determined, which copy of the other pages is better (and it
1015*4882a593Smuzhiyun 	 * could happen otherwise that a correct page would be
1016*4882a593Smuzhiyun 	 * overwritten by a bad one).
1017*4882a593Smuzhiyun 	 */
1018*4882a593Smuzhiyun 	for (mirror_index = 0; ;mirror_index++) {
1019*4882a593Smuzhiyun 		struct scrub_block *sblock_other;
1020*4882a593Smuzhiyun 
1021*4882a593Smuzhiyun 		if (mirror_index == failed_mirror_index)
1022*4882a593Smuzhiyun 			continue;
1023*4882a593Smuzhiyun 
1024*4882a593Smuzhiyun 		/* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1025*4882a593Smuzhiyun 		if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1026*4882a593Smuzhiyun 			if (mirror_index >= BTRFS_MAX_MIRRORS)
1027*4882a593Smuzhiyun 				break;
1028*4882a593Smuzhiyun 			if (!sblocks_for_recheck[mirror_index].page_count)
1029*4882a593Smuzhiyun 				break;
1030*4882a593Smuzhiyun 
1031*4882a593Smuzhiyun 			sblock_other = sblocks_for_recheck + mirror_index;
1032*4882a593Smuzhiyun 		} else {
1033*4882a593Smuzhiyun 			struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1034*4882a593Smuzhiyun 			int max_allowed = r->bbio->num_stripes -
1035*4882a593Smuzhiyun 						r->bbio->num_tgtdevs;
1036*4882a593Smuzhiyun 
1037*4882a593Smuzhiyun 			if (mirror_index >= max_allowed)
1038*4882a593Smuzhiyun 				break;
1039*4882a593Smuzhiyun 			if (!sblocks_for_recheck[1].page_count)
1040*4882a593Smuzhiyun 				break;
1041*4882a593Smuzhiyun 
1042*4882a593Smuzhiyun 			ASSERT(failed_mirror_index == 0);
1043*4882a593Smuzhiyun 			sblock_other = sblocks_for_recheck + 1;
1044*4882a593Smuzhiyun 			sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1045*4882a593Smuzhiyun 		}
1046*4882a593Smuzhiyun 
1047*4882a593Smuzhiyun 		/* build and submit the bios, check checksums */
1048*4882a593Smuzhiyun 		scrub_recheck_block(fs_info, sblock_other, 0);
1049*4882a593Smuzhiyun 
1050*4882a593Smuzhiyun 		if (!sblock_other->header_error &&
1051*4882a593Smuzhiyun 		    !sblock_other->checksum_error &&
1052*4882a593Smuzhiyun 		    sblock_other->no_io_error_seen) {
1053*4882a593Smuzhiyun 			if (sctx->is_dev_replace) {
1054*4882a593Smuzhiyun 				scrub_write_block_to_dev_replace(sblock_other);
1055*4882a593Smuzhiyun 				goto corrected_error;
1056*4882a593Smuzhiyun 			} else {
1057*4882a593Smuzhiyun 				ret = scrub_repair_block_from_good_copy(
1058*4882a593Smuzhiyun 						sblock_bad, sblock_other);
1059*4882a593Smuzhiyun 				if (!ret)
1060*4882a593Smuzhiyun 					goto corrected_error;
1061*4882a593Smuzhiyun 			}
1062*4882a593Smuzhiyun 		}
1063*4882a593Smuzhiyun 	}
1064*4882a593Smuzhiyun 
1065*4882a593Smuzhiyun 	if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1066*4882a593Smuzhiyun 		goto did_not_correct_error;
1067*4882a593Smuzhiyun 
1068*4882a593Smuzhiyun 	/*
1069*4882a593Smuzhiyun 	 * In case of I/O errors in the area that is supposed to be
1070*4882a593Smuzhiyun 	 * repaired, continue by picking good copies of those pages.
1071*4882a593Smuzhiyun 	 * Select the good pages from mirrors to rewrite bad pages from
1072*4882a593Smuzhiyun 	 * the area to fix. Afterwards verify the checksum of the block
1073*4882a593Smuzhiyun 	 * that is supposed to be repaired. This verification step is
1074*4882a593Smuzhiyun 	 * only done for the purpose of statistic counting and for the
1075*4882a593Smuzhiyun 	 * final scrub report, whether errors remain.
1076*4882a593Smuzhiyun 	 * A perfect algorithm could make use of the checksum and try
1077*4882a593Smuzhiyun 	 * all possible combinations of pages from the different mirrors
1078*4882a593Smuzhiyun 	 * until the checksum verification succeeds. For example, when
1079*4882a593Smuzhiyun 	 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1080*4882a593Smuzhiyun 	 * of mirror #2 is readable but the final checksum test fails,
1081*4882a593Smuzhiyun 	 * then the 2nd page of mirror #3 could be tried, whether now
1082*4882a593Smuzhiyun 	 * the final checksum succeeds. But this would be a rare
1083*4882a593Smuzhiyun 	 * exception and is therefore not implemented. At least it is
1084*4882a593Smuzhiyun 	 * avoided that the good copy is overwritten.
1085*4882a593Smuzhiyun 	 * A more useful improvement would be to pick the sectors
1086*4882a593Smuzhiyun 	 * without I/O error based on sector sizes (512 bytes on legacy
1087*4882a593Smuzhiyun 	 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1088*4882a593Smuzhiyun 	 * mirror could be repaired by taking 512 byte of a different
1089*4882a593Smuzhiyun 	 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1090*4882a593Smuzhiyun 	 * area are unreadable.
1091*4882a593Smuzhiyun 	 */
1092*4882a593Smuzhiyun 	success = 1;
1093*4882a593Smuzhiyun 	for (page_num = 0; page_num < sblock_bad->page_count;
1094*4882a593Smuzhiyun 	     page_num++) {
1095*4882a593Smuzhiyun 		struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1096*4882a593Smuzhiyun 		struct scrub_block *sblock_other = NULL;
1097*4882a593Smuzhiyun 
1098*4882a593Smuzhiyun 		/* skip no-io-error page in scrub */
1099*4882a593Smuzhiyun 		if (!page_bad->io_error && !sctx->is_dev_replace)
1100*4882a593Smuzhiyun 			continue;
1101*4882a593Smuzhiyun 
1102*4882a593Smuzhiyun 		if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1103*4882a593Smuzhiyun 			/*
1104*4882a593Smuzhiyun 			 * In case of dev replace, if raid56 rebuild process
1105*4882a593Smuzhiyun 			 * didn't work out correct data, then copy the content
1106*4882a593Smuzhiyun 			 * in sblock_bad to make sure target device is identical
1107*4882a593Smuzhiyun 			 * to source device, instead of writing garbage data in
1108*4882a593Smuzhiyun 			 * sblock_for_recheck array to target device.
1109*4882a593Smuzhiyun 			 */
1110*4882a593Smuzhiyun 			sblock_other = NULL;
1111*4882a593Smuzhiyun 		} else if (page_bad->io_error) {
1112*4882a593Smuzhiyun 			/* try to find no-io-error page in mirrors */
1113*4882a593Smuzhiyun 			for (mirror_index = 0;
1114*4882a593Smuzhiyun 			     mirror_index < BTRFS_MAX_MIRRORS &&
1115*4882a593Smuzhiyun 			     sblocks_for_recheck[mirror_index].page_count > 0;
1116*4882a593Smuzhiyun 			     mirror_index++) {
1117*4882a593Smuzhiyun 				if (!sblocks_for_recheck[mirror_index].
1118*4882a593Smuzhiyun 				    pagev[page_num]->io_error) {
1119*4882a593Smuzhiyun 					sblock_other = sblocks_for_recheck +
1120*4882a593Smuzhiyun 						       mirror_index;
1121*4882a593Smuzhiyun 					break;
1122*4882a593Smuzhiyun 				}
1123*4882a593Smuzhiyun 			}
1124*4882a593Smuzhiyun 			if (!sblock_other)
1125*4882a593Smuzhiyun 				success = 0;
1126*4882a593Smuzhiyun 		}
1127*4882a593Smuzhiyun 
1128*4882a593Smuzhiyun 		if (sctx->is_dev_replace) {
1129*4882a593Smuzhiyun 			/*
1130*4882a593Smuzhiyun 			 * did not find a mirror to fetch the page
1131*4882a593Smuzhiyun 			 * from. scrub_write_page_to_dev_replace()
1132*4882a593Smuzhiyun 			 * handles this case (page->io_error), by
1133*4882a593Smuzhiyun 			 * filling the block with zeros before
1134*4882a593Smuzhiyun 			 * submitting the write request
1135*4882a593Smuzhiyun 			 */
1136*4882a593Smuzhiyun 			if (!sblock_other)
1137*4882a593Smuzhiyun 				sblock_other = sblock_bad;
1138*4882a593Smuzhiyun 
1139*4882a593Smuzhiyun 			if (scrub_write_page_to_dev_replace(sblock_other,
1140*4882a593Smuzhiyun 							    page_num) != 0) {
1141*4882a593Smuzhiyun 				atomic64_inc(
1142*4882a593Smuzhiyun 					&fs_info->dev_replace.num_write_errors);
1143*4882a593Smuzhiyun 				success = 0;
1144*4882a593Smuzhiyun 			}
1145*4882a593Smuzhiyun 		} else if (sblock_other) {
1146*4882a593Smuzhiyun 			ret = scrub_repair_page_from_good_copy(sblock_bad,
1147*4882a593Smuzhiyun 							       sblock_other,
1148*4882a593Smuzhiyun 							       page_num, 0);
1149*4882a593Smuzhiyun 			if (0 == ret)
1150*4882a593Smuzhiyun 				page_bad->io_error = 0;
1151*4882a593Smuzhiyun 			else
1152*4882a593Smuzhiyun 				success = 0;
1153*4882a593Smuzhiyun 		}
1154*4882a593Smuzhiyun 	}
1155*4882a593Smuzhiyun 
1156*4882a593Smuzhiyun 	if (success && !sctx->is_dev_replace) {
1157*4882a593Smuzhiyun 		if (is_metadata || have_csum) {
1158*4882a593Smuzhiyun 			/*
1159*4882a593Smuzhiyun 			 * need to verify the checksum now that all
1160*4882a593Smuzhiyun 			 * sectors on disk are repaired (the write
1161*4882a593Smuzhiyun 			 * request for data to be repaired is on its way).
1162*4882a593Smuzhiyun 			 * Just be lazy and use scrub_recheck_block()
1163*4882a593Smuzhiyun 			 * which re-reads the data before the checksum
1164*4882a593Smuzhiyun 			 * is verified, but most likely the data comes out
1165*4882a593Smuzhiyun 			 * of the page cache.
1166*4882a593Smuzhiyun 			 */
1167*4882a593Smuzhiyun 			scrub_recheck_block(fs_info, sblock_bad, 1);
1168*4882a593Smuzhiyun 			if (!sblock_bad->header_error &&
1169*4882a593Smuzhiyun 			    !sblock_bad->checksum_error &&
1170*4882a593Smuzhiyun 			    sblock_bad->no_io_error_seen)
1171*4882a593Smuzhiyun 				goto corrected_error;
1172*4882a593Smuzhiyun 			else
1173*4882a593Smuzhiyun 				goto did_not_correct_error;
1174*4882a593Smuzhiyun 		} else {
1175*4882a593Smuzhiyun corrected_error:
1176*4882a593Smuzhiyun 			spin_lock(&sctx->stat_lock);
1177*4882a593Smuzhiyun 			sctx->stat.corrected_errors++;
1178*4882a593Smuzhiyun 			sblock_to_check->data_corrected = 1;
1179*4882a593Smuzhiyun 			spin_unlock(&sctx->stat_lock);
1180*4882a593Smuzhiyun 			btrfs_err_rl_in_rcu(fs_info,
1181*4882a593Smuzhiyun 				"fixed up error at logical %llu on dev %s",
1182*4882a593Smuzhiyun 				logical, rcu_str_deref(dev->name));
1183*4882a593Smuzhiyun 		}
1184*4882a593Smuzhiyun 	} else {
1185*4882a593Smuzhiyun did_not_correct_error:
1186*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
1187*4882a593Smuzhiyun 		sctx->stat.uncorrectable_errors++;
1188*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
1189*4882a593Smuzhiyun 		btrfs_err_rl_in_rcu(fs_info,
1190*4882a593Smuzhiyun 			"unable to fixup (regular) error at logical %llu on dev %s",
1191*4882a593Smuzhiyun 			logical, rcu_str_deref(dev->name));
1192*4882a593Smuzhiyun 	}
1193*4882a593Smuzhiyun 
1194*4882a593Smuzhiyun out:
1195*4882a593Smuzhiyun 	if (sblocks_for_recheck) {
1196*4882a593Smuzhiyun 		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1197*4882a593Smuzhiyun 		     mirror_index++) {
1198*4882a593Smuzhiyun 			struct scrub_block *sblock = sblocks_for_recheck +
1199*4882a593Smuzhiyun 						     mirror_index;
1200*4882a593Smuzhiyun 			struct scrub_recover *recover;
1201*4882a593Smuzhiyun 			int page_index;
1202*4882a593Smuzhiyun 
1203*4882a593Smuzhiyun 			for (page_index = 0; page_index < sblock->page_count;
1204*4882a593Smuzhiyun 			     page_index++) {
1205*4882a593Smuzhiyun 				sblock->pagev[page_index]->sblock = NULL;
1206*4882a593Smuzhiyun 				recover = sblock->pagev[page_index]->recover;
1207*4882a593Smuzhiyun 				if (recover) {
1208*4882a593Smuzhiyun 					scrub_put_recover(fs_info, recover);
1209*4882a593Smuzhiyun 					sblock->pagev[page_index]->recover =
1210*4882a593Smuzhiyun 									NULL;
1211*4882a593Smuzhiyun 				}
1212*4882a593Smuzhiyun 				scrub_page_put(sblock->pagev[page_index]);
1213*4882a593Smuzhiyun 			}
1214*4882a593Smuzhiyun 		}
1215*4882a593Smuzhiyun 		kfree(sblocks_for_recheck);
1216*4882a593Smuzhiyun 	}
1217*4882a593Smuzhiyun 
1218*4882a593Smuzhiyun 	ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1219*4882a593Smuzhiyun 	memalloc_nofs_restore(nofs_flag);
1220*4882a593Smuzhiyun 	if (ret < 0)
1221*4882a593Smuzhiyun 		return ret;
1222*4882a593Smuzhiyun 	return 0;
1223*4882a593Smuzhiyun }
1224*4882a593Smuzhiyun 
scrub_nr_raid_mirrors(struct btrfs_bio * bbio)1225*4882a593Smuzhiyun static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1226*4882a593Smuzhiyun {
1227*4882a593Smuzhiyun 	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1228*4882a593Smuzhiyun 		return 2;
1229*4882a593Smuzhiyun 	else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1230*4882a593Smuzhiyun 		return 3;
1231*4882a593Smuzhiyun 	else
1232*4882a593Smuzhiyun 		return (int)bbio->num_stripes;
1233*4882a593Smuzhiyun }
1234*4882a593Smuzhiyun 
scrub_stripe_index_and_offset(u64 logical,u64 map_type,u64 * raid_map,u64 mapped_length,int nstripes,int mirror,int * stripe_index,u64 * stripe_offset)1235*4882a593Smuzhiyun static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1236*4882a593Smuzhiyun 						 u64 *raid_map,
1237*4882a593Smuzhiyun 						 u64 mapped_length,
1238*4882a593Smuzhiyun 						 int nstripes, int mirror,
1239*4882a593Smuzhiyun 						 int *stripe_index,
1240*4882a593Smuzhiyun 						 u64 *stripe_offset)
1241*4882a593Smuzhiyun {
1242*4882a593Smuzhiyun 	int i;
1243*4882a593Smuzhiyun 
1244*4882a593Smuzhiyun 	if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1245*4882a593Smuzhiyun 		/* RAID5/6 */
1246*4882a593Smuzhiyun 		for (i = 0; i < nstripes; i++) {
1247*4882a593Smuzhiyun 			if (raid_map[i] == RAID6_Q_STRIPE ||
1248*4882a593Smuzhiyun 			    raid_map[i] == RAID5_P_STRIPE)
1249*4882a593Smuzhiyun 				continue;
1250*4882a593Smuzhiyun 
1251*4882a593Smuzhiyun 			if (logical >= raid_map[i] &&
1252*4882a593Smuzhiyun 			    logical < raid_map[i] + mapped_length)
1253*4882a593Smuzhiyun 				break;
1254*4882a593Smuzhiyun 		}
1255*4882a593Smuzhiyun 
1256*4882a593Smuzhiyun 		*stripe_index = i;
1257*4882a593Smuzhiyun 		*stripe_offset = logical - raid_map[i];
1258*4882a593Smuzhiyun 	} else {
1259*4882a593Smuzhiyun 		/* The other RAID type */
1260*4882a593Smuzhiyun 		*stripe_index = mirror;
1261*4882a593Smuzhiyun 		*stripe_offset = 0;
1262*4882a593Smuzhiyun 	}
1263*4882a593Smuzhiyun }
1264*4882a593Smuzhiyun 
scrub_setup_recheck_block(struct scrub_block * original_sblock,struct scrub_block * sblocks_for_recheck)1265*4882a593Smuzhiyun static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1266*4882a593Smuzhiyun 				     struct scrub_block *sblocks_for_recheck)
1267*4882a593Smuzhiyun {
1268*4882a593Smuzhiyun 	struct scrub_ctx *sctx = original_sblock->sctx;
1269*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sctx->fs_info;
1270*4882a593Smuzhiyun 	u64 length = original_sblock->page_count * PAGE_SIZE;
1271*4882a593Smuzhiyun 	u64 logical = original_sblock->pagev[0]->logical;
1272*4882a593Smuzhiyun 	u64 generation = original_sblock->pagev[0]->generation;
1273*4882a593Smuzhiyun 	u64 flags = original_sblock->pagev[0]->flags;
1274*4882a593Smuzhiyun 	u64 have_csum = original_sblock->pagev[0]->have_csum;
1275*4882a593Smuzhiyun 	struct scrub_recover *recover;
1276*4882a593Smuzhiyun 	struct btrfs_bio *bbio;
1277*4882a593Smuzhiyun 	u64 sublen;
1278*4882a593Smuzhiyun 	u64 mapped_length;
1279*4882a593Smuzhiyun 	u64 stripe_offset;
1280*4882a593Smuzhiyun 	int stripe_index;
1281*4882a593Smuzhiyun 	int page_index = 0;
1282*4882a593Smuzhiyun 	int mirror_index;
1283*4882a593Smuzhiyun 	int nmirrors;
1284*4882a593Smuzhiyun 	int ret;
1285*4882a593Smuzhiyun 
1286*4882a593Smuzhiyun 	/*
1287*4882a593Smuzhiyun 	 * note: the two members refs and outstanding_pages
1288*4882a593Smuzhiyun 	 * are not used (and not set) in the blocks that are used for
1289*4882a593Smuzhiyun 	 * the recheck procedure
1290*4882a593Smuzhiyun 	 */
1291*4882a593Smuzhiyun 
1292*4882a593Smuzhiyun 	while (length > 0) {
1293*4882a593Smuzhiyun 		sublen = min_t(u64, length, PAGE_SIZE);
1294*4882a593Smuzhiyun 		mapped_length = sublen;
1295*4882a593Smuzhiyun 		bbio = NULL;
1296*4882a593Smuzhiyun 
1297*4882a593Smuzhiyun 		/*
1298*4882a593Smuzhiyun 		 * with a length of PAGE_SIZE, each returned stripe
1299*4882a593Smuzhiyun 		 * represents one mirror
1300*4882a593Smuzhiyun 		 */
1301*4882a593Smuzhiyun 		btrfs_bio_counter_inc_blocked(fs_info);
1302*4882a593Smuzhiyun 		ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1303*4882a593Smuzhiyun 				logical, &mapped_length, &bbio);
1304*4882a593Smuzhiyun 		if (ret || !bbio || mapped_length < sublen) {
1305*4882a593Smuzhiyun 			btrfs_put_bbio(bbio);
1306*4882a593Smuzhiyun 			btrfs_bio_counter_dec(fs_info);
1307*4882a593Smuzhiyun 			return -EIO;
1308*4882a593Smuzhiyun 		}
1309*4882a593Smuzhiyun 
1310*4882a593Smuzhiyun 		recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1311*4882a593Smuzhiyun 		if (!recover) {
1312*4882a593Smuzhiyun 			btrfs_put_bbio(bbio);
1313*4882a593Smuzhiyun 			btrfs_bio_counter_dec(fs_info);
1314*4882a593Smuzhiyun 			return -ENOMEM;
1315*4882a593Smuzhiyun 		}
1316*4882a593Smuzhiyun 
1317*4882a593Smuzhiyun 		refcount_set(&recover->refs, 1);
1318*4882a593Smuzhiyun 		recover->bbio = bbio;
1319*4882a593Smuzhiyun 		recover->map_length = mapped_length;
1320*4882a593Smuzhiyun 
1321*4882a593Smuzhiyun 		BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1322*4882a593Smuzhiyun 
1323*4882a593Smuzhiyun 		nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1324*4882a593Smuzhiyun 
1325*4882a593Smuzhiyun 		for (mirror_index = 0; mirror_index < nmirrors;
1326*4882a593Smuzhiyun 		     mirror_index++) {
1327*4882a593Smuzhiyun 			struct scrub_block *sblock;
1328*4882a593Smuzhiyun 			struct scrub_page *page;
1329*4882a593Smuzhiyun 
1330*4882a593Smuzhiyun 			sblock = sblocks_for_recheck + mirror_index;
1331*4882a593Smuzhiyun 			sblock->sctx = sctx;
1332*4882a593Smuzhiyun 
1333*4882a593Smuzhiyun 			page = kzalloc(sizeof(*page), GFP_NOFS);
1334*4882a593Smuzhiyun 			if (!page) {
1335*4882a593Smuzhiyun leave_nomem:
1336*4882a593Smuzhiyun 				spin_lock(&sctx->stat_lock);
1337*4882a593Smuzhiyun 				sctx->stat.malloc_errors++;
1338*4882a593Smuzhiyun 				spin_unlock(&sctx->stat_lock);
1339*4882a593Smuzhiyun 				scrub_put_recover(fs_info, recover);
1340*4882a593Smuzhiyun 				return -ENOMEM;
1341*4882a593Smuzhiyun 			}
1342*4882a593Smuzhiyun 			scrub_page_get(page);
1343*4882a593Smuzhiyun 			sblock->pagev[page_index] = page;
1344*4882a593Smuzhiyun 			page->sblock = sblock;
1345*4882a593Smuzhiyun 			page->flags = flags;
1346*4882a593Smuzhiyun 			page->generation = generation;
1347*4882a593Smuzhiyun 			page->logical = logical;
1348*4882a593Smuzhiyun 			page->have_csum = have_csum;
1349*4882a593Smuzhiyun 			if (have_csum)
1350*4882a593Smuzhiyun 				memcpy(page->csum,
1351*4882a593Smuzhiyun 				       original_sblock->pagev[0]->csum,
1352*4882a593Smuzhiyun 				       sctx->csum_size);
1353*4882a593Smuzhiyun 
1354*4882a593Smuzhiyun 			scrub_stripe_index_and_offset(logical,
1355*4882a593Smuzhiyun 						      bbio->map_type,
1356*4882a593Smuzhiyun 						      bbio->raid_map,
1357*4882a593Smuzhiyun 						      mapped_length,
1358*4882a593Smuzhiyun 						      bbio->num_stripes -
1359*4882a593Smuzhiyun 						      bbio->num_tgtdevs,
1360*4882a593Smuzhiyun 						      mirror_index,
1361*4882a593Smuzhiyun 						      &stripe_index,
1362*4882a593Smuzhiyun 						      &stripe_offset);
1363*4882a593Smuzhiyun 			page->physical = bbio->stripes[stripe_index].physical +
1364*4882a593Smuzhiyun 					 stripe_offset;
1365*4882a593Smuzhiyun 			page->dev = bbio->stripes[stripe_index].dev;
1366*4882a593Smuzhiyun 
1367*4882a593Smuzhiyun 			BUG_ON(page_index >= original_sblock->page_count);
1368*4882a593Smuzhiyun 			page->physical_for_dev_replace =
1369*4882a593Smuzhiyun 				original_sblock->pagev[page_index]->
1370*4882a593Smuzhiyun 				physical_for_dev_replace;
1371*4882a593Smuzhiyun 			/* for missing devices, dev->bdev is NULL */
1372*4882a593Smuzhiyun 			page->mirror_num = mirror_index + 1;
1373*4882a593Smuzhiyun 			sblock->page_count++;
1374*4882a593Smuzhiyun 			page->page = alloc_page(GFP_NOFS);
1375*4882a593Smuzhiyun 			if (!page->page)
1376*4882a593Smuzhiyun 				goto leave_nomem;
1377*4882a593Smuzhiyun 
1378*4882a593Smuzhiyun 			scrub_get_recover(recover);
1379*4882a593Smuzhiyun 			page->recover = recover;
1380*4882a593Smuzhiyun 		}
1381*4882a593Smuzhiyun 		scrub_put_recover(fs_info, recover);
1382*4882a593Smuzhiyun 		length -= sublen;
1383*4882a593Smuzhiyun 		logical += sublen;
1384*4882a593Smuzhiyun 		page_index++;
1385*4882a593Smuzhiyun 	}
1386*4882a593Smuzhiyun 
1387*4882a593Smuzhiyun 	return 0;
1388*4882a593Smuzhiyun }
1389*4882a593Smuzhiyun 
scrub_bio_wait_endio(struct bio * bio)1390*4882a593Smuzhiyun static void scrub_bio_wait_endio(struct bio *bio)
1391*4882a593Smuzhiyun {
1392*4882a593Smuzhiyun 	complete(bio->bi_private);
1393*4882a593Smuzhiyun }
1394*4882a593Smuzhiyun 
scrub_submit_raid56_bio_wait(struct btrfs_fs_info * fs_info,struct bio * bio,struct scrub_page * page)1395*4882a593Smuzhiyun static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1396*4882a593Smuzhiyun 					struct bio *bio,
1397*4882a593Smuzhiyun 					struct scrub_page *page)
1398*4882a593Smuzhiyun {
1399*4882a593Smuzhiyun 	DECLARE_COMPLETION_ONSTACK(done);
1400*4882a593Smuzhiyun 	int ret;
1401*4882a593Smuzhiyun 	int mirror_num;
1402*4882a593Smuzhiyun 
1403*4882a593Smuzhiyun 	bio->bi_iter.bi_sector = page->logical >> 9;
1404*4882a593Smuzhiyun 	bio->bi_private = &done;
1405*4882a593Smuzhiyun 	bio->bi_end_io = scrub_bio_wait_endio;
1406*4882a593Smuzhiyun 
1407*4882a593Smuzhiyun 	mirror_num = page->sblock->pagev[0]->mirror_num;
1408*4882a593Smuzhiyun 	ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
1409*4882a593Smuzhiyun 				    page->recover->map_length,
1410*4882a593Smuzhiyun 				    mirror_num, 0);
1411*4882a593Smuzhiyun 	if (ret)
1412*4882a593Smuzhiyun 		return ret;
1413*4882a593Smuzhiyun 
1414*4882a593Smuzhiyun 	wait_for_completion_io(&done);
1415*4882a593Smuzhiyun 	return blk_status_to_errno(bio->bi_status);
1416*4882a593Smuzhiyun }
1417*4882a593Smuzhiyun 
scrub_recheck_block_on_raid56(struct btrfs_fs_info * fs_info,struct scrub_block * sblock)1418*4882a593Smuzhiyun static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1419*4882a593Smuzhiyun 					  struct scrub_block *sblock)
1420*4882a593Smuzhiyun {
1421*4882a593Smuzhiyun 	struct scrub_page *first_page = sblock->pagev[0];
1422*4882a593Smuzhiyun 	struct bio *bio;
1423*4882a593Smuzhiyun 	int page_num;
1424*4882a593Smuzhiyun 
1425*4882a593Smuzhiyun 	/* All pages in sblock belong to the same stripe on the same device. */
1426*4882a593Smuzhiyun 	ASSERT(first_page->dev);
1427*4882a593Smuzhiyun 	if (!first_page->dev->bdev)
1428*4882a593Smuzhiyun 		goto out;
1429*4882a593Smuzhiyun 
1430*4882a593Smuzhiyun 	bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
1431*4882a593Smuzhiyun 	bio_set_dev(bio, first_page->dev->bdev);
1432*4882a593Smuzhiyun 
1433*4882a593Smuzhiyun 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1434*4882a593Smuzhiyun 		struct scrub_page *page = sblock->pagev[page_num];
1435*4882a593Smuzhiyun 
1436*4882a593Smuzhiyun 		WARN_ON(!page->page);
1437*4882a593Smuzhiyun 		bio_add_page(bio, page->page, PAGE_SIZE, 0);
1438*4882a593Smuzhiyun 	}
1439*4882a593Smuzhiyun 
1440*4882a593Smuzhiyun 	if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1441*4882a593Smuzhiyun 		bio_put(bio);
1442*4882a593Smuzhiyun 		goto out;
1443*4882a593Smuzhiyun 	}
1444*4882a593Smuzhiyun 
1445*4882a593Smuzhiyun 	bio_put(bio);
1446*4882a593Smuzhiyun 
1447*4882a593Smuzhiyun 	scrub_recheck_block_checksum(sblock);
1448*4882a593Smuzhiyun 
1449*4882a593Smuzhiyun 	return;
1450*4882a593Smuzhiyun out:
1451*4882a593Smuzhiyun 	for (page_num = 0; page_num < sblock->page_count; page_num++)
1452*4882a593Smuzhiyun 		sblock->pagev[page_num]->io_error = 1;
1453*4882a593Smuzhiyun 
1454*4882a593Smuzhiyun 	sblock->no_io_error_seen = 0;
1455*4882a593Smuzhiyun }
1456*4882a593Smuzhiyun 
1457*4882a593Smuzhiyun /*
1458*4882a593Smuzhiyun  * this function will check the on disk data for checksum errors, header
1459*4882a593Smuzhiyun  * errors and read I/O errors. If any I/O errors happen, the exact pages
1460*4882a593Smuzhiyun  * which are errored are marked as being bad. The goal is to enable scrub
1461*4882a593Smuzhiyun  * to take those pages that are not errored from all the mirrors so that
1462*4882a593Smuzhiyun  * the pages that are errored in the just handled mirror can be repaired.
1463*4882a593Smuzhiyun  */
scrub_recheck_block(struct btrfs_fs_info * fs_info,struct scrub_block * sblock,int retry_failed_mirror)1464*4882a593Smuzhiyun static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1465*4882a593Smuzhiyun 				struct scrub_block *sblock,
1466*4882a593Smuzhiyun 				int retry_failed_mirror)
1467*4882a593Smuzhiyun {
1468*4882a593Smuzhiyun 	int page_num;
1469*4882a593Smuzhiyun 
1470*4882a593Smuzhiyun 	sblock->no_io_error_seen = 1;
1471*4882a593Smuzhiyun 
1472*4882a593Smuzhiyun 	/* short cut for raid56 */
1473*4882a593Smuzhiyun 	if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1474*4882a593Smuzhiyun 		return scrub_recheck_block_on_raid56(fs_info, sblock);
1475*4882a593Smuzhiyun 
1476*4882a593Smuzhiyun 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1477*4882a593Smuzhiyun 		struct bio *bio;
1478*4882a593Smuzhiyun 		struct scrub_page *page = sblock->pagev[page_num];
1479*4882a593Smuzhiyun 
1480*4882a593Smuzhiyun 		if (page->dev->bdev == NULL) {
1481*4882a593Smuzhiyun 			page->io_error = 1;
1482*4882a593Smuzhiyun 			sblock->no_io_error_seen = 0;
1483*4882a593Smuzhiyun 			continue;
1484*4882a593Smuzhiyun 		}
1485*4882a593Smuzhiyun 
1486*4882a593Smuzhiyun 		WARN_ON(!page->page);
1487*4882a593Smuzhiyun 		bio = btrfs_io_bio_alloc(1);
1488*4882a593Smuzhiyun 		bio_set_dev(bio, page->dev->bdev);
1489*4882a593Smuzhiyun 
1490*4882a593Smuzhiyun 		bio_add_page(bio, page->page, PAGE_SIZE, 0);
1491*4882a593Smuzhiyun 		bio->bi_iter.bi_sector = page->physical >> 9;
1492*4882a593Smuzhiyun 		bio->bi_opf = REQ_OP_READ;
1493*4882a593Smuzhiyun 
1494*4882a593Smuzhiyun 		if (btrfsic_submit_bio_wait(bio)) {
1495*4882a593Smuzhiyun 			page->io_error = 1;
1496*4882a593Smuzhiyun 			sblock->no_io_error_seen = 0;
1497*4882a593Smuzhiyun 		}
1498*4882a593Smuzhiyun 
1499*4882a593Smuzhiyun 		bio_put(bio);
1500*4882a593Smuzhiyun 	}
1501*4882a593Smuzhiyun 
1502*4882a593Smuzhiyun 	if (sblock->no_io_error_seen)
1503*4882a593Smuzhiyun 		scrub_recheck_block_checksum(sblock);
1504*4882a593Smuzhiyun }
1505*4882a593Smuzhiyun 
scrub_check_fsid(u8 fsid[],struct scrub_page * spage)1506*4882a593Smuzhiyun static inline int scrub_check_fsid(u8 fsid[],
1507*4882a593Smuzhiyun 				   struct scrub_page *spage)
1508*4882a593Smuzhiyun {
1509*4882a593Smuzhiyun 	struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1510*4882a593Smuzhiyun 	int ret;
1511*4882a593Smuzhiyun 
1512*4882a593Smuzhiyun 	ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1513*4882a593Smuzhiyun 	return !ret;
1514*4882a593Smuzhiyun }
1515*4882a593Smuzhiyun 
scrub_recheck_block_checksum(struct scrub_block * sblock)1516*4882a593Smuzhiyun static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1517*4882a593Smuzhiyun {
1518*4882a593Smuzhiyun 	sblock->header_error = 0;
1519*4882a593Smuzhiyun 	sblock->checksum_error = 0;
1520*4882a593Smuzhiyun 	sblock->generation_error = 0;
1521*4882a593Smuzhiyun 
1522*4882a593Smuzhiyun 	if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1523*4882a593Smuzhiyun 		scrub_checksum_data(sblock);
1524*4882a593Smuzhiyun 	else
1525*4882a593Smuzhiyun 		scrub_checksum_tree_block(sblock);
1526*4882a593Smuzhiyun }
1527*4882a593Smuzhiyun 
scrub_repair_block_from_good_copy(struct scrub_block * sblock_bad,struct scrub_block * sblock_good)1528*4882a593Smuzhiyun static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1529*4882a593Smuzhiyun 					     struct scrub_block *sblock_good)
1530*4882a593Smuzhiyun {
1531*4882a593Smuzhiyun 	int page_num;
1532*4882a593Smuzhiyun 	int ret = 0;
1533*4882a593Smuzhiyun 
1534*4882a593Smuzhiyun 	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1535*4882a593Smuzhiyun 		int ret_sub;
1536*4882a593Smuzhiyun 
1537*4882a593Smuzhiyun 		ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1538*4882a593Smuzhiyun 							   sblock_good,
1539*4882a593Smuzhiyun 							   page_num, 1);
1540*4882a593Smuzhiyun 		if (ret_sub)
1541*4882a593Smuzhiyun 			ret = ret_sub;
1542*4882a593Smuzhiyun 	}
1543*4882a593Smuzhiyun 
1544*4882a593Smuzhiyun 	return ret;
1545*4882a593Smuzhiyun }
1546*4882a593Smuzhiyun 
scrub_repair_page_from_good_copy(struct scrub_block * sblock_bad,struct scrub_block * sblock_good,int page_num,int force_write)1547*4882a593Smuzhiyun static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1548*4882a593Smuzhiyun 					    struct scrub_block *sblock_good,
1549*4882a593Smuzhiyun 					    int page_num, int force_write)
1550*4882a593Smuzhiyun {
1551*4882a593Smuzhiyun 	struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1552*4882a593Smuzhiyun 	struct scrub_page *page_good = sblock_good->pagev[page_num];
1553*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1554*4882a593Smuzhiyun 
1555*4882a593Smuzhiyun 	BUG_ON(page_bad->page == NULL);
1556*4882a593Smuzhiyun 	BUG_ON(page_good->page == NULL);
1557*4882a593Smuzhiyun 	if (force_write || sblock_bad->header_error ||
1558*4882a593Smuzhiyun 	    sblock_bad->checksum_error || page_bad->io_error) {
1559*4882a593Smuzhiyun 		struct bio *bio;
1560*4882a593Smuzhiyun 		int ret;
1561*4882a593Smuzhiyun 
1562*4882a593Smuzhiyun 		if (!page_bad->dev->bdev) {
1563*4882a593Smuzhiyun 			btrfs_warn_rl(fs_info,
1564*4882a593Smuzhiyun 				"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1565*4882a593Smuzhiyun 			return -EIO;
1566*4882a593Smuzhiyun 		}
1567*4882a593Smuzhiyun 
1568*4882a593Smuzhiyun 		bio = btrfs_io_bio_alloc(1);
1569*4882a593Smuzhiyun 		bio_set_dev(bio, page_bad->dev->bdev);
1570*4882a593Smuzhiyun 		bio->bi_iter.bi_sector = page_bad->physical >> 9;
1571*4882a593Smuzhiyun 		bio->bi_opf = REQ_OP_WRITE;
1572*4882a593Smuzhiyun 
1573*4882a593Smuzhiyun 		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1574*4882a593Smuzhiyun 		if (PAGE_SIZE != ret) {
1575*4882a593Smuzhiyun 			bio_put(bio);
1576*4882a593Smuzhiyun 			return -EIO;
1577*4882a593Smuzhiyun 		}
1578*4882a593Smuzhiyun 
1579*4882a593Smuzhiyun 		if (btrfsic_submit_bio_wait(bio)) {
1580*4882a593Smuzhiyun 			btrfs_dev_stat_inc_and_print(page_bad->dev,
1581*4882a593Smuzhiyun 				BTRFS_DEV_STAT_WRITE_ERRS);
1582*4882a593Smuzhiyun 			atomic64_inc(&fs_info->dev_replace.num_write_errors);
1583*4882a593Smuzhiyun 			bio_put(bio);
1584*4882a593Smuzhiyun 			return -EIO;
1585*4882a593Smuzhiyun 		}
1586*4882a593Smuzhiyun 		bio_put(bio);
1587*4882a593Smuzhiyun 	}
1588*4882a593Smuzhiyun 
1589*4882a593Smuzhiyun 	return 0;
1590*4882a593Smuzhiyun }
1591*4882a593Smuzhiyun 
scrub_write_block_to_dev_replace(struct scrub_block * sblock)1592*4882a593Smuzhiyun static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1593*4882a593Smuzhiyun {
1594*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1595*4882a593Smuzhiyun 	int page_num;
1596*4882a593Smuzhiyun 
1597*4882a593Smuzhiyun 	/*
1598*4882a593Smuzhiyun 	 * This block is used for the check of the parity on the source device,
1599*4882a593Smuzhiyun 	 * so the data needn't be written into the destination device.
1600*4882a593Smuzhiyun 	 */
1601*4882a593Smuzhiyun 	if (sblock->sparity)
1602*4882a593Smuzhiyun 		return;
1603*4882a593Smuzhiyun 
1604*4882a593Smuzhiyun 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
1605*4882a593Smuzhiyun 		int ret;
1606*4882a593Smuzhiyun 
1607*4882a593Smuzhiyun 		ret = scrub_write_page_to_dev_replace(sblock, page_num);
1608*4882a593Smuzhiyun 		if (ret)
1609*4882a593Smuzhiyun 			atomic64_inc(&fs_info->dev_replace.num_write_errors);
1610*4882a593Smuzhiyun 	}
1611*4882a593Smuzhiyun }
1612*4882a593Smuzhiyun 
scrub_write_page_to_dev_replace(struct scrub_block * sblock,int page_num)1613*4882a593Smuzhiyun static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1614*4882a593Smuzhiyun 					   int page_num)
1615*4882a593Smuzhiyun {
1616*4882a593Smuzhiyun 	struct scrub_page *spage = sblock->pagev[page_num];
1617*4882a593Smuzhiyun 
1618*4882a593Smuzhiyun 	BUG_ON(spage->page == NULL);
1619*4882a593Smuzhiyun 	if (spage->io_error)
1620*4882a593Smuzhiyun 		clear_page(page_address(spage->page));
1621*4882a593Smuzhiyun 
1622*4882a593Smuzhiyun 	return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1623*4882a593Smuzhiyun }
1624*4882a593Smuzhiyun 
scrub_add_page_to_wr_bio(struct scrub_ctx * sctx,struct scrub_page * spage)1625*4882a593Smuzhiyun static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1626*4882a593Smuzhiyun 				    struct scrub_page *spage)
1627*4882a593Smuzhiyun {
1628*4882a593Smuzhiyun 	struct scrub_bio *sbio;
1629*4882a593Smuzhiyun 	int ret;
1630*4882a593Smuzhiyun 
1631*4882a593Smuzhiyun 	mutex_lock(&sctx->wr_lock);
1632*4882a593Smuzhiyun again:
1633*4882a593Smuzhiyun 	if (!sctx->wr_curr_bio) {
1634*4882a593Smuzhiyun 		sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1635*4882a593Smuzhiyun 					      GFP_KERNEL);
1636*4882a593Smuzhiyun 		if (!sctx->wr_curr_bio) {
1637*4882a593Smuzhiyun 			mutex_unlock(&sctx->wr_lock);
1638*4882a593Smuzhiyun 			return -ENOMEM;
1639*4882a593Smuzhiyun 		}
1640*4882a593Smuzhiyun 		sctx->wr_curr_bio->sctx = sctx;
1641*4882a593Smuzhiyun 		sctx->wr_curr_bio->page_count = 0;
1642*4882a593Smuzhiyun 	}
1643*4882a593Smuzhiyun 	sbio = sctx->wr_curr_bio;
1644*4882a593Smuzhiyun 	if (sbio->page_count == 0) {
1645*4882a593Smuzhiyun 		struct bio *bio;
1646*4882a593Smuzhiyun 
1647*4882a593Smuzhiyun 		sbio->physical = spage->physical_for_dev_replace;
1648*4882a593Smuzhiyun 		sbio->logical = spage->logical;
1649*4882a593Smuzhiyun 		sbio->dev = sctx->wr_tgtdev;
1650*4882a593Smuzhiyun 		bio = sbio->bio;
1651*4882a593Smuzhiyun 		if (!bio) {
1652*4882a593Smuzhiyun 			bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
1653*4882a593Smuzhiyun 			sbio->bio = bio;
1654*4882a593Smuzhiyun 		}
1655*4882a593Smuzhiyun 
1656*4882a593Smuzhiyun 		bio->bi_private = sbio;
1657*4882a593Smuzhiyun 		bio->bi_end_io = scrub_wr_bio_end_io;
1658*4882a593Smuzhiyun 		bio_set_dev(bio, sbio->dev->bdev);
1659*4882a593Smuzhiyun 		bio->bi_iter.bi_sector = sbio->physical >> 9;
1660*4882a593Smuzhiyun 		bio->bi_opf = REQ_OP_WRITE;
1661*4882a593Smuzhiyun 		sbio->status = 0;
1662*4882a593Smuzhiyun 	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1663*4882a593Smuzhiyun 		   spage->physical_for_dev_replace ||
1664*4882a593Smuzhiyun 		   sbio->logical + sbio->page_count * PAGE_SIZE !=
1665*4882a593Smuzhiyun 		   spage->logical) {
1666*4882a593Smuzhiyun 		scrub_wr_submit(sctx);
1667*4882a593Smuzhiyun 		goto again;
1668*4882a593Smuzhiyun 	}
1669*4882a593Smuzhiyun 
1670*4882a593Smuzhiyun 	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1671*4882a593Smuzhiyun 	if (ret != PAGE_SIZE) {
1672*4882a593Smuzhiyun 		if (sbio->page_count < 1) {
1673*4882a593Smuzhiyun 			bio_put(sbio->bio);
1674*4882a593Smuzhiyun 			sbio->bio = NULL;
1675*4882a593Smuzhiyun 			mutex_unlock(&sctx->wr_lock);
1676*4882a593Smuzhiyun 			return -EIO;
1677*4882a593Smuzhiyun 		}
1678*4882a593Smuzhiyun 		scrub_wr_submit(sctx);
1679*4882a593Smuzhiyun 		goto again;
1680*4882a593Smuzhiyun 	}
1681*4882a593Smuzhiyun 
1682*4882a593Smuzhiyun 	sbio->pagev[sbio->page_count] = spage;
1683*4882a593Smuzhiyun 	scrub_page_get(spage);
1684*4882a593Smuzhiyun 	sbio->page_count++;
1685*4882a593Smuzhiyun 	if (sbio->page_count == sctx->pages_per_wr_bio)
1686*4882a593Smuzhiyun 		scrub_wr_submit(sctx);
1687*4882a593Smuzhiyun 	mutex_unlock(&sctx->wr_lock);
1688*4882a593Smuzhiyun 
1689*4882a593Smuzhiyun 	return 0;
1690*4882a593Smuzhiyun }
1691*4882a593Smuzhiyun 
scrub_wr_submit(struct scrub_ctx * sctx)1692*4882a593Smuzhiyun static void scrub_wr_submit(struct scrub_ctx *sctx)
1693*4882a593Smuzhiyun {
1694*4882a593Smuzhiyun 	struct scrub_bio *sbio;
1695*4882a593Smuzhiyun 
1696*4882a593Smuzhiyun 	if (!sctx->wr_curr_bio)
1697*4882a593Smuzhiyun 		return;
1698*4882a593Smuzhiyun 
1699*4882a593Smuzhiyun 	sbio = sctx->wr_curr_bio;
1700*4882a593Smuzhiyun 	sctx->wr_curr_bio = NULL;
1701*4882a593Smuzhiyun 	WARN_ON(!sbio->bio->bi_disk);
1702*4882a593Smuzhiyun 	scrub_pending_bio_inc(sctx);
1703*4882a593Smuzhiyun 	/* process all writes in a single worker thread. Then the block layer
1704*4882a593Smuzhiyun 	 * orders the requests before sending them to the driver which
1705*4882a593Smuzhiyun 	 * doubled the write performance on spinning disks when measured
1706*4882a593Smuzhiyun 	 * with Linux 3.5 */
1707*4882a593Smuzhiyun 	btrfsic_submit_bio(sbio->bio);
1708*4882a593Smuzhiyun }
1709*4882a593Smuzhiyun 
scrub_wr_bio_end_io(struct bio * bio)1710*4882a593Smuzhiyun static void scrub_wr_bio_end_io(struct bio *bio)
1711*4882a593Smuzhiyun {
1712*4882a593Smuzhiyun 	struct scrub_bio *sbio = bio->bi_private;
1713*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1714*4882a593Smuzhiyun 
1715*4882a593Smuzhiyun 	sbio->status = bio->bi_status;
1716*4882a593Smuzhiyun 	sbio->bio = bio;
1717*4882a593Smuzhiyun 
1718*4882a593Smuzhiyun 	btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1719*4882a593Smuzhiyun 	btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1720*4882a593Smuzhiyun }
1721*4882a593Smuzhiyun 
scrub_wr_bio_end_io_worker(struct btrfs_work * work)1722*4882a593Smuzhiyun static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1723*4882a593Smuzhiyun {
1724*4882a593Smuzhiyun 	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1725*4882a593Smuzhiyun 	struct scrub_ctx *sctx = sbio->sctx;
1726*4882a593Smuzhiyun 	int i;
1727*4882a593Smuzhiyun 
1728*4882a593Smuzhiyun 	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1729*4882a593Smuzhiyun 	if (sbio->status) {
1730*4882a593Smuzhiyun 		struct btrfs_dev_replace *dev_replace =
1731*4882a593Smuzhiyun 			&sbio->sctx->fs_info->dev_replace;
1732*4882a593Smuzhiyun 
1733*4882a593Smuzhiyun 		for (i = 0; i < sbio->page_count; i++) {
1734*4882a593Smuzhiyun 			struct scrub_page *spage = sbio->pagev[i];
1735*4882a593Smuzhiyun 
1736*4882a593Smuzhiyun 			spage->io_error = 1;
1737*4882a593Smuzhiyun 			atomic64_inc(&dev_replace->num_write_errors);
1738*4882a593Smuzhiyun 		}
1739*4882a593Smuzhiyun 	}
1740*4882a593Smuzhiyun 
1741*4882a593Smuzhiyun 	for (i = 0; i < sbio->page_count; i++)
1742*4882a593Smuzhiyun 		scrub_page_put(sbio->pagev[i]);
1743*4882a593Smuzhiyun 
1744*4882a593Smuzhiyun 	bio_put(sbio->bio);
1745*4882a593Smuzhiyun 	kfree(sbio);
1746*4882a593Smuzhiyun 	scrub_pending_bio_dec(sctx);
1747*4882a593Smuzhiyun }
1748*4882a593Smuzhiyun 
scrub_checksum(struct scrub_block * sblock)1749*4882a593Smuzhiyun static int scrub_checksum(struct scrub_block *sblock)
1750*4882a593Smuzhiyun {
1751*4882a593Smuzhiyun 	u64 flags;
1752*4882a593Smuzhiyun 	int ret;
1753*4882a593Smuzhiyun 
1754*4882a593Smuzhiyun 	/*
1755*4882a593Smuzhiyun 	 * No need to initialize these stats currently,
1756*4882a593Smuzhiyun 	 * because this function only use return value
1757*4882a593Smuzhiyun 	 * instead of these stats value.
1758*4882a593Smuzhiyun 	 *
1759*4882a593Smuzhiyun 	 * Todo:
1760*4882a593Smuzhiyun 	 * always use stats
1761*4882a593Smuzhiyun 	 */
1762*4882a593Smuzhiyun 	sblock->header_error = 0;
1763*4882a593Smuzhiyun 	sblock->generation_error = 0;
1764*4882a593Smuzhiyun 	sblock->checksum_error = 0;
1765*4882a593Smuzhiyun 
1766*4882a593Smuzhiyun 	WARN_ON(sblock->page_count < 1);
1767*4882a593Smuzhiyun 	flags = sblock->pagev[0]->flags;
1768*4882a593Smuzhiyun 	ret = 0;
1769*4882a593Smuzhiyun 	if (flags & BTRFS_EXTENT_FLAG_DATA)
1770*4882a593Smuzhiyun 		ret = scrub_checksum_data(sblock);
1771*4882a593Smuzhiyun 	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1772*4882a593Smuzhiyun 		ret = scrub_checksum_tree_block(sblock);
1773*4882a593Smuzhiyun 	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1774*4882a593Smuzhiyun 		(void)scrub_checksum_super(sblock);
1775*4882a593Smuzhiyun 	else
1776*4882a593Smuzhiyun 		WARN_ON(1);
1777*4882a593Smuzhiyun 	if (ret)
1778*4882a593Smuzhiyun 		scrub_handle_errored_block(sblock);
1779*4882a593Smuzhiyun 
1780*4882a593Smuzhiyun 	return ret;
1781*4882a593Smuzhiyun }
1782*4882a593Smuzhiyun 
scrub_checksum_data(struct scrub_block * sblock)1783*4882a593Smuzhiyun static int scrub_checksum_data(struct scrub_block *sblock)
1784*4882a593Smuzhiyun {
1785*4882a593Smuzhiyun 	struct scrub_ctx *sctx = sblock->sctx;
1786*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sctx->fs_info;
1787*4882a593Smuzhiyun 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1788*4882a593Smuzhiyun 	u8 csum[BTRFS_CSUM_SIZE];
1789*4882a593Smuzhiyun 	struct scrub_page *spage;
1790*4882a593Smuzhiyun 	char *kaddr;
1791*4882a593Smuzhiyun 
1792*4882a593Smuzhiyun 	BUG_ON(sblock->page_count < 1);
1793*4882a593Smuzhiyun 	spage = sblock->pagev[0];
1794*4882a593Smuzhiyun 	if (!spage->have_csum)
1795*4882a593Smuzhiyun 		return 0;
1796*4882a593Smuzhiyun 
1797*4882a593Smuzhiyun 	kaddr = page_address(spage->page);
1798*4882a593Smuzhiyun 
1799*4882a593Smuzhiyun 	shash->tfm = fs_info->csum_shash;
1800*4882a593Smuzhiyun 	crypto_shash_init(shash);
1801*4882a593Smuzhiyun 	crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum);
1802*4882a593Smuzhiyun 
1803*4882a593Smuzhiyun 	if (memcmp(csum, spage->csum, sctx->csum_size))
1804*4882a593Smuzhiyun 		sblock->checksum_error = 1;
1805*4882a593Smuzhiyun 
1806*4882a593Smuzhiyun 	return sblock->checksum_error;
1807*4882a593Smuzhiyun }
1808*4882a593Smuzhiyun 
scrub_checksum_tree_block(struct scrub_block * sblock)1809*4882a593Smuzhiyun static int scrub_checksum_tree_block(struct scrub_block *sblock)
1810*4882a593Smuzhiyun {
1811*4882a593Smuzhiyun 	struct scrub_ctx *sctx = sblock->sctx;
1812*4882a593Smuzhiyun 	struct btrfs_header *h;
1813*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sctx->fs_info;
1814*4882a593Smuzhiyun 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1815*4882a593Smuzhiyun 	u8 calculated_csum[BTRFS_CSUM_SIZE];
1816*4882a593Smuzhiyun 	u8 on_disk_csum[BTRFS_CSUM_SIZE];
1817*4882a593Smuzhiyun 	const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT;
1818*4882a593Smuzhiyun 	int i;
1819*4882a593Smuzhiyun 	struct scrub_page *spage;
1820*4882a593Smuzhiyun 	char *kaddr;
1821*4882a593Smuzhiyun 
1822*4882a593Smuzhiyun 	BUG_ON(sblock->page_count < 1);
1823*4882a593Smuzhiyun 	spage = sblock->pagev[0];
1824*4882a593Smuzhiyun 	kaddr = page_address(spage->page);
1825*4882a593Smuzhiyun 	h = (struct btrfs_header *)kaddr;
1826*4882a593Smuzhiyun 	memcpy(on_disk_csum, h->csum, sctx->csum_size);
1827*4882a593Smuzhiyun 
1828*4882a593Smuzhiyun 	/*
1829*4882a593Smuzhiyun 	 * we don't use the getter functions here, as we
1830*4882a593Smuzhiyun 	 * a) don't have an extent buffer and
1831*4882a593Smuzhiyun 	 * b) the page is already kmapped
1832*4882a593Smuzhiyun 	 */
1833*4882a593Smuzhiyun 	if (spage->logical != btrfs_stack_header_bytenr(h))
1834*4882a593Smuzhiyun 		sblock->header_error = 1;
1835*4882a593Smuzhiyun 
1836*4882a593Smuzhiyun 	if (spage->generation != btrfs_stack_header_generation(h)) {
1837*4882a593Smuzhiyun 		sblock->header_error = 1;
1838*4882a593Smuzhiyun 		sblock->generation_error = 1;
1839*4882a593Smuzhiyun 	}
1840*4882a593Smuzhiyun 
1841*4882a593Smuzhiyun 	if (!scrub_check_fsid(h->fsid, spage))
1842*4882a593Smuzhiyun 		sblock->header_error = 1;
1843*4882a593Smuzhiyun 
1844*4882a593Smuzhiyun 	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1845*4882a593Smuzhiyun 		   BTRFS_UUID_SIZE))
1846*4882a593Smuzhiyun 		sblock->header_error = 1;
1847*4882a593Smuzhiyun 
1848*4882a593Smuzhiyun 	shash->tfm = fs_info->csum_shash;
1849*4882a593Smuzhiyun 	crypto_shash_init(shash);
1850*4882a593Smuzhiyun 	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1851*4882a593Smuzhiyun 			    PAGE_SIZE - BTRFS_CSUM_SIZE);
1852*4882a593Smuzhiyun 
1853*4882a593Smuzhiyun 	for (i = 1; i < num_pages; i++) {
1854*4882a593Smuzhiyun 		kaddr = page_address(sblock->pagev[i]->page);
1855*4882a593Smuzhiyun 		crypto_shash_update(shash, kaddr, PAGE_SIZE);
1856*4882a593Smuzhiyun 	}
1857*4882a593Smuzhiyun 
1858*4882a593Smuzhiyun 	crypto_shash_final(shash, calculated_csum);
1859*4882a593Smuzhiyun 	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1860*4882a593Smuzhiyun 		sblock->checksum_error = 1;
1861*4882a593Smuzhiyun 
1862*4882a593Smuzhiyun 	return sblock->header_error || sblock->checksum_error;
1863*4882a593Smuzhiyun }
1864*4882a593Smuzhiyun 
scrub_checksum_super(struct scrub_block * sblock)1865*4882a593Smuzhiyun static int scrub_checksum_super(struct scrub_block *sblock)
1866*4882a593Smuzhiyun {
1867*4882a593Smuzhiyun 	struct btrfs_super_block *s;
1868*4882a593Smuzhiyun 	struct scrub_ctx *sctx = sblock->sctx;
1869*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sctx->fs_info;
1870*4882a593Smuzhiyun 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1871*4882a593Smuzhiyun 	u8 calculated_csum[BTRFS_CSUM_SIZE];
1872*4882a593Smuzhiyun 	struct scrub_page *spage;
1873*4882a593Smuzhiyun 	char *kaddr;
1874*4882a593Smuzhiyun 	int fail_gen = 0;
1875*4882a593Smuzhiyun 	int fail_cor = 0;
1876*4882a593Smuzhiyun 
1877*4882a593Smuzhiyun 	BUG_ON(sblock->page_count < 1);
1878*4882a593Smuzhiyun 	spage = sblock->pagev[0];
1879*4882a593Smuzhiyun 	kaddr = page_address(spage->page);
1880*4882a593Smuzhiyun 	s = (struct btrfs_super_block *)kaddr;
1881*4882a593Smuzhiyun 
1882*4882a593Smuzhiyun 	if (spage->logical != btrfs_super_bytenr(s))
1883*4882a593Smuzhiyun 		++fail_cor;
1884*4882a593Smuzhiyun 
1885*4882a593Smuzhiyun 	if (spage->generation != btrfs_super_generation(s))
1886*4882a593Smuzhiyun 		++fail_gen;
1887*4882a593Smuzhiyun 
1888*4882a593Smuzhiyun 	if (!scrub_check_fsid(s->fsid, spage))
1889*4882a593Smuzhiyun 		++fail_cor;
1890*4882a593Smuzhiyun 
1891*4882a593Smuzhiyun 	shash->tfm = fs_info->csum_shash;
1892*4882a593Smuzhiyun 	crypto_shash_init(shash);
1893*4882a593Smuzhiyun 	crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1894*4882a593Smuzhiyun 			BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1895*4882a593Smuzhiyun 
1896*4882a593Smuzhiyun 	if (memcmp(calculated_csum, s->csum, sctx->csum_size))
1897*4882a593Smuzhiyun 		++fail_cor;
1898*4882a593Smuzhiyun 
1899*4882a593Smuzhiyun 	if (fail_cor + fail_gen) {
1900*4882a593Smuzhiyun 		/*
1901*4882a593Smuzhiyun 		 * if we find an error in a super block, we just report it.
1902*4882a593Smuzhiyun 		 * They will get written with the next transaction commit
1903*4882a593Smuzhiyun 		 * anyway
1904*4882a593Smuzhiyun 		 */
1905*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
1906*4882a593Smuzhiyun 		++sctx->stat.super_errors;
1907*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
1908*4882a593Smuzhiyun 		if (fail_cor)
1909*4882a593Smuzhiyun 			btrfs_dev_stat_inc_and_print(spage->dev,
1910*4882a593Smuzhiyun 				BTRFS_DEV_STAT_CORRUPTION_ERRS);
1911*4882a593Smuzhiyun 		else
1912*4882a593Smuzhiyun 			btrfs_dev_stat_inc_and_print(spage->dev,
1913*4882a593Smuzhiyun 				BTRFS_DEV_STAT_GENERATION_ERRS);
1914*4882a593Smuzhiyun 	}
1915*4882a593Smuzhiyun 
1916*4882a593Smuzhiyun 	return fail_cor + fail_gen;
1917*4882a593Smuzhiyun }
1918*4882a593Smuzhiyun 
scrub_block_get(struct scrub_block * sblock)1919*4882a593Smuzhiyun static void scrub_block_get(struct scrub_block *sblock)
1920*4882a593Smuzhiyun {
1921*4882a593Smuzhiyun 	refcount_inc(&sblock->refs);
1922*4882a593Smuzhiyun }
1923*4882a593Smuzhiyun 
scrub_block_put(struct scrub_block * sblock)1924*4882a593Smuzhiyun static void scrub_block_put(struct scrub_block *sblock)
1925*4882a593Smuzhiyun {
1926*4882a593Smuzhiyun 	if (refcount_dec_and_test(&sblock->refs)) {
1927*4882a593Smuzhiyun 		int i;
1928*4882a593Smuzhiyun 
1929*4882a593Smuzhiyun 		if (sblock->sparity)
1930*4882a593Smuzhiyun 			scrub_parity_put(sblock->sparity);
1931*4882a593Smuzhiyun 
1932*4882a593Smuzhiyun 		for (i = 0; i < sblock->page_count; i++)
1933*4882a593Smuzhiyun 			scrub_page_put(sblock->pagev[i]);
1934*4882a593Smuzhiyun 		kfree(sblock);
1935*4882a593Smuzhiyun 	}
1936*4882a593Smuzhiyun }
1937*4882a593Smuzhiyun 
scrub_page_get(struct scrub_page * spage)1938*4882a593Smuzhiyun static void scrub_page_get(struct scrub_page *spage)
1939*4882a593Smuzhiyun {
1940*4882a593Smuzhiyun 	atomic_inc(&spage->refs);
1941*4882a593Smuzhiyun }
1942*4882a593Smuzhiyun 
scrub_page_put(struct scrub_page * spage)1943*4882a593Smuzhiyun static void scrub_page_put(struct scrub_page *spage)
1944*4882a593Smuzhiyun {
1945*4882a593Smuzhiyun 	if (atomic_dec_and_test(&spage->refs)) {
1946*4882a593Smuzhiyun 		if (spage->page)
1947*4882a593Smuzhiyun 			__free_page(spage->page);
1948*4882a593Smuzhiyun 		kfree(spage);
1949*4882a593Smuzhiyun 	}
1950*4882a593Smuzhiyun }
1951*4882a593Smuzhiyun 
scrub_submit(struct scrub_ctx * sctx)1952*4882a593Smuzhiyun static void scrub_submit(struct scrub_ctx *sctx)
1953*4882a593Smuzhiyun {
1954*4882a593Smuzhiyun 	struct scrub_bio *sbio;
1955*4882a593Smuzhiyun 
1956*4882a593Smuzhiyun 	if (sctx->curr == -1)
1957*4882a593Smuzhiyun 		return;
1958*4882a593Smuzhiyun 
1959*4882a593Smuzhiyun 	sbio = sctx->bios[sctx->curr];
1960*4882a593Smuzhiyun 	sctx->curr = -1;
1961*4882a593Smuzhiyun 	scrub_pending_bio_inc(sctx);
1962*4882a593Smuzhiyun 	btrfsic_submit_bio(sbio->bio);
1963*4882a593Smuzhiyun }
1964*4882a593Smuzhiyun 
scrub_add_page_to_rd_bio(struct scrub_ctx * sctx,struct scrub_page * spage)1965*4882a593Smuzhiyun static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1966*4882a593Smuzhiyun 				    struct scrub_page *spage)
1967*4882a593Smuzhiyun {
1968*4882a593Smuzhiyun 	struct scrub_block *sblock = spage->sblock;
1969*4882a593Smuzhiyun 	struct scrub_bio *sbio;
1970*4882a593Smuzhiyun 	int ret;
1971*4882a593Smuzhiyun 
1972*4882a593Smuzhiyun again:
1973*4882a593Smuzhiyun 	/*
1974*4882a593Smuzhiyun 	 * grab a fresh bio or wait for one to become available
1975*4882a593Smuzhiyun 	 */
1976*4882a593Smuzhiyun 	while (sctx->curr == -1) {
1977*4882a593Smuzhiyun 		spin_lock(&sctx->list_lock);
1978*4882a593Smuzhiyun 		sctx->curr = sctx->first_free;
1979*4882a593Smuzhiyun 		if (sctx->curr != -1) {
1980*4882a593Smuzhiyun 			sctx->first_free = sctx->bios[sctx->curr]->next_free;
1981*4882a593Smuzhiyun 			sctx->bios[sctx->curr]->next_free = -1;
1982*4882a593Smuzhiyun 			sctx->bios[sctx->curr]->page_count = 0;
1983*4882a593Smuzhiyun 			spin_unlock(&sctx->list_lock);
1984*4882a593Smuzhiyun 		} else {
1985*4882a593Smuzhiyun 			spin_unlock(&sctx->list_lock);
1986*4882a593Smuzhiyun 			wait_event(sctx->list_wait, sctx->first_free != -1);
1987*4882a593Smuzhiyun 		}
1988*4882a593Smuzhiyun 	}
1989*4882a593Smuzhiyun 	sbio = sctx->bios[sctx->curr];
1990*4882a593Smuzhiyun 	if (sbio->page_count == 0) {
1991*4882a593Smuzhiyun 		struct bio *bio;
1992*4882a593Smuzhiyun 
1993*4882a593Smuzhiyun 		sbio->physical = spage->physical;
1994*4882a593Smuzhiyun 		sbio->logical = spage->logical;
1995*4882a593Smuzhiyun 		sbio->dev = spage->dev;
1996*4882a593Smuzhiyun 		bio = sbio->bio;
1997*4882a593Smuzhiyun 		if (!bio) {
1998*4882a593Smuzhiyun 			bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
1999*4882a593Smuzhiyun 			sbio->bio = bio;
2000*4882a593Smuzhiyun 		}
2001*4882a593Smuzhiyun 
2002*4882a593Smuzhiyun 		bio->bi_private = sbio;
2003*4882a593Smuzhiyun 		bio->bi_end_io = scrub_bio_end_io;
2004*4882a593Smuzhiyun 		bio_set_dev(bio, sbio->dev->bdev);
2005*4882a593Smuzhiyun 		bio->bi_iter.bi_sector = sbio->physical >> 9;
2006*4882a593Smuzhiyun 		bio->bi_opf = REQ_OP_READ;
2007*4882a593Smuzhiyun 		sbio->status = 0;
2008*4882a593Smuzhiyun 	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2009*4882a593Smuzhiyun 		   spage->physical ||
2010*4882a593Smuzhiyun 		   sbio->logical + sbio->page_count * PAGE_SIZE !=
2011*4882a593Smuzhiyun 		   spage->logical ||
2012*4882a593Smuzhiyun 		   sbio->dev != spage->dev) {
2013*4882a593Smuzhiyun 		scrub_submit(sctx);
2014*4882a593Smuzhiyun 		goto again;
2015*4882a593Smuzhiyun 	}
2016*4882a593Smuzhiyun 
2017*4882a593Smuzhiyun 	sbio->pagev[sbio->page_count] = spage;
2018*4882a593Smuzhiyun 	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2019*4882a593Smuzhiyun 	if (ret != PAGE_SIZE) {
2020*4882a593Smuzhiyun 		if (sbio->page_count < 1) {
2021*4882a593Smuzhiyun 			bio_put(sbio->bio);
2022*4882a593Smuzhiyun 			sbio->bio = NULL;
2023*4882a593Smuzhiyun 			return -EIO;
2024*4882a593Smuzhiyun 		}
2025*4882a593Smuzhiyun 		scrub_submit(sctx);
2026*4882a593Smuzhiyun 		goto again;
2027*4882a593Smuzhiyun 	}
2028*4882a593Smuzhiyun 
2029*4882a593Smuzhiyun 	scrub_block_get(sblock); /* one for the page added to the bio */
2030*4882a593Smuzhiyun 	atomic_inc(&sblock->outstanding_pages);
2031*4882a593Smuzhiyun 	sbio->page_count++;
2032*4882a593Smuzhiyun 	if (sbio->page_count == sctx->pages_per_rd_bio)
2033*4882a593Smuzhiyun 		scrub_submit(sctx);
2034*4882a593Smuzhiyun 
2035*4882a593Smuzhiyun 	return 0;
2036*4882a593Smuzhiyun }
2037*4882a593Smuzhiyun 
scrub_missing_raid56_end_io(struct bio * bio)2038*4882a593Smuzhiyun static void scrub_missing_raid56_end_io(struct bio *bio)
2039*4882a593Smuzhiyun {
2040*4882a593Smuzhiyun 	struct scrub_block *sblock = bio->bi_private;
2041*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2042*4882a593Smuzhiyun 
2043*4882a593Smuzhiyun 	if (bio->bi_status)
2044*4882a593Smuzhiyun 		sblock->no_io_error_seen = 0;
2045*4882a593Smuzhiyun 
2046*4882a593Smuzhiyun 	bio_put(bio);
2047*4882a593Smuzhiyun 
2048*4882a593Smuzhiyun 	btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2049*4882a593Smuzhiyun }
2050*4882a593Smuzhiyun 
scrub_missing_raid56_worker(struct btrfs_work * work)2051*4882a593Smuzhiyun static void scrub_missing_raid56_worker(struct btrfs_work *work)
2052*4882a593Smuzhiyun {
2053*4882a593Smuzhiyun 	struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2054*4882a593Smuzhiyun 	struct scrub_ctx *sctx = sblock->sctx;
2055*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sctx->fs_info;
2056*4882a593Smuzhiyun 	u64 logical;
2057*4882a593Smuzhiyun 	struct btrfs_device *dev;
2058*4882a593Smuzhiyun 
2059*4882a593Smuzhiyun 	logical = sblock->pagev[0]->logical;
2060*4882a593Smuzhiyun 	dev = sblock->pagev[0]->dev;
2061*4882a593Smuzhiyun 
2062*4882a593Smuzhiyun 	if (sblock->no_io_error_seen)
2063*4882a593Smuzhiyun 		scrub_recheck_block_checksum(sblock);
2064*4882a593Smuzhiyun 
2065*4882a593Smuzhiyun 	if (!sblock->no_io_error_seen) {
2066*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
2067*4882a593Smuzhiyun 		sctx->stat.read_errors++;
2068*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
2069*4882a593Smuzhiyun 		btrfs_err_rl_in_rcu(fs_info,
2070*4882a593Smuzhiyun 			"IO error rebuilding logical %llu for dev %s",
2071*4882a593Smuzhiyun 			logical, rcu_str_deref(dev->name));
2072*4882a593Smuzhiyun 	} else if (sblock->header_error || sblock->checksum_error) {
2073*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
2074*4882a593Smuzhiyun 		sctx->stat.uncorrectable_errors++;
2075*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
2076*4882a593Smuzhiyun 		btrfs_err_rl_in_rcu(fs_info,
2077*4882a593Smuzhiyun 			"failed to rebuild valid logical %llu for dev %s",
2078*4882a593Smuzhiyun 			logical, rcu_str_deref(dev->name));
2079*4882a593Smuzhiyun 	} else {
2080*4882a593Smuzhiyun 		scrub_write_block_to_dev_replace(sblock);
2081*4882a593Smuzhiyun 	}
2082*4882a593Smuzhiyun 
2083*4882a593Smuzhiyun 	if (sctx->is_dev_replace && sctx->flush_all_writes) {
2084*4882a593Smuzhiyun 		mutex_lock(&sctx->wr_lock);
2085*4882a593Smuzhiyun 		scrub_wr_submit(sctx);
2086*4882a593Smuzhiyun 		mutex_unlock(&sctx->wr_lock);
2087*4882a593Smuzhiyun 	}
2088*4882a593Smuzhiyun 
2089*4882a593Smuzhiyun 	scrub_block_put(sblock);
2090*4882a593Smuzhiyun 	scrub_pending_bio_dec(sctx);
2091*4882a593Smuzhiyun }
2092*4882a593Smuzhiyun 
scrub_missing_raid56_pages(struct scrub_block * sblock)2093*4882a593Smuzhiyun static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2094*4882a593Smuzhiyun {
2095*4882a593Smuzhiyun 	struct scrub_ctx *sctx = sblock->sctx;
2096*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sctx->fs_info;
2097*4882a593Smuzhiyun 	u64 length = sblock->page_count * PAGE_SIZE;
2098*4882a593Smuzhiyun 	u64 logical = sblock->pagev[0]->logical;
2099*4882a593Smuzhiyun 	struct btrfs_bio *bbio = NULL;
2100*4882a593Smuzhiyun 	struct bio *bio;
2101*4882a593Smuzhiyun 	struct btrfs_raid_bio *rbio;
2102*4882a593Smuzhiyun 	int ret;
2103*4882a593Smuzhiyun 	int i;
2104*4882a593Smuzhiyun 
2105*4882a593Smuzhiyun 	btrfs_bio_counter_inc_blocked(fs_info);
2106*4882a593Smuzhiyun 	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2107*4882a593Smuzhiyun 			&length, &bbio);
2108*4882a593Smuzhiyun 	if (ret || !bbio || !bbio->raid_map)
2109*4882a593Smuzhiyun 		goto bbio_out;
2110*4882a593Smuzhiyun 
2111*4882a593Smuzhiyun 	if (WARN_ON(!sctx->is_dev_replace ||
2112*4882a593Smuzhiyun 		    !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2113*4882a593Smuzhiyun 		/*
2114*4882a593Smuzhiyun 		 * We shouldn't be scrubbing a missing device. Even for dev
2115*4882a593Smuzhiyun 		 * replace, we should only get here for RAID 5/6. We either
2116*4882a593Smuzhiyun 		 * managed to mount something with no mirrors remaining or
2117*4882a593Smuzhiyun 		 * there's a bug in scrub_remap_extent()/btrfs_map_block().
2118*4882a593Smuzhiyun 		 */
2119*4882a593Smuzhiyun 		goto bbio_out;
2120*4882a593Smuzhiyun 	}
2121*4882a593Smuzhiyun 
2122*4882a593Smuzhiyun 	bio = btrfs_io_bio_alloc(0);
2123*4882a593Smuzhiyun 	bio->bi_iter.bi_sector = logical >> 9;
2124*4882a593Smuzhiyun 	bio->bi_private = sblock;
2125*4882a593Smuzhiyun 	bio->bi_end_io = scrub_missing_raid56_end_io;
2126*4882a593Smuzhiyun 
2127*4882a593Smuzhiyun 	rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
2128*4882a593Smuzhiyun 	if (!rbio)
2129*4882a593Smuzhiyun 		goto rbio_out;
2130*4882a593Smuzhiyun 
2131*4882a593Smuzhiyun 	for (i = 0; i < sblock->page_count; i++) {
2132*4882a593Smuzhiyun 		struct scrub_page *spage = sblock->pagev[i];
2133*4882a593Smuzhiyun 
2134*4882a593Smuzhiyun 		raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2135*4882a593Smuzhiyun 	}
2136*4882a593Smuzhiyun 
2137*4882a593Smuzhiyun 	btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
2138*4882a593Smuzhiyun 	scrub_block_get(sblock);
2139*4882a593Smuzhiyun 	scrub_pending_bio_inc(sctx);
2140*4882a593Smuzhiyun 	raid56_submit_missing_rbio(rbio);
2141*4882a593Smuzhiyun 	return;
2142*4882a593Smuzhiyun 
2143*4882a593Smuzhiyun rbio_out:
2144*4882a593Smuzhiyun 	bio_put(bio);
2145*4882a593Smuzhiyun bbio_out:
2146*4882a593Smuzhiyun 	btrfs_bio_counter_dec(fs_info);
2147*4882a593Smuzhiyun 	btrfs_put_bbio(bbio);
2148*4882a593Smuzhiyun 	spin_lock(&sctx->stat_lock);
2149*4882a593Smuzhiyun 	sctx->stat.malloc_errors++;
2150*4882a593Smuzhiyun 	spin_unlock(&sctx->stat_lock);
2151*4882a593Smuzhiyun }
2152*4882a593Smuzhiyun 
scrub_pages(struct scrub_ctx * sctx,u64 logical,u64 len,u64 physical,struct btrfs_device * dev,u64 flags,u64 gen,int mirror_num,u8 * csum,int force,u64 physical_for_dev_replace)2153*4882a593Smuzhiyun static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2154*4882a593Smuzhiyun 		       u64 physical, struct btrfs_device *dev, u64 flags,
2155*4882a593Smuzhiyun 		       u64 gen, int mirror_num, u8 *csum, int force,
2156*4882a593Smuzhiyun 		       u64 physical_for_dev_replace)
2157*4882a593Smuzhiyun {
2158*4882a593Smuzhiyun 	struct scrub_block *sblock;
2159*4882a593Smuzhiyun 	int index;
2160*4882a593Smuzhiyun 
2161*4882a593Smuzhiyun 	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2162*4882a593Smuzhiyun 	if (!sblock) {
2163*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
2164*4882a593Smuzhiyun 		sctx->stat.malloc_errors++;
2165*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
2166*4882a593Smuzhiyun 		return -ENOMEM;
2167*4882a593Smuzhiyun 	}
2168*4882a593Smuzhiyun 
2169*4882a593Smuzhiyun 	/* one ref inside this function, plus one for each page added to
2170*4882a593Smuzhiyun 	 * a bio later on */
2171*4882a593Smuzhiyun 	refcount_set(&sblock->refs, 1);
2172*4882a593Smuzhiyun 	sblock->sctx = sctx;
2173*4882a593Smuzhiyun 	sblock->no_io_error_seen = 1;
2174*4882a593Smuzhiyun 
2175*4882a593Smuzhiyun 	for (index = 0; len > 0; index++) {
2176*4882a593Smuzhiyun 		struct scrub_page *spage;
2177*4882a593Smuzhiyun 		u64 l = min_t(u64, len, PAGE_SIZE);
2178*4882a593Smuzhiyun 
2179*4882a593Smuzhiyun 		spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2180*4882a593Smuzhiyun 		if (!spage) {
2181*4882a593Smuzhiyun leave_nomem:
2182*4882a593Smuzhiyun 			spin_lock(&sctx->stat_lock);
2183*4882a593Smuzhiyun 			sctx->stat.malloc_errors++;
2184*4882a593Smuzhiyun 			spin_unlock(&sctx->stat_lock);
2185*4882a593Smuzhiyun 			scrub_block_put(sblock);
2186*4882a593Smuzhiyun 			return -ENOMEM;
2187*4882a593Smuzhiyun 		}
2188*4882a593Smuzhiyun 		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2189*4882a593Smuzhiyun 		scrub_page_get(spage);
2190*4882a593Smuzhiyun 		sblock->pagev[index] = spage;
2191*4882a593Smuzhiyun 		spage->sblock = sblock;
2192*4882a593Smuzhiyun 		spage->dev = dev;
2193*4882a593Smuzhiyun 		spage->flags = flags;
2194*4882a593Smuzhiyun 		spage->generation = gen;
2195*4882a593Smuzhiyun 		spage->logical = logical;
2196*4882a593Smuzhiyun 		spage->physical = physical;
2197*4882a593Smuzhiyun 		spage->physical_for_dev_replace = physical_for_dev_replace;
2198*4882a593Smuzhiyun 		spage->mirror_num = mirror_num;
2199*4882a593Smuzhiyun 		if (csum) {
2200*4882a593Smuzhiyun 			spage->have_csum = 1;
2201*4882a593Smuzhiyun 			memcpy(spage->csum, csum, sctx->csum_size);
2202*4882a593Smuzhiyun 		} else {
2203*4882a593Smuzhiyun 			spage->have_csum = 0;
2204*4882a593Smuzhiyun 		}
2205*4882a593Smuzhiyun 		sblock->page_count++;
2206*4882a593Smuzhiyun 		spage->page = alloc_page(GFP_KERNEL);
2207*4882a593Smuzhiyun 		if (!spage->page)
2208*4882a593Smuzhiyun 			goto leave_nomem;
2209*4882a593Smuzhiyun 		len -= l;
2210*4882a593Smuzhiyun 		logical += l;
2211*4882a593Smuzhiyun 		physical += l;
2212*4882a593Smuzhiyun 		physical_for_dev_replace += l;
2213*4882a593Smuzhiyun 	}
2214*4882a593Smuzhiyun 
2215*4882a593Smuzhiyun 	WARN_ON(sblock->page_count == 0);
2216*4882a593Smuzhiyun 	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2217*4882a593Smuzhiyun 		/*
2218*4882a593Smuzhiyun 		 * This case should only be hit for RAID 5/6 device replace. See
2219*4882a593Smuzhiyun 		 * the comment in scrub_missing_raid56_pages() for details.
2220*4882a593Smuzhiyun 		 */
2221*4882a593Smuzhiyun 		scrub_missing_raid56_pages(sblock);
2222*4882a593Smuzhiyun 	} else {
2223*4882a593Smuzhiyun 		for (index = 0; index < sblock->page_count; index++) {
2224*4882a593Smuzhiyun 			struct scrub_page *spage = sblock->pagev[index];
2225*4882a593Smuzhiyun 			int ret;
2226*4882a593Smuzhiyun 
2227*4882a593Smuzhiyun 			ret = scrub_add_page_to_rd_bio(sctx, spage);
2228*4882a593Smuzhiyun 			if (ret) {
2229*4882a593Smuzhiyun 				scrub_block_put(sblock);
2230*4882a593Smuzhiyun 				return ret;
2231*4882a593Smuzhiyun 			}
2232*4882a593Smuzhiyun 		}
2233*4882a593Smuzhiyun 
2234*4882a593Smuzhiyun 		if (force)
2235*4882a593Smuzhiyun 			scrub_submit(sctx);
2236*4882a593Smuzhiyun 	}
2237*4882a593Smuzhiyun 
2238*4882a593Smuzhiyun 	/* last one frees, either here or in bio completion for last page */
2239*4882a593Smuzhiyun 	scrub_block_put(sblock);
2240*4882a593Smuzhiyun 	return 0;
2241*4882a593Smuzhiyun }
2242*4882a593Smuzhiyun 
scrub_bio_end_io(struct bio * bio)2243*4882a593Smuzhiyun static void scrub_bio_end_io(struct bio *bio)
2244*4882a593Smuzhiyun {
2245*4882a593Smuzhiyun 	struct scrub_bio *sbio = bio->bi_private;
2246*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2247*4882a593Smuzhiyun 
2248*4882a593Smuzhiyun 	sbio->status = bio->bi_status;
2249*4882a593Smuzhiyun 	sbio->bio = bio;
2250*4882a593Smuzhiyun 
2251*4882a593Smuzhiyun 	btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2252*4882a593Smuzhiyun }
2253*4882a593Smuzhiyun 
scrub_bio_end_io_worker(struct btrfs_work * work)2254*4882a593Smuzhiyun static void scrub_bio_end_io_worker(struct btrfs_work *work)
2255*4882a593Smuzhiyun {
2256*4882a593Smuzhiyun 	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2257*4882a593Smuzhiyun 	struct scrub_ctx *sctx = sbio->sctx;
2258*4882a593Smuzhiyun 	int i;
2259*4882a593Smuzhiyun 
2260*4882a593Smuzhiyun 	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2261*4882a593Smuzhiyun 	if (sbio->status) {
2262*4882a593Smuzhiyun 		for (i = 0; i < sbio->page_count; i++) {
2263*4882a593Smuzhiyun 			struct scrub_page *spage = sbio->pagev[i];
2264*4882a593Smuzhiyun 
2265*4882a593Smuzhiyun 			spage->io_error = 1;
2266*4882a593Smuzhiyun 			spage->sblock->no_io_error_seen = 0;
2267*4882a593Smuzhiyun 		}
2268*4882a593Smuzhiyun 	}
2269*4882a593Smuzhiyun 
2270*4882a593Smuzhiyun 	/* now complete the scrub_block items that have all pages completed */
2271*4882a593Smuzhiyun 	for (i = 0; i < sbio->page_count; i++) {
2272*4882a593Smuzhiyun 		struct scrub_page *spage = sbio->pagev[i];
2273*4882a593Smuzhiyun 		struct scrub_block *sblock = spage->sblock;
2274*4882a593Smuzhiyun 
2275*4882a593Smuzhiyun 		if (atomic_dec_and_test(&sblock->outstanding_pages))
2276*4882a593Smuzhiyun 			scrub_block_complete(sblock);
2277*4882a593Smuzhiyun 		scrub_block_put(sblock);
2278*4882a593Smuzhiyun 	}
2279*4882a593Smuzhiyun 
2280*4882a593Smuzhiyun 	bio_put(sbio->bio);
2281*4882a593Smuzhiyun 	sbio->bio = NULL;
2282*4882a593Smuzhiyun 	spin_lock(&sctx->list_lock);
2283*4882a593Smuzhiyun 	sbio->next_free = sctx->first_free;
2284*4882a593Smuzhiyun 	sctx->first_free = sbio->index;
2285*4882a593Smuzhiyun 	spin_unlock(&sctx->list_lock);
2286*4882a593Smuzhiyun 
2287*4882a593Smuzhiyun 	if (sctx->is_dev_replace && sctx->flush_all_writes) {
2288*4882a593Smuzhiyun 		mutex_lock(&sctx->wr_lock);
2289*4882a593Smuzhiyun 		scrub_wr_submit(sctx);
2290*4882a593Smuzhiyun 		mutex_unlock(&sctx->wr_lock);
2291*4882a593Smuzhiyun 	}
2292*4882a593Smuzhiyun 
2293*4882a593Smuzhiyun 	scrub_pending_bio_dec(sctx);
2294*4882a593Smuzhiyun }
2295*4882a593Smuzhiyun 
__scrub_mark_bitmap(struct scrub_parity * sparity,unsigned long * bitmap,u64 start,u64 len)2296*4882a593Smuzhiyun static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2297*4882a593Smuzhiyun 				       unsigned long *bitmap,
2298*4882a593Smuzhiyun 				       u64 start, u64 len)
2299*4882a593Smuzhiyun {
2300*4882a593Smuzhiyun 	u64 offset;
2301*4882a593Smuzhiyun 	u64 nsectors64;
2302*4882a593Smuzhiyun 	u32 nsectors;
2303*4882a593Smuzhiyun 	int sectorsize = sparity->sctx->fs_info->sectorsize;
2304*4882a593Smuzhiyun 
2305*4882a593Smuzhiyun 	if (len >= sparity->stripe_len) {
2306*4882a593Smuzhiyun 		bitmap_set(bitmap, 0, sparity->nsectors);
2307*4882a593Smuzhiyun 		return;
2308*4882a593Smuzhiyun 	}
2309*4882a593Smuzhiyun 
2310*4882a593Smuzhiyun 	start -= sparity->logic_start;
2311*4882a593Smuzhiyun 	start = div64_u64_rem(start, sparity->stripe_len, &offset);
2312*4882a593Smuzhiyun 	offset = div_u64(offset, sectorsize);
2313*4882a593Smuzhiyun 	nsectors64 = div_u64(len, sectorsize);
2314*4882a593Smuzhiyun 
2315*4882a593Smuzhiyun 	ASSERT(nsectors64 < UINT_MAX);
2316*4882a593Smuzhiyun 	nsectors = (u32)nsectors64;
2317*4882a593Smuzhiyun 
2318*4882a593Smuzhiyun 	if (offset + nsectors <= sparity->nsectors) {
2319*4882a593Smuzhiyun 		bitmap_set(bitmap, offset, nsectors);
2320*4882a593Smuzhiyun 		return;
2321*4882a593Smuzhiyun 	}
2322*4882a593Smuzhiyun 
2323*4882a593Smuzhiyun 	bitmap_set(bitmap, offset, sparity->nsectors - offset);
2324*4882a593Smuzhiyun 	bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2325*4882a593Smuzhiyun }
2326*4882a593Smuzhiyun 
scrub_parity_mark_sectors_error(struct scrub_parity * sparity,u64 start,u64 len)2327*4882a593Smuzhiyun static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2328*4882a593Smuzhiyun 						   u64 start, u64 len)
2329*4882a593Smuzhiyun {
2330*4882a593Smuzhiyun 	__scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2331*4882a593Smuzhiyun }
2332*4882a593Smuzhiyun 
scrub_parity_mark_sectors_data(struct scrub_parity * sparity,u64 start,u64 len)2333*4882a593Smuzhiyun static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2334*4882a593Smuzhiyun 						  u64 start, u64 len)
2335*4882a593Smuzhiyun {
2336*4882a593Smuzhiyun 	__scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2337*4882a593Smuzhiyun }
2338*4882a593Smuzhiyun 
scrub_block_complete(struct scrub_block * sblock)2339*4882a593Smuzhiyun static void scrub_block_complete(struct scrub_block *sblock)
2340*4882a593Smuzhiyun {
2341*4882a593Smuzhiyun 	int corrupted = 0;
2342*4882a593Smuzhiyun 
2343*4882a593Smuzhiyun 	if (!sblock->no_io_error_seen) {
2344*4882a593Smuzhiyun 		corrupted = 1;
2345*4882a593Smuzhiyun 		scrub_handle_errored_block(sblock);
2346*4882a593Smuzhiyun 	} else {
2347*4882a593Smuzhiyun 		/*
2348*4882a593Smuzhiyun 		 * if has checksum error, write via repair mechanism in
2349*4882a593Smuzhiyun 		 * dev replace case, otherwise write here in dev replace
2350*4882a593Smuzhiyun 		 * case.
2351*4882a593Smuzhiyun 		 */
2352*4882a593Smuzhiyun 		corrupted = scrub_checksum(sblock);
2353*4882a593Smuzhiyun 		if (!corrupted && sblock->sctx->is_dev_replace)
2354*4882a593Smuzhiyun 			scrub_write_block_to_dev_replace(sblock);
2355*4882a593Smuzhiyun 	}
2356*4882a593Smuzhiyun 
2357*4882a593Smuzhiyun 	if (sblock->sparity && corrupted && !sblock->data_corrected) {
2358*4882a593Smuzhiyun 		u64 start = sblock->pagev[0]->logical;
2359*4882a593Smuzhiyun 		u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2360*4882a593Smuzhiyun 			  PAGE_SIZE;
2361*4882a593Smuzhiyun 
2362*4882a593Smuzhiyun 		scrub_parity_mark_sectors_error(sblock->sparity,
2363*4882a593Smuzhiyun 						start, end - start);
2364*4882a593Smuzhiyun 	}
2365*4882a593Smuzhiyun }
2366*4882a593Smuzhiyun 
scrub_find_csum(struct scrub_ctx * sctx,u64 logical,u8 * csum)2367*4882a593Smuzhiyun static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2368*4882a593Smuzhiyun {
2369*4882a593Smuzhiyun 	struct btrfs_ordered_sum *sum = NULL;
2370*4882a593Smuzhiyun 	unsigned long index;
2371*4882a593Smuzhiyun 	unsigned long num_sectors;
2372*4882a593Smuzhiyun 
2373*4882a593Smuzhiyun 	while (!list_empty(&sctx->csum_list)) {
2374*4882a593Smuzhiyun 		sum = list_first_entry(&sctx->csum_list,
2375*4882a593Smuzhiyun 				       struct btrfs_ordered_sum, list);
2376*4882a593Smuzhiyun 		if (sum->bytenr > logical)
2377*4882a593Smuzhiyun 			return 0;
2378*4882a593Smuzhiyun 		if (sum->bytenr + sum->len > logical)
2379*4882a593Smuzhiyun 			break;
2380*4882a593Smuzhiyun 
2381*4882a593Smuzhiyun 		++sctx->stat.csum_discards;
2382*4882a593Smuzhiyun 		list_del(&sum->list);
2383*4882a593Smuzhiyun 		kfree(sum);
2384*4882a593Smuzhiyun 		sum = NULL;
2385*4882a593Smuzhiyun 	}
2386*4882a593Smuzhiyun 	if (!sum)
2387*4882a593Smuzhiyun 		return 0;
2388*4882a593Smuzhiyun 
2389*4882a593Smuzhiyun 	index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize);
2390*4882a593Smuzhiyun 	ASSERT(index < UINT_MAX);
2391*4882a593Smuzhiyun 
2392*4882a593Smuzhiyun 	num_sectors = sum->len / sctx->fs_info->sectorsize;
2393*4882a593Smuzhiyun 	memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size);
2394*4882a593Smuzhiyun 	if (index == num_sectors - 1) {
2395*4882a593Smuzhiyun 		list_del(&sum->list);
2396*4882a593Smuzhiyun 		kfree(sum);
2397*4882a593Smuzhiyun 	}
2398*4882a593Smuzhiyun 	return 1;
2399*4882a593Smuzhiyun }
2400*4882a593Smuzhiyun 
2401*4882a593Smuzhiyun /* scrub extent tries to collect up to 64 kB for each bio */
scrub_extent(struct scrub_ctx * sctx,struct map_lookup * map,u64 logical,u64 len,u64 physical,struct btrfs_device * dev,u64 flags,u64 gen,int mirror_num,u64 physical_for_dev_replace)2402*4882a593Smuzhiyun static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2403*4882a593Smuzhiyun 			u64 logical, u64 len,
2404*4882a593Smuzhiyun 			u64 physical, struct btrfs_device *dev, u64 flags,
2405*4882a593Smuzhiyun 			u64 gen, int mirror_num, u64 physical_for_dev_replace)
2406*4882a593Smuzhiyun {
2407*4882a593Smuzhiyun 	int ret;
2408*4882a593Smuzhiyun 	u8 csum[BTRFS_CSUM_SIZE];
2409*4882a593Smuzhiyun 	u32 blocksize;
2410*4882a593Smuzhiyun 
2411*4882a593Smuzhiyun 	if (flags & BTRFS_EXTENT_FLAG_DATA) {
2412*4882a593Smuzhiyun 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2413*4882a593Smuzhiyun 			blocksize = map->stripe_len;
2414*4882a593Smuzhiyun 		else
2415*4882a593Smuzhiyun 			blocksize = sctx->fs_info->sectorsize;
2416*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
2417*4882a593Smuzhiyun 		sctx->stat.data_extents_scrubbed++;
2418*4882a593Smuzhiyun 		sctx->stat.data_bytes_scrubbed += len;
2419*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
2420*4882a593Smuzhiyun 	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2421*4882a593Smuzhiyun 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2422*4882a593Smuzhiyun 			blocksize = map->stripe_len;
2423*4882a593Smuzhiyun 		else
2424*4882a593Smuzhiyun 			blocksize = sctx->fs_info->nodesize;
2425*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
2426*4882a593Smuzhiyun 		sctx->stat.tree_extents_scrubbed++;
2427*4882a593Smuzhiyun 		sctx->stat.tree_bytes_scrubbed += len;
2428*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
2429*4882a593Smuzhiyun 	} else {
2430*4882a593Smuzhiyun 		blocksize = sctx->fs_info->sectorsize;
2431*4882a593Smuzhiyun 		WARN_ON(1);
2432*4882a593Smuzhiyun 	}
2433*4882a593Smuzhiyun 
2434*4882a593Smuzhiyun 	while (len) {
2435*4882a593Smuzhiyun 		u64 l = min_t(u64, len, blocksize);
2436*4882a593Smuzhiyun 		int have_csum = 0;
2437*4882a593Smuzhiyun 
2438*4882a593Smuzhiyun 		if (flags & BTRFS_EXTENT_FLAG_DATA) {
2439*4882a593Smuzhiyun 			/* push csums to sbio */
2440*4882a593Smuzhiyun 			have_csum = scrub_find_csum(sctx, logical, csum);
2441*4882a593Smuzhiyun 			if (have_csum == 0)
2442*4882a593Smuzhiyun 				++sctx->stat.no_csum;
2443*4882a593Smuzhiyun 		}
2444*4882a593Smuzhiyun 		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2445*4882a593Smuzhiyun 				  mirror_num, have_csum ? csum : NULL, 0,
2446*4882a593Smuzhiyun 				  physical_for_dev_replace);
2447*4882a593Smuzhiyun 		if (ret)
2448*4882a593Smuzhiyun 			return ret;
2449*4882a593Smuzhiyun 		len -= l;
2450*4882a593Smuzhiyun 		logical += l;
2451*4882a593Smuzhiyun 		physical += l;
2452*4882a593Smuzhiyun 		physical_for_dev_replace += l;
2453*4882a593Smuzhiyun 	}
2454*4882a593Smuzhiyun 	return 0;
2455*4882a593Smuzhiyun }
2456*4882a593Smuzhiyun 
scrub_pages_for_parity(struct scrub_parity * sparity,u64 logical,u64 len,u64 physical,struct btrfs_device * dev,u64 flags,u64 gen,int mirror_num,u8 * csum)2457*4882a593Smuzhiyun static int scrub_pages_for_parity(struct scrub_parity *sparity,
2458*4882a593Smuzhiyun 				  u64 logical, u64 len,
2459*4882a593Smuzhiyun 				  u64 physical, struct btrfs_device *dev,
2460*4882a593Smuzhiyun 				  u64 flags, u64 gen, int mirror_num, u8 *csum)
2461*4882a593Smuzhiyun {
2462*4882a593Smuzhiyun 	struct scrub_ctx *sctx = sparity->sctx;
2463*4882a593Smuzhiyun 	struct scrub_block *sblock;
2464*4882a593Smuzhiyun 	int index;
2465*4882a593Smuzhiyun 
2466*4882a593Smuzhiyun 	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2467*4882a593Smuzhiyun 	if (!sblock) {
2468*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
2469*4882a593Smuzhiyun 		sctx->stat.malloc_errors++;
2470*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
2471*4882a593Smuzhiyun 		return -ENOMEM;
2472*4882a593Smuzhiyun 	}
2473*4882a593Smuzhiyun 
2474*4882a593Smuzhiyun 	/* one ref inside this function, plus one for each page added to
2475*4882a593Smuzhiyun 	 * a bio later on */
2476*4882a593Smuzhiyun 	refcount_set(&sblock->refs, 1);
2477*4882a593Smuzhiyun 	sblock->sctx = sctx;
2478*4882a593Smuzhiyun 	sblock->no_io_error_seen = 1;
2479*4882a593Smuzhiyun 	sblock->sparity = sparity;
2480*4882a593Smuzhiyun 	scrub_parity_get(sparity);
2481*4882a593Smuzhiyun 
2482*4882a593Smuzhiyun 	for (index = 0; len > 0; index++) {
2483*4882a593Smuzhiyun 		struct scrub_page *spage;
2484*4882a593Smuzhiyun 		u64 l = min_t(u64, len, PAGE_SIZE);
2485*4882a593Smuzhiyun 
2486*4882a593Smuzhiyun 		spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2487*4882a593Smuzhiyun 		if (!spage) {
2488*4882a593Smuzhiyun leave_nomem:
2489*4882a593Smuzhiyun 			spin_lock(&sctx->stat_lock);
2490*4882a593Smuzhiyun 			sctx->stat.malloc_errors++;
2491*4882a593Smuzhiyun 			spin_unlock(&sctx->stat_lock);
2492*4882a593Smuzhiyun 			scrub_block_put(sblock);
2493*4882a593Smuzhiyun 			return -ENOMEM;
2494*4882a593Smuzhiyun 		}
2495*4882a593Smuzhiyun 		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2496*4882a593Smuzhiyun 		/* For scrub block */
2497*4882a593Smuzhiyun 		scrub_page_get(spage);
2498*4882a593Smuzhiyun 		sblock->pagev[index] = spage;
2499*4882a593Smuzhiyun 		/* For scrub parity */
2500*4882a593Smuzhiyun 		scrub_page_get(spage);
2501*4882a593Smuzhiyun 		list_add_tail(&spage->list, &sparity->spages);
2502*4882a593Smuzhiyun 		spage->sblock = sblock;
2503*4882a593Smuzhiyun 		spage->dev = dev;
2504*4882a593Smuzhiyun 		spage->flags = flags;
2505*4882a593Smuzhiyun 		spage->generation = gen;
2506*4882a593Smuzhiyun 		spage->logical = logical;
2507*4882a593Smuzhiyun 		spage->physical = physical;
2508*4882a593Smuzhiyun 		spage->mirror_num = mirror_num;
2509*4882a593Smuzhiyun 		if (csum) {
2510*4882a593Smuzhiyun 			spage->have_csum = 1;
2511*4882a593Smuzhiyun 			memcpy(spage->csum, csum, sctx->csum_size);
2512*4882a593Smuzhiyun 		} else {
2513*4882a593Smuzhiyun 			spage->have_csum = 0;
2514*4882a593Smuzhiyun 		}
2515*4882a593Smuzhiyun 		sblock->page_count++;
2516*4882a593Smuzhiyun 		spage->page = alloc_page(GFP_KERNEL);
2517*4882a593Smuzhiyun 		if (!spage->page)
2518*4882a593Smuzhiyun 			goto leave_nomem;
2519*4882a593Smuzhiyun 		len -= l;
2520*4882a593Smuzhiyun 		logical += l;
2521*4882a593Smuzhiyun 		physical += l;
2522*4882a593Smuzhiyun 	}
2523*4882a593Smuzhiyun 
2524*4882a593Smuzhiyun 	WARN_ON(sblock->page_count == 0);
2525*4882a593Smuzhiyun 	for (index = 0; index < sblock->page_count; index++) {
2526*4882a593Smuzhiyun 		struct scrub_page *spage = sblock->pagev[index];
2527*4882a593Smuzhiyun 		int ret;
2528*4882a593Smuzhiyun 
2529*4882a593Smuzhiyun 		ret = scrub_add_page_to_rd_bio(sctx, spage);
2530*4882a593Smuzhiyun 		if (ret) {
2531*4882a593Smuzhiyun 			scrub_block_put(sblock);
2532*4882a593Smuzhiyun 			return ret;
2533*4882a593Smuzhiyun 		}
2534*4882a593Smuzhiyun 	}
2535*4882a593Smuzhiyun 
2536*4882a593Smuzhiyun 	/* last one frees, either here or in bio completion for last page */
2537*4882a593Smuzhiyun 	scrub_block_put(sblock);
2538*4882a593Smuzhiyun 	return 0;
2539*4882a593Smuzhiyun }
2540*4882a593Smuzhiyun 
scrub_extent_for_parity(struct scrub_parity * sparity,u64 logical,u64 len,u64 physical,struct btrfs_device * dev,u64 flags,u64 gen,int mirror_num)2541*4882a593Smuzhiyun static int scrub_extent_for_parity(struct scrub_parity *sparity,
2542*4882a593Smuzhiyun 				   u64 logical, u64 len,
2543*4882a593Smuzhiyun 				   u64 physical, struct btrfs_device *dev,
2544*4882a593Smuzhiyun 				   u64 flags, u64 gen, int mirror_num)
2545*4882a593Smuzhiyun {
2546*4882a593Smuzhiyun 	struct scrub_ctx *sctx = sparity->sctx;
2547*4882a593Smuzhiyun 	int ret;
2548*4882a593Smuzhiyun 	u8 csum[BTRFS_CSUM_SIZE];
2549*4882a593Smuzhiyun 	u32 blocksize;
2550*4882a593Smuzhiyun 
2551*4882a593Smuzhiyun 	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2552*4882a593Smuzhiyun 		scrub_parity_mark_sectors_error(sparity, logical, len);
2553*4882a593Smuzhiyun 		return 0;
2554*4882a593Smuzhiyun 	}
2555*4882a593Smuzhiyun 
2556*4882a593Smuzhiyun 	if (flags & BTRFS_EXTENT_FLAG_DATA) {
2557*4882a593Smuzhiyun 		blocksize = sparity->stripe_len;
2558*4882a593Smuzhiyun 	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2559*4882a593Smuzhiyun 		blocksize = sparity->stripe_len;
2560*4882a593Smuzhiyun 	} else {
2561*4882a593Smuzhiyun 		blocksize = sctx->fs_info->sectorsize;
2562*4882a593Smuzhiyun 		WARN_ON(1);
2563*4882a593Smuzhiyun 	}
2564*4882a593Smuzhiyun 
2565*4882a593Smuzhiyun 	while (len) {
2566*4882a593Smuzhiyun 		u64 l = min_t(u64, len, blocksize);
2567*4882a593Smuzhiyun 		int have_csum = 0;
2568*4882a593Smuzhiyun 
2569*4882a593Smuzhiyun 		if (flags & BTRFS_EXTENT_FLAG_DATA) {
2570*4882a593Smuzhiyun 			/* push csums to sbio */
2571*4882a593Smuzhiyun 			have_csum = scrub_find_csum(sctx, logical, csum);
2572*4882a593Smuzhiyun 			if (have_csum == 0)
2573*4882a593Smuzhiyun 				goto skip;
2574*4882a593Smuzhiyun 		}
2575*4882a593Smuzhiyun 		ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2576*4882a593Smuzhiyun 					     flags, gen, mirror_num,
2577*4882a593Smuzhiyun 					     have_csum ? csum : NULL);
2578*4882a593Smuzhiyun 		if (ret)
2579*4882a593Smuzhiyun 			return ret;
2580*4882a593Smuzhiyun skip:
2581*4882a593Smuzhiyun 		len -= l;
2582*4882a593Smuzhiyun 		logical += l;
2583*4882a593Smuzhiyun 		physical += l;
2584*4882a593Smuzhiyun 	}
2585*4882a593Smuzhiyun 	return 0;
2586*4882a593Smuzhiyun }
2587*4882a593Smuzhiyun 
2588*4882a593Smuzhiyun /*
2589*4882a593Smuzhiyun  * Given a physical address, this will calculate it's
2590*4882a593Smuzhiyun  * logical offset. if this is a parity stripe, it will return
2591*4882a593Smuzhiyun  * the most left data stripe's logical offset.
2592*4882a593Smuzhiyun  *
2593*4882a593Smuzhiyun  * return 0 if it is a data stripe, 1 means parity stripe.
2594*4882a593Smuzhiyun  */
get_raid56_logic_offset(u64 physical,int num,struct map_lookup * map,u64 * offset,u64 * stripe_start)2595*4882a593Smuzhiyun static int get_raid56_logic_offset(u64 physical, int num,
2596*4882a593Smuzhiyun 				   struct map_lookup *map, u64 *offset,
2597*4882a593Smuzhiyun 				   u64 *stripe_start)
2598*4882a593Smuzhiyun {
2599*4882a593Smuzhiyun 	int i;
2600*4882a593Smuzhiyun 	int j = 0;
2601*4882a593Smuzhiyun 	u64 stripe_nr;
2602*4882a593Smuzhiyun 	u64 last_offset;
2603*4882a593Smuzhiyun 	u32 stripe_index;
2604*4882a593Smuzhiyun 	u32 rot;
2605*4882a593Smuzhiyun 	const int data_stripes = nr_data_stripes(map);
2606*4882a593Smuzhiyun 
2607*4882a593Smuzhiyun 	last_offset = (physical - map->stripes[num].physical) * data_stripes;
2608*4882a593Smuzhiyun 	if (stripe_start)
2609*4882a593Smuzhiyun 		*stripe_start = last_offset;
2610*4882a593Smuzhiyun 
2611*4882a593Smuzhiyun 	*offset = last_offset;
2612*4882a593Smuzhiyun 	for (i = 0; i < data_stripes; i++) {
2613*4882a593Smuzhiyun 		*offset = last_offset + i * map->stripe_len;
2614*4882a593Smuzhiyun 
2615*4882a593Smuzhiyun 		stripe_nr = div64_u64(*offset, map->stripe_len);
2616*4882a593Smuzhiyun 		stripe_nr = div_u64(stripe_nr, data_stripes);
2617*4882a593Smuzhiyun 
2618*4882a593Smuzhiyun 		/* Work out the disk rotation on this stripe-set */
2619*4882a593Smuzhiyun 		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2620*4882a593Smuzhiyun 		/* calculate which stripe this data locates */
2621*4882a593Smuzhiyun 		rot += i;
2622*4882a593Smuzhiyun 		stripe_index = rot % map->num_stripes;
2623*4882a593Smuzhiyun 		if (stripe_index == num)
2624*4882a593Smuzhiyun 			return 0;
2625*4882a593Smuzhiyun 		if (stripe_index < num)
2626*4882a593Smuzhiyun 			j++;
2627*4882a593Smuzhiyun 	}
2628*4882a593Smuzhiyun 	*offset = last_offset + j * map->stripe_len;
2629*4882a593Smuzhiyun 	return 1;
2630*4882a593Smuzhiyun }
2631*4882a593Smuzhiyun 
scrub_free_parity(struct scrub_parity * sparity)2632*4882a593Smuzhiyun static void scrub_free_parity(struct scrub_parity *sparity)
2633*4882a593Smuzhiyun {
2634*4882a593Smuzhiyun 	struct scrub_ctx *sctx = sparity->sctx;
2635*4882a593Smuzhiyun 	struct scrub_page *curr, *next;
2636*4882a593Smuzhiyun 	int nbits;
2637*4882a593Smuzhiyun 
2638*4882a593Smuzhiyun 	nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2639*4882a593Smuzhiyun 	if (nbits) {
2640*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
2641*4882a593Smuzhiyun 		sctx->stat.read_errors += nbits;
2642*4882a593Smuzhiyun 		sctx->stat.uncorrectable_errors += nbits;
2643*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
2644*4882a593Smuzhiyun 	}
2645*4882a593Smuzhiyun 
2646*4882a593Smuzhiyun 	list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2647*4882a593Smuzhiyun 		list_del_init(&curr->list);
2648*4882a593Smuzhiyun 		scrub_page_put(curr);
2649*4882a593Smuzhiyun 	}
2650*4882a593Smuzhiyun 
2651*4882a593Smuzhiyun 	kfree(sparity);
2652*4882a593Smuzhiyun }
2653*4882a593Smuzhiyun 
scrub_parity_bio_endio_worker(struct btrfs_work * work)2654*4882a593Smuzhiyun static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2655*4882a593Smuzhiyun {
2656*4882a593Smuzhiyun 	struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2657*4882a593Smuzhiyun 						    work);
2658*4882a593Smuzhiyun 	struct scrub_ctx *sctx = sparity->sctx;
2659*4882a593Smuzhiyun 
2660*4882a593Smuzhiyun 	scrub_free_parity(sparity);
2661*4882a593Smuzhiyun 	scrub_pending_bio_dec(sctx);
2662*4882a593Smuzhiyun }
2663*4882a593Smuzhiyun 
scrub_parity_bio_endio(struct bio * bio)2664*4882a593Smuzhiyun static void scrub_parity_bio_endio(struct bio *bio)
2665*4882a593Smuzhiyun {
2666*4882a593Smuzhiyun 	struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2667*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2668*4882a593Smuzhiyun 
2669*4882a593Smuzhiyun 	if (bio->bi_status)
2670*4882a593Smuzhiyun 		bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2671*4882a593Smuzhiyun 			  sparity->nsectors);
2672*4882a593Smuzhiyun 
2673*4882a593Smuzhiyun 	bio_put(bio);
2674*4882a593Smuzhiyun 
2675*4882a593Smuzhiyun 	btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
2676*4882a593Smuzhiyun 			NULL);
2677*4882a593Smuzhiyun 	btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
2678*4882a593Smuzhiyun }
2679*4882a593Smuzhiyun 
scrub_parity_check_and_repair(struct scrub_parity * sparity)2680*4882a593Smuzhiyun static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2681*4882a593Smuzhiyun {
2682*4882a593Smuzhiyun 	struct scrub_ctx *sctx = sparity->sctx;
2683*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sctx->fs_info;
2684*4882a593Smuzhiyun 	struct bio *bio;
2685*4882a593Smuzhiyun 	struct btrfs_raid_bio *rbio;
2686*4882a593Smuzhiyun 	struct btrfs_bio *bbio = NULL;
2687*4882a593Smuzhiyun 	u64 length;
2688*4882a593Smuzhiyun 	int ret;
2689*4882a593Smuzhiyun 
2690*4882a593Smuzhiyun 	if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2691*4882a593Smuzhiyun 			   sparity->nsectors))
2692*4882a593Smuzhiyun 		goto out;
2693*4882a593Smuzhiyun 
2694*4882a593Smuzhiyun 	length = sparity->logic_end - sparity->logic_start;
2695*4882a593Smuzhiyun 
2696*4882a593Smuzhiyun 	btrfs_bio_counter_inc_blocked(fs_info);
2697*4882a593Smuzhiyun 	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2698*4882a593Smuzhiyun 			       &length, &bbio);
2699*4882a593Smuzhiyun 	if (ret || !bbio || !bbio->raid_map)
2700*4882a593Smuzhiyun 		goto bbio_out;
2701*4882a593Smuzhiyun 
2702*4882a593Smuzhiyun 	bio = btrfs_io_bio_alloc(0);
2703*4882a593Smuzhiyun 	bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2704*4882a593Smuzhiyun 	bio->bi_private = sparity;
2705*4882a593Smuzhiyun 	bio->bi_end_io = scrub_parity_bio_endio;
2706*4882a593Smuzhiyun 
2707*4882a593Smuzhiyun 	rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
2708*4882a593Smuzhiyun 					      length, sparity->scrub_dev,
2709*4882a593Smuzhiyun 					      sparity->dbitmap,
2710*4882a593Smuzhiyun 					      sparity->nsectors);
2711*4882a593Smuzhiyun 	if (!rbio)
2712*4882a593Smuzhiyun 		goto rbio_out;
2713*4882a593Smuzhiyun 
2714*4882a593Smuzhiyun 	scrub_pending_bio_inc(sctx);
2715*4882a593Smuzhiyun 	raid56_parity_submit_scrub_rbio(rbio);
2716*4882a593Smuzhiyun 	return;
2717*4882a593Smuzhiyun 
2718*4882a593Smuzhiyun rbio_out:
2719*4882a593Smuzhiyun 	bio_put(bio);
2720*4882a593Smuzhiyun bbio_out:
2721*4882a593Smuzhiyun 	btrfs_bio_counter_dec(fs_info);
2722*4882a593Smuzhiyun 	btrfs_put_bbio(bbio);
2723*4882a593Smuzhiyun 	bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2724*4882a593Smuzhiyun 		  sparity->nsectors);
2725*4882a593Smuzhiyun 	spin_lock(&sctx->stat_lock);
2726*4882a593Smuzhiyun 	sctx->stat.malloc_errors++;
2727*4882a593Smuzhiyun 	spin_unlock(&sctx->stat_lock);
2728*4882a593Smuzhiyun out:
2729*4882a593Smuzhiyun 	scrub_free_parity(sparity);
2730*4882a593Smuzhiyun }
2731*4882a593Smuzhiyun 
scrub_calc_parity_bitmap_len(int nsectors)2732*4882a593Smuzhiyun static inline int scrub_calc_parity_bitmap_len(int nsectors)
2733*4882a593Smuzhiyun {
2734*4882a593Smuzhiyun 	return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
2735*4882a593Smuzhiyun }
2736*4882a593Smuzhiyun 
scrub_parity_get(struct scrub_parity * sparity)2737*4882a593Smuzhiyun static void scrub_parity_get(struct scrub_parity *sparity)
2738*4882a593Smuzhiyun {
2739*4882a593Smuzhiyun 	refcount_inc(&sparity->refs);
2740*4882a593Smuzhiyun }
2741*4882a593Smuzhiyun 
scrub_parity_put(struct scrub_parity * sparity)2742*4882a593Smuzhiyun static void scrub_parity_put(struct scrub_parity *sparity)
2743*4882a593Smuzhiyun {
2744*4882a593Smuzhiyun 	if (!refcount_dec_and_test(&sparity->refs))
2745*4882a593Smuzhiyun 		return;
2746*4882a593Smuzhiyun 
2747*4882a593Smuzhiyun 	scrub_parity_check_and_repair(sparity);
2748*4882a593Smuzhiyun }
2749*4882a593Smuzhiyun 
scrub_raid56_parity(struct scrub_ctx * sctx,struct map_lookup * map,struct btrfs_device * sdev,struct btrfs_path * path,u64 logic_start,u64 logic_end)2750*4882a593Smuzhiyun static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2751*4882a593Smuzhiyun 						  struct map_lookup *map,
2752*4882a593Smuzhiyun 						  struct btrfs_device *sdev,
2753*4882a593Smuzhiyun 						  struct btrfs_path *path,
2754*4882a593Smuzhiyun 						  u64 logic_start,
2755*4882a593Smuzhiyun 						  u64 logic_end)
2756*4882a593Smuzhiyun {
2757*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sctx->fs_info;
2758*4882a593Smuzhiyun 	struct btrfs_root *root = fs_info->extent_root;
2759*4882a593Smuzhiyun 	struct btrfs_root *csum_root = fs_info->csum_root;
2760*4882a593Smuzhiyun 	struct btrfs_extent_item *extent;
2761*4882a593Smuzhiyun 	struct btrfs_bio *bbio = NULL;
2762*4882a593Smuzhiyun 	u64 flags;
2763*4882a593Smuzhiyun 	int ret;
2764*4882a593Smuzhiyun 	int slot;
2765*4882a593Smuzhiyun 	struct extent_buffer *l;
2766*4882a593Smuzhiyun 	struct btrfs_key key;
2767*4882a593Smuzhiyun 	u64 generation;
2768*4882a593Smuzhiyun 	u64 extent_logical;
2769*4882a593Smuzhiyun 	u64 extent_physical;
2770*4882a593Smuzhiyun 	u64 extent_len;
2771*4882a593Smuzhiyun 	u64 mapped_length;
2772*4882a593Smuzhiyun 	struct btrfs_device *extent_dev;
2773*4882a593Smuzhiyun 	struct scrub_parity *sparity;
2774*4882a593Smuzhiyun 	int nsectors;
2775*4882a593Smuzhiyun 	int bitmap_len;
2776*4882a593Smuzhiyun 	int extent_mirror_num;
2777*4882a593Smuzhiyun 	int stop_loop = 0;
2778*4882a593Smuzhiyun 
2779*4882a593Smuzhiyun 	nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
2780*4882a593Smuzhiyun 	bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2781*4882a593Smuzhiyun 	sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2782*4882a593Smuzhiyun 			  GFP_NOFS);
2783*4882a593Smuzhiyun 	if (!sparity) {
2784*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
2785*4882a593Smuzhiyun 		sctx->stat.malloc_errors++;
2786*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
2787*4882a593Smuzhiyun 		return -ENOMEM;
2788*4882a593Smuzhiyun 	}
2789*4882a593Smuzhiyun 
2790*4882a593Smuzhiyun 	sparity->stripe_len = map->stripe_len;
2791*4882a593Smuzhiyun 	sparity->nsectors = nsectors;
2792*4882a593Smuzhiyun 	sparity->sctx = sctx;
2793*4882a593Smuzhiyun 	sparity->scrub_dev = sdev;
2794*4882a593Smuzhiyun 	sparity->logic_start = logic_start;
2795*4882a593Smuzhiyun 	sparity->logic_end = logic_end;
2796*4882a593Smuzhiyun 	refcount_set(&sparity->refs, 1);
2797*4882a593Smuzhiyun 	INIT_LIST_HEAD(&sparity->spages);
2798*4882a593Smuzhiyun 	sparity->dbitmap = sparity->bitmap;
2799*4882a593Smuzhiyun 	sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2800*4882a593Smuzhiyun 
2801*4882a593Smuzhiyun 	ret = 0;
2802*4882a593Smuzhiyun 	while (logic_start < logic_end) {
2803*4882a593Smuzhiyun 		if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2804*4882a593Smuzhiyun 			key.type = BTRFS_METADATA_ITEM_KEY;
2805*4882a593Smuzhiyun 		else
2806*4882a593Smuzhiyun 			key.type = BTRFS_EXTENT_ITEM_KEY;
2807*4882a593Smuzhiyun 		key.objectid = logic_start;
2808*4882a593Smuzhiyun 		key.offset = (u64)-1;
2809*4882a593Smuzhiyun 
2810*4882a593Smuzhiyun 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2811*4882a593Smuzhiyun 		if (ret < 0)
2812*4882a593Smuzhiyun 			goto out;
2813*4882a593Smuzhiyun 
2814*4882a593Smuzhiyun 		if (ret > 0) {
2815*4882a593Smuzhiyun 			ret = btrfs_previous_extent_item(root, path, 0);
2816*4882a593Smuzhiyun 			if (ret < 0)
2817*4882a593Smuzhiyun 				goto out;
2818*4882a593Smuzhiyun 			if (ret > 0) {
2819*4882a593Smuzhiyun 				btrfs_release_path(path);
2820*4882a593Smuzhiyun 				ret = btrfs_search_slot(NULL, root, &key,
2821*4882a593Smuzhiyun 							path, 0, 0);
2822*4882a593Smuzhiyun 				if (ret < 0)
2823*4882a593Smuzhiyun 					goto out;
2824*4882a593Smuzhiyun 			}
2825*4882a593Smuzhiyun 		}
2826*4882a593Smuzhiyun 
2827*4882a593Smuzhiyun 		stop_loop = 0;
2828*4882a593Smuzhiyun 		while (1) {
2829*4882a593Smuzhiyun 			u64 bytes;
2830*4882a593Smuzhiyun 
2831*4882a593Smuzhiyun 			l = path->nodes[0];
2832*4882a593Smuzhiyun 			slot = path->slots[0];
2833*4882a593Smuzhiyun 			if (slot >= btrfs_header_nritems(l)) {
2834*4882a593Smuzhiyun 				ret = btrfs_next_leaf(root, path);
2835*4882a593Smuzhiyun 				if (ret == 0)
2836*4882a593Smuzhiyun 					continue;
2837*4882a593Smuzhiyun 				if (ret < 0)
2838*4882a593Smuzhiyun 					goto out;
2839*4882a593Smuzhiyun 
2840*4882a593Smuzhiyun 				stop_loop = 1;
2841*4882a593Smuzhiyun 				break;
2842*4882a593Smuzhiyun 			}
2843*4882a593Smuzhiyun 			btrfs_item_key_to_cpu(l, &key, slot);
2844*4882a593Smuzhiyun 
2845*4882a593Smuzhiyun 			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2846*4882a593Smuzhiyun 			    key.type != BTRFS_METADATA_ITEM_KEY)
2847*4882a593Smuzhiyun 				goto next;
2848*4882a593Smuzhiyun 
2849*4882a593Smuzhiyun 			if (key.type == BTRFS_METADATA_ITEM_KEY)
2850*4882a593Smuzhiyun 				bytes = fs_info->nodesize;
2851*4882a593Smuzhiyun 			else
2852*4882a593Smuzhiyun 				bytes = key.offset;
2853*4882a593Smuzhiyun 
2854*4882a593Smuzhiyun 			if (key.objectid + bytes <= logic_start)
2855*4882a593Smuzhiyun 				goto next;
2856*4882a593Smuzhiyun 
2857*4882a593Smuzhiyun 			if (key.objectid >= logic_end) {
2858*4882a593Smuzhiyun 				stop_loop = 1;
2859*4882a593Smuzhiyun 				break;
2860*4882a593Smuzhiyun 			}
2861*4882a593Smuzhiyun 
2862*4882a593Smuzhiyun 			while (key.objectid >= logic_start + map->stripe_len)
2863*4882a593Smuzhiyun 				logic_start += map->stripe_len;
2864*4882a593Smuzhiyun 
2865*4882a593Smuzhiyun 			extent = btrfs_item_ptr(l, slot,
2866*4882a593Smuzhiyun 						struct btrfs_extent_item);
2867*4882a593Smuzhiyun 			flags = btrfs_extent_flags(l, extent);
2868*4882a593Smuzhiyun 			generation = btrfs_extent_generation(l, extent);
2869*4882a593Smuzhiyun 
2870*4882a593Smuzhiyun 			if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
2871*4882a593Smuzhiyun 			    (key.objectid < logic_start ||
2872*4882a593Smuzhiyun 			     key.objectid + bytes >
2873*4882a593Smuzhiyun 			     logic_start + map->stripe_len)) {
2874*4882a593Smuzhiyun 				btrfs_err(fs_info,
2875*4882a593Smuzhiyun 					  "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2876*4882a593Smuzhiyun 					  key.objectid, logic_start);
2877*4882a593Smuzhiyun 				spin_lock(&sctx->stat_lock);
2878*4882a593Smuzhiyun 				sctx->stat.uncorrectable_errors++;
2879*4882a593Smuzhiyun 				spin_unlock(&sctx->stat_lock);
2880*4882a593Smuzhiyun 				goto next;
2881*4882a593Smuzhiyun 			}
2882*4882a593Smuzhiyun again:
2883*4882a593Smuzhiyun 			extent_logical = key.objectid;
2884*4882a593Smuzhiyun 			extent_len = bytes;
2885*4882a593Smuzhiyun 
2886*4882a593Smuzhiyun 			if (extent_logical < logic_start) {
2887*4882a593Smuzhiyun 				extent_len -= logic_start - extent_logical;
2888*4882a593Smuzhiyun 				extent_logical = logic_start;
2889*4882a593Smuzhiyun 			}
2890*4882a593Smuzhiyun 
2891*4882a593Smuzhiyun 			if (extent_logical + extent_len >
2892*4882a593Smuzhiyun 			    logic_start + map->stripe_len)
2893*4882a593Smuzhiyun 				extent_len = logic_start + map->stripe_len -
2894*4882a593Smuzhiyun 					     extent_logical;
2895*4882a593Smuzhiyun 
2896*4882a593Smuzhiyun 			scrub_parity_mark_sectors_data(sparity, extent_logical,
2897*4882a593Smuzhiyun 						       extent_len);
2898*4882a593Smuzhiyun 
2899*4882a593Smuzhiyun 			mapped_length = extent_len;
2900*4882a593Smuzhiyun 			bbio = NULL;
2901*4882a593Smuzhiyun 			ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
2902*4882a593Smuzhiyun 					extent_logical, &mapped_length, &bbio,
2903*4882a593Smuzhiyun 					0);
2904*4882a593Smuzhiyun 			if (!ret) {
2905*4882a593Smuzhiyun 				if (!bbio || mapped_length < extent_len)
2906*4882a593Smuzhiyun 					ret = -EIO;
2907*4882a593Smuzhiyun 			}
2908*4882a593Smuzhiyun 			if (ret) {
2909*4882a593Smuzhiyun 				btrfs_put_bbio(bbio);
2910*4882a593Smuzhiyun 				goto out;
2911*4882a593Smuzhiyun 			}
2912*4882a593Smuzhiyun 			extent_physical = bbio->stripes[0].physical;
2913*4882a593Smuzhiyun 			extent_mirror_num = bbio->mirror_num;
2914*4882a593Smuzhiyun 			extent_dev = bbio->stripes[0].dev;
2915*4882a593Smuzhiyun 			btrfs_put_bbio(bbio);
2916*4882a593Smuzhiyun 
2917*4882a593Smuzhiyun 			ret = btrfs_lookup_csums_range(csum_root,
2918*4882a593Smuzhiyun 						extent_logical,
2919*4882a593Smuzhiyun 						extent_logical + extent_len - 1,
2920*4882a593Smuzhiyun 						&sctx->csum_list, 1);
2921*4882a593Smuzhiyun 			if (ret)
2922*4882a593Smuzhiyun 				goto out;
2923*4882a593Smuzhiyun 
2924*4882a593Smuzhiyun 			ret = scrub_extent_for_parity(sparity, extent_logical,
2925*4882a593Smuzhiyun 						      extent_len,
2926*4882a593Smuzhiyun 						      extent_physical,
2927*4882a593Smuzhiyun 						      extent_dev, flags,
2928*4882a593Smuzhiyun 						      generation,
2929*4882a593Smuzhiyun 						      extent_mirror_num);
2930*4882a593Smuzhiyun 
2931*4882a593Smuzhiyun 			scrub_free_csums(sctx);
2932*4882a593Smuzhiyun 
2933*4882a593Smuzhiyun 			if (ret)
2934*4882a593Smuzhiyun 				goto out;
2935*4882a593Smuzhiyun 
2936*4882a593Smuzhiyun 			if (extent_logical + extent_len <
2937*4882a593Smuzhiyun 			    key.objectid + bytes) {
2938*4882a593Smuzhiyun 				logic_start += map->stripe_len;
2939*4882a593Smuzhiyun 
2940*4882a593Smuzhiyun 				if (logic_start >= logic_end) {
2941*4882a593Smuzhiyun 					stop_loop = 1;
2942*4882a593Smuzhiyun 					break;
2943*4882a593Smuzhiyun 				}
2944*4882a593Smuzhiyun 
2945*4882a593Smuzhiyun 				if (logic_start < key.objectid + bytes) {
2946*4882a593Smuzhiyun 					cond_resched();
2947*4882a593Smuzhiyun 					goto again;
2948*4882a593Smuzhiyun 				}
2949*4882a593Smuzhiyun 			}
2950*4882a593Smuzhiyun next:
2951*4882a593Smuzhiyun 			path->slots[0]++;
2952*4882a593Smuzhiyun 		}
2953*4882a593Smuzhiyun 
2954*4882a593Smuzhiyun 		btrfs_release_path(path);
2955*4882a593Smuzhiyun 
2956*4882a593Smuzhiyun 		if (stop_loop)
2957*4882a593Smuzhiyun 			break;
2958*4882a593Smuzhiyun 
2959*4882a593Smuzhiyun 		logic_start += map->stripe_len;
2960*4882a593Smuzhiyun 	}
2961*4882a593Smuzhiyun out:
2962*4882a593Smuzhiyun 	if (ret < 0)
2963*4882a593Smuzhiyun 		scrub_parity_mark_sectors_error(sparity, logic_start,
2964*4882a593Smuzhiyun 						logic_end - logic_start);
2965*4882a593Smuzhiyun 	scrub_parity_put(sparity);
2966*4882a593Smuzhiyun 	scrub_submit(sctx);
2967*4882a593Smuzhiyun 	mutex_lock(&sctx->wr_lock);
2968*4882a593Smuzhiyun 	scrub_wr_submit(sctx);
2969*4882a593Smuzhiyun 	mutex_unlock(&sctx->wr_lock);
2970*4882a593Smuzhiyun 
2971*4882a593Smuzhiyun 	btrfs_release_path(path);
2972*4882a593Smuzhiyun 	return ret < 0 ? ret : 0;
2973*4882a593Smuzhiyun }
2974*4882a593Smuzhiyun 
scrub_stripe(struct scrub_ctx * sctx,struct map_lookup * map,struct btrfs_device * scrub_dev,int num,u64 base,u64 length,struct btrfs_block_group * cache)2975*4882a593Smuzhiyun static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2976*4882a593Smuzhiyun 					   struct map_lookup *map,
2977*4882a593Smuzhiyun 					   struct btrfs_device *scrub_dev,
2978*4882a593Smuzhiyun 					   int num, u64 base, u64 length,
2979*4882a593Smuzhiyun 					   struct btrfs_block_group *cache)
2980*4882a593Smuzhiyun {
2981*4882a593Smuzhiyun 	struct btrfs_path *path, *ppath;
2982*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sctx->fs_info;
2983*4882a593Smuzhiyun 	struct btrfs_root *root = fs_info->extent_root;
2984*4882a593Smuzhiyun 	struct btrfs_root *csum_root = fs_info->csum_root;
2985*4882a593Smuzhiyun 	struct btrfs_extent_item *extent;
2986*4882a593Smuzhiyun 	struct blk_plug plug;
2987*4882a593Smuzhiyun 	u64 flags;
2988*4882a593Smuzhiyun 	int ret;
2989*4882a593Smuzhiyun 	int slot;
2990*4882a593Smuzhiyun 	u64 nstripes;
2991*4882a593Smuzhiyun 	struct extent_buffer *l;
2992*4882a593Smuzhiyun 	u64 physical;
2993*4882a593Smuzhiyun 	u64 logical;
2994*4882a593Smuzhiyun 	u64 logic_end;
2995*4882a593Smuzhiyun 	u64 physical_end;
2996*4882a593Smuzhiyun 	u64 generation;
2997*4882a593Smuzhiyun 	int mirror_num;
2998*4882a593Smuzhiyun 	struct reada_control *reada1;
2999*4882a593Smuzhiyun 	struct reada_control *reada2;
3000*4882a593Smuzhiyun 	struct btrfs_key key;
3001*4882a593Smuzhiyun 	struct btrfs_key key_end;
3002*4882a593Smuzhiyun 	u64 increment = map->stripe_len;
3003*4882a593Smuzhiyun 	u64 offset;
3004*4882a593Smuzhiyun 	u64 extent_logical;
3005*4882a593Smuzhiyun 	u64 extent_physical;
3006*4882a593Smuzhiyun 	u64 extent_len;
3007*4882a593Smuzhiyun 	u64 stripe_logical;
3008*4882a593Smuzhiyun 	u64 stripe_end;
3009*4882a593Smuzhiyun 	struct btrfs_device *extent_dev;
3010*4882a593Smuzhiyun 	int extent_mirror_num;
3011*4882a593Smuzhiyun 	int stop_loop = 0;
3012*4882a593Smuzhiyun 
3013*4882a593Smuzhiyun 	physical = map->stripes[num].physical;
3014*4882a593Smuzhiyun 	offset = 0;
3015*4882a593Smuzhiyun 	nstripes = div64_u64(length, map->stripe_len);
3016*4882a593Smuzhiyun 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3017*4882a593Smuzhiyun 		offset = map->stripe_len * num;
3018*4882a593Smuzhiyun 		increment = map->stripe_len * map->num_stripes;
3019*4882a593Smuzhiyun 		mirror_num = 1;
3020*4882a593Smuzhiyun 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3021*4882a593Smuzhiyun 		int factor = map->num_stripes / map->sub_stripes;
3022*4882a593Smuzhiyun 		offset = map->stripe_len * (num / map->sub_stripes);
3023*4882a593Smuzhiyun 		increment = map->stripe_len * factor;
3024*4882a593Smuzhiyun 		mirror_num = num % map->sub_stripes + 1;
3025*4882a593Smuzhiyun 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
3026*4882a593Smuzhiyun 		increment = map->stripe_len;
3027*4882a593Smuzhiyun 		mirror_num = num % map->num_stripes + 1;
3028*4882a593Smuzhiyun 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3029*4882a593Smuzhiyun 		increment = map->stripe_len;
3030*4882a593Smuzhiyun 		mirror_num = num % map->num_stripes + 1;
3031*4882a593Smuzhiyun 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3032*4882a593Smuzhiyun 		get_raid56_logic_offset(physical, num, map, &offset, NULL);
3033*4882a593Smuzhiyun 		increment = map->stripe_len * nr_data_stripes(map);
3034*4882a593Smuzhiyun 		mirror_num = 1;
3035*4882a593Smuzhiyun 	} else {
3036*4882a593Smuzhiyun 		increment = map->stripe_len;
3037*4882a593Smuzhiyun 		mirror_num = 1;
3038*4882a593Smuzhiyun 	}
3039*4882a593Smuzhiyun 
3040*4882a593Smuzhiyun 	path = btrfs_alloc_path();
3041*4882a593Smuzhiyun 	if (!path)
3042*4882a593Smuzhiyun 		return -ENOMEM;
3043*4882a593Smuzhiyun 
3044*4882a593Smuzhiyun 	ppath = btrfs_alloc_path();
3045*4882a593Smuzhiyun 	if (!ppath) {
3046*4882a593Smuzhiyun 		btrfs_free_path(path);
3047*4882a593Smuzhiyun 		return -ENOMEM;
3048*4882a593Smuzhiyun 	}
3049*4882a593Smuzhiyun 
3050*4882a593Smuzhiyun 	/*
3051*4882a593Smuzhiyun 	 * work on commit root. The related disk blocks are static as
3052*4882a593Smuzhiyun 	 * long as COW is applied. This means, it is save to rewrite
3053*4882a593Smuzhiyun 	 * them to repair disk errors without any race conditions
3054*4882a593Smuzhiyun 	 */
3055*4882a593Smuzhiyun 	path->search_commit_root = 1;
3056*4882a593Smuzhiyun 	path->skip_locking = 1;
3057*4882a593Smuzhiyun 
3058*4882a593Smuzhiyun 	ppath->search_commit_root = 1;
3059*4882a593Smuzhiyun 	ppath->skip_locking = 1;
3060*4882a593Smuzhiyun 	/*
3061*4882a593Smuzhiyun 	 * trigger the readahead for extent tree csum tree and wait for
3062*4882a593Smuzhiyun 	 * completion. During readahead, the scrub is officially paused
3063*4882a593Smuzhiyun 	 * to not hold off transaction commits
3064*4882a593Smuzhiyun 	 */
3065*4882a593Smuzhiyun 	logical = base + offset;
3066*4882a593Smuzhiyun 	physical_end = physical + nstripes * map->stripe_len;
3067*4882a593Smuzhiyun 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3068*4882a593Smuzhiyun 		get_raid56_logic_offset(physical_end, num,
3069*4882a593Smuzhiyun 					map, &logic_end, NULL);
3070*4882a593Smuzhiyun 		logic_end += base;
3071*4882a593Smuzhiyun 	} else {
3072*4882a593Smuzhiyun 		logic_end = logical + increment * nstripes;
3073*4882a593Smuzhiyun 	}
3074*4882a593Smuzhiyun 	wait_event(sctx->list_wait,
3075*4882a593Smuzhiyun 		   atomic_read(&sctx->bios_in_flight) == 0);
3076*4882a593Smuzhiyun 	scrub_blocked_if_needed(fs_info);
3077*4882a593Smuzhiyun 
3078*4882a593Smuzhiyun 	/* FIXME it might be better to start readahead at commit root */
3079*4882a593Smuzhiyun 	key.objectid = logical;
3080*4882a593Smuzhiyun 	key.type = BTRFS_EXTENT_ITEM_KEY;
3081*4882a593Smuzhiyun 	key.offset = (u64)0;
3082*4882a593Smuzhiyun 	key_end.objectid = logic_end;
3083*4882a593Smuzhiyun 	key_end.type = BTRFS_METADATA_ITEM_KEY;
3084*4882a593Smuzhiyun 	key_end.offset = (u64)-1;
3085*4882a593Smuzhiyun 	reada1 = btrfs_reada_add(root, &key, &key_end);
3086*4882a593Smuzhiyun 
3087*4882a593Smuzhiyun 	key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3088*4882a593Smuzhiyun 	key.type = BTRFS_EXTENT_CSUM_KEY;
3089*4882a593Smuzhiyun 	key.offset = logical;
3090*4882a593Smuzhiyun 	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3091*4882a593Smuzhiyun 	key_end.type = BTRFS_EXTENT_CSUM_KEY;
3092*4882a593Smuzhiyun 	key_end.offset = logic_end;
3093*4882a593Smuzhiyun 	reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3094*4882a593Smuzhiyun 
3095*4882a593Smuzhiyun 	if (!IS_ERR(reada1))
3096*4882a593Smuzhiyun 		btrfs_reada_wait(reada1);
3097*4882a593Smuzhiyun 	if (!IS_ERR(reada2))
3098*4882a593Smuzhiyun 		btrfs_reada_wait(reada2);
3099*4882a593Smuzhiyun 
3100*4882a593Smuzhiyun 
3101*4882a593Smuzhiyun 	/*
3102*4882a593Smuzhiyun 	 * collect all data csums for the stripe to avoid seeking during
3103*4882a593Smuzhiyun 	 * the scrub. This might currently (crc32) end up to be about 1MB
3104*4882a593Smuzhiyun 	 */
3105*4882a593Smuzhiyun 	blk_start_plug(&plug);
3106*4882a593Smuzhiyun 
3107*4882a593Smuzhiyun 	/*
3108*4882a593Smuzhiyun 	 * now find all extents for each stripe and scrub them
3109*4882a593Smuzhiyun 	 */
3110*4882a593Smuzhiyun 	ret = 0;
3111*4882a593Smuzhiyun 	while (physical < physical_end) {
3112*4882a593Smuzhiyun 		/*
3113*4882a593Smuzhiyun 		 * canceled?
3114*4882a593Smuzhiyun 		 */
3115*4882a593Smuzhiyun 		if (atomic_read(&fs_info->scrub_cancel_req) ||
3116*4882a593Smuzhiyun 		    atomic_read(&sctx->cancel_req)) {
3117*4882a593Smuzhiyun 			ret = -ECANCELED;
3118*4882a593Smuzhiyun 			goto out;
3119*4882a593Smuzhiyun 		}
3120*4882a593Smuzhiyun 		/*
3121*4882a593Smuzhiyun 		 * check to see if we have to pause
3122*4882a593Smuzhiyun 		 */
3123*4882a593Smuzhiyun 		if (atomic_read(&fs_info->scrub_pause_req)) {
3124*4882a593Smuzhiyun 			/* push queued extents */
3125*4882a593Smuzhiyun 			sctx->flush_all_writes = true;
3126*4882a593Smuzhiyun 			scrub_submit(sctx);
3127*4882a593Smuzhiyun 			mutex_lock(&sctx->wr_lock);
3128*4882a593Smuzhiyun 			scrub_wr_submit(sctx);
3129*4882a593Smuzhiyun 			mutex_unlock(&sctx->wr_lock);
3130*4882a593Smuzhiyun 			wait_event(sctx->list_wait,
3131*4882a593Smuzhiyun 				   atomic_read(&sctx->bios_in_flight) == 0);
3132*4882a593Smuzhiyun 			sctx->flush_all_writes = false;
3133*4882a593Smuzhiyun 			scrub_blocked_if_needed(fs_info);
3134*4882a593Smuzhiyun 		}
3135*4882a593Smuzhiyun 
3136*4882a593Smuzhiyun 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3137*4882a593Smuzhiyun 			ret = get_raid56_logic_offset(physical, num, map,
3138*4882a593Smuzhiyun 						      &logical,
3139*4882a593Smuzhiyun 						      &stripe_logical);
3140*4882a593Smuzhiyun 			logical += base;
3141*4882a593Smuzhiyun 			if (ret) {
3142*4882a593Smuzhiyun 				/* it is parity strip */
3143*4882a593Smuzhiyun 				stripe_logical += base;
3144*4882a593Smuzhiyun 				stripe_end = stripe_logical + increment;
3145*4882a593Smuzhiyun 				ret = scrub_raid56_parity(sctx, map, scrub_dev,
3146*4882a593Smuzhiyun 							  ppath, stripe_logical,
3147*4882a593Smuzhiyun 							  stripe_end);
3148*4882a593Smuzhiyun 				if (ret)
3149*4882a593Smuzhiyun 					goto out;
3150*4882a593Smuzhiyun 				goto skip;
3151*4882a593Smuzhiyun 			}
3152*4882a593Smuzhiyun 		}
3153*4882a593Smuzhiyun 
3154*4882a593Smuzhiyun 		if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3155*4882a593Smuzhiyun 			key.type = BTRFS_METADATA_ITEM_KEY;
3156*4882a593Smuzhiyun 		else
3157*4882a593Smuzhiyun 			key.type = BTRFS_EXTENT_ITEM_KEY;
3158*4882a593Smuzhiyun 		key.objectid = logical;
3159*4882a593Smuzhiyun 		key.offset = (u64)-1;
3160*4882a593Smuzhiyun 
3161*4882a593Smuzhiyun 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3162*4882a593Smuzhiyun 		if (ret < 0)
3163*4882a593Smuzhiyun 			goto out;
3164*4882a593Smuzhiyun 
3165*4882a593Smuzhiyun 		if (ret > 0) {
3166*4882a593Smuzhiyun 			ret = btrfs_previous_extent_item(root, path, 0);
3167*4882a593Smuzhiyun 			if (ret < 0)
3168*4882a593Smuzhiyun 				goto out;
3169*4882a593Smuzhiyun 			if (ret > 0) {
3170*4882a593Smuzhiyun 				/* there's no smaller item, so stick with the
3171*4882a593Smuzhiyun 				 * larger one */
3172*4882a593Smuzhiyun 				btrfs_release_path(path);
3173*4882a593Smuzhiyun 				ret = btrfs_search_slot(NULL, root, &key,
3174*4882a593Smuzhiyun 							path, 0, 0);
3175*4882a593Smuzhiyun 				if (ret < 0)
3176*4882a593Smuzhiyun 					goto out;
3177*4882a593Smuzhiyun 			}
3178*4882a593Smuzhiyun 		}
3179*4882a593Smuzhiyun 
3180*4882a593Smuzhiyun 		stop_loop = 0;
3181*4882a593Smuzhiyun 		while (1) {
3182*4882a593Smuzhiyun 			u64 bytes;
3183*4882a593Smuzhiyun 
3184*4882a593Smuzhiyun 			l = path->nodes[0];
3185*4882a593Smuzhiyun 			slot = path->slots[0];
3186*4882a593Smuzhiyun 			if (slot >= btrfs_header_nritems(l)) {
3187*4882a593Smuzhiyun 				ret = btrfs_next_leaf(root, path);
3188*4882a593Smuzhiyun 				if (ret == 0)
3189*4882a593Smuzhiyun 					continue;
3190*4882a593Smuzhiyun 				if (ret < 0)
3191*4882a593Smuzhiyun 					goto out;
3192*4882a593Smuzhiyun 
3193*4882a593Smuzhiyun 				stop_loop = 1;
3194*4882a593Smuzhiyun 				break;
3195*4882a593Smuzhiyun 			}
3196*4882a593Smuzhiyun 			btrfs_item_key_to_cpu(l, &key, slot);
3197*4882a593Smuzhiyun 
3198*4882a593Smuzhiyun 			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3199*4882a593Smuzhiyun 			    key.type != BTRFS_METADATA_ITEM_KEY)
3200*4882a593Smuzhiyun 				goto next;
3201*4882a593Smuzhiyun 
3202*4882a593Smuzhiyun 			if (key.type == BTRFS_METADATA_ITEM_KEY)
3203*4882a593Smuzhiyun 				bytes = fs_info->nodesize;
3204*4882a593Smuzhiyun 			else
3205*4882a593Smuzhiyun 				bytes = key.offset;
3206*4882a593Smuzhiyun 
3207*4882a593Smuzhiyun 			if (key.objectid + bytes <= logical)
3208*4882a593Smuzhiyun 				goto next;
3209*4882a593Smuzhiyun 
3210*4882a593Smuzhiyun 			if (key.objectid >= logical + map->stripe_len) {
3211*4882a593Smuzhiyun 				/* out of this device extent */
3212*4882a593Smuzhiyun 				if (key.objectid >= logic_end)
3213*4882a593Smuzhiyun 					stop_loop = 1;
3214*4882a593Smuzhiyun 				break;
3215*4882a593Smuzhiyun 			}
3216*4882a593Smuzhiyun 
3217*4882a593Smuzhiyun 			/*
3218*4882a593Smuzhiyun 			 * If our block group was removed in the meanwhile, just
3219*4882a593Smuzhiyun 			 * stop scrubbing since there is no point in continuing.
3220*4882a593Smuzhiyun 			 * Continuing would prevent reusing its device extents
3221*4882a593Smuzhiyun 			 * for new block groups for a long time.
3222*4882a593Smuzhiyun 			 */
3223*4882a593Smuzhiyun 			spin_lock(&cache->lock);
3224*4882a593Smuzhiyun 			if (cache->removed) {
3225*4882a593Smuzhiyun 				spin_unlock(&cache->lock);
3226*4882a593Smuzhiyun 				ret = 0;
3227*4882a593Smuzhiyun 				goto out;
3228*4882a593Smuzhiyun 			}
3229*4882a593Smuzhiyun 			spin_unlock(&cache->lock);
3230*4882a593Smuzhiyun 
3231*4882a593Smuzhiyun 			extent = btrfs_item_ptr(l, slot,
3232*4882a593Smuzhiyun 						struct btrfs_extent_item);
3233*4882a593Smuzhiyun 			flags = btrfs_extent_flags(l, extent);
3234*4882a593Smuzhiyun 			generation = btrfs_extent_generation(l, extent);
3235*4882a593Smuzhiyun 
3236*4882a593Smuzhiyun 			if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3237*4882a593Smuzhiyun 			    (key.objectid < logical ||
3238*4882a593Smuzhiyun 			     key.objectid + bytes >
3239*4882a593Smuzhiyun 			     logical + map->stripe_len)) {
3240*4882a593Smuzhiyun 				btrfs_err(fs_info,
3241*4882a593Smuzhiyun 					   "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3242*4882a593Smuzhiyun 				       key.objectid, logical);
3243*4882a593Smuzhiyun 				spin_lock(&sctx->stat_lock);
3244*4882a593Smuzhiyun 				sctx->stat.uncorrectable_errors++;
3245*4882a593Smuzhiyun 				spin_unlock(&sctx->stat_lock);
3246*4882a593Smuzhiyun 				goto next;
3247*4882a593Smuzhiyun 			}
3248*4882a593Smuzhiyun 
3249*4882a593Smuzhiyun again:
3250*4882a593Smuzhiyun 			extent_logical = key.objectid;
3251*4882a593Smuzhiyun 			extent_len = bytes;
3252*4882a593Smuzhiyun 
3253*4882a593Smuzhiyun 			/*
3254*4882a593Smuzhiyun 			 * trim extent to this stripe
3255*4882a593Smuzhiyun 			 */
3256*4882a593Smuzhiyun 			if (extent_logical < logical) {
3257*4882a593Smuzhiyun 				extent_len -= logical - extent_logical;
3258*4882a593Smuzhiyun 				extent_logical = logical;
3259*4882a593Smuzhiyun 			}
3260*4882a593Smuzhiyun 			if (extent_logical + extent_len >
3261*4882a593Smuzhiyun 			    logical + map->stripe_len) {
3262*4882a593Smuzhiyun 				extent_len = logical + map->stripe_len -
3263*4882a593Smuzhiyun 					     extent_logical;
3264*4882a593Smuzhiyun 			}
3265*4882a593Smuzhiyun 
3266*4882a593Smuzhiyun 			extent_physical = extent_logical - logical + physical;
3267*4882a593Smuzhiyun 			extent_dev = scrub_dev;
3268*4882a593Smuzhiyun 			extent_mirror_num = mirror_num;
3269*4882a593Smuzhiyun 			if (sctx->is_dev_replace)
3270*4882a593Smuzhiyun 				scrub_remap_extent(fs_info, extent_logical,
3271*4882a593Smuzhiyun 						   extent_len, &extent_physical,
3272*4882a593Smuzhiyun 						   &extent_dev,
3273*4882a593Smuzhiyun 						   &extent_mirror_num);
3274*4882a593Smuzhiyun 
3275*4882a593Smuzhiyun 			if (flags & BTRFS_EXTENT_FLAG_DATA) {
3276*4882a593Smuzhiyun 				ret = btrfs_lookup_csums_range(csum_root,
3277*4882a593Smuzhiyun 						extent_logical,
3278*4882a593Smuzhiyun 						extent_logical + extent_len - 1,
3279*4882a593Smuzhiyun 						&sctx->csum_list, 1);
3280*4882a593Smuzhiyun 				if (ret)
3281*4882a593Smuzhiyun 					goto out;
3282*4882a593Smuzhiyun 			}
3283*4882a593Smuzhiyun 
3284*4882a593Smuzhiyun 			ret = scrub_extent(sctx, map, extent_logical, extent_len,
3285*4882a593Smuzhiyun 					   extent_physical, extent_dev, flags,
3286*4882a593Smuzhiyun 					   generation, extent_mirror_num,
3287*4882a593Smuzhiyun 					   extent_logical - logical + physical);
3288*4882a593Smuzhiyun 
3289*4882a593Smuzhiyun 			scrub_free_csums(sctx);
3290*4882a593Smuzhiyun 
3291*4882a593Smuzhiyun 			if (ret)
3292*4882a593Smuzhiyun 				goto out;
3293*4882a593Smuzhiyun 
3294*4882a593Smuzhiyun 			if (extent_logical + extent_len <
3295*4882a593Smuzhiyun 			    key.objectid + bytes) {
3296*4882a593Smuzhiyun 				if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3297*4882a593Smuzhiyun 					/*
3298*4882a593Smuzhiyun 					 * loop until we find next data stripe
3299*4882a593Smuzhiyun 					 * or we have finished all stripes.
3300*4882a593Smuzhiyun 					 */
3301*4882a593Smuzhiyun loop:
3302*4882a593Smuzhiyun 					physical += map->stripe_len;
3303*4882a593Smuzhiyun 					ret = get_raid56_logic_offset(physical,
3304*4882a593Smuzhiyun 							num, map, &logical,
3305*4882a593Smuzhiyun 							&stripe_logical);
3306*4882a593Smuzhiyun 					logical += base;
3307*4882a593Smuzhiyun 
3308*4882a593Smuzhiyun 					if (ret && physical < physical_end) {
3309*4882a593Smuzhiyun 						stripe_logical += base;
3310*4882a593Smuzhiyun 						stripe_end = stripe_logical +
3311*4882a593Smuzhiyun 								increment;
3312*4882a593Smuzhiyun 						ret = scrub_raid56_parity(sctx,
3313*4882a593Smuzhiyun 							map, scrub_dev, ppath,
3314*4882a593Smuzhiyun 							stripe_logical,
3315*4882a593Smuzhiyun 							stripe_end);
3316*4882a593Smuzhiyun 						if (ret)
3317*4882a593Smuzhiyun 							goto out;
3318*4882a593Smuzhiyun 						goto loop;
3319*4882a593Smuzhiyun 					}
3320*4882a593Smuzhiyun 				} else {
3321*4882a593Smuzhiyun 					physical += map->stripe_len;
3322*4882a593Smuzhiyun 					logical += increment;
3323*4882a593Smuzhiyun 				}
3324*4882a593Smuzhiyun 				if (logical < key.objectid + bytes) {
3325*4882a593Smuzhiyun 					cond_resched();
3326*4882a593Smuzhiyun 					goto again;
3327*4882a593Smuzhiyun 				}
3328*4882a593Smuzhiyun 
3329*4882a593Smuzhiyun 				if (physical >= physical_end) {
3330*4882a593Smuzhiyun 					stop_loop = 1;
3331*4882a593Smuzhiyun 					break;
3332*4882a593Smuzhiyun 				}
3333*4882a593Smuzhiyun 			}
3334*4882a593Smuzhiyun next:
3335*4882a593Smuzhiyun 			path->slots[0]++;
3336*4882a593Smuzhiyun 		}
3337*4882a593Smuzhiyun 		btrfs_release_path(path);
3338*4882a593Smuzhiyun skip:
3339*4882a593Smuzhiyun 		logical += increment;
3340*4882a593Smuzhiyun 		physical += map->stripe_len;
3341*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
3342*4882a593Smuzhiyun 		if (stop_loop)
3343*4882a593Smuzhiyun 			sctx->stat.last_physical = map->stripes[num].physical +
3344*4882a593Smuzhiyun 						   length;
3345*4882a593Smuzhiyun 		else
3346*4882a593Smuzhiyun 			sctx->stat.last_physical = physical;
3347*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
3348*4882a593Smuzhiyun 		if (stop_loop)
3349*4882a593Smuzhiyun 			break;
3350*4882a593Smuzhiyun 	}
3351*4882a593Smuzhiyun out:
3352*4882a593Smuzhiyun 	/* push queued extents */
3353*4882a593Smuzhiyun 	scrub_submit(sctx);
3354*4882a593Smuzhiyun 	mutex_lock(&sctx->wr_lock);
3355*4882a593Smuzhiyun 	scrub_wr_submit(sctx);
3356*4882a593Smuzhiyun 	mutex_unlock(&sctx->wr_lock);
3357*4882a593Smuzhiyun 
3358*4882a593Smuzhiyun 	blk_finish_plug(&plug);
3359*4882a593Smuzhiyun 	btrfs_free_path(path);
3360*4882a593Smuzhiyun 	btrfs_free_path(ppath);
3361*4882a593Smuzhiyun 	return ret < 0 ? ret : 0;
3362*4882a593Smuzhiyun }
3363*4882a593Smuzhiyun 
scrub_chunk(struct scrub_ctx * sctx,struct btrfs_device * scrub_dev,u64 chunk_offset,u64 length,u64 dev_offset,struct btrfs_block_group * cache)3364*4882a593Smuzhiyun static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3365*4882a593Smuzhiyun 					  struct btrfs_device *scrub_dev,
3366*4882a593Smuzhiyun 					  u64 chunk_offset, u64 length,
3367*4882a593Smuzhiyun 					  u64 dev_offset,
3368*4882a593Smuzhiyun 					  struct btrfs_block_group *cache)
3369*4882a593Smuzhiyun {
3370*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sctx->fs_info;
3371*4882a593Smuzhiyun 	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3372*4882a593Smuzhiyun 	struct map_lookup *map;
3373*4882a593Smuzhiyun 	struct extent_map *em;
3374*4882a593Smuzhiyun 	int i;
3375*4882a593Smuzhiyun 	int ret = 0;
3376*4882a593Smuzhiyun 
3377*4882a593Smuzhiyun 	read_lock(&map_tree->lock);
3378*4882a593Smuzhiyun 	em = lookup_extent_mapping(map_tree, chunk_offset, 1);
3379*4882a593Smuzhiyun 	read_unlock(&map_tree->lock);
3380*4882a593Smuzhiyun 
3381*4882a593Smuzhiyun 	if (!em) {
3382*4882a593Smuzhiyun 		/*
3383*4882a593Smuzhiyun 		 * Might have been an unused block group deleted by the cleaner
3384*4882a593Smuzhiyun 		 * kthread or relocation.
3385*4882a593Smuzhiyun 		 */
3386*4882a593Smuzhiyun 		spin_lock(&cache->lock);
3387*4882a593Smuzhiyun 		if (!cache->removed)
3388*4882a593Smuzhiyun 			ret = -EINVAL;
3389*4882a593Smuzhiyun 		spin_unlock(&cache->lock);
3390*4882a593Smuzhiyun 
3391*4882a593Smuzhiyun 		return ret;
3392*4882a593Smuzhiyun 	}
3393*4882a593Smuzhiyun 
3394*4882a593Smuzhiyun 	map = em->map_lookup;
3395*4882a593Smuzhiyun 	if (em->start != chunk_offset)
3396*4882a593Smuzhiyun 		goto out;
3397*4882a593Smuzhiyun 
3398*4882a593Smuzhiyun 	if (em->len < length)
3399*4882a593Smuzhiyun 		goto out;
3400*4882a593Smuzhiyun 
3401*4882a593Smuzhiyun 	for (i = 0; i < map->num_stripes; ++i) {
3402*4882a593Smuzhiyun 		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3403*4882a593Smuzhiyun 		    map->stripes[i].physical == dev_offset) {
3404*4882a593Smuzhiyun 			ret = scrub_stripe(sctx, map, scrub_dev, i,
3405*4882a593Smuzhiyun 					   chunk_offset, length, cache);
3406*4882a593Smuzhiyun 			if (ret)
3407*4882a593Smuzhiyun 				goto out;
3408*4882a593Smuzhiyun 		}
3409*4882a593Smuzhiyun 	}
3410*4882a593Smuzhiyun out:
3411*4882a593Smuzhiyun 	free_extent_map(em);
3412*4882a593Smuzhiyun 
3413*4882a593Smuzhiyun 	return ret;
3414*4882a593Smuzhiyun }
3415*4882a593Smuzhiyun 
3416*4882a593Smuzhiyun static noinline_for_stack
scrub_enumerate_chunks(struct scrub_ctx * sctx,struct btrfs_device * scrub_dev,u64 start,u64 end)3417*4882a593Smuzhiyun int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3418*4882a593Smuzhiyun 			   struct btrfs_device *scrub_dev, u64 start, u64 end)
3419*4882a593Smuzhiyun {
3420*4882a593Smuzhiyun 	struct btrfs_dev_extent *dev_extent = NULL;
3421*4882a593Smuzhiyun 	struct btrfs_path *path;
3422*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sctx->fs_info;
3423*4882a593Smuzhiyun 	struct btrfs_root *root = fs_info->dev_root;
3424*4882a593Smuzhiyun 	u64 length;
3425*4882a593Smuzhiyun 	u64 chunk_offset;
3426*4882a593Smuzhiyun 	int ret = 0;
3427*4882a593Smuzhiyun 	int ro_set;
3428*4882a593Smuzhiyun 	int slot;
3429*4882a593Smuzhiyun 	struct extent_buffer *l;
3430*4882a593Smuzhiyun 	struct btrfs_key key;
3431*4882a593Smuzhiyun 	struct btrfs_key found_key;
3432*4882a593Smuzhiyun 	struct btrfs_block_group *cache;
3433*4882a593Smuzhiyun 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3434*4882a593Smuzhiyun 
3435*4882a593Smuzhiyun 	path = btrfs_alloc_path();
3436*4882a593Smuzhiyun 	if (!path)
3437*4882a593Smuzhiyun 		return -ENOMEM;
3438*4882a593Smuzhiyun 
3439*4882a593Smuzhiyun 	path->reada = READA_FORWARD;
3440*4882a593Smuzhiyun 	path->search_commit_root = 1;
3441*4882a593Smuzhiyun 	path->skip_locking = 1;
3442*4882a593Smuzhiyun 
3443*4882a593Smuzhiyun 	key.objectid = scrub_dev->devid;
3444*4882a593Smuzhiyun 	key.offset = 0ull;
3445*4882a593Smuzhiyun 	key.type = BTRFS_DEV_EXTENT_KEY;
3446*4882a593Smuzhiyun 
3447*4882a593Smuzhiyun 	while (1) {
3448*4882a593Smuzhiyun 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3449*4882a593Smuzhiyun 		if (ret < 0)
3450*4882a593Smuzhiyun 			break;
3451*4882a593Smuzhiyun 		if (ret > 0) {
3452*4882a593Smuzhiyun 			if (path->slots[0] >=
3453*4882a593Smuzhiyun 			    btrfs_header_nritems(path->nodes[0])) {
3454*4882a593Smuzhiyun 				ret = btrfs_next_leaf(root, path);
3455*4882a593Smuzhiyun 				if (ret < 0)
3456*4882a593Smuzhiyun 					break;
3457*4882a593Smuzhiyun 				if (ret > 0) {
3458*4882a593Smuzhiyun 					ret = 0;
3459*4882a593Smuzhiyun 					break;
3460*4882a593Smuzhiyun 				}
3461*4882a593Smuzhiyun 			} else {
3462*4882a593Smuzhiyun 				ret = 0;
3463*4882a593Smuzhiyun 			}
3464*4882a593Smuzhiyun 		}
3465*4882a593Smuzhiyun 
3466*4882a593Smuzhiyun 		l = path->nodes[0];
3467*4882a593Smuzhiyun 		slot = path->slots[0];
3468*4882a593Smuzhiyun 
3469*4882a593Smuzhiyun 		btrfs_item_key_to_cpu(l, &found_key, slot);
3470*4882a593Smuzhiyun 
3471*4882a593Smuzhiyun 		if (found_key.objectid != scrub_dev->devid)
3472*4882a593Smuzhiyun 			break;
3473*4882a593Smuzhiyun 
3474*4882a593Smuzhiyun 		if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3475*4882a593Smuzhiyun 			break;
3476*4882a593Smuzhiyun 
3477*4882a593Smuzhiyun 		if (found_key.offset >= end)
3478*4882a593Smuzhiyun 			break;
3479*4882a593Smuzhiyun 
3480*4882a593Smuzhiyun 		if (found_key.offset < key.offset)
3481*4882a593Smuzhiyun 			break;
3482*4882a593Smuzhiyun 
3483*4882a593Smuzhiyun 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3484*4882a593Smuzhiyun 		length = btrfs_dev_extent_length(l, dev_extent);
3485*4882a593Smuzhiyun 
3486*4882a593Smuzhiyun 		if (found_key.offset + length <= start)
3487*4882a593Smuzhiyun 			goto skip;
3488*4882a593Smuzhiyun 
3489*4882a593Smuzhiyun 		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3490*4882a593Smuzhiyun 
3491*4882a593Smuzhiyun 		/*
3492*4882a593Smuzhiyun 		 * get a reference on the corresponding block group to prevent
3493*4882a593Smuzhiyun 		 * the chunk from going away while we scrub it
3494*4882a593Smuzhiyun 		 */
3495*4882a593Smuzhiyun 		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3496*4882a593Smuzhiyun 
3497*4882a593Smuzhiyun 		/* some chunks are removed but not committed to disk yet,
3498*4882a593Smuzhiyun 		 * continue scrubbing */
3499*4882a593Smuzhiyun 		if (!cache)
3500*4882a593Smuzhiyun 			goto skip;
3501*4882a593Smuzhiyun 
3502*4882a593Smuzhiyun 		/*
3503*4882a593Smuzhiyun 		 * Make sure that while we are scrubbing the corresponding block
3504*4882a593Smuzhiyun 		 * group doesn't get its logical address and its device extents
3505*4882a593Smuzhiyun 		 * reused for another block group, which can possibly be of a
3506*4882a593Smuzhiyun 		 * different type and different profile. We do this to prevent
3507*4882a593Smuzhiyun 		 * false error detections and crashes due to bogus attempts to
3508*4882a593Smuzhiyun 		 * repair extents.
3509*4882a593Smuzhiyun 		 */
3510*4882a593Smuzhiyun 		spin_lock(&cache->lock);
3511*4882a593Smuzhiyun 		if (cache->removed) {
3512*4882a593Smuzhiyun 			spin_unlock(&cache->lock);
3513*4882a593Smuzhiyun 			btrfs_put_block_group(cache);
3514*4882a593Smuzhiyun 			goto skip;
3515*4882a593Smuzhiyun 		}
3516*4882a593Smuzhiyun 		btrfs_freeze_block_group(cache);
3517*4882a593Smuzhiyun 		spin_unlock(&cache->lock);
3518*4882a593Smuzhiyun 
3519*4882a593Smuzhiyun 		/*
3520*4882a593Smuzhiyun 		 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3521*4882a593Smuzhiyun 		 * to avoid deadlock caused by:
3522*4882a593Smuzhiyun 		 * btrfs_inc_block_group_ro()
3523*4882a593Smuzhiyun 		 * -> btrfs_wait_for_commit()
3524*4882a593Smuzhiyun 		 * -> btrfs_commit_transaction()
3525*4882a593Smuzhiyun 		 * -> btrfs_scrub_pause()
3526*4882a593Smuzhiyun 		 */
3527*4882a593Smuzhiyun 		scrub_pause_on(fs_info);
3528*4882a593Smuzhiyun 
3529*4882a593Smuzhiyun 		/*
3530*4882a593Smuzhiyun 		 * Don't do chunk preallocation for scrub.
3531*4882a593Smuzhiyun 		 *
3532*4882a593Smuzhiyun 		 * This is especially important for SYSTEM bgs, or we can hit
3533*4882a593Smuzhiyun 		 * -EFBIG from btrfs_finish_chunk_alloc() like:
3534*4882a593Smuzhiyun 		 * 1. The only SYSTEM bg is marked RO.
3535*4882a593Smuzhiyun 		 *    Since SYSTEM bg is small, that's pretty common.
3536*4882a593Smuzhiyun 		 * 2. New SYSTEM bg will be allocated
3537*4882a593Smuzhiyun 		 *    Due to regular version will allocate new chunk.
3538*4882a593Smuzhiyun 		 * 3. New SYSTEM bg is empty and will get cleaned up
3539*4882a593Smuzhiyun 		 *    Before cleanup really happens, it's marked RO again.
3540*4882a593Smuzhiyun 		 * 4. Empty SYSTEM bg get scrubbed
3541*4882a593Smuzhiyun 		 *    We go back to 2.
3542*4882a593Smuzhiyun 		 *
3543*4882a593Smuzhiyun 		 * This can easily boost the amount of SYSTEM chunks if cleaner
3544*4882a593Smuzhiyun 		 * thread can't be triggered fast enough, and use up all space
3545*4882a593Smuzhiyun 		 * of btrfs_super_block::sys_chunk_array
3546*4882a593Smuzhiyun 		 *
3547*4882a593Smuzhiyun 		 * While for dev replace, we need to try our best to mark block
3548*4882a593Smuzhiyun 		 * group RO, to prevent race between:
3549*4882a593Smuzhiyun 		 * - Write duplication
3550*4882a593Smuzhiyun 		 *   Contains latest data
3551*4882a593Smuzhiyun 		 * - Scrub copy
3552*4882a593Smuzhiyun 		 *   Contains data from commit tree
3553*4882a593Smuzhiyun 		 *
3554*4882a593Smuzhiyun 		 * If target block group is not marked RO, nocow writes can
3555*4882a593Smuzhiyun 		 * be overwritten by scrub copy, causing data corruption.
3556*4882a593Smuzhiyun 		 * So for dev-replace, it's not allowed to continue if a block
3557*4882a593Smuzhiyun 		 * group is not RO.
3558*4882a593Smuzhiyun 		 */
3559*4882a593Smuzhiyun 		ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3560*4882a593Smuzhiyun 		if (ret == 0) {
3561*4882a593Smuzhiyun 			ro_set = 1;
3562*4882a593Smuzhiyun 		} else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3563*4882a593Smuzhiyun 			/*
3564*4882a593Smuzhiyun 			 * btrfs_inc_block_group_ro return -ENOSPC when it
3565*4882a593Smuzhiyun 			 * failed in creating new chunk for metadata.
3566*4882a593Smuzhiyun 			 * It is not a problem for scrub, because
3567*4882a593Smuzhiyun 			 * metadata are always cowed, and our scrub paused
3568*4882a593Smuzhiyun 			 * commit_transactions.
3569*4882a593Smuzhiyun 			 */
3570*4882a593Smuzhiyun 			ro_set = 0;
3571*4882a593Smuzhiyun 		} else if (ret == -ETXTBSY) {
3572*4882a593Smuzhiyun 			btrfs_warn(fs_info,
3573*4882a593Smuzhiyun 		   "skipping scrub of block group %llu due to active swapfile",
3574*4882a593Smuzhiyun 				   cache->start);
3575*4882a593Smuzhiyun 			scrub_pause_off(fs_info);
3576*4882a593Smuzhiyun 			ret = 0;
3577*4882a593Smuzhiyun 			goto skip_unfreeze;
3578*4882a593Smuzhiyun 		} else {
3579*4882a593Smuzhiyun 			btrfs_warn(fs_info,
3580*4882a593Smuzhiyun 				   "failed setting block group ro: %d", ret);
3581*4882a593Smuzhiyun 			btrfs_unfreeze_block_group(cache);
3582*4882a593Smuzhiyun 			btrfs_put_block_group(cache);
3583*4882a593Smuzhiyun 			scrub_pause_off(fs_info);
3584*4882a593Smuzhiyun 			break;
3585*4882a593Smuzhiyun 		}
3586*4882a593Smuzhiyun 
3587*4882a593Smuzhiyun 		/*
3588*4882a593Smuzhiyun 		 * Now the target block is marked RO, wait for nocow writes to
3589*4882a593Smuzhiyun 		 * finish before dev-replace.
3590*4882a593Smuzhiyun 		 * COW is fine, as COW never overwrites extents in commit tree.
3591*4882a593Smuzhiyun 		 */
3592*4882a593Smuzhiyun 		if (sctx->is_dev_replace) {
3593*4882a593Smuzhiyun 			btrfs_wait_nocow_writers(cache);
3594*4882a593Smuzhiyun 			btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3595*4882a593Smuzhiyun 					cache->length);
3596*4882a593Smuzhiyun 		}
3597*4882a593Smuzhiyun 
3598*4882a593Smuzhiyun 		scrub_pause_off(fs_info);
3599*4882a593Smuzhiyun 		down_write(&dev_replace->rwsem);
3600*4882a593Smuzhiyun 		dev_replace->cursor_right = found_key.offset + length;
3601*4882a593Smuzhiyun 		dev_replace->cursor_left = found_key.offset;
3602*4882a593Smuzhiyun 		dev_replace->item_needs_writeback = 1;
3603*4882a593Smuzhiyun 		up_write(&dev_replace->rwsem);
3604*4882a593Smuzhiyun 
3605*4882a593Smuzhiyun 		ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
3606*4882a593Smuzhiyun 				  found_key.offset, cache);
3607*4882a593Smuzhiyun 
3608*4882a593Smuzhiyun 		/*
3609*4882a593Smuzhiyun 		 * flush, submit all pending read and write bios, afterwards
3610*4882a593Smuzhiyun 		 * wait for them.
3611*4882a593Smuzhiyun 		 * Note that in the dev replace case, a read request causes
3612*4882a593Smuzhiyun 		 * write requests that are submitted in the read completion
3613*4882a593Smuzhiyun 		 * worker. Therefore in the current situation, it is required
3614*4882a593Smuzhiyun 		 * that all write requests are flushed, so that all read and
3615*4882a593Smuzhiyun 		 * write requests are really completed when bios_in_flight
3616*4882a593Smuzhiyun 		 * changes to 0.
3617*4882a593Smuzhiyun 		 */
3618*4882a593Smuzhiyun 		sctx->flush_all_writes = true;
3619*4882a593Smuzhiyun 		scrub_submit(sctx);
3620*4882a593Smuzhiyun 		mutex_lock(&sctx->wr_lock);
3621*4882a593Smuzhiyun 		scrub_wr_submit(sctx);
3622*4882a593Smuzhiyun 		mutex_unlock(&sctx->wr_lock);
3623*4882a593Smuzhiyun 
3624*4882a593Smuzhiyun 		wait_event(sctx->list_wait,
3625*4882a593Smuzhiyun 			   atomic_read(&sctx->bios_in_flight) == 0);
3626*4882a593Smuzhiyun 
3627*4882a593Smuzhiyun 		scrub_pause_on(fs_info);
3628*4882a593Smuzhiyun 
3629*4882a593Smuzhiyun 		/*
3630*4882a593Smuzhiyun 		 * must be called before we decrease @scrub_paused.
3631*4882a593Smuzhiyun 		 * make sure we don't block transaction commit while
3632*4882a593Smuzhiyun 		 * we are waiting pending workers finished.
3633*4882a593Smuzhiyun 		 */
3634*4882a593Smuzhiyun 		wait_event(sctx->list_wait,
3635*4882a593Smuzhiyun 			   atomic_read(&sctx->workers_pending) == 0);
3636*4882a593Smuzhiyun 		sctx->flush_all_writes = false;
3637*4882a593Smuzhiyun 
3638*4882a593Smuzhiyun 		scrub_pause_off(fs_info);
3639*4882a593Smuzhiyun 
3640*4882a593Smuzhiyun 		down_write(&dev_replace->rwsem);
3641*4882a593Smuzhiyun 		dev_replace->cursor_left = dev_replace->cursor_right;
3642*4882a593Smuzhiyun 		dev_replace->item_needs_writeback = 1;
3643*4882a593Smuzhiyun 		up_write(&dev_replace->rwsem);
3644*4882a593Smuzhiyun 
3645*4882a593Smuzhiyun 		if (ro_set)
3646*4882a593Smuzhiyun 			btrfs_dec_block_group_ro(cache);
3647*4882a593Smuzhiyun 
3648*4882a593Smuzhiyun 		/*
3649*4882a593Smuzhiyun 		 * We might have prevented the cleaner kthread from deleting
3650*4882a593Smuzhiyun 		 * this block group if it was already unused because we raced
3651*4882a593Smuzhiyun 		 * and set it to RO mode first. So add it back to the unused
3652*4882a593Smuzhiyun 		 * list, otherwise it might not ever be deleted unless a manual
3653*4882a593Smuzhiyun 		 * balance is triggered or it becomes used and unused again.
3654*4882a593Smuzhiyun 		 */
3655*4882a593Smuzhiyun 		spin_lock(&cache->lock);
3656*4882a593Smuzhiyun 		if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3657*4882a593Smuzhiyun 		    cache->used == 0) {
3658*4882a593Smuzhiyun 			spin_unlock(&cache->lock);
3659*4882a593Smuzhiyun 			if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3660*4882a593Smuzhiyun 				btrfs_discard_queue_work(&fs_info->discard_ctl,
3661*4882a593Smuzhiyun 							 cache);
3662*4882a593Smuzhiyun 			else
3663*4882a593Smuzhiyun 				btrfs_mark_bg_unused(cache);
3664*4882a593Smuzhiyun 		} else {
3665*4882a593Smuzhiyun 			spin_unlock(&cache->lock);
3666*4882a593Smuzhiyun 		}
3667*4882a593Smuzhiyun skip_unfreeze:
3668*4882a593Smuzhiyun 		btrfs_unfreeze_block_group(cache);
3669*4882a593Smuzhiyun 		btrfs_put_block_group(cache);
3670*4882a593Smuzhiyun 		if (ret)
3671*4882a593Smuzhiyun 			break;
3672*4882a593Smuzhiyun 		if (sctx->is_dev_replace &&
3673*4882a593Smuzhiyun 		    atomic64_read(&dev_replace->num_write_errors) > 0) {
3674*4882a593Smuzhiyun 			ret = -EIO;
3675*4882a593Smuzhiyun 			break;
3676*4882a593Smuzhiyun 		}
3677*4882a593Smuzhiyun 		if (sctx->stat.malloc_errors > 0) {
3678*4882a593Smuzhiyun 			ret = -ENOMEM;
3679*4882a593Smuzhiyun 			break;
3680*4882a593Smuzhiyun 		}
3681*4882a593Smuzhiyun skip:
3682*4882a593Smuzhiyun 		key.offset = found_key.offset + length;
3683*4882a593Smuzhiyun 		btrfs_release_path(path);
3684*4882a593Smuzhiyun 	}
3685*4882a593Smuzhiyun 
3686*4882a593Smuzhiyun 	btrfs_free_path(path);
3687*4882a593Smuzhiyun 
3688*4882a593Smuzhiyun 	return ret;
3689*4882a593Smuzhiyun }
3690*4882a593Smuzhiyun 
scrub_supers(struct scrub_ctx * sctx,struct btrfs_device * scrub_dev)3691*4882a593Smuzhiyun static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3692*4882a593Smuzhiyun 					   struct btrfs_device *scrub_dev)
3693*4882a593Smuzhiyun {
3694*4882a593Smuzhiyun 	int	i;
3695*4882a593Smuzhiyun 	u64	bytenr;
3696*4882a593Smuzhiyun 	u64	gen;
3697*4882a593Smuzhiyun 	int	ret;
3698*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = sctx->fs_info;
3699*4882a593Smuzhiyun 
3700*4882a593Smuzhiyun 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3701*4882a593Smuzhiyun 		return -EROFS;
3702*4882a593Smuzhiyun 
3703*4882a593Smuzhiyun 	/* Seed devices of a new filesystem has their own generation. */
3704*4882a593Smuzhiyun 	if (scrub_dev->fs_devices != fs_info->fs_devices)
3705*4882a593Smuzhiyun 		gen = scrub_dev->generation;
3706*4882a593Smuzhiyun 	else
3707*4882a593Smuzhiyun 		gen = fs_info->last_trans_committed;
3708*4882a593Smuzhiyun 
3709*4882a593Smuzhiyun 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3710*4882a593Smuzhiyun 		bytenr = btrfs_sb_offset(i);
3711*4882a593Smuzhiyun 		if (bytenr + BTRFS_SUPER_INFO_SIZE >
3712*4882a593Smuzhiyun 		    scrub_dev->commit_total_bytes)
3713*4882a593Smuzhiyun 			break;
3714*4882a593Smuzhiyun 
3715*4882a593Smuzhiyun 		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3716*4882a593Smuzhiyun 				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3717*4882a593Smuzhiyun 				  NULL, 1, bytenr);
3718*4882a593Smuzhiyun 		if (ret)
3719*4882a593Smuzhiyun 			return ret;
3720*4882a593Smuzhiyun 	}
3721*4882a593Smuzhiyun 	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3722*4882a593Smuzhiyun 
3723*4882a593Smuzhiyun 	return 0;
3724*4882a593Smuzhiyun }
3725*4882a593Smuzhiyun 
scrub_workers_put(struct btrfs_fs_info * fs_info)3726*4882a593Smuzhiyun static void scrub_workers_put(struct btrfs_fs_info *fs_info)
3727*4882a593Smuzhiyun {
3728*4882a593Smuzhiyun 	if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
3729*4882a593Smuzhiyun 					&fs_info->scrub_lock)) {
3730*4882a593Smuzhiyun 		struct btrfs_workqueue *scrub_workers = NULL;
3731*4882a593Smuzhiyun 		struct btrfs_workqueue *scrub_wr_comp = NULL;
3732*4882a593Smuzhiyun 		struct btrfs_workqueue *scrub_parity = NULL;
3733*4882a593Smuzhiyun 
3734*4882a593Smuzhiyun 		scrub_workers = fs_info->scrub_workers;
3735*4882a593Smuzhiyun 		scrub_wr_comp = fs_info->scrub_wr_completion_workers;
3736*4882a593Smuzhiyun 		scrub_parity = fs_info->scrub_parity_workers;
3737*4882a593Smuzhiyun 
3738*4882a593Smuzhiyun 		fs_info->scrub_workers = NULL;
3739*4882a593Smuzhiyun 		fs_info->scrub_wr_completion_workers = NULL;
3740*4882a593Smuzhiyun 		fs_info->scrub_parity_workers = NULL;
3741*4882a593Smuzhiyun 		mutex_unlock(&fs_info->scrub_lock);
3742*4882a593Smuzhiyun 
3743*4882a593Smuzhiyun 		btrfs_destroy_workqueue(scrub_workers);
3744*4882a593Smuzhiyun 		btrfs_destroy_workqueue(scrub_wr_comp);
3745*4882a593Smuzhiyun 		btrfs_destroy_workqueue(scrub_parity);
3746*4882a593Smuzhiyun 	}
3747*4882a593Smuzhiyun }
3748*4882a593Smuzhiyun 
3749*4882a593Smuzhiyun /*
3750*4882a593Smuzhiyun  * get a reference count on fs_info->scrub_workers. start worker if necessary
3751*4882a593Smuzhiyun  */
scrub_workers_get(struct btrfs_fs_info * fs_info,int is_dev_replace)3752*4882a593Smuzhiyun static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3753*4882a593Smuzhiyun 						int is_dev_replace)
3754*4882a593Smuzhiyun {
3755*4882a593Smuzhiyun 	struct btrfs_workqueue *scrub_workers = NULL;
3756*4882a593Smuzhiyun 	struct btrfs_workqueue *scrub_wr_comp = NULL;
3757*4882a593Smuzhiyun 	struct btrfs_workqueue *scrub_parity = NULL;
3758*4882a593Smuzhiyun 	unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
3759*4882a593Smuzhiyun 	int max_active = fs_info->thread_pool_size;
3760*4882a593Smuzhiyun 	int ret = -ENOMEM;
3761*4882a593Smuzhiyun 
3762*4882a593Smuzhiyun 	if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
3763*4882a593Smuzhiyun 		return 0;
3764*4882a593Smuzhiyun 
3765*4882a593Smuzhiyun 	scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
3766*4882a593Smuzhiyun 					      is_dev_replace ? 1 : max_active, 4);
3767*4882a593Smuzhiyun 	if (!scrub_workers)
3768*4882a593Smuzhiyun 		goto fail_scrub_workers;
3769*4882a593Smuzhiyun 
3770*4882a593Smuzhiyun 	scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
3771*4882a593Smuzhiyun 					      max_active, 2);
3772*4882a593Smuzhiyun 	if (!scrub_wr_comp)
3773*4882a593Smuzhiyun 		goto fail_scrub_wr_completion_workers;
3774*4882a593Smuzhiyun 
3775*4882a593Smuzhiyun 	scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
3776*4882a593Smuzhiyun 					     max_active, 2);
3777*4882a593Smuzhiyun 	if (!scrub_parity)
3778*4882a593Smuzhiyun 		goto fail_scrub_parity_workers;
3779*4882a593Smuzhiyun 
3780*4882a593Smuzhiyun 	mutex_lock(&fs_info->scrub_lock);
3781*4882a593Smuzhiyun 	if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
3782*4882a593Smuzhiyun 		ASSERT(fs_info->scrub_workers == NULL &&
3783*4882a593Smuzhiyun 		       fs_info->scrub_wr_completion_workers == NULL &&
3784*4882a593Smuzhiyun 		       fs_info->scrub_parity_workers == NULL);
3785*4882a593Smuzhiyun 		fs_info->scrub_workers = scrub_workers;
3786*4882a593Smuzhiyun 		fs_info->scrub_wr_completion_workers = scrub_wr_comp;
3787*4882a593Smuzhiyun 		fs_info->scrub_parity_workers = scrub_parity;
3788*4882a593Smuzhiyun 		refcount_set(&fs_info->scrub_workers_refcnt, 1);
3789*4882a593Smuzhiyun 		mutex_unlock(&fs_info->scrub_lock);
3790*4882a593Smuzhiyun 		return 0;
3791*4882a593Smuzhiyun 	}
3792*4882a593Smuzhiyun 	/* Other thread raced in and created the workers for us */
3793*4882a593Smuzhiyun 	refcount_inc(&fs_info->scrub_workers_refcnt);
3794*4882a593Smuzhiyun 	mutex_unlock(&fs_info->scrub_lock);
3795*4882a593Smuzhiyun 
3796*4882a593Smuzhiyun 	ret = 0;
3797*4882a593Smuzhiyun 	btrfs_destroy_workqueue(scrub_parity);
3798*4882a593Smuzhiyun fail_scrub_parity_workers:
3799*4882a593Smuzhiyun 	btrfs_destroy_workqueue(scrub_wr_comp);
3800*4882a593Smuzhiyun fail_scrub_wr_completion_workers:
3801*4882a593Smuzhiyun 	btrfs_destroy_workqueue(scrub_workers);
3802*4882a593Smuzhiyun fail_scrub_workers:
3803*4882a593Smuzhiyun 	return ret;
3804*4882a593Smuzhiyun }
3805*4882a593Smuzhiyun 
btrfs_scrub_dev(struct btrfs_fs_info * fs_info,u64 devid,u64 start,u64 end,struct btrfs_scrub_progress * progress,int readonly,int is_dev_replace)3806*4882a593Smuzhiyun int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3807*4882a593Smuzhiyun 		    u64 end, struct btrfs_scrub_progress *progress,
3808*4882a593Smuzhiyun 		    int readonly, int is_dev_replace)
3809*4882a593Smuzhiyun {
3810*4882a593Smuzhiyun 	struct scrub_ctx *sctx;
3811*4882a593Smuzhiyun 	int ret;
3812*4882a593Smuzhiyun 	struct btrfs_device *dev;
3813*4882a593Smuzhiyun 	unsigned int nofs_flag;
3814*4882a593Smuzhiyun 	bool need_commit = false;
3815*4882a593Smuzhiyun 
3816*4882a593Smuzhiyun 	if (btrfs_fs_closing(fs_info))
3817*4882a593Smuzhiyun 		return -EAGAIN;
3818*4882a593Smuzhiyun 
3819*4882a593Smuzhiyun 	if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
3820*4882a593Smuzhiyun 		/*
3821*4882a593Smuzhiyun 		 * in this case scrub is unable to calculate the checksum
3822*4882a593Smuzhiyun 		 * the way scrub is implemented. Do not handle this
3823*4882a593Smuzhiyun 		 * situation at all because it won't ever happen.
3824*4882a593Smuzhiyun 		 */
3825*4882a593Smuzhiyun 		btrfs_err(fs_info,
3826*4882a593Smuzhiyun 			   "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
3827*4882a593Smuzhiyun 		       fs_info->nodesize,
3828*4882a593Smuzhiyun 		       BTRFS_STRIPE_LEN);
3829*4882a593Smuzhiyun 		return -EINVAL;
3830*4882a593Smuzhiyun 	}
3831*4882a593Smuzhiyun 
3832*4882a593Smuzhiyun 	if (fs_info->sectorsize != PAGE_SIZE) {
3833*4882a593Smuzhiyun 		/* not supported for data w/o checksums */
3834*4882a593Smuzhiyun 		btrfs_err_rl(fs_info,
3835*4882a593Smuzhiyun 			   "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
3836*4882a593Smuzhiyun 		       fs_info->sectorsize, PAGE_SIZE);
3837*4882a593Smuzhiyun 		return -EINVAL;
3838*4882a593Smuzhiyun 	}
3839*4882a593Smuzhiyun 
3840*4882a593Smuzhiyun 	if (fs_info->nodesize >
3841*4882a593Smuzhiyun 	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
3842*4882a593Smuzhiyun 	    fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
3843*4882a593Smuzhiyun 		/*
3844*4882a593Smuzhiyun 		 * would exhaust the array bounds of pagev member in
3845*4882a593Smuzhiyun 		 * struct scrub_block
3846*4882a593Smuzhiyun 		 */
3847*4882a593Smuzhiyun 		btrfs_err(fs_info,
3848*4882a593Smuzhiyun 			  "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
3849*4882a593Smuzhiyun 		       fs_info->nodesize,
3850*4882a593Smuzhiyun 		       SCRUB_MAX_PAGES_PER_BLOCK,
3851*4882a593Smuzhiyun 		       fs_info->sectorsize,
3852*4882a593Smuzhiyun 		       SCRUB_MAX_PAGES_PER_BLOCK);
3853*4882a593Smuzhiyun 		return -EINVAL;
3854*4882a593Smuzhiyun 	}
3855*4882a593Smuzhiyun 
3856*4882a593Smuzhiyun 	/* Allocate outside of device_list_mutex */
3857*4882a593Smuzhiyun 	sctx = scrub_setup_ctx(fs_info, is_dev_replace);
3858*4882a593Smuzhiyun 	if (IS_ERR(sctx))
3859*4882a593Smuzhiyun 		return PTR_ERR(sctx);
3860*4882a593Smuzhiyun 
3861*4882a593Smuzhiyun 	ret = scrub_workers_get(fs_info, is_dev_replace);
3862*4882a593Smuzhiyun 	if (ret)
3863*4882a593Smuzhiyun 		goto out_free_ctx;
3864*4882a593Smuzhiyun 
3865*4882a593Smuzhiyun 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
3866*4882a593Smuzhiyun 	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
3867*4882a593Smuzhiyun 	if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
3868*4882a593Smuzhiyun 		     !is_dev_replace)) {
3869*4882a593Smuzhiyun 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3870*4882a593Smuzhiyun 		ret = -ENODEV;
3871*4882a593Smuzhiyun 		goto out;
3872*4882a593Smuzhiyun 	}
3873*4882a593Smuzhiyun 
3874*4882a593Smuzhiyun 	if (!is_dev_replace && !readonly &&
3875*4882a593Smuzhiyun 	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
3876*4882a593Smuzhiyun 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3877*4882a593Smuzhiyun 		btrfs_err_in_rcu(fs_info,
3878*4882a593Smuzhiyun 			"scrub on devid %llu: filesystem on %s is not writable",
3879*4882a593Smuzhiyun 				 devid, rcu_str_deref(dev->name));
3880*4882a593Smuzhiyun 		ret = -EROFS;
3881*4882a593Smuzhiyun 		goto out;
3882*4882a593Smuzhiyun 	}
3883*4882a593Smuzhiyun 
3884*4882a593Smuzhiyun 	mutex_lock(&fs_info->scrub_lock);
3885*4882a593Smuzhiyun 	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3886*4882a593Smuzhiyun 	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
3887*4882a593Smuzhiyun 		mutex_unlock(&fs_info->scrub_lock);
3888*4882a593Smuzhiyun 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3889*4882a593Smuzhiyun 		ret = -EIO;
3890*4882a593Smuzhiyun 		goto out;
3891*4882a593Smuzhiyun 	}
3892*4882a593Smuzhiyun 
3893*4882a593Smuzhiyun 	down_read(&fs_info->dev_replace.rwsem);
3894*4882a593Smuzhiyun 	if (dev->scrub_ctx ||
3895*4882a593Smuzhiyun 	    (!is_dev_replace &&
3896*4882a593Smuzhiyun 	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3897*4882a593Smuzhiyun 		up_read(&fs_info->dev_replace.rwsem);
3898*4882a593Smuzhiyun 		mutex_unlock(&fs_info->scrub_lock);
3899*4882a593Smuzhiyun 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3900*4882a593Smuzhiyun 		ret = -EINPROGRESS;
3901*4882a593Smuzhiyun 		goto out;
3902*4882a593Smuzhiyun 	}
3903*4882a593Smuzhiyun 	up_read(&fs_info->dev_replace.rwsem);
3904*4882a593Smuzhiyun 
3905*4882a593Smuzhiyun 	sctx->readonly = readonly;
3906*4882a593Smuzhiyun 	dev->scrub_ctx = sctx;
3907*4882a593Smuzhiyun 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3908*4882a593Smuzhiyun 
3909*4882a593Smuzhiyun 	/*
3910*4882a593Smuzhiyun 	 * checking @scrub_pause_req here, we can avoid
3911*4882a593Smuzhiyun 	 * race between committing transaction and scrubbing.
3912*4882a593Smuzhiyun 	 */
3913*4882a593Smuzhiyun 	__scrub_blocked_if_needed(fs_info);
3914*4882a593Smuzhiyun 	atomic_inc(&fs_info->scrubs_running);
3915*4882a593Smuzhiyun 	mutex_unlock(&fs_info->scrub_lock);
3916*4882a593Smuzhiyun 
3917*4882a593Smuzhiyun 	/*
3918*4882a593Smuzhiyun 	 * In order to avoid deadlock with reclaim when there is a transaction
3919*4882a593Smuzhiyun 	 * trying to pause scrub, make sure we use GFP_NOFS for all the
3920*4882a593Smuzhiyun 	 * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
3921*4882a593Smuzhiyun 	 * invoked by our callees. The pausing request is done when the
3922*4882a593Smuzhiyun 	 * transaction commit starts, and it blocks the transaction until scrub
3923*4882a593Smuzhiyun 	 * is paused (done at specific points at scrub_stripe() or right above
3924*4882a593Smuzhiyun 	 * before incrementing fs_info->scrubs_running).
3925*4882a593Smuzhiyun 	 */
3926*4882a593Smuzhiyun 	nofs_flag = memalloc_nofs_save();
3927*4882a593Smuzhiyun 	if (!is_dev_replace) {
3928*4882a593Smuzhiyun 		u64 old_super_errors;
3929*4882a593Smuzhiyun 
3930*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
3931*4882a593Smuzhiyun 		old_super_errors = sctx->stat.super_errors;
3932*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
3933*4882a593Smuzhiyun 
3934*4882a593Smuzhiyun 		btrfs_info(fs_info, "scrub: started on devid %llu", devid);
3935*4882a593Smuzhiyun 		/*
3936*4882a593Smuzhiyun 		 * by holding device list mutex, we can
3937*4882a593Smuzhiyun 		 * kick off writing super in log tree sync.
3938*4882a593Smuzhiyun 		 */
3939*4882a593Smuzhiyun 		mutex_lock(&fs_info->fs_devices->device_list_mutex);
3940*4882a593Smuzhiyun 		ret = scrub_supers(sctx, dev);
3941*4882a593Smuzhiyun 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3942*4882a593Smuzhiyun 
3943*4882a593Smuzhiyun 		spin_lock(&sctx->stat_lock);
3944*4882a593Smuzhiyun 		/*
3945*4882a593Smuzhiyun 		 * Super block errors found, but we can not commit transaction
3946*4882a593Smuzhiyun 		 * at current context, since btrfs_commit_transaction() needs
3947*4882a593Smuzhiyun 		 * to pause the current running scrub (hold by ourselves).
3948*4882a593Smuzhiyun 		 */
3949*4882a593Smuzhiyun 		if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
3950*4882a593Smuzhiyun 			need_commit = true;
3951*4882a593Smuzhiyun 		spin_unlock(&sctx->stat_lock);
3952*4882a593Smuzhiyun 	}
3953*4882a593Smuzhiyun 
3954*4882a593Smuzhiyun 	if (!ret)
3955*4882a593Smuzhiyun 		ret = scrub_enumerate_chunks(sctx, dev, start, end);
3956*4882a593Smuzhiyun 	memalloc_nofs_restore(nofs_flag);
3957*4882a593Smuzhiyun 
3958*4882a593Smuzhiyun 	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3959*4882a593Smuzhiyun 	atomic_dec(&fs_info->scrubs_running);
3960*4882a593Smuzhiyun 	wake_up(&fs_info->scrub_pause_wait);
3961*4882a593Smuzhiyun 
3962*4882a593Smuzhiyun 	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
3963*4882a593Smuzhiyun 
3964*4882a593Smuzhiyun 	if (progress)
3965*4882a593Smuzhiyun 		memcpy(progress, &sctx->stat, sizeof(*progress));
3966*4882a593Smuzhiyun 
3967*4882a593Smuzhiyun 	if (!is_dev_replace)
3968*4882a593Smuzhiyun 		btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
3969*4882a593Smuzhiyun 			ret ? "not finished" : "finished", devid, ret);
3970*4882a593Smuzhiyun 
3971*4882a593Smuzhiyun 	mutex_lock(&fs_info->scrub_lock);
3972*4882a593Smuzhiyun 	dev->scrub_ctx = NULL;
3973*4882a593Smuzhiyun 	mutex_unlock(&fs_info->scrub_lock);
3974*4882a593Smuzhiyun 
3975*4882a593Smuzhiyun 	scrub_workers_put(fs_info);
3976*4882a593Smuzhiyun 	scrub_put_ctx(sctx);
3977*4882a593Smuzhiyun 
3978*4882a593Smuzhiyun 	/*
3979*4882a593Smuzhiyun 	 * We found some super block errors before, now try to force a
3980*4882a593Smuzhiyun 	 * transaction commit, as scrub has finished.
3981*4882a593Smuzhiyun 	 */
3982*4882a593Smuzhiyun 	if (need_commit) {
3983*4882a593Smuzhiyun 		struct btrfs_trans_handle *trans;
3984*4882a593Smuzhiyun 
3985*4882a593Smuzhiyun 		trans = btrfs_start_transaction(fs_info->tree_root, 0);
3986*4882a593Smuzhiyun 		if (IS_ERR(trans)) {
3987*4882a593Smuzhiyun 			ret = PTR_ERR(trans);
3988*4882a593Smuzhiyun 			btrfs_err(fs_info,
3989*4882a593Smuzhiyun 	"scrub: failed to start transaction to fix super block errors: %d", ret);
3990*4882a593Smuzhiyun 			return ret;
3991*4882a593Smuzhiyun 		}
3992*4882a593Smuzhiyun 		ret = btrfs_commit_transaction(trans);
3993*4882a593Smuzhiyun 		if (ret < 0)
3994*4882a593Smuzhiyun 			btrfs_err(fs_info,
3995*4882a593Smuzhiyun 	"scrub: failed to commit transaction to fix super block errors: %d", ret);
3996*4882a593Smuzhiyun 	}
3997*4882a593Smuzhiyun 	return ret;
3998*4882a593Smuzhiyun out:
3999*4882a593Smuzhiyun 	scrub_workers_put(fs_info);
4000*4882a593Smuzhiyun out_free_ctx:
4001*4882a593Smuzhiyun 	scrub_free_ctx(sctx);
4002*4882a593Smuzhiyun 
4003*4882a593Smuzhiyun 	return ret;
4004*4882a593Smuzhiyun }
4005*4882a593Smuzhiyun 
btrfs_scrub_pause(struct btrfs_fs_info * fs_info)4006*4882a593Smuzhiyun void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4007*4882a593Smuzhiyun {
4008*4882a593Smuzhiyun 	mutex_lock(&fs_info->scrub_lock);
4009*4882a593Smuzhiyun 	atomic_inc(&fs_info->scrub_pause_req);
4010*4882a593Smuzhiyun 	while (atomic_read(&fs_info->scrubs_paused) !=
4011*4882a593Smuzhiyun 	       atomic_read(&fs_info->scrubs_running)) {
4012*4882a593Smuzhiyun 		mutex_unlock(&fs_info->scrub_lock);
4013*4882a593Smuzhiyun 		wait_event(fs_info->scrub_pause_wait,
4014*4882a593Smuzhiyun 			   atomic_read(&fs_info->scrubs_paused) ==
4015*4882a593Smuzhiyun 			   atomic_read(&fs_info->scrubs_running));
4016*4882a593Smuzhiyun 		mutex_lock(&fs_info->scrub_lock);
4017*4882a593Smuzhiyun 	}
4018*4882a593Smuzhiyun 	mutex_unlock(&fs_info->scrub_lock);
4019*4882a593Smuzhiyun }
4020*4882a593Smuzhiyun 
btrfs_scrub_continue(struct btrfs_fs_info * fs_info)4021*4882a593Smuzhiyun void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4022*4882a593Smuzhiyun {
4023*4882a593Smuzhiyun 	atomic_dec(&fs_info->scrub_pause_req);
4024*4882a593Smuzhiyun 	wake_up(&fs_info->scrub_pause_wait);
4025*4882a593Smuzhiyun }
4026*4882a593Smuzhiyun 
btrfs_scrub_cancel(struct btrfs_fs_info * fs_info)4027*4882a593Smuzhiyun int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4028*4882a593Smuzhiyun {
4029*4882a593Smuzhiyun 	mutex_lock(&fs_info->scrub_lock);
4030*4882a593Smuzhiyun 	if (!atomic_read(&fs_info->scrubs_running)) {
4031*4882a593Smuzhiyun 		mutex_unlock(&fs_info->scrub_lock);
4032*4882a593Smuzhiyun 		return -ENOTCONN;
4033*4882a593Smuzhiyun 	}
4034*4882a593Smuzhiyun 
4035*4882a593Smuzhiyun 	atomic_inc(&fs_info->scrub_cancel_req);
4036*4882a593Smuzhiyun 	while (atomic_read(&fs_info->scrubs_running)) {
4037*4882a593Smuzhiyun 		mutex_unlock(&fs_info->scrub_lock);
4038*4882a593Smuzhiyun 		wait_event(fs_info->scrub_pause_wait,
4039*4882a593Smuzhiyun 			   atomic_read(&fs_info->scrubs_running) == 0);
4040*4882a593Smuzhiyun 		mutex_lock(&fs_info->scrub_lock);
4041*4882a593Smuzhiyun 	}
4042*4882a593Smuzhiyun 	atomic_dec(&fs_info->scrub_cancel_req);
4043*4882a593Smuzhiyun 	mutex_unlock(&fs_info->scrub_lock);
4044*4882a593Smuzhiyun 
4045*4882a593Smuzhiyun 	return 0;
4046*4882a593Smuzhiyun }
4047*4882a593Smuzhiyun 
btrfs_scrub_cancel_dev(struct btrfs_device * dev)4048*4882a593Smuzhiyun int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4049*4882a593Smuzhiyun {
4050*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = dev->fs_info;
4051*4882a593Smuzhiyun 	struct scrub_ctx *sctx;
4052*4882a593Smuzhiyun 
4053*4882a593Smuzhiyun 	mutex_lock(&fs_info->scrub_lock);
4054*4882a593Smuzhiyun 	sctx = dev->scrub_ctx;
4055*4882a593Smuzhiyun 	if (!sctx) {
4056*4882a593Smuzhiyun 		mutex_unlock(&fs_info->scrub_lock);
4057*4882a593Smuzhiyun 		return -ENOTCONN;
4058*4882a593Smuzhiyun 	}
4059*4882a593Smuzhiyun 	atomic_inc(&sctx->cancel_req);
4060*4882a593Smuzhiyun 	while (dev->scrub_ctx) {
4061*4882a593Smuzhiyun 		mutex_unlock(&fs_info->scrub_lock);
4062*4882a593Smuzhiyun 		wait_event(fs_info->scrub_pause_wait,
4063*4882a593Smuzhiyun 			   dev->scrub_ctx == NULL);
4064*4882a593Smuzhiyun 		mutex_lock(&fs_info->scrub_lock);
4065*4882a593Smuzhiyun 	}
4066*4882a593Smuzhiyun 	mutex_unlock(&fs_info->scrub_lock);
4067*4882a593Smuzhiyun 
4068*4882a593Smuzhiyun 	return 0;
4069*4882a593Smuzhiyun }
4070*4882a593Smuzhiyun 
btrfs_scrub_progress(struct btrfs_fs_info * fs_info,u64 devid,struct btrfs_scrub_progress * progress)4071*4882a593Smuzhiyun int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4072*4882a593Smuzhiyun 			 struct btrfs_scrub_progress *progress)
4073*4882a593Smuzhiyun {
4074*4882a593Smuzhiyun 	struct btrfs_device *dev;
4075*4882a593Smuzhiyun 	struct scrub_ctx *sctx = NULL;
4076*4882a593Smuzhiyun 
4077*4882a593Smuzhiyun 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
4078*4882a593Smuzhiyun 	dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
4079*4882a593Smuzhiyun 	if (dev)
4080*4882a593Smuzhiyun 		sctx = dev->scrub_ctx;
4081*4882a593Smuzhiyun 	if (sctx)
4082*4882a593Smuzhiyun 		memcpy(progress, &sctx->stat, sizeof(*progress));
4083*4882a593Smuzhiyun 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4084*4882a593Smuzhiyun 
4085*4882a593Smuzhiyun 	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4086*4882a593Smuzhiyun }
4087*4882a593Smuzhiyun 
scrub_remap_extent(struct btrfs_fs_info * fs_info,u64 extent_logical,u64 extent_len,u64 * extent_physical,struct btrfs_device ** extent_dev,int * extent_mirror_num)4088*4882a593Smuzhiyun static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4089*4882a593Smuzhiyun 			       u64 extent_logical, u64 extent_len,
4090*4882a593Smuzhiyun 			       u64 *extent_physical,
4091*4882a593Smuzhiyun 			       struct btrfs_device **extent_dev,
4092*4882a593Smuzhiyun 			       int *extent_mirror_num)
4093*4882a593Smuzhiyun {
4094*4882a593Smuzhiyun 	u64 mapped_length;
4095*4882a593Smuzhiyun 	struct btrfs_bio *bbio = NULL;
4096*4882a593Smuzhiyun 	int ret;
4097*4882a593Smuzhiyun 
4098*4882a593Smuzhiyun 	mapped_length = extent_len;
4099*4882a593Smuzhiyun 	ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4100*4882a593Smuzhiyun 			      &mapped_length, &bbio, 0);
4101*4882a593Smuzhiyun 	if (ret || !bbio || mapped_length < extent_len ||
4102*4882a593Smuzhiyun 	    !bbio->stripes[0].dev->bdev) {
4103*4882a593Smuzhiyun 		btrfs_put_bbio(bbio);
4104*4882a593Smuzhiyun 		return;
4105*4882a593Smuzhiyun 	}
4106*4882a593Smuzhiyun 
4107*4882a593Smuzhiyun 	*extent_physical = bbio->stripes[0].physical;
4108*4882a593Smuzhiyun 	*extent_mirror_num = bbio->mirror_num;
4109*4882a593Smuzhiyun 	*extent_dev = bbio->stripes[0].dev;
4110*4882a593Smuzhiyun 	btrfs_put_bbio(bbio);
4111*4882a593Smuzhiyun }
4112