1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Copyright (C) 2011, 2012 STRATO. All rights reserved.
4*4882a593Smuzhiyun */
5*4882a593Smuzhiyun
6*4882a593Smuzhiyun #include <linux/blkdev.h>
7*4882a593Smuzhiyun #include <linux/ratelimit.h>
8*4882a593Smuzhiyun #include <linux/sched/mm.h>
9*4882a593Smuzhiyun #include <crypto/hash.h>
10*4882a593Smuzhiyun #include "ctree.h"
11*4882a593Smuzhiyun #include "discard.h"
12*4882a593Smuzhiyun #include "volumes.h"
13*4882a593Smuzhiyun #include "disk-io.h"
14*4882a593Smuzhiyun #include "ordered-data.h"
15*4882a593Smuzhiyun #include "transaction.h"
16*4882a593Smuzhiyun #include "backref.h"
17*4882a593Smuzhiyun #include "extent_io.h"
18*4882a593Smuzhiyun #include "dev-replace.h"
19*4882a593Smuzhiyun #include "check-integrity.h"
20*4882a593Smuzhiyun #include "rcu-string.h"
21*4882a593Smuzhiyun #include "raid56.h"
22*4882a593Smuzhiyun #include "block-group.h"
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun /*
25*4882a593Smuzhiyun * This is only the first step towards a full-features scrub. It reads all
26*4882a593Smuzhiyun * extent and super block and verifies the checksums. In case a bad checksum
27*4882a593Smuzhiyun * is found or the extent cannot be read, good data will be written back if
28*4882a593Smuzhiyun * any can be found.
29*4882a593Smuzhiyun *
30*4882a593Smuzhiyun * Future enhancements:
31*4882a593Smuzhiyun * - In case an unrepairable extent is encountered, track which files are
32*4882a593Smuzhiyun * affected and report them
33*4882a593Smuzhiyun * - track and record media errors, throw out bad devices
34*4882a593Smuzhiyun * - add a mode to also read unallocated space
35*4882a593Smuzhiyun */
36*4882a593Smuzhiyun
37*4882a593Smuzhiyun struct scrub_block;
38*4882a593Smuzhiyun struct scrub_ctx;
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun /*
41*4882a593Smuzhiyun * the following three values only influence the performance.
42*4882a593Smuzhiyun * The last one configures the number of parallel and outstanding I/O
43*4882a593Smuzhiyun * operations. The first two values configure an upper limit for the number
44*4882a593Smuzhiyun * of (dynamically allocated) pages that are added to a bio.
45*4882a593Smuzhiyun */
46*4882a593Smuzhiyun #define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
47*4882a593Smuzhiyun #define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
48*4882a593Smuzhiyun #define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun /*
51*4882a593Smuzhiyun * the following value times PAGE_SIZE needs to be large enough to match the
52*4882a593Smuzhiyun * largest node/leaf/sector size that shall be supported.
53*4882a593Smuzhiyun * Values larger than BTRFS_STRIPE_LEN are not supported.
54*4882a593Smuzhiyun */
55*4882a593Smuzhiyun #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
56*4882a593Smuzhiyun
57*4882a593Smuzhiyun struct scrub_recover {
58*4882a593Smuzhiyun refcount_t refs;
59*4882a593Smuzhiyun struct btrfs_bio *bbio;
60*4882a593Smuzhiyun u64 map_length;
61*4882a593Smuzhiyun };
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun struct scrub_page {
64*4882a593Smuzhiyun struct scrub_block *sblock;
65*4882a593Smuzhiyun struct page *page;
66*4882a593Smuzhiyun struct btrfs_device *dev;
67*4882a593Smuzhiyun struct list_head list;
68*4882a593Smuzhiyun u64 flags; /* extent flags */
69*4882a593Smuzhiyun u64 generation;
70*4882a593Smuzhiyun u64 logical;
71*4882a593Smuzhiyun u64 physical;
72*4882a593Smuzhiyun u64 physical_for_dev_replace;
73*4882a593Smuzhiyun atomic_t refs;
74*4882a593Smuzhiyun struct {
75*4882a593Smuzhiyun unsigned int mirror_num:8;
76*4882a593Smuzhiyun unsigned int have_csum:1;
77*4882a593Smuzhiyun unsigned int io_error:1;
78*4882a593Smuzhiyun };
79*4882a593Smuzhiyun u8 csum[BTRFS_CSUM_SIZE];
80*4882a593Smuzhiyun
81*4882a593Smuzhiyun struct scrub_recover *recover;
82*4882a593Smuzhiyun };
83*4882a593Smuzhiyun
84*4882a593Smuzhiyun struct scrub_bio {
85*4882a593Smuzhiyun int index;
86*4882a593Smuzhiyun struct scrub_ctx *sctx;
87*4882a593Smuzhiyun struct btrfs_device *dev;
88*4882a593Smuzhiyun struct bio *bio;
89*4882a593Smuzhiyun blk_status_t status;
90*4882a593Smuzhiyun u64 logical;
91*4882a593Smuzhiyun u64 physical;
92*4882a593Smuzhiyun #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
93*4882a593Smuzhiyun struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
94*4882a593Smuzhiyun #else
95*4882a593Smuzhiyun struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
96*4882a593Smuzhiyun #endif
97*4882a593Smuzhiyun int page_count;
98*4882a593Smuzhiyun int next_free;
99*4882a593Smuzhiyun struct btrfs_work work;
100*4882a593Smuzhiyun };
101*4882a593Smuzhiyun
102*4882a593Smuzhiyun struct scrub_block {
103*4882a593Smuzhiyun struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
104*4882a593Smuzhiyun int page_count;
105*4882a593Smuzhiyun atomic_t outstanding_pages;
106*4882a593Smuzhiyun refcount_t refs; /* free mem on transition to zero */
107*4882a593Smuzhiyun struct scrub_ctx *sctx;
108*4882a593Smuzhiyun struct scrub_parity *sparity;
109*4882a593Smuzhiyun struct {
110*4882a593Smuzhiyun unsigned int header_error:1;
111*4882a593Smuzhiyun unsigned int checksum_error:1;
112*4882a593Smuzhiyun unsigned int no_io_error_seen:1;
113*4882a593Smuzhiyun unsigned int generation_error:1; /* also sets header_error */
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun /* The following is for the data used to check parity */
116*4882a593Smuzhiyun /* It is for the data with checksum */
117*4882a593Smuzhiyun unsigned int data_corrected:1;
118*4882a593Smuzhiyun };
119*4882a593Smuzhiyun struct btrfs_work work;
120*4882a593Smuzhiyun };
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun /* Used for the chunks with parity stripe such RAID5/6 */
123*4882a593Smuzhiyun struct scrub_parity {
124*4882a593Smuzhiyun struct scrub_ctx *sctx;
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun struct btrfs_device *scrub_dev;
127*4882a593Smuzhiyun
128*4882a593Smuzhiyun u64 logic_start;
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun u64 logic_end;
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun int nsectors;
133*4882a593Smuzhiyun
134*4882a593Smuzhiyun u64 stripe_len;
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun refcount_t refs;
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun struct list_head spages;
139*4882a593Smuzhiyun
140*4882a593Smuzhiyun /* Work of parity check and repair */
141*4882a593Smuzhiyun struct btrfs_work work;
142*4882a593Smuzhiyun
143*4882a593Smuzhiyun /* Mark the parity blocks which have data */
144*4882a593Smuzhiyun unsigned long *dbitmap;
145*4882a593Smuzhiyun
146*4882a593Smuzhiyun /*
147*4882a593Smuzhiyun * Mark the parity blocks which have data, but errors happen when
148*4882a593Smuzhiyun * read data or check data
149*4882a593Smuzhiyun */
150*4882a593Smuzhiyun unsigned long *ebitmap;
151*4882a593Smuzhiyun
152*4882a593Smuzhiyun unsigned long bitmap[];
153*4882a593Smuzhiyun };
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun struct scrub_ctx {
156*4882a593Smuzhiyun struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
157*4882a593Smuzhiyun struct btrfs_fs_info *fs_info;
158*4882a593Smuzhiyun int first_free;
159*4882a593Smuzhiyun int curr;
160*4882a593Smuzhiyun atomic_t bios_in_flight;
161*4882a593Smuzhiyun atomic_t workers_pending;
162*4882a593Smuzhiyun spinlock_t list_lock;
163*4882a593Smuzhiyun wait_queue_head_t list_wait;
164*4882a593Smuzhiyun u16 csum_size;
165*4882a593Smuzhiyun struct list_head csum_list;
166*4882a593Smuzhiyun atomic_t cancel_req;
167*4882a593Smuzhiyun int readonly;
168*4882a593Smuzhiyun int pages_per_rd_bio;
169*4882a593Smuzhiyun
170*4882a593Smuzhiyun int is_dev_replace;
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun struct scrub_bio *wr_curr_bio;
173*4882a593Smuzhiyun struct mutex wr_lock;
174*4882a593Smuzhiyun int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
175*4882a593Smuzhiyun struct btrfs_device *wr_tgtdev;
176*4882a593Smuzhiyun bool flush_all_writes;
177*4882a593Smuzhiyun
178*4882a593Smuzhiyun /*
179*4882a593Smuzhiyun * statistics
180*4882a593Smuzhiyun */
181*4882a593Smuzhiyun struct btrfs_scrub_progress stat;
182*4882a593Smuzhiyun spinlock_t stat_lock;
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun /*
185*4882a593Smuzhiyun * Use a ref counter to avoid use-after-free issues. Scrub workers
186*4882a593Smuzhiyun * decrement bios_in_flight and workers_pending and then do a wakeup
187*4882a593Smuzhiyun * on the list_wait wait queue. We must ensure the main scrub task
188*4882a593Smuzhiyun * doesn't free the scrub context before or while the workers are
189*4882a593Smuzhiyun * doing the wakeup() call.
190*4882a593Smuzhiyun */
191*4882a593Smuzhiyun refcount_t refs;
192*4882a593Smuzhiyun };
193*4882a593Smuzhiyun
194*4882a593Smuzhiyun struct scrub_warning {
195*4882a593Smuzhiyun struct btrfs_path *path;
196*4882a593Smuzhiyun u64 extent_item_size;
197*4882a593Smuzhiyun const char *errstr;
198*4882a593Smuzhiyun u64 physical;
199*4882a593Smuzhiyun u64 logical;
200*4882a593Smuzhiyun struct btrfs_device *dev;
201*4882a593Smuzhiyun };
202*4882a593Smuzhiyun
203*4882a593Smuzhiyun struct full_stripe_lock {
204*4882a593Smuzhiyun struct rb_node node;
205*4882a593Smuzhiyun u64 logical;
206*4882a593Smuzhiyun u64 refs;
207*4882a593Smuzhiyun struct mutex mutex;
208*4882a593Smuzhiyun };
209*4882a593Smuzhiyun
210*4882a593Smuzhiyun static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
211*4882a593Smuzhiyun static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
212*4882a593Smuzhiyun static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
213*4882a593Smuzhiyun static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
214*4882a593Smuzhiyun struct scrub_block *sblocks_for_recheck);
215*4882a593Smuzhiyun static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
216*4882a593Smuzhiyun struct scrub_block *sblock,
217*4882a593Smuzhiyun int retry_failed_mirror);
218*4882a593Smuzhiyun static void scrub_recheck_block_checksum(struct scrub_block *sblock);
219*4882a593Smuzhiyun static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
220*4882a593Smuzhiyun struct scrub_block *sblock_good);
221*4882a593Smuzhiyun static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
222*4882a593Smuzhiyun struct scrub_block *sblock_good,
223*4882a593Smuzhiyun int page_num, int force_write);
224*4882a593Smuzhiyun static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
225*4882a593Smuzhiyun static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
226*4882a593Smuzhiyun int page_num);
227*4882a593Smuzhiyun static int scrub_checksum_data(struct scrub_block *sblock);
228*4882a593Smuzhiyun static int scrub_checksum_tree_block(struct scrub_block *sblock);
229*4882a593Smuzhiyun static int scrub_checksum_super(struct scrub_block *sblock);
230*4882a593Smuzhiyun static void scrub_block_get(struct scrub_block *sblock);
231*4882a593Smuzhiyun static void scrub_block_put(struct scrub_block *sblock);
232*4882a593Smuzhiyun static void scrub_page_get(struct scrub_page *spage);
233*4882a593Smuzhiyun static void scrub_page_put(struct scrub_page *spage);
234*4882a593Smuzhiyun static void scrub_parity_get(struct scrub_parity *sparity);
235*4882a593Smuzhiyun static void scrub_parity_put(struct scrub_parity *sparity);
236*4882a593Smuzhiyun static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
237*4882a593Smuzhiyun struct scrub_page *spage);
238*4882a593Smuzhiyun static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
239*4882a593Smuzhiyun u64 physical, struct btrfs_device *dev, u64 flags,
240*4882a593Smuzhiyun u64 gen, int mirror_num, u8 *csum, int force,
241*4882a593Smuzhiyun u64 physical_for_dev_replace);
242*4882a593Smuzhiyun static void scrub_bio_end_io(struct bio *bio);
243*4882a593Smuzhiyun static void scrub_bio_end_io_worker(struct btrfs_work *work);
244*4882a593Smuzhiyun static void scrub_block_complete(struct scrub_block *sblock);
245*4882a593Smuzhiyun static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
246*4882a593Smuzhiyun u64 extent_logical, u64 extent_len,
247*4882a593Smuzhiyun u64 *extent_physical,
248*4882a593Smuzhiyun struct btrfs_device **extent_dev,
249*4882a593Smuzhiyun int *extent_mirror_num);
250*4882a593Smuzhiyun static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
251*4882a593Smuzhiyun struct scrub_page *spage);
252*4882a593Smuzhiyun static void scrub_wr_submit(struct scrub_ctx *sctx);
253*4882a593Smuzhiyun static void scrub_wr_bio_end_io(struct bio *bio);
254*4882a593Smuzhiyun static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
255*4882a593Smuzhiyun static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
256*4882a593Smuzhiyun static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
257*4882a593Smuzhiyun static void scrub_put_ctx(struct scrub_ctx *sctx);
258*4882a593Smuzhiyun
scrub_is_page_on_raid56(struct scrub_page * page)259*4882a593Smuzhiyun static inline int scrub_is_page_on_raid56(struct scrub_page *page)
260*4882a593Smuzhiyun {
261*4882a593Smuzhiyun return page->recover &&
262*4882a593Smuzhiyun (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
263*4882a593Smuzhiyun }
264*4882a593Smuzhiyun
scrub_pending_bio_inc(struct scrub_ctx * sctx)265*4882a593Smuzhiyun static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
266*4882a593Smuzhiyun {
267*4882a593Smuzhiyun refcount_inc(&sctx->refs);
268*4882a593Smuzhiyun atomic_inc(&sctx->bios_in_flight);
269*4882a593Smuzhiyun }
270*4882a593Smuzhiyun
scrub_pending_bio_dec(struct scrub_ctx * sctx)271*4882a593Smuzhiyun static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
272*4882a593Smuzhiyun {
273*4882a593Smuzhiyun atomic_dec(&sctx->bios_in_flight);
274*4882a593Smuzhiyun wake_up(&sctx->list_wait);
275*4882a593Smuzhiyun scrub_put_ctx(sctx);
276*4882a593Smuzhiyun }
277*4882a593Smuzhiyun
__scrub_blocked_if_needed(struct btrfs_fs_info * fs_info)278*4882a593Smuzhiyun static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
279*4882a593Smuzhiyun {
280*4882a593Smuzhiyun while (atomic_read(&fs_info->scrub_pause_req)) {
281*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
282*4882a593Smuzhiyun wait_event(fs_info->scrub_pause_wait,
283*4882a593Smuzhiyun atomic_read(&fs_info->scrub_pause_req) == 0);
284*4882a593Smuzhiyun mutex_lock(&fs_info->scrub_lock);
285*4882a593Smuzhiyun }
286*4882a593Smuzhiyun }
287*4882a593Smuzhiyun
scrub_pause_on(struct btrfs_fs_info * fs_info)288*4882a593Smuzhiyun static void scrub_pause_on(struct btrfs_fs_info *fs_info)
289*4882a593Smuzhiyun {
290*4882a593Smuzhiyun atomic_inc(&fs_info->scrubs_paused);
291*4882a593Smuzhiyun wake_up(&fs_info->scrub_pause_wait);
292*4882a593Smuzhiyun }
293*4882a593Smuzhiyun
scrub_pause_off(struct btrfs_fs_info * fs_info)294*4882a593Smuzhiyun static void scrub_pause_off(struct btrfs_fs_info *fs_info)
295*4882a593Smuzhiyun {
296*4882a593Smuzhiyun mutex_lock(&fs_info->scrub_lock);
297*4882a593Smuzhiyun __scrub_blocked_if_needed(fs_info);
298*4882a593Smuzhiyun atomic_dec(&fs_info->scrubs_paused);
299*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
300*4882a593Smuzhiyun
301*4882a593Smuzhiyun wake_up(&fs_info->scrub_pause_wait);
302*4882a593Smuzhiyun }
303*4882a593Smuzhiyun
scrub_blocked_if_needed(struct btrfs_fs_info * fs_info)304*4882a593Smuzhiyun static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
305*4882a593Smuzhiyun {
306*4882a593Smuzhiyun scrub_pause_on(fs_info);
307*4882a593Smuzhiyun scrub_pause_off(fs_info);
308*4882a593Smuzhiyun }
309*4882a593Smuzhiyun
310*4882a593Smuzhiyun /*
311*4882a593Smuzhiyun * Insert new full stripe lock into full stripe locks tree
312*4882a593Smuzhiyun *
313*4882a593Smuzhiyun * Return pointer to existing or newly inserted full_stripe_lock structure if
314*4882a593Smuzhiyun * everything works well.
315*4882a593Smuzhiyun * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
316*4882a593Smuzhiyun *
317*4882a593Smuzhiyun * NOTE: caller must hold full_stripe_locks_root->lock before calling this
318*4882a593Smuzhiyun * function
319*4882a593Smuzhiyun */
insert_full_stripe_lock(struct btrfs_full_stripe_locks_tree * locks_root,u64 fstripe_logical)320*4882a593Smuzhiyun static struct full_stripe_lock *insert_full_stripe_lock(
321*4882a593Smuzhiyun struct btrfs_full_stripe_locks_tree *locks_root,
322*4882a593Smuzhiyun u64 fstripe_logical)
323*4882a593Smuzhiyun {
324*4882a593Smuzhiyun struct rb_node **p;
325*4882a593Smuzhiyun struct rb_node *parent = NULL;
326*4882a593Smuzhiyun struct full_stripe_lock *entry;
327*4882a593Smuzhiyun struct full_stripe_lock *ret;
328*4882a593Smuzhiyun
329*4882a593Smuzhiyun lockdep_assert_held(&locks_root->lock);
330*4882a593Smuzhiyun
331*4882a593Smuzhiyun p = &locks_root->root.rb_node;
332*4882a593Smuzhiyun while (*p) {
333*4882a593Smuzhiyun parent = *p;
334*4882a593Smuzhiyun entry = rb_entry(parent, struct full_stripe_lock, node);
335*4882a593Smuzhiyun if (fstripe_logical < entry->logical) {
336*4882a593Smuzhiyun p = &(*p)->rb_left;
337*4882a593Smuzhiyun } else if (fstripe_logical > entry->logical) {
338*4882a593Smuzhiyun p = &(*p)->rb_right;
339*4882a593Smuzhiyun } else {
340*4882a593Smuzhiyun entry->refs++;
341*4882a593Smuzhiyun return entry;
342*4882a593Smuzhiyun }
343*4882a593Smuzhiyun }
344*4882a593Smuzhiyun
345*4882a593Smuzhiyun /*
346*4882a593Smuzhiyun * Insert new lock.
347*4882a593Smuzhiyun */
348*4882a593Smuzhiyun ret = kmalloc(sizeof(*ret), GFP_KERNEL);
349*4882a593Smuzhiyun if (!ret)
350*4882a593Smuzhiyun return ERR_PTR(-ENOMEM);
351*4882a593Smuzhiyun ret->logical = fstripe_logical;
352*4882a593Smuzhiyun ret->refs = 1;
353*4882a593Smuzhiyun mutex_init(&ret->mutex);
354*4882a593Smuzhiyun
355*4882a593Smuzhiyun rb_link_node(&ret->node, parent, p);
356*4882a593Smuzhiyun rb_insert_color(&ret->node, &locks_root->root);
357*4882a593Smuzhiyun return ret;
358*4882a593Smuzhiyun }
359*4882a593Smuzhiyun
360*4882a593Smuzhiyun /*
361*4882a593Smuzhiyun * Search for a full stripe lock of a block group
362*4882a593Smuzhiyun *
363*4882a593Smuzhiyun * Return pointer to existing full stripe lock if found
364*4882a593Smuzhiyun * Return NULL if not found
365*4882a593Smuzhiyun */
search_full_stripe_lock(struct btrfs_full_stripe_locks_tree * locks_root,u64 fstripe_logical)366*4882a593Smuzhiyun static struct full_stripe_lock *search_full_stripe_lock(
367*4882a593Smuzhiyun struct btrfs_full_stripe_locks_tree *locks_root,
368*4882a593Smuzhiyun u64 fstripe_logical)
369*4882a593Smuzhiyun {
370*4882a593Smuzhiyun struct rb_node *node;
371*4882a593Smuzhiyun struct full_stripe_lock *entry;
372*4882a593Smuzhiyun
373*4882a593Smuzhiyun lockdep_assert_held(&locks_root->lock);
374*4882a593Smuzhiyun
375*4882a593Smuzhiyun node = locks_root->root.rb_node;
376*4882a593Smuzhiyun while (node) {
377*4882a593Smuzhiyun entry = rb_entry(node, struct full_stripe_lock, node);
378*4882a593Smuzhiyun if (fstripe_logical < entry->logical)
379*4882a593Smuzhiyun node = node->rb_left;
380*4882a593Smuzhiyun else if (fstripe_logical > entry->logical)
381*4882a593Smuzhiyun node = node->rb_right;
382*4882a593Smuzhiyun else
383*4882a593Smuzhiyun return entry;
384*4882a593Smuzhiyun }
385*4882a593Smuzhiyun return NULL;
386*4882a593Smuzhiyun }
387*4882a593Smuzhiyun
388*4882a593Smuzhiyun /*
389*4882a593Smuzhiyun * Helper to get full stripe logical from a normal bytenr.
390*4882a593Smuzhiyun *
391*4882a593Smuzhiyun * Caller must ensure @cache is a RAID56 block group.
392*4882a593Smuzhiyun */
get_full_stripe_logical(struct btrfs_block_group * cache,u64 bytenr)393*4882a593Smuzhiyun static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
394*4882a593Smuzhiyun {
395*4882a593Smuzhiyun u64 ret;
396*4882a593Smuzhiyun
397*4882a593Smuzhiyun /*
398*4882a593Smuzhiyun * Due to chunk item size limit, full stripe length should not be
399*4882a593Smuzhiyun * larger than U32_MAX. Just a sanity check here.
400*4882a593Smuzhiyun */
401*4882a593Smuzhiyun WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
402*4882a593Smuzhiyun
403*4882a593Smuzhiyun /*
404*4882a593Smuzhiyun * round_down() can only handle power of 2, while RAID56 full
405*4882a593Smuzhiyun * stripe length can be 64KiB * n, so we need to manually round down.
406*4882a593Smuzhiyun */
407*4882a593Smuzhiyun ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
408*4882a593Smuzhiyun cache->full_stripe_len + cache->start;
409*4882a593Smuzhiyun return ret;
410*4882a593Smuzhiyun }
411*4882a593Smuzhiyun
412*4882a593Smuzhiyun /*
413*4882a593Smuzhiyun * Lock a full stripe to avoid concurrency of recovery and read
414*4882a593Smuzhiyun *
415*4882a593Smuzhiyun * It's only used for profiles with parities (RAID5/6), for other profiles it
416*4882a593Smuzhiyun * does nothing.
417*4882a593Smuzhiyun *
418*4882a593Smuzhiyun * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
419*4882a593Smuzhiyun * So caller must call unlock_full_stripe() at the same context.
420*4882a593Smuzhiyun *
421*4882a593Smuzhiyun * Return <0 if encounters error.
422*4882a593Smuzhiyun */
lock_full_stripe(struct btrfs_fs_info * fs_info,u64 bytenr,bool * locked_ret)423*4882a593Smuzhiyun static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
424*4882a593Smuzhiyun bool *locked_ret)
425*4882a593Smuzhiyun {
426*4882a593Smuzhiyun struct btrfs_block_group *bg_cache;
427*4882a593Smuzhiyun struct btrfs_full_stripe_locks_tree *locks_root;
428*4882a593Smuzhiyun struct full_stripe_lock *existing;
429*4882a593Smuzhiyun u64 fstripe_start;
430*4882a593Smuzhiyun int ret = 0;
431*4882a593Smuzhiyun
432*4882a593Smuzhiyun *locked_ret = false;
433*4882a593Smuzhiyun bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
434*4882a593Smuzhiyun if (!bg_cache) {
435*4882a593Smuzhiyun ASSERT(0);
436*4882a593Smuzhiyun return -ENOENT;
437*4882a593Smuzhiyun }
438*4882a593Smuzhiyun
439*4882a593Smuzhiyun /* Profiles not based on parity don't need full stripe lock */
440*4882a593Smuzhiyun if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
441*4882a593Smuzhiyun goto out;
442*4882a593Smuzhiyun locks_root = &bg_cache->full_stripe_locks_root;
443*4882a593Smuzhiyun
444*4882a593Smuzhiyun fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
445*4882a593Smuzhiyun
446*4882a593Smuzhiyun /* Now insert the full stripe lock */
447*4882a593Smuzhiyun mutex_lock(&locks_root->lock);
448*4882a593Smuzhiyun existing = insert_full_stripe_lock(locks_root, fstripe_start);
449*4882a593Smuzhiyun mutex_unlock(&locks_root->lock);
450*4882a593Smuzhiyun if (IS_ERR(existing)) {
451*4882a593Smuzhiyun ret = PTR_ERR(existing);
452*4882a593Smuzhiyun goto out;
453*4882a593Smuzhiyun }
454*4882a593Smuzhiyun mutex_lock(&existing->mutex);
455*4882a593Smuzhiyun *locked_ret = true;
456*4882a593Smuzhiyun out:
457*4882a593Smuzhiyun btrfs_put_block_group(bg_cache);
458*4882a593Smuzhiyun return ret;
459*4882a593Smuzhiyun }
460*4882a593Smuzhiyun
461*4882a593Smuzhiyun /*
462*4882a593Smuzhiyun * Unlock a full stripe.
463*4882a593Smuzhiyun *
464*4882a593Smuzhiyun * NOTE: Caller must ensure it's the same context calling corresponding
465*4882a593Smuzhiyun * lock_full_stripe().
466*4882a593Smuzhiyun *
467*4882a593Smuzhiyun * Return 0 if we unlock full stripe without problem.
468*4882a593Smuzhiyun * Return <0 for error
469*4882a593Smuzhiyun */
unlock_full_stripe(struct btrfs_fs_info * fs_info,u64 bytenr,bool locked)470*4882a593Smuzhiyun static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
471*4882a593Smuzhiyun bool locked)
472*4882a593Smuzhiyun {
473*4882a593Smuzhiyun struct btrfs_block_group *bg_cache;
474*4882a593Smuzhiyun struct btrfs_full_stripe_locks_tree *locks_root;
475*4882a593Smuzhiyun struct full_stripe_lock *fstripe_lock;
476*4882a593Smuzhiyun u64 fstripe_start;
477*4882a593Smuzhiyun bool freeit = false;
478*4882a593Smuzhiyun int ret = 0;
479*4882a593Smuzhiyun
480*4882a593Smuzhiyun /* If we didn't acquire full stripe lock, no need to continue */
481*4882a593Smuzhiyun if (!locked)
482*4882a593Smuzhiyun return 0;
483*4882a593Smuzhiyun
484*4882a593Smuzhiyun bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
485*4882a593Smuzhiyun if (!bg_cache) {
486*4882a593Smuzhiyun ASSERT(0);
487*4882a593Smuzhiyun return -ENOENT;
488*4882a593Smuzhiyun }
489*4882a593Smuzhiyun if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
490*4882a593Smuzhiyun goto out;
491*4882a593Smuzhiyun
492*4882a593Smuzhiyun locks_root = &bg_cache->full_stripe_locks_root;
493*4882a593Smuzhiyun fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
494*4882a593Smuzhiyun
495*4882a593Smuzhiyun mutex_lock(&locks_root->lock);
496*4882a593Smuzhiyun fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
497*4882a593Smuzhiyun /* Unpaired unlock_full_stripe() detected */
498*4882a593Smuzhiyun if (!fstripe_lock) {
499*4882a593Smuzhiyun WARN_ON(1);
500*4882a593Smuzhiyun ret = -ENOENT;
501*4882a593Smuzhiyun mutex_unlock(&locks_root->lock);
502*4882a593Smuzhiyun goto out;
503*4882a593Smuzhiyun }
504*4882a593Smuzhiyun
505*4882a593Smuzhiyun if (fstripe_lock->refs == 0) {
506*4882a593Smuzhiyun WARN_ON(1);
507*4882a593Smuzhiyun btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
508*4882a593Smuzhiyun fstripe_lock->logical);
509*4882a593Smuzhiyun } else {
510*4882a593Smuzhiyun fstripe_lock->refs--;
511*4882a593Smuzhiyun }
512*4882a593Smuzhiyun
513*4882a593Smuzhiyun if (fstripe_lock->refs == 0) {
514*4882a593Smuzhiyun rb_erase(&fstripe_lock->node, &locks_root->root);
515*4882a593Smuzhiyun freeit = true;
516*4882a593Smuzhiyun }
517*4882a593Smuzhiyun mutex_unlock(&locks_root->lock);
518*4882a593Smuzhiyun
519*4882a593Smuzhiyun mutex_unlock(&fstripe_lock->mutex);
520*4882a593Smuzhiyun if (freeit)
521*4882a593Smuzhiyun kfree(fstripe_lock);
522*4882a593Smuzhiyun out:
523*4882a593Smuzhiyun btrfs_put_block_group(bg_cache);
524*4882a593Smuzhiyun return ret;
525*4882a593Smuzhiyun }
526*4882a593Smuzhiyun
scrub_free_csums(struct scrub_ctx * sctx)527*4882a593Smuzhiyun static void scrub_free_csums(struct scrub_ctx *sctx)
528*4882a593Smuzhiyun {
529*4882a593Smuzhiyun while (!list_empty(&sctx->csum_list)) {
530*4882a593Smuzhiyun struct btrfs_ordered_sum *sum;
531*4882a593Smuzhiyun sum = list_first_entry(&sctx->csum_list,
532*4882a593Smuzhiyun struct btrfs_ordered_sum, list);
533*4882a593Smuzhiyun list_del(&sum->list);
534*4882a593Smuzhiyun kfree(sum);
535*4882a593Smuzhiyun }
536*4882a593Smuzhiyun }
537*4882a593Smuzhiyun
scrub_free_ctx(struct scrub_ctx * sctx)538*4882a593Smuzhiyun static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
539*4882a593Smuzhiyun {
540*4882a593Smuzhiyun int i;
541*4882a593Smuzhiyun
542*4882a593Smuzhiyun if (!sctx)
543*4882a593Smuzhiyun return;
544*4882a593Smuzhiyun
545*4882a593Smuzhiyun /* this can happen when scrub is cancelled */
546*4882a593Smuzhiyun if (sctx->curr != -1) {
547*4882a593Smuzhiyun struct scrub_bio *sbio = sctx->bios[sctx->curr];
548*4882a593Smuzhiyun
549*4882a593Smuzhiyun for (i = 0; i < sbio->page_count; i++) {
550*4882a593Smuzhiyun WARN_ON(!sbio->pagev[i]->page);
551*4882a593Smuzhiyun scrub_block_put(sbio->pagev[i]->sblock);
552*4882a593Smuzhiyun }
553*4882a593Smuzhiyun bio_put(sbio->bio);
554*4882a593Smuzhiyun }
555*4882a593Smuzhiyun
556*4882a593Smuzhiyun for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
557*4882a593Smuzhiyun struct scrub_bio *sbio = sctx->bios[i];
558*4882a593Smuzhiyun
559*4882a593Smuzhiyun if (!sbio)
560*4882a593Smuzhiyun break;
561*4882a593Smuzhiyun kfree(sbio);
562*4882a593Smuzhiyun }
563*4882a593Smuzhiyun
564*4882a593Smuzhiyun kfree(sctx->wr_curr_bio);
565*4882a593Smuzhiyun scrub_free_csums(sctx);
566*4882a593Smuzhiyun kfree(sctx);
567*4882a593Smuzhiyun }
568*4882a593Smuzhiyun
scrub_put_ctx(struct scrub_ctx * sctx)569*4882a593Smuzhiyun static void scrub_put_ctx(struct scrub_ctx *sctx)
570*4882a593Smuzhiyun {
571*4882a593Smuzhiyun if (refcount_dec_and_test(&sctx->refs))
572*4882a593Smuzhiyun scrub_free_ctx(sctx);
573*4882a593Smuzhiyun }
574*4882a593Smuzhiyun
scrub_setup_ctx(struct btrfs_fs_info * fs_info,int is_dev_replace)575*4882a593Smuzhiyun static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
576*4882a593Smuzhiyun struct btrfs_fs_info *fs_info, int is_dev_replace)
577*4882a593Smuzhiyun {
578*4882a593Smuzhiyun struct scrub_ctx *sctx;
579*4882a593Smuzhiyun int i;
580*4882a593Smuzhiyun
581*4882a593Smuzhiyun sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
582*4882a593Smuzhiyun if (!sctx)
583*4882a593Smuzhiyun goto nomem;
584*4882a593Smuzhiyun refcount_set(&sctx->refs, 1);
585*4882a593Smuzhiyun sctx->is_dev_replace = is_dev_replace;
586*4882a593Smuzhiyun sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
587*4882a593Smuzhiyun sctx->curr = -1;
588*4882a593Smuzhiyun sctx->fs_info = fs_info;
589*4882a593Smuzhiyun INIT_LIST_HEAD(&sctx->csum_list);
590*4882a593Smuzhiyun for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
591*4882a593Smuzhiyun struct scrub_bio *sbio;
592*4882a593Smuzhiyun
593*4882a593Smuzhiyun sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
594*4882a593Smuzhiyun if (!sbio)
595*4882a593Smuzhiyun goto nomem;
596*4882a593Smuzhiyun sctx->bios[i] = sbio;
597*4882a593Smuzhiyun
598*4882a593Smuzhiyun sbio->index = i;
599*4882a593Smuzhiyun sbio->sctx = sctx;
600*4882a593Smuzhiyun sbio->page_count = 0;
601*4882a593Smuzhiyun btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
602*4882a593Smuzhiyun NULL);
603*4882a593Smuzhiyun
604*4882a593Smuzhiyun if (i != SCRUB_BIOS_PER_SCTX - 1)
605*4882a593Smuzhiyun sctx->bios[i]->next_free = i + 1;
606*4882a593Smuzhiyun else
607*4882a593Smuzhiyun sctx->bios[i]->next_free = -1;
608*4882a593Smuzhiyun }
609*4882a593Smuzhiyun sctx->first_free = 0;
610*4882a593Smuzhiyun atomic_set(&sctx->bios_in_flight, 0);
611*4882a593Smuzhiyun atomic_set(&sctx->workers_pending, 0);
612*4882a593Smuzhiyun atomic_set(&sctx->cancel_req, 0);
613*4882a593Smuzhiyun sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
614*4882a593Smuzhiyun
615*4882a593Smuzhiyun spin_lock_init(&sctx->list_lock);
616*4882a593Smuzhiyun spin_lock_init(&sctx->stat_lock);
617*4882a593Smuzhiyun init_waitqueue_head(&sctx->list_wait);
618*4882a593Smuzhiyun
619*4882a593Smuzhiyun WARN_ON(sctx->wr_curr_bio != NULL);
620*4882a593Smuzhiyun mutex_init(&sctx->wr_lock);
621*4882a593Smuzhiyun sctx->wr_curr_bio = NULL;
622*4882a593Smuzhiyun if (is_dev_replace) {
623*4882a593Smuzhiyun WARN_ON(!fs_info->dev_replace.tgtdev);
624*4882a593Smuzhiyun sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
625*4882a593Smuzhiyun sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
626*4882a593Smuzhiyun sctx->flush_all_writes = false;
627*4882a593Smuzhiyun }
628*4882a593Smuzhiyun
629*4882a593Smuzhiyun return sctx;
630*4882a593Smuzhiyun
631*4882a593Smuzhiyun nomem:
632*4882a593Smuzhiyun scrub_free_ctx(sctx);
633*4882a593Smuzhiyun return ERR_PTR(-ENOMEM);
634*4882a593Smuzhiyun }
635*4882a593Smuzhiyun
scrub_print_warning_inode(u64 inum,u64 offset,u64 root,void * warn_ctx)636*4882a593Smuzhiyun static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
637*4882a593Smuzhiyun void *warn_ctx)
638*4882a593Smuzhiyun {
639*4882a593Smuzhiyun u64 isize;
640*4882a593Smuzhiyun u32 nlink;
641*4882a593Smuzhiyun int ret;
642*4882a593Smuzhiyun int i;
643*4882a593Smuzhiyun unsigned nofs_flag;
644*4882a593Smuzhiyun struct extent_buffer *eb;
645*4882a593Smuzhiyun struct btrfs_inode_item *inode_item;
646*4882a593Smuzhiyun struct scrub_warning *swarn = warn_ctx;
647*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
648*4882a593Smuzhiyun struct inode_fs_paths *ipath = NULL;
649*4882a593Smuzhiyun struct btrfs_root *local_root;
650*4882a593Smuzhiyun struct btrfs_key key;
651*4882a593Smuzhiyun
652*4882a593Smuzhiyun local_root = btrfs_get_fs_root(fs_info, root, true);
653*4882a593Smuzhiyun if (IS_ERR(local_root)) {
654*4882a593Smuzhiyun ret = PTR_ERR(local_root);
655*4882a593Smuzhiyun goto err;
656*4882a593Smuzhiyun }
657*4882a593Smuzhiyun
658*4882a593Smuzhiyun /*
659*4882a593Smuzhiyun * this makes the path point to (inum INODE_ITEM ioff)
660*4882a593Smuzhiyun */
661*4882a593Smuzhiyun key.objectid = inum;
662*4882a593Smuzhiyun key.type = BTRFS_INODE_ITEM_KEY;
663*4882a593Smuzhiyun key.offset = 0;
664*4882a593Smuzhiyun
665*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
666*4882a593Smuzhiyun if (ret) {
667*4882a593Smuzhiyun btrfs_put_root(local_root);
668*4882a593Smuzhiyun btrfs_release_path(swarn->path);
669*4882a593Smuzhiyun goto err;
670*4882a593Smuzhiyun }
671*4882a593Smuzhiyun
672*4882a593Smuzhiyun eb = swarn->path->nodes[0];
673*4882a593Smuzhiyun inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
674*4882a593Smuzhiyun struct btrfs_inode_item);
675*4882a593Smuzhiyun isize = btrfs_inode_size(eb, inode_item);
676*4882a593Smuzhiyun nlink = btrfs_inode_nlink(eb, inode_item);
677*4882a593Smuzhiyun btrfs_release_path(swarn->path);
678*4882a593Smuzhiyun
679*4882a593Smuzhiyun /*
680*4882a593Smuzhiyun * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
681*4882a593Smuzhiyun * uses GFP_NOFS in this context, so we keep it consistent but it does
682*4882a593Smuzhiyun * not seem to be strictly necessary.
683*4882a593Smuzhiyun */
684*4882a593Smuzhiyun nofs_flag = memalloc_nofs_save();
685*4882a593Smuzhiyun ipath = init_ipath(4096, local_root, swarn->path);
686*4882a593Smuzhiyun memalloc_nofs_restore(nofs_flag);
687*4882a593Smuzhiyun if (IS_ERR(ipath)) {
688*4882a593Smuzhiyun btrfs_put_root(local_root);
689*4882a593Smuzhiyun ret = PTR_ERR(ipath);
690*4882a593Smuzhiyun ipath = NULL;
691*4882a593Smuzhiyun goto err;
692*4882a593Smuzhiyun }
693*4882a593Smuzhiyun ret = paths_from_inode(inum, ipath);
694*4882a593Smuzhiyun
695*4882a593Smuzhiyun if (ret < 0)
696*4882a593Smuzhiyun goto err;
697*4882a593Smuzhiyun
698*4882a593Smuzhiyun /*
699*4882a593Smuzhiyun * we deliberately ignore the bit ipath might have been too small to
700*4882a593Smuzhiyun * hold all of the paths here
701*4882a593Smuzhiyun */
702*4882a593Smuzhiyun for (i = 0; i < ipath->fspath->elem_cnt; ++i)
703*4882a593Smuzhiyun btrfs_warn_in_rcu(fs_info,
704*4882a593Smuzhiyun "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
705*4882a593Smuzhiyun swarn->errstr, swarn->logical,
706*4882a593Smuzhiyun rcu_str_deref(swarn->dev->name),
707*4882a593Smuzhiyun swarn->physical,
708*4882a593Smuzhiyun root, inum, offset,
709*4882a593Smuzhiyun min(isize - offset, (u64)PAGE_SIZE), nlink,
710*4882a593Smuzhiyun (char *)(unsigned long)ipath->fspath->val[i]);
711*4882a593Smuzhiyun
712*4882a593Smuzhiyun btrfs_put_root(local_root);
713*4882a593Smuzhiyun free_ipath(ipath);
714*4882a593Smuzhiyun return 0;
715*4882a593Smuzhiyun
716*4882a593Smuzhiyun err:
717*4882a593Smuzhiyun btrfs_warn_in_rcu(fs_info,
718*4882a593Smuzhiyun "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
719*4882a593Smuzhiyun swarn->errstr, swarn->logical,
720*4882a593Smuzhiyun rcu_str_deref(swarn->dev->name),
721*4882a593Smuzhiyun swarn->physical,
722*4882a593Smuzhiyun root, inum, offset, ret);
723*4882a593Smuzhiyun
724*4882a593Smuzhiyun free_ipath(ipath);
725*4882a593Smuzhiyun return 0;
726*4882a593Smuzhiyun }
727*4882a593Smuzhiyun
scrub_print_warning(const char * errstr,struct scrub_block * sblock)728*4882a593Smuzhiyun static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
729*4882a593Smuzhiyun {
730*4882a593Smuzhiyun struct btrfs_device *dev;
731*4882a593Smuzhiyun struct btrfs_fs_info *fs_info;
732*4882a593Smuzhiyun struct btrfs_path *path;
733*4882a593Smuzhiyun struct btrfs_key found_key;
734*4882a593Smuzhiyun struct extent_buffer *eb;
735*4882a593Smuzhiyun struct btrfs_extent_item *ei;
736*4882a593Smuzhiyun struct scrub_warning swarn;
737*4882a593Smuzhiyun unsigned long ptr = 0;
738*4882a593Smuzhiyun u64 extent_item_pos;
739*4882a593Smuzhiyun u64 flags = 0;
740*4882a593Smuzhiyun u64 ref_root;
741*4882a593Smuzhiyun u32 item_size;
742*4882a593Smuzhiyun u8 ref_level = 0;
743*4882a593Smuzhiyun int ret;
744*4882a593Smuzhiyun
745*4882a593Smuzhiyun WARN_ON(sblock->page_count < 1);
746*4882a593Smuzhiyun dev = sblock->pagev[0]->dev;
747*4882a593Smuzhiyun fs_info = sblock->sctx->fs_info;
748*4882a593Smuzhiyun
749*4882a593Smuzhiyun path = btrfs_alloc_path();
750*4882a593Smuzhiyun if (!path)
751*4882a593Smuzhiyun return;
752*4882a593Smuzhiyun
753*4882a593Smuzhiyun swarn.physical = sblock->pagev[0]->physical;
754*4882a593Smuzhiyun swarn.logical = sblock->pagev[0]->logical;
755*4882a593Smuzhiyun swarn.errstr = errstr;
756*4882a593Smuzhiyun swarn.dev = NULL;
757*4882a593Smuzhiyun
758*4882a593Smuzhiyun ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
759*4882a593Smuzhiyun &flags);
760*4882a593Smuzhiyun if (ret < 0)
761*4882a593Smuzhiyun goto out;
762*4882a593Smuzhiyun
763*4882a593Smuzhiyun extent_item_pos = swarn.logical - found_key.objectid;
764*4882a593Smuzhiyun swarn.extent_item_size = found_key.offset;
765*4882a593Smuzhiyun
766*4882a593Smuzhiyun eb = path->nodes[0];
767*4882a593Smuzhiyun ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
768*4882a593Smuzhiyun item_size = btrfs_item_size_nr(eb, path->slots[0]);
769*4882a593Smuzhiyun
770*4882a593Smuzhiyun if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
771*4882a593Smuzhiyun do {
772*4882a593Smuzhiyun ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
773*4882a593Smuzhiyun item_size, &ref_root,
774*4882a593Smuzhiyun &ref_level);
775*4882a593Smuzhiyun btrfs_warn_in_rcu(fs_info,
776*4882a593Smuzhiyun "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
777*4882a593Smuzhiyun errstr, swarn.logical,
778*4882a593Smuzhiyun rcu_str_deref(dev->name),
779*4882a593Smuzhiyun swarn.physical,
780*4882a593Smuzhiyun ref_level ? "node" : "leaf",
781*4882a593Smuzhiyun ret < 0 ? -1 : ref_level,
782*4882a593Smuzhiyun ret < 0 ? -1 : ref_root);
783*4882a593Smuzhiyun } while (ret != 1);
784*4882a593Smuzhiyun btrfs_release_path(path);
785*4882a593Smuzhiyun } else {
786*4882a593Smuzhiyun btrfs_release_path(path);
787*4882a593Smuzhiyun swarn.path = path;
788*4882a593Smuzhiyun swarn.dev = dev;
789*4882a593Smuzhiyun iterate_extent_inodes(fs_info, found_key.objectid,
790*4882a593Smuzhiyun extent_item_pos, 1,
791*4882a593Smuzhiyun scrub_print_warning_inode, &swarn, false);
792*4882a593Smuzhiyun }
793*4882a593Smuzhiyun
794*4882a593Smuzhiyun out:
795*4882a593Smuzhiyun btrfs_free_path(path);
796*4882a593Smuzhiyun }
797*4882a593Smuzhiyun
scrub_get_recover(struct scrub_recover * recover)798*4882a593Smuzhiyun static inline void scrub_get_recover(struct scrub_recover *recover)
799*4882a593Smuzhiyun {
800*4882a593Smuzhiyun refcount_inc(&recover->refs);
801*4882a593Smuzhiyun }
802*4882a593Smuzhiyun
scrub_put_recover(struct btrfs_fs_info * fs_info,struct scrub_recover * recover)803*4882a593Smuzhiyun static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
804*4882a593Smuzhiyun struct scrub_recover *recover)
805*4882a593Smuzhiyun {
806*4882a593Smuzhiyun if (refcount_dec_and_test(&recover->refs)) {
807*4882a593Smuzhiyun btrfs_bio_counter_dec(fs_info);
808*4882a593Smuzhiyun btrfs_put_bbio(recover->bbio);
809*4882a593Smuzhiyun kfree(recover);
810*4882a593Smuzhiyun }
811*4882a593Smuzhiyun }
812*4882a593Smuzhiyun
813*4882a593Smuzhiyun /*
814*4882a593Smuzhiyun * scrub_handle_errored_block gets called when either verification of the
815*4882a593Smuzhiyun * pages failed or the bio failed to read, e.g. with EIO. In the latter
816*4882a593Smuzhiyun * case, this function handles all pages in the bio, even though only one
817*4882a593Smuzhiyun * may be bad.
818*4882a593Smuzhiyun * The goal of this function is to repair the errored block by using the
819*4882a593Smuzhiyun * contents of one of the mirrors.
820*4882a593Smuzhiyun */
scrub_handle_errored_block(struct scrub_block * sblock_to_check)821*4882a593Smuzhiyun static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
822*4882a593Smuzhiyun {
823*4882a593Smuzhiyun struct scrub_ctx *sctx = sblock_to_check->sctx;
824*4882a593Smuzhiyun struct btrfs_device *dev;
825*4882a593Smuzhiyun struct btrfs_fs_info *fs_info;
826*4882a593Smuzhiyun u64 logical;
827*4882a593Smuzhiyun unsigned int failed_mirror_index;
828*4882a593Smuzhiyun unsigned int is_metadata;
829*4882a593Smuzhiyun unsigned int have_csum;
830*4882a593Smuzhiyun struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
831*4882a593Smuzhiyun struct scrub_block *sblock_bad;
832*4882a593Smuzhiyun int ret;
833*4882a593Smuzhiyun int mirror_index;
834*4882a593Smuzhiyun int page_num;
835*4882a593Smuzhiyun int success;
836*4882a593Smuzhiyun bool full_stripe_locked;
837*4882a593Smuzhiyun unsigned int nofs_flag;
838*4882a593Smuzhiyun static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
839*4882a593Smuzhiyun DEFAULT_RATELIMIT_BURST);
840*4882a593Smuzhiyun
841*4882a593Smuzhiyun BUG_ON(sblock_to_check->page_count < 1);
842*4882a593Smuzhiyun fs_info = sctx->fs_info;
843*4882a593Smuzhiyun if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
844*4882a593Smuzhiyun /*
845*4882a593Smuzhiyun * if we find an error in a super block, we just report it.
846*4882a593Smuzhiyun * They will get written with the next transaction commit
847*4882a593Smuzhiyun * anyway
848*4882a593Smuzhiyun */
849*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
850*4882a593Smuzhiyun ++sctx->stat.super_errors;
851*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
852*4882a593Smuzhiyun return 0;
853*4882a593Smuzhiyun }
854*4882a593Smuzhiyun logical = sblock_to_check->pagev[0]->logical;
855*4882a593Smuzhiyun BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
856*4882a593Smuzhiyun failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
857*4882a593Smuzhiyun is_metadata = !(sblock_to_check->pagev[0]->flags &
858*4882a593Smuzhiyun BTRFS_EXTENT_FLAG_DATA);
859*4882a593Smuzhiyun have_csum = sblock_to_check->pagev[0]->have_csum;
860*4882a593Smuzhiyun dev = sblock_to_check->pagev[0]->dev;
861*4882a593Smuzhiyun
862*4882a593Smuzhiyun /*
863*4882a593Smuzhiyun * We must use GFP_NOFS because the scrub task might be waiting for a
864*4882a593Smuzhiyun * worker task executing this function and in turn a transaction commit
865*4882a593Smuzhiyun * might be waiting the scrub task to pause (which needs to wait for all
866*4882a593Smuzhiyun * the worker tasks to complete before pausing).
867*4882a593Smuzhiyun * We do allocations in the workers through insert_full_stripe_lock()
868*4882a593Smuzhiyun * and scrub_add_page_to_wr_bio(), which happens down the call chain of
869*4882a593Smuzhiyun * this function.
870*4882a593Smuzhiyun */
871*4882a593Smuzhiyun nofs_flag = memalloc_nofs_save();
872*4882a593Smuzhiyun /*
873*4882a593Smuzhiyun * For RAID5/6, race can happen for a different device scrub thread.
874*4882a593Smuzhiyun * For data corruption, Parity and Data threads will both try
875*4882a593Smuzhiyun * to recovery the data.
876*4882a593Smuzhiyun * Race can lead to doubly added csum error, or even unrecoverable
877*4882a593Smuzhiyun * error.
878*4882a593Smuzhiyun */
879*4882a593Smuzhiyun ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
880*4882a593Smuzhiyun if (ret < 0) {
881*4882a593Smuzhiyun memalloc_nofs_restore(nofs_flag);
882*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
883*4882a593Smuzhiyun if (ret == -ENOMEM)
884*4882a593Smuzhiyun sctx->stat.malloc_errors++;
885*4882a593Smuzhiyun sctx->stat.read_errors++;
886*4882a593Smuzhiyun sctx->stat.uncorrectable_errors++;
887*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
888*4882a593Smuzhiyun return ret;
889*4882a593Smuzhiyun }
890*4882a593Smuzhiyun
891*4882a593Smuzhiyun /*
892*4882a593Smuzhiyun * read all mirrors one after the other. This includes to
893*4882a593Smuzhiyun * re-read the extent or metadata block that failed (that was
894*4882a593Smuzhiyun * the cause that this fixup code is called) another time,
895*4882a593Smuzhiyun * page by page this time in order to know which pages
896*4882a593Smuzhiyun * caused I/O errors and which ones are good (for all mirrors).
897*4882a593Smuzhiyun * It is the goal to handle the situation when more than one
898*4882a593Smuzhiyun * mirror contains I/O errors, but the errors do not
899*4882a593Smuzhiyun * overlap, i.e. the data can be repaired by selecting the
900*4882a593Smuzhiyun * pages from those mirrors without I/O error on the
901*4882a593Smuzhiyun * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
902*4882a593Smuzhiyun * would be that mirror #1 has an I/O error on the first page,
903*4882a593Smuzhiyun * the second page is good, and mirror #2 has an I/O error on
904*4882a593Smuzhiyun * the second page, but the first page is good.
905*4882a593Smuzhiyun * Then the first page of the first mirror can be repaired by
906*4882a593Smuzhiyun * taking the first page of the second mirror, and the
907*4882a593Smuzhiyun * second page of the second mirror can be repaired by
908*4882a593Smuzhiyun * copying the contents of the 2nd page of the 1st mirror.
909*4882a593Smuzhiyun * One more note: if the pages of one mirror contain I/O
910*4882a593Smuzhiyun * errors, the checksum cannot be verified. In order to get
911*4882a593Smuzhiyun * the best data for repairing, the first attempt is to find
912*4882a593Smuzhiyun * a mirror without I/O errors and with a validated checksum.
913*4882a593Smuzhiyun * Only if this is not possible, the pages are picked from
914*4882a593Smuzhiyun * mirrors with I/O errors without considering the checksum.
915*4882a593Smuzhiyun * If the latter is the case, at the end, the checksum of the
916*4882a593Smuzhiyun * repaired area is verified in order to correctly maintain
917*4882a593Smuzhiyun * the statistics.
918*4882a593Smuzhiyun */
919*4882a593Smuzhiyun
920*4882a593Smuzhiyun sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
921*4882a593Smuzhiyun sizeof(*sblocks_for_recheck), GFP_KERNEL);
922*4882a593Smuzhiyun if (!sblocks_for_recheck) {
923*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
924*4882a593Smuzhiyun sctx->stat.malloc_errors++;
925*4882a593Smuzhiyun sctx->stat.read_errors++;
926*4882a593Smuzhiyun sctx->stat.uncorrectable_errors++;
927*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
928*4882a593Smuzhiyun btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
929*4882a593Smuzhiyun goto out;
930*4882a593Smuzhiyun }
931*4882a593Smuzhiyun
932*4882a593Smuzhiyun /* setup the context, map the logical blocks and alloc the pages */
933*4882a593Smuzhiyun ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
934*4882a593Smuzhiyun if (ret) {
935*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
936*4882a593Smuzhiyun sctx->stat.read_errors++;
937*4882a593Smuzhiyun sctx->stat.uncorrectable_errors++;
938*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
939*4882a593Smuzhiyun btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
940*4882a593Smuzhiyun goto out;
941*4882a593Smuzhiyun }
942*4882a593Smuzhiyun BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
943*4882a593Smuzhiyun sblock_bad = sblocks_for_recheck + failed_mirror_index;
944*4882a593Smuzhiyun
945*4882a593Smuzhiyun /* build and submit the bios for the failed mirror, check checksums */
946*4882a593Smuzhiyun scrub_recheck_block(fs_info, sblock_bad, 1);
947*4882a593Smuzhiyun
948*4882a593Smuzhiyun if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
949*4882a593Smuzhiyun sblock_bad->no_io_error_seen) {
950*4882a593Smuzhiyun /*
951*4882a593Smuzhiyun * the error disappeared after reading page by page, or
952*4882a593Smuzhiyun * the area was part of a huge bio and other parts of the
953*4882a593Smuzhiyun * bio caused I/O errors, or the block layer merged several
954*4882a593Smuzhiyun * read requests into one and the error is caused by a
955*4882a593Smuzhiyun * different bio (usually one of the two latter cases is
956*4882a593Smuzhiyun * the cause)
957*4882a593Smuzhiyun */
958*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
959*4882a593Smuzhiyun sctx->stat.unverified_errors++;
960*4882a593Smuzhiyun sblock_to_check->data_corrected = 1;
961*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
962*4882a593Smuzhiyun
963*4882a593Smuzhiyun if (sctx->is_dev_replace)
964*4882a593Smuzhiyun scrub_write_block_to_dev_replace(sblock_bad);
965*4882a593Smuzhiyun goto out;
966*4882a593Smuzhiyun }
967*4882a593Smuzhiyun
968*4882a593Smuzhiyun if (!sblock_bad->no_io_error_seen) {
969*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
970*4882a593Smuzhiyun sctx->stat.read_errors++;
971*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
972*4882a593Smuzhiyun if (__ratelimit(&rs))
973*4882a593Smuzhiyun scrub_print_warning("i/o error", sblock_to_check);
974*4882a593Smuzhiyun btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
975*4882a593Smuzhiyun } else if (sblock_bad->checksum_error) {
976*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
977*4882a593Smuzhiyun sctx->stat.csum_errors++;
978*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
979*4882a593Smuzhiyun if (__ratelimit(&rs))
980*4882a593Smuzhiyun scrub_print_warning("checksum error", sblock_to_check);
981*4882a593Smuzhiyun btrfs_dev_stat_inc_and_print(dev,
982*4882a593Smuzhiyun BTRFS_DEV_STAT_CORRUPTION_ERRS);
983*4882a593Smuzhiyun } else if (sblock_bad->header_error) {
984*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
985*4882a593Smuzhiyun sctx->stat.verify_errors++;
986*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
987*4882a593Smuzhiyun if (__ratelimit(&rs))
988*4882a593Smuzhiyun scrub_print_warning("checksum/header error",
989*4882a593Smuzhiyun sblock_to_check);
990*4882a593Smuzhiyun if (sblock_bad->generation_error)
991*4882a593Smuzhiyun btrfs_dev_stat_inc_and_print(dev,
992*4882a593Smuzhiyun BTRFS_DEV_STAT_GENERATION_ERRS);
993*4882a593Smuzhiyun else
994*4882a593Smuzhiyun btrfs_dev_stat_inc_and_print(dev,
995*4882a593Smuzhiyun BTRFS_DEV_STAT_CORRUPTION_ERRS);
996*4882a593Smuzhiyun }
997*4882a593Smuzhiyun
998*4882a593Smuzhiyun if (sctx->readonly) {
999*4882a593Smuzhiyun ASSERT(!sctx->is_dev_replace);
1000*4882a593Smuzhiyun goto out;
1001*4882a593Smuzhiyun }
1002*4882a593Smuzhiyun
1003*4882a593Smuzhiyun /*
1004*4882a593Smuzhiyun * now build and submit the bios for the other mirrors, check
1005*4882a593Smuzhiyun * checksums.
1006*4882a593Smuzhiyun * First try to pick the mirror which is completely without I/O
1007*4882a593Smuzhiyun * errors and also does not have a checksum error.
1008*4882a593Smuzhiyun * If one is found, and if a checksum is present, the full block
1009*4882a593Smuzhiyun * that is known to contain an error is rewritten. Afterwards
1010*4882a593Smuzhiyun * the block is known to be corrected.
1011*4882a593Smuzhiyun * If a mirror is found which is completely correct, and no
1012*4882a593Smuzhiyun * checksum is present, only those pages are rewritten that had
1013*4882a593Smuzhiyun * an I/O error in the block to be repaired, since it cannot be
1014*4882a593Smuzhiyun * determined, which copy of the other pages is better (and it
1015*4882a593Smuzhiyun * could happen otherwise that a correct page would be
1016*4882a593Smuzhiyun * overwritten by a bad one).
1017*4882a593Smuzhiyun */
1018*4882a593Smuzhiyun for (mirror_index = 0; ;mirror_index++) {
1019*4882a593Smuzhiyun struct scrub_block *sblock_other;
1020*4882a593Smuzhiyun
1021*4882a593Smuzhiyun if (mirror_index == failed_mirror_index)
1022*4882a593Smuzhiyun continue;
1023*4882a593Smuzhiyun
1024*4882a593Smuzhiyun /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1025*4882a593Smuzhiyun if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1026*4882a593Smuzhiyun if (mirror_index >= BTRFS_MAX_MIRRORS)
1027*4882a593Smuzhiyun break;
1028*4882a593Smuzhiyun if (!sblocks_for_recheck[mirror_index].page_count)
1029*4882a593Smuzhiyun break;
1030*4882a593Smuzhiyun
1031*4882a593Smuzhiyun sblock_other = sblocks_for_recheck + mirror_index;
1032*4882a593Smuzhiyun } else {
1033*4882a593Smuzhiyun struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1034*4882a593Smuzhiyun int max_allowed = r->bbio->num_stripes -
1035*4882a593Smuzhiyun r->bbio->num_tgtdevs;
1036*4882a593Smuzhiyun
1037*4882a593Smuzhiyun if (mirror_index >= max_allowed)
1038*4882a593Smuzhiyun break;
1039*4882a593Smuzhiyun if (!sblocks_for_recheck[1].page_count)
1040*4882a593Smuzhiyun break;
1041*4882a593Smuzhiyun
1042*4882a593Smuzhiyun ASSERT(failed_mirror_index == 0);
1043*4882a593Smuzhiyun sblock_other = sblocks_for_recheck + 1;
1044*4882a593Smuzhiyun sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1045*4882a593Smuzhiyun }
1046*4882a593Smuzhiyun
1047*4882a593Smuzhiyun /* build and submit the bios, check checksums */
1048*4882a593Smuzhiyun scrub_recheck_block(fs_info, sblock_other, 0);
1049*4882a593Smuzhiyun
1050*4882a593Smuzhiyun if (!sblock_other->header_error &&
1051*4882a593Smuzhiyun !sblock_other->checksum_error &&
1052*4882a593Smuzhiyun sblock_other->no_io_error_seen) {
1053*4882a593Smuzhiyun if (sctx->is_dev_replace) {
1054*4882a593Smuzhiyun scrub_write_block_to_dev_replace(sblock_other);
1055*4882a593Smuzhiyun goto corrected_error;
1056*4882a593Smuzhiyun } else {
1057*4882a593Smuzhiyun ret = scrub_repair_block_from_good_copy(
1058*4882a593Smuzhiyun sblock_bad, sblock_other);
1059*4882a593Smuzhiyun if (!ret)
1060*4882a593Smuzhiyun goto corrected_error;
1061*4882a593Smuzhiyun }
1062*4882a593Smuzhiyun }
1063*4882a593Smuzhiyun }
1064*4882a593Smuzhiyun
1065*4882a593Smuzhiyun if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1066*4882a593Smuzhiyun goto did_not_correct_error;
1067*4882a593Smuzhiyun
1068*4882a593Smuzhiyun /*
1069*4882a593Smuzhiyun * In case of I/O errors in the area that is supposed to be
1070*4882a593Smuzhiyun * repaired, continue by picking good copies of those pages.
1071*4882a593Smuzhiyun * Select the good pages from mirrors to rewrite bad pages from
1072*4882a593Smuzhiyun * the area to fix. Afterwards verify the checksum of the block
1073*4882a593Smuzhiyun * that is supposed to be repaired. This verification step is
1074*4882a593Smuzhiyun * only done for the purpose of statistic counting and for the
1075*4882a593Smuzhiyun * final scrub report, whether errors remain.
1076*4882a593Smuzhiyun * A perfect algorithm could make use of the checksum and try
1077*4882a593Smuzhiyun * all possible combinations of pages from the different mirrors
1078*4882a593Smuzhiyun * until the checksum verification succeeds. For example, when
1079*4882a593Smuzhiyun * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1080*4882a593Smuzhiyun * of mirror #2 is readable but the final checksum test fails,
1081*4882a593Smuzhiyun * then the 2nd page of mirror #3 could be tried, whether now
1082*4882a593Smuzhiyun * the final checksum succeeds. But this would be a rare
1083*4882a593Smuzhiyun * exception and is therefore not implemented. At least it is
1084*4882a593Smuzhiyun * avoided that the good copy is overwritten.
1085*4882a593Smuzhiyun * A more useful improvement would be to pick the sectors
1086*4882a593Smuzhiyun * without I/O error based on sector sizes (512 bytes on legacy
1087*4882a593Smuzhiyun * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1088*4882a593Smuzhiyun * mirror could be repaired by taking 512 byte of a different
1089*4882a593Smuzhiyun * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1090*4882a593Smuzhiyun * area are unreadable.
1091*4882a593Smuzhiyun */
1092*4882a593Smuzhiyun success = 1;
1093*4882a593Smuzhiyun for (page_num = 0; page_num < sblock_bad->page_count;
1094*4882a593Smuzhiyun page_num++) {
1095*4882a593Smuzhiyun struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1096*4882a593Smuzhiyun struct scrub_block *sblock_other = NULL;
1097*4882a593Smuzhiyun
1098*4882a593Smuzhiyun /* skip no-io-error page in scrub */
1099*4882a593Smuzhiyun if (!page_bad->io_error && !sctx->is_dev_replace)
1100*4882a593Smuzhiyun continue;
1101*4882a593Smuzhiyun
1102*4882a593Smuzhiyun if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1103*4882a593Smuzhiyun /*
1104*4882a593Smuzhiyun * In case of dev replace, if raid56 rebuild process
1105*4882a593Smuzhiyun * didn't work out correct data, then copy the content
1106*4882a593Smuzhiyun * in sblock_bad to make sure target device is identical
1107*4882a593Smuzhiyun * to source device, instead of writing garbage data in
1108*4882a593Smuzhiyun * sblock_for_recheck array to target device.
1109*4882a593Smuzhiyun */
1110*4882a593Smuzhiyun sblock_other = NULL;
1111*4882a593Smuzhiyun } else if (page_bad->io_error) {
1112*4882a593Smuzhiyun /* try to find no-io-error page in mirrors */
1113*4882a593Smuzhiyun for (mirror_index = 0;
1114*4882a593Smuzhiyun mirror_index < BTRFS_MAX_MIRRORS &&
1115*4882a593Smuzhiyun sblocks_for_recheck[mirror_index].page_count > 0;
1116*4882a593Smuzhiyun mirror_index++) {
1117*4882a593Smuzhiyun if (!sblocks_for_recheck[mirror_index].
1118*4882a593Smuzhiyun pagev[page_num]->io_error) {
1119*4882a593Smuzhiyun sblock_other = sblocks_for_recheck +
1120*4882a593Smuzhiyun mirror_index;
1121*4882a593Smuzhiyun break;
1122*4882a593Smuzhiyun }
1123*4882a593Smuzhiyun }
1124*4882a593Smuzhiyun if (!sblock_other)
1125*4882a593Smuzhiyun success = 0;
1126*4882a593Smuzhiyun }
1127*4882a593Smuzhiyun
1128*4882a593Smuzhiyun if (sctx->is_dev_replace) {
1129*4882a593Smuzhiyun /*
1130*4882a593Smuzhiyun * did not find a mirror to fetch the page
1131*4882a593Smuzhiyun * from. scrub_write_page_to_dev_replace()
1132*4882a593Smuzhiyun * handles this case (page->io_error), by
1133*4882a593Smuzhiyun * filling the block with zeros before
1134*4882a593Smuzhiyun * submitting the write request
1135*4882a593Smuzhiyun */
1136*4882a593Smuzhiyun if (!sblock_other)
1137*4882a593Smuzhiyun sblock_other = sblock_bad;
1138*4882a593Smuzhiyun
1139*4882a593Smuzhiyun if (scrub_write_page_to_dev_replace(sblock_other,
1140*4882a593Smuzhiyun page_num) != 0) {
1141*4882a593Smuzhiyun atomic64_inc(
1142*4882a593Smuzhiyun &fs_info->dev_replace.num_write_errors);
1143*4882a593Smuzhiyun success = 0;
1144*4882a593Smuzhiyun }
1145*4882a593Smuzhiyun } else if (sblock_other) {
1146*4882a593Smuzhiyun ret = scrub_repair_page_from_good_copy(sblock_bad,
1147*4882a593Smuzhiyun sblock_other,
1148*4882a593Smuzhiyun page_num, 0);
1149*4882a593Smuzhiyun if (0 == ret)
1150*4882a593Smuzhiyun page_bad->io_error = 0;
1151*4882a593Smuzhiyun else
1152*4882a593Smuzhiyun success = 0;
1153*4882a593Smuzhiyun }
1154*4882a593Smuzhiyun }
1155*4882a593Smuzhiyun
1156*4882a593Smuzhiyun if (success && !sctx->is_dev_replace) {
1157*4882a593Smuzhiyun if (is_metadata || have_csum) {
1158*4882a593Smuzhiyun /*
1159*4882a593Smuzhiyun * need to verify the checksum now that all
1160*4882a593Smuzhiyun * sectors on disk are repaired (the write
1161*4882a593Smuzhiyun * request for data to be repaired is on its way).
1162*4882a593Smuzhiyun * Just be lazy and use scrub_recheck_block()
1163*4882a593Smuzhiyun * which re-reads the data before the checksum
1164*4882a593Smuzhiyun * is verified, but most likely the data comes out
1165*4882a593Smuzhiyun * of the page cache.
1166*4882a593Smuzhiyun */
1167*4882a593Smuzhiyun scrub_recheck_block(fs_info, sblock_bad, 1);
1168*4882a593Smuzhiyun if (!sblock_bad->header_error &&
1169*4882a593Smuzhiyun !sblock_bad->checksum_error &&
1170*4882a593Smuzhiyun sblock_bad->no_io_error_seen)
1171*4882a593Smuzhiyun goto corrected_error;
1172*4882a593Smuzhiyun else
1173*4882a593Smuzhiyun goto did_not_correct_error;
1174*4882a593Smuzhiyun } else {
1175*4882a593Smuzhiyun corrected_error:
1176*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
1177*4882a593Smuzhiyun sctx->stat.corrected_errors++;
1178*4882a593Smuzhiyun sblock_to_check->data_corrected = 1;
1179*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
1180*4882a593Smuzhiyun btrfs_err_rl_in_rcu(fs_info,
1181*4882a593Smuzhiyun "fixed up error at logical %llu on dev %s",
1182*4882a593Smuzhiyun logical, rcu_str_deref(dev->name));
1183*4882a593Smuzhiyun }
1184*4882a593Smuzhiyun } else {
1185*4882a593Smuzhiyun did_not_correct_error:
1186*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
1187*4882a593Smuzhiyun sctx->stat.uncorrectable_errors++;
1188*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
1189*4882a593Smuzhiyun btrfs_err_rl_in_rcu(fs_info,
1190*4882a593Smuzhiyun "unable to fixup (regular) error at logical %llu on dev %s",
1191*4882a593Smuzhiyun logical, rcu_str_deref(dev->name));
1192*4882a593Smuzhiyun }
1193*4882a593Smuzhiyun
1194*4882a593Smuzhiyun out:
1195*4882a593Smuzhiyun if (sblocks_for_recheck) {
1196*4882a593Smuzhiyun for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1197*4882a593Smuzhiyun mirror_index++) {
1198*4882a593Smuzhiyun struct scrub_block *sblock = sblocks_for_recheck +
1199*4882a593Smuzhiyun mirror_index;
1200*4882a593Smuzhiyun struct scrub_recover *recover;
1201*4882a593Smuzhiyun int page_index;
1202*4882a593Smuzhiyun
1203*4882a593Smuzhiyun for (page_index = 0; page_index < sblock->page_count;
1204*4882a593Smuzhiyun page_index++) {
1205*4882a593Smuzhiyun sblock->pagev[page_index]->sblock = NULL;
1206*4882a593Smuzhiyun recover = sblock->pagev[page_index]->recover;
1207*4882a593Smuzhiyun if (recover) {
1208*4882a593Smuzhiyun scrub_put_recover(fs_info, recover);
1209*4882a593Smuzhiyun sblock->pagev[page_index]->recover =
1210*4882a593Smuzhiyun NULL;
1211*4882a593Smuzhiyun }
1212*4882a593Smuzhiyun scrub_page_put(sblock->pagev[page_index]);
1213*4882a593Smuzhiyun }
1214*4882a593Smuzhiyun }
1215*4882a593Smuzhiyun kfree(sblocks_for_recheck);
1216*4882a593Smuzhiyun }
1217*4882a593Smuzhiyun
1218*4882a593Smuzhiyun ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1219*4882a593Smuzhiyun memalloc_nofs_restore(nofs_flag);
1220*4882a593Smuzhiyun if (ret < 0)
1221*4882a593Smuzhiyun return ret;
1222*4882a593Smuzhiyun return 0;
1223*4882a593Smuzhiyun }
1224*4882a593Smuzhiyun
scrub_nr_raid_mirrors(struct btrfs_bio * bbio)1225*4882a593Smuzhiyun static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1226*4882a593Smuzhiyun {
1227*4882a593Smuzhiyun if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1228*4882a593Smuzhiyun return 2;
1229*4882a593Smuzhiyun else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1230*4882a593Smuzhiyun return 3;
1231*4882a593Smuzhiyun else
1232*4882a593Smuzhiyun return (int)bbio->num_stripes;
1233*4882a593Smuzhiyun }
1234*4882a593Smuzhiyun
scrub_stripe_index_and_offset(u64 logical,u64 map_type,u64 * raid_map,u64 mapped_length,int nstripes,int mirror,int * stripe_index,u64 * stripe_offset)1235*4882a593Smuzhiyun static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1236*4882a593Smuzhiyun u64 *raid_map,
1237*4882a593Smuzhiyun u64 mapped_length,
1238*4882a593Smuzhiyun int nstripes, int mirror,
1239*4882a593Smuzhiyun int *stripe_index,
1240*4882a593Smuzhiyun u64 *stripe_offset)
1241*4882a593Smuzhiyun {
1242*4882a593Smuzhiyun int i;
1243*4882a593Smuzhiyun
1244*4882a593Smuzhiyun if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1245*4882a593Smuzhiyun /* RAID5/6 */
1246*4882a593Smuzhiyun for (i = 0; i < nstripes; i++) {
1247*4882a593Smuzhiyun if (raid_map[i] == RAID6_Q_STRIPE ||
1248*4882a593Smuzhiyun raid_map[i] == RAID5_P_STRIPE)
1249*4882a593Smuzhiyun continue;
1250*4882a593Smuzhiyun
1251*4882a593Smuzhiyun if (logical >= raid_map[i] &&
1252*4882a593Smuzhiyun logical < raid_map[i] + mapped_length)
1253*4882a593Smuzhiyun break;
1254*4882a593Smuzhiyun }
1255*4882a593Smuzhiyun
1256*4882a593Smuzhiyun *stripe_index = i;
1257*4882a593Smuzhiyun *stripe_offset = logical - raid_map[i];
1258*4882a593Smuzhiyun } else {
1259*4882a593Smuzhiyun /* The other RAID type */
1260*4882a593Smuzhiyun *stripe_index = mirror;
1261*4882a593Smuzhiyun *stripe_offset = 0;
1262*4882a593Smuzhiyun }
1263*4882a593Smuzhiyun }
1264*4882a593Smuzhiyun
scrub_setup_recheck_block(struct scrub_block * original_sblock,struct scrub_block * sblocks_for_recheck)1265*4882a593Smuzhiyun static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1266*4882a593Smuzhiyun struct scrub_block *sblocks_for_recheck)
1267*4882a593Smuzhiyun {
1268*4882a593Smuzhiyun struct scrub_ctx *sctx = original_sblock->sctx;
1269*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sctx->fs_info;
1270*4882a593Smuzhiyun u64 length = original_sblock->page_count * PAGE_SIZE;
1271*4882a593Smuzhiyun u64 logical = original_sblock->pagev[0]->logical;
1272*4882a593Smuzhiyun u64 generation = original_sblock->pagev[0]->generation;
1273*4882a593Smuzhiyun u64 flags = original_sblock->pagev[0]->flags;
1274*4882a593Smuzhiyun u64 have_csum = original_sblock->pagev[0]->have_csum;
1275*4882a593Smuzhiyun struct scrub_recover *recover;
1276*4882a593Smuzhiyun struct btrfs_bio *bbio;
1277*4882a593Smuzhiyun u64 sublen;
1278*4882a593Smuzhiyun u64 mapped_length;
1279*4882a593Smuzhiyun u64 stripe_offset;
1280*4882a593Smuzhiyun int stripe_index;
1281*4882a593Smuzhiyun int page_index = 0;
1282*4882a593Smuzhiyun int mirror_index;
1283*4882a593Smuzhiyun int nmirrors;
1284*4882a593Smuzhiyun int ret;
1285*4882a593Smuzhiyun
1286*4882a593Smuzhiyun /*
1287*4882a593Smuzhiyun * note: the two members refs and outstanding_pages
1288*4882a593Smuzhiyun * are not used (and not set) in the blocks that are used for
1289*4882a593Smuzhiyun * the recheck procedure
1290*4882a593Smuzhiyun */
1291*4882a593Smuzhiyun
1292*4882a593Smuzhiyun while (length > 0) {
1293*4882a593Smuzhiyun sublen = min_t(u64, length, PAGE_SIZE);
1294*4882a593Smuzhiyun mapped_length = sublen;
1295*4882a593Smuzhiyun bbio = NULL;
1296*4882a593Smuzhiyun
1297*4882a593Smuzhiyun /*
1298*4882a593Smuzhiyun * with a length of PAGE_SIZE, each returned stripe
1299*4882a593Smuzhiyun * represents one mirror
1300*4882a593Smuzhiyun */
1301*4882a593Smuzhiyun btrfs_bio_counter_inc_blocked(fs_info);
1302*4882a593Smuzhiyun ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1303*4882a593Smuzhiyun logical, &mapped_length, &bbio);
1304*4882a593Smuzhiyun if (ret || !bbio || mapped_length < sublen) {
1305*4882a593Smuzhiyun btrfs_put_bbio(bbio);
1306*4882a593Smuzhiyun btrfs_bio_counter_dec(fs_info);
1307*4882a593Smuzhiyun return -EIO;
1308*4882a593Smuzhiyun }
1309*4882a593Smuzhiyun
1310*4882a593Smuzhiyun recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1311*4882a593Smuzhiyun if (!recover) {
1312*4882a593Smuzhiyun btrfs_put_bbio(bbio);
1313*4882a593Smuzhiyun btrfs_bio_counter_dec(fs_info);
1314*4882a593Smuzhiyun return -ENOMEM;
1315*4882a593Smuzhiyun }
1316*4882a593Smuzhiyun
1317*4882a593Smuzhiyun refcount_set(&recover->refs, 1);
1318*4882a593Smuzhiyun recover->bbio = bbio;
1319*4882a593Smuzhiyun recover->map_length = mapped_length;
1320*4882a593Smuzhiyun
1321*4882a593Smuzhiyun BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1322*4882a593Smuzhiyun
1323*4882a593Smuzhiyun nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1324*4882a593Smuzhiyun
1325*4882a593Smuzhiyun for (mirror_index = 0; mirror_index < nmirrors;
1326*4882a593Smuzhiyun mirror_index++) {
1327*4882a593Smuzhiyun struct scrub_block *sblock;
1328*4882a593Smuzhiyun struct scrub_page *page;
1329*4882a593Smuzhiyun
1330*4882a593Smuzhiyun sblock = sblocks_for_recheck + mirror_index;
1331*4882a593Smuzhiyun sblock->sctx = sctx;
1332*4882a593Smuzhiyun
1333*4882a593Smuzhiyun page = kzalloc(sizeof(*page), GFP_NOFS);
1334*4882a593Smuzhiyun if (!page) {
1335*4882a593Smuzhiyun leave_nomem:
1336*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
1337*4882a593Smuzhiyun sctx->stat.malloc_errors++;
1338*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
1339*4882a593Smuzhiyun scrub_put_recover(fs_info, recover);
1340*4882a593Smuzhiyun return -ENOMEM;
1341*4882a593Smuzhiyun }
1342*4882a593Smuzhiyun scrub_page_get(page);
1343*4882a593Smuzhiyun sblock->pagev[page_index] = page;
1344*4882a593Smuzhiyun page->sblock = sblock;
1345*4882a593Smuzhiyun page->flags = flags;
1346*4882a593Smuzhiyun page->generation = generation;
1347*4882a593Smuzhiyun page->logical = logical;
1348*4882a593Smuzhiyun page->have_csum = have_csum;
1349*4882a593Smuzhiyun if (have_csum)
1350*4882a593Smuzhiyun memcpy(page->csum,
1351*4882a593Smuzhiyun original_sblock->pagev[0]->csum,
1352*4882a593Smuzhiyun sctx->csum_size);
1353*4882a593Smuzhiyun
1354*4882a593Smuzhiyun scrub_stripe_index_and_offset(logical,
1355*4882a593Smuzhiyun bbio->map_type,
1356*4882a593Smuzhiyun bbio->raid_map,
1357*4882a593Smuzhiyun mapped_length,
1358*4882a593Smuzhiyun bbio->num_stripes -
1359*4882a593Smuzhiyun bbio->num_tgtdevs,
1360*4882a593Smuzhiyun mirror_index,
1361*4882a593Smuzhiyun &stripe_index,
1362*4882a593Smuzhiyun &stripe_offset);
1363*4882a593Smuzhiyun page->physical = bbio->stripes[stripe_index].physical +
1364*4882a593Smuzhiyun stripe_offset;
1365*4882a593Smuzhiyun page->dev = bbio->stripes[stripe_index].dev;
1366*4882a593Smuzhiyun
1367*4882a593Smuzhiyun BUG_ON(page_index >= original_sblock->page_count);
1368*4882a593Smuzhiyun page->physical_for_dev_replace =
1369*4882a593Smuzhiyun original_sblock->pagev[page_index]->
1370*4882a593Smuzhiyun physical_for_dev_replace;
1371*4882a593Smuzhiyun /* for missing devices, dev->bdev is NULL */
1372*4882a593Smuzhiyun page->mirror_num = mirror_index + 1;
1373*4882a593Smuzhiyun sblock->page_count++;
1374*4882a593Smuzhiyun page->page = alloc_page(GFP_NOFS);
1375*4882a593Smuzhiyun if (!page->page)
1376*4882a593Smuzhiyun goto leave_nomem;
1377*4882a593Smuzhiyun
1378*4882a593Smuzhiyun scrub_get_recover(recover);
1379*4882a593Smuzhiyun page->recover = recover;
1380*4882a593Smuzhiyun }
1381*4882a593Smuzhiyun scrub_put_recover(fs_info, recover);
1382*4882a593Smuzhiyun length -= sublen;
1383*4882a593Smuzhiyun logical += sublen;
1384*4882a593Smuzhiyun page_index++;
1385*4882a593Smuzhiyun }
1386*4882a593Smuzhiyun
1387*4882a593Smuzhiyun return 0;
1388*4882a593Smuzhiyun }
1389*4882a593Smuzhiyun
scrub_bio_wait_endio(struct bio * bio)1390*4882a593Smuzhiyun static void scrub_bio_wait_endio(struct bio *bio)
1391*4882a593Smuzhiyun {
1392*4882a593Smuzhiyun complete(bio->bi_private);
1393*4882a593Smuzhiyun }
1394*4882a593Smuzhiyun
scrub_submit_raid56_bio_wait(struct btrfs_fs_info * fs_info,struct bio * bio,struct scrub_page * page)1395*4882a593Smuzhiyun static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1396*4882a593Smuzhiyun struct bio *bio,
1397*4882a593Smuzhiyun struct scrub_page *page)
1398*4882a593Smuzhiyun {
1399*4882a593Smuzhiyun DECLARE_COMPLETION_ONSTACK(done);
1400*4882a593Smuzhiyun int ret;
1401*4882a593Smuzhiyun int mirror_num;
1402*4882a593Smuzhiyun
1403*4882a593Smuzhiyun bio->bi_iter.bi_sector = page->logical >> 9;
1404*4882a593Smuzhiyun bio->bi_private = &done;
1405*4882a593Smuzhiyun bio->bi_end_io = scrub_bio_wait_endio;
1406*4882a593Smuzhiyun
1407*4882a593Smuzhiyun mirror_num = page->sblock->pagev[0]->mirror_num;
1408*4882a593Smuzhiyun ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
1409*4882a593Smuzhiyun page->recover->map_length,
1410*4882a593Smuzhiyun mirror_num, 0);
1411*4882a593Smuzhiyun if (ret)
1412*4882a593Smuzhiyun return ret;
1413*4882a593Smuzhiyun
1414*4882a593Smuzhiyun wait_for_completion_io(&done);
1415*4882a593Smuzhiyun return blk_status_to_errno(bio->bi_status);
1416*4882a593Smuzhiyun }
1417*4882a593Smuzhiyun
scrub_recheck_block_on_raid56(struct btrfs_fs_info * fs_info,struct scrub_block * sblock)1418*4882a593Smuzhiyun static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1419*4882a593Smuzhiyun struct scrub_block *sblock)
1420*4882a593Smuzhiyun {
1421*4882a593Smuzhiyun struct scrub_page *first_page = sblock->pagev[0];
1422*4882a593Smuzhiyun struct bio *bio;
1423*4882a593Smuzhiyun int page_num;
1424*4882a593Smuzhiyun
1425*4882a593Smuzhiyun /* All pages in sblock belong to the same stripe on the same device. */
1426*4882a593Smuzhiyun ASSERT(first_page->dev);
1427*4882a593Smuzhiyun if (!first_page->dev->bdev)
1428*4882a593Smuzhiyun goto out;
1429*4882a593Smuzhiyun
1430*4882a593Smuzhiyun bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
1431*4882a593Smuzhiyun bio_set_dev(bio, first_page->dev->bdev);
1432*4882a593Smuzhiyun
1433*4882a593Smuzhiyun for (page_num = 0; page_num < sblock->page_count; page_num++) {
1434*4882a593Smuzhiyun struct scrub_page *page = sblock->pagev[page_num];
1435*4882a593Smuzhiyun
1436*4882a593Smuzhiyun WARN_ON(!page->page);
1437*4882a593Smuzhiyun bio_add_page(bio, page->page, PAGE_SIZE, 0);
1438*4882a593Smuzhiyun }
1439*4882a593Smuzhiyun
1440*4882a593Smuzhiyun if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1441*4882a593Smuzhiyun bio_put(bio);
1442*4882a593Smuzhiyun goto out;
1443*4882a593Smuzhiyun }
1444*4882a593Smuzhiyun
1445*4882a593Smuzhiyun bio_put(bio);
1446*4882a593Smuzhiyun
1447*4882a593Smuzhiyun scrub_recheck_block_checksum(sblock);
1448*4882a593Smuzhiyun
1449*4882a593Smuzhiyun return;
1450*4882a593Smuzhiyun out:
1451*4882a593Smuzhiyun for (page_num = 0; page_num < sblock->page_count; page_num++)
1452*4882a593Smuzhiyun sblock->pagev[page_num]->io_error = 1;
1453*4882a593Smuzhiyun
1454*4882a593Smuzhiyun sblock->no_io_error_seen = 0;
1455*4882a593Smuzhiyun }
1456*4882a593Smuzhiyun
1457*4882a593Smuzhiyun /*
1458*4882a593Smuzhiyun * this function will check the on disk data for checksum errors, header
1459*4882a593Smuzhiyun * errors and read I/O errors. If any I/O errors happen, the exact pages
1460*4882a593Smuzhiyun * which are errored are marked as being bad. The goal is to enable scrub
1461*4882a593Smuzhiyun * to take those pages that are not errored from all the mirrors so that
1462*4882a593Smuzhiyun * the pages that are errored in the just handled mirror can be repaired.
1463*4882a593Smuzhiyun */
scrub_recheck_block(struct btrfs_fs_info * fs_info,struct scrub_block * sblock,int retry_failed_mirror)1464*4882a593Smuzhiyun static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1465*4882a593Smuzhiyun struct scrub_block *sblock,
1466*4882a593Smuzhiyun int retry_failed_mirror)
1467*4882a593Smuzhiyun {
1468*4882a593Smuzhiyun int page_num;
1469*4882a593Smuzhiyun
1470*4882a593Smuzhiyun sblock->no_io_error_seen = 1;
1471*4882a593Smuzhiyun
1472*4882a593Smuzhiyun /* short cut for raid56 */
1473*4882a593Smuzhiyun if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1474*4882a593Smuzhiyun return scrub_recheck_block_on_raid56(fs_info, sblock);
1475*4882a593Smuzhiyun
1476*4882a593Smuzhiyun for (page_num = 0; page_num < sblock->page_count; page_num++) {
1477*4882a593Smuzhiyun struct bio *bio;
1478*4882a593Smuzhiyun struct scrub_page *page = sblock->pagev[page_num];
1479*4882a593Smuzhiyun
1480*4882a593Smuzhiyun if (page->dev->bdev == NULL) {
1481*4882a593Smuzhiyun page->io_error = 1;
1482*4882a593Smuzhiyun sblock->no_io_error_seen = 0;
1483*4882a593Smuzhiyun continue;
1484*4882a593Smuzhiyun }
1485*4882a593Smuzhiyun
1486*4882a593Smuzhiyun WARN_ON(!page->page);
1487*4882a593Smuzhiyun bio = btrfs_io_bio_alloc(1);
1488*4882a593Smuzhiyun bio_set_dev(bio, page->dev->bdev);
1489*4882a593Smuzhiyun
1490*4882a593Smuzhiyun bio_add_page(bio, page->page, PAGE_SIZE, 0);
1491*4882a593Smuzhiyun bio->bi_iter.bi_sector = page->physical >> 9;
1492*4882a593Smuzhiyun bio->bi_opf = REQ_OP_READ;
1493*4882a593Smuzhiyun
1494*4882a593Smuzhiyun if (btrfsic_submit_bio_wait(bio)) {
1495*4882a593Smuzhiyun page->io_error = 1;
1496*4882a593Smuzhiyun sblock->no_io_error_seen = 0;
1497*4882a593Smuzhiyun }
1498*4882a593Smuzhiyun
1499*4882a593Smuzhiyun bio_put(bio);
1500*4882a593Smuzhiyun }
1501*4882a593Smuzhiyun
1502*4882a593Smuzhiyun if (sblock->no_io_error_seen)
1503*4882a593Smuzhiyun scrub_recheck_block_checksum(sblock);
1504*4882a593Smuzhiyun }
1505*4882a593Smuzhiyun
scrub_check_fsid(u8 fsid[],struct scrub_page * spage)1506*4882a593Smuzhiyun static inline int scrub_check_fsid(u8 fsid[],
1507*4882a593Smuzhiyun struct scrub_page *spage)
1508*4882a593Smuzhiyun {
1509*4882a593Smuzhiyun struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1510*4882a593Smuzhiyun int ret;
1511*4882a593Smuzhiyun
1512*4882a593Smuzhiyun ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1513*4882a593Smuzhiyun return !ret;
1514*4882a593Smuzhiyun }
1515*4882a593Smuzhiyun
scrub_recheck_block_checksum(struct scrub_block * sblock)1516*4882a593Smuzhiyun static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1517*4882a593Smuzhiyun {
1518*4882a593Smuzhiyun sblock->header_error = 0;
1519*4882a593Smuzhiyun sblock->checksum_error = 0;
1520*4882a593Smuzhiyun sblock->generation_error = 0;
1521*4882a593Smuzhiyun
1522*4882a593Smuzhiyun if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1523*4882a593Smuzhiyun scrub_checksum_data(sblock);
1524*4882a593Smuzhiyun else
1525*4882a593Smuzhiyun scrub_checksum_tree_block(sblock);
1526*4882a593Smuzhiyun }
1527*4882a593Smuzhiyun
scrub_repair_block_from_good_copy(struct scrub_block * sblock_bad,struct scrub_block * sblock_good)1528*4882a593Smuzhiyun static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1529*4882a593Smuzhiyun struct scrub_block *sblock_good)
1530*4882a593Smuzhiyun {
1531*4882a593Smuzhiyun int page_num;
1532*4882a593Smuzhiyun int ret = 0;
1533*4882a593Smuzhiyun
1534*4882a593Smuzhiyun for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1535*4882a593Smuzhiyun int ret_sub;
1536*4882a593Smuzhiyun
1537*4882a593Smuzhiyun ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1538*4882a593Smuzhiyun sblock_good,
1539*4882a593Smuzhiyun page_num, 1);
1540*4882a593Smuzhiyun if (ret_sub)
1541*4882a593Smuzhiyun ret = ret_sub;
1542*4882a593Smuzhiyun }
1543*4882a593Smuzhiyun
1544*4882a593Smuzhiyun return ret;
1545*4882a593Smuzhiyun }
1546*4882a593Smuzhiyun
scrub_repair_page_from_good_copy(struct scrub_block * sblock_bad,struct scrub_block * sblock_good,int page_num,int force_write)1547*4882a593Smuzhiyun static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1548*4882a593Smuzhiyun struct scrub_block *sblock_good,
1549*4882a593Smuzhiyun int page_num, int force_write)
1550*4882a593Smuzhiyun {
1551*4882a593Smuzhiyun struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1552*4882a593Smuzhiyun struct scrub_page *page_good = sblock_good->pagev[page_num];
1553*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1554*4882a593Smuzhiyun
1555*4882a593Smuzhiyun BUG_ON(page_bad->page == NULL);
1556*4882a593Smuzhiyun BUG_ON(page_good->page == NULL);
1557*4882a593Smuzhiyun if (force_write || sblock_bad->header_error ||
1558*4882a593Smuzhiyun sblock_bad->checksum_error || page_bad->io_error) {
1559*4882a593Smuzhiyun struct bio *bio;
1560*4882a593Smuzhiyun int ret;
1561*4882a593Smuzhiyun
1562*4882a593Smuzhiyun if (!page_bad->dev->bdev) {
1563*4882a593Smuzhiyun btrfs_warn_rl(fs_info,
1564*4882a593Smuzhiyun "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1565*4882a593Smuzhiyun return -EIO;
1566*4882a593Smuzhiyun }
1567*4882a593Smuzhiyun
1568*4882a593Smuzhiyun bio = btrfs_io_bio_alloc(1);
1569*4882a593Smuzhiyun bio_set_dev(bio, page_bad->dev->bdev);
1570*4882a593Smuzhiyun bio->bi_iter.bi_sector = page_bad->physical >> 9;
1571*4882a593Smuzhiyun bio->bi_opf = REQ_OP_WRITE;
1572*4882a593Smuzhiyun
1573*4882a593Smuzhiyun ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1574*4882a593Smuzhiyun if (PAGE_SIZE != ret) {
1575*4882a593Smuzhiyun bio_put(bio);
1576*4882a593Smuzhiyun return -EIO;
1577*4882a593Smuzhiyun }
1578*4882a593Smuzhiyun
1579*4882a593Smuzhiyun if (btrfsic_submit_bio_wait(bio)) {
1580*4882a593Smuzhiyun btrfs_dev_stat_inc_and_print(page_bad->dev,
1581*4882a593Smuzhiyun BTRFS_DEV_STAT_WRITE_ERRS);
1582*4882a593Smuzhiyun atomic64_inc(&fs_info->dev_replace.num_write_errors);
1583*4882a593Smuzhiyun bio_put(bio);
1584*4882a593Smuzhiyun return -EIO;
1585*4882a593Smuzhiyun }
1586*4882a593Smuzhiyun bio_put(bio);
1587*4882a593Smuzhiyun }
1588*4882a593Smuzhiyun
1589*4882a593Smuzhiyun return 0;
1590*4882a593Smuzhiyun }
1591*4882a593Smuzhiyun
scrub_write_block_to_dev_replace(struct scrub_block * sblock)1592*4882a593Smuzhiyun static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1593*4882a593Smuzhiyun {
1594*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1595*4882a593Smuzhiyun int page_num;
1596*4882a593Smuzhiyun
1597*4882a593Smuzhiyun /*
1598*4882a593Smuzhiyun * This block is used for the check of the parity on the source device,
1599*4882a593Smuzhiyun * so the data needn't be written into the destination device.
1600*4882a593Smuzhiyun */
1601*4882a593Smuzhiyun if (sblock->sparity)
1602*4882a593Smuzhiyun return;
1603*4882a593Smuzhiyun
1604*4882a593Smuzhiyun for (page_num = 0; page_num < sblock->page_count; page_num++) {
1605*4882a593Smuzhiyun int ret;
1606*4882a593Smuzhiyun
1607*4882a593Smuzhiyun ret = scrub_write_page_to_dev_replace(sblock, page_num);
1608*4882a593Smuzhiyun if (ret)
1609*4882a593Smuzhiyun atomic64_inc(&fs_info->dev_replace.num_write_errors);
1610*4882a593Smuzhiyun }
1611*4882a593Smuzhiyun }
1612*4882a593Smuzhiyun
scrub_write_page_to_dev_replace(struct scrub_block * sblock,int page_num)1613*4882a593Smuzhiyun static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1614*4882a593Smuzhiyun int page_num)
1615*4882a593Smuzhiyun {
1616*4882a593Smuzhiyun struct scrub_page *spage = sblock->pagev[page_num];
1617*4882a593Smuzhiyun
1618*4882a593Smuzhiyun BUG_ON(spage->page == NULL);
1619*4882a593Smuzhiyun if (spage->io_error)
1620*4882a593Smuzhiyun clear_page(page_address(spage->page));
1621*4882a593Smuzhiyun
1622*4882a593Smuzhiyun return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1623*4882a593Smuzhiyun }
1624*4882a593Smuzhiyun
scrub_add_page_to_wr_bio(struct scrub_ctx * sctx,struct scrub_page * spage)1625*4882a593Smuzhiyun static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1626*4882a593Smuzhiyun struct scrub_page *spage)
1627*4882a593Smuzhiyun {
1628*4882a593Smuzhiyun struct scrub_bio *sbio;
1629*4882a593Smuzhiyun int ret;
1630*4882a593Smuzhiyun
1631*4882a593Smuzhiyun mutex_lock(&sctx->wr_lock);
1632*4882a593Smuzhiyun again:
1633*4882a593Smuzhiyun if (!sctx->wr_curr_bio) {
1634*4882a593Smuzhiyun sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1635*4882a593Smuzhiyun GFP_KERNEL);
1636*4882a593Smuzhiyun if (!sctx->wr_curr_bio) {
1637*4882a593Smuzhiyun mutex_unlock(&sctx->wr_lock);
1638*4882a593Smuzhiyun return -ENOMEM;
1639*4882a593Smuzhiyun }
1640*4882a593Smuzhiyun sctx->wr_curr_bio->sctx = sctx;
1641*4882a593Smuzhiyun sctx->wr_curr_bio->page_count = 0;
1642*4882a593Smuzhiyun }
1643*4882a593Smuzhiyun sbio = sctx->wr_curr_bio;
1644*4882a593Smuzhiyun if (sbio->page_count == 0) {
1645*4882a593Smuzhiyun struct bio *bio;
1646*4882a593Smuzhiyun
1647*4882a593Smuzhiyun sbio->physical = spage->physical_for_dev_replace;
1648*4882a593Smuzhiyun sbio->logical = spage->logical;
1649*4882a593Smuzhiyun sbio->dev = sctx->wr_tgtdev;
1650*4882a593Smuzhiyun bio = sbio->bio;
1651*4882a593Smuzhiyun if (!bio) {
1652*4882a593Smuzhiyun bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
1653*4882a593Smuzhiyun sbio->bio = bio;
1654*4882a593Smuzhiyun }
1655*4882a593Smuzhiyun
1656*4882a593Smuzhiyun bio->bi_private = sbio;
1657*4882a593Smuzhiyun bio->bi_end_io = scrub_wr_bio_end_io;
1658*4882a593Smuzhiyun bio_set_dev(bio, sbio->dev->bdev);
1659*4882a593Smuzhiyun bio->bi_iter.bi_sector = sbio->physical >> 9;
1660*4882a593Smuzhiyun bio->bi_opf = REQ_OP_WRITE;
1661*4882a593Smuzhiyun sbio->status = 0;
1662*4882a593Smuzhiyun } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1663*4882a593Smuzhiyun spage->physical_for_dev_replace ||
1664*4882a593Smuzhiyun sbio->logical + sbio->page_count * PAGE_SIZE !=
1665*4882a593Smuzhiyun spage->logical) {
1666*4882a593Smuzhiyun scrub_wr_submit(sctx);
1667*4882a593Smuzhiyun goto again;
1668*4882a593Smuzhiyun }
1669*4882a593Smuzhiyun
1670*4882a593Smuzhiyun ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1671*4882a593Smuzhiyun if (ret != PAGE_SIZE) {
1672*4882a593Smuzhiyun if (sbio->page_count < 1) {
1673*4882a593Smuzhiyun bio_put(sbio->bio);
1674*4882a593Smuzhiyun sbio->bio = NULL;
1675*4882a593Smuzhiyun mutex_unlock(&sctx->wr_lock);
1676*4882a593Smuzhiyun return -EIO;
1677*4882a593Smuzhiyun }
1678*4882a593Smuzhiyun scrub_wr_submit(sctx);
1679*4882a593Smuzhiyun goto again;
1680*4882a593Smuzhiyun }
1681*4882a593Smuzhiyun
1682*4882a593Smuzhiyun sbio->pagev[sbio->page_count] = spage;
1683*4882a593Smuzhiyun scrub_page_get(spage);
1684*4882a593Smuzhiyun sbio->page_count++;
1685*4882a593Smuzhiyun if (sbio->page_count == sctx->pages_per_wr_bio)
1686*4882a593Smuzhiyun scrub_wr_submit(sctx);
1687*4882a593Smuzhiyun mutex_unlock(&sctx->wr_lock);
1688*4882a593Smuzhiyun
1689*4882a593Smuzhiyun return 0;
1690*4882a593Smuzhiyun }
1691*4882a593Smuzhiyun
scrub_wr_submit(struct scrub_ctx * sctx)1692*4882a593Smuzhiyun static void scrub_wr_submit(struct scrub_ctx *sctx)
1693*4882a593Smuzhiyun {
1694*4882a593Smuzhiyun struct scrub_bio *sbio;
1695*4882a593Smuzhiyun
1696*4882a593Smuzhiyun if (!sctx->wr_curr_bio)
1697*4882a593Smuzhiyun return;
1698*4882a593Smuzhiyun
1699*4882a593Smuzhiyun sbio = sctx->wr_curr_bio;
1700*4882a593Smuzhiyun sctx->wr_curr_bio = NULL;
1701*4882a593Smuzhiyun WARN_ON(!sbio->bio->bi_disk);
1702*4882a593Smuzhiyun scrub_pending_bio_inc(sctx);
1703*4882a593Smuzhiyun /* process all writes in a single worker thread. Then the block layer
1704*4882a593Smuzhiyun * orders the requests before sending them to the driver which
1705*4882a593Smuzhiyun * doubled the write performance on spinning disks when measured
1706*4882a593Smuzhiyun * with Linux 3.5 */
1707*4882a593Smuzhiyun btrfsic_submit_bio(sbio->bio);
1708*4882a593Smuzhiyun }
1709*4882a593Smuzhiyun
scrub_wr_bio_end_io(struct bio * bio)1710*4882a593Smuzhiyun static void scrub_wr_bio_end_io(struct bio *bio)
1711*4882a593Smuzhiyun {
1712*4882a593Smuzhiyun struct scrub_bio *sbio = bio->bi_private;
1713*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1714*4882a593Smuzhiyun
1715*4882a593Smuzhiyun sbio->status = bio->bi_status;
1716*4882a593Smuzhiyun sbio->bio = bio;
1717*4882a593Smuzhiyun
1718*4882a593Smuzhiyun btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
1719*4882a593Smuzhiyun btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1720*4882a593Smuzhiyun }
1721*4882a593Smuzhiyun
scrub_wr_bio_end_io_worker(struct btrfs_work * work)1722*4882a593Smuzhiyun static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1723*4882a593Smuzhiyun {
1724*4882a593Smuzhiyun struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1725*4882a593Smuzhiyun struct scrub_ctx *sctx = sbio->sctx;
1726*4882a593Smuzhiyun int i;
1727*4882a593Smuzhiyun
1728*4882a593Smuzhiyun WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1729*4882a593Smuzhiyun if (sbio->status) {
1730*4882a593Smuzhiyun struct btrfs_dev_replace *dev_replace =
1731*4882a593Smuzhiyun &sbio->sctx->fs_info->dev_replace;
1732*4882a593Smuzhiyun
1733*4882a593Smuzhiyun for (i = 0; i < sbio->page_count; i++) {
1734*4882a593Smuzhiyun struct scrub_page *spage = sbio->pagev[i];
1735*4882a593Smuzhiyun
1736*4882a593Smuzhiyun spage->io_error = 1;
1737*4882a593Smuzhiyun atomic64_inc(&dev_replace->num_write_errors);
1738*4882a593Smuzhiyun }
1739*4882a593Smuzhiyun }
1740*4882a593Smuzhiyun
1741*4882a593Smuzhiyun for (i = 0; i < sbio->page_count; i++)
1742*4882a593Smuzhiyun scrub_page_put(sbio->pagev[i]);
1743*4882a593Smuzhiyun
1744*4882a593Smuzhiyun bio_put(sbio->bio);
1745*4882a593Smuzhiyun kfree(sbio);
1746*4882a593Smuzhiyun scrub_pending_bio_dec(sctx);
1747*4882a593Smuzhiyun }
1748*4882a593Smuzhiyun
scrub_checksum(struct scrub_block * sblock)1749*4882a593Smuzhiyun static int scrub_checksum(struct scrub_block *sblock)
1750*4882a593Smuzhiyun {
1751*4882a593Smuzhiyun u64 flags;
1752*4882a593Smuzhiyun int ret;
1753*4882a593Smuzhiyun
1754*4882a593Smuzhiyun /*
1755*4882a593Smuzhiyun * No need to initialize these stats currently,
1756*4882a593Smuzhiyun * because this function only use return value
1757*4882a593Smuzhiyun * instead of these stats value.
1758*4882a593Smuzhiyun *
1759*4882a593Smuzhiyun * Todo:
1760*4882a593Smuzhiyun * always use stats
1761*4882a593Smuzhiyun */
1762*4882a593Smuzhiyun sblock->header_error = 0;
1763*4882a593Smuzhiyun sblock->generation_error = 0;
1764*4882a593Smuzhiyun sblock->checksum_error = 0;
1765*4882a593Smuzhiyun
1766*4882a593Smuzhiyun WARN_ON(sblock->page_count < 1);
1767*4882a593Smuzhiyun flags = sblock->pagev[0]->flags;
1768*4882a593Smuzhiyun ret = 0;
1769*4882a593Smuzhiyun if (flags & BTRFS_EXTENT_FLAG_DATA)
1770*4882a593Smuzhiyun ret = scrub_checksum_data(sblock);
1771*4882a593Smuzhiyun else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1772*4882a593Smuzhiyun ret = scrub_checksum_tree_block(sblock);
1773*4882a593Smuzhiyun else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1774*4882a593Smuzhiyun (void)scrub_checksum_super(sblock);
1775*4882a593Smuzhiyun else
1776*4882a593Smuzhiyun WARN_ON(1);
1777*4882a593Smuzhiyun if (ret)
1778*4882a593Smuzhiyun scrub_handle_errored_block(sblock);
1779*4882a593Smuzhiyun
1780*4882a593Smuzhiyun return ret;
1781*4882a593Smuzhiyun }
1782*4882a593Smuzhiyun
scrub_checksum_data(struct scrub_block * sblock)1783*4882a593Smuzhiyun static int scrub_checksum_data(struct scrub_block *sblock)
1784*4882a593Smuzhiyun {
1785*4882a593Smuzhiyun struct scrub_ctx *sctx = sblock->sctx;
1786*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sctx->fs_info;
1787*4882a593Smuzhiyun SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1788*4882a593Smuzhiyun u8 csum[BTRFS_CSUM_SIZE];
1789*4882a593Smuzhiyun struct scrub_page *spage;
1790*4882a593Smuzhiyun char *kaddr;
1791*4882a593Smuzhiyun
1792*4882a593Smuzhiyun BUG_ON(sblock->page_count < 1);
1793*4882a593Smuzhiyun spage = sblock->pagev[0];
1794*4882a593Smuzhiyun if (!spage->have_csum)
1795*4882a593Smuzhiyun return 0;
1796*4882a593Smuzhiyun
1797*4882a593Smuzhiyun kaddr = page_address(spage->page);
1798*4882a593Smuzhiyun
1799*4882a593Smuzhiyun shash->tfm = fs_info->csum_shash;
1800*4882a593Smuzhiyun crypto_shash_init(shash);
1801*4882a593Smuzhiyun crypto_shash_digest(shash, kaddr, PAGE_SIZE, csum);
1802*4882a593Smuzhiyun
1803*4882a593Smuzhiyun if (memcmp(csum, spage->csum, sctx->csum_size))
1804*4882a593Smuzhiyun sblock->checksum_error = 1;
1805*4882a593Smuzhiyun
1806*4882a593Smuzhiyun return sblock->checksum_error;
1807*4882a593Smuzhiyun }
1808*4882a593Smuzhiyun
scrub_checksum_tree_block(struct scrub_block * sblock)1809*4882a593Smuzhiyun static int scrub_checksum_tree_block(struct scrub_block *sblock)
1810*4882a593Smuzhiyun {
1811*4882a593Smuzhiyun struct scrub_ctx *sctx = sblock->sctx;
1812*4882a593Smuzhiyun struct btrfs_header *h;
1813*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sctx->fs_info;
1814*4882a593Smuzhiyun SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1815*4882a593Smuzhiyun u8 calculated_csum[BTRFS_CSUM_SIZE];
1816*4882a593Smuzhiyun u8 on_disk_csum[BTRFS_CSUM_SIZE];
1817*4882a593Smuzhiyun const int num_pages = sctx->fs_info->nodesize >> PAGE_SHIFT;
1818*4882a593Smuzhiyun int i;
1819*4882a593Smuzhiyun struct scrub_page *spage;
1820*4882a593Smuzhiyun char *kaddr;
1821*4882a593Smuzhiyun
1822*4882a593Smuzhiyun BUG_ON(sblock->page_count < 1);
1823*4882a593Smuzhiyun spage = sblock->pagev[0];
1824*4882a593Smuzhiyun kaddr = page_address(spage->page);
1825*4882a593Smuzhiyun h = (struct btrfs_header *)kaddr;
1826*4882a593Smuzhiyun memcpy(on_disk_csum, h->csum, sctx->csum_size);
1827*4882a593Smuzhiyun
1828*4882a593Smuzhiyun /*
1829*4882a593Smuzhiyun * we don't use the getter functions here, as we
1830*4882a593Smuzhiyun * a) don't have an extent buffer and
1831*4882a593Smuzhiyun * b) the page is already kmapped
1832*4882a593Smuzhiyun */
1833*4882a593Smuzhiyun if (spage->logical != btrfs_stack_header_bytenr(h))
1834*4882a593Smuzhiyun sblock->header_error = 1;
1835*4882a593Smuzhiyun
1836*4882a593Smuzhiyun if (spage->generation != btrfs_stack_header_generation(h)) {
1837*4882a593Smuzhiyun sblock->header_error = 1;
1838*4882a593Smuzhiyun sblock->generation_error = 1;
1839*4882a593Smuzhiyun }
1840*4882a593Smuzhiyun
1841*4882a593Smuzhiyun if (!scrub_check_fsid(h->fsid, spage))
1842*4882a593Smuzhiyun sblock->header_error = 1;
1843*4882a593Smuzhiyun
1844*4882a593Smuzhiyun if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1845*4882a593Smuzhiyun BTRFS_UUID_SIZE))
1846*4882a593Smuzhiyun sblock->header_error = 1;
1847*4882a593Smuzhiyun
1848*4882a593Smuzhiyun shash->tfm = fs_info->csum_shash;
1849*4882a593Smuzhiyun crypto_shash_init(shash);
1850*4882a593Smuzhiyun crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1851*4882a593Smuzhiyun PAGE_SIZE - BTRFS_CSUM_SIZE);
1852*4882a593Smuzhiyun
1853*4882a593Smuzhiyun for (i = 1; i < num_pages; i++) {
1854*4882a593Smuzhiyun kaddr = page_address(sblock->pagev[i]->page);
1855*4882a593Smuzhiyun crypto_shash_update(shash, kaddr, PAGE_SIZE);
1856*4882a593Smuzhiyun }
1857*4882a593Smuzhiyun
1858*4882a593Smuzhiyun crypto_shash_final(shash, calculated_csum);
1859*4882a593Smuzhiyun if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1860*4882a593Smuzhiyun sblock->checksum_error = 1;
1861*4882a593Smuzhiyun
1862*4882a593Smuzhiyun return sblock->header_error || sblock->checksum_error;
1863*4882a593Smuzhiyun }
1864*4882a593Smuzhiyun
scrub_checksum_super(struct scrub_block * sblock)1865*4882a593Smuzhiyun static int scrub_checksum_super(struct scrub_block *sblock)
1866*4882a593Smuzhiyun {
1867*4882a593Smuzhiyun struct btrfs_super_block *s;
1868*4882a593Smuzhiyun struct scrub_ctx *sctx = sblock->sctx;
1869*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sctx->fs_info;
1870*4882a593Smuzhiyun SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1871*4882a593Smuzhiyun u8 calculated_csum[BTRFS_CSUM_SIZE];
1872*4882a593Smuzhiyun struct scrub_page *spage;
1873*4882a593Smuzhiyun char *kaddr;
1874*4882a593Smuzhiyun int fail_gen = 0;
1875*4882a593Smuzhiyun int fail_cor = 0;
1876*4882a593Smuzhiyun
1877*4882a593Smuzhiyun BUG_ON(sblock->page_count < 1);
1878*4882a593Smuzhiyun spage = sblock->pagev[0];
1879*4882a593Smuzhiyun kaddr = page_address(spage->page);
1880*4882a593Smuzhiyun s = (struct btrfs_super_block *)kaddr;
1881*4882a593Smuzhiyun
1882*4882a593Smuzhiyun if (spage->logical != btrfs_super_bytenr(s))
1883*4882a593Smuzhiyun ++fail_cor;
1884*4882a593Smuzhiyun
1885*4882a593Smuzhiyun if (spage->generation != btrfs_super_generation(s))
1886*4882a593Smuzhiyun ++fail_gen;
1887*4882a593Smuzhiyun
1888*4882a593Smuzhiyun if (!scrub_check_fsid(s->fsid, spage))
1889*4882a593Smuzhiyun ++fail_cor;
1890*4882a593Smuzhiyun
1891*4882a593Smuzhiyun shash->tfm = fs_info->csum_shash;
1892*4882a593Smuzhiyun crypto_shash_init(shash);
1893*4882a593Smuzhiyun crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1894*4882a593Smuzhiyun BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1895*4882a593Smuzhiyun
1896*4882a593Smuzhiyun if (memcmp(calculated_csum, s->csum, sctx->csum_size))
1897*4882a593Smuzhiyun ++fail_cor;
1898*4882a593Smuzhiyun
1899*4882a593Smuzhiyun if (fail_cor + fail_gen) {
1900*4882a593Smuzhiyun /*
1901*4882a593Smuzhiyun * if we find an error in a super block, we just report it.
1902*4882a593Smuzhiyun * They will get written with the next transaction commit
1903*4882a593Smuzhiyun * anyway
1904*4882a593Smuzhiyun */
1905*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
1906*4882a593Smuzhiyun ++sctx->stat.super_errors;
1907*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
1908*4882a593Smuzhiyun if (fail_cor)
1909*4882a593Smuzhiyun btrfs_dev_stat_inc_and_print(spage->dev,
1910*4882a593Smuzhiyun BTRFS_DEV_STAT_CORRUPTION_ERRS);
1911*4882a593Smuzhiyun else
1912*4882a593Smuzhiyun btrfs_dev_stat_inc_and_print(spage->dev,
1913*4882a593Smuzhiyun BTRFS_DEV_STAT_GENERATION_ERRS);
1914*4882a593Smuzhiyun }
1915*4882a593Smuzhiyun
1916*4882a593Smuzhiyun return fail_cor + fail_gen;
1917*4882a593Smuzhiyun }
1918*4882a593Smuzhiyun
scrub_block_get(struct scrub_block * sblock)1919*4882a593Smuzhiyun static void scrub_block_get(struct scrub_block *sblock)
1920*4882a593Smuzhiyun {
1921*4882a593Smuzhiyun refcount_inc(&sblock->refs);
1922*4882a593Smuzhiyun }
1923*4882a593Smuzhiyun
scrub_block_put(struct scrub_block * sblock)1924*4882a593Smuzhiyun static void scrub_block_put(struct scrub_block *sblock)
1925*4882a593Smuzhiyun {
1926*4882a593Smuzhiyun if (refcount_dec_and_test(&sblock->refs)) {
1927*4882a593Smuzhiyun int i;
1928*4882a593Smuzhiyun
1929*4882a593Smuzhiyun if (sblock->sparity)
1930*4882a593Smuzhiyun scrub_parity_put(sblock->sparity);
1931*4882a593Smuzhiyun
1932*4882a593Smuzhiyun for (i = 0; i < sblock->page_count; i++)
1933*4882a593Smuzhiyun scrub_page_put(sblock->pagev[i]);
1934*4882a593Smuzhiyun kfree(sblock);
1935*4882a593Smuzhiyun }
1936*4882a593Smuzhiyun }
1937*4882a593Smuzhiyun
scrub_page_get(struct scrub_page * spage)1938*4882a593Smuzhiyun static void scrub_page_get(struct scrub_page *spage)
1939*4882a593Smuzhiyun {
1940*4882a593Smuzhiyun atomic_inc(&spage->refs);
1941*4882a593Smuzhiyun }
1942*4882a593Smuzhiyun
scrub_page_put(struct scrub_page * spage)1943*4882a593Smuzhiyun static void scrub_page_put(struct scrub_page *spage)
1944*4882a593Smuzhiyun {
1945*4882a593Smuzhiyun if (atomic_dec_and_test(&spage->refs)) {
1946*4882a593Smuzhiyun if (spage->page)
1947*4882a593Smuzhiyun __free_page(spage->page);
1948*4882a593Smuzhiyun kfree(spage);
1949*4882a593Smuzhiyun }
1950*4882a593Smuzhiyun }
1951*4882a593Smuzhiyun
scrub_submit(struct scrub_ctx * sctx)1952*4882a593Smuzhiyun static void scrub_submit(struct scrub_ctx *sctx)
1953*4882a593Smuzhiyun {
1954*4882a593Smuzhiyun struct scrub_bio *sbio;
1955*4882a593Smuzhiyun
1956*4882a593Smuzhiyun if (sctx->curr == -1)
1957*4882a593Smuzhiyun return;
1958*4882a593Smuzhiyun
1959*4882a593Smuzhiyun sbio = sctx->bios[sctx->curr];
1960*4882a593Smuzhiyun sctx->curr = -1;
1961*4882a593Smuzhiyun scrub_pending_bio_inc(sctx);
1962*4882a593Smuzhiyun btrfsic_submit_bio(sbio->bio);
1963*4882a593Smuzhiyun }
1964*4882a593Smuzhiyun
scrub_add_page_to_rd_bio(struct scrub_ctx * sctx,struct scrub_page * spage)1965*4882a593Smuzhiyun static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1966*4882a593Smuzhiyun struct scrub_page *spage)
1967*4882a593Smuzhiyun {
1968*4882a593Smuzhiyun struct scrub_block *sblock = spage->sblock;
1969*4882a593Smuzhiyun struct scrub_bio *sbio;
1970*4882a593Smuzhiyun int ret;
1971*4882a593Smuzhiyun
1972*4882a593Smuzhiyun again:
1973*4882a593Smuzhiyun /*
1974*4882a593Smuzhiyun * grab a fresh bio or wait for one to become available
1975*4882a593Smuzhiyun */
1976*4882a593Smuzhiyun while (sctx->curr == -1) {
1977*4882a593Smuzhiyun spin_lock(&sctx->list_lock);
1978*4882a593Smuzhiyun sctx->curr = sctx->first_free;
1979*4882a593Smuzhiyun if (sctx->curr != -1) {
1980*4882a593Smuzhiyun sctx->first_free = sctx->bios[sctx->curr]->next_free;
1981*4882a593Smuzhiyun sctx->bios[sctx->curr]->next_free = -1;
1982*4882a593Smuzhiyun sctx->bios[sctx->curr]->page_count = 0;
1983*4882a593Smuzhiyun spin_unlock(&sctx->list_lock);
1984*4882a593Smuzhiyun } else {
1985*4882a593Smuzhiyun spin_unlock(&sctx->list_lock);
1986*4882a593Smuzhiyun wait_event(sctx->list_wait, sctx->first_free != -1);
1987*4882a593Smuzhiyun }
1988*4882a593Smuzhiyun }
1989*4882a593Smuzhiyun sbio = sctx->bios[sctx->curr];
1990*4882a593Smuzhiyun if (sbio->page_count == 0) {
1991*4882a593Smuzhiyun struct bio *bio;
1992*4882a593Smuzhiyun
1993*4882a593Smuzhiyun sbio->physical = spage->physical;
1994*4882a593Smuzhiyun sbio->logical = spage->logical;
1995*4882a593Smuzhiyun sbio->dev = spage->dev;
1996*4882a593Smuzhiyun bio = sbio->bio;
1997*4882a593Smuzhiyun if (!bio) {
1998*4882a593Smuzhiyun bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
1999*4882a593Smuzhiyun sbio->bio = bio;
2000*4882a593Smuzhiyun }
2001*4882a593Smuzhiyun
2002*4882a593Smuzhiyun bio->bi_private = sbio;
2003*4882a593Smuzhiyun bio->bi_end_io = scrub_bio_end_io;
2004*4882a593Smuzhiyun bio_set_dev(bio, sbio->dev->bdev);
2005*4882a593Smuzhiyun bio->bi_iter.bi_sector = sbio->physical >> 9;
2006*4882a593Smuzhiyun bio->bi_opf = REQ_OP_READ;
2007*4882a593Smuzhiyun sbio->status = 0;
2008*4882a593Smuzhiyun } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2009*4882a593Smuzhiyun spage->physical ||
2010*4882a593Smuzhiyun sbio->logical + sbio->page_count * PAGE_SIZE !=
2011*4882a593Smuzhiyun spage->logical ||
2012*4882a593Smuzhiyun sbio->dev != spage->dev) {
2013*4882a593Smuzhiyun scrub_submit(sctx);
2014*4882a593Smuzhiyun goto again;
2015*4882a593Smuzhiyun }
2016*4882a593Smuzhiyun
2017*4882a593Smuzhiyun sbio->pagev[sbio->page_count] = spage;
2018*4882a593Smuzhiyun ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2019*4882a593Smuzhiyun if (ret != PAGE_SIZE) {
2020*4882a593Smuzhiyun if (sbio->page_count < 1) {
2021*4882a593Smuzhiyun bio_put(sbio->bio);
2022*4882a593Smuzhiyun sbio->bio = NULL;
2023*4882a593Smuzhiyun return -EIO;
2024*4882a593Smuzhiyun }
2025*4882a593Smuzhiyun scrub_submit(sctx);
2026*4882a593Smuzhiyun goto again;
2027*4882a593Smuzhiyun }
2028*4882a593Smuzhiyun
2029*4882a593Smuzhiyun scrub_block_get(sblock); /* one for the page added to the bio */
2030*4882a593Smuzhiyun atomic_inc(&sblock->outstanding_pages);
2031*4882a593Smuzhiyun sbio->page_count++;
2032*4882a593Smuzhiyun if (sbio->page_count == sctx->pages_per_rd_bio)
2033*4882a593Smuzhiyun scrub_submit(sctx);
2034*4882a593Smuzhiyun
2035*4882a593Smuzhiyun return 0;
2036*4882a593Smuzhiyun }
2037*4882a593Smuzhiyun
scrub_missing_raid56_end_io(struct bio * bio)2038*4882a593Smuzhiyun static void scrub_missing_raid56_end_io(struct bio *bio)
2039*4882a593Smuzhiyun {
2040*4882a593Smuzhiyun struct scrub_block *sblock = bio->bi_private;
2041*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2042*4882a593Smuzhiyun
2043*4882a593Smuzhiyun if (bio->bi_status)
2044*4882a593Smuzhiyun sblock->no_io_error_seen = 0;
2045*4882a593Smuzhiyun
2046*4882a593Smuzhiyun bio_put(bio);
2047*4882a593Smuzhiyun
2048*4882a593Smuzhiyun btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2049*4882a593Smuzhiyun }
2050*4882a593Smuzhiyun
scrub_missing_raid56_worker(struct btrfs_work * work)2051*4882a593Smuzhiyun static void scrub_missing_raid56_worker(struct btrfs_work *work)
2052*4882a593Smuzhiyun {
2053*4882a593Smuzhiyun struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2054*4882a593Smuzhiyun struct scrub_ctx *sctx = sblock->sctx;
2055*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sctx->fs_info;
2056*4882a593Smuzhiyun u64 logical;
2057*4882a593Smuzhiyun struct btrfs_device *dev;
2058*4882a593Smuzhiyun
2059*4882a593Smuzhiyun logical = sblock->pagev[0]->logical;
2060*4882a593Smuzhiyun dev = sblock->pagev[0]->dev;
2061*4882a593Smuzhiyun
2062*4882a593Smuzhiyun if (sblock->no_io_error_seen)
2063*4882a593Smuzhiyun scrub_recheck_block_checksum(sblock);
2064*4882a593Smuzhiyun
2065*4882a593Smuzhiyun if (!sblock->no_io_error_seen) {
2066*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
2067*4882a593Smuzhiyun sctx->stat.read_errors++;
2068*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
2069*4882a593Smuzhiyun btrfs_err_rl_in_rcu(fs_info,
2070*4882a593Smuzhiyun "IO error rebuilding logical %llu for dev %s",
2071*4882a593Smuzhiyun logical, rcu_str_deref(dev->name));
2072*4882a593Smuzhiyun } else if (sblock->header_error || sblock->checksum_error) {
2073*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
2074*4882a593Smuzhiyun sctx->stat.uncorrectable_errors++;
2075*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
2076*4882a593Smuzhiyun btrfs_err_rl_in_rcu(fs_info,
2077*4882a593Smuzhiyun "failed to rebuild valid logical %llu for dev %s",
2078*4882a593Smuzhiyun logical, rcu_str_deref(dev->name));
2079*4882a593Smuzhiyun } else {
2080*4882a593Smuzhiyun scrub_write_block_to_dev_replace(sblock);
2081*4882a593Smuzhiyun }
2082*4882a593Smuzhiyun
2083*4882a593Smuzhiyun if (sctx->is_dev_replace && sctx->flush_all_writes) {
2084*4882a593Smuzhiyun mutex_lock(&sctx->wr_lock);
2085*4882a593Smuzhiyun scrub_wr_submit(sctx);
2086*4882a593Smuzhiyun mutex_unlock(&sctx->wr_lock);
2087*4882a593Smuzhiyun }
2088*4882a593Smuzhiyun
2089*4882a593Smuzhiyun scrub_block_put(sblock);
2090*4882a593Smuzhiyun scrub_pending_bio_dec(sctx);
2091*4882a593Smuzhiyun }
2092*4882a593Smuzhiyun
scrub_missing_raid56_pages(struct scrub_block * sblock)2093*4882a593Smuzhiyun static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2094*4882a593Smuzhiyun {
2095*4882a593Smuzhiyun struct scrub_ctx *sctx = sblock->sctx;
2096*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sctx->fs_info;
2097*4882a593Smuzhiyun u64 length = sblock->page_count * PAGE_SIZE;
2098*4882a593Smuzhiyun u64 logical = sblock->pagev[0]->logical;
2099*4882a593Smuzhiyun struct btrfs_bio *bbio = NULL;
2100*4882a593Smuzhiyun struct bio *bio;
2101*4882a593Smuzhiyun struct btrfs_raid_bio *rbio;
2102*4882a593Smuzhiyun int ret;
2103*4882a593Smuzhiyun int i;
2104*4882a593Smuzhiyun
2105*4882a593Smuzhiyun btrfs_bio_counter_inc_blocked(fs_info);
2106*4882a593Smuzhiyun ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2107*4882a593Smuzhiyun &length, &bbio);
2108*4882a593Smuzhiyun if (ret || !bbio || !bbio->raid_map)
2109*4882a593Smuzhiyun goto bbio_out;
2110*4882a593Smuzhiyun
2111*4882a593Smuzhiyun if (WARN_ON(!sctx->is_dev_replace ||
2112*4882a593Smuzhiyun !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2113*4882a593Smuzhiyun /*
2114*4882a593Smuzhiyun * We shouldn't be scrubbing a missing device. Even for dev
2115*4882a593Smuzhiyun * replace, we should only get here for RAID 5/6. We either
2116*4882a593Smuzhiyun * managed to mount something with no mirrors remaining or
2117*4882a593Smuzhiyun * there's a bug in scrub_remap_extent()/btrfs_map_block().
2118*4882a593Smuzhiyun */
2119*4882a593Smuzhiyun goto bbio_out;
2120*4882a593Smuzhiyun }
2121*4882a593Smuzhiyun
2122*4882a593Smuzhiyun bio = btrfs_io_bio_alloc(0);
2123*4882a593Smuzhiyun bio->bi_iter.bi_sector = logical >> 9;
2124*4882a593Smuzhiyun bio->bi_private = sblock;
2125*4882a593Smuzhiyun bio->bi_end_io = scrub_missing_raid56_end_io;
2126*4882a593Smuzhiyun
2127*4882a593Smuzhiyun rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
2128*4882a593Smuzhiyun if (!rbio)
2129*4882a593Smuzhiyun goto rbio_out;
2130*4882a593Smuzhiyun
2131*4882a593Smuzhiyun for (i = 0; i < sblock->page_count; i++) {
2132*4882a593Smuzhiyun struct scrub_page *spage = sblock->pagev[i];
2133*4882a593Smuzhiyun
2134*4882a593Smuzhiyun raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2135*4882a593Smuzhiyun }
2136*4882a593Smuzhiyun
2137*4882a593Smuzhiyun btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
2138*4882a593Smuzhiyun scrub_block_get(sblock);
2139*4882a593Smuzhiyun scrub_pending_bio_inc(sctx);
2140*4882a593Smuzhiyun raid56_submit_missing_rbio(rbio);
2141*4882a593Smuzhiyun return;
2142*4882a593Smuzhiyun
2143*4882a593Smuzhiyun rbio_out:
2144*4882a593Smuzhiyun bio_put(bio);
2145*4882a593Smuzhiyun bbio_out:
2146*4882a593Smuzhiyun btrfs_bio_counter_dec(fs_info);
2147*4882a593Smuzhiyun btrfs_put_bbio(bbio);
2148*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
2149*4882a593Smuzhiyun sctx->stat.malloc_errors++;
2150*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
2151*4882a593Smuzhiyun }
2152*4882a593Smuzhiyun
scrub_pages(struct scrub_ctx * sctx,u64 logical,u64 len,u64 physical,struct btrfs_device * dev,u64 flags,u64 gen,int mirror_num,u8 * csum,int force,u64 physical_for_dev_replace)2153*4882a593Smuzhiyun static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2154*4882a593Smuzhiyun u64 physical, struct btrfs_device *dev, u64 flags,
2155*4882a593Smuzhiyun u64 gen, int mirror_num, u8 *csum, int force,
2156*4882a593Smuzhiyun u64 physical_for_dev_replace)
2157*4882a593Smuzhiyun {
2158*4882a593Smuzhiyun struct scrub_block *sblock;
2159*4882a593Smuzhiyun int index;
2160*4882a593Smuzhiyun
2161*4882a593Smuzhiyun sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2162*4882a593Smuzhiyun if (!sblock) {
2163*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
2164*4882a593Smuzhiyun sctx->stat.malloc_errors++;
2165*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
2166*4882a593Smuzhiyun return -ENOMEM;
2167*4882a593Smuzhiyun }
2168*4882a593Smuzhiyun
2169*4882a593Smuzhiyun /* one ref inside this function, plus one for each page added to
2170*4882a593Smuzhiyun * a bio later on */
2171*4882a593Smuzhiyun refcount_set(&sblock->refs, 1);
2172*4882a593Smuzhiyun sblock->sctx = sctx;
2173*4882a593Smuzhiyun sblock->no_io_error_seen = 1;
2174*4882a593Smuzhiyun
2175*4882a593Smuzhiyun for (index = 0; len > 0; index++) {
2176*4882a593Smuzhiyun struct scrub_page *spage;
2177*4882a593Smuzhiyun u64 l = min_t(u64, len, PAGE_SIZE);
2178*4882a593Smuzhiyun
2179*4882a593Smuzhiyun spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2180*4882a593Smuzhiyun if (!spage) {
2181*4882a593Smuzhiyun leave_nomem:
2182*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
2183*4882a593Smuzhiyun sctx->stat.malloc_errors++;
2184*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
2185*4882a593Smuzhiyun scrub_block_put(sblock);
2186*4882a593Smuzhiyun return -ENOMEM;
2187*4882a593Smuzhiyun }
2188*4882a593Smuzhiyun BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2189*4882a593Smuzhiyun scrub_page_get(spage);
2190*4882a593Smuzhiyun sblock->pagev[index] = spage;
2191*4882a593Smuzhiyun spage->sblock = sblock;
2192*4882a593Smuzhiyun spage->dev = dev;
2193*4882a593Smuzhiyun spage->flags = flags;
2194*4882a593Smuzhiyun spage->generation = gen;
2195*4882a593Smuzhiyun spage->logical = logical;
2196*4882a593Smuzhiyun spage->physical = physical;
2197*4882a593Smuzhiyun spage->physical_for_dev_replace = physical_for_dev_replace;
2198*4882a593Smuzhiyun spage->mirror_num = mirror_num;
2199*4882a593Smuzhiyun if (csum) {
2200*4882a593Smuzhiyun spage->have_csum = 1;
2201*4882a593Smuzhiyun memcpy(spage->csum, csum, sctx->csum_size);
2202*4882a593Smuzhiyun } else {
2203*4882a593Smuzhiyun spage->have_csum = 0;
2204*4882a593Smuzhiyun }
2205*4882a593Smuzhiyun sblock->page_count++;
2206*4882a593Smuzhiyun spage->page = alloc_page(GFP_KERNEL);
2207*4882a593Smuzhiyun if (!spage->page)
2208*4882a593Smuzhiyun goto leave_nomem;
2209*4882a593Smuzhiyun len -= l;
2210*4882a593Smuzhiyun logical += l;
2211*4882a593Smuzhiyun physical += l;
2212*4882a593Smuzhiyun physical_for_dev_replace += l;
2213*4882a593Smuzhiyun }
2214*4882a593Smuzhiyun
2215*4882a593Smuzhiyun WARN_ON(sblock->page_count == 0);
2216*4882a593Smuzhiyun if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2217*4882a593Smuzhiyun /*
2218*4882a593Smuzhiyun * This case should only be hit for RAID 5/6 device replace. See
2219*4882a593Smuzhiyun * the comment in scrub_missing_raid56_pages() for details.
2220*4882a593Smuzhiyun */
2221*4882a593Smuzhiyun scrub_missing_raid56_pages(sblock);
2222*4882a593Smuzhiyun } else {
2223*4882a593Smuzhiyun for (index = 0; index < sblock->page_count; index++) {
2224*4882a593Smuzhiyun struct scrub_page *spage = sblock->pagev[index];
2225*4882a593Smuzhiyun int ret;
2226*4882a593Smuzhiyun
2227*4882a593Smuzhiyun ret = scrub_add_page_to_rd_bio(sctx, spage);
2228*4882a593Smuzhiyun if (ret) {
2229*4882a593Smuzhiyun scrub_block_put(sblock);
2230*4882a593Smuzhiyun return ret;
2231*4882a593Smuzhiyun }
2232*4882a593Smuzhiyun }
2233*4882a593Smuzhiyun
2234*4882a593Smuzhiyun if (force)
2235*4882a593Smuzhiyun scrub_submit(sctx);
2236*4882a593Smuzhiyun }
2237*4882a593Smuzhiyun
2238*4882a593Smuzhiyun /* last one frees, either here or in bio completion for last page */
2239*4882a593Smuzhiyun scrub_block_put(sblock);
2240*4882a593Smuzhiyun return 0;
2241*4882a593Smuzhiyun }
2242*4882a593Smuzhiyun
scrub_bio_end_io(struct bio * bio)2243*4882a593Smuzhiyun static void scrub_bio_end_io(struct bio *bio)
2244*4882a593Smuzhiyun {
2245*4882a593Smuzhiyun struct scrub_bio *sbio = bio->bi_private;
2246*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2247*4882a593Smuzhiyun
2248*4882a593Smuzhiyun sbio->status = bio->bi_status;
2249*4882a593Smuzhiyun sbio->bio = bio;
2250*4882a593Smuzhiyun
2251*4882a593Smuzhiyun btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2252*4882a593Smuzhiyun }
2253*4882a593Smuzhiyun
scrub_bio_end_io_worker(struct btrfs_work * work)2254*4882a593Smuzhiyun static void scrub_bio_end_io_worker(struct btrfs_work *work)
2255*4882a593Smuzhiyun {
2256*4882a593Smuzhiyun struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2257*4882a593Smuzhiyun struct scrub_ctx *sctx = sbio->sctx;
2258*4882a593Smuzhiyun int i;
2259*4882a593Smuzhiyun
2260*4882a593Smuzhiyun BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2261*4882a593Smuzhiyun if (sbio->status) {
2262*4882a593Smuzhiyun for (i = 0; i < sbio->page_count; i++) {
2263*4882a593Smuzhiyun struct scrub_page *spage = sbio->pagev[i];
2264*4882a593Smuzhiyun
2265*4882a593Smuzhiyun spage->io_error = 1;
2266*4882a593Smuzhiyun spage->sblock->no_io_error_seen = 0;
2267*4882a593Smuzhiyun }
2268*4882a593Smuzhiyun }
2269*4882a593Smuzhiyun
2270*4882a593Smuzhiyun /* now complete the scrub_block items that have all pages completed */
2271*4882a593Smuzhiyun for (i = 0; i < sbio->page_count; i++) {
2272*4882a593Smuzhiyun struct scrub_page *spage = sbio->pagev[i];
2273*4882a593Smuzhiyun struct scrub_block *sblock = spage->sblock;
2274*4882a593Smuzhiyun
2275*4882a593Smuzhiyun if (atomic_dec_and_test(&sblock->outstanding_pages))
2276*4882a593Smuzhiyun scrub_block_complete(sblock);
2277*4882a593Smuzhiyun scrub_block_put(sblock);
2278*4882a593Smuzhiyun }
2279*4882a593Smuzhiyun
2280*4882a593Smuzhiyun bio_put(sbio->bio);
2281*4882a593Smuzhiyun sbio->bio = NULL;
2282*4882a593Smuzhiyun spin_lock(&sctx->list_lock);
2283*4882a593Smuzhiyun sbio->next_free = sctx->first_free;
2284*4882a593Smuzhiyun sctx->first_free = sbio->index;
2285*4882a593Smuzhiyun spin_unlock(&sctx->list_lock);
2286*4882a593Smuzhiyun
2287*4882a593Smuzhiyun if (sctx->is_dev_replace && sctx->flush_all_writes) {
2288*4882a593Smuzhiyun mutex_lock(&sctx->wr_lock);
2289*4882a593Smuzhiyun scrub_wr_submit(sctx);
2290*4882a593Smuzhiyun mutex_unlock(&sctx->wr_lock);
2291*4882a593Smuzhiyun }
2292*4882a593Smuzhiyun
2293*4882a593Smuzhiyun scrub_pending_bio_dec(sctx);
2294*4882a593Smuzhiyun }
2295*4882a593Smuzhiyun
__scrub_mark_bitmap(struct scrub_parity * sparity,unsigned long * bitmap,u64 start,u64 len)2296*4882a593Smuzhiyun static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2297*4882a593Smuzhiyun unsigned long *bitmap,
2298*4882a593Smuzhiyun u64 start, u64 len)
2299*4882a593Smuzhiyun {
2300*4882a593Smuzhiyun u64 offset;
2301*4882a593Smuzhiyun u64 nsectors64;
2302*4882a593Smuzhiyun u32 nsectors;
2303*4882a593Smuzhiyun int sectorsize = sparity->sctx->fs_info->sectorsize;
2304*4882a593Smuzhiyun
2305*4882a593Smuzhiyun if (len >= sparity->stripe_len) {
2306*4882a593Smuzhiyun bitmap_set(bitmap, 0, sparity->nsectors);
2307*4882a593Smuzhiyun return;
2308*4882a593Smuzhiyun }
2309*4882a593Smuzhiyun
2310*4882a593Smuzhiyun start -= sparity->logic_start;
2311*4882a593Smuzhiyun start = div64_u64_rem(start, sparity->stripe_len, &offset);
2312*4882a593Smuzhiyun offset = div_u64(offset, sectorsize);
2313*4882a593Smuzhiyun nsectors64 = div_u64(len, sectorsize);
2314*4882a593Smuzhiyun
2315*4882a593Smuzhiyun ASSERT(nsectors64 < UINT_MAX);
2316*4882a593Smuzhiyun nsectors = (u32)nsectors64;
2317*4882a593Smuzhiyun
2318*4882a593Smuzhiyun if (offset + nsectors <= sparity->nsectors) {
2319*4882a593Smuzhiyun bitmap_set(bitmap, offset, nsectors);
2320*4882a593Smuzhiyun return;
2321*4882a593Smuzhiyun }
2322*4882a593Smuzhiyun
2323*4882a593Smuzhiyun bitmap_set(bitmap, offset, sparity->nsectors - offset);
2324*4882a593Smuzhiyun bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2325*4882a593Smuzhiyun }
2326*4882a593Smuzhiyun
scrub_parity_mark_sectors_error(struct scrub_parity * sparity,u64 start,u64 len)2327*4882a593Smuzhiyun static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2328*4882a593Smuzhiyun u64 start, u64 len)
2329*4882a593Smuzhiyun {
2330*4882a593Smuzhiyun __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2331*4882a593Smuzhiyun }
2332*4882a593Smuzhiyun
scrub_parity_mark_sectors_data(struct scrub_parity * sparity,u64 start,u64 len)2333*4882a593Smuzhiyun static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2334*4882a593Smuzhiyun u64 start, u64 len)
2335*4882a593Smuzhiyun {
2336*4882a593Smuzhiyun __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2337*4882a593Smuzhiyun }
2338*4882a593Smuzhiyun
scrub_block_complete(struct scrub_block * sblock)2339*4882a593Smuzhiyun static void scrub_block_complete(struct scrub_block *sblock)
2340*4882a593Smuzhiyun {
2341*4882a593Smuzhiyun int corrupted = 0;
2342*4882a593Smuzhiyun
2343*4882a593Smuzhiyun if (!sblock->no_io_error_seen) {
2344*4882a593Smuzhiyun corrupted = 1;
2345*4882a593Smuzhiyun scrub_handle_errored_block(sblock);
2346*4882a593Smuzhiyun } else {
2347*4882a593Smuzhiyun /*
2348*4882a593Smuzhiyun * if has checksum error, write via repair mechanism in
2349*4882a593Smuzhiyun * dev replace case, otherwise write here in dev replace
2350*4882a593Smuzhiyun * case.
2351*4882a593Smuzhiyun */
2352*4882a593Smuzhiyun corrupted = scrub_checksum(sblock);
2353*4882a593Smuzhiyun if (!corrupted && sblock->sctx->is_dev_replace)
2354*4882a593Smuzhiyun scrub_write_block_to_dev_replace(sblock);
2355*4882a593Smuzhiyun }
2356*4882a593Smuzhiyun
2357*4882a593Smuzhiyun if (sblock->sparity && corrupted && !sblock->data_corrected) {
2358*4882a593Smuzhiyun u64 start = sblock->pagev[0]->logical;
2359*4882a593Smuzhiyun u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2360*4882a593Smuzhiyun PAGE_SIZE;
2361*4882a593Smuzhiyun
2362*4882a593Smuzhiyun scrub_parity_mark_sectors_error(sblock->sparity,
2363*4882a593Smuzhiyun start, end - start);
2364*4882a593Smuzhiyun }
2365*4882a593Smuzhiyun }
2366*4882a593Smuzhiyun
scrub_find_csum(struct scrub_ctx * sctx,u64 logical,u8 * csum)2367*4882a593Smuzhiyun static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2368*4882a593Smuzhiyun {
2369*4882a593Smuzhiyun struct btrfs_ordered_sum *sum = NULL;
2370*4882a593Smuzhiyun unsigned long index;
2371*4882a593Smuzhiyun unsigned long num_sectors;
2372*4882a593Smuzhiyun
2373*4882a593Smuzhiyun while (!list_empty(&sctx->csum_list)) {
2374*4882a593Smuzhiyun sum = list_first_entry(&sctx->csum_list,
2375*4882a593Smuzhiyun struct btrfs_ordered_sum, list);
2376*4882a593Smuzhiyun if (sum->bytenr > logical)
2377*4882a593Smuzhiyun return 0;
2378*4882a593Smuzhiyun if (sum->bytenr + sum->len > logical)
2379*4882a593Smuzhiyun break;
2380*4882a593Smuzhiyun
2381*4882a593Smuzhiyun ++sctx->stat.csum_discards;
2382*4882a593Smuzhiyun list_del(&sum->list);
2383*4882a593Smuzhiyun kfree(sum);
2384*4882a593Smuzhiyun sum = NULL;
2385*4882a593Smuzhiyun }
2386*4882a593Smuzhiyun if (!sum)
2387*4882a593Smuzhiyun return 0;
2388*4882a593Smuzhiyun
2389*4882a593Smuzhiyun index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize);
2390*4882a593Smuzhiyun ASSERT(index < UINT_MAX);
2391*4882a593Smuzhiyun
2392*4882a593Smuzhiyun num_sectors = sum->len / sctx->fs_info->sectorsize;
2393*4882a593Smuzhiyun memcpy(csum, sum->sums + index * sctx->csum_size, sctx->csum_size);
2394*4882a593Smuzhiyun if (index == num_sectors - 1) {
2395*4882a593Smuzhiyun list_del(&sum->list);
2396*4882a593Smuzhiyun kfree(sum);
2397*4882a593Smuzhiyun }
2398*4882a593Smuzhiyun return 1;
2399*4882a593Smuzhiyun }
2400*4882a593Smuzhiyun
2401*4882a593Smuzhiyun /* scrub extent tries to collect up to 64 kB for each bio */
scrub_extent(struct scrub_ctx * sctx,struct map_lookup * map,u64 logical,u64 len,u64 physical,struct btrfs_device * dev,u64 flags,u64 gen,int mirror_num,u64 physical_for_dev_replace)2402*4882a593Smuzhiyun static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2403*4882a593Smuzhiyun u64 logical, u64 len,
2404*4882a593Smuzhiyun u64 physical, struct btrfs_device *dev, u64 flags,
2405*4882a593Smuzhiyun u64 gen, int mirror_num, u64 physical_for_dev_replace)
2406*4882a593Smuzhiyun {
2407*4882a593Smuzhiyun int ret;
2408*4882a593Smuzhiyun u8 csum[BTRFS_CSUM_SIZE];
2409*4882a593Smuzhiyun u32 blocksize;
2410*4882a593Smuzhiyun
2411*4882a593Smuzhiyun if (flags & BTRFS_EXTENT_FLAG_DATA) {
2412*4882a593Smuzhiyun if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2413*4882a593Smuzhiyun blocksize = map->stripe_len;
2414*4882a593Smuzhiyun else
2415*4882a593Smuzhiyun blocksize = sctx->fs_info->sectorsize;
2416*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
2417*4882a593Smuzhiyun sctx->stat.data_extents_scrubbed++;
2418*4882a593Smuzhiyun sctx->stat.data_bytes_scrubbed += len;
2419*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
2420*4882a593Smuzhiyun } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2421*4882a593Smuzhiyun if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2422*4882a593Smuzhiyun blocksize = map->stripe_len;
2423*4882a593Smuzhiyun else
2424*4882a593Smuzhiyun blocksize = sctx->fs_info->nodesize;
2425*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
2426*4882a593Smuzhiyun sctx->stat.tree_extents_scrubbed++;
2427*4882a593Smuzhiyun sctx->stat.tree_bytes_scrubbed += len;
2428*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
2429*4882a593Smuzhiyun } else {
2430*4882a593Smuzhiyun blocksize = sctx->fs_info->sectorsize;
2431*4882a593Smuzhiyun WARN_ON(1);
2432*4882a593Smuzhiyun }
2433*4882a593Smuzhiyun
2434*4882a593Smuzhiyun while (len) {
2435*4882a593Smuzhiyun u64 l = min_t(u64, len, blocksize);
2436*4882a593Smuzhiyun int have_csum = 0;
2437*4882a593Smuzhiyun
2438*4882a593Smuzhiyun if (flags & BTRFS_EXTENT_FLAG_DATA) {
2439*4882a593Smuzhiyun /* push csums to sbio */
2440*4882a593Smuzhiyun have_csum = scrub_find_csum(sctx, logical, csum);
2441*4882a593Smuzhiyun if (have_csum == 0)
2442*4882a593Smuzhiyun ++sctx->stat.no_csum;
2443*4882a593Smuzhiyun }
2444*4882a593Smuzhiyun ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2445*4882a593Smuzhiyun mirror_num, have_csum ? csum : NULL, 0,
2446*4882a593Smuzhiyun physical_for_dev_replace);
2447*4882a593Smuzhiyun if (ret)
2448*4882a593Smuzhiyun return ret;
2449*4882a593Smuzhiyun len -= l;
2450*4882a593Smuzhiyun logical += l;
2451*4882a593Smuzhiyun physical += l;
2452*4882a593Smuzhiyun physical_for_dev_replace += l;
2453*4882a593Smuzhiyun }
2454*4882a593Smuzhiyun return 0;
2455*4882a593Smuzhiyun }
2456*4882a593Smuzhiyun
scrub_pages_for_parity(struct scrub_parity * sparity,u64 logical,u64 len,u64 physical,struct btrfs_device * dev,u64 flags,u64 gen,int mirror_num,u8 * csum)2457*4882a593Smuzhiyun static int scrub_pages_for_parity(struct scrub_parity *sparity,
2458*4882a593Smuzhiyun u64 logical, u64 len,
2459*4882a593Smuzhiyun u64 physical, struct btrfs_device *dev,
2460*4882a593Smuzhiyun u64 flags, u64 gen, int mirror_num, u8 *csum)
2461*4882a593Smuzhiyun {
2462*4882a593Smuzhiyun struct scrub_ctx *sctx = sparity->sctx;
2463*4882a593Smuzhiyun struct scrub_block *sblock;
2464*4882a593Smuzhiyun int index;
2465*4882a593Smuzhiyun
2466*4882a593Smuzhiyun sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2467*4882a593Smuzhiyun if (!sblock) {
2468*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
2469*4882a593Smuzhiyun sctx->stat.malloc_errors++;
2470*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
2471*4882a593Smuzhiyun return -ENOMEM;
2472*4882a593Smuzhiyun }
2473*4882a593Smuzhiyun
2474*4882a593Smuzhiyun /* one ref inside this function, plus one for each page added to
2475*4882a593Smuzhiyun * a bio later on */
2476*4882a593Smuzhiyun refcount_set(&sblock->refs, 1);
2477*4882a593Smuzhiyun sblock->sctx = sctx;
2478*4882a593Smuzhiyun sblock->no_io_error_seen = 1;
2479*4882a593Smuzhiyun sblock->sparity = sparity;
2480*4882a593Smuzhiyun scrub_parity_get(sparity);
2481*4882a593Smuzhiyun
2482*4882a593Smuzhiyun for (index = 0; len > 0; index++) {
2483*4882a593Smuzhiyun struct scrub_page *spage;
2484*4882a593Smuzhiyun u64 l = min_t(u64, len, PAGE_SIZE);
2485*4882a593Smuzhiyun
2486*4882a593Smuzhiyun spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2487*4882a593Smuzhiyun if (!spage) {
2488*4882a593Smuzhiyun leave_nomem:
2489*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
2490*4882a593Smuzhiyun sctx->stat.malloc_errors++;
2491*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
2492*4882a593Smuzhiyun scrub_block_put(sblock);
2493*4882a593Smuzhiyun return -ENOMEM;
2494*4882a593Smuzhiyun }
2495*4882a593Smuzhiyun BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2496*4882a593Smuzhiyun /* For scrub block */
2497*4882a593Smuzhiyun scrub_page_get(spage);
2498*4882a593Smuzhiyun sblock->pagev[index] = spage;
2499*4882a593Smuzhiyun /* For scrub parity */
2500*4882a593Smuzhiyun scrub_page_get(spage);
2501*4882a593Smuzhiyun list_add_tail(&spage->list, &sparity->spages);
2502*4882a593Smuzhiyun spage->sblock = sblock;
2503*4882a593Smuzhiyun spage->dev = dev;
2504*4882a593Smuzhiyun spage->flags = flags;
2505*4882a593Smuzhiyun spage->generation = gen;
2506*4882a593Smuzhiyun spage->logical = logical;
2507*4882a593Smuzhiyun spage->physical = physical;
2508*4882a593Smuzhiyun spage->mirror_num = mirror_num;
2509*4882a593Smuzhiyun if (csum) {
2510*4882a593Smuzhiyun spage->have_csum = 1;
2511*4882a593Smuzhiyun memcpy(spage->csum, csum, sctx->csum_size);
2512*4882a593Smuzhiyun } else {
2513*4882a593Smuzhiyun spage->have_csum = 0;
2514*4882a593Smuzhiyun }
2515*4882a593Smuzhiyun sblock->page_count++;
2516*4882a593Smuzhiyun spage->page = alloc_page(GFP_KERNEL);
2517*4882a593Smuzhiyun if (!spage->page)
2518*4882a593Smuzhiyun goto leave_nomem;
2519*4882a593Smuzhiyun len -= l;
2520*4882a593Smuzhiyun logical += l;
2521*4882a593Smuzhiyun physical += l;
2522*4882a593Smuzhiyun }
2523*4882a593Smuzhiyun
2524*4882a593Smuzhiyun WARN_ON(sblock->page_count == 0);
2525*4882a593Smuzhiyun for (index = 0; index < sblock->page_count; index++) {
2526*4882a593Smuzhiyun struct scrub_page *spage = sblock->pagev[index];
2527*4882a593Smuzhiyun int ret;
2528*4882a593Smuzhiyun
2529*4882a593Smuzhiyun ret = scrub_add_page_to_rd_bio(sctx, spage);
2530*4882a593Smuzhiyun if (ret) {
2531*4882a593Smuzhiyun scrub_block_put(sblock);
2532*4882a593Smuzhiyun return ret;
2533*4882a593Smuzhiyun }
2534*4882a593Smuzhiyun }
2535*4882a593Smuzhiyun
2536*4882a593Smuzhiyun /* last one frees, either here or in bio completion for last page */
2537*4882a593Smuzhiyun scrub_block_put(sblock);
2538*4882a593Smuzhiyun return 0;
2539*4882a593Smuzhiyun }
2540*4882a593Smuzhiyun
scrub_extent_for_parity(struct scrub_parity * sparity,u64 logical,u64 len,u64 physical,struct btrfs_device * dev,u64 flags,u64 gen,int mirror_num)2541*4882a593Smuzhiyun static int scrub_extent_for_parity(struct scrub_parity *sparity,
2542*4882a593Smuzhiyun u64 logical, u64 len,
2543*4882a593Smuzhiyun u64 physical, struct btrfs_device *dev,
2544*4882a593Smuzhiyun u64 flags, u64 gen, int mirror_num)
2545*4882a593Smuzhiyun {
2546*4882a593Smuzhiyun struct scrub_ctx *sctx = sparity->sctx;
2547*4882a593Smuzhiyun int ret;
2548*4882a593Smuzhiyun u8 csum[BTRFS_CSUM_SIZE];
2549*4882a593Smuzhiyun u32 blocksize;
2550*4882a593Smuzhiyun
2551*4882a593Smuzhiyun if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2552*4882a593Smuzhiyun scrub_parity_mark_sectors_error(sparity, logical, len);
2553*4882a593Smuzhiyun return 0;
2554*4882a593Smuzhiyun }
2555*4882a593Smuzhiyun
2556*4882a593Smuzhiyun if (flags & BTRFS_EXTENT_FLAG_DATA) {
2557*4882a593Smuzhiyun blocksize = sparity->stripe_len;
2558*4882a593Smuzhiyun } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2559*4882a593Smuzhiyun blocksize = sparity->stripe_len;
2560*4882a593Smuzhiyun } else {
2561*4882a593Smuzhiyun blocksize = sctx->fs_info->sectorsize;
2562*4882a593Smuzhiyun WARN_ON(1);
2563*4882a593Smuzhiyun }
2564*4882a593Smuzhiyun
2565*4882a593Smuzhiyun while (len) {
2566*4882a593Smuzhiyun u64 l = min_t(u64, len, blocksize);
2567*4882a593Smuzhiyun int have_csum = 0;
2568*4882a593Smuzhiyun
2569*4882a593Smuzhiyun if (flags & BTRFS_EXTENT_FLAG_DATA) {
2570*4882a593Smuzhiyun /* push csums to sbio */
2571*4882a593Smuzhiyun have_csum = scrub_find_csum(sctx, logical, csum);
2572*4882a593Smuzhiyun if (have_csum == 0)
2573*4882a593Smuzhiyun goto skip;
2574*4882a593Smuzhiyun }
2575*4882a593Smuzhiyun ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2576*4882a593Smuzhiyun flags, gen, mirror_num,
2577*4882a593Smuzhiyun have_csum ? csum : NULL);
2578*4882a593Smuzhiyun if (ret)
2579*4882a593Smuzhiyun return ret;
2580*4882a593Smuzhiyun skip:
2581*4882a593Smuzhiyun len -= l;
2582*4882a593Smuzhiyun logical += l;
2583*4882a593Smuzhiyun physical += l;
2584*4882a593Smuzhiyun }
2585*4882a593Smuzhiyun return 0;
2586*4882a593Smuzhiyun }
2587*4882a593Smuzhiyun
2588*4882a593Smuzhiyun /*
2589*4882a593Smuzhiyun * Given a physical address, this will calculate it's
2590*4882a593Smuzhiyun * logical offset. if this is a parity stripe, it will return
2591*4882a593Smuzhiyun * the most left data stripe's logical offset.
2592*4882a593Smuzhiyun *
2593*4882a593Smuzhiyun * return 0 if it is a data stripe, 1 means parity stripe.
2594*4882a593Smuzhiyun */
get_raid56_logic_offset(u64 physical,int num,struct map_lookup * map,u64 * offset,u64 * stripe_start)2595*4882a593Smuzhiyun static int get_raid56_logic_offset(u64 physical, int num,
2596*4882a593Smuzhiyun struct map_lookup *map, u64 *offset,
2597*4882a593Smuzhiyun u64 *stripe_start)
2598*4882a593Smuzhiyun {
2599*4882a593Smuzhiyun int i;
2600*4882a593Smuzhiyun int j = 0;
2601*4882a593Smuzhiyun u64 stripe_nr;
2602*4882a593Smuzhiyun u64 last_offset;
2603*4882a593Smuzhiyun u32 stripe_index;
2604*4882a593Smuzhiyun u32 rot;
2605*4882a593Smuzhiyun const int data_stripes = nr_data_stripes(map);
2606*4882a593Smuzhiyun
2607*4882a593Smuzhiyun last_offset = (physical - map->stripes[num].physical) * data_stripes;
2608*4882a593Smuzhiyun if (stripe_start)
2609*4882a593Smuzhiyun *stripe_start = last_offset;
2610*4882a593Smuzhiyun
2611*4882a593Smuzhiyun *offset = last_offset;
2612*4882a593Smuzhiyun for (i = 0; i < data_stripes; i++) {
2613*4882a593Smuzhiyun *offset = last_offset + i * map->stripe_len;
2614*4882a593Smuzhiyun
2615*4882a593Smuzhiyun stripe_nr = div64_u64(*offset, map->stripe_len);
2616*4882a593Smuzhiyun stripe_nr = div_u64(stripe_nr, data_stripes);
2617*4882a593Smuzhiyun
2618*4882a593Smuzhiyun /* Work out the disk rotation on this stripe-set */
2619*4882a593Smuzhiyun stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2620*4882a593Smuzhiyun /* calculate which stripe this data locates */
2621*4882a593Smuzhiyun rot += i;
2622*4882a593Smuzhiyun stripe_index = rot % map->num_stripes;
2623*4882a593Smuzhiyun if (stripe_index == num)
2624*4882a593Smuzhiyun return 0;
2625*4882a593Smuzhiyun if (stripe_index < num)
2626*4882a593Smuzhiyun j++;
2627*4882a593Smuzhiyun }
2628*4882a593Smuzhiyun *offset = last_offset + j * map->stripe_len;
2629*4882a593Smuzhiyun return 1;
2630*4882a593Smuzhiyun }
2631*4882a593Smuzhiyun
scrub_free_parity(struct scrub_parity * sparity)2632*4882a593Smuzhiyun static void scrub_free_parity(struct scrub_parity *sparity)
2633*4882a593Smuzhiyun {
2634*4882a593Smuzhiyun struct scrub_ctx *sctx = sparity->sctx;
2635*4882a593Smuzhiyun struct scrub_page *curr, *next;
2636*4882a593Smuzhiyun int nbits;
2637*4882a593Smuzhiyun
2638*4882a593Smuzhiyun nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2639*4882a593Smuzhiyun if (nbits) {
2640*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
2641*4882a593Smuzhiyun sctx->stat.read_errors += nbits;
2642*4882a593Smuzhiyun sctx->stat.uncorrectable_errors += nbits;
2643*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
2644*4882a593Smuzhiyun }
2645*4882a593Smuzhiyun
2646*4882a593Smuzhiyun list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2647*4882a593Smuzhiyun list_del_init(&curr->list);
2648*4882a593Smuzhiyun scrub_page_put(curr);
2649*4882a593Smuzhiyun }
2650*4882a593Smuzhiyun
2651*4882a593Smuzhiyun kfree(sparity);
2652*4882a593Smuzhiyun }
2653*4882a593Smuzhiyun
scrub_parity_bio_endio_worker(struct btrfs_work * work)2654*4882a593Smuzhiyun static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2655*4882a593Smuzhiyun {
2656*4882a593Smuzhiyun struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2657*4882a593Smuzhiyun work);
2658*4882a593Smuzhiyun struct scrub_ctx *sctx = sparity->sctx;
2659*4882a593Smuzhiyun
2660*4882a593Smuzhiyun scrub_free_parity(sparity);
2661*4882a593Smuzhiyun scrub_pending_bio_dec(sctx);
2662*4882a593Smuzhiyun }
2663*4882a593Smuzhiyun
scrub_parity_bio_endio(struct bio * bio)2664*4882a593Smuzhiyun static void scrub_parity_bio_endio(struct bio *bio)
2665*4882a593Smuzhiyun {
2666*4882a593Smuzhiyun struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2667*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2668*4882a593Smuzhiyun
2669*4882a593Smuzhiyun if (bio->bi_status)
2670*4882a593Smuzhiyun bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2671*4882a593Smuzhiyun sparity->nsectors);
2672*4882a593Smuzhiyun
2673*4882a593Smuzhiyun bio_put(bio);
2674*4882a593Smuzhiyun
2675*4882a593Smuzhiyun btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
2676*4882a593Smuzhiyun NULL);
2677*4882a593Smuzhiyun btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
2678*4882a593Smuzhiyun }
2679*4882a593Smuzhiyun
scrub_parity_check_and_repair(struct scrub_parity * sparity)2680*4882a593Smuzhiyun static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2681*4882a593Smuzhiyun {
2682*4882a593Smuzhiyun struct scrub_ctx *sctx = sparity->sctx;
2683*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sctx->fs_info;
2684*4882a593Smuzhiyun struct bio *bio;
2685*4882a593Smuzhiyun struct btrfs_raid_bio *rbio;
2686*4882a593Smuzhiyun struct btrfs_bio *bbio = NULL;
2687*4882a593Smuzhiyun u64 length;
2688*4882a593Smuzhiyun int ret;
2689*4882a593Smuzhiyun
2690*4882a593Smuzhiyun if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2691*4882a593Smuzhiyun sparity->nsectors))
2692*4882a593Smuzhiyun goto out;
2693*4882a593Smuzhiyun
2694*4882a593Smuzhiyun length = sparity->logic_end - sparity->logic_start;
2695*4882a593Smuzhiyun
2696*4882a593Smuzhiyun btrfs_bio_counter_inc_blocked(fs_info);
2697*4882a593Smuzhiyun ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2698*4882a593Smuzhiyun &length, &bbio);
2699*4882a593Smuzhiyun if (ret || !bbio || !bbio->raid_map)
2700*4882a593Smuzhiyun goto bbio_out;
2701*4882a593Smuzhiyun
2702*4882a593Smuzhiyun bio = btrfs_io_bio_alloc(0);
2703*4882a593Smuzhiyun bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2704*4882a593Smuzhiyun bio->bi_private = sparity;
2705*4882a593Smuzhiyun bio->bi_end_io = scrub_parity_bio_endio;
2706*4882a593Smuzhiyun
2707*4882a593Smuzhiyun rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
2708*4882a593Smuzhiyun length, sparity->scrub_dev,
2709*4882a593Smuzhiyun sparity->dbitmap,
2710*4882a593Smuzhiyun sparity->nsectors);
2711*4882a593Smuzhiyun if (!rbio)
2712*4882a593Smuzhiyun goto rbio_out;
2713*4882a593Smuzhiyun
2714*4882a593Smuzhiyun scrub_pending_bio_inc(sctx);
2715*4882a593Smuzhiyun raid56_parity_submit_scrub_rbio(rbio);
2716*4882a593Smuzhiyun return;
2717*4882a593Smuzhiyun
2718*4882a593Smuzhiyun rbio_out:
2719*4882a593Smuzhiyun bio_put(bio);
2720*4882a593Smuzhiyun bbio_out:
2721*4882a593Smuzhiyun btrfs_bio_counter_dec(fs_info);
2722*4882a593Smuzhiyun btrfs_put_bbio(bbio);
2723*4882a593Smuzhiyun bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2724*4882a593Smuzhiyun sparity->nsectors);
2725*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
2726*4882a593Smuzhiyun sctx->stat.malloc_errors++;
2727*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
2728*4882a593Smuzhiyun out:
2729*4882a593Smuzhiyun scrub_free_parity(sparity);
2730*4882a593Smuzhiyun }
2731*4882a593Smuzhiyun
scrub_calc_parity_bitmap_len(int nsectors)2732*4882a593Smuzhiyun static inline int scrub_calc_parity_bitmap_len(int nsectors)
2733*4882a593Smuzhiyun {
2734*4882a593Smuzhiyun return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
2735*4882a593Smuzhiyun }
2736*4882a593Smuzhiyun
scrub_parity_get(struct scrub_parity * sparity)2737*4882a593Smuzhiyun static void scrub_parity_get(struct scrub_parity *sparity)
2738*4882a593Smuzhiyun {
2739*4882a593Smuzhiyun refcount_inc(&sparity->refs);
2740*4882a593Smuzhiyun }
2741*4882a593Smuzhiyun
scrub_parity_put(struct scrub_parity * sparity)2742*4882a593Smuzhiyun static void scrub_parity_put(struct scrub_parity *sparity)
2743*4882a593Smuzhiyun {
2744*4882a593Smuzhiyun if (!refcount_dec_and_test(&sparity->refs))
2745*4882a593Smuzhiyun return;
2746*4882a593Smuzhiyun
2747*4882a593Smuzhiyun scrub_parity_check_and_repair(sparity);
2748*4882a593Smuzhiyun }
2749*4882a593Smuzhiyun
scrub_raid56_parity(struct scrub_ctx * sctx,struct map_lookup * map,struct btrfs_device * sdev,struct btrfs_path * path,u64 logic_start,u64 logic_end)2750*4882a593Smuzhiyun static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2751*4882a593Smuzhiyun struct map_lookup *map,
2752*4882a593Smuzhiyun struct btrfs_device *sdev,
2753*4882a593Smuzhiyun struct btrfs_path *path,
2754*4882a593Smuzhiyun u64 logic_start,
2755*4882a593Smuzhiyun u64 logic_end)
2756*4882a593Smuzhiyun {
2757*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sctx->fs_info;
2758*4882a593Smuzhiyun struct btrfs_root *root = fs_info->extent_root;
2759*4882a593Smuzhiyun struct btrfs_root *csum_root = fs_info->csum_root;
2760*4882a593Smuzhiyun struct btrfs_extent_item *extent;
2761*4882a593Smuzhiyun struct btrfs_bio *bbio = NULL;
2762*4882a593Smuzhiyun u64 flags;
2763*4882a593Smuzhiyun int ret;
2764*4882a593Smuzhiyun int slot;
2765*4882a593Smuzhiyun struct extent_buffer *l;
2766*4882a593Smuzhiyun struct btrfs_key key;
2767*4882a593Smuzhiyun u64 generation;
2768*4882a593Smuzhiyun u64 extent_logical;
2769*4882a593Smuzhiyun u64 extent_physical;
2770*4882a593Smuzhiyun u64 extent_len;
2771*4882a593Smuzhiyun u64 mapped_length;
2772*4882a593Smuzhiyun struct btrfs_device *extent_dev;
2773*4882a593Smuzhiyun struct scrub_parity *sparity;
2774*4882a593Smuzhiyun int nsectors;
2775*4882a593Smuzhiyun int bitmap_len;
2776*4882a593Smuzhiyun int extent_mirror_num;
2777*4882a593Smuzhiyun int stop_loop = 0;
2778*4882a593Smuzhiyun
2779*4882a593Smuzhiyun nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
2780*4882a593Smuzhiyun bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2781*4882a593Smuzhiyun sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2782*4882a593Smuzhiyun GFP_NOFS);
2783*4882a593Smuzhiyun if (!sparity) {
2784*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
2785*4882a593Smuzhiyun sctx->stat.malloc_errors++;
2786*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
2787*4882a593Smuzhiyun return -ENOMEM;
2788*4882a593Smuzhiyun }
2789*4882a593Smuzhiyun
2790*4882a593Smuzhiyun sparity->stripe_len = map->stripe_len;
2791*4882a593Smuzhiyun sparity->nsectors = nsectors;
2792*4882a593Smuzhiyun sparity->sctx = sctx;
2793*4882a593Smuzhiyun sparity->scrub_dev = sdev;
2794*4882a593Smuzhiyun sparity->logic_start = logic_start;
2795*4882a593Smuzhiyun sparity->logic_end = logic_end;
2796*4882a593Smuzhiyun refcount_set(&sparity->refs, 1);
2797*4882a593Smuzhiyun INIT_LIST_HEAD(&sparity->spages);
2798*4882a593Smuzhiyun sparity->dbitmap = sparity->bitmap;
2799*4882a593Smuzhiyun sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2800*4882a593Smuzhiyun
2801*4882a593Smuzhiyun ret = 0;
2802*4882a593Smuzhiyun while (logic_start < logic_end) {
2803*4882a593Smuzhiyun if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2804*4882a593Smuzhiyun key.type = BTRFS_METADATA_ITEM_KEY;
2805*4882a593Smuzhiyun else
2806*4882a593Smuzhiyun key.type = BTRFS_EXTENT_ITEM_KEY;
2807*4882a593Smuzhiyun key.objectid = logic_start;
2808*4882a593Smuzhiyun key.offset = (u64)-1;
2809*4882a593Smuzhiyun
2810*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2811*4882a593Smuzhiyun if (ret < 0)
2812*4882a593Smuzhiyun goto out;
2813*4882a593Smuzhiyun
2814*4882a593Smuzhiyun if (ret > 0) {
2815*4882a593Smuzhiyun ret = btrfs_previous_extent_item(root, path, 0);
2816*4882a593Smuzhiyun if (ret < 0)
2817*4882a593Smuzhiyun goto out;
2818*4882a593Smuzhiyun if (ret > 0) {
2819*4882a593Smuzhiyun btrfs_release_path(path);
2820*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &key,
2821*4882a593Smuzhiyun path, 0, 0);
2822*4882a593Smuzhiyun if (ret < 0)
2823*4882a593Smuzhiyun goto out;
2824*4882a593Smuzhiyun }
2825*4882a593Smuzhiyun }
2826*4882a593Smuzhiyun
2827*4882a593Smuzhiyun stop_loop = 0;
2828*4882a593Smuzhiyun while (1) {
2829*4882a593Smuzhiyun u64 bytes;
2830*4882a593Smuzhiyun
2831*4882a593Smuzhiyun l = path->nodes[0];
2832*4882a593Smuzhiyun slot = path->slots[0];
2833*4882a593Smuzhiyun if (slot >= btrfs_header_nritems(l)) {
2834*4882a593Smuzhiyun ret = btrfs_next_leaf(root, path);
2835*4882a593Smuzhiyun if (ret == 0)
2836*4882a593Smuzhiyun continue;
2837*4882a593Smuzhiyun if (ret < 0)
2838*4882a593Smuzhiyun goto out;
2839*4882a593Smuzhiyun
2840*4882a593Smuzhiyun stop_loop = 1;
2841*4882a593Smuzhiyun break;
2842*4882a593Smuzhiyun }
2843*4882a593Smuzhiyun btrfs_item_key_to_cpu(l, &key, slot);
2844*4882a593Smuzhiyun
2845*4882a593Smuzhiyun if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2846*4882a593Smuzhiyun key.type != BTRFS_METADATA_ITEM_KEY)
2847*4882a593Smuzhiyun goto next;
2848*4882a593Smuzhiyun
2849*4882a593Smuzhiyun if (key.type == BTRFS_METADATA_ITEM_KEY)
2850*4882a593Smuzhiyun bytes = fs_info->nodesize;
2851*4882a593Smuzhiyun else
2852*4882a593Smuzhiyun bytes = key.offset;
2853*4882a593Smuzhiyun
2854*4882a593Smuzhiyun if (key.objectid + bytes <= logic_start)
2855*4882a593Smuzhiyun goto next;
2856*4882a593Smuzhiyun
2857*4882a593Smuzhiyun if (key.objectid >= logic_end) {
2858*4882a593Smuzhiyun stop_loop = 1;
2859*4882a593Smuzhiyun break;
2860*4882a593Smuzhiyun }
2861*4882a593Smuzhiyun
2862*4882a593Smuzhiyun while (key.objectid >= logic_start + map->stripe_len)
2863*4882a593Smuzhiyun logic_start += map->stripe_len;
2864*4882a593Smuzhiyun
2865*4882a593Smuzhiyun extent = btrfs_item_ptr(l, slot,
2866*4882a593Smuzhiyun struct btrfs_extent_item);
2867*4882a593Smuzhiyun flags = btrfs_extent_flags(l, extent);
2868*4882a593Smuzhiyun generation = btrfs_extent_generation(l, extent);
2869*4882a593Smuzhiyun
2870*4882a593Smuzhiyun if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
2871*4882a593Smuzhiyun (key.objectid < logic_start ||
2872*4882a593Smuzhiyun key.objectid + bytes >
2873*4882a593Smuzhiyun logic_start + map->stripe_len)) {
2874*4882a593Smuzhiyun btrfs_err(fs_info,
2875*4882a593Smuzhiyun "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2876*4882a593Smuzhiyun key.objectid, logic_start);
2877*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
2878*4882a593Smuzhiyun sctx->stat.uncorrectable_errors++;
2879*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
2880*4882a593Smuzhiyun goto next;
2881*4882a593Smuzhiyun }
2882*4882a593Smuzhiyun again:
2883*4882a593Smuzhiyun extent_logical = key.objectid;
2884*4882a593Smuzhiyun extent_len = bytes;
2885*4882a593Smuzhiyun
2886*4882a593Smuzhiyun if (extent_logical < logic_start) {
2887*4882a593Smuzhiyun extent_len -= logic_start - extent_logical;
2888*4882a593Smuzhiyun extent_logical = logic_start;
2889*4882a593Smuzhiyun }
2890*4882a593Smuzhiyun
2891*4882a593Smuzhiyun if (extent_logical + extent_len >
2892*4882a593Smuzhiyun logic_start + map->stripe_len)
2893*4882a593Smuzhiyun extent_len = logic_start + map->stripe_len -
2894*4882a593Smuzhiyun extent_logical;
2895*4882a593Smuzhiyun
2896*4882a593Smuzhiyun scrub_parity_mark_sectors_data(sparity, extent_logical,
2897*4882a593Smuzhiyun extent_len);
2898*4882a593Smuzhiyun
2899*4882a593Smuzhiyun mapped_length = extent_len;
2900*4882a593Smuzhiyun bbio = NULL;
2901*4882a593Smuzhiyun ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
2902*4882a593Smuzhiyun extent_logical, &mapped_length, &bbio,
2903*4882a593Smuzhiyun 0);
2904*4882a593Smuzhiyun if (!ret) {
2905*4882a593Smuzhiyun if (!bbio || mapped_length < extent_len)
2906*4882a593Smuzhiyun ret = -EIO;
2907*4882a593Smuzhiyun }
2908*4882a593Smuzhiyun if (ret) {
2909*4882a593Smuzhiyun btrfs_put_bbio(bbio);
2910*4882a593Smuzhiyun goto out;
2911*4882a593Smuzhiyun }
2912*4882a593Smuzhiyun extent_physical = bbio->stripes[0].physical;
2913*4882a593Smuzhiyun extent_mirror_num = bbio->mirror_num;
2914*4882a593Smuzhiyun extent_dev = bbio->stripes[0].dev;
2915*4882a593Smuzhiyun btrfs_put_bbio(bbio);
2916*4882a593Smuzhiyun
2917*4882a593Smuzhiyun ret = btrfs_lookup_csums_range(csum_root,
2918*4882a593Smuzhiyun extent_logical,
2919*4882a593Smuzhiyun extent_logical + extent_len - 1,
2920*4882a593Smuzhiyun &sctx->csum_list, 1);
2921*4882a593Smuzhiyun if (ret)
2922*4882a593Smuzhiyun goto out;
2923*4882a593Smuzhiyun
2924*4882a593Smuzhiyun ret = scrub_extent_for_parity(sparity, extent_logical,
2925*4882a593Smuzhiyun extent_len,
2926*4882a593Smuzhiyun extent_physical,
2927*4882a593Smuzhiyun extent_dev, flags,
2928*4882a593Smuzhiyun generation,
2929*4882a593Smuzhiyun extent_mirror_num);
2930*4882a593Smuzhiyun
2931*4882a593Smuzhiyun scrub_free_csums(sctx);
2932*4882a593Smuzhiyun
2933*4882a593Smuzhiyun if (ret)
2934*4882a593Smuzhiyun goto out;
2935*4882a593Smuzhiyun
2936*4882a593Smuzhiyun if (extent_logical + extent_len <
2937*4882a593Smuzhiyun key.objectid + bytes) {
2938*4882a593Smuzhiyun logic_start += map->stripe_len;
2939*4882a593Smuzhiyun
2940*4882a593Smuzhiyun if (logic_start >= logic_end) {
2941*4882a593Smuzhiyun stop_loop = 1;
2942*4882a593Smuzhiyun break;
2943*4882a593Smuzhiyun }
2944*4882a593Smuzhiyun
2945*4882a593Smuzhiyun if (logic_start < key.objectid + bytes) {
2946*4882a593Smuzhiyun cond_resched();
2947*4882a593Smuzhiyun goto again;
2948*4882a593Smuzhiyun }
2949*4882a593Smuzhiyun }
2950*4882a593Smuzhiyun next:
2951*4882a593Smuzhiyun path->slots[0]++;
2952*4882a593Smuzhiyun }
2953*4882a593Smuzhiyun
2954*4882a593Smuzhiyun btrfs_release_path(path);
2955*4882a593Smuzhiyun
2956*4882a593Smuzhiyun if (stop_loop)
2957*4882a593Smuzhiyun break;
2958*4882a593Smuzhiyun
2959*4882a593Smuzhiyun logic_start += map->stripe_len;
2960*4882a593Smuzhiyun }
2961*4882a593Smuzhiyun out:
2962*4882a593Smuzhiyun if (ret < 0)
2963*4882a593Smuzhiyun scrub_parity_mark_sectors_error(sparity, logic_start,
2964*4882a593Smuzhiyun logic_end - logic_start);
2965*4882a593Smuzhiyun scrub_parity_put(sparity);
2966*4882a593Smuzhiyun scrub_submit(sctx);
2967*4882a593Smuzhiyun mutex_lock(&sctx->wr_lock);
2968*4882a593Smuzhiyun scrub_wr_submit(sctx);
2969*4882a593Smuzhiyun mutex_unlock(&sctx->wr_lock);
2970*4882a593Smuzhiyun
2971*4882a593Smuzhiyun btrfs_release_path(path);
2972*4882a593Smuzhiyun return ret < 0 ? ret : 0;
2973*4882a593Smuzhiyun }
2974*4882a593Smuzhiyun
scrub_stripe(struct scrub_ctx * sctx,struct map_lookup * map,struct btrfs_device * scrub_dev,int num,u64 base,u64 length,struct btrfs_block_group * cache)2975*4882a593Smuzhiyun static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2976*4882a593Smuzhiyun struct map_lookup *map,
2977*4882a593Smuzhiyun struct btrfs_device *scrub_dev,
2978*4882a593Smuzhiyun int num, u64 base, u64 length,
2979*4882a593Smuzhiyun struct btrfs_block_group *cache)
2980*4882a593Smuzhiyun {
2981*4882a593Smuzhiyun struct btrfs_path *path, *ppath;
2982*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sctx->fs_info;
2983*4882a593Smuzhiyun struct btrfs_root *root = fs_info->extent_root;
2984*4882a593Smuzhiyun struct btrfs_root *csum_root = fs_info->csum_root;
2985*4882a593Smuzhiyun struct btrfs_extent_item *extent;
2986*4882a593Smuzhiyun struct blk_plug plug;
2987*4882a593Smuzhiyun u64 flags;
2988*4882a593Smuzhiyun int ret;
2989*4882a593Smuzhiyun int slot;
2990*4882a593Smuzhiyun u64 nstripes;
2991*4882a593Smuzhiyun struct extent_buffer *l;
2992*4882a593Smuzhiyun u64 physical;
2993*4882a593Smuzhiyun u64 logical;
2994*4882a593Smuzhiyun u64 logic_end;
2995*4882a593Smuzhiyun u64 physical_end;
2996*4882a593Smuzhiyun u64 generation;
2997*4882a593Smuzhiyun int mirror_num;
2998*4882a593Smuzhiyun struct reada_control *reada1;
2999*4882a593Smuzhiyun struct reada_control *reada2;
3000*4882a593Smuzhiyun struct btrfs_key key;
3001*4882a593Smuzhiyun struct btrfs_key key_end;
3002*4882a593Smuzhiyun u64 increment = map->stripe_len;
3003*4882a593Smuzhiyun u64 offset;
3004*4882a593Smuzhiyun u64 extent_logical;
3005*4882a593Smuzhiyun u64 extent_physical;
3006*4882a593Smuzhiyun u64 extent_len;
3007*4882a593Smuzhiyun u64 stripe_logical;
3008*4882a593Smuzhiyun u64 stripe_end;
3009*4882a593Smuzhiyun struct btrfs_device *extent_dev;
3010*4882a593Smuzhiyun int extent_mirror_num;
3011*4882a593Smuzhiyun int stop_loop = 0;
3012*4882a593Smuzhiyun
3013*4882a593Smuzhiyun physical = map->stripes[num].physical;
3014*4882a593Smuzhiyun offset = 0;
3015*4882a593Smuzhiyun nstripes = div64_u64(length, map->stripe_len);
3016*4882a593Smuzhiyun if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3017*4882a593Smuzhiyun offset = map->stripe_len * num;
3018*4882a593Smuzhiyun increment = map->stripe_len * map->num_stripes;
3019*4882a593Smuzhiyun mirror_num = 1;
3020*4882a593Smuzhiyun } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3021*4882a593Smuzhiyun int factor = map->num_stripes / map->sub_stripes;
3022*4882a593Smuzhiyun offset = map->stripe_len * (num / map->sub_stripes);
3023*4882a593Smuzhiyun increment = map->stripe_len * factor;
3024*4882a593Smuzhiyun mirror_num = num % map->sub_stripes + 1;
3025*4882a593Smuzhiyun } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
3026*4882a593Smuzhiyun increment = map->stripe_len;
3027*4882a593Smuzhiyun mirror_num = num % map->num_stripes + 1;
3028*4882a593Smuzhiyun } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3029*4882a593Smuzhiyun increment = map->stripe_len;
3030*4882a593Smuzhiyun mirror_num = num % map->num_stripes + 1;
3031*4882a593Smuzhiyun } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3032*4882a593Smuzhiyun get_raid56_logic_offset(physical, num, map, &offset, NULL);
3033*4882a593Smuzhiyun increment = map->stripe_len * nr_data_stripes(map);
3034*4882a593Smuzhiyun mirror_num = 1;
3035*4882a593Smuzhiyun } else {
3036*4882a593Smuzhiyun increment = map->stripe_len;
3037*4882a593Smuzhiyun mirror_num = 1;
3038*4882a593Smuzhiyun }
3039*4882a593Smuzhiyun
3040*4882a593Smuzhiyun path = btrfs_alloc_path();
3041*4882a593Smuzhiyun if (!path)
3042*4882a593Smuzhiyun return -ENOMEM;
3043*4882a593Smuzhiyun
3044*4882a593Smuzhiyun ppath = btrfs_alloc_path();
3045*4882a593Smuzhiyun if (!ppath) {
3046*4882a593Smuzhiyun btrfs_free_path(path);
3047*4882a593Smuzhiyun return -ENOMEM;
3048*4882a593Smuzhiyun }
3049*4882a593Smuzhiyun
3050*4882a593Smuzhiyun /*
3051*4882a593Smuzhiyun * work on commit root. The related disk blocks are static as
3052*4882a593Smuzhiyun * long as COW is applied. This means, it is save to rewrite
3053*4882a593Smuzhiyun * them to repair disk errors without any race conditions
3054*4882a593Smuzhiyun */
3055*4882a593Smuzhiyun path->search_commit_root = 1;
3056*4882a593Smuzhiyun path->skip_locking = 1;
3057*4882a593Smuzhiyun
3058*4882a593Smuzhiyun ppath->search_commit_root = 1;
3059*4882a593Smuzhiyun ppath->skip_locking = 1;
3060*4882a593Smuzhiyun /*
3061*4882a593Smuzhiyun * trigger the readahead for extent tree csum tree and wait for
3062*4882a593Smuzhiyun * completion. During readahead, the scrub is officially paused
3063*4882a593Smuzhiyun * to not hold off transaction commits
3064*4882a593Smuzhiyun */
3065*4882a593Smuzhiyun logical = base + offset;
3066*4882a593Smuzhiyun physical_end = physical + nstripes * map->stripe_len;
3067*4882a593Smuzhiyun if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3068*4882a593Smuzhiyun get_raid56_logic_offset(physical_end, num,
3069*4882a593Smuzhiyun map, &logic_end, NULL);
3070*4882a593Smuzhiyun logic_end += base;
3071*4882a593Smuzhiyun } else {
3072*4882a593Smuzhiyun logic_end = logical + increment * nstripes;
3073*4882a593Smuzhiyun }
3074*4882a593Smuzhiyun wait_event(sctx->list_wait,
3075*4882a593Smuzhiyun atomic_read(&sctx->bios_in_flight) == 0);
3076*4882a593Smuzhiyun scrub_blocked_if_needed(fs_info);
3077*4882a593Smuzhiyun
3078*4882a593Smuzhiyun /* FIXME it might be better to start readahead at commit root */
3079*4882a593Smuzhiyun key.objectid = logical;
3080*4882a593Smuzhiyun key.type = BTRFS_EXTENT_ITEM_KEY;
3081*4882a593Smuzhiyun key.offset = (u64)0;
3082*4882a593Smuzhiyun key_end.objectid = logic_end;
3083*4882a593Smuzhiyun key_end.type = BTRFS_METADATA_ITEM_KEY;
3084*4882a593Smuzhiyun key_end.offset = (u64)-1;
3085*4882a593Smuzhiyun reada1 = btrfs_reada_add(root, &key, &key_end);
3086*4882a593Smuzhiyun
3087*4882a593Smuzhiyun key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3088*4882a593Smuzhiyun key.type = BTRFS_EXTENT_CSUM_KEY;
3089*4882a593Smuzhiyun key.offset = logical;
3090*4882a593Smuzhiyun key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3091*4882a593Smuzhiyun key_end.type = BTRFS_EXTENT_CSUM_KEY;
3092*4882a593Smuzhiyun key_end.offset = logic_end;
3093*4882a593Smuzhiyun reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3094*4882a593Smuzhiyun
3095*4882a593Smuzhiyun if (!IS_ERR(reada1))
3096*4882a593Smuzhiyun btrfs_reada_wait(reada1);
3097*4882a593Smuzhiyun if (!IS_ERR(reada2))
3098*4882a593Smuzhiyun btrfs_reada_wait(reada2);
3099*4882a593Smuzhiyun
3100*4882a593Smuzhiyun
3101*4882a593Smuzhiyun /*
3102*4882a593Smuzhiyun * collect all data csums for the stripe to avoid seeking during
3103*4882a593Smuzhiyun * the scrub. This might currently (crc32) end up to be about 1MB
3104*4882a593Smuzhiyun */
3105*4882a593Smuzhiyun blk_start_plug(&plug);
3106*4882a593Smuzhiyun
3107*4882a593Smuzhiyun /*
3108*4882a593Smuzhiyun * now find all extents for each stripe and scrub them
3109*4882a593Smuzhiyun */
3110*4882a593Smuzhiyun ret = 0;
3111*4882a593Smuzhiyun while (physical < physical_end) {
3112*4882a593Smuzhiyun /*
3113*4882a593Smuzhiyun * canceled?
3114*4882a593Smuzhiyun */
3115*4882a593Smuzhiyun if (atomic_read(&fs_info->scrub_cancel_req) ||
3116*4882a593Smuzhiyun atomic_read(&sctx->cancel_req)) {
3117*4882a593Smuzhiyun ret = -ECANCELED;
3118*4882a593Smuzhiyun goto out;
3119*4882a593Smuzhiyun }
3120*4882a593Smuzhiyun /*
3121*4882a593Smuzhiyun * check to see if we have to pause
3122*4882a593Smuzhiyun */
3123*4882a593Smuzhiyun if (atomic_read(&fs_info->scrub_pause_req)) {
3124*4882a593Smuzhiyun /* push queued extents */
3125*4882a593Smuzhiyun sctx->flush_all_writes = true;
3126*4882a593Smuzhiyun scrub_submit(sctx);
3127*4882a593Smuzhiyun mutex_lock(&sctx->wr_lock);
3128*4882a593Smuzhiyun scrub_wr_submit(sctx);
3129*4882a593Smuzhiyun mutex_unlock(&sctx->wr_lock);
3130*4882a593Smuzhiyun wait_event(sctx->list_wait,
3131*4882a593Smuzhiyun atomic_read(&sctx->bios_in_flight) == 0);
3132*4882a593Smuzhiyun sctx->flush_all_writes = false;
3133*4882a593Smuzhiyun scrub_blocked_if_needed(fs_info);
3134*4882a593Smuzhiyun }
3135*4882a593Smuzhiyun
3136*4882a593Smuzhiyun if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3137*4882a593Smuzhiyun ret = get_raid56_logic_offset(physical, num, map,
3138*4882a593Smuzhiyun &logical,
3139*4882a593Smuzhiyun &stripe_logical);
3140*4882a593Smuzhiyun logical += base;
3141*4882a593Smuzhiyun if (ret) {
3142*4882a593Smuzhiyun /* it is parity strip */
3143*4882a593Smuzhiyun stripe_logical += base;
3144*4882a593Smuzhiyun stripe_end = stripe_logical + increment;
3145*4882a593Smuzhiyun ret = scrub_raid56_parity(sctx, map, scrub_dev,
3146*4882a593Smuzhiyun ppath, stripe_logical,
3147*4882a593Smuzhiyun stripe_end);
3148*4882a593Smuzhiyun if (ret)
3149*4882a593Smuzhiyun goto out;
3150*4882a593Smuzhiyun goto skip;
3151*4882a593Smuzhiyun }
3152*4882a593Smuzhiyun }
3153*4882a593Smuzhiyun
3154*4882a593Smuzhiyun if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3155*4882a593Smuzhiyun key.type = BTRFS_METADATA_ITEM_KEY;
3156*4882a593Smuzhiyun else
3157*4882a593Smuzhiyun key.type = BTRFS_EXTENT_ITEM_KEY;
3158*4882a593Smuzhiyun key.objectid = logical;
3159*4882a593Smuzhiyun key.offset = (u64)-1;
3160*4882a593Smuzhiyun
3161*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3162*4882a593Smuzhiyun if (ret < 0)
3163*4882a593Smuzhiyun goto out;
3164*4882a593Smuzhiyun
3165*4882a593Smuzhiyun if (ret > 0) {
3166*4882a593Smuzhiyun ret = btrfs_previous_extent_item(root, path, 0);
3167*4882a593Smuzhiyun if (ret < 0)
3168*4882a593Smuzhiyun goto out;
3169*4882a593Smuzhiyun if (ret > 0) {
3170*4882a593Smuzhiyun /* there's no smaller item, so stick with the
3171*4882a593Smuzhiyun * larger one */
3172*4882a593Smuzhiyun btrfs_release_path(path);
3173*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &key,
3174*4882a593Smuzhiyun path, 0, 0);
3175*4882a593Smuzhiyun if (ret < 0)
3176*4882a593Smuzhiyun goto out;
3177*4882a593Smuzhiyun }
3178*4882a593Smuzhiyun }
3179*4882a593Smuzhiyun
3180*4882a593Smuzhiyun stop_loop = 0;
3181*4882a593Smuzhiyun while (1) {
3182*4882a593Smuzhiyun u64 bytes;
3183*4882a593Smuzhiyun
3184*4882a593Smuzhiyun l = path->nodes[0];
3185*4882a593Smuzhiyun slot = path->slots[0];
3186*4882a593Smuzhiyun if (slot >= btrfs_header_nritems(l)) {
3187*4882a593Smuzhiyun ret = btrfs_next_leaf(root, path);
3188*4882a593Smuzhiyun if (ret == 0)
3189*4882a593Smuzhiyun continue;
3190*4882a593Smuzhiyun if (ret < 0)
3191*4882a593Smuzhiyun goto out;
3192*4882a593Smuzhiyun
3193*4882a593Smuzhiyun stop_loop = 1;
3194*4882a593Smuzhiyun break;
3195*4882a593Smuzhiyun }
3196*4882a593Smuzhiyun btrfs_item_key_to_cpu(l, &key, slot);
3197*4882a593Smuzhiyun
3198*4882a593Smuzhiyun if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3199*4882a593Smuzhiyun key.type != BTRFS_METADATA_ITEM_KEY)
3200*4882a593Smuzhiyun goto next;
3201*4882a593Smuzhiyun
3202*4882a593Smuzhiyun if (key.type == BTRFS_METADATA_ITEM_KEY)
3203*4882a593Smuzhiyun bytes = fs_info->nodesize;
3204*4882a593Smuzhiyun else
3205*4882a593Smuzhiyun bytes = key.offset;
3206*4882a593Smuzhiyun
3207*4882a593Smuzhiyun if (key.objectid + bytes <= logical)
3208*4882a593Smuzhiyun goto next;
3209*4882a593Smuzhiyun
3210*4882a593Smuzhiyun if (key.objectid >= logical + map->stripe_len) {
3211*4882a593Smuzhiyun /* out of this device extent */
3212*4882a593Smuzhiyun if (key.objectid >= logic_end)
3213*4882a593Smuzhiyun stop_loop = 1;
3214*4882a593Smuzhiyun break;
3215*4882a593Smuzhiyun }
3216*4882a593Smuzhiyun
3217*4882a593Smuzhiyun /*
3218*4882a593Smuzhiyun * If our block group was removed in the meanwhile, just
3219*4882a593Smuzhiyun * stop scrubbing since there is no point in continuing.
3220*4882a593Smuzhiyun * Continuing would prevent reusing its device extents
3221*4882a593Smuzhiyun * for new block groups for a long time.
3222*4882a593Smuzhiyun */
3223*4882a593Smuzhiyun spin_lock(&cache->lock);
3224*4882a593Smuzhiyun if (cache->removed) {
3225*4882a593Smuzhiyun spin_unlock(&cache->lock);
3226*4882a593Smuzhiyun ret = 0;
3227*4882a593Smuzhiyun goto out;
3228*4882a593Smuzhiyun }
3229*4882a593Smuzhiyun spin_unlock(&cache->lock);
3230*4882a593Smuzhiyun
3231*4882a593Smuzhiyun extent = btrfs_item_ptr(l, slot,
3232*4882a593Smuzhiyun struct btrfs_extent_item);
3233*4882a593Smuzhiyun flags = btrfs_extent_flags(l, extent);
3234*4882a593Smuzhiyun generation = btrfs_extent_generation(l, extent);
3235*4882a593Smuzhiyun
3236*4882a593Smuzhiyun if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3237*4882a593Smuzhiyun (key.objectid < logical ||
3238*4882a593Smuzhiyun key.objectid + bytes >
3239*4882a593Smuzhiyun logical + map->stripe_len)) {
3240*4882a593Smuzhiyun btrfs_err(fs_info,
3241*4882a593Smuzhiyun "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3242*4882a593Smuzhiyun key.objectid, logical);
3243*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
3244*4882a593Smuzhiyun sctx->stat.uncorrectable_errors++;
3245*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
3246*4882a593Smuzhiyun goto next;
3247*4882a593Smuzhiyun }
3248*4882a593Smuzhiyun
3249*4882a593Smuzhiyun again:
3250*4882a593Smuzhiyun extent_logical = key.objectid;
3251*4882a593Smuzhiyun extent_len = bytes;
3252*4882a593Smuzhiyun
3253*4882a593Smuzhiyun /*
3254*4882a593Smuzhiyun * trim extent to this stripe
3255*4882a593Smuzhiyun */
3256*4882a593Smuzhiyun if (extent_logical < logical) {
3257*4882a593Smuzhiyun extent_len -= logical - extent_logical;
3258*4882a593Smuzhiyun extent_logical = logical;
3259*4882a593Smuzhiyun }
3260*4882a593Smuzhiyun if (extent_logical + extent_len >
3261*4882a593Smuzhiyun logical + map->stripe_len) {
3262*4882a593Smuzhiyun extent_len = logical + map->stripe_len -
3263*4882a593Smuzhiyun extent_logical;
3264*4882a593Smuzhiyun }
3265*4882a593Smuzhiyun
3266*4882a593Smuzhiyun extent_physical = extent_logical - logical + physical;
3267*4882a593Smuzhiyun extent_dev = scrub_dev;
3268*4882a593Smuzhiyun extent_mirror_num = mirror_num;
3269*4882a593Smuzhiyun if (sctx->is_dev_replace)
3270*4882a593Smuzhiyun scrub_remap_extent(fs_info, extent_logical,
3271*4882a593Smuzhiyun extent_len, &extent_physical,
3272*4882a593Smuzhiyun &extent_dev,
3273*4882a593Smuzhiyun &extent_mirror_num);
3274*4882a593Smuzhiyun
3275*4882a593Smuzhiyun if (flags & BTRFS_EXTENT_FLAG_DATA) {
3276*4882a593Smuzhiyun ret = btrfs_lookup_csums_range(csum_root,
3277*4882a593Smuzhiyun extent_logical,
3278*4882a593Smuzhiyun extent_logical + extent_len - 1,
3279*4882a593Smuzhiyun &sctx->csum_list, 1);
3280*4882a593Smuzhiyun if (ret)
3281*4882a593Smuzhiyun goto out;
3282*4882a593Smuzhiyun }
3283*4882a593Smuzhiyun
3284*4882a593Smuzhiyun ret = scrub_extent(sctx, map, extent_logical, extent_len,
3285*4882a593Smuzhiyun extent_physical, extent_dev, flags,
3286*4882a593Smuzhiyun generation, extent_mirror_num,
3287*4882a593Smuzhiyun extent_logical - logical + physical);
3288*4882a593Smuzhiyun
3289*4882a593Smuzhiyun scrub_free_csums(sctx);
3290*4882a593Smuzhiyun
3291*4882a593Smuzhiyun if (ret)
3292*4882a593Smuzhiyun goto out;
3293*4882a593Smuzhiyun
3294*4882a593Smuzhiyun if (extent_logical + extent_len <
3295*4882a593Smuzhiyun key.objectid + bytes) {
3296*4882a593Smuzhiyun if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3297*4882a593Smuzhiyun /*
3298*4882a593Smuzhiyun * loop until we find next data stripe
3299*4882a593Smuzhiyun * or we have finished all stripes.
3300*4882a593Smuzhiyun */
3301*4882a593Smuzhiyun loop:
3302*4882a593Smuzhiyun physical += map->stripe_len;
3303*4882a593Smuzhiyun ret = get_raid56_logic_offset(physical,
3304*4882a593Smuzhiyun num, map, &logical,
3305*4882a593Smuzhiyun &stripe_logical);
3306*4882a593Smuzhiyun logical += base;
3307*4882a593Smuzhiyun
3308*4882a593Smuzhiyun if (ret && physical < physical_end) {
3309*4882a593Smuzhiyun stripe_logical += base;
3310*4882a593Smuzhiyun stripe_end = stripe_logical +
3311*4882a593Smuzhiyun increment;
3312*4882a593Smuzhiyun ret = scrub_raid56_parity(sctx,
3313*4882a593Smuzhiyun map, scrub_dev, ppath,
3314*4882a593Smuzhiyun stripe_logical,
3315*4882a593Smuzhiyun stripe_end);
3316*4882a593Smuzhiyun if (ret)
3317*4882a593Smuzhiyun goto out;
3318*4882a593Smuzhiyun goto loop;
3319*4882a593Smuzhiyun }
3320*4882a593Smuzhiyun } else {
3321*4882a593Smuzhiyun physical += map->stripe_len;
3322*4882a593Smuzhiyun logical += increment;
3323*4882a593Smuzhiyun }
3324*4882a593Smuzhiyun if (logical < key.objectid + bytes) {
3325*4882a593Smuzhiyun cond_resched();
3326*4882a593Smuzhiyun goto again;
3327*4882a593Smuzhiyun }
3328*4882a593Smuzhiyun
3329*4882a593Smuzhiyun if (physical >= physical_end) {
3330*4882a593Smuzhiyun stop_loop = 1;
3331*4882a593Smuzhiyun break;
3332*4882a593Smuzhiyun }
3333*4882a593Smuzhiyun }
3334*4882a593Smuzhiyun next:
3335*4882a593Smuzhiyun path->slots[0]++;
3336*4882a593Smuzhiyun }
3337*4882a593Smuzhiyun btrfs_release_path(path);
3338*4882a593Smuzhiyun skip:
3339*4882a593Smuzhiyun logical += increment;
3340*4882a593Smuzhiyun physical += map->stripe_len;
3341*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
3342*4882a593Smuzhiyun if (stop_loop)
3343*4882a593Smuzhiyun sctx->stat.last_physical = map->stripes[num].physical +
3344*4882a593Smuzhiyun length;
3345*4882a593Smuzhiyun else
3346*4882a593Smuzhiyun sctx->stat.last_physical = physical;
3347*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
3348*4882a593Smuzhiyun if (stop_loop)
3349*4882a593Smuzhiyun break;
3350*4882a593Smuzhiyun }
3351*4882a593Smuzhiyun out:
3352*4882a593Smuzhiyun /* push queued extents */
3353*4882a593Smuzhiyun scrub_submit(sctx);
3354*4882a593Smuzhiyun mutex_lock(&sctx->wr_lock);
3355*4882a593Smuzhiyun scrub_wr_submit(sctx);
3356*4882a593Smuzhiyun mutex_unlock(&sctx->wr_lock);
3357*4882a593Smuzhiyun
3358*4882a593Smuzhiyun blk_finish_plug(&plug);
3359*4882a593Smuzhiyun btrfs_free_path(path);
3360*4882a593Smuzhiyun btrfs_free_path(ppath);
3361*4882a593Smuzhiyun return ret < 0 ? ret : 0;
3362*4882a593Smuzhiyun }
3363*4882a593Smuzhiyun
scrub_chunk(struct scrub_ctx * sctx,struct btrfs_device * scrub_dev,u64 chunk_offset,u64 length,u64 dev_offset,struct btrfs_block_group * cache)3364*4882a593Smuzhiyun static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3365*4882a593Smuzhiyun struct btrfs_device *scrub_dev,
3366*4882a593Smuzhiyun u64 chunk_offset, u64 length,
3367*4882a593Smuzhiyun u64 dev_offset,
3368*4882a593Smuzhiyun struct btrfs_block_group *cache)
3369*4882a593Smuzhiyun {
3370*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sctx->fs_info;
3371*4882a593Smuzhiyun struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3372*4882a593Smuzhiyun struct map_lookup *map;
3373*4882a593Smuzhiyun struct extent_map *em;
3374*4882a593Smuzhiyun int i;
3375*4882a593Smuzhiyun int ret = 0;
3376*4882a593Smuzhiyun
3377*4882a593Smuzhiyun read_lock(&map_tree->lock);
3378*4882a593Smuzhiyun em = lookup_extent_mapping(map_tree, chunk_offset, 1);
3379*4882a593Smuzhiyun read_unlock(&map_tree->lock);
3380*4882a593Smuzhiyun
3381*4882a593Smuzhiyun if (!em) {
3382*4882a593Smuzhiyun /*
3383*4882a593Smuzhiyun * Might have been an unused block group deleted by the cleaner
3384*4882a593Smuzhiyun * kthread or relocation.
3385*4882a593Smuzhiyun */
3386*4882a593Smuzhiyun spin_lock(&cache->lock);
3387*4882a593Smuzhiyun if (!cache->removed)
3388*4882a593Smuzhiyun ret = -EINVAL;
3389*4882a593Smuzhiyun spin_unlock(&cache->lock);
3390*4882a593Smuzhiyun
3391*4882a593Smuzhiyun return ret;
3392*4882a593Smuzhiyun }
3393*4882a593Smuzhiyun
3394*4882a593Smuzhiyun map = em->map_lookup;
3395*4882a593Smuzhiyun if (em->start != chunk_offset)
3396*4882a593Smuzhiyun goto out;
3397*4882a593Smuzhiyun
3398*4882a593Smuzhiyun if (em->len < length)
3399*4882a593Smuzhiyun goto out;
3400*4882a593Smuzhiyun
3401*4882a593Smuzhiyun for (i = 0; i < map->num_stripes; ++i) {
3402*4882a593Smuzhiyun if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3403*4882a593Smuzhiyun map->stripes[i].physical == dev_offset) {
3404*4882a593Smuzhiyun ret = scrub_stripe(sctx, map, scrub_dev, i,
3405*4882a593Smuzhiyun chunk_offset, length, cache);
3406*4882a593Smuzhiyun if (ret)
3407*4882a593Smuzhiyun goto out;
3408*4882a593Smuzhiyun }
3409*4882a593Smuzhiyun }
3410*4882a593Smuzhiyun out:
3411*4882a593Smuzhiyun free_extent_map(em);
3412*4882a593Smuzhiyun
3413*4882a593Smuzhiyun return ret;
3414*4882a593Smuzhiyun }
3415*4882a593Smuzhiyun
3416*4882a593Smuzhiyun static noinline_for_stack
scrub_enumerate_chunks(struct scrub_ctx * sctx,struct btrfs_device * scrub_dev,u64 start,u64 end)3417*4882a593Smuzhiyun int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3418*4882a593Smuzhiyun struct btrfs_device *scrub_dev, u64 start, u64 end)
3419*4882a593Smuzhiyun {
3420*4882a593Smuzhiyun struct btrfs_dev_extent *dev_extent = NULL;
3421*4882a593Smuzhiyun struct btrfs_path *path;
3422*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sctx->fs_info;
3423*4882a593Smuzhiyun struct btrfs_root *root = fs_info->dev_root;
3424*4882a593Smuzhiyun u64 length;
3425*4882a593Smuzhiyun u64 chunk_offset;
3426*4882a593Smuzhiyun int ret = 0;
3427*4882a593Smuzhiyun int ro_set;
3428*4882a593Smuzhiyun int slot;
3429*4882a593Smuzhiyun struct extent_buffer *l;
3430*4882a593Smuzhiyun struct btrfs_key key;
3431*4882a593Smuzhiyun struct btrfs_key found_key;
3432*4882a593Smuzhiyun struct btrfs_block_group *cache;
3433*4882a593Smuzhiyun struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3434*4882a593Smuzhiyun
3435*4882a593Smuzhiyun path = btrfs_alloc_path();
3436*4882a593Smuzhiyun if (!path)
3437*4882a593Smuzhiyun return -ENOMEM;
3438*4882a593Smuzhiyun
3439*4882a593Smuzhiyun path->reada = READA_FORWARD;
3440*4882a593Smuzhiyun path->search_commit_root = 1;
3441*4882a593Smuzhiyun path->skip_locking = 1;
3442*4882a593Smuzhiyun
3443*4882a593Smuzhiyun key.objectid = scrub_dev->devid;
3444*4882a593Smuzhiyun key.offset = 0ull;
3445*4882a593Smuzhiyun key.type = BTRFS_DEV_EXTENT_KEY;
3446*4882a593Smuzhiyun
3447*4882a593Smuzhiyun while (1) {
3448*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3449*4882a593Smuzhiyun if (ret < 0)
3450*4882a593Smuzhiyun break;
3451*4882a593Smuzhiyun if (ret > 0) {
3452*4882a593Smuzhiyun if (path->slots[0] >=
3453*4882a593Smuzhiyun btrfs_header_nritems(path->nodes[0])) {
3454*4882a593Smuzhiyun ret = btrfs_next_leaf(root, path);
3455*4882a593Smuzhiyun if (ret < 0)
3456*4882a593Smuzhiyun break;
3457*4882a593Smuzhiyun if (ret > 0) {
3458*4882a593Smuzhiyun ret = 0;
3459*4882a593Smuzhiyun break;
3460*4882a593Smuzhiyun }
3461*4882a593Smuzhiyun } else {
3462*4882a593Smuzhiyun ret = 0;
3463*4882a593Smuzhiyun }
3464*4882a593Smuzhiyun }
3465*4882a593Smuzhiyun
3466*4882a593Smuzhiyun l = path->nodes[0];
3467*4882a593Smuzhiyun slot = path->slots[0];
3468*4882a593Smuzhiyun
3469*4882a593Smuzhiyun btrfs_item_key_to_cpu(l, &found_key, slot);
3470*4882a593Smuzhiyun
3471*4882a593Smuzhiyun if (found_key.objectid != scrub_dev->devid)
3472*4882a593Smuzhiyun break;
3473*4882a593Smuzhiyun
3474*4882a593Smuzhiyun if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3475*4882a593Smuzhiyun break;
3476*4882a593Smuzhiyun
3477*4882a593Smuzhiyun if (found_key.offset >= end)
3478*4882a593Smuzhiyun break;
3479*4882a593Smuzhiyun
3480*4882a593Smuzhiyun if (found_key.offset < key.offset)
3481*4882a593Smuzhiyun break;
3482*4882a593Smuzhiyun
3483*4882a593Smuzhiyun dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3484*4882a593Smuzhiyun length = btrfs_dev_extent_length(l, dev_extent);
3485*4882a593Smuzhiyun
3486*4882a593Smuzhiyun if (found_key.offset + length <= start)
3487*4882a593Smuzhiyun goto skip;
3488*4882a593Smuzhiyun
3489*4882a593Smuzhiyun chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3490*4882a593Smuzhiyun
3491*4882a593Smuzhiyun /*
3492*4882a593Smuzhiyun * get a reference on the corresponding block group to prevent
3493*4882a593Smuzhiyun * the chunk from going away while we scrub it
3494*4882a593Smuzhiyun */
3495*4882a593Smuzhiyun cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3496*4882a593Smuzhiyun
3497*4882a593Smuzhiyun /* some chunks are removed but not committed to disk yet,
3498*4882a593Smuzhiyun * continue scrubbing */
3499*4882a593Smuzhiyun if (!cache)
3500*4882a593Smuzhiyun goto skip;
3501*4882a593Smuzhiyun
3502*4882a593Smuzhiyun /*
3503*4882a593Smuzhiyun * Make sure that while we are scrubbing the corresponding block
3504*4882a593Smuzhiyun * group doesn't get its logical address and its device extents
3505*4882a593Smuzhiyun * reused for another block group, which can possibly be of a
3506*4882a593Smuzhiyun * different type and different profile. We do this to prevent
3507*4882a593Smuzhiyun * false error detections and crashes due to bogus attempts to
3508*4882a593Smuzhiyun * repair extents.
3509*4882a593Smuzhiyun */
3510*4882a593Smuzhiyun spin_lock(&cache->lock);
3511*4882a593Smuzhiyun if (cache->removed) {
3512*4882a593Smuzhiyun spin_unlock(&cache->lock);
3513*4882a593Smuzhiyun btrfs_put_block_group(cache);
3514*4882a593Smuzhiyun goto skip;
3515*4882a593Smuzhiyun }
3516*4882a593Smuzhiyun btrfs_freeze_block_group(cache);
3517*4882a593Smuzhiyun spin_unlock(&cache->lock);
3518*4882a593Smuzhiyun
3519*4882a593Smuzhiyun /*
3520*4882a593Smuzhiyun * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3521*4882a593Smuzhiyun * to avoid deadlock caused by:
3522*4882a593Smuzhiyun * btrfs_inc_block_group_ro()
3523*4882a593Smuzhiyun * -> btrfs_wait_for_commit()
3524*4882a593Smuzhiyun * -> btrfs_commit_transaction()
3525*4882a593Smuzhiyun * -> btrfs_scrub_pause()
3526*4882a593Smuzhiyun */
3527*4882a593Smuzhiyun scrub_pause_on(fs_info);
3528*4882a593Smuzhiyun
3529*4882a593Smuzhiyun /*
3530*4882a593Smuzhiyun * Don't do chunk preallocation for scrub.
3531*4882a593Smuzhiyun *
3532*4882a593Smuzhiyun * This is especially important for SYSTEM bgs, or we can hit
3533*4882a593Smuzhiyun * -EFBIG from btrfs_finish_chunk_alloc() like:
3534*4882a593Smuzhiyun * 1. The only SYSTEM bg is marked RO.
3535*4882a593Smuzhiyun * Since SYSTEM bg is small, that's pretty common.
3536*4882a593Smuzhiyun * 2. New SYSTEM bg will be allocated
3537*4882a593Smuzhiyun * Due to regular version will allocate new chunk.
3538*4882a593Smuzhiyun * 3. New SYSTEM bg is empty and will get cleaned up
3539*4882a593Smuzhiyun * Before cleanup really happens, it's marked RO again.
3540*4882a593Smuzhiyun * 4. Empty SYSTEM bg get scrubbed
3541*4882a593Smuzhiyun * We go back to 2.
3542*4882a593Smuzhiyun *
3543*4882a593Smuzhiyun * This can easily boost the amount of SYSTEM chunks if cleaner
3544*4882a593Smuzhiyun * thread can't be triggered fast enough, and use up all space
3545*4882a593Smuzhiyun * of btrfs_super_block::sys_chunk_array
3546*4882a593Smuzhiyun *
3547*4882a593Smuzhiyun * While for dev replace, we need to try our best to mark block
3548*4882a593Smuzhiyun * group RO, to prevent race between:
3549*4882a593Smuzhiyun * - Write duplication
3550*4882a593Smuzhiyun * Contains latest data
3551*4882a593Smuzhiyun * - Scrub copy
3552*4882a593Smuzhiyun * Contains data from commit tree
3553*4882a593Smuzhiyun *
3554*4882a593Smuzhiyun * If target block group is not marked RO, nocow writes can
3555*4882a593Smuzhiyun * be overwritten by scrub copy, causing data corruption.
3556*4882a593Smuzhiyun * So for dev-replace, it's not allowed to continue if a block
3557*4882a593Smuzhiyun * group is not RO.
3558*4882a593Smuzhiyun */
3559*4882a593Smuzhiyun ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3560*4882a593Smuzhiyun if (ret == 0) {
3561*4882a593Smuzhiyun ro_set = 1;
3562*4882a593Smuzhiyun } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3563*4882a593Smuzhiyun /*
3564*4882a593Smuzhiyun * btrfs_inc_block_group_ro return -ENOSPC when it
3565*4882a593Smuzhiyun * failed in creating new chunk for metadata.
3566*4882a593Smuzhiyun * It is not a problem for scrub, because
3567*4882a593Smuzhiyun * metadata are always cowed, and our scrub paused
3568*4882a593Smuzhiyun * commit_transactions.
3569*4882a593Smuzhiyun */
3570*4882a593Smuzhiyun ro_set = 0;
3571*4882a593Smuzhiyun } else if (ret == -ETXTBSY) {
3572*4882a593Smuzhiyun btrfs_warn(fs_info,
3573*4882a593Smuzhiyun "skipping scrub of block group %llu due to active swapfile",
3574*4882a593Smuzhiyun cache->start);
3575*4882a593Smuzhiyun scrub_pause_off(fs_info);
3576*4882a593Smuzhiyun ret = 0;
3577*4882a593Smuzhiyun goto skip_unfreeze;
3578*4882a593Smuzhiyun } else {
3579*4882a593Smuzhiyun btrfs_warn(fs_info,
3580*4882a593Smuzhiyun "failed setting block group ro: %d", ret);
3581*4882a593Smuzhiyun btrfs_unfreeze_block_group(cache);
3582*4882a593Smuzhiyun btrfs_put_block_group(cache);
3583*4882a593Smuzhiyun scrub_pause_off(fs_info);
3584*4882a593Smuzhiyun break;
3585*4882a593Smuzhiyun }
3586*4882a593Smuzhiyun
3587*4882a593Smuzhiyun /*
3588*4882a593Smuzhiyun * Now the target block is marked RO, wait for nocow writes to
3589*4882a593Smuzhiyun * finish before dev-replace.
3590*4882a593Smuzhiyun * COW is fine, as COW never overwrites extents in commit tree.
3591*4882a593Smuzhiyun */
3592*4882a593Smuzhiyun if (sctx->is_dev_replace) {
3593*4882a593Smuzhiyun btrfs_wait_nocow_writers(cache);
3594*4882a593Smuzhiyun btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3595*4882a593Smuzhiyun cache->length);
3596*4882a593Smuzhiyun }
3597*4882a593Smuzhiyun
3598*4882a593Smuzhiyun scrub_pause_off(fs_info);
3599*4882a593Smuzhiyun down_write(&dev_replace->rwsem);
3600*4882a593Smuzhiyun dev_replace->cursor_right = found_key.offset + length;
3601*4882a593Smuzhiyun dev_replace->cursor_left = found_key.offset;
3602*4882a593Smuzhiyun dev_replace->item_needs_writeback = 1;
3603*4882a593Smuzhiyun up_write(&dev_replace->rwsem);
3604*4882a593Smuzhiyun
3605*4882a593Smuzhiyun ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
3606*4882a593Smuzhiyun found_key.offset, cache);
3607*4882a593Smuzhiyun
3608*4882a593Smuzhiyun /*
3609*4882a593Smuzhiyun * flush, submit all pending read and write bios, afterwards
3610*4882a593Smuzhiyun * wait for them.
3611*4882a593Smuzhiyun * Note that in the dev replace case, a read request causes
3612*4882a593Smuzhiyun * write requests that are submitted in the read completion
3613*4882a593Smuzhiyun * worker. Therefore in the current situation, it is required
3614*4882a593Smuzhiyun * that all write requests are flushed, so that all read and
3615*4882a593Smuzhiyun * write requests are really completed when bios_in_flight
3616*4882a593Smuzhiyun * changes to 0.
3617*4882a593Smuzhiyun */
3618*4882a593Smuzhiyun sctx->flush_all_writes = true;
3619*4882a593Smuzhiyun scrub_submit(sctx);
3620*4882a593Smuzhiyun mutex_lock(&sctx->wr_lock);
3621*4882a593Smuzhiyun scrub_wr_submit(sctx);
3622*4882a593Smuzhiyun mutex_unlock(&sctx->wr_lock);
3623*4882a593Smuzhiyun
3624*4882a593Smuzhiyun wait_event(sctx->list_wait,
3625*4882a593Smuzhiyun atomic_read(&sctx->bios_in_flight) == 0);
3626*4882a593Smuzhiyun
3627*4882a593Smuzhiyun scrub_pause_on(fs_info);
3628*4882a593Smuzhiyun
3629*4882a593Smuzhiyun /*
3630*4882a593Smuzhiyun * must be called before we decrease @scrub_paused.
3631*4882a593Smuzhiyun * make sure we don't block transaction commit while
3632*4882a593Smuzhiyun * we are waiting pending workers finished.
3633*4882a593Smuzhiyun */
3634*4882a593Smuzhiyun wait_event(sctx->list_wait,
3635*4882a593Smuzhiyun atomic_read(&sctx->workers_pending) == 0);
3636*4882a593Smuzhiyun sctx->flush_all_writes = false;
3637*4882a593Smuzhiyun
3638*4882a593Smuzhiyun scrub_pause_off(fs_info);
3639*4882a593Smuzhiyun
3640*4882a593Smuzhiyun down_write(&dev_replace->rwsem);
3641*4882a593Smuzhiyun dev_replace->cursor_left = dev_replace->cursor_right;
3642*4882a593Smuzhiyun dev_replace->item_needs_writeback = 1;
3643*4882a593Smuzhiyun up_write(&dev_replace->rwsem);
3644*4882a593Smuzhiyun
3645*4882a593Smuzhiyun if (ro_set)
3646*4882a593Smuzhiyun btrfs_dec_block_group_ro(cache);
3647*4882a593Smuzhiyun
3648*4882a593Smuzhiyun /*
3649*4882a593Smuzhiyun * We might have prevented the cleaner kthread from deleting
3650*4882a593Smuzhiyun * this block group if it was already unused because we raced
3651*4882a593Smuzhiyun * and set it to RO mode first. So add it back to the unused
3652*4882a593Smuzhiyun * list, otherwise it might not ever be deleted unless a manual
3653*4882a593Smuzhiyun * balance is triggered or it becomes used and unused again.
3654*4882a593Smuzhiyun */
3655*4882a593Smuzhiyun spin_lock(&cache->lock);
3656*4882a593Smuzhiyun if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3657*4882a593Smuzhiyun cache->used == 0) {
3658*4882a593Smuzhiyun spin_unlock(&cache->lock);
3659*4882a593Smuzhiyun if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3660*4882a593Smuzhiyun btrfs_discard_queue_work(&fs_info->discard_ctl,
3661*4882a593Smuzhiyun cache);
3662*4882a593Smuzhiyun else
3663*4882a593Smuzhiyun btrfs_mark_bg_unused(cache);
3664*4882a593Smuzhiyun } else {
3665*4882a593Smuzhiyun spin_unlock(&cache->lock);
3666*4882a593Smuzhiyun }
3667*4882a593Smuzhiyun skip_unfreeze:
3668*4882a593Smuzhiyun btrfs_unfreeze_block_group(cache);
3669*4882a593Smuzhiyun btrfs_put_block_group(cache);
3670*4882a593Smuzhiyun if (ret)
3671*4882a593Smuzhiyun break;
3672*4882a593Smuzhiyun if (sctx->is_dev_replace &&
3673*4882a593Smuzhiyun atomic64_read(&dev_replace->num_write_errors) > 0) {
3674*4882a593Smuzhiyun ret = -EIO;
3675*4882a593Smuzhiyun break;
3676*4882a593Smuzhiyun }
3677*4882a593Smuzhiyun if (sctx->stat.malloc_errors > 0) {
3678*4882a593Smuzhiyun ret = -ENOMEM;
3679*4882a593Smuzhiyun break;
3680*4882a593Smuzhiyun }
3681*4882a593Smuzhiyun skip:
3682*4882a593Smuzhiyun key.offset = found_key.offset + length;
3683*4882a593Smuzhiyun btrfs_release_path(path);
3684*4882a593Smuzhiyun }
3685*4882a593Smuzhiyun
3686*4882a593Smuzhiyun btrfs_free_path(path);
3687*4882a593Smuzhiyun
3688*4882a593Smuzhiyun return ret;
3689*4882a593Smuzhiyun }
3690*4882a593Smuzhiyun
scrub_supers(struct scrub_ctx * sctx,struct btrfs_device * scrub_dev)3691*4882a593Smuzhiyun static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3692*4882a593Smuzhiyun struct btrfs_device *scrub_dev)
3693*4882a593Smuzhiyun {
3694*4882a593Smuzhiyun int i;
3695*4882a593Smuzhiyun u64 bytenr;
3696*4882a593Smuzhiyun u64 gen;
3697*4882a593Smuzhiyun int ret;
3698*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = sctx->fs_info;
3699*4882a593Smuzhiyun
3700*4882a593Smuzhiyun if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
3701*4882a593Smuzhiyun return -EROFS;
3702*4882a593Smuzhiyun
3703*4882a593Smuzhiyun /* Seed devices of a new filesystem has their own generation. */
3704*4882a593Smuzhiyun if (scrub_dev->fs_devices != fs_info->fs_devices)
3705*4882a593Smuzhiyun gen = scrub_dev->generation;
3706*4882a593Smuzhiyun else
3707*4882a593Smuzhiyun gen = fs_info->last_trans_committed;
3708*4882a593Smuzhiyun
3709*4882a593Smuzhiyun for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3710*4882a593Smuzhiyun bytenr = btrfs_sb_offset(i);
3711*4882a593Smuzhiyun if (bytenr + BTRFS_SUPER_INFO_SIZE >
3712*4882a593Smuzhiyun scrub_dev->commit_total_bytes)
3713*4882a593Smuzhiyun break;
3714*4882a593Smuzhiyun
3715*4882a593Smuzhiyun ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3716*4882a593Smuzhiyun scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3717*4882a593Smuzhiyun NULL, 1, bytenr);
3718*4882a593Smuzhiyun if (ret)
3719*4882a593Smuzhiyun return ret;
3720*4882a593Smuzhiyun }
3721*4882a593Smuzhiyun wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3722*4882a593Smuzhiyun
3723*4882a593Smuzhiyun return 0;
3724*4882a593Smuzhiyun }
3725*4882a593Smuzhiyun
scrub_workers_put(struct btrfs_fs_info * fs_info)3726*4882a593Smuzhiyun static void scrub_workers_put(struct btrfs_fs_info *fs_info)
3727*4882a593Smuzhiyun {
3728*4882a593Smuzhiyun if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
3729*4882a593Smuzhiyun &fs_info->scrub_lock)) {
3730*4882a593Smuzhiyun struct btrfs_workqueue *scrub_workers = NULL;
3731*4882a593Smuzhiyun struct btrfs_workqueue *scrub_wr_comp = NULL;
3732*4882a593Smuzhiyun struct btrfs_workqueue *scrub_parity = NULL;
3733*4882a593Smuzhiyun
3734*4882a593Smuzhiyun scrub_workers = fs_info->scrub_workers;
3735*4882a593Smuzhiyun scrub_wr_comp = fs_info->scrub_wr_completion_workers;
3736*4882a593Smuzhiyun scrub_parity = fs_info->scrub_parity_workers;
3737*4882a593Smuzhiyun
3738*4882a593Smuzhiyun fs_info->scrub_workers = NULL;
3739*4882a593Smuzhiyun fs_info->scrub_wr_completion_workers = NULL;
3740*4882a593Smuzhiyun fs_info->scrub_parity_workers = NULL;
3741*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
3742*4882a593Smuzhiyun
3743*4882a593Smuzhiyun btrfs_destroy_workqueue(scrub_workers);
3744*4882a593Smuzhiyun btrfs_destroy_workqueue(scrub_wr_comp);
3745*4882a593Smuzhiyun btrfs_destroy_workqueue(scrub_parity);
3746*4882a593Smuzhiyun }
3747*4882a593Smuzhiyun }
3748*4882a593Smuzhiyun
3749*4882a593Smuzhiyun /*
3750*4882a593Smuzhiyun * get a reference count on fs_info->scrub_workers. start worker if necessary
3751*4882a593Smuzhiyun */
scrub_workers_get(struct btrfs_fs_info * fs_info,int is_dev_replace)3752*4882a593Smuzhiyun static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3753*4882a593Smuzhiyun int is_dev_replace)
3754*4882a593Smuzhiyun {
3755*4882a593Smuzhiyun struct btrfs_workqueue *scrub_workers = NULL;
3756*4882a593Smuzhiyun struct btrfs_workqueue *scrub_wr_comp = NULL;
3757*4882a593Smuzhiyun struct btrfs_workqueue *scrub_parity = NULL;
3758*4882a593Smuzhiyun unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
3759*4882a593Smuzhiyun int max_active = fs_info->thread_pool_size;
3760*4882a593Smuzhiyun int ret = -ENOMEM;
3761*4882a593Smuzhiyun
3762*4882a593Smuzhiyun if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
3763*4882a593Smuzhiyun return 0;
3764*4882a593Smuzhiyun
3765*4882a593Smuzhiyun scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
3766*4882a593Smuzhiyun is_dev_replace ? 1 : max_active, 4);
3767*4882a593Smuzhiyun if (!scrub_workers)
3768*4882a593Smuzhiyun goto fail_scrub_workers;
3769*4882a593Smuzhiyun
3770*4882a593Smuzhiyun scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
3771*4882a593Smuzhiyun max_active, 2);
3772*4882a593Smuzhiyun if (!scrub_wr_comp)
3773*4882a593Smuzhiyun goto fail_scrub_wr_completion_workers;
3774*4882a593Smuzhiyun
3775*4882a593Smuzhiyun scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
3776*4882a593Smuzhiyun max_active, 2);
3777*4882a593Smuzhiyun if (!scrub_parity)
3778*4882a593Smuzhiyun goto fail_scrub_parity_workers;
3779*4882a593Smuzhiyun
3780*4882a593Smuzhiyun mutex_lock(&fs_info->scrub_lock);
3781*4882a593Smuzhiyun if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
3782*4882a593Smuzhiyun ASSERT(fs_info->scrub_workers == NULL &&
3783*4882a593Smuzhiyun fs_info->scrub_wr_completion_workers == NULL &&
3784*4882a593Smuzhiyun fs_info->scrub_parity_workers == NULL);
3785*4882a593Smuzhiyun fs_info->scrub_workers = scrub_workers;
3786*4882a593Smuzhiyun fs_info->scrub_wr_completion_workers = scrub_wr_comp;
3787*4882a593Smuzhiyun fs_info->scrub_parity_workers = scrub_parity;
3788*4882a593Smuzhiyun refcount_set(&fs_info->scrub_workers_refcnt, 1);
3789*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
3790*4882a593Smuzhiyun return 0;
3791*4882a593Smuzhiyun }
3792*4882a593Smuzhiyun /* Other thread raced in and created the workers for us */
3793*4882a593Smuzhiyun refcount_inc(&fs_info->scrub_workers_refcnt);
3794*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
3795*4882a593Smuzhiyun
3796*4882a593Smuzhiyun ret = 0;
3797*4882a593Smuzhiyun btrfs_destroy_workqueue(scrub_parity);
3798*4882a593Smuzhiyun fail_scrub_parity_workers:
3799*4882a593Smuzhiyun btrfs_destroy_workqueue(scrub_wr_comp);
3800*4882a593Smuzhiyun fail_scrub_wr_completion_workers:
3801*4882a593Smuzhiyun btrfs_destroy_workqueue(scrub_workers);
3802*4882a593Smuzhiyun fail_scrub_workers:
3803*4882a593Smuzhiyun return ret;
3804*4882a593Smuzhiyun }
3805*4882a593Smuzhiyun
btrfs_scrub_dev(struct btrfs_fs_info * fs_info,u64 devid,u64 start,u64 end,struct btrfs_scrub_progress * progress,int readonly,int is_dev_replace)3806*4882a593Smuzhiyun int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3807*4882a593Smuzhiyun u64 end, struct btrfs_scrub_progress *progress,
3808*4882a593Smuzhiyun int readonly, int is_dev_replace)
3809*4882a593Smuzhiyun {
3810*4882a593Smuzhiyun struct scrub_ctx *sctx;
3811*4882a593Smuzhiyun int ret;
3812*4882a593Smuzhiyun struct btrfs_device *dev;
3813*4882a593Smuzhiyun unsigned int nofs_flag;
3814*4882a593Smuzhiyun bool need_commit = false;
3815*4882a593Smuzhiyun
3816*4882a593Smuzhiyun if (btrfs_fs_closing(fs_info))
3817*4882a593Smuzhiyun return -EAGAIN;
3818*4882a593Smuzhiyun
3819*4882a593Smuzhiyun if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
3820*4882a593Smuzhiyun /*
3821*4882a593Smuzhiyun * in this case scrub is unable to calculate the checksum
3822*4882a593Smuzhiyun * the way scrub is implemented. Do not handle this
3823*4882a593Smuzhiyun * situation at all because it won't ever happen.
3824*4882a593Smuzhiyun */
3825*4882a593Smuzhiyun btrfs_err(fs_info,
3826*4882a593Smuzhiyun "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
3827*4882a593Smuzhiyun fs_info->nodesize,
3828*4882a593Smuzhiyun BTRFS_STRIPE_LEN);
3829*4882a593Smuzhiyun return -EINVAL;
3830*4882a593Smuzhiyun }
3831*4882a593Smuzhiyun
3832*4882a593Smuzhiyun if (fs_info->sectorsize != PAGE_SIZE) {
3833*4882a593Smuzhiyun /* not supported for data w/o checksums */
3834*4882a593Smuzhiyun btrfs_err_rl(fs_info,
3835*4882a593Smuzhiyun "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
3836*4882a593Smuzhiyun fs_info->sectorsize, PAGE_SIZE);
3837*4882a593Smuzhiyun return -EINVAL;
3838*4882a593Smuzhiyun }
3839*4882a593Smuzhiyun
3840*4882a593Smuzhiyun if (fs_info->nodesize >
3841*4882a593Smuzhiyun PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
3842*4882a593Smuzhiyun fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
3843*4882a593Smuzhiyun /*
3844*4882a593Smuzhiyun * would exhaust the array bounds of pagev member in
3845*4882a593Smuzhiyun * struct scrub_block
3846*4882a593Smuzhiyun */
3847*4882a593Smuzhiyun btrfs_err(fs_info,
3848*4882a593Smuzhiyun "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
3849*4882a593Smuzhiyun fs_info->nodesize,
3850*4882a593Smuzhiyun SCRUB_MAX_PAGES_PER_BLOCK,
3851*4882a593Smuzhiyun fs_info->sectorsize,
3852*4882a593Smuzhiyun SCRUB_MAX_PAGES_PER_BLOCK);
3853*4882a593Smuzhiyun return -EINVAL;
3854*4882a593Smuzhiyun }
3855*4882a593Smuzhiyun
3856*4882a593Smuzhiyun /* Allocate outside of device_list_mutex */
3857*4882a593Smuzhiyun sctx = scrub_setup_ctx(fs_info, is_dev_replace);
3858*4882a593Smuzhiyun if (IS_ERR(sctx))
3859*4882a593Smuzhiyun return PTR_ERR(sctx);
3860*4882a593Smuzhiyun
3861*4882a593Smuzhiyun ret = scrub_workers_get(fs_info, is_dev_replace);
3862*4882a593Smuzhiyun if (ret)
3863*4882a593Smuzhiyun goto out_free_ctx;
3864*4882a593Smuzhiyun
3865*4882a593Smuzhiyun mutex_lock(&fs_info->fs_devices->device_list_mutex);
3866*4882a593Smuzhiyun dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
3867*4882a593Smuzhiyun if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
3868*4882a593Smuzhiyun !is_dev_replace)) {
3869*4882a593Smuzhiyun mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3870*4882a593Smuzhiyun ret = -ENODEV;
3871*4882a593Smuzhiyun goto out;
3872*4882a593Smuzhiyun }
3873*4882a593Smuzhiyun
3874*4882a593Smuzhiyun if (!is_dev_replace && !readonly &&
3875*4882a593Smuzhiyun !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
3876*4882a593Smuzhiyun mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3877*4882a593Smuzhiyun btrfs_err_in_rcu(fs_info,
3878*4882a593Smuzhiyun "scrub on devid %llu: filesystem on %s is not writable",
3879*4882a593Smuzhiyun devid, rcu_str_deref(dev->name));
3880*4882a593Smuzhiyun ret = -EROFS;
3881*4882a593Smuzhiyun goto out;
3882*4882a593Smuzhiyun }
3883*4882a593Smuzhiyun
3884*4882a593Smuzhiyun mutex_lock(&fs_info->scrub_lock);
3885*4882a593Smuzhiyun if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3886*4882a593Smuzhiyun test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
3887*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
3888*4882a593Smuzhiyun mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3889*4882a593Smuzhiyun ret = -EIO;
3890*4882a593Smuzhiyun goto out;
3891*4882a593Smuzhiyun }
3892*4882a593Smuzhiyun
3893*4882a593Smuzhiyun down_read(&fs_info->dev_replace.rwsem);
3894*4882a593Smuzhiyun if (dev->scrub_ctx ||
3895*4882a593Smuzhiyun (!is_dev_replace &&
3896*4882a593Smuzhiyun btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3897*4882a593Smuzhiyun up_read(&fs_info->dev_replace.rwsem);
3898*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
3899*4882a593Smuzhiyun mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3900*4882a593Smuzhiyun ret = -EINPROGRESS;
3901*4882a593Smuzhiyun goto out;
3902*4882a593Smuzhiyun }
3903*4882a593Smuzhiyun up_read(&fs_info->dev_replace.rwsem);
3904*4882a593Smuzhiyun
3905*4882a593Smuzhiyun sctx->readonly = readonly;
3906*4882a593Smuzhiyun dev->scrub_ctx = sctx;
3907*4882a593Smuzhiyun mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3908*4882a593Smuzhiyun
3909*4882a593Smuzhiyun /*
3910*4882a593Smuzhiyun * checking @scrub_pause_req here, we can avoid
3911*4882a593Smuzhiyun * race between committing transaction and scrubbing.
3912*4882a593Smuzhiyun */
3913*4882a593Smuzhiyun __scrub_blocked_if_needed(fs_info);
3914*4882a593Smuzhiyun atomic_inc(&fs_info->scrubs_running);
3915*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
3916*4882a593Smuzhiyun
3917*4882a593Smuzhiyun /*
3918*4882a593Smuzhiyun * In order to avoid deadlock with reclaim when there is a transaction
3919*4882a593Smuzhiyun * trying to pause scrub, make sure we use GFP_NOFS for all the
3920*4882a593Smuzhiyun * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity()
3921*4882a593Smuzhiyun * invoked by our callees. The pausing request is done when the
3922*4882a593Smuzhiyun * transaction commit starts, and it blocks the transaction until scrub
3923*4882a593Smuzhiyun * is paused (done at specific points at scrub_stripe() or right above
3924*4882a593Smuzhiyun * before incrementing fs_info->scrubs_running).
3925*4882a593Smuzhiyun */
3926*4882a593Smuzhiyun nofs_flag = memalloc_nofs_save();
3927*4882a593Smuzhiyun if (!is_dev_replace) {
3928*4882a593Smuzhiyun u64 old_super_errors;
3929*4882a593Smuzhiyun
3930*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
3931*4882a593Smuzhiyun old_super_errors = sctx->stat.super_errors;
3932*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
3933*4882a593Smuzhiyun
3934*4882a593Smuzhiyun btrfs_info(fs_info, "scrub: started on devid %llu", devid);
3935*4882a593Smuzhiyun /*
3936*4882a593Smuzhiyun * by holding device list mutex, we can
3937*4882a593Smuzhiyun * kick off writing super in log tree sync.
3938*4882a593Smuzhiyun */
3939*4882a593Smuzhiyun mutex_lock(&fs_info->fs_devices->device_list_mutex);
3940*4882a593Smuzhiyun ret = scrub_supers(sctx, dev);
3941*4882a593Smuzhiyun mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3942*4882a593Smuzhiyun
3943*4882a593Smuzhiyun spin_lock(&sctx->stat_lock);
3944*4882a593Smuzhiyun /*
3945*4882a593Smuzhiyun * Super block errors found, but we can not commit transaction
3946*4882a593Smuzhiyun * at current context, since btrfs_commit_transaction() needs
3947*4882a593Smuzhiyun * to pause the current running scrub (hold by ourselves).
3948*4882a593Smuzhiyun */
3949*4882a593Smuzhiyun if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
3950*4882a593Smuzhiyun need_commit = true;
3951*4882a593Smuzhiyun spin_unlock(&sctx->stat_lock);
3952*4882a593Smuzhiyun }
3953*4882a593Smuzhiyun
3954*4882a593Smuzhiyun if (!ret)
3955*4882a593Smuzhiyun ret = scrub_enumerate_chunks(sctx, dev, start, end);
3956*4882a593Smuzhiyun memalloc_nofs_restore(nofs_flag);
3957*4882a593Smuzhiyun
3958*4882a593Smuzhiyun wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3959*4882a593Smuzhiyun atomic_dec(&fs_info->scrubs_running);
3960*4882a593Smuzhiyun wake_up(&fs_info->scrub_pause_wait);
3961*4882a593Smuzhiyun
3962*4882a593Smuzhiyun wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
3963*4882a593Smuzhiyun
3964*4882a593Smuzhiyun if (progress)
3965*4882a593Smuzhiyun memcpy(progress, &sctx->stat, sizeof(*progress));
3966*4882a593Smuzhiyun
3967*4882a593Smuzhiyun if (!is_dev_replace)
3968*4882a593Smuzhiyun btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
3969*4882a593Smuzhiyun ret ? "not finished" : "finished", devid, ret);
3970*4882a593Smuzhiyun
3971*4882a593Smuzhiyun mutex_lock(&fs_info->scrub_lock);
3972*4882a593Smuzhiyun dev->scrub_ctx = NULL;
3973*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
3974*4882a593Smuzhiyun
3975*4882a593Smuzhiyun scrub_workers_put(fs_info);
3976*4882a593Smuzhiyun scrub_put_ctx(sctx);
3977*4882a593Smuzhiyun
3978*4882a593Smuzhiyun /*
3979*4882a593Smuzhiyun * We found some super block errors before, now try to force a
3980*4882a593Smuzhiyun * transaction commit, as scrub has finished.
3981*4882a593Smuzhiyun */
3982*4882a593Smuzhiyun if (need_commit) {
3983*4882a593Smuzhiyun struct btrfs_trans_handle *trans;
3984*4882a593Smuzhiyun
3985*4882a593Smuzhiyun trans = btrfs_start_transaction(fs_info->tree_root, 0);
3986*4882a593Smuzhiyun if (IS_ERR(trans)) {
3987*4882a593Smuzhiyun ret = PTR_ERR(trans);
3988*4882a593Smuzhiyun btrfs_err(fs_info,
3989*4882a593Smuzhiyun "scrub: failed to start transaction to fix super block errors: %d", ret);
3990*4882a593Smuzhiyun return ret;
3991*4882a593Smuzhiyun }
3992*4882a593Smuzhiyun ret = btrfs_commit_transaction(trans);
3993*4882a593Smuzhiyun if (ret < 0)
3994*4882a593Smuzhiyun btrfs_err(fs_info,
3995*4882a593Smuzhiyun "scrub: failed to commit transaction to fix super block errors: %d", ret);
3996*4882a593Smuzhiyun }
3997*4882a593Smuzhiyun return ret;
3998*4882a593Smuzhiyun out:
3999*4882a593Smuzhiyun scrub_workers_put(fs_info);
4000*4882a593Smuzhiyun out_free_ctx:
4001*4882a593Smuzhiyun scrub_free_ctx(sctx);
4002*4882a593Smuzhiyun
4003*4882a593Smuzhiyun return ret;
4004*4882a593Smuzhiyun }
4005*4882a593Smuzhiyun
btrfs_scrub_pause(struct btrfs_fs_info * fs_info)4006*4882a593Smuzhiyun void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4007*4882a593Smuzhiyun {
4008*4882a593Smuzhiyun mutex_lock(&fs_info->scrub_lock);
4009*4882a593Smuzhiyun atomic_inc(&fs_info->scrub_pause_req);
4010*4882a593Smuzhiyun while (atomic_read(&fs_info->scrubs_paused) !=
4011*4882a593Smuzhiyun atomic_read(&fs_info->scrubs_running)) {
4012*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
4013*4882a593Smuzhiyun wait_event(fs_info->scrub_pause_wait,
4014*4882a593Smuzhiyun atomic_read(&fs_info->scrubs_paused) ==
4015*4882a593Smuzhiyun atomic_read(&fs_info->scrubs_running));
4016*4882a593Smuzhiyun mutex_lock(&fs_info->scrub_lock);
4017*4882a593Smuzhiyun }
4018*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
4019*4882a593Smuzhiyun }
4020*4882a593Smuzhiyun
btrfs_scrub_continue(struct btrfs_fs_info * fs_info)4021*4882a593Smuzhiyun void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4022*4882a593Smuzhiyun {
4023*4882a593Smuzhiyun atomic_dec(&fs_info->scrub_pause_req);
4024*4882a593Smuzhiyun wake_up(&fs_info->scrub_pause_wait);
4025*4882a593Smuzhiyun }
4026*4882a593Smuzhiyun
btrfs_scrub_cancel(struct btrfs_fs_info * fs_info)4027*4882a593Smuzhiyun int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4028*4882a593Smuzhiyun {
4029*4882a593Smuzhiyun mutex_lock(&fs_info->scrub_lock);
4030*4882a593Smuzhiyun if (!atomic_read(&fs_info->scrubs_running)) {
4031*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
4032*4882a593Smuzhiyun return -ENOTCONN;
4033*4882a593Smuzhiyun }
4034*4882a593Smuzhiyun
4035*4882a593Smuzhiyun atomic_inc(&fs_info->scrub_cancel_req);
4036*4882a593Smuzhiyun while (atomic_read(&fs_info->scrubs_running)) {
4037*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
4038*4882a593Smuzhiyun wait_event(fs_info->scrub_pause_wait,
4039*4882a593Smuzhiyun atomic_read(&fs_info->scrubs_running) == 0);
4040*4882a593Smuzhiyun mutex_lock(&fs_info->scrub_lock);
4041*4882a593Smuzhiyun }
4042*4882a593Smuzhiyun atomic_dec(&fs_info->scrub_cancel_req);
4043*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
4044*4882a593Smuzhiyun
4045*4882a593Smuzhiyun return 0;
4046*4882a593Smuzhiyun }
4047*4882a593Smuzhiyun
btrfs_scrub_cancel_dev(struct btrfs_device * dev)4048*4882a593Smuzhiyun int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4049*4882a593Smuzhiyun {
4050*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = dev->fs_info;
4051*4882a593Smuzhiyun struct scrub_ctx *sctx;
4052*4882a593Smuzhiyun
4053*4882a593Smuzhiyun mutex_lock(&fs_info->scrub_lock);
4054*4882a593Smuzhiyun sctx = dev->scrub_ctx;
4055*4882a593Smuzhiyun if (!sctx) {
4056*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
4057*4882a593Smuzhiyun return -ENOTCONN;
4058*4882a593Smuzhiyun }
4059*4882a593Smuzhiyun atomic_inc(&sctx->cancel_req);
4060*4882a593Smuzhiyun while (dev->scrub_ctx) {
4061*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
4062*4882a593Smuzhiyun wait_event(fs_info->scrub_pause_wait,
4063*4882a593Smuzhiyun dev->scrub_ctx == NULL);
4064*4882a593Smuzhiyun mutex_lock(&fs_info->scrub_lock);
4065*4882a593Smuzhiyun }
4066*4882a593Smuzhiyun mutex_unlock(&fs_info->scrub_lock);
4067*4882a593Smuzhiyun
4068*4882a593Smuzhiyun return 0;
4069*4882a593Smuzhiyun }
4070*4882a593Smuzhiyun
btrfs_scrub_progress(struct btrfs_fs_info * fs_info,u64 devid,struct btrfs_scrub_progress * progress)4071*4882a593Smuzhiyun int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4072*4882a593Smuzhiyun struct btrfs_scrub_progress *progress)
4073*4882a593Smuzhiyun {
4074*4882a593Smuzhiyun struct btrfs_device *dev;
4075*4882a593Smuzhiyun struct scrub_ctx *sctx = NULL;
4076*4882a593Smuzhiyun
4077*4882a593Smuzhiyun mutex_lock(&fs_info->fs_devices->device_list_mutex);
4078*4882a593Smuzhiyun dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
4079*4882a593Smuzhiyun if (dev)
4080*4882a593Smuzhiyun sctx = dev->scrub_ctx;
4081*4882a593Smuzhiyun if (sctx)
4082*4882a593Smuzhiyun memcpy(progress, &sctx->stat, sizeof(*progress));
4083*4882a593Smuzhiyun mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4084*4882a593Smuzhiyun
4085*4882a593Smuzhiyun return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4086*4882a593Smuzhiyun }
4087*4882a593Smuzhiyun
scrub_remap_extent(struct btrfs_fs_info * fs_info,u64 extent_logical,u64 extent_len,u64 * extent_physical,struct btrfs_device ** extent_dev,int * extent_mirror_num)4088*4882a593Smuzhiyun static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4089*4882a593Smuzhiyun u64 extent_logical, u64 extent_len,
4090*4882a593Smuzhiyun u64 *extent_physical,
4091*4882a593Smuzhiyun struct btrfs_device **extent_dev,
4092*4882a593Smuzhiyun int *extent_mirror_num)
4093*4882a593Smuzhiyun {
4094*4882a593Smuzhiyun u64 mapped_length;
4095*4882a593Smuzhiyun struct btrfs_bio *bbio = NULL;
4096*4882a593Smuzhiyun int ret;
4097*4882a593Smuzhiyun
4098*4882a593Smuzhiyun mapped_length = extent_len;
4099*4882a593Smuzhiyun ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4100*4882a593Smuzhiyun &mapped_length, &bbio, 0);
4101*4882a593Smuzhiyun if (ret || !bbio || mapped_length < extent_len ||
4102*4882a593Smuzhiyun !bbio->stripes[0].dev->bdev) {
4103*4882a593Smuzhiyun btrfs_put_bbio(bbio);
4104*4882a593Smuzhiyun return;
4105*4882a593Smuzhiyun }
4106*4882a593Smuzhiyun
4107*4882a593Smuzhiyun *extent_physical = bbio->stripes[0].physical;
4108*4882a593Smuzhiyun *extent_mirror_num = bbio->mirror_num;
4109*4882a593Smuzhiyun *extent_dev = bbio->stripes[0].dev;
4110*4882a593Smuzhiyun btrfs_put_bbio(bbio);
4111*4882a593Smuzhiyun }
4112