1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0+
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Copyright (C) 2019 Oracle. All Rights Reserved.
4*4882a593Smuzhiyun * Author: Darrick J. Wong <darrick.wong@oracle.com>
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun #include "xfs.h"
7*4882a593Smuzhiyun #include "xfs_fs.h"
8*4882a593Smuzhiyun #include "xfs_shared.h"
9*4882a593Smuzhiyun #include "xfs_format.h"
10*4882a593Smuzhiyun #include "xfs_trans_resv.h"
11*4882a593Smuzhiyun #include "xfs_mount.h"
12*4882a593Smuzhiyun #include "xfs_sb.h"
13*4882a593Smuzhiyun #include "xfs_alloc.h"
14*4882a593Smuzhiyun #include "xfs_ialloc.h"
15*4882a593Smuzhiyun #include "xfs_health.h"
16*4882a593Smuzhiyun #include "scrub/scrub.h"
17*4882a593Smuzhiyun #include "scrub/common.h"
18*4882a593Smuzhiyun #include "scrub/trace.h"
19*4882a593Smuzhiyun
20*4882a593Smuzhiyun /*
21*4882a593Smuzhiyun * FS Summary Counters
22*4882a593Smuzhiyun * ===================
23*4882a593Smuzhiyun *
24*4882a593Smuzhiyun * The basics of filesystem summary counter checking are that we iterate the
25*4882a593Smuzhiyun * AGs counting the number of free blocks, free space btree blocks, per-AG
26*4882a593Smuzhiyun * reservations, inodes, delayed allocation reservations, and free inodes.
27*4882a593Smuzhiyun * Then we compare what we computed against the in-core counters.
28*4882a593Smuzhiyun *
29*4882a593Smuzhiyun * However, the reality is that summary counters are a tricky beast to check.
30*4882a593Smuzhiyun * While we /could/ freeze the filesystem and scramble around the AGs counting
31*4882a593Smuzhiyun * the free blocks, in practice we prefer not do that for a scan because
32*4882a593Smuzhiyun * freezing is costly. To get around this, we added a per-cpu counter of the
33*4882a593Smuzhiyun * delalloc reservations so that we can rotor around the AGs relatively
34*4882a593Smuzhiyun * quickly, and we allow the counts to be slightly off because we're not taking
35*4882a593Smuzhiyun * any locks while we do this.
36*4882a593Smuzhiyun *
37*4882a593Smuzhiyun * So the first thing we do is warm up the buffer cache in the setup routine by
38*4882a593Smuzhiyun * walking all the AGs to make sure the incore per-AG structure has been
39*4882a593Smuzhiyun * initialized. The expected value calculation then iterates the incore per-AG
40*4882a593Smuzhiyun * structures as quickly as it can. We snapshot the percpu counters before and
41*4882a593Smuzhiyun * after this operation and use the difference in counter values to guess at
42*4882a593Smuzhiyun * our tolerance for mismatch between expected and actual counter values.
43*4882a593Smuzhiyun */
44*4882a593Smuzhiyun
45*4882a593Smuzhiyun /*
46*4882a593Smuzhiyun * Since the expected value computation is lockless but only browses incore
47*4882a593Smuzhiyun * values, the percpu counters should be fairly close to each other. However,
48*4882a593Smuzhiyun * we'll allow ourselves to be off by at least this (arbitrary) amount.
49*4882a593Smuzhiyun */
50*4882a593Smuzhiyun #define XCHK_FSCOUNT_MIN_VARIANCE (512)
51*4882a593Smuzhiyun
52*4882a593Smuzhiyun /*
53*4882a593Smuzhiyun * Make sure the per-AG structure has been initialized from the on-disk header
54*4882a593Smuzhiyun * contents and trust that the incore counters match the ondisk counters. (The
55*4882a593Smuzhiyun * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the
56*4882a593Smuzhiyun * summary counters after checking all AG headers). Do this from the setup
57*4882a593Smuzhiyun * function so that the inner AG aggregation loop runs as quickly as possible.
58*4882a593Smuzhiyun *
59*4882a593Smuzhiyun * This function runs during the setup phase /before/ we start checking any
60*4882a593Smuzhiyun * metadata.
61*4882a593Smuzhiyun */
62*4882a593Smuzhiyun STATIC int
xchk_fscount_warmup(struct xfs_scrub * sc)63*4882a593Smuzhiyun xchk_fscount_warmup(
64*4882a593Smuzhiyun struct xfs_scrub *sc)
65*4882a593Smuzhiyun {
66*4882a593Smuzhiyun struct xfs_mount *mp = sc->mp;
67*4882a593Smuzhiyun struct xfs_buf *agi_bp = NULL;
68*4882a593Smuzhiyun struct xfs_buf *agf_bp = NULL;
69*4882a593Smuzhiyun struct xfs_perag *pag = NULL;
70*4882a593Smuzhiyun xfs_agnumber_t agno;
71*4882a593Smuzhiyun int error = 0;
72*4882a593Smuzhiyun
73*4882a593Smuzhiyun for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
74*4882a593Smuzhiyun pag = xfs_perag_get(mp, agno);
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun if (pag->pagi_init && pag->pagf_init)
77*4882a593Smuzhiyun goto next_loop_perag;
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun /* Lock both AG headers. */
80*4882a593Smuzhiyun error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp);
81*4882a593Smuzhiyun if (error)
82*4882a593Smuzhiyun break;
83*4882a593Smuzhiyun error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp);
84*4882a593Smuzhiyun if (error)
85*4882a593Smuzhiyun break;
86*4882a593Smuzhiyun
87*4882a593Smuzhiyun /*
88*4882a593Smuzhiyun * These are supposed to be initialized by the header read
89*4882a593Smuzhiyun * function.
90*4882a593Smuzhiyun */
91*4882a593Smuzhiyun error = -EFSCORRUPTED;
92*4882a593Smuzhiyun if (!pag->pagi_init || !pag->pagf_init)
93*4882a593Smuzhiyun break;
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun xfs_buf_relse(agf_bp);
96*4882a593Smuzhiyun agf_bp = NULL;
97*4882a593Smuzhiyun xfs_buf_relse(agi_bp);
98*4882a593Smuzhiyun agi_bp = NULL;
99*4882a593Smuzhiyun next_loop_perag:
100*4882a593Smuzhiyun xfs_perag_put(pag);
101*4882a593Smuzhiyun pag = NULL;
102*4882a593Smuzhiyun error = 0;
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun if (xchk_should_terminate(sc, &error))
105*4882a593Smuzhiyun break;
106*4882a593Smuzhiyun }
107*4882a593Smuzhiyun
108*4882a593Smuzhiyun if (agf_bp)
109*4882a593Smuzhiyun xfs_buf_relse(agf_bp);
110*4882a593Smuzhiyun if (agi_bp)
111*4882a593Smuzhiyun xfs_buf_relse(agi_bp);
112*4882a593Smuzhiyun if (pag)
113*4882a593Smuzhiyun xfs_perag_put(pag);
114*4882a593Smuzhiyun return error;
115*4882a593Smuzhiyun }
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun int
xchk_setup_fscounters(struct xfs_scrub * sc,struct xfs_inode * ip)118*4882a593Smuzhiyun xchk_setup_fscounters(
119*4882a593Smuzhiyun struct xfs_scrub *sc,
120*4882a593Smuzhiyun struct xfs_inode *ip)
121*4882a593Smuzhiyun {
122*4882a593Smuzhiyun struct xchk_fscounters *fsc;
123*4882a593Smuzhiyun int error;
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
126*4882a593Smuzhiyun if (!sc->buf)
127*4882a593Smuzhiyun return -ENOMEM;
128*4882a593Smuzhiyun fsc = sc->buf;
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun /* We must get the incore counters set up before we can proceed. */
133*4882a593Smuzhiyun error = xchk_fscount_warmup(sc);
134*4882a593Smuzhiyun if (error)
135*4882a593Smuzhiyun return error;
136*4882a593Smuzhiyun
137*4882a593Smuzhiyun /*
138*4882a593Smuzhiyun * Pause background reclaim while we're scrubbing to reduce the
139*4882a593Smuzhiyun * likelihood of background perturbations to the counters throwing off
140*4882a593Smuzhiyun * our calculations.
141*4882a593Smuzhiyun */
142*4882a593Smuzhiyun xchk_stop_reaping(sc);
143*4882a593Smuzhiyun
144*4882a593Smuzhiyun return xchk_trans_alloc(sc, 0);
145*4882a593Smuzhiyun }
146*4882a593Smuzhiyun
147*4882a593Smuzhiyun /*
148*4882a593Smuzhiyun * Calculate what the global in-core counters ought to be from the incore
149*4882a593Smuzhiyun * per-AG structure. Callers can compare this to the actual in-core counters
150*4882a593Smuzhiyun * to estimate by how much both in-core and on-disk counters need to be
151*4882a593Smuzhiyun * adjusted.
152*4882a593Smuzhiyun */
153*4882a593Smuzhiyun STATIC int
xchk_fscount_aggregate_agcounts(struct xfs_scrub * sc,struct xchk_fscounters * fsc)154*4882a593Smuzhiyun xchk_fscount_aggregate_agcounts(
155*4882a593Smuzhiyun struct xfs_scrub *sc,
156*4882a593Smuzhiyun struct xchk_fscounters *fsc)
157*4882a593Smuzhiyun {
158*4882a593Smuzhiyun struct xfs_mount *mp = sc->mp;
159*4882a593Smuzhiyun struct xfs_perag *pag;
160*4882a593Smuzhiyun uint64_t delayed;
161*4882a593Smuzhiyun xfs_agnumber_t agno;
162*4882a593Smuzhiyun int tries = 8;
163*4882a593Smuzhiyun int error = 0;
164*4882a593Smuzhiyun
165*4882a593Smuzhiyun retry:
166*4882a593Smuzhiyun fsc->icount = 0;
167*4882a593Smuzhiyun fsc->ifree = 0;
168*4882a593Smuzhiyun fsc->fdblocks = 0;
169*4882a593Smuzhiyun
170*4882a593Smuzhiyun for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
171*4882a593Smuzhiyun pag = xfs_perag_get(mp, agno);
172*4882a593Smuzhiyun
173*4882a593Smuzhiyun /* This somehow got unset since the warmup? */
174*4882a593Smuzhiyun if (!pag->pagi_init || !pag->pagf_init) {
175*4882a593Smuzhiyun xfs_perag_put(pag);
176*4882a593Smuzhiyun return -EFSCORRUPTED;
177*4882a593Smuzhiyun }
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun /* Count all the inodes */
180*4882a593Smuzhiyun fsc->icount += pag->pagi_count;
181*4882a593Smuzhiyun fsc->ifree += pag->pagi_freecount;
182*4882a593Smuzhiyun
183*4882a593Smuzhiyun /* Add up the free/freelist/bnobt/cntbt blocks */
184*4882a593Smuzhiyun fsc->fdblocks += pag->pagf_freeblks;
185*4882a593Smuzhiyun fsc->fdblocks += pag->pagf_flcount;
186*4882a593Smuzhiyun fsc->fdblocks += pag->pagf_btreeblks;
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun /*
189*4882a593Smuzhiyun * Per-AG reservations are taken out of the incore counters,
190*4882a593Smuzhiyun * so they must be left out of the free blocks computation.
191*4882a593Smuzhiyun */
192*4882a593Smuzhiyun fsc->fdblocks -= pag->pag_meta_resv.ar_reserved;
193*4882a593Smuzhiyun fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved;
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun xfs_perag_put(pag);
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun if (xchk_should_terminate(sc, &error))
198*4882a593Smuzhiyun break;
199*4882a593Smuzhiyun }
200*4882a593Smuzhiyun
201*4882a593Smuzhiyun if (error)
202*4882a593Smuzhiyun return error;
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun /*
205*4882a593Smuzhiyun * The global incore space reservation is taken from the incore
206*4882a593Smuzhiyun * counters, so leave that out of the computation.
207*4882a593Smuzhiyun */
208*4882a593Smuzhiyun fsc->fdblocks -= mp->m_resblks_avail;
209*4882a593Smuzhiyun
210*4882a593Smuzhiyun /*
211*4882a593Smuzhiyun * Delayed allocation reservations are taken out of the incore counters
212*4882a593Smuzhiyun * but not recorded on disk, so leave them and their indlen blocks out
213*4882a593Smuzhiyun * of the computation.
214*4882a593Smuzhiyun */
215*4882a593Smuzhiyun delayed = percpu_counter_sum(&mp->m_delalloc_blks);
216*4882a593Smuzhiyun fsc->fdblocks -= delayed;
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks,
219*4882a593Smuzhiyun delayed);
220*4882a593Smuzhiyun
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun /* Bail out if the values we compute are totally nonsense. */
223*4882a593Smuzhiyun if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max ||
224*4882a593Smuzhiyun fsc->fdblocks > mp->m_sb.sb_dblocks ||
225*4882a593Smuzhiyun fsc->ifree > fsc->icount_max)
226*4882a593Smuzhiyun return -EFSCORRUPTED;
227*4882a593Smuzhiyun
228*4882a593Smuzhiyun /*
229*4882a593Smuzhiyun * If ifree > icount then we probably had some perturbation in the
230*4882a593Smuzhiyun * counters while we were calculating things. We'll try a few times
231*4882a593Smuzhiyun * to maintain ifree <= icount before giving up.
232*4882a593Smuzhiyun */
233*4882a593Smuzhiyun if (fsc->ifree > fsc->icount) {
234*4882a593Smuzhiyun if (tries--)
235*4882a593Smuzhiyun goto retry;
236*4882a593Smuzhiyun xchk_set_incomplete(sc);
237*4882a593Smuzhiyun return 0;
238*4882a593Smuzhiyun }
239*4882a593Smuzhiyun
240*4882a593Smuzhiyun return 0;
241*4882a593Smuzhiyun }
242*4882a593Smuzhiyun
243*4882a593Smuzhiyun /*
244*4882a593Smuzhiyun * Is the @counter reasonably close to the @expected value?
245*4882a593Smuzhiyun *
246*4882a593Smuzhiyun * We neither locked nor froze anything in the filesystem while aggregating the
247*4882a593Smuzhiyun * per-AG data to compute the @expected value, which means that the counter
248*4882a593Smuzhiyun * could have changed. We know the @old_value of the summation of the counter
249*4882a593Smuzhiyun * before the aggregation, and we re-sum the counter now. If the expected
250*4882a593Smuzhiyun * value falls between the two summations, we're ok.
251*4882a593Smuzhiyun *
252*4882a593Smuzhiyun * Otherwise, we /might/ have a problem. If the change in the summations is
253*4882a593Smuzhiyun * more than we want to tolerate, the filesystem is probably busy and we should
254*4882a593Smuzhiyun * just send back INCOMPLETE and see if userspace will try again.
255*4882a593Smuzhiyun */
256*4882a593Smuzhiyun static inline bool
xchk_fscount_within_range(struct xfs_scrub * sc,const int64_t old_value,struct percpu_counter * counter,uint64_t expected)257*4882a593Smuzhiyun xchk_fscount_within_range(
258*4882a593Smuzhiyun struct xfs_scrub *sc,
259*4882a593Smuzhiyun const int64_t old_value,
260*4882a593Smuzhiyun struct percpu_counter *counter,
261*4882a593Smuzhiyun uint64_t expected)
262*4882a593Smuzhiyun {
263*4882a593Smuzhiyun int64_t min_value, max_value;
264*4882a593Smuzhiyun int64_t curr_value = percpu_counter_sum(counter);
265*4882a593Smuzhiyun
266*4882a593Smuzhiyun trace_xchk_fscounters_within_range(sc->mp, expected, curr_value,
267*4882a593Smuzhiyun old_value);
268*4882a593Smuzhiyun
269*4882a593Smuzhiyun /* Negative values are always wrong. */
270*4882a593Smuzhiyun if (curr_value < 0)
271*4882a593Smuzhiyun return false;
272*4882a593Smuzhiyun
273*4882a593Smuzhiyun /* Exact matches are always ok. */
274*4882a593Smuzhiyun if (curr_value == expected)
275*4882a593Smuzhiyun return true;
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun min_value = min(old_value, curr_value);
278*4882a593Smuzhiyun max_value = max(old_value, curr_value);
279*4882a593Smuzhiyun
280*4882a593Smuzhiyun /* Within the before-and-after range is ok. */
281*4882a593Smuzhiyun if (expected >= min_value && expected <= max_value)
282*4882a593Smuzhiyun return true;
283*4882a593Smuzhiyun
284*4882a593Smuzhiyun /*
285*4882a593Smuzhiyun * If the difference between the two summations is too large, the fs
286*4882a593Smuzhiyun * might just be busy and so we'll mark the scrub incomplete. Return
287*4882a593Smuzhiyun * true here so that we don't mark the counter corrupt.
288*4882a593Smuzhiyun *
289*4882a593Smuzhiyun * XXX: In the future when userspace can grant scrub permission to
290*4882a593Smuzhiyun * quiesce the filesystem to solve the outsized variance problem, this
291*4882a593Smuzhiyun * check should be moved up and the return code changed to signal to
292*4882a593Smuzhiyun * userspace that we need quiesce permission.
293*4882a593Smuzhiyun */
294*4882a593Smuzhiyun if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) {
295*4882a593Smuzhiyun xchk_set_incomplete(sc);
296*4882a593Smuzhiyun return true;
297*4882a593Smuzhiyun }
298*4882a593Smuzhiyun
299*4882a593Smuzhiyun return false;
300*4882a593Smuzhiyun }
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun /* Check the superblock counters. */
303*4882a593Smuzhiyun int
xchk_fscounters(struct xfs_scrub * sc)304*4882a593Smuzhiyun xchk_fscounters(
305*4882a593Smuzhiyun struct xfs_scrub *sc)
306*4882a593Smuzhiyun {
307*4882a593Smuzhiyun struct xfs_mount *mp = sc->mp;
308*4882a593Smuzhiyun struct xchk_fscounters *fsc = sc->buf;
309*4882a593Smuzhiyun int64_t icount, ifree, fdblocks;
310*4882a593Smuzhiyun int error;
311*4882a593Smuzhiyun
312*4882a593Smuzhiyun /* Snapshot the percpu counters. */
313*4882a593Smuzhiyun icount = percpu_counter_sum(&mp->m_icount);
314*4882a593Smuzhiyun ifree = percpu_counter_sum(&mp->m_ifree);
315*4882a593Smuzhiyun fdblocks = percpu_counter_sum(&mp->m_fdblocks);
316*4882a593Smuzhiyun
317*4882a593Smuzhiyun /* No negative values, please! */
318*4882a593Smuzhiyun if (icount < 0 || ifree < 0 || fdblocks < 0)
319*4882a593Smuzhiyun xchk_set_corrupt(sc);
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun /* See if icount is obviously wrong. */
322*4882a593Smuzhiyun if (icount < fsc->icount_min || icount > fsc->icount_max)
323*4882a593Smuzhiyun xchk_set_corrupt(sc);
324*4882a593Smuzhiyun
325*4882a593Smuzhiyun /* See if fdblocks is obviously wrong. */
326*4882a593Smuzhiyun if (fdblocks > mp->m_sb.sb_dblocks)
327*4882a593Smuzhiyun xchk_set_corrupt(sc);
328*4882a593Smuzhiyun
329*4882a593Smuzhiyun /*
330*4882a593Smuzhiyun * If ifree exceeds icount by more than the minimum variance then
331*4882a593Smuzhiyun * something's probably wrong with the counters.
332*4882a593Smuzhiyun */
333*4882a593Smuzhiyun if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE)
334*4882a593Smuzhiyun xchk_set_corrupt(sc);
335*4882a593Smuzhiyun
336*4882a593Smuzhiyun /* Walk the incore AG headers to calculate the expected counters. */
337*4882a593Smuzhiyun error = xchk_fscount_aggregate_agcounts(sc, fsc);
338*4882a593Smuzhiyun if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
339*4882a593Smuzhiyun return error;
340*4882a593Smuzhiyun if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
341*4882a593Smuzhiyun return 0;
342*4882a593Smuzhiyun
343*4882a593Smuzhiyun /* Compare the in-core counters with whatever we counted. */
344*4882a593Smuzhiyun if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
345*4882a593Smuzhiyun xchk_set_corrupt(sc);
346*4882a593Smuzhiyun
347*4882a593Smuzhiyun if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree))
348*4882a593Smuzhiyun xchk_set_corrupt(sc);
349*4882a593Smuzhiyun
350*4882a593Smuzhiyun if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
351*4882a593Smuzhiyun fsc->fdblocks))
352*4882a593Smuzhiyun xchk_set_corrupt(sc);
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun return 0;
355*4882a593Smuzhiyun }
356