1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4*4882a593Smuzhiyun * Copyright (c) 2010 David Chinner.
5*4882a593Smuzhiyun * Copyright (c) 2011 Christoph Hellwig.
6*4882a593Smuzhiyun * All Rights Reserved.
7*4882a593Smuzhiyun */
8*4882a593Smuzhiyun #include "xfs.h"
9*4882a593Smuzhiyun #include "xfs_fs.h"
10*4882a593Smuzhiyun #include "xfs_format.h"
11*4882a593Smuzhiyun #include "xfs_log_format.h"
12*4882a593Smuzhiyun #include "xfs_shared.h"
13*4882a593Smuzhiyun #include "xfs_trans_resv.h"
14*4882a593Smuzhiyun #include "xfs_sb.h"
15*4882a593Smuzhiyun #include "xfs_mount.h"
16*4882a593Smuzhiyun #include "xfs_alloc.h"
17*4882a593Smuzhiyun #include "xfs_extent_busy.h"
18*4882a593Smuzhiyun #include "xfs_trace.h"
19*4882a593Smuzhiyun #include "xfs_trans.h"
20*4882a593Smuzhiyun #include "xfs_log.h"
21*4882a593Smuzhiyun
22*4882a593Smuzhiyun void
xfs_extent_busy_insert(struct xfs_trans * tp,xfs_agnumber_t agno,xfs_agblock_t bno,xfs_extlen_t len,unsigned int flags)23*4882a593Smuzhiyun xfs_extent_busy_insert(
24*4882a593Smuzhiyun struct xfs_trans *tp,
25*4882a593Smuzhiyun xfs_agnumber_t agno,
26*4882a593Smuzhiyun xfs_agblock_t bno,
27*4882a593Smuzhiyun xfs_extlen_t len,
28*4882a593Smuzhiyun unsigned int flags)
29*4882a593Smuzhiyun {
30*4882a593Smuzhiyun struct xfs_extent_busy *new;
31*4882a593Smuzhiyun struct xfs_extent_busy *busyp;
32*4882a593Smuzhiyun struct xfs_perag *pag;
33*4882a593Smuzhiyun struct rb_node **rbp;
34*4882a593Smuzhiyun struct rb_node *parent = NULL;
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
37*4882a593Smuzhiyun new->agno = agno;
38*4882a593Smuzhiyun new->bno = bno;
39*4882a593Smuzhiyun new->length = len;
40*4882a593Smuzhiyun INIT_LIST_HEAD(&new->list);
41*4882a593Smuzhiyun new->flags = flags;
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun /* trace before insert to be able to see failed inserts */
44*4882a593Smuzhiyun trace_xfs_extent_busy(tp->t_mountp, agno, bno, len);
45*4882a593Smuzhiyun
46*4882a593Smuzhiyun pag = xfs_perag_get(tp->t_mountp, new->agno);
47*4882a593Smuzhiyun spin_lock(&pag->pagb_lock);
48*4882a593Smuzhiyun rbp = &pag->pagb_tree.rb_node;
49*4882a593Smuzhiyun while (*rbp) {
50*4882a593Smuzhiyun parent = *rbp;
51*4882a593Smuzhiyun busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun if (new->bno < busyp->bno) {
54*4882a593Smuzhiyun rbp = &(*rbp)->rb_left;
55*4882a593Smuzhiyun ASSERT(new->bno + new->length <= busyp->bno);
56*4882a593Smuzhiyun } else if (new->bno > busyp->bno) {
57*4882a593Smuzhiyun rbp = &(*rbp)->rb_right;
58*4882a593Smuzhiyun ASSERT(bno >= busyp->bno + busyp->length);
59*4882a593Smuzhiyun } else {
60*4882a593Smuzhiyun ASSERT(0);
61*4882a593Smuzhiyun }
62*4882a593Smuzhiyun }
63*4882a593Smuzhiyun
64*4882a593Smuzhiyun rb_link_node(&new->rb_node, parent, rbp);
65*4882a593Smuzhiyun rb_insert_color(&new->rb_node, &pag->pagb_tree);
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun list_add(&new->list, &tp->t_busy);
68*4882a593Smuzhiyun spin_unlock(&pag->pagb_lock);
69*4882a593Smuzhiyun xfs_perag_put(pag);
70*4882a593Smuzhiyun }
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun /*
73*4882a593Smuzhiyun * Search for a busy extent within the range of the extent we are about to
74*4882a593Smuzhiyun * allocate. You need to be holding the busy extent tree lock when calling
75*4882a593Smuzhiyun * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
76*4882a593Smuzhiyun * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
77*4882a593Smuzhiyun * match. This is done so that a non-zero return indicates an overlap that
78*4882a593Smuzhiyun * will require a synchronous transaction, but it can still be
79*4882a593Smuzhiyun * used to distinguish between a partial or exact match.
80*4882a593Smuzhiyun */
81*4882a593Smuzhiyun int
xfs_extent_busy_search(struct xfs_mount * mp,xfs_agnumber_t agno,xfs_agblock_t bno,xfs_extlen_t len)82*4882a593Smuzhiyun xfs_extent_busy_search(
83*4882a593Smuzhiyun struct xfs_mount *mp,
84*4882a593Smuzhiyun xfs_agnumber_t agno,
85*4882a593Smuzhiyun xfs_agblock_t bno,
86*4882a593Smuzhiyun xfs_extlen_t len)
87*4882a593Smuzhiyun {
88*4882a593Smuzhiyun struct xfs_perag *pag;
89*4882a593Smuzhiyun struct rb_node *rbp;
90*4882a593Smuzhiyun struct xfs_extent_busy *busyp;
91*4882a593Smuzhiyun int match = 0;
92*4882a593Smuzhiyun
93*4882a593Smuzhiyun pag = xfs_perag_get(mp, agno);
94*4882a593Smuzhiyun spin_lock(&pag->pagb_lock);
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun rbp = pag->pagb_tree.rb_node;
97*4882a593Smuzhiyun
98*4882a593Smuzhiyun /* find closest start bno overlap */
99*4882a593Smuzhiyun while (rbp) {
100*4882a593Smuzhiyun busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
101*4882a593Smuzhiyun if (bno < busyp->bno) {
102*4882a593Smuzhiyun /* may overlap, but exact start block is lower */
103*4882a593Smuzhiyun if (bno + len > busyp->bno)
104*4882a593Smuzhiyun match = -1;
105*4882a593Smuzhiyun rbp = rbp->rb_left;
106*4882a593Smuzhiyun } else if (bno > busyp->bno) {
107*4882a593Smuzhiyun /* may overlap, but exact start block is higher */
108*4882a593Smuzhiyun if (bno < busyp->bno + busyp->length)
109*4882a593Smuzhiyun match = -1;
110*4882a593Smuzhiyun rbp = rbp->rb_right;
111*4882a593Smuzhiyun } else {
112*4882a593Smuzhiyun /* bno matches busyp, length determines exact match */
113*4882a593Smuzhiyun match = (busyp->length == len) ? 1 : -1;
114*4882a593Smuzhiyun break;
115*4882a593Smuzhiyun }
116*4882a593Smuzhiyun }
117*4882a593Smuzhiyun spin_unlock(&pag->pagb_lock);
118*4882a593Smuzhiyun xfs_perag_put(pag);
119*4882a593Smuzhiyun return match;
120*4882a593Smuzhiyun }
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun /*
123*4882a593Smuzhiyun * The found free extent [fbno, fend] overlaps part or all of the given busy
124*4882a593Smuzhiyun * extent. If the overlap covers the beginning, the end, or all of the busy
125*4882a593Smuzhiyun * extent, the overlapping portion can be made unbusy and used for the
126*4882a593Smuzhiyun * allocation. We can't split a busy extent because we can't modify a
127*4882a593Smuzhiyun * transaction/CIL context busy list, but we can update an entry's block
128*4882a593Smuzhiyun * number or length.
129*4882a593Smuzhiyun *
130*4882a593Smuzhiyun * Returns true if the extent can safely be reused, or false if the search
131*4882a593Smuzhiyun * needs to be restarted.
132*4882a593Smuzhiyun */
133*4882a593Smuzhiyun STATIC bool
xfs_extent_busy_update_extent(struct xfs_mount * mp,struct xfs_perag * pag,struct xfs_extent_busy * busyp,xfs_agblock_t fbno,xfs_extlen_t flen,bool userdata)134*4882a593Smuzhiyun xfs_extent_busy_update_extent(
135*4882a593Smuzhiyun struct xfs_mount *mp,
136*4882a593Smuzhiyun struct xfs_perag *pag,
137*4882a593Smuzhiyun struct xfs_extent_busy *busyp,
138*4882a593Smuzhiyun xfs_agblock_t fbno,
139*4882a593Smuzhiyun xfs_extlen_t flen,
140*4882a593Smuzhiyun bool userdata) __releases(&pag->pagb_lock)
141*4882a593Smuzhiyun __acquires(&pag->pagb_lock)
142*4882a593Smuzhiyun {
143*4882a593Smuzhiyun xfs_agblock_t fend = fbno + flen;
144*4882a593Smuzhiyun xfs_agblock_t bbno = busyp->bno;
145*4882a593Smuzhiyun xfs_agblock_t bend = bbno + busyp->length;
146*4882a593Smuzhiyun
147*4882a593Smuzhiyun /*
148*4882a593Smuzhiyun * This extent is currently being discarded. Give the thread
149*4882a593Smuzhiyun * performing the discard a chance to mark the extent unbusy
150*4882a593Smuzhiyun * and retry.
151*4882a593Smuzhiyun */
152*4882a593Smuzhiyun if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
153*4882a593Smuzhiyun spin_unlock(&pag->pagb_lock);
154*4882a593Smuzhiyun delay(1);
155*4882a593Smuzhiyun spin_lock(&pag->pagb_lock);
156*4882a593Smuzhiyun return false;
157*4882a593Smuzhiyun }
158*4882a593Smuzhiyun
159*4882a593Smuzhiyun /*
160*4882a593Smuzhiyun * If there is a busy extent overlapping a user allocation, we have
161*4882a593Smuzhiyun * no choice but to force the log and retry the search.
162*4882a593Smuzhiyun *
163*4882a593Smuzhiyun * Fortunately this does not happen during normal operation, but
164*4882a593Smuzhiyun * only if the filesystem is very low on space and has to dip into
165*4882a593Smuzhiyun * the AGFL for normal allocations.
166*4882a593Smuzhiyun */
167*4882a593Smuzhiyun if (userdata)
168*4882a593Smuzhiyun goto out_force_log;
169*4882a593Smuzhiyun
170*4882a593Smuzhiyun if (bbno < fbno && bend > fend) {
171*4882a593Smuzhiyun /*
172*4882a593Smuzhiyun * Case 1:
173*4882a593Smuzhiyun * bbno bend
174*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
175*4882a593Smuzhiyun * +---------+
176*4882a593Smuzhiyun * fbno fend
177*4882a593Smuzhiyun */
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun /*
180*4882a593Smuzhiyun * We would have to split the busy extent to be able to track
181*4882a593Smuzhiyun * it correct, which we cannot do because we would have to
182*4882a593Smuzhiyun * modify the list of busy extents attached to the transaction
183*4882a593Smuzhiyun * or CIL context, which is immutable.
184*4882a593Smuzhiyun *
185*4882a593Smuzhiyun * Force out the log to clear the busy extent and retry the
186*4882a593Smuzhiyun * search.
187*4882a593Smuzhiyun */
188*4882a593Smuzhiyun goto out_force_log;
189*4882a593Smuzhiyun } else if (bbno >= fbno && bend <= fend) {
190*4882a593Smuzhiyun /*
191*4882a593Smuzhiyun * Case 2:
192*4882a593Smuzhiyun * bbno bend
193*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
194*4882a593Smuzhiyun * +-----------------+
195*4882a593Smuzhiyun * fbno fend
196*4882a593Smuzhiyun *
197*4882a593Smuzhiyun * Case 3:
198*4882a593Smuzhiyun * bbno bend
199*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
200*4882a593Smuzhiyun * +--------------------------+
201*4882a593Smuzhiyun * fbno fend
202*4882a593Smuzhiyun *
203*4882a593Smuzhiyun * Case 4:
204*4882a593Smuzhiyun * bbno bend
205*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
206*4882a593Smuzhiyun * +--------------------------+
207*4882a593Smuzhiyun * fbno fend
208*4882a593Smuzhiyun *
209*4882a593Smuzhiyun * Case 5:
210*4882a593Smuzhiyun * bbno bend
211*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
212*4882a593Smuzhiyun * +-----------------------------------+
213*4882a593Smuzhiyun * fbno fend
214*4882a593Smuzhiyun *
215*4882a593Smuzhiyun */
216*4882a593Smuzhiyun
217*4882a593Smuzhiyun /*
218*4882a593Smuzhiyun * The busy extent is fully covered by the extent we are
219*4882a593Smuzhiyun * allocating, and can simply be removed from the rbtree.
220*4882a593Smuzhiyun * However we cannot remove it from the immutable list
221*4882a593Smuzhiyun * tracking busy extents in the transaction or CIL context,
222*4882a593Smuzhiyun * so set the length to zero to mark it invalid.
223*4882a593Smuzhiyun *
224*4882a593Smuzhiyun * We also need to restart the busy extent search from the
225*4882a593Smuzhiyun * tree root, because erasing the node can rearrange the
226*4882a593Smuzhiyun * tree topology.
227*4882a593Smuzhiyun */
228*4882a593Smuzhiyun rb_erase(&busyp->rb_node, &pag->pagb_tree);
229*4882a593Smuzhiyun busyp->length = 0;
230*4882a593Smuzhiyun return false;
231*4882a593Smuzhiyun } else if (fend < bend) {
232*4882a593Smuzhiyun /*
233*4882a593Smuzhiyun * Case 6:
234*4882a593Smuzhiyun * bbno bend
235*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
236*4882a593Smuzhiyun * +---------+
237*4882a593Smuzhiyun * fbno fend
238*4882a593Smuzhiyun *
239*4882a593Smuzhiyun * Case 7:
240*4882a593Smuzhiyun * bbno bend
241*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
242*4882a593Smuzhiyun * +------------------+
243*4882a593Smuzhiyun * fbno fend
244*4882a593Smuzhiyun *
245*4882a593Smuzhiyun */
246*4882a593Smuzhiyun busyp->bno = fend;
247*4882a593Smuzhiyun } else if (bbno < fbno) {
248*4882a593Smuzhiyun /*
249*4882a593Smuzhiyun * Case 8:
250*4882a593Smuzhiyun * bbno bend
251*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
252*4882a593Smuzhiyun * +-------------+
253*4882a593Smuzhiyun * fbno fend
254*4882a593Smuzhiyun *
255*4882a593Smuzhiyun * Case 9:
256*4882a593Smuzhiyun * bbno bend
257*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
258*4882a593Smuzhiyun * +----------------------+
259*4882a593Smuzhiyun * fbno fend
260*4882a593Smuzhiyun */
261*4882a593Smuzhiyun busyp->length = fbno - busyp->bno;
262*4882a593Smuzhiyun } else {
263*4882a593Smuzhiyun ASSERT(0);
264*4882a593Smuzhiyun }
265*4882a593Smuzhiyun
266*4882a593Smuzhiyun trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
267*4882a593Smuzhiyun return true;
268*4882a593Smuzhiyun
269*4882a593Smuzhiyun out_force_log:
270*4882a593Smuzhiyun spin_unlock(&pag->pagb_lock);
271*4882a593Smuzhiyun xfs_log_force(mp, XFS_LOG_SYNC);
272*4882a593Smuzhiyun trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
273*4882a593Smuzhiyun spin_lock(&pag->pagb_lock);
274*4882a593Smuzhiyun return false;
275*4882a593Smuzhiyun }
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun
278*4882a593Smuzhiyun /*
279*4882a593Smuzhiyun * For a given extent [fbno, flen], make sure we can reuse it safely.
280*4882a593Smuzhiyun */
281*4882a593Smuzhiyun void
xfs_extent_busy_reuse(struct xfs_mount * mp,xfs_agnumber_t agno,xfs_agblock_t fbno,xfs_extlen_t flen,bool userdata)282*4882a593Smuzhiyun xfs_extent_busy_reuse(
283*4882a593Smuzhiyun struct xfs_mount *mp,
284*4882a593Smuzhiyun xfs_agnumber_t agno,
285*4882a593Smuzhiyun xfs_agblock_t fbno,
286*4882a593Smuzhiyun xfs_extlen_t flen,
287*4882a593Smuzhiyun bool userdata)
288*4882a593Smuzhiyun {
289*4882a593Smuzhiyun struct xfs_perag *pag;
290*4882a593Smuzhiyun struct rb_node *rbp;
291*4882a593Smuzhiyun
292*4882a593Smuzhiyun ASSERT(flen > 0);
293*4882a593Smuzhiyun
294*4882a593Smuzhiyun pag = xfs_perag_get(mp, agno);
295*4882a593Smuzhiyun spin_lock(&pag->pagb_lock);
296*4882a593Smuzhiyun restart:
297*4882a593Smuzhiyun rbp = pag->pagb_tree.rb_node;
298*4882a593Smuzhiyun while (rbp) {
299*4882a593Smuzhiyun struct xfs_extent_busy *busyp =
300*4882a593Smuzhiyun rb_entry(rbp, struct xfs_extent_busy, rb_node);
301*4882a593Smuzhiyun xfs_agblock_t bbno = busyp->bno;
302*4882a593Smuzhiyun xfs_agblock_t bend = bbno + busyp->length;
303*4882a593Smuzhiyun
304*4882a593Smuzhiyun if (fbno + flen <= bbno) {
305*4882a593Smuzhiyun rbp = rbp->rb_left;
306*4882a593Smuzhiyun continue;
307*4882a593Smuzhiyun } else if (fbno >= bend) {
308*4882a593Smuzhiyun rbp = rbp->rb_right;
309*4882a593Smuzhiyun continue;
310*4882a593Smuzhiyun }
311*4882a593Smuzhiyun
312*4882a593Smuzhiyun if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
313*4882a593Smuzhiyun userdata))
314*4882a593Smuzhiyun goto restart;
315*4882a593Smuzhiyun }
316*4882a593Smuzhiyun spin_unlock(&pag->pagb_lock);
317*4882a593Smuzhiyun xfs_perag_put(pag);
318*4882a593Smuzhiyun }
319*4882a593Smuzhiyun
320*4882a593Smuzhiyun /*
321*4882a593Smuzhiyun * For a given extent [fbno, flen], search the busy extent list to find a
322*4882a593Smuzhiyun * subset of the extent that is not busy. If *rlen is smaller than
323*4882a593Smuzhiyun * args->minlen no suitable extent could be found, and the higher level
324*4882a593Smuzhiyun * code needs to force out the log and retry the allocation.
325*4882a593Smuzhiyun *
326*4882a593Smuzhiyun * Return the current busy generation for the AG if the extent is busy. This
327*4882a593Smuzhiyun * value can be used to wait for at least one of the currently busy extents
328*4882a593Smuzhiyun * to be cleared. Note that the busy list is not guaranteed to be empty after
329*4882a593Smuzhiyun * the gen is woken. The state of a specific extent must always be confirmed
330*4882a593Smuzhiyun * with another call to xfs_extent_busy_trim() before it can be used.
331*4882a593Smuzhiyun */
332*4882a593Smuzhiyun bool
xfs_extent_busy_trim(struct xfs_alloc_arg * args,xfs_agblock_t * bno,xfs_extlen_t * len,unsigned * busy_gen)333*4882a593Smuzhiyun xfs_extent_busy_trim(
334*4882a593Smuzhiyun struct xfs_alloc_arg *args,
335*4882a593Smuzhiyun xfs_agblock_t *bno,
336*4882a593Smuzhiyun xfs_extlen_t *len,
337*4882a593Smuzhiyun unsigned *busy_gen)
338*4882a593Smuzhiyun {
339*4882a593Smuzhiyun xfs_agblock_t fbno;
340*4882a593Smuzhiyun xfs_extlen_t flen;
341*4882a593Smuzhiyun struct rb_node *rbp;
342*4882a593Smuzhiyun bool ret = false;
343*4882a593Smuzhiyun
344*4882a593Smuzhiyun ASSERT(*len > 0);
345*4882a593Smuzhiyun
346*4882a593Smuzhiyun spin_lock(&args->pag->pagb_lock);
347*4882a593Smuzhiyun restart:
348*4882a593Smuzhiyun fbno = *bno;
349*4882a593Smuzhiyun flen = *len;
350*4882a593Smuzhiyun rbp = args->pag->pagb_tree.rb_node;
351*4882a593Smuzhiyun while (rbp && flen >= args->minlen) {
352*4882a593Smuzhiyun struct xfs_extent_busy *busyp =
353*4882a593Smuzhiyun rb_entry(rbp, struct xfs_extent_busy, rb_node);
354*4882a593Smuzhiyun xfs_agblock_t fend = fbno + flen;
355*4882a593Smuzhiyun xfs_agblock_t bbno = busyp->bno;
356*4882a593Smuzhiyun xfs_agblock_t bend = bbno + busyp->length;
357*4882a593Smuzhiyun
358*4882a593Smuzhiyun if (fend <= bbno) {
359*4882a593Smuzhiyun rbp = rbp->rb_left;
360*4882a593Smuzhiyun continue;
361*4882a593Smuzhiyun } else if (fbno >= bend) {
362*4882a593Smuzhiyun rbp = rbp->rb_right;
363*4882a593Smuzhiyun continue;
364*4882a593Smuzhiyun }
365*4882a593Smuzhiyun
366*4882a593Smuzhiyun /*
367*4882a593Smuzhiyun * If this is a metadata allocation, try to reuse the busy
368*4882a593Smuzhiyun * extent instead of trimming the allocation.
369*4882a593Smuzhiyun */
370*4882a593Smuzhiyun if (!(args->datatype & XFS_ALLOC_USERDATA) &&
371*4882a593Smuzhiyun !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
372*4882a593Smuzhiyun if (!xfs_extent_busy_update_extent(args->mp, args->pag,
373*4882a593Smuzhiyun busyp, fbno, flen,
374*4882a593Smuzhiyun false))
375*4882a593Smuzhiyun goto restart;
376*4882a593Smuzhiyun continue;
377*4882a593Smuzhiyun }
378*4882a593Smuzhiyun
379*4882a593Smuzhiyun if (bbno <= fbno) {
380*4882a593Smuzhiyun /* start overlap */
381*4882a593Smuzhiyun
382*4882a593Smuzhiyun /*
383*4882a593Smuzhiyun * Case 1:
384*4882a593Smuzhiyun * bbno bend
385*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
386*4882a593Smuzhiyun * +---------+
387*4882a593Smuzhiyun * fbno fend
388*4882a593Smuzhiyun *
389*4882a593Smuzhiyun * Case 2:
390*4882a593Smuzhiyun * bbno bend
391*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
392*4882a593Smuzhiyun * +-------------+
393*4882a593Smuzhiyun * fbno fend
394*4882a593Smuzhiyun *
395*4882a593Smuzhiyun * Case 3:
396*4882a593Smuzhiyun * bbno bend
397*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
398*4882a593Smuzhiyun * +-------------+
399*4882a593Smuzhiyun * fbno fend
400*4882a593Smuzhiyun *
401*4882a593Smuzhiyun * Case 4:
402*4882a593Smuzhiyun * bbno bend
403*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
404*4882a593Smuzhiyun * +-----------------+
405*4882a593Smuzhiyun * fbno fend
406*4882a593Smuzhiyun *
407*4882a593Smuzhiyun * No unbusy region in extent, return failure.
408*4882a593Smuzhiyun */
409*4882a593Smuzhiyun if (fend <= bend)
410*4882a593Smuzhiyun goto fail;
411*4882a593Smuzhiyun
412*4882a593Smuzhiyun /*
413*4882a593Smuzhiyun * Case 5:
414*4882a593Smuzhiyun * bbno bend
415*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
416*4882a593Smuzhiyun * +----------------------+
417*4882a593Smuzhiyun * fbno fend
418*4882a593Smuzhiyun *
419*4882a593Smuzhiyun * Case 6:
420*4882a593Smuzhiyun * bbno bend
421*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
422*4882a593Smuzhiyun * +--------------------------+
423*4882a593Smuzhiyun * fbno fend
424*4882a593Smuzhiyun *
425*4882a593Smuzhiyun * Needs to be trimmed to:
426*4882a593Smuzhiyun * +-------+
427*4882a593Smuzhiyun * fbno fend
428*4882a593Smuzhiyun */
429*4882a593Smuzhiyun fbno = bend;
430*4882a593Smuzhiyun } else if (bend >= fend) {
431*4882a593Smuzhiyun /* end overlap */
432*4882a593Smuzhiyun
433*4882a593Smuzhiyun /*
434*4882a593Smuzhiyun * Case 7:
435*4882a593Smuzhiyun * bbno bend
436*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
437*4882a593Smuzhiyun * +------------------+
438*4882a593Smuzhiyun * fbno fend
439*4882a593Smuzhiyun *
440*4882a593Smuzhiyun * Case 8:
441*4882a593Smuzhiyun * bbno bend
442*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
443*4882a593Smuzhiyun * +--------------------------+
444*4882a593Smuzhiyun * fbno fend
445*4882a593Smuzhiyun *
446*4882a593Smuzhiyun * Needs to be trimmed to:
447*4882a593Smuzhiyun * +-------+
448*4882a593Smuzhiyun * fbno fend
449*4882a593Smuzhiyun */
450*4882a593Smuzhiyun fend = bbno;
451*4882a593Smuzhiyun } else {
452*4882a593Smuzhiyun /* middle overlap */
453*4882a593Smuzhiyun
454*4882a593Smuzhiyun /*
455*4882a593Smuzhiyun * Case 9:
456*4882a593Smuzhiyun * bbno bend
457*4882a593Smuzhiyun * +BBBBBBBBBBBBBBBBB+
458*4882a593Smuzhiyun * +-----------------------------------+
459*4882a593Smuzhiyun * fbno fend
460*4882a593Smuzhiyun *
461*4882a593Smuzhiyun * Can be trimmed to:
462*4882a593Smuzhiyun * +-------+ OR +-------+
463*4882a593Smuzhiyun * fbno fend fbno fend
464*4882a593Smuzhiyun *
465*4882a593Smuzhiyun * Backward allocation leads to significant
466*4882a593Smuzhiyun * fragmentation of directories, which degrades
467*4882a593Smuzhiyun * directory performance, therefore we always want to
468*4882a593Smuzhiyun * choose the option that produces forward allocation
469*4882a593Smuzhiyun * patterns.
470*4882a593Smuzhiyun * Preferring the lower bno extent will make the next
471*4882a593Smuzhiyun * request use "fend" as the start of the next
472*4882a593Smuzhiyun * allocation; if the segment is no longer busy at
473*4882a593Smuzhiyun * that point, we'll get a contiguous allocation, but
474*4882a593Smuzhiyun * even if it is still busy, we will get a forward
475*4882a593Smuzhiyun * allocation.
476*4882a593Smuzhiyun * We try to avoid choosing the segment at "bend",
477*4882a593Smuzhiyun * because that can lead to the next allocation
478*4882a593Smuzhiyun * taking the segment at "fbno", which would be a
479*4882a593Smuzhiyun * backward allocation. We only use the segment at
480*4882a593Smuzhiyun * "fbno" if it is much larger than the current
481*4882a593Smuzhiyun * requested size, because in that case there's a
482*4882a593Smuzhiyun * good chance subsequent allocations will be
483*4882a593Smuzhiyun * contiguous.
484*4882a593Smuzhiyun */
485*4882a593Smuzhiyun if (bbno - fbno >= args->maxlen) {
486*4882a593Smuzhiyun /* left candidate fits perfect */
487*4882a593Smuzhiyun fend = bbno;
488*4882a593Smuzhiyun } else if (fend - bend >= args->maxlen * 4) {
489*4882a593Smuzhiyun /* right candidate has enough free space */
490*4882a593Smuzhiyun fbno = bend;
491*4882a593Smuzhiyun } else if (bbno - fbno >= args->minlen) {
492*4882a593Smuzhiyun /* left candidate fits minimum requirement */
493*4882a593Smuzhiyun fend = bbno;
494*4882a593Smuzhiyun } else {
495*4882a593Smuzhiyun goto fail;
496*4882a593Smuzhiyun }
497*4882a593Smuzhiyun }
498*4882a593Smuzhiyun
499*4882a593Smuzhiyun flen = fend - fbno;
500*4882a593Smuzhiyun }
501*4882a593Smuzhiyun out:
502*4882a593Smuzhiyun
503*4882a593Smuzhiyun if (fbno != *bno || flen != *len) {
504*4882a593Smuzhiyun trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
505*4882a593Smuzhiyun fbno, flen);
506*4882a593Smuzhiyun *bno = fbno;
507*4882a593Smuzhiyun *len = flen;
508*4882a593Smuzhiyun *busy_gen = args->pag->pagb_gen;
509*4882a593Smuzhiyun ret = true;
510*4882a593Smuzhiyun }
511*4882a593Smuzhiyun spin_unlock(&args->pag->pagb_lock);
512*4882a593Smuzhiyun return ret;
513*4882a593Smuzhiyun fail:
514*4882a593Smuzhiyun /*
515*4882a593Smuzhiyun * Return a zero extent length as failure indications. All callers
516*4882a593Smuzhiyun * re-check if the trimmed extent satisfies the minlen requirement.
517*4882a593Smuzhiyun */
518*4882a593Smuzhiyun flen = 0;
519*4882a593Smuzhiyun goto out;
520*4882a593Smuzhiyun }
521*4882a593Smuzhiyun
522*4882a593Smuzhiyun STATIC void
xfs_extent_busy_clear_one(struct xfs_mount * mp,struct xfs_perag * pag,struct xfs_extent_busy * busyp)523*4882a593Smuzhiyun xfs_extent_busy_clear_one(
524*4882a593Smuzhiyun struct xfs_mount *mp,
525*4882a593Smuzhiyun struct xfs_perag *pag,
526*4882a593Smuzhiyun struct xfs_extent_busy *busyp)
527*4882a593Smuzhiyun {
528*4882a593Smuzhiyun if (busyp->length) {
529*4882a593Smuzhiyun trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
530*4882a593Smuzhiyun busyp->length);
531*4882a593Smuzhiyun rb_erase(&busyp->rb_node, &pag->pagb_tree);
532*4882a593Smuzhiyun }
533*4882a593Smuzhiyun
534*4882a593Smuzhiyun list_del_init(&busyp->list);
535*4882a593Smuzhiyun kmem_free(busyp);
536*4882a593Smuzhiyun }
537*4882a593Smuzhiyun
538*4882a593Smuzhiyun static void
xfs_extent_busy_put_pag(struct xfs_perag * pag,bool wakeup)539*4882a593Smuzhiyun xfs_extent_busy_put_pag(
540*4882a593Smuzhiyun struct xfs_perag *pag,
541*4882a593Smuzhiyun bool wakeup)
542*4882a593Smuzhiyun __releases(pag->pagb_lock)
543*4882a593Smuzhiyun {
544*4882a593Smuzhiyun if (wakeup) {
545*4882a593Smuzhiyun pag->pagb_gen++;
546*4882a593Smuzhiyun wake_up_all(&pag->pagb_wait);
547*4882a593Smuzhiyun }
548*4882a593Smuzhiyun
549*4882a593Smuzhiyun spin_unlock(&pag->pagb_lock);
550*4882a593Smuzhiyun xfs_perag_put(pag);
551*4882a593Smuzhiyun }
552*4882a593Smuzhiyun
553*4882a593Smuzhiyun /*
554*4882a593Smuzhiyun * Remove all extents on the passed in list from the busy extents tree.
555*4882a593Smuzhiyun * If do_discard is set skip extents that need to be discarded, and mark
556*4882a593Smuzhiyun * these as undergoing a discard operation instead.
557*4882a593Smuzhiyun */
558*4882a593Smuzhiyun void
xfs_extent_busy_clear(struct xfs_mount * mp,struct list_head * list,bool do_discard)559*4882a593Smuzhiyun xfs_extent_busy_clear(
560*4882a593Smuzhiyun struct xfs_mount *mp,
561*4882a593Smuzhiyun struct list_head *list,
562*4882a593Smuzhiyun bool do_discard)
563*4882a593Smuzhiyun {
564*4882a593Smuzhiyun struct xfs_extent_busy *busyp, *n;
565*4882a593Smuzhiyun struct xfs_perag *pag = NULL;
566*4882a593Smuzhiyun xfs_agnumber_t agno = NULLAGNUMBER;
567*4882a593Smuzhiyun bool wakeup = false;
568*4882a593Smuzhiyun
569*4882a593Smuzhiyun list_for_each_entry_safe(busyp, n, list, list) {
570*4882a593Smuzhiyun if (busyp->agno != agno) {
571*4882a593Smuzhiyun if (pag)
572*4882a593Smuzhiyun xfs_extent_busy_put_pag(pag, wakeup);
573*4882a593Smuzhiyun agno = busyp->agno;
574*4882a593Smuzhiyun pag = xfs_perag_get(mp, agno);
575*4882a593Smuzhiyun spin_lock(&pag->pagb_lock);
576*4882a593Smuzhiyun wakeup = false;
577*4882a593Smuzhiyun }
578*4882a593Smuzhiyun
579*4882a593Smuzhiyun if (do_discard && busyp->length &&
580*4882a593Smuzhiyun !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
581*4882a593Smuzhiyun busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
582*4882a593Smuzhiyun } else {
583*4882a593Smuzhiyun xfs_extent_busy_clear_one(mp, pag, busyp);
584*4882a593Smuzhiyun wakeup = true;
585*4882a593Smuzhiyun }
586*4882a593Smuzhiyun }
587*4882a593Smuzhiyun
588*4882a593Smuzhiyun if (pag)
589*4882a593Smuzhiyun xfs_extent_busy_put_pag(pag, wakeup);
590*4882a593Smuzhiyun }
591*4882a593Smuzhiyun
592*4882a593Smuzhiyun /*
593*4882a593Smuzhiyun * Flush out all busy extents for this AG.
594*4882a593Smuzhiyun */
595*4882a593Smuzhiyun void
xfs_extent_busy_flush(struct xfs_mount * mp,struct xfs_perag * pag,unsigned busy_gen)596*4882a593Smuzhiyun xfs_extent_busy_flush(
597*4882a593Smuzhiyun struct xfs_mount *mp,
598*4882a593Smuzhiyun struct xfs_perag *pag,
599*4882a593Smuzhiyun unsigned busy_gen)
600*4882a593Smuzhiyun {
601*4882a593Smuzhiyun DEFINE_WAIT (wait);
602*4882a593Smuzhiyun int error;
603*4882a593Smuzhiyun
604*4882a593Smuzhiyun error = xfs_log_force(mp, XFS_LOG_SYNC);
605*4882a593Smuzhiyun if (error)
606*4882a593Smuzhiyun return;
607*4882a593Smuzhiyun
608*4882a593Smuzhiyun do {
609*4882a593Smuzhiyun prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
610*4882a593Smuzhiyun if (busy_gen != READ_ONCE(pag->pagb_gen))
611*4882a593Smuzhiyun break;
612*4882a593Smuzhiyun schedule();
613*4882a593Smuzhiyun } while (1);
614*4882a593Smuzhiyun
615*4882a593Smuzhiyun finish_wait(&pag->pagb_wait, &wait);
616*4882a593Smuzhiyun }
617*4882a593Smuzhiyun
618*4882a593Smuzhiyun void
xfs_extent_busy_wait_all(struct xfs_mount * mp)619*4882a593Smuzhiyun xfs_extent_busy_wait_all(
620*4882a593Smuzhiyun struct xfs_mount *mp)
621*4882a593Smuzhiyun {
622*4882a593Smuzhiyun DEFINE_WAIT (wait);
623*4882a593Smuzhiyun xfs_agnumber_t agno;
624*4882a593Smuzhiyun
625*4882a593Smuzhiyun for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
626*4882a593Smuzhiyun struct xfs_perag *pag = xfs_perag_get(mp, agno);
627*4882a593Smuzhiyun
628*4882a593Smuzhiyun do {
629*4882a593Smuzhiyun prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
630*4882a593Smuzhiyun if (RB_EMPTY_ROOT(&pag->pagb_tree))
631*4882a593Smuzhiyun break;
632*4882a593Smuzhiyun schedule();
633*4882a593Smuzhiyun } while (1);
634*4882a593Smuzhiyun finish_wait(&pag->pagb_wait, &wait);
635*4882a593Smuzhiyun
636*4882a593Smuzhiyun xfs_perag_put(pag);
637*4882a593Smuzhiyun }
638*4882a593Smuzhiyun }
639*4882a593Smuzhiyun
640*4882a593Smuzhiyun /*
641*4882a593Smuzhiyun * Callback for list_sort to sort busy extents by the AG they reside in.
642*4882a593Smuzhiyun */
643*4882a593Smuzhiyun int
xfs_extent_busy_ag_cmp(void * priv,struct list_head * l1,struct list_head * l2)644*4882a593Smuzhiyun xfs_extent_busy_ag_cmp(
645*4882a593Smuzhiyun void *priv,
646*4882a593Smuzhiyun struct list_head *l1,
647*4882a593Smuzhiyun struct list_head *l2)
648*4882a593Smuzhiyun {
649*4882a593Smuzhiyun struct xfs_extent_busy *b1 =
650*4882a593Smuzhiyun container_of(l1, struct xfs_extent_busy, list);
651*4882a593Smuzhiyun struct xfs_extent_busy *b2 =
652*4882a593Smuzhiyun container_of(l2, struct xfs_extent_busy, list);
653*4882a593Smuzhiyun s32 diff;
654*4882a593Smuzhiyun
655*4882a593Smuzhiyun diff = b1->agno - b2->agno;
656*4882a593Smuzhiyun if (!diff)
657*4882a593Smuzhiyun diff = b1->bno - b2->bno;
658*4882a593Smuzhiyun return diff;
659*4882a593Smuzhiyun }
660