xref: /OK3568_Linux_fs/kernel/fs/btrfs/relocation.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * Copyright (C) 2009 Oracle.  All rights reserved.
4*4882a593Smuzhiyun  */
5*4882a593Smuzhiyun 
6*4882a593Smuzhiyun #include <linux/sched.h>
7*4882a593Smuzhiyun #include <linux/pagemap.h>
8*4882a593Smuzhiyun #include <linux/writeback.h>
9*4882a593Smuzhiyun #include <linux/blkdev.h>
10*4882a593Smuzhiyun #include <linux/rbtree.h>
11*4882a593Smuzhiyun #include <linux/slab.h>
12*4882a593Smuzhiyun #include <linux/error-injection.h>
13*4882a593Smuzhiyun #include "ctree.h"
14*4882a593Smuzhiyun #include "disk-io.h"
15*4882a593Smuzhiyun #include "transaction.h"
16*4882a593Smuzhiyun #include "volumes.h"
17*4882a593Smuzhiyun #include "locking.h"
18*4882a593Smuzhiyun #include "btrfs_inode.h"
19*4882a593Smuzhiyun #include "async-thread.h"
20*4882a593Smuzhiyun #include "free-space-cache.h"
21*4882a593Smuzhiyun #include "inode-map.h"
22*4882a593Smuzhiyun #include "qgroup.h"
23*4882a593Smuzhiyun #include "print-tree.h"
24*4882a593Smuzhiyun #include "delalloc-space.h"
25*4882a593Smuzhiyun #include "block-group.h"
26*4882a593Smuzhiyun #include "backref.h"
27*4882a593Smuzhiyun #include "misc.h"
28*4882a593Smuzhiyun 
29*4882a593Smuzhiyun /*
30*4882a593Smuzhiyun  * Relocation overview
31*4882a593Smuzhiyun  *
32*4882a593Smuzhiyun  * [What does relocation do]
33*4882a593Smuzhiyun  *
34*4882a593Smuzhiyun  * The objective of relocation is to relocate all extents of the target block
35*4882a593Smuzhiyun  * group to other block groups.
36*4882a593Smuzhiyun  * This is utilized by resize (shrink only), profile converting, compacting
37*4882a593Smuzhiyun  * space, or balance routine to spread chunks over devices.
38*4882a593Smuzhiyun  *
39*4882a593Smuzhiyun  * 		Before		|		After
40*4882a593Smuzhiyun  * ------------------------------------------------------------------
41*4882a593Smuzhiyun  *  BG A: 10 data extents	| BG A: deleted
42*4882a593Smuzhiyun  *  BG B:  2 data extents	| BG B: 10 data extents (2 old + 8 relocated)
43*4882a593Smuzhiyun  *  BG C:  1 extents		| BG C:  3 data extents (1 old + 2 relocated)
44*4882a593Smuzhiyun  *
45*4882a593Smuzhiyun  * [How does relocation work]
46*4882a593Smuzhiyun  *
47*4882a593Smuzhiyun  * 1.   Mark the target block group read-only
48*4882a593Smuzhiyun  *      New extents won't be allocated from the target block group.
49*4882a593Smuzhiyun  *
50*4882a593Smuzhiyun  * 2.1  Record each extent in the target block group
51*4882a593Smuzhiyun  *      To build a proper map of extents to be relocated.
52*4882a593Smuzhiyun  *
53*4882a593Smuzhiyun  * 2.2  Build data reloc tree and reloc trees
54*4882a593Smuzhiyun  *      Data reloc tree will contain an inode, recording all newly relocated
55*4882a593Smuzhiyun  *      data extents.
56*4882a593Smuzhiyun  *      There will be only one data reloc tree for one data block group.
57*4882a593Smuzhiyun  *
58*4882a593Smuzhiyun  *      Reloc tree will be a special snapshot of its source tree, containing
59*4882a593Smuzhiyun  *      relocated tree blocks.
60*4882a593Smuzhiyun  *      Each tree referring to a tree block in target block group will get its
61*4882a593Smuzhiyun  *      reloc tree built.
62*4882a593Smuzhiyun  *
63*4882a593Smuzhiyun  * 2.3  Swap source tree with its corresponding reloc tree
64*4882a593Smuzhiyun  *      Each involved tree only refers to new extents after swap.
65*4882a593Smuzhiyun  *
66*4882a593Smuzhiyun  * 3.   Cleanup reloc trees and data reloc tree.
67*4882a593Smuzhiyun  *      As old extents in the target block group are still referenced by reloc
68*4882a593Smuzhiyun  *      trees, we need to clean them up before really freeing the target block
69*4882a593Smuzhiyun  *      group.
70*4882a593Smuzhiyun  *
71*4882a593Smuzhiyun  * The main complexity is in steps 2.2 and 2.3.
72*4882a593Smuzhiyun  *
73*4882a593Smuzhiyun  * The entry point of relocation is relocate_block_group() function.
74*4882a593Smuzhiyun  */
75*4882a593Smuzhiyun 
76*4882a593Smuzhiyun #define RELOCATION_RESERVED_NODES	256
77*4882a593Smuzhiyun /*
78*4882a593Smuzhiyun  * map address of tree root to tree
79*4882a593Smuzhiyun  */
80*4882a593Smuzhiyun struct mapping_node {
81*4882a593Smuzhiyun 	struct {
82*4882a593Smuzhiyun 		struct rb_node rb_node;
83*4882a593Smuzhiyun 		u64 bytenr;
84*4882a593Smuzhiyun 	}; /* Use rb_simle_node for search/insert */
85*4882a593Smuzhiyun 	void *data;
86*4882a593Smuzhiyun };
87*4882a593Smuzhiyun 
88*4882a593Smuzhiyun struct mapping_tree {
89*4882a593Smuzhiyun 	struct rb_root rb_root;
90*4882a593Smuzhiyun 	spinlock_t lock;
91*4882a593Smuzhiyun };
92*4882a593Smuzhiyun 
93*4882a593Smuzhiyun /*
94*4882a593Smuzhiyun  * present a tree block to process
95*4882a593Smuzhiyun  */
96*4882a593Smuzhiyun struct tree_block {
97*4882a593Smuzhiyun 	struct {
98*4882a593Smuzhiyun 		struct rb_node rb_node;
99*4882a593Smuzhiyun 		u64 bytenr;
100*4882a593Smuzhiyun 	}; /* Use rb_simple_node for search/insert */
101*4882a593Smuzhiyun 	struct btrfs_key key;
102*4882a593Smuzhiyun 	unsigned int level:8;
103*4882a593Smuzhiyun 	unsigned int key_ready:1;
104*4882a593Smuzhiyun };
105*4882a593Smuzhiyun 
106*4882a593Smuzhiyun #define MAX_EXTENTS 128
107*4882a593Smuzhiyun 
108*4882a593Smuzhiyun struct file_extent_cluster {
109*4882a593Smuzhiyun 	u64 start;
110*4882a593Smuzhiyun 	u64 end;
111*4882a593Smuzhiyun 	u64 boundary[MAX_EXTENTS];
112*4882a593Smuzhiyun 	unsigned int nr;
113*4882a593Smuzhiyun };
114*4882a593Smuzhiyun 
115*4882a593Smuzhiyun struct reloc_control {
116*4882a593Smuzhiyun 	/* block group to relocate */
117*4882a593Smuzhiyun 	struct btrfs_block_group *block_group;
118*4882a593Smuzhiyun 	/* extent tree */
119*4882a593Smuzhiyun 	struct btrfs_root *extent_root;
120*4882a593Smuzhiyun 	/* inode for moving data */
121*4882a593Smuzhiyun 	struct inode *data_inode;
122*4882a593Smuzhiyun 
123*4882a593Smuzhiyun 	struct btrfs_block_rsv *block_rsv;
124*4882a593Smuzhiyun 
125*4882a593Smuzhiyun 	struct btrfs_backref_cache backref_cache;
126*4882a593Smuzhiyun 
127*4882a593Smuzhiyun 	struct file_extent_cluster cluster;
128*4882a593Smuzhiyun 	/* tree blocks have been processed */
129*4882a593Smuzhiyun 	struct extent_io_tree processed_blocks;
130*4882a593Smuzhiyun 	/* map start of tree root to corresponding reloc tree */
131*4882a593Smuzhiyun 	struct mapping_tree reloc_root_tree;
132*4882a593Smuzhiyun 	/* list of reloc trees */
133*4882a593Smuzhiyun 	struct list_head reloc_roots;
134*4882a593Smuzhiyun 	/* list of subvolume trees that get relocated */
135*4882a593Smuzhiyun 	struct list_head dirty_subvol_roots;
136*4882a593Smuzhiyun 	/* size of metadata reservation for merging reloc trees */
137*4882a593Smuzhiyun 	u64 merging_rsv_size;
138*4882a593Smuzhiyun 	/* size of relocated tree nodes */
139*4882a593Smuzhiyun 	u64 nodes_relocated;
140*4882a593Smuzhiyun 	/* reserved size for block group relocation*/
141*4882a593Smuzhiyun 	u64 reserved_bytes;
142*4882a593Smuzhiyun 
143*4882a593Smuzhiyun 	u64 search_start;
144*4882a593Smuzhiyun 	u64 extents_found;
145*4882a593Smuzhiyun 
146*4882a593Smuzhiyun 	unsigned int stage:8;
147*4882a593Smuzhiyun 	unsigned int create_reloc_tree:1;
148*4882a593Smuzhiyun 	unsigned int merge_reloc_tree:1;
149*4882a593Smuzhiyun 	unsigned int found_file_extent:1;
150*4882a593Smuzhiyun };
151*4882a593Smuzhiyun 
152*4882a593Smuzhiyun /* stages of data relocation */
153*4882a593Smuzhiyun #define MOVE_DATA_EXTENTS	0
154*4882a593Smuzhiyun #define UPDATE_DATA_PTRS	1
155*4882a593Smuzhiyun 
mark_block_processed(struct reloc_control * rc,struct btrfs_backref_node * node)156*4882a593Smuzhiyun static void mark_block_processed(struct reloc_control *rc,
157*4882a593Smuzhiyun 				 struct btrfs_backref_node *node)
158*4882a593Smuzhiyun {
159*4882a593Smuzhiyun 	u32 blocksize;
160*4882a593Smuzhiyun 
161*4882a593Smuzhiyun 	if (node->level == 0 ||
162*4882a593Smuzhiyun 	    in_range(node->bytenr, rc->block_group->start,
163*4882a593Smuzhiyun 		     rc->block_group->length)) {
164*4882a593Smuzhiyun 		blocksize = rc->extent_root->fs_info->nodesize;
165*4882a593Smuzhiyun 		set_extent_bits(&rc->processed_blocks, node->bytenr,
166*4882a593Smuzhiyun 				node->bytenr + blocksize - 1, EXTENT_DIRTY);
167*4882a593Smuzhiyun 	}
168*4882a593Smuzhiyun 	node->processed = 1;
169*4882a593Smuzhiyun }
170*4882a593Smuzhiyun 
171*4882a593Smuzhiyun 
mapping_tree_init(struct mapping_tree * tree)172*4882a593Smuzhiyun static void mapping_tree_init(struct mapping_tree *tree)
173*4882a593Smuzhiyun {
174*4882a593Smuzhiyun 	tree->rb_root = RB_ROOT;
175*4882a593Smuzhiyun 	spin_lock_init(&tree->lock);
176*4882a593Smuzhiyun }
177*4882a593Smuzhiyun 
178*4882a593Smuzhiyun /*
179*4882a593Smuzhiyun  * walk up backref nodes until reach node presents tree root
180*4882a593Smuzhiyun  */
walk_up_backref(struct btrfs_backref_node * node,struct btrfs_backref_edge * edges[],int * index)181*4882a593Smuzhiyun static struct btrfs_backref_node *walk_up_backref(
182*4882a593Smuzhiyun 		struct btrfs_backref_node *node,
183*4882a593Smuzhiyun 		struct btrfs_backref_edge *edges[], int *index)
184*4882a593Smuzhiyun {
185*4882a593Smuzhiyun 	struct btrfs_backref_edge *edge;
186*4882a593Smuzhiyun 	int idx = *index;
187*4882a593Smuzhiyun 
188*4882a593Smuzhiyun 	while (!list_empty(&node->upper)) {
189*4882a593Smuzhiyun 		edge = list_entry(node->upper.next,
190*4882a593Smuzhiyun 				  struct btrfs_backref_edge, list[LOWER]);
191*4882a593Smuzhiyun 		edges[idx++] = edge;
192*4882a593Smuzhiyun 		node = edge->node[UPPER];
193*4882a593Smuzhiyun 	}
194*4882a593Smuzhiyun 	BUG_ON(node->detached);
195*4882a593Smuzhiyun 	*index = idx;
196*4882a593Smuzhiyun 	return node;
197*4882a593Smuzhiyun }
198*4882a593Smuzhiyun 
199*4882a593Smuzhiyun /*
200*4882a593Smuzhiyun  * walk down backref nodes to find start of next reference path
201*4882a593Smuzhiyun  */
walk_down_backref(struct btrfs_backref_edge * edges[],int * index)202*4882a593Smuzhiyun static struct btrfs_backref_node *walk_down_backref(
203*4882a593Smuzhiyun 		struct btrfs_backref_edge *edges[], int *index)
204*4882a593Smuzhiyun {
205*4882a593Smuzhiyun 	struct btrfs_backref_edge *edge;
206*4882a593Smuzhiyun 	struct btrfs_backref_node *lower;
207*4882a593Smuzhiyun 	int idx = *index;
208*4882a593Smuzhiyun 
209*4882a593Smuzhiyun 	while (idx > 0) {
210*4882a593Smuzhiyun 		edge = edges[idx - 1];
211*4882a593Smuzhiyun 		lower = edge->node[LOWER];
212*4882a593Smuzhiyun 		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
213*4882a593Smuzhiyun 			idx--;
214*4882a593Smuzhiyun 			continue;
215*4882a593Smuzhiyun 		}
216*4882a593Smuzhiyun 		edge = list_entry(edge->list[LOWER].next,
217*4882a593Smuzhiyun 				  struct btrfs_backref_edge, list[LOWER]);
218*4882a593Smuzhiyun 		edges[idx - 1] = edge;
219*4882a593Smuzhiyun 		*index = idx;
220*4882a593Smuzhiyun 		return edge->node[UPPER];
221*4882a593Smuzhiyun 	}
222*4882a593Smuzhiyun 	*index = 0;
223*4882a593Smuzhiyun 	return NULL;
224*4882a593Smuzhiyun }
225*4882a593Smuzhiyun 
update_backref_node(struct btrfs_backref_cache * cache,struct btrfs_backref_node * node,u64 bytenr)226*4882a593Smuzhiyun static void update_backref_node(struct btrfs_backref_cache *cache,
227*4882a593Smuzhiyun 				struct btrfs_backref_node *node, u64 bytenr)
228*4882a593Smuzhiyun {
229*4882a593Smuzhiyun 	struct rb_node *rb_node;
230*4882a593Smuzhiyun 	rb_erase(&node->rb_node, &cache->rb_root);
231*4882a593Smuzhiyun 	node->bytenr = bytenr;
232*4882a593Smuzhiyun 	rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node);
233*4882a593Smuzhiyun 	if (rb_node)
234*4882a593Smuzhiyun 		btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST);
235*4882a593Smuzhiyun }
236*4882a593Smuzhiyun 
237*4882a593Smuzhiyun /*
238*4882a593Smuzhiyun  * update backref cache after a transaction commit
239*4882a593Smuzhiyun  */
update_backref_cache(struct btrfs_trans_handle * trans,struct btrfs_backref_cache * cache)240*4882a593Smuzhiyun static int update_backref_cache(struct btrfs_trans_handle *trans,
241*4882a593Smuzhiyun 				struct btrfs_backref_cache *cache)
242*4882a593Smuzhiyun {
243*4882a593Smuzhiyun 	struct btrfs_backref_node *node;
244*4882a593Smuzhiyun 	int level = 0;
245*4882a593Smuzhiyun 
246*4882a593Smuzhiyun 	if (cache->last_trans == 0) {
247*4882a593Smuzhiyun 		cache->last_trans = trans->transid;
248*4882a593Smuzhiyun 		return 0;
249*4882a593Smuzhiyun 	}
250*4882a593Smuzhiyun 
251*4882a593Smuzhiyun 	if (cache->last_trans == trans->transid)
252*4882a593Smuzhiyun 		return 0;
253*4882a593Smuzhiyun 
254*4882a593Smuzhiyun 	/*
255*4882a593Smuzhiyun 	 * detached nodes are used to avoid unnecessary backref
256*4882a593Smuzhiyun 	 * lookup. transaction commit changes the extent tree.
257*4882a593Smuzhiyun 	 * so the detached nodes are no longer useful.
258*4882a593Smuzhiyun 	 */
259*4882a593Smuzhiyun 	while (!list_empty(&cache->detached)) {
260*4882a593Smuzhiyun 		node = list_entry(cache->detached.next,
261*4882a593Smuzhiyun 				  struct btrfs_backref_node, list);
262*4882a593Smuzhiyun 		btrfs_backref_cleanup_node(cache, node);
263*4882a593Smuzhiyun 	}
264*4882a593Smuzhiyun 
265*4882a593Smuzhiyun 	while (!list_empty(&cache->changed)) {
266*4882a593Smuzhiyun 		node = list_entry(cache->changed.next,
267*4882a593Smuzhiyun 				  struct btrfs_backref_node, list);
268*4882a593Smuzhiyun 		list_del_init(&node->list);
269*4882a593Smuzhiyun 		BUG_ON(node->pending);
270*4882a593Smuzhiyun 		update_backref_node(cache, node, node->new_bytenr);
271*4882a593Smuzhiyun 	}
272*4882a593Smuzhiyun 
273*4882a593Smuzhiyun 	/*
274*4882a593Smuzhiyun 	 * some nodes can be left in the pending list if there were
275*4882a593Smuzhiyun 	 * errors during processing the pending nodes.
276*4882a593Smuzhiyun 	 */
277*4882a593Smuzhiyun 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
278*4882a593Smuzhiyun 		list_for_each_entry(node, &cache->pending[level], list) {
279*4882a593Smuzhiyun 			BUG_ON(!node->pending);
280*4882a593Smuzhiyun 			if (node->bytenr == node->new_bytenr)
281*4882a593Smuzhiyun 				continue;
282*4882a593Smuzhiyun 			update_backref_node(cache, node, node->new_bytenr);
283*4882a593Smuzhiyun 		}
284*4882a593Smuzhiyun 	}
285*4882a593Smuzhiyun 
286*4882a593Smuzhiyun 	cache->last_trans = 0;
287*4882a593Smuzhiyun 	return 1;
288*4882a593Smuzhiyun }
289*4882a593Smuzhiyun 
reloc_root_is_dead(struct btrfs_root * root)290*4882a593Smuzhiyun static bool reloc_root_is_dead(struct btrfs_root *root)
291*4882a593Smuzhiyun {
292*4882a593Smuzhiyun 	/*
293*4882a593Smuzhiyun 	 * Pair with set_bit/clear_bit in clean_dirty_subvols and
294*4882a593Smuzhiyun 	 * btrfs_update_reloc_root. We need to see the updated bit before
295*4882a593Smuzhiyun 	 * trying to access reloc_root
296*4882a593Smuzhiyun 	 */
297*4882a593Smuzhiyun 	smp_rmb();
298*4882a593Smuzhiyun 	if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
299*4882a593Smuzhiyun 		return true;
300*4882a593Smuzhiyun 	return false;
301*4882a593Smuzhiyun }
302*4882a593Smuzhiyun 
303*4882a593Smuzhiyun /*
304*4882a593Smuzhiyun  * Check if this subvolume tree has valid reloc tree.
305*4882a593Smuzhiyun  *
306*4882a593Smuzhiyun  * Reloc tree after swap is considered dead, thus not considered as valid.
307*4882a593Smuzhiyun  * This is enough for most callers, as they don't distinguish dead reloc root
308*4882a593Smuzhiyun  * from no reloc root.  But btrfs_should_ignore_reloc_root() below is a
309*4882a593Smuzhiyun  * special case.
310*4882a593Smuzhiyun  */
have_reloc_root(struct btrfs_root * root)311*4882a593Smuzhiyun static bool have_reloc_root(struct btrfs_root *root)
312*4882a593Smuzhiyun {
313*4882a593Smuzhiyun 	if (reloc_root_is_dead(root))
314*4882a593Smuzhiyun 		return false;
315*4882a593Smuzhiyun 	if (!root->reloc_root)
316*4882a593Smuzhiyun 		return false;
317*4882a593Smuzhiyun 	return true;
318*4882a593Smuzhiyun }
319*4882a593Smuzhiyun 
btrfs_should_ignore_reloc_root(struct btrfs_root * root)320*4882a593Smuzhiyun int btrfs_should_ignore_reloc_root(struct btrfs_root *root)
321*4882a593Smuzhiyun {
322*4882a593Smuzhiyun 	struct btrfs_root *reloc_root;
323*4882a593Smuzhiyun 
324*4882a593Smuzhiyun 	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
325*4882a593Smuzhiyun 		return 0;
326*4882a593Smuzhiyun 
327*4882a593Smuzhiyun 	/* This root has been merged with its reloc tree, we can ignore it */
328*4882a593Smuzhiyun 	if (reloc_root_is_dead(root))
329*4882a593Smuzhiyun 		return 1;
330*4882a593Smuzhiyun 
331*4882a593Smuzhiyun 	reloc_root = root->reloc_root;
332*4882a593Smuzhiyun 	if (!reloc_root)
333*4882a593Smuzhiyun 		return 0;
334*4882a593Smuzhiyun 
335*4882a593Smuzhiyun 	if (btrfs_header_generation(reloc_root->commit_root) ==
336*4882a593Smuzhiyun 	    root->fs_info->running_transaction->transid)
337*4882a593Smuzhiyun 		return 0;
338*4882a593Smuzhiyun 	/*
339*4882a593Smuzhiyun 	 * if there is reloc tree and it was created in previous
340*4882a593Smuzhiyun 	 * transaction backref lookup can find the reloc tree,
341*4882a593Smuzhiyun 	 * so backref node for the fs tree root is useless for
342*4882a593Smuzhiyun 	 * relocation.
343*4882a593Smuzhiyun 	 */
344*4882a593Smuzhiyun 	return 1;
345*4882a593Smuzhiyun }
346*4882a593Smuzhiyun 
347*4882a593Smuzhiyun /*
348*4882a593Smuzhiyun  * find reloc tree by address of tree root
349*4882a593Smuzhiyun  */
find_reloc_root(struct btrfs_fs_info * fs_info,u64 bytenr)350*4882a593Smuzhiyun struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
351*4882a593Smuzhiyun {
352*4882a593Smuzhiyun 	struct reloc_control *rc = fs_info->reloc_ctl;
353*4882a593Smuzhiyun 	struct rb_node *rb_node;
354*4882a593Smuzhiyun 	struct mapping_node *node;
355*4882a593Smuzhiyun 	struct btrfs_root *root = NULL;
356*4882a593Smuzhiyun 
357*4882a593Smuzhiyun 	ASSERT(rc);
358*4882a593Smuzhiyun 	spin_lock(&rc->reloc_root_tree.lock);
359*4882a593Smuzhiyun 	rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
360*4882a593Smuzhiyun 	if (rb_node) {
361*4882a593Smuzhiyun 		node = rb_entry(rb_node, struct mapping_node, rb_node);
362*4882a593Smuzhiyun 		root = (struct btrfs_root *)node->data;
363*4882a593Smuzhiyun 	}
364*4882a593Smuzhiyun 	spin_unlock(&rc->reloc_root_tree.lock);
365*4882a593Smuzhiyun 	return btrfs_grab_root(root);
366*4882a593Smuzhiyun }
367*4882a593Smuzhiyun 
368*4882a593Smuzhiyun /*
369*4882a593Smuzhiyun  * For useless nodes, do two major clean ups:
370*4882a593Smuzhiyun  *
371*4882a593Smuzhiyun  * - Cleanup the children edges and nodes
372*4882a593Smuzhiyun  *   If child node is also orphan (no parent) during cleanup, then the child
373*4882a593Smuzhiyun  *   node will also be cleaned up.
374*4882a593Smuzhiyun  *
375*4882a593Smuzhiyun  * - Freeing up leaves (level 0), keeps nodes detached
376*4882a593Smuzhiyun  *   For nodes, the node is still cached as "detached"
377*4882a593Smuzhiyun  *
378*4882a593Smuzhiyun  * Return false if @node is not in the @useless_nodes list.
379*4882a593Smuzhiyun  * Return true if @node is in the @useless_nodes list.
380*4882a593Smuzhiyun  */
handle_useless_nodes(struct reloc_control * rc,struct btrfs_backref_node * node)381*4882a593Smuzhiyun static bool handle_useless_nodes(struct reloc_control *rc,
382*4882a593Smuzhiyun 				 struct btrfs_backref_node *node)
383*4882a593Smuzhiyun {
384*4882a593Smuzhiyun 	struct btrfs_backref_cache *cache = &rc->backref_cache;
385*4882a593Smuzhiyun 	struct list_head *useless_node = &cache->useless_node;
386*4882a593Smuzhiyun 	bool ret = false;
387*4882a593Smuzhiyun 
388*4882a593Smuzhiyun 	while (!list_empty(useless_node)) {
389*4882a593Smuzhiyun 		struct btrfs_backref_node *cur;
390*4882a593Smuzhiyun 
391*4882a593Smuzhiyun 		cur = list_first_entry(useless_node, struct btrfs_backref_node,
392*4882a593Smuzhiyun 				 list);
393*4882a593Smuzhiyun 		list_del_init(&cur->list);
394*4882a593Smuzhiyun 
395*4882a593Smuzhiyun 		/* Only tree root nodes can be added to @useless_nodes */
396*4882a593Smuzhiyun 		ASSERT(list_empty(&cur->upper));
397*4882a593Smuzhiyun 
398*4882a593Smuzhiyun 		if (cur == node)
399*4882a593Smuzhiyun 			ret = true;
400*4882a593Smuzhiyun 
401*4882a593Smuzhiyun 		/* The node is the lowest node */
402*4882a593Smuzhiyun 		if (cur->lowest) {
403*4882a593Smuzhiyun 			list_del_init(&cur->lower);
404*4882a593Smuzhiyun 			cur->lowest = 0;
405*4882a593Smuzhiyun 		}
406*4882a593Smuzhiyun 
407*4882a593Smuzhiyun 		/* Cleanup the lower edges */
408*4882a593Smuzhiyun 		while (!list_empty(&cur->lower)) {
409*4882a593Smuzhiyun 			struct btrfs_backref_edge *edge;
410*4882a593Smuzhiyun 			struct btrfs_backref_node *lower;
411*4882a593Smuzhiyun 
412*4882a593Smuzhiyun 			edge = list_entry(cur->lower.next,
413*4882a593Smuzhiyun 					struct btrfs_backref_edge, list[UPPER]);
414*4882a593Smuzhiyun 			list_del(&edge->list[UPPER]);
415*4882a593Smuzhiyun 			list_del(&edge->list[LOWER]);
416*4882a593Smuzhiyun 			lower = edge->node[LOWER];
417*4882a593Smuzhiyun 			btrfs_backref_free_edge(cache, edge);
418*4882a593Smuzhiyun 
419*4882a593Smuzhiyun 			/* Child node is also orphan, queue for cleanup */
420*4882a593Smuzhiyun 			if (list_empty(&lower->upper))
421*4882a593Smuzhiyun 				list_add(&lower->list, useless_node);
422*4882a593Smuzhiyun 		}
423*4882a593Smuzhiyun 		/* Mark this block processed for relocation */
424*4882a593Smuzhiyun 		mark_block_processed(rc, cur);
425*4882a593Smuzhiyun 
426*4882a593Smuzhiyun 		/*
427*4882a593Smuzhiyun 		 * Backref nodes for tree leaves are deleted from the cache.
428*4882a593Smuzhiyun 		 * Backref nodes for upper level tree blocks are left in the
429*4882a593Smuzhiyun 		 * cache to avoid unnecessary backref lookup.
430*4882a593Smuzhiyun 		 */
431*4882a593Smuzhiyun 		if (cur->level > 0) {
432*4882a593Smuzhiyun 			list_add(&cur->list, &cache->detached);
433*4882a593Smuzhiyun 			cur->detached = 1;
434*4882a593Smuzhiyun 		} else {
435*4882a593Smuzhiyun 			rb_erase(&cur->rb_node, &cache->rb_root);
436*4882a593Smuzhiyun 			btrfs_backref_free_node(cache, cur);
437*4882a593Smuzhiyun 		}
438*4882a593Smuzhiyun 	}
439*4882a593Smuzhiyun 	return ret;
440*4882a593Smuzhiyun }
441*4882a593Smuzhiyun 
442*4882a593Smuzhiyun /*
443*4882a593Smuzhiyun  * Build backref tree for a given tree block. Root of the backref tree
444*4882a593Smuzhiyun  * corresponds the tree block, leaves of the backref tree correspond roots of
445*4882a593Smuzhiyun  * b-trees that reference the tree block.
446*4882a593Smuzhiyun  *
447*4882a593Smuzhiyun  * The basic idea of this function is check backrefs of a given block to find
448*4882a593Smuzhiyun  * upper level blocks that reference the block, and then check backrefs of
449*4882a593Smuzhiyun  * these upper level blocks recursively. The recursion stops when tree root is
450*4882a593Smuzhiyun  * reached or backrefs for the block is cached.
451*4882a593Smuzhiyun  *
452*4882a593Smuzhiyun  * NOTE: if we find that backrefs for a block are cached, we know backrefs for
453*4882a593Smuzhiyun  * all upper level blocks that directly/indirectly reference the block are also
454*4882a593Smuzhiyun  * cached.
455*4882a593Smuzhiyun  */
build_backref_tree(struct reloc_control * rc,struct btrfs_key * node_key,int level,u64 bytenr)456*4882a593Smuzhiyun static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
457*4882a593Smuzhiyun 			struct reloc_control *rc, struct btrfs_key *node_key,
458*4882a593Smuzhiyun 			int level, u64 bytenr)
459*4882a593Smuzhiyun {
460*4882a593Smuzhiyun 	struct btrfs_backref_iter *iter;
461*4882a593Smuzhiyun 	struct btrfs_backref_cache *cache = &rc->backref_cache;
462*4882a593Smuzhiyun 	/* For searching parent of TREE_BLOCK_REF */
463*4882a593Smuzhiyun 	struct btrfs_path *path;
464*4882a593Smuzhiyun 	struct btrfs_backref_node *cur;
465*4882a593Smuzhiyun 	struct btrfs_backref_node *node = NULL;
466*4882a593Smuzhiyun 	struct btrfs_backref_edge *edge;
467*4882a593Smuzhiyun 	int ret;
468*4882a593Smuzhiyun 	int err = 0;
469*4882a593Smuzhiyun 
470*4882a593Smuzhiyun 	iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS);
471*4882a593Smuzhiyun 	if (!iter)
472*4882a593Smuzhiyun 		return ERR_PTR(-ENOMEM);
473*4882a593Smuzhiyun 	path = btrfs_alloc_path();
474*4882a593Smuzhiyun 	if (!path) {
475*4882a593Smuzhiyun 		err = -ENOMEM;
476*4882a593Smuzhiyun 		goto out;
477*4882a593Smuzhiyun 	}
478*4882a593Smuzhiyun 
479*4882a593Smuzhiyun 	node = btrfs_backref_alloc_node(cache, bytenr, level);
480*4882a593Smuzhiyun 	if (!node) {
481*4882a593Smuzhiyun 		err = -ENOMEM;
482*4882a593Smuzhiyun 		goto out;
483*4882a593Smuzhiyun 	}
484*4882a593Smuzhiyun 
485*4882a593Smuzhiyun 	node->lowest = 1;
486*4882a593Smuzhiyun 	cur = node;
487*4882a593Smuzhiyun 
488*4882a593Smuzhiyun 	/* Breadth-first search to build backref cache */
489*4882a593Smuzhiyun 	do {
490*4882a593Smuzhiyun 		ret = btrfs_backref_add_tree_node(cache, path, iter, node_key,
491*4882a593Smuzhiyun 						  cur);
492*4882a593Smuzhiyun 		if (ret < 0) {
493*4882a593Smuzhiyun 			err = ret;
494*4882a593Smuzhiyun 			goto out;
495*4882a593Smuzhiyun 		}
496*4882a593Smuzhiyun 		edge = list_first_entry_or_null(&cache->pending_edge,
497*4882a593Smuzhiyun 				struct btrfs_backref_edge, list[UPPER]);
498*4882a593Smuzhiyun 		/*
499*4882a593Smuzhiyun 		 * The pending list isn't empty, take the first block to
500*4882a593Smuzhiyun 		 * process
501*4882a593Smuzhiyun 		 */
502*4882a593Smuzhiyun 		if (edge) {
503*4882a593Smuzhiyun 			list_del_init(&edge->list[UPPER]);
504*4882a593Smuzhiyun 			cur = edge->node[UPPER];
505*4882a593Smuzhiyun 		}
506*4882a593Smuzhiyun 	} while (edge);
507*4882a593Smuzhiyun 
508*4882a593Smuzhiyun 	/* Finish the upper linkage of newly added edges/nodes */
509*4882a593Smuzhiyun 	ret = btrfs_backref_finish_upper_links(cache, node);
510*4882a593Smuzhiyun 	if (ret < 0) {
511*4882a593Smuzhiyun 		err = ret;
512*4882a593Smuzhiyun 		goto out;
513*4882a593Smuzhiyun 	}
514*4882a593Smuzhiyun 
515*4882a593Smuzhiyun 	if (handle_useless_nodes(rc, node))
516*4882a593Smuzhiyun 		node = NULL;
517*4882a593Smuzhiyun out:
518*4882a593Smuzhiyun 	btrfs_backref_iter_free(iter);
519*4882a593Smuzhiyun 	btrfs_free_path(path);
520*4882a593Smuzhiyun 	if (err) {
521*4882a593Smuzhiyun 		btrfs_backref_error_cleanup(cache, node);
522*4882a593Smuzhiyun 		return ERR_PTR(err);
523*4882a593Smuzhiyun 	}
524*4882a593Smuzhiyun 	ASSERT(!node || !node->detached);
525*4882a593Smuzhiyun 	ASSERT(list_empty(&cache->useless_node) &&
526*4882a593Smuzhiyun 	       list_empty(&cache->pending_edge));
527*4882a593Smuzhiyun 	return node;
528*4882a593Smuzhiyun }
529*4882a593Smuzhiyun 
530*4882a593Smuzhiyun /*
531*4882a593Smuzhiyun  * helper to add backref node for the newly created snapshot.
532*4882a593Smuzhiyun  * the backref node is created by cloning backref node that
533*4882a593Smuzhiyun  * corresponds to root of source tree
534*4882a593Smuzhiyun  */
clone_backref_node(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_root * src,struct btrfs_root * dest)535*4882a593Smuzhiyun static int clone_backref_node(struct btrfs_trans_handle *trans,
536*4882a593Smuzhiyun 			      struct reloc_control *rc,
537*4882a593Smuzhiyun 			      struct btrfs_root *src,
538*4882a593Smuzhiyun 			      struct btrfs_root *dest)
539*4882a593Smuzhiyun {
540*4882a593Smuzhiyun 	struct btrfs_root *reloc_root = src->reloc_root;
541*4882a593Smuzhiyun 	struct btrfs_backref_cache *cache = &rc->backref_cache;
542*4882a593Smuzhiyun 	struct btrfs_backref_node *node = NULL;
543*4882a593Smuzhiyun 	struct btrfs_backref_node *new_node;
544*4882a593Smuzhiyun 	struct btrfs_backref_edge *edge;
545*4882a593Smuzhiyun 	struct btrfs_backref_edge *new_edge;
546*4882a593Smuzhiyun 	struct rb_node *rb_node;
547*4882a593Smuzhiyun 
548*4882a593Smuzhiyun 	if (cache->last_trans > 0)
549*4882a593Smuzhiyun 		update_backref_cache(trans, cache);
550*4882a593Smuzhiyun 
551*4882a593Smuzhiyun 	rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
552*4882a593Smuzhiyun 	if (rb_node) {
553*4882a593Smuzhiyun 		node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
554*4882a593Smuzhiyun 		if (node->detached)
555*4882a593Smuzhiyun 			node = NULL;
556*4882a593Smuzhiyun 		else
557*4882a593Smuzhiyun 			BUG_ON(node->new_bytenr != reloc_root->node->start);
558*4882a593Smuzhiyun 	}
559*4882a593Smuzhiyun 
560*4882a593Smuzhiyun 	if (!node) {
561*4882a593Smuzhiyun 		rb_node = rb_simple_search(&cache->rb_root,
562*4882a593Smuzhiyun 					   reloc_root->commit_root->start);
563*4882a593Smuzhiyun 		if (rb_node) {
564*4882a593Smuzhiyun 			node = rb_entry(rb_node, struct btrfs_backref_node,
565*4882a593Smuzhiyun 					rb_node);
566*4882a593Smuzhiyun 			BUG_ON(node->detached);
567*4882a593Smuzhiyun 		}
568*4882a593Smuzhiyun 	}
569*4882a593Smuzhiyun 
570*4882a593Smuzhiyun 	if (!node)
571*4882a593Smuzhiyun 		return 0;
572*4882a593Smuzhiyun 
573*4882a593Smuzhiyun 	new_node = btrfs_backref_alloc_node(cache, dest->node->start,
574*4882a593Smuzhiyun 					    node->level);
575*4882a593Smuzhiyun 	if (!new_node)
576*4882a593Smuzhiyun 		return -ENOMEM;
577*4882a593Smuzhiyun 
578*4882a593Smuzhiyun 	new_node->lowest = node->lowest;
579*4882a593Smuzhiyun 	new_node->checked = 1;
580*4882a593Smuzhiyun 	new_node->root = btrfs_grab_root(dest);
581*4882a593Smuzhiyun 	ASSERT(new_node->root);
582*4882a593Smuzhiyun 
583*4882a593Smuzhiyun 	if (!node->lowest) {
584*4882a593Smuzhiyun 		list_for_each_entry(edge, &node->lower, list[UPPER]) {
585*4882a593Smuzhiyun 			new_edge = btrfs_backref_alloc_edge(cache);
586*4882a593Smuzhiyun 			if (!new_edge)
587*4882a593Smuzhiyun 				goto fail;
588*4882a593Smuzhiyun 
589*4882a593Smuzhiyun 			btrfs_backref_link_edge(new_edge, edge->node[LOWER],
590*4882a593Smuzhiyun 						new_node, LINK_UPPER);
591*4882a593Smuzhiyun 		}
592*4882a593Smuzhiyun 	} else {
593*4882a593Smuzhiyun 		list_add_tail(&new_node->lower, &cache->leaves);
594*4882a593Smuzhiyun 	}
595*4882a593Smuzhiyun 
596*4882a593Smuzhiyun 	rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
597*4882a593Smuzhiyun 				   &new_node->rb_node);
598*4882a593Smuzhiyun 	if (rb_node)
599*4882a593Smuzhiyun 		btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
600*4882a593Smuzhiyun 
601*4882a593Smuzhiyun 	if (!new_node->lowest) {
602*4882a593Smuzhiyun 		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
603*4882a593Smuzhiyun 			list_add_tail(&new_edge->list[LOWER],
604*4882a593Smuzhiyun 				      &new_edge->node[LOWER]->upper);
605*4882a593Smuzhiyun 		}
606*4882a593Smuzhiyun 	}
607*4882a593Smuzhiyun 	return 0;
608*4882a593Smuzhiyun fail:
609*4882a593Smuzhiyun 	while (!list_empty(&new_node->lower)) {
610*4882a593Smuzhiyun 		new_edge = list_entry(new_node->lower.next,
611*4882a593Smuzhiyun 				      struct btrfs_backref_edge, list[UPPER]);
612*4882a593Smuzhiyun 		list_del(&new_edge->list[UPPER]);
613*4882a593Smuzhiyun 		btrfs_backref_free_edge(cache, new_edge);
614*4882a593Smuzhiyun 	}
615*4882a593Smuzhiyun 	btrfs_backref_free_node(cache, new_node);
616*4882a593Smuzhiyun 	return -ENOMEM;
617*4882a593Smuzhiyun }
618*4882a593Smuzhiyun 
619*4882a593Smuzhiyun /*
620*4882a593Smuzhiyun  * helper to add 'address of tree root -> reloc tree' mapping
621*4882a593Smuzhiyun  */
__add_reloc_root(struct btrfs_root * root)622*4882a593Smuzhiyun static int __must_check __add_reloc_root(struct btrfs_root *root)
623*4882a593Smuzhiyun {
624*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = root->fs_info;
625*4882a593Smuzhiyun 	struct rb_node *rb_node;
626*4882a593Smuzhiyun 	struct mapping_node *node;
627*4882a593Smuzhiyun 	struct reloc_control *rc = fs_info->reloc_ctl;
628*4882a593Smuzhiyun 
629*4882a593Smuzhiyun 	node = kmalloc(sizeof(*node), GFP_NOFS);
630*4882a593Smuzhiyun 	if (!node)
631*4882a593Smuzhiyun 		return -ENOMEM;
632*4882a593Smuzhiyun 
633*4882a593Smuzhiyun 	node->bytenr = root->commit_root->start;
634*4882a593Smuzhiyun 	node->data = root;
635*4882a593Smuzhiyun 
636*4882a593Smuzhiyun 	spin_lock(&rc->reloc_root_tree.lock);
637*4882a593Smuzhiyun 	rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
638*4882a593Smuzhiyun 				   node->bytenr, &node->rb_node);
639*4882a593Smuzhiyun 	spin_unlock(&rc->reloc_root_tree.lock);
640*4882a593Smuzhiyun 	if (rb_node) {
641*4882a593Smuzhiyun 		btrfs_panic(fs_info, -EEXIST,
642*4882a593Smuzhiyun 			    "Duplicate root found for start=%llu while inserting into relocation tree",
643*4882a593Smuzhiyun 			    node->bytenr);
644*4882a593Smuzhiyun 	}
645*4882a593Smuzhiyun 
646*4882a593Smuzhiyun 	list_add_tail(&root->root_list, &rc->reloc_roots);
647*4882a593Smuzhiyun 	return 0;
648*4882a593Smuzhiyun }
649*4882a593Smuzhiyun 
650*4882a593Smuzhiyun /*
651*4882a593Smuzhiyun  * helper to delete the 'address of tree root -> reloc tree'
652*4882a593Smuzhiyun  * mapping
653*4882a593Smuzhiyun  */
__del_reloc_root(struct btrfs_root * root)654*4882a593Smuzhiyun static void __del_reloc_root(struct btrfs_root *root)
655*4882a593Smuzhiyun {
656*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = root->fs_info;
657*4882a593Smuzhiyun 	struct rb_node *rb_node;
658*4882a593Smuzhiyun 	struct mapping_node *node = NULL;
659*4882a593Smuzhiyun 	struct reloc_control *rc = fs_info->reloc_ctl;
660*4882a593Smuzhiyun 	bool put_ref = false;
661*4882a593Smuzhiyun 
662*4882a593Smuzhiyun 	if (rc && root->node) {
663*4882a593Smuzhiyun 		spin_lock(&rc->reloc_root_tree.lock);
664*4882a593Smuzhiyun 		rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
665*4882a593Smuzhiyun 					   root->commit_root->start);
666*4882a593Smuzhiyun 		if (rb_node) {
667*4882a593Smuzhiyun 			node = rb_entry(rb_node, struct mapping_node, rb_node);
668*4882a593Smuzhiyun 			rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
669*4882a593Smuzhiyun 			RB_CLEAR_NODE(&node->rb_node);
670*4882a593Smuzhiyun 		}
671*4882a593Smuzhiyun 		spin_unlock(&rc->reloc_root_tree.lock);
672*4882a593Smuzhiyun 		ASSERT(!node || (struct btrfs_root *)node->data == root);
673*4882a593Smuzhiyun 	}
674*4882a593Smuzhiyun 
675*4882a593Smuzhiyun 	/*
676*4882a593Smuzhiyun 	 * We only put the reloc root here if it's on the list.  There's a lot
677*4882a593Smuzhiyun 	 * of places where the pattern is to splice the rc->reloc_roots, process
678*4882a593Smuzhiyun 	 * the reloc roots, and then add the reloc root back onto
679*4882a593Smuzhiyun 	 * rc->reloc_roots.  If we call __del_reloc_root while it's off of the
680*4882a593Smuzhiyun 	 * list we don't want the reference being dropped, because the guy
681*4882a593Smuzhiyun 	 * messing with the list is in charge of the reference.
682*4882a593Smuzhiyun 	 */
683*4882a593Smuzhiyun 	spin_lock(&fs_info->trans_lock);
684*4882a593Smuzhiyun 	if (!list_empty(&root->root_list)) {
685*4882a593Smuzhiyun 		put_ref = true;
686*4882a593Smuzhiyun 		list_del_init(&root->root_list);
687*4882a593Smuzhiyun 	}
688*4882a593Smuzhiyun 	spin_unlock(&fs_info->trans_lock);
689*4882a593Smuzhiyun 	if (put_ref)
690*4882a593Smuzhiyun 		btrfs_put_root(root);
691*4882a593Smuzhiyun 	kfree(node);
692*4882a593Smuzhiyun }
693*4882a593Smuzhiyun 
694*4882a593Smuzhiyun /*
695*4882a593Smuzhiyun  * helper to update the 'address of tree root -> reloc tree'
696*4882a593Smuzhiyun  * mapping
697*4882a593Smuzhiyun  */
__update_reloc_root(struct btrfs_root * root)698*4882a593Smuzhiyun static int __update_reloc_root(struct btrfs_root *root)
699*4882a593Smuzhiyun {
700*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = root->fs_info;
701*4882a593Smuzhiyun 	struct rb_node *rb_node;
702*4882a593Smuzhiyun 	struct mapping_node *node = NULL;
703*4882a593Smuzhiyun 	struct reloc_control *rc = fs_info->reloc_ctl;
704*4882a593Smuzhiyun 
705*4882a593Smuzhiyun 	spin_lock(&rc->reloc_root_tree.lock);
706*4882a593Smuzhiyun 	rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
707*4882a593Smuzhiyun 				   root->commit_root->start);
708*4882a593Smuzhiyun 	if (rb_node) {
709*4882a593Smuzhiyun 		node = rb_entry(rb_node, struct mapping_node, rb_node);
710*4882a593Smuzhiyun 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
711*4882a593Smuzhiyun 	}
712*4882a593Smuzhiyun 	spin_unlock(&rc->reloc_root_tree.lock);
713*4882a593Smuzhiyun 
714*4882a593Smuzhiyun 	if (!node)
715*4882a593Smuzhiyun 		return 0;
716*4882a593Smuzhiyun 	BUG_ON((struct btrfs_root *)node->data != root);
717*4882a593Smuzhiyun 
718*4882a593Smuzhiyun 	spin_lock(&rc->reloc_root_tree.lock);
719*4882a593Smuzhiyun 	node->bytenr = root->node->start;
720*4882a593Smuzhiyun 	rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
721*4882a593Smuzhiyun 				   node->bytenr, &node->rb_node);
722*4882a593Smuzhiyun 	spin_unlock(&rc->reloc_root_tree.lock);
723*4882a593Smuzhiyun 	if (rb_node)
724*4882a593Smuzhiyun 		btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
725*4882a593Smuzhiyun 	return 0;
726*4882a593Smuzhiyun }
727*4882a593Smuzhiyun 
create_reloc_root(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 objectid)728*4882a593Smuzhiyun static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
729*4882a593Smuzhiyun 					struct btrfs_root *root, u64 objectid)
730*4882a593Smuzhiyun {
731*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = root->fs_info;
732*4882a593Smuzhiyun 	struct btrfs_root *reloc_root;
733*4882a593Smuzhiyun 	struct extent_buffer *eb;
734*4882a593Smuzhiyun 	struct btrfs_root_item *root_item;
735*4882a593Smuzhiyun 	struct btrfs_key root_key;
736*4882a593Smuzhiyun 	int ret = 0;
737*4882a593Smuzhiyun 	bool must_abort = false;
738*4882a593Smuzhiyun 
739*4882a593Smuzhiyun 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
740*4882a593Smuzhiyun 	if (!root_item)
741*4882a593Smuzhiyun 		return ERR_PTR(-ENOMEM);
742*4882a593Smuzhiyun 
743*4882a593Smuzhiyun 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
744*4882a593Smuzhiyun 	root_key.type = BTRFS_ROOT_ITEM_KEY;
745*4882a593Smuzhiyun 	root_key.offset = objectid;
746*4882a593Smuzhiyun 
747*4882a593Smuzhiyun 	if (root->root_key.objectid == objectid) {
748*4882a593Smuzhiyun 		u64 commit_root_gen;
749*4882a593Smuzhiyun 
750*4882a593Smuzhiyun 		/* called by btrfs_init_reloc_root */
751*4882a593Smuzhiyun 		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
752*4882a593Smuzhiyun 				      BTRFS_TREE_RELOC_OBJECTID);
753*4882a593Smuzhiyun 		if (ret)
754*4882a593Smuzhiyun 			goto fail;
755*4882a593Smuzhiyun 
756*4882a593Smuzhiyun 		/*
757*4882a593Smuzhiyun 		 * Set the last_snapshot field to the generation of the commit
758*4882a593Smuzhiyun 		 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
759*4882a593Smuzhiyun 		 * correctly (returns true) when the relocation root is created
760*4882a593Smuzhiyun 		 * either inside the critical section of a transaction commit
761*4882a593Smuzhiyun 		 * (through transaction.c:qgroup_account_snapshot()) and when
762*4882a593Smuzhiyun 		 * it's created before the transaction commit is started.
763*4882a593Smuzhiyun 		 */
764*4882a593Smuzhiyun 		commit_root_gen = btrfs_header_generation(root->commit_root);
765*4882a593Smuzhiyun 		btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
766*4882a593Smuzhiyun 	} else {
767*4882a593Smuzhiyun 		/*
768*4882a593Smuzhiyun 		 * called by btrfs_reloc_post_snapshot_hook.
769*4882a593Smuzhiyun 		 * the source tree is a reloc tree, all tree blocks
770*4882a593Smuzhiyun 		 * modified after it was created have RELOC flag
771*4882a593Smuzhiyun 		 * set in their headers. so it's OK to not update
772*4882a593Smuzhiyun 		 * the 'last_snapshot'.
773*4882a593Smuzhiyun 		 */
774*4882a593Smuzhiyun 		ret = btrfs_copy_root(trans, root, root->node, &eb,
775*4882a593Smuzhiyun 				      BTRFS_TREE_RELOC_OBJECTID);
776*4882a593Smuzhiyun 		if (ret)
777*4882a593Smuzhiyun 			goto fail;
778*4882a593Smuzhiyun 	}
779*4882a593Smuzhiyun 
780*4882a593Smuzhiyun 	/*
781*4882a593Smuzhiyun 	 * We have changed references at this point, we must abort the
782*4882a593Smuzhiyun 	 * transaction if anything fails.
783*4882a593Smuzhiyun 	 */
784*4882a593Smuzhiyun 	must_abort = true;
785*4882a593Smuzhiyun 
786*4882a593Smuzhiyun 	memcpy(root_item, &root->root_item, sizeof(*root_item));
787*4882a593Smuzhiyun 	btrfs_set_root_bytenr(root_item, eb->start);
788*4882a593Smuzhiyun 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
789*4882a593Smuzhiyun 	btrfs_set_root_generation(root_item, trans->transid);
790*4882a593Smuzhiyun 
791*4882a593Smuzhiyun 	if (root->root_key.objectid == objectid) {
792*4882a593Smuzhiyun 		btrfs_set_root_refs(root_item, 0);
793*4882a593Smuzhiyun 		memset(&root_item->drop_progress, 0,
794*4882a593Smuzhiyun 		       sizeof(struct btrfs_disk_key));
795*4882a593Smuzhiyun 		root_item->drop_level = 0;
796*4882a593Smuzhiyun 	}
797*4882a593Smuzhiyun 
798*4882a593Smuzhiyun 	btrfs_tree_unlock(eb);
799*4882a593Smuzhiyun 	free_extent_buffer(eb);
800*4882a593Smuzhiyun 
801*4882a593Smuzhiyun 	ret = btrfs_insert_root(trans, fs_info->tree_root,
802*4882a593Smuzhiyun 				&root_key, root_item);
803*4882a593Smuzhiyun 	if (ret)
804*4882a593Smuzhiyun 		goto fail;
805*4882a593Smuzhiyun 
806*4882a593Smuzhiyun 	kfree(root_item);
807*4882a593Smuzhiyun 
808*4882a593Smuzhiyun 	reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
809*4882a593Smuzhiyun 	if (IS_ERR(reloc_root)) {
810*4882a593Smuzhiyun 		ret = PTR_ERR(reloc_root);
811*4882a593Smuzhiyun 		goto abort;
812*4882a593Smuzhiyun 	}
813*4882a593Smuzhiyun 	set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
814*4882a593Smuzhiyun 	reloc_root->last_trans = trans->transid;
815*4882a593Smuzhiyun 	return reloc_root;
816*4882a593Smuzhiyun fail:
817*4882a593Smuzhiyun 	kfree(root_item);
818*4882a593Smuzhiyun abort:
819*4882a593Smuzhiyun 	if (must_abort)
820*4882a593Smuzhiyun 		btrfs_abort_transaction(trans, ret);
821*4882a593Smuzhiyun 	return ERR_PTR(ret);
822*4882a593Smuzhiyun }
823*4882a593Smuzhiyun 
824*4882a593Smuzhiyun /*
825*4882a593Smuzhiyun  * create reloc tree for a given fs tree. reloc tree is just a
826*4882a593Smuzhiyun  * snapshot of the fs tree with special root objectid.
827*4882a593Smuzhiyun  *
828*4882a593Smuzhiyun  * The reloc_root comes out of here with two references, one for
829*4882a593Smuzhiyun  * root->reloc_root, and another for being on the rc->reloc_roots list.
830*4882a593Smuzhiyun  */
btrfs_init_reloc_root(struct btrfs_trans_handle * trans,struct btrfs_root * root)831*4882a593Smuzhiyun int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
832*4882a593Smuzhiyun 			  struct btrfs_root *root)
833*4882a593Smuzhiyun {
834*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = root->fs_info;
835*4882a593Smuzhiyun 	struct btrfs_root *reloc_root;
836*4882a593Smuzhiyun 	struct reloc_control *rc = fs_info->reloc_ctl;
837*4882a593Smuzhiyun 	struct btrfs_block_rsv *rsv;
838*4882a593Smuzhiyun 	int clear_rsv = 0;
839*4882a593Smuzhiyun 	int ret;
840*4882a593Smuzhiyun 
841*4882a593Smuzhiyun 	if (!rc)
842*4882a593Smuzhiyun 		return 0;
843*4882a593Smuzhiyun 
844*4882a593Smuzhiyun 	/*
845*4882a593Smuzhiyun 	 * The subvolume has reloc tree but the swap is finished, no need to
846*4882a593Smuzhiyun 	 * create/update the dead reloc tree
847*4882a593Smuzhiyun 	 */
848*4882a593Smuzhiyun 	if (reloc_root_is_dead(root))
849*4882a593Smuzhiyun 		return 0;
850*4882a593Smuzhiyun 
851*4882a593Smuzhiyun 	/*
852*4882a593Smuzhiyun 	 * This is subtle but important.  We do not do
853*4882a593Smuzhiyun 	 * record_root_in_transaction for reloc roots, instead we record their
854*4882a593Smuzhiyun 	 * corresponding fs root, and then here we update the last trans for the
855*4882a593Smuzhiyun 	 * reloc root.  This means that we have to do this for the entire life
856*4882a593Smuzhiyun 	 * of the reloc root, regardless of which stage of the relocation we are
857*4882a593Smuzhiyun 	 * in.
858*4882a593Smuzhiyun 	 */
859*4882a593Smuzhiyun 	if (root->reloc_root) {
860*4882a593Smuzhiyun 		reloc_root = root->reloc_root;
861*4882a593Smuzhiyun 		reloc_root->last_trans = trans->transid;
862*4882a593Smuzhiyun 		return 0;
863*4882a593Smuzhiyun 	}
864*4882a593Smuzhiyun 
865*4882a593Smuzhiyun 	/*
866*4882a593Smuzhiyun 	 * We are merging reloc roots, we do not need new reloc trees.  Also
867*4882a593Smuzhiyun 	 * reloc trees never need their own reloc tree.
868*4882a593Smuzhiyun 	 */
869*4882a593Smuzhiyun 	if (!rc->create_reloc_tree ||
870*4882a593Smuzhiyun 	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
871*4882a593Smuzhiyun 		return 0;
872*4882a593Smuzhiyun 
873*4882a593Smuzhiyun 	if (!trans->reloc_reserved) {
874*4882a593Smuzhiyun 		rsv = trans->block_rsv;
875*4882a593Smuzhiyun 		trans->block_rsv = rc->block_rsv;
876*4882a593Smuzhiyun 		clear_rsv = 1;
877*4882a593Smuzhiyun 	}
878*4882a593Smuzhiyun 	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
879*4882a593Smuzhiyun 	if (clear_rsv)
880*4882a593Smuzhiyun 		trans->block_rsv = rsv;
881*4882a593Smuzhiyun 
882*4882a593Smuzhiyun 	ret = __add_reloc_root(reloc_root);
883*4882a593Smuzhiyun 	BUG_ON(ret < 0);
884*4882a593Smuzhiyun 	root->reloc_root = btrfs_grab_root(reloc_root);
885*4882a593Smuzhiyun 	return 0;
886*4882a593Smuzhiyun }
887*4882a593Smuzhiyun 
888*4882a593Smuzhiyun /*
889*4882a593Smuzhiyun  * update root item of reloc tree
890*4882a593Smuzhiyun  */
btrfs_update_reloc_root(struct btrfs_trans_handle * trans,struct btrfs_root * root)891*4882a593Smuzhiyun int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
892*4882a593Smuzhiyun 			    struct btrfs_root *root)
893*4882a593Smuzhiyun {
894*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = root->fs_info;
895*4882a593Smuzhiyun 	struct btrfs_root *reloc_root;
896*4882a593Smuzhiyun 	struct btrfs_root_item *root_item;
897*4882a593Smuzhiyun 	int ret;
898*4882a593Smuzhiyun 
899*4882a593Smuzhiyun 	if (!have_reloc_root(root))
900*4882a593Smuzhiyun 		return 0;
901*4882a593Smuzhiyun 
902*4882a593Smuzhiyun 	reloc_root = root->reloc_root;
903*4882a593Smuzhiyun 	root_item = &reloc_root->root_item;
904*4882a593Smuzhiyun 
905*4882a593Smuzhiyun 	/*
906*4882a593Smuzhiyun 	 * We are probably ok here, but __del_reloc_root() will drop its ref of
907*4882a593Smuzhiyun 	 * the root.  We have the ref for root->reloc_root, but just in case
908*4882a593Smuzhiyun 	 * hold it while we update the reloc root.
909*4882a593Smuzhiyun 	 */
910*4882a593Smuzhiyun 	btrfs_grab_root(reloc_root);
911*4882a593Smuzhiyun 
912*4882a593Smuzhiyun 	/* root->reloc_root will stay until current relocation finished */
913*4882a593Smuzhiyun 	if (fs_info->reloc_ctl->merge_reloc_tree &&
914*4882a593Smuzhiyun 	    btrfs_root_refs(root_item) == 0) {
915*4882a593Smuzhiyun 		set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
916*4882a593Smuzhiyun 		/*
917*4882a593Smuzhiyun 		 * Mark the tree as dead before we change reloc_root so
918*4882a593Smuzhiyun 		 * have_reloc_root will not touch it from now on.
919*4882a593Smuzhiyun 		 */
920*4882a593Smuzhiyun 		smp_wmb();
921*4882a593Smuzhiyun 		__del_reloc_root(reloc_root);
922*4882a593Smuzhiyun 	}
923*4882a593Smuzhiyun 
924*4882a593Smuzhiyun 	if (reloc_root->commit_root != reloc_root->node) {
925*4882a593Smuzhiyun 		__update_reloc_root(reloc_root);
926*4882a593Smuzhiyun 		btrfs_set_root_node(root_item, reloc_root->node);
927*4882a593Smuzhiyun 		free_extent_buffer(reloc_root->commit_root);
928*4882a593Smuzhiyun 		reloc_root->commit_root = btrfs_root_node(reloc_root);
929*4882a593Smuzhiyun 	}
930*4882a593Smuzhiyun 
931*4882a593Smuzhiyun 	ret = btrfs_update_root(trans, fs_info->tree_root,
932*4882a593Smuzhiyun 				&reloc_root->root_key, root_item);
933*4882a593Smuzhiyun 	btrfs_put_root(reloc_root);
934*4882a593Smuzhiyun 	return ret;
935*4882a593Smuzhiyun }
936*4882a593Smuzhiyun 
937*4882a593Smuzhiyun /*
938*4882a593Smuzhiyun  * helper to find first cached inode with inode number >= objectid
939*4882a593Smuzhiyun  * in a subvolume
940*4882a593Smuzhiyun  */
find_next_inode(struct btrfs_root * root,u64 objectid)941*4882a593Smuzhiyun static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
942*4882a593Smuzhiyun {
943*4882a593Smuzhiyun 	struct rb_node *node;
944*4882a593Smuzhiyun 	struct rb_node *prev;
945*4882a593Smuzhiyun 	struct btrfs_inode *entry;
946*4882a593Smuzhiyun 	struct inode *inode;
947*4882a593Smuzhiyun 
948*4882a593Smuzhiyun 	spin_lock(&root->inode_lock);
949*4882a593Smuzhiyun again:
950*4882a593Smuzhiyun 	node = root->inode_tree.rb_node;
951*4882a593Smuzhiyun 	prev = NULL;
952*4882a593Smuzhiyun 	while (node) {
953*4882a593Smuzhiyun 		prev = node;
954*4882a593Smuzhiyun 		entry = rb_entry(node, struct btrfs_inode, rb_node);
955*4882a593Smuzhiyun 
956*4882a593Smuzhiyun 		if (objectid < btrfs_ino(entry))
957*4882a593Smuzhiyun 			node = node->rb_left;
958*4882a593Smuzhiyun 		else if (objectid > btrfs_ino(entry))
959*4882a593Smuzhiyun 			node = node->rb_right;
960*4882a593Smuzhiyun 		else
961*4882a593Smuzhiyun 			break;
962*4882a593Smuzhiyun 	}
963*4882a593Smuzhiyun 	if (!node) {
964*4882a593Smuzhiyun 		while (prev) {
965*4882a593Smuzhiyun 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
966*4882a593Smuzhiyun 			if (objectid <= btrfs_ino(entry)) {
967*4882a593Smuzhiyun 				node = prev;
968*4882a593Smuzhiyun 				break;
969*4882a593Smuzhiyun 			}
970*4882a593Smuzhiyun 			prev = rb_next(prev);
971*4882a593Smuzhiyun 		}
972*4882a593Smuzhiyun 	}
973*4882a593Smuzhiyun 	while (node) {
974*4882a593Smuzhiyun 		entry = rb_entry(node, struct btrfs_inode, rb_node);
975*4882a593Smuzhiyun 		inode = igrab(&entry->vfs_inode);
976*4882a593Smuzhiyun 		if (inode) {
977*4882a593Smuzhiyun 			spin_unlock(&root->inode_lock);
978*4882a593Smuzhiyun 			return inode;
979*4882a593Smuzhiyun 		}
980*4882a593Smuzhiyun 
981*4882a593Smuzhiyun 		objectid = btrfs_ino(entry) + 1;
982*4882a593Smuzhiyun 		if (cond_resched_lock(&root->inode_lock))
983*4882a593Smuzhiyun 			goto again;
984*4882a593Smuzhiyun 
985*4882a593Smuzhiyun 		node = rb_next(node);
986*4882a593Smuzhiyun 	}
987*4882a593Smuzhiyun 	spin_unlock(&root->inode_lock);
988*4882a593Smuzhiyun 	return NULL;
989*4882a593Smuzhiyun }
990*4882a593Smuzhiyun 
991*4882a593Smuzhiyun /*
992*4882a593Smuzhiyun  * get new location of data
993*4882a593Smuzhiyun  */
get_new_location(struct inode * reloc_inode,u64 * new_bytenr,u64 bytenr,u64 num_bytes)994*4882a593Smuzhiyun static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
995*4882a593Smuzhiyun 			    u64 bytenr, u64 num_bytes)
996*4882a593Smuzhiyun {
997*4882a593Smuzhiyun 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
998*4882a593Smuzhiyun 	struct btrfs_path *path;
999*4882a593Smuzhiyun 	struct btrfs_file_extent_item *fi;
1000*4882a593Smuzhiyun 	struct extent_buffer *leaf;
1001*4882a593Smuzhiyun 	int ret;
1002*4882a593Smuzhiyun 
1003*4882a593Smuzhiyun 	path = btrfs_alloc_path();
1004*4882a593Smuzhiyun 	if (!path)
1005*4882a593Smuzhiyun 		return -ENOMEM;
1006*4882a593Smuzhiyun 
1007*4882a593Smuzhiyun 	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1008*4882a593Smuzhiyun 	ret = btrfs_lookup_file_extent(NULL, root, path,
1009*4882a593Smuzhiyun 			btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
1010*4882a593Smuzhiyun 	if (ret < 0)
1011*4882a593Smuzhiyun 		goto out;
1012*4882a593Smuzhiyun 	if (ret > 0) {
1013*4882a593Smuzhiyun 		ret = -ENOENT;
1014*4882a593Smuzhiyun 		goto out;
1015*4882a593Smuzhiyun 	}
1016*4882a593Smuzhiyun 
1017*4882a593Smuzhiyun 	leaf = path->nodes[0];
1018*4882a593Smuzhiyun 	fi = btrfs_item_ptr(leaf, path->slots[0],
1019*4882a593Smuzhiyun 			    struct btrfs_file_extent_item);
1020*4882a593Smuzhiyun 
1021*4882a593Smuzhiyun 	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1022*4882a593Smuzhiyun 	       btrfs_file_extent_compression(leaf, fi) ||
1023*4882a593Smuzhiyun 	       btrfs_file_extent_encryption(leaf, fi) ||
1024*4882a593Smuzhiyun 	       btrfs_file_extent_other_encoding(leaf, fi));
1025*4882a593Smuzhiyun 
1026*4882a593Smuzhiyun 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1027*4882a593Smuzhiyun 		ret = -EINVAL;
1028*4882a593Smuzhiyun 		goto out;
1029*4882a593Smuzhiyun 	}
1030*4882a593Smuzhiyun 
1031*4882a593Smuzhiyun 	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1032*4882a593Smuzhiyun 	ret = 0;
1033*4882a593Smuzhiyun out:
1034*4882a593Smuzhiyun 	btrfs_free_path(path);
1035*4882a593Smuzhiyun 	return ret;
1036*4882a593Smuzhiyun }
1037*4882a593Smuzhiyun 
1038*4882a593Smuzhiyun /*
1039*4882a593Smuzhiyun  * update file extent items in the tree leaf to point to
1040*4882a593Smuzhiyun  * the new locations.
1041*4882a593Smuzhiyun  */
1042*4882a593Smuzhiyun static noinline_for_stack
replace_file_extents(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_root * root,struct extent_buffer * leaf)1043*4882a593Smuzhiyun int replace_file_extents(struct btrfs_trans_handle *trans,
1044*4882a593Smuzhiyun 			 struct reloc_control *rc,
1045*4882a593Smuzhiyun 			 struct btrfs_root *root,
1046*4882a593Smuzhiyun 			 struct extent_buffer *leaf)
1047*4882a593Smuzhiyun {
1048*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = root->fs_info;
1049*4882a593Smuzhiyun 	struct btrfs_key key;
1050*4882a593Smuzhiyun 	struct btrfs_file_extent_item *fi;
1051*4882a593Smuzhiyun 	struct inode *inode = NULL;
1052*4882a593Smuzhiyun 	u64 parent;
1053*4882a593Smuzhiyun 	u64 bytenr;
1054*4882a593Smuzhiyun 	u64 new_bytenr = 0;
1055*4882a593Smuzhiyun 	u64 num_bytes;
1056*4882a593Smuzhiyun 	u64 end;
1057*4882a593Smuzhiyun 	u32 nritems;
1058*4882a593Smuzhiyun 	u32 i;
1059*4882a593Smuzhiyun 	int ret = 0;
1060*4882a593Smuzhiyun 	int first = 1;
1061*4882a593Smuzhiyun 	int dirty = 0;
1062*4882a593Smuzhiyun 
1063*4882a593Smuzhiyun 	if (rc->stage != UPDATE_DATA_PTRS)
1064*4882a593Smuzhiyun 		return 0;
1065*4882a593Smuzhiyun 
1066*4882a593Smuzhiyun 	/* reloc trees always use full backref */
1067*4882a593Smuzhiyun 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1068*4882a593Smuzhiyun 		parent = leaf->start;
1069*4882a593Smuzhiyun 	else
1070*4882a593Smuzhiyun 		parent = 0;
1071*4882a593Smuzhiyun 
1072*4882a593Smuzhiyun 	nritems = btrfs_header_nritems(leaf);
1073*4882a593Smuzhiyun 	for (i = 0; i < nritems; i++) {
1074*4882a593Smuzhiyun 		struct btrfs_ref ref = { 0 };
1075*4882a593Smuzhiyun 
1076*4882a593Smuzhiyun 		cond_resched();
1077*4882a593Smuzhiyun 		btrfs_item_key_to_cpu(leaf, &key, i);
1078*4882a593Smuzhiyun 		if (key.type != BTRFS_EXTENT_DATA_KEY)
1079*4882a593Smuzhiyun 			continue;
1080*4882a593Smuzhiyun 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1081*4882a593Smuzhiyun 		if (btrfs_file_extent_type(leaf, fi) ==
1082*4882a593Smuzhiyun 		    BTRFS_FILE_EXTENT_INLINE)
1083*4882a593Smuzhiyun 			continue;
1084*4882a593Smuzhiyun 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1085*4882a593Smuzhiyun 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1086*4882a593Smuzhiyun 		if (bytenr == 0)
1087*4882a593Smuzhiyun 			continue;
1088*4882a593Smuzhiyun 		if (!in_range(bytenr, rc->block_group->start,
1089*4882a593Smuzhiyun 			      rc->block_group->length))
1090*4882a593Smuzhiyun 			continue;
1091*4882a593Smuzhiyun 
1092*4882a593Smuzhiyun 		/*
1093*4882a593Smuzhiyun 		 * if we are modifying block in fs tree, wait for readpage
1094*4882a593Smuzhiyun 		 * to complete and drop the extent cache
1095*4882a593Smuzhiyun 		 */
1096*4882a593Smuzhiyun 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1097*4882a593Smuzhiyun 			if (first) {
1098*4882a593Smuzhiyun 				inode = find_next_inode(root, key.objectid);
1099*4882a593Smuzhiyun 				first = 0;
1100*4882a593Smuzhiyun 			} else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
1101*4882a593Smuzhiyun 				btrfs_add_delayed_iput(inode);
1102*4882a593Smuzhiyun 				inode = find_next_inode(root, key.objectid);
1103*4882a593Smuzhiyun 			}
1104*4882a593Smuzhiyun 			if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
1105*4882a593Smuzhiyun 				end = key.offset +
1106*4882a593Smuzhiyun 				      btrfs_file_extent_num_bytes(leaf, fi);
1107*4882a593Smuzhiyun 				WARN_ON(!IS_ALIGNED(key.offset,
1108*4882a593Smuzhiyun 						    fs_info->sectorsize));
1109*4882a593Smuzhiyun 				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1110*4882a593Smuzhiyun 				end--;
1111*4882a593Smuzhiyun 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1112*4882a593Smuzhiyun 						      key.offset, end);
1113*4882a593Smuzhiyun 				if (!ret)
1114*4882a593Smuzhiyun 					continue;
1115*4882a593Smuzhiyun 
1116*4882a593Smuzhiyun 				btrfs_drop_extent_cache(BTRFS_I(inode),
1117*4882a593Smuzhiyun 						key.offset,	end, 1);
1118*4882a593Smuzhiyun 				unlock_extent(&BTRFS_I(inode)->io_tree,
1119*4882a593Smuzhiyun 					      key.offset, end);
1120*4882a593Smuzhiyun 			}
1121*4882a593Smuzhiyun 		}
1122*4882a593Smuzhiyun 
1123*4882a593Smuzhiyun 		ret = get_new_location(rc->data_inode, &new_bytenr,
1124*4882a593Smuzhiyun 				       bytenr, num_bytes);
1125*4882a593Smuzhiyun 		if (ret) {
1126*4882a593Smuzhiyun 			/*
1127*4882a593Smuzhiyun 			 * Don't have to abort since we've not changed anything
1128*4882a593Smuzhiyun 			 * in the file extent yet.
1129*4882a593Smuzhiyun 			 */
1130*4882a593Smuzhiyun 			break;
1131*4882a593Smuzhiyun 		}
1132*4882a593Smuzhiyun 
1133*4882a593Smuzhiyun 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1134*4882a593Smuzhiyun 		dirty = 1;
1135*4882a593Smuzhiyun 
1136*4882a593Smuzhiyun 		key.offset -= btrfs_file_extent_offset(leaf, fi);
1137*4882a593Smuzhiyun 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1138*4882a593Smuzhiyun 				       num_bytes, parent);
1139*4882a593Smuzhiyun 		ref.real_root = root->root_key.objectid;
1140*4882a593Smuzhiyun 		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1141*4882a593Smuzhiyun 				    key.objectid, key.offset);
1142*4882a593Smuzhiyun 		ret = btrfs_inc_extent_ref(trans, &ref);
1143*4882a593Smuzhiyun 		if (ret) {
1144*4882a593Smuzhiyun 			btrfs_abort_transaction(trans, ret);
1145*4882a593Smuzhiyun 			break;
1146*4882a593Smuzhiyun 		}
1147*4882a593Smuzhiyun 
1148*4882a593Smuzhiyun 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1149*4882a593Smuzhiyun 				       num_bytes, parent);
1150*4882a593Smuzhiyun 		ref.real_root = root->root_key.objectid;
1151*4882a593Smuzhiyun 		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1152*4882a593Smuzhiyun 				    key.objectid, key.offset);
1153*4882a593Smuzhiyun 		ret = btrfs_free_extent(trans, &ref);
1154*4882a593Smuzhiyun 		if (ret) {
1155*4882a593Smuzhiyun 			btrfs_abort_transaction(trans, ret);
1156*4882a593Smuzhiyun 			break;
1157*4882a593Smuzhiyun 		}
1158*4882a593Smuzhiyun 	}
1159*4882a593Smuzhiyun 	if (dirty)
1160*4882a593Smuzhiyun 		btrfs_mark_buffer_dirty(leaf);
1161*4882a593Smuzhiyun 	if (inode)
1162*4882a593Smuzhiyun 		btrfs_add_delayed_iput(inode);
1163*4882a593Smuzhiyun 	return ret;
1164*4882a593Smuzhiyun }
1165*4882a593Smuzhiyun 
1166*4882a593Smuzhiyun static noinline_for_stack
memcmp_node_keys(struct extent_buffer * eb,int slot,struct btrfs_path * path,int level)1167*4882a593Smuzhiyun int memcmp_node_keys(struct extent_buffer *eb, int slot,
1168*4882a593Smuzhiyun 		     struct btrfs_path *path, int level)
1169*4882a593Smuzhiyun {
1170*4882a593Smuzhiyun 	struct btrfs_disk_key key1;
1171*4882a593Smuzhiyun 	struct btrfs_disk_key key2;
1172*4882a593Smuzhiyun 	btrfs_node_key(eb, &key1, slot);
1173*4882a593Smuzhiyun 	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1174*4882a593Smuzhiyun 	return memcmp(&key1, &key2, sizeof(key1));
1175*4882a593Smuzhiyun }
1176*4882a593Smuzhiyun 
1177*4882a593Smuzhiyun /*
1178*4882a593Smuzhiyun  * try to replace tree blocks in fs tree with the new blocks
1179*4882a593Smuzhiyun  * in reloc tree. tree blocks haven't been modified since the
1180*4882a593Smuzhiyun  * reloc tree was create can be replaced.
1181*4882a593Smuzhiyun  *
1182*4882a593Smuzhiyun  * if a block was replaced, level of the block + 1 is returned.
1183*4882a593Smuzhiyun  * if no block got replaced, 0 is returned. if there are other
1184*4882a593Smuzhiyun  * errors, a negative error number is returned.
1185*4882a593Smuzhiyun  */
1186*4882a593Smuzhiyun static noinline_for_stack
replace_path(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_root * dest,struct btrfs_root * src,struct btrfs_path * path,struct btrfs_key * next_key,int lowest_level,int max_level)1187*4882a593Smuzhiyun int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
1188*4882a593Smuzhiyun 		 struct btrfs_root *dest, struct btrfs_root *src,
1189*4882a593Smuzhiyun 		 struct btrfs_path *path, struct btrfs_key *next_key,
1190*4882a593Smuzhiyun 		 int lowest_level, int max_level)
1191*4882a593Smuzhiyun {
1192*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = dest->fs_info;
1193*4882a593Smuzhiyun 	struct extent_buffer *eb;
1194*4882a593Smuzhiyun 	struct extent_buffer *parent;
1195*4882a593Smuzhiyun 	struct btrfs_ref ref = { 0 };
1196*4882a593Smuzhiyun 	struct btrfs_key key;
1197*4882a593Smuzhiyun 	u64 old_bytenr;
1198*4882a593Smuzhiyun 	u64 new_bytenr;
1199*4882a593Smuzhiyun 	u64 old_ptr_gen;
1200*4882a593Smuzhiyun 	u64 new_ptr_gen;
1201*4882a593Smuzhiyun 	u64 last_snapshot;
1202*4882a593Smuzhiyun 	u32 blocksize;
1203*4882a593Smuzhiyun 	int cow = 0;
1204*4882a593Smuzhiyun 	int level;
1205*4882a593Smuzhiyun 	int ret;
1206*4882a593Smuzhiyun 	int slot;
1207*4882a593Smuzhiyun 
1208*4882a593Smuzhiyun 	ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1209*4882a593Smuzhiyun 	ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1210*4882a593Smuzhiyun 
1211*4882a593Smuzhiyun 	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1212*4882a593Smuzhiyun again:
1213*4882a593Smuzhiyun 	slot = path->slots[lowest_level];
1214*4882a593Smuzhiyun 	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1215*4882a593Smuzhiyun 
1216*4882a593Smuzhiyun 	eb = btrfs_lock_root_node(dest);
1217*4882a593Smuzhiyun 	btrfs_set_lock_blocking_write(eb);
1218*4882a593Smuzhiyun 	level = btrfs_header_level(eb);
1219*4882a593Smuzhiyun 
1220*4882a593Smuzhiyun 	if (level < lowest_level) {
1221*4882a593Smuzhiyun 		btrfs_tree_unlock(eb);
1222*4882a593Smuzhiyun 		free_extent_buffer(eb);
1223*4882a593Smuzhiyun 		return 0;
1224*4882a593Smuzhiyun 	}
1225*4882a593Smuzhiyun 
1226*4882a593Smuzhiyun 	if (cow) {
1227*4882a593Smuzhiyun 		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
1228*4882a593Smuzhiyun 				      BTRFS_NESTING_COW);
1229*4882a593Smuzhiyun 		BUG_ON(ret);
1230*4882a593Smuzhiyun 	}
1231*4882a593Smuzhiyun 	btrfs_set_lock_blocking_write(eb);
1232*4882a593Smuzhiyun 
1233*4882a593Smuzhiyun 	if (next_key) {
1234*4882a593Smuzhiyun 		next_key->objectid = (u64)-1;
1235*4882a593Smuzhiyun 		next_key->type = (u8)-1;
1236*4882a593Smuzhiyun 		next_key->offset = (u64)-1;
1237*4882a593Smuzhiyun 	}
1238*4882a593Smuzhiyun 
1239*4882a593Smuzhiyun 	parent = eb;
1240*4882a593Smuzhiyun 	while (1) {
1241*4882a593Smuzhiyun 		struct btrfs_key first_key;
1242*4882a593Smuzhiyun 
1243*4882a593Smuzhiyun 		level = btrfs_header_level(parent);
1244*4882a593Smuzhiyun 		ASSERT(level >= lowest_level);
1245*4882a593Smuzhiyun 
1246*4882a593Smuzhiyun 		ret = btrfs_bin_search(parent, &key, &slot);
1247*4882a593Smuzhiyun 		if (ret < 0)
1248*4882a593Smuzhiyun 			break;
1249*4882a593Smuzhiyun 		if (ret && slot > 0)
1250*4882a593Smuzhiyun 			slot--;
1251*4882a593Smuzhiyun 
1252*4882a593Smuzhiyun 		if (next_key && slot + 1 < btrfs_header_nritems(parent))
1253*4882a593Smuzhiyun 			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1254*4882a593Smuzhiyun 
1255*4882a593Smuzhiyun 		old_bytenr = btrfs_node_blockptr(parent, slot);
1256*4882a593Smuzhiyun 		blocksize = fs_info->nodesize;
1257*4882a593Smuzhiyun 		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1258*4882a593Smuzhiyun 		btrfs_node_key_to_cpu(parent, &first_key, slot);
1259*4882a593Smuzhiyun 
1260*4882a593Smuzhiyun 		if (level <= max_level) {
1261*4882a593Smuzhiyun 			eb = path->nodes[level];
1262*4882a593Smuzhiyun 			new_bytenr = btrfs_node_blockptr(eb,
1263*4882a593Smuzhiyun 							path->slots[level]);
1264*4882a593Smuzhiyun 			new_ptr_gen = btrfs_node_ptr_generation(eb,
1265*4882a593Smuzhiyun 							path->slots[level]);
1266*4882a593Smuzhiyun 		} else {
1267*4882a593Smuzhiyun 			new_bytenr = 0;
1268*4882a593Smuzhiyun 			new_ptr_gen = 0;
1269*4882a593Smuzhiyun 		}
1270*4882a593Smuzhiyun 
1271*4882a593Smuzhiyun 		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1272*4882a593Smuzhiyun 			ret = level;
1273*4882a593Smuzhiyun 			break;
1274*4882a593Smuzhiyun 		}
1275*4882a593Smuzhiyun 
1276*4882a593Smuzhiyun 		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1277*4882a593Smuzhiyun 		    memcmp_node_keys(parent, slot, path, level)) {
1278*4882a593Smuzhiyun 			if (level <= lowest_level) {
1279*4882a593Smuzhiyun 				ret = 0;
1280*4882a593Smuzhiyun 				break;
1281*4882a593Smuzhiyun 			}
1282*4882a593Smuzhiyun 
1283*4882a593Smuzhiyun 			eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen,
1284*4882a593Smuzhiyun 					     level - 1, &first_key);
1285*4882a593Smuzhiyun 			if (IS_ERR(eb)) {
1286*4882a593Smuzhiyun 				ret = PTR_ERR(eb);
1287*4882a593Smuzhiyun 				break;
1288*4882a593Smuzhiyun 			} else if (!extent_buffer_uptodate(eb)) {
1289*4882a593Smuzhiyun 				ret = -EIO;
1290*4882a593Smuzhiyun 				free_extent_buffer(eb);
1291*4882a593Smuzhiyun 				break;
1292*4882a593Smuzhiyun 			}
1293*4882a593Smuzhiyun 			btrfs_tree_lock(eb);
1294*4882a593Smuzhiyun 			if (cow) {
1295*4882a593Smuzhiyun 				ret = btrfs_cow_block(trans, dest, eb, parent,
1296*4882a593Smuzhiyun 						      slot, &eb,
1297*4882a593Smuzhiyun 						      BTRFS_NESTING_COW);
1298*4882a593Smuzhiyun 				BUG_ON(ret);
1299*4882a593Smuzhiyun 			}
1300*4882a593Smuzhiyun 			btrfs_set_lock_blocking_write(eb);
1301*4882a593Smuzhiyun 
1302*4882a593Smuzhiyun 			btrfs_tree_unlock(parent);
1303*4882a593Smuzhiyun 			free_extent_buffer(parent);
1304*4882a593Smuzhiyun 
1305*4882a593Smuzhiyun 			parent = eb;
1306*4882a593Smuzhiyun 			continue;
1307*4882a593Smuzhiyun 		}
1308*4882a593Smuzhiyun 
1309*4882a593Smuzhiyun 		if (!cow) {
1310*4882a593Smuzhiyun 			btrfs_tree_unlock(parent);
1311*4882a593Smuzhiyun 			free_extent_buffer(parent);
1312*4882a593Smuzhiyun 			cow = 1;
1313*4882a593Smuzhiyun 			goto again;
1314*4882a593Smuzhiyun 		}
1315*4882a593Smuzhiyun 
1316*4882a593Smuzhiyun 		btrfs_node_key_to_cpu(path->nodes[level], &key,
1317*4882a593Smuzhiyun 				      path->slots[level]);
1318*4882a593Smuzhiyun 		btrfs_release_path(path);
1319*4882a593Smuzhiyun 
1320*4882a593Smuzhiyun 		path->lowest_level = level;
1321*4882a593Smuzhiyun 		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1322*4882a593Smuzhiyun 		path->lowest_level = 0;
1323*4882a593Smuzhiyun 		BUG_ON(ret);
1324*4882a593Smuzhiyun 
1325*4882a593Smuzhiyun 		/*
1326*4882a593Smuzhiyun 		 * Info qgroup to trace both subtrees.
1327*4882a593Smuzhiyun 		 *
1328*4882a593Smuzhiyun 		 * We must trace both trees.
1329*4882a593Smuzhiyun 		 * 1) Tree reloc subtree
1330*4882a593Smuzhiyun 		 *    If not traced, we will leak data numbers
1331*4882a593Smuzhiyun 		 * 2) Fs subtree
1332*4882a593Smuzhiyun 		 *    If not traced, we will double count old data
1333*4882a593Smuzhiyun 		 *
1334*4882a593Smuzhiyun 		 * We don't scan the subtree right now, but only record
1335*4882a593Smuzhiyun 		 * the swapped tree blocks.
1336*4882a593Smuzhiyun 		 * The real subtree rescan is delayed until we have new
1337*4882a593Smuzhiyun 		 * CoW on the subtree root node before transaction commit.
1338*4882a593Smuzhiyun 		 */
1339*4882a593Smuzhiyun 		ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
1340*4882a593Smuzhiyun 				rc->block_group, parent, slot,
1341*4882a593Smuzhiyun 				path->nodes[level], path->slots[level],
1342*4882a593Smuzhiyun 				last_snapshot);
1343*4882a593Smuzhiyun 		if (ret < 0)
1344*4882a593Smuzhiyun 			break;
1345*4882a593Smuzhiyun 		/*
1346*4882a593Smuzhiyun 		 * swap blocks in fs tree and reloc tree.
1347*4882a593Smuzhiyun 		 */
1348*4882a593Smuzhiyun 		btrfs_set_node_blockptr(parent, slot, new_bytenr);
1349*4882a593Smuzhiyun 		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1350*4882a593Smuzhiyun 		btrfs_mark_buffer_dirty(parent);
1351*4882a593Smuzhiyun 
1352*4882a593Smuzhiyun 		btrfs_set_node_blockptr(path->nodes[level],
1353*4882a593Smuzhiyun 					path->slots[level], old_bytenr);
1354*4882a593Smuzhiyun 		btrfs_set_node_ptr_generation(path->nodes[level],
1355*4882a593Smuzhiyun 					      path->slots[level], old_ptr_gen);
1356*4882a593Smuzhiyun 		btrfs_mark_buffer_dirty(path->nodes[level]);
1357*4882a593Smuzhiyun 
1358*4882a593Smuzhiyun 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
1359*4882a593Smuzhiyun 				       blocksize, path->nodes[level]->start);
1360*4882a593Smuzhiyun 		ref.skip_qgroup = true;
1361*4882a593Smuzhiyun 		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
1362*4882a593Smuzhiyun 		ret = btrfs_inc_extent_ref(trans, &ref);
1363*4882a593Smuzhiyun 		BUG_ON(ret);
1364*4882a593Smuzhiyun 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1365*4882a593Smuzhiyun 				       blocksize, 0);
1366*4882a593Smuzhiyun 		ref.skip_qgroup = true;
1367*4882a593Smuzhiyun 		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
1368*4882a593Smuzhiyun 		ret = btrfs_inc_extent_ref(trans, &ref);
1369*4882a593Smuzhiyun 		BUG_ON(ret);
1370*4882a593Smuzhiyun 
1371*4882a593Smuzhiyun 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
1372*4882a593Smuzhiyun 				       blocksize, path->nodes[level]->start);
1373*4882a593Smuzhiyun 		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
1374*4882a593Smuzhiyun 		ref.skip_qgroup = true;
1375*4882a593Smuzhiyun 		ret = btrfs_free_extent(trans, &ref);
1376*4882a593Smuzhiyun 		BUG_ON(ret);
1377*4882a593Smuzhiyun 
1378*4882a593Smuzhiyun 		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
1379*4882a593Smuzhiyun 				       blocksize, 0);
1380*4882a593Smuzhiyun 		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
1381*4882a593Smuzhiyun 		ref.skip_qgroup = true;
1382*4882a593Smuzhiyun 		ret = btrfs_free_extent(trans, &ref);
1383*4882a593Smuzhiyun 		BUG_ON(ret);
1384*4882a593Smuzhiyun 
1385*4882a593Smuzhiyun 		btrfs_unlock_up_safe(path, 0);
1386*4882a593Smuzhiyun 
1387*4882a593Smuzhiyun 		ret = level;
1388*4882a593Smuzhiyun 		break;
1389*4882a593Smuzhiyun 	}
1390*4882a593Smuzhiyun 	btrfs_tree_unlock(parent);
1391*4882a593Smuzhiyun 	free_extent_buffer(parent);
1392*4882a593Smuzhiyun 	return ret;
1393*4882a593Smuzhiyun }
1394*4882a593Smuzhiyun 
1395*4882a593Smuzhiyun /*
1396*4882a593Smuzhiyun  * helper to find next relocated block in reloc tree
1397*4882a593Smuzhiyun  */
1398*4882a593Smuzhiyun static noinline_for_stack
walk_up_reloc_tree(struct btrfs_root * root,struct btrfs_path * path,int * level)1399*4882a593Smuzhiyun int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1400*4882a593Smuzhiyun 		       int *level)
1401*4882a593Smuzhiyun {
1402*4882a593Smuzhiyun 	struct extent_buffer *eb;
1403*4882a593Smuzhiyun 	int i;
1404*4882a593Smuzhiyun 	u64 last_snapshot;
1405*4882a593Smuzhiyun 	u32 nritems;
1406*4882a593Smuzhiyun 
1407*4882a593Smuzhiyun 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1408*4882a593Smuzhiyun 
1409*4882a593Smuzhiyun 	for (i = 0; i < *level; i++) {
1410*4882a593Smuzhiyun 		free_extent_buffer(path->nodes[i]);
1411*4882a593Smuzhiyun 		path->nodes[i] = NULL;
1412*4882a593Smuzhiyun 	}
1413*4882a593Smuzhiyun 
1414*4882a593Smuzhiyun 	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1415*4882a593Smuzhiyun 		eb = path->nodes[i];
1416*4882a593Smuzhiyun 		nritems = btrfs_header_nritems(eb);
1417*4882a593Smuzhiyun 		while (path->slots[i] + 1 < nritems) {
1418*4882a593Smuzhiyun 			path->slots[i]++;
1419*4882a593Smuzhiyun 			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1420*4882a593Smuzhiyun 			    last_snapshot)
1421*4882a593Smuzhiyun 				continue;
1422*4882a593Smuzhiyun 
1423*4882a593Smuzhiyun 			*level = i;
1424*4882a593Smuzhiyun 			return 0;
1425*4882a593Smuzhiyun 		}
1426*4882a593Smuzhiyun 		free_extent_buffer(path->nodes[i]);
1427*4882a593Smuzhiyun 		path->nodes[i] = NULL;
1428*4882a593Smuzhiyun 	}
1429*4882a593Smuzhiyun 	return 1;
1430*4882a593Smuzhiyun }
1431*4882a593Smuzhiyun 
1432*4882a593Smuzhiyun /*
1433*4882a593Smuzhiyun  * walk down reloc tree to find relocated block of lowest level
1434*4882a593Smuzhiyun  */
1435*4882a593Smuzhiyun static noinline_for_stack
walk_down_reloc_tree(struct btrfs_root * root,struct btrfs_path * path,int * level)1436*4882a593Smuzhiyun int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1437*4882a593Smuzhiyun 			 int *level)
1438*4882a593Smuzhiyun {
1439*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = root->fs_info;
1440*4882a593Smuzhiyun 	struct extent_buffer *eb = NULL;
1441*4882a593Smuzhiyun 	int i;
1442*4882a593Smuzhiyun 	u64 bytenr;
1443*4882a593Smuzhiyun 	u64 ptr_gen = 0;
1444*4882a593Smuzhiyun 	u64 last_snapshot;
1445*4882a593Smuzhiyun 	u32 nritems;
1446*4882a593Smuzhiyun 
1447*4882a593Smuzhiyun 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1448*4882a593Smuzhiyun 
1449*4882a593Smuzhiyun 	for (i = *level; i > 0; i--) {
1450*4882a593Smuzhiyun 		struct btrfs_key first_key;
1451*4882a593Smuzhiyun 
1452*4882a593Smuzhiyun 		eb = path->nodes[i];
1453*4882a593Smuzhiyun 		nritems = btrfs_header_nritems(eb);
1454*4882a593Smuzhiyun 		while (path->slots[i] < nritems) {
1455*4882a593Smuzhiyun 			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1456*4882a593Smuzhiyun 			if (ptr_gen > last_snapshot)
1457*4882a593Smuzhiyun 				break;
1458*4882a593Smuzhiyun 			path->slots[i]++;
1459*4882a593Smuzhiyun 		}
1460*4882a593Smuzhiyun 		if (path->slots[i] >= nritems) {
1461*4882a593Smuzhiyun 			if (i == *level)
1462*4882a593Smuzhiyun 				break;
1463*4882a593Smuzhiyun 			*level = i + 1;
1464*4882a593Smuzhiyun 			return 0;
1465*4882a593Smuzhiyun 		}
1466*4882a593Smuzhiyun 		if (i == 1) {
1467*4882a593Smuzhiyun 			*level = i;
1468*4882a593Smuzhiyun 			return 0;
1469*4882a593Smuzhiyun 		}
1470*4882a593Smuzhiyun 
1471*4882a593Smuzhiyun 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1472*4882a593Smuzhiyun 		btrfs_node_key_to_cpu(eb, &first_key, path->slots[i]);
1473*4882a593Smuzhiyun 		eb = read_tree_block(fs_info, bytenr, ptr_gen, i - 1,
1474*4882a593Smuzhiyun 				     &first_key);
1475*4882a593Smuzhiyun 		if (IS_ERR(eb)) {
1476*4882a593Smuzhiyun 			return PTR_ERR(eb);
1477*4882a593Smuzhiyun 		} else if (!extent_buffer_uptodate(eb)) {
1478*4882a593Smuzhiyun 			free_extent_buffer(eb);
1479*4882a593Smuzhiyun 			return -EIO;
1480*4882a593Smuzhiyun 		}
1481*4882a593Smuzhiyun 		BUG_ON(btrfs_header_level(eb) != i - 1);
1482*4882a593Smuzhiyun 		path->nodes[i - 1] = eb;
1483*4882a593Smuzhiyun 		path->slots[i - 1] = 0;
1484*4882a593Smuzhiyun 	}
1485*4882a593Smuzhiyun 	return 1;
1486*4882a593Smuzhiyun }
1487*4882a593Smuzhiyun 
1488*4882a593Smuzhiyun /*
1489*4882a593Smuzhiyun  * invalidate extent cache for file extents whose key in range of
1490*4882a593Smuzhiyun  * [min_key, max_key)
1491*4882a593Smuzhiyun  */
invalidate_extent_cache(struct btrfs_root * root,struct btrfs_key * min_key,struct btrfs_key * max_key)1492*4882a593Smuzhiyun static int invalidate_extent_cache(struct btrfs_root *root,
1493*4882a593Smuzhiyun 				   struct btrfs_key *min_key,
1494*4882a593Smuzhiyun 				   struct btrfs_key *max_key)
1495*4882a593Smuzhiyun {
1496*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = root->fs_info;
1497*4882a593Smuzhiyun 	struct inode *inode = NULL;
1498*4882a593Smuzhiyun 	u64 objectid;
1499*4882a593Smuzhiyun 	u64 start, end;
1500*4882a593Smuzhiyun 	u64 ino;
1501*4882a593Smuzhiyun 
1502*4882a593Smuzhiyun 	objectid = min_key->objectid;
1503*4882a593Smuzhiyun 	while (1) {
1504*4882a593Smuzhiyun 		cond_resched();
1505*4882a593Smuzhiyun 		iput(inode);
1506*4882a593Smuzhiyun 
1507*4882a593Smuzhiyun 		if (objectid > max_key->objectid)
1508*4882a593Smuzhiyun 			break;
1509*4882a593Smuzhiyun 
1510*4882a593Smuzhiyun 		inode = find_next_inode(root, objectid);
1511*4882a593Smuzhiyun 		if (!inode)
1512*4882a593Smuzhiyun 			break;
1513*4882a593Smuzhiyun 		ino = btrfs_ino(BTRFS_I(inode));
1514*4882a593Smuzhiyun 
1515*4882a593Smuzhiyun 		if (ino > max_key->objectid) {
1516*4882a593Smuzhiyun 			iput(inode);
1517*4882a593Smuzhiyun 			break;
1518*4882a593Smuzhiyun 		}
1519*4882a593Smuzhiyun 
1520*4882a593Smuzhiyun 		objectid = ino + 1;
1521*4882a593Smuzhiyun 		if (!S_ISREG(inode->i_mode))
1522*4882a593Smuzhiyun 			continue;
1523*4882a593Smuzhiyun 
1524*4882a593Smuzhiyun 		if (unlikely(min_key->objectid == ino)) {
1525*4882a593Smuzhiyun 			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1526*4882a593Smuzhiyun 				continue;
1527*4882a593Smuzhiyun 			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1528*4882a593Smuzhiyun 				start = 0;
1529*4882a593Smuzhiyun 			else {
1530*4882a593Smuzhiyun 				start = min_key->offset;
1531*4882a593Smuzhiyun 				WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
1532*4882a593Smuzhiyun 			}
1533*4882a593Smuzhiyun 		} else {
1534*4882a593Smuzhiyun 			start = 0;
1535*4882a593Smuzhiyun 		}
1536*4882a593Smuzhiyun 
1537*4882a593Smuzhiyun 		if (unlikely(max_key->objectid == ino)) {
1538*4882a593Smuzhiyun 			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1539*4882a593Smuzhiyun 				continue;
1540*4882a593Smuzhiyun 			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1541*4882a593Smuzhiyun 				end = (u64)-1;
1542*4882a593Smuzhiyun 			} else {
1543*4882a593Smuzhiyun 				if (max_key->offset == 0)
1544*4882a593Smuzhiyun 					continue;
1545*4882a593Smuzhiyun 				end = max_key->offset;
1546*4882a593Smuzhiyun 				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1547*4882a593Smuzhiyun 				end--;
1548*4882a593Smuzhiyun 			}
1549*4882a593Smuzhiyun 		} else {
1550*4882a593Smuzhiyun 			end = (u64)-1;
1551*4882a593Smuzhiyun 		}
1552*4882a593Smuzhiyun 
1553*4882a593Smuzhiyun 		/* the lock_extent waits for readpage to complete */
1554*4882a593Smuzhiyun 		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
1555*4882a593Smuzhiyun 		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1);
1556*4882a593Smuzhiyun 		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
1557*4882a593Smuzhiyun 	}
1558*4882a593Smuzhiyun 	return 0;
1559*4882a593Smuzhiyun }
1560*4882a593Smuzhiyun 
find_next_key(struct btrfs_path * path,int level,struct btrfs_key * key)1561*4882a593Smuzhiyun static int find_next_key(struct btrfs_path *path, int level,
1562*4882a593Smuzhiyun 			 struct btrfs_key *key)
1563*4882a593Smuzhiyun 
1564*4882a593Smuzhiyun {
1565*4882a593Smuzhiyun 	while (level < BTRFS_MAX_LEVEL) {
1566*4882a593Smuzhiyun 		if (!path->nodes[level])
1567*4882a593Smuzhiyun 			break;
1568*4882a593Smuzhiyun 		if (path->slots[level] + 1 <
1569*4882a593Smuzhiyun 		    btrfs_header_nritems(path->nodes[level])) {
1570*4882a593Smuzhiyun 			btrfs_node_key_to_cpu(path->nodes[level], key,
1571*4882a593Smuzhiyun 					      path->slots[level] + 1);
1572*4882a593Smuzhiyun 			return 0;
1573*4882a593Smuzhiyun 		}
1574*4882a593Smuzhiyun 		level++;
1575*4882a593Smuzhiyun 	}
1576*4882a593Smuzhiyun 	return 1;
1577*4882a593Smuzhiyun }
1578*4882a593Smuzhiyun 
1579*4882a593Smuzhiyun /*
1580*4882a593Smuzhiyun  * Insert current subvolume into reloc_control::dirty_subvol_roots
1581*4882a593Smuzhiyun  */
insert_dirty_subvol(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_root * root)1582*4882a593Smuzhiyun static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
1583*4882a593Smuzhiyun 				struct reloc_control *rc,
1584*4882a593Smuzhiyun 				struct btrfs_root *root)
1585*4882a593Smuzhiyun {
1586*4882a593Smuzhiyun 	struct btrfs_root *reloc_root = root->reloc_root;
1587*4882a593Smuzhiyun 	struct btrfs_root_item *reloc_root_item;
1588*4882a593Smuzhiyun 
1589*4882a593Smuzhiyun 	/* @root must be a subvolume tree root with a valid reloc tree */
1590*4882a593Smuzhiyun 	ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1591*4882a593Smuzhiyun 	ASSERT(reloc_root);
1592*4882a593Smuzhiyun 
1593*4882a593Smuzhiyun 	reloc_root_item = &reloc_root->root_item;
1594*4882a593Smuzhiyun 	memset(&reloc_root_item->drop_progress, 0,
1595*4882a593Smuzhiyun 		sizeof(reloc_root_item->drop_progress));
1596*4882a593Smuzhiyun 	reloc_root_item->drop_level = 0;
1597*4882a593Smuzhiyun 	btrfs_set_root_refs(reloc_root_item, 0);
1598*4882a593Smuzhiyun 	btrfs_update_reloc_root(trans, root);
1599*4882a593Smuzhiyun 
1600*4882a593Smuzhiyun 	if (list_empty(&root->reloc_dirty_list)) {
1601*4882a593Smuzhiyun 		btrfs_grab_root(root);
1602*4882a593Smuzhiyun 		list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
1603*4882a593Smuzhiyun 	}
1604*4882a593Smuzhiyun }
1605*4882a593Smuzhiyun 
clean_dirty_subvols(struct reloc_control * rc)1606*4882a593Smuzhiyun static int clean_dirty_subvols(struct reloc_control *rc)
1607*4882a593Smuzhiyun {
1608*4882a593Smuzhiyun 	struct btrfs_root *root;
1609*4882a593Smuzhiyun 	struct btrfs_root *next;
1610*4882a593Smuzhiyun 	int ret = 0;
1611*4882a593Smuzhiyun 	int ret2;
1612*4882a593Smuzhiyun 
1613*4882a593Smuzhiyun 	list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
1614*4882a593Smuzhiyun 				 reloc_dirty_list) {
1615*4882a593Smuzhiyun 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1616*4882a593Smuzhiyun 			/* Merged subvolume, cleanup its reloc root */
1617*4882a593Smuzhiyun 			struct btrfs_root *reloc_root = root->reloc_root;
1618*4882a593Smuzhiyun 
1619*4882a593Smuzhiyun 			list_del_init(&root->reloc_dirty_list);
1620*4882a593Smuzhiyun 			root->reloc_root = NULL;
1621*4882a593Smuzhiyun 			/*
1622*4882a593Smuzhiyun 			 * Need barrier to ensure clear_bit() only happens after
1623*4882a593Smuzhiyun 			 * root->reloc_root = NULL. Pairs with have_reloc_root.
1624*4882a593Smuzhiyun 			 */
1625*4882a593Smuzhiyun 			smp_wmb();
1626*4882a593Smuzhiyun 			clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
1627*4882a593Smuzhiyun 			if (reloc_root) {
1628*4882a593Smuzhiyun 				/*
1629*4882a593Smuzhiyun 				 * btrfs_drop_snapshot drops our ref we hold for
1630*4882a593Smuzhiyun 				 * ->reloc_root.  If it fails however we must
1631*4882a593Smuzhiyun 				 * drop the ref ourselves.
1632*4882a593Smuzhiyun 				 */
1633*4882a593Smuzhiyun 				ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
1634*4882a593Smuzhiyun 				if (ret2 < 0) {
1635*4882a593Smuzhiyun 					btrfs_put_root(reloc_root);
1636*4882a593Smuzhiyun 					if (!ret)
1637*4882a593Smuzhiyun 						ret = ret2;
1638*4882a593Smuzhiyun 				}
1639*4882a593Smuzhiyun 			}
1640*4882a593Smuzhiyun 			btrfs_put_root(root);
1641*4882a593Smuzhiyun 		} else {
1642*4882a593Smuzhiyun 			/* Orphan reloc tree, just clean it up */
1643*4882a593Smuzhiyun 			ret2 = btrfs_drop_snapshot(root, 0, 1);
1644*4882a593Smuzhiyun 			if (ret2 < 0) {
1645*4882a593Smuzhiyun 				btrfs_put_root(root);
1646*4882a593Smuzhiyun 				if (!ret)
1647*4882a593Smuzhiyun 					ret = ret2;
1648*4882a593Smuzhiyun 			}
1649*4882a593Smuzhiyun 		}
1650*4882a593Smuzhiyun 	}
1651*4882a593Smuzhiyun 	return ret;
1652*4882a593Smuzhiyun }
1653*4882a593Smuzhiyun 
1654*4882a593Smuzhiyun /*
1655*4882a593Smuzhiyun  * merge the relocated tree blocks in reloc tree with corresponding
1656*4882a593Smuzhiyun  * fs tree.
1657*4882a593Smuzhiyun  */
merge_reloc_root(struct reloc_control * rc,struct btrfs_root * root)1658*4882a593Smuzhiyun static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1659*4882a593Smuzhiyun 					       struct btrfs_root *root)
1660*4882a593Smuzhiyun {
1661*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
1662*4882a593Smuzhiyun 	struct btrfs_key key;
1663*4882a593Smuzhiyun 	struct btrfs_key next_key;
1664*4882a593Smuzhiyun 	struct btrfs_trans_handle *trans = NULL;
1665*4882a593Smuzhiyun 	struct btrfs_root *reloc_root;
1666*4882a593Smuzhiyun 	struct btrfs_root_item *root_item;
1667*4882a593Smuzhiyun 	struct btrfs_path *path;
1668*4882a593Smuzhiyun 	struct extent_buffer *leaf;
1669*4882a593Smuzhiyun 	int reserve_level;
1670*4882a593Smuzhiyun 	int level;
1671*4882a593Smuzhiyun 	int max_level;
1672*4882a593Smuzhiyun 	int replaced = 0;
1673*4882a593Smuzhiyun 	int ret;
1674*4882a593Smuzhiyun 	int err = 0;
1675*4882a593Smuzhiyun 	u32 min_reserved;
1676*4882a593Smuzhiyun 
1677*4882a593Smuzhiyun 	path = btrfs_alloc_path();
1678*4882a593Smuzhiyun 	if (!path)
1679*4882a593Smuzhiyun 		return -ENOMEM;
1680*4882a593Smuzhiyun 	path->reada = READA_FORWARD;
1681*4882a593Smuzhiyun 
1682*4882a593Smuzhiyun 	reloc_root = root->reloc_root;
1683*4882a593Smuzhiyun 	root_item = &reloc_root->root_item;
1684*4882a593Smuzhiyun 
1685*4882a593Smuzhiyun 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1686*4882a593Smuzhiyun 		level = btrfs_root_level(root_item);
1687*4882a593Smuzhiyun 		atomic_inc(&reloc_root->node->refs);
1688*4882a593Smuzhiyun 		path->nodes[level] = reloc_root->node;
1689*4882a593Smuzhiyun 		path->slots[level] = 0;
1690*4882a593Smuzhiyun 	} else {
1691*4882a593Smuzhiyun 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1692*4882a593Smuzhiyun 
1693*4882a593Smuzhiyun 		level = root_item->drop_level;
1694*4882a593Smuzhiyun 		BUG_ON(level == 0);
1695*4882a593Smuzhiyun 		path->lowest_level = level;
1696*4882a593Smuzhiyun 		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1697*4882a593Smuzhiyun 		path->lowest_level = 0;
1698*4882a593Smuzhiyun 		if (ret < 0) {
1699*4882a593Smuzhiyun 			btrfs_free_path(path);
1700*4882a593Smuzhiyun 			return ret;
1701*4882a593Smuzhiyun 		}
1702*4882a593Smuzhiyun 
1703*4882a593Smuzhiyun 		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1704*4882a593Smuzhiyun 				      path->slots[level]);
1705*4882a593Smuzhiyun 		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1706*4882a593Smuzhiyun 
1707*4882a593Smuzhiyun 		btrfs_unlock_up_safe(path, 0);
1708*4882a593Smuzhiyun 	}
1709*4882a593Smuzhiyun 
1710*4882a593Smuzhiyun 	/*
1711*4882a593Smuzhiyun 	 * In merge_reloc_root(), we modify the upper level pointer to swap the
1712*4882a593Smuzhiyun 	 * tree blocks between reloc tree and subvolume tree.  Thus for tree
1713*4882a593Smuzhiyun 	 * block COW, we COW at most from level 1 to root level for each tree.
1714*4882a593Smuzhiyun 	 *
1715*4882a593Smuzhiyun 	 * Thus the needed metadata size is at most root_level * nodesize,
1716*4882a593Smuzhiyun 	 * and * 2 since we have two trees to COW.
1717*4882a593Smuzhiyun 	 */
1718*4882a593Smuzhiyun 	reserve_level = max_t(int, 1, btrfs_root_level(root_item));
1719*4882a593Smuzhiyun 	min_reserved = fs_info->nodesize * reserve_level * 2;
1720*4882a593Smuzhiyun 	memset(&next_key, 0, sizeof(next_key));
1721*4882a593Smuzhiyun 
1722*4882a593Smuzhiyun 	while (1) {
1723*4882a593Smuzhiyun 		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
1724*4882a593Smuzhiyun 					     BTRFS_RESERVE_FLUSH_LIMIT);
1725*4882a593Smuzhiyun 		if (ret) {
1726*4882a593Smuzhiyun 			err = ret;
1727*4882a593Smuzhiyun 			goto out;
1728*4882a593Smuzhiyun 		}
1729*4882a593Smuzhiyun 		trans = btrfs_start_transaction(root, 0);
1730*4882a593Smuzhiyun 		if (IS_ERR(trans)) {
1731*4882a593Smuzhiyun 			err = PTR_ERR(trans);
1732*4882a593Smuzhiyun 			trans = NULL;
1733*4882a593Smuzhiyun 			goto out;
1734*4882a593Smuzhiyun 		}
1735*4882a593Smuzhiyun 
1736*4882a593Smuzhiyun 		/*
1737*4882a593Smuzhiyun 		 * At this point we no longer have a reloc_control, so we can't
1738*4882a593Smuzhiyun 		 * depend on btrfs_init_reloc_root to update our last_trans.
1739*4882a593Smuzhiyun 		 *
1740*4882a593Smuzhiyun 		 * But that's ok, we started the trans handle on our
1741*4882a593Smuzhiyun 		 * corresponding fs_root, which means it's been added to the
1742*4882a593Smuzhiyun 		 * dirty list.  At commit time we'll still call
1743*4882a593Smuzhiyun 		 * btrfs_update_reloc_root() and update our root item
1744*4882a593Smuzhiyun 		 * appropriately.
1745*4882a593Smuzhiyun 		 */
1746*4882a593Smuzhiyun 		reloc_root->last_trans = trans->transid;
1747*4882a593Smuzhiyun 		trans->block_rsv = rc->block_rsv;
1748*4882a593Smuzhiyun 
1749*4882a593Smuzhiyun 		replaced = 0;
1750*4882a593Smuzhiyun 		max_level = level;
1751*4882a593Smuzhiyun 
1752*4882a593Smuzhiyun 		ret = walk_down_reloc_tree(reloc_root, path, &level);
1753*4882a593Smuzhiyun 		if (ret < 0) {
1754*4882a593Smuzhiyun 			err = ret;
1755*4882a593Smuzhiyun 			goto out;
1756*4882a593Smuzhiyun 		}
1757*4882a593Smuzhiyun 		if (ret > 0)
1758*4882a593Smuzhiyun 			break;
1759*4882a593Smuzhiyun 
1760*4882a593Smuzhiyun 		if (!find_next_key(path, level, &key) &&
1761*4882a593Smuzhiyun 		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1762*4882a593Smuzhiyun 			ret = 0;
1763*4882a593Smuzhiyun 		} else {
1764*4882a593Smuzhiyun 			ret = replace_path(trans, rc, root, reloc_root, path,
1765*4882a593Smuzhiyun 					   &next_key, level, max_level);
1766*4882a593Smuzhiyun 		}
1767*4882a593Smuzhiyun 		if (ret < 0) {
1768*4882a593Smuzhiyun 			err = ret;
1769*4882a593Smuzhiyun 			goto out;
1770*4882a593Smuzhiyun 		}
1771*4882a593Smuzhiyun 
1772*4882a593Smuzhiyun 		if (ret > 0) {
1773*4882a593Smuzhiyun 			level = ret;
1774*4882a593Smuzhiyun 			btrfs_node_key_to_cpu(path->nodes[level], &key,
1775*4882a593Smuzhiyun 					      path->slots[level]);
1776*4882a593Smuzhiyun 			replaced = 1;
1777*4882a593Smuzhiyun 		}
1778*4882a593Smuzhiyun 
1779*4882a593Smuzhiyun 		ret = walk_up_reloc_tree(reloc_root, path, &level);
1780*4882a593Smuzhiyun 		if (ret > 0)
1781*4882a593Smuzhiyun 			break;
1782*4882a593Smuzhiyun 
1783*4882a593Smuzhiyun 		BUG_ON(level == 0);
1784*4882a593Smuzhiyun 		/*
1785*4882a593Smuzhiyun 		 * save the merging progress in the drop_progress.
1786*4882a593Smuzhiyun 		 * this is OK since root refs == 1 in this case.
1787*4882a593Smuzhiyun 		 */
1788*4882a593Smuzhiyun 		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1789*4882a593Smuzhiyun 			       path->slots[level]);
1790*4882a593Smuzhiyun 		root_item->drop_level = level;
1791*4882a593Smuzhiyun 
1792*4882a593Smuzhiyun 		btrfs_end_transaction_throttle(trans);
1793*4882a593Smuzhiyun 		trans = NULL;
1794*4882a593Smuzhiyun 
1795*4882a593Smuzhiyun 		btrfs_btree_balance_dirty(fs_info);
1796*4882a593Smuzhiyun 
1797*4882a593Smuzhiyun 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
1798*4882a593Smuzhiyun 			invalidate_extent_cache(root, &key, &next_key);
1799*4882a593Smuzhiyun 	}
1800*4882a593Smuzhiyun 
1801*4882a593Smuzhiyun 	/*
1802*4882a593Smuzhiyun 	 * handle the case only one block in the fs tree need to be
1803*4882a593Smuzhiyun 	 * relocated and the block is tree root.
1804*4882a593Smuzhiyun 	 */
1805*4882a593Smuzhiyun 	leaf = btrfs_lock_root_node(root);
1806*4882a593Smuzhiyun 	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
1807*4882a593Smuzhiyun 			      BTRFS_NESTING_COW);
1808*4882a593Smuzhiyun 	btrfs_tree_unlock(leaf);
1809*4882a593Smuzhiyun 	free_extent_buffer(leaf);
1810*4882a593Smuzhiyun 	if (ret < 0)
1811*4882a593Smuzhiyun 		err = ret;
1812*4882a593Smuzhiyun out:
1813*4882a593Smuzhiyun 	btrfs_free_path(path);
1814*4882a593Smuzhiyun 
1815*4882a593Smuzhiyun 	if (err == 0)
1816*4882a593Smuzhiyun 		insert_dirty_subvol(trans, rc, root);
1817*4882a593Smuzhiyun 
1818*4882a593Smuzhiyun 	if (trans)
1819*4882a593Smuzhiyun 		btrfs_end_transaction_throttle(trans);
1820*4882a593Smuzhiyun 
1821*4882a593Smuzhiyun 	btrfs_btree_balance_dirty(fs_info);
1822*4882a593Smuzhiyun 
1823*4882a593Smuzhiyun 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
1824*4882a593Smuzhiyun 		invalidate_extent_cache(root, &key, &next_key);
1825*4882a593Smuzhiyun 
1826*4882a593Smuzhiyun 	return err;
1827*4882a593Smuzhiyun }
1828*4882a593Smuzhiyun 
1829*4882a593Smuzhiyun static noinline_for_stack
prepare_to_merge(struct reloc_control * rc,int err)1830*4882a593Smuzhiyun int prepare_to_merge(struct reloc_control *rc, int err)
1831*4882a593Smuzhiyun {
1832*4882a593Smuzhiyun 	struct btrfs_root *root = rc->extent_root;
1833*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = root->fs_info;
1834*4882a593Smuzhiyun 	struct btrfs_root *reloc_root;
1835*4882a593Smuzhiyun 	struct btrfs_trans_handle *trans;
1836*4882a593Smuzhiyun 	LIST_HEAD(reloc_roots);
1837*4882a593Smuzhiyun 	u64 num_bytes = 0;
1838*4882a593Smuzhiyun 	int ret;
1839*4882a593Smuzhiyun 
1840*4882a593Smuzhiyun 	mutex_lock(&fs_info->reloc_mutex);
1841*4882a593Smuzhiyun 	rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1842*4882a593Smuzhiyun 	rc->merging_rsv_size += rc->nodes_relocated * 2;
1843*4882a593Smuzhiyun 	mutex_unlock(&fs_info->reloc_mutex);
1844*4882a593Smuzhiyun 
1845*4882a593Smuzhiyun again:
1846*4882a593Smuzhiyun 	if (!err) {
1847*4882a593Smuzhiyun 		num_bytes = rc->merging_rsv_size;
1848*4882a593Smuzhiyun 		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
1849*4882a593Smuzhiyun 					  BTRFS_RESERVE_FLUSH_ALL);
1850*4882a593Smuzhiyun 		if (ret)
1851*4882a593Smuzhiyun 			err = ret;
1852*4882a593Smuzhiyun 	}
1853*4882a593Smuzhiyun 
1854*4882a593Smuzhiyun 	trans = btrfs_join_transaction(rc->extent_root);
1855*4882a593Smuzhiyun 	if (IS_ERR(trans)) {
1856*4882a593Smuzhiyun 		if (!err)
1857*4882a593Smuzhiyun 			btrfs_block_rsv_release(fs_info, rc->block_rsv,
1858*4882a593Smuzhiyun 						num_bytes, NULL);
1859*4882a593Smuzhiyun 		return PTR_ERR(trans);
1860*4882a593Smuzhiyun 	}
1861*4882a593Smuzhiyun 
1862*4882a593Smuzhiyun 	if (!err) {
1863*4882a593Smuzhiyun 		if (num_bytes != rc->merging_rsv_size) {
1864*4882a593Smuzhiyun 			btrfs_end_transaction(trans);
1865*4882a593Smuzhiyun 			btrfs_block_rsv_release(fs_info, rc->block_rsv,
1866*4882a593Smuzhiyun 						num_bytes, NULL);
1867*4882a593Smuzhiyun 			goto again;
1868*4882a593Smuzhiyun 		}
1869*4882a593Smuzhiyun 	}
1870*4882a593Smuzhiyun 
1871*4882a593Smuzhiyun 	rc->merge_reloc_tree = 1;
1872*4882a593Smuzhiyun 
1873*4882a593Smuzhiyun 	while (!list_empty(&rc->reloc_roots)) {
1874*4882a593Smuzhiyun 		reloc_root = list_entry(rc->reloc_roots.next,
1875*4882a593Smuzhiyun 					struct btrfs_root, root_list);
1876*4882a593Smuzhiyun 		list_del_init(&reloc_root->root_list);
1877*4882a593Smuzhiyun 
1878*4882a593Smuzhiyun 		root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
1879*4882a593Smuzhiyun 				false);
1880*4882a593Smuzhiyun 		BUG_ON(IS_ERR(root));
1881*4882a593Smuzhiyun 		BUG_ON(root->reloc_root != reloc_root);
1882*4882a593Smuzhiyun 
1883*4882a593Smuzhiyun 		/*
1884*4882a593Smuzhiyun 		 * set reference count to 1, so btrfs_recover_relocation
1885*4882a593Smuzhiyun 		 * knows it should resumes merging
1886*4882a593Smuzhiyun 		 */
1887*4882a593Smuzhiyun 		if (!err)
1888*4882a593Smuzhiyun 			btrfs_set_root_refs(&reloc_root->root_item, 1);
1889*4882a593Smuzhiyun 		btrfs_update_reloc_root(trans, root);
1890*4882a593Smuzhiyun 
1891*4882a593Smuzhiyun 		list_add(&reloc_root->root_list, &reloc_roots);
1892*4882a593Smuzhiyun 		btrfs_put_root(root);
1893*4882a593Smuzhiyun 	}
1894*4882a593Smuzhiyun 
1895*4882a593Smuzhiyun 	list_splice(&reloc_roots, &rc->reloc_roots);
1896*4882a593Smuzhiyun 
1897*4882a593Smuzhiyun 	if (!err)
1898*4882a593Smuzhiyun 		btrfs_commit_transaction(trans);
1899*4882a593Smuzhiyun 	else
1900*4882a593Smuzhiyun 		btrfs_end_transaction(trans);
1901*4882a593Smuzhiyun 	return err;
1902*4882a593Smuzhiyun }
1903*4882a593Smuzhiyun 
1904*4882a593Smuzhiyun static noinline_for_stack
free_reloc_roots(struct list_head * list)1905*4882a593Smuzhiyun void free_reloc_roots(struct list_head *list)
1906*4882a593Smuzhiyun {
1907*4882a593Smuzhiyun 	struct btrfs_root *reloc_root, *tmp;
1908*4882a593Smuzhiyun 
1909*4882a593Smuzhiyun 	list_for_each_entry_safe(reloc_root, tmp, list, root_list)
1910*4882a593Smuzhiyun 		__del_reloc_root(reloc_root);
1911*4882a593Smuzhiyun }
1912*4882a593Smuzhiyun 
1913*4882a593Smuzhiyun static noinline_for_stack
merge_reloc_roots(struct reloc_control * rc)1914*4882a593Smuzhiyun void merge_reloc_roots(struct reloc_control *rc)
1915*4882a593Smuzhiyun {
1916*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
1917*4882a593Smuzhiyun 	struct btrfs_root *root;
1918*4882a593Smuzhiyun 	struct btrfs_root *reloc_root;
1919*4882a593Smuzhiyun 	LIST_HEAD(reloc_roots);
1920*4882a593Smuzhiyun 	int found = 0;
1921*4882a593Smuzhiyun 	int ret = 0;
1922*4882a593Smuzhiyun again:
1923*4882a593Smuzhiyun 	root = rc->extent_root;
1924*4882a593Smuzhiyun 
1925*4882a593Smuzhiyun 	/*
1926*4882a593Smuzhiyun 	 * this serializes us with btrfs_record_root_in_transaction,
1927*4882a593Smuzhiyun 	 * we have to make sure nobody is in the middle of
1928*4882a593Smuzhiyun 	 * adding their roots to the list while we are
1929*4882a593Smuzhiyun 	 * doing this splice
1930*4882a593Smuzhiyun 	 */
1931*4882a593Smuzhiyun 	mutex_lock(&fs_info->reloc_mutex);
1932*4882a593Smuzhiyun 	list_splice_init(&rc->reloc_roots, &reloc_roots);
1933*4882a593Smuzhiyun 	mutex_unlock(&fs_info->reloc_mutex);
1934*4882a593Smuzhiyun 
1935*4882a593Smuzhiyun 	while (!list_empty(&reloc_roots)) {
1936*4882a593Smuzhiyun 		found = 1;
1937*4882a593Smuzhiyun 		reloc_root = list_entry(reloc_roots.next,
1938*4882a593Smuzhiyun 					struct btrfs_root, root_list);
1939*4882a593Smuzhiyun 
1940*4882a593Smuzhiyun 		root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
1941*4882a593Smuzhiyun 					 false);
1942*4882a593Smuzhiyun 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1943*4882a593Smuzhiyun 			BUG_ON(IS_ERR(root));
1944*4882a593Smuzhiyun 			BUG_ON(root->reloc_root != reloc_root);
1945*4882a593Smuzhiyun 			ret = merge_reloc_root(rc, root);
1946*4882a593Smuzhiyun 			btrfs_put_root(root);
1947*4882a593Smuzhiyun 			if (ret) {
1948*4882a593Smuzhiyun 				if (list_empty(&reloc_root->root_list))
1949*4882a593Smuzhiyun 					list_add_tail(&reloc_root->root_list,
1950*4882a593Smuzhiyun 						      &reloc_roots);
1951*4882a593Smuzhiyun 				goto out;
1952*4882a593Smuzhiyun 			}
1953*4882a593Smuzhiyun 		} else {
1954*4882a593Smuzhiyun 			if (!IS_ERR(root)) {
1955*4882a593Smuzhiyun 				if (root->reloc_root == reloc_root) {
1956*4882a593Smuzhiyun 					root->reloc_root = NULL;
1957*4882a593Smuzhiyun 					btrfs_put_root(reloc_root);
1958*4882a593Smuzhiyun 				}
1959*4882a593Smuzhiyun 				clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE,
1960*4882a593Smuzhiyun 					  &root->state);
1961*4882a593Smuzhiyun 				btrfs_put_root(root);
1962*4882a593Smuzhiyun 			}
1963*4882a593Smuzhiyun 
1964*4882a593Smuzhiyun 			list_del_init(&reloc_root->root_list);
1965*4882a593Smuzhiyun 			/* Don't forget to queue this reloc root for cleanup */
1966*4882a593Smuzhiyun 			list_add_tail(&reloc_root->reloc_dirty_list,
1967*4882a593Smuzhiyun 				      &rc->dirty_subvol_roots);
1968*4882a593Smuzhiyun 		}
1969*4882a593Smuzhiyun 	}
1970*4882a593Smuzhiyun 
1971*4882a593Smuzhiyun 	if (found) {
1972*4882a593Smuzhiyun 		found = 0;
1973*4882a593Smuzhiyun 		goto again;
1974*4882a593Smuzhiyun 	}
1975*4882a593Smuzhiyun out:
1976*4882a593Smuzhiyun 	if (ret) {
1977*4882a593Smuzhiyun 		btrfs_handle_fs_error(fs_info, ret, NULL);
1978*4882a593Smuzhiyun 		free_reloc_roots(&reloc_roots);
1979*4882a593Smuzhiyun 
1980*4882a593Smuzhiyun 		/* new reloc root may be added */
1981*4882a593Smuzhiyun 		mutex_lock(&fs_info->reloc_mutex);
1982*4882a593Smuzhiyun 		list_splice_init(&rc->reloc_roots, &reloc_roots);
1983*4882a593Smuzhiyun 		mutex_unlock(&fs_info->reloc_mutex);
1984*4882a593Smuzhiyun 		free_reloc_roots(&reloc_roots);
1985*4882a593Smuzhiyun 	}
1986*4882a593Smuzhiyun 
1987*4882a593Smuzhiyun 	/*
1988*4882a593Smuzhiyun 	 * We used to have
1989*4882a593Smuzhiyun 	 *
1990*4882a593Smuzhiyun 	 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1991*4882a593Smuzhiyun 	 *
1992*4882a593Smuzhiyun 	 * here, but it's wrong.  If we fail to start the transaction in
1993*4882a593Smuzhiyun 	 * prepare_to_merge() we will have only 0 ref reloc roots, none of which
1994*4882a593Smuzhiyun 	 * have actually been removed from the reloc_root_tree rb tree.  This is
1995*4882a593Smuzhiyun 	 * fine because we're bailing here, and we hold a reference on the root
1996*4882a593Smuzhiyun 	 * for the list that holds it, so these roots will be cleaned up when we
1997*4882a593Smuzhiyun 	 * do the reloc_dirty_list afterwards.  Meanwhile the root->reloc_root
1998*4882a593Smuzhiyun 	 * will be cleaned up on unmount.
1999*4882a593Smuzhiyun 	 *
2000*4882a593Smuzhiyun 	 * The remaining nodes will be cleaned up by free_reloc_control.
2001*4882a593Smuzhiyun 	 */
2002*4882a593Smuzhiyun }
2003*4882a593Smuzhiyun 
free_block_list(struct rb_root * blocks)2004*4882a593Smuzhiyun static void free_block_list(struct rb_root *blocks)
2005*4882a593Smuzhiyun {
2006*4882a593Smuzhiyun 	struct tree_block *block;
2007*4882a593Smuzhiyun 	struct rb_node *rb_node;
2008*4882a593Smuzhiyun 	while ((rb_node = rb_first(blocks))) {
2009*4882a593Smuzhiyun 		block = rb_entry(rb_node, struct tree_block, rb_node);
2010*4882a593Smuzhiyun 		rb_erase(rb_node, blocks);
2011*4882a593Smuzhiyun 		kfree(block);
2012*4882a593Smuzhiyun 	}
2013*4882a593Smuzhiyun }
2014*4882a593Smuzhiyun 
record_reloc_root_in_trans(struct btrfs_trans_handle * trans,struct btrfs_root * reloc_root)2015*4882a593Smuzhiyun static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2016*4882a593Smuzhiyun 				      struct btrfs_root *reloc_root)
2017*4882a593Smuzhiyun {
2018*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = reloc_root->fs_info;
2019*4882a593Smuzhiyun 	struct btrfs_root *root;
2020*4882a593Smuzhiyun 	int ret;
2021*4882a593Smuzhiyun 
2022*4882a593Smuzhiyun 	if (reloc_root->last_trans == trans->transid)
2023*4882a593Smuzhiyun 		return 0;
2024*4882a593Smuzhiyun 
2025*4882a593Smuzhiyun 	root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
2026*4882a593Smuzhiyun 	BUG_ON(IS_ERR(root));
2027*4882a593Smuzhiyun 	BUG_ON(root->reloc_root != reloc_root);
2028*4882a593Smuzhiyun 	ret = btrfs_record_root_in_trans(trans, root);
2029*4882a593Smuzhiyun 	btrfs_put_root(root);
2030*4882a593Smuzhiyun 
2031*4882a593Smuzhiyun 	return ret;
2032*4882a593Smuzhiyun }
2033*4882a593Smuzhiyun 
2034*4882a593Smuzhiyun static noinline_for_stack
select_reloc_root(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_backref_node * node,struct btrfs_backref_edge * edges[])2035*4882a593Smuzhiyun struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2036*4882a593Smuzhiyun 				     struct reloc_control *rc,
2037*4882a593Smuzhiyun 				     struct btrfs_backref_node *node,
2038*4882a593Smuzhiyun 				     struct btrfs_backref_edge *edges[])
2039*4882a593Smuzhiyun {
2040*4882a593Smuzhiyun 	struct btrfs_backref_node *next;
2041*4882a593Smuzhiyun 	struct btrfs_root *root;
2042*4882a593Smuzhiyun 	int index = 0;
2043*4882a593Smuzhiyun 
2044*4882a593Smuzhiyun 	next = node;
2045*4882a593Smuzhiyun 	while (1) {
2046*4882a593Smuzhiyun 		cond_resched();
2047*4882a593Smuzhiyun 		next = walk_up_backref(next, edges, &index);
2048*4882a593Smuzhiyun 		root = next->root;
2049*4882a593Smuzhiyun 		BUG_ON(!root);
2050*4882a593Smuzhiyun 		BUG_ON(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state));
2051*4882a593Smuzhiyun 
2052*4882a593Smuzhiyun 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2053*4882a593Smuzhiyun 			record_reloc_root_in_trans(trans, root);
2054*4882a593Smuzhiyun 			break;
2055*4882a593Smuzhiyun 		}
2056*4882a593Smuzhiyun 
2057*4882a593Smuzhiyun 		btrfs_record_root_in_trans(trans, root);
2058*4882a593Smuzhiyun 		root = root->reloc_root;
2059*4882a593Smuzhiyun 
2060*4882a593Smuzhiyun 		if (next->new_bytenr != root->node->start) {
2061*4882a593Smuzhiyun 			BUG_ON(next->new_bytenr);
2062*4882a593Smuzhiyun 			BUG_ON(!list_empty(&next->list));
2063*4882a593Smuzhiyun 			next->new_bytenr = root->node->start;
2064*4882a593Smuzhiyun 			btrfs_put_root(next->root);
2065*4882a593Smuzhiyun 			next->root = btrfs_grab_root(root);
2066*4882a593Smuzhiyun 			ASSERT(next->root);
2067*4882a593Smuzhiyun 			list_add_tail(&next->list,
2068*4882a593Smuzhiyun 				      &rc->backref_cache.changed);
2069*4882a593Smuzhiyun 			mark_block_processed(rc, next);
2070*4882a593Smuzhiyun 			break;
2071*4882a593Smuzhiyun 		}
2072*4882a593Smuzhiyun 
2073*4882a593Smuzhiyun 		WARN_ON(1);
2074*4882a593Smuzhiyun 		root = NULL;
2075*4882a593Smuzhiyun 		next = walk_down_backref(edges, &index);
2076*4882a593Smuzhiyun 		if (!next || next->level <= node->level)
2077*4882a593Smuzhiyun 			break;
2078*4882a593Smuzhiyun 	}
2079*4882a593Smuzhiyun 	if (!root)
2080*4882a593Smuzhiyun 		return NULL;
2081*4882a593Smuzhiyun 
2082*4882a593Smuzhiyun 	next = node;
2083*4882a593Smuzhiyun 	/* setup backref node path for btrfs_reloc_cow_block */
2084*4882a593Smuzhiyun 	while (1) {
2085*4882a593Smuzhiyun 		rc->backref_cache.path[next->level] = next;
2086*4882a593Smuzhiyun 		if (--index < 0)
2087*4882a593Smuzhiyun 			break;
2088*4882a593Smuzhiyun 		next = edges[index]->node[UPPER];
2089*4882a593Smuzhiyun 	}
2090*4882a593Smuzhiyun 	return root;
2091*4882a593Smuzhiyun }
2092*4882a593Smuzhiyun 
2093*4882a593Smuzhiyun /*
2094*4882a593Smuzhiyun  * Select a tree root for relocation.
2095*4882a593Smuzhiyun  *
2096*4882a593Smuzhiyun  * Return NULL if the block is not shareable. We should use do_relocation() in
2097*4882a593Smuzhiyun  * this case.
2098*4882a593Smuzhiyun  *
2099*4882a593Smuzhiyun  * Return a tree root pointer if the block is shareable.
2100*4882a593Smuzhiyun  * Return -ENOENT if the block is root of reloc tree.
2101*4882a593Smuzhiyun  */
2102*4882a593Smuzhiyun static noinline_for_stack
select_one_root(struct btrfs_backref_node * node)2103*4882a593Smuzhiyun struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
2104*4882a593Smuzhiyun {
2105*4882a593Smuzhiyun 	struct btrfs_backref_node *next;
2106*4882a593Smuzhiyun 	struct btrfs_root *root;
2107*4882a593Smuzhiyun 	struct btrfs_root *fs_root = NULL;
2108*4882a593Smuzhiyun 	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2109*4882a593Smuzhiyun 	int index = 0;
2110*4882a593Smuzhiyun 
2111*4882a593Smuzhiyun 	next = node;
2112*4882a593Smuzhiyun 	while (1) {
2113*4882a593Smuzhiyun 		cond_resched();
2114*4882a593Smuzhiyun 		next = walk_up_backref(next, edges, &index);
2115*4882a593Smuzhiyun 		root = next->root;
2116*4882a593Smuzhiyun 		BUG_ON(!root);
2117*4882a593Smuzhiyun 
2118*4882a593Smuzhiyun 		/* No other choice for non-shareable tree */
2119*4882a593Smuzhiyun 		if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
2120*4882a593Smuzhiyun 			return root;
2121*4882a593Smuzhiyun 
2122*4882a593Smuzhiyun 		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2123*4882a593Smuzhiyun 			fs_root = root;
2124*4882a593Smuzhiyun 
2125*4882a593Smuzhiyun 		if (next != node)
2126*4882a593Smuzhiyun 			return NULL;
2127*4882a593Smuzhiyun 
2128*4882a593Smuzhiyun 		next = walk_down_backref(edges, &index);
2129*4882a593Smuzhiyun 		if (!next || next->level <= node->level)
2130*4882a593Smuzhiyun 			break;
2131*4882a593Smuzhiyun 	}
2132*4882a593Smuzhiyun 
2133*4882a593Smuzhiyun 	if (!fs_root)
2134*4882a593Smuzhiyun 		return ERR_PTR(-ENOENT);
2135*4882a593Smuzhiyun 	return fs_root;
2136*4882a593Smuzhiyun }
2137*4882a593Smuzhiyun 
2138*4882a593Smuzhiyun static noinline_for_stack
calcu_metadata_size(struct reloc_control * rc,struct btrfs_backref_node * node,int reserve)2139*4882a593Smuzhiyun u64 calcu_metadata_size(struct reloc_control *rc,
2140*4882a593Smuzhiyun 			struct btrfs_backref_node *node, int reserve)
2141*4882a593Smuzhiyun {
2142*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2143*4882a593Smuzhiyun 	struct btrfs_backref_node *next = node;
2144*4882a593Smuzhiyun 	struct btrfs_backref_edge *edge;
2145*4882a593Smuzhiyun 	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2146*4882a593Smuzhiyun 	u64 num_bytes = 0;
2147*4882a593Smuzhiyun 	int index = 0;
2148*4882a593Smuzhiyun 
2149*4882a593Smuzhiyun 	BUG_ON(reserve && node->processed);
2150*4882a593Smuzhiyun 
2151*4882a593Smuzhiyun 	while (next) {
2152*4882a593Smuzhiyun 		cond_resched();
2153*4882a593Smuzhiyun 		while (1) {
2154*4882a593Smuzhiyun 			if (next->processed && (reserve || next != node))
2155*4882a593Smuzhiyun 				break;
2156*4882a593Smuzhiyun 
2157*4882a593Smuzhiyun 			num_bytes += fs_info->nodesize;
2158*4882a593Smuzhiyun 
2159*4882a593Smuzhiyun 			if (list_empty(&next->upper))
2160*4882a593Smuzhiyun 				break;
2161*4882a593Smuzhiyun 
2162*4882a593Smuzhiyun 			edge = list_entry(next->upper.next,
2163*4882a593Smuzhiyun 					struct btrfs_backref_edge, list[LOWER]);
2164*4882a593Smuzhiyun 			edges[index++] = edge;
2165*4882a593Smuzhiyun 			next = edge->node[UPPER];
2166*4882a593Smuzhiyun 		}
2167*4882a593Smuzhiyun 		next = walk_down_backref(edges, &index);
2168*4882a593Smuzhiyun 	}
2169*4882a593Smuzhiyun 	return num_bytes;
2170*4882a593Smuzhiyun }
2171*4882a593Smuzhiyun 
reserve_metadata_space(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_backref_node * node)2172*4882a593Smuzhiyun static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2173*4882a593Smuzhiyun 				  struct reloc_control *rc,
2174*4882a593Smuzhiyun 				  struct btrfs_backref_node *node)
2175*4882a593Smuzhiyun {
2176*4882a593Smuzhiyun 	struct btrfs_root *root = rc->extent_root;
2177*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = root->fs_info;
2178*4882a593Smuzhiyun 	u64 num_bytes;
2179*4882a593Smuzhiyun 	int ret;
2180*4882a593Smuzhiyun 	u64 tmp;
2181*4882a593Smuzhiyun 
2182*4882a593Smuzhiyun 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2183*4882a593Smuzhiyun 
2184*4882a593Smuzhiyun 	trans->block_rsv = rc->block_rsv;
2185*4882a593Smuzhiyun 	rc->reserved_bytes += num_bytes;
2186*4882a593Smuzhiyun 
2187*4882a593Smuzhiyun 	/*
2188*4882a593Smuzhiyun 	 * We are under a transaction here so we can only do limited flushing.
2189*4882a593Smuzhiyun 	 * If we get an enospc just kick back -EAGAIN so we know to drop the
2190*4882a593Smuzhiyun 	 * transaction and try to refill when we can flush all the things.
2191*4882a593Smuzhiyun 	 */
2192*4882a593Smuzhiyun 	ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
2193*4882a593Smuzhiyun 				BTRFS_RESERVE_FLUSH_LIMIT);
2194*4882a593Smuzhiyun 	if (ret) {
2195*4882a593Smuzhiyun 		tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
2196*4882a593Smuzhiyun 		while (tmp <= rc->reserved_bytes)
2197*4882a593Smuzhiyun 			tmp <<= 1;
2198*4882a593Smuzhiyun 		/*
2199*4882a593Smuzhiyun 		 * only one thread can access block_rsv at this point,
2200*4882a593Smuzhiyun 		 * so we don't need hold lock to protect block_rsv.
2201*4882a593Smuzhiyun 		 * we expand more reservation size here to allow enough
2202*4882a593Smuzhiyun 		 * space for relocation and we will return earlier in
2203*4882a593Smuzhiyun 		 * enospc case.
2204*4882a593Smuzhiyun 		 */
2205*4882a593Smuzhiyun 		rc->block_rsv->size = tmp + fs_info->nodesize *
2206*4882a593Smuzhiyun 				      RELOCATION_RESERVED_NODES;
2207*4882a593Smuzhiyun 		return -EAGAIN;
2208*4882a593Smuzhiyun 	}
2209*4882a593Smuzhiyun 
2210*4882a593Smuzhiyun 	return 0;
2211*4882a593Smuzhiyun }
2212*4882a593Smuzhiyun 
2213*4882a593Smuzhiyun /*
2214*4882a593Smuzhiyun  * relocate a block tree, and then update pointers in upper level
2215*4882a593Smuzhiyun  * blocks that reference the block to point to the new location.
2216*4882a593Smuzhiyun  *
2217*4882a593Smuzhiyun  * if called by link_to_upper, the block has already been relocated.
2218*4882a593Smuzhiyun  * in that case this function just updates pointers.
2219*4882a593Smuzhiyun  */
do_relocation(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_backref_node * node,struct btrfs_key * key,struct btrfs_path * path,int lowest)2220*4882a593Smuzhiyun static int do_relocation(struct btrfs_trans_handle *trans,
2221*4882a593Smuzhiyun 			 struct reloc_control *rc,
2222*4882a593Smuzhiyun 			 struct btrfs_backref_node *node,
2223*4882a593Smuzhiyun 			 struct btrfs_key *key,
2224*4882a593Smuzhiyun 			 struct btrfs_path *path, int lowest)
2225*4882a593Smuzhiyun {
2226*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2227*4882a593Smuzhiyun 	struct btrfs_backref_node *upper;
2228*4882a593Smuzhiyun 	struct btrfs_backref_edge *edge;
2229*4882a593Smuzhiyun 	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2230*4882a593Smuzhiyun 	struct btrfs_root *root;
2231*4882a593Smuzhiyun 	struct extent_buffer *eb;
2232*4882a593Smuzhiyun 	u32 blocksize;
2233*4882a593Smuzhiyun 	u64 bytenr;
2234*4882a593Smuzhiyun 	u64 generation;
2235*4882a593Smuzhiyun 	int slot;
2236*4882a593Smuzhiyun 	int ret;
2237*4882a593Smuzhiyun 	int err = 0;
2238*4882a593Smuzhiyun 
2239*4882a593Smuzhiyun 	BUG_ON(lowest && node->eb);
2240*4882a593Smuzhiyun 
2241*4882a593Smuzhiyun 	path->lowest_level = node->level + 1;
2242*4882a593Smuzhiyun 	rc->backref_cache.path[node->level] = node;
2243*4882a593Smuzhiyun 	list_for_each_entry(edge, &node->upper, list[LOWER]) {
2244*4882a593Smuzhiyun 		struct btrfs_key first_key;
2245*4882a593Smuzhiyun 		struct btrfs_ref ref = { 0 };
2246*4882a593Smuzhiyun 
2247*4882a593Smuzhiyun 		cond_resched();
2248*4882a593Smuzhiyun 
2249*4882a593Smuzhiyun 		upper = edge->node[UPPER];
2250*4882a593Smuzhiyun 		root = select_reloc_root(trans, rc, upper, edges);
2251*4882a593Smuzhiyun 		BUG_ON(!root);
2252*4882a593Smuzhiyun 
2253*4882a593Smuzhiyun 		if (upper->eb && !upper->locked) {
2254*4882a593Smuzhiyun 			if (!lowest) {
2255*4882a593Smuzhiyun 				ret = btrfs_bin_search(upper->eb, key, &slot);
2256*4882a593Smuzhiyun 				if (ret < 0) {
2257*4882a593Smuzhiyun 					err = ret;
2258*4882a593Smuzhiyun 					goto next;
2259*4882a593Smuzhiyun 				}
2260*4882a593Smuzhiyun 				BUG_ON(ret);
2261*4882a593Smuzhiyun 				bytenr = btrfs_node_blockptr(upper->eb, slot);
2262*4882a593Smuzhiyun 				if (node->eb->start == bytenr)
2263*4882a593Smuzhiyun 					goto next;
2264*4882a593Smuzhiyun 			}
2265*4882a593Smuzhiyun 			btrfs_backref_drop_node_buffer(upper);
2266*4882a593Smuzhiyun 		}
2267*4882a593Smuzhiyun 
2268*4882a593Smuzhiyun 		if (!upper->eb) {
2269*4882a593Smuzhiyun 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2270*4882a593Smuzhiyun 			if (ret) {
2271*4882a593Smuzhiyun 				if (ret < 0)
2272*4882a593Smuzhiyun 					err = ret;
2273*4882a593Smuzhiyun 				else
2274*4882a593Smuzhiyun 					err = -ENOENT;
2275*4882a593Smuzhiyun 
2276*4882a593Smuzhiyun 				btrfs_release_path(path);
2277*4882a593Smuzhiyun 				break;
2278*4882a593Smuzhiyun 			}
2279*4882a593Smuzhiyun 
2280*4882a593Smuzhiyun 			if (!upper->eb) {
2281*4882a593Smuzhiyun 				upper->eb = path->nodes[upper->level];
2282*4882a593Smuzhiyun 				path->nodes[upper->level] = NULL;
2283*4882a593Smuzhiyun 			} else {
2284*4882a593Smuzhiyun 				BUG_ON(upper->eb != path->nodes[upper->level]);
2285*4882a593Smuzhiyun 			}
2286*4882a593Smuzhiyun 
2287*4882a593Smuzhiyun 			upper->locked = 1;
2288*4882a593Smuzhiyun 			path->locks[upper->level] = 0;
2289*4882a593Smuzhiyun 
2290*4882a593Smuzhiyun 			slot = path->slots[upper->level];
2291*4882a593Smuzhiyun 			btrfs_release_path(path);
2292*4882a593Smuzhiyun 		} else {
2293*4882a593Smuzhiyun 			ret = btrfs_bin_search(upper->eb, key, &slot);
2294*4882a593Smuzhiyun 			if (ret < 0) {
2295*4882a593Smuzhiyun 				err = ret;
2296*4882a593Smuzhiyun 				goto next;
2297*4882a593Smuzhiyun 			}
2298*4882a593Smuzhiyun 			BUG_ON(ret);
2299*4882a593Smuzhiyun 		}
2300*4882a593Smuzhiyun 
2301*4882a593Smuzhiyun 		bytenr = btrfs_node_blockptr(upper->eb, slot);
2302*4882a593Smuzhiyun 		if (lowest) {
2303*4882a593Smuzhiyun 			if (bytenr != node->bytenr) {
2304*4882a593Smuzhiyun 				btrfs_err(root->fs_info,
2305*4882a593Smuzhiyun 		"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2306*4882a593Smuzhiyun 					  bytenr, node->bytenr, slot,
2307*4882a593Smuzhiyun 					  upper->eb->start);
2308*4882a593Smuzhiyun 				err = -EIO;
2309*4882a593Smuzhiyun 				goto next;
2310*4882a593Smuzhiyun 			}
2311*4882a593Smuzhiyun 		} else {
2312*4882a593Smuzhiyun 			if (node->eb->start == bytenr)
2313*4882a593Smuzhiyun 				goto next;
2314*4882a593Smuzhiyun 		}
2315*4882a593Smuzhiyun 
2316*4882a593Smuzhiyun 		blocksize = root->fs_info->nodesize;
2317*4882a593Smuzhiyun 		generation = btrfs_node_ptr_generation(upper->eb, slot);
2318*4882a593Smuzhiyun 		btrfs_node_key_to_cpu(upper->eb, &first_key, slot);
2319*4882a593Smuzhiyun 		eb = read_tree_block(fs_info, bytenr, generation,
2320*4882a593Smuzhiyun 				     upper->level - 1, &first_key);
2321*4882a593Smuzhiyun 		if (IS_ERR(eb)) {
2322*4882a593Smuzhiyun 			err = PTR_ERR(eb);
2323*4882a593Smuzhiyun 			goto next;
2324*4882a593Smuzhiyun 		} else if (!extent_buffer_uptodate(eb)) {
2325*4882a593Smuzhiyun 			free_extent_buffer(eb);
2326*4882a593Smuzhiyun 			err = -EIO;
2327*4882a593Smuzhiyun 			goto next;
2328*4882a593Smuzhiyun 		}
2329*4882a593Smuzhiyun 		btrfs_tree_lock(eb);
2330*4882a593Smuzhiyun 		btrfs_set_lock_blocking_write(eb);
2331*4882a593Smuzhiyun 
2332*4882a593Smuzhiyun 		if (!node->eb) {
2333*4882a593Smuzhiyun 			ret = btrfs_cow_block(trans, root, eb, upper->eb,
2334*4882a593Smuzhiyun 					      slot, &eb, BTRFS_NESTING_COW);
2335*4882a593Smuzhiyun 			btrfs_tree_unlock(eb);
2336*4882a593Smuzhiyun 			free_extent_buffer(eb);
2337*4882a593Smuzhiyun 			if (ret < 0) {
2338*4882a593Smuzhiyun 				err = ret;
2339*4882a593Smuzhiyun 				goto next;
2340*4882a593Smuzhiyun 			}
2341*4882a593Smuzhiyun 			BUG_ON(node->eb != eb);
2342*4882a593Smuzhiyun 		} else {
2343*4882a593Smuzhiyun 			btrfs_set_node_blockptr(upper->eb, slot,
2344*4882a593Smuzhiyun 						node->eb->start);
2345*4882a593Smuzhiyun 			btrfs_set_node_ptr_generation(upper->eb, slot,
2346*4882a593Smuzhiyun 						      trans->transid);
2347*4882a593Smuzhiyun 			btrfs_mark_buffer_dirty(upper->eb);
2348*4882a593Smuzhiyun 
2349*4882a593Smuzhiyun 			btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2350*4882a593Smuzhiyun 					       node->eb->start, blocksize,
2351*4882a593Smuzhiyun 					       upper->eb->start);
2352*4882a593Smuzhiyun 			ref.real_root = root->root_key.objectid;
2353*4882a593Smuzhiyun 			btrfs_init_tree_ref(&ref, node->level,
2354*4882a593Smuzhiyun 					    btrfs_header_owner(upper->eb));
2355*4882a593Smuzhiyun 			ret = btrfs_inc_extent_ref(trans, &ref);
2356*4882a593Smuzhiyun 			BUG_ON(ret);
2357*4882a593Smuzhiyun 
2358*4882a593Smuzhiyun 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2359*4882a593Smuzhiyun 			BUG_ON(ret);
2360*4882a593Smuzhiyun 		}
2361*4882a593Smuzhiyun next:
2362*4882a593Smuzhiyun 		if (!upper->pending)
2363*4882a593Smuzhiyun 			btrfs_backref_drop_node_buffer(upper);
2364*4882a593Smuzhiyun 		else
2365*4882a593Smuzhiyun 			btrfs_backref_unlock_node_buffer(upper);
2366*4882a593Smuzhiyun 		if (err)
2367*4882a593Smuzhiyun 			break;
2368*4882a593Smuzhiyun 	}
2369*4882a593Smuzhiyun 
2370*4882a593Smuzhiyun 	if (!err && node->pending) {
2371*4882a593Smuzhiyun 		btrfs_backref_drop_node_buffer(node);
2372*4882a593Smuzhiyun 		list_move_tail(&node->list, &rc->backref_cache.changed);
2373*4882a593Smuzhiyun 		node->pending = 0;
2374*4882a593Smuzhiyun 	}
2375*4882a593Smuzhiyun 
2376*4882a593Smuzhiyun 	path->lowest_level = 0;
2377*4882a593Smuzhiyun 	BUG_ON(err == -ENOSPC);
2378*4882a593Smuzhiyun 	return err;
2379*4882a593Smuzhiyun }
2380*4882a593Smuzhiyun 
link_to_upper(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_backref_node * node,struct btrfs_path * path)2381*4882a593Smuzhiyun static int link_to_upper(struct btrfs_trans_handle *trans,
2382*4882a593Smuzhiyun 			 struct reloc_control *rc,
2383*4882a593Smuzhiyun 			 struct btrfs_backref_node *node,
2384*4882a593Smuzhiyun 			 struct btrfs_path *path)
2385*4882a593Smuzhiyun {
2386*4882a593Smuzhiyun 	struct btrfs_key key;
2387*4882a593Smuzhiyun 
2388*4882a593Smuzhiyun 	btrfs_node_key_to_cpu(node->eb, &key, 0);
2389*4882a593Smuzhiyun 	return do_relocation(trans, rc, node, &key, path, 0);
2390*4882a593Smuzhiyun }
2391*4882a593Smuzhiyun 
finish_pending_nodes(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_path * path,int err)2392*4882a593Smuzhiyun static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2393*4882a593Smuzhiyun 				struct reloc_control *rc,
2394*4882a593Smuzhiyun 				struct btrfs_path *path, int err)
2395*4882a593Smuzhiyun {
2396*4882a593Smuzhiyun 	LIST_HEAD(list);
2397*4882a593Smuzhiyun 	struct btrfs_backref_cache *cache = &rc->backref_cache;
2398*4882a593Smuzhiyun 	struct btrfs_backref_node *node;
2399*4882a593Smuzhiyun 	int level;
2400*4882a593Smuzhiyun 	int ret;
2401*4882a593Smuzhiyun 
2402*4882a593Smuzhiyun 	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2403*4882a593Smuzhiyun 		while (!list_empty(&cache->pending[level])) {
2404*4882a593Smuzhiyun 			node = list_entry(cache->pending[level].next,
2405*4882a593Smuzhiyun 					  struct btrfs_backref_node, list);
2406*4882a593Smuzhiyun 			list_move_tail(&node->list, &list);
2407*4882a593Smuzhiyun 			BUG_ON(!node->pending);
2408*4882a593Smuzhiyun 
2409*4882a593Smuzhiyun 			if (!err) {
2410*4882a593Smuzhiyun 				ret = link_to_upper(trans, rc, node, path);
2411*4882a593Smuzhiyun 				if (ret < 0)
2412*4882a593Smuzhiyun 					err = ret;
2413*4882a593Smuzhiyun 			}
2414*4882a593Smuzhiyun 		}
2415*4882a593Smuzhiyun 		list_splice_init(&list, &cache->pending[level]);
2416*4882a593Smuzhiyun 	}
2417*4882a593Smuzhiyun 	return err;
2418*4882a593Smuzhiyun }
2419*4882a593Smuzhiyun 
2420*4882a593Smuzhiyun /*
2421*4882a593Smuzhiyun  * mark a block and all blocks directly/indirectly reference the block
2422*4882a593Smuzhiyun  * as processed.
2423*4882a593Smuzhiyun  */
update_processed_blocks(struct reloc_control * rc,struct btrfs_backref_node * node)2424*4882a593Smuzhiyun static void update_processed_blocks(struct reloc_control *rc,
2425*4882a593Smuzhiyun 				    struct btrfs_backref_node *node)
2426*4882a593Smuzhiyun {
2427*4882a593Smuzhiyun 	struct btrfs_backref_node *next = node;
2428*4882a593Smuzhiyun 	struct btrfs_backref_edge *edge;
2429*4882a593Smuzhiyun 	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2430*4882a593Smuzhiyun 	int index = 0;
2431*4882a593Smuzhiyun 
2432*4882a593Smuzhiyun 	while (next) {
2433*4882a593Smuzhiyun 		cond_resched();
2434*4882a593Smuzhiyun 		while (1) {
2435*4882a593Smuzhiyun 			if (next->processed)
2436*4882a593Smuzhiyun 				break;
2437*4882a593Smuzhiyun 
2438*4882a593Smuzhiyun 			mark_block_processed(rc, next);
2439*4882a593Smuzhiyun 
2440*4882a593Smuzhiyun 			if (list_empty(&next->upper))
2441*4882a593Smuzhiyun 				break;
2442*4882a593Smuzhiyun 
2443*4882a593Smuzhiyun 			edge = list_entry(next->upper.next,
2444*4882a593Smuzhiyun 					struct btrfs_backref_edge, list[LOWER]);
2445*4882a593Smuzhiyun 			edges[index++] = edge;
2446*4882a593Smuzhiyun 			next = edge->node[UPPER];
2447*4882a593Smuzhiyun 		}
2448*4882a593Smuzhiyun 		next = walk_down_backref(edges, &index);
2449*4882a593Smuzhiyun 	}
2450*4882a593Smuzhiyun }
2451*4882a593Smuzhiyun 
tree_block_processed(u64 bytenr,struct reloc_control * rc)2452*4882a593Smuzhiyun static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2453*4882a593Smuzhiyun {
2454*4882a593Smuzhiyun 	u32 blocksize = rc->extent_root->fs_info->nodesize;
2455*4882a593Smuzhiyun 
2456*4882a593Smuzhiyun 	if (test_range_bit(&rc->processed_blocks, bytenr,
2457*4882a593Smuzhiyun 			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
2458*4882a593Smuzhiyun 		return 1;
2459*4882a593Smuzhiyun 	return 0;
2460*4882a593Smuzhiyun }
2461*4882a593Smuzhiyun 
get_tree_block_key(struct btrfs_fs_info * fs_info,struct tree_block * block)2462*4882a593Smuzhiyun static int get_tree_block_key(struct btrfs_fs_info *fs_info,
2463*4882a593Smuzhiyun 			      struct tree_block *block)
2464*4882a593Smuzhiyun {
2465*4882a593Smuzhiyun 	struct extent_buffer *eb;
2466*4882a593Smuzhiyun 
2467*4882a593Smuzhiyun 	eb = read_tree_block(fs_info, block->bytenr, block->key.offset,
2468*4882a593Smuzhiyun 			     block->level, NULL);
2469*4882a593Smuzhiyun 	if (IS_ERR(eb)) {
2470*4882a593Smuzhiyun 		return PTR_ERR(eb);
2471*4882a593Smuzhiyun 	} else if (!extent_buffer_uptodate(eb)) {
2472*4882a593Smuzhiyun 		free_extent_buffer(eb);
2473*4882a593Smuzhiyun 		return -EIO;
2474*4882a593Smuzhiyun 	}
2475*4882a593Smuzhiyun 	if (block->level == 0)
2476*4882a593Smuzhiyun 		btrfs_item_key_to_cpu(eb, &block->key, 0);
2477*4882a593Smuzhiyun 	else
2478*4882a593Smuzhiyun 		btrfs_node_key_to_cpu(eb, &block->key, 0);
2479*4882a593Smuzhiyun 	free_extent_buffer(eb);
2480*4882a593Smuzhiyun 	block->key_ready = 1;
2481*4882a593Smuzhiyun 	return 0;
2482*4882a593Smuzhiyun }
2483*4882a593Smuzhiyun 
2484*4882a593Smuzhiyun /*
2485*4882a593Smuzhiyun  * helper function to relocate a tree block
2486*4882a593Smuzhiyun  */
relocate_tree_block(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct btrfs_backref_node * node,struct btrfs_key * key,struct btrfs_path * path)2487*4882a593Smuzhiyun static int relocate_tree_block(struct btrfs_trans_handle *trans,
2488*4882a593Smuzhiyun 				struct reloc_control *rc,
2489*4882a593Smuzhiyun 				struct btrfs_backref_node *node,
2490*4882a593Smuzhiyun 				struct btrfs_key *key,
2491*4882a593Smuzhiyun 				struct btrfs_path *path)
2492*4882a593Smuzhiyun {
2493*4882a593Smuzhiyun 	struct btrfs_root *root;
2494*4882a593Smuzhiyun 	int ret = 0;
2495*4882a593Smuzhiyun 
2496*4882a593Smuzhiyun 	if (!node)
2497*4882a593Smuzhiyun 		return 0;
2498*4882a593Smuzhiyun 
2499*4882a593Smuzhiyun 	/*
2500*4882a593Smuzhiyun 	 * If we fail here we want to drop our backref_node because we are going
2501*4882a593Smuzhiyun 	 * to start over and regenerate the tree for it.
2502*4882a593Smuzhiyun 	 */
2503*4882a593Smuzhiyun 	ret = reserve_metadata_space(trans, rc, node);
2504*4882a593Smuzhiyun 	if (ret)
2505*4882a593Smuzhiyun 		goto out;
2506*4882a593Smuzhiyun 
2507*4882a593Smuzhiyun 	BUG_ON(node->processed);
2508*4882a593Smuzhiyun 	root = select_one_root(node);
2509*4882a593Smuzhiyun 	if (root == ERR_PTR(-ENOENT)) {
2510*4882a593Smuzhiyun 		update_processed_blocks(rc, node);
2511*4882a593Smuzhiyun 		goto out;
2512*4882a593Smuzhiyun 	}
2513*4882a593Smuzhiyun 
2514*4882a593Smuzhiyun 	if (root) {
2515*4882a593Smuzhiyun 		if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
2516*4882a593Smuzhiyun 			BUG_ON(node->new_bytenr);
2517*4882a593Smuzhiyun 			BUG_ON(!list_empty(&node->list));
2518*4882a593Smuzhiyun 			btrfs_record_root_in_trans(trans, root);
2519*4882a593Smuzhiyun 			root = root->reloc_root;
2520*4882a593Smuzhiyun 			node->new_bytenr = root->node->start;
2521*4882a593Smuzhiyun 			btrfs_put_root(node->root);
2522*4882a593Smuzhiyun 			node->root = btrfs_grab_root(root);
2523*4882a593Smuzhiyun 			ASSERT(node->root);
2524*4882a593Smuzhiyun 			list_add_tail(&node->list, &rc->backref_cache.changed);
2525*4882a593Smuzhiyun 		} else {
2526*4882a593Smuzhiyun 			path->lowest_level = node->level;
2527*4882a593Smuzhiyun 			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2528*4882a593Smuzhiyun 			btrfs_release_path(path);
2529*4882a593Smuzhiyun 			if (ret > 0)
2530*4882a593Smuzhiyun 				ret = 0;
2531*4882a593Smuzhiyun 		}
2532*4882a593Smuzhiyun 		if (!ret)
2533*4882a593Smuzhiyun 			update_processed_blocks(rc, node);
2534*4882a593Smuzhiyun 	} else {
2535*4882a593Smuzhiyun 		ret = do_relocation(trans, rc, node, key, path, 1);
2536*4882a593Smuzhiyun 	}
2537*4882a593Smuzhiyun out:
2538*4882a593Smuzhiyun 	if (ret || node->level == 0 || node->cowonly)
2539*4882a593Smuzhiyun 		btrfs_backref_cleanup_node(&rc->backref_cache, node);
2540*4882a593Smuzhiyun 	return ret;
2541*4882a593Smuzhiyun }
2542*4882a593Smuzhiyun 
2543*4882a593Smuzhiyun /*
2544*4882a593Smuzhiyun  * relocate a list of blocks
2545*4882a593Smuzhiyun  */
2546*4882a593Smuzhiyun static noinline_for_stack
relocate_tree_blocks(struct btrfs_trans_handle * trans,struct reloc_control * rc,struct rb_root * blocks)2547*4882a593Smuzhiyun int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2548*4882a593Smuzhiyun 			 struct reloc_control *rc, struct rb_root *blocks)
2549*4882a593Smuzhiyun {
2550*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2551*4882a593Smuzhiyun 	struct btrfs_backref_node *node;
2552*4882a593Smuzhiyun 	struct btrfs_path *path;
2553*4882a593Smuzhiyun 	struct tree_block *block;
2554*4882a593Smuzhiyun 	struct tree_block *next;
2555*4882a593Smuzhiyun 	int ret;
2556*4882a593Smuzhiyun 	int err = 0;
2557*4882a593Smuzhiyun 
2558*4882a593Smuzhiyun 	path = btrfs_alloc_path();
2559*4882a593Smuzhiyun 	if (!path) {
2560*4882a593Smuzhiyun 		err = -ENOMEM;
2561*4882a593Smuzhiyun 		goto out_free_blocks;
2562*4882a593Smuzhiyun 	}
2563*4882a593Smuzhiyun 
2564*4882a593Smuzhiyun 	/* Kick in readahead for tree blocks with missing keys */
2565*4882a593Smuzhiyun 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2566*4882a593Smuzhiyun 		if (!block->key_ready)
2567*4882a593Smuzhiyun 			readahead_tree_block(fs_info, block->bytenr);
2568*4882a593Smuzhiyun 	}
2569*4882a593Smuzhiyun 
2570*4882a593Smuzhiyun 	/* Get first keys */
2571*4882a593Smuzhiyun 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2572*4882a593Smuzhiyun 		if (!block->key_ready) {
2573*4882a593Smuzhiyun 			err = get_tree_block_key(fs_info, block);
2574*4882a593Smuzhiyun 			if (err)
2575*4882a593Smuzhiyun 				goto out_free_path;
2576*4882a593Smuzhiyun 		}
2577*4882a593Smuzhiyun 	}
2578*4882a593Smuzhiyun 
2579*4882a593Smuzhiyun 	/* Do tree relocation */
2580*4882a593Smuzhiyun 	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2581*4882a593Smuzhiyun 		node = build_backref_tree(rc, &block->key,
2582*4882a593Smuzhiyun 					  block->level, block->bytenr);
2583*4882a593Smuzhiyun 		if (IS_ERR(node)) {
2584*4882a593Smuzhiyun 			err = PTR_ERR(node);
2585*4882a593Smuzhiyun 			goto out;
2586*4882a593Smuzhiyun 		}
2587*4882a593Smuzhiyun 
2588*4882a593Smuzhiyun 		ret = relocate_tree_block(trans, rc, node, &block->key,
2589*4882a593Smuzhiyun 					  path);
2590*4882a593Smuzhiyun 		if (ret < 0) {
2591*4882a593Smuzhiyun 			err = ret;
2592*4882a593Smuzhiyun 			break;
2593*4882a593Smuzhiyun 		}
2594*4882a593Smuzhiyun 	}
2595*4882a593Smuzhiyun out:
2596*4882a593Smuzhiyun 	err = finish_pending_nodes(trans, rc, path, err);
2597*4882a593Smuzhiyun 
2598*4882a593Smuzhiyun out_free_path:
2599*4882a593Smuzhiyun 	btrfs_free_path(path);
2600*4882a593Smuzhiyun out_free_blocks:
2601*4882a593Smuzhiyun 	free_block_list(blocks);
2602*4882a593Smuzhiyun 	return err;
2603*4882a593Smuzhiyun }
2604*4882a593Smuzhiyun 
prealloc_file_extent_cluster(struct btrfs_inode * inode,struct file_extent_cluster * cluster)2605*4882a593Smuzhiyun static noinline_for_stack int prealloc_file_extent_cluster(
2606*4882a593Smuzhiyun 				struct btrfs_inode *inode,
2607*4882a593Smuzhiyun 				struct file_extent_cluster *cluster)
2608*4882a593Smuzhiyun {
2609*4882a593Smuzhiyun 	u64 alloc_hint = 0;
2610*4882a593Smuzhiyun 	u64 start;
2611*4882a593Smuzhiyun 	u64 end;
2612*4882a593Smuzhiyun 	u64 offset = inode->index_cnt;
2613*4882a593Smuzhiyun 	u64 num_bytes;
2614*4882a593Smuzhiyun 	int nr;
2615*4882a593Smuzhiyun 	int ret = 0;
2616*4882a593Smuzhiyun 	u64 prealloc_start = cluster->start - offset;
2617*4882a593Smuzhiyun 	u64 prealloc_end = cluster->end - offset;
2618*4882a593Smuzhiyun 	u64 cur_offset = prealloc_start;
2619*4882a593Smuzhiyun 
2620*4882a593Smuzhiyun 	BUG_ON(cluster->start != cluster->boundary[0]);
2621*4882a593Smuzhiyun 	ret = btrfs_alloc_data_chunk_ondemand(inode,
2622*4882a593Smuzhiyun 					      prealloc_end + 1 - prealloc_start);
2623*4882a593Smuzhiyun 	if (ret)
2624*4882a593Smuzhiyun 		return ret;
2625*4882a593Smuzhiyun 
2626*4882a593Smuzhiyun 	inode_lock(&inode->vfs_inode);
2627*4882a593Smuzhiyun 	for (nr = 0; nr < cluster->nr; nr++) {
2628*4882a593Smuzhiyun 		start = cluster->boundary[nr] - offset;
2629*4882a593Smuzhiyun 		if (nr + 1 < cluster->nr)
2630*4882a593Smuzhiyun 			end = cluster->boundary[nr + 1] - 1 - offset;
2631*4882a593Smuzhiyun 		else
2632*4882a593Smuzhiyun 			end = cluster->end - offset;
2633*4882a593Smuzhiyun 
2634*4882a593Smuzhiyun 		lock_extent(&inode->io_tree, start, end);
2635*4882a593Smuzhiyun 		num_bytes = end + 1 - start;
2636*4882a593Smuzhiyun 		ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
2637*4882a593Smuzhiyun 						num_bytes, num_bytes,
2638*4882a593Smuzhiyun 						end + 1, &alloc_hint);
2639*4882a593Smuzhiyun 		cur_offset = end + 1;
2640*4882a593Smuzhiyun 		unlock_extent(&inode->io_tree, start, end);
2641*4882a593Smuzhiyun 		if (ret)
2642*4882a593Smuzhiyun 			break;
2643*4882a593Smuzhiyun 	}
2644*4882a593Smuzhiyun 	inode_unlock(&inode->vfs_inode);
2645*4882a593Smuzhiyun 
2646*4882a593Smuzhiyun 	if (cur_offset < prealloc_end)
2647*4882a593Smuzhiyun 		btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
2648*4882a593Smuzhiyun 					       prealloc_end + 1 - cur_offset);
2649*4882a593Smuzhiyun 	return ret;
2650*4882a593Smuzhiyun }
2651*4882a593Smuzhiyun 
2652*4882a593Smuzhiyun static noinline_for_stack
setup_extent_mapping(struct inode * inode,u64 start,u64 end,u64 block_start)2653*4882a593Smuzhiyun int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
2654*4882a593Smuzhiyun 			 u64 block_start)
2655*4882a593Smuzhiyun {
2656*4882a593Smuzhiyun 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2657*4882a593Smuzhiyun 	struct extent_map *em;
2658*4882a593Smuzhiyun 	int ret = 0;
2659*4882a593Smuzhiyun 
2660*4882a593Smuzhiyun 	em = alloc_extent_map();
2661*4882a593Smuzhiyun 	if (!em)
2662*4882a593Smuzhiyun 		return -ENOMEM;
2663*4882a593Smuzhiyun 
2664*4882a593Smuzhiyun 	em->start = start;
2665*4882a593Smuzhiyun 	em->len = end + 1 - start;
2666*4882a593Smuzhiyun 	em->block_len = em->len;
2667*4882a593Smuzhiyun 	em->block_start = block_start;
2668*4882a593Smuzhiyun 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
2669*4882a593Smuzhiyun 
2670*4882a593Smuzhiyun 	lock_extent(&BTRFS_I(inode)->io_tree, start, end);
2671*4882a593Smuzhiyun 	while (1) {
2672*4882a593Smuzhiyun 		write_lock(&em_tree->lock);
2673*4882a593Smuzhiyun 		ret = add_extent_mapping(em_tree, em, 0);
2674*4882a593Smuzhiyun 		write_unlock(&em_tree->lock);
2675*4882a593Smuzhiyun 		if (ret != -EEXIST) {
2676*4882a593Smuzhiyun 			free_extent_map(em);
2677*4882a593Smuzhiyun 			break;
2678*4882a593Smuzhiyun 		}
2679*4882a593Smuzhiyun 		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
2680*4882a593Smuzhiyun 	}
2681*4882a593Smuzhiyun 	unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
2682*4882a593Smuzhiyun 	return ret;
2683*4882a593Smuzhiyun }
2684*4882a593Smuzhiyun 
2685*4882a593Smuzhiyun /*
2686*4882a593Smuzhiyun  * Allow error injection to test balance cancellation
2687*4882a593Smuzhiyun  */
btrfs_should_cancel_balance(struct btrfs_fs_info * fs_info)2688*4882a593Smuzhiyun int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
2689*4882a593Smuzhiyun {
2690*4882a593Smuzhiyun 	return atomic_read(&fs_info->balance_cancel_req) ||
2691*4882a593Smuzhiyun 		fatal_signal_pending(current);
2692*4882a593Smuzhiyun }
2693*4882a593Smuzhiyun ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
2694*4882a593Smuzhiyun 
relocate_file_extent_cluster(struct inode * inode,struct file_extent_cluster * cluster)2695*4882a593Smuzhiyun static int relocate_file_extent_cluster(struct inode *inode,
2696*4882a593Smuzhiyun 					struct file_extent_cluster *cluster)
2697*4882a593Smuzhiyun {
2698*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2699*4882a593Smuzhiyun 	u64 page_start;
2700*4882a593Smuzhiyun 	u64 page_end;
2701*4882a593Smuzhiyun 	u64 offset = BTRFS_I(inode)->index_cnt;
2702*4882a593Smuzhiyun 	unsigned long index;
2703*4882a593Smuzhiyun 	unsigned long last_index;
2704*4882a593Smuzhiyun 	struct page *page;
2705*4882a593Smuzhiyun 	struct file_ra_state *ra;
2706*4882a593Smuzhiyun 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2707*4882a593Smuzhiyun 	int nr = 0;
2708*4882a593Smuzhiyun 	int ret = 0;
2709*4882a593Smuzhiyun 
2710*4882a593Smuzhiyun 	if (!cluster->nr)
2711*4882a593Smuzhiyun 		return 0;
2712*4882a593Smuzhiyun 
2713*4882a593Smuzhiyun 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
2714*4882a593Smuzhiyun 	if (!ra)
2715*4882a593Smuzhiyun 		return -ENOMEM;
2716*4882a593Smuzhiyun 
2717*4882a593Smuzhiyun 	ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
2718*4882a593Smuzhiyun 	if (ret)
2719*4882a593Smuzhiyun 		goto out;
2720*4882a593Smuzhiyun 
2721*4882a593Smuzhiyun 	file_ra_state_init(ra, inode->i_mapping);
2722*4882a593Smuzhiyun 
2723*4882a593Smuzhiyun 	ret = setup_extent_mapping(inode, cluster->start - offset,
2724*4882a593Smuzhiyun 				   cluster->end - offset, cluster->start);
2725*4882a593Smuzhiyun 	if (ret)
2726*4882a593Smuzhiyun 		goto out;
2727*4882a593Smuzhiyun 
2728*4882a593Smuzhiyun 	index = (cluster->start - offset) >> PAGE_SHIFT;
2729*4882a593Smuzhiyun 	last_index = (cluster->end - offset) >> PAGE_SHIFT;
2730*4882a593Smuzhiyun 	while (index <= last_index) {
2731*4882a593Smuzhiyun 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
2732*4882a593Smuzhiyun 				PAGE_SIZE);
2733*4882a593Smuzhiyun 		if (ret)
2734*4882a593Smuzhiyun 			goto out;
2735*4882a593Smuzhiyun 
2736*4882a593Smuzhiyun 		page = find_lock_page(inode->i_mapping, index);
2737*4882a593Smuzhiyun 		if (!page) {
2738*4882a593Smuzhiyun 			page_cache_sync_readahead(inode->i_mapping,
2739*4882a593Smuzhiyun 						  ra, NULL, index,
2740*4882a593Smuzhiyun 						  last_index + 1 - index);
2741*4882a593Smuzhiyun 			page = find_or_create_page(inode->i_mapping, index,
2742*4882a593Smuzhiyun 						   mask);
2743*4882a593Smuzhiyun 			if (!page) {
2744*4882a593Smuzhiyun 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
2745*4882a593Smuzhiyun 							PAGE_SIZE, true);
2746*4882a593Smuzhiyun 				btrfs_delalloc_release_extents(BTRFS_I(inode),
2747*4882a593Smuzhiyun 							PAGE_SIZE);
2748*4882a593Smuzhiyun 				ret = -ENOMEM;
2749*4882a593Smuzhiyun 				goto out;
2750*4882a593Smuzhiyun 			}
2751*4882a593Smuzhiyun 		}
2752*4882a593Smuzhiyun 
2753*4882a593Smuzhiyun 		if (PageReadahead(page)) {
2754*4882a593Smuzhiyun 			page_cache_async_readahead(inode->i_mapping,
2755*4882a593Smuzhiyun 						   ra, NULL, page, index,
2756*4882a593Smuzhiyun 						   last_index + 1 - index);
2757*4882a593Smuzhiyun 		}
2758*4882a593Smuzhiyun 
2759*4882a593Smuzhiyun 		if (!PageUptodate(page)) {
2760*4882a593Smuzhiyun 			btrfs_readpage(NULL, page);
2761*4882a593Smuzhiyun 			lock_page(page);
2762*4882a593Smuzhiyun 			if (!PageUptodate(page)) {
2763*4882a593Smuzhiyun 				unlock_page(page);
2764*4882a593Smuzhiyun 				put_page(page);
2765*4882a593Smuzhiyun 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
2766*4882a593Smuzhiyun 							PAGE_SIZE, true);
2767*4882a593Smuzhiyun 				btrfs_delalloc_release_extents(BTRFS_I(inode),
2768*4882a593Smuzhiyun 							       PAGE_SIZE);
2769*4882a593Smuzhiyun 				ret = -EIO;
2770*4882a593Smuzhiyun 				goto out;
2771*4882a593Smuzhiyun 			}
2772*4882a593Smuzhiyun 		}
2773*4882a593Smuzhiyun 
2774*4882a593Smuzhiyun 		page_start = page_offset(page);
2775*4882a593Smuzhiyun 		page_end = page_start + PAGE_SIZE - 1;
2776*4882a593Smuzhiyun 
2777*4882a593Smuzhiyun 		lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
2778*4882a593Smuzhiyun 
2779*4882a593Smuzhiyun 		set_page_extent_mapped(page);
2780*4882a593Smuzhiyun 
2781*4882a593Smuzhiyun 		if (nr < cluster->nr &&
2782*4882a593Smuzhiyun 		    page_start + offset == cluster->boundary[nr]) {
2783*4882a593Smuzhiyun 			set_extent_bits(&BTRFS_I(inode)->io_tree,
2784*4882a593Smuzhiyun 					page_start, page_end,
2785*4882a593Smuzhiyun 					EXTENT_BOUNDARY);
2786*4882a593Smuzhiyun 			nr++;
2787*4882a593Smuzhiyun 		}
2788*4882a593Smuzhiyun 
2789*4882a593Smuzhiyun 		ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start,
2790*4882a593Smuzhiyun 						page_end, 0, NULL);
2791*4882a593Smuzhiyun 		if (ret) {
2792*4882a593Smuzhiyun 			unlock_page(page);
2793*4882a593Smuzhiyun 			put_page(page);
2794*4882a593Smuzhiyun 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
2795*4882a593Smuzhiyun 							 PAGE_SIZE, true);
2796*4882a593Smuzhiyun 			btrfs_delalloc_release_extents(BTRFS_I(inode),
2797*4882a593Smuzhiyun 			                               PAGE_SIZE);
2798*4882a593Smuzhiyun 
2799*4882a593Smuzhiyun 			clear_extent_bits(&BTRFS_I(inode)->io_tree,
2800*4882a593Smuzhiyun 					  page_start, page_end,
2801*4882a593Smuzhiyun 					  EXTENT_LOCKED | EXTENT_BOUNDARY);
2802*4882a593Smuzhiyun 			goto out;
2803*4882a593Smuzhiyun 
2804*4882a593Smuzhiyun 		}
2805*4882a593Smuzhiyun 		set_page_dirty(page);
2806*4882a593Smuzhiyun 
2807*4882a593Smuzhiyun 		unlock_extent(&BTRFS_I(inode)->io_tree,
2808*4882a593Smuzhiyun 			      page_start, page_end);
2809*4882a593Smuzhiyun 		unlock_page(page);
2810*4882a593Smuzhiyun 		put_page(page);
2811*4882a593Smuzhiyun 
2812*4882a593Smuzhiyun 		index++;
2813*4882a593Smuzhiyun 		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
2814*4882a593Smuzhiyun 		balance_dirty_pages_ratelimited(inode->i_mapping);
2815*4882a593Smuzhiyun 		btrfs_throttle(fs_info);
2816*4882a593Smuzhiyun 		if (btrfs_should_cancel_balance(fs_info)) {
2817*4882a593Smuzhiyun 			ret = -ECANCELED;
2818*4882a593Smuzhiyun 			goto out;
2819*4882a593Smuzhiyun 		}
2820*4882a593Smuzhiyun 	}
2821*4882a593Smuzhiyun 	WARN_ON(nr != cluster->nr);
2822*4882a593Smuzhiyun out:
2823*4882a593Smuzhiyun 	kfree(ra);
2824*4882a593Smuzhiyun 	return ret;
2825*4882a593Smuzhiyun }
2826*4882a593Smuzhiyun 
2827*4882a593Smuzhiyun static noinline_for_stack
relocate_data_extent(struct inode * inode,struct btrfs_key * extent_key,struct file_extent_cluster * cluster)2828*4882a593Smuzhiyun int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
2829*4882a593Smuzhiyun 			 struct file_extent_cluster *cluster)
2830*4882a593Smuzhiyun {
2831*4882a593Smuzhiyun 	int ret;
2832*4882a593Smuzhiyun 
2833*4882a593Smuzhiyun 	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
2834*4882a593Smuzhiyun 		ret = relocate_file_extent_cluster(inode, cluster);
2835*4882a593Smuzhiyun 		if (ret)
2836*4882a593Smuzhiyun 			return ret;
2837*4882a593Smuzhiyun 		cluster->nr = 0;
2838*4882a593Smuzhiyun 	}
2839*4882a593Smuzhiyun 
2840*4882a593Smuzhiyun 	if (!cluster->nr)
2841*4882a593Smuzhiyun 		cluster->start = extent_key->objectid;
2842*4882a593Smuzhiyun 	else
2843*4882a593Smuzhiyun 		BUG_ON(cluster->nr >= MAX_EXTENTS);
2844*4882a593Smuzhiyun 	cluster->end = extent_key->objectid + extent_key->offset - 1;
2845*4882a593Smuzhiyun 	cluster->boundary[cluster->nr] = extent_key->objectid;
2846*4882a593Smuzhiyun 	cluster->nr++;
2847*4882a593Smuzhiyun 
2848*4882a593Smuzhiyun 	if (cluster->nr >= MAX_EXTENTS) {
2849*4882a593Smuzhiyun 		ret = relocate_file_extent_cluster(inode, cluster);
2850*4882a593Smuzhiyun 		if (ret)
2851*4882a593Smuzhiyun 			return ret;
2852*4882a593Smuzhiyun 		cluster->nr = 0;
2853*4882a593Smuzhiyun 	}
2854*4882a593Smuzhiyun 	return 0;
2855*4882a593Smuzhiyun }
2856*4882a593Smuzhiyun 
2857*4882a593Smuzhiyun /*
2858*4882a593Smuzhiyun  * helper to add a tree block to the list.
2859*4882a593Smuzhiyun  * the major work is getting the generation and level of the block
2860*4882a593Smuzhiyun  */
add_tree_block(struct reloc_control * rc,struct btrfs_key * extent_key,struct btrfs_path * path,struct rb_root * blocks)2861*4882a593Smuzhiyun static int add_tree_block(struct reloc_control *rc,
2862*4882a593Smuzhiyun 			  struct btrfs_key *extent_key,
2863*4882a593Smuzhiyun 			  struct btrfs_path *path,
2864*4882a593Smuzhiyun 			  struct rb_root *blocks)
2865*4882a593Smuzhiyun {
2866*4882a593Smuzhiyun 	struct extent_buffer *eb;
2867*4882a593Smuzhiyun 	struct btrfs_extent_item *ei;
2868*4882a593Smuzhiyun 	struct btrfs_tree_block_info *bi;
2869*4882a593Smuzhiyun 	struct tree_block *block;
2870*4882a593Smuzhiyun 	struct rb_node *rb_node;
2871*4882a593Smuzhiyun 	u32 item_size;
2872*4882a593Smuzhiyun 	int level = -1;
2873*4882a593Smuzhiyun 	u64 generation;
2874*4882a593Smuzhiyun 
2875*4882a593Smuzhiyun 	eb =  path->nodes[0];
2876*4882a593Smuzhiyun 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
2877*4882a593Smuzhiyun 
2878*4882a593Smuzhiyun 	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
2879*4882a593Smuzhiyun 	    item_size >= sizeof(*ei) + sizeof(*bi)) {
2880*4882a593Smuzhiyun 		ei = btrfs_item_ptr(eb, path->slots[0],
2881*4882a593Smuzhiyun 				struct btrfs_extent_item);
2882*4882a593Smuzhiyun 		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
2883*4882a593Smuzhiyun 			bi = (struct btrfs_tree_block_info *)(ei + 1);
2884*4882a593Smuzhiyun 			level = btrfs_tree_block_level(eb, bi);
2885*4882a593Smuzhiyun 		} else {
2886*4882a593Smuzhiyun 			level = (int)extent_key->offset;
2887*4882a593Smuzhiyun 		}
2888*4882a593Smuzhiyun 		generation = btrfs_extent_generation(eb, ei);
2889*4882a593Smuzhiyun 	} else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
2890*4882a593Smuzhiyun 		btrfs_print_v0_err(eb->fs_info);
2891*4882a593Smuzhiyun 		btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
2892*4882a593Smuzhiyun 		return -EINVAL;
2893*4882a593Smuzhiyun 	} else {
2894*4882a593Smuzhiyun 		BUG();
2895*4882a593Smuzhiyun 	}
2896*4882a593Smuzhiyun 
2897*4882a593Smuzhiyun 	btrfs_release_path(path);
2898*4882a593Smuzhiyun 
2899*4882a593Smuzhiyun 	BUG_ON(level == -1);
2900*4882a593Smuzhiyun 
2901*4882a593Smuzhiyun 	block = kmalloc(sizeof(*block), GFP_NOFS);
2902*4882a593Smuzhiyun 	if (!block)
2903*4882a593Smuzhiyun 		return -ENOMEM;
2904*4882a593Smuzhiyun 
2905*4882a593Smuzhiyun 	block->bytenr = extent_key->objectid;
2906*4882a593Smuzhiyun 	block->key.objectid = rc->extent_root->fs_info->nodesize;
2907*4882a593Smuzhiyun 	block->key.offset = generation;
2908*4882a593Smuzhiyun 	block->level = level;
2909*4882a593Smuzhiyun 	block->key_ready = 0;
2910*4882a593Smuzhiyun 
2911*4882a593Smuzhiyun 	rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
2912*4882a593Smuzhiyun 	if (rb_node)
2913*4882a593Smuzhiyun 		btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
2914*4882a593Smuzhiyun 				    -EEXIST);
2915*4882a593Smuzhiyun 
2916*4882a593Smuzhiyun 	return 0;
2917*4882a593Smuzhiyun }
2918*4882a593Smuzhiyun 
2919*4882a593Smuzhiyun /*
2920*4882a593Smuzhiyun  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
2921*4882a593Smuzhiyun  */
__add_tree_block(struct reloc_control * rc,u64 bytenr,u32 blocksize,struct rb_root * blocks)2922*4882a593Smuzhiyun static int __add_tree_block(struct reloc_control *rc,
2923*4882a593Smuzhiyun 			    u64 bytenr, u32 blocksize,
2924*4882a593Smuzhiyun 			    struct rb_root *blocks)
2925*4882a593Smuzhiyun {
2926*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2927*4882a593Smuzhiyun 	struct btrfs_path *path;
2928*4882a593Smuzhiyun 	struct btrfs_key key;
2929*4882a593Smuzhiyun 	int ret;
2930*4882a593Smuzhiyun 	bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
2931*4882a593Smuzhiyun 
2932*4882a593Smuzhiyun 	if (tree_block_processed(bytenr, rc))
2933*4882a593Smuzhiyun 		return 0;
2934*4882a593Smuzhiyun 
2935*4882a593Smuzhiyun 	if (rb_simple_search(blocks, bytenr))
2936*4882a593Smuzhiyun 		return 0;
2937*4882a593Smuzhiyun 
2938*4882a593Smuzhiyun 	path = btrfs_alloc_path();
2939*4882a593Smuzhiyun 	if (!path)
2940*4882a593Smuzhiyun 		return -ENOMEM;
2941*4882a593Smuzhiyun again:
2942*4882a593Smuzhiyun 	key.objectid = bytenr;
2943*4882a593Smuzhiyun 	if (skinny) {
2944*4882a593Smuzhiyun 		key.type = BTRFS_METADATA_ITEM_KEY;
2945*4882a593Smuzhiyun 		key.offset = (u64)-1;
2946*4882a593Smuzhiyun 	} else {
2947*4882a593Smuzhiyun 		key.type = BTRFS_EXTENT_ITEM_KEY;
2948*4882a593Smuzhiyun 		key.offset = blocksize;
2949*4882a593Smuzhiyun 	}
2950*4882a593Smuzhiyun 
2951*4882a593Smuzhiyun 	path->search_commit_root = 1;
2952*4882a593Smuzhiyun 	path->skip_locking = 1;
2953*4882a593Smuzhiyun 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
2954*4882a593Smuzhiyun 	if (ret < 0)
2955*4882a593Smuzhiyun 		goto out;
2956*4882a593Smuzhiyun 
2957*4882a593Smuzhiyun 	if (ret > 0 && skinny) {
2958*4882a593Smuzhiyun 		if (path->slots[0]) {
2959*4882a593Smuzhiyun 			path->slots[0]--;
2960*4882a593Smuzhiyun 			btrfs_item_key_to_cpu(path->nodes[0], &key,
2961*4882a593Smuzhiyun 					      path->slots[0]);
2962*4882a593Smuzhiyun 			if (key.objectid == bytenr &&
2963*4882a593Smuzhiyun 			    (key.type == BTRFS_METADATA_ITEM_KEY ||
2964*4882a593Smuzhiyun 			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
2965*4882a593Smuzhiyun 			      key.offset == blocksize)))
2966*4882a593Smuzhiyun 				ret = 0;
2967*4882a593Smuzhiyun 		}
2968*4882a593Smuzhiyun 
2969*4882a593Smuzhiyun 		if (ret) {
2970*4882a593Smuzhiyun 			skinny = false;
2971*4882a593Smuzhiyun 			btrfs_release_path(path);
2972*4882a593Smuzhiyun 			goto again;
2973*4882a593Smuzhiyun 		}
2974*4882a593Smuzhiyun 	}
2975*4882a593Smuzhiyun 	if (ret) {
2976*4882a593Smuzhiyun 		ASSERT(ret == 1);
2977*4882a593Smuzhiyun 		btrfs_print_leaf(path->nodes[0]);
2978*4882a593Smuzhiyun 		btrfs_err(fs_info,
2979*4882a593Smuzhiyun 	     "tree block extent item (%llu) is not found in extent tree",
2980*4882a593Smuzhiyun 		     bytenr);
2981*4882a593Smuzhiyun 		WARN_ON(1);
2982*4882a593Smuzhiyun 		ret = -EINVAL;
2983*4882a593Smuzhiyun 		goto out;
2984*4882a593Smuzhiyun 	}
2985*4882a593Smuzhiyun 
2986*4882a593Smuzhiyun 	ret = add_tree_block(rc, &key, path, blocks);
2987*4882a593Smuzhiyun out:
2988*4882a593Smuzhiyun 	btrfs_free_path(path);
2989*4882a593Smuzhiyun 	return ret;
2990*4882a593Smuzhiyun }
2991*4882a593Smuzhiyun 
delete_block_group_cache(struct btrfs_fs_info * fs_info,struct btrfs_block_group * block_group,struct inode * inode,u64 ino)2992*4882a593Smuzhiyun static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
2993*4882a593Smuzhiyun 				    struct btrfs_block_group *block_group,
2994*4882a593Smuzhiyun 				    struct inode *inode,
2995*4882a593Smuzhiyun 				    u64 ino)
2996*4882a593Smuzhiyun {
2997*4882a593Smuzhiyun 	struct btrfs_root *root = fs_info->tree_root;
2998*4882a593Smuzhiyun 	struct btrfs_trans_handle *trans;
2999*4882a593Smuzhiyun 	int ret = 0;
3000*4882a593Smuzhiyun 
3001*4882a593Smuzhiyun 	if (inode)
3002*4882a593Smuzhiyun 		goto truncate;
3003*4882a593Smuzhiyun 
3004*4882a593Smuzhiyun 	inode = btrfs_iget(fs_info->sb, ino, root);
3005*4882a593Smuzhiyun 	if (IS_ERR(inode))
3006*4882a593Smuzhiyun 		return -ENOENT;
3007*4882a593Smuzhiyun 
3008*4882a593Smuzhiyun truncate:
3009*4882a593Smuzhiyun 	ret = btrfs_check_trunc_cache_free_space(fs_info,
3010*4882a593Smuzhiyun 						 &fs_info->global_block_rsv);
3011*4882a593Smuzhiyun 	if (ret)
3012*4882a593Smuzhiyun 		goto out;
3013*4882a593Smuzhiyun 
3014*4882a593Smuzhiyun 	trans = btrfs_join_transaction(root);
3015*4882a593Smuzhiyun 	if (IS_ERR(trans)) {
3016*4882a593Smuzhiyun 		ret = PTR_ERR(trans);
3017*4882a593Smuzhiyun 		goto out;
3018*4882a593Smuzhiyun 	}
3019*4882a593Smuzhiyun 
3020*4882a593Smuzhiyun 	ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
3021*4882a593Smuzhiyun 
3022*4882a593Smuzhiyun 	btrfs_end_transaction(trans);
3023*4882a593Smuzhiyun 	btrfs_btree_balance_dirty(fs_info);
3024*4882a593Smuzhiyun out:
3025*4882a593Smuzhiyun 	iput(inode);
3026*4882a593Smuzhiyun 	return ret;
3027*4882a593Smuzhiyun }
3028*4882a593Smuzhiyun 
3029*4882a593Smuzhiyun /*
3030*4882a593Smuzhiyun  * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
3031*4882a593Smuzhiyun  * cache inode, to avoid free space cache data extent blocking data relocation.
3032*4882a593Smuzhiyun  */
delete_v1_space_cache(struct extent_buffer * leaf,struct btrfs_block_group * block_group,u64 data_bytenr)3033*4882a593Smuzhiyun static int delete_v1_space_cache(struct extent_buffer *leaf,
3034*4882a593Smuzhiyun 				 struct btrfs_block_group *block_group,
3035*4882a593Smuzhiyun 				 u64 data_bytenr)
3036*4882a593Smuzhiyun {
3037*4882a593Smuzhiyun 	u64 space_cache_ino;
3038*4882a593Smuzhiyun 	struct btrfs_file_extent_item *ei;
3039*4882a593Smuzhiyun 	struct btrfs_key key;
3040*4882a593Smuzhiyun 	bool found = false;
3041*4882a593Smuzhiyun 	int i;
3042*4882a593Smuzhiyun 	int ret;
3043*4882a593Smuzhiyun 
3044*4882a593Smuzhiyun 	if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
3045*4882a593Smuzhiyun 		return 0;
3046*4882a593Smuzhiyun 
3047*4882a593Smuzhiyun 	for (i = 0; i < btrfs_header_nritems(leaf); i++) {
3048*4882a593Smuzhiyun 		u8 type;
3049*4882a593Smuzhiyun 
3050*4882a593Smuzhiyun 		btrfs_item_key_to_cpu(leaf, &key, i);
3051*4882a593Smuzhiyun 		if (key.type != BTRFS_EXTENT_DATA_KEY)
3052*4882a593Smuzhiyun 			continue;
3053*4882a593Smuzhiyun 		ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3054*4882a593Smuzhiyun 		type = btrfs_file_extent_type(leaf, ei);
3055*4882a593Smuzhiyun 
3056*4882a593Smuzhiyun 		if ((type == BTRFS_FILE_EXTENT_REG ||
3057*4882a593Smuzhiyun 		     type == BTRFS_FILE_EXTENT_PREALLOC) &&
3058*4882a593Smuzhiyun 		    btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
3059*4882a593Smuzhiyun 			found = true;
3060*4882a593Smuzhiyun 			space_cache_ino = key.objectid;
3061*4882a593Smuzhiyun 			break;
3062*4882a593Smuzhiyun 		}
3063*4882a593Smuzhiyun 	}
3064*4882a593Smuzhiyun 	if (!found)
3065*4882a593Smuzhiyun 		return -ENOENT;
3066*4882a593Smuzhiyun 	ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
3067*4882a593Smuzhiyun 					space_cache_ino);
3068*4882a593Smuzhiyun 	return ret;
3069*4882a593Smuzhiyun }
3070*4882a593Smuzhiyun 
3071*4882a593Smuzhiyun /*
3072*4882a593Smuzhiyun  * helper to find all tree blocks that reference a given data extent
3073*4882a593Smuzhiyun  */
3074*4882a593Smuzhiyun static noinline_for_stack
add_data_references(struct reloc_control * rc,struct btrfs_key * extent_key,struct btrfs_path * path,struct rb_root * blocks)3075*4882a593Smuzhiyun int add_data_references(struct reloc_control *rc,
3076*4882a593Smuzhiyun 			struct btrfs_key *extent_key,
3077*4882a593Smuzhiyun 			struct btrfs_path *path,
3078*4882a593Smuzhiyun 			struct rb_root *blocks)
3079*4882a593Smuzhiyun {
3080*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3081*4882a593Smuzhiyun 	struct ulist *leaves = NULL;
3082*4882a593Smuzhiyun 	struct ulist_iterator leaf_uiter;
3083*4882a593Smuzhiyun 	struct ulist_node *ref_node = NULL;
3084*4882a593Smuzhiyun 	const u32 blocksize = fs_info->nodesize;
3085*4882a593Smuzhiyun 	int ret = 0;
3086*4882a593Smuzhiyun 
3087*4882a593Smuzhiyun 	btrfs_release_path(path);
3088*4882a593Smuzhiyun 	ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
3089*4882a593Smuzhiyun 				   0, &leaves, NULL, true);
3090*4882a593Smuzhiyun 	if (ret < 0)
3091*4882a593Smuzhiyun 		return ret;
3092*4882a593Smuzhiyun 
3093*4882a593Smuzhiyun 	ULIST_ITER_INIT(&leaf_uiter);
3094*4882a593Smuzhiyun 	while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
3095*4882a593Smuzhiyun 		struct extent_buffer *eb;
3096*4882a593Smuzhiyun 
3097*4882a593Smuzhiyun 		eb = read_tree_block(fs_info, ref_node->val, 0, 0, NULL);
3098*4882a593Smuzhiyun 		if (IS_ERR(eb)) {
3099*4882a593Smuzhiyun 			ret = PTR_ERR(eb);
3100*4882a593Smuzhiyun 			break;
3101*4882a593Smuzhiyun 		}
3102*4882a593Smuzhiyun 		ret = delete_v1_space_cache(eb, rc->block_group,
3103*4882a593Smuzhiyun 					    extent_key->objectid);
3104*4882a593Smuzhiyun 		free_extent_buffer(eb);
3105*4882a593Smuzhiyun 		if (ret < 0)
3106*4882a593Smuzhiyun 			break;
3107*4882a593Smuzhiyun 		ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
3108*4882a593Smuzhiyun 		if (ret < 0)
3109*4882a593Smuzhiyun 			break;
3110*4882a593Smuzhiyun 	}
3111*4882a593Smuzhiyun 	if (ret < 0)
3112*4882a593Smuzhiyun 		free_block_list(blocks);
3113*4882a593Smuzhiyun 	ulist_free(leaves);
3114*4882a593Smuzhiyun 	return ret;
3115*4882a593Smuzhiyun }
3116*4882a593Smuzhiyun 
3117*4882a593Smuzhiyun /*
3118*4882a593Smuzhiyun  * helper to find next unprocessed extent
3119*4882a593Smuzhiyun  */
3120*4882a593Smuzhiyun static noinline_for_stack
find_next_extent(struct reloc_control * rc,struct btrfs_path * path,struct btrfs_key * extent_key)3121*4882a593Smuzhiyun int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3122*4882a593Smuzhiyun 		     struct btrfs_key *extent_key)
3123*4882a593Smuzhiyun {
3124*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3125*4882a593Smuzhiyun 	struct btrfs_key key;
3126*4882a593Smuzhiyun 	struct extent_buffer *leaf;
3127*4882a593Smuzhiyun 	u64 start, end, last;
3128*4882a593Smuzhiyun 	int ret;
3129*4882a593Smuzhiyun 
3130*4882a593Smuzhiyun 	last = rc->block_group->start + rc->block_group->length;
3131*4882a593Smuzhiyun 	while (1) {
3132*4882a593Smuzhiyun 		cond_resched();
3133*4882a593Smuzhiyun 		if (rc->search_start >= last) {
3134*4882a593Smuzhiyun 			ret = 1;
3135*4882a593Smuzhiyun 			break;
3136*4882a593Smuzhiyun 		}
3137*4882a593Smuzhiyun 
3138*4882a593Smuzhiyun 		key.objectid = rc->search_start;
3139*4882a593Smuzhiyun 		key.type = BTRFS_EXTENT_ITEM_KEY;
3140*4882a593Smuzhiyun 		key.offset = 0;
3141*4882a593Smuzhiyun 
3142*4882a593Smuzhiyun 		path->search_commit_root = 1;
3143*4882a593Smuzhiyun 		path->skip_locking = 1;
3144*4882a593Smuzhiyun 		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3145*4882a593Smuzhiyun 					0, 0);
3146*4882a593Smuzhiyun 		if (ret < 0)
3147*4882a593Smuzhiyun 			break;
3148*4882a593Smuzhiyun next:
3149*4882a593Smuzhiyun 		leaf = path->nodes[0];
3150*4882a593Smuzhiyun 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3151*4882a593Smuzhiyun 			ret = btrfs_next_leaf(rc->extent_root, path);
3152*4882a593Smuzhiyun 			if (ret != 0)
3153*4882a593Smuzhiyun 				break;
3154*4882a593Smuzhiyun 			leaf = path->nodes[0];
3155*4882a593Smuzhiyun 		}
3156*4882a593Smuzhiyun 
3157*4882a593Smuzhiyun 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3158*4882a593Smuzhiyun 		if (key.objectid >= last) {
3159*4882a593Smuzhiyun 			ret = 1;
3160*4882a593Smuzhiyun 			break;
3161*4882a593Smuzhiyun 		}
3162*4882a593Smuzhiyun 
3163*4882a593Smuzhiyun 		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3164*4882a593Smuzhiyun 		    key.type != BTRFS_METADATA_ITEM_KEY) {
3165*4882a593Smuzhiyun 			path->slots[0]++;
3166*4882a593Smuzhiyun 			goto next;
3167*4882a593Smuzhiyun 		}
3168*4882a593Smuzhiyun 
3169*4882a593Smuzhiyun 		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3170*4882a593Smuzhiyun 		    key.objectid + key.offset <= rc->search_start) {
3171*4882a593Smuzhiyun 			path->slots[0]++;
3172*4882a593Smuzhiyun 			goto next;
3173*4882a593Smuzhiyun 		}
3174*4882a593Smuzhiyun 
3175*4882a593Smuzhiyun 		if (key.type == BTRFS_METADATA_ITEM_KEY &&
3176*4882a593Smuzhiyun 		    key.objectid + fs_info->nodesize <=
3177*4882a593Smuzhiyun 		    rc->search_start) {
3178*4882a593Smuzhiyun 			path->slots[0]++;
3179*4882a593Smuzhiyun 			goto next;
3180*4882a593Smuzhiyun 		}
3181*4882a593Smuzhiyun 
3182*4882a593Smuzhiyun 		ret = find_first_extent_bit(&rc->processed_blocks,
3183*4882a593Smuzhiyun 					    key.objectid, &start, &end,
3184*4882a593Smuzhiyun 					    EXTENT_DIRTY, NULL);
3185*4882a593Smuzhiyun 
3186*4882a593Smuzhiyun 		if (ret == 0 && start <= key.objectid) {
3187*4882a593Smuzhiyun 			btrfs_release_path(path);
3188*4882a593Smuzhiyun 			rc->search_start = end + 1;
3189*4882a593Smuzhiyun 		} else {
3190*4882a593Smuzhiyun 			if (key.type == BTRFS_EXTENT_ITEM_KEY)
3191*4882a593Smuzhiyun 				rc->search_start = key.objectid + key.offset;
3192*4882a593Smuzhiyun 			else
3193*4882a593Smuzhiyun 				rc->search_start = key.objectid +
3194*4882a593Smuzhiyun 					fs_info->nodesize;
3195*4882a593Smuzhiyun 			memcpy(extent_key, &key, sizeof(key));
3196*4882a593Smuzhiyun 			return 0;
3197*4882a593Smuzhiyun 		}
3198*4882a593Smuzhiyun 	}
3199*4882a593Smuzhiyun 	btrfs_release_path(path);
3200*4882a593Smuzhiyun 	return ret;
3201*4882a593Smuzhiyun }
3202*4882a593Smuzhiyun 
set_reloc_control(struct reloc_control * rc)3203*4882a593Smuzhiyun static void set_reloc_control(struct reloc_control *rc)
3204*4882a593Smuzhiyun {
3205*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3206*4882a593Smuzhiyun 
3207*4882a593Smuzhiyun 	mutex_lock(&fs_info->reloc_mutex);
3208*4882a593Smuzhiyun 	fs_info->reloc_ctl = rc;
3209*4882a593Smuzhiyun 	mutex_unlock(&fs_info->reloc_mutex);
3210*4882a593Smuzhiyun }
3211*4882a593Smuzhiyun 
unset_reloc_control(struct reloc_control * rc)3212*4882a593Smuzhiyun static void unset_reloc_control(struct reloc_control *rc)
3213*4882a593Smuzhiyun {
3214*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3215*4882a593Smuzhiyun 
3216*4882a593Smuzhiyun 	mutex_lock(&fs_info->reloc_mutex);
3217*4882a593Smuzhiyun 	fs_info->reloc_ctl = NULL;
3218*4882a593Smuzhiyun 	mutex_unlock(&fs_info->reloc_mutex);
3219*4882a593Smuzhiyun }
3220*4882a593Smuzhiyun 
check_extent_flags(u64 flags)3221*4882a593Smuzhiyun static int check_extent_flags(u64 flags)
3222*4882a593Smuzhiyun {
3223*4882a593Smuzhiyun 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3224*4882a593Smuzhiyun 	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3225*4882a593Smuzhiyun 		return 1;
3226*4882a593Smuzhiyun 	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
3227*4882a593Smuzhiyun 	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
3228*4882a593Smuzhiyun 		return 1;
3229*4882a593Smuzhiyun 	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
3230*4882a593Smuzhiyun 	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
3231*4882a593Smuzhiyun 		return 1;
3232*4882a593Smuzhiyun 	return 0;
3233*4882a593Smuzhiyun }
3234*4882a593Smuzhiyun 
3235*4882a593Smuzhiyun static noinline_for_stack
prepare_to_relocate(struct reloc_control * rc)3236*4882a593Smuzhiyun int prepare_to_relocate(struct reloc_control *rc)
3237*4882a593Smuzhiyun {
3238*4882a593Smuzhiyun 	struct btrfs_trans_handle *trans;
3239*4882a593Smuzhiyun 	int ret;
3240*4882a593Smuzhiyun 
3241*4882a593Smuzhiyun 	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
3242*4882a593Smuzhiyun 					      BTRFS_BLOCK_RSV_TEMP);
3243*4882a593Smuzhiyun 	if (!rc->block_rsv)
3244*4882a593Smuzhiyun 		return -ENOMEM;
3245*4882a593Smuzhiyun 
3246*4882a593Smuzhiyun 	memset(&rc->cluster, 0, sizeof(rc->cluster));
3247*4882a593Smuzhiyun 	rc->search_start = rc->block_group->start;
3248*4882a593Smuzhiyun 	rc->extents_found = 0;
3249*4882a593Smuzhiyun 	rc->nodes_relocated = 0;
3250*4882a593Smuzhiyun 	rc->merging_rsv_size = 0;
3251*4882a593Smuzhiyun 	rc->reserved_bytes = 0;
3252*4882a593Smuzhiyun 	rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
3253*4882a593Smuzhiyun 			      RELOCATION_RESERVED_NODES;
3254*4882a593Smuzhiyun 	ret = btrfs_block_rsv_refill(rc->extent_root,
3255*4882a593Smuzhiyun 				     rc->block_rsv, rc->block_rsv->size,
3256*4882a593Smuzhiyun 				     BTRFS_RESERVE_FLUSH_ALL);
3257*4882a593Smuzhiyun 	if (ret)
3258*4882a593Smuzhiyun 		return ret;
3259*4882a593Smuzhiyun 
3260*4882a593Smuzhiyun 	rc->create_reloc_tree = 1;
3261*4882a593Smuzhiyun 	set_reloc_control(rc);
3262*4882a593Smuzhiyun 
3263*4882a593Smuzhiyun 	trans = btrfs_join_transaction(rc->extent_root);
3264*4882a593Smuzhiyun 	if (IS_ERR(trans)) {
3265*4882a593Smuzhiyun 		unset_reloc_control(rc);
3266*4882a593Smuzhiyun 		/*
3267*4882a593Smuzhiyun 		 * extent tree is not a ref_cow tree and has no reloc_root to
3268*4882a593Smuzhiyun 		 * cleanup.  And callers are responsible to free the above
3269*4882a593Smuzhiyun 		 * block rsv.
3270*4882a593Smuzhiyun 		 */
3271*4882a593Smuzhiyun 		return PTR_ERR(trans);
3272*4882a593Smuzhiyun 	}
3273*4882a593Smuzhiyun 	btrfs_commit_transaction(trans);
3274*4882a593Smuzhiyun 	return 0;
3275*4882a593Smuzhiyun }
3276*4882a593Smuzhiyun 
relocate_block_group(struct reloc_control * rc)3277*4882a593Smuzhiyun static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3278*4882a593Smuzhiyun {
3279*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3280*4882a593Smuzhiyun 	struct rb_root blocks = RB_ROOT;
3281*4882a593Smuzhiyun 	struct btrfs_key key;
3282*4882a593Smuzhiyun 	struct btrfs_trans_handle *trans = NULL;
3283*4882a593Smuzhiyun 	struct btrfs_path *path;
3284*4882a593Smuzhiyun 	struct btrfs_extent_item *ei;
3285*4882a593Smuzhiyun 	u64 flags;
3286*4882a593Smuzhiyun 	u32 item_size;
3287*4882a593Smuzhiyun 	int ret;
3288*4882a593Smuzhiyun 	int err = 0;
3289*4882a593Smuzhiyun 	int progress = 0;
3290*4882a593Smuzhiyun 
3291*4882a593Smuzhiyun 	path = btrfs_alloc_path();
3292*4882a593Smuzhiyun 	if (!path)
3293*4882a593Smuzhiyun 		return -ENOMEM;
3294*4882a593Smuzhiyun 	path->reada = READA_FORWARD;
3295*4882a593Smuzhiyun 
3296*4882a593Smuzhiyun 	ret = prepare_to_relocate(rc);
3297*4882a593Smuzhiyun 	if (ret) {
3298*4882a593Smuzhiyun 		err = ret;
3299*4882a593Smuzhiyun 		goto out_free;
3300*4882a593Smuzhiyun 	}
3301*4882a593Smuzhiyun 
3302*4882a593Smuzhiyun 	while (1) {
3303*4882a593Smuzhiyun 		rc->reserved_bytes = 0;
3304*4882a593Smuzhiyun 		ret = btrfs_block_rsv_refill(rc->extent_root,
3305*4882a593Smuzhiyun 					rc->block_rsv, rc->block_rsv->size,
3306*4882a593Smuzhiyun 					BTRFS_RESERVE_FLUSH_ALL);
3307*4882a593Smuzhiyun 		if (ret) {
3308*4882a593Smuzhiyun 			err = ret;
3309*4882a593Smuzhiyun 			break;
3310*4882a593Smuzhiyun 		}
3311*4882a593Smuzhiyun 		progress++;
3312*4882a593Smuzhiyun 		trans = btrfs_start_transaction(rc->extent_root, 0);
3313*4882a593Smuzhiyun 		if (IS_ERR(trans)) {
3314*4882a593Smuzhiyun 			err = PTR_ERR(trans);
3315*4882a593Smuzhiyun 			trans = NULL;
3316*4882a593Smuzhiyun 			break;
3317*4882a593Smuzhiyun 		}
3318*4882a593Smuzhiyun restart:
3319*4882a593Smuzhiyun 		if (update_backref_cache(trans, &rc->backref_cache)) {
3320*4882a593Smuzhiyun 			btrfs_end_transaction(trans);
3321*4882a593Smuzhiyun 			trans = NULL;
3322*4882a593Smuzhiyun 			continue;
3323*4882a593Smuzhiyun 		}
3324*4882a593Smuzhiyun 
3325*4882a593Smuzhiyun 		ret = find_next_extent(rc, path, &key);
3326*4882a593Smuzhiyun 		if (ret < 0)
3327*4882a593Smuzhiyun 			err = ret;
3328*4882a593Smuzhiyun 		if (ret != 0)
3329*4882a593Smuzhiyun 			break;
3330*4882a593Smuzhiyun 
3331*4882a593Smuzhiyun 		rc->extents_found++;
3332*4882a593Smuzhiyun 
3333*4882a593Smuzhiyun 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3334*4882a593Smuzhiyun 				    struct btrfs_extent_item);
3335*4882a593Smuzhiyun 		item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3336*4882a593Smuzhiyun 		if (item_size >= sizeof(*ei)) {
3337*4882a593Smuzhiyun 			flags = btrfs_extent_flags(path->nodes[0], ei);
3338*4882a593Smuzhiyun 			ret = check_extent_flags(flags);
3339*4882a593Smuzhiyun 			BUG_ON(ret);
3340*4882a593Smuzhiyun 		} else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
3341*4882a593Smuzhiyun 			err = -EINVAL;
3342*4882a593Smuzhiyun 			btrfs_print_v0_err(trans->fs_info);
3343*4882a593Smuzhiyun 			btrfs_abort_transaction(trans, err);
3344*4882a593Smuzhiyun 			break;
3345*4882a593Smuzhiyun 		} else {
3346*4882a593Smuzhiyun 			BUG();
3347*4882a593Smuzhiyun 		}
3348*4882a593Smuzhiyun 
3349*4882a593Smuzhiyun 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3350*4882a593Smuzhiyun 			ret = add_tree_block(rc, &key, path, &blocks);
3351*4882a593Smuzhiyun 		} else if (rc->stage == UPDATE_DATA_PTRS &&
3352*4882a593Smuzhiyun 			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
3353*4882a593Smuzhiyun 			ret = add_data_references(rc, &key, path, &blocks);
3354*4882a593Smuzhiyun 		} else {
3355*4882a593Smuzhiyun 			btrfs_release_path(path);
3356*4882a593Smuzhiyun 			ret = 0;
3357*4882a593Smuzhiyun 		}
3358*4882a593Smuzhiyun 		if (ret < 0) {
3359*4882a593Smuzhiyun 			err = ret;
3360*4882a593Smuzhiyun 			break;
3361*4882a593Smuzhiyun 		}
3362*4882a593Smuzhiyun 
3363*4882a593Smuzhiyun 		if (!RB_EMPTY_ROOT(&blocks)) {
3364*4882a593Smuzhiyun 			ret = relocate_tree_blocks(trans, rc, &blocks);
3365*4882a593Smuzhiyun 			if (ret < 0) {
3366*4882a593Smuzhiyun 				if (ret != -EAGAIN) {
3367*4882a593Smuzhiyun 					err = ret;
3368*4882a593Smuzhiyun 					break;
3369*4882a593Smuzhiyun 				}
3370*4882a593Smuzhiyun 				rc->extents_found--;
3371*4882a593Smuzhiyun 				rc->search_start = key.objectid;
3372*4882a593Smuzhiyun 			}
3373*4882a593Smuzhiyun 		}
3374*4882a593Smuzhiyun 
3375*4882a593Smuzhiyun 		btrfs_end_transaction_throttle(trans);
3376*4882a593Smuzhiyun 		btrfs_btree_balance_dirty(fs_info);
3377*4882a593Smuzhiyun 		trans = NULL;
3378*4882a593Smuzhiyun 
3379*4882a593Smuzhiyun 		if (rc->stage == MOVE_DATA_EXTENTS &&
3380*4882a593Smuzhiyun 		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
3381*4882a593Smuzhiyun 			rc->found_file_extent = 1;
3382*4882a593Smuzhiyun 			ret = relocate_data_extent(rc->data_inode,
3383*4882a593Smuzhiyun 						   &key, &rc->cluster);
3384*4882a593Smuzhiyun 			if (ret < 0) {
3385*4882a593Smuzhiyun 				err = ret;
3386*4882a593Smuzhiyun 				break;
3387*4882a593Smuzhiyun 			}
3388*4882a593Smuzhiyun 		}
3389*4882a593Smuzhiyun 		if (btrfs_should_cancel_balance(fs_info)) {
3390*4882a593Smuzhiyun 			err = -ECANCELED;
3391*4882a593Smuzhiyun 			break;
3392*4882a593Smuzhiyun 		}
3393*4882a593Smuzhiyun 	}
3394*4882a593Smuzhiyun 	if (trans && progress && err == -ENOSPC) {
3395*4882a593Smuzhiyun 		ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
3396*4882a593Smuzhiyun 		if (ret == 1) {
3397*4882a593Smuzhiyun 			err = 0;
3398*4882a593Smuzhiyun 			progress = 0;
3399*4882a593Smuzhiyun 			goto restart;
3400*4882a593Smuzhiyun 		}
3401*4882a593Smuzhiyun 	}
3402*4882a593Smuzhiyun 
3403*4882a593Smuzhiyun 	btrfs_release_path(path);
3404*4882a593Smuzhiyun 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
3405*4882a593Smuzhiyun 
3406*4882a593Smuzhiyun 	if (trans) {
3407*4882a593Smuzhiyun 		btrfs_end_transaction_throttle(trans);
3408*4882a593Smuzhiyun 		btrfs_btree_balance_dirty(fs_info);
3409*4882a593Smuzhiyun 	}
3410*4882a593Smuzhiyun 
3411*4882a593Smuzhiyun 	if (!err) {
3412*4882a593Smuzhiyun 		ret = relocate_file_extent_cluster(rc->data_inode,
3413*4882a593Smuzhiyun 						   &rc->cluster);
3414*4882a593Smuzhiyun 		if (ret < 0)
3415*4882a593Smuzhiyun 			err = ret;
3416*4882a593Smuzhiyun 	}
3417*4882a593Smuzhiyun 
3418*4882a593Smuzhiyun 	rc->create_reloc_tree = 0;
3419*4882a593Smuzhiyun 	set_reloc_control(rc);
3420*4882a593Smuzhiyun 
3421*4882a593Smuzhiyun 	btrfs_backref_release_cache(&rc->backref_cache);
3422*4882a593Smuzhiyun 	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
3423*4882a593Smuzhiyun 
3424*4882a593Smuzhiyun 	/*
3425*4882a593Smuzhiyun 	 * Even in the case when the relocation is cancelled, we should all go
3426*4882a593Smuzhiyun 	 * through prepare_to_merge() and merge_reloc_roots().
3427*4882a593Smuzhiyun 	 *
3428*4882a593Smuzhiyun 	 * For error (including cancelled balance), prepare_to_merge() will
3429*4882a593Smuzhiyun 	 * mark all reloc trees orphan, then queue them for cleanup in
3430*4882a593Smuzhiyun 	 * merge_reloc_roots()
3431*4882a593Smuzhiyun 	 */
3432*4882a593Smuzhiyun 	err = prepare_to_merge(rc, err);
3433*4882a593Smuzhiyun 
3434*4882a593Smuzhiyun 	merge_reloc_roots(rc);
3435*4882a593Smuzhiyun 
3436*4882a593Smuzhiyun 	rc->merge_reloc_tree = 0;
3437*4882a593Smuzhiyun 	unset_reloc_control(rc);
3438*4882a593Smuzhiyun 	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
3439*4882a593Smuzhiyun 
3440*4882a593Smuzhiyun 	/* get rid of pinned extents */
3441*4882a593Smuzhiyun 	trans = btrfs_join_transaction(rc->extent_root);
3442*4882a593Smuzhiyun 	if (IS_ERR(trans)) {
3443*4882a593Smuzhiyun 		err = PTR_ERR(trans);
3444*4882a593Smuzhiyun 		goto out_free;
3445*4882a593Smuzhiyun 	}
3446*4882a593Smuzhiyun 	btrfs_commit_transaction(trans);
3447*4882a593Smuzhiyun out_free:
3448*4882a593Smuzhiyun 	ret = clean_dirty_subvols(rc);
3449*4882a593Smuzhiyun 	if (ret < 0 && !err)
3450*4882a593Smuzhiyun 		err = ret;
3451*4882a593Smuzhiyun 	btrfs_free_block_rsv(fs_info, rc->block_rsv);
3452*4882a593Smuzhiyun 	btrfs_free_path(path);
3453*4882a593Smuzhiyun 	return err;
3454*4882a593Smuzhiyun }
3455*4882a593Smuzhiyun 
__insert_orphan_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 objectid)3456*4882a593Smuzhiyun static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3457*4882a593Smuzhiyun 				 struct btrfs_root *root, u64 objectid)
3458*4882a593Smuzhiyun {
3459*4882a593Smuzhiyun 	struct btrfs_path *path;
3460*4882a593Smuzhiyun 	struct btrfs_inode_item *item;
3461*4882a593Smuzhiyun 	struct extent_buffer *leaf;
3462*4882a593Smuzhiyun 	int ret;
3463*4882a593Smuzhiyun 
3464*4882a593Smuzhiyun 	path = btrfs_alloc_path();
3465*4882a593Smuzhiyun 	if (!path)
3466*4882a593Smuzhiyun 		return -ENOMEM;
3467*4882a593Smuzhiyun 
3468*4882a593Smuzhiyun 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3469*4882a593Smuzhiyun 	if (ret)
3470*4882a593Smuzhiyun 		goto out;
3471*4882a593Smuzhiyun 
3472*4882a593Smuzhiyun 	leaf = path->nodes[0];
3473*4882a593Smuzhiyun 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3474*4882a593Smuzhiyun 	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3475*4882a593Smuzhiyun 	btrfs_set_inode_generation(leaf, item, 1);
3476*4882a593Smuzhiyun 	btrfs_set_inode_size(leaf, item, 0);
3477*4882a593Smuzhiyun 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3478*4882a593Smuzhiyun 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3479*4882a593Smuzhiyun 					  BTRFS_INODE_PREALLOC);
3480*4882a593Smuzhiyun 	btrfs_mark_buffer_dirty(leaf);
3481*4882a593Smuzhiyun out:
3482*4882a593Smuzhiyun 	btrfs_free_path(path);
3483*4882a593Smuzhiyun 	return ret;
3484*4882a593Smuzhiyun }
3485*4882a593Smuzhiyun 
3486*4882a593Smuzhiyun /*
3487*4882a593Smuzhiyun  * helper to create inode for data relocation.
3488*4882a593Smuzhiyun  * the inode is in data relocation tree and its link count is 0
3489*4882a593Smuzhiyun  */
3490*4882a593Smuzhiyun static noinline_for_stack
create_reloc_inode(struct btrfs_fs_info * fs_info,struct btrfs_block_group * group)3491*4882a593Smuzhiyun struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3492*4882a593Smuzhiyun 				 struct btrfs_block_group *group)
3493*4882a593Smuzhiyun {
3494*4882a593Smuzhiyun 	struct inode *inode = NULL;
3495*4882a593Smuzhiyun 	struct btrfs_trans_handle *trans;
3496*4882a593Smuzhiyun 	struct btrfs_root *root;
3497*4882a593Smuzhiyun 	u64 objectid;
3498*4882a593Smuzhiyun 	int err = 0;
3499*4882a593Smuzhiyun 
3500*4882a593Smuzhiyun 	root = btrfs_grab_root(fs_info->data_reloc_root);
3501*4882a593Smuzhiyun 	trans = btrfs_start_transaction(root, 6);
3502*4882a593Smuzhiyun 	if (IS_ERR(trans)) {
3503*4882a593Smuzhiyun 		btrfs_put_root(root);
3504*4882a593Smuzhiyun 		return ERR_CAST(trans);
3505*4882a593Smuzhiyun 	}
3506*4882a593Smuzhiyun 
3507*4882a593Smuzhiyun 	err = btrfs_find_free_objectid(root, &objectid);
3508*4882a593Smuzhiyun 	if (err)
3509*4882a593Smuzhiyun 		goto out;
3510*4882a593Smuzhiyun 
3511*4882a593Smuzhiyun 	err = __insert_orphan_inode(trans, root, objectid);
3512*4882a593Smuzhiyun 	BUG_ON(err);
3513*4882a593Smuzhiyun 
3514*4882a593Smuzhiyun 	inode = btrfs_iget(fs_info->sb, objectid, root);
3515*4882a593Smuzhiyun 	BUG_ON(IS_ERR(inode));
3516*4882a593Smuzhiyun 	BTRFS_I(inode)->index_cnt = group->start;
3517*4882a593Smuzhiyun 
3518*4882a593Smuzhiyun 	err = btrfs_orphan_add(trans, BTRFS_I(inode));
3519*4882a593Smuzhiyun out:
3520*4882a593Smuzhiyun 	btrfs_put_root(root);
3521*4882a593Smuzhiyun 	btrfs_end_transaction(trans);
3522*4882a593Smuzhiyun 	btrfs_btree_balance_dirty(fs_info);
3523*4882a593Smuzhiyun 	if (err) {
3524*4882a593Smuzhiyun 		if (inode)
3525*4882a593Smuzhiyun 			iput(inode);
3526*4882a593Smuzhiyun 		inode = ERR_PTR(err);
3527*4882a593Smuzhiyun 	}
3528*4882a593Smuzhiyun 	return inode;
3529*4882a593Smuzhiyun }
3530*4882a593Smuzhiyun 
alloc_reloc_control(struct btrfs_fs_info * fs_info)3531*4882a593Smuzhiyun static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
3532*4882a593Smuzhiyun {
3533*4882a593Smuzhiyun 	struct reloc_control *rc;
3534*4882a593Smuzhiyun 
3535*4882a593Smuzhiyun 	rc = kzalloc(sizeof(*rc), GFP_NOFS);
3536*4882a593Smuzhiyun 	if (!rc)
3537*4882a593Smuzhiyun 		return NULL;
3538*4882a593Smuzhiyun 
3539*4882a593Smuzhiyun 	INIT_LIST_HEAD(&rc->reloc_roots);
3540*4882a593Smuzhiyun 	INIT_LIST_HEAD(&rc->dirty_subvol_roots);
3541*4882a593Smuzhiyun 	btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
3542*4882a593Smuzhiyun 	mapping_tree_init(&rc->reloc_root_tree);
3543*4882a593Smuzhiyun 	extent_io_tree_init(fs_info, &rc->processed_blocks,
3544*4882a593Smuzhiyun 			    IO_TREE_RELOC_BLOCKS, NULL);
3545*4882a593Smuzhiyun 	return rc;
3546*4882a593Smuzhiyun }
3547*4882a593Smuzhiyun 
free_reloc_control(struct reloc_control * rc)3548*4882a593Smuzhiyun static void free_reloc_control(struct reloc_control *rc)
3549*4882a593Smuzhiyun {
3550*4882a593Smuzhiyun 	struct mapping_node *node, *tmp;
3551*4882a593Smuzhiyun 
3552*4882a593Smuzhiyun 	free_reloc_roots(&rc->reloc_roots);
3553*4882a593Smuzhiyun 	rbtree_postorder_for_each_entry_safe(node, tmp,
3554*4882a593Smuzhiyun 			&rc->reloc_root_tree.rb_root, rb_node)
3555*4882a593Smuzhiyun 		kfree(node);
3556*4882a593Smuzhiyun 
3557*4882a593Smuzhiyun 	kfree(rc);
3558*4882a593Smuzhiyun }
3559*4882a593Smuzhiyun 
3560*4882a593Smuzhiyun /*
3561*4882a593Smuzhiyun  * Print the block group being relocated
3562*4882a593Smuzhiyun  */
describe_relocation(struct btrfs_fs_info * fs_info,struct btrfs_block_group * block_group)3563*4882a593Smuzhiyun static void describe_relocation(struct btrfs_fs_info *fs_info,
3564*4882a593Smuzhiyun 				struct btrfs_block_group *block_group)
3565*4882a593Smuzhiyun {
3566*4882a593Smuzhiyun 	char buf[128] = {'\0'};
3567*4882a593Smuzhiyun 
3568*4882a593Smuzhiyun 	btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
3569*4882a593Smuzhiyun 
3570*4882a593Smuzhiyun 	btrfs_info(fs_info,
3571*4882a593Smuzhiyun 		   "relocating block group %llu flags %s",
3572*4882a593Smuzhiyun 		   block_group->start, buf);
3573*4882a593Smuzhiyun }
3574*4882a593Smuzhiyun 
stage_to_string(int stage)3575*4882a593Smuzhiyun static const char *stage_to_string(int stage)
3576*4882a593Smuzhiyun {
3577*4882a593Smuzhiyun 	if (stage == MOVE_DATA_EXTENTS)
3578*4882a593Smuzhiyun 		return "move data extents";
3579*4882a593Smuzhiyun 	if (stage == UPDATE_DATA_PTRS)
3580*4882a593Smuzhiyun 		return "update data pointers";
3581*4882a593Smuzhiyun 	return "unknown";
3582*4882a593Smuzhiyun }
3583*4882a593Smuzhiyun 
3584*4882a593Smuzhiyun /*
3585*4882a593Smuzhiyun  * function to relocate all extents in a block group.
3586*4882a593Smuzhiyun  */
btrfs_relocate_block_group(struct btrfs_fs_info * fs_info,u64 group_start)3587*4882a593Smuzhiyun int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
3588*4882a593Smuzhiyun {
3589*4882a593Smuzhiyun 	struct btrfs_block_group *bg;
3590*4882a593Smuzhiyun 	struct btrfs_root *extent_root = fs_info->extent_root;
3591*4882a593Smuzhiyun 	struct reloc_control *rc;
3592*4882a593Smuzhiyun 	struct inode *inode;
3593*4882a593Smuzhiyun 	struct btrfs_path *path;
3594*4882a593Smuzhiyun 	int ret;
3595*4882a593Smuzhiyun 	int rw = 0;
3596*4882a593Smuzhiyun 	int err = 0;
3597*4882a593Smuzhiyun 
3598*4882a593Smuzhiyun 	bg = btrfs_lookup_block_group(fs_info, group_start);
3599*4882a593Smuzhiyun 	if (!bg)
3600*4882a593Smuzhiyun 		return -ENOENT;
3601*4882a593Smuzhiyun 
3602*4882a593Smuzhiyun 	if (btrfs_pinned_by_swapfile(fs_info, bg)) {
3603*4882a593Smuzhiyun 		btrfs_put_block_group(bg);
3604*4882a593Smuzhiyun 		return -ETXTBSY;
3605*4882a593Smuzhiyun 	}
3606*4882a593Smuzhiyun 
3607*4882a593Smuzhiyun 	rc = alloc_reloc_control(fs_info);
3608*4882a593Smuzhiyun 	if (!rc) {
3609*4882a593Smuzhiyun 		btrfs_put_block_group(bg);
3610*4882a593Smuzhiyun 		return -ENOMEM;
3611*4882a593Smuzhiyun 	}
3612*4882a593Smuzhiyun 
3613*4882a593Smuzhiyun 	rc->extent_root = extent_root;
3614*4882a593Smuzhiyun 	rc->block_group = bg;
3615*4882a593Smuzhiyun 
3616*4882a593Smuzhiyun 	ret = btrfs_inc_block_group_ro(rc->block_group, true);
3617*4882a593Smuzhiyun 	if (ret) {
3618*4882a593Smuzhiyun 		err = ret;
3619*4882a593Smuzhiyun 		goto out;
3620*4882a593Smuzhiyun 	}
3621*4882a593Smuzhiyun 	rw = 1;
3622*4882a593Smuzhiyun 
3623*4882a593Smuzhiyun 	path = btrfs_alloc_path();
3624*4882a593Smuzhiyun 	if (!path) {
3625*4882a593Smuzhiyun 		err = -ENOMEM;
3626*4882a593Smuzhiyun 		goto out;
3627*4882a593Smuzhiyun 	}
3628*4882a593Smuzhiyun 
3629*4882a593Smuzhiyun 	inode = lookup_free_space_inode(rc->block_group, path);
3630*4882a593Smuzhiyun 	btrfs_free_path(path);
3631*4882a593Smuzhiyun 
3632*4882a593Smuzhiyun 	if (!IS_ERR(inode))
3633*4882a593Smuzhiyun 		ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
3634*4882a593Smuzhiyun 	else
3635*4882a593Smuzhiyun 		ret = PTR_ERR(inode);
3636*4882a593Smuzhiyun 
3637*4882a593Smuzhiyun 	if (ret && ret != -ENOENT) {
3638*4882a593Smuzhiyun 		err = ret;
3639*4882a593Smuzhiyun 		goto out;
3640*4882a593Smuzhiyun 	}
3641*4882a593Smuzhiyun 
3642*4882a593Smuzhiyun 	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3643*4882a593Smuzhiyun 	if (IS_ERR(rc->data_inode)) {
3644*4882a593Smuzhiyun 		err = PTR_ERR(rc->data_inode);
3645*4882a593Smuzhiyun 		rc->data_inode = NULL;
3646*4882a593Smuzhiyun 		goto out;
3647*4882a593Smuzhiyun 	}
3648*4882a593Smuzhiyun 
3649*4882a593Smuzhiyun 	describe_relocation(fs_info, rc->block_group);
3650*4882a593Smuzhiyun 
3651*4882a593Smuzhiyun 	btrfs_wait_block_group_reservations(rc->block_group);
3652*4882a593Smuzhiyun 	btrfs_wait_nocow_writers(rc->block_group);
3653*4882a593Smuzhiyun 	btrfs_wait_ordered_roots(fs_info, U64_MAX,
3654*4882a593Smuzhiyun 				 rc->block_group->start,
3655*4882a593Smuzhiyun 				 rc->block_group->length);
3656*4882a593Smuzhiyun 
3657*4882a593Smuzhiyun 	while (1) {
3658*4882a593Smuzhiyun 		int finishes_stage;
3659*4882a593Smuzhiyun 
3660*4882a593Smuzhiyun 		mutex_lock(&fs_info->cleaner_mutex);
3661*4882a593Smuzhiyun 		ret = relocate_block_group(rc);
3662*4882a593Smuzhiyun 		mutex_unlock(&fs_info->cleaner_mutex);
3663*4882a593Smuzhiyun 		if (ret < 0)
3664*4882a593Smuzhiyun 			err = ret;
3665*4882a593Smuzhiyun 
3666*4882a593Smuzhiyun 		finishes_stage = rc->stage;
3667*4882a593Smuzhiyun 		/*
3668*4882a593Smuzhiyun 		 * We may have gotten ENOSPC after we already dirtied some
3669*4882a593Smuzhiyun 		 * extents.  If writeout happens while we're relocating a
3670*4882a593Smuzhiyun 		 * different block group we could end up hitting the
3671*4882a593Smuzhiyun 		 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
3672*4882a593Smuzhiyun 		 * btrfs_reloc_cow_block.  Make sure we write everything out
3673*4882a593Smuzhiyun 		 * properly so we don't trip over this problem, and then break
3674*4882a593Smuzhiyun 		 * out of the loop if we hit an error.
3675*4882a593Smuzhiyun 		 */
3676*4882a593Smuzhiyun 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
3677*4882a593Smuzhiyun 			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
3678*4882a593Smuzhiyun 						       (u64)-1);
3679*4882a593Smuzhiyun 			if (ret)
3680*4882a593Smuzhiyun 				err = ret;
3681*4882a593Smuzhiyun 			invalidate_mapping_pages(rc->data_inode->i_mapping,
3682*4882a593Smuzhiyun 						 0, -1);
3683*4882a593Smuzhiyun 			rc->stage = UPDATE_DATA_PTRS;
3684*4882a593Smuzhiyun 		}
3685*4882a593Smuzhiyun 
3686*4882a593Smuzhiyun 		if (err < 0)
3687*4882a593Smuzhiyun 			goto out;
3688*4882a593Smuzhiyun 
3689*4882a593Smuzhiyun 		if (rc->extents_found == 0)
3690*4882a593Smuzhiyun 			break;
3691*4882a593Smuzhiyun 
3692*4882a593Smuzhiyun 		btrfs_info(fs_info, "found %llu extents, stage: %s",
3693*4882a593Smuzhiyun 			   rc->extents_found, stage_to_string(finishes_stage));
3694*4882a593Smuzhiyun 	}
3695*4882a593Smuzhiyun 
3696*4882a593Smuzhiyun 	WARN_ON(rc->block_group->pinned > 0);
3697*4882a593Smuzhiyun 	WARN_ON(rc->block_group->reserved > 0);
3698*4882a593Smuzhiyun 	WARN_ON(rc->block_group->used > 0);
3699*4882a593Smuzhiyun out:
3700*4882a593Smuzhiyun 	if (err && rw)
3701*4882a593Smuzhiyun 		btrfs_dec_block_group_ro(rc->block_group);
3702*4882a593Smuzhiyun 	iput(rc->data_inode);
3703*4882a593Smuzhiyun 	btrfs_put_block_group(rc->block_group);
3704*4882a593Smuzhiyun 	free_reloc_control(rc);
3705*4882a593Smuzhiyun 	return err;
3706*4882a593Smuzhiyun }
3707*4882a593Smuzhiyun 
mark_garbage_root(struct btrfs_root * root)3708*4882a593Smuzhiyun static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3709*4882a593Smuzhiyun {
3710*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = root->fs_info;
3711*4882a593Smuzhiyun 	struct btrfs_trans_handle *trans;
3712*4882a593Smuzhiyun 	int ret, err;
3713*4882a593Smuzhiyun 
3714*4882a593Smuzhiyun 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
3715*4882a593Smuzhiyun 	if (IS_ERR(trans))
3716*4882a593Smuzhiyun 		return PTR_ERR(trans);
3717*4882a593Smuzhiyun 
3718*4882a593Smuzhiyun 	memset(&root->root_item.drop_progress, 0,
3719*4882a593Smuzhiyun 		sizeof(root->root_item.drop_progress));
3720*4882a593Smuzhiyun 	root->root_item.drop_level = 0;
3721*4882a593Smuzhiyun 	btrfs_set_root_refs(&root->root_item, 0);
3722*4882a593Smuzhiyun 	ret = btrfs_update_root(trans, fs_info->tree_root,
3723*4882a593Smuzhiyun 				&root->root_key, &root->root_item);
3724*4882a593Smuzhiyun 
3725*4882a593Smuzhiyun 	err = btrfs_end_transaction(trans);
3726*4882a593Smuzhiyun 	if (err)
3727*4882a593Smuzhiyun 		return err;
3728*4882a593Smuzhiyun 	return ret;
3729*4882a593Smuzhiyun }
3730*4882a593Smuzhiyun 
3731*4882a593Smuzhiyun /*
3732*4882a593Smuzhiyun  * recover relocation interrupted by system crash.
3733*4882a593Smuzhiyun  *
3734*4882a593Smuzhiyun  * this function resumes merging reloc trees with corresponding fs trees.
3735*4882a593Smuzhiyun  * this is important for keeping the sharing of tree blocks
3736*4882a593Smuzhiyun  */
btrfs_recover_relocation(struct btrfs_root * root)3737*4882a593Smuzhiyun int btrfs_recover_relocation(struct btrfs_root *root)
3738*4882a593Smuzhiyun {
3739*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = root->fs_info;
3740*4882a593Smuzhiyun 	LIST_HEAD(reloc_roots);
3741*4882a593Smuzhiyun 	struct btrfs_key key;
3742*4882a593Smuzhiyun 	struct btrfs_root *fs_root;
3743*4882a593Smuzhiyun 	struct btrfs_root *reloc_root;
3744*4882a593Smuzhiyun 	struct btrfs_path *path;
3745*4882a593Smuzhiyun 	struct extent_buffer *leaf;
3746*4882a593Smuzhiyun 	struct reloc_control *rc = NULL;
3747*4882a593Smuzhiyun 	struct btrfs_trans_handle *trans;
3748*4882a593Smuzhiyun 	int ret;
3749*4882a593Smuzhiyun 	int err = 0;
3750*4882a593Smuzhiyun 
3751*4882a593Smuzhiyun 	path = btrfs_alloc_path();
3752*4882a593Smuzhiyun 	if (!path)
3753*4882a593Smuzhiyun 		return -ENOMEM;
3754*4882a593Smuzhiyun 	path->reada = READA_BACK;
3755*4882a593Smuzhiyun 
3756*4882a593Smuzhiyun 	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
3757*4882a593Smuzhiyun 	key.type = BTRFS_ROOT_ITEM_KEY;
3758*4882a593Smuzhiyun 	key.offset = (u64)-1;
3759*4882a593Smuzhiyun 
3760*4882a593Smuzhiyun 	while (1) {
3761*4882a593Smuzhiyun 		ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
3762*4882a593Smuzhiyun 					path, 0, 0);
3763*4882a593Smuzhiyun 		if (ret < 0) {
3764*4882a593Smuzhiyun 			err = ret;
3765*4882a593Smuzhiyun 			goto out;
3766*4882a593Smuzhiyun 		}
3767*4882a593Smuzhiyun 		if (ret > 0) {
3768*4882a593Smuzhiyun 			if (path->slots[0] == 0)
3769*4882a593Smuzhiyun 				break;
3770*4882a593Smuzhiyun 			path->slots[0]--;
3771*4882a593Smuzhiyun 		}
3772*4882a593Smuzhiyun 		leaf = path->nodes[0];
3773*4882a593Smuzhiyun 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3774*4882a593Smuzhiyun 		btrfs_release_path(path);
3775*4882a593Smuzhiyun 
3776*4882a593Smuzhiyun 		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
3777*4882a593Smuzhiyun 		    key.type != BTRFS_ROOT_ITEM_KEY)
3778*4882a593Smuzhiyun 			break;
3779*4882a593Smuzhiyun 
3780*4882a593Smuzhiyun 		reloc_root = btrfs_read_tree_root(root, &key);
3781*4882a593Smuzhiyun 		if (IS_ERR(reloc_root)) {
3782*4882a593Smuzhiyun 			err = PTR_ERR(reloc_root);
3783*4882a593Smuzhiyun 			goto out;
3784*4882a593Smuzhiyun 		}
3785*4882a593Smuzhiyun 
3786*4882a593Smuzhiyun 		set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
3787*4882a593Smuzhiyun 		list_add(&reloc_root->root_list, &reloc_roots);
3788*4882a593Smuzhiyun 
3789*4882a593Smuzhiyun 		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
3790*4882a593Smuzhiyun 			fs_root = btrfs_get_fs_root(fs_info,
3791*4882a593Smuzhiyun 					reloc_root->root_key.offset, false);
3792*4882a593Smuzhiyun 			if (IS_ERR(fs_root)) {
3793*4882a593Smuzhiyun 				ret = PTR_ERR(fs_root);
3794*4882a593Smuzhiyun 				if (ret != -ENOENT) {
3795*4882a593Smuzhiyun 					err = ret;
3796*4882a593Smuzhiyun 					goto out;
3797*4882a593Smuzhiyun 				}
3798*4882a593Smuzhiyun 				ret = mark_garbage_root(reloc_root);
3799*4882a593Smuzhiyun 				if (ret < 0) {
3800*4882a593Smuzhiyun 					err = ret;
3801*4882a593Smuzhiyun 					goto out;
3802*4882a593Smuzhiyun 				}
3803*4882a593Smuzhiyun 			} else {
3804*4882a593Smuzhiyun 				btrfs_put_root(fs_root);
3805*4882a593Smuzhiyun 			}
3806*4882a593Smuzhiyun 		}
3807*4882a593Smuzhiyun 
3808*4882a593Smuzhiyun 		if (key.offset == 0)
3809*4882a593Smuzhiyun 			break;
3810*4882a593Smuzhiyun 
3811*4882a593Smuzhiyun 		key.offset--;
3812*4882a593Smuzhiyun 	}
3813*4882a593Smuzhiyun 	btrfs_release_path(path);
3814*4882a593Smuzhiyun 
3815*4882a593Smuzhiyun 	if (list_empty(&reloc_roots))
3816*4882a593Smuzhiyun 		goto out;
3817*4882a593Smuzhiyun 
3818*4882a593Smuzhiyun 	rc = alloc_reloc_control(fs_info);
3819*4882a593Smuzhiyun 	if (!rc) {
3820*4882a593Smuzhiyun 		err = -ENOMEM;
3821*4882a593Smuzhiyun 		goto out;
3822*4882a593Smuzhiyun 	}
3823*4882a593Smuzhiyun 
3824*4882a593Smuzhiyun 	rc->extent_root = fs_info->extent_root;
3825*4882a593Smuzhiyun 
3826*4882a593Smuzhiyun 	set_reloc_control(rc);
3827*4882a593Smuzhiyun 
3828*4882a593Smuzhiyun 	trans = btrfs_join_transaction(rc->extent_root);
3829*4882a593Smuzhiyun 	if (IS_ERR(trans)) {
3830*4882a593Smuzhiyun 		err = PTR_ERR(trans);
3831*4882a593Smuzhiyun 		goto out_unset;
3832*4882a593Smuzhiyun 	}
3833*4882a593Smuzhiyun 
3834*4882a593Smuzhiyun 	rc->merge_reloc_tree = 1;
3835*4882a593Smuzhiyun 
3836*4882a593Smuzhiyun 	while (!list_empty(&reloc_roots)) {
3837*4882a593Smuzhiyun 		reloc_root = list_entry(reloc_roots.next,
3838*4882a593Smuzhiyun 					struct btrfs_root, root_list);
3839*4882a593Smuzhiyun 		list_del(&reloc_root->root_list);
3840*4882a593Smuzhiyun 
3841*4882a593Smuzhiyun 		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
3842*4882a593Smuzhiyun 			list_add_tail(&reloc_root->root_list,
3843*4882a593Smuzhiyun 				      &rc->reloc_roots);
3844*4882a593Smuzhiyun 			continue;
3845*4882a593Smuzhiyun 		}
3846*4882a593Smuzhiyun 
3847*4882a593Smuzhiyun 		fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
3848*4882a593Smuzhiyun 					    false);
3849*4882a593Smuzhiyun 		if (IS_ERR(fs_root)) {
3850*4882a593Smuzhiyun 			err = PTR_ERR(fs_root);
3851*4882a593Smuzhiyun 			list_add_tail(&reloc_root->root_list, &reloc_roots);
3852*4882a593Smuzhiyun 			btrfs_end_transaction(trans);
3853*4882a593Smuzhiyun 			goto out_unset;
3854*4882a593Smuzhiyun 		}
3855*4882a593Smuzhiyun 
3856*4882a593Smuzhiyun 		err = __add_reloc_root(reloc_root);
3857*4882a593Smuzhiyun 		BUG_ON(err < 0); /* -ENOMEM or logic error */
3858*4882a593Smuzhiyun 		fs_root->reloc_root = btrfs_grab_root(reloc_root);
3859*4882a593Smuzhiyun 		btrfs_put_root(fs_root);
3860*4882a593Smuzhiyun 	}
3861*4882a593Smuzhiyun 
3862*4882a593Smuzhiyun 	err = btrfs_commit_transaction(trans);
3863*4882a593Smuzhiyun 	if (err)
3864*4882a593Smuzhiyun 		goto out_unset;
3865*4882a593Smuzhiyun 
3866*4882a593Smuzhiyun 	merge_reloc_roots(rc);
3867*4882a593Smuzhiyun 
3868*4882a593Smuzhiyun 	unset_reloc_control(rc);
3869*4882a593Smuzhiyun 
3870*4882a593Smuzhiyun 	trans = btrfs_join_transaction(rc->extent_root);
3871*4882a593Smuzhiyun 	if (IS_ERR(trans)) {
3872*4882a593Smuzhiyun 		err = PTR_ERR(trans);
3873*4882a593Smuzhiyun 		goto out_clean;
3874*4882a593Smuzhiyun 	}
3875*4882a593Smuzhiyun 	err = btrfs_commit_transaction(trans);
3876*4882a593Smuzhiyun out_clean:
3877*4882a593Smuzhiyun 	ret = clean_dirty_subvols(rc);
3878*4882a593Smuzhiyun 	if (ret < 0 && !err)
3879*4882a593Smuzhiyun 		err = ret;
3880*4882a593Smuzhiyun out_unset:
3881*4882a593Smuzhiyun 	unset_reloc_control(rc);
3882*4882a593Smuzhiyun 	free_reloc_control(rc);
3883*4882a593Smuzhiyun out:
3884*4882a593Smuzhiyun 	free_reloc_roots(&reloc_roots);
3885*4882a593Smuzhiyun 
3886*4882a593Smuzhiyun 	btrfs_free_path(path);
3887*4882a593Smuzhiyun 
3888*4882a593Smuzhiyun 	if (err == 0) {
3889*4882a593Smuzhiyun 		/* cleanup orphan inode in data relocation tree */
3890*4882a593Smuzhiyun 		fs_root = btrfs_grab_root(fs_info->data_reloc_root);
3891*4882a593Smuzhiyun 		ASSERT(fs_root);
3892*4882a593Smuzhiyun 		err = btrfs_orphan_cleanup(fs_root);
3893*4882a593Smuzhiyun 		btrfs_put_root(fs_root);
3894*4882a593Smuzhiyun 	}
3895*4882a593Smuzhiyun 	return err;
3896*4882a593Smuzhiyun }
3897*4882a593Smuzhiyun 
3898*4882a593Smuzhiyun /*
3899*4882a593Smuzhiyun  * helper to add ordered checksum for data relocation.
3900*4882a593Smuzhiyun  *
3901*4882a593Smuzhiyun  * cloning checksum properly handles the nodatasum extents.
3902*4882a593Smuzhiyun  * it also saves CPU time to re-calculate the checksum.
3903*4882a593Smuzhiyun  */
btrfs_reloc_clone_csums(struct btrfs_inode * inode,u64 file_pos,u64 len)3904*4882a593Smuzhiyun int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
3905*4882a593Smuzhiyun {
3906*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3907*4882a593Smuzhiyun 	struct btrfs_ordered_sum *sums;
3908*4882a593Smuzhiyun 	struct btrfs_ordered_extent *ordered;
3909*4882a593Smuzhiyun 	int ret;
3910*4882a593Smuzhiyun 	u64 disk_bytenr;
3911*4882a593Smuzhiyun 	u64 new_bytenr;
3912*4882a593Smuzhiyun 	LIST_HEAD(list);
3913*4882a593Smuzhiyun 
3914*4882a593Smuzhiyun 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
3915*4882a593Smuzhiyun 	BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
3916*4882a593Smuzhiyun 
3917*4882a593Smuzhiyun 	disk_bytenr = file_pos + inode->index_cnt;
3918*4882a593Smuzhiyun 	ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr,
3919*4882a593Smuzhiyun 				       disk_bytenr + len - 1, &list, 0);
3920*4882a593Smuzhiyun 	if (ret)
3921*4882a593Smuzhiyun 		goto out;
3922*4882a593Smuzhiyun 
3923*4882a593Smuzhiyun 	while (!list_empty(&list)) {
3924*4882a593Smuzhiyun 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
3925*4882a593Smuzhiyun 		list_del_init(&sums->list);
3926*4882a593Smuzhiyun 
3927*4882a593Smuzhiyun 		/*
3928*4882a593Smuzhiyun 		 * We need to offset the new_bytenr based on where the csum is.
3929*4882a593Smuzhiyun 		 * We need to do this because we will read in entire prealloc
3930*4882a593Smuzhiyun 		 * extents but we may have written to say the middle of the
3931*4882a593Smuzhiyun 		 * prealloc extent, so we need to make sure the csum goes with
3932*4882a593Smuzhiyun 		 * the right disk offset.
3933*4882a593Smuzhiyun 		 *
3934*4882a593Smuzhiyun 		 * We can do this because the data reloc inode refers strictly
3935*4882a593Smuzhiyun 		 * to the on disk bytes, so we don't have to worry about
3936*4882a593Smuzhiyun 		 * disk_len vs real len like with real inodes since it's all
3937*4882a593Smuzhiyun 		 * disk length.
3938*4882a593Smuzhiyun 		 */
3939*4882a593Smuzhiyun 		new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr;
3940*4882a593Smuzhiyun 		sums->bytenr = new_bytenr;
3941*4882a593Smuzhiyun 
3942*4882a593Smuzhiyun 		btrfs_add_ordered_sum(ordered, sums);
3943*4882a593Smuzhiyun 	}
3944*4882a593Smuzhiyun out:
3945*4882a593Smuzhiyun 	btrfs_put_ordered_extent(ordered);
3946*4882a593Smuzhiyun 	return ret;
3947*4882a593Smuzhiyun }
3948*4882a593Smuzhiyun 
btrfs_reloc_cow_block(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct extent_buffer * buf,struct extent_buffer * cow)3949*4882a593Smuzhiyun int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
3950*4882a593Smuzhiyun 			  struct btrfs_root *root, struct extent_buffer *buf,
3951*4882a593Smuzhiyun 			  struct extent_buffer *cow)
3952*4882a593Smuzhiyun {
3953*4882a593Smuzhiyun 	struct btrfs_fs_info *fs_info = root->fs_info;
3954*4882a593Smuzhiyun 	struct reloc_control *rc;
3955*4882a593Smuzhiyun 	struct btrfs_backref_node *node;
3956*4882a593Smuzhiyun 	int first_cow = 0;
3957*4882a593Smuzhiyun 	int level;
3958*4882a593Smuzhiyun 	int ret = 0;
3959*4882a593Smuzhiyun 
3960*4882a593Smuzhiyun 	rc = fs_info->reloc_ctl;
3961*4882a593Smuzhiyun 	if (!rc)
3962*4882a593Smuzhiyun 		return 0;
3963*4882a593Smuzhiyun 
3964*4882a593Smuzhiyun 	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
3965*4882a593Smuzhiyun 	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
3966*4882a593Smuzhiyun 
3967*4882a593Smuzhiyun 	level = btrfs_header_level(buf);
3968*4882a593Smuzhiyun 	if (btrfs_header_generation(buf) <=
3969*4882a593Smuzhiyun 	    btrfs_root_last_snapshot(&root->root_item))
3970*4882a593Smuzhiyun 		first_cow = 1;
3971*4882a593Smuzhiyun 
3972*4882a593Smuzhiyun 	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
3973*4882a593Smuzhiyun 	    rc->create_reloc_tree) {
3974*4882a593Smuzhiyun 		WARN_ON(!first_cow && level == 0);
3975*4882a593Smuzhiyun 
3976*4882a593Smuzhiyun 		node = rc->backref_cache.path[level];
3977*4882a593Smuzhiyun 		BUG_ON(node->bytenr != buf->start &&
3978*4882a593Smuzhiyun 		       node->new_bytenr != buf->start);
3979*4882a593Smuzhiyun 
3980*4882a593Smuzhiyun 		btrfs_backref_drop_node_buffer(node);
3981*4882a593Smuzhiyun 		atomic_inc(&cow->refs);
3982*4882a593Smuzhiyun 		node->eb = cow;
3983*4882a593Smuzhiyun 		node->new_bytenr = cow->start;
3984*4882a593Smuzhiyun 
3985*4882a593Smuzhiyun 		if (!node->pending) {
3986*4882a593Smuzhiyun 			list_move_tail(&node->list,
3987*4882a593Smuzhiyun 				       &rc->backref_cache.pending[level]);
3988*4882a593Smuzhiyun 			node->pending = 1;
3989*4882a593Smuzhiyun 		}
3990*4882a593Smuzhiyun 
3991*4882a593Smuzhiyun 		if (first_cow)
3992*4882a593Smuzhiyun 			mark_block_processed(rc, node);
3993*4882a593Smuzhiyun 
3994*4882a593Smuzhiyun 		if (first_cow && level > 0)
3995*4882a593Smuzhiyun 			rc->nodes_relocated += buf->len;
3996*4882a593Smuzhiyun 	}
3997*4882a593Smuzhiyun 
3998*4882a593Smuzhiyun 	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
3999*4882a593Smuzhiyun 		ret = replace_file_extents(trans, rc, root, cow);
4000*4882a593Smuzhiyun 	return ret;
4001*4882a593Smuzhiyun }
4002*4882a593Smuzhiyun 
4003*4882a593Smuzhiyun /*
4004*4882a593Smuzhiyun  * called before creating snapshot. it calculates metadata reservation
4005*4882a593Smuzhiyun  * required for relocating tree blocks in the snapshot
4006*4882a593Smuzhiyun  */
btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot * pending,u64 * bytes_to_reserve)4007*4882a593Smuzhiyun void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4008*4882a593Smuzhiyun 			      u64 *bytes_to_reserve)
4009*4882a593Smuzhiyun {
4010*4882a593Smuzhiyun 	struct btrfs_root *root = pending->root;
4011*4882a593Smuzhiyun 	struct reloc_control *rc = root->fs_info->reloc_ctl;
4012*4882a593Smuzhiyun 
4013*4882a593Smuzhiyun 	if (!rc || !have_reloc_root(root))
4014*4882a593Smuzhiyun 		return;
4015*4882a593Smuzhiyun 
4016*4882a593Smuzhiyun 	if (!rc->merge_reloc_tree)
4017*4882a593Smuzhiyun 		return;
4018*4882a593Smuzhiyun 
4019*4882a593Smuzhiyun 	root = root->reloc_root;
4020*4882a593Smuzhiyun 	BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4021*4882a593Smuzhiyun 	/*
4022*4882a593Smuzhiyun 	 * relocation is in the stage of merging trees. the space
4023*4882a593Smuzhiyun 	 * used by merging a reloc tree is twice the size of
4024*4882a593Smuzhiyun 	 * relocated tree nodes in the worst case. half for cowing
4025*4882a593Smuzhiyun 	 * the reloc tree, half for cowing the fs tree. the space
4026*4882a593Smuzhiyun 	 * used by cowing the reloc tree will be freed after the
4027*4882a593Smuzhiyun 	 * tree is dropped. if we create snapshot, cowing the fs
4028*4882a593Smuzhiyun 	 * tree may use more space than it frees. so we need
4029*4882a593Smuzhiyun 	 * reserve extra space.
4030*4882a593Smuzhiyun 	 */
4031*4882a593Smuzhiyun 	*bytes_to_reserve += rc->nodes_relocated;
4032*4882a593Smuzhiyun }
4033*4882a593Smuzhiyun 
4034*4882a593Smuzhiyun /*
4035*4882a593Smuzhiyun  * called after snapshot is created. migrate block reservation
4036*4882a593Smuzhiyun  * and create reloc root for the newly created snapshot
4037*4882a593Smuzhiyun  *
4038*4882a593Smuzhiyun  * This is similar to btrfs_init_reloc_root(), we come out of here with two
4039*4882a593Smuzhiyun  * references held on the reloc_root, one for root->reloc_root and one for
4040*4882a593Smuzhiyun  * rc->reloc_roots.
4041*4882a593Smuzhiyun  */
btrfs_reloc_post_snapshot(struct btrfs_trans_handle * trans,struct btrfs_pending_snapshot * pending)4042*4882a593Smuzhiyun int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4043*4882a593Smuzhiyun 			       struct btrfs_pending_snapshot *pending)
4044*4882a593Smuzhiyun {
4045*4882a593Smuzhiyun 	struct btrfs_root *root = pending->root;
4046*4882a593Smuzhiyun 	struct btrfs_root *reloc_root;
4047*4882a593Smuzhiyun 	struct btrfs_root *new_root;
4048*4882a593Smuzhiyun 	struct reloc_control *rc = root->fs_info->reloc_ctl;
4049*4882a593Smuzhiyun 	int ret;
4050*4882a593Smuzhiyun 
4051*4882a593Smuzhiyun 	if (!rc || !have_reloc_root(root))
4052*4882a593Smuzhiyun 		return 0;
4053*4882a593Smuzhiyun 
4054*4882a593Smuzhiyun 	rc = root->fs_info->reloc_ctl;
4055*4882a593Smuzhiyun 	rc->merging_rsv_size += rc->nodes_relocated;
4056*4882a593Smuzhiyun 
4057*4882a593Smuzhiyun 	if (rc->merge_reloc_tree) {
4058*4882a593Smuzhiyun 		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4059*4882a593Smuzhiyun 					      rc->block_rsv,
4060*4882a593Smuzhiyun 					      rc->nodes_relocated, true);
4061*4882a593Smuzhiyun 		if (ret)
4062*4882a593Smuzhiyun 			return ret;
4063*4882a593Smuzhiyun 	}
4064*4882a593Smuzhiyun 
4065*4882a593Smuzhiyun 	new_root = pending->snap;
4066*4882a593Smuzhiyun 	reloc_root = create_reloc_root(trans, root->reloc_root,
4067*4882a593Smuzhiyun 				       new_root->root_key.objectid);
4068*4882a593Smuzhiyun 	if (IS_ERR(reloc_root))
4069*4882a593Smuzhiyun 		return PTR_ERR(reloc_root);
4070*4882a593Smuzhiyun 
4071*4882a593Smuzhiyun 	ret = __add_reloc_root(reloc_root);
4072*4882a593Smuzhiyun 	BUG_ON(ret < 0);
4073*4882a593Smuzhiyun 	new_root->reloc_root = btrfs_grab_root(reloc_root);
4074*4882a593Smuzhiyun 
4075*4882a593Smuzhiyun 	if (rc->create_reloc_tree)
4076*4882a593Smuzhiyun 		ret = clone_backref_node(trans, rc, root, reloc_root);
4077*4882a593Smuzhiyun 	return ret;
4078*4882a593Smuzhiyun }
4079