xref: /OK3568_Linux_fs/kernel/drivers/md/persistent-data/dm-btree.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun  * Copyright (C) 2011 Red Hat, Inc.
3*4882a593Smuzhiyun  *
4*4882a593Smuzhiyun  * This file is released under the GPL.
5*4882a593Smuzhiyun  */
6*4882a593Smuzhiyun 
7*4882a593Smuzhiyun #include "dm-btree-internal.h"
8*4882a593Smuzhiyun #include "dm-space-map.h"
9*4882a593Smuzhiyun #include "dm-transaction-manager.h"
10*4882a593Smuzhiyun 
11*4882a593Smuzhiyun #include <linux/export.h>
12*4882a593Smuzhiyun #include <linux/device-mapper.h>
13*4882a593Smuzhiyun 
14*4882a593Smuzhiyun #define DM_MSG_PREFIX "btree"
15*4882a593Smuzhiyun 
16*4882a593Smuzhiyun /*----------------------------------------------------------------
17*4882a593Smuzhiyun  * Array manipulation
18*4882a593Smuzhiyun  *--------------------------------------------------------------*/
memcpy_disk(void * dest,const void * src,size_t len)19*4882a593Smuzhiyun static void memcpy_disk(void *dest, const void *src, size_t len)
20*4882a593Smuzhiyun 	__dm_written_to_disk(src)
21*4882a593Smuzhiyun {
22*4882a593Smuzhiyun 	memcpy(dest, src, len);
23*4882a593Smuzhiyun 	__dm_unbless_for_disk(src);
24*4882a593Smuzhiyun }
25*4882a593Smuzhiyun 
array_insert(void * base,size_t elt_size,unsigned nr_elts,unsigned index,void * elt)26*4882a593Smuzhiyun static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27*4882a593Smuzhiyun 			 unsigned index, void *elt)
28*4882a593Smuzhiyun 	__dm_written_to_disk(elt)
29*4882a593Smuzhiyun {
30*4882a593Smuzhiyun 	if (index < nr_elts)
31*4882a593Smuzhiyun 		memmove(base + (elt_size * (index + 1)),
32*4882a593Smuzhiyun 			base + (elt_size * index),
33*4882a593Smuzhiyun 			(nr_elts - index) * elt_size);
34*4882a593Smuzhiyun 
35*4882a593Smuzhiyun 	memcpy_disk(base + (elt_size * index), elt, elt_size);
36*4882a593Smuzhiyun }
37*4882a593Smuzhiyun 
38*4882a593Smuzhiyun /*----------------------------------------------------------------*/
39*4882a593Smuzhiyun 
40*4882a593Smuzhiyun /* makes the assumption that no two keys are the same. */
bsearch(struct btree_node * n,uint64_t key,int want_hi)41*4882a593Smuzhiyun static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
42*4882a593Smuzhiyun {
43*4882a593Smuzhiyun 	int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44*4882a593Smuzhiyun 
45*4882a593Smuzhiyun 	while (hi - lo > 1) {
46*4882a593Smuzhiyun 		int mid = lo + ((hi - lo) / 2);
47*4882a593Smuzhiyun 		uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48*4882a593Smuzhiyun 
49*4882a593Smuzhiyun 		if (mid_key == key)
50*4882a593Smuzhiyun 			return mid;
51*4882a593Smuzhiyun 
52*4882a593Smuzhiyun 		if (mid_key < key)
53*4882a593Smuzhiyun 			lo = mid;
54*4882a593Smuzhiyun 		else
55*4882a593Smuzhiyun 			hi = mid;
56*4882a593Smuzhiyun 	}
57*4882a593Smuzhiyun 
58*4882a593Smuzhiyun 	return want_hi ? hi : lo;
59*4882a593Smuzhiyun }
60*4882a593Smuzhiyun 
lower_bound(struct btree_node * n,uint64_t key)61*4882a593Smuzhiyun int lower_bound(struct btree_node *n, uint64_t key)
62*4882a593Smuzhiyun {
63*4882a593Smuzhiyun 	return bsearch(n, key, 0);
64*4882a593Smuzhiyun }
65*4882a593Smuzhiyun 
upper_bound(struct btree_node * n,uint64_t key)66*4882a593Smuzhiyun static int upper_bound(struct btree_node *n, uint64_t key)
67*4882a593Smuzhiyun {
68*4882a593Smuzhiyun 	return bsearch(n, key, 1);
69*4882a593Smuzhiyun }
70*4882a593Smuzhiyun 
inc_children(struct dm_transaction_manager * tm,struct btree_node * n,struct dm_btree_value_type * vt)71*4882a593Smuzhiyun void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
72*4882a593Smuzhiyun 		  struct dm_btree_value_type *vt)
73*4882a593Smuzhiyun {
74*4882a593Smuzhiyun 	unsigned i;
75*4882a593Smuzhiyun 	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
76*4882a593Smuzhiyun 
77*4882a593Smuzhiyun 	if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
78*4882a593Smuzhiyun 		for (i = 0; i < nr_entries; i++)
79*4882a593Smuzhiyun 			dm_tm_inc(tm, value64(n, i));
80*4882a593Smuzhiyun 	else if (vt->inc)
81*4882a593Smuzhiyun 		for (i = 0; i < nr_entries; i++)
82*4882a593Smuzhiyun 			vt->inc(vt->context, value_ptr(n, i));
83*4882a593Smuzhiyun }
84*4882a593Smuzhiyun 
insert_at(size_t value_size,struct btree_node * node,unsigned index,uint64_t key,void * value)85*4882a593Smuzhiyun static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
86*4882a593Smuzhiyun 		     uint64_t key, void *value)
87*4882a593Smuzhiyun 	__dm_written_to_disk(value)
88*4882a593Smuzhiyun {
89*4882a593Smuzhiyun 	uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
90*4882a593Smuzhiyun 	uint32_t max_entries = le32_to_cpu(node->header.max_entries);
91*4882a593Smuzhiyun 	__le64 key_le = cpu_to_le64(key);
92*4882a593Smuzhiyun 
93*4882a593Smuzhiyun 	if (index > nr_entries ||
94*4882a593Smuzhiyun 	    index >= max_entries ||
95*4882a593Smuzhiyun 	    nr_entries >= max_entries) {
96*4882a593Smuzhiyun 		DMERR("too many entries in btree node for insert");
97*4882a593Smuzhiyun 		__dm_unbless_for_disk(value);
98*4882a593Smuzhiyun 		return -ENOMEM;
99*4882a593Smuzhiyun 	}
100*4882a593Smuzhiyun 
101*4882a593Smuzhiyun 	__dm_bless_for_disk(&key_le);
102*4882a593Smuzhiyun 
103*4882a593Smuzhiyun 	array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
104*4882a593Smuzhiyun 	array_insert(value_base(node), value_size, nr_entries, index, value);
105*4882a593Smuzhiyun 	node->header.nr_entries = cpu_to_le32(nr_entries + 1);
106*4882a593Smuzhiyun 
107*4882a593Smuzhiyun 	return 0;
108*4882a593Smuzhiyun }
109*4882a593Smuzhiyun 
110*4882a593Smuzhiyun /*----------------------------------------------------------------*/
111*4882a593Smuzhiyun 
112*4882a593Smuzhiyun /*
113*4882a593Smuzhiyun  * We want 3n entries (for some n).  This works more nicely for repeated
114*4882a593Smuzhiyun  * insert remove loops than (2n + 1).
115*4882a593Smuzhiyun  */
calc_max_entries(size_t value_size,size_t block_size)116*4882a593Smuzhiyun static uint32_t calc_max_entries(size_t value_size, size_t block_size)
117*4882a593Smuzhiyun {
118*4882a593Smuzhiyun 	uint32_t total, n;
119*4882a593Smuzhiyun 	size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
120*4882a593Smuzhiyun 
121*4882a593Smuzhiyun 	block_size -= sizeof(struct node_header);
122*4882a593Smuzhiyun 	total = block_size / elt_size;
123*4882a593Smuzhiyun 	n = total / 3;		/* rounds down */
124*4882a593Smuzhiyun 
125*4882a593Smuzhiyun 	return 3 * n;
126*4882a593Smuzhiyun }
127*4882a593Smuzhiyun 
dm_btree_empty(struct dm_btree_info * info,dm_block_t * root)128*4882a593Smuzhiyun int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
129*4882a593Smuzhiyun {
130*4882a593Smuzhiyun 	int r;
131*4882a593Smuzhiyun 	struct dm_block *b;
132*4882a593Smuzhiyun 	struct btree_node *n;
133*4882a593Smuzhiyun 	size_t block_size;
134*4882a593Smuzhiyun 	uint32_t max_entries;
135*4882a593Smuzhiyun 
136*4882a593Smuzhiyun 	r = new_block(info, &b);
137*4882a593Smuzhiyun 	if (r < 0)
138*4882a593Smuzhiyun 		return r;
139*4882a593Smuzhiyun 
140*4882a593Smuzhiyun 	block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
141*4882a593Smuzhiyun 	max_entries = calc_max_entries(info->value_type.size, block_size);
142*4882a593Smuzhiyun 
143*4882a593Smuzhiyun 	n = dm_block_data(b);
144*4882a593Smuzhiyun 	memset(n, 0, block_size);
145*4882a593Smuzhiyun 	n->header.flags = cpu_to_le32(LEAF_NODE);
146*4882a593Smuzhiyun 	n->header.nr_entries = cpu_to_le32(0);
147*4882a593Smuzhiyun 	n->header.max_entries = cpu_to_le32(max_entries);
148*4882a593Smuzhiyun 	n->header.value_size = cpu_to_le32(info->value_type.size);
149*4882a593Smuzhiyun 
150*4882a593Smuzhiyun 	*root = dm_block_location(b);
151*4882a593Smuzhiyun 	unlock_block(info, b);
152*4882a593Smuzhiyun 
153*4882a593Smuzhiyun 	return 0;
154*4882a593Smuzhiyun }
155*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_empty);
156*4882a593Smuzhiyun 
157*4882a593Smuzhiyun /*----------------------------------------------------------------*/
158*4882a593Smuzhiyun 
159*4882a593Smuzhiyun /*
160*4882a593Smuzhiyun  * Deletion uses a recursive algorithm, since we have limited stack space
161*4882a593Smuzhiyun  * we explicitly manage our own stack on the heap.
162*4882a593Smuzhiyun  */
163*4882a593Smuzhiyun #define MAX_SPINE_DEPTH 64
164*4882a593Smuzhiyun struct frame {
165*4882a593Smuzhiyun 	struct dm_block *b;
166*4882a593Smuzhiyun 	struct btree_node *n;
167*4882a593Smuzhiyun 	unsigned level;
168*4882a593Smuzhiyun 	unsigned nr_children;
169*4882a593Smuzhiyun 	unsigned current_child;
170*4882a593Smuzhiyun };
171*4882a593Smuzhiyun 
172*4882a593Smuzhiyun struct del_stack {
173*4882a593Smuzhiyun 	struct dm_btree_info *info;
174*4882a593Smuzhiyun 	struct dm_transaction_manager *tm;
175*4882a593Smuzhiyun 	int top;
176*4882a593Smuzhiyun 	struct frame spine[MAX_SPINE_DEPTH];
177*4882a593Smuzhiyun };
178*4882a593Smuzhiyun 
top_frame(struct del_stack * s,struct frame ** f)179*4882a593Smuzhiyun static int top_frame(struct del_stack *s, struct frame **f)
180*4882a593Smuzhiyun {
181*4882a593Smuzhiyun 	if (s->top < 0) {
182*4882a593Smuzhiyun 		DMERR("btree deletion stack empty");
183*4882a593Smuzhiyun 		return -EINVAL;
184*4882a593Smuzhiyun 	}
185*4882a593Smuzhiyun 
186*4882a593Smuzhiyun 	*f = s->spine + s->top;
187*4882a593Smuzhiyun 
188*4882a593Smuzhiyun 	return 0;
189*4882a593Smuzhiyun }
190*4882a593Smuzhiyun 
unprocessed_frames(struct del_stack * s)191*4882a593Smuzhiyun static int unprocessed_frames(struct del_stack *s)
192*4882a593Smuzhiyun {
193*4882a593Smuzhiyun 	return s->top >= 0;
194*4882a593Smuzhiyun }
195*4882a593Smuzhiyun 
prefetch_children(struct del_stack * s,struct frame * f)196*4882a593Smuzhiyun static void prefetch_children(struct del_stack *s, struct frame *f)
197*4882a593Smuzhiyun {
198*4882a593Smuzhiyun 	unsigned i;
199*4882a593Smuzhiyun 	struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
200*4882a593Smuzhiyun 
201*4882a593Smuzhiyun 	for (i = 0; i < f->nr_children; i++)
202*4882a593Smuzhiyun 		dm_bm_prefetch(bm, value64(f->n, i));
203*4882a593Smuzhiyun }
204*4882a593Smuzhiyun 
is_internal_level(struct dm_btree_info * info,struct frame * f)205*4882a593Smuzhiyun static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
206*4882a593Smuzhiyun {
207*4882a593Smuzhiyun 	return f->level < (info->levels - 1);
208*4882a593Smuzhiyun }
209*4882a593Smuzhiyun 
push_frame(struct del_stack * s,dm_block_t b,unsigned level)210*4882a593Smuzhiyun static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
211*4882a593Smuzhiyun {
212*4882a593Smuzhiyun 	int r;
213*4882a593Smuzhiyun 	uint32_t ref_count;
214*4882a593Smuzhiyun 
215*4882a593Smuzhiyun 	if (s->top >= MAX_SPINE_DEPTH - 1) {
216*4882a593Smuzhiyun 		DMERR("btree deletion stack out of memory");
217*4882a593Smuzhiyun 		return -ENOMEM;
218*4882a593Smuzhiyun 	}
219*4882a593Smuzhiyun 
220*4882a593Smuzhiyun 	r = dm_tm_ref(s->tm, b, &ref_count);
221*4882a593Smuzhiyun 	if (r)
222*4882a593Smuzhiyun 		return r;
223*4882a593Smuzhiyun 
224*4882a593Smuzhiyun 	if (ref_count > 1)
225*4882a593Smuzhiyun 		/*
226*4882a593Smuzhiyun 		 * This is a shared node, so we can just decrement it's
227*4882a593Smuzhiyun 		 * reference counter and leave the children.
228*4882a593Smuzhiyun 		 */
229*4882a593Smuzhiyun 		dm_tm_dec(s->tm, b);
230*4882a593Smuzhiyun 
231*4882a593Smuzhiyun 	else {
232*4882a593Smuzhiyun 		uint32_t flags;
233*4882a593Smuzhiyun 		struct frame *f = s->spine + ++s->top;
234*4882a593Smuzhiyun 
235*4882a593Smuzhiyun 		r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
236*4882a593Smuzhiyun 		if (r) {
237*4882a593Smuzhiyun 			s->top--;
238*4882a593Smuzhiyun 			return r;
239*4882a593Smuzhiyun 		}
240*4882a593Smuzhiyun 
241*4882a593Smuzhiyun 		f->n = dm_block_data(f->b);
242*4882a593Smuzhiyun 		f->level = level;
243*4882a593Smuzhiyun 		f->nr_children = le32_to_cpu(f->n->header.nr_entries);
244*4882a593Smuzhiyun 		f->current_child = 0;
245*4882a593Smuzhiyun 
246*4882a593Smuzhiyun 		flags = le32_to_cpu(f->n->header.flags);
247*4882a593Smuzhiyun 		if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
248*4882a593Smuzhiyun 			prefetch_children(s, f);
249*4882a593Smuzhiyun 	}
250*4882a593Smuzhiyun 
251*4882a593Smuzhiyun 	return 0;
252*4882a593Smuzhiyun }
253*4882a593Smuzhiyun 
pop_frame(struct del_stack * s)254*4882a593Smuzhiyun static void pop_frame(struct del_stack *s)
255*4882a593Smuzhiyun {
256*4882a593Smuzhiyun 	struct frame *f = s->spine + s->top--;
257*4882a593Smuzhiyun 
258*4882a593Smuzhiyun 	dm_tm_dec(s->tm, dm_block_location(f->b));
259*4882a593Smuzhiyun 	dm_tm_unlock(s->tm, f->b);
260*4882a593Smuzhiyun }
261*4882a593Smuzhiyun 
unlock_all_frames(struct del_stack * s)262*4882a593Smuzhiyun static void unlock_all_frames(struct del_stack *s)
263*4882a593Smuzhiyun {
264*4882a593Smuzhiyun 	struct frame *f;
265*4882a593Smuzhiyun 
266*4882a593Smuzhiyun 	while (unprocessed_frames(s)) {
267*4882a593Smuzhiyun 		f = s->spine + s->top--;
268*4882a593Smuzhiyun 		dm_tm_unlock(s->tm, f->b);
269*4882a593Smuzhiyun 	}
270*4882a593Smuzhiyun }
271*4882a593Smuzhiyun 
dm_btree_del(struct dm_btree_info * info,dm_block_t root)272*4882a593Smuzhiyun int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
273*4882a593Smuzhiyun {
274*4882a593Smuzhiyun 	int r;
275*4882a593Smuzhiyun 	struct del_stack *s;
276*4882a593Smuzhiyun 
277*4882a593Smuzhiyun 	/*
278*4882a593Smuzhiyun 	 * dm_btree_del() is called via an ioctl, as such should be
279*4882a593Smuzhiyun 	 * considered an FS op.  We can't recurse back into the FS, so we
280*4882a593Smuzhiyun 	 * allocate GFP_NOFS.
281*4882a593Smuzhiyun 	 */
282*4882a593Smuzhiyun 	s = kmalloc(sizeof(*s), GFP_NOFS);
283*4882a593Smuzhiyun 	if (!s)
284*4882a593Smuzhiyun 		return -ENOMEM;
285*4882a593Smuzhiyun 	s->info = info;
286*4882a593Smuzhiyun 	s->tm = info->tm;
287*4882a593Smuzhiyun 	s->top = -1;
288*4882a593Smuzhiyun 
289*4882a593Smuzhiyun 	r = push_frame(s, root, 0);
290*4882a593Smuzhiyun 	if (r)
291*4882a593Smuzhiyun 		goto out;
292*4882a593Smuzhiyun 
293*4882a593Smuzhiyun 	while (unprocessed_frames(s)) {
294*4882a593Smuzhiyun 		uint32_t flags;
295*4882a593Smuzhiyun 		struct frame *f;
296*4882a593Smuzhiyun 		dm_block_t b;
297*4882a593Smuzhiyun 
298*4882a593Smuzhiyun 		r = top_frame(s, &f);
299*4882a593Smuzhiyun 		if (r)
300*4882a593Smuzhiyun 			goto out;
301*4882a593Smuzhiyun 
302*4882a593Smuzhiyun 		if (f->current_child >= f->nr_children) {
303*4882a593Smuzhiyun 			pop_frame(s);
304*4882a593Smuzhiyun 			continue;
305*4882a593Smuzhiyun 		}
306*4882a593Smuzhiyun 
307*4882a593Smuzhiyun 		flags = le32_to_cpu(f->n->header.flags);
308*4882a593Smuzhiyun 		if (flags & INTERNAL_NODE) {
309*4882a593Smuzhiyun 			b = value64(f->n, f->current_child);
310*4882a593Smuzhiyun 			f->current_child++;
311*4882a593Smuzhiyun 			r = push_frame(s, b, f->level);
312*4882a593Smuzhiyun 			if (r)
313*4882a593Smuzhiyun 				goto out;
314*4882a593Smuzhiyun 
315*4882a593Smuzhiyun 		} else if (is_internal_level(info, f)) {
316*4882a593Smuzhiyun 			b = value64(f->n, f->current_child);
317*4882a593Smuzhiyun 			f->current_child++;
318*4882a593Smuzhiyun 			r = push_frame(s, b, f->level + 1);
319*4882a593Smuzhiyun 			if (r)
320*4882a593Smuzhiyun 				goto out;
321*4882a593Smuzhiyun 
322*4882a593Smuzhiyun 		} else {
323*4882a593Smuzhiyun 			if (info->value_type.dec) {
324*4882a593Smuzhiyun 				unsigned i;
325*4882a593Smuzhiyun 
326*4882a593Smuzhiyun 				for (i = 0; i < f->nr_children; i++)
327*4882a593Smuzhiyun 					info->value_type.dec(info->value_type.context,
328*4882a593Smuzhiyun 							     value_ptr(f->n, i));
329*4882a593Smuzhiyun 			}
330*4882a593Smuzhiyun 			pop_frame(s);
331*4882a593Smuzhiyun 		}
332*4882a593Smuzhiyun 	}
333*4882a593Smuzhiyun out:
334*4882a593Smuzhiyun 	if (r) {
335*4882a593Smuzhiyun 		/* cleanup all frames of del_stack */
336*4882a593Smuzhiyun 		unlock_all_frames(s);
337*4882a593Smuzhiyun 	}
338*4882a593Smuzhiyun 	kfree(s);
339*4882a593Smuzhiyun 
340*4882a593Smuzhiyun 	return r;
341*4882a593Smuzhiyun }
342*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_del);
343*4882a593Smuzhiyun 
344*4882a593Smuzhiyun /*----------------------------------------------------------------*/
345*4882a593Smuzhiyun 
btree_lookup_raw(struct ro_spine * s,dm_block_t block,uint64_t key,int (* search_fn)(struct btree_node *,uint64_t),uint64_t * result_key,void * v,size_t value_size)346*4882a593Smuzhiyun static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
347*4882a593Smuzhiyun 			    int (*search_fn)(struct btree_node *, uint64_t),
348*4882a593Smuzhiyun 			    uint64_t *result_key, void *v, size_t value_size)
349*4882a593Smuzhiyun {
350*4882a593Smuzhiyun 	int i, r;
351*4882a593Smuzhiyun 	uint32_t flags, nr_entries;
352*4882a593Smuzhiyun 
353*4882a593Smuzhiyun 	do {
354*4882a593Smuzhiyun 		r = ro_step(s, block);
355*4882a593Smuzhiyun 		if (r < 0)
356*4882a593Smuzhiyun 			return r;
357*4882a593Smuzhiyun 
358*4882a593Smuzhiyun 		i = search_fn(ro_node(s), key);
359*4882a593Smuzhiyun 
360*4882a593Smuzhiyun 		flags = le32_to_cpu(ro_node(s)->header.flags);
361*4882a593Smuzhiyun 		nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
362*4882a593Smuzhiyun 		if (i < 0 || i >= nr_entries)
363*4882a593Smuzhiyun 			return -ENODATA;
364*4882a593Smuzhiyun 
365*4882a593Smuzhiyun 		if (flags & INTERNAL_NODE)
366*4882a593Smuzhiyun 			block = value64(ro_node(s), i);
367*4882a593Smuzhiyun 
368*4882a593Smuzhiyun 	} while (!(flags & LEAF_NODE));
369*4882a593Smuzhiyun 
370*4882a593Smuzhiyun 	*result_key = le64_to_cpu(ro_node(s)->keys[i]);
371*4882a593Smuzhiyun 	if (v)
372*4882a593Smuzhiyun 		memcpy(v, value_ptr(ro_node(s), i), value_size);
373*4882a593Smuzhiyun 
374*4882a593Smuzhiyun 	return 0;
375*4882a593Smuzhiyun }
376*4882a593Smuzhiyun 
dm_btree_lookup(struct dm_btree_info * info,dm_block_t root,uint64_t * keys,void * value_le)377*4882a593Smuzhiyun int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
378*4882a593Smuzhiyun 		    uint64_t *keys, void *value_le)
379*4882a593Smuzhiyun {
380*4882a593Smuzhiyun 	unsigned level, last_level = info->levels - 1;
381*4882a593Smuzhiyun 	int r = -ENODATA;
382*4882a593Smuzhiyun 	uint64_t rkey;
383*4882a593Smuzhiyun 	__le64 internal_value_le;
384*4882a593Smuzhiyun 	struct ro_spine spine;
385*4882a593Smuzhiyun 
386*4882a593Smuzhiyun 	init_ro_spine(&spine, info);
387*4882a593Smuzhiyun 	for (level = 0; level < info->levels; level++) {
388*4882a593Smuzhiyun 		size_t size;
389*4882a593Smuzhiyun 		void *value_p;
390*4882a593Smuzhiyun 
391*4882a593Smuzhiyun 		if (level == last_level) {
392*4882a593Smuzhiyun 			value_p = value_le;
393*4882a593Smuzhiyun 			size = info->value_type.size;
394*4882a593Smuzhiyun 
395*4882a593Smuzhiyun 		} else {
396*4882a593Smuzhiyun 			value_p = &internal_value_le;
397*4882a593Smuzhiyun 			size = sizeof(uint64_t);
398*4882a593Smuzhiyun 		}
399*4882a593Smuzhiyun 
400*4882a593Smuzhiyun 		r = btree_lookup_raw(&spine, root, keys[level],
401*4882a593Smuzhiyun 				     lower_bound, &rkey,
402*4882a593Smuzhiyun 				     value_p, size);
403*4882a593Smuzhiyun 
404*4882a593Smuzhiyun 		if (!r) {
405*4882a593Smuzhiyun 			if (rkey != keys[level]) {
406*4882a593Smuzhiyun 				exit_ro_spine(&spine);
407*4882a593Smuzhiyun 				return -ENODATA;
408*4882a593Smuzhiyun 			}
409*4882a593Smuzhiyun 		} else {
410*4882a593Smuzhiyun 			exit_ro_spine(&spine);
411*4882a593Smuzhiyun 			return r;
412*4882a593Smuzhiyun 		}
413*4882a593Smuzhiyun 
414*4882a593Smuzhiyun 		root = le64_to_cpu(internal_value_le);
415*4882a593Smuzhiyun 	}
416*4882a593Smuzhiyun 	exit_ro_spine(&spine);
417*4882a593Smuzhiyun 
418*4882a593Smuzhiyun 	return r;
419*4882a593Smuzhiyun }
420*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_lookup);
421*4882a593Smuzhiyun 
dm_btree_lookup_next_single(struct dm_btree_info * info,dm_block_t root,uint64_t key,uint64_t * rkey,void * value_le)422*4882a593Smuzhiyun static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root,
423*4882a593Smuzhiyun 				       uint64_t key, uint64_t *rkey, void *value_le)
424*4882a593Smuzhiyun {
425*4882a593Smuzhiyun 	int r, i;
426*4882a593Smuzhiyun 	uint32_t flags, nr_entries;
427*4882a593Smuzhiyun 	struct dm_block *node;
428*4882a593Smuzhiyun 	struct btree_node *n;
429*4882a593Smuzhiyun 
430*4882a593Smuzhiyun 	r = bn_read_lock(info, root, &node);
431*4882a593Smuzhiyun 	if (r)
432*4882a593Smuzhiyun 		return r;
433*4882a593Smuzhiyun 
434*4882a593Smuzhiyun 	n = dm_block_data(node);
435*4882a593Smuzhiyun 	flags = le32_to_cpu(n->header.flags);
436*4882a593Smuzhiyun 	nr_entries = le32_to_cpu(n->header.nr_entries);
437*4882a593Smuzhiyun 
438*4882a593Smuzhiyun 	if (flags & INTERNAL_NODE) {
439*4882a593Smuzhiyun 		i = lower_bound(n, key);
440*4882a593Smuzhiyun 		if (i < 0) {
441*4882a593Smuzhiyun 			/*
442*4882a593Smuzhiyun 			 * avoid early -ENODATA return when all entries are
443*4882a593Smuzhiyun 			 * higher than the search @key.
444*4882a593Smuzhiyun 			 */
445*4882a593Smuzhiyun 			i = 0;
446*4882a593Smuzhiyun 		}
447*4882a593Smuzhiyun 		if (i >= nr_entries) {
448*4882a593Smuzhiyun 			r = -ENODATA;
449*4882a593Smuzhiyun 			goto out;
450*4882a593Smuzhiyun 		}
451*4882a593Smuzhiyun 
452*4882a593Smuzhiyun 		r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
453*4882a593Smuzhiyun 		if (r == -ENODATA && i < (nr_entries - 1)) {
454*4882a593Smuzhiyun 			i++;
455*4882a593Smuzhiyun 			r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
456*4882a593Smuzhiyun 		}
457*4882a593Smuzhiyun 
458*4882a593Smuzhiyun 	} else {
459*4882a593Smuzhiyun 		i = upper_bound(n, key);
460*4882a593Smuzhiyun 		if (i < 0 || i >= nr_entries) {
461*4882a593Smuzhiyun 			r = -ENODATA;
462*4882a593Smuzhiyun 			goto out;
463*4882a593Smuzhiyun 		}
464*4882a593Smuzhiyun 
465*4882a593Smuzhiyun 		*rkey = le64_to_cpu(n->keys[i]);
466*4882a593Smuzhiyun 		memcpy(value_le, value_ptr(n, i), info->value_type.size);
467*4882a593Smuzhiyun 	}
468*4882a593Smuzhiyun out:
469*4882a593Smuzhiyun 	dm_tm_unlock(info->tm, node);
470*4882a593Smuzhiyun 	return r;
471*4882a593Smuzhiyun }
472*4882a593Smuzhiyun 
dm_btree_lookup_next(struct dm_btree_info * info,dm_block_t root,uint64_t * keys,uint64_t * rkey,void * value_le)473*4882a593Smuzhiyun int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
474*4882a593Smuzhiyun 			 uint64_t *keys, uint64_t *rkey, void *value_le)
475*4882a593Smuzhiyun {
476*4882a593Smuzhiyun 	unsigned level;
477*4882a593Smuzhiyun 	int r = -ENODATA;
478*4882a593Smuzhiyun 	__le64 internal_value_le;
479*4882a593Smuzhiyun 	struct ro_spine spine;
480*4882a593Smuzhiyun 
481*4882a593Smuzhiyun 	init_ro_spine(&spine, info);
482*4882a593Smuzhiyun 	for (level = 0; level < info->levels - 1u; level++) {
483*4882a593Smuzhiyun 		r = btree_lookup_raw(&spine, root, keys[level],
484*4882a593Smuzhiyun 				     lower_bound, rkey,
485*4882a593Smuzhiyun 				     &internal_value_le, sizeof(uint64_t));
486*4882a593Smuzhiyun 		if (r)
487*4882a593Smuzhiyun 			goto out;
488*4882a593Smuzhiyun 
489*4882a593Smuzhiyun 		if (*rkey != keys[level]) {
490*4882a593Smuzhiyun 			r = -ENODATA;
491*4882a593Smuzhiyun 			goto out;
492*4882a593Smuzhiyun 		}
493*4882a593Smuzhiyun 
494*4882a593Smuzhiyun 		root = le64_to_cpu(internal_value_le);
495*4882a593Smuzhiyun 	}
496*4882a593Smuzhiyun 
497*4882a593Smuzhiyun 	r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le);
498*4882a593Smuzhiyun out:
499*4882a593Smuzhiyun 	exit_ro_spine(&spine);
500*4882a593Smuzhiyun 	return r;
501*4882a593Smuzhiyun }
502*4882a593Smuzhiyun 
503*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
504*4882a593Smuzhiyun 
505*4882a593Smuzhiyun /*
506*4882a593Smuzhiyun  * Splits a node by creating a sibling node and shifting half the nodes
507*4882a593Smuzhiyun  * contents across.  Assumes there is a parent node, and it has room for
508*4882a593Smuzhiyun  * another child.
509*4882a593Smuzhiyun  *
510*4882a593Smuzhiyun  * Before:
511*4882a593Smuzhiyun  *	  +--------+
512*4882a593Smuzhiyun  *	  | Parent |
513*4882a593Smuzhiyun  *	  +--------+
514*4882a593Smuzhiyun  *	     |
515*4882a593Smuzhiyun  *	     v
516*4882a593Smuzhiyun  *	+----------+
517*4882a593Smuzhiyun  *	| A ++++++ |
518*4882a593Smuzhiyun  *	+----------+
519*4882a593Smuzhiyun  *
520*4882a593Smuzhiyun  *
521*4882a593Smuzhiyun  * After:
522*4882a593Smuzhiyun  *		+--------+
523*4882a593Smuzhiyun  *		| Parent |
524*4882a593Smuzhiyun  *		+--------+
525*4882a593Smuzhiyun  *		  |	|
526*4882a593Smuzhiyun  *		  v	+------+
527*4882a593Smuzhiyun  *	    +---------+	       |
528*4882a593Smuzhiyun  *	    | A* +++  |	       v
529*4882a593Smuzhiyun  *	    +---------+	  +-------+
530*4882a593Smuzhiyun  *			  | B +++ |
531*4882a593Smuzhiyun  *			  +-------+
532*4882a593Smuzhiyun  *
533*4882a593Smuzhiyun  * Where A* is a shadow of A.
534*4882a593Smuzhiyun  */
btree_split_sibling(struct shadow_spine * s,unsigned parent_index,uint64_t key)535*4882a593Smuzhiyun static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
536*4882a593Smuzhiyun 			       uint64_t key)
537*4882a593Smuzhiyun {
538*4882a593Smuzhiyun 	int r;
539*4882a593Smuzhiyun 	size_t size;
540*4882a593Smuzhiyun 	unsigned nr_left, nr_right;
541*4882a593Smuzhiyun 	struct dm_block *left, *right, *parent;
542*4882a593Smuzhiyun 	struct btree_node *ln, *rn, *pn;
543*4882a593Smuzhiyun 	__le64 location;
544*4882a593Smuzhiyun 
545*4882a593Smuzhiyun 	left = shadow_current(s);
546*4882a593Smuzhiyun 
547*4882a593Smuzhiyun 	r = new_block(s->info, &right);
548*4882a593Smuzhiyun 	if (r < 0)
549*4882a593Smuzhiyun 		return r;
550*4882a593Smuzhiyun 
551*4882a593Smuzhiyun 	ln = dm_block_data(left);
552*4882a593Smuzhiyun 	rn = dm_block_data(right);
553*4882a593Smuzhiyun 
554*4882a593Smuzhiyun 	nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
555*4882a593Smuzhiyun 	nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
556*4882a593Smuzhiyun 
557*4882a593Smuzhiyun 	ln->header.nr_entries = cpu_to_le32(nr_left);
558*4882a593Smuzhiyun 
559*4882a593Smuzhiyun 	rn->header.flags = ln->header.flags;
560*4882a593Smuzhiyun 	rn->header.nr_entries = cpu_to_le32(nr_right);
561*4882a593Smuzhiyun 	rn->header.max_entries = ln->header.max_entries;
562*4882a593Smuzhiyun 	rn->header.value_size = ln->header.value_size;
563*4882a593Smuzhiyun 	memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
564*4882a593Smuzhiyun 
565*4882a593Smuzhiyun 	size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
566*4882a593Smuzhiyun 		sizeof(uint64_t) : s->info->value_type.size;
567*4882a593Smuzhiyun 	memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
568*4882a593Smuzhiyun 	       size * nr_right);
569*4882a593Smuzhiyun 
570*4882a593Smuzhiyun 	/*
571*4882a593Smuzhiyun 	 * Patch up the parent
572*4882a593Smuzhiyun 	 */
573*4882a593Smuzhiyun 	parent = shadow_parent(s);
574*4882a593Smuzhiyun 
575*4882a593Smuzhiyun 	pn = dm_block_data(parent);
576*4882a593Smuzhiyun 	location = cpu_to_le64(dm_block_location(left));
577*4882a593Smuzhiyun 	__dm_bless_for_disk(&location);
578*4882a593Smuzhiyun 	memcpy_disk(value_ptr(pn, parent_index),
579*4882a593Smuzhiyun 		    &location, sizeof(__le64));
580*4882a593Smuzhiyun 
581*4882a593Smuzhiyun 	location = cpu_to_le64(dm_block_location(right));
582*4882a593Smuzhiyun 	__dm_bless_for_disk(&location);
583*4882a593Smuzhiyun 
584*4882a593Smuzhiyun 	r = insert_at(sizeof(__le64), pn, parent_index + 1,
585*4882a593Smuzhiyun 		      le64_to_cpu(rn->keys[0]), &location);
586*4882a593Smuzhiyun 	if (r) {
587*4882a593Smuzhiyun 		unlock_block(s->info, right);
588*4882a593Smuzhiyun 		return r;
589*4882a593Smuzhiyun 	}
590*4882a593Smuzhiyun 
591*4882a593Smuzhiyun 	if (key < le64_to_cpu(rn->keys[0])) {
592*4882a593Smuzhiyun 		unlock_block(s->info, right);
593*4882a593Smuzhiyun 		s->nodes[1] = left;
594*4882a593Smuzhiyun 	} else {
595*4882a593Smuzhiyun 		unlock_block(s->info, left);
596*4882a593Smuzhiyun 		s->nodes[1] = right;
597*4882a593Smuzhiyun 	}
598*4882a593Smuzhiyun 
599*4882a593Smuzhiyun 	return 0;
600*4882a593Smuzhiyun }
601*4882a593Smuzhiyun 
602*4882a593Smuzhiyun /*
603*4882a593Smuzhiyun  * Splits a node by creating two new children beneath the given node.
604*4882a593Smuzhiyun  *
605*4882a593Smuzhiyun  * Before:
606*4882a593Smuzhiyun  *	  +----------+
607*4882a593Smuzhiyun  *	  | A ++++++ |
608*4882a593Smuzhiyun  *	  +----------+
609*4882a593Smuzhiyun  *
610*4882a593Smuzhiyun  *
611*4882a593Smuzhiyun  * After:
612*4882a593Smuzhiyun  *	+------------+
613*4882a593Smuzhiyun  *	| A (shadow) |
614*4882a593Smuzhiyun  *	+------------+
615*4882a593Smuzhiyun  *	    |	|
616*4882a593Smuzhiyun  *   +------+	+----+
617*4882a593Smuzhiyun  *   |		     |
618*4882a593Smuzhiyun  *   v		     v
619*4882a593Smuzhiyun  * +-------+	 +-------+
620*4882a593Smuzhiyun  * | B +++ |	 | C +++ |
621*4882a593Smuzhiyun  * +-------+	 +-------+
622*4882a593Smuzhiyun  */
btree_split_beneath(struct shadow_spine * s,uint64_t key)623*4882a593Smuzhiyun static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
624*4882a593Smuzhiyun {
625*4882a593Smuzhiyun 	int r;
626*4882a593Smuzhiyun 	size_t size;
627*4882a593Smuzhiyun 	unsigned nr_left, nr_right;
628*4882a593Smuzhiyun 	struct dm_block *left, *right, *new_parent;
629*4882a593Smuzhiyun 	struct btree_node *pn, *ln, *rn;
630*4882a593Smuzhiyun 	__le64 val;
631*4882a593Smuzhiyun 
632*4882a593Smuzhiyun 	new_parent = shadow_current(s);
633*4882a593Smuzhiyun 
634*4882a593Smuzhiyun 	pn = dm_block_data(new_parent);
635*4882a593Smuzhiyun 	size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
636*4882a593Smuzhiyun 		sizeof(__le64) : s->info->value_type.size;
637*4882a593Smuzhiyun 
638*4882a593Smuzhiyun 	/* create & init the left block */
639*4882a593Smuzhiyun 	r = new_block(s->info, &left);
640*4882a593Smuzhiyun 	if (r < 0)
641*4882a593Smuzhiyun 		return r;
642*4882a593Smuzhiyun 
643*4882a593Smuzhiyun 	ln = dm_block_data(left);
644*4882a593Smuzhiyun 	nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
645*4882a593Smuzhiyun 
646*4882a593Smuzhiyun 	ln->header.flags = pn->header.flags;
647*4882a593Smuzhiyun 	ln->header.nr_entries = cpu_to_le32(nr_left);
648*4882a593Smuzhiyun 	ln->header.max_entries = pn->header.max_entries;
649*4882a593Smuzhiyun 	ln->header.value_size = pn->header.value_size;
650*4882a593Smuzhiyun 	memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
651*4882a593Smuzhiyun 	memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
652*4882a593Smuzhiyun 
653*4882a593Smuzhiyun 	/* create & init the right block */
654*4882a593Smuzhiyun 	r = new_block(s->info, &right);
655*4882a593Smuzhiyun 	if (r < 0) {
656*4882a593Smuzhiyun 		unlock_block(s->info, left);
657*4882a593Smuzhiyun 		return r;
658*4882a593Smuzhiyun 	}
659*4882a593Smuzhiyun 
660*4882a593Smuzhiyun 	rn = dm_block_data(right);
661*4882a593Smuzhiyun 	nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
662*4882a593Smuzhiyun 
663*4882a593Smuzhiyun 	rn->header.flags = pn->header.flags;
664*4882a593Smuzhiyun 	rn->header.nr_entries = cpu_to_le32(nr_right);
665*4882a593Smuzhiyun 	rn->header.max_entries = pn->header.max_entries;
666*4882a593Smuzhiyun 	rn->header.value_size = pn->header.value_size;
667*4882a593Smuzhiyun 	memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
668*4882a593Smuzhiyun 	memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
669*4882a593Smuzhiyun 	       nr_right * size);
670*4882a593Smuzhiyun 
671*4882a593Smuzhiyun 	/* new_parent should just point to l and r now */
672*4882a593Smuzhiyun 	pn->header.flags = cpu_to_le32(INTERNAL_NODE);
673*4882a593Smuzhiyun 	pn->header.nr_entries = cpu_to_le32(2);
674*4882a593Smuzhiyun 	pn->header.max_entries = cpu_to_le32(
675*4882a593Smuzhiyun 		calc_max_entries(sizeof(__le64),
676*4882a593Smuzhiyun 				 dm_bm_block_size(
677*4882a593Smuzhiyun 					 dm_tm_get_bm(s->info->tm))));
678*4882a593Smuzhiyun 	pn->header.value_size = cpu_to_le32(sizeof(__le64));
679*4882a593Smuzhiyun 
680*4882a593Smuzhiyun 	val = cpu_to_le64(dm_block_location(left));
681*4882a593Smuzhiyun 	__dm_bless_for_disk(&val);
682*4882a593Smuzhiyun 	pn->keys[0] = ln->keys[0];
683*4882a593Smuzhiyun 	memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
684*4882a593Smuzhiyun 
685*4882a593Smuzhiyun 	val = cpu_to_le64(dm_block_location(right));
686*4882a593Smuzhiyun 	__dm_bless_for_disk(&val);
687*4882a593Smuzhiyun 	pn->keys[1] = rn->keys[0];
688*4882a593Smuzhiyun 	memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
689*4882a593Smuzhiyun 
690*4882a593Smuzhiyun 	unlock_block(s->info, left);
691*4882a593Smuzhiyun 	unlock_block(s->info, right);
692*4882a593Smuzhiyun 	return 0;
693*4882a593Smuzhiyun }
694*4882a593Smuzhiyun 
btree_insert_raw(struct shadow_spine * s,dm_block_t root,struct dm_btree_value_type * vt,uint64_t key,unsigned * index)695*4882a593Smuzhiyun static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
696*4882a593Smuzhiyun 			    struct dm_btree_value_type *vt,
697*4882a593Smuzhiyun 			    uint64_t key, unsigned *index)
698*4882a593Smuzhiyun {
699*4882a593Smuzhiyun 	int r, i = *index, top = 1;
700*4882a593Smuzhiyun 	struct btree_node *node;
701*4882a593Smuzhiyun 
702*4882a593Smuzhiyun 	for (;;) {
703*4882a593Smuzhiyun 		r = shadow_step(s, root, vt);
704*4882a593Smuzhiyun 		if (r < 0)
705*4882a593Smuzhiyun 			return r;
706*4882a593Smuzhiyun 
707*4882a593Smuzhiyun 		node = dm_block_data(shadow_current(s));
708*4882a593Smuzhiyun 
709*4882a593Smuzhiyun 		/*
710*4882a593Smuzhiyun 		 * We have to patch up the parent node, ugly, but I don't
711*4882a593Smuzhiyun 		 * see a way to do this automatically as part of the spine
712*4882a593Smuzhiyun 		 * op.
713*4882a593Smuzhiyun 		 */
714*4882a593Smuzhiyun 		if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
715*4882a593Smuzhiyun 			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
716*4882a593Smuzhiyun 
717*4882a593Smuzhiyun 			__dm_bless_for_disk(&location);
718*4882a593Smuzhiyun 			memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
719*4882a593Smuzhiyun 				    &location, sizeof(__le64));
720*4882a593Smuzhiyun 		}
721*4882a593Smuzhiyun 
722*4882a593Smuzhiyun 		node = dm_block_data(shadow_current(s));
723*4882a593Smuzhiyun 
724*4882a593Smuzhiyun 		if (node->header.nr_entries == node->header.max_entries) {
725*4882a593Smuzhiyun 			if (top)
726*4882a593Smuzhiyun 				r = btree_split_beneath(s, key);
727*4882a593Smuzhiyun 			else
728*4882a593Smuzhiyun 				r = btree_split_sibling(s, i, key);
729*4882a593Smuzhiyun 
730*4882a593Smuzhiyun 			if (r < 0)
731*4882a593Smuzhiyun 				return r;
732*4882a593Smuzhiyun 		}
733*4882a593Smuzhiyun 
734*4882a593Smuzhiyun 		node = dm_block_data(shadow_current(s));
735*4882a593Smuzhiyun 
736*4882a593Smuzhiyun 		i = lower_bound(node, key);
737*4882a593Smuzhiyun 
738*4882a593Smuzhiyun 		if (le32_to_cpu(node->header.flags) & LEAF_NODE)
739*4882a593Smuzhiyun 			break;
740*4882a593Smuzhiyun 
741*4882a593Smuzhiyun 		if (i < 0) {
742*4882a593Smuzhiyun 			/* change the bounds on the lowest key */
743*4882a593Smuzhiyun 			node->keys[0] = cpu_to_le64(key);
744*4882a593Smuzhiyun 			i = 0;
745*4882a593Smuzhiyun 		}
746*4882a593Smuzhiyun 
747*4882a593Smuzhiyun 		root = value64(node, i);
748*4882a593Smuzhiyun 		top = 0;
749*4882a593Smuzhiyun 	}
750*4882a593Smuzhiyun 
751*4882a593Smuzhiyun 	if (i < 0 || le64_to_cpu(node->keys[i]) != key)
752*4882a593Smuzhiyun 		i++;
753*4882a593Smuzhiyun 
754*4882a593Smuzhiyun 	*index = i;
755*4882a593Smuzhiyun 	return 0;
756*4882a593Smuzhiyun }
757*4882a593Smuzhiyun 
need_insert(struct btree_node * node,uint64_t * keys,unsigned level,unsigned index)758*4882a593Smuzhiyun static bool need_insert(struct btree_node *node, uint64_t *keys,
759*4882a593Smuzhiyun 			unsigned level, unsigned index)
760*4882a593Smuzhiyun {
761*4882a593Smuzhiyun         return ((index >= le32_to_cpu(node->header.nr_entries)) ||
762*4882a593Smuzhiyun 		(le64_to_cpu(node->keys[index]) != keys[level]));
763*4882a593Smuzhiyun }
764*4882a593Smuzhiyun 
insert(struct dm_btree_info * info,dm_block_t root,uint64_t * keys,void * value,dm_block_t * new_root,int * inserted)765*4882a593Smuzhiyun static int insert(struct dm_btree_info *info, dm_block_t root,
766*4882a593Smuzhiyun 		  uint64_t *keys, void *value, dm_block_t *new_root,
767*4882a593Smuzhiyun 		  int *inserted)
768*4882a593Smuzhiyun 		  __dm_written_to_disk(value)
769*4882a593Smuzhiyun {
770*4882a593Smuzhiyun 	int r;
771*4882a593Smuzhiyun 	unsigned level, index = -1, last_level = info->levels - 1;
772*4882a593Smuzhiyun 	dm_block_t block = root;
773*4882a593Smuzhiyun 	struct shadow_spine spine;
774*4882a593Smuzhiyun 	struct btree_node *n;
775*4882a593Smuzhiyun 	struct dm_btree_value_type le64_type;
776*4882a593Smuzhiyun 
777*4882a593Smuzhiyun 	init_le64_type(info->tm, &le64_type);
778*4882a593Smuzhiyun 	init_shadow_spine(&spine, info);
779*4882a593Smuzhiyun 
780*4882a593Smuzhiyun 	for (level = 0; level < (info->levels - 1); level++) {
781*4882a593Smuzhiyun 		r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
782*4882a593Smuzhiyun 		if (r < 0)
783*4882a593Smuzhiyun 			goto bad;
784*4882a593Smuzhiyun 
785*4882a593Smuzhiyun 		n = dm_block_data(shadow_current(&spine));
786*4882a593Smuzhiyun 
787*4882a593Smuzhiyun 		if (need_insert(n, keys, level, index)) {
788*4882a593Smuzhiyun 			dm_block_t new_tree;
789*4882a593Smuzhiyun 			__le64 new_le;
790*4882a593Smuzhiyun 
791*4882a593Smuzhiyun 			r = dm_btree_empty(info, &new_tree);
792*4882a593Smuzhiyun 			if (r < 0)
793*4882a593Smuzhiyun 				goto bad;
794*4882a593Smuzhiyun 
795*4882a593Smuzhiyun 			new_le = cpu_to_le64(new_tree);
796*4882a593Smuzhiyun 			__dm_bless_for_disk(&new_le);
797*4882a593Smuzhiyun 
798*4882a593Smuzhiyun 			r = insert_at(sizeof(uint64_t), n, index,
799*4882a593Smuzhiyun 				      keys[level], &new_le);
800*4882a593Smuzhiyun 			if (r)
801*4882a593Smuzhiyun 				goto bad;
802*4882a593Smuzhiyun 		}
803*4882a593Smuzhiyun 
804*4882a593Smuzhiyun 		if (level < last_level)
805*4882a593Smuzhiyun 			block = value64(n, index);
806*4882a593Smuzhiyun 	}
807*4882a593Smuzhiyun 
808*4882a593Smuzhiyun 	r = btree_insert_raw(&spine, block, &info->value_type,
809*4882a593Smuzhiyun 			     keys[level], &index);
810*4882a593Smuzhiyun 	if (r < 0)
811*4882a593Smuzhiyun 		goto bad;
812*4882a593Smuzhiyun 
813*4882a593Smuzhiyun 	n = dm_block_data(shadow_current(&spine));
814*4882a593Smuzhiyun 
815*4882a593Smuzhiyun 	if (need_insert(n, keys, level, index)) {
816*4882a593Smuzhiyun 		if (inserted)
817*4882a593Smuzhiyun 			*inserted = 1;
818*4882a593Smuzhiyun 
819*4882a593Smuzhiyun 		r = insert_at(info->value_type.size, n, index,
820*4882a593Smuzhiyun 			      keys[level], value);
821*4882a593Smuzhiyun 		if (r)
822*4882a593Smuzhiyun 			goto bad_unblessed;
823*4882a593Smuzhiyun 	} else {
824*4882a593Smuzhiyun 		if (inserted)
825*4882a593Smuzhiyun 			*inserted = 0;
826*4882a593Smuzhiyun 
827*4882a593Smuzhiyun 		if (info->value_type.dec &&
828*4882a593Smuzhiyun 		    (!info->value_type.equal ||
829*4882a593Smuzhiyun 		     !info->value_type.equal(
830*4882a593Smuzhiyun 			     info->value_type.context,
831*4882a593Smuzhiyun 			     value_ptr(n, index),
832*4882a593Smuzhiyun 			     value))) {
833*4882a593Smuzhiyun 			info->value_type.dec(info->value_type.context,
834*4882a593Smuzhiyun 					     value_ptr(n, index));
835*4882a593Smuzhiyun 		}
836*4882a593Smuzhiyun 		memcpy_disk(value_ptr(n, index),
837*4882a593Smuzhiyun 			    value, info->value_type.size);
838*4882a593Smuzhiyun 	}
839*4882a593Smuzhiyun 
840*4882a593Smuzhiyun 	*new_root = shadow_root(&spine);
841*4882a593Smuzhiyun 	exit_shadow_spine(&spine);
842*4882a593Smuzhiyun 
843*4882a593Smuzhiyun 	return 0;
844*4882a593Smuzhiyun 
845*4882a593Smuzhiyun bad:
846*4882a593Smuzhiyun 	__dm_unbless_for_disk(value);
847*4882a593Smuzhiyun bad_unblessed:
848*4882a593Smuzhiyun 	exit_shadow_spine(&spine);
849*4882a593Smuzhiyun 	return r;
850*4882a593Smuzhiyun }
851*4882a593Smuzhiyun 
dm_btree_insert(struct dm_btree_info * info,dm_block_t root,uint64_t * keys,void * value,dm_block_t * new_root)852*4882a593Smuzhiyun int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
853*4882a593Smuzhiyun 		    uint64_t *keys, void *value, dm_block_t *new_root)
854*4882a593Smuzhiyun 		    __dm_written_to_disk(value)
855*4882a593Smuzhiyun {
856*4882a593Smuzhiyun 	return insert(info, root, keys, value, new_root, NULL);
857*4882a593Smuzhiyun }
858*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_insert);
859*4882a593Smuzhiyun 
dm_btree_insert_notify(struct dm_btree_info * info,dm_block_t root,uint64_t * keys,void * value,dm_block_t * new_root,int * inserted)860*4882a593Smuzhiyun int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
861*4882a593Smuzhiyun 			   uint64_t *keys, void *value, dm_block_t *new_root,
862*4882a593Smuzhiyun 			   int *inserted)
863*4882a593Smuzhiyun 			   __dm_written_to_disk(value)
864*4882a593Smuzhiyun {
865*4882a593Smuzhiyun 	return insert(info, root, keys, value, new_root, inserted);
866*4882a593Smuzhiyun }
867*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
868*4882a593Smuzhiyun 
869*4882a593Smuzhiyun /*----------------------------------------------------------------*/
870*4882a593Smuzhiyun 
find_key(struct ro_spine * s,dm_block_t block,bool find_highest,uint64_t * result_key,dm_block_t * next_block)871*4882a593Smuzhiyun static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
872*4882a593Smuzhiyun 		    uint64_t *result_key, dm_block_t *next_block)
873*4882a593Smuzhiyun {
874*4882a593Smuzhiyun 	int i, r;
875*4882a593Smuzhiyun 	uint32_t flags;
876*4882a593Smuzhiyun 
877*4882a593Smuzhiyun 	do {
878*4882a593Smuzhiyun 		r = ro_step(s, block);
879*4882a593Smuzhiyun 		if (r < 0)
880*4882a593Smuzhiyun 			return r;
881*4882a593Smuzhiyun 
882*4882a593Smuzhiyun 		flags = le32_to_cpu(ro_node(s)->header.flags);
883*4882a593Smuzhiyun 		i = le32_to_cpu(ro_node(s)->header.nr_entries);
884*4882a593Smuzhiyun 		if (!i)
885*4882a593Smuzhiyun 			return -ENODATA;
886*4882a593Smuzhiyun 		else
887*4882a593Smuzhiyun 			i--;
888*4882a593Smuzhiyun 
889*4882a593Smuzhiyun 		if (find_highest)
890*4882a593Smuzhiyun 			*result_key = le64_to_cpu(ro_node(s)->keys[i]);
891*4882a593Smuzhiyun 		else
892*4882a593Smuzhiyun 			*result_key = le64_to_cpu(ro_node(s)->keys[0]);
893*4882a593Smuzhiyun 
894*4882a593Smuzhiyun 		if (next_block || flags & INTERNAL_NODE) {
895*4882a593Smuzhiyun 			if (find_highest)
896*4882a593Smuzhiyun 				block = value64(ro_node(s), i);
897*4882a593Smuzhiyun 			else
898*4882a593Smuzhiyun 				block = value64(ro_node(s), 0);
899*4882a593Smuzhiyun 		}
900*4882a593Smuzhiyun 
901*4882a593Smuzhiyun 	} while (flags & INTERNAL_NODE);
902*4882a593Smuzhiyun 
903*4882a593Smuzhiyun 	if (next_block)
904*4882a593Smuzhiyun 		*next_block = block;
905*4882a593Smuzhiyun 	return 0;
906*4882a593Smuzhiyun }
907*4882a593Smuzhiyun 
dm_btree_find_key(struct dm_btree_info * info,dm_block_t root,bool find_highest,uint64_t * result_keys)908*4882a593Smuzhiyun static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
909*4882a593Smuzhiyun 			     bool find_highest, uint64_t *result_keys)
910*4882a593Smuzhiyun {
911*4882a593Smuzhiyun 	int r = 0, count = 0, level;
912*4882a593Smuzhiyun 	struct ro_spine spine;
913*4882a593Smuzhiyun 
914*4882a593Smuzhiyun 	init_ro_spine(&spine, info);
915*4882a593Smuzhiyun 	for (level = 0; level < info->levels; level++) {
916*4882a593Smuzhiyun 		r = find_key(&spine, root, find_highest, result_keys + level,
917*4882a593Smuzhiyun 			     level == info->levels - 1 ? NULL : &root);
918*4882a593Smuzhiyun 		if (r == -ENODATA) {
919*4882a593Smuzhiyun 			r = 0;
920*4882a593Smuzhiyun 			break;
921*4882a593Smuzhiyun 
922*4882a593Smuzhiyun 		} else if (r)
923*4882a593Smuzhiyun 			break;
924*4882a593Smuzhiyun 
925*4882a593Smuzhiyun 		count++;
926*4882a593Smuzhiyun 	}
927*4882a593Smuzhiyun 	exit_ro_spine(&spine);
928*4882a593Smuzhiyun 
929*4882a593Smuzhiyun 	return r ? r : count;
930*4882a593Smuzhiyun }
931*4882a593Smuzhiyun 
dm_btree_find_highest_key(struct dm_btree_info * info,dm_block_t root,uint64_t * result_keys)932*4882a593Smuzhiyun int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
933*4882a593Smuzhiyun 			      uint64_t *result_keys)
934*4882a593Smuzhiyun {
935*4882a593Smuzhiyun 	return dm_btree_find_key(info, root, true, result_keys);
936*4882a593Smuzhiyun }
937*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
938*4882a593Smuzhiyun 
dm_btree_find_lowest_key(struct dm_btree_info * info,dm_block_t root,uint64_t * result_keys)939*4882a593Smuzhiyun int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
940*4882a593Smuzhiyun 			     uint64_t *result_keys)
941*4882a593Smuzhiyun {
942*4882a593Smuzhiyun 	return dm_btree_find_key(info, root, false, result_keys);
943*4882a593Smuzhiyun }
944*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
945*4882a593Smuzhiyun 
946*4882a593Smuzhiyun /*----------------------------------------------------------------*/
947*4882a593Smuzhiyun 
948*4882a593Smuzhiyun /*
949*4882a593Smuzhiyun  * FIXME: We shouldn't use a recursive algorithm when we have limited stack
950*4882a593Smuzhiyun  * space.  Also this only works for single level trees.
951*4882a593Smuzhiyun  */
walk_node(struct dm_btree_info * info,dm_block_t block,int (* fn)(void * context,uint64_t * keys,void * leaf),void * context)952*4882a593Smuzhiyun static int walk_node(struct dm_btree_info *info, dm_block_t block,
953*4882a593Smuzhiyun 		     int (*fn)(void *context, uint64_t *keys, void *leaf),
954*4882a593Smuzhiyun 		     void *context)
955*4882a593Smuzhiyun {
956*4882a593Smuzhiyun 	int r;
957*4882a593Smuzhiyun 	unsigned i, nr;
958*4882a593Smuzhiyun 	struct dm_block *node;
959*4882a593Smuzhiyun 	struct btree_node *n;
960*4882a593Smuzhiyun 	uint64_t keys;
961*4882a593Smuzhiyun 
962*4882a593Smuzhiyun 	r = bn_read_lock(info, block, &node);
963*4882a593Smuzhiyun 	if (r)
964*4882a593Smuzhiyun 		return r;
965*4882a593Smuzhiyun 
966*4882a593Smuzhiyun 	n = dm_block_data(node);
967*4882a593Smuzhiyun 
968*4882a593Smuzhiyun 	nr = le32_to_cpu(n->header.nr_entries);
969*4882a593Smuzhiyun 	for (i = 0; i < nr; i++) {
970*4882a593Smuzhiyun 		if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
971*4882a593Smuzhiyun 			r = walk_node(info, value64(n, i), fn, context);
972*4882a593Smuzhiyun 			if (r)
973*4882a593Smuzhiyun 				goto out;
974*4882a593Smuzhiyun 		} else {
975*4882a593Smuzhiyun 			keys = le64_to_cpu(*key_ptr(n, i));
976*4882a593Smuzhiyun 			r = fn(context, &keys, value_ptr(n, i));
977*4882a593Smuzhiyun 			if (r)
978*4882a593Smuzhiyun 				goto out;
979*4882a593Smuzhiyun 		}
980*4882a593Smuzhiyun 	}
981*4882a593Smuzhiyun 
982*4882a593Smuzhiyun out:
983*4882a593Smuzhiyun 	dm_tm_unlock(info->tm, node);
984*4882a593Smuzhiyun 	return r;
985*4882a593Smuzhiyun }
986*4882a593Smuzhiyun 
dm_btree_walk(struct dm_btree_info * info,dm_block_t root,int (* fn)(void * context,uint64_t * keys,void * leaf),void * context)987*4882a593Smuzhiyun int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
988*4882a593Smuzhiyun 		  int (*fn)(void *context, uint64_t *keys, void *leaf),
989*4882a593Smuzhiyun 		  void *context)
990*4882a593Smuzhiyun {
991*4882a593Smuzhiyun 	BUG_ON(info->levels > 1);
992*4882a593Smuzhiyun 	return walk_node(info, root, fn, context);
993*4882a593Smuzhiyun }
994*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_walk);
995*4882a593Smuzhiyun 
996*4882a593Smuzhiyun /*----------------------------------------------------------------*/
997*4882a593Smuzhiyun 
prefetch_values(struct dm_btree_cursor * c)998*4882a593Smuzhiyun static void prefetch_values(struct dm_btree_cursor *c)
999*4882a593Smuzhiyun {
1000*4882a593Smuzhiyun 	unsigned i, nr;
1001*4882a593Smuzhiyun 	__le64 value_le;
1002*4882a593Smuzhiyun 	struct cursor_node *n = c->nodes + c->depth - 1;
1003*4882a593Smuzhiyun 	struct btree_node *bn = dm_block_data(n->b);
1004*4882a593Smuzhiyun 	struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm);
1005*4882a593Smuzhiyun 
1006*4882a593Smuzhiyun 	BUG_ON(c->info->value_type.size != sizeof(value_le));
1007*4882a593Smuzhiyun 
1008*4882a593Smuzhiyun 	nr = le32_to_cpu(bn->header.nr_entries);
1009*4882a593Smuzhiyun 	for (i = 0; i < nr; i++) {
1010*4882a593Smuzhiyun 		memcpy(&value_le, value_ptr(bn, i), sizeof(value_le));
1011*4882a593Smuzhiyun 		dm_bm_prefetch(bm, le64_to_cpu(value_le));
1012*4882a593Smuzhiyun 	}
1013*4882a593Smuzhiyun }
1014*4882a593Smuzhiyun 
leaf_node(struct dm_btree_cursor * c)1015*4882a593Smuzhiyun static bool leaf_node(struct dm_btree_cursor *c)
1016*4882a593Smuzhiyun {
1017*4882a593Smuzhiyun 	struct cursor_node *n = c->nodes + c->depth - 1;
1018*4882a593Smuzhiyun 	struct btree_node *bn = dm_block_data(n->b);
1019*4882a593Smuzhiyun 
1020*4882a593Smuzhiyun 	return le32_to_cpu(bn->header.flags) & LEAF_NODE;
1021*4882a593Smuzhiyun }
1022*4882a593Smuzhiyun 
push_node(struct dm_btree_cursor * c,dm_block_t b)1023*4882a593Smuzhiyun static int push_node(struct dm_btree_cursor *c, dm_block_t b)
1024*4882a593Smuzhiyun {
1025*4882a593Smuzhiyun 	int r;
1026*4882a593Smuzhiyun 	struct cursor_node *n = c->nodes + c->depth;
1027*4882a593Smuzhiyun 
1028*4882a593Smuzhiyun 	if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) {
1029*4882a593Smuzhiyun 		DMERR("couldn't push cursor node, stack depth too high");
1030*4882a593Smuzhiyun 		return -EINVAL;
1031*4882a593Smuzhiyun 	}
1032*4882a593Smuzhiyun 
1033*4882a593Smuzhiyun 	r = bn_read_lock(c->info, b, &n->b);
1034*4882a593Smuzhiyun 	if (r)
1035*4882a593Smuzhiyun 		return r;
1036*4882a593Smuzhiyun 
1037*4882a593Smuzhiyun 	n->index = 0;
1038*4882a593Smuzhiyun 	c->depth++;
1039*4882a593Smuzhiyun 
1040*4882a593Smuzhiyun 	if (c->prefetch_leaves || !leaf_node(c))
1041*4882a593Smuzhiyun 		prefetch_values(c);
1042*4882a593Smuzhiyun 
1043*4882a593Smuzhiyun 	return 0;
1044*4882a593Smuzhiyun }
1045*4882a593Smuzhiyun 
pop_node(struct dm_btree_cursor * c)1046*4882a593Smuzhiyun static void pop_node(struct dm_btree_cursor *c)
1047*4882a593Smuzhiyun {
1048*4882a593Smuzhiyun 	c->depth--;
1049*4882a593Smuzhiyun 	unlock_block(c->info, c->nodes[c->depth].b);
1050*4882a593Smuzhiyun }
1051*4882a593Smuzhiyun 
inc_or_backtrack(struct dm_btree_cursor * c)1052*4882a593Smuzhiyun static int inc_or_backtrack(struct dm_btree_cursor *c)
1053*4882a593Smuzhiyun {
1054*4882a593Smuzhiyun 	struct cursor_node *n;
1055*4882a593Smuzhiyun 	struct btree_node *bn;
1056*4882a593Smuzhiyun 
1057*4882a593Smuzhiyun 	for (;;) {
1058*4882a593Smuzhiyun 		if (!c->depth)
1059*4882a593Smuzhiyun 			return -ENODATA;
1060*4882a593Smuzhiyun 
1061*4882a593Smuzhiyun 		n = c->nodes + c->depth - 1;
1062*4882a593Smuzhiyun 		bn = dm_block_data(n->b);
1063*4882a593Smuzhiyun 
1064*4882a593Smuzhiyun 		n->index++;
1065*4882a593Smuzhiyun 		if (n->index < le32_to_cpu(bn->header.nr_entries))
1066*4882a593Smuzhiyun 			break;
1067*4882a593Smuzhiyun 
1068*4882a593Smuzhiyun 		pop_node(c);
1069*4882a593Smuzhiyun 	}
1070*4882a593Smuzhiyun 
1071*4882a593Smuzhiyun 	return 0;
1072*4882a593Smuzhiyun }
1073*4882a593Smuzhiyun 
find_leaf(struct dm_btree_cursor * c)1074*4882a593Smuzhiyun static int find_leaf(struct dm_btree_cursor *c)
1075*4882a593Smuzhiyun {
1076*4882a593Smuzhiyun 	int r = 0;
1077*4882a593Smuzhiyun 	struct cursor_node *n;
1078*4882a593Smuzhiyun 	struct btree_node *bn;
1079*4882a593Smuzhiyun 	__le64 value_le;
1080*4882a593Smuzhiyun 
1081*4882a593Smuzhiyun 	for (;;) {
1082*4882a593Smuzhiyun 		n = c->nodes + c->depth - 1;
1083*4882a593Smuzhiyun 		bn = dm_block_data(n->b);
1084*4882a593Smuzhiyun 
1085*4882a593Smuzhiyun 		if (le32_to_cpu(bn->header.flags) & LEAF_NODE)
1086*4882a593Smuzhiyun 			break;
1087*4882a593Smuzhiyun 
1088*4882a593Smuzhiyun 		memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le));
1089*4882a593Smuzhiyun 		r = push_node(c, le64_to_cpu(value_le));
1090*4882a593Smuzhiyun 		if (r) {
1091*4882a593Smuzhiyun 			DMERR("push_node failed");
1092*4882a593Smuzhiyun 			break;
1093*4882a593Smuzhiyun 		}
1094*4882a593Smuzhiyun 	}
1095*4882a593Smuzhiyun 
1096*4882a593Smuzhiyun 	if (!r && (le32_to_cpu(bn->header.nr_entries) == 0))
1097*4882a593Smuzhiyun 		return -ENODATA;
1098*4882a593Smuzhiyun 
1099*4882a593Smuzhiyun 	return r;
1100*4882a593Smuzhiyun }
1101*4882a593Smuzhiyun 
dm_btree_cursor_begin(struct dm_btree_info * info,dm_block_t root,bool prefetch_leaves,struct dm_btree_cursor * c)1102*4882a593Smuzhiyun int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
1103*4882a593Smuzhiyun 			  bool prefetch_leaves, struct dm_btree_cursor *c)
1104*4882a593Smuzhiyun {
1105*4882a593Smuzhiyun 	int r;
1106*4882a593Smuzhiyun 
1107*4882a593Smuzhiyun 	c->info = info;
1108*4882a593Smuzhiyun 	c->root = root;
1109*4882a593Smuzhiyun 	c->depth = 0;
1110*4882a593Smuzhiyun 	c->prefetch_leaves = prefetch_leaves;
1111*4882a593Smuzhiyun 
1112*4882a593Smuzhiyun 	r = push_node(c, root);
1113*4882a593Smuzhiyun 	if (r)
1114*4882a593Smuzhiyun 		return r;
1115*4882a593Smuzhiyun 
1116*4882a593Smuzhiyun 	return find_leaf(c);
1117*4882a593Smuzhiyun }
1118*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_cursor_begin);
1119*4882a593Smuzhiyun 
dm_btree_cursor_end(struct dm_btree_cursor * c)1120*4882a593Smuzhiyun void dm_btree_cursor_end(struct dm_btree_cursor *c)
1121*4882a593Smuzhiyun {
1122*4882a593Smuzhiyun 	while (c->depth)
1123*4882a593Smuzhiyun 		pop_node(c);
1124*4882a593Smuzhiyun }
1125*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_cursor_end);
1126*4882a593Smuzhiyun 
dm_btree_cursor_next(struct dm_btree_cursor * c)1127*4882a593Smuzhiyun int dm_btree_cursor_next(struct dm_btree_cursor *c)
1128*4882a593Smuzhiyun {
1129*4882a593Smuzhiyun 	int r = inc_or_backtrack(c);
1130*4882a593Smuzhiyun 	if (!r) {
1131*4882a593Smuzhiyun 		r = find_leaf(c);
1132*4882a593Smuzhiyun 		if (r)
1133*4882a593Smuzhiyun 			DMERR("find_leaf failed");
1134*4882a593Smuzhiyun 	}
1135*4882a593Smuzhiyun 
1136*4882a593Smuzhiyun 	return r;
1137*4882a593Smuzhiyun }
1138*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_cursor_next);
1139*4882a593Smuzhiyun 
dm_btree_cursor_skip(struct dm_btree_cursor * c,uint32_t count)1140*4882a593Smuzhiyun int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count)
1141*4882a593Smuzhiyun {
1142*4882a593Smuzhiyun 	int r = 0;
1143*4882a593Smuzhiyun 
1144*4882a593Smuzhiyun 	while (count-- && !r)
1145*4882a593Smuzhiyun 		r = dm_btree_cursor_next(c);
1146*4882a593Smuzhiyun 
1147*4882a593Smuzhiyun 	return r;
1148*4882a593Smuzhiyun }
1149*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_cursor_skip);
1150*4882a593Smuzhiyun 
dm_btree_cursor_get_value(struct dm_btree_cursor * c,uint64_t * key,void * value_le)1151*4882a593Smuzhiyun int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le)
1152*4882a593Smuzhiyun {
1153*4882a593Smuzhiyun 	if (c->depth) {
1154*4882a593Smuzhiyun 		struct cursor_node *n = c->nodes + c->depth - 1;
1155*4882a593Smuzhiyun 		struct btree_node *bn = dm_block_data(n->b);
1156*4882a593Smuzhiyun 
1157*4882a593Smuzhiyun 		if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE)
1158*4882a593Smuzhiyun 			return -EINVAL;
1159*4882a593Smuzhiyun 
1160*4882a593Smuzhiyun 		*key = le64_to_cpu(*key_ptr(bn, n->index));
1161*4882a593Smuzhiyun 		memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size);
1162*4882a593Smuzhiyun 		return 0;
1163*4882a593Smuzhiyun 
1164*4882a593Smuzhiyun 	} else
1165*4882a593Smuzhiyun 		return -ENODATA;
1166*4882a593Smuzhiyun }
1167*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value);
1168