1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun * Copyright (C) 2011 Red Hat, Inc.
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * This file is released under the GPL.
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun #include "dm-btree-internal.h"
8*4882a593Smuzhiyun #include "dm-space-map.h"
9*4882a593Smuzhiyun #include "dm-transaction-manager.h"
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun #include <linux/export.h>
12*4882a593Smuzhiyun #include <linux/device-mapper.h>
13*4882a593Smuzhiyun
14*4882a593Smuzhiyun #define DM_MSG_PREFIX "btree"
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun /*----------------------------------------------------------------
17*4882a593Smuzhiyun * Array manipulation
18*4882a593Smuzhiyun *--------------------------------------------------------------*/
memcpy_disk(void * dest,const void * src,size_t len)19*4882a593Smuzhiyun static void memcpy_disk(void *dest, const void *src, size_t len)
20*4882a593Smuzhiyun __dm_written_to_disk(src)
21*4882a593Smuzhiyun {
22*4882a593Smuzhiyun memcpy(dest, src, len);
23*4882a593Smuzhiyun __dm_unbless_for_disk(src);
24*4882a593Smuzhiyun }
25*4882a593Smuzhiyun
array_insert(void * base,size_t elt_size,unsigned nr_elts,unsigned index,void * elt)26*4882a593Smuzhiyun static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27*4882a593Smuzhiyun unsigned index, void *elt)
28*4882a593Smuzhiyun __dm_written_to_disk(elt)
29*4882a593Smuzhiyun {
30*4882a593Smuzhiyun if (index < nr_elts)
31*4882a593Smuzhiyun memmove(base + (elt_size * (index + 1)),
32*4882a593Smuzhiyun base + (elt_size * index),
33*4882a593Smuzhiyun (nr_elts - index) * elt_size);
34*4882a593Smuzhiyun
35*4882a593Smuzhiyun memcpy_disk(base + (elt_size * index), elt, elt_size);
36*4882a593Smuzhiyun }
37*4882a593Smuzhiyun
38*4882a593Smuzhiyun /*----------------------------------------------------------------*/
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun /* makes the assumption that no two keys are the same. */
bsearch(struct btree_node * n,uint64_t key,int want_hi)41*4882a593Smuzhiyun static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
42*4882a593Smuzhiyun {
43*4882a593Smuzhiyun int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44*4882a593Smuzhiyun
45*4882a593Smuzhiyun while (hi - lo > 1) {
46*4882a593Smuzhiyun int mid = lo + ((hi - lo) / 2);
47*4882a593Smuzhiyun uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun if (mid_key == key)
50*4882a593Smuzhiyun return mid;
51*4882a593Smuzhiyun
52*4882a593Smuzhiyun if (mid_key < key)
53*4882a593Smuzhiyun lo = mid;
54*4882a593Smuzhiyun else
55*4882a593Smuzhiyun hi = mid;
56*4882a593Smuzhiyun }
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun return want_hi ? hi : lo;
59*4882a593Smuzhiyun }
60*4882a593Smuzhiyun
lower_bound(struct btree_node * n,uint64_t key)61*4882a593Smuzhiyun int lower_bound(struct btree_node *n, uint64_t key)
62*4882a593Smuzhiyun {
63*4882a593Smuzhiyun return bsearch(n, key, 0);
64*4882a593Smuzhiyun }
65*4882a593Smuzhiyun
upper_bound(struct btree_node * n,uint64_t key)66*4882a593Smuzhiyun static int upper_bound(struct btree_node *n, uint64_t key)
67*4882a593Smuzhiyun {
68*4882a593Smuzhiyun return bsearch(n, key, 1);
69*4882a593Smuzhiyun }
70*4882a593Smuzhiyun
inc_children(struct dm_transaction_manager * tm,struct btree_node * n,struct dm_btree_value_type * vt)71*4882a593Smuzhiyun void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
72*4882a593Smuzhiyun struct dm_btree_value_type *vt)
73*4882a593Smuzhiyun {
74*4882a593Smuzhiyun unsigned i;
75*4882a593Smuzhiyun uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
78*4882a593Smuzhiyun for (i = 0; i < nr_entries; i++)
79*4882a593Smuzhiyun dm_tm_inc(tm, value64(n, i));
80*4882a593Smuzhiyun else if (vt->inc)
81*4882a593Smuzhiyun for (i = 0; i < nr_entries; i++)
82*4882a593Smuzhiyun vt->inc(vt->context, value_ptr(n, i));
83*4882a593Smuzhiyun }
84*4882a593Smuzhiyun
insert_at(size_t value_size,struct btree_node * node,unsigned index,uint64_t key,void * value)85*4882a593Smuzhiyun static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
86*4882a593Smuzhiyun uint64_t key, void *value)
87*4882a593Smuzhiyun __dm_written_to_disk(value)
88*4882a593Smuzhiyun {
89*4882a593Smuzhiyun uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
90*4882a593Smuzhiyun uint32_t max_entries = le32_to_cpu(node->header.max_entries);
91*4882a593Smuzhiyun __le64 key_le = cpu_to_le64(key);
92*4882a593Smuzhiyun
93*4882a593Smuzhiyun if (index > nr_entries ||
94*4882a593Smuzhiyun index >= max_entries ||
95*4882a593Smuzhiyun nr_entries >= max_entries) {
96*4882a593Smuzhiyun DMERR("too many entries in btree node for insert");
97*4882a593Smuzhiyun __dm_unbless_for_disk(value);
98*4882a593Smuzhiyun return -ENOMEM;
99*4882a593Smuzhiyun }
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun __dm_bless_for_disk(&key_le);
102*4882a593Smuzhiyun
103*4882a593Smuzhiyun array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
104*4882a593Smuzhiyun array_insert(value_base(node), value_size, nr_entries, index, value);
105*4882a593Smuzhiyun node->header.nr_entries = cpu_to_le32(nr_entries + 1);
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun return 0;
108*4882a593Smuzhiyun }
109*4882a593Smuzhiyun
110*4882a593Smuzhiyun /*----------------------------------------------------------------*/
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun /*
113*4882a593Smuzhiyun * We want 3n entries (for some n). This works more nicely for repeated
114*4882a593Smuzhiyun * insert remove loops than (2n + 1).
115*4882a593Smuzhiyun */
calc_max_entries(size_t value_size,size_t block_size)116*4882a593Smuzhiyun static uint32_t calc_max_entries(size_t value_size, size_t block_size)
117*4882a593Smuzhiyun {
118*4882a593Smuzhiyun uint32_t total, n;
119*4882a593Smuzhiyun size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
120*4882a593Smuzhiyun
121*4882a593Smuzhiyun block_size -= sizeof(struct node_header);
122*4882a593Smuzhiyun total = block_size / elt_size;
123*4882a593Smuzhiyun n = total / 3; /* rounds down */
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun return 3 * n;
126*4882a593Smuzhiyun }
127*4882a593Smuzhiyun
dm_btree_empty(struct dm_btree_info * info,dm_block_t * root)128*4882a593Smuzhiyun int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
129*4882a593Smuzhiyun {
130*4882a593Smuzhiyun int r;
131*4882a593Smuzhiyun struct dm_block *b;
132*4882a593Smuzhiyun struct btree_node *n;
133*4882a593Smuzhiyun size_t block_size;
134*4882a593Smuzhiyun uint32_t max_entries;
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun r = new_block(info, &b);
137*4882a593Smuzhiyun if (r < 0)
138*4882a593Smuzhiyun return r;
139*4882a593Smuzhiyun
140*4882a593Smuzhiyun block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
141*4882a593Smuzhiyun max_entries = calc_max_entries(info->value_type.size, block_size);
142*4882a593Smuzhiyun
143*4882a593Smuzhiyun n = dm_block_data(b);
144*4882a593Smuzhiyun memset(n, 0, block_size);
145*4882a593Smuzhiyun n->header.flags = cpu_to_le32(LEAF_NODE);
146*4882a593Smuzhiyun n->header.nr_entries = cpu_to_le32(0);
147*4882a593Smuzhiyun n->header.max_entries = cpu_to_le32(max_entries);
148*4882a593Smuzhiyun n->header.value_size = cpu_to_le32(info->value_type.size);
149*4882a593Smuzhiyun
150*4882a593Smuzhiyun *root = dm_block_location(b);
151*4882a593Smuzhiyun unlock_block(info, b);
152*4882a593Smuzhiyun
153*4882a593Smuzhiyun return 0;
154*4882a593Smuzhiyun }
155*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_empty);
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun /*----------------------------------------------------------------*/
158*4882a593Smuzhiyun
159*4882a593Smuzhiyun /*
160*4882a593Smuzhiyun * Deletion uses a recursive algorithm, since we have limited stack space
161*4882a593Smuzhiyun * we explicitly manage our own stack on the heap.
162*4882a593Smuzhiyun */
163*4882a593Smuzhiyun #define MAX_SPINE_DEPTH 64
164*4882a593Smuzhiyun struct frame {
165*4882a593Smuzhiyun struct dm_block *b;
166*4882a593Smuzhiyun struct btree_node *n;
167*4882a593Smuzhiyun unsigned level;
168*4882a593Smuzhiyun unsigned nr_children;
169*4882a593Smuzhiyun unsigned current_child;
170*4882a593Smuzhiyun };
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun struct del_stack {
173*4882a593Smuzhiyun struct dm_btree_info *info;
174*4882a593Smuzhiyun struct dm_transaction_manager *tm;
175*4882a593Smuzhiyun int top;
176*4882a593Smuzhiyun struct frame spine[MAX_SPINE_DEPTH];
177*4882a593Smuzhiyun };
178*4882a593Smuzhiyun
top_frame(struct del_stack * s,struct frame ** f)179*4882a593Smuzhiyun static int top_frame(struct del_stack *s, struct frame **f)
180*4882a593Smuzhiyun {
181*4882a593Smuzhiyun if (s->top < 0) {
182*4882a593Smuzhiyun DMERR("btree deletion stack empty");
183*4882a593Smuzhiyun return -EINVAL;
184*4882a593Smuzhiyun }
185*4882a593Smuzhiyun
186*4882a593Smuzhiyun *f = s->spine + s->top;
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun return 0;
189*4882a593Smuzhiyun }
190*4882a593Smuzhiyun
unprocessed_frames(struct del_stack * s)191*4882a593Smuzhiyun static int unprocessed_frames(struct del_stack *s)
192*4882a593Smuzhiyun {
193*4882a593Smuzhiyun return s->top >= 0;
194*4882a593Smuzhiyun }
195*4882a593Smuzhiyun
prefetch_children(struct del_stack * s,struct frame * f)196*4882a593Smuzhiyun static void prefetch_children(struct del_stack *s, struct frame *f)
197*4882a593Smuzhiyun {
198*4882a593Smuzhiyun unsigned i;
199*4882a593Smuzhiyun struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
200*4882a593Smuzhiyun
201*4882a593Smuzhiyun for (i = 0; i < f->nr_children; i++)
202*4882a593Smuzhiyun dm_bm_prefetch(bm, value64(f->n, i));
203*4882a593Smuzhiyun }
204*4882a593Smuzhiyun
is_internal_level(struct dm_btree_info * info,struct frame * f)205*4882a593Smuzhiyun static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
206*4882a593Smuzhiyun {
207*4882a593Smuzhiyun return f->level < (info->levels - 1);
208*4882a593Smuzhiyun }
209*4882a593Smuzhiyun
push_frame(struct del_stack * s,dm_block_t b,unsigned level)210*4882a593Smuzhiyun static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
211*4882a593Smuzhiyun {
212*4882a593Smuzhiyun int r;
213*4882a593Smuzhiyun uint32_t ref_count;
214*4882a593Smuzhiyun
215*4882a593Smuzhiyun if (s->top >= MAX_SPINE_DEPTH - 1) {
216*4882a593Smuzhiyun DMERR("btree deletion stack out of memory");
217*4882a593Smuzhiyun return -ENOMEM;
218*4882a593Smuzhiyun }
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun r = dm_tm_ref(s->tm, b, &ref_count);
221*4882a593Smuzhiyun if (r)
222*4882a593Smuzhiyun return r;
223*4882a593Smuzhiyun
224*4882a593Smuzhiyun if (ref_count > 1)
225*4882a593Smuzhiyun /*
226*4882a593Smuzhiyun * This is a shared node, so we can just decrement it's
227*4882a593Smuzhiyun * reference counter and leave the children.
228*4882a593Smuzhiyun */
229*4882a593Smuzhiyun dm_tm_dec(s->tm, b);
230*4882a593Smuzhiyun
231*4882a593Smuzhiyun else {
232*4882a593Smuzhiyun uint32_t flags;
233*4882a593Smuzhiyun struct frame *f = s->spine + ++s->top;
234*4882a593Smuzhiyun
235*4882a593Smuzhiyun r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
236*4882a593Smuzhiyun if (r) {
237*4882a593Smuzhiyun s->top--;
238*4882a593Smuzhiyun return r;
239*4882a593Smuzhiyun }
240*4882a593Smuzhiyun
241*4882a593Smuzhiyun f->n = dm_block_data(f->b);
242*4882a593Smuzhiyun f->level = level;
243*4882a593Smuzhiyun f->nr_children = le32_to_cpu(f->n->header.nr_entries);
244*4882a593Smuzhiyun f->current_child = 0;
245*4882a593Smuzhiyun
246*4882a593Smuzhiyun flags = le32_to_cpu(f->n->header.flags);
247*4882a593Smuzhiyun if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
248*4882a593Smuzhiyun prefetch_children(s, f);
249*4882a593Smuzhiyun }
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun return 0;
252*4882a593Smuzhiyun }
253*4882a593Smuzhiyun
pop_frame(struct del_stack * s)254*4882a593Smuzhiyun static void pop_frame(struct del_stack *s)
255*4882a593Smuzhiyun {
256*4882a593Smuzhiyun struct frame *f = s->spine + s->top--;
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun dm_tm_dec(s->tm, dm_block_location(f->b));
259*4882a593Smuzhiyun dm_tm_unlock(s->tm, f->b);
260*4882a593Smuzhiyun }
261*4882a593Smuzhiyun
unlock_all_frames(struct del_stack * s)262*4882a593Smuzhiyun static void unlock_all_frames(struct del_stack *s)
263*4882a593Smuzhiyun {
264*4882a593Smuzhiyun struct frame *f;
265*4882a593Smuzhiyun
266*4882a593Smuzhiyun while (unprocessed_frames(s)) {
267*4882a593Smuzhiyun f = s->spine + s->top--;
268*4882a593Smuzhiyun dm_tm_unlock(s->tm, f->b);
269*4882a593Smuzhiyun }
270*4882a593Smuzhiyun }
271*4882a593Smuzhiyun
dm_btree_del(struct dm_btree_info * info,dm_block_t root)272*4882a593Smuzhiyun int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
273*4882a593Smuzhiyun {
274*4882a593Smuzhiyun int r;
275*4882a593Smuzhiyun struct del_stack *s;
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun /*
278*4882a593Smuzhiyun * dm_btree_del() is called via an ioctl, as such should be
279*4882a593Smuzhiyun * considered an FS op. We can't recurse back into the FS, so we
280*4882a593Smuzhiyun * allocate GFP_NOFS.
281*4882a593Smuzhiyun */
282*4882a593Smuzhiyun s = kmalloc(sizeof(*s), GFP_NOFS);
283*4882a593Smuzhiyun if (!s)
284*4882a593Smuzhiyun return -ENOMEM;
285*4882a593Smuzhiyun s->info = info;
286*4882a593Smuzhiyun s->tm = info->tm;
287*4882a593Smuzhiyun s->top = -1;
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun r = push_frame(s, root, 0);
290*4882a593Smuzhiyun if (r)
291*4882a593Smuzhiyun goto out;
292*4882a593Smuzhiyun
293*4882a593Smuzhiyun while (unprocessed_frames(s)) {
294*4882a593Smuzhiyun uint32_t flags;
295*4882a593Smuzhiyun struct frame *f;
296*4882a593Smuzhiyun dm_block_t b;
297*4882a593Smuzhiyun
298*4882a593Smuzhiyun r = top_frame(s, &f);
299*4882a593Smuzhiyun if (r)
300*4882a593Smuzhiyun goto out;
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun if (f->current_child >= f->nr_children) {
303*4882a593Smuzhiyun pop_frame(s);
304*4882a593Smuzhiyun continue;
305*4882a593Smuzhiyun }
306*4882a593Smuzhiyun
307*4882a593Smuzhiyun flags = le32_to_cpu(f->n->header.flags);
308*4882a593Smuzhiyun if (flags & INTERNAL_NODE) {
309*4882a593Smuzhiyun b = value64(f->n, f->current_child);
310*4882a593Smuzhiyun f->current_child++;
311*4882a593Smuzhiyun r = push_frame(s, b, f->level);
312*4882a593Smuzhiyun if (r)
313*4882a593Smuzhiyun goto out;
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun } else if (is_internal_level(info, f)) {
316*4882a593Smuzhiyun b = value64(f->n, f->current_child);
317*4882a593Smuzhiyun f->current_child++;
318*4882a593Smuzhiyun r = push_frame(s, b, f->level + 1);
319*4882a593Smuzhiyun if (r)
320*4882a593Smuzhiyun goto out;
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun } else {
323*4882a593Smuzhiyun if (info->value_type.dec) {
324*4882a593Smuzhiyun unsigned i;
325*4882a593Smuzhiyun
326*4882a593Smuzhiyun for (i = 0; i < f->nr_children; i++)
327*4882a593Smuzhiyun info->value_type.dec(info->value_type.context,
328*4882a593Smuzhiyun value_ptr(f->n, i));
329*4882a593Smuzhiyun }
330*4882a593Smuzhiyun pop_frame(s);
331*4882a593Smuzhiyun }
332*4882a593Smuzhiyun }
333*4882a593Smuzhiyun out:
334*4882a593Smuzhiyun if (r) {
335*4882a593Smuzhiyun /* cleanup all frames of del_stack */
336*4882a593Smuzhiyun unlock_all_frames(s);
337*4882a593Smuzhiyun }
338*4882a593Smuzhiyun kfree(s);
339*4882a593Smuzhiyun
340*4882a593Smuzhiyun return r;
341*4882a593Smuzhiyun }
342*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_del);
343*4882a593Smuzhiyun
344*4882a593Smuzhiyun /*----------------------------------------------------------------*/
345*4882a593Smuzhiyun
btree_lookup_raw(struct ro_spine * s,dm_block_t block,uint64_t key,int (* search_fn)(struct btree_node *,uint64_t),uint64_t * result_key,void * v,size_t value_size)346*4882a593Smuzhiyun static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
347*4882a593Smuzhiyun int (*search_fn)(struct btree_node *, uint64_t),
348*4882a593Smuzhiyun uint64_t *result_key, void *v, size_t value_size)
349*4882a593Smuzhiyun {
350*4882a593Smuzhiyun int i, r;
351*4882a593Smuzhiyun uint32_t flags, nr_entries;
352*4882a593Smuzhiyun
353*4882a593Smuzhiyun do {
354*4882a593Smuzhiyun r = ro_step(s, block);
355*4882a593Smuzhiyun if (r < 0)
356*4882a593Smuzhiyun return r;
357*4882a593Smuzhiyun
358*4882a593Smuzhiyun i = search_fn(ro_node(s), key);
359*4882a593Smuzhiyun
360*4882a593Smuzhiyun flags = le32_to_cpu(ro_node(s)->header.flags);
361*4882a593Smuzhiyun nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
362*4882a593Smuzhiyun if (i < 0 || i >= nr_entries)
363*4882a593Smuzhiyun return -ENODATA;
364*4882a593Smuzhiyun
365*4882a593Smuzhiyun if (flags & INTERNAL_NODE)
366*4882a593Smuzhiyun block = value64(ro_node(s), i);
367*4882a593Smuzhiyun
368*4882a593Smuzhiyun } while (!(flags & LEAF_NODE));
369*4882a593Smuzhiyun
370*4882a593Smuzhiyun *result_key = le64_to_cpu(ro_node(s)->keys[i]);
371*4882a593Smuzhiyun if (v)
372*4882a593Smuzhiyun memcpy(v, value_ptr(ro_node(s), i), value_size);
373*4882a593Smuzhiyun
374*4882a593Smuzhiyun return 0;
375*4882a593Smuzhiyun }
376*4882a593Smuzhiyun
dm_btree_lookup(struct dm_btree_info * info,dm_block_t root,uint64_t * keys,void * value_le)377*4882a593Smuzhiyun int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
378*4882a593Smuzhiyun uint64_t *keys, void *value_le)
379*4882a593Smuzhiyun {
380*4882a593Smuzhiyun unsigned level, last_level = info->levels - 1;
381*4882a593Smuzhiyun int r = -ENODATA;
382*4882a593Smuzhiyun uint64_t rkey;
383*4882a593Smuzhiyun __le64 internal_value_le;
384*4882a593Smuzhiyun struct ro_spine spine;
385*4882a593Smuzhiyun
386*4882a593Smuzhiyun init_ro_spine(&spine, info);
387*4882a593Smuzhiyun for (level = 0; level < info->levels; level++) {
388*4882a593Smuzhiyun size_t size;
389*4882a593Smuzhiyun void *value_p;
390*4882a593Smuzhiyun
391*4882a593Smuzhiyun if (level == last_level) {
392*4882a593Smuzhiyun value_p = value_le;
393*4882a593Smuzhiyun size = info->value_type.size;
394*4882a593Smuzhiyun
395*4882a593Smuzhiyun } else {
396*4882a593Smuzhiyun value_p = &internal_value_le;
397*4882a593Smuzhiyun size = sizeof(uint64_t);
398*4882a593Smuzhiyun }
399*4882a593Smuzhiyun
400*4882a593Smuzhiyun r = btree_lookup_raw(&spine, root, keys[level],
401*4882a593Smuzhiyun lower_bound, &rkey,
402*4882a593Smuzhiyun value_p, size);
403*4882a593Smuzhiyun
404*4882a593Smuzhiyun if (!r) {
405*4882a593Smuzhiyun if (rkey != keys[level]) {
406*4882a593Smuzhiyun exit_ro_spine(&spine);
407*4882a593Smuzhiyun return -ENODATA;
408*4882a593Smuzhiyun }
409*4882a593Smuzhiyun } else {
410*4882a593Smuzhiyun exit_ro_spine(&spine);
411*4882a593Smuzhiyun return r;
412*4882a593Smuzhiyun }
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun root = le64_to_cpu(internal_value_le);
415*4882a593Smuzhiyun }
416*4882a593Smuzhiyun exit_ro_spine(&spine);
417*4882a593Smuzhiyun
418*4882a593Smuzhiyun return r;
419*4882a593Smuzhiyun }
420*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_lookup);
421*4882a593Smuzhiyun
dm_btree_lookup_next_single(struct dm_btree_info * info,dm_block_t root,uint64_t key,uint64_t * rkey,void * value_le)422*4882a593Smuzhiyun static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root,
423*4882a593Smuzhiyun uint64_t key, uint64_t *rkey, void *value_le)
424*4882a593Smuzhiyun {
425*4882a593Smuzhiyun int r, i;
426*4882a593Smuzhiyun uint32_t flags, nr_entries;
427*4882a593Smuzhiyun struct dm_block *node;
428*4882a593Smuzhiyun struct btree_node *n;
429*4882a593Smuzhiyun
430*4882a593Smuzhiyun r = bn_read_lock(info, root, &node);
431*4882a593Smuzhiyun if (r)
432*4882a593Smuzhiyun return r;
433*4882a593Smuzhiyun
434*4882a593Smuzhiyun n = dm_block_data(node);
435*4882a593Smuzhiyun flags = le32_to_cpu(n->header.flags);
436*4882a593Smuzhiyun nr_entries = le32_to_cpu(n->header.nr_entries);
437*4882a593Smuzhiyun
438*4882a593Smuzhiyun if (flags & INTERNAL_NODE) {
439*4882a593Smuzhiyun i = lower_bound(n, key);
440*4882a593Smuzhiyun if (i < 0) {
441*4882a593Smuzhiyun /*
442*4882a593Smuzhiyun * avoid early -ENODATA return when all entries are
443*4882a593Smuzhiyun * higher than the search @key.
444*4882a593Smuzhiyun */
445*4882a593Smuzhiyun i = 0;
446*4882a593Smuzhiyun }
447*4882a593Smuzhiyun if (i >= nr_entries) {
448*4882a593Smuzhiyun r = -ENODATA;
449*4882a593Smuzhiyun goto out;
450*4882a593Smuzhiyun }
451*4882a593Smuzhiyun
452*4882a593Smuzhiyun r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
453*4882a593Smuzhiyun if (r == -ENODATA && i < (nr_entries - 1)) {
454*4882a593Smuzhiyun i++;
455*4882a593Smuzhiyun r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
456*4882a593Smuzhiyun }
457*4882a593Smuzhiyun
458*4882a593Smuzhiyun } else {
459*4882a593Smuzhiyun i = upper_bound(n, key);
460*4882a593Smuzhiyun if (i < 0 || i >= nr_entries) {
461*4882a593Smuzhiyun r = -ENODATA;
462*4882a593Smuzhiyun goto out;
463*4882a593Smuzhiyun }
464*4882a593Smuzhiyun
465*4882a593Smuzhiyun *rkey = le64_to_cpu(n->keys[i]);
466*4882a593Smuzhiyun memcpy(value_le, value_ptr(n, i), info->value_type.size);
467*4882a593Smuzhiyun }
468*4882a593Smuzhiyun out:
469*4882a593Smuzhiyun dm_tm_unlock(info->tm, node);
470*4882a593Smuzhiyun return r;
471*4882a593Smuzhiyun }
472*4882a593Smuzhiyun
dm_btree_lookup_next(struct dm_btree_info * info,dm_block_t root,uint64_t * keys,uint64_t * rkey,void * value_le)473*4882a593Smuzhiyun int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
474*4882a593Smuzhiyun uint64_t *keys, uint64_t *rkey, void *value_le)
475*4882a593Smuzhiyun {
476*4882a593Smuzhiyun unsigned level;
477*4882a593Smuzhiyun int r = -ENODATA;
478*4882a593Smuzhiyun __le64 internal_value_le;
479*4882a593Smuzhiyun struct ro_spine spine;
480*4882a593Smuzhiyun
481*4882a593Smuzhiyun init_ro_spine(&spine, info);
482*4882a593Smuzhiyun for (level = 0; level < info->levels - 1u; level++) {
483*4882a593Smuzhiyun r = btree_lookup_raw(&spine, root, keys[level],
484*4882a593Smuzhiyun lower_bound, rkey,
485*4882a593Smuzhiyun &internal_value_le, sizeof(uint64_t));
486*4882a593Smuzhiyun if (r)
487*4882a593Smuzhiyun goto out;
488*4882a593Smuzhiyun
489*4882a593Smuzhiyun if (*rkey != keys[level]) {
490*4882a593Smuzhiyun r = -ENODATA;
491*4882a593Smuzhiyun goto out;
492*4882a593Smuzhiyun }
493*4882a593Smuzhiyun
494*4882a593Smuzhiyun root = le64_to_cpu(internal_value_le);
495*4882a593Smuzhiyun }
496*4882a593Smuzhiyun
497*4882a593Smuzhiyun r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le);
498*4882a593Smuzhiyun out:
499*4882a593Smuzhiyun exit_ro_spine(&spine);
500*4882a593Smuzhiyun return r;
501*4882a593Smuzhiyun }
502*4882a593Smuzhiyun
503*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
504*4882a593Smuzhiyun
505*4882a593Smuzhiyun /*
506*4882a593Smuzhiyun * Splits a node by creating a sibling node and shifting half the nodes
507*4882a593Smuzhiyun * contents across. Assumes there is a parent node, and it has room for
508*4882a593Smuzhiyun * another child.
509*4882a593Smuzhiyun *
510*4882a593Smuzhiyun * Before:
511*4882a593Smuzhiyun * +--------+
512*4882a593Smuzhiyun * | Parent |
513*4882a593Smuzhiyun * +--------+
514*4882a593Smuzhiyun * |
515*4882a593Smuzhiyun * v
516*4882a593Smuzhiyun * +----------+
517*4882a593Smuzhiyun * | A ++++++ |
518*4882a593Smuzhiyun * +----------+
519*4882a593Smuzhiyun *
520*4882a593Smuzhiyun *
521*4882a593Smuzhiyun * After:
522*4882a593Smuzhiyun * +--------+
523*4882a593Smuzhiyun * | Parent |
524*4882a593Smuzhiyun * +--------+
525*4882a593Smuzhiyun * | |
526*4882a593Smuzhiyun * v +------+
527*4882a593Smuzhiyun * +---------+ |
528*4882a593Smuzhiyun * | A* +++ | v
529*4882a593Smuzhiyun * +---------+ +-------+
530*4882a593Smuzhiyun * | B +++ |
531*4882a593Smuzhiyun * +-------+
532*4882a593Smuzhiyun *
533*4882a593Smuzhiyun * Where A* is a shadow of A.
534*4882a593Smuzhiyun */
btree_split_sibling(struct shadow_spine * s,unsigned parent_index,uint64_t key)535*4882a593Smuzhiyun static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
536*4882a593Smuzhiyun uint64_t key)
537*4882a593Smuzhiyun {
538*4882a593Smuzhiyun int r;
539*4882a593Smuzhiyun size_t size;
540*4882a593Smuzhiyun unsigned nr_left, nr_right;
541*4882a593Smuzhiyun struct dm_block *left, *right, *parent;
542*4882a593Smuzhiyun struct btree_node *ln, *rn, *pn;
543*4882a593Smuzhiyun __le64 location;
544*4882a593Smuzhiyun
545*4882a593Smuzhiyun left = shadow_current(s);
546*4882a593Smuzhiyun
547*4882a593Smuzhiyun r = new_block(s->info, &right);
548*4882a593Smuzhiyun if (r < 0)
549*4882a593Smuzhiyun return r;
550*4882a593Smuzhiyun
551*4882a593Smuzhiyun ln = dm_block_data(left);
552*4882a593Smuzhiyun rn = dm_block_data(right);
553*4882a593Smuzhiyun
554*4882a593Smuzhiyun nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
555*4882a593Smuzhiyun nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
556*4882a593Smuzhiyun
557*4882a593Smuzhiyun ln->header.nr_entries = cpu_to_le32(nr_left);
558*4882a593Smuzhiyun
559*4882a593Smuzhiyun rn->header.flags = ln->header.flags;
560*4882a593Smuzhiyun rn->header.nr_entries = cpu_to_le32(nr_right);
561*4882a593Smuzhiyun rn->header.max_entries = ln->header.max_entries;
562*4882a593Smuzhiyun rn->header.value_size = ln->header.value_size;
563*4882a593Smuzhiyun memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
564*4882a593Smuzhiyun
565*4882a593Smuzhiyun size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
566*4882a593Smuzhiyun sizeof(uint64_t) : s->info->value_type.size;
567*4882a593Smuzhiyun memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
568*4882a593Smuzhiyun size * nr_right);
569*4882a593Smuzhiyun
570*4882a593Smuzhiyun /*
571*4882a593Smuzhiyun * Patch up the parent
572*4882a593Smuzhiyun */
573*4882a593Smuzhiyun parent = shadow_parent(s);
574*4882a593Smuzhiyun
575*4882a593Smuzhiyun pn = dm_block_data(parent);
576*4882a593Smuzhiyun location = cpu_to_le64(dm_block_location(left));
577*4882a593Smuzhiyun __dm_bless_for_disk(&location);
578*4882a593Smuzhiyun memcpy_disk(value_ptr(pn, parent_index),
579*4882a593Smuzhiyun &location, sizeof(__le64));
580*4882a593Smuzhiyun
581*4882a593Smuzhiyun location = cpu_to_le64(dm_block_location(right));
582*4882a593Smuzhiyun __dm_bless_for_disk(&location);
583*4882a593Smuzhiyun
584*4882a593Smuzhiyun r = insert_at(sizeof(__le64), pn, parent_index + 1,
585*4882a593Smuzhiyun le64_to_cpu(rn->keys[0]), &location);
586*4882a593Smuzhiyun if (r) {
587*4882a593Smuzhiyun unlock_block(s->info, right);
588*4882a593Smuzhiyun return r;
589*4882a593Smuzhiyun }
590*4882a593Smuzhiyun
591*4882a593Smuzhiyun if (key < le64_to_cpu(rn->keys[0])) {
592*4882a593Smuzhiyun unlock_block(s->info, right);
593*4882a593Smuzhiyun s->nodes[1] = left;
594*4882a593Smuzhiyun } else {
595*4882a593Smuzhiyun unlock_block(s->info, left);
596*4882a593Smuzhiyun s->nodes[1] = right;
597*4882a593Smuzhiyun }
598*4882a593Smuzhiyun
599*4882a593Smuzhiyun return 0;
600*4882a593Smuzhiyun }
601*4882a593Smuzhiyun
602*4882a593Smuzhiyun /*
603*4882a593Smuzhiyun * Splits a node by creating two new children beneath the given node.
604*4882a593Smuzhiyun *
605*4882a593Smuzhiyun * Before:
606*4882a593Smuzhiyun * +----------+
607*4882a593Smuzhiyun * | A ++++++ |
608*4882a593Smuzhiyun * +----------+
609*4882a593Smuzhiyun *
610*4882a593Smuzhiyun *
611*4882a593Smuzhiyun * After:
612*4882a593Smuzhiyun * +------------+
613*4882a593Smuzhiyun * | A (shadow) |
614*4882a593Smuzhiyun * +------------+
615*4882a593Smuzhiyun * | |
616*4882a593Smuzhiyun * +------+ +----+
617*4882a593Smuzhiyun * | |
618*4882a593Smuzhiyun * v v
619*4882a593Smuzhiyun * +-------+ +-------+
620*4882a593Smuzhiyun * | B +++ | | C +++ |
621*4882a593Smuzhiyun * +-------+ +-------+
622*4882a593Smuzhiyun */
btree_split_beneath(struct shadow_spine * s,uint64_t key)623*4882a593Smuzhiyun static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
624*4882a593Smuzhiyun {
625*4882a593Smuzhiyun int r;
626*4882a593Smuzhiyun size_t size;
627*4882a593Smuzhiyun unsigned nr_left, nr_right;
628*4882a593Smuzhiyun struct dm_block *left, *right, *new_parent;
629*4882a593Smuzhiyun struct btree_node *pn, *ln, *rn;
630*4882a593Smuzhiyun __le64 val;
631*4882a593Smuzhiyun
632*4882a593Smuzhiyun new_parent = shadow_current(s);
633*4882a593Smuzhiyun
634*4882a593Smuzhiyun pn = dm_block_data(new_parent);
635*4882a593Smuzhiyun size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
636*4882a593Smuzhiyun sizeof(__le64) : s->info->value_type.size;
637*4882a593Smuzhiyun
638*4882a593Smuzhiyun /* create & init the left block */
639*4882a593Smuzhiyun r = new_block(s->info, &left);
640*4882a593Smuzhiyun if (r < 0)
641*4882a593Smuzhiyun return r;
642*4882a593Smuzhiyun
643*4882a593Smuzhiyun ln = dm_block_data(left);
644*4882a593Smuzhiyun nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
645*4882a593Smuzhiyun
646*4882a593Smuzhiyun ln->header.flags = pn->header.flags;
647*4882a593Smuzhiyun ln->header.nr_entries = cpu_to_le32(nr_left);
648*4882a593Smuzhiyun ln->header.max_entries = pn->header.max_entries;
649*4882a593Smuzhiyun ln->header.value_size = pn->header.value_size;
650*4882a593Smuzhiyun memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
651*4882a593Smuzhiyun memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
652*4882a593Smuzhiyun
653*4882a593Smuzhiyun /* create & init the right block */
654*4882a593Smuzhiyun r = new_block(s->info, &right);
655*4882a593Smuzhiyun if (r < 0) {
656*4882a593Smuzhiyun unlock_block(s->info, left);
657*4882a593Smuzhiyun return r;
658*4882a593Smuzhiyun }
659*4882a593Smuzhiyun
660*4882a593Smuzhiyun rn = dm_block_data(right);
661*4882a593Smuzhiyun nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
662*4882a593Smuzhiyun
663*4882a593Smuzhiyun rn->header.flags = pn->header.flags;
664*4882a593Smuzhiyun rn->header.nr_entries = cpu_to_le32(nr_right);
665*4882a593Smuzhiyun rn->header.max_entries = pn->header.max_entries;
666*4882a593Smuzhiyun rn->header.value_size = pn->header.value_size;
667*4882a593Smuzhiyun memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
668*4882a593Smuzhiyun memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
669*4882a593Smuzhiyun nr_right * size);
670*4882a593Smuzhiyun
671*4882a593Smuzhiyun /* new_parent should just point to l and r now */
672*4882a593Smuzhiyun pn->header.flags = cpu_to_le32(INTERNAL_NODE);
673*4882a593Smuzhiyun pn->header.nr_entries = cpu_to_le32(2);
674*4882a593Smuzhiyun pn->header.max_entries = cpu_to_le32(
675*4882a593Smuzhiyun calc_max_entries(sizeof(__le64),
676*4882a593Smuzhiyun dm_bm_block_size(
677*4882a593Smuzhiyun dm_tm_get_bm(s->info->tm))));
678*4882a593Smuzhiyun pn->header.value_size = cpu_to_le32(sizeof(__le64));
679*4882a593Smuzhiyun
680*4882a593Smuzhiyun val = cpu_to_le64(dm_block_location(left));
681*4882a593Smuzhiyun __dm_bless_for_disk(&val);
682*4882a593Smuzhiyun pn->keys[0] = ln->keys[0];
683*4882a593Smuzhiyun memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
684*4882a593Smuzhiyun
685*4882a593Smuzhiyun val = cpu_to_le64(dm_block_location(right));
686*4882a593Smuzhiyun __dm_bless_for_disk(&val);
687*4882a593Smuzhiyun pn->keys[1] = rn->keys[0];
688*4882a593Smuzhiyun memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
689*4882a593Smuzhiyun
690*4882a593Smuzhiyun unlock_block(s->info, left);
691*4882a593Smuzhiyun unlock_block(s->info, right);
692*4882a593Smuzhiyun return 0;
693*4882a593Smuzhiyun }
694*4882a593Smuzhiyun
btree_insert_raw(struct shadow_spine * s,dm_block_t root,struct dm_btree_value_type * vt,uint64_t key,unsigned * index)695*4882a593Smuzhiyun static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
696*4882a593Smuzhiyun struct dm_btree_value_type *vt,
697*4882a593Smuzhiyun uint64_t key, unsigned *index)
698*4882a593Smuzhiyun {
699*4882a593Smuzhiyun int r, i = *index, top = 1;
700*4882a593Smuzhiyun struct btree_node *node;
701*4882a593Smuzhiyun
702*4882a593Smuzhiyun for (;;) {
703*4882a593Smuzhiyun r = shadow_step(s, root, vt);
704*4882a593Smuzhiyun if (r < 0)
705*4882a593Smuzhiyun return r;
706*4882a593Smuzhiyun
707*4882a593Smuzhiyun node = dm_block_data(shadow_current(s));
708*4882a593Smuzhiyun
709*4882a593Smuzhiyun /*
710*4882a593Smuzhiyun * We have to patch up the parent node, ugly, but I don't
711*4882a593Smuzhiyun * see a way to do this automatically as part of the spine
712*4882a593Smuzhiyun * op.
713*4882a593Smuzhiyun */
714*4882a593Smuzhiyun if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
715*4882a593Smuzhiyun __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
716*4882a593Smuzhiyun
717*4882a593Smuzhiyun __dm_bless_for_disk(&location);
718*4882a593Smuzhiyun memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
719*4882a593Smuzhiyun &location, sizeof(__le64));
720*4882a593Smuzhiyun }
721*4882a593Smuzhiyun
722*4882a593Smuzhiyun node = dm_block_data(shadow_current(s));
723*4882a593Smuzhiyun
724*4882a593Smuzhiyun if (node->header.nr_entries == node->header.max_entries) {
725*4882a593Smuzhiyun if (top)
726*4882a593Smuzhiyun r = btree_split_beneath(s, key);
727*4882a593Smuzhiyun else
728*4882a593Smuzhiyun r = btree_split_sibling(s, i, key);
729*4882a593Smuzhiyun
730*4882a593Smuzhiyun if (r < 0)
731*4882a593Smuzhiyun return r;
732*4882a593Smuzhiyun }
733*4882a593Smuzhiyun
734*4882a593Smuzhiyun node = dm_block_data(shadow_current(s));
735*4882a593Smuzhiyun
736*4882a593Smuzhiyun i = lower_bound(node, key);
737*4882a593Smuzhiyun
738*4882a593Smuzhiyun if (le32_to_cpu(node->header.flags) & LEAF_NODE)
739*4882a593Smuzhiyun break;
740*4882a593Smuzhiyun
741*4882a593Smuzhiyun if (i < 0) {
742*4882a593Smuzhiyun /* change the bounds on the lowest key */
743*4882a593Smuzhiyun node->keys[0] = cpu_to_le64(key);
744*4882a593Smuzhiyun i = 0;
745*4882a593Smuzhiyun }
746*4882a593Smuzhiyun
747*4882a593Smuzhiyun root = value64(node, i);
748*4882a593Smuzhiyun top = 0;
749*4882a593Smuzhiyun }
750*4882a593Smuzhiyun
751*4882a593Smuzhiyun if (i < 0 || le64_to_cpu(node->keys[i]) != key)
752*4882a593Smuzhiyun i++;
753*4882a593Smuzhiyun
754*4882a593Smuzhiyun *index = i;
755*4882a593Smuzhiyun return 0;
756*4882a593Smuzhiyun }
757*4882a593Smuzhiyun
need_insert(struct btree_node * node,uint64_t * keys,unsigned level,unsigned index)758*4882a593Smuzhiyun static bool need_insert(struct btree_node *node, uint64_t *keys,
759*4882a593Smuzhiyun unsigned level, unsigned index)
760*4882a593Smuzhiyun {
761*4882a593Smuzhiyun return ((index >= le32_to_cpu(node->header.nr_entries)) ||
762*4882a593Smuzhiyun (le64_to_cpu(node->keys[index]) != keys[level]));
763*4882a593Smuzhiyun }
764*4882a593Smuzhiyun
insert(struct dm_btree_info * info,dm_block_t root,uint64_t * keys,void * value,dm_block_t * new_root,int * inserted)765*4882a593Smuzhiyun static int insert(struct dm_btree_info *info, dm_block_t root,
766*4882a593Smuzhiyun uint64_t *keys, void *value, dm_block_t *new_root,
767*4882a593Smuzhiyun int *inserted)
768*4882a593Smuzhiyun __dm_written_to_disk(value)
769*4882a593Smuzhiyun {
770*4882a593Smuzhiyun int r;
771*4882a593Smuzhiyun unsigned level, index = -1, last_level = info->levels - 1;
772*4882a593Smuzhiyun dm_block_t block = root;
773*4882a593Smuzhiyun struct shadow_spine spine;
774*4882a593Smuzhiyun struct btree_node *n;
775*4882a593Smuzhiyun struct dm_btree_value_type le64_type;
776*4882a593Smuzhiyun
777*4882a593Smuzhiyun init_le64_type(info->tm, &le64_type);
778*4882a593Smuzhiyun init_shadow_spine(&spine, info);
779*4882a593Smuzhiyun
780*4882a593Smuzhiyun for (level = 0; level < (info->levels - 1); level++) {
781*4882a593Smuzhiyun r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
782*4882a593Smuzhiyun if (r < 0)
783*4882a593Smuzhiyun goto bad;
784*4882a593Smuzhiyun
785*4882a593Smuzhiyun n = dm_block_data(shadow_current(&spine));
786*4882a593Smuzhiyun
787*4882a593Smuzhiyun if (need_insert(n, keys, level, index)) {
788*4882a593Smuzhiyun dm_block_t new_tree;
789*4882a593Smuzhiyun __le64 new_le;
790*4882a593Smuzhiyun
791*4882a593Smuzhiyun r = dm_btree_empty(info, &new_tree);
792*4882a593Smuzhiyun if (r < 0)
793*4882a593Smuzhiyun goto bad;
794*4882a593Smuzhiyun
795*4882a593Smuzhiyun new_le = cpu_to_le64(new_tree);
796*4882a593Smuzhiyun __dm_bless_for_disk(&new_le);
797*4882a593Smuzhiyun
798*4882a593Smuzhiyun r = insert_at(sizeof(uint64_t), n, index,
799*4882a593Smuzhiyun keys[level], &new_le);
800*4882a593Smuzhiyun if (r)
801*4882a593Smuzhiyun goto bad;
802*4882a593Smuzhiyun }
803*4882a593Smuzhiyun
804*4882a593Smuzhiyun if (level < last_level)
805*4882a593Smuzhiyun block = value64(n, index);
806*4882a593Smuzhiyun }
807*4882a593Smuzhiyun
808*4882a593Smuzhiyun r = btree_insert_raw(&spine, block, &info->value_type,
809*4882a593Smuzhiyun keys[level], &index);
810*4882a593Smuzhiyun if (r < 0)
811*4882a593Smuzhiyun goto bad;
812*4882a593Smuzhiyun
813*4882a593Smuzhiyun n = dm_block_data(shadow_current(&spine));
814*4882a593Smuzhiyun
815*4882a593Smuzhiyun if (need_insert(n, keys, level, index)) {
816*4882a593Smuzhiyun if (inserted)
817*4882a593Smuzhiyun *inserted = 1;
818*4882a593Smuzhiyun
819*4882a593Smuzhiyun r = insert_at(info->value_type.size, n, index,
820*4882a593Smuzhiyun keys[level], value);
821*4882a593Smuzhiyun if (r)
822*4882a593Smuzhiyun goto bad_unblessed;
823*4882a593Smuzhiyun } else {
824*4882a593Smuzhiyun if (inserted)
825*4882a593Smuzhiyun *inserted = 0;
826*4882a593Smuzhiyun
827*4882a593Smuzhiyun if (info->value_type.dec &&
828*4882a593Smuzhiyun (!info->value_type.equal ||
829*4882a593Smuzhiyun !info->value_type.equal(
830*4882a593Smuzhiyun info->value_type.context,
831*4882a593Smuzhiyun value_ptr(n, index),
832*4882a593Smuzhiyun value))) {
833*4882a593Smuzhiyun info->value_type.dec(info->value_type.context,
834*4882a593Smuzhiyun value_ptr(n, index));
835*4882a593Smuzhiyun }
836*4882a593Smuzhiyun memcpy_disk(value_ptr(n, index),
837*4882a593Smuzhiyun value, info->value_type.size);
838*4882a593Smuzhiyun }
839*4882a593Smuzhiyun
840*4882a593Smuzhiyun *new_root = shadow_root(&spine);
841*4882a593Smuzhiyun exit_shadow_spine(&spine);
842*4882a593Smuzhiyun
843*4882a593Smuzhiyun return 0;
844*4882a593Smuzhiyun
845*4882a593Smuzhiyun bad:
846*4882a593Smuzhiyun __dm_unbless_for_disk(value);
847*4882a593Smuzhiyun bad_unblessed:
848*4882a593Smuzhiyun exit_shadow_spine(&spine);
849*4882a593Smuzhiyun return r;
850*4882a593Smuzhiyun }
851*4882a593Smuzhiyun
dm_btree_insert(struct dm_btree_info * info,dm_block_t root,uint64_t * keys,void * value,dm_block_t * new_root)852*4882a593Smuzhiyun int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
853*4882a593Smuzhiyun uint64_t *keys, void *value, dm_block_t *new_root)
854*4882a593Smuzhiyun __dm_written_to_disk(value)
855*4882a593Smuzhiyun {
856*4882a593Smuzhiyun return insert(info, root, keys, value, new_root, NULL);
857*4882a593Smuzhiyun }
858*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_insert);
859*4882a593Smuzhiyun
dm_btree_insert_notify(struct dm_btree_info * info,dm_block_t root,uint64_t * keys,void * value,dm_block_t * new_root,int * inserted)860*4882a593Smuzhiyun int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
861*4882a593Smuzhiyun uint64_t *keys, void *value, dm_block_t *new_root,
862*4882a593Smuzhiyun int *inserted)
863*4882a593Smuzhiyun __dm_written_to_disk(value)
864*4882a593Smuzhiyun {
865*4882a593Smuzhiyun return insert(info, root, keys, value, new_root, inserted);
866*4882a593Smuzhiyun }
867*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
868*4882a593Smuzhiyun
869*4882a593Smuzhiyun /*----------------------------------------------------------------*/
870*4882a593Smuzhiyun
find_key(struct ro_spine * s,dm_block_t block,bool find_highest,uint64_t * result_key,dm_block_t * next_block)871*4882a593Smuzhiyun static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
872*4882a593Smuzhiyun uint64_t *result_key, dm_block_t *next_block)
873*4882a593Smuzhiyun {
874*4882a593Smuzhiyun int i, r;
875*4882a593Smuzhiyun uint32_t flags;
876*4882a593Smuzhiyun
877*4882a593Smuzhiyun do {
878*4882a593Smuzhiyun r = ro_step(s, block);
879*4882a593Smuzhiyun if (r < 0)
880*4882a593Smuzhiyun return r;
881*4882a593Smuzhiyun
882*4882a593Smuzhiyun flags = le32_to_cpu(ro_node(s)->header.flags);
883*4882a593Smuzhiyun i = le32_to_cpu(ro_node(s)->header.nr_entries);
884*4882a593Smuzhiyun if (!i)
885*4882a593Smuzhiyun return -ENODATA;
886*4882a593Smuzhiyun else
887*4882a593Smuzhiyun i--;
888*4882a593Smuzhiyun
889*4882a593Smuzhiyun if (find_highest)
890*4882a593Smuzhiyun *result_key = le64_to_cpu(ro_node(s)->keys[i]);
891*4882a593Smuzhiyun else
892*4882a593Smuzhiyun *result_key = le64_to_cpu(ro_node(s)->keys[0]);
893*4882a593Smuzhiyun
894*4882a593Smuzhiyun if (next_block || flags & INTERNAL_NODE) {
895*4882a593Smuzhiyun if (find_highest)
896*4882a593Smuzhiyun block = value64(ro_node(s), i);
897*4882a593Smuzhiyun else
898*4882a593Smuzhiyun block = value64(ro_node(s), 0);
899*4882a593Smuzhiyun }
900*4882a593Smuzhiyun
901*4882a593Smuzhiyun } while (flags & INTERNAL_NODE);
902*4882a593Smuzhiyun
903*4882a593Smuzhiyun if (next_block)
904*4882a593Smuzhiyun *next_block = block;
905*4882a593Smuzhiyun return 0;
906*4882a593Smuzhiyun }
907*4882a593Smuzhiyun
dm_btree_find_key(struct dm_btree_info * info,dm_block_t root,bool find_highest,uint64_t * result_keys)908*4882a593Smuzhiyun static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
909*4882a593Smuzhiyun bool find_highest, uint64_t *result_keys)
910*4882a593Smuzhiyun {
911*4882a593Smuzhiyun int r = 0, count = 0, level;
912*4882a593Smuzhiyun struct ro_spine spine;
913*4882a593Smuzhiyun
914*4882a593Smuzhiyun init_ro_spine(&spine, info);
915*4882a593Smuzhiyun for (level = 0; level < info->levels; level++) {
916*4882a593Smuzhiyun r = find_key(&spine, root, find_highest, result_keys + level,
917*4882a593Smuzhiyun level == info->levels - 1 ? NULL : &root);
918*4882a593Smuzhiyun if (r == -ENODATA) {
919*4882a593Smuzhiyun r = 0;
920*4882a593Smuzhiyun break;
921*4882a593Smuzhiyun
922*4882a593Smuzhiyun } else if (r)
923*4882a593Smuzhiyun break;
924*4882a593Smuzhiyun
925*4882a593Smuzhiyun count++;
926*4882a593Smuzhiyun }
927*4882a593Smuzhiyun exit_ro_spine(&spine);
928*4882a593Smuzhiyun
929*4882a593Smuzhiyun return r ? r : count;
930*4882a593Smuzhiyun }
931*4882a593Smuzhiyun
dm_btree_find_highest_key(struct dm_btree_info * info,dm_block_t root,uint64_t * result_keys)932*4882a593Smuzhiyun int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
933*4882a593Smuzhiyun uint64_t *result_keys)
934*4882a593Smuzhiyun {
935*4882a593Smuzhiyun return dm_btree_find_key(info, root, true, result_keys);
936*4882a593Smuzhiyun }
937*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
938*4882a593Smuzhiyun
dm_btree_find_lowest_key(struct dm_btree_info * info,dm_block_t root,uint64_t * result_keys)939*4882a593Smuzhiyun int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
940*4882a593Smuzhiyun uint64_t *result_keys)
941*4882a593Smuzhiyun {
942*4882a593Smuzhiyun return dm_btree_find_key(info, root, false, result_keys);
943*4882a593Smuzhiyun }
944*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
945*4882a593Smuzhiyun
946*4882a593Smuzhiyun /*----------------------------------------------------------------*/
947*4882a593Smuzhiyun
948*4882a593Smuzhiyun /*
949*4882a593Smuzhiyun * FIXME: We shouldn't use a recursive algorithm when we have limited stack
950*4882a593Smuzhiyun * space. Also this only works for single level trees.
951*4882a593Smuzhiyun */
walk_node(struct dm_btree_info * info,dm_block_t block,int (* fn)(void * context,uint64_t * keys,void * leaf),void * context)952*4882a593Smuzhiyun static int walk_node(struct dm_btree_info *info, dm_block_t block,
953*4882a593Smuzhiyun int (*fn)(void *context, uint64_t *keys, void *leaf),
954*4882a593Smuzhiyun void *context)
955*4882a593Smuzhiyun {
956*4882a593Smuzhiyun int r;
957*4882a593Smuzhiyun unsigned i, nr;
958*4882a593Smuzhiyun struct dm_block *node;
959*4882a593Smuzhiyun struct btree_node *n;
960*4882a593Smuzhiyun uint64_t keys;
961*4882a593Smuzhiyun
962*4882a593Smuzhiyun r = bn_read_lock(info, block, &node);
963*4882a593Smuzhiyun if (r)
964*4882a593Smuzhiyun return r;
965*4882a593Smuzhiyun
966*4882a593Smuzhiyun n = dm_block_data(node);
967*4882a593Smuzhiyun
968*4882a593Smuzhiyun nr = le32_to_cpu(n->header.nr_entries);
969*4882a593Smuzhiyun for (i = 0; i < nr; i++) {
970*4882a593Smuzhiyun if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
971*4882a593Smuzhiyun r = walk_node(info, value64(n, i), fn, context);
972*4882a593Smuzhiyun if (r)
973*4882a593Smuzhiyun goto out;
974*4882a593Smuzhiyun } else {
975*4882a593Smuzhiyun keys = le64_to_cpu(*key_ptr(n, i));
976*4882a593Smuzhiyun r = fn(context, &keys, value_ptr(n, i));
977*4882a593Smuzhiyun if (r)
978*4882a593Smuzhiyun goto out;
979*4882a593Smuzhiyun }
980*4882a593Smuzhiyun }
981*4882a593Smuzhiyun
982*4882a593Smuzhiyun out:
983*4882a593Smuzhiyun dm_tm_unlock(info->tm, node);
984*4882a593Smuzhiyun return r;
985*4882a593Smuzhiyun }
986*4882a593Smuzhiyun
dm_btree_walk(struct dm_btree_info * info,dm_block_t root,int (* fn)(void * context,uint64_t * keys,void * leaf),void * context)987*4882a593Smuzhiyun int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
988*4882a593Smuzhiyun int (*fn)(void *context, uint64_t *keys, void *leaf),
989*4882a593Smuzhiyun void *context)
990*4882a593Smuzhiyun {
991*4882a593Smuzhiyun BUG_ON(info->levels > 1);
992*4882a593Smuzhiyun return walk_node(info, root, fn, context);
993*4882a593Smuzhiyun }
994*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_walk);
995*4882a593Smuzhiyun
996*4882a593Smuzhiyun /*----------------------------------------------------------------*/
997*4882a593Smuzhiyun
prefetch_values(struct dm_btree_cursor * c)998*4882a593Smuzhiyun static void prefetch_values(struct dm_btree_cursor *c)
999*4882a593Smuzhiyun {
1000*4882a593Smuzhiyun unsigned i, nr;
1001*4882a593Smuzhiyun __le64 value_le;
1002*4882a593Smuzhiyun struct cursor_node *n = c->nodes + c->depth - 1;
1003*4882a593Smuzhiyun struct btree_node *bn = dm_block_data(n->b);
1004*4882a593Smuzhiyun struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm);
1005*4882a593Smuzhiyun
1006*4882a593Smuzhiyun BUG_ON(c->info->value_type.size != sizeof(value_le));
1007*4882a593Smuzhiyun
1008*4882a593Smuzhiyun nr = le32_to_cpu(bn->header.nr_entries);
1009*4882a593Smuzhiyun for (i = 0; i < nr; i++) {
1010*4882a593Smuzhiyun memcpy(&value_le, value_ptr(bn, i), sizeof(value_le));
1011*4882a593Smuzhiyun dm_bm_prefetch(bm, le64_to_cpu(value_le));
1012*4882a593Smuzhiyun }
1013*4882a593Smuzhiyun }
1014*4882a593Smuzhiyun
leaf_node(struct dm_btree_cursor * c)1015*4882a593Smuzhiyun static bool leaf_node(struct dm_btree_cursor *c)
1016*4882a593Smuzhiyun {
1017*4882a593Smuzhiyun struct cursor_node *n = c->nodes + c->depth - 1;
1018*4882a593Smuzhiyun struct btree_node *bn = dm_block_data(n->b);
1019*4882a593Smuzhiyun
1020*4882a593Smuzhiyun return le32_to_cpu(bn->header.flags) & LEAF_NODE;
1021*4882a593Smuzhiyun }
1022*4882a593Smuzhiyun
push_node(struct dm_btree_cursor * c,dm_block_t b)1023*4882a593Smuzhiyun static int push_node(struct dm_btree_cursor *c, dm_block_t b)
1024*4882a593Smuzhiyun {
1025*4882a593Smuzhiyun int r;
1026*4882a593Smuzhiyun struct cursor_node *n = c->nodes + c->depth;
1027*4882a593Smuzhiyun
1028*4882a593Smuzhiyun if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) {
1029*4882a593Smuzhiyun DMERR("couldn't push cursor node, stack depth too high");
1030*4882a593Smuzhiyun return -EINVAL;
1031*4882a593Smuzhiyun }
1032*4882a593Smuzhiyun
1033*4882a593Smuzhiyun r = bn_read_lock(c->info, b, &n->b);
1034*4882a593Smuzhiyun if (r)
1035*4882a593Smuzhiyun return r;
1036*4882a593Smuzhiyun
1037*4882a593Smuzhiyun n->index = 0;
1038*4882a593Smuzhiyun c->depth++;
1039*4882a593Smuzhiyun
1040*4882a593Smuzhiyun if (c->prefetch_leaves || !leaf_node(c))
1041*4882a593Smuzhiyun prefetch_values(c);
1042*4882a593Smuzhiyun
1043*4882a593Smuzhiyun return 0;
1044*4882a593Smuzhiyun }
1045*4882a593Smuzhiyun
pop_node(struct dm_btree_cursor * c)1046*4882a593Smuzhiyun static void pop_node(struct dm_btree_cursor *c)
1047*4882a593Smuzhiyun {
1048*4882a593Smuzhiyun c->depth--;
1049*4882a593Smuzhiyun unlock_block(c->info, c->nodes[c->depth].b);
1050*4882a593Smuzhiyun }
1051*4882a593Smuzhiyun
inc_or_backtrack(struct dm_btree_cursor * c)1052*4882a593Smuzhiyun static int inc_or_backtrack(struct dm_btree_cursor *c)
1053*4882a593Smuzhiyun {
1054*4882a593Smuzhiyun struct cursor_node *n;
1055*4882a593Smuzhiyun struct btree_node *bn;
1056*4882a593Smuzhiyun
1057*4882a593Smuzhiyun for (;;) {
1058*4882a593Smuzhiyun if (!c->depth)
1059*4882a593Smuzhiyun return -ENODATA;
1060*4882a593Smuzhiyun
1061*4882a593Smuzhiyun n = c->nodes + c->depth - 1;
1062*4882a593Smuzhiyun bn = dm_block_data(n->b);
1063*4882a593Smuzhiyun
1064*4882a593Smuzhiyun n->index++;
1065*4882a593Smuzhiyun if (n->index < le32_to_cpu(bn->header.nr_entries))
1066*4882a593Smuzhiyun break;
1067*4882a593Smuzhiyun
1068*4882a593Smuzhiyun pop_node(c);
1069*4882a593Smuzhiyun }
1070*4882a593Smuzhiyun
1071*4882a593Smuzhiyun return 0;
1072*4882a593Smuzhiyun }
1073*4882a593Smuzhiyun
find_leaf(struct dm_btree_cursor * c)1074*4882a593Smuzhiyun static int find_leaf(struct dm_btree_cursor *c)
1075*4882a593Smuzhiyun {
1076*4882a593Smuzhiyun int r = 0;
1077*4882a593Smuzhiyun struct cursor_node *n;
1078*4882a593Smuzhiyun struct btree_node *bn;
1079*4882a593Smuzhiyun __le64 value_le;
1080*4882a593Smuzhiyun
1081*4882a593Smuzhiyun for (;;) {
1082*4882a593Smuzhiyun n = c->nodes + c->depth - 1;
1083*4882a593Smuzhiyun bn = dm_block_data(n->b);
1084*4882a593Smuzhiyun
1085*4882a593Smuzhiyun if (le32_to_cpu(bn->header.flags) & LEAF_NODE)
1086*4882a593Smuzhiyun break;
1087*4882a593Smuzhiyun
1088*4882a593Smuzhiyun memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le));
1089*4882a593Smuzhiyun r = push_node(c, le64_to_cpu(value_le));
1090*4882a593Smuzhiyun if (r) {
1091*4882a593Smuzhiyun DMERR("push_node failed");
1092*4882a593Smuzhiyun break;
1093*4882a593Smuzhiyun }
1094*4882a593Smuzhiyun }
1095*4882a593Smuzhiyun
1096*4882a593Smuzhiyun if (!r && (le32_to_cpu(bn->header.nr_entries) == 0))
1097*4882a593Smuzhiyun return -ENODATA;
1098*4882a593Smuzhiyun
1099*4882a593Smuzhiyun return r;
1100*4882a593Smuzhiyun }
1101*4882a593Smuzhiyun
dm_btree_cursor_begin(struct dm_btree_info * info,dm_block_t root,bool prefetch_leaves,struct dm_btree_cursor * c)1102*4882a593Smuzhiyun int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
1103*4882a593Smuzhiyun bool prefetch_leaves, struct dm_btree_cursor *c)
1104*4882a593Smuzhiyun {
1105*4882a593Smuzhiyun int r;
1106*4882a593Smuzhiyun
1107*4882a593Smuzhiyun c->info = info;
1108*4882a593Smuzhiyun c->root = root;
1109*4882a593Smuzhiyun c->depth = 0;
1110*4882a593Smuzhiyun c->prefetch_leaves = prefetch_leaves;
1111*4882a593Smuzhiyun
1112*4882a593Smuzhiyun r = push_node(c, root);
1113*4882a593Smuzhiyun if (r)
1114*4882a593Smuzhiyun return r;
1115*4882a593Smuzhiyun
1116*4882a593Smuzhiyun return find_leaf(c);
1117*4882a593Smuzhiyun }
1118*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_cursor_begin);
1119*4882a593Smuzhiyun
dm_btree_cursor_end(struct dm_btree_cursor * c)1120*4882a593Smuzhiyun void dm_btree_cursor_end(struct dm_btree_cursor *c)
1121*4882a593Smuzhiyun {
1122*4882a593Smuzhiyun while (c->depth)
1123*4882a593Smuzhiyun pop_node(c);
1124*4882a593Smuzhiyun }
1125*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_cursor_end);
1126*4882a593Smuzhiyun
dm_btree_cursor_next(struct dm_btree_cursor * c)1127*4882a593Smuzhiyun int dm_btree_cursor_next(struct dm_btree_cursor *c)
1128*4882a593Smuzhiyun {
1129*4882a593Smuzhiyun int r = inc_or_backtrack(c);
1130*4882a593Smuzhiyun if (!r) {
1131*4882a593Smuzhiyun r = find_leaf(c);
1132*4882a593Smuzhiyun if (r)
1133*4882a593Smuzhiyun DMERR("find_leaf failed");
1134*4882a593Smuzhiyun }
1135*4882a593Smuzhiyun
1136*4882a593Smuzhiyun return r;
1137*4882a593Smuzhiyun }
1138*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_cursor_next);
1139*4882a593Smuzhiyun
dm_btree_cursor_skip(struct dm_btree_cursor * c,uint32_t count)1140*4882a593Smuzhiyun int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count)
1141*4882a593Smuzhiyun {
1142*4882a593Smuzhiyun int r = 0;
1143*4882a593Smuzhiyun
1144*4882a593Smuzhiyun while (count-- && !r)
1145*4882a593Smuzhiyun r = dm_btree_cursor_next(c);
1146*4882a593Smuzhiyun
1147*4882a593Smuzhiyun return r;
1148*4882a593Smuzhiyun }
1149*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_cursor_skip);
1150*4882a593Smuzhiyun
dm_btree_cursor_get_value(struct dm_btree_cursor * c,uint64_t * key,void * value_le)1151*4882a593Smuzhiyun int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le)
1152*4882a593Smuzhiyun {
1153*4882a593Smuzhiyun if (c->depth) {
1154*4882a593Smuzhiyun struct cursor_node *n = c->nodes + c->depth - 1;
1155*4882a593Smuzhiyun struct btree_node *bn = dm_block_data(n->b);
1156*4882a593Smuzhiyun
1157*4882a593Smuzhiyun if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE)
1158*4882a593Smuzhiyun return -EINVAL;
1159*4882a593Smuzhiyun
1160*4882a593Smuzhiyun *key = le64_to_cpu(*key_ptr(bn, n->index));
1161*4882a593Smuzhiyun memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size);
1162*4882a593Smuzhiyun return 0;
1163*4882a593Smuzhiyun
1164*4882a593Smuzhiyun } else
1165*4882a593Smuzhiyun return -ENODATA;
1166*4882a593Smuzhiyun }
1167*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value);
1168