1*4882a593Smuzhiyun /* 2*4882a593Smuzhiyun * Copyright (C) 2011 Red Hat, Inc. 3*4882a593Smuzhiyun * 4*4882a593Smuzhiyun * This file is released under the GPL. 5*4882a593Smuzhiyun */ 6*4882a593Smuzhiyun #ifndef _LINUX_DM_BTREE_H 7*4882a593Smuzhiyun #define _LINUX_DM_BTREE_H 8*4882a593Smuzhiyun 9*4882a593Smuzhiyun #include "dm-block-manager.h" 10*4882a593Smuzhiyun 11*4882a593Smuzhiyun struct dm_transaction_manager; 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun /*----------------------------------------------------------------*/ 14*4882a593Smuzhiyun 15*4882a593Smuzhiyun /* 16*4882a593Smuzhiyun * Annotations used to check on-disk metadata is handled as little-endian. 17*4882a593Smuzhiyun */ 18*4882a593Smuzhiyun #ifdef __CHECKER__ 19*4882a593Smuzhiyun # define __dm_written_to_disk(x) __releases(x) 20*4882a593Smuzhiyun # define __dm_reads_from_disk(x) __acquires(x) 21*4882a593Smuzhiyun # define __dm_bless_for_disk(x) __acquire(x) 22*4882a593Smuzhiyun # define __dm_unbless_for_disk(x) __release(x) 23*4882a593Smuzhiyun #else 24*4882a593Smuzhiyun # define __dm_written_to_disk(x) 25*4882a593Smuzhiyun # define __dm_reads_from_disk(x) 26*4882a593Smuzhiyun # define __dm_bless_for_disk(x) 27*4882a593Smuzhiyun # define __dm_unbless_for_disk(x) 28*4882a593Smuzhiyun #endif 29*4882a593Smuzhiyun 30*4882a593Smuzhiyun /*----------------------------------------------------------------*/ 31*4882a593Smuzhiyun 32*4882a593Smuzhiyun /* 33*4882a593Smuzhiyun * Manipulates hierarchical B+ trees with 64-bit keys and arbitrary-sized 34*4882a593Smuzhiyun * values. 35*4882a593Smuzhiyun */ 36*4882a593Smuzhiyun 37*4882a593Smuzhiyun /* 38*4882a593Smuzhiyun * Information about the values stored within the btree. 39*4882a593Smuzhiyun */ 40*4882a593Smuzhiyun struct dm_btree_value_type { 41*4882a593Smuzhiyun void *context; 42*4882a593Smuzhiyun 43*4882a593Smuzhiyun /* 44*4882a593Smuzhiyun * The size in bytes of each value. 45*4882a593Smuzhiyun */ 46*4882a593Smuzhiyun uint32_t size; 47*4882a593Smuzhiyun 48*4882a593Smuzhiyun /* 49*4882a593Smuzhiyun * Any of these methods can be safely set to NULL if you do not 50*4882a593Smuzhiyun * need the corresponding feature. 51*4882a593Smuzhiyun */ 52*4882a593Smuzhiyun 53*4882a593Smuzhiyun /* 54*4882a593Smuzhiyun * The btree is making a duplicate of the value, for instance 55*4882a593Smuzhiyun * because previously-shared btree nodes have now diverged. 56*4882a593Smuzhiyun * @value argument is the new copy that the copy function may modify. 57*4882a593Smuzhiyun * (Probably it just wants to increment a reference count 58*4882a593Smuzhiyun * somewhere.) This method is _not_ called for insertion of a new 59*4882a593Smuzhiyun * value: It is assumed the ref count is already 1. 60*4882a593Smuzhiyun */ 61*4882a593Smuzhiyun void (*inc)(void *context, const void *value); 62*4882a593Smuzhiyun 63*4882a593Smuzhiyun /* 64*4882a593Smuzhiyun * This value is being deleted. The btree takes care of freeing 65*4882a593Smuzhiyun * the memory pointed to by @value. Often the del function just 66*4882a593Smuzhiyun * needs to decrement a reference count somewhere. 67*4882a593Smuzhiyun */ 68*4882a593Smuzhiyun void (*dec)(void *context, const void *value); 69*4882a593Smuzhiyun 70*4882a593Smuzhiyun /* 71*4882a593Smuzhiyun * A test for equality between two values. When a value is 72*4882a593Smuzhiyun * overwritten with a new one, the old one has the dec method 73*4882a593Smuzhiyun * called _unless_ the new and old value are deemed equal. 74*4882a593Smuzhiyun */ 75*4882a593Smuzhiyun int (*equal)(void *context, const void *value1, const void *value2); 76*4882a593Smuzhiyun }; 77*4882a593Smuzhiyun 78*4882a593Smuzhiyun /* 79*4882a593Smuzhiyun * The shape and contents of a btree. 80*4882a593Smuzhiyun */ 81*4882a593Smuzhiyun struct dm_btree_info { 82*4882a593Smuzhiyun struct dm_transaction_manager *tm; 83*4882a593Smuzhiyun 84*4882a593Smuzhiyun /* 85*4882a593Smuzhiyun * Number of nested btrees. (Not the depth of a single tree.) 86*4882a593Smuzhiyun */ 87*4882a593Smuzhiyun unsigned levels; 88*4882a593Smuzhiyun struct dm_btree_value_type value_type; 89*4882a593Smuzhiyun }; 90*4882a593Smuzhiyun 91*4882a593Smuzhiyun /* 92*4882a593Smuzhiyun * Set up an empty tree. O(1). 93*4882a593Smuzhiyun */ 94*4882a593Smuzhiyun int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root); 95*4882a593Smuzhiyun 96*4882a593Smuzhiyun /* 97*4882a593Smuzhiyun * Delete a tree. O(n) - this is the slow one! It can also block, so 98*4882a593Smuzhiyun * please don't call it on an IO path. 99*4882a593Smuzhiyun */ 100*4882a593Smuzhiyun int dm_btree_del(struct dm_btree_info *info, dm_block_t root); 101*4882a593Smuzhiyun 102*4882a593Smuzhiyun /* 103*4882a593Smuzhiyun * All the lookup functions return -ENODATA if the key cannot be found. 104*4882a593Smuzhiyun */ 105*4882a593Smuzhiyun 106*4882a593Smuzhiyun /* 107*4882a593Smuzhiyun * Tries to find a key that matches exactly. O(ln(n)) 108*4882a593Smuzhiyun */ 109*4882a593Smuzhiyun int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, 110*4882a593Smuzhiyun uint64_t *keys, void *value_le); 111*4882a593Smuzhiyun 112*4882a593Smuzhiyun /* 113*4882a593Smuzhiyun * Tries to find the first key where the bottom level key is >= to that 114*4882a593Smuzhiyun * given. Useful for skipping empty sections of the btree. 115*4882a593Smuzhiyun */ 116*4882a593Smuzhiyun int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root, 117*4882a593Smuzhiyun uint64_t *keys, uint64_t *rkey, void *value_le); 118*4882a593Smuzhiyun 119*4882a593Smuzhiyun /* 120*4882a593Smuzhiyun * Insertion (or overwrite an existing value). O(ln(n)) 121*4882a593Smuzhiyun */ 122*4882a593Smuzhiyun int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, 123*4882a593Smuzhiyun uint64_t *keys, void *value, dm_block_t *new_root) 124*4882a593Smuzhiyun __dm_written_to_disk(value); 125*4882a593Smuzhiyun 126*4882a593Smuzhiyun /* 127*4882a593Smuzhiyun * A variant of insert that indicates whether it actually inserted or just 128*4882a593Smuzhiyun * overwrote. Useful if you're keeping track of the number of entries in a 129*4882a593Smuzhiyun * tree. 130*4882a593Smuzhiyun */ 131*4882a593Smuzhiyun int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, 132*4882a593Smuzhiyun uint64_t *keys, void *value, dm_block_t *new_root, 133*4882a593Smuzhiyun int *inserted) 134*4882a593Smuzhiyun __dm_written_to_disk(value); 135*4882a593Smuzhiyun 136*4882a593Smuzhiyun /* 137*4882a593Smuzhiyun * Remove a key if present. This doesn't remove empty sub trees. Normally 138*4882a593Smuzhiyun * subtrees represent a separate entity, like a snapshot map, so this is 139*4882a593Smuzhiyun * correct behaviour. O(ln(n)). 140*4882a593Smuzhiyun */ 141*4882a593Smuzhiyun int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, 142*4882a593Smuzhiyun uint64_t *keys, dm_block_t *new_root); 143*4882a593Smuzhiyun 144*4882a593Smuzhiyun /* 145*4882a593Smuzhiyun * Removes a _contiguous_ run of values starting from 'keys' and not 146*4882a593Smuzhiyun * reaching keys2 (where keys2 is keys with the final key replaced with 147*4882a593Smuzhiyun * 'end_key'). 'end_key' is the one-past-the-end value. 'keys' may be 148*4882a593Smuzhiyun * altered. 149*4882a593Smuzhiyun */ 150*4882a593Smuzhiyun int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root, 151*4882a593Smuzhiyun uint64_t *keys, uint64_t end_key, 152*4882a593Smuzhiyun dm_block_t *new_root, unsigned *nr_removed); 153*4882a593Smuzhiyun 154*4882a593Smuzhiyun /* 155*4882a593Smuzhiyun * Returns < 0 on failure. Otherwise the number of key entries that have 156*4882a593Smuzhiyun * been filled out. Remember trees can have zero entries, and as such have 157*4882a593Smuzhiyun * no lowest key. 158*4882a593Smuzhiyun */ 159*4882a593Smuzhiyun int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, 160*4882a593Smuzhiyun uint64_t *result_keys); 161*4882a593Smuzhiyun 162*4882a593Smuzhiyun /* 163*4882a593Smuzhiyun * Returns < 0 on failure. Otherwise the number of key entries that have 164*4882a593Smuzhiyun * been filled out. Remember trees can have zero entries, and as such have 165*4882a593Smuzhiyun * no highest key. 166*4882a593Smuzhiyun */ 167*4882a593Smuzhiyun int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, 168*4882a593Smuzhiyun uint64_t *result_keys); 169*4882a593Smuzhiyun 170*4882a593Smuzhiyun /* 171*4882a593Smuzhiyun * Iterate through the a btree, calling fn() on each entry. 172*4882a593Smuzhiyun * It only works for single level trees and is internally recursive, so 173*4882a593Smuzhiyun * monitor stack usage carefully. 174*4882a593Smuzhiyun */ 175*4882a593Smuzhiyun int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, 176*4882a593Smuzhiyun int (*fn)(void *context, uint64_t *keys, void *leaf), 177*4882a593Smuzhiyun void *context); 178*4882a593Smuzhiyun 179*4882a593Smuzhiyun 180*4882a593Smuzhiyun /*----------------------------------------------------------------*/ 181*4882a593Smuzhiyun 182*4882a593Smuzhiyun /* 183*4882a593Smuzhiyun * Cursor API. This does not follow the rolling lock convention. Since we 184*4882a593Smuzhiyun * know the order that values are required we can issue prefetches to speed 185*4882a593Smuzhiyun * up iteration. Use on a single level btree only. 186*4882a593Smuzhiyun */ 187*4882a593Smuzhiyun #define DM_BTREE_CURSOR_MAX_DEPTH 16 188*4882a593Smuzhiyun 189*4882a593Smuzhiyun struct cursor_node { 190*4882a593Smuzhiyun struct dm_block *b; 191*4882a593Smuzhiyun unsigned index; 192*4882a593Smuzhiyun }; 193*4882a593Smuzhiyun 194*4882a593Smuzhiyun struct dm_btree_cursor { 195*4882a593Smuzhiyun struct dm_btree_info *info; 196*4882a593Smuzhiyun dm_block_t root; 197*4882a593Smuzhiyun 198*4882a593Smuzhiyun bool prefetch_leaves; 199*4882a593Smuzhiyun unsigned depth; 200*4882a593Smuzhiyun struct cursor_node nodes[DM_BTREE_CURSOR_MAX_DEPTH]; 201*4882a593Smuzhiyun }; 202*4882a593Smuzhiyun 203*4882a593Smuzhiyun /* 204*4882a593Smuzhiyun * Creates a fresh cursor. If prefetch_leaves is set then it is assumed 205*4882a593Smuzhiyun * the btree contains block indexes that will be prefetched. The cursor is 206*4882a593Smuzhiyun * quite large, so you probably don't want to put it on the stack. 207*4882a593Smuzhiyun */ 208*4882a593Smuzhiyun int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root, 209*4882a593Smuzhiyun bool prefetch_leaves, struct dm_btree_cursor *c); 210*4882a593Smuzhiyun void dm_btree_cursor_end(struct dm_btree_cursor *c); 211*4882a593Smuzhiyun int dm_btree_cursor_next(struct dm_btree_cursor *c); 212*4882a593Smuzhiyun int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count); 213*4882a593Smuzhiyun int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le); 214*4882a593Smuzhiyun 215*4882a593Smuzhiyun #endif /* _LINUX_DM_BTREE_H */ 216