xref: /OK3568_Linux_fs/kernel/drivers/md/dm-thin-metadata.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun  * Copyright (C) 2011-2012 Red Hat, Inc.
3*4882a593Smuzhiyun  *
4*4882a593Smuzhiyun  * This file is released under the GPL.
5*4882a593Smuzhiyun  */
6*4882a593Smuzhiyun 
7*4882a593Smuzhiyun #include "dm-thin-metadata.h"
8*4882a593Smuzhiyun #include "persistent-data/dm-btree.h"
9*4882a593Smuzhiyun #include "persistent-data/dm-space-map.h"
10*4882a593Smuzhiyun #include "persistent-data/dm-space-map-disk.h"
11*4882a593Smuzhiyun #include "persistent-data/dm-transaction-manager.h"
12*4882a593Smuzhiyun 
13*4882a593Smuzhiyun #include <linux/list.h>
14*4882a593Smuzhiyun #include <linux/device-mapper.h>
15*4882a593Smuzhiyun #include <linux/workqueue.h>
16*4882a593Smuzhiyun 
17*4882a593Smuzhiyun /*--------------------------------------------------------------------------
18*4882a593Smuzhiyun  * As far as the metadata goes, there is:
19*4882a593Smuzhiyun  *
20*4882a593Smuzhiyun  * - A superblock in block zero, taking up fewer than 512 bytes for
21*4882a593Smuzhiyun  *   atomic writes.
22*4882a593Smuzhiyun  *
23*4882a593Smuzhiyun  * - A space map managing the metadata blocks.
24*4882a593Smuzhiyun  *
25*4882a593Smuzhiyun  * - A space map managing the data blocks.
26*4882a593Smuzhiyun  *
27*4882a593Smuzhiyun  * - A btree mapping our internal thin dev ids onto struct disk_device_details.
28*4882a593Smuzhiyun  *
29*4882a593Smuzhiyun  * - A hierarchical btree, with 2 levels which effectively maps (thin
30*4882a593Smuzhiyun  *   dev id, virtual block) -> block_time.  Block time is a 64-bit
31*4882a593Smuzhiyun  *   field holding the time in the low 24 bits, and block in the top 40
32*4882a593Smuzhiyun  *   bits.
33*4882a593Smuzhiyun  *
34*4882a593Smuzhiyun  * BTrees consist solely of btree_nodes, that fill a block.  Some are
35*4882a593Smuzhiyun  * internal nodes, as such their values are a __le64 pointing to other
36*4882a593Smuzhiyun  * nodes.  Leaf nodes can store data of any reasonable size (ie. much
37*4882a593Smuzhiyun  * smaller than the block size).  The nodes consist of the header,
38*4882a593Smuzhiyun  * followed by an array of keys, followed by an array of values.  We have
39*4882a593Smuzhiyun  * to binary search on the keys so they're all held together to help the
40*4882a593Smuzhiyun  * cpu cache.
41*4882a593Smuzhiyun  *
42*4882a593Smuzhiyun  * Space maps have 2 btrees:
43*4882a593Smuzhiyun  *
44*4882a593Smuzhiyun  * - One maps a uint64_t onto a struct index_entry.  Which points to a
45*4882a593Smuzhiyun  *   bitmap block, and has some details about how many free entries there
46*4882a593Smuzhiyun  *   are etc.
47*4882a593Smuzhiyun  *
48*4882a593Smuzhiyun  * - The bitmap blocks have a header (for the checksum).  Then the rest
49*4882a593Smuzhiyun  *   of the block is pairs of bits.  With the meaning being:
50*4882a593Smuzhiyun  *
51*4882a593Smuzhiyun  *   0 - ref count is 0
52*4882a593Smuzhiyun  *   1 - ref count is 1
53*4882a593Smuzhiyun  *   2 - ref count is 2
54*4882a593Smuzhiyun  *   3 - ref count is higher than 2
55*4882a593Smuzhiyun  *
56*4882a593Smuzhiyun  * - If the count is higher than 2 then the ref count is entered in a
57*4882a593Smuzhiyun  *   second btree that directly maps the block_address to a uint32_t ref
58*4882a593Smuzhiyun  *   count.
59*4882a593Smuzhiyun  *
60*4882a593Smuzhiyun  * The space map metadata variant doesn't have a bitmaps btree.  Instead
61*4882a593Smuzhiyun  * it has one single blocks worth of index_entries.  This avoids
62*4882a593Smuzhiyun  * recursive issues with the bitmap btree needing to allocate space in
63*4882a593Smuzhiyun  * order to insert.  With a small data block size such as 64k the
64*4882a593Smuzhiyun  * metadata support data devices that are hundreds of terrabytes.
65*4882a593Smuzhiyun  *
66*4882a593Smuzhiyun  * The space maps allocate space linearly from front to back.  Space that
67*4882a593Smuzhiyun  * is freed in a transaction is never recycled within that transaction.
68*4882a593Smuzhiyun  * To try and avoid fragmenting _free_ space the allocator always goes
69*4882a593Smuzhiyun  * back and fills in gaps.
70*4882a593Smuzhiyun  *
71*4882a593Smuzhiyun  * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks
72*4882a593Smuzhiyun  * from the block manager.
73*4882a593Smuzhiyun  *--------------------------------------------------------------------------*/
74*4882a593Smuzhiyun 
75*4882a593Smuzhiyun #define DM_MSG_PREFIX   "thin metadata"
76*4882a593Smuzhiyun 
77*4882a593Smuzhiyun #define THIN_SUPERBLOCK_MAGIC 27022010
78*4882a593Smuzhiyun #define THIN_SUPERBLOCK_LOCATION 0
79*4882a593Smuzhiyun #define THIN_VERSION 2
80*4882a593Smuzhiyun #define SECTOR_TO_BLOCK_SHIFT 3
81*4882a593Smuzhiyun 
82*4882a593Smuzhiyun /*
83*4882a593Smuzhiyun  * For btree insert:
84*4882a593Smuzhiyun  *  3 for btree insert +
85*4882a593Smuzhiyun  *  2 for btree lookup used within space map
86*4882a593Smuzhiyun  * For btree remove:
87*4882a593Smuzhiyun  *  2 for shadow spine +
88*4882a593Smuzhiyun  *  4 for rebalance 3 child node
89*4882a593Smuzhiyun  */
90*4882a593Smuzhiyun #define THIN_MAX_CONCURRENT_LOCKS 6
91*4882a593Smuzhiyun 
92*4882a593Smuzhiyun /* This should be plenty */
93*4882a593Smuzhiyun #define SPACE_MAP_ROOT_SIZE 128
94*4882a593Smuzhiyun 
95*4882a593Smuzhiyun /*
96*4882a593Smuzhiyun  * Little endian on-disk superblock and device details.
97*4882a593Smuzhiyun  */
98*4882a593Smuzhiyun struct thin_disk_superblock {
99*4882a593Smuzhiyun 	__le32 csum;	/* Checksum of superblock except for this field. */
100*4882a593Smuzhiyun 	__le32 flags;
101*4882a593Smuzhiyun 	__le64 blocknr;	/* This block number, dm_block_t. */
102*4882a593Smuzhiyun 
103*4882a593Smuzhiyun 	__u8 uuid[16];
104*4882a593Smuzhiyun 	__le64 magic;
105*4882a593Smuzhiyun 	__le32 version;
106*4882a593Smuzhiyun 	__le32 time;
107*4882a593Smuzhiyun 
108*4882a593Smuzhiyun 	__le64 trans_id;
109*4882a593Smuzhiyun 
110*4882a593Smuzhiyun 	/*
111*4882a593Smuzhiyun 	 * Root held by userspace transactions.
112*4882a593Smuzhiyun 	 */
113*4882a593Smuzhiyun 	__le64 held_root;
114*4882a593Smuzhiyun 
115*4882a593Smuzhiyun 	__u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
116*4882a593Smuzhiyun 	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
117*4882a593Smuzhiyun 
118*4882a593Smuzhiyun 	/*
119*4882a593Smuzhiyun 	 * 2-level btree mapping (dev_id, (dev block, time)) -> data block
120*4882a593Smuzhiyun 	 */
121*4882a593Smuzhiyun 	__le64 data_mapping_root;
122*4882a593Smuzhiyun 
123*4882a593Smuzhiyun 	/*
124*4882a593Smuzhiyun 	 * Device detail root mapping dev_id -> device_details
125*4882a593Smuzhiyun 	 */
126*4882a593Smuzhiyun 	__le64 device_details_root;
127*4882a593Smuzhiyun 
128*4882a593Smuzhiyun 	__le32 data_block_size;		/* In 512-byte sectors. */
129*4882a593Smuzhiyun 
130*4882a593Smuzhiyun 	__le32 metadata_block_size;	/* In 512-byte sectors. */
131*4882a593Smuzhiyun 	__le64 metadata_nr_blocks;
132*4882a593Smuzhiyun 
133*4882a593Smuzhiyun 	__le32 compat_flags;
134*4882a593Smuzhiyun 	__le32 compat_ro_flags;
135*4882a593Smuzhiyun 	__le32 incompat_flags;
136*4882a593Smuzhiyun } __packed;
137*4882a593Smuzhiyun 
138*4882a593Smuzhiyun struct disk_device_details {
139*4882a593Smuzhiyun 	__le64 mapped_blocks;
140*4882a593Smuzhiyun 	__le64 transaction_id;		/* When created. */
141*4882a593Smuzhiyun 	__le32 creation_time;
142*4882a593Smuzhiyun 	__le32 snapshotted_time;
143*4882a593Smuzhiyun } __packed;
144*4882a593Smuzhiyun 
145*4882a593Smuzhiyun struct dm_pool_metadata {
146*4882a593Smuzhiyun 	struct hlist_node hash;
147*4882a593Smuzhiyun 
148*4882a593Smuzhiyun 	struct block_device *bdev;
149*4882a593Smuzhiyun 	struct dm_block_manager *bm;
150*4882a593Smuzhiyun 	struct dm_space_map *metadata_sm;
151*4882a593Smuzhiyun 	struct dm_space_map *data_sm;
152*4882a593Smuzhiyun 	struct dm_transaction_manager *tm;
153*4882a593Smuzhiyun 	struct dm_transaction_manager *nb_tm;
154*4882a593Smuzhiyun 
155*4882a593Smuzhiyun 	/*
156*4882a593Smuzhiyun 	 * Two-level btree.
157*4882a593Smuzhiyun 	 * First level holds thin_dev_t.
158*4882a593Smuzhiyun 	 * Second level holds mappings.
159*4882a593Smuzhiyun 	 */
160*4882a593Smuzhiyun 	struct dm_btree_info info;
161*4882a593Smuzhiyun 
162*4882a593Smuzhiyun 	/*
163*4882a593Smuzhiyun 	 * Non-blocking version of the above.
164*4882a593Smuzhiyun 	 */
165*4882a593Smuzhiyun 	struct dm_btree_info nb_info;
166*4882a593Smuzhiyun 
167*4882a593Smuzhiyun 	/*
168*4882a593Smuzhiyun 	 * Just the top level for deleting whole devices.
169*4882a593Smuzhiyun 	 */
170*4882a593Smuzhiyun 	struct dm_btree_info tl_info;
171*4882a593Smuzhiyun 
172*4882a593Smuzhiyun 	/*
173*4882a593Smuzhiyun 	 * Just the bottom level for creating new devices.
174*4882a593Smuzhiyun 	 */
175*4882a593Smuzhiyun 	struct dm_btree_info bl_info;
176*4882a593Smuzhiyun 
177*4882a593Smuzhiyun 	/*
178*4882a593Smuzhiyun 	 * Describes the device details btree.
179*4882a593Smuzhiyun 	 */
180*4882a593Smuzhiyun 	struct dm_btree_info details_info;
181*4882a593Smuzhiyun 
182*4882a593Smuzhiyun 	struct rw_semaphore root_lock;
183*4882a593Smuzhiyun 	uint32_t time;
184*4882a593Smuzhiyun 	dm_block_t root;
185*4882a593Smuzhiyun 	dm_block_t details_root;
186*4882a593Smuzhiyun 	struct list_head thin_devices;
187*4882a593Smuzhiyun 	uint64_t trans_id;
188*4882a593Smuzhiyun 	unsigned long flags;
189*4882a593Smuzhiyun 	sector_t data_block_size;
190*4882a593Smuzhiyun 
191*4882a593Smuzhiyun 	/*
192*4882a593Smuzhiyun 	 * Pre-commit callback.
193*4882a593Smuzhiyun 	 *
194*4882a593Smuzhiyun 	 * This allows the thin provisioning target to run a callback before
195*4882a593Smuzhiyun 	 * the metadata are committed.
196*4882a593Smuzhiyun 	 */
197*4882a593Smuzhiyun 	dm_pool_pre_commit_fn pre_commit_fn;
198*4882a593Smuzhiyun 	void *pre_commit_context;
199*4882a593Smuzhiyun 
200*4882a593Smuzhiyun 	/*
201*4882a593Smuzhiyun 	 * We reserve a section of the metadata for commit overhead.
202*4882a593Smuzhiyun 	 * All reported space does *not* include this.
203*4882a593Smuzhiyun 	 */
204*4882a593Smuzhiyun 	dm_block_t metadata_reserve;
205*4882a593Smuzhiyun 
206*4882a593Smuzhiyun 	/*
207*4882a593Smuzhiyun 	 * Set if a transaction has to be aborted but the attempt to roll back
208*4882a593Smuzhiyun 	 * to the previous (good) transaction failed.  The only pool metadata
209*4882a593Smuzhiyun 	 * operation possible in this state is the closing of the device.
210*4882a593Smuzhiyun 	 */
211*4882a593Smuzhiyun 	bool fail_io:1;
212*4882a593Smuzhiyun 
213*4882a593Smuzhiyun 	/*
214*4882a593Smuzhiyun 	 * Set once a thin-pool has been accessed through one of the interfaces
215*4882a593Smuzhiyun 	 * that imply the pool is in-service (e.g. thin devices created/deleted,
216*4882a593Smuzhiyun 	 * thin-pool message, metadata snapshots, etc).
217*4882a593Smuzhiyun 	 */
218*4882a593Smuzhiyun 	bool in_service:1;
219*4882a593Smuzhiyun 
220*4882a593Smuzhiyun 	/*
221*4882a593Smuzhiyun 	 * Reading the space map roots can fail, so we read it into these
222*4882a593Smuzhiyun 	 * buffers before the superblock is locked and updated.
223*4882a593Smuzhiyun 	 */
224*4882a593Smuzhiyun 	__u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
225*4882a593Smuzhiyun 	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
226*4882a593Smuzhiyun };
227*4882a593Smuzhiyun 
228*4882a593Smuzhiyun struct dm_thin_device {
229*4882a593Smuzhiyun 	struct list_head list;
230*4882a593Smuzhiyun 	struct dm_pool_metadata *pmd;
231*4882a593Smuzhiyun 	dm_thin_id id;
232*4882a593Smuzhiyun 
233*4882a593Smuzhiyun 	int open_count;
234*4882a593Smuzhiyun 	bool changed:1;
235*4882a593Smuzhiyun 	bool aborted_with_changes:1;
236*4882a593Smuzhiyun 	uint64_t mapped_blocks;
237*4882a593Smuzhiyun 	uint64_t transaction_id;
238*4882a593Smuzhiyun 	uint32_t creation_time;
239*4882a593Smuzhiyun 	uint32_t snapshotted_time;
240*4882a593Smuzhiyun };
241*4882a593Smuzhiyun 
242*4882a593Smuzhiyun /*----------------------------------------------------------------
243*4882a593Smuzhiyun  * superblock validator
244*4882a593Smuzhiyun  *--------------------------------------------------------------*/
245*4882a593Smuzhiyun 
246*4882a593Smuzhiyun #define SUPERBLOCK_CSUM_XOR 160774
247*4882a593Smuzhiyun 
sb_prepare_for_write(struct dm_block_validator * v,struct dm_block * b,size_t block_size)248*4882a593Smuzhiyun static void sb_prepare_for_write(struct dm_block_validator *v,
249*4882a593Smuzhiyun 				 struct dm_block *b,
250*4882a593Smuzhiyun 				 size_t block_size)
251*4882a593Smuzhiyun {
252*4882a593Smuzhiyun 	struct thin_disk_superblock *disk_super = dm_block_data(b);
253*4882a593Smuzhiyun 
254*4882a593Smuzhiyun 	disk_super->blocknr = cpu_to_le64(dm_block_location(b));
255*4882a593Smuzhiyun 	disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
256*4882a593Smuzhiyun 						      block_size - sizeof(__le32),
257*4882a593Smuzhiyun 						      SUPERBLOCK_CSUM_XOR));
258*4882a593Smuzhiyun }
259*4882a593Smuzhiyun 
sb_check(struct dm_block_validator * v,struct dm_block * b,size_t block_size)260*4882a593Smuzhiyun static int sb_check(struct dm_block_validator *v,
261*4882a593Smuzhiyun 		    struct dm_block *b,
262*4882a593Smuzhiyun 		    size_t block_size)
263*4882a593Smuzhiyun {
264*4882a593Smuzhiyun 	struct thin_disk_superblock *disk_super = dm_block_data(b);
265*4882a593Smuzhiyun 	__le32 csum_le;
266*4882a593Smuzhiyun 
267*4882a593Smuzhiyun 	if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
268*4882a593Smuzhiyun 		DMERR("sb_check failed: blocknr %llu: "
269*4882a593Smuzhiyun 		      "wanted %llu", le64_to_cpu(disk_super->blocknr),
270*4882a593Smuzhiyun 		      (unsigned long long)dm_block_location(b));
271*4882a593Smuzhiyun 		return -ENOTBLK;
272*4882a593Smuzhiyun 	}
273*4882a593Smuzhiyun 
274*4882a593Smuzhiyun 	if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) {
275*4882a593Smuzhiyun 		DMERR("sb_check failed: magic %llu: "
276*4882a593Smuzhiyun 		      "wanted %llu", le64_to_cpu(disk_super->magic),
277*4882a593Smuzhiyun 		      (unsigned long long)THIN_SUPERBLOCK_MAGIC);
278*4882a593Smuzhiyun 		return -EILSEQ;
279*4882a593Smuzhiyun 	}
280*4882a593Smuzhiyun 
281*4882a593Smuzhiyun 	csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
282*4882a593Smuzhiyun 					     block_size - sizeof(__le32),
283*4882a593Smuzhiyun 					     SUPERBLOCK_CSUM_XOR));
284*4882a593Smuzhiyun 	if (csum_le != disk_super->csum) {
285*4882a593Smuzhiyun 		DMERR("sb_check failed: csum %u: wanted %u",
286*4882a593Smuzhiyun 		      le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
287*4882a593Smuzhiyun 		return -EILSEQ;
288*4882a593Smuzhiyun 	}
289*4882a593Smuzhiyun 
290*4882a593Smuzhiyun 	return 0;
291*4882a593Smuzhiyun }
292*4882a593Smuzhiyun 
293*4882a593Smuzhiyun static struct dm_block_validator sb_validator = {
294*4882a593Smuzhiyun 	.name = "superblock",
295*4882a593Smuzhiyun 	.prepare_for_write = sb_prepare_for_write,
296*4882a593Smuzhiyun 	.check = sb_check
297*4882a593Smuzhiyun };
298*4882a593Smuzhiyun 
299*4882a593Smuzhiyun /*----------------------------------------------------------------
300*4882a593Smuzhiyun  * Methods for the btree value types
301*4882a593Smuzhiyun  *--------------------------------------------------------------*/
302*4882a593Smuzhiyun 
pack_block_time(dm_block_t b,uint32_t t)303*4882a593Smuzhiyun static uint64_t pack_block_time(dm_block_t b, uint32_t t)
304*4882a593Smuzhiyun {
305*4882a593Smuzhiyun 	return (b << 24) | t;
306*4882a593Smuzhiyun }
307*4882a593Smuzhiyun 
unpack_block_time(uint64_t v,dm_block_t * b,uint32_t * t)308*4882a593Smuzhiyun static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
309*4882a593Smuzhiyun {
310*4882a593Smuzhiyun 	*b = v >> 24;
311*4882a593Smuzhiyun 	*t = v & ((1 << 24) - 1);
312*4882a593Smuzhiyun }
313*4882a593Smuzhiyun 
data_block_inc(void * context,const void * value_le)314*4882a593Smuzhiyun static void data_block_inc(void *context, const void *value_le)
315*4882a593Smuzhiyun {
316*4882a593Smuzhiyun 	struct dm_space_map *sm = context;
317*4882a593Smuzhiyun 	__le64 v_le;
318*4882a593Smuzhiyun 	uint64_t b;
319*4882a593Smuzhiyun 	uint32_t t;
320*4882a593Smuzhiyun 
321*4882a593Smuzhiyun 	memcpy(&v_le, value_le, sizeof(v_le));
322*4882a593Smuzhiyun 	unpack_block_time(le64_to_cpu(v_le), &b, &t);
323*4882a593Smuzhiyun 	dm_sm_inc_block(sm, b);
324*4882a593Smuzhiyun }
325*4882a593Smuzhiyun 
data_block_dec(void * context,const void * value_le)326*4882a593Smuzhiyun static void data_block_dec(void *context, const void *value_le)
327*4882a593Smuzhiyun {
328*4882a593Smuzhiyun 	struct dm_space_map *sm = context;
329*4882a593Smuzhiyun 	__le64 v_le;
330*4882a593Smuzhiyun 	uint64_t b;
331*4882a593Smuzhiyun 	uint32_t t;
332*4882a593Smuzhiyun 
333*4882a593Smuzhiyun 	memcpy(&v_le, value_le, sizeof(v_le));
334*4882a593Smuzhiyun 	unpack_block_time(le64_to_cpu(v_le), &b, &t);
335*4882a593Smuzhiyun 	dm_sm_dec_block(sm, b);
336*4882a593Smuzhiyun }
337*4882a593Smuzhiyun 
data_block_equal(void * context,const void * value1_le,const void * value2_le)338*4882a593Smuzhiyun static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
339*4882a593Smuzhiyun {
340*4882a593Smuzhiyun 	__le64 v1_le, v2_le;
341*4882a593Smuzhiyun 	uint64_t b1, b2;
342*4882a593Smuzhiyun 	uint32_t t;
343*4882a593Smuzhiyun 
344*4882a593Smuzhiyun 	memcpy(&v1_le, value1_le, sizeof(v1_le));
345*4882a593Smuzhiyun 	memcpy(&v2_le, value2_le, sizeof(v2_le));
346*4882a593Smuzhiyun 	unpack_block_time(le64_to_cpu(v1_le), &b1, &t);
347*4882a593Smuzhiyun 	unpack_block_time(le64_to_cpu(v2_le), &b2, &t);
348*4882a593Smuzhiyun 
349*4882a593Smuzhiyun 	return b1 == b2;
350*4882a593Smuzhiyun }
351*4882a593Smuzhiyun 
subtree_inc(void * context,const void * value)352*4882a593Smuzhiyun static void subtree_inc(void *context, const void *value)
353*4882a593Smuzhiyun {
354*4882a593Smuzhiyun 	struct dm_btree_info *info = context;
355*4882a593Smuzhiyun 	__le64 root_le;
356*4882a593Smuzhiyun 	uint64_t root;
357*4882a593Smuzhiyun 
358*4882a593Smuzhiyun 	memcpy(&root_le, value, sizeof(root_le));
359*4882a593Smuzhiyun 	root = le64_to_cpu(root_le);
360*4882a593Smuzhiyun 	dm_tm_inc(info->tm, root);
361*4882a593Smuzhiyun }
362*4882a593Smuzhiyun 
subtree_dec(void * context,const void * value)363*4882a593Smuzhiyun static void subtree_dec(void *context, const void *value)
364*4882a593Smuzhiyun {
365*4882a593Smuzhiyun 	struct dm_btree_info *info = context;
366*4882a593Smuzhiyun 	__le64 root_le;
367*4882a593Smuzhiyun 	uint64_t root;
368*4882a593Smuzhiyun 
369*4882a593Smuzhiyun 	memcpy(&root_le, value, sizeof(root_le));
370*4882a593Smuzhiyun 	root = le64_to_cpu(root_le);
371*4882a593Smuzhiyun 	if (dm_btree_del(info, root))
372*4882a593Smuzhiyun 		DMERR("btree delete failed");
373*4882a593Smuzhiyun }
374*4882a593Smuzhiyun 
subtree_equal(void * context,const void * value1_le,const void * value2_le)375*4882a593Smuzhiyun static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
376*4882a593Smuzhiyun {
377*4882a593Smuzhiyun 	__le64 v1_le, v2_le;
378*4882a593Smuzhiyun 	memcpy(&v1_le, value1_le, sizeof(v1_le));
379*4882a593Smuzhiyun 	memcpy(&v2_le, value2_le, sizeof(v2_le));
380*4882a593Smuzhiyun 
381*4882a593Smuzhiyun 	return v1_le == v2_le;
382*4882a593Smuzhiyun }
383*4882a593Smuzhiyun 
384*4882a593Smuzhiyun /*----------------------------------------------------------------*/
385*4882a593Smuzhiyun 
386*4882a593Smuzhiyun /*
387*4882a593Smuzhiyun  * Variant that is used for in-core only changes or code that
388*4882a593Smuzhiyun  * shouldn't put the pool in service on its own (e.g. commit).
389*4882a593Smuzhiyun  */
pmd_write_lock_in_core(struct dm_pool_metadata * pmd)390*4882a593Smuzhiyun static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd)
391*4882a593Smuzhiyun 	__acquires(pmd->root_lock)
392*4882a593Smuzhiyun {
393*4882a593Smuzhiyun 	down_write(&pmd->root_lock);
394*4882a593Smuzhiyun }
395*4882a593Smuzhiyun 
pmd_write_lock(struct dm_pool_metadata * pmd)396*4882a593Smuzhiyun static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
397*4882a593Smuzhiyun {
398*4882a593Smuzhiyun 	pmd_write_lock_in_core(pmd);
399*4882a593Smuzhiyun 	if (unlikely(!pmd->in_service))
400*4882a593Smuzhiyun 		pmd->in_service = true;
401*4882a593Smuzhiyun }
402*4882a593Smuzhiyun 
pmd_write_unlock(struct dm_pool_metadata * pmd)403*4882a593Smuzhiyun static inline void pmd_write_unlock(struct dm_pool_metadata *pmd)
404*4882a593Smuzhiyun 	__releases(pmd->root_lock)
405*4882a593Smuzhiyun {
406*4882a593Smuzhiyun 	up_write(&pmd->root_lock);
407*4882a593Smuzhiyun }
408*4882a593Smuzhiyun 
409*4882a593Smuzhiyun /*----------------------------------------------------------------*/
410*4882a593Smuzhiyun 
superblock_lock_zero(struct dm_pool_metadata * pmd,struct dm_block ** sblock)411*4882a593Smuzhiyun static int superblock_lock_zero(struct dm_pool_metadata *pmd,
412*4882a593Smuzhiyun 				struct dm_block **sblock)
413*4882a593Smuzhiyun {
414*4882a593Smuzhiyun 	return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION,
415*4882a593Smuzhiyun 				     &sb_validator, sblock);
416*4882a593Smuzhiyun }
417*4882a593Smuzhiyun 
superblock_lock(struct dm_pool_metadata * pmd,struct dm_block ** sblock)418*4882a593Smuzhiyun static int superblock_lock(struct dm_pool_metadata *pmd,
419*4882a593Smuzhiyun 			   struct dm_block **sblock)
420*4882a593Smuzhiyun {
421*4882a593Smuzhiyun 	return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
422*4882a593Smuzhiyun 				&sb_validator, sblock);
423*4882a593Smuzhiyun }
424*4882a593Smuzhiyun 
__superblock_all_zeroes(struct dm_block_manager * bm,int * result)425*4882a593Smuzhiyun static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
426*4882a593Smuzhiyun {
427*4882a593Smuzhiyun 	int r;
428*4882a593Smuzhiyun 	unsigned i;
429*4882a593Smuzhiyun 	struct dm_block *b;
430*4882a593Smuzhiyun 	__le64 *data_le, zero = cpu_to_le64(0);
431*4882a593Smuzhiyun 	unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64);
432*4882a593Smuzhiyun 
433*4882a593Smuzhiyun 	/*
434*4882a593Smuzhiyun 	 * We can't use a validator here - it may be all zeroes.
435*4882a593Smuzhiyun 	 */
436*4882a593Smuzhiyun 	r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b);
437*4882a593Smuzhiyun 	if (r)
438*4882a593Smuzhiyun 		return r;
439*4882a593Smuzhiyun 
440*4882a593Smuzhiyun 	data_le = dm_block_data(b);
441*4882a593Smuzhiyun 	*result = 1;
442*4882a593Smuzhiyun 	for (i = 0; i < block_size; i++) {
443*4882a593Smuzhiyun 		if (data_le[i] != zero) {
444*4882a593Smuzhiyun 			*result = 0;
445*4882a593Smuzhiyun 			break;
446*4882a593Smuzhiyun 		}
447*4882a593Smuzhiyun 	}
448*4882a593Smuzhiyun 
449*4882a593Smuzhiyun 	dm_bm_unlock(b);
450*4882a593Smuzhiyun 
451*4882a593Smuzhiyun 	return 0;
452*4882a593Smuzhiyun }
453*4882a593Smuzhiyun 
__setup_btree_details(struct dm_pool_metadata * pmd)454*4882a593Smuzhiyun static void __setup_btree_details(struct dm_pool_metadata *pmd)
455*4882a593Smuzhiyun {
456*4882a593Smuzhiyun 	pmd->info.tm = pmd->tm;
457*4882a593Smuzhiyun 	pmd->info.levels = 2;
458*4882a593Smuzhiyun 	pmd->info.value_type.context = pmd->data_sm;
459*4882a593Smuzhiyun 	pmd->info.value_type.size = sizeof(__le64);
460*4882a593Smuzhiyun 	pmd->info.value_type.inc = data_block_inc;
461*4882a593Smuzhiyun 	pmd->info.value_type.dec = data_block_dec;
462*4882a593Smuzhiyun 	pmd->info.value_type.equal = data_block_equal;
463*4882a593Smuzhiyun 
464*4882a593Smuzhiyun 	memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
465*4882a593Smuzhiyun 	pmd->nb_info.tm = pmd->nb_tm;
466*4882a593Smuzhiyun 
467*4882a593Smuzhiyun 	pmd->tl_info.tm = pmd->tm;
468*4882a593Smuzhiyun 	pmd->tl_info.levels = 1;
469*4882a593Smuzhiyun 	pmd->tl_info.value_type.context = &pmd->bl_info;
470*4882a593Smuzhiyun 	pmd->tl_info.value_type.size = sizeof(__le64);
471*4882a593Smuzhiyun 	pmd->tl_info.value_type.inc = subtree_inc;
472*4882a593Smuzhiyun 	pmd->tl_info.value_type.dec = subtree_dec;
473*4882a593Smuzhiyun 	pmd->tl_info.value_type.equal = subtree_equal;
474*4882a593Smuzhiyun 
475*4882a593Smuzhiyun 	pmd->bl_info.tm = pmd->tm;
476*4882a593Smuzhiyun 	pmd->bl_info.levels = 1;
477*4882a593Smuzhiyun 	pmd->bl_info.value_type.context = pmd->data_sm;
478*4882a593Smuzhiyun 	pmd->bl_info.value_type.size = sizeof(__le64);
479*4882a593Smuzhiyun 	pmd->bl_info.value_type.inc = data_block_inc;
480*4882a593Smuzhiyun 	pmd->bl_info.value_type.dec = data_block_dec;
481*4882a593Smuzhiyun 	pmd->bl_info.value_type.equal = data_block_equal;
482*4882a593Smuzhiyun 
483*4882a593Smuzhiyun 	pmd->details_info.tm = pmd->tm;
484*4882a593Smuzhiyun 	pmd->details_info.levels = 1;
485*4882a593Smuzhiyun 	pmd->details_info.value_type.context = NULL;
486*4882a593Smuzhiyun 	pmd->details_info.value_type.size = sizeof(struct disk_device_details);
487*4882a593Smuzhiyun 	pmd->details_info.value_type.inc = NULL;
488*4882a593Smuzhiyun 	pmd->details_info.value_type.dec = NULL;
489*4882a593Smuzhiyun 	pmd->details_info.value_type.equal = NULL;
490*4882a593Smuzhiyun }
491*4882a593Smuzhiyun 
save_sm_roots(struct dm_pool_metadata * pmd)492*4882a593Smuzhiyun static int save_sm_roots(struct dm_pool_metadata *pmd)
493*4882a593Smuzhiyun {
494*4882a593Smuzhiyun 	int r;
495*4882a593Smuzhiyun 	size_t len;
496*4882a593Smuzhiyun 
497*4882a593Smuzhiyun 	r = dm_sm_root_size(pmd->metadata_sm, &len);
498*4882a593Smuzhiyun 	if (r < 0)
499*4882a593Smuzhiyun 		return r;
500*4882a593Smuzhiyun 
501*4882a593Smuzhiyun 	r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len);
502*4882a593Smuzhiyun 	if (r < 0)
503*4882a593Smuzhiyun 		return r;
504*4882a593Smuzhiyun 
505*4882a593Smuzhiyun 	r = dm_sm_root_size(pmd->data_sm, &len);
506*4882a593Smuzhiyun 	if (r < 0)
507*4882a593Smuzhiyun 		return r;
508*4882a593Smuzhiyun 
509*4882a593Smuzhiyun 	return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len);
510*4882a593Smuzhiyun }
511*4882a593Smuzhiyun 
copy_sm_roots(struct dm_pool_metadata * pmd,struct thin_disk_superblock * disk)512*4882a593Smuzhiyun static void copy_sm_roots(struct dm_pool_metadata *pmd,
513*4882a593Smuzhiyun 			  struct thin_disk_superblock *disk)
514*4882a593Smuzhiyun {
515*4882a593Smuzhiyun 	memcpy(&disk->metadata_space_map_root,
516*4882a593Smuzhiyun 	       &pmd->metadata_space_map_root,
517*4882a593Smuzhiyun 	       sizeof(pmd->metadata_space_map_root));
518*4882a593Smuzhiyun 
519*4882a593Smuzhiyun 	memcpy(&disk->data_space_map_root,
520*4882a593Smuzhiyun 	       &pmd->data_space_map_root,
521*4882a593Smuzhiyun 	       sizeof(pmd->data_space_map_root));
522*4882a593Smuzhiyun }
523*4882a593Smuzhiyun 
__write_initial_superblock(struct dm_pool_metadata * pmd)524*4882a593Smuzhiyun static int __write_initial_superblock(struct dm_pool_metadata *pmd)
525*4882a593Smuzhiyun {
526*4882a593Smuzhiyun 	int r;
527*4882a593Smuzhiyun 	struct dm_block *sblock;
528*4882a593Smuzhiyun 	struct thin_disk_superblock *disk_super;
529*4882a593Smuzhiyun 	sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
530*4882a593Smuzhiyun 
531*4882a593Smuzhiyun 	if (bdev_size > THIN_METADATA_MAX_SECTORS)
532*4882a593Smuzhiyun 		bdev_size = THIN_METADATA_MAX_SECTORS;
533*4882a593Smuzhiyun 
534*4882a593Smuzhiyun 	r = dm_sm_commit(pmd->data_sm);
535*4882a593Smuzhiyun 	if (r < 0)
536*4882a593Smuzhiyun 		return r;
537*4882a593Smuzhiyun 
538*4882a593Smuzhiyun 	r = dm_tm_pre_commit(pmd->tm);
539*4882a593Smuzhiyun 	if (r < 0)
540*4882a593Smuzhiyun 		return r;
541*4882a593Smuzhiyun 
542*4882a593Smuzhiyun 	r = save_sm_roots(pmd);
543*4882a593Smuzhiyun 	if (r < 0)
544*4882a593Smuzhiyun 		return r;
545*4882a593Smuzhiyun 
546*4882a593Smuzhiyun 	r = superblock_lock_zero(pmd, &sblock);
547*4882a593Smuzhiyun 	if (r)
548*4882a593Smuzhiyun 		return r;
549*4882a593Smuzhiyun 
550*4882a593Smuzhiyun 	disk_super = dm_block_data(sblock);
551*4882a593Smuzhiyun 	disk_super->flags = 0;
552*4882a593Smuzhiyun 	memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
553*4882a593Smuzhiyun 	disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
554*4882a593Smuzhiyun 	disk_super->version = cpu_to_le32(THIN_VERSION);
555*4882a593Smuzhiyun 	disk_super->time = 0;
556*4882a593Smuzhiyun 	disk_super->trans_id = 0;
557*4882a593Smuzhiyun 	disk_super->held_root = 0;
558*4882a593Smuzhiyun 
559*4882a593Smuzhiyun 	copy_sm_roots(pmd, disk_super);
560*4882a593Smuzhiyun 
561*4882a593Smuzhiyun 	disk_super->data_mapping_root = cpu_to_le64(pmd->root);
562*4882a593Smuzhiyun 	disk_super->device_details_root = cpu_to_le64(pmd->details_root);
563*4882a593Smuzhiyun 	disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
564*4882a593Smuzhiyun 	disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
565*4882a593Smuzhiyun 	disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
566*4882a593Smuzhiyun 
567*4882a593Smuzhiyun 	return dm_tm_commit(pmd->tm, sblock);
568*4882a593Smuzhiyun }
569*4882a593Smuzhiyun 
__format_metadata(struct dm_pool_metadata * pmd)570*4882a593Smuzhiyun static int __format_metadata(struct dm_pool_metadata *pmd)
571*4882a593Smuzhiyun {
572*4882a593Smuzhiyun 	int r;
573*4882a593Smuzhiyun 
574*4882a593Smuzhiyun 	r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
575*4882a593Smuzhiyun 				 &pmd->tm, &pmd->metadata_sm);
576*4882a593Smuzhiyun 	if (r < 0) {
577*4882a593Smuzhiyun 		DMERR("tm_create_with_sm failed");
578*4882a593Smuzhiyun 		return r;
579*4882a593Smuzhiyun 	}
580*4882a593Smuzhiyun 
581*4882a593Smuzhiyun 	pmd->data_sm = dm_sm_disk_create(pmd->tm, 0);
582*4882a593Smuzhiyun 	if (IS_ERR(pmd->data_sm)) {
583*4882a593Smuzhiyun 		DMERR("sm_disk_create failed");
584*4882a593Smuzhiyun 		r = PTR_ERR(pmd->data_sm);
585*4882a593Smuzhiyun 		goto bad_cleanup_tm;
586*4882a593Smuzhiyun 	}
587*4882a593Smuzhiyun 
588*4882a593Smuzhiyun 	pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
589*4882a593Smuzhiyun 	if (!pmd->nb_tm) {
590*4882a593Smuzhiyun 		DMERR("could not create non-blocking clone tm");
591*4882a593Smuzhiyun 		r = -ENOMEM;
592*4882a593Smuzhiyun 		goto bad_cleanup_data_sm;
593*4882a593Smuzhiyun 	}
594*4882a593Smuzhiyun 
595*4882a593Smuzhiyun 	__setup_btree_details(pmd);
596*4882a593Smuzhiyun 
597*4882a593Smuzhiyun 	r = dm_btree_empty(&pmd->info, &pmd->root);
598*4882a593Smuzhiyun 	if (r < 0)
599*4882a593Smuzhiyun 		goto bad_cleanup_nb_tm;
600*4882a593Smuzhiyun 
601*4882a593Smuzhiyun 	r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
602*4882a593Smuzhiyun 	if (r < 0) {
603*4882a593Smuzhiyun 		DMERR("couldn't create devices root");
604*4882a593Smuzhiyun 		goto bad_cleanup_nb_tm;
605*4882a593Smuzhiyun 	}
606*4882a593Smuzhiyun 
607*4882a593Smuzhiyun 	r = __write_initial_superblock(pmd);
608*4882a593Smuzhiyun 	if (r)
609*4882a593Smuzhiyun 		goto bad_cleanup_nb_tm;
610*4882a593Smuzhiyun 
611*4882a593Smuzhiyun 	return 0;
612*4882a593Smuzhiyun 
613*4882a593Smuzhiyun bad_cleanup_nb_tm:
614*4882a593Smuzhiyun 	dm_tm_destroy(pmd->nb_tm);
615*4882a593Smuzhiyun bad_cleanup_data_sm:
616*4882a593Smuzhiyun 	dm_sm_destroy(pmd->data_sm);
617*4882a593Smuzhiyun bad_cleanup_tm:
618*4882a593Smuzhiyun 	dm_tm_destroy(pmd->tm);
619*4882a593Smuzhiyun 	dm_sm_destroy(pmd->metadata_sm);
620*4882a593Smuzhiyun 
621*4882a593Smuzhiyun 	return r;
622*4882a593Smuzhiyun }
623*4882a593Smuzhiyun 
__check_incompat_features(struct thin_disk_superblock * disk_super,struct dm_pool_metadata * pmd)624*4882a593Smuzhiyun static int __check_incompat_features(struct thin_disk_superblock *disk_super,
625*4882a593Smuzhiyun 				     struct dm_pool_metadata *pmd)
626*4882a593Smuzhiyun {
627*4882a593Smuzhiyun 	uint32_t features;
628*4882a593Smuzhiyun 
629*4882a593Smuzhiyun 	features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
630*4882a593Smuzhiyun 	if (features) {
631*4882a593Smuzhiyun 		DMERR("could not access metadata due to unsupported optional features (%lx).",
632*4882a593Smuzhiyun 		      (unsigned long)features);
633*4882a593Smuzhiyun 		return -EINVAL;
634*4882a593Smuzhiyun 	}
635*4882a593Smuzhiyun 
636*4882a593Smuzhiyun 	/*
637*4882a593Smuzhiyun 	 * Check for read-only metadata to skip the following RDWR checks.
638*4882a593Smuzhiyun 	 */
639*4882a593Smuzhiyun 	if (get_disk_ro(pmd->bdev->bd_disk))
640*4882a593Smuzhiyun 		return 0;
641*4882a593Smuzhiyun 
642*4882a593Smuzhiyun 	features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
643*4882a593Smuzhiyun 	if (features) {
644*4882a593Smuzhiyun 		DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
645*4882a593Smuzhiyun 		      (unsigned long)features);
646*4882a593Smuzhiyun 		return -EINVAL;
647*4882a593Smuzhiyun 	}
648*4882a593Smuzhiyun 
649*4882a593Smuzhiyun 	return 0;
650*4882a593Smuzhiyun }
651*4882a593Smuzhiyun 
__open_metadata(struct dm_pool_metadata * pmd)652*4882a593Smuzhiyun static int __open_metadata(struct dm_pool_metadata *pmd)
653*4882a593Smuzhiyun {
654*4882a593Smuzhiyun 	int r;
655*4882a593Smuzhiyun 	struct dm_block *sblock;
656*4882a593Smuzhiyun 	struct thin_disk_superblock *disk_super;
657*4882a593Smuzhiyun 
658*4882a593Smuzhiyun 	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
659*4882a593Smuzhiyun 			    &sb_validator, &sblock);
660*4882a593Smuzhiyun 	if (r < 0) {
661*4882a593Smuzhiyun 		DMERR("couldn't read superblock");
662*4882a593Smuzhiyun 		return r;
663*4882a593Smuzhiyun 	}
664*4882a593Smuzhiyun 
665*4882a593Smuzhiyun 	disk_super = dm_block_data(sblock);
666*4882a593Smuzhiyun 
667*4882a593Smuzhiyun 	/* Verify the data block size hasn't changed */
668*4882a593Smuzhiyun 	if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) {
669*4882a593Smuzhiyun 		DMERR("changing the data block size (from %u to %llu) is not supported",
670*4882a593Smuzhiyun 		      le32_to_cpu(disk_super->data_block_size),
671*4882a593Smuzhiyun 		      (unsigned long long)pmd->data_block_size);
672*4882a593Smuzhiyun 		r = -EINVAL;
673*4882a593Smuzhiyun 		goto bad_unlock_sblock;
674*4882a593Smuzhiyun 	}
675*4882a593Smuzhiyun 
676*4882a593Smuzhiyun 	r = __check_incompat_features(disk_super, pmd);
677*4882a593Smuzhiyun 	if (r < 0)
678*4882a593Smuzhiyun 		goto bad_unlock_sblock;
679*4882a593Smuzhiyun 
680*4882a593Smuzhiyun 	r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
681*4882a593Smuzhiyun 			       disk_super->metadata_space_map_root,
682*4882a593Smuzhiyun 			       sizeof(disk_super->metadata_space_map_root),
683*4882a593Smuzhiyun 			       &pmd->tm, &pmd->metadata_sm);
684*4882a593Smuzhiyun 	if (r < 0) {
685*4882a593Smuzhiyun 		DMERR("tm_open_with_sm failed");
686*4882a593Smuzhiyun 		goto bad_unlock_sblock;
687*4882a593Smuzhiyun 	}
688*4882a593Smuzhiyun 
689*4882a593Smuzhiyun 	pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root,
690*4882a593Smuzhiyun 				       sizeof(disk_super->data_space_map_root));
691*4882a593Smuzhiyun 	if (IS_ERR(pmd->data_sm)) {
692*4882a593Smuzhiyun 		DMERR("sm_disk_open failed");
693*4882a593Smuzhiyun 		r = PTR_ERR(pmd->data_sm);
694*4882a593Smuzhiyun 		goto bad_cleanup_tm;
695*4882a593Smuzhiyun 	}
696*4882a593Smuzhiyun 
697*4882a593Smuzhiyun 	pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
698*4882a593Smuzhiyun 	if (!pmd->nb_tm) {
699*4882a593Smuzhiyun 		DMERR("could not create non-blocking clone tm");
700*4882a593Smuzhiyun 		r = -ENOMEM;
701*4882a593Smuzhiyun 		goto bad_cleanup_data_sm;
702*4882a593Smuzhiyun 	}
703*4882a593Smuzhiyun 
704*4882a593Smuzhiyun 	__setup_btree_details(pmd);
705*4882a593Smuzhiyun 	dm_bm_unlock(sblock);
706*4882a593Smuzhiyun 
707*4882a593Smuzhiyun 	return 0;
708*4882a593Smuzhiyun 
709*4882a593Smuzhiyun bad_cleanup_data_sm:
710*4882a593Smuzhiyun 	dm_sm_destroy(pmd->data_sm);
711*4882a593Smuzhiyun bad_cleanup_tm:
712*4882a593Smuzhiyun 	dm_tm_destroy(pmd->tm);
713*4882a593Smuzhiyun 	dm_sm_destroy(pmd->metadata_sm);
714*4882a593Smuzhiyun bad_unlock_sblock:
715*4882a593Smuzhiyun 	dm_bm_unlock(sblock);
716*4882a593Smuzhiyun 
717*4882a593Smuzhiyun 	return r;
718*4882a593Smuzhiyun }
719*4882a593Smuzhiyun 
__open_or_format_metadata(struct dm_pool_metadata * pmd,bool format_device)720*4882a593Smuzhiyun static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device)
721*4882a593Smuzhiyun {
722*4882a593Smuzhiyun 	int r, unformatted;
723*4882a593Smuzhiyun 
724*4882a593Smuzhiyun 	r = __superblock_all_zeroes(pmd->bm, &unformatted);
725*4882a593Smuzhiyun 	if (r)
726*4882a593Smuzhiyun 		return r;
727*4882a593Smuzhiyun 
728*4882a593Smuzhiyun 	if (unformatted)
729*4882a593Smuzhiyun 		return format_device ? __format_metadata(pmd) : -EPERM;
730*4882a593Smuzhiyun 
731*4882a593Smuzhiyun 	return __open_metadata(pmd);
732*4882a593Smuzhiyun }
733*4882a593Smuzhiyun 
__create_persistent_data_objects(struct dm_pool_metadata * pmd,bool format_device)734*4882a593Smuzhiyun static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device)
735*4882a593Smuzhiyun {
736*4882a593Smuzhiyun 	int r;
737*4882a593Smuzhiyun 
738*4882a593Smuzhiyun 	pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
739*4882a593Smuzhiyun 					  THIN_MAX_CONCURRENT_LOCKS);
740*4882a593Smuzhiyun 	if (IS_ERR(pmd->bm)) {
741*4882a593Smuzhiyun 		DMERR("could not create block manager");
742*4882a593Smuzhiyun 		r = PTR_ERR(pmd->bm);
743*4882a593Smuzhiyun 		pmd->bm = NULL;
744*4882a593Smuzhiyun 		return r;
745*4882a593Smuzhiyun 	}
746*4882a593Smuzhiyun 
747*4882a593Smuzhiyun 	r = __open_or_format_metadata(pmd, format_device);
748*4882a593Smuzhiyun 	if (r) {
749*4882a593Smuzhiyun 		dm_block_manager_destroy(pmd->bm);
750*4882a593Smuzhiyun 		pmd->bm = NULL;
751*4882a593Smuzhiyun 	}
752*4882a593Smuzhiyun 
753*4882a593Smuzhiyun 	return r;
754*4882a593Smuzhiyun }
755*4882a593Smuzhiyun 
__destroy_persistent_data_objects(struct dm_pool_metadata * pmd)756*4882a593Smuzhiyun static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd)
757*4882a593Smuzhiyun {
758*4882a593Smuzhiyun 	dm_sm_destroy(pmd->data_sm);
759*4882a593Smuzhiyun 	dm_sm_destroy(pmd->metadata_sm);
760*4882a593Smuzhiyun 	dm_tm_destroy(pmd->nb_tm);
761*4882a593Smuzhiyun 	dm_tm_destroy(pmd->tm);
762*4882a593Smuzhiyun 	dm_block_manager_destroy(pmd->bm);
763*4882a593Smuzhiyun }
764*4882a593Smuzhiyun 
__begin_transaction(struct dm_pool_metadata * pmd)765*4882a593Smuzhiyun static int __begin_transaction(struct dm_pool_metadata *pmd)
766*4882a593Smuzhiyun {
767*4882a593Smuzhiyun 	int r;
768*4882a593Smuzhiyun 	struct thin_disk_superblock *disk_super;
769*4882a593Smuzhiyun 	struct dm_block *sblock;
770*4882a593Smuzhiyun 
771*4882a593Smuzhiyun 	/*
772*4882a593Smuzhiyun 	 * We re-read the superblock every time.  Shouldn't need to do this
773*4882a593Smuzhiyun 	 * really.
774*4882a593Smuzhiyun 	 */
775*4882a593Smuzhiyun 	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
776*4882a593Smuzhiyun 			    &sb_validator, &sblock);
777*4882a593Smuzhiyun 	if (r)
778*4882a593Smuzhiyun 		return r;
779*4882a593Smuzhiyun 
780*4882a593Smuzhiyun 	disk_super = dm_block_data(sblock);
781*4882a593Smuzhiyun 	pmd->time = le32_to_cpu(disk_super->time);
782*4882a593Smuzhiyun 	pmd->root = le64_to_cpu(disk_super->data_mapping_root);
783*4882a593Smuzhiyun 	pmd->details_root = le64_to_cpu(disk_super->device_details_root);
784*4882a593Smuzhiyun 	pmd->trans_id = le64_to_cpu(disk_super->trans_id);
785*4882a593Smuzhiyun 	pmd->flags = le32_to_cpu(disk_super->flags);
786*4882a593Smuzhiyun 	pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
787*4882a593Smuzhiyun 
788*4882a593Smuzhiyun 	dm_bm_unlock(sblock);
789*4882a593Smuzhiyun 	return 0;
790*4882a593Smuzhiyun }
791*4882a593Smuzhiyun 
__write_changed_details(struct dm_pool_metadata * pmd)792*4882a593Smuzhiyun static int __write_changed_details(struct dm_pool_metadata *pmd)
793*4882a593Smuzhiyun {
794*4882a593Smuzhiyun 	int r;
795*4882a593Smuzhiyun 	struct dm_thin_device *td, *tmp;
796*4882a593Smuzhiyun 	struct disk_device_details details;
797*4882a593Smuzhiyun 	uint64_t key;
798*4882a593Smuzhiyun 
799*4882a593Smuzhiyun 	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
800*4882a593Smuzhiyun 		if (!td->changed)
801*4882a593Smuzhiyun 			continue;
802*4882a593Smuzhiyun 
803*4882a593Smuzhiyun 		key = td->id;
804*4882a593Smuzhiyun 
805*4882a593Smuzhiyun 		details.mapped_blocks = cpu_to_le64(td->mapped_blocks);
806*4882a593Smuzhiyun 		details.transaction_id = cpu_to_le64(td->transaction_id);
807*4882a593Smuzhiyun 		details.creation_time = cpu_to_le32(td->creation_time);
808*4882a593Smuzhiyun 		details.snapshotted_time = cpu_to_le32(td->snapshotted_time);
809*4882a593Smuzhiyun 		__dm_bless_for_disk(&details);
810*4882a593Smuzhiyun 
811*4882a593Smuzhiyun 		r = dm_btree_insert(&pmd->details_info, pmd->details_root,
812*4882a593Smuzhiyun 				    &key, &details, &pmd->details_root);
813*4882a593Smuzhiyun 		if (r)
814*4882a593Smuzhiyun 			return r;
815*4882a593Smuzhiyun 
816*4882a593Smuzhiyun 		if (td->open_count)
817*4882a593Smuzhiyun 			td->changed = false;
818*4882a593Smuzhiyun 		else {
819*4882a593Smuzhiyun 			list_del(&td->list);
820*4882a593Smuzhiyun 			kfree(td);
821*4882a593Smuzhiyun 		}
822*4882a593Smuzhiyun 	}
823*4882a593Smuzhiyun 
824*4882a593Smuzhiyun 	return 0;
825*4882a593Smuzhiyun }
826*4882a593Smuzhiyun 
__commit_transaction(struct dm_pool_metadata * pmd)827*4882a593Smuzhiyun static int __commit_transaction(struct dm_pool_metadata *pmd)
828*4882a593Smuzhiyun {
829*4882a593Smuzhiyun 	int r;
830*4882a593Smuzhiyun 	struct thin_disk_superblock *disk_super;
831*4882a593Smuzhiyun 	struct dm_block *sblock;
832*4882a593Smuzhiyun 
833*4882a593Smuzhiyun 	/*
834*4882a593Smuzhiyun 	 * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
835*4882a593Smuzhiyun 	 */
836*4882a593Smuzhiyun 	BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
837*4882a593Smuzhiyun 	BUG_ON(!rwsem_is_locked(&pmd->root_lock));
838*4882a593Smuzhiyun 
839*4882a593Smuzhiyun 	if (unlikely(!pmd->in_service))
840*4882a593Smuzhiyun 		return 0;
841*4882a593Smuzhiyun 
842*4882a593Smuzhiyun 	if (pmd->pre_commit_fn) {
843*4882a593Smuzhiyun 		r = pmd->pre_commit_fn(pmd->pre_commit_context);
844*4882a593Smuzhiyun 		if (r < 0) {
845*4882a593Smuzhiyun 			DMERR("pre-commit callback failed");
846*4882a593Smuzhiyun 			return r;
847*4882a593Smuzhiyun 		}
848*4882a593Smuzhiyun 	}
849*4882a593Smuzhiyun 
850*4882a593Smuzhiyun 	r = __write_changed_details(pmd);
851*4882a593Smuzhiyun 	if (r < 0)
852*4882a593Smuzhiyun 		return r;
853*4882a593Smuzhiyun 
854*4882a593Smuzhiyun 	r = dm_sm_commit(pmd->data_sm);
855*4882a593Smuzhiyun 	if (r < 0)
856*4882a593Smuzhiyun 		return r;
857*4882a593Smuzhiyun 
858*4882a593Smuzhiyun 	r = dm_tm_pre_commit(pmd->tm);
859*4882a593Smuzhiyun 	if (r < 0)
860*4882a593Smuzhiyun 		return r;
861*4882a593Smuzhiyun 
862*4882a593Smuzhiyun 	r = save_sm_roots(pmd);
863*4882a593Smuzhiyun 	if (r < 0)
864*4882a593Smuzhiyun 		return r;
865*4882a593Smuzhiyun 
866*4882a593Smuzhiyun 	r = superblock_lock(pmd, &sblock);
867*4882a593Smuzhiyun 	if (r)
868*4882a593Smuzhiyun 		return r;
869*4882a593Smuzhiyun 
870*4882a593Smuzhiyun 	disk_super = dm_block_data(sblock);
871*4882a593Smuzhiyun 	disk_super->time = cpu_to_le32(pmd->time);
872*4882a593Smuzhiyun 	disk_super->data_mapping_root = cpu_to_le64(pmd->root);
873*4882a593Smuzhiyun 	disk_super->device_details_root = cpu_to_le64(pmd->details_root);
874*4882a593Smuzhiyun 	disk_super->trans_id = cpu_to_le64(pmd->trans_id);
875*4882a593Smuzhiyun 	disk_super->flags = cpu_to_le32(pmd->flags);
876*4882a593Smuzhiyun 
877*4882a593Smuzhiyun 	copy_sm_roots(pmd, disk_super);
878*4882a593Smuzhiyun 
879*4882a593Smuzhiyun 	return dm_tm_commit(pmd->tm, sblock);
880*4882a593Smuzhiyun }
881*4882a593Smuzhiyun 
__set_metadata_reserve(struct dm_pool_metadata * pmd)882*4882a593Smuzhiyun static void __set_metadata_reserve(struct dm_pool_metadata *pmd)
883*4882a593Smuzhiyun {
884*4882a593Smuzhiyun 	int r;
885*4882a593Smuzhiyun 	dm_block_t total;
886*4882a593Smuzhiyun 	dm_block_t max_blocks = 4096; /* 16M */
887*4882a593Smuzhiyun 
888*4882a593Smuzhiyun 	r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total);
889*4882a593Smuzhiyun 	if (r) {
890*4882a593Smuzhiyun 		DMERR("could not get size of metadata device");
891*4882a593Smuzhiyun 		pmd->metadata_reserve = max_blocks;
892*4882a593Smuzhiyun 	} else
893*4882a593Smuzhiyun 		pmd->metadata_reserve = min(max_blocks, div_u64(total, 10));
894*4882a593Smuzhiyun }
895*4882a593Smuzhiyun 
dm_pool_metadata_open(struct block_device * bdev,sector_t data_block_size,bool format_device)896*4882a593Smuzhiyun struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
897*4882a593Smuzhiyun 					       sector_t data_block_size,
898*4882a593Smuzhiyun 					       bool format_device)
899*4882a593Smuzhiyun {
900*4882a593Smuzhiyun 	int r;
901*4882a593Smuzhiyun 	struct dm_pool_metadata *pmd;
902*4882a593Smuzhiyun 
903*4882a593Smuzhiyun 	pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
904*4882a593Smuzhiyun 	if (!pmd) {
905*4882a593Smuzhiyun 		DMERR("could not allocate metadata struct");
906*4882a593Smuzhiyun 		return ERR_PTR(-ENOMEM);
907*4882a593Smuzhiyun 	}
908*4882a593Smuzhiyun 
909*4882a593Smuzhiyun 	init_rwsem(&pmd->root_lock);
910*4882a593Smuzhiyun 	pmd->time = 0;
911*4882a593Smuzhiyun 	INIT_LIST_HEAD(&pmd->thin_devices);
912*4882a593Smuzhiyun 	pmd->fail_io = false;
913*4882a593Smuzhiyun 	pmd->in_service = false;
914*4882a593Smuzhiyun 	pmd->bdev = bdev;
915*4882a593Smuzhiyun 	pmd->data_block_size = data_block_size;
916*4882a593Smuzhiyun 	pmd->pre_commit_fn = NULL;
917*4882a593Smuzhiyun 	pmd->pre_commit_context = NULL;
918*4882a593Smuzhiyun 
919*4882a593Smuzhiyun 	r = __create_persistent_data_objects(pmd, format_device);
920*4882a593Smuzhiyun 	if (r) {
921*4882a593Smuzhiyun 		kfree(pmd);
922*4882a593Smuzhiyun 		return ERR_PTR(r);
923*4882a593Smuzhiyun 	}
924*4882a593Smuzhiyun 
925*4882a593Smuzhiyun 	r = __begin_transaction(pmd);
926*4882a593Smuzhiyun 	if (r < 0) {
927*4882a593Smuzhiyun 		if (dm_pool_metadata_close(pmd) < 0)
928*4882a593Smuzhiyun 			DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
929*4882a593Smuzhiyun 		return ERR_PTR(r);
930*4882a593Smuzhiyun 	}
931*4882a593Smuzhiyun 
932*4882a593Smuzhiyun 	__set_metadata_reserve(pmd);
933*4882a593Smuzhiyun 
934*4882a593Smuzhiyun 	return pmd;
935*4882a593Smuzhiyun }
936*4882a593Smuzhiyun 
dm_pool_metadata_close(struct dm_pool_metadata * pmd)937*4882a593Smuzhiyun int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
938*4882a593Smuzhiyun {
939*4882a593Smuzhiyun 	int r;
940*4882a593Smuzhiyun 	unsigned open_devices = 0;
941*4882a593Smuzhiyun 	struct dm_thin_device *td, *tmp;
942*4882a593Smuzhiyun 
943*4882a593Smuzhiyun 	down_read(&pmd->root_lock);
944*4882a593Smuzhiyun 	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
945*4882a593Smuzhiyun 		if (td->open_count)
946*4882a593Smuzhiyun 			open_devices++;
947*4882a593Smuzhiyun 		else {
948*4882a593Smuzhiyun 			list_del(&td->list);
949*4882a593Smuzhiyun 			kfree(td);
950*4882a593Smuzhiyun 		}
951*4882a593Smuzhiyun 	}
952*4882a593Smuzhiyun 	up_read(&pmd->root_lock);
953*4882a593Smuzhiyun 
954*4882a593Smuzhiyun 	if (open_devices) {
955*4882a593Smuzhiyun 		DMERR("attempt to close pmd when %u device(s) are still open",
956*4882a593Smuzhiyun 		       open_devices);
957*4882a593Smuzhiyun 		return -EBUSY;
958*4882a593Smuzhiyun 	}
959*4882a593Smuzhiyun 
960*4882a593Smuzhiyun 	pmd_write_lock_in_core(pmd);
961*4882a593Smuzhiyun 	if (!pmd->fail_io && !dm_bm_is_read_only(pmd->bm)) {
962*4882a593Smuzhiyun 		r = __commit_transaction(pmd);
963*4882a593Smuzhiyun 		if (r < 0)
964*4882a593Smuzhiyun 			DMWARN("%s: __commit_transaction() failed, error = %d",
965*4882a593Smuzhiyun 			       __func__, r);
966*4882a593Smuzhiyun 	}
967*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
968*4882a593Smuzhiyun 	if (!pmd->fail_io)
969*4882a593Smuzhiyun 		__destroy_persistent_data_objects(pmd);
970*4882a593Smuzhiyun 
971*4882a593Smuzhiyun 	kfree(pmd);
972*4882a593Smuzhiyun 	return 0;
973*4882a593Smuzhiyun }
974*4882a593Smuzhiyun 
975*4882a593Smuzhiyun /*
976*4882a593Smuzhiyun  * __open_device: Returns @td corresponding to device with id @dev,
977*4882a593Smuzhiyun  * creating it if @create is set and incrementing @td->open_count.
978*4882a593Smuzhiyun  * On failure, @td is undefined.
979*4882a593Smuzhiyun  */
__open_device(struct dm_pool_metadata * pmd,dm_thin_id dev,int create,struct dm_thin_device ** td)980*4882a593Smuzhiyun static int __open_device(struct dm_pool_metadata *pmd,
981*4882a593Smuzhiyun 			 dm_thin_id dev, int create,
982*4882a593Smuzhiyun 			 struct dm_thin_device **td)
983*4882a593Smuzhiyun {
984*4882a593Smuzhiyun 	int r, changed = 0;
985*4882a593Smuzhiyun 	struct dm_thin_device *td2;
986*4882a593Smuzhiyun 	uint64_t key = dev;
987*4882a593Smuzhiyun 	struct disk_device_details details_le;
988*4882a593Smuzhiyun 
989*4882a593Smuzhiyun 	/*
990*4882a593Smuzhiyun 	 * If the device is already open, return it.
991*4882a593Smuzhiyun 	 */
992*4882a593Smuzhiyun 	list_for_each_entry(td2, &pmd->thin_devices, list)
993*4882a593Smuzhiyun 		if (td2->id == dev) {
994*4882a593Smuzhiyun 			/*
995*4882a593Smuzhiyun 			 * May not create an already-open device.
996*4882a593Smuzhiyun 			 */
997*4882a593Smuzhiyun 			if (create)
998*4882a593Smuzhiyun 				return -EEXIST;
999*4882a593Smuzhiyun 
1000*4882a593Smuzhiyun 			td2->open_count++;
1001*4882a593Smuzhiyun 			*td = td2;
1002*4882a593Smuzhiyun 			return 0;
1003*4882a593Smuzhiyun 		}
1004*4882a593Smuzhiyun 
1005*4882a593Smuzhiyun 	/*
1006*4882a593Smuzhiyun 	 * Check the device exists.
1007*4882a593Smuzhiyun 	 */
1008*4882a593Smuzhiyun 	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1009*4882a593Smuzhiyun 			    &key, &details_le);
1010*4882a593Smuzhiyun 	if (r) {
1011*4882a593Smuzhiyun 		if (r != -ENODATA || !create)
1012*4882a593Smuzhiyun 			return r;
1013*4882a593Smuzhiyun 
1014*4882a593Smuzhiyun 		/*
1015*4882a593Smuzhiyun 		 * Create new device.
1016*4882a593Smuzhiyun 		 */
1017*4882a593Smuzhiyun 		changed = 1;
1018*4882a593Smuzhiyun 		details_le.mapped_blocks = 0;
1019*4882a593Smuzhiyun 		details_le.transaction_id = cpu_to_le64(pmd->trans_id);
1020*4882a593Smuzhiyun 		details_le.creation_time = cpu_to_le32(pmd->time);
1021*4882a593Smuzhiyun 		details_le.snapshotted_time = cpu_to_le32(pmd->time);
1022*4882a593Smuzhiyun 	}
1023*4882a593Smuzhiyun 
1024*4882a593Smuzhiyun 	*td = kmalloc(sizeof(**td), GFP_NOIO);
1025*4882a593Smuzhiyun 	if (!*td)
1026*4882a593Smuzhiyun 		return -ENOMEM;
1027*4882a593Smuzhiyun 
1028*4882a593Smuzhiyun 	(*td)->pmd = pmd;
1029*4882a593Smuzhiyun 	(*td)->id = dev;
1030*4882a593Smuzhiyun 	(*td)->open_count = 1;
1031*4882a593Smuzhiyun 	(*td)->changed = changed;
1032*4882a593Smuzhiyun 	(*td)->aborted_with_changes = false;
1033*4882a593Smuzhiyun 	(*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
1034*4882a593Smuzhiyun 	(*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
1035*4882a593Smuzhiyun 	(*td)->creation_time = le32_to_cpu(details_le.creation_time);
1036*4882a593Smuzhiyun 	(*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time);
1037*4882a593Smuzhiyun 
1038*4882a593Smuzhiyun 	list_add(&(*td)->list, &pmd->thin_devices);
1039*4882a593Smuzhiyun 
1040*4882a593Smuzhiyun 	return 0;
1041*4882a593Smuzhiyun }
1042*4882a593Smuzhiyun 
__close_device(struct dm_thin_device * td)1043*4882a593Smuzhiyun static void __close_device(struct dm_thin_device *td)
1044*4882a593Smuzhiyun {
1045*4882a593Smuzhiyun 	--td->open_count;
1046*4882a593Smuzhiyun }
1047*4882a593Smuzhiyun 
__create_thin(struct dm_pool_metadata * pmd,dm_thin_id dev)1048*4882a593Smuzhiyun static int __create_thin(struct dm_pool_metadata *pmd,
1049*4882a593Smuzhiyun 			 dm_thin_id dev)
1050*4882a593Smuzhiyun {
1051*4882a593Smuzhiyun 	int r;
1052*4882a593Smuzhiyun 	dm_block_t dev_root;
1053*4882a593Smuzhiyun 	uint64_t key = dev;
1054*4882a593Smuzhiyun 	struct dm_thin_device *td;
1055*4882a593Smuzhiyun 	__le64 value;
1056*4882a593Smuzhiyun 
1057*4882a593Smuzhiyun 	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1058*4882a593Smuzhiyun 			    &key, NULL);
1059*4882a593Smuzhiyun 	if (!r)
1060*4882a593Smuzhiyun 		return -EEXIST;
1061*4882a593Smuzhiyun 
1062*4882a593Smuzhiyun 	/*
1063*4882a593Smuzhiyun 	 * Create an empty btree for the mappings.
1064*4882a593Smuzhiyun 	 */
1065*4882a593Smuzhiyun 	r = dm_btree_empty(&pmd->bl_info, &dev_root);
1066*4882a593Smuzhiyun 	if (r)
1067*4882a593Smuzhiyun 		return r;
1068*4882a593Smuzhiyun 
1069*4882a593Smuzhiyun 	/*
1070*4882a593Smuzhiyun 	 * Insert it into the main mapping tree.
1071*4882a593Smuzhiyun 	 */
1072*4882a593Smuzhiyun 	value = cpu_to_le64(dev_root);
1073*4882a593Smuzhiyun 	__dm_bless_for_disk(&value);
1074*4882a593Smuzhiyun 	r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1075*4882a593Smuzhiyun 	if (r) {
1076*4882a593Smuzhiyun 		dm_btree_del(&pmd->bl_info, dev_root);
1077*4882a593Smuzhiyun 		return r;
1078*4882a593Smuzhiyun 	}
1079*4882a593Smuzhiyun 
1080*4882a593Smuzhiyun 	r = __open_device(pmd, dev, 1, &td);
1081*4882a593Smuzhiyun 	if (r) {
1082*4882a593Smuzhiyun 		dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1083*4882a593Smuzhiyun 		dm_btree_del(&pmd->bl_info, dev_root);
1084*4882a593Smuzhiyun 		return r;
1085*4882a593Smuzhiyun 	}
1086*4882a593Smuzhiyun 	__close_device(td);
1087*4882a593Smuzhiyun 
1088*4882a593Smuzhiyun 	return r;
1089*4882a593Smuzhiyun }
1090*4882a593Smuzhiyun 
dm_pool_create_thin(struct dm_pool_metadata * pmd,dm_thin_id dev)1091*4882a593Smuzhiyun int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
1092*4882a593Smuzhiyun {
1093*4882a593Smuzhiyun 	int r = -EINVAL;
1094*4882a593Smuzhiyun 
1095*4882a593Smuzhiyun 	pmd_write_lock(pmd);
1096*4882a593Smuzhiyun 	if (!pmd->fail_io)
1097*4882a593Smuzhiyun 		r = __create_thin(pmd, dev);
1098*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
1099*4882a593Smuzhiyun 
1100*4882a593Smuzhiyun 	return r;
1101*4882a593Smuzhiyun }
1102*4882a593Smuzhiyun 
__set_snapshot_details(struct dm_pool_metadata * pmd,struct dm_thin_device * snap,dm_thin_id origin,uint32_t time)1103*4882a593Smuzhiyun static int __set_snapshot_details(struct dm_pool_metadata *pmd,
1104*4882a593Smuzhiyun 				  struct dm_thin_device *snap,
1105*4882a593Smuzhiyun 				  dm_thin_id origin, uint32_t time)
1106*4882a593Smuzhiyun {
1107*4882a593Smuzhiyun 	int r;
1108*4882a593Smuzhiyun 	struct dm_thin_device *td;
1109*4882a593Smuzhiyun 
1110*4882a593Smuzhiyun 	r = __open_device(pmd, origin, 0, &td);
1111*4882a593Smuzhiyun 	if (r)
1112*4882a593Smuzhiyun 		return r;
1113*4882a593Smuzhiyun 
1114*4882a593Smuzhiyun 	td->changed = true;
1115*4882a593Smuzhiyun 	td->snapshotted_time = time;
1116*4882a593Smuzhiyun 
1117*4882a593Smuzhiyun 	snap->mapped_blocks = td->mapped_blocks;
1118*4882a593Smuzhiyun 	snap->snapshotted_time = time;
1119*4882a593Smuzhiyun 	__close_device(td);
1120*4882a593Smuzhiyun 
1121*4882a593Smuzhiyun 	return 0;
1122*4882a593Smuzhiyun }
1123*4882a593Smuzhiyun 
__create_snap(struct dm_pool_metadata * pmd,dm_thin_id dev,dm_thin_id origin)1124*4882a593Smuzhiyun static int __create_snap(struct dm_pool_metadata *pmd,
1125*4882a593Smuzhiyun 			 dm_thin_id dev, dm_thin_id origin)
1126*4882a593Smuzhiyun {
1127*4882a593Smuzhiyun 	int r;
1128*4882a593Smuzhiyun 	dm_block_t origin_root;
1129*4882a593Smuzhiyun 	uint64_t key = origin, dev_key = dev;
1130*4882a593Smuzhiyun 	struct dm_thin_device *td;
1131*4882a593Smuzhiyun 	__le64 value;
1132*4882a593Smuzhiyun 
1133*4882a593Smuzhiyun 	/* check this device is unused */
1134*4882a593Smuzhiyun 	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1135*4882a593Smuzhiyun 			    &dev_key, NULL);
1136*4882a593Smuzhiyun 	if (!r)
1137*4882a593Smuzhiyun 		return -EEXIST;
1138*4882a593Smuzhiyun 
1139*4882a593Smuzhiyun 	/* find the mapping tree for the origin */
1140*4882a593Smuzhiyun 	r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value);
1141*4882a593Smuzhiyun 	if (r)
1142*4882a593Smuzhiyun 		return r;
1143*4882a593Smuzhiyun 	origin_root = le64_to_cpu(value);
1144*4882a593Smuzhiyun 
1145*4882a593Smuzhiyun 	/* clone the origin, an inc will do */
1146*4882a593Smuzhiyun 	dm_tm_inc(pmd->tm, origin_root);
1147*4882a593Smuzhiyun 
1148*4882a593Smuzhiyun 	/* insert into the main mapping tree */
1149*4882a593Smuzhiyun 	value = cpu_to_le64(origin_root);
1150*4882a593Smuzhiyun 	__dm_bless_for_disk(&value);
1151*4882a593Smuzhiyun 	key = dev;
1152*4882a593Smuzhiyun 	r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1153*4882a593Smuzhiyun 	if (r) {
1154*4882a593Smuzhiyun 		dm_tm_dec(pmd->tm, origin_root);
1155*4882a593Smuzhiyun 		return r;
1156*4882a593Smuzhiyun 	}
1157*4882a593Smuzhiyun 
1158*4882a593Smuzhiyun 	pmd->time++;
1159*4882a593Smuzhiyun 
1160*4882a593Smuzhiyun 	r = __open_device(pmd, dev, 1, &td);
1161*4882a593Smuzhiyun 	if (r)
1162*4882a593Smuzhiyun 		goto bad;
1163*4882a593Smuzhiyun 
1164*4882a593Smuzhiyun 	r = __set_snapshot_details(pmd, td, origin, pmd->time);
1165*4882a593Smuzhiyun 	__close_device(td);
1166*4882a593Smuzhiyun 
1167*4882a593Smuzhiyun 	if (r)
1168*4882a593Smuzhiyun 		goto bad;
1169*4882a593Smuzhiyun 
1170*4882a593Smuzhiyun 	return 0;
1171*4882a593Smuzhiyun 
1172*4882a593Smuzhiyun bad:
1173*4882a593Smuzhiyun 	dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1174*4882a593Smuzhiyun 	dm_btree_remove(&pmd->details_info, pmd->details_root,
1175*4882a593Smuzhiyun 			&key, &pmd->details_root);
1176*4882a593Smuzhiyun 	return r;
1177*4882a593Smuzhiyun }
1178*4882a593Smuzhiyun 
dm_pool_create_snap(struct dm_pool_metadata * pmd,dm_thin_id dev,dm_thin_id origin)1179*4882a593Smuzhiyun int dm_pool_create_snap(struct dm_pool_metadata *pmd,
1180*4882a593Smuzhiyun 				 dm_thin_id dev,
1181*4882a593Smuzhiyun 				 dm_thin_id origin)
1182*4882a593Smuzhiyun {
1183*4882a593Smuzhiyun 	int r = -EINVAL;
1184*4882a593Smuzhiyun 
1185*4882a593Smuzhiyun 	pmd_write_lock(pmd);
1186*4882a593Smuzhiyun 	if (!pmd->fail_io)
1187*4882a593Smuzhiyun 		r = __create_snap(pmd, dev, origin);
1188*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
1189*4882a593Smuzhiyun 
1190*4882a593Smuzhiyun 	return r;
1191*4882a593Smuzhiyun }
1192*4882a593Smuzhiyun 
__delete_device(struct dm_pool_metadata * pmd,dm_thin_id dev)1193*4882a593Smuzhiyun static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
1194*4882a593Smuzhiyun {
1195*4882a593Smuzhiyun 	int r;
1196*4882a593Smuzhiyun 	uint64_t key = dev;
1197*4882a593Smuzhiyun 	struct dm_thin_device *td;
1198*4882a593Smuzhiyun 
1199*4882a593Smuzhiyun 	/* TODO: failure should mark the transaction invalid */
1200*4882a593Smuzhiyun 	r = __open_device(pmd, dev, 0, &td);
1201*4882a593Smuzhiyun 	if (r)
1202*4882a593Smuzhiyun 		return r;
1203*4882a593Smuzhiyun 
1204*4882a593Smuzhiyun 	if (td->open_count > 1) {
1205*4882a593Smuzhiyun 		__close_device(td);
1206*4882a593Smuzhiyun 		return -EBUSY;
1207*4882a593Smuzhiyun 	}
1208*4882a593Smuzhiyun 
1209*4882a593Smuzhiyun 	list_del(&td->list);
1210*4882a593Smuzhiyun 	kfree(td);
1211*4882a593Smuzhiyun 	r = dm_btree_remove(&pmd->details_info, pmd->details_root,
1212*4882a593Smuzhiyun 			    &key, &pmd->details_root);
1213*4882a593Smuzhiyun 	if (r)
1214*4882a593Smuzhiyun 		return r;
1215*4882a593Smuzhiyun 
1216*4882a593Smuzhiyun 	r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1217*4882a593Smuzhiyun 	if (r)
1218*4882a593Smuzhiyun 		return r;
1219*4882a593Smuzhiyun 
1220*4882a593Smuzhiyun 	return 0;
1221*4882a593Smuzhiyun }
1222*4882a593Smuzhiyun 
dm_pool_delete_thin_device(struct dm_pool_metadata * pmd,dm_thin_id dev)1223*4882a593Smuzhiyun int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
1224*4882a593Smuzhiyun 			       dm_thin_id dev)
1225*4882a593Smuzhiyun {
1226*4882a593Smuzhiyun 	int r = -EINVAL;
1227*4882a593Smuzhiyun 
1228*4882a593Smuzhiyun 	pmd_write_lock(pmd);
1229*4882a593Smuzhiyun 	if (!pmd->fail_io)
1230*4882a593Smuzhiyun 		r = __delete_device(pmd, dev);
1231*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
1232*4882a593Smuzhiyun 
1233*4882a593Smuzhiyun 	return r;
1234*4882a593Smuzhiyun }
1235*4882a593Smuzhiyun 
dm_pool_set_metadata_transaction_id(struct dm_pool_metadata * pmd,uint64_t current_id,uint64_t new_id)1236*4882a593Smuzhiyun int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
1237*4882a593Smuzhiyun 					uint64_t current_id,
1238*4882a593Smuzhiyun 					uint64_t new_id)
1239*4882a593Smuzhiyun {
1240*4882a593Smuzhiyun 	int r = -EINVAL;
1241*4882a593Smuzhiyun 
1242*4882a593Smuzhiyun 	pmd_write_lock(pmd);
1243*4882a593Smuzhiyun 
1244*4882a593Smuzhiyun 	if (pmd->fail_io)
1245*4882a593Smuzhiyun 		goto out;
1246*4882a593Smuzhiyun 
1247*4882a593Smuzhiyun 	if (pmd->trans_id != current_id) {
1248*4882a593Smuzhiyun 		DMERR("mismatched transaction id");
1249*4882a593Smuzhiyun 		goto out;
1250*4882a593Smuzhiyun 	}
1251*4882a593Smuzhiyun 
1252*4882a593Smuzhiyun 	pmd->trans_id = new_id;
1253*4882a593Smuzhiyun 	r = 0;
1254*4882a593Smuzhiyun 
1255*4882a593Smuzhiyun out:
1256*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
1257*4882a593Smuzhiyun 
1258*4882a593Smuzhiyun 	return r;
1259*4882a593Smuzhiyun }
1260*4882a593Smuzhiyun 
dm_pool_get_metadata_transaction_id(struct dm_pool_metadata * pmd,uint64_t * result)1261*4882a593Smuzhiyun int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
1262*4882a593Smuzhiyun 					uint64_t *result)
1263*4882a593Smuzhiyun {
1264*4882a593Smuzhiyun 	int r = -EINVAL;
1265*4882a593Smuzhiyun 
1266*4882a593Smuzhiyun 	down_read(&pmd->root_lock);
1267*4882a593Smuzhiyun 	if (!pmd->fail_io) {
1268*4882a593Smuzhiyun 		*result = pmd->trans_id;
1269*4882a593Smuzhiyun 		r = 0;
1270*4882a593Smuzhiyun 	}
1271*4882a593Smuzhiyun 	up_read(&pmd->root_lock);
1272*4882a593Smuzhiyun 
1273*4882a593Smuzhiyun 	return r;
1274*4882a593Smuzhiyun }
1275*4882a593Smuzhiyun 
__reserve_metadata_snap(struct dm_pool_metadata * pmd)1276*4882a593Smuzhiyun static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
1277*4882a593Smuzhiyun {
1278*4882a593Smuzhiyun 	int r, inc;
1279*4882a593Smuzhiyun 	struct thin_disk_superblock *disk_super;
1280*4882a593Smuzhiyun 	struct dm_block *copy, *sblock;
1281*4882a593Smuzhiyun 	dm_block_t held_root;
1282*4882a593Smuzhiyun 
1283*4882a593Smuzhiyun 	/*
1284*4882a593Smuzhiyun 	 * We commit to ensure the btree roots which we increment in a
1285*4882a593Smuzhiyun 	 * moment are up to date.
1286*4882a593Smuzhiyun 	 */
1287*4882a593Smuzhiyun 	r = __commit_transaction(pmd);
1288*4882a593Smuzhiyun 	if (r < 0) {
1289*4882a593Smuzhiyun 		DMWARN("%s: __commit_transaction() failed, error = %d",
1290*4882a593Smuzhiyun 		       __func__, r);
1291*4882a593Smuzhiyun 		return r;
1292*4882a593Smuzhiyun 	}
1293*4882a593Smuzhiyun 
1294*4882a593Smuzhiyun 	/*
1295*4882a593Smuzhiyun 	 * Copy the superblock.
1296*4882a593Smuzhiyun 	 */
1297*4882a593Smuzhiyun 	dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
1298*4882a593Smuzhiyun 	r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION,
1299*4882a593Smuzhiyun 			       &sb_validator, &copy, &inc);
1300*4882a593Smuzhiyun 	if (r)
1301*4882a593Smuzhiyun 		return r;
1302*4882a593Smuzhiyun 
1303*4882a593Smuzhiyun 	BUG_ON(!inc);
1304*4882a593Smuzhiyun 
1305*4882a593Smuzhiyun 	held_root = dm_block_location(copy);
1306*4882a593Smuzhiyun 	disk_super = dm_block_data(copy);
1307*4882a593Smuzhiyun 
1308*4882a593Smuzhiyun 	if (le64_to_cpu(disk_super->held_root)) {
1309*4882a593Smuzhiyun 		DMWARN("Pool metadata snapshot already exists: release this before taking another.");
1310*4882a593Smuzhiyun 
1311*4882a593Smuzhiyun 		dm_tm_dec(pmd->tm, held_root);
1312*4882a593Smuzhiyun 		dm_tm_unlock(pmd->tm, copy);
1313*4882a593Smuzhiyun 		return -EBUSY;
1314*4882a593Smuzhiyun 	}
1315*4882a593Smuzhiyun 
1316*4882a593Smuzhiyun 	/*
1317*4882a593Smuzhiyun 	 * Wipe the spacemap since we're not publishing this.
1318*4882a593Smuzhiyun 	 */
1319*4882a593Smuzhiyun 	memset(&disk_super->data_space_map_root, 0,
1320*4882a593Smuzhiyun 	       sizeof(disk_super->data_space_map_root));
1321*4882a593Smuzhiyun 	memset(&disk_super->metadata_space_map_root, 0,
1322*4882a593Smuzhiyun 	       sizeof(disk_super->metadata_space_map_root));
1323*4882a593Smuzhiyun 
1324*4882a593Smuzhiyun 	/*
1325*4882a593Smuzhiyun 	 * Increment the data structures that need to be preserved.
1326*4882a593Smuzhiyun 	 */
1327*4882a593Smuzhiyun 	dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root));
1328*4882a593Smuzhiyun 	dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root));
1329*4882a593Smuzhiyun 	dm_tm_unlock(pmd->tm, copy);
1330*4882a593Smuzhiyun 
1331*4882a593Smuzhiyun 	/*
1332*4882a593Smuzhiyun 	 * Write the held root into the superblock.
1333*4882a593Smuzhiyun 	 */
1334*4882a593Smuzhiyun 	r = superblock_lock(pmd, &sblock);
1335*4882a593Smuzhiyun 	if (r) {
1336*4882a593Smuzhiyun 		dm_tm_dec(pmd->tm, held_root);
1337*4882a593Smuzhiyun 		return r;
1338*4882a593Smuzhiyun 	}
1339*4882a593Smuzhiyun 
1340*4882a593Smuzhiyun 	disk_super = dm_block_data(sblock);
1341*4882a593Smuzhiyun 	disk_super->held_root = cpu_to_le64(held_root);
1342*4882a593Smuzhiyun 	dm_bm_unlock(sblock);
1343*4882a593Smuzhiyun 	return 0;
1344*4882a593Smuzhiyun }
1345*4882a593Smuzhiyun 
dm_pool_reserve_metadata_snap(struct dm_pool_metadata * pmd)1346*4882a593Smuzhiyun int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
1347*4882a593Smuzhiyun {
1348*4882a593Smuzhiyun 	int r = -EINVAL;
1349*4882a593Smuzhiyun 
1350*4882a593Smuzhiyun 	pmd_write_lock(pmd);
1351*4882a593Smuzhiyun 	if (!pmd->fail_io)
1352*4882a593Smuzhiyun 		r = __reserve_metadata_snap(pmd);
1353*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
1354*4882a593Smuzhiyun 
1355*4882a593Smuzhiyun 	return r;
1356*4882a593Smuzhiyun }
1357*4882a593Smuzhiyun 
__release_metadata_snap(struct dm_pool_metadata * pmd)1358*4882a593Smuzhiyun static int __release_metadata_snap(struct dm_pool_metadata *pmd)
1359*4882a593Smuzhiyun {
1360*4882a593Smuzhiyun 	int r;
1361*4882a593Smuzhiyun 	struct thin_disk_superblock *disk_super;
1362*4882a593Smuzhiyun 	struct dm_block *sblock, *copy;
1363*4882a593Smuzhiyun 	dm_block_t held_root;
1364*4882a593Smuzhiyun 
1365*4882a593Smuzhiyun 	r = superblock_lock(pmd, &sblock);
1366*4882a593Smuzhiyun 	if (r)
1367*4882a593Smuzhiyun 		return r;
1368*4882a593Smuzhiyun 
1369*4882a593Smuzhiyun 	disk_super = dm_block_data(sblock);
1370*4882a593Smuzhiyun 	held_root = le64_to_cpu(disk_super->held_root);
1371*4882a593Smuzhiyun 	disk_super->held_root = cpu_to_le64(0);
1372*4882a593Smuzhiyun 
1373*4882a593Smuzhiyun 	dm_bm_unlock(sblock);
1374*4882a593Smuzhiyun 
1375*4882a593Smuzhiyun 	if (!held_root) {
1376*4882a593Smuzhiyun 		DMWARN("No pool metadata snapshot found: nothing to release.");
1377*4882a593Smuzhiyun 		return -EINVAL;
1378*4882a593Smuzhiyun 	}
1379*4882a593Smuzhiyun 
1380*4882a593Smuzhiyun 	r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, &copy);
1381*4882a593Smuzhiyun 	if (r)
1382*4882a593Smuzhiyun 		return r;
1383*4882a593Smuzhiyun 
1384*4882a593Smuzhiyun 	disk_super = dm_block_data(copy);
1385*4882a593Smuzhiyun 	dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root));
1386*4882a593Smuzhiyun 	dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root));
1387*4882a593Smuzhiyun 	dm_sm_dec_block(pmd->metadata_sm, held_root);
1388*4882a593Smuzhiyun 
1389*4882a593Smuzhiyun 	dm_tm_unlock(pmd->tm, copy);
1390*4882a593Smuzhiyun 
1391*4882a593Smuzhiyun 	return 0;
1392*4882a593Smuzhiyun }
1393*4882a593Smuzhiyun 
dm_pool_release_metadata_snap(struct dm_pool_metadata * pmd)1394*4882a593Smuzhiyun int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
1395*4882a593Smuzhiyun {
1396*4882a593Smuzhiyun 	int r = -EINVAL;
1397*4882a593Smuzhiyun 
1398*4882a593Smuzhiyun 	pmd_write_lock(pmd);
1399*4882a593Smuzhiyun 	if (!pmd->fail_io)
1400*4882a593Smuzhiyun 		r = __release_metadata_snap(pmd);
1401*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
1402*4882a593Smuzhiyun 
1403*4882a593Smuzhiyun 	return r;
1404*4882a593Smuzhiyun }
1405*4882a593Smuzhiyun 
__get_metadata_snap(struct dm_pool_metadata * pmd,dm_block_t * result)1406*4882a593Smuzhiyun static int __get_metadata_snap(struct dm_pool_metadata *pmd,
1407*4882a593Smuzhiyun 			       dm_block_t *result)
1408*4882a593Smuzhiyun {
1409*4882a593Smuzhiyun 	int r;
1410*4882a593Smuzhiyun 	struct thin_disk_superblock *disk_super;
1411*4882a593Smuzhiyun 	struct dm_block *sblock;
1412*4882a593Smuzhiyun 
1413*4882a593Smuzhiyun 	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
1414*4882a593Smuzhiyun 			    &sb_validator, &sblock);
1415*4882a593Smuzhiyun 	if (r)
1416*4882a593Smuzhiyun 		return r;
1417*4882a593Smuzhiyun 
1418*4882a593Smuzhiyun 	disk_super = dm_block_data(sblock);
1419*4882a593Smuzhiyun 	*result = le64_to_cpu(disk_super->held_root);
1420*4882a593Smuzhiyun 
1421*4882a593Smuzhiyun 	dm_bm_unlock(sblock);
1422*4882a593Smuzhiyun 
1423*4882a593Smuzhiyun 	return 0;
1424*4882a593Smuzhiyun }
1425*4882a593Smuzhiyun 
dm_pool_get_metadata_snap(struct dm_pool_metadata * pmd,dm_block_t * result)1426*4882a593Smuzhiyun int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
1427*4882a593Smuzhiyun 			      dm_block_t *result)
1428*4882a593Smuzhiyun {
1429*4882a593Smuzhiyun 	int r = -EINVAL;
1430*4882a593Smuzhiyun 
1431*4882a593Smuzhiyun 	down_read(&pmd->root_lock);
1432*4882a593Smuzhiyun 	if (!pmd->fail_io)
1433*4882a593Smuzhiyun 		r = __get_metadata_snap(pmd, result);
1434*4882a593Smuzhiyun 	up_read(&pmd->root_lock);
1435*4882a593Smuzhiyun 
1436*4882a593Smuzhiyun 	return r;
1437*4882a593Smuzhiyun }
1438*4882a593Smuzhiyun 
dm_pool_open_thin_device(struct dm_pool_metadata * pmd,dm_thin_id dev,struct dm_thin_device ** td)1439*4882a593Smuzhiyun int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
1440*4882a593Smuzhiyun 			     struct dm_thin_device **td)
1441*4882a593Smuzhiyun {
1442*4882a593Smuzhiyun 	int r = -EINVAL;
1443*4882a593Smuzhiyun 
1444*4882a593Smuzhiyun 	pmd_write_lock_in_core(pmd);
1445*4882a593Smuzhiyun 	if (!pmd->fail_io)
1446*4882a593Smuzhiyun 		r = __open_device(pmd, dev, 0, td);
1447*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
1448*4882a593Smuzhiyun 
1449*4882a593Smuzhiyun 	return r;
1450*4882a593Smuzhiyun }
1451*4882a593Smuzhiyun 
dm_pool_close_thin_device(struct dm_thin_device * td)1452*4882a593Smuzhiyun int dm_pool_close_thin_device(struct dm_thin_device *td)
1453*4882a593Smuzhiyun {
1454*4882a593Smuzhiyun 	pmd_write_lock_in_core(td->pmd);
1455*4882a593Smuzhiyun 	__close_device(td);
1456*4882a593Smuzhiyun 	pmd_write_unlock(td->pmd);
1457*4882a593Smuzhiyun 
1458*4882a593Smuzhiyun 	return 0;
1459*4882a593Smuzhiyun }
1460*4882a593Smuzhiyun 
dm_thin_dev_id(struct dm_thin_device * td)1461*4882a593Smuzhiyun dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
1462*4882a593Smuzhiyun {
1463*4882a593Smuzhiyun 	return td->id;
1464*4882a593Smuzhiyun }
1465*4882a593Smuzhiyun 
1466*4882a593Smuzhiyun /*
1467*4882a593Smuzhiyun  * Check whether @time (of block creation) is older than @td's last snapshot.
1468*4882a593Smuzhiyun  * If so then the associated block is shared with the last snapshot device.
1469*4882a593Smuzhiyun  * Any block on a device created *after* the device last got snapshotted is
1470*4882a593Smuzhiyun  * necessarily not shared.
1471*4882a593Smuzhiyun  */
__snapshotted_since(struct dm_thin_device * td,uint32_t time)1472*4882a593Smuzhiyun static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1473*4882a593Smuzhiyun {
1474*4882a593Smuzhiyun 	return td->snapshotted_time > time;
1475*4882a593Smuzhiyun }
1476*4882a593Smuzhiyun 
unpack_lookup_result(struct dm_thin_device * td,__le64 value,struct dm_thin_lookup_result * result)1477*4882a593Smuzhiyun static void unpack_lookup_result(struct dm_thin_device *td, __le64 value,
1478*4882a593Smuzhiyun 				 struct dm_thin_lookup_result *result)
1479*4882a593Smuzhiyun {
1480*4882a593Smuzhiyun 	uint64_t block_time = 0;
1481*4882a593Smuzhiyun 	dm_block_t exception_block;
1482*4882a593Smuzhiyun 	uint32_t exception_time;
1483*4882a593Smuzhiyun 
1484*4882a593Smuzhiyun 	block_time = le64_to_cpu(value);
1485*4882a593Smuzhiyun 	unpack_block_time(block_time, &exception_block, &exception_time);
1486*4882a593Smuzhiyun 	result->block = exception_block;
1487*4882a593Smuzhiyun 	result->shared = __snapshotted_since(td, exception_time);
1488*4882a593Smuzhiyun }
1489*4882a593Smuzhiyun 
__find_block(struct dm_thin_device * td,dm_block_t block,int can_issue_io,struct dm_thin_lookup_result * result)1490*4882a593Smuzhiyun static int __find_block(struct dm_thin_device *td, dm_block_t block,
1491*4882a593Smuzhiyun 			int can_issue_io, struct dm_thin_lookup_result *result)
1492*4882a593Smuzhiyun {
1493*4882a593Smuzhiyun 	int r;
1494*4882a593Smuzhiyun 	__le64 value;
1495*4882a593Smuzhiyun 	struct dm_pool_metadata *pmd = td->pmd;
1496*4882a593Smuzhiyun 	dm_block_t keys[2] = { td->id, block };
1497*4882a593Smuzhiyun 	struct dm_btree_info *info;
1498*4882a593Smuzhiyun 
1499*4882a593Smuzhiyun 	if (can_issue_io) {
1500*4882a593Smuzhiyun 		info = &pmd->info;
1501*4882a593Smuzhiyun 	} else
1502*4882a593Smuzhiyun 		info = &pmd->nb_info;
1503*4882a593Smuzhiyun 
1504*4882a593Smuzhiyun 	r = dm_btree_lookup(info, pmd->root, keys, &value);
1505*4882a593Smuzhiyun 	if (!r)
1506*4882a593Smuzhiyun 		unpack_lookup_result(td, value, result);
1507*4882a593Smuzhiyun 
1508*4882a593Smuzhiyun 	return r;
1509*4882a593Smuzhiyun }
1510*4882a593Smuzhiyun 
dm_thin_find_block(struct dm_thin_device * td,dm_block_t block,int can_issue_io,struct dm_thin_lookup_result * result)1511*4882a593Smuzhiyun int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
1512*4882a593Smuzhiyun 		       int can_issue_io, struct dm_thin_lookup_result *result)
1513*4882a593Smuzhiyun {
1514*4882a593Smuzhiyun 	int r;
1515*4882a593Smuzhiyun 	struct dm_pool_metadata *pmd = td->pmd;
1516*4882a593Smuzhiyun 
1517*4882a593Smuzhiyun 	down_read(&pmd->root_lock);
1518*4882a593Smuzhiyun 	if (pmd->fail_io) {
1519*4882a593Smuzhiyun 		up_read(&pmd->root_lock);
1520*4882a593Smuzhiyun 		return -EINVAL;
1521*4882a593Smuzhiyun 	}
1522*4882a593Smuzhiyun 
1523*4882a593Smuzhiyun 	r = __find_block(td, block, can_issue_io, result);
1524*4882a593Smuzhiyun 
1525*4882a593Smuzhiyun 	up_read(&pmd->root_lock);
1526*4882a593Smuzhiyun 	return r;
1527*4882a593Smuzhiyun }
1528*4882a593Smuzhiyun 
__find_next_mapped_block(struct dm_thin_device * td,dm_block_t block,dm_block_t * vblock,struct dm_thin_lookup_result * result)1529*4882a593Smuzhiyun static int __find_next_mapped_block(struct dm_thin_device *td, dm_block_t block,
1530*4882a593Smuzhiyun 					  dm_block_t *vblock,
1531*4882a593Smuzhiyun 					  struct dm_thin_lookup_result *result)
1532*4882a593Smuzhiyun {
1533*4882a593Smuzhiyun 	int r;
1534*4882a593Smuzhiyun 	__le64 value;
1535*4882a593Smuzhiyun 	struct dm_pool_metadata *pmd = td->pmd;
1536*4882a593Smuzhiyun 	dm_block_t keys[2] = { td->id, block };
1537*4882a593Smuzhiyun 
1538*4882a593Smuzhiyun 	r = dm_btree_lookup_next(&pmd->info, pmd->root, keys, vblock, &value);
1539*4882a593Smuzhiyun 	if (!r)
1540*4882a593Smuzhiyun 		unpack_lookup_result(td, value, result);
1541*4882a593Smuzhiyun 
1542*4882a593Smuzhiyun 	return r;
1543*4882a593Smuzhiyun }
1544*4882a593Smuzhiyun 
__find_mapped_range(struct dm_thin_device * td,dm_block_t begin,dm_block_t end,dm_block_t * thin_begin,dm_block_t * thin_end,dm_block_t * pool_begin,bool * maybe_shared)1545*4882a593Smuzhiyun static int __find_mapped_range(struct dm_thin_device *td,
1546*4882a593Smuzhiyun 			       dm_block_t begin, dm_block_t end,
1547*4882a593Smuzhiyun 			       dm_block_t *thin_begin, dm_block_t *thin_end,
1548*4882a593Smuzhiyun 			       dm_block_t *pool_begin, bool *maybe_shared)
1549*4882a593Smuzhiyun {
1550*4882a593Smuzhiyun 	int r;
1551*4882a593Smuzhiyun 	dm_block_t pool_end;
1552*4882a593Smuzhiyun 	struct dm_thin_lookup_result lookup;
1553*4882a593Smuzhiyun 
1554*4882a593Smuzhiyun 	if (end < begin)
1555*4882a593Smuzhiyun 		return -ENODATA;
1556*4882a593Smuzhiyun 
1557*4882a593Smuzhiyun 	r = __find_next_mapped_block(td, begin, &begin, &lookup);
1558*4882a593Smuzhiyun 	if (r)
1559*4882a593Smuzhiyun 		return r;
1560*4882a593Smuzhiyun 
1561*4882a593Smuzhiyun 	if (begin >= end)
1562*4882a593Smuzhiyun 		return -ENODATA;
1563*4882a593Smuzhiyun 
1564*4882a593Smuzhiyun 	*thin_begin = begin;
1565*4882a593Smuzhiyun 	*pool_begin = lookup.block;
1566*4882a593Smuzhiyun 	*maybe_shared = lookup.shared;
1567*4882a593Smuzhiyun 
1568*4882a593Smuzhiyun 	begin++;
1569*4882a593Smuzhiyun 	pool_end = *pool_begin + 1;
1570*4882a593Smuzhiyun 	while (begin != end) {
1571*4882a593Smuzhiyun 		r = __find_block(td, begin, true, &lookup);
1572*4882a593Smuzhiyun 		if (r) {
1573*4882a593Smuzhiyun 			if (r == -ENODATA)
1574*4882a593Smuzhiyun 				break;
1575*4882a593Smuzhiyun 			else
1576*4882a593Smuzhiyun 				return r;
1577*4882a593Smuzhiyun 		}
1578*4882a593Smuzhiyun 
1579*4882a593Smuzhiyun 		if ((lookup.block != pool_end) ||
1580*4882a593Smuzhiyun 		    (lookup.shared != *maybe_shared))
1581*4882a593Smuzhiyun 			break;
1582*4882a593Smuzhiyun 
1583*4882a593Smuzhiyun 		pool_end++;
1584*4882a593Smuzhiyun 		begin++;
1585*4882a593Smuzhiyun 	}
1586*4882a593Smuzhiyun 
1587*4882a593Smuzhiyun 	*thin_end = begin;
1588*4882a593Smuzhiyun 	return 0;
1589*4882a593Smuzhiyun }
1590*4882a593Smuzhiyun 
dm_thin_find_mapped_range(struct dm_thin_device * td,dm_block_t begin,dm_block_t end,dm_block_t * thin_begin,dm_block_t * thin_end,dm_block_t * pool_begin,bool * maybe_shared)1591*4882a593Smuzhiyun int dm_thin_find_mapped_range(struct dm_thin_device *td,
1592*4882a593Smuzhiyun 			      dm_block_t begin, dm_block_t end,
1593*4882a593Smuzhiyun 			      dm_block_t *thin_begin, dm_block_t *thin_end,
1594*4882a593Smuzhiyun 			      dm_block_t *pool_begin, bool *maybe_shared)
1595*4882a593Smuzhiyun {
1596*4882a593Smuzhiyun 	int r = -EINVAL;
1597*4882a593Smuzhiyun 	struct dm_pool_metadata *pmd = td->pmd;
1598*4882a593Smuzhiyun 
1599*4882a593Smuzhiyun 	down_read(&pmd->root_lock);
1600*4882a593Smuzhiyun 	if (!pmd->fail_io) {
1601*4882a593Smuzhiyun 		r = __find_mapped_range(td, begin, end, thin_begin, thin_end,
1602*4882a593Smuzhiyun 					pool_begin, maybe_shared);
1603*4882a593Smuzhiyun 	}
1604*4882a593Smuzhiyun 	up_read(&pmd->root_lock);
1605*4882a593Smuzhiyun 
1606*4882a593Smuzhiyun 	return r;
1607*4882a593Smuzhiyun }
1608*4882a593Smuzhiyun 
__insert(struct dm_thin_device * td,dm_block_t block,dm_block_t data_block)1609*4882a593Smuzhiyun static int __insert(struct dm_thin_device *td, dm_block_t block,
1610*4882a593Smuzhiyun 		    dm_block_t data_block)
1611*4882a593Smuzhiyun {
1612*4882a593Smuzhiyun 	int r, inserted;
1613*4882a593Smuzhiyun 	__le64 value;
1614*4882a593Smuzhiyun 	struct dm_pool_metadata *pmd = td->pmd;
1615*4882a593Smuzhiyun 	dm_block_t keys[2] = { td->id, block };
1616*4882a593Smuzhiyun 
1617*4882a593Smuzhiyun 	value = cpu_to_le64(pack_block_time(data_block, pmd->time));
1618*4882a593Smuzhiyun 	__dm_bless_for_disk(&value);
1619*4882a593Smuzhiyun 
1620*4882a593Smuzhiyun 	r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value,
1621*4882a593Smuzhiyun 				   &pmd->root, &inserted);
1622*4882a593Smuzhiyun 	if (r)
1623*4882a593Smuzhiyun 		return r;
1624*4882a593Smuzhiyun 
1625*4882a593Smuzhiyun 	td->changed = true;
1626*4882a593Smuzhiyun 	if (inserted)
1627*4882a593Smuzhiyun 		td->mapped_blocks++;
1628*4882a593Smuzhiyun 
1629*4882a593Smuzhiyun 	return 0;
1630*4882a593Smuzhiyun }
1631*4882a593Smuzhiyun 
dm_thin_insert_block(struct dm_thin_device * td,dm_block_t block,dm_block_t data_block)1632*4882a593Smuzhiyun int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
1633*4882a593Smuzhiyun 			 dm_block_t data_block)
1634*4882a593Smuzhiyun {
1635*4882a593Smuzhiyun 	int r = -EINVAL;
1636*4882a593Smuzhiyun 
1637*4882a593Smuzhiyun 	pmd_write_lock(td->pmd);
1638*4882a593Smuzhiyun 	if (!td->pmd->fail_io)
1639*4882a593Smuzhiyun 		r = __insert(td, block, data_block);
1640*4882a593Smuzhiyun 	pmd_write_unlock(td->pmd);
1641*4882a593Smuzhiyun 
1642*4882a593Smuzhiyun 	return r;
1643*4882a593Smuzhiyun }
1644*4882a593Smuzhiyun 
__remove(struct dm_thin_device * td,dm_block_t block)1645*4882a593Smuzhiyun static int __remove(struct dm_thin_device *td, dm_block_t block)
1646*4882a593Smuzhiyun {
1647*4882a593Smuzhiyun 	int r;
1648*4882a593Smuzhiyun 	struct dm_pool_metadata *pmd = td->pmd;
1649*4882a593Smuzhiyun 	dm_block_t keys[2] = { td->id, block };
1650*4882a593Smuzhiyun 
1651*4882a593Smuzhiyun 	r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root);
1652*4882a593Smuzhiyun 	if (r)
1653*4882a593Smuzhiyun 		return r;
1654*4882a593Smuzhiyun 
1655*4882a593Smuzhiyun 	td->mapped_blocks--;
1656*4882a593Smuzhiyun 	td->changed = true;
1657*4882a593Smuzhiyun 
1658*4882a593Smuzhiyun 	return 0;
1659*4882a593Smuzhiyun }
1660*4882a593Smuzhiyun 
__remove_range(struct dm_thin_device * td,dm_block_t begin,dm_block_t end)1661*4882a593Smuzhiyun static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
1662*4882a593Smuzhiyun {
1663*4882a593Smuzhiyun 	int r;
1664*4882a593Smuzhiyun 	unsigned count, total_count = 0;
1665*4882a593Smuzhiyun 	struct dm_pool_metadata *pmd = td->pmd;
1666*4882a593Smuzhiyun 	dm_block_t keys[1] = { td->id };
1667*4882a593Smuzhiyun 	__le64 value;
1668*4882a593Smuzhiyun 	dm_block_t mapping_root;
1669*4882a593Smuzhiyun 
1670*4882a593Smuzhiyun 	/*
1671*4882a593Smuzhiyun 	 * Find the mapping tree
1672*4882a593Smuzhiyun 	 */
1673*4882a593Smuzhiyun 	r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value);
1674*4882a593Smuzhiyun 	if (r)
1675*4882a593Smuzhiyun 		return r;
1676*4882a593Smuzhiyun 
1677*4882a593Smuzhiyun 	/*
1678*4882a593Smuzhiyun 	 * Remove from the mapping tree, taking care to inc the
1679*4882a593Smuzhiyun 	 * ref count so it doesn't get deleted.
1680*4882a593Smuzhiyun 	 */
1681*4882a593Smuzhiyun 	mapping_root = le64_to_cpu(value);
1682*4882a593Smuzhiyun 	dm_tm_inc(pmd->tm, mapping_root);
1683*4882a593Smuzhiyun 	r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root);
1684*4882a593Smuzhiyun 	if (r)
1685*4882a593Smuzhiyun 		return r;
1686*4882a593Smuzhiyun 
1687*4882a593Smuzhiyun 	/*
1688*4882a593Smuzhiyun 	 * Remove leaves stops at the first unmapped entry, so we have to
1689*4882a593Smuzhiyun 	 * loop round finding mapped ranges.
1690*4882a593Smuzhiyun 	 */
1691*4882a593Smuzhiyun 	while (begin < end) {
1692*4882a593Smuzhiyun 		r = dm_btree_lookup_next(&pmd->bl_info, mapping_root, &begin, &begin, &value);
1693*4882a593Smuzhiyun 		if (r == -ENODATA)
1694*4882a593Smuzhiyun 			break;
1695*4882a593Smuzhiyun 
1696*4882a593Smuzhiyun 		if (r)
1697*4882a593Smuzhiyun 			return r;
1698*4882a593Smuzhiyun 
1699*4882a593Smuzhiyun 		if (begin >= end)
1700*4882a593Smuzhiyun 			break;
1701*4882a593Smuzhiyun 
1702*4882a593Smuzhiyun 		r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
1703*4882a593Smuzhiyun 		if (r)
1704*4882a593Smuzhiyun 			return r;
1705*4882a593Smuzhiyun 
1706*4882a593Smuzhiyun 		total_count += count;
1707*4882a593Smuzhiyun 	}
1708*4882a593Smuzhiyun 
1709*4882a593Smuzhiyun 	td->mapped_blocks -= total_count;
1710*4882a593Smuzhiyun 	td->changed = true;
1711*4882a593Smuzhiyun 
1712*4882a593Smuzhiyun 	/*
1713*4882a593Smuzhiyun 	 * Reinsert the mapping tree.
1714*4882a593Smuzhiyun 	 */
1715*4882a593Smuzhiyun 	value = cpu_to_le64(mapping_root);
1716*4882a593Smuzhiyun 	__dm_bless_for_disk(&value);
1717*4882a593Smuzhiyun 	return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root);
1718*4882a593Smuzhiyun }
1719*4882a593Smuzhiyun 
dm_thin_remove_block(struct dm_thin_device * td,dm_block_t block)1720*4882a593Smuzhiyun int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
1721*4882a593Smuzhiyun {
1722*4882a593Smuzhiyun 	int r = -EINVAL;
1723*4882a593Smuzhiyun 
1724*4882a593Smuzhiyun 	pmd_write_lock(td->pmd);
1725*4882a593Smuzhiyun 	if (!td->pmd->fail_io)
1726*4882a593Smuzhiyun 		r = __remove(td, block);
1727*4882a593Smuzhiyun 	pmd_write_unlock(td->pmd);
1728*4882a593Smuzhiyun 
1729*4882a593Smuzhiyun 	return r;
1730*4882a593Smuzhiyun }
1731*4882a593Smuzhiyun 
dm_thin_remove_range(struct dm_thin_device * td,dm_block_t begin,dm_block_t end)1732*4882a593Smuzhiyun int dm_thin_remove_range(struct dm_thin_device *td,
1733*4882a593Smuzhiyun 			 dm_block_t begin, dm_block_t end)
1734*4882a593Smuzhiyun {
1735*4882a593Smuzhiyun 	int r = -EINVAL;
1736*4882a593Smuzhiyun 
1737*4882a593Smuzhiyun 	pmd_write_lock(td->pmd);
1738*4882a593Smuzhiyun 	if (!td->pmd->fail_io)
1739*4882a593Smuzhiyun 		r = __remove_range(td, begin, end);
1740*4882a593Smuzhiyun 	pmd_write_unlock(td->pmd);
1741*4882a593Smuzhiyun 
1742*4882a593Smuzhiyun 	return r;
1743*4882a593Smuzhiyun }
1744*4882a593Smuzhiyun 
dm_pool_block_is_shared(struct dm_pool_metadata * pmd,dm_block_t b,bool * result)1745*4882a593Smuzhiyun int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
1746*4882a593Smuzhiyun {
1747*4882a593Smuzhiyun 	int r;
1748*4882a593Smuzhiyun 	uint32_t ref_count;
1749*4882a593Smuzhiyun 
1750*4882a593Smuzhiyun 	down_read(&pmd->root_lock);
1751*4882a593Smuzhiyun 	r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
1752*4882a593Smuzhiyun 	if (!r)
1753*4882a593Smuzhiyun 		*result = (ref_count > 1);
1754*4882a593Smuzhiyun 	up_read(&pmd->root_lock);
1755*4882a593Smuzhiyun 
1756*4882a593Smuzhiyun 	return r;
1757*4882a593Smuzhiyun }
1758*4882a593Smuzhiyun 
dm_pool_inc_data_range(struct dm_pool_metadata * pmd,dm_block_t b,dm_block_t e)1759*4882a593Smuzhiyun int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
1760*4882a593Smuzhiyun {
1761*4882a593Smuzhiyun 	int r = 0;
1762*4882a593Smuzhiyun 
1763*4882a593Smuzhiyun 	pmd_write_lock(pmd);
1764*4882a593Smuzhiyun 	for (; b != e; b++) {
1765*4882a593Smuzhiyun 		r = dm_sm_inc_block(pmd->data_sm, b);
1766*4882a593Smuzhiyun 		if (r)
1767*4882a593Smuzhiyun 			break;
1768*4882a593Smuzhiyun 	}
1769*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
1770*4882a593Smuzhiyun 
1771*4882a593Smuzhiyun 	return r;
1772*4882a593Smuzhiyun }
1773*4882a593Smuzhiyun 
dm_pool_dec_data_range(struct dm_pool_metadata * pmd,dm_block_t b,dm_block_t e)1774*4882a593Smuzhiyun int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
1775*4882a593Smuzhiyun {
1776*4882a593Smuzhiyun 	int r = 0;
1777*4882a593Smuzhiyun 
1778*4882a593Smuzhiyun 	pmd_write_lock(pmd);
1779*4882a593Smuzhiyun 	for (; b != e; b++) {
1780*4882a593Smuzhiyun 		r = dm_sm_dec_block(pmd->data_sm, b);
1781*4882a593Smuzhiyun 		if (r)
1782*4882a593Smuzhiyun 			break;
1783*4882a593Smuzhiyun 	}
1784*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
1785*4882a593Smuzhiyun 
1786*4882a593Smuzhiyun 	return r;
1787*4882a593Smuzhiyun }
1788*4882a593Smuzhiyun 
dm_thin_changed_this_transaction(struct dm_thin_device * td)1789*4882a593Smuzhiyun bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1790*4882a593Smuzhiyun {
1791*4882a593Smuzhiyun 	int r;
1792*4882a593Smuzhiyun 
1793*4882a593Smuzhiyun 	down_read(&td->pmd->root_lock);
1794*4882a593Smuzhiyun 	r = td->changed;
1795*4882a593Smuzhiyun 	up_read(&td->pmd->root_lock);
1796*4882a593Smuzhiyun 
1797*4882a593Smuzhiyun 	return r;
1798*4882a593Smuzhiyun }
1799*4882a593Smuzhiyun 
dm_pool_changed_this_transaction(struct dm_pool_metadata * pmd)1800*4882a593Smuzhiyun bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
1801*4882a593Smuzhiyun {
1802*4882a593Smuzhiyun 	bool r = false;
1803*4882a593Smuzhiyun 	struct dm_thin_device *td, *tmp;
1804*4882a593Smuzhiyun 
1805*4882a593Smuzhiyun 	down_read(&pmd->root_lock);
1806*4882a593Smuzhiyun 	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
1807*4882a593Smuzhiyun 		if (td->changed) {
1808*4882a593Smuzhiyun 			r = td->changed;
1809*4882a593Smuzhiyun 			break;
1810*4882a593Smuzhiyun 		}
1811*4882a593Smuzhiyun 	}
1812*4882a593Smuzhiyun 	up_read(&pmd->root_lock);
1813*4882a593Smuzhiyun 
1814*4882a593Smuzhiyun 	return r;
1815*4882a593Smuzhiyun }
1816*4882a593Smuzhiyun 
dm_thin_aborted_changes(struct dm_thin_device * td)1817*4882a593Smuzhiyun bool dm_thin_aborted_changes(struct dm_thin_device *td)
1818*4882a593Smuzhiyun {
1819*4882a593Smuzhiyun 	bool r;
1820*4882a593Smuzhiyun 
1821*4882a593Smuzhiyun 	down_read(&td->pmd->root_lock);
1822*4882a593Smuzhiyun 	r = td->aborted_with_changes;
1823*4882a593Smuzhiyun 	up_read(&td->pmd->root_lock);
1824*4882a593Smuzhiyun 
1825*4882a593Smuzhiyun 	return r;
1826*4882a593Smuzhiyun }
1827*4882a593Smuzhiyun 
dm_pool_alloc_data_block(struct dm_pool_metadata * pmd,dm_block_t * result)1828*4882a593Smuzhiyun int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1829*4882a593Smuzhiyun {
1830*4882a593Smuzhiyun 	int r = -EINVAL;
1831*4882a593Smuzhiyun 
1832*4882a593Smuzhiyun 	pmd_write_lock(pmd);
1833*4882a593Smuzhiyun 	if (!pmd->fail_io)
1834*4882a593Smuzhiyun 		r = dm_sm_new_block(pmd->data_sm, result);
1835*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
1836*4882a593Smuzhiyun 
1837*4882a593Smuzhiyun 	return r;
1838*4882a593Smuzhiyun }
1839*4882a593Smuzhiyun 
dm_pool_commit_metadata(struct dm_pool_metadata * pmd)1840*4882a593Smuzhiyun int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
1841*4882a593Smuzhiyun {
1842*4882a593Smuzhiyun 	int r = -EINVAL;
1843*4882a593Smuzhiyun 
1844*4882a593Smuzhiyun 	/*
1845*4882a593Smuzhiyun 	 * Care is taken to not have commit be what
1846*4882a593Smuzhiyun 	 * triggers putting the thin-pool in-service.
1847*4882a593Smuzhiyun 	 */
1848*4882a593Smuzhiyun 	pmd_write_lock_in_core(pmd);
1849*4882a593Smuzhiyun 	if (pmd->fail_io)
1850*4882a593Smuzhiyun 		goto out;
1851*4882a593Smuzhiyun 
1852*4882a593Smuzhiyun 	r = __commit_transaction(pmd);
1853*4882a593Smuzhiyun 	if (r < 0)
1854*4882a593Smuzhiyun 		goto out;
1855*4882a593Smuzhiyun 
1856*4882a593Smuzhiyun 	/*
1857*4882a593Smuzhiyun 	 * Open the next transaction.
1858*4882a593Smuzhiyun 	 */
1859*4882a593Smuzhiyun 	r = __begin_transaction(pmd);
1860*4882a593Smuzhiyun out:
1861*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
1862*4882a593Smuzhiyun 	return r;
1863*4882a593Smuzhiyun }
1864*4882a593Smuzhiyun 
__set_abort_with_changes_flags(struct dm_pool_metadata * pmd)1865*4882a593Smuzhiyun static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
1866*4882a593Smuzhiyun {
1867*4882a593Smuzhiyun 	struct dm_thin_device *td;
1868*4882a593Smuzhiyun 
1869*4882a593Smuzhiyun 	list_for_each_entry(td, &pmd->thin_devices, list)
1870*4882a593Smuzhiyun 		td->aborted_with_changes = td->changed;
1871*4882a593Smuzhiyun }
1872*4882a593Smuzhiyun 
dm_pool_abort_metadata(struct dm_pool_metadata * pmd)1873*4882a593Smuzhiyun int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
1874*4882a593Smuzhiyun {
1875*4882a593Smuzhiyun 	int r = -EINVAL;
1876*4882a593Smuzhiyun 
1877*4882a593Smuzhiyun 	pmd_write_lock(pmd);
1878*4882a593Smuzhiyun 	if (pmd->fail_io)
1879*4882a593Smuzhiyun 		goto out;
1880*4882a593Smuzhiyun 
1881*4882a593Smuzhiyun 	__set_abort_with_changes_flags(pmd);
1882*4882a593Smuzhiyun 	__destroy_persistent_data_objects(pmd);
1883*4882a593Smuzhiyun 	r = __create_persistent_data_objects(pmd, false);
1884*4882a593Smuzhiyun 	if (r)
1885*4882a593Smuzhiyun 		pmd->fail_io = true;
1886*4882a593Smuzhiyun 
1887*4882a593Smuzhiyun out:
1888*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
1889*4882a593Smuzhiyun 
1890*4882a593Smuzhiyun 	return r;
1891*4882a593Smuzhiyun }
1892*4882a593Smuzhiyun 
dm_pool_get_free_block_count(struct dm_pool_metadata * pmd,dm_block_t * result)1893*4882a593Smuzhiyun int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
1894*4882a593Smuzhiyun {
1895*4882a593Smuzhiyun 	int r = -EINVAL;
1896*4882a593Smuzhiyun 
1897*4882a593Smuzhiyun 	down_read(&pmd->root_lock);
1898*4882a593Smuzhiyun 	if (!pmd->fail_io)
1899*4882a593Smuzhiyun 		r = dm_sm_get_nr_free(pmd->data_sm, result);
1900*4882a593Smuzhiyun 	up_read(&pmd->root_lock);
1901*4882a593Smuzhiyun 
1902*4882a593Smuzhiyun 	return r;
1903*4882a593Smuzhiyun }
1904*4882a593Smuzhiyun 
dm_pool_get_free_metadata_block_count(struct dm_pool_metadata * pmd,dm_block_t * result)1905*4882a593Smuzhiyun int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1906*4882a593Smuzhiyun 					  dm_block_t *result)
1907*4882a593Smuzhiyun {
1908*4882a593Smuzhiyun 	int r = -EINVAL;
1909*4882a593Smuzhiyun 
1910*4882a593Smuzhiyun 	down_read(&pmd->root_lock);
1911*4882a593Smuzhiyun 	if (!pmd->fail_io)
1912*4882a593Smuzhiyun 		r = dm_sm_get_nr_free(pmd->metadata_sm, result);
1913*4882a593Smuzhiyun 
1914*4882a593Smuzhiyun 	if (!r) {
1915*4882a593Smuzhiyun 		if (*result < pmd->metadata_reserve)
1916*4882a593Smuzhiyun 			*result = 0;
1917*4882a593Smuzhiyun 		else
1918*4882a593Smuzhiyun 			*result -= pmd->metadata_reserve;
1919*4882a593Smuzhiyun 	}
1920*4882a593Smuzhiyun 	up_read(&pmd->root_lock);
1921*4882a593Smuzhiyun 
1922*4882a593Smuzhiyun 	return r;
1923*4882a593Smuzhiyun }
1924*4882a593Smuzhiyun 
dm_pool_get_metadata_dev_size(struct dm_pool_metadata * pmd,dm_block_t * result)1925*4882a593Smuzhiyun int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
1926*4882a593Smuzhiyun 				  dm_block_t *result)
1927*4882a593Smuzhiyun {
1928*4882a593Smuzhiyun 	int r = -EINVAL;
1929*4882a593Smuzhiyun 
1930*4882a593Smuzhiyun 	down_read(&pmd->root_lock);
1931*4882a593Smuzhiyun 	if (!pmd->fail_io)
1932*4882a593Smuzhiyun 		r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
1933*4882a593Smuzhiyun 	up_read(&pmd->root_lock);
1934*4882a593Smuzhiyun 
1935*4882a593Smuzhiyun 	return r;
1936*4882a593Smuzhiyun }
1937*4882a593Smuzhiyun 
dm_pool_get_data_dev_size(struct dm_pool_metadata * pmd,dm_block_t * result)1938*4882a593Smuzhiyun int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
1939*4882a593Smuzhiyun {
1940*4882a593Smuzhiyun 	int r = -EINVAL;
1941*4882a593Smuzhiyun 
1942*4882a593Smuzhiyun 	down_read(&pmd->root_lock);
1943*4882a593Smuzhiyun 	if (!pmd->fail_io)
1944*4882a593Smuzhiyun 		r = dm_sm_get_nr_blocks(pmd->data_sm, result);
1945*4882a593Smuzhiyun 	up_read(&pmd->root_lock);
1946*4882a593Smuzhiyun 
1947*4882a593Smuzhiyun 	return r;
1948*4882a593Smuzhiyun }
1949*4882a593Smuzhiyun 
dm_thin_get_mapped_count(struct dm_thin_device * td,dm_block_t * result)1950*4882a593Smuzhiyun int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
1951*4882a593Smuzhiyun {
1952*4882a593Smuzhiyun 	int r = -EINVAL;
1953*4882a593Smuzhiyun 	struct dm_pool_metadata *pmd = td->pmd;
1954*4882a593Smuzhiyun 
1955*4882a593Smuzhiyun 	down_read(&pmd->root_lock);
1956*4882a593Smuzhiyun 	if (!pmd->fail_io) {
1957*4882a593Smuzhiyun 		*result = td->mapped_blocks;
1958*4882a593Smuzhiyun 		r = 0;
1959*4882a593Smuzhiyun 	}
1960*4882a593Smuzhiyun 	up_read(&pmd->root_lock);
1961*4882a593Smuzhiyun 
1962*4882a593Smuzhiyun 	return r;
1963*4882a593Smuzhiyun }
1964*4882a593Smuzhiyun 
__highest_block(struct dm_thin_device * td,dm_block_t * result)1965*4882a593Smuzhiyun static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
1966*4882a593Smuzhiyun {
1967*4882a593Smuzhiyun 	int r;
1968*4882a593Smuzhiyun 	__le64 value_le;
1969*4882a593Smuzhiyun 	dm_block_t thin_root;
1970*4882a593Smuzhiyun 	struct dm_pool_metadata *pmd = td->pmd;
1971*4882a593Smuzhiyun 
1972*4882a593Smuzhiyun 	r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le);
1973*4882a593Smuzhiyun 	if (r)
1974*4882a593Smuzhiyun 		return r;
1975*4882a593Smuzhiyun 
1976*4882a593Smuzhiyun 	thin_root = le64_to_cpu(value_le);
1977*4882a593Smuzhiyun 
1978*4882a593Smuzhiyun 	return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result);
1979*4882a593Smuzhiyun }
1980*4882a593Smuzhiyun 
dm_thin_get_highest_mapped_block(struct dm_thin_device * td,dm_block_t * result)1981*4882a593Smuzhiyun int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
1982*4882a593Smuzhiyun 				     dm_block_t *result)
1983*4882a593Smuzhiyun {
1984*4882a593Smuzhiyun 	int r = -EINVAL;
1985*4882a593Smuzhiyun 	struct dm_pool_metadata *pmd = td->pmd;
1986*4882a593Smuzhiyun 
1987*4882a593Smuzhiyun 	down_read(&pmd->root_lock);
1988*4882a593Smuzhiyun 	if (!pmd->fail_io)
1989*4882a593Smuzhiyun 		r = __highest_block(td, result);
1990*4882a593Smuzhiyun 	up_read(&pmd->root_lock);
1991*4882a593Smuzhiyun 
1992*4882a593Smuzhiyun 	return r;
1993*4882a593Smuzhiyun }
1994*4882a593Smuzhiyun 
__resize_space_map(struct dm_space_map * sm,dm_block_t new_count)1995*4882a593Smuzhiyun static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count)
1996*4882a593Smuzhiyun {
1997*4882a593Smuzhiyun 	int r;
1998*4882a593Smuzhiyun 	dm_block_t old_count;
1999*4882a593Smuzhiyun 
2000*4882a593Smuzhiyun 	r = dm_sm_get_nr_blocks(sm, &old_count);
2001*4882a593Smuzhiyun 	if (r)
2002*4882a593Smuzhiyun 		return r;
2003*4882a593Smuzhiyun 
2004*4882a593Smuzhiyun 	if (new_count == old_count)
2005*4882a593Smuzhiyun 		return 0;
2006*4882a593Smuzhiyun 
2007*4882a593Smuzhiyun 	if (new_count < old_count) {
2008*4882a593Smuzhiyun 		DMERR("cannot reduce size of space map");
2009*4882a593Smuzhiyun 		return -EINVAL;
2010*4882a593Smuzhiyun 	}
2011*4882a593Smuzhiyun 
2012*4882a593Smuzhiyun 	return dm_sm_extend(sm, new_count - old_count);
2013*4882a593Smuzhiyun }
2014*4882a593Smuzhiyun 
dm_pool_resize_data_dev(struct dm_pool_metadata * pmd,dm_block_t new_count)2015*4882a593Smuzhiyun int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
2016*4882a593Smuzhiyun {
2017*4882a593Smuzhiyun 	int r = -EINVAL;
2018*4882a593Smuzhiyun 
2019*4882a593Smuzhiyun 	pmd_write_lock(pmd);
2020*4882a593Smuzhiyun 	if (!pmd->fail_io)
2021*4882a593Smuzhiyun 		r = __resize_space_map(pmd->data_sm, new_count);
2022*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
2023*4882a593Smuzhiyun 
2024*4882a593Smuzhiyun 	return r;
2025*4882a593Smuzhiyun }
2026*4882a593Smuzhiyun 
dm_pool_resize_metadata_dev(struct dm_pool_metadata * pmd,dm_block_t new_count)2027*4882a593Smuzhiyun int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
2028*4882a593Smuzhiyun {
2029*4882a593Smuzhiyun 	int r = -EINVAL;
2030*4882a593Smuzhiyun 
2031*4882a593Smuzhiyun 	pmd_write_lock(pmd);
2032*4882a593Smuzhiyun 	if (!pmd->fail_io) {
2033*4882a593Smuzhiyun 		r = __resize_space_map(pmd->metadata_sm, new_count);
2034*4882a593Smuzhiyun 		if (!r)
2035*4882a593Smuzhiyun 			__set_metadata_reserve(pmd);
2036*4882a593Smuzhiyun 	}
2037*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
2038*4882a593Smuzhiyun 
2039*4882a593Smuzhiyun 	return r;
2040*4882a593Smuzhiyun }
2041*4882a593Smuzhiyun 
dm_pool_metadata_read_only(struct dm_pool_metadata * pmd)2042*4882a593Smuzhiyun void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
2043*4882a593Smuzhiyun {
2044*4882a593Smuzhiyun 	pmd_write_lock_in_core(pmd);
2045*4882a593Smuzhiyun 	dm_bm_set_read_only(pmd->bm);
2046*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
2047*4882a593Smuzhiyun }
2048*4882a593Smuzhiyun 
dm_pool_metadata_read_write(struct dm_pool_metadata * pmd)2049*4882a593Smuzhiyun void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
2050*4882a593Smuzhiyun {
2051*4882a593Smuzhiyun 	pmd_write_lock_in_core(pmd);
2052*4882a593Smuzhiyun 	dm_bm_set_read_write(pmd->bm);
2053*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
2054*4882a593Smuzhiyun }
2055*4882a593Smuzhiyun 
dm_pool_register_metadata_threshold(struct dm_pool_metadata * pmd,dm_block_t threshold,dm_sm_threshold_fn fn,void * context)2056*4882a593Smuzhiyun int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
2057*4882a593Smuzhiyun 					dm_block_t threshold,
2058*4882a593Smuzhiyun 					dm_sm_threshold_fn fn,
2059*4882a593Smuzhiyun 					void *context)
2060*4882a593Smuzhiyun {
2061*4882a593Smuzhiyun 	int r = -EINVAL;
2062*4882a593Smuzhiyun 
2063*4882a593Smuzhiyun 	pmd_write_lock_in_core(pmd);
2064*4882a593Smuzhiyun 	if (!pmd->fail_io) {
2065*4882a593Smuzhiyun 		r = dm_sm_register_threshold_callback(pmd->metadata_sm,
2066*4882a593Smuzhiyun 						      threshold, fn, context);
2067*4882a593Smuzhiyun 	}
2068*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
2069*4882a593Smuzhiyun 
2070*4882a593Smuzhiyun 	return r;
2071*4882a593Smuzhiyun }
2072*4882a593Smuzhiyun 
dm_pool_register_pre_commit_callback(struct dm_pool_metadata * pmd,dm_pool_pre_commit_fn fn,void * context)2073*4882a593Smuzhiyun void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
2074*4882a593Smuzhiyun 					  dm_pool_pre_commit_fn fn,
2075*4882a593Smuzhiyun 					  void *context)
2076*4882a593Smuzhiyun {
2077*4882a593Smuzhiyun 	pmd_write_lock_in_core(pmd);
2078*4882a593Smuzhiyun 	pmd->pre_commit_fn = fn;
2079*4882a593Smuzhiyun 	pmd->pre_commit_context = context;
2080*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
2081*4882a593Smuzhiyun }
2082*4882a593Smuzhiyun 
dm_pool_metadata_set_needs_check(struct dm_pool_metadata * pmd)2083*4882a593Smuzhiyun int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
2084*4882a593Smuzhiyun {
2085*4882a593Smuzhiyun 	int r = -EINVAL;
2086*4882a593Smuzhiyun 	struct dm_block *sblock;
2087*4882a593Smuzhiyun 	struct thin_disk_superblock *disk_super;
2088*4882a593Smuzhiyun 
2089*4882a593Smuzhiyun 	pmd_write_lock(pmd);
2090*4882a593Smuzhiyun 	if (pmd->fail_io)
2091*4882a593Smuzhiyun 		goto out;
2092*4882a593Smuzhiyun 
2093*4882a593Smuzhiyun 	pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
2094*4882a593Smuzhiyun 
2095*4882a593Smuzhiyun 	r = superblock_lock(pmd, &sblock);
2096*4882a593Smuzhiyun 	if (r) {
2097*4882a593Smuzhiyun 		DMERR("couldn't lock superblock");
2098*4882a593Smuzhiyun 		goto out;
2099*4882a593Smuzhiyun 	}
2100*4882a593Smuzhiyun 
2101*4882a593Smuzhiyun 	disk_super = dm_block_data(sblock);
2102*4882a593Smuzhiyun 	disk_super->flags = cpu_to_le32(pmd->flags);
2103*4882a593Smuzhiyun 
2104*4882a593Smuzhiyun 	dm_bm_unlock(sblock);
2105*4882a593Smuzhiyun out:
2106*4882a593Smuzhiyun 	pmd_write_unlock(pmd);
2107*4882a593Smuzhiyun 	return r;
2108*4882a593Smuzhiyun }
2109*4882a593Smuzhiyun 
dm_pool_metadata_needs_check(struct dm_pool_metadata * pmd)2110*4882a593Smuzhiyun bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
2111*4882a593Smuzhiyun {
2112*4882a593Smuzhiyun 	bool needs_check;
2113*4882a593Smuzhiyun 
2114*4882a593Smuzhiyun 	down_read(&pmd->root_lock);
2115*4882a593Smuzhiyun 	needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
2116*4882a593Smuzhiyun 	up_read(&pmd->root_lock);
2117*4882a593Smuzhiyun 
2118*4882a593Smuzhiyun 	return needs_check;
2119*4882a593Smuzhiyun }
2120*4882a593Smuzhiyun 
dm_pool_issue_prefetches(struct dm_pool_metadata * pmd)2121*4882a593Smuzhiyun void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
2122*4882a593Smuzhiyun {
2123*4882a593Smuzhiyun 	down_read(&pmd->root_lock);
2124*4882a593Smuzhiyun 	if (!pmd->fail_io)
2125*4882a593Smuzhiyun 		dm_tm_issue_prefetches(pmd->tm);
2126*4882a593Smuzhiyun 	up_read(&pmd->root_lock);
2127*4882a593Smuzhiyun }
2128