xref: /OK3568_Linux_fs/kernel/drivers/block/rbd.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun 
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun    rbd.c -- Export ceph rados objects as a Linux block device
4*4882a593Smuzhiyun 
5*4882a593Smuzhiyun 
6*4882a593Smuzhiyun    based on drivers/block/osdblk.c:
7*4882a593Smuzhiyun 
8*4882a593Smuzhiyun    Copyright 2009 Red Hat, Inc.
9*4882a593Smuzhiyun 
10*4882a593Smuzhiyun    This program is free software; you can redistribute it and/or modify
11*4882a593Smuzhiyun    it under the terms of the GNU General Public License as published by
12*4882a593Smuzhiyun    the Free Software Foundation.
13*4882a593Smuzhiyun 
14*4882a593Smuzhiyun    This program is distributed in the hope that it will be useful,
15*4882a593Smuzhiyun    but WITHOUT ANY WARRANTY; without even the implied warranty of
16*4882a593Smuzhiyun    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17*4882a593Smuzhiyun    GNU General Public License for more details.
18*4882a593Smuzhiyun 
19*4882a593Smuzhiyun    You should have received a copy of the GNU General Public License
20*4882a593Smuzhiyun    along with this program; see the file COPYING.  If not, write to
21*4882a593Smuzhiyun    the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22*4882a593Smuzhiyun 
23*4882a593Smuzhiyun 
24*4882a593Smuzhiyun 
25*4882a593Smuzhiyun    For usage instructions, please refer to:
26*4882a593Smuzhiyun 
27*4882a593Smuzhiyun                  Documentation/ABI/testing/sysfs-bus-rbd
28*4882a593Smuzhiyun 
29*4882a593Smuzhiyun  */
30*4882a593Smuzhiyun 
31*4882a593Smuzhiyun #include <linux/ceph/libceph.h>
32*4882a593Smuzhiyun #include <linux/ceph/osd_client.h>
33*4882a593Smuzhiyun #include <linux/ceph/mon_client.h>
34*4882a593Smuzhiyun #include <linux/ceph/cls_lock_client.h>
35*4882a593Smuzhiyun #include <linux/ceph/striper.h>
36*4882a593Smuzhiyun #include <linux/ceph/decode.h>
37*4882a593Smuzhiyun #include <linux/fs_parser.h>
38*4882a593Smuzhiyun #include <linux/bsearch.h>
39*4882a593Smuzhiyun 
40*4882a593Smuzhiyun #include <linux/kernel.h>
41*4882a593Smuzhiyun #include <linux/device.h>
42*4882a593Smuzhiyun #include <linux/module.h>
43*4882a593Smuzhiyun #include <linux/blk-mq.h>
44*4882a593Smuzhiyun #include <linux/fs.h>
45*4882a593Smuzhiyun #include <linux/blkdev.h>
46*4882a593Smuzhiyun #include <linux/slab.h>
47*4882a593Smuzhiyun #include <linux/idr.h>
48*4882a593Smuzhiyun #include <linux/workqueue.h>
49*4882a593Smuzhiyun 
50*4882a593Smuzhiyun #include "rbd_types.h"
51*4882a593Smuzhiyun 
52*4882a593Smuzhiyun #define RBD_DEBUG	/* Activate rbd_assert() calls */
53*4882a593Smuzhiyun 
54*4882a593Smuzhiyun /*
55*4882a593Smuzhiyun  * Increment the given counter and return its updated value.
56*4882a593Smuzhiyun  * If the counter is already 0 it will not be incremented.
57*4882a593Smuzhiyun  * If the counter is already at its maximum value returns
58*4882a593Smuzhiyun  * -EINVAL without updating it.
59*4882a593Smuzhiyun  */
atomic_inc_return_safe(atomic_t * v)60*4882a593Smuzhiyun static int atomic_inc_return_safe(atomic_t *v)
61*4882a593Smuzhiyun {
62*4882a593Smuzhiyun 	unsigned int counter;
63*4882a593Smuzhiyun 
64*4882a593Smuzhiyun 	counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
65*4882a593Smuzhiyun 	if (counter <= (unsigned int)INT_MAX)
66*4882a593Smuzhiyun 		return (int)counter;
67*4882a593Smuzhiyun 
68*4882a593Smuzhiyun 	atomic_dec(v);
69*4882a593Smuzhiyun 
70*4882a593Smuzhiyun 	return -EINVAL;
71*4882a593Smuzhiyun }
72*4882a593Smuzhiyun 
73*4882a593Smuzhiyun /* Decrement the counter.  Return the resulting value, or -EINVAL */
atomic_dec_return_safe(atomic_t * v)74*4882a593Smuzhiyun static int atomic_dec_return_safe(atomic_t *v)
75*4882a593Smuzhiyun {
76*4882a593Smuzhiyun 	int counter;
77*4882a593Smuzhiyun 
78*4882a593Smuzhiyun 	counter = atomic_dec_return(v);
79*4882a593Smuzhiyun 	if (counter >= 0)
80*4882a593Smuzhiyun 		return counter;
81*4882a593Smuzhiyun 
82*4882a593Smuzhiyun 	atomic_inc(v);
83*4882a593Smuzhiyun 
84*4882a593Smuzhiyun 	return -EINVAL;
85*4882a593Smuzhiyun }
86*4882a593Smuzhiyun 
87*4882a593Smuzhiyun #define RBD_DRV_NAME "rbd"
88*4882a593Smuzhiyun 
89*4882a593Smuzhiyun #define RBD_MINORS_PER_MAJOR		256
90*4882a593Smuzhiyun #define RBD_SINGLE_MAJOR_PART_SHIFT	4
91*4882a593Smuzhiyun 
92*4882a593Smuzhiyun #define RBD_MAX_PARENT_CHAIN_LEN	16
93*4882a593Smuzhiyun 
94*4882a593Smuzhiyun #define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
95*4882a593Smuzhiyun #define RBD_MAX_SNAP_NAME_LEN	\
96*4882a593Smuzhiyun 			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97*4882a593Smuzhiyun 
98*4882a593Smuzhiyun #define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
99*4882a593Smuzhiyun 
100*4882a593Smuzhiyun #define RBD_SNAP_HEAD_NAME	"-"
101*4882a593Smuzhiyun 
102*4882a593Smuzhiyun #define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
103*4882a593Smuzhiyun 
104*4882a593Smuzhiyun /* This allows a single page to hold an image name sent by OSD */
105*4882a593Smuzhiyun #define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
106*4882a593Smuzhiyun #define RBD_IMAGE_ID_LEN_MAX	64
107*4882a593Smuzhiyun 
108*4882a593Smuzhiyun #define RBD_OBJ_PREFIX_LEN_MAX	64
109*4882a593Smuzhiyun 
110*4882a593Smuzhiyun #define RBD_NOTIFY_TIMEOUT	5	/* seconds */
111*4882a593Smuzhiyun #define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
112*4882a593Smuzhiyun 
113*4882a593Smuzhiyun /* Feature bits */
114*4882a593Smuzhiyun 
115*4882a593Smuzhiyun #define RBD_FEATURE_LAYERING		(1ULL<<0)
116*4882a593Smuzhiyun #define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
117*4882a593Smuzhiyun #define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
118*4882a593Smuzhiyun #define RBD_FEATURE_OBJECT_MAP		(1ULL<<3)
119*4882a593Smuzhiyun #define RBD_FEATURE_FAST_DIFF		(1ULL<<4)
120*4882a593Smuzhiyun #define RBD_FEATURE_DEEP_FLATTEN	(1ULL<<5)
121*4882a593Smuzhiyun #define RBD_FEATURE_DATA_POOL		(1ULL<<7)
122*4882a593Smuzhiyun #define RBD_FEATURE_OPERATIONS		(1ULL<<8)
123*4882a593Smuzhiyun 
124*4882a593Smuzhiyun #define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
125*4882a593Smuzhiyun 				 RBD_FEATURE_STRIPINGV2 |	\
126*4882a593Smuzhiyun 				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
127*4882a593Smuzhiyun 				 RBD_FEATURE_OBJECT_MAP |	\
128*4882a593Smuzhiyun 				 RBD_FEATURE_FAST_DIFF |	\
129*4882a593Smuzhiyun 				 RBD_FEATURE_DEEP_FLATTEN |	\
130*4882a593Smuzhiyun 				 RBD_FEATURE_DATA_POOL |	\
131*4882a593Smuzhiyun 				 RBD_FEATURE_OPERATIONS)
132*4882a593Smuzhiyun 
133*4882a593Smuzhiyun /* Features supported by this (client software) implementation. */
134*4882a593Smuzhiyun 
135*4882a593Smuzhiyun #define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
136*4882a593Smuzhiyun 
137*4882a593Smuzhiyun /*
138*4882a593Smuzhiyun  * An RBD device name will be "rbd#", where the "rbd" comes from
139*4882a593Smuzhiyun  * RBD_DRV_NAME above, and # is a unique integer identifier.
140*4882a593Smuzhiyun  */
141*4882a593Smuzhiyun #define DEV_NAME_LEN		32
142*4882a593Smuzhiyun 
143*4882a593Smuzhiyun /*
144*4882a593Smuzhiyun  * block device image metadata (in-memory version)
145*4882a593Smuzhiyun  */
146*4882a593Smuzhiyun struct rbd_image_header {
147*4882a593Smuzhiyun 	/* These six fields never change for a given rbd image */
148*4882a593Smuzhiyun 	char *object_prefix;
149*4882a593Smuzhiyun 	__u8 obj_order;
150*4882a593Smuzhiyun 	u64 stripe_unit;
151*4882a593Smuzhiyun 	u64 stripe_count;
152*4882a593Smuzhiyun 	s64 data_pool_id;
153*4882a593Smuzhiyun 	u64 features;		/* Might be changeable someday? */
154*4882a593Smuzhiyun 
155*4882a593Smuzhiyun 	/* The remaining fields need to be updated occasionally */
156*4882a593Smuzhiyun 	u64 image_size;
157*4882a593Smuzhiyun 	struct ceph_snap_context *snapc;
158*4882a593Smuzhiyun 	char *snap_names;	/* format 1 only */
159*4882a593Smuzhiyun 	u64 *snap_sizes;	/* format 1 only */
160*4882a593Smuzhiyun };
161*4882a593Smuzhiyun 
162*4882a593Smuzhiyun /*
163*4882a593Smuzhiyun  * An rbd image specification.
164*4882a593Smuzhiyun  *
165*4882a593Smuzhiyun  * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
166*4882a593Smuzhiyun  * identify an image.  Each rbd_dev structure includes a pointer to
167*4882a593Smuzhiyun  * an rbd_spec structure that encapsulates this identity.
168*4882a593Smuzhiyun  *
169*4882a593Smuzhiyun  * Each of the id's in an rbd_spec has an associated name.  For a
170*4882a593Smuzhiyun  * user-mapped image, the names are supplied and the id's associated
171*4882a593Smuzhiyun  * with them are looked up.  For a layered image, a parent image is
172*4882a593Smuzhiyun  * defined by the tuple, and the names are looked up.
173*4882a593Smuzhiyun  *
174*4882a593Smuzhiyun  * An rbd_dev structure contains a parent_spec pointer which is
175*4882a593Smuzhiyun  * non-null if the image it represents is a child in a layered
176*4882a593Smuzhiyun  * image.  This pointer will refer to the rbd_spec structure used
177*4882a593Smuzhiyun  * by the parent rbd_dev for its own identity (i.e., the structure
178*4882a593Smuzhiyun  * is shared between the parent and child).
179*4882a593Smuzhiyun  *
180*4882a593Smuzhiyun  * Since these structures are populated once, during the discovery
181*4882a593Smuzhiyun  * phase of image construction, they are effectively immutable so
182*4882a593Smuzhiyun  * we make no effort to synchronize access to them.
183*4882a593Smuzhiyun  *
184*4882a593Smuzhiyun  * Note that code herein does not assume the image name is known (it
185*4882a593Smuzhiyun  * could be a null pointer).
186*4882a593Smuzhiyun  */
187*4882a593Smuzhiyun struct rbd_spec {
188*4882a593Smuzhiyun 	u64		pool_id;
189*4882a593Smuzhiyun 	const char	*pool_name;
190*4882a593Smuzhiyun 	const char	*pool_ns;	/* NULL if default, never "" */
191*4882a593Smuzhiyun 
192*4882a593Smuzhiyun 	const char	*image_id;
193*4882a593Smuzhiyun 	const char	*image_name;
194*4882a593Smuzhiyun 
195*4882a593Smuzhiyun 	u64		snap_id;
196*4882a593Smuzhiyun 	const char	*snap_name;
197*4882a593Smuzhiyun 
198*4882a593Smuzhiyun 	struct kref	kref;
199*4882a593Smuzhiyun };
200*4882a593Smuzhiyun 
201*4882a593Smuzhiyun /*
202*4882a593Smuzhiyun  * an instance of the client.  multiple devices may share an rbd client.
203*4882a593Smuzhiyun  */
204*4882a593Smuzhiyun struct rbd_client {
205*4882a593Smuzhiyun 	struct ceph_client	*client;
206*4882a593Smuzhiyun 	struct kref		kref;
207*4882a593Smuzhiyun 	struct list_head	node;
208*4882a593Smuzhiyun };
209*4882a593Smuzhiyun 
210*4882a593Smuzhiyun struct pending_result {
211*4882a593Smuzhiyun 	int			result;		/* first nonzero result */
212*4882a593Smuzhiyun 	int			num_pending;
213*4882a593Smuzhiyun };
214*4882a593Smuzhiyun 
215*4882a593Smuzhiyun struct rbd_img_request;
216*4882a593Smuzhiyun 
217*4882a593Smuzhiyun enum obj_request_type {
218*4882a593Smuzhiyun 	OBJ_REQUEST_NODATA = 1,
219*4882a593Smuzhiyun 	OBJ_REQUEST_BIO,	/* pointer into provided bio (list) */
220*4882a593Smuzhiyun 	OBJ_REQUEST_BVECS,	/* pointer into provided bio_vec array */
221*4882a593Smuzhiyun 	OBJ_REQUEST_OWN_BVECS,	/* private bio_vec array, doesn't own pages */
222*4882a593Smuzhiyun };
223*4882a593Smuzhiyun 
224*4882a593Smuzhiyun enum obj_operation_type {
225*4882a593Smuzhiyun 	OBJ_OP_READ = 1,
226*4882a593Smuzhiyun 	OBJ_OP_WRITE,
227*4882a593Smuzhiyun 	OBJ_OP_DISCARD,
228*4882a593Smuzhiyun 	OBJ_OP_ZEROOUT,
229*4882a593Smuzhiyun };
230*4882a593Smuzhiyun 
231*4882a593Smuzhiyun #define RBD_OBJ_FLAG_DELETION			(1U << 0)
232*4882a593Smuzhiyun #define RBD_OBJ_FLAG_COPYUP_ENABLED		(1U << 1)
233*4882a593Smuzhiyun #define RBD_OBJ_FLAG_COPYUP_ZEROS		(1U << 2)
234*4882a593Smuzhiyun #define RBD_OBJ_FLAG_MAY_EXIST			(1U << 3)
235*4882a593Smuzhiyun #define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT	(1U << 4)
236*4882a593Smuzhiyun 
237*4882a593Smuzhiyun enum rbd_obj_read_state {
238*4882a593Smuzhiyun 	RBD_OBJ_READ_START = 1,
239*4882a593Smuzhiyun 	RBD_OBJ_READ_OBJECT,
240*4882a593Smuzhiyun 	RBD_OBJ_READ_PARENT,
241*4882a593Smuzhiyun };
242*4882a593Smuzhiyun 
243*4882a593Smuzhiyun /*
244*4882a593Smuzhiyun  * Writes go through the following state machine to deal with
245*4882a593Smuzhiyun  * layering:
246*4882a593Smuzhiyun  *
247*4882a593Smuzhiyun  *            . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
248*4882a593Smuzhiyun  *            .                 |                                    .
249*4882a593Smuzhiyun  *            .                 v                                    .
250*4882a593Smuzhiyun  *            .    RBD_OBJ_WRITE_READ_FROM_PARENT. . .               .
251*4882a593Smuzhiyun  *            .                 |                    .               .
252*4882a593Smuzhiyun  *            .                 v                    v (deep-copyup  .
253*4882a593Smuzhiyun  *    (image  .   RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC   .  not needed)  .
254*4882a593Smuzhiyun  * flattened) v                 |                    .               .
255*4882a593Smuzhiyun  *            .                 v                    .               .
256*4882a593Smuzhiyun  *            . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . .      (copyup  .
257*4882a593Smuzhiyun  *                              |                        not needed) v
258*4882a593Smuzhiyun  *                              v                                    .
259*4882a593Smuzhiyun  *                            done . . . . . . . . . . . . . . . . . .
260*4882a593Smuzhiyun  *                              ^
261*4882a593Smuzhiyun  *                              |
262*4882a593Smuzhiyun  *                     RBD_OBJ_WRITE_FLAT
263*4882a593Smuzhiyun  *
264*4882a593Smuzhiyun  * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
265*4882a593Smuzhiyun  * assert_exists guard is needed or not (in some cases it's not needed
266*4882a593Smuzhiyun  * even if there is a parent).
267*4882a593Smuzhiyun  */
268*4882a593Smuzhiyun enum rbd_obj_write_state {
269*4882a593Smuzhiyun 	RBD_OBJ_WRITE_START = 1,
270*4882a593Smuzhiyun 	RBD_OBJ_WRITE_PRE_OBJECT_MAP,
271*4882a593Smuzhiyun 	RBD_OBJ_WRITE_OBJECT,
272*4882a593Smuzhiyun 	__RBD_OBJ_WRITE_COPYUP,
273*4882a593Smuzhiyun 	RBD_OBJ_WRITE_COPYUP,
274*4882a593Smuzhiyun 	RBD_OBJ_WRITE_POST_OBJECT_MAP,
275*4882a593Smuzhiyun };
276*4882a593Smuzhiyun 
277*4882a593Smuzhiyun enum rbd_obj_copyup_state {
278*4882a593Smuzhiyun 	RBD_OBJ_COPYUP_START = 1,
279*4882a593Smuzhiyun 	RBD_OBJ_COPYUP_READ_PARENT,
280*4882a593Smuzhiyun 	__RBD_OBJ_COPYUP_OBJECT_MAPS,
281*4882a593Smuzhiyun 	RBD_OBJ_COPYUP_OBJECT_MAPS,
282*4882a593Smuzhiyun 	__RBD_OBJ_COPYUP_WRITE_OBJECT,
283*4882a593Smuzhiyun 	RBD_OBJ_COPYUP_WRITE_OBJECT,
284*4882a593Smuzhiyun };
285*4882a593Smuzhiyun 
286*4882a593Smuzhiyun struct rbd_obj_request {
287*4882a593Smuzhiyun 	struct ceph_object_extent ex;
288*4882a593Smuzhiyun 	unsigned int		flags;	/* RBD_OBJ_FLAG_* */
289*4882a593Smuzhiyun 	union {
290*4882a593Smuzhiyun 		enum rbd_obj_read_state	 read_state;	/* for reads */
291*4882a593Smuzhiyun 		enum rbd_obj_write_state write_state;	/* for writes */
292*4882a593Smuzhiyun 	};
293*4882a593Smuzhiyun 
294*4882a593Smuzhiyun 	struct rbd_img_request	*img_request;
295*4882a593Smuzhiyun 	struct ceph_file_extent	*img_extents;
296*4882a593Smuzhiyun 	u32			num_img_extents;
297*4882a593Smuzhiyun 
298*4882a593Smuzhiyun 	union {
299*4882a593Smuzhiyun 		struct ceph_bio_iter	bio_pos;
300*4882a593Smuzhiyun 		struct {
301*4882a593Smuzhiyun 			struct ceph_bvec_iter	bvec_pos;
302*4882a593Smuzhiyun 			u32			bvec_count;
303*4882a593Smuzhiyun 			u32			bvec_idx;
304*4882a593Smuzhiyun 		};
305*4882a593Smuzhiyun 	};
306*4882a593Smuzhiyun 
307*4882a593Smuzhiyun 	enum rbd_obj_copyup_state copyup_state;
308*4882a593Smuzhiyun 	struct bio_vec		*copyup_bvecs;
309*4882a593Smuzhiyun 	u32			copyup_bvec_count;
310*4882a593Smuzhiyun 
311*4882a593Smuzhiyun 	struct list_head	osd_reqs;	/* w/ r_private_item */
312*4882a593Smuzhiyun 
313*4882a593Smuzhiyun 	struct mutex		state_mutex;
314*4882a593Smuzhiyun 	struct pending_result	pending;
315*4882a593Smuzhiyun 	struct kref		kref;
316*4882a593Smuzhiyun };
317*4882a593Smuzhiyun 
318*4882a593Smuzhiyun enum img_req_flags {
319*4882a593Smuzhiyun 	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
320*4882a593Smuzhiyun 	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
321*4882a593Smuzhiyun };
322*4882a593Smuzhiyun 
323*4882a593Smuzhiyun enum rbd_img_state {
324*4882a593Smuzhiyun 	RBD_IMG_START = 1,
325*4882a593Smuzhiyun 	RBD_IMG_EXCLUSIVE_LOCK,
326*4882a593Smuzhiyun 	__RBD_IMG_OBJECT_REQUESTS,
327*4882a593Smuzhiyun 	RBD_IMG_OBJECT_REQUESTS,
328*4882a593Smuzhiyun };
329*4882a593Smuzhiyun 
330*4882a593Smuzhiyun struct rbd_img_request {
331*4882a593Smuzhiyun 	struct rbd_device	*rbd_dev;
332*4882a593Smuzhiyun 	enum obj_operation_type	op_type;
333*4882a593Smuzhiyun 	enum obj_request_type	data_type;
334*4882a593Smuzhiyun 	unsigned long		flags;
335*4882a593Smuzhiyun 	enum rbd_img_state	state;
336*4882a593Smuzhiyun 	union {
337*4882a593Smuzhiyun 		u64			snap_id;	/* for reads */
338*4882a593Smuzhiyun 		struct ceph_snap_context *snapc;	/* for writes */
339*4882a593Smuzhiyun 	};
340*4882a593Smuzhiyun 	struct rbd_obj_request	*obj_request;	/* obj req initiator */
341*4882a593Smuzhiyun 
342*4882a593Smuzhiyun 	struct list_head	lock_item;
343*4882a593Smuzhiyun 	struct list_head	object_extents;	/* obj_req.ex structs */
344*4882a593Smuzhiyun 
345*4882a593Smuzhiyun 	struct mutex		state_mutex;
346*4882a593Smuzhiyun 	struct pending_result	pending;
347*4882a593Smuzhiyun 	struct work_struct	work;
348*4882a593Smuzhiyun 	int			work_result;
349*4882a593Smuzhiyun };
350*4882a593Smuzhiyun 
351*4882a593Smuzhiyun #define for_each_obj_request(ireq, oreq) \
352*4882a593Smuzhiyun 	list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
353*4882a593Smuzhiyun #define for_each_obj_request_safe(ireq, oreq, n) \
354*4882a593Smuzhiyun 	list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
355*4882a593Smuzhiyun 
356*4882a593Smuzhiyun enum rbd_watch_state {
357*4882a593Smuzhiyun 	RBD_WATCH_STATE_UNREGISTERED,
358*4882a593Smuzhiyun 	RBD_WATCH_STATE_REGISTERED,
359*4882a593Smuzhiyun 	RBD_WATCH_STATE_ERROR,
360*4882a593Smuzhiyun };
361*4882a593Smuzhiyun 
362*4882a593Smuzhiyun enum rbd_lock_state {
363*4882a593Smuzhiyun 	RBD_LOCK_STATE_UNLOCKED,
364*4882a593Smuzhiyun 	RBD_LOCK_STATE_LOCKED,
365*4882a593Smuzhiyun 	RBD_LOCK_STATE_RELEASING,
366*4882a593Smuzhiyun };
367*4882a593Smuzhiyun 
368*4882a593Smuzhiyun /* WatchNotify::ClientId */
369*4882a593Smuzhiyun struct rbd_client_id {
370*4882a593Smuzhiyun 	u64 gid;
371*4882a593Smuzhiyun 	u64 handle;
372*4882a593Smuzhiyun };
373*4882a593Smuzhiyun 
374*4882a593Smuzhiyun struct rbd_mapping {
375*4882a593Smuzhiyun 	u64                     size;
376*4882a593Smuzhiyun };
377*4882a593Smuzhiyun 
378*4882a593Smuzhiyun /*
379*4882a593Smuzhiyun  * a single device
380*4882a593Smuzhiyun  */
381*4882a593Smuzhiyun struct rbd_device {
382*4882a593Smuzhiyun 	int			dev_id;		/* blkdev unique id */
383*4882a593Smuzhiyun 
384*4882a593Smuzhiyun 	int			major;		/* blkdev assigned major */
385*4882a593Smuzhiyun 	int			minor;
386*4882a593Smuzhiyun 	struct gendisk		*disk;		/* blkdev's gendisk and rq */
387*4882a593Smuzhiyun 
388*4882a593Smuzhiyun 	u32			image_format;	/* Either 1 or 2 */
389*4882a593Smuzhiyun 	struct rbd_client	*rbd_client;
390*4882a593Smuzhiyun 
391*4882a593Smuzhiyun 	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
392*4882a593Smuzhiyun 
393*4882a593Smuzhiyun 	spinlock_t		lock;		/* queue, flags, open_count */
394*4882a593Smuzhiyun 
395*4882a593Smuzhiyun 	struct rbd_image_header	header;
396*4882a593Smuzhiyun 	unsigned long		flags;		/* possibly lock protected */
397*4882a593Smuzhiyun 	struct rbd_spec		*spec;
398*4882a593Smuzhiyun 	struct rbd_options	*opts;
399*4882a593Smuzhiyun 	char			*config_info;	/* add{,_single_major} string */
400*4882a593Smuzhiyun 
401*4882a593Smuzhiyun 	struct ceph_object_id	header_oid;
402*4882a593Smuzhiyun 	struct ceph_object_locator header_oloc;
403*4882a593Smuzhiyun 
404*4882a593Smuzhiyun 	struct ceph_file_layout	layout;		/* used for all rbd requests */
405*4882a593Smuzhiyun 
406*4882a593Smuzhiyun 	struct mutex		watch_mutex;
407*4882a593Smuzhiyun 	enum rbd_watch_state	watch_state;
408*4882a593Smuzhiyun 	struct ceph_osd_linger_request *watch_handle;
409*4882a593Smuzhiyun 	u64			watch_cookie;
410*4882a593Smuzhiyun 	struct delayed_work	watch_dwork;
411*4882a593Smuzhiyun 
412*4882a593Smuzhiyun 	struct rw_semaphore	lock_rwsem;
413*4882a593Smuzhiyun 	enum rbd_lock_state	lock_state;
414*4882a593Smuzhiyun 	char			lock_cookie[32];
415*4882a593Smuzhiyun 	struct rbd_client_id	owner_cid;
416*4882a593Smuzhiyun 	struct work_struct	acquired_lock_work;
417*4882a593Smuzhiyun 	struct work_struct	released_lock_work;
418*4882a593Smuzhiyun 	struct delayed_work	lock_dwork;
419*4882a593Smuzhiyun 	struct work_struct	unlock_work;
420*4882a593Smuzhiyun 	spinlock_t		lock_lists_lock;
421*4882a593Smuzhiyun 	struct list_head	acquiring_list;
422*4882a593Smuzhiyun 	struct list_head	running_list;
423*4882a593Smuzhiyun 	struct completion	acquire_wait;
424*4882a593Smuzhiyun 	int			acquire_err;
425*4882a593Smuzhiyun 	struct completion	releasing_wait;
426*4882a593Smuzhiyun 
427*4882a593Smuzhiyun 	spinlock_t		object_map_lock;
428*4882a593Smuzhiyun 	u8			*object_map;
429*4882a593Smuzhiyun 	u64			object_map_size;	/* in objects */
430*4882a593Smuzhiyun 	u64			object_map_flags;
431*4882a593Smuzhiyun 
432*4882a593Smuzhiyun 	struct workqueue_struct	*task_wq;
433*4882a593Smuzhiyun 
434*4882a593Smuzhiyun 	struct rbd_spec		*parent_spec;
435*4882a593Smuzhiyun 	u64			parent_overlap;
436*4882a593Smuzhiyun 	atomic_t		parent_ref;
437*4882a593Smuzhiyun 	struct rbd_device	*parent;
438*4882a593Smuzhiyun 
439*4882a593Smuzhiyun 	/* Block layer tags. */
440*4882a593Smuzhiyun 	struct blk_mq_tag_set	tag_set;
441*4882a593Smuzhiyun 
442*4882a593Smuzhiyun 	/* protects updating the header */
443*4882a593Smuzhiyun 	struct rw_semaphore     header_rwsem;
444*4882a593Smuzhiyun 
445*4882a593Smuzhiyun 	struct rbd_mapping	mapping;
446*4882a593Smuzhiyun 
447*4882a593Smuzhiyun 	struct list_head	node;
448*4882a593Smuzhiyun 
449*4882a593Smuzhiyun 	/* sysfs related */
450*4882a593Smuzhiyun 	struct device		dev;
451*4882a593Smuzhiyun 	unsigned long		open_count;	/* protected by lock */
452*4882a593Smuzhiyun };
453*4882a593Smuzhiyun 
454*4882a593Smuzhiyun /*
455*4882a593Smuzhiyun  * Flag bits for rbd_dev->flags:
456*4882a593Smuzhiyun  * - REMOVING (which is coupled with rbd_dev->open_count) is protected
457*4882a593Smuzhiyun  *   by rbd_dev->lock
458*4882a593Smuzhiyun  */
459*4882a593Smuzhiyun enum rbd_dev_flags {
460*4882a593Smuzhiyun 	RBD_DEV_FLAG_EXISTS,	/* rbd_dev_device_setup() ran */
461*4882a593Smuzhiyun 	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
462*4882a593Smuzhiyun 	RBD_DEV_FLAG_READONLY,  /* -o ro or snapshot */
463*4882a593Smuzhiyun };
464*4882a593Smuzhiyun 
465*4882a593Smuzhiyun static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
466*4882a593Smuzhiyun 
467*4882a593Smuzhiyun static LIST_HEAD(rbd_dev_list);    /* devices */
468*4882a593Smuzhiyun static DEFINE_SPINLOCK(rbd_dev_list_lock);
469*4882a593Smuzhiyun 
470*4882a593Smuzhiyun static LIST_HEAD(rbd_client_list);		/* clients */
471*4882a593Smuzhiyun static DEFINE_SPINLOCK(rbd_client_list_lock);
472*4882a593Smuzhiyun 
473*4882a593Smuzhiyun /* Slab caches for frequently-allocated structures */
474*4882a593Smuzhiyun 
475*4882a593Smuzhiyun static struct kmem_cache	*rbd_img_request_cache;
476*4882a593Smuzhiyun static struct kmem_cache	*rbd_obj_request_cache;
477*4882a593Smuzhiyun 
478*4882a593Smuzhiyun static int rbd_major;
479*4882a593Smuzhiyun static DEFINE_IDA(rbd_dev_id_ida);
480*4882a593Smuzhiyun 
481*4882a593Smuzhiyun static struct workqueue_struct *rbd_wq;
482*4882a593Smuzhiyun 
483*4882a593Smuzhiyun static struct ceph_snap_context rbd_empty_snapc = {
484*4882a593Smuzhiyun 	.nref = REFCOUNT_INIT(1),
485*4882a593Smuzhiyun };
486*4882a593Smuzhiyun 
487*4882a593Smuzhiyun /*
488*4882a593Smuzhiyun  * single-major requires >= 0.75 version of userspace rbd utility.
489*4882a593Smuzhiyun  */
490*4882a593Smuzhiyun static bool single_major = true;
491*4882a593Smuzhiyun module_param(single_major, bool, 0444);
492*4882a593Smuzhiyun MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
493*4882a593Smuzhiyun 
494*4882a593Smuzhiyun static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
495*4882a593Smuzhiyun static ssize_t remove_store(struct bus_type *bus, const char *buf,
496*4882a593Smuzhiyun 			    size_t count);
497*4882a593Smuzhiyun static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
498*4882a593Smuzhiyun 				      size_t count);
499*4882a593Smuzhiyun static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
500*4882a593Smuzhiyun 					 size_t count);
501*4882a593Smuzhiyun static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
502*4882a593Smuzhiyun 
rbd_dev_id_to_minor(int dev_id)503*4882a593Smuzhiyun static int rbd_dev_id_to_minor(int dev_id)
504*4882a593Smuzhiyun {
505*4882a593Smuzhiyun 	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
506*4882a593Smuzhiyun }
507*4882a593Smuzhiyun 
minor_to_rbd_dev_id(int minor)508*4882a593Smuzhiyun static int minor_to_rbd_dev_id(int minor)
509*4882a593Smuzhiyun {
510*4882a593Smuzhiyun 	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
511*4882a593Smuzhiyun }
512*4882a593Smuzhiyun 
rbd_is_ro(struct rbd_device * rbd_dev)513*4882a593Smuzhiyun static bool rbd_is_ro(struct rbd_device *rbd_dev)
514*4882a593Smuzhiyun {
515*4882a593Smuzhiyun 	return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
516*4882a593Smuzhiyun }
517*4882a593Smuzhiyun 
rbd_is_snap(struct rbd_device * rbd_dev)518*4882a593Smuzhiyun static bool rbd_is_snap(struct rbd_device *rbd_dev)
519*4882a593Smuzhiyun {
520*4882a593Smuzhiyun 	return rbd_dev->spec->snap_id != CEPH_NOSNAP;
521*4882a593Smuzhiyun }
522*4882a593Smuzhiyun 
__rbd_is_lock_owner(struct rbd_device * rbd_dev)523*4882a593Smuzhiyun static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
524*4882a593Smuzhiyun {
525*4882a593Smuzhiyun 	lockdep_assert_held(&rbd_dev->lock_rwsem);
526*4882a593Smuzhiyun 
527*4882a593Smuzhiyun 	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
528*4882a593Smuzhiyun 	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
529*4882a593Smuzhiyun }
530*4882a593Smuzhiyun 
rbd_is_lock_owner(struct rbd_device * rbd_dev)531*4882a593Smuzhiyun static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
532*4882a593Smuzhiyun {
533*4882a593Smuzhiyun 	bool is_lock_owner;
534*4882a593Smuzhiyun 
535*4882a593Smuzhiyun 	down_read(&rbd_dev->lock_rwsem);
536*4882a593Smuzhiyun 	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
537*4882a593Smuzhiyun 	up_read(&rbd_dev->lock_rwsem);
538*4882a593Smuzhiyun 	return is_lock_owner;
539*4882a593Smuzhiyun }
540*4882a593Smuzhiyun 
supported_features_show(struct bus_type * bus,char * buf)541*4882a593Smuzhiyun static ssize_t supported_features_show(struct bus_type *bus, char *buf)
542*4882a593Smuzhiyun {
543*4882a593Smuzhiyun 	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
544*4882a593Smuzhiyun }
545*4882a593Smuzhiyun 
546*4882a593Smuzhiyun static BUS_ATTR_WO(add);
547*4882a593Smuzhiyun static BUS_ATTR_WO(remove);
548*4882a593Smuzhiyun static BUS_ATTR_WO(add_single_major);
549*4882a593Smuzhiyun static BUS_ATTR_WO(remove_single_major);
550*4882a593Smuzhiyun static BUS_ATTR_RO(supported_features);
551*4882a593Smuzhiyun 
552*4882a593Smuzhiyun static struct attribute *rbd_bus_attrs[] = {
553*4882a593Smuzhiyun 	&bus_attr_add.attr,
554*4882a593Smuzhiyun 	&bus_attr_remove.attr,
555*4882a593Smuzhiyun 	&bus_attr_add_single_major.attr,
556*4882a593Smuzhiyun 	&bus_attr_remove_single_major.attr,
557*4882a593Smuzhiyun 	&bus_attr_supported_features.attr,
558*4882a593Smuzhiyun 	NULL,
559*4882a593Smuzhiyun };
560*4882a593Smuzhiyun 
rbd_bus_is_visible(struct kobject * kobj,struct attribute * attr,int index)561*4882a593Smuzhiyun static umode_t rbd_bus_is_visible(struct kobject *kobj,
562*4882a593Smuzhiyun 				  struct attribute *attr, int index)
563*4882a593Smuzhiyun {
564*4882a593Smuzhiyun 	if (!single_major &&
565*4882a593Smuzhiyun 	    (attr == &bus_attr_add_single_major.attr ||
566*4882a593Smuzhiyun 	     attr == &bus_attr_remove_single_major.attr))
567*4882a593Smuzhiyun 		return 0;
568*4882a593Smuzhiyun 
569*4882a593Smuzhiyun 	return attr->mode;
570*4882a593Smuzhiyun }
571*4882a593Smuzhiyun 
572*4882a593Smuzhiyun static const struct attribute_group rbd_bus_group = {
573*4882a593Smuzhiyun 	.attrs = rbd_bus_attrs,
574*4882a593Smuzhiyun 	.is_visible = rbd_bus_is_visible,
575*4882a593Smuzhiyun };
576*4882a593Smuzhiyun __ATTRIBUTE_GROUPS(rbd_bus);
577*4882a593Smuzhiyun 
578*4882a593Smuzhiyun static struct bus_type rbd_bus_type = {
579*4882a593Smuzhiyun 	.name		= "rbd",
580*4882a593Smuzhiyun 	.bus_groups	= rbd_bus_groups,
581*4882a593Smuzhiyun };
582*4882a593Smuzhiyun 
rbd_root_dev_release(struct device * dev)583*4882a593Smuzhiyun static void rbd_root_dev_release(struct device *dev)
584*4882a593Smuzhiyun {
585*4882a593Smuzhiyun }
586*4882a593Smuzhiyun 
587*4882a593Smuzhiyun static struct device rbd_root_dev = {
588*4882a593Smuzhiyun 	.init_name =    "rbd",
589*4882a593Smuzhiyun 	.release =      rbd_root_dev_release,
590*4882a593Smuzhiyun };
591*4882a593Smuzhiyun 
592*4882a593Smuzhiyun static __printf(2, 3)
rbd_warn(struct rbd_device * rbd_dev,const char * fmt,...)593*4882a593Smuzhiyun void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
594*4882a593Smuzhiyun {
595*4882a593Smuzhiyun 	struct va_format vaf;
596*4882a593Smuzhiyun 	va_list args;
597*4882a593Smuzhiyun 
598*4882a593Smuzhiyun 	va_start(args, fmt);
599*4882a593Smuzhiyun 	vaf.fmt = fmt;
600*4882a593Smuzhiyun 	vaf.va = &args;
601*4882a593Smuzhiyun 
602*4882a593Smuzhiyun 	if (!rbd_dev)
603*4882a593Smuzhiyun 		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
604*4882a593Smuzhiyun 	else if (rbd_dev->disk)
605*4882a593Smuzhiyun 		printk(KERN_WARNING "%s: %s: %pV\n",
606*4882a593Smuzhiyun 			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
607*4882a593Smuzhiyun 	else if (rbd_dev->spec && rbd_dev->spec->image_name)
608*4882a593Smuzhiyun 		printk(KERN_WARNING "%s: image %s: %pV\n",
609*4882a593Smuzhiyun 			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
610*4882a593Smuzhiyun 	else if (rbd_dev->spec && rbd_dev->spec->image_id)
611*4882a593Smuzhiyun 		printk(KERN_WARNING "%s: id %s: %pV\n",
612*4882a593Smuzhiyun 			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
613*4882a593Smuzhiyun 	else	/* punt */
614*4882a593Smuzhiyun 		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
615*4882a593Smuzhiyun 			RBD_DRV_NAME, rbd_dev, &vaf);
616*4882a593Smuzhiyun 	va_end(args);
617*4882a593Smuzhiyun }
618*4882a593Smuzhiyun 
619*4882a593Smuzhiyun #ifdef RBD_DEBUG
620*4882a593Smuzhiyun #define rbd_assert(expr)						\
621*4882a593Smuzhiyun 		if (unlikely(!(expr))) {				\
622*4882a593Smuzhiyun 			printk(KERN_ERR "\nAssertion failure in %s() "	\
623*4882a593Smuzhiyun 						"at line %d:\n\n"	\
624*4882a593Smuzhiyun 					"\trbd_assert(%s);\n\n",	\
625*4882a593Smuzhiyun 					__func__, __LINE__, #expr);	\
626*4882a593Smuzhiyun 			BUG();						\
627*4882a593Smuzhiyun 		}
628*4882a593Smuzhiyun #else /* !RBD_DEBUG */
629*4882a593Smuzhiyun #  define rbd_assert(expr)	((void) 0)
630*4882a593Smuzhiyun #endif /* !RBD_DEBUG */
631*4882a593Smuzhiyun 
632*4882a593Smuzhiyun static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
633*4882a593Smuzhiyun 
634*4882a593Smuzhiyun static int rbd_dev_refresh(struct rbd_device *rbd_dev);
635*4882a593Smuzhiyun static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
636*4882a593Smuzhiyun static int rbd_dev_header_info(struct rbd_device *rbd_dev);
637*4882a593Smuzhiyun static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
638*4882a593Smuzhiyun static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
639*4882a593Smuzhiyun 					u64 snap_id);
640*4882a593Smuzhiyun static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
641*4882a593Smuzhiyun 				u8 *order, u64 *snap_size);
642*4882a593Smuzhiyun static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
643*4882a593Smuzhiyun 
644*4882a593Smuzhiyun static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
645*4882a593Smuzhiyun static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
646*4882a593Smuzhiyun 
647*4882a593Smuzhiyun /*
648*4882a593Smuzhiyun  * Return true if nothing else is pending.
649*4882a593Smuzhiyun  */
pending_result_dec(struct pending_result * pending,int * result)650*4882a593Smuzhiyun static bool pending_result_dec(struct pending_result *pending, int *result)
651*4882a593Smuzhiyun {
652*4882a593Smuzhiyun 	rbd_assert(pending->num_pending > 0);
653*4882a593Smuzhiyun 
654*4882a593Smuzhiyun 	if (*result && !pending->result)
655*4882a593Smuzhiyun 		pending->result = *result;
656*4882a593Smuzhiyun 	if (--pending->num_pending)
657*4882a593Smuzhiyun 		return false;
658*4882a593Smuzhiyun 
659*4882a593Smuzhiyun 	*result = pending->result;
660*4882a593Smuzhiyun 	return true;
661*4882a593Smuzhiyun }
662*4882a593Smuzhiyun 
rbd_open(struct block_device * bdev,fmode_t mode)663*4882a593Smuzhiyun static int rbd_open(struct block_device *bdev, fmode_t mode)
664*4882a593Smuzhiyun {
665*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
666*4882a593Smuzhiyun 	bool removing = false;
667*4882a593Smuzhiyun 
668*4882a593Smuzhiyun 	spin_lock_irq(&rbd_dev->lock);
669*4882a593Smuzhiyun 	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
670*4882a593Smuzhiyun 		removing = true;
671*4882a593Smuzhiyun 	else
672*4882a593Smuzhiyun 		rbd_dev->open_count++;
673*4882a593Smuzhiyun 	spin_unlock_irq(&rbd_dev->lock);
674*4882a593Smuzhiyun 	if (removing)
675*4882a593Smuzhiyun 		return -ENOENT;
676*4882a593Smuzhiyun 
677*4882a593Smuzhiyun 	(void) get_device(&rbd_dev->dev);
678*4882a593Smuzhiyun 
679*4882a593Smuzhiyun 	return 0;
680*4882a593Smuzhiyun }
681*4882a593Smuzhiyun 
rbd_release(struct gendisk * disk,fmode_t mode)682*4882a593Smuzhiyun static void rbd_release(struct gendisk *disk, fmode_t mode)
683*4882a593Smuzhiyun {
684*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = disk->private_data;
685*4882a593Smuzhiyun 	unsigned long open_count_before;
686*4882a593Smuzhiyun 
687*4882a593Smuzhiyun 	spin_lock_irq(&rbd_dev->lock);
688*4882a593Smuzhiyun 	open_count_before = rbd_dev->open_count--;
689*4882a593Smuzhiyun 	spin_unlock_irq(&rbd_dev->lock);
690*4882a593Smuzhiyun 	rbd_assert(open_count_before > 0);
691*4882a593Smuzhiyun 
692*4882a593Smuzhiyun 	put_device(&rbd_dev->dev);
693*4882a593Smuzhiyun }
694*4882a593Smuzhiyun 
rbd_ioctl_set_ro(struct rbd_device * rbd_dev,unsigned long arg)695*4882a593Smuzhiyun static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
696*4882a593Smuzhiyun {
697*4882a593Smuzhiyun 	int ro;
698*4882a593Smuzhiyun 
699*4882a593Smuzhiyun 	if (get_user(ro, (int __user *)arg))
700*4882a593Smuzhiyun 		return -EFAULT;
701*4882a593Smuzhiyun 
702*4882a593Smuzhiyun 	/*
703*4882a593Smuzhiyun 	 * Both images mapped read-only and snapshots can't be marked
704*4882a593Smuzhiyun 	 * read-write.
705*4882a593Smuzhiyun 	 */
706*4882a593Smuzhiyun 	if (!ro) {
707*4882a593Smuzhiyun 		if (rbd_is_ro(rbd_dev))
708*4882a593Smuzhiyun 			return -EROFS;
709*4882a593Smuzhiyun 
710*4882a593Smuzhiyun 		rbd_assert(!rbd_is_snap(rbd_dev));
711*4882a593Smuzhiyun 	}
712*4882a593Smuzhiyun 
713*4882a593Smuzhiyun 	/* Let blkdev_roset() handle it */
714*4882a593Smuzhiyun 	return -ENOTTY;
715*4882a593Smuzhiyun }
716*4882a593Smuzhiyun 
rbd_ioctl(struct block_device * bdev,fmode_t mode,unsigned int cmd,unsigned long arg)717*4882a593Smuzhiyun static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
718*4882a593Smuzhiyun 			unsigned int cmd, unsigned long arg)
719*4882a593Smuzhiyun {
720*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
721*4882a593Smuzhiyun 	int ret;
722*4882a593Smuzhiyun 
723*4882a593Smuzhiyun 	switch (cmd) {
724*4882a593Smuzhiyun 	case BLKROSET:
725*4882a593Smuzhiyun 		ret = rbd_ioctl_set_ro(rbd_dev, arg);
726*4882a593Smuzhiyun 		break;
727*4882a593Smuzhiyun 	default:
728*4882a593Smuzhiyun 		ret = -ENOTTY;
729*4882a593Smuzhiyun 	}
730*4882a593Smuzhiyun 
731*4882a593Smuzhiyun 	return ret;
732*4882a593Smuzhiyun }
733*4882a593Smuzhiyun 
734*4882a593Smuzhiyun #ifdef CONFIG_COMPAT
rbd_compat_ioctl(struct block_device * bdev,fmode_t mode,unsigned int cmd,unsigned long arg)735*4882a593Smuzhiyun static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
736*4882a593Smuzhiyun 				unsigned int cmd, unsigned long arg)
737*4882a593Smuzhiyun {
738*4882a593Smuzhiyun 	return rbd_ioctl(bdev, mode, cmd, arg);
739*4882a593Smuzhiyun }
740*4882a593Smuzhiyun #endif /* CONFIG_COMPAT */
741*4882a593Smuzhiyun 
742*4882a593Smuzhiyun static const struct block_device_operations rbd_bd_ops = {
743*4882a593Smuzhiyun 	.owner			= THIS_MODULE,
744*4882a593Smuzhiyun 	.open			= rbd_open,
745*4882a593Smuzhiyun 	.release		= rbd_release,
746*4882a593Smuzhiyun 	.ioctl			= rbd_ioctl,
747*4882a593Smuzhiyun #ifdef CONFIG_COMPAT
748*4882a593Smuzhiyun 	.compat_ioctl		= rbd_compat_ioctl,
749*4882a593Smuzhiyun #endif
750*4882a593Smuzhiyun };
751*4882a593Smuzhiyun 
752*4882a593Smuzhiyun /*
753*4882a593Smuzhiyun  * Initialize an rbd client instance.  Success or not, this function
754*4882a593Smuzhiyun  * consumes ceph_opts.  Caller holds client_mutex.
755*4882a593Smuzhiyun  */
rbd_client_create(struct ceph_options * ceph_opts)756*4882a593Smuzhiyun static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
757*4882a593Smuzhiyun {
758*4882a593Smuzhiyun 	struct rbd_client *rbdc;
759*4882a593Smuzhiyun 	int ret = -ENOMEM;
760*4882a593Smuzhiyun 
761*4882a593Smuzhiyun 	dout("%s:\n", __func__);
762*4882a593Smuzhiyun 	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
763*4882a593Smuzhiyun 	if (!rbdc)
764*4882a593Smuzhiyun 		goto out_opt;
765*4882a593Smuzhiyun 
766*4882a593Smuzhiyun 	kref_init(&rbdc->kref);
767*4882a593Smuzhiyun 	INIT_LIST_HEAD(&rbdc->node);
768*4882a593Smuzhiyun 
769*4882a593Smuzhiyun 	rbdc->client = ceph_create_client(ceph_opts, rbdc);
770*4882a593Smuzhiyun 	if (IS_ERR(rbdc->client))
771*4882a593Smuzhiyun 		goto out_rbdc;
772*4882a593Smuzhiyun 	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
773*4882a593Smuzhiyun 
774*4882a593Smuzhiyun 	ret = ceph_open_session(rbdc->client);
775*4882a593Smuzhiyun 	if (ret < 0)
776*4882a593Smuzhiyun 		goto out_client;
777*4882a593Smuzhiyun 
778*4882a593Smuzhiyun 	spin_lock(&rbd_client_list_lock);
779*4882a593Smuzhiyun 	list_add_tail(&rbdc->node, &rbd_client_list);
780*4882a593Smuzhiyun 	spin_unlock(&rbd_client_list_lock);
781*4882a593Smuzhiyun 
782*4882a593Smuzhiyun 	dout("%s: rbdc %p\n", __func__, rbdc);
783*4882a593Smuzhiyun 
784*4882a593Smuzhiyun 	return rbdc;
785*4882a593Smuzhiyun out_client:
786*4882a593Smuzhiyun 	ceph_destroy_client(rbdc->client);
787*4882a593Smuzhiyun out_rbdc:
788*4882a593Smuzhiyun 	kfree(rbdc);
789*4882a593Smuzhiyun out_opt:
790*4882a593Smuzhiyun 	if (ceph_opts)
791*4882a593Smuzhiyun 		ceph_destroy_options(ceph_opts);
792*4882a593Smuzhiyun 	dout("%s: error %d\n", __func__, ret);
793*4882a593Smuzhiyun 
794*4882a593Smuzhiyun 	return ERR_PTR(ret);
795*4882a593Smuzhiyun }
796*4882a593Smuzhiyun 
__rbd_get_client(struct rbd_client * rbdc)797*4882a593Smuzhiyun static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
798*4882a593Smuzhiyun {
799*4882a593Smuzhiyun 	kref_get(&rbdc->kref);
800*4882a593Smuzhiyun 
801*4882a593Smuzhiyun 	return rbdc;
802*4882a593Smuzhiyun }
803*4882a593Smuzhiyun 
804*4882a593Smuzhiyun /*
805*4882a593Smuzhiyun  * Find a ceph client with specific addr and configuration.  If
806*4882a593Smuzhiyun  * found, bump its reference count.
807*4882a593Smuzhiyun  */
rbd_client_find(struct ceph_options * ceph_opts)808*4882a593Smuzhiyun static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
809*4882a593Smuzhiyun {
810*4882a593Smuzhiyun 	struct rbd_client *client_node;
811*4882a593Smuzhiyun 	bool found = false;
812*4882a593Smuzhiyun 
813*4882a593Smuzhiyun 	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
814*4882a593Smuzhiyun 		return NULL;
815*4882a593Smuzhiyun 
816*4882a593Smuzhiyun 	spin_lock(&rbd_client_list_lock);
817*4882a593Smuzhiyun 	list_for_each_entry(client_node, &rbd_client_list, node) {
818*4882a593Smuzhiyun 		if (!ceph_compare_options(ceph_opts, client_node->client)) {
819*4882a593Smuzhiyun 			__rbd_get_client(client_node);
820*4882a593Smuzhiyun 
821*4882a593Smuzhiyun 			found = true;
822*4882a593Smuzhiyun 			break;
823*4882a593Smuzhiyun 		}
824*4882a593Smuzhiyun 	}
825*4882a593Smuzhiyun 	spin_unlock(&rbd_client_list_lock);
826*4882a593Smuzhiyun 
827*4882a593Smuzhiyun 	return found ? client_node : NULL;
828*4882a593Smuzhiyun }
829*4882a593Smuzhiyun 
830*4882a593Smuzhiyun /*
831*4882a593Smuzhiyun  * (Per device) rbd map options
832*4882a593Smuzhiyun  */
833*4882a593Smuzhiyun enum {
834*4882a593Smuzhiyun 	Opt_queue_depth,
835*4882a593Smuzhiyun 	Opt_alloc_size,
836*4882a593Smuzhiyun 	Opt_lock_timeout,
837*4882a593Smuzhiyun 	/* int args above */
838*4882a593Smuzhiyun 	Opt_pool_ns,
839*4882a593Smuzhiyun 	Opt_compression_hint,
840*4882a593Smuzhiyun 	/* string args above */
841*4882a593Smuzhiyun 	Opt_read_only,
842*4882a593Smuzhiyun 	Opt_read_write,
843*4882a593Smuzhiyun 	Opt_lock_on_read,
844*4882a593Smuzhiyun 	Opt_exclusive,
845*4882a593Smuzhiyun 	Opt_notrim,
846*4882a593Smuzhiyun };
847*4882a593Smuzhiyun 
848*4882a593Smuzhiyun enum {
849*4882a593Smuzhiyun 	Opt_compression_hint_none,
850*4882a593Smuzhiyun 	Opt_compression_hint_compressible,
851*4882a593Smuzhiyun 	Opt_compression_hint_incompressible,
852*4882a593Smuzhiyun };
853*4882a593Smuzhiyun 
854*4882a593Smuzhiyun static const struct constant_table rbd_param_compression_hint[] = {
855*4882a593Smuzhiyun 	{"none",		Opt_compression_hint_none},
856*4882a593Smuzhiyun 	{"compressible",	Opt_compression_hint_compressible},
857*4882a593Smuzhiyun 	{"incompressible",	Opt_compression_hint_incompressible},
858*4882a593Smuzhiyun 	{}
859*4882a593Smuzhiyun };
860*4882a593Smuzhiyun 
861*4882a593Smuzhiyun static const struct fs_parameter_spec rbd_parameters[] = {
862*4882a593Smuzhiyun 	fsparam_u32	("alloc_size",			Opt_alloc_size),
863*4882a593Smuzhiyun 	fsparam_enum	("compression_hint",		Opt_compression_hint,
864*4882a593Smuzhiyun 			 rbd_param_compression_hint),
865*4882a593Smuzhiyun 	fsparam_flag	("exclusive",			Opt_exclusive),
866*4882a593Smuzhiyun 	fsparam_flag	("lock_on_read",		Opt_lock_on_read),
867*4882a593Smuzhiyun 	fsparam_u32	("lock_timeout",		Opt_lock_timeout),
868*4882a593Smuzhiyun 	fsparam_flag	("notrim",			Opt_notrim),
869*4882a593Smuzhiyun 	fsparam_string	("_pool_ns",			Opt_pool_ns),
870*4882a593Smuzhiyun 	fsparam_u32	("queue_depth",			Opt_queue_depth),
871*4882a593Smuzhiyun 	fsparam_flag	("read_only",			Opt_read_only),
872*4882a593Smuzhiyun 	fsparam_flag	("read_write",			Opt_read_write),
873*4882a593Smuzhiyun 	fsparam_flag	("ro",				Opt_read_only),
874*4882a593Smuzhiyun 	fsparam_flag	("rw",				Opt_read_write),
875*4882a593Smuzhiyun 	{}
876*4882a593Smuzhiyun };
877*4882a593Smuzhiyun 
878*4882a593Smuzhiyun struct rbd_options {
879*4882a593Smuzhiyun 	int	queue_depth;
880*4882a593Smuzhiyun 	int	alloc_size;
881*4882a593Smuzhiyun 	unsigned long	lock_timeout;
882*4882a593Smuzhiyun 	bool	read_only;
883*4882a593Smuzhiyun 	bool	lock_on_read;
884*4882a593Smuzhiyun 	bool	exclusive;
885*4882a593Smuzhiyun 	bool	trim;
886*4882a593Smuzhiyun 
887*4882a593Smuzhiyun 	u32 alloc_hint_flags;  /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
888*4882a593Smuzhiyun };
889*4882a593Smuzhiyun 
890*4882a593Smuzhiyun #define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_MAX_RQ
891*4882a593Smuzhiyun #define RBD_ALLOC_SIZE_DEFAULT	(64 * 1024)
892*4882a593Smuzhiyun #define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
893*4882a593Smuzhiyun #define RBD_READ_ONLY_DEFAULT	false
894*4882a593Smuzhiyun #define RBD_LOCK_ON_READ_DEFAULT false
895*4882a593Smuzhiyun #define RBD_EXCLUSIVE_DEFAULT	false
896*4882a593Smuzhiyun #define RBD_TRIM_DEFAULT	true
897*4882a593Smuzhiyun 
898*4882a593Smuzhiyun struct rbd_parse_opts_ctx {
899*4882a593Smuzhiyun 	struct rbd_spec		*spec;
900*4882a593Smuzhiyun 	struct ceph_options	*copts;
901*4882a593Smuzhiyun 	struct rbd_options	*opts;
902*4882a593Smuzhiyun };
903*4882a593Smuzhiyun 
obj_op_name(enum obj_operation_type op_type)904*4882a593Smuzhiyun static char* obj_op_name(enum obj_operation_type op_type)
905*4882a593Smuzhiyun {
906*4882a593Smuzhiyun 	switch (op_type) {
907*4882a593Smuzhiyun 	case OBJ_OP_READ:
908*4882a593Smuzhiyun 		return "read";
909*4882a593Smuzhiyun 	case OBJ_OP_WRITE:
910*4882a593Smuzhiyun 		return "write";
911*4882a593Smuzhiyun 	case OBJ_OP_DISCARD:
912*4882a593Smuzhiyun 		return "discard";
913*4882a593Smuzhiyun 	case OBJ_OP_ZEROOUT:
914*4882a593Smuzhiyun 		return "zeroout";
915*4882a593Smuzhiyun 	default:
916*4882a593Smuzhiyun 		return "???";
917*4882a593Smuzhiyun 	}
918*4882a593Smuzhiyun }
919*4882a593Smuzhiyun 
920*4882a593Smuzhiyun /*
921*4882a593Smuzhiyun  * Destroy ceph client
922*4882a593Smuzhiyun  *
923*4882a593Smuzhiyun  * Caller must hold rbd_client_list_lock.
924*4882a593Smuzhiyun  */
rbd_client_release(struct kref * kref)925*4882a593Smuzhiyun static void rbd_client_release(struct kref *kref)
926*4882a593Smuzhiyun {
927*4882a593Smuzhiyun 	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
928*4882a593Smuzhiyun 
929*4882a593Smuzhiyun 	dout("%s: rbdc %p\n", __func__, rbdc);
930*4882a593Smuzhiyun 	spin_lock(&rbd_client_list_lock);
931*4882a593Smuzhiyun 	list_del(&rbdc->node);
932*4882a593Smuzhiyun 	spin_unlock(&rbd_client_list_lock);
933*4882a593Smuzhiyun 
934*4882a593Smuzhiyun 	ceph_destroy_client(rbdc->client);
935*4882a593Smuzhiyun 	kfree(rbdc);
936*4882a593Smuzhiyun }
937*4882a593Smuzhiyun 
938*4882a593Smuzhiyun /*
939*4882a593Smuzhiyun  * Drop reference to ceph client node. If it's not referenced anymore, release
940*4882a593Smuzhiyun  * it.
941*4882a593Smuzhiyun  */
rbd_put_client(struct rbd_client * rbdc)942*4882a593Smuzhiyun static void rbd_put_client(struct rbd_client *rbdc)
943*4882a593Smuzhiyun {
944*4882a593Smuzhiyun 	if (rbdc)
945*4882a593Smuzhiyun 		kref_put(&rbdc->kref, rbd_client_release);
946*4882a593Smuzhiyun }
947*4882a593Smuzhiyun 
948*4882a593Smuzhiyun /*
949*4882a593Smuzhiyun  * Get a ceph client with specific addr and configuration, if one does
950*4882a593Smuzhiyun  * not exist create it.  Either way, ceph_opts is consumed by this
951*4882a593Smuzhiyun  * function.
952*4882a593Smuzhiyun  */
rbd_get_client(struct ceph_options * ceph_opts)953*4882a593Smuzhiyun static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
954*4882a593Smuzhiyun {
955*4882a593Smuzhiyun 	struct rbd_client *rbdc;
956*4882a593Smuzhiyun 	int ret;
957*4882a593Smuzhiyun 
958*4882a593Smuzhiyun 	mutex_lock(&client_mutex);
959*4882a593Smuzhiyun 	rbdc = rbd_client_find(ceph_opts);
960*4882a593Smuzhiyun 	if (rbdc) {
961*4882a593Smuzhiyun 		ceph_destroy_options(ceph_opts);
962*4882a593Smuzhiyun 
963*4882a593Smuzhiyun 		/*
964*4882a593Smuzhiyun 		 * Using an existing client.  Make sure ->pg_pools is up to
965*4882a593Smuzhiyun 		 * date before we look up the pool id in do_rbd_add().
966*4882a593Smuzhiyun 		 */
967*4882a593Smuzhiyun 		ret = ceph_wait_for_latest_osdmap(rbdc->client,
968*4882a593Smuzhiyun 					rbdc->client->options->mount_timeout);
969*4882a593Smuzhiyun 		if (ret) {
970*4882a593Smuzhiyun 			rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
971*4882a593Smuzhiyun 			rbd_put_client(rbdc);
972*4882a593Smuzhiyun 			rbdc = ERR_PTR(ret);
973*4882a593Smuzhiyun 		}
974*4882a593Smuzhiyun 	} else {
975*4882a593Smuzhiyun 		rbdc = rbd_client_create(ceph_opts);
976*4882a593Smuzhiyun 	}
977*4882a593Smuzhiyun 	mutex_unlock(&client_mutex);
978*4882a593Smuzhiyun 
979*4882a593Smuzhiyun 	return rbdc;
980*4882a593Smuzhiyun }
981*4882a593Smuzhiyun 
rbd_image_format_valid(u32 image_format)982*4882a593Smuzhiyun static bool rbd_image_format_valid(u32 image_format)
983*4882a593Smuzhiyun {
984*4882a593Smuzhiyun 	return image_format == 1 || image_format == 2;
985*4882a593Smuzhiyun }
986*4882a593Smuzhiyun 
rbd_dev_ondisk_valid(struct rbd_image_header_ondisk * ondisk)987*4882a593Smuzhiyun static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
988*4882a593Smuzhiyun {
989*4882a593Smuzhiyun 	size_t size;
990*4882a593Smuzhiyun 	u32 snap_count;
991*4882a593Smuzhiyun 
992*4882a593Smuzhiyun 	/* The header has to start with the magic rbd header text */
993*4882a593Smuzhiyun 	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
994*4882a593Smuzhiyun 		return false;
995*4882a593Smuzhiyun 
996*4882a593Smuzhiyun 	/* The bio layer requires at least sector-sized I/O */
997*4882a593Smuzhiyun 
998*4882a593Smuzhiyun 	if (ondisk->options.order < SECTOR_SHIFT)
999*4882a593Smuzhiyun 		return false;
1000*4882a593Smuzhiyun 
1001*4882a593Smuzhiyun 	/* If we use u64 in a few spots we may be able to loosen this */
1002*4882a593Smuzhiyun 
1003*4882a593Smuzhiyun 	if (ondisk->options.order > 8 * sizeof (int) - 1)
1004*4882a593Smuzhiyun 		return false;
1005*4882a593Smuzhiyun 
1006*4882a593Smuzhiyun 	/*
1007*4882a593Smuzhiyun 	 * The size of a snapshot header has to fit in a size_t, and
1008*4882a593Smuzhiyun 	 * that limits the number of snapshots.
1009*4882a593Smuzhiyun 	 */
1010*4882a593Smuzhiyun 	snap_count = le32_to_cpu(ondisk->snap_count);
1011*4882a593Smuzhiyun 	size = SIZE_MAX - sizeof (struct ceph_snap_context);
1012*4882a593Smuzhiyun 	if (snap_count > size / sizeof (__le64))
1013*4882a593Smuzhiyun 		return false;
1014*4882a593Smuzhiyun 
1015*4882a593Smuzhiyun 	/*
1016*4882a593Smuzhiyun 	 * Not only that, but the size of the entire the snapshot
1017*4882a593Smuzhiyun 	 * header must also be representable in a size_t.
1018*4882a593Smuzhiyun 	 */
1019*4882a593Smuzhiyun 	size -= snap_count * sizeof (__le64);
1020*4882a593Smuzhiyun 	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1021*4882a593Smuzhiyun 		return false;
1022*4882a593Smuzhiyun 
1023*4882a593Smuzhiyun 	return true;
1024*4882a593Smuzhiyun }
1025*4882a593Smuzhiyun 
1026*4882a593Smuzhiyun /*
1027*4882a593Smuzhiyun  * returns the size of an object in the image
1028*4882a593Smuzhiyun  */
rbd_obj_bytes(struct rbd_image_header * header)1029*4882a593Smuzhiyun static u32 rbd_obj_bytes(struct rbd_image_header *header)
1030*4882a593Smuzhiyun {
1031*4882a593Smuzhiyun 	return 1U << header->obj_order;
1032*4882a593Smuzhiyun }
1033*4882a593Smuzhiyun 
rbd_init_layout(struct rbd_device * rbd_dev)1034*4882a593Smuzhiyun static void rbd_init_layout(struct rbd_device *rbd_dev)
1035*4882a593Smuzhiyun {
1036*4882a593Smuzhiyun 	if (rbd_dev->header.stripe_unit == 0 ||
1037*4882a593Smuzhiyun 	    rbd_dev->header.stripe_count == 0) {
1038*4882a593Smuzhiyun 		rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1039*4882a593Smuzhiyun 		rbd_dev->header.stripe_count = 1;
1040*4882a593Smuzhiyun 	}
1041*4882a593Smuzhiyun 
1042*4882a593Smuzhiyun 	rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1043*4882a593Smuzhiyun 	rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1044*4882a593Smuzhiyun 	rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
1045*4882a593Smuzhiyun 	rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1046*4882a593Smuzhiyun 			  rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
1047*4882a593Smuzhiyun 	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1048*4882a593Smuzhiyun }
1049*4882a593Smuzhiyun 
1050*4882a593Smuzhiyun /*
1051*4882a593Smuzhiyun  * Fill an rbd image header with information from the given format 1
1052*4882a593Smuzhiyun  * on-disk header.
1053*4882a593Smuzhiyun  */
rbd_header_from_disk(struct rbd_device * rbd_dev,struct rbd_image_header_ondisk * ondisk)1054*4882a593Smuzhiyun static int rbd_header_from_disk(struct rbd_device *rbd_dev,
1055*4882a593Smuzhiyun 				 struct rbd_image_header_ondisk *ondisk)
1056*4882a593Smuzhiyun {
1057*4882a593Smuzhiyun 	struct rbd_image_header *header = &rbd_dev->header;
1058*4882a593Smuzhiyun 	bool first_time = header->object_prefix == NULL;
1059*4882a593Smuzhiyun 	struct ceph_snap_context *snapc;
1060*4882a593Smuzhiyun 	char *object_prefix = NULL;
1061*4882a593Smuzhiyun 	char *snap_names = NULL;
1062*4882a593Smuzhiyun 	u64 *snap_sizes = NULL;
1063*4882a593Smuzhiyun 	u32 snap_count;
1064*4882a593Smuzhiyun 	int ret = -ENOMEM;
1065*4882a593Smuzhiyun 	u32 i;
1066*4882a593Smuzhiyun 
1067*4882a593Smuzhiyun 	/* Allocate this now to avoid having to handle failure below */
1068*4882a593Smuzhiyun 
1069*4882a593Smuzhiyun 	if (first_time) {
1070*4882a593Smuzhiyun 		object_prefix = kstrndup(ondisk->object_prefix,
1071*4882a593Smuzhiyun 					 sizeof(ondisk->object_prefix),
1072*4882a593Smuzhiyun 					 GFP_KERNEL);
1073*4882a593Smuzhiyun 		if (!object_prefix)
1074*4882a593Smuzhiyun 			return -ENOMEM;
1075*4882a593Smuzhiyun 	}
1076*4882a593Smuzhiyun 
1077*4882a593Smuzhiyun 	/* Allocate the snapshot context and fill it in */
1078*4882a593Smuzhiyun 
1079*4882a593Smuzhiyun 	snap_count = le32_to_cpu(ondisk->snap_count);
1080*4882a593Smuzhiyun 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1081*4882a593Smuzhiyun 	if (!snapc)
1082*4882a593Smuzhiyun 		goto out_err;
1083*4882a593Smuzhiyun 	snapc->seq = le64_to_cpu(ondisk->snap_seq);
1084*4882a593Smuzhiyun 	if (snap_count) {
1085*4882a593Smuzhiyun 		struct rbd_image_snap_ondisk *snaps;
1086*4882a593Smuzhiyun 		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1087*4882a593Smuzhiyun 
1088*4882a593Smuzhiyun 		/* We'll keep a copy of the snapshot names... */
1089*4882a593Smuzhiyun 
1090*4882a593Smuzhiyun 		if (snap_names_len > (u64)SIZE_MAX)
1091*4882a593Smuzhiyun 			goto out_2big;
1092*4882a593Smuzhiyun 		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1093*4882a593Smuzhiyun 		if (!snap_names)
1094*4882a593Smuzhiyun 			goto out_err;
1095*4882a593Smuzhiyun 
1096*4882a593Smuzhiyun 		/* ...as well as the array of their sizes. */
1097*4882a593Smuzhiyun 		snap_sizes = kmalloc_array(snap_count,
1098*4882a593Smuzhiyun 					   sizeof(*header->snap_sizes),
1099*4882a593Smuzhiyun 					   GFP_KERNEL);
1100*4882a593Smuzhiyun 		if (!snap_sizes)
1101*4882a593Smuzhiyun 			goto out_err;
1102*4882a593Smuzhiyun 
1103*4882a593Smuzhiyun 		/*
1104*4882a593Smuzhiyun 		 * Copy the names, and fill in each snapshot's id
1105*4882a593Smuzhiyun 		 * and size.
1106*4882a593Smuzhiyun 		 *
1107*4882a593Smuzhiyun 		 * Note that rbd_dev_v1_header_info() guarantees the
1108*4882a593Smuzhiyun 		 * ondisk buffer we're working with has
1109*4882a593Smuzhiyun 		 * snap_names_len bytes beyond the end of the
1110*4882a593Smuzhiyun 		 * snapshot id array, this memcpy() is safe.
1111*4882a593Smuzhiyun 		 */
1112*4882a593Smuzhiyun 		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1113*4882a593Smuzhiyun 		snaps = ondisk->snaps;
1114*4882a593Smuzhiyun 		for (i = 0; i < snap_count; i++) {
1115*4882a593Smuzhiyun 			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1116*4882a593Smuzhiyun 			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1117*4882a593Smuzhiyun 		}
1118*4882a593Smuzhiyun 	}
1119*4882a593Smuzhiyun 
1120*4882a593Smuzhiyun 	/* We won't fail any more, fill in the header */
1121*4882a593Smuzhiyun 
1122*4882a593Smuzhiyun 	if (first_time) {
1123*4882a593Smuzhiyun 		header->object_prefix = object_prefix;
1124*4882a593Smuzhiyun 		header->obj_order = ondisk->options.order;
1125*4882a593Smuzhiyun 		rbd_init_layout(rbd_dev);
1126*4882a593Smuzhiyun 	} else {
1127*4882a593Smuzhiyun 		ceph_put_snap_context(header->snapc);
1128*4882a593Smuzhiyun 		kfree(header->snap_names);
1129*4882a593Smuzhiyun 		kfree(header->snap_sizes);
1130*4882a593Smuzhiyun 	}
1131*4882a593Smuzhiyun 
1132*4882a593Smuzhiyun 	/* The remaining fields always get updated (when we refresh) */
1133*4882a593Smuzhiyun 
1134*4882a593Smuzhiyun 	header->image_size = le64_to_cpu(ondisk->image_size);
1135*4882a593Smuzhiyun 	header->snapc = snapc;
1136*4882a593Smuzhiyun 	header->snap_names = snap_names;
1137*4882a593Smuzhiyun 	header->snap_sizes = snap_sizes;
1138*4882a593Smuzhiyun 
1139*4882a593Smuzhiyun 	return 0;
1140*4882a593Smuzhiyun out_2big:
1141*4882a593Smuzhiyun 	ret = -EIO;
1142*4882a593Smuzhiyun out_err:
1143*4882a593Smuzhiyun 	kfree(snap_sizes);
1144*4882a593Smuzhiyun 	kfree(snap_names);
1145*4882a593Smuzhiyun 	ceph_put_snap_context(snapc);
1146*4882a593Smuzhiyun 	kfree(object_prefix);
1147*4882a593Smuzhiyun 
1148*4882a593Smuzhiyun 	return ret;
1149*4882a593Smuzhiyun }
1150*4882a593Smuzhiyun 
_rbd_dev_v1_snap_name(struct rbd_device * rbd_dev,u32 which)1151*4882a593Smuzhiyun static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1152*4882a593Smuzhiyun {
1153*4882a593Smuzhiyun 	const char *snap_name;
1154*4882a593Smuzhiyun 
1155*4882a593Smuzhiyun 	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1156*4882a593Smuzhiyun 
1157*4882a593Smuzhiyun 	/* Skip over names until we find the one we are looking for */
1158*4882a593Smuzhiyun 
1159*4882a593Smuzhiyun 	snap_name = rbd_dev->header.snap_names;
1160*4882a593Smuzhiyun 	while (which--)
1161*4882a593Smuzhiyun 		snap_name += strlen(snap_name) + 1;
1162*4882a593Smuzhiyun 
1163*4882a593Smuzhiyun 	return kstrdup(snap_name, GFP_KERNEL);
1164*4882a593Smuzhiyun }
1165*4882a593Smuzhiyun 
1166*4882a593Smuzhiyun /*
1167*4882a593Smuzhiyun  * Snapshot id comparison function for use with qsort()/bsearch().
1168*4882a593Smuzhiyun  * Note that result is for snapshots in *descending* order.
1169*4882a593Smuzhiyun  */
snapid_compare_reverse(const void * s1,const void * s2)1170*4882a593Smuzhiyun static int snapid_compare_reverse(const void *s1, const void *s2)
1171*4882a593Smuzhiyun {
1172*4882a593Smuzhiyun 	u64 snap_id1 = *(u64 *)s1;
1173*4882a593Smuzhiyun 	u64 snap_id2 = *(u64 *)s2;
1174*4882a593Smuzhiyun 
1175*4882a593Smuzhiyun 	if (snap_id1 < snap_id2)
1176*4882a593Smuzhiyun 		return 1;
1177*4882a593Smuzhiyun 	return snap_id1 == snap_id2 ? 0 : -1;
1178*4882a593Smuzhiyun }
1179*4882a593Smuzhiyun 
1180*4882a593Smuzhiyun /*
1181*4882a593Smuzhiyun  * Search a snapshot context to see if the given snapshot id is
1182*4882a593Smuzhiyun  * present.
1183*4882a593Smuzhiyun  *
1184*4882a593Smuzhiyun  * Returns the position of the snapshot id in the array if it's found,
1185*4882a593Smuzhiyun  * or BAD_SNAP_INDEX otherwise.
1186*4882a593Smuzhiyun  *
1187*4882a593Smuzhiyun  * Note: The snapshot array is in kept sorted (by the osd) in
1188*4882a593Smuzhiyun  * reverse order, highest snapshot id first.
1189*4882a593Smuzhiyun  */
rbd_dev_snap_index(struct rbd_device * rbd_dev,u64 snap_id)1190*4882a593Smuzhiyun static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1191*4882a593Smuzhiyun {
1192*4882a593Smuzhiyun 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
1193*4882a593Smuzhiyun 	u64 *found;
1194*4882a593Smuzhiyun 
1195*4882a593Smuzhiyun 	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1196*4882a593Smuzhiyun 				sizeof (snap_id), snapid_compare_reverse);
1197*4882a593Smuzhiyun 
1198*4882a593Smuzhiyun 	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
1199*4882a593Smuzhiyun }
1200*4882a593Smuzhiyun 
rbd_dev_v1_snap_name(struct rbd_device * rbd_dev,u64 snap_id)1201*4882a593Smuzhiyun static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1202*4882a593Smuzhiyun 					u64 snap_id)
1203*4882a593Smuzhiyun {
1204*4882a593Smuzhiyun 	u32 which;
1205*4882a593Smuzhiyun 	const char *snap_name;
1206*4882a593Smuzhiyun 
1207*4882a593Smuzhiyun 	which = rbd_dev_snap_index(rbd_dev, snap_id);
1208*4882a593Smuzhiyun 	if (which == BAD_SNAP_INDEX)
1209*4882a593Smuzhiyun 		return ERR_PTR(-ENOENT);
1210*4882a593Smuzhiyun 
1211*4882a593Smuzhiyun 	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1212*4882a593Smuzhiyun 	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
1213*4882a593Smuzhiyun }
1214*4882a593Smuzhiyun 
rbd_snap_name(struct rbd_device * rbd_dev,u64 snap_id)1215*4882a593Smuzhiyun static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1216*4882a593Smuzhiyun {
1217*4882a593Smuzhiyun 	if (snap_id == CEPH_NOSNAP)
1218*4882a593Smuzhiyun 		return RBD_SNAP_HEAD_NAME;
1219*4882a593Smuzhiyun 
1220*4882a593Smuzhiyun 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1221*4882a593Smuzhiyun 	if (rbd_dev->image_format == 1)
1222*4882a593Smuzhiyun 		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
1223*4882a593Smuzhiyun 
1224*4882a593Smuzhiyun 	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
1225*4882a593Smuzhiyun }
1226*4882a593Smuzhiyun 
rbd_snap_size(struct rbd_device * rbd_dev,u64 snap_id,u64 * snap_size)1227*4882a593Smuzhiyun static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1228*4882a593Smuzhiyun 				u64 *snap_size)
1229*4882a593Smuzhiyun {
1230*4882a593Smuzhiyun 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1231*4882a593Smuzhiyun 	if (snap_id == CEPH_NOSNAP) {
1232*4882a593Smuzhiyun 		*snap_size = rbd_dev->header.image_size;
1233*4882a593Smuzhiyun 	} else if (rbd_dev->image_format == 1) {
1234*4882a593Smuzhiyun 		u32 which;
1235*4882a593Smuzhiyun 
1236*4882a593Smuzhiyun 		which = rbd_dev_snap_index(rbd_dev, snap_id);
1237*4882a593Smuzhiyun 		if (which == BAD_SNAP_INDEX)
1238*4882a593Smuzhiyun 			return -ENOENT;
1239*4882a593Smuzhiyun 
1240*4882a593Smuzhiyun 		*snap_size = rbd_dev->header.snap_sizes[which];
1241*4882a593Smuzhiyun 	} else {
1242*4882a593Smuzhiyun 		u64 size = 0;
1243*4882a593Smuzhiyun 		int ret;
1244*4882a593Smuzhiyun 
1245*4882a593Smuzhiyun 		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1246*4882a593Smuzhiyun 		if (ret)
1247*4882a593Smuzhiyun 			return ret;
1248*4882a593Smuzhiyun 
1249*4882a593Smuzhiyun 		*snap_size = size;
1250*4882a593Smuzhiyun 	}
1251*4882a593Smuzhiyun 	return 0;
1252*4882a593Smuzhiyun }
1253*4882a593Smuzhiyun 
rbd_dev_mapping_set(struct rbd_device * rbd_dev)1254*4882a593Smuzhiyun static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
1255*4882a593Smuzhiyun {
1256*4882a593Smuzhiyun 	u64 snap_id = rbd_dev->spec->snap_id;
1257*4882a593Smuzhiyun 	u64 size = 0;
1258*4882a593Smuzhiyun 	int ret;
1259*4882a593Smuzhiyun 
1260*4882a593Smuzhiyun 	ret = rbd_snap_size(rbd_dev, snap_id, &size);
1261*4882a593Smuzhiyun 	if (ret)
1262*4882a593Smuzhiyun 		return ret;
1263*4882a593Smuzhiyun 
1264*4882a593Smuzhiyun 	rbd_dev->mapping.size = size;
1265*4882a593Smuzhiyun 	return 0;
1266*4882a593Smuzhiyun }
1267*4882a593Smuzhiyun 
rbd_dev_mapping_clear(struct rbd_device * rbd_dev)1268*4882a593Smuzhiyun static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1269*4882a593Smuzhiyun {
1270*4882a593Smuzhiyun 	rbd_dev->mapping.size = 0;
1271*4882a593Smuzhiyun }
1272*4882a593Smuzhiyun 
zero_bvec(struct bio_vec * bv)1273*4882a593Smuzhiyun static void zero_bvec(struct bio_vec *bv)
1274*4882a593Smuzhiyun {
1275*4882a593Smuzhiyun 	void *buf;
1276*4882a593Smuzhiyun 	unsigned long flags;
1277*4882a593Smuzhiyun 
1278*4882a593Smuzhiyun 	buf = bvec_kmap_irq(bv, &flags);
1279*4882a593Smuzhiyun 	memset(buf, 0, bv->bv_len);
1280*4882a593Smuzhiyun 	flush_dcache_page(bv->bv_page);
1281*4882a593Smuzhiyun 	bvec_kunmap_irq(buf, &flags);
1282*4882a593Smuzhiyun }
1283*4882a593Smuzhiyun 
zero_bios(struct ceph_bio_iter * bio_pos,u32 off,u32 bytes)1284*4882a593Smuzhiyun static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1285*4882a593Smuzhiyun {
1286*4882a593Smuzhiyun 	struct ceph_bio_iter it = *bio_pos;
1287*4882a593Smuzhiyun 
1288*4882a593Smuzhiyun 	ceph_bio_iter_advance(&it, off);
1289*4882a593Smuzhiyun 	ceph_bio_iter_advance_step(&it, bytes, ({
1290*4882a593Smuzhiyun 		zero_bvec(&bv);
1291*4882a593Smuzhiyun 	}));
1292*4882a593Smuzhiyun }
1293*4882a593Smuzhiyun 
zero_bvecs(struct ceph_bvec_iter * bvec_pos,u32 off,u32 bytes)1294*4882a593Smuzhiyun static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
1295*4882a593Smuzhiyun {
1296*4882a593Smuzhiyun 	struct ceph_bvec_iter it = *bvec_pos;
1297*4882a593Smuzhiyun 
1298*4882a593Smuzhiyun 	ceph_bvec_iter_advance(&it, off);
1299*4882a593Smuzhiyun 	ceph_bvec_iter_advance_step(&it, bytes, ({
1300*4882a593Smuzhiyun 		zero_bvec(&bv);
1301*4882a593Smuzhiyun 	}));
1302*4882a593Smuzhiyun }
1303*4882a593Smuzhiyun 
1304*4882a593Smuzhiyun /*
1305*4882a593Smuzhiyun  * Zero a range in @obj_req data buffer defined by a bio (list) or
1306*4882a593Smuzhiyun  * (private) bio_vec array.
1307*4882a593Smuzhiyun  *
1308*4882a593Smuzhiyun  * @off is relative to the start of the data buffer.
1309*4882a593Smuzhiyun  */
rbd_obj_zero_range(struct rbd_obj_request * obj_req,u32 off,u32 bytes)1310*4882a593Smuzhiyun static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1311*4882a593Smuzhiyun 			       u32 bytes)
1312*4882a593Smuzhiyun {
1313*4882a593Smuzhiyun 	dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1314*4882a593Smuzhiyun 
1315*4882a593Smuzhiyun 	switch (obj_req->img_request->data_type) {
1316*4882a593Smuzhiyun 	case OBJ_REQUEST_BIO:
1317*4882a593Smuzhiyun 		zero_bios(&obj_req->bio_pos, off, bytes);
1318*4882a593Smuzhiyun 		break;
1319*4882a593Smuzhiyun 	case OBJ_REQUEST_BVECS:
1320*4882a593Smuzhiyun 	case OBJ_REQUEST_OWN_BVECS:
1321*4882a593Smuzhiyun 		zero_bvecs(&obj_req->bvec_pos, off, bytes);
1322*4882a593Smuzhiyun 		break;
1323*4882a593Smuzhiyun 	default:
1324*4882a593Smuzhiyun 		BUG();
1325*4882a593Smuzhiyun 	}
1326*4882a593Smuzhiyun }
1327*4882a593Smuzhiyun 
1328*4882a593Smuzhiyun static void rbd_obj_request_destroy(struct kref *kref);
rbd_obj_request_put(struct rbd_obj_request * obj_request)1329*4882a593Smuzhiyun static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1330*4882a593Smuzhiyun {
1331*4882a593Smuzhiyun 	rbd_assert(obj_request != NULL);
1332*4882a593Smuzhiyun 	dout("%s: obj %p (was %d)\n", __func__, obj_request,
1333*4882a593Smuzhiyun 		kref_read(&obj_request->kref));
1334*4882a593Smuzhiyun 	kref_put(&obj_request->kref, rbd_obj_request_destroy);
1335*4882a593Smuzhiyun }
1336*4882a593Smuzhiyun 
rbd_img_obj_request_add(struct rbd_img_request * img_request,struct rbd_obj_request * obj_request)1337*4882a593Smuzhiyun static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1338*4882a593Smuzhiyun 					struct rbd_obj_request *obj_request)
1339*4882a593Smuzhiyun {
1340*4882a593Smuzhiyun 	rbd_assert(obj_request->img_request == NULL);
1341*4882a593Smuzhiyun 
1342*4882a593Smuzhiyun 	/* Image request now owns object's original reference */
1343*4882a593Smuzhiyun 	obj_request->img_request = img_request;
1344*4882a593Smuzhiyun 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1345*4882a593Smuzhiyun }
1346*4882a593Smuzhiyun 
rbd_img_obj_request_del(struct rbd_img_request * img_request,struct rbd_obj_request * obj_request)1347*4882a593Smuzhiyun static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1348*4882a593Smuzhiyun 					struct rbd_obj_request *obj_request)
1349*4882a593Smuzhiyun {
1350*4882a593Smuzhiyun 	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1351*4882a593Smuzhiyun 	list_del(&obj_request->ex.oe_item);
1352*4882a593Smuzhiyun 	rbd_assert(obj_request->img_request == img_request);
1353*4882a593Smuzhiyun 	rbd_obj_request_put(obj_request);
1354*4882a593Smuzhiyun }
1355*4882a593Smuzhiyun 
rbd_osd_submit(struct ceph_osd_request * osd_req)1356*4882a593Smuzhiyun static void rbd_osd_submit(struct ceph_osd_request *osd_req)
1357*4882a593Smuzhiyun {
1358*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req = osd_req->r_priv;
1359*4882a593Smuzhiyun 
1360*4882a593Smuzhiyun 	dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1361*4882a593Smuzhiyun 	     __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1362*4882a593Smuzhiyun 	     obj_req->ex.oe_off, obj_req->ex.oe_len);
1363*4882a593Smuzhiyun 	ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
1364*4882a593Smuzhiyun }
1365*4882a593Smuzhiyun 
1366*4882a593Smuzhiyun /*
1367*4882a593Smuzhiyun  * The default/initial value for all image request flags is 0.  Each
1368*4882a593Smuzhiyun  * is conditionally set to 1 at image request initialization time
1369*4882a593Smuzhiyun  * and currently never change thereafter.
1370*4882a593Smuzhiyun  */
img_request_layered_set(struct rbd_img_request * img_request)1371*4882a593Smuzhiyun static void img_request_layered_set(struct rbd_img_request *img_request)
1372*4882a593Smuzhiyun {
1373*4882a593Smuzhiyun 	set_bit(IMG_REQ_LAYERED, &img_request->flags);
1374*4882a593Smuzhiyun }
1375*4882a593Smuzhiyun 
img_request_layered_test(struct rbd_img_request * img_request)1376*4882a593Smuzhiyun static bool img_request_layered_test(struct rbd_img_request *img_request)
1377*4882a593Smuzhiyun {
1378*4882a593Smuzhiyun 	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1379*4882a593Smuzhiyun }
1380*4882a593Smuzhiyun 
rbd_obj_is_entire(struct rbd_obj_request * obj_req)1381*4882a593Smuzhiyun static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
1382*4882a593Smuzhiyun {
1383*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1384*4882a593Smuzhiyun 
1385*4882a593Smuzhiyun 	return !obj_req->ex.oe_off &&
1386*4882a593Smuzhiyun 	       obj_req->ex.oe_len == rbd_dev->layout.object_size;
1387*4882a593Smuzhiyun }
1388*4882a593Smuzhiyun 
rbd_obj_is_tail(struct rbd_obj_request * obj_req)1389*4882a593Smuzhiyun static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
1390*4882a593Smuzhiyun {
1391*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1392*4882a593Smuzhiyun 
1393*4882a593Smuzhiyun 	return obj_req->ex.oe_off + obj_req->ex.oe_len ==
1394*4882a593Smuzhiyun 					rbd_dev->layout.object_size;
1395*4882a593Smuzhiyun }
1396*4882a593Smuzhiyun 
1397*4882a593Smuzhiyun /*
1398*4882a593Smuzhiyun  * Must be called after rbd_obj_calc_img_extents().
1399*4882a593Smuzhiyun  */
rbd_obj_copyup_enabled(struct rbd_obj_request * obj_req)1400*4882a593Smuzhiyun static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1401*4882a593Smuzhiyun {
1402*4882a593Smuzhiyun 	if (!obj_req->num_img_extents ||
1403*4882a593Smuzhiyun 	    (rbd_obj_is_entire(obj_req) &&
1404*4882a593Smuzhiyun 	     !obj_req->img_request->snapc->num_snaps))
1405*4882a593Smuzhiyun 		return false;
1406*4882a593Smuzhiyun 
1407*4882a593Smuzhiyun 	return true;
1408*4882a593Smuzhiyun }
1409*4882a593Smuzhiyun 
rbd_obj_img_extents_bytes(struct rbd_obj_request * obj_req)1410*4882a593Smuzhiyun static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1411*4882a593Smuzhiyun {
1412*4882a593Smuzhiyun 	return ceph_file_extents_bytes(obj_req->img_extents,
1413*4882a593Smuzhiyun 				       obj_req->num_img_extents);
1414*4882a593Smuzhiyun }
1415*4882a593Smuzhiyun 
rbd_img_is_write(struct rbd_img_request * img_req)1416*4882a593Smuzhiyun static bool rbd_img_is_write(struct rbd_img_request *img_req)
1417*4882a593Smuzhiyun {
1418*4882a593Smuzhiyun 	switch (img_req->op_type) {
1419*4882a593Smuzhiyun 	case OBJ_OP_READ:
1420*4882a593Smuzhiyun 		return false;
1421*4882a593Smuzhiyun 	case OBJ_OP_WRITE:
1422*4882a593Smuzhiyun 	case OBJ_OP_DISCARD:
1423*4882a593Smuzhiyun 	case OBJ_OP_ZEROOUT:
1424*4882a593Smuzhiyun 		return true;
1425*4882a593Smuzhiyun 	default:
1426*4882a593Smuzhiyun 		BUG();
1427*4882a593Smuzhiyun 	}
1428*4882a593Smuzhiyun }
1429*4882a593Smuzhiyun 
rbd_osd_req_callback(struct ceph_osd_request * osd_req)1430*4882a593Smuzhiyun static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
1431*4882a593Smuzhiyun {
1432*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req = osd_req->r_priv;
1433*4882a593Smuzhiyun 	int result;
1434*4882a593Smuzhiyun 
1435*4882a593Smuzhiyun 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1436*4882a593Smuzhiyun 	     osd_req->r_result, obj_req);
1437*4882a593Smuzhiyun 
1438*4882a593Smuzhiyun 	/*
1439*4882a593Smuzhiyun 	 * Writes aren't allowed to return a data payload.  In some
1440*4882a593Smuzhiyun 	 * guarded write cases (e.g. stat + zero on an empty object)
1441*4882a593Smuzhiyun 	 * a stat response makes it through, but we don't care.
1442*4882a593Smuzhiyun 	 */
1443*4882a593Smuzhiyun 	if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1444*4882a593Smuzhiyun 		result = 0;
1445*4882a593Smuzhiyun 	else
1446*4882a593Smuzhiyun 		result = osd_req->r_result;
1447*4882a593Smuzhiyun 
1448*4882a593Smuzhiyun 	rbd_obj_handle_request(obj_req, result);
1449*4882a593Smuzhiyun }
1450*4882a593Smuzhiyun 
rbd_osd_format_read(struct ceph_osd_request * osd_req)1451*4882a593Smuzhiyun static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
1452*4882a593Smuzhiyun {
1453*4882a593Smuzhiyun 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1454*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1455*4882a593Smuzhiyun 	struct ceph_options *opt = rbd_dev->rbd_client->client->options;
1456*4882a593Smuzhiyun 
1457*4882a593Smuzhiyun 	osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica;
1458*4882a593Smuzhiyun 	osd_req->r_snapid = obj_request->img_request->snap_id;
1459*4882a593Smuzhiyun }
1460*4882a593Smuzhiyun 
rbd_osd_format_write(struct ceph_osd_request * osd_req)1461*4882a593Smuzhiyun static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
1462*4882a593Smuzhiyun {
1463*4882a593Smuzhiyun 	struct rbd_obj_request *obj_request = osd_req->r_priv;
1464*4882a593Smuzhiyun 
1465*4882a593Smuzhiyun 	osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
1466*4882a593Smuzhiyun 	ktime_get_real_ts64(&osd_req->r_mtime);
1467*4882a593Smuzhiyun 	osd_req->r_data_offset = obj_request->ex.oe_off;
1468*4882a593Smuzhiyun }
1469*4882a593Smuzhiyun 
1470*4882a593Smuzhiyun static struct ceph_osd_request *
__rbd_obj_add_osd_request(struct rbd_obj_request * obj_req,struct ceph_snap_context * snapc,int num_ops)1471*4882a593Smuzhiyun __rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1472*4882a593Smuzhiyun 			  struct ceph_snap_context *snapc, int num_ops)
1473*4882a593Smuzhiyun {
1474*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1475*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1476*4882a593Smuzhiyun 	struct ceph_osd_request *req;
1477*4882a593Smuzhiyun 	const char *name_format = rbd_dev->image_format == 1 ?
1478*4882a593Smuzhiyun 				      RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
1479*4882a593Smuzhiyun 	int ret;
1480*4882a593Smuzhiyun 
1481*4882a593Smuzhiyun 	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1482*4882a593Smuzhiyun 	if (!req)
1483*4882a593Smuzhiyun 		return ERR_PTR(-ENOMEM);
1484*4882a593Smuzhiyun 
1485*4882a593Smuzhiyun 	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
1486*4882a593Smuzhiyun 	req->r_callback = rbd_osd_req_callback;
1487*4882a593Smuzhiyun 	req->r_priv = obj_req;
1488*4882a593Smuzhiyun 
1489*4882a593Smuzhiyun 	/*
1490*4882a593Smuzhiyun 	 * Data objects may be stored in a separate pool, but always in
1491*4882a593Smuzhiyun 	 * the same namespace in that pool as the header in its pool.
1492*4882a593Smuzhiyun 	 */
1493*4882a593Smuzhiyun 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
1494*4882a593Smuzhiyun 	req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1495*4882a593Smuzhiyun 
1496*4882a593Smuzhiyun 	ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1497*4882a593Smuzhiyun 			       rbd_dev->header.object_prefix,
1498*4882a593Smuzhiyun 			       obj_req->ex.oe_objno);
1499*4882a593Smuzhiyun 	if (ret)
1500*4882a593Smuzhiyun 		return ERR_PTR(ret);
1501*4882a593Smuzhiyun 
1502*4882a593Smuzhiyun 	return req;
1503*4882a593Smuzhiyun }
1504*4882a593Smuzhiyun 
1505*4882a593Smuzhiyun static struct ceph_osd_request *
rbd_obj_add_osd_request(struct rbd_obj_request * obj_req,int num_ops)1506*4882a593Smuzhiyun rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
1507*4882a593Smuzhiyun {
1508*4882a593Smuzhiyun 	return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1509*4882a593Smuzhiyun 					 num_ops);
1510*4882a593Smuzhiyun }
1511*4882a593Smuzhiyun 
rbd_obj_request_create(void)1512*4882a593Smuzhiyun static struct rbd_obj_request *rbd_obj_request_create(void)
1513*4882a593Smuzhiyun {
1514*4882a593Smuzhiyun 	struct rbd_obj_request *obj_request;
1515*4882a593Smuzhiyun 
1516*4882a593Smuzhiyun 	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
1517*4882a593Smuzhiyun 	if (!obj_request)
1518*4882a593Smuzhiyun 		return NULL;
1519*4882a593Smuzhiyun 
1520*4882a593Smuzhiyun 	ceph_object_extent_init(&obj_request->ex);
1521*4882a593Smuzhiyun 	INIT_LIST_HEAD(&obj_request->osd_reqs);
1522*4882a593Smuzhiyun 	mutex_init(&obj_request->state_mutex);
1523*4882a593Smuzhiyun 	kref_init(&obj_request->kref);
1524*4882a593Smuzhiyun 
1525*4882a593Smuzhiyun 	dout("%s %p\n", __func__, obj_request);
1526*4882a593Smuzhiyun 	return obj_request;
1527*4882a593Smuzhiyun }
1528*4882a593Smuzhiyun 
rbd_obj_request_destroy(struct kref * kref)1529*4882a593Smuzhiyun static void rbd_obj_request_destroy(struct kref *kref)
1530*4882a593Smuzhiyun {
1531*4882a593Smuzhiyun 	struct rbd_obj_request *obj_request;
1532*4882a593Smuzhiyun 	struct ceph_osd_request *osd_req;
1533*4882a593Smuzhiyun 	u32 i;
1534*4882a593Smuzhiyun 
1535*4882a593Smuzhiyun 	obj_request = container_of(kref, struct rbd_obj_request, kref);
1536*4882a593Smuzhiyun 
1537*4882a593Smuzhiyun 	dout("%s: obj %p\n", __func__, obj_request);
1538*4882a593Smuzhiyun 
1539*4882a593Smuzhiyun 	while (!list_empty(&obj_request->osd_reqs)) {
1540*4882a593Smuzhiyun 		osd_req = list_first_entry(&obj_request->osd_reqs,
1541*4882a593Smuzhiyun 				    struct ceph_osd_request, r_private_item);
1542*4882a593Smuzhiyun 		list_del_init(&osd_req->r_private_item);
1543*4882a593Smuzhiyun 		ceph_osdc_put_request(osd_req);
1544*4882a593Smuzhiyun 	}
1545*4882a593Smuzhiyun 
1546*4882a593Smuzhiyun 	switch (obj_request->img_request->data_type) {
1547*4882a593Smuzhiyun 	case OBJ_REQUEST_NODATA:
1548*4882a593Smuzhiyun 	case OBJ_REQUEST_BIO:
1549*4882a593Smuzhiyun 	case OBJ_REQUEST_BVECS:
1550*4882a593Smuzhiyun 		break;		/* Nothing to do */
1551*4882a593Smuzhiyun 	case OBJ_REQUEST_OWN_BVECS:
1552*4882a593Smuzhiyun 		kfree(obj_request->bvec_pos.bvecs);
1553*4882a593Smuzhiyun 		break;
1554*4882a593Smuzhiyun 	default:
1555*4882a593Smuzhiyun 		BUG();
1556*4882a593Smuzhiyun 	}
1557*4882a593Smuzhiyun 
1558*4882a593Smuzhiyun 	kfree(obj_request->img_extents);
1559*4882a593Smuzhiyun 	if (obj_request->copyup_bvecs) {
1560*4882a593Smuzhiyun 		for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1561*4882a593Smuzhiyun 			if (obj_request->copyup_bvecs[i].bv_page)
1562*4882a593Smuzhiyun 				__free_page(obj_request->copyup_bvecs[i].bv_page);
1563*4882a593Smuzhiyun 		}
1564*4882a593Smuzhiyun 		kfree(obj_request->copyup_bvecs);
1565*4882a593Smuzhiyun 	}
1566*4882a593Smuzhiyun 
1567*4882a593Smuzhiyun 	kmem_cache_free(rbd_obj_request_cache, obj_request);
1568*4882a593Smuzhiyun }
1569*4882a593Smuzhiyun 
1570*4882a593Smuzhiyun /* It's OK to call this for a device with no parent */
1571*4882a593Smuzhiyun 
1572*4882a593Smuzhiyun static void rbd_spec_put(struct rbd_spec *spec);
rbd_dev_unparent(struct rbd_device * rbd_dev)1573*4882a593Smuzhiyun static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1574*4882a593Smuzhiyun {
1575*4882a593Smuzhiyun 	rbd_dev_remove_parent(rbd_dev);
1576*4882a593Smuzhiyun 	rbd_spec_put(rbd_dev->parent_spec);
1577*4882a593Smuzhiyun 	rbd_dev->parent_spec = NULL;
1578*4882a593Smuzhiyun 	rbd_dev->parent_overlap = 0;
1579*4882a593Smuzhiyun }
1580*4882a593Smuzhiyun 
1581*4882a593Smuzhiyun /*
1582*4882a593Smuzhiyun  * Parent image reference counting is used to determine when an
1583*4882a593Smuzhiyun  * image's parent fields can be safely torn down--after there are no
1584*4882a593Smuzhiyun  * more in-flight requests to the parent image.  When the last
1585*4882a593Smuzhiyun  * reference is dropped, cleaning them up is safe.
1586*4882a593Smuzhiyun  */
rbd_dev_parent_put(struct rbd_device * rbd_dev)1587*4882a593Smuzhiyun static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1588*4882a593Smuzhiyun {
1589*4882a593Smuzhiyun 	int counter;
1590*4882a593Smuzhiyun 
1591*4882a593Smuzhiyun 	if (!rbd_dev->parent_spec)
1592*4882a593Smuzhiyun 		return;
1593*4882a593Smuzhiyun 
1594*4882a593Smuzhiyun 	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1595*4882a593Smuzhiyun 	if (counter > 0)
1596*4882a593Smuzhiyun 		return;
1597*4882a593Smuzhiyun 
1598*4882a593Smuzhiyun 	/* Last reference; clean up parent data structures */
1599*4882a593Smuzhiyun 
1600*4882a593Smuzhiyun 	if (!counter)
1601*4882a593Smuzhiyun 		rbd_dev_unparent(rbd_dev);
1602*4882a593Smuzhiyun 	else
1603*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "parent reference underflow");
1604*4882a593Smuzhiyun }
1605*4882a593Smuzhiyun 
1606*4882a593Smuzhiyun /*
1607*4882a593Smuzhiyun  * If an image has a non-zero parent overlap, get a reference to its
1608*4882a593Smuzhiyun  * parent.
1609*4882a593Smuzhiyun  *
1610*4882a593Smuzhiyun  * Returns true if the rbd device has a parent with a non-zero
1611*4882a593Smuzhiyun  * overlap and a reference for it was successfully taken, or
1612*4882a593Smuzhiyun  * false otherwise.
1613*4882a593Smuzhiyun  */
rbd_dev_parent_get(struct rbd_device * rbd_dev)1614*4882a593Smuzhiyun static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1615*4882a593Smuzhiyun {
1616*4882a593Smuzhiyun 	int counter = 0;
1617*4882a593Smuzhiyun 
1618*4882a593Smuzhiyun 	if (!rbd_dev->parent_spec)
1619*4882a593Smuzhiyun 		return false;
1620*4882a593Smuzhiyun 
1621*4882a593Smuzhiyun 	if (rbd_dev->parent_overlap)
1622*4882a593Smuzhiyun 		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1623*4882a593Smuzhiyun 
1624*4882a593Smuzhiyun 	if (counter < 0)
1625*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "parent reference overflow");
1626*4882a593Smuzhiyun 
1627*4882a593Smuzhiyun 	return counter > 0;
1628*4882a593Smuzhiyun }
1629*4882a593Smuzhiyun 
rbd_img_request_init(struct rbd_img_request * img_request,struct rbd_device * rbd_dev,enum obj_operation_type op_type)1630*4882a593Smuzhiyun static void rbd_img_request_init(struct rbd_img_request *img_request,
1631*4882a593Smuzhiyun 				 struct rbd_device *rbd_dev,
1632*4882a593Smuzhiyun 				 enum obj_operation_type op_type)
1633*4882a593Smuzhiyun {
1634*4882a593Smuzhiyun 	memset(img_request, 0, sizeof(*img_request));
1635*4882a593Smuzhiyun 
1636*4882a593Smuzhiyun 	img_request->rbd_dev = rbd_dev;
1637*4882a593Smuzhiyun 	img_request->op_type = op_type;
1638*4882a593Smuzhiyun 
1639*4882a593Smuzhiyun 	INIT_LIST_HEAD(&img_request->lock_item);
1640*4882a593Smuzhiyun 	INIT_LIST_HEAD(&img_request->object_extents);
1641*4882a593Smuzhiyun 	mutex_init(&img_request->state_mutex);
1642*4882a593Smuzhiyun }
1643*4882a593Smuzhiyun 
rbd_img_capture_header(struct rbd_img_request * img_req)1644*4882a593Smuzhiyun static void rbd_img_capture_header(struct rbd_img_request *img_req)
1645*4882a593Smuzhiyun {
1646*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = img_req->rbd_dev;
1647*4882a593Smuzhiyun 
1648*4882a593Smuzhiyun 	lockdep_assert_held(&rbd_dev->header_rwsem);
1649*4882a593Smuzhiyun 
1650*4882a593Smuzhiyun 	if (rbd_img_is_write(img_req))
1651*4882a593Smuzhiyun 		img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1652*4882a593Smuzhiyun 	else
1653*4882a593Smuzhiyun 		img_req->snap_id = rbd_dev->spec->snap_id;
1654*4882a593Smuzhiyun 
1655*4882a593Smuzhiyun 	if (rbd_dev_parent_get(rbd_dev))
1656*4882a593Smuzhiyun 		img_request_layered_set(img_req);
1657*4882a593Smuzhiyun }
1658*4882a593Smuzhiyun 
rbd_img_request_destroy(struct rbd_img_request * img_request)1659*4882a593Smuzhiyun static void rbd_img_request_destroy(struct rbd_img_request *img_request)
1660*4882a593Smuzhiyun {
1661*4882a593Smuzhiyun 	struct rbd_obj_request *obj_request;
1662*4882a593Smuzhiyun 	struct rbd_obj_request *next_obj_request;
1663*4882a593Smuzhiyun 
1664*4882a593Smuzhiyun 	dout("%s: img %p\n", __func__, img_request);
1665*4882a593Smuzhiyun 
1666*4882a593Smuzhiyun 	WARN_ON(!list_empty(&img_request->lock_item));
1667*4882a593Smuzhiyun 	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1668*4882a593Smuzhiyun 		rbd_img_obj_request_del(img_request, obj_request);
1669*4882a593Smuzhiyun 
1670*4882a593Smuzhiyun 	if (img_request_layered_test(img_request))
1671*4882a593Smuzhiyun 		rbd_dev_parent_put(img_request->rbd_dev);
1672*4882a593Smuzhiyun 
1673*4882a593Smuzhiyun 	if (rbd_img_is_write(img_request))
1674*4882a593Smuzhiyun 		ceph_put_snap_context(img_request->snapc);
1675*4882a593Smuzhiyun 
1676*4882a593Smuzhiyun 	if (test_bit(IMG_REQ_CHILD, &img_request->flags))
1677*4882a593Smuzhiyun 		kmem_cache_free(rbd_img_request_cache, img_request);
1678*4882a593Smuzhiyun }
1679*4882a593Smuzhiyun 
1680*4882a593Smuzhiyun #define BITS_PER_OBJ	2
1681*4882a593Smuzhiyun #define OBJS_PER_BYTE	(BITS_PER_BYTE / BITS_PER_OBJ)
1682*4882a593Smuzhiyun #define OBJ_MASK	((1 << BITS_PER_OBJ) - 1)
1683*4882a593Smuzhiyun 
__rbd_object_map_index(struct rbd_device * rbd_dev,u64 objno,u64 * index,u8 * shift)1684*4882a593Smuzhiyun static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
1685*4882a593Smuzhiyun 				   u64 *index, u8 *shift)
1686*4882a593Smuzhiyun {
1687*4882a593Smuzhiyun 	u32 off;
1688*4882a593Smuzhiyun 
1689*4882a593Smuzhiyun 	rbd_assert(objno < rbd_dev->object_map_size);
1690*4882a593Smuzhiyun 	*index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
1691*4882a593Smuzhiyun 	*shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
1692*4882a593Smuzhiyun }
1693*4882a593Smuzhiyun 
__rbd_object_map_get(struct rbd_device * rbd_dev,u64 objno)1694*4882a593Smuzhiyun static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1695*4882a593Smuzhiyun {
1696*4882a593Smuzhiyun 	u64 index;
1697*4882a593Smuzhiyun 	u8 shift;
1698*4882a593Smuzhiyun 
1699*4882a593Smuzhiyun 	lockdep_assert_held(&rbd_dev->object_map_lock);
1700*4882a593Smuzhiyun 	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
1701*4882a593Smuzhiyun 	return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
1702*4882a593Smuzhiyun }
1703*4882a593Smuzhiyun 
__rbd_object_map_set(struct rbd_device * rbd_dev,u64 objno,u8 val)1704*4882a593Smuzhiyun static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
1705*4882a593Smuzhiyun {
1706*4882a593Smuzhiyun 	u64 index;
1707*4882a593Smuzhiyun 	u8 shift;
1708*4882a593Smuzhiyun 	u8 *p;
1709*4882a593Smuzhiyun 
1710*4882a593Smuzhiyun 	lockdep_assert_held(&rbd_dev->object_map_lock);
1711*4882a593Smuzhiyun 	rbd_assert(!(val & ~OBJ_MASK));
1712*4882a593Smuzhiyun 
1713*4882a593Smuzhiyun 	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
1714*4882a593Smuzhiyun 	p = &rbd_dev->object_map[index];
1715*4882a593Smuzhiyun 	*p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
1716*4882a593Smuzhiyun }
1717*4882a593Smuzhiyun 
rbd_object_map_get(struct rbd_device * rbd_dev,u64 objno)1718*4882a593Smuzhiyun static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1719*4882a593Smuzhiyun {
1720*4882a593Smuzhiyun 	u8 state;
1721*4882a593Smuzhiyun 
1722*4882a593Smuzhiyun 	spin_lock(&rbd_dev->object_map_lock);
1723*4882a593Smuzhiyun 	state = __rbd_object_map_get(rbd_dev, objno);
1724*4882a593Smuzhiyun 	spin_unlock(&rbd_dev->object_map_lock);
1725*4882a593Smuzhiyun 	return state;
1726*4882a593Smuzhiyun }
1727*4882a593Smuzhiyun 
use_object_map(struct rbd_device * rbd_dev)1728*4882a593Smuzhiyun static bool use_object_map(struct rbd_device *rbd_dev)
1729*4882a593Smuzhiyun {
1730*4882a593Smuzhiyun 	/*
1731*4882a593Smuzhiyun 	 * An image mapped read-only can't use the object map -- it isn't
1732*4882a593Smuzhiyun 	 * loaded because the header lock isn't acquired.  Someone else can
1733*4882a593Smuzhiyun 	 * write to the image and update the object map behind our back.
1734*4882a593Smuzhiyun 	 *
1735*4882a593Smuzhiyun 	 * A snapshot can't be written to, so using the object map is always
1736*4882a593Smuzhiyun 	 * safe.
1737*4882a593Smuzhiyun 	 */
1738*4882a593Smuzhiyun 	if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev))
1739*4882a593Smuzhiyun 		return false;
1740*4882a593Smuzhiyun 
1741*4882a593Smuzhiyun 	return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
1742*4882a593Smuzhiyun 		!(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
1743*4882a593Smuzhiyun }
1744*4882a593Smuzhiyun 
rbd_object_map_may_exist(struct rbd_device * rbd_dev,u64 objno)1745*4882a593Smuzhiyun static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
1746*4882a593Smuzhiyun {
1747*4882a593Smuzhiyun 	u8 state;
1748*4882a593Smuzhiyun 
1749*4882a593Smuzhiyun 	/* fall back to default logic if object map is disabled or invalid */
1750*4882a593Smuzhiyun 	if (!use_object_map(rbd_dev))
1751*4882a593Smuzhiyun 		return true;
1752*4882a593Smuzhiyun 
1753*4882a593Smuzhiyun 	state = rbd_object_map_get(rbd_dev, objno);
1754*4882a593Smuzhiyun 	return state != OBJECT_NONEXISTENT;
1755*4882a593Smuzhiyun }
1756*4882a593Smuzhiyun 
rbd_object_map_name(struct rbd_device * rbd_dev,u64 snap_id,struct ceph_object_id * oid)1757*4882a593Smuzhiyun static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
1758*4882a593Smuzhiyun 				struct ceph_object_id *oid)
1759*4882a593Smuzhiyun {
1760*4882a593Smuzhiyun 	if (snap_id == CEPH_NOSNAP)
1761*4882a593Smuzhiyun 		ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
1762*4882a593Smuzhiyun 				rbd_dev->spec->image_id);
1763*4882a593Smuzhiyun 	else
1764*4882a593Smuzhiyun 		ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
1765*4882a593Smuzhiyun 				rbd_dev->spec->image_id, snap_id);
1766*4882a593Smuzhiyun }
1767*4882a593Smuzhiyun 
rbd_object_map_lock(struct rbd_device * rbd_dev)1768*4882a593Smuzhiyun static int rbd_object_map_lock(struct rbd_device *rbd_dev)
1769*4882a593Smuzhiyun {
1770*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1771*4882a593Smuzhiyun 	CEPH_DEFINE_OID_ONSTACK(oid);
1772*4882a593Smuzhiyun 	u8 lock_type;
1773*4882a593Smuzhiyun 	char *lock_tag;
1774*4882a593Smuzhiyun 	struct ceph_locker *lockers;
1775*4882a593Smuzhiyun 	u32 num_lockers;
1776*4882a593Smuzhiyun 	bool broke_lock = false;
1777*4882a593Smuzhiyun 	int ret;
1778*4882a593Smuzhiyun 
1779*4882a593Smuzhiyun 	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1780*4882a593Smuzhiyun 
1781*4882a593Smuzhiyun again:
1782*4882a593Smuzhiyun 	ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1783*4882a593Smuzhiyun 			    CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
1784*4882a593Smuzhiyun 	if (ret != -EBUSY || broke_lock) {
1785*4882a593Smuzhiyun 		if (ret == -EEXIST)
1786*4882a593Smuzhiyun 			ret = 0; /* already locked by myself */
1787*4882a593Smuzhiyun 		if (ret)
1788*4882a593Smuzhiyun 			rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
1789*4882a593Smuzhiyun 		return ret;
1790*4882a593Smuzhiyun 	}
1791*4882a593Smuzhiyun 
1792*4882a593Smuzhiyun 	ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
1793*4882a593Smuzhiyun 				 RBD_LOCK_NAME, &lock_type, &lock_tag,
1794*4882a593Smuzhiyun 				 &lockers, &num_lockers);
1795*4882a593Smuzhiyun 	if (ret) {
1796*4882a593Smuzhiyun 		if (ret == -ENOENT)
1797*4882a593Smuzhiyun 			goto again;
1798*4882a593Smuzhiyun 
1799*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
1800*4882a593Smuzhiyun 		return ret;
1801*4882a593Smuzhiyun 	}
1802*4882a593Smuzhiyun 
1803*4882a593Smuzhiyun 	kfree(lock_tag);
1804*4882a593Smuzhiyun 	if (num_lockers == 0)
1805*4882a593Smuzhiyun 		goto again;
1806*4882a593Smuzhiyun 
1807*4882a593Smuzhiyun 	rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
1808*4882a593Smuzhiyun 		 ENTITY_NAME(lockers[0].id.name));
1809*4882a593Smuzhiyun 
1810*4882a593Smuzhiyun 	ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
1811*4882a593Smuzhiyun 				  RBD_LOCK_NAME, lockers[0].id.cookie,
1812*4882a593Smuzhiyun 				  &lockers[0].id.name);
1813*4882a593Smuzhiyun 	ceph_free_lockers(lockers, num_lockers);
1814*4882a593Smuzhiyun 	if (ret) {
1815*4882a593Smuzhiyun 		if (ret == -ENOENT)
1816*4882a593Smuzhiyun 			goto again;
1817*4882a593Smuzhiyun 
1818*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
1819*4882a593Smuzhiyun 		return ret;
1820*4882a593Smuzhiyun 	}
1821*4882a593Smuzhiyun 
1822*4882a593Smuzhiyun 	broke_lock = true;
1823*4882a593Smuzhiyun 	goto again;
1824*4882a593Smuzhiyun }
1825*4882a593Smuzhiyun 
rbd_object_map_unlock(struct rbd_device * rbd_dev)1826*4882a593Smuzhiyun static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
1827*4882a593Smuzhiyun {
1828*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1829*4882a593Smuzhiyun 	CEPH_DEFINE_OID_ONSTACK(oid);
1830*4882a593Smuzhiyun 	int ret;
1831*4882a593Smuzhiyun 
1832*4882a593Smuzhiyun 	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1833*4882a593Smuzhiyun 
1834*4882a593Smuzhiyun 	ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1835*4882a593Smuzhiyun 			      "");
1836*4882a593Smuzhiyun 	if (ret && ret != -ENOENT)
1837*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
1838*4882a593Smuzhiyun }
1839*4882a593Smuzhiyun 
decode_object_map_header(void ** p,void * end,u64 * object_map_size)1840*4882a593Smuzhiyun static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
1841*4882a593Smuzhiyun {
1842*4882a593Smuzhiyun 	u8 struct_v;
1843*4882a593Smuzhiyun 	u32 struct_len;
1844*4882a593Smuzhiyun 	u32 header_len;
1845*4882a593Smuzhiyun 	void *header_end;
1846*4882a593Smuzhiyun 	int ret;
1847*4882a593Smuzhiyun 
1848*4882a593Smuzhiyun 	ceph_decode_32_safe(p, end, header_len, e_inval);
1849*4882a593Smuzhiyun 	header_end = *p + header_len;
1850*4882a593Smuzhiyun 
1851*4882a593Smuzhiyun 	ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
1852*4882a593Smuzhiyun 				  &struct_len);
1853*4882a593Smuzhiyun 	if (ret)
1854*4882a593Smuzhiyun 		return ret;
1855*4882a593Smuzhiyun 
1856*4882a593Smuzhiyun 	ceph_decode_64_safe(p, end, *object_map_size, e_inval);
1857*4882a593Smuzhiyun 
1858*4882a593Smuzhiyun 	*p = header_end;
1859*4882a593Smuzhiyun 	return 0;
1860*4882a593Smuzhiyun 
1861*4882a593Smuzhiyun e_inval:
1862*4882a593Smuzhiyun 	return -EINVAL;
1863*4882a593Smuzhiyun }
1864*4882a593Smuzhiyun 
__rbd_object_map_load(struct rbd_device * rbd_dev)1865*4882a593Smuzhiyun static int __rbd_object_map_load(struct rbd_device *rbd_dev)
1866*4882a593Smuzhiyun {
1867*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1868*4882a593Smuzhiyun 	CEPH_DEFINE_OID_ONSTACK(oid);
1869*4882a593Smuzhiyun 	struct page **pages;
1870*4882a593Smuzhiyun 	void *p, *end;
1871*4882a593Smuzhiyun 	size_t reply_len;
1872*4882a593Smuzhiyun 	u64 num_objects;
1873*4882a593Smuzhiyun 	u64 object_map_bytes;
1874*4882a593Smuzhiyun 	u64 object_map_size;
1875*4882a593Smuzhiyun 	int num_pages;
1876*4882a593Smuzhiyun 	int ret;
1877*4882a593Smuzhiyun 
1878*4882a593Smuzhiyun 	rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
1879*4882a593Smuzhiyun 
1880*4882a593Smuzhiyun 	num_objects = ceph_get_num_objects(&rbd_dev->layout,
1881*4882a593Smuzhiyun 					   rbd_dev->mapping.size);
1882*4882a593Smuzhiyun 	object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
1883*4882a593Smuzhiyun 					    BITS_PER_BYTE);
1884*4882a593Smuzhiyun 	num_pages = calc_pages_for(0, object_map_bytes) + 1;
1885*4882a593Smuzhiyun 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1886*4882a593Smuzhiyun 	if (IS_ERR(pages))
1887*4882a593Smuzhiyun 		return PTR_ERR(pages);
1888*4882a593Smuzhiyun 
1889*4882a593Smuzhiyun 	reply_len = num_pages * PAGE_SIZE;
1890*4882a593Smuzhiyun 	rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
1891*4882a593Smuzhiyun 	ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
1892*4882a593Smuzhiyun 			     "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
1893*4882a593Smuzhiyun 			     NULL, 0, pages, &reply_len);
1894*4882a593Smuzhiyun 	if (ret)
1895*4882a593Smuzhiyun 		goto out;
1896*4882a593Smuzhiyun 
1897*4882a593Smuzhiyun 	p = page_address(pages[0]);
1898*4882a593Smuzhiyun 	end = p + min(reply_len, (size_t)PAGE_SIZE);
1899*4882a593Smuzhiyun 	ret = decode_object_map_header(&p, end, &object_map_size);
1900*4882a593Smuzhiyun 	if (ret)
1901*4882a593Smuzhiyun 		goto out;
1902*4882a593Smuzhiyun 
1903*4882a593Smuzhiyun 	if (object_map_size != num_objects) {
1904*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
1905*4882a593Smuzhiyun 			 object_map_size, num_objects);
1906*4882a593Smuzhiyun 		ret = -EINVAL;
1907*4882a593Smuzhiyun 		goto out;
1908*4882a593Smuzhiyun 	}
1909*4882a593Smuzhiyun 
1910*4882a593Smuzhiyun 	if (offset_in_page(p) + object_map_bytes > reply_len) {
1911*4882a593Smuzhiyun 		ret = -EINVAL;
1912*4882a593Smuzhiyun 		goto out;
1913*4882a593Smuzhiyun 	}
1914*4882a593Smuzhiyun 
1915*4882a593Smuzhiyun 	rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
1916*4882a593Smuzhiyun 	if (!rbd_dev->object_map) {
1917*4882a593Smuzhiyun 		ret = -ENOMEM;
1918*4882a593Smuzhiyun 		goto out;
1919*4882a593Smuzhiyun 	}
1920*4882a593Smuzhiyun 
1921*4882a593Smuzhiyun 	rbd_dev->object_map_size = object_map_size;
1922*4882a593Smuzhiyun 	ceph_copy_from_page_vector(pages, rbd_dev->object_map,
1923*4882a593Smuzhiyun 				   offset_in_page(p), object_map_bytes);
1924*4882a593Smuzhiyun 
1925*4882a593Smuzhiyun out:
1926*4882a593Smuzhiyun 	ceph_release_page_vector(pages, num_pages);
1927*4882a593Smuzhiyun 	return ret;
1928*4882a593Smuzhiyun }
1929*4882a593Smuzhiyun 
rbd_object_map_free(struct rbd_device * rbd_dev)1930*4882a593Smuzhiyun static void rbd_object_map_free(struct rbd_device *rbd_dev)
1931*4882a593Smuzhiyun {
1932*4882a593Smuzhiyun 	kvfree(rbd_dev->object_map);
1933*4882a593Smuzhiyun 	rbd_dev->object_map = NULL;
1934*4882a593Smuzhiyun 	rbd_dev->object_map_size = 0;
1935*4882a593Smuzhiyun }
1936*4882a593Smuzhiyun 
rbd_object_map_load(struct rbd_device * rbd_dev)1937*4882a593Smuzhiyun static int rbd_object_map_load(struct rbd_device *rbd_dev)
1938*4882a593Smuzhiyun {
1939*4882a593Smuzhiyun 	int ret;
1940*4882a593Smuzhiyun 
1941*4882a593Smuzhiyun 	ret = __rbd_object_map_load(rbd_dev);
1942*4882a593Smuzhiyun 	if (ret)
1943*4882a593Smuzhiyun 		return ret;
1944*4882a593Smuzhiyun 
1945*4882a593Smuzhiyun 	ret = rbd_dev_v2_get_flags(rbd_dev);
1946*4882a593Smuzhiyun 	if (ret) {
1947*4882a593Smuzhiyun 		rbd_object_map_free(rbd_dev);
1948*4882a593Smuzhiyun 		return ret;
1949*4882a593Smuzhiyun 	}
1950*4882a593Smuzhiyun 
1951*4882a593Smuzhiyun 	if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
1952*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "object map is invalid");
1953*4882a593Smuzhiyun 
1954*4882a593Smuzhiyun 	return 0;
1955*4882a593Smuzhiyun }
1956*4882a593Smuzhiyun 
rbd_object_map_open(struct rbd_device * rbd_dev)1957*4882a593Smuzhiyun static int rbd_object_map_open(struct rbd_device *rbd_dev)
1958*4882a593Smuzhiyun {
1959*4882a593Smuzhiyun 	int ret;
1960*4882a593Smuzhiyun 
1961*4882a593Smuzhiyun 	ret = rbd_object_map_lock(rbd_dev);
1962*4882a593Smuzhiyun 	if (ret)
1963*4882a593Smuzhiyun 		return ret;
1964*4882a593Smuzhiyun 
1965*4882a593Smuzhiyun 	ret = rbd_object_map_load(rbd_dev);
1966*4882a593Smuzhiyun 	if (ret) {
1967*4882a593Smuzhiyun 		rbd_object_map_unlock(rbd_dev);
1968*4882a593Smuzhiyun 		return ret;
1969*4882a593Smuzhiyun 	}
1970*4882a593Smuzhiyun 
1971*4882a593Smuzhiyun 	return 0;
1972*4882a593Smuzhiyun }
1973*4882a593Smuzhiyun 
rbd_object_map_close(struct rbd_device * rbd_dev)1974*4882a593Smuzhiyun static void rbd_object_map_close(struct rbd_device *rbd_dev)
1975*4882a593Smuzhiyun {
1976*4882a593Smuzhiyun 	rbd_object_map_free(rbd_dev);
1977*4882a593Smuzhiyun 	rbd_object_map_unlock(rbd_dev);
1978*4882a593Smuzhiyun }
1979*4882a593Smuzhiyun 
1980*4882a593Smuzhiyun /*
1981*4882a593Smuzhiyun  * This function needs snap_id (or more precisely just something to
1982*4882a593Smuzhiyun  * distinguish between HEAD and snapshot object maps), new_state and
1983*4882a593Smuzhiyun  * current_state that were passed to rbd_object_map_update().
1984*4882a593Smuzhiyun  *
1985*4882a593Smuzhiyun  * To avoid allocating and stashing a context we piggyback on the OSD
1986*4882a593Smuzhiyun  * request.  A HEAD update has two ops (assert_locked).  For new_state
1987*4882a593Smuzhiyun  * and current_state we decode our own object_map_update op, encoded in
1988*4882a593Smuzhiyun  * rbd_cls_object_map_update().
1989*4882a593Smuzhiyun  */
rbd_object_map_update_finish(struct rbd_obj_request * obj_req,struct ceph_osd_request * osd_req)1990*4882a593Smuzhiyun static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
1991*4882a593Smuzhiyun 					struct ceph_osd_request *osd_req)
1992*4882a593Smuzhiyun {
1993*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1994*4882a593Smuzhiyun 	struct ceph_osd_data *osd_data;
1995*4882a593Smuzhiyun 	u64 objno;
1996*4882a593Smuzhiyun 	u8 state, new_state, current_state;
1997*4882a593Smuzhiyun 	bool has_current_state;
1998*4882a593Smuzhiyun 	void *p;
1999*4882a593Smuzhiyun 
2000*4882a593Smuzhiyun 	if (osd_req->r_result)
2001*4882a593Smuzhiyun 		return osd_req->r_result;
2002*4882a593Smuzhiyun 
2003*4882a593Smuzhiyun 	/*
2004*4882a593Smuzhiyun 	 * Nothing to do for a snapshot object map.
2005*4882a593Smuzhiyun 	 */
2006*4882a593Smuzhiyun 	if (osd_req->r_num_ops == 1)
2007*4882a593Smuzhiyun 		return 0;
2008*4882a593Smuzhiyun 
2009*4882a593Smuzhiyun 	/*
2010*4882a593Smuzhiyun 	 * Update in-memory HEAD object map.
2011*4882a593Smuzhiyun 	 */
2012*4882a593Smuzhiyun 	rbd_assert(osd_req->r_num_ops == 2);
2013*4882a593Smuzhiyun 	osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
2014*4882a593Smuzhiyun 	rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
2015*4882a593Smuzhiyun 
2016*4882a593Smuzhiyun 	p = page_address(osd_data->pages[0]);
2017*4882a593Smuzhiyun 	objno = ceph_decode_64(&p);
2018*4882a593Smuzhiyun 	rbd_assert(objno == obj_req->ex.oe_objno);
2019*4882a593Smuzhiyun 	rbd_assert(ceph_decode_64(&p) == objno + 1);
2020*4882a593Smuzhiyun 	new_state = ceph_decode_8(&p);
2021*4882a593Smuzhiyun 	has_current_state = ceph_decode_8(&p);
2022*4882a593Smuzhiyun 	if (has_current_state)
2023*4882a593Smuzhiyun 		current_state = ceph_decode_8(&p);
2024*4882a593Smuzhiyun 
2025*4882a593Smuzhiyun 	spin_lock(&rbd_dev->object_map_lock);
2026*4882a593Smuzhiyun 	state = __rbd_object_map_get(rbd_dev, objno);
2027*4882a593Smuzhiyun 	if (!has_current_state || current_state == state ||
2028*4882a593Smuzhiyun 	    (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
2029*4882a593Smuzhiyun 		__rbd_object_map_set(rbd_dev, objno, new_state);
2030*4882a593Smuzhiyun 	spin_unlock(&rbd_dev->object_map_lock);
2031*4882a593Smuzhiyun 
2032*4882a593Smuzhiyun 	return 0;
2033*4882a593Smuzhiyun }
2034*4882a593Smuzhiyun 
rbd_object_map_callback(struct ceph_osd_request * osd_req)2035*4882a593Smuzhiyun static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
2036*4882a593Smuzhiyun {
2037*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2038*4882a593Smuzhiyun 	int result;
2039*4882a593Smuzhiyun 
2040*4882a593Smuzhiyun 	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
2041*4882a593Smuzhiyun 	     osd_req->r_result, obj_req);
2042*4882a593Smuzhiyun 
2043*4882a593Smuzhiyun 	result = rbd_object_map_update_finish(obj_req, osd_req);
2044*4882a593Smuzhiyun 	rbd_obj_handle_request(obj_req, result);
2045*4882a593Smuzhiyun }
2046*4882a593Smuzhiyun 
update_needed(struct rbd_device * rbd_dev,u64 objno,u8 new_state)2047*4882a593Smuzhiyun static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
2048*4882a593Smuzhiyun {
2049*4882a593Smuzhiyun 	u8 state = rbd_object_map_get(rbd_dev, objno);
2050*4882a593Smuzhiyun 
2051*4882a593Smuzhiyun 	if (state == new_state ||
2052*4882a593Smuzhiyun 	    (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
2053*4882a593Smuzhiyun 	    (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
2054*4882a593Smuzhiyun 		return false;
2055*4882a593Smuzhiyun 
2056*4882a593Smuzhiyun 	return true;
2057*4882a593Smuzhiyun }
2058*4882a593Smuzhiyun 
rbd_cls_object_map_update(struct ceph_osd_request * req,int which,u64 objno,u8 new_state,const u8 * current_state)2059*4882a593Smuzhiyun static int rbd_cls_object_map_update(struct ceph_osd_request *req,
2060*4882a593Smuzhiyun 				     int which, u64 objno, u8 new_state,
2061*4882a593Smuzhiyun 				     const u8 *current_state)
2062*4882a593Smuzhiyun {
2063*4882a593Smuzhiyun 	struct page **pages;
2064*4882a593Smuzhiyun 	void *p, *start;
2065*4882a593Smuzhiyun 	int ret;
2066*4882a593Smuzhiyun 
2067*4882a593Smuzhiyun 	ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
2068*4882a593Smuzhiyun 	if (ret)
2069*4882a593Smuzhiyun 		return ret;
2070*4882a593Smuzhiyun 
2071*4882a593Smuzhiyun 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
2072*4882a593Smuzhiyun 	if (IS_ERR(pages))
2073*4882a593Smuzhiyun 		return PTR_ERR(pages);
2074*4882a593Smuzhiyun 
2075*4882a593Smuzhiyun 	p = start = page_address(pages[0]);
2076*4882a593Smuzhiyun 	ceph_encode_64(&p, objno);
2077*4882a593Smuzhiyun 	ceph_encode_64(&p, objno + 1);
2078*4882a593Smuzhiyun 	ceph_encode_8(&p, new_state);
2079*4882a593Smuzhiyun 	if (current_state) {
2080*4882a593Smuzhiyun 		ceph_encode_8(&p, 1);
2081*4882a593Smuzhiyun 		ceph_encode_8(&p, *current_state);
2082*4882a593Smuzhiyun 	} else {
2083*4882a593Smuzhiyun 		ceph_encode_8(&p, 0);
2084*4882a593Smuzhiyun 	}
2085*4882a593Smuzhiyun 
2086*4882a593Smuzhiyun 	osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
2087*4882a593Smuzhiyun 					  false, true);
2088*4882a593Smuzhiyun 	return 0;
2089*4882a593Smuzhiyun }
2090*4882a593Smuzhiyun 
2091*4882a593Smuzhiyun /*
2092*4882a593Smuzhiyun  * Return:
2093*4882a593Smuzhiyun  *   0 - object map update sent
2094*4882a593Smuzhiyun  *   1 - object map update isn't needed
2095*4882a593Smuzhiyun  *  <0 - error
2096*4882a593Smuzhiyun  */
rbd_object_map_update(struct rbd_obj_request * obj_req,u64 snap_id,u8 new_state,const u8 * current_state)2097*4882a593Smuzhiyun static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
2098*4882a593Smuzhiyun 				 u8 new_state, const u8 *current_state)
2099*4882a593Smuzhiyun {
2100*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2101*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2102*4882a593Smuzhiyun 	struct ceph_osd_request *req;
2103*4882a593Smuzhiyun 	int num_ops = 1;
2104*4882a593Smuzhiyun 	int which = 0;
2105*4882a593Smuzhiyun 	int ret;
2106*4882a593Smuzhiyun 
2107*4882a593Smuzhiyun 	if (snap_id == CEPH_NOSNAP) {
2108*4882a593Smuzhiyun 		if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
2109*4882a593Smuzhiyun 			return 1;
2110*4882a593Smuzhiyun 
2111*4882a593Smuzhiyun 		num_ops++; /* assert_locked */
2112*4882a593Smuzhiyun 	}
2113*4882a593Smuzhiyun 
2114*4882a593Smuzhiyun 	req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
2115*4882a593Smuzhiyun 	if (!req)
2116*4882a593Smuzhiyun 		return -ENOMEM;
2117*4882a593Smuzhiyun 
2118*4882a593Smuzhiyun 	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
2119*4882a593Smuzhiyun 	req->r_callback = rbd_object_map_callback;
2120*4882a593Smuzhiyun 	req->r_priv = obj_req;
2121*4882a593Smuzhiyun 
2122*4882a593Smuzhiyun 	rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
2123*4882a593Smuzhiyun 	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
2124*4882a593Smuzhiyun 	req->r_flags = CEPH_OSD_FLAG_WRITE;
2125*4882a593Smuzhiyun 	ktime_get_real_ts64(&req->r_mtime);
2126*4882a593Smuzhiyun 
2127*4882a593Smuzhiyun 	if (snap_id == CEPH_NOSNAP) {
2128*4882a593Smuzhiyun 		/*
2129*4882a593Smuzhiyun 		 * Protect against possible race conditions during lock
2130*4882a593Smuzhiyun 		 * ownership transitions.
2131*4882a593Smuzhiyun 		 */
2132*4882a593Smuzhiyun 		ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
2133*4882a593Smuzhiyun 					     CEPH_CLS_LOCK_EXCLUSIVE, "", "");
2134*4882a593Smuzhiyun 		if (ret)
2135*4882a593Smuzhiyun 			return ret;
2136*4882a593Smuzhiyun 	}
2137*4882a593Smuzhiyun 
2138*4882a593Smuzhiyun 	ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
2139*4882a593Smuzhiyun 					new_state, current_state);
2140*4882a593Smuzhiyun 	if (ret)
2141*4882a593Smuzhiyun 		return ret;
2142*4882a593Smuzhiyun 
2143*4882a593Smuzhiyun 	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
2144*4882a593Smuzhiyun 	if (ret)
2145*4882a593Smuzhiyun 		return ret;
2146*4882a593Smuzhiyun 
2147*4882a593Smuzhiyun 	ceph_osdc_start_request(osdc, req, false);
2148*4882a593Smuzhiyun 	return 0;
2149*4882a593Smuzhiyun }
2150*4882a593Smuzhiyun 
prune_extents(struct ceph_file_extent * img_extents,u32 * num_img_extents,u64 overlap)2151*4882a593Smuzhiyun static void prune_extents(struct ceph_file_extent *img_extents,
2152*4882a593Smuzhiyun 			  u32 *num_img_extents, u64 overlap)
2153*4882a593Smuzhiyun {
2154*4882a593Smuzhiyun 	u32 cnt = *num_img_extents;
2155*4882a593Smuzhiyun 
2156*4882a593Smuzhiyun 	/* drop extents completely beyond the overlap */
2157*4882a593Smuzhiyun 	while (cnt && img_extents[cnt - 1].fe_off >= overlap)
2158*4882a593Smuzhiyun 		cnt--;
2159*4882a593Smuzhiyun 
2160*4882a593Smuzhiyun 	if (cnt) {
2161*4882a593Smuzhiyun 		struct ceph_file_extent *ex = &img_extents[cnt - 1];
2162*4882a593Smuzhiyun 
2163*4882a593Smuzhiyun 		/* trim final overlapping extent */
2164*4882a593Smuzhiyun 		if (ex->fe_off + ex->fe_len > overlap)
2165*4882a593Smuzhiyun 			ex->fe_len = overlap - ex->fe_off;
2166*4882a593Smuzhiyun 	}
2167*4882a593Smuzhiyun 
2168*4882a593Smuzhiyun 	*num_img_extents = cnt;
2169*4882a593Smuzhiyun }
2170*4882a593Smuzhiyun 
2171*4882a593Smuzhiyun /*
2172*4882a593Smuzhiyun  * Determine the byte range(s) covered by either just the object extent
2173*4882a593Smuzhiyun  * or the entire object in the parent image.
2174*4882a593Smuzhiyun  */
rbd_obj_calc_img_extents(struct rbd_obj_request * obj_req,bool entire)2175*4882a593Smuzhiyun static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
2176*4882a593Smuzhiyun 				    bool entire)
2177*4882a593Smuzhiyun {
2178*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2179*4882a593Smuzhiyun 	int ret;
2180*4882a593Smuzhiyun 
2181*4882a593Smuzhiyun 	if (!rbd_dev->parent_overlap)
2182*4882a593Smuzhiyun 		return 0;
2183*4882a593Smuzhiyun 
2184*4882a593Smuzhiyun 	ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
2185*4882a593Smuzhiyun 				  entire ? 0 : obj_req->ex.oe_off,
2186*4882a593Smuzhiyun 				  entire ? rbd_dev->layout.object_size :
2187*4882a593Smuzhiyun 							obj_req->ex.oe_len,
2188*4882a593Smuzhiyun 				  &obj_req->img_extents,
2189*4882a593Smuzhiyun 				  &obj_req->num_img_extents);
2190*4882a593Smuzhiyun 	if (ret)
2191*4882a593Smuzhiyun 		return ret;
2192*4882a593Smuzhiyun 
2193*4882a593Smuzhiyun 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2194*4882a593Smuzhiyun 		      rbd_dev->parent_overlap);
2195*4882a593Smuzhiyun 	return 0;
2196*4882a593Smuzhiyun }
2197*4882a593Smuzhiyun 
rbd_osd_setup_data(struct ceph_osd_request * osd_req,int which)2198*4882a593Smuzhiyun static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
2199*4882a593Smuzhiyun {
2200*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2201*4882a593Smuzhiyun 
2202*4882a593Smuzhiyun 	switch (obj_req->img_request->data_type) {
2203*4882a593Smuzhiyun 	case OBJ_REQUEST_BIO:
2204*4882a593Smuzhiyun 		osd_req_op_extent_osd_data_bio(osd_req, which,
2205*4882a593Smuzhiyun 					       &obj_req->bio_pos,
2206*4882a593Smuzhiyun 					       obj_req->ex.oe_len);
2207*4882a593Smuzhiyun 		break;
2208*4882a593Smuzhiyun 	case OBJ_REQUEST_BVECS:
2209*4882a593Smuzhiyun 	case OBJ_REQUEST_OWN_BVECS:
2210*4882a593Smuzhiyun 		rbd_assert(obj_req->bvec_pos.iter.bi_size ==
2211*4882a593Smuzhiyun 							obj_req->ex.oe_len);
2212*4882a593Smuzhiyun 		rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
2213*4882a593Smuzhiyun 		osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
2214*4882a593Smuzhiyun 						    &obj_req->bvec_pos);
2215*4882a593Smuzhiyun 		break;
2216*4882a593Smuzhiyun 	default:
2217*4882a593Smuzhiyun 		BUG();
2218*4882a593Smuzhiyun 	}
2219*4882a593Smuzhiyun }
2220*4882a593Smuzhiyun 
rbd_osd_setup_stat(struct ceph_osd_request * osd_req,int which)2221*4882a593Smuzhiyun static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
2222*4882a593Smuzhiyun {
2223*4882a593Smuzhiyun 	struct page **pages;
2224*4882a593Smuzhiyun 
2225*4882a593Smuzhiyun 	/*
2226*4882a593Smuzhiyun 	 * The response data for a STAT call consists of:
2227*4882a593Smuzhiyun 	 *     le64 length;
2228*4882a593Smuzhiyun 	 *     struct {
2229*4882a593Smuzhiyun 	 *         le32 tv_sec;
2230*4882a593Smuzhiyun 	 *         le32 tv_nsec;
2231*4882a593Smuzhiyun 	 *     } mtime;
2232*4882a593Smuzhiyun 	 */
2233*4882a593Smuzhiyun 	pages = ceph_alloc_page_vector(1, GFP_NOIO);
2234*4882a593Smuzhiyun 	if (IS_ERR(pages))
2235*4882a593Smuzhiyun 		return PTR_ERR(pages);
2236*4882a593Smuzhiyun 
2237*4882a593Smuzhiyun 	osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2238*4882a593Smuzhiyun 	osd_req_op_raw_data_in_pages(osd_req, which, pages,
2239*4882a593Smuzhiyun 				     8 + sizeof(struct ceph_timespec),
2240*4882a593Smuzhiyun 				     0, false, true);
2241*4882a593Smuzhiyun 	return 0;
2242*4882a593Smuzhiyun }
2243*4882a593Smuzhiyun 
rbd_osd_setup_copyup(struct ceph_osd_request * osd_req,int which,u32 bytes)2244*4882a593Smuzhiyun static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2245*4882a593Smuzhiyun 				u32 bytes)
2246*4882a593Smuzhiyun {
2247*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2248*4882a593Smuzhiyun 	int ret;
2249*4882a593Smuzhiyun 
2250*4882a593Smuzhiyun 	ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2251*4882a593Smuzhiyun 	if (ret)
2252*4882a593Smuzhiyun 		return ret;
2253*4882a593Smuzhiyun 
2254*4882a593Smuzhiyun 	osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2255*4882a593Smuzhiyun 					  obj_req->copyup_bvec_count, bytes);
2256*4882a593Smuzhiyun 	return 0;
2257*4882a593Smuzhiyun }
2258*4882a593Smuzhiyun 
rbd_obj_init_read(struct rbd_obj_request * obj_req)2259*4882a593Smuzhiyun static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
2260*4882a593Smuzhiyun {
2261*4882a593Smuzhiyun 	obj_req->read_state = RBD_OBJ_READ_START;
2262*4882a593Smuzhiyun 	return 0;
2263*4882a593Smuzhiyun }
2264*4882a593Smuzhiyun 
__rbd_osd_setup_write_ops(struct ceph_osd_request * osd_req,int which)2265*4882a593Smuzhiyun static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2266*4882a593Smuzhiyun 				      int which)
2267*4882a593Smuzhiyun {
2268*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2269*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2270*4882a593Smuzhiyun 	u16 opcode;
2271*4882a593Smuzhiyun 
2272*4882a593Smuzhiyun 	if (!use_object_map(rbd_dev) ||
2273*4882a593Smuzhiyun 	    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2274*4882a593Smuzhiyun 		osd_req_op_alloc_hint_init(osd_req, which++,
2275*4882a593Smuzhiyun 					   rbd_dev->layout.object_size,
2276*4882a593Smuzhiyun 					   rbd_dev->layout.object_size,
2277*4882a593Smuzhiyun 					   rbd_dev->opts->alloc_hint_flags);
2278*4882a593Smuzhiyun 	}
2279*4882a593Smuzhiyun 
2280*4882a593Smuzhiyun 	if (rbd_obj_is_entire(obj_req))
2281*4882a593Smuzhiyun 		opcode = CEPH_OSD_OP_WRITEFULL;
2282*4882a593Smuzhiyun 	else
2283*4882a593Smuzhiyun 		opcode = CEPH_OSD_OP_WRITE;
2284*4882a593Smuzhiyun 
2285*4882a593Smuzhiyun 	osd_req_op_extent_init(osd_req, which, opcode,
2286*4882a593Smuzhiyun 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2287*4882a593Smuzhiyun 	rbd_osd_setup_data(osd_req, which);
2288*4882a593Smuzhiyun }
2289*4882a593Smuzhiyun 
rbd_obj_init_write(struct rbd_obj_request * obj_req)2290*4882a593Smuzhiyun static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
2291*4882a593Smuzhiyun {
2292*4882a593Smuzhiyun 	int ret;
2293*4882a593Smuzhiyun 
2294*4882a593Smuzhiyun 	/* reverse map the entire object onto the parent */
2295*4882a593Smuzhiyun 	ret = rbd_obj_calc_img_extents(obj_req, true);
2296*4882a593Smuzhiyun 	if (ret)
2297*4882a593Smuzhiyun 		return ret;
2298*4882a593Smuzhiyun 
2299*4882a593Smuzhiyun 	if (rbd_obj_copyup_enabled(obj_req))
2300*4882a593Smuzhiyun 		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2301*4882a593Smuzhiyun 
2302*4882a593Smuzhiyun 	obj_req->write_state = RBD_OBJ_WRITE_START;
2303*4882a593Smuzhiyun 	return 0;
2304*4882a593Smuzhiyun }
2305*4882a593Smuzhiyun 
truncate_or_zero_opcode(struct rbd_obj_request * obj_req)2306*4882a593Smuzhiyun static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
2307*4882a593Smuzhiyun {
2308*4882a593Smuzhiyun 	return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
2309*4882a593Smuzhiyun 					  CEPH_OSD_OP_ZERO;
2310*4882a593Smuzhiyun }
2311*4882a593Smuzhiyun 
__rbd_osd_setup_discard_ops(struct ceph_osd_request * osd_req,int which)2312*4882a593Smuzhiyun static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
2313*4882a593Smuzhiyun 					int which)
2314*4882a593Smuzhiyun {
2315*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2316*4882a593Smuzhiyun 
2317*4882a593Smuzhiyun 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
2318*4882a593Smuzhiyun 		rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2319*4882a593Smuzhiyun 		osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
2320*4882a593Smuzhiyun 	} else {
2321*4882a593Smuzhiyun 		osd_req_op_extent_init(osd_req, which,
2322*4882a593Smuzhiyun 				       truncate_or_zero_opcode(obj_req),
2323*4882a593Smuzhiyun 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
2324*4882a593Smuzhiyun 				       0, 0);
2325*4882a593Smuzhiyun 	}
2326*4882a593Smuzhiyun }
2327*4882a593Smuzhiyun 
rbd_obj_init_discard(struct rbd_obj_request * obj_req)2328*4882a593Smuzhiyun static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
2329*4882a593Smuzhiyun {
2330*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2331*4882a593Smuzhiyun 	u64 off, next_off;
2332*4882a593Smuzhiyun 	int ret;
2333*4882a593Smuzhiyun 
2334*4882a593Smuzhiyun 	/*
2335*4882a593Smuzhiyun 	 * Align the range to alloc_size boundary and punt on discards
2336*4882a593Smuzhiyun 	 * that are too small to free up any space.
2337*4882a593Smuzhiyun 	 *
2338*4882a593Smuzhiyun 	 * alloc_size == object_size && is_tail() is a special case for
2339*4882a593Smuzhiyun 	 * filestore with filestore_punch_hole = false, needed to allow
2340*4882a593Smuzhiyun 	 * truncate (in addition to delete).
2341*4882a593Smuzhiyun 	 */
2342*4882a593Smuzhiyun 	if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
2343*4882a593Smuzhiyun 	    !rbd_obj_is_tail(obj_req)) {
2344*4882a593Smuzhiyun 		off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
2345*4882a593Smuzhiyun 		next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
2346*4882a593Smuzhiyun 				      rbd_dev->opts->alloc_size);
2347*4882a593Smuzhiyun 		if (off >= next_off)
2348*4882a593Smuzhiyun 			return 1;
2349*4882a593Smuzhiyun 
2350*4882a593Smuzhiyun 		dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
2351*4882a593Smuzhiyun 		     obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
2352*4882a593Smuzhiyun 		     off, next_off - off);
2353*4882a593Smuzhiyun 		obj_req->ex.oe_off = off;
2354*4882a593Smuzhiyun 		obj_req->ex.oe_len = next_off - off;
2355*4882a593Smuzhiyun 	}
2356*4882a593Smuzhiyun 
2357*4882a593Smuzhiyun 	/* reverse map the entire object onto the parent */
2358*4882a593Smuzhiyun 	ret = rbd_obj_calc_img_extents(obj_req, true);
2359*4882a593Smuzhiyun 	if (ret)
2360*4882a593Smuzhiyun 		return ret;
2361*4882a593Smuzhiyun 
2362*4882a593Smuzhiyun 	obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
2363*4882a593Smuzhiyun 	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
2364*4882a593Smuzhiyun 		obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2365*4882a593Smuzhiyun 
2366*4882a593Smuzhiyun 	obj_req->write_state = RBD_OBJ_WRITE_START;
2367*4882a593Smuzhiyun 	return 0;
2368*4882a593Smuzhiyun }
2369*4882a593Smuzhiyun 
__rbd_osd_setup_zeroout_ops(struct ceph_osd_request * osd_req,int which)2370*4882a593Smuzhiyun static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2371*4882a593Smuzhiyun 					int which)
2372*4882a593Smuzhiyun {
2373*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2374*4882a593Smuzhiyun 	u16 opcode;
2375*4882a593Smuzhiyun 
2376*4882a593Smuzhiyun 	if (rbd_obj_is_entire(obj_req)) {
2377*4882a593Smuzhiyun 		if (obj_req->num_img_extents) {
2378*4882a593Smuzhiyun 			if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2379*4882a593Smuzhiyun 				osd_req_op_init(osd_req, which++,
2380*4882a593Smuzhiyun 						CEPH_OSD_OP_CREATE, 0);
2381*4882a593Smuzhiyun 			opcode = CEPH_OSD_OP_TRUNCATE;
2382*4882a593Smuzhiyun 		} else {
2383*4882a593Smuzhiyun 			rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2384*4882a593Smuzhiyun 			osd_req_op_init(osd_req, which++,
2385*4882a593Smuzhiyun 					CEPH_OSD_OP_DELETE, 0);
2386*4882a593Smuzhiyun 			opcode = 0;
2387*4882a593Smuzhiyun 		}
2388*4882a593Smuzhiyun 	} else {
2389*4882a593Smuzhiyun 		opcode = truncate_or_zero_opcode(obj_req);
2390*4882a593Smuzhiyun 	}
2391*4882a593Smuzhiyun 
2392*4882a593Smuzhiyun 	if (opcode)
2393*4882a593Smuzhiyun 		osd_req_op_extent_init(osd_req, which, opcode,
2394*4882a593Smuzhiyun 				       obj_req->ex.oe_off, obj_req->ex.oe_len,
2395*4882a593Smuzhiyun 				       0, 0);
2396*4882a593Smuzhiyun }
2397*4882a593Smuzhiyun 
rbd_obj_init_zeroout(struct rbd_obj_request * obj_req)2398*4882a593Smuzhiyun static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
2399*4882a593Smuzhiyun {
2400*4882a593Smuzhiyun 	int ret;
2401*4882a593Smuzhiyun 
2402*4882a593Smuzhiyun 	/* reverse map the entire object onto the parent */
2403*4882a593Smuzhiyun 	ret = rbd_obj_calc_img_extents(obj_req, true);
2404*4882a593Smuzhiyun 	if (ret)
2405*4882a593Smuzhiyun 		return ret;
2406*4882a593Smuzhiyun 
2407*4882a593Smuzhiyun 	if (rbd_obj_copyup_enabled(obj_req))
2408*4882a593Smuzhiyun 		obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2409*4882a593Smuzhiyun 	if (!obj_req->num_img_extents) {
2410*4882a593Smuzhiyun 		obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
2411*4882a593Smuzhiyun 		if (rbd_obj_is_entire(obj_req))
2412*4882a593Smuzhiyun 			obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2413*4882a593Smuzhiyun 	}
2414*4882a593Smuzhiyun 
2415*4882a593Smuzhiyun 	obj_req->write_state = RBD_OBJ_WRITE_START;
2416*4882a593Smuzhiyun 	return 0;
2417*4882a593Smuzhiyun }
2418*4882a593Smuzhiyun 
count_write_ops(struct rbd_obj_request * obj_req)2419*4882a593Smuzhiyun static int count_write_ops(struct rbd_obj_request *obj_req)
2420*4882a593Smuzhiyun {
2421*4882a593Smuzhiyun 	struct rbd_img_request *img_req = obj_req->img_request;
2422*4882a593Smuzhiyun 
2423*4882a593Smuzhiyun 	switch (img_req->op_type) {
2424*4882a593Smuzhiyun 	case OBJ_OP_WRITE:
2425*4882a593Smuzhiyun 		if (!use_object_map(img_req->rbd_dev) ||
2426*4882a593Smuzhiyun 		    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2427*4882a593Smuzhiyun 			return 2; /* setallochint + write/writefull */
2428*4882a593Smuzhiyun 
2429*4882a593Smuzhiyun 		return 1; /* write/writefull */
2430*4882a593Smuzhiyun 	case OBJ_OP_DISCARD:
2431*4882a593Smuzhiyun 		return 1; /* delete/truncate/zero */
2432*4882a593Smuzhiyun 	case OBJ_OP_ZEROOUT:
2433*4882a593Smuzhiyun 		if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2434*4882a593Smuzhiyun 		    !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2435*4882a593Smuzhiyun 			return 2; /* create + truncate */
2436*4882a593Smuzhiyun 
2437*4882a593Smuzhiyun 		return 1; /* delete/truncate/zero */
2438*4882a593Smuzhiyun 	default:
2439*4882a593Smuzhiyun 		BUG();
2440*4882a593Smuzhiyun 	}
2441*4882a593Smuzhiyun }
2442*4882a593Smuzhiyun 
rbd_osd_setup_write_ops(struct ceph_osd_request * osd_req,int which)2443*4882a593Smuzhiyun static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2444*4882a593Smuzhiyun 				    int which)
2445*4882a593Smuzhiyun {
2446*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req = osd_req->r_priv;
2447*4882a593Smuzhiyun 
2448*4882a593Smuzhiyun 	switch (obj_req->img_request->op_type) {
2449*4882a593Smuzhiyun 	case OBJ_OP_WRITE:
2450*4882a593Smuzhiyun 		__rbd_osd_setup_write_ops(osd_req, which);
2451*4882a593Smuzhiyun 		break;
2452*4882a593Smuzhiyun 	case OBJ_OP_DISCARD:
2453*4882a593Smuzhiyun 		__rbd_osd_setup_discard_ops(osd_req, which);
2454*4882a593Smuzhiyun 		break;
2455*4882a593Smuzhiyun 	case OBJ_OP_ZEROOUT:
2456*4882a593Smuzhiyun 		__rbd_osd_setup_zeroout_ops(osd_req, which);
2457*4882a593Smuzhiyun 		break;
2458*4882a593Smuzhiyun 	default:
2459*4882a593Smuzhiyun 		BUG();
2460*4882a593Smuzhiyun 	}
2461*4882a593Smuzhiyun }
2462*4882a593Smuzhiyun 
2463*4882a593Smuzhiyun /*
2464*4882a593Smuzhiyun  * Prune the list of object requests (adjust offset and/or length, drop
2465*4882a593Smuzhiyun  * redundant requests).  Prepare object request state machines and image
2466*4882a593Smuzhiyun  * request state machine for execution.
2467*4882a593Smuzhiyun  */
__rbd_img_fill_request(struct rbd_img_request * img_req)2468*4882a593Smuzhiyun static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2469*4882a593Smuzhiyun {
2470*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req, *next_obj_req;
2471*4882a593Smuzhiyun 	int ret;
2472*4882a593Smuzhiyun 
2473*4882a593Smuzhiyun 	for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
2474*4882a593Smuzhiyun 		switch (img_req->op_type) {
2475*4882a593Smuzhiyun 		case OBJ_OP_READ:
2476*4882a593Smuzhiyun 			ret = rbd_obj_init_read(obj_req);
2477*4882a593Smuzhiyun 			break;
2478*4882a593Smuzhiyun 		case OBJ_OP_WRITE:
2479*4882a593Smuzhiyun 			ret = rbd_obj_init_write(obj_req);
2480*4882a593Smuzhiyun 			break;
2481*4882a593Smuzhiyun 		case OBJ_OP_DISCARD:
2482*4882a593Smuzhiyun 			ret = rbd_obj_init_discard(obj_req);
2483*4882a593Smuzhiyun 			break;
2484*4882a593Smuzhiyun 		case OBJ_OP_ZEROOUT:
2485*4882a593Smuzhiyun 			ret = rbd_obj_init_zeroout(obj_req);
2486*4882a593Smuzhiyun 			break;
2487*4882a593Smuzhiyun 		default:
2488*4882a593Smuzhiyun 			BUG();
2489*4882a593Smuzhiyun 		}
2490*4882a593Smuzhiyun 		if (ret < 0)
2491*4882a593Smuzhiyun 			return ret;
2492*4882a593Smuzhiyun 		if (ret > 0) {
2493*4882a593Smuzhiyun 			rbd_img_obj_request_del(img_req, obj_req);
2494*4882a593Smuzhiyun 			continue;
2495*4882a593Smuzhiyun 		}
2496*4882a593Smuzhiyun 	}
2497*4882a593Smuzhiyun 
2498*4882a593Smuzhiyun 	img_req->state = RBD_IMG_START;
2499*4882a593Smuzhiyun 	return 0;
2500*4882a593Smuzhiyun }
2501*4882a593Smuzhiyun 
2502*4882a593Smuzhiyun union rbd_img_fill_iter {
2503*4882a593Smuzhiyun 	struct ceph_bio_iter	bio_iter;
2504*4882a593Smuzhiyun 	struct ceph_bvec_iter	bvec_iter;
2505*4882a593Smuzhiyun };
2506*4882a593Smuzhiyun 
2507*4882a593Smuzhiyun struct rbd_img_fill_ctx {
2508*4882a593Smuzhiyun 	enum obj_request_type	pos_type;
2509*4882a593Smuzhiyun 	union rbd_img_fill_iter	*pos;
2510*4882a593Smuzhiyun 	union rbd_img_fill_iter	iter;
2511*4882a593Smuzhiyun 	ceph_object_extent_fn_t	set_pos_fn;
2512*4882a593Smuzhiyun 	ceph_object_extent_fn_t	count_fn;
2513*4882a593Smuzhiyun 	ceph_object_extent_fn_t	copy_fn;
2514*4882a593Smuzhiyun };
2515*4882a593Smuzhiyun 
alloc_object_extent(void * arg)2516*4882a593Smuzhiyun static struct ceph_object_extent *alloc_object_extent(void *arg)
2517*4882a593Smuzhiyun {
2518*4882a593Smuzhiyun 	struct rbd_img_request *img_req = arg;
2519*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req;
2520*4882a593Smuzhiyun 
2521*4882a593Smuzhiyun 	obj_req = rbd_obj_request_create();
2522*4882a593Smuzhiyun 	if (!obj_req)
2523*4882a593Smuzhiyun 		return NULL;
2524*4882a593Smuzhiyun 
2525*4882a593Smuzhiyun 	rbd_img_obj_request_add(img_req, obj_req);
2526*4882a593Smuzhiyun 	return &obj_req->ex;
2527*4882a593Smuzhiyun }
2528*4882a593Smuzhiyun 
2529*4882a593Smuzhiyun /*
2530*4882a593Smuzhiyun  * While su != os && sc == 1 is technically not fancy (it's the same
2531*4882a593Smuzhiyun  * layout as su == os && sc == 1), we can't use the nocopy path for it
2532*4882a593Smuzhiyun  * because ->set_pos_fn() should be called only once per object.
2533*4882a593Smuzhiyun  * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2534*4882a593Smuzhiyun  * treat su != os && sc == 1 as fancy.
2535*4882a593Smuzhiyun  */
rbd_layout_is_fancy(struct ceph_file_layout * l)2536*4882a593Smuzhiyun static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2537*4882a593Smuzhiyun {
2538*4882a593Smuzhiyun 	return l->stripe_unit != l->object_size;
2539*4882a593Smuzhiyun }
2540*4882a593Smuzhiyun 
rbd_img_fill_request_nocopy(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct rbd_img_fill_ctx * fctx)2541*4882a593Smuzhiyun static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2542*4882a593Smuzhiyun 				       struct ceph_file_extent *img_extents,
2543*4882a593Smuzhiyun 				       u32 num_img_extents,
2544*4882a593Smuzhiyun 				       struct rbd_img_fill_ctx *fctx)
2545*4882a593Smuzhiyun {
2546*4882a593Smuzhiyun 	u32 i;
2547*4882a593Smuzhiyun 	int ret;
2548*4882a593Smuzhiyun 
2549*4882a593Smuzhiyun 	img_req->data_type = fctx->pos_type;
2550*4882a593Smuzhiyun 
2551*4882a593Smuzhiyun 	/*
2552*4882a593Smuzhiyun 	 * Create object requests and set each object request's starting
2553*4882a593Smuzhiyun 	 * position in the provided bio (list) or bio_vec array.
2554*4882a593Smuzhiyun 	 */
2555*4882a593Smuzhiyun 	fctx->iter = *fctx->pos;
2556*4882a593Smuzhiyun 	for (i = 0; i < num_img_extents; i++) {
2557*4882a593Smuzhiyun 		ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2558*4882a593Smuzhiyun 					   img_extents[i].fe_off,
2559*4882a593Smuzhiyun 					   img_extents[i].fe_len,
2560*4882a593Smuzhiyun 					   &img_req->object_extents,
2561*4882a593Smuzhiyun 					   alloc_object_extent, img_req,
2562*4882a593Smuzhiyun 					   fctx->set_pos_fn, &fctx->iter);
2563*4882a593Smuzhiyun 		if (ret)
2564*4882a593Smuzhiyun 			return ret;
2565*4882a593Smuzhiyun 	}
2566*4882a593Smuzhiyun 
2567*4882a593Smuzhiyun 	return __rbd_img_fill_request(img_req);
2568*4882a593Smuzhiyun }
2569*4882a593Smuzhiyun 
2570*4882a593Smuzhiyun /*
2571*4882a593Smuzhiyun  * Map a list of image extents to a list of object extents, create the
2572*4882a593Smuzhiyun  * corresponding object requests (normally each to a different object,
2573*4882a593Smuzhiyun  * but not always) and add them to @img_req.  For each object request,
2574*4882a593Smuzhiyun  * set up its data descriptor to point to the corresponding chunk(s) of
2575*4882a593Smuzhiyun  * @fctx->pos data buffer.
2576*4882a593Smuzhiyun  *
2577*4882a593Smuzhiyun  * Because ceph_file_to_extents() will merge adjacent object extents
2578*4882a593Smuzhiyun  * together, each object request's data descriptor may point to multiple
2579*4882a593Smuzhiyun  * different chunks of @fctx->pos data buffer.
2580*4882a593Smuzhiyun  *
2581*4882a593Smuzhiyun  * @fctx->pos data buffer is assumed to be large enough.
2582*4882a593Smuzhiyun  */
rbd_img_fill_request(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct rbd_img_fill_ctx * fctx)2583*4882a593Smuzhiyun static int rbd_img_fill_request(struct rbd_img_request *img_req,
2584*4882a593Smuzhiyun 				struct ceph_file_extent *img_extents,
2585*4882a593Smuzhiyun 				u32 num_img_extents,
2586*4882a593Smuzhiyun 				struct rbd_img_fill_ctx *fctx)
2587*4882a593Smuzhiyun {
2588*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = img_req->rbd_dev;
2589*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req;
2590*4882a593Smuzhiyun 	u32 i;
2591*4882a593Smuzhiyun 	int ret;
2592*4882a593Smuzhiyun 
2593*4882a593Smuzhiyun 	if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2594*4882a593Smuzhiyun 	    !rbd_layout_is_fancy(&rbd_dev->layout))
2595*4882a593Smuzhiyun 		return rbd_img_fill_request_nocopy(img_req, img_extents,
2596*4882a593Smuzhiyun 						   num_img_extents, fctx);
2597*4882a593Smuzhiyun 
2598*4882a593Smuzhiyun 	img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2599*4882a593Smuzhiyun 
2600*4882a593Smuzhiyun 	/*
2601*4882a593Smuzhiyun 	 * Create object requests and determine ->bvec_count for each object
2602*4882a593Smuzhiyun 	 * request.  Note that ->bvec_count sum over all object requests may
2603*4882a593Smuzhiyun 	 * be greater than the number of bio_vecs in the provided bio (list)
2604*4882a593Smuzhiyun 	 * or bio_vec array because when mapped, those bio_vecs can straddle
2605*4882a593Smuzhiyun 	 * stripe unit boundaries.
2606*4882a593Smuzhiyun 	 */
2607*4882a593Smuzhiyun 	fctx->iter = *fctx->pos;
2608*4882a593Smuzhiyun 	for (i = 0; i < num_img_extents; i++) {
2609*4882a593Smuzhiyun 		ret = ceph_file_to_extents(&rbd_dev->layout,
2610*4882a593Smuzhiyun 					   img_extents[i].fe_off,
2611*4882a593Smuzhiyun 					   img_extents[i].fe_len,
2612*4882a593Smuzhiyun 					   &img_req->object_extents,
2613*4882a593Smuzhiyun 					   alloc_object_extent, img_req,
2614*4882a593Smuzhiyun 					   fctx->count_fn, &fctx->iter);
2615*4882a593Smuzhiyun 		if (ret)
2616*4882a593Smuzhiyun 			return ret;
2617*4882a593Smuzhiyun 	}
2618*4882a593Smuzhiyun 
2619*4882a593Smuzhiyun 	for_each_obj_request(img_req, obj_req) {
2620*4882a593Smuzhiyun 		obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2621*4882a593Smuzhiyun 					      sizeof(*obj_req->bvec_pos.bvecs),
2622*4882a593Smuzhiyun 					      GFP_NOIO);
2623*4882a593Smuzhiyun 		if (!obj_req->bvec_pos.bvecs)
2624*4882a593Smuzhiyun 			return -ENOMEM;
2625*4882a593Smuzhiyun 	}
2626*4882a593Smuzhiyun 
2627*4882a593Smuzhiyun 	/*
2628*4882a593Smuzhiyun 	 * Fill in each object request's private bio_vec array, splitting and
2629*4882a593Smuzhiyun 	 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2630*4882a593Smuzhiyun 	 */
2631*4882a593Smuzhiyun 	fctx->iter = *fctx->pos;
2632*4882a593Smuzhiyun 	for (i = 0; i < num_img_extents; i++) {
2633*4882a593Smuzhiyun 		ret = ceph_iterate_extents(&rbd_dev->layout,
2634*4882a593Smuzhiyun 					   img_extents[i].fe_off,
2635*4882a593Smuzhiyun 					   img_extents[i].fe_len,
2636*4882a593Smuzhiyun 					   &img_req->object_extents,
2637*4882a593Smuzhiyun 					   fctx->copy_fn, &fctx->iter);
2638*4882a593Smuzhiyun 		if (ret)
2639*4882a593Smuzhiyun 			return ret;
2640*4882a593Smuzhiyun 	}
2641*4882a593Smuzhiyun 
2642*4882a593Smuzhiyun 	return __rbd_img_fill_request(img_req);
2643*4882a593Smuzhiyun }
2644*4882a593Smuzhiyun 
rbd_img_fill_nodata(struct rbd_img_request * img_req,u64 off,u64 len)2645*4882a593Smuzhiyun static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2646*4882a593Smuzhiyun 			       u64 off, u64 len)
2647*4882a593Smuzhiyun {
2648*4882a593Smuzhiyun 	struct ceph_file_extent ex = { off, len };
2649*4882a593Smuzhiyun 	union rbd_img_fill_iter dummy = {};
2650*4882a593Smuzhiyun 	struct rbd_img_fill_ctx fctx = {
2651*4882a593Smuzhiyun 		.pos_type = OBJ_REQUEST_NODATA,
2652*4882a593Smuzhiyun 		.pos = &dummy,
2653*4882a593Smuzhiyun 	};
2654*4882a593Smuzhiyun 
2655*4882a593Smuzhiyun 	return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2656*4882a593Smuzhiyun }
2657*4882a593Smuzhiyun 
set_bio_pos(struct ceph_object_extent * ex,u32 bytes,void * arg)2658*4882a593Smuzhiyun static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2659*4882a593Smuzhiyun {
2660*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req =
2661*4882a593Smuzhiyun 	    container_of(ex, struct rbd_obj_request, ex);
2662*4882a593Smuzhiyun 	struct ceph_bio_iter *it = arg;
2663*4882a593Smuzhiyun 
2664*4882a593Smuzhiyun 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2665*4882a593Smuzhiyun 	obj_req->bio_pos = *it;
2666*4882a593Smuzhiyun 	ceph_bio_iter_advance(it, bytes);
2667*4882a593Smuzhiyun }
2668*4882a593Smuzhiyun 
count_bio_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2669*4882a593Smuzhiyun static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2670*4882a593Smuzhiyun {
2671*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req =
2672*4882a593Smuzhiyun 	    container_of(ex, struct rbd_obj_request, ex);
2673*4882a593Smuzhiyun 	struct ceph_bio_iter *it = arg;
2674*4882a593Smuzhiyun 
2675*4882a593Smuzhiyun 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2676*4882a593Smuzhiyun 	ceph_bio_iter_advance_step(it, bytes, ({
2677*4882a593Smuzhiyun 		obj_req->bvec_count++;
2678*4882a593Smuzhiyun 	}));
2679*4882a593Smuzhiyun 
2680*4882a593Smuzhiyun }
2681*4882a593Smuzhiyun 
copy_bio_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2682*4882a593Smuzhiyun static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2683*4882a593Smuzhiyun {
2684*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req =
2685*4882a593Smuzhiyun 	    container_of(ex, struct rbd_obj_request, ex);
2686*4882a593Smuzhiyun 	struct ceph_bio_iter *it = arg;
2687*4882a593Smuzhiyun 
2688*4882a593Smuzhiyun 	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2689*4882a593Smuzhiyun 	ceph_bio_iter_advance_step(it, bytes, ({
2690*4882a593Smuzhiyun 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2691*4882a593Smuzhiyun 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2692*4882a593Smuzhiyun 	}));
2693*4882a593Smuzhiyun }
2694*4882a593Smuzhiyun 
__rbd_img_fill_from_bio(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct ceph_bio_iter * bio_pos)2695*4882a593Smuzhiyun static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2696*4882a593Smuzhiyun 				   struct ceph_file_extent *img_extents,
2697*4882a593Smuzhiyun 				   u32 num_img_extents,
2698*4882a593Smuzhiyun 				   struct ceph_bio_iter *bio_pos)
2699*4882a593Smuzhiyun {
2700*4882a593Smuzhiyun 	struct rbd_img_fill_ctx fctx = {
2701*4882a593Smuzhiyun 		.pos_type = OBJ_REQUEST_BIO,
2702*4882a593Smuzhiyun 		.pos = (union rbd_img_fill_iter *)bio_pos,
2703*4882a593Smuzhiyun 		.set_pos_fn = set_bio_pos,
2704*4882a593Smuzhiyun 		.count_fn = count_bio_bvecs,
2705*4882a593Smuzhiyun 		.copy_fn = copy_bio_bvecs,
2706*4882a593Smuzhiyun 	};
2707*4882a593Smuzhiyun 
2708*4882a593Smuzhiyun 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2709*4882a593Smuzhiyun 				    &fctx);
2710*4882a593Smuzhiyun }
2711*4882a593Smuzhiyun 
rbd_img_fill_from_bio(struct rbd_img_request * img_req,u64 off,u64 len,struct bio * bio)2712*4882a593Smuzhiyun static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2713*4882a593Smuzhiyun 				 u64 off, u64 len, struct bio *bio)
2714*4882a593Smuzhiyun {
2715*4882a593Smuzhiyun 	struct ceph_file_extent ex = { off, len };
2716*4882a593Smuzhiyun 	struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2717*4882a593Smuzhiyun 
2718*4882a593Smuzhiyun 	return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2719*4882a593Smuzhiyun }
2720*4882a593Smuzhiyun 
set_bvec_pos(struct ceph_object_extent * ex,u32 bytes,void * arg)2721*4882a593Smuzhiyun static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2722*4882a593Smuzhiyun {
2723*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req =
2724*4882a593Smuzhiyun 	    container_of(ex, struct rbd_obj_request, ex);
2725*4882a593Smuzhiyun 	struct ceph_bvec_iter *it = arg;
2726*4882a593Smuzhiyun 
2727*4882a593Smuzhiyun 	obj_req->bvec_pos = *it;
2728*4882a593Smuzhiyun 	ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2729*4882a593Smuzhiyun 	ceph_bvec_iter_advance(it, bytes);
2730*4882a593Smuzhiyun }
2731*4882a593Smuzhiyun 
count_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2732*4882a593Smuzhiyun static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2733*4882a593Smuzhiyun {
2734*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req =
2735*4882a593Smuzhiyun 	    container_of(ex, struct rbd_obj_request, ex);
2736*4882a593Smuzhiyun 	struct ceph_bvec_iter *it = arg;
2737*4882a593Smuzhiyun 
2738*4882a593Smuzhiyun 	ceph_bvec_iter_advance_step(it, bytes, ({
2739*4882a593Smuzhiyun 		obj_req->bvec_count++;
2740*4882a593Smuzhiyun 	}));
2741*4882a593Smuzhiyun }
2742*4882a593Smuzhiyun 
copy_bvecs(struct ceph_object_extent * ex,u32 bytes,void * arg)2743*4882a593Smuzhiyun static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2744*4882a593Smuzhiyun {
2745*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req =
2746*4882a593Smuzhiyun 	    container_of(ex, struct rbd_obj_request, ex);
2747*4882a593Smuzhiyun 	struct ceph_bvec_iter *it = arg;
2748*4882a593Smuzhiyun 
2749*4882a593Smuzhiyun 	ceph_bvec_iter_advance_step(it, bytes, ({
2750*4882a593Smuzhiyun 		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2751*4882a593Smuzhiyun 		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2752*4882a593Smuzhiyun 	}));
2753*4882a593Smuzhiyun }
2754*4882a593Smuzhiyun 
__rbd_img_fill_from_bvecs(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct ceph_bvec_iter * bvec_pos)2755*4882a593Smuzhiyun static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2756*4882a593Smuzhiyun 				     struct ceph_file_extent *img_extents,
2757*4882a593Smuzhiyun 				     u32 num_img_extents,
2758*4882a593Smuzhiyun 				     struct ceph_bvec_iter *bvec_pos)
2759*4882a593Smuzhiyun {
2760*4882a593Smuzhiyun 	struct rbd_img_fill_ctx fctx = {
2761*4882a593Smuzhiyun 		.pos_type = OBJ_REQUEST_BVECS,
2762*4882a593Smuzhiyun 		.pos = (union rbd_img_fill_iter *)bvec_pos,
2763*4882a593Smuzhiyun 		.set_pos_fn = set_bvec_pos,
2764*4882a593Smuzhiyun 		.count_fn = count_bvecs,
2765*4882a593Smuzhiyun 		.copy_fn = copy_bvecs,
2766*4882a593Smuzhiyun 	};
2767*4882a593Smuzhiyun 
2768*4882a593Smuzhiyun 	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2769*4882a593Smuzhiyun 				    &fctx);
2770*4882a593Smuzhiyun }
2771*4882a593Smuzhiyun 
rbd_img_fill_from_bvecs(struct rbd_img_request * img_req,struct ceph_file_extent * img_extents,u32 num_img_extents,struct bio_vec * bvecs)2772*4882a593Smuzhiyun static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2773*4882a593Smuzhiyun 				   struct ceph_file_extent *img_extents,
2774*4882a593Smuzhiyun 				   u32 num_img_extents,
2775*4882a593Smuzhiyun 				   struct bio_vec *bvecs)
2776*4882a593Smuzhiyun {
2777*4882a593Smuzhiyun 	struct ceph_bvec_iter it = {
2778*4882a593Smuzhiyun 		.bvecs = bvecs,
2779*4882a593Smuzhiyun 		.iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2780*4882a593Smuzhiyun 							     num_img_extents) },
2781*4882a593Smuzhiyun 	};
2782*4882a593Smuzhiyun 
2783*4882a593Smuzhiyun 	return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2784*4882a593Smuzhiyun 					 &it);
2785*4882a593Smuzhiyun }
2786*4882a593Smuzhiyun 
rbd_img_handle_request_work(struct work_struct * work)2787*4882a593Smuzhiyun static void rbd_img_handle_request_work(struct work_struct *work)
2788*4882a593Smuzhiyun {
2789*4882a593Smuzhiyun 	struct rbd_img_request *img_req =
2790*4882a593Smuzhiyun 	    container_of(work, struct rbd_img_request, work);
2791*4882a593Smuzhiyun 
2792*4882a593Smuzhiyun 	rbd_img_handle_request(img_req, img_req->work_result);
2793*4882a593Smuzhiyun }
2794*4882a593Smuzhiyun 
rbd_img_schedule(struct rbd_img_request * img_req,int result)2795*4882a593Smuzhiyun static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2796*4882a593Smuzhiyun {
2797*4882a593Smuzhiyun 	INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2798*4882a593Smuzhiyun 	img_req->work_result = result;
2799*4882a593Smuzhiyun 	queue_work(rbd_wq, &img_req->work);
2800*4882a593Smuzhiyun }
2801*4882a593Smuzhiyun 
rbd_obj_may_exist(struct rbd_obj_request * obj_req)2802*4882a593Smuzhiyun static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
2803*4882a593Smuzhiyun {
2804*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2805*4882a593Smuzhiyun 
2806*4882a593Smuzhiyun 	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
2807*4882a593Smuzhiyun 		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2808*4882a593Smuzhiyun 		return true;
2809*4882a593Smuzhiyun 	}
2810*4882a593Smuzhiyun 
2811*4882a593Smuzhiyun 	dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
2812*4882a593Smuzhiyun 	     obj_req->ex.oe_objno);
2813*4882a593Smuzhiyun 	return false;
2814*4882a593Smuzhiyun }
2815*4882a593Smuzhiyun 
rbd_obj_read_object(struct rbd_obj_request * obj_req)2816*4882a593Smuzhiyun static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2817*4882a593Smuzhiyun {
2818*4882a593Smuzhiyun 	struct ceph_osd_request *osd_req;
2819*4882a593Smuzhiyun 	int ret;
2820*4882a593Smuzhiyun 
2821*4882a593Smuzhiyun 	osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2822*4882a593Smuzhiyun 	if (IS_ERR(osd_req))
2823*4882a593Smuzhiyun 		return PTR_ERR(osd_req);
2824*4882a593Smuzhiyun 
2825*4882a593Smuzhiyun 	osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2826*4882a593Smuzhiyun 			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2827*4882a593Smuzhiyun 	rbd_osd_setup_data(osd_req, 0);
2828*4882a593Smuzhiyun 	rbd_osd_format_read(osd_req);
2829*4882a593Smuzhiyun 
2830*4882a593Smuzhiyun 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2831*4882a593Smuzhiyun 	if (ret)
2832*4882a593Smuzhiyun 		return ret;
2833*4882a593Smuzhiyun 
2834*4882a593Smuzhiyun 	rbd_osd_submit(osd_req);
2835*4882a593Smuzhiyun 	return 0;
2836*4882a593Smuzhiyun }
2837*4882a593Smuzhiyun 
rbd_obj_read_from_parent(struct rbd_obj_request * obj_req)2838*4882a593Smuzhiyun static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
2839*4882a593Smuzhiyun {
2840*4882a593Smuzhiyun 	struct rbd_img_request *img_req = obj_req->img_request;
2841*4882a593Smuzhiyun 	struct rbd_device *parent = img_req->rbd_dev->parent;
2842*4882a593Smuzhiyun 	struct rbd_img_request *child_img_req;
2843*4882a593Smuzhiyun 	int ret;
2844*4882a593Smuzhiyun 
2845*4882a593Smuzhiyun 	child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
2846*4882a593Smuzhiyun 	if (!child_img_req)
2847*4882a593Smuzhiyun 		return -ENOMEM;
2848*4882a593Smuzhiyun 
2849*4882a593Smuzhiyun 	rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
2850*4882a593Smuzhiyun 	__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2851*4882a593Smuzhiyun 	child_img_req->obj_request = obj_req;
2852*4882a593Smuzhiyun 
2853*4882a593Smuzhiyun 	down_read(&parent->header_rwsem);
2854*4882a593Smuzhiyun 	rbd_img_capture_header(child_img_req);
2855*4882a593Smuzhiyun 	up_read(&parent->header_rwsem);
2856*4882a593Smuzhiyun 
2857*4882a593Smuzhiyun 	dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
2858*4882a593Smuzhiyun 	     obj_req);
2859*4882a593Smuzhiyun 
2860*4882a593Smuzhiyun 	if (!rbd_img_is_write(img_req)) {
2861*4882a593Smuzhiyun 		switch (img_req->data_type) {
2862*4882a593Smuzhiyun 		case OBJ_REQUEST_BIO:
2863*4882a593Smuzhiyun 			ret = __rbd_img_fill_from_bio(child_img_req,
2864*4882a593Smuzhiyun 						      obj_req->img_extents,
2865*4882a593Smuzhiyun 						      obj_req->num_img_extents,
2866*4882a593Smuzhiyun 						      &obj_req->bio_pos);
2867*4882a593Smuzhiyun 			break;
2868*4882a593Smuzhiyun 		case OBJ_REQUEST_BVECS:
2869*4882a593Smuzhiyun 		case OBJ_REQUEST_OWN_BVECS:
2870*4882a593Smuzhiyun 			ret = __rbd_img_fill_from_bvecs(child_img_req,
2871*4882a593Smuzhiyun 						      obj_req->img_extents,
2872*4882a593Smuzhiyun 						      obj_req->num_img_extents,
2873*4882a593Smuzhiyun 						      &obj_req->bvec_pos);
2874*4882a593Smuzhiyun 			break;
2875*4882a593Smuzhiyun 		default:
2876*4882a593Smuzhiyun 			BUG();
2877*4882a593Smuzhiyun 		}
2878*4882a593Smuzhiyun 	} else {
2879*4882a593Smuzhiyun 		ret = rbd_img_fill_from_bvecs(child_img_req,
2880*4882a593Smuzhiyun 					      obj_req->img_extents,
2881*4882a593Smuzhiyun 					      obj_req->num_img_extents,
2882*4882a593Smuzhiyun 					      obj_req->copyup_bvecs);
2883*4882a593Smuzhiyun 	}
2884*4882a593Smuzhiyun 	if (ret) {
2885*4882a593Smuzhiyun 		rbd_img_request_destroy(child_img_req);
2886*4882a593Smuzhiyun 		return ret;
2887*4882a593Smuzhiyun 	}
2888*4882a593Smuzhiyun 
2889*4882a593Smuzhiyun 	/* avoid parent chain recursion */
2890*4882a593Smuzhiyun 	rbd_img_schedule(child_img_req, 0);
2891*4882a593Smuzhiyun 	return 0;
2892*4882a593Smuzhiyun }
2893*4882a593Smuzhiyun 
rbd_obj_advance_read(struct rbd_obj_request * obj_req,int * result)2894*4882a593Smuzhiyun static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
2895*4882a593Smuzhiyun {
2896*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2897*4882a593Smuzhiyun 	int ret;
2898*4882a593Smuzhiyun 
2899*4882a593Smuzhiyun again:
2900*4882a593Smuzhiyun 	switch (obj_req->read_state) {
2901*4882a593Smuzhiyun 	case RBD_OBJ_READ_START:
2902*4882a593Smuzhiyun 		rbd_assert(!*result);
2903*4882a593Smuzhiyun 
2904*4882a593Smuzhiyun 		if (!rbd_obj_may_exist(obj_req)) {
2905*4882a593Smuzhiyun 			*result = -ENOENT;
2906*4882a593Smuzhiyun 			obj_req->read_state = RBD_OBJ_READ_OBJECT;
2907*4882a593Smuzhiyun 			goto again;
2908*4882a593Smuzhiyun 		}
2909*4882a593Smuzhiyun 
2910*4882a593Smuzhiyun 		ret = rbd_obj_read_object(obj_req);
2911*4882a593Smuzhiyun 		if (ret) {
2912*4882a593Smuzhiyun 			*result = ret;
2913*4882a593Smuzhiyun 			return true;
2914*4882a593Smuzhiyun 		}
2915*4882a593Smuzhiyun 		obj_req->read_state = RBD_OBJ_READ_OBJECT;
2916*4882a593Smuzhiyun 		return false;
2917*4882a593Smuzhiyun 	case RBD_OBJ_READ_OBJECT:
2918*4882a593Smuzhiyun 		if (*result == -ENOENT && rbd_dev->parent_overlap) {
2919*4882a593Smuzhiyun 			/* reverse map this object extent onto the parent */
2920*4882a593Smuzhiyun 			ret = rbd_obj_calc_img_extents(obj_req, false);
2921*4882a593Smuzhiyun 			if (ret) {
2922*4882a593Smuzhiyun 				*result = ret;
2923*4882a593Smuzhiyun 				return true;
2924*4882a593Smuzhiyun 			}
2925*4882a593Smuzhiyun 			if (obj_req->num_img_extents) {
2926*4882a593Smuzhiyun 				ret = rbd_obj_read_from_parent(obj_req);
2927*4882a593Smuzhiyun 				if (ret) {
2928*4882a593Smuzhiyun 					*result = ret;
2929*4882a593Smuzhiyun 					return true;
2930*4882a593Smuzhiyun 				}
2931*4882a593Smuzhiyun 				obj_req->read_state = RBD_OBJ_READ_PARENT;
2932*4882a593Smuzhiyun 				return false;
2933*4882a593Smuzhiyun 			}
2934*4882a593Smuzhiyun 		}
2935*4882a593Smuzhiyun 
2936*4882a593Smuzhiyun 		/*
2937*4882a593Smuzhiyun 		 * -ENOENT means a hole in the image -- zero-fill the entire
2938*4882a593Smuzhiyun 		 * length of the request.  A short read also implies zero-fill
2939*4882a593Smuzhiyun 		 * to the end of the request.
2940*4882a593Smuzhiyun 		 */
2941*4882a593Smuzhiyun 		if (*result == -ENOENT) {
2942*4882a593Smuzhiyun 			rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
2943*4882a593Smuzhiyun 			*result = 0;
2944*4882a593Smuzhiyun 		} else if (*result >= 0) {
2945*4882a593Smuzhiyun 			if (*result < obj_req->ex.oe_len)
2946*4882a593Smuzhiyun 				rbd_obj_zero_range(obj_req, *result,
2947*4882a593Smuzhiyun 						obj_req->ex.oe_len - *result);
2948*4882a593Smuzhiyun 			else
2949*4882a593Smuzhiyun 				rbd_assert(*result == obj_req->ex.oe_len);
2950*4882a593Smuzhiyun 			*result = 0;
2951*4882a593Smuzhiyun 		}
2952*4882a593Smuzhiyun 		return true;
2953*4882a593Smuzhiyun 	case RBD_OBJ_READ_PARENT:
2954*4882a593Smuzhiyun 		/*
2955*4882a593Smuzhiyun 		 * The parent image is read only up to the overlap -- zero-fill
2956*4882a593Smuzhiyun 		 * from the overlap to the end of the request.
2957*4882a593Smuzhiyun 		 */
2958*4882a593Smuzhiyun 		if (!*result) {
2959*4882a593Smuzhiyun 			u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
2960*4882a593Smuzhiyun 
2961*4882a593Smuzhiyun 			if (obj_overlap < obj_req->ex.oe_len)
2962*4882a593Smuzhiyun 				rbd_obj_zero_range(obj_req, obj_overlap,
2963*4882a593Smuzhiyun 					    obj_req->ex.oe_len - obj_overlap);
2964*4882a593Smuzhiyun 		}
2965*4882a593Smuzhiyun 		return true;
2966*4882a593Smuzhiyun 	default:
2967*4882a593Smuzhiyun 		BUG();
2968*4882a593Smuzhiyun 	}
2969*4882a593Smuzhiyun }
2970*4882a593Smuzhiyun 
rbd_obj_write_is_noop(struct rbd_obj_request * obj_req)2971*4882a593Smuzhiyun static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
2972*4882a593Smuzhiyun {
2973*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2974*4882a593Smuzhiyun 
2975*4882a593Smuzhiyun 	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
2976*4882a593Smuzhiyun 		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2977*4882a593Smuzhiyun 
2978*4882a593Smuzhiyun 	if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
2979*4882a593Smuzhiyun 	    (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
2980*4882a593Smuzhiyun 		dout("%s %p noop for nonexistent\n", __func__, obj_req);
2981*4882a593Smuzhiyun 		return true;
2982*4882a593Smuzhiyun 	}
2983*4882a593Smuzhiyun 
2984*4882a593Smuzhiyun 	return false;
2985*4882a593Smuzhiyun }
2986*4882a593Smuzhiyun 
2987*4882a593Smuzhiyun /*
2988*4882a593Smuzhiyun  * Return:
2989*4882a593Smuzhiyun  *   0 - object map update sent
2990*4882a593Smuzhiyun  *   1 - object map update isn't needed
2991*4882a593Smuzhiyun  *  <0 - error
2992*4882a593Smuzhiyun  */
rbd_obj_write_pre_object_map(struct rbd_obj_request * obj_req)2993*4882a593Smuzhiyun static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
2994*4882a593Smuzhiyun {
2995*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2996*4882a593Smuzhiyun 	u8 new_state;
2997*4882a593Smuzhiyun 
2998*4882a593Smuzhiyun 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
2999*4882a593Smuzhiyun 		return 1;
3000*4882a593Smuzhiyun 
3001*4882a593Smuzhiyun 	if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3002*4882a593Smuzhiyun 		new_state = OBJECT_PENDING;
3003*4882a593Smuzhiyun 	else
3004*4882a593Smuzhiyun 		new_state = OBJECT_EXISTS;
3005*4882a593Smuzhiyun 
3006*4882a593Smuzhiyun 	return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
3007*4882a593Smuzhiyun }
3008*4882a593Smuzhiyun 
rbd_obj_write_object(struct rbd_obj_request * obj_req)3009*4882a593Smuzhiyun static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
3010*4882a593Smuzhiyun {
3011*4882a593Smuzhiyun 	struct ceph_osd_request *osd_req;
3012*4882a593Smuzhiyun 	int num_ops = count_write_ops(obj_req);
3013*4882a593Smuzhiyun 	int which = 0;
3014*4882a593Smuzhiyun 	int ret;
3015*4882a593Smuzhiyun 
3016*4882a593Smuzhiyun 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
3017*4882a593Smuzhiyun 		num_ops++; /* stat */
3018*4882a593Smuzhiyun 
3019*4882a593Smuzhiyun 	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3020*4882a593Smuzhiyun 	if (IS_ERR(osd_req))
3021*4882a593Smuzhiyun 		return PTR_ERR(osd_req);
3022*4882a593Smuzhiyun 
3023*4882a593Smuzhiyun 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3024*4882a593Smuzhiyun 		ret = rbd_osd_setup_stat(osd_req, which++);
3025*4882a593Smuzhiyun 		if (ret)
3026*4882a593Smuzhiyun 			return ret;
3027*4882a593Smuzhiyun 	}
3028*4882a593Smuzhiyun 
3029*4882a593Smuzhiyun 	rbd_osd_setup_write_ops(osd_req, which);
3030*4882a593Smuzhiyun 	rbd_osd_format_write(osd_req);
3031*4882a593Smuzhiyun 
3032*4882a593Smuzhiyun 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3033*4882a593Smuzhiyun 	if (ret)
3034*4882a593Smuzhiyun 		return ret;
3035*4882a593Smuzhiyun 
3036*4882a593Smuzhiyun 	rbd_osd_submit(osd_req);
3037*4882a593Smuzhiyun 	return 0;
3038*4882a593Smuzhiyun }
3039*4882a593Smuzhiyun 
3040*4882a593Smuzhiyun /*
3041*4882a593Smuzhiyun  * copyup_bvecs pages are never highmem pages
3042*4882a593Smuzhiyun  */
is_zero_bvecs(struct bio_vec * bvecs,u32 bytes)3043*4882a593Smuzhiyun static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
3044*4882a593Smuzhiyun {
3045*4882a593Smuzhiyun 	struct ceph_bvec_iter it = {
3046*4882a593Smuzhiyun 		.bvecs = bvecs,
3047*4882a593Smuzhiyun 		.iter = { .bi_size = bytes },
3048*4882a593Smuzhiyun 	};
3049*4882a593Smuzhiyun 
3050*4882a593Smuzhiyun 	ceph_bvec_iter_advance_step(&it, bytes, ({
3051*4882a593Smuzhiyun 		if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
3052*4882a593Smuzhiyun 			       bv.bv_len))
3053*4882a593Smuzhiyun 			return false;
3054*4882a593Smuzhiyun 	}));
3055*4882a593Smuzhiyun 	return true;
3056*4882a593Smuzhiyun }
3057*4882a593Smuzhiyun 
3058*4882a593Smuzhiyun #define MODS_ONLY	U32_MAX
3059*4882a593Smuzhiyun 
rbd_obj_copyup_empty_snapc(struct rbd_obj_request * obj_req,u32 bytes)3060*4882a593Smuzhiyun static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
3061*4882a593Smuzhiyun 				      u32 bytes)
3062*4882a593Smuzhiyun {
3063*4882a593Smuzhiyun 	struct ceph_osd_request *osd_req;
3064*4882a593Smuzhiyun 	int ret;
3065*4882a593Smuzhiyun 
3066*4882a593Smuzhiyun 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
3067*4882a593Smuzhiyun 	rbd_assert(bytes > 0 && bytes != MODS_ONLY);
3068*4882a593Smuzhiyun 
3069*4882a593Smuzhiyun 	osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3070*4882a593Smuzhiyun 	if (IS_ERR(osd_req))
3071*4882a593Smuzhiyun 		return PTR_ERR(osd_req);
3072*4882a593Smuzhiyun 
3073*4882a593Smuzhiyun 	ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
3074*4882a593Smuzhiyun 	if (ret)
3075*4882a593Smuzhiyun 		return ret;
3076*4882a593Smuzhiyun 
3077*4882a593Smuzhiyun 	rbd_osd_format_write(osd_req);
3078*4882a593Smuzhiyun 
3079*4882a593Smuzhiyun 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3080*4882a593Smuzhiyun 	if (ret)
3081*4882a593Smuzhiyun 		return ret;
3082*4882a593Smuzhiyun 
3083*4882a593Smuzhiyun 	rbd_osd_submit(osd_req);
3084*4882a593Smuzhiyun 	return 0;
3085*4882a593Smuzhiyun }
3086*4882a593Smuzhiyun 
rbd_obj_copyup_current_snapc(struct rbd_obj_request * obj_req,u32 bytes)3087*4882a593Smuzhiyun static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3088*4882a593Smuzhiyun 					u32 bytes)
3089*4882a593Smuzhiyun {
3090*4882a593Smuzhiyun 	struct ceph_osd_request *osd_req;
3091*4882a593Smuzhiyun 	int num_ops = count_write_ops(obj_req);
3092*4882a593Smuzhiyun 	int which = 0;
3093*4882a593Smuzhiyun 	int ret;
3094*4882a593Smuzhiyun 
3095*4882a593Smuzhiyun 	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
3096*4882a593Smuzhiyun 
3097*4882a593Smuzhiyun 	if (bytes != MODS_ONLY)
3098*4882a593Smuzhiyun 		num_ops++; /* copyup */
3099*4882a593Smuzhiyun 
3100*4882a593Smuzhiyun 	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3101*4882a593Smuzhiyun 	if (IS_ERR(osd_req))
3102*4882a593Smuzhiyun 		return PTR_ERR(osd_req);
3103*4882a593Smuzhiyun 
3104*4882a593Smuzhiyun 	if (bytes != MODS_ONLY) {
3105*4882a593Smuzhiyun 		ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
3106*4882a593Smuzhiyun 		if (ret)
3107*4882a593Smuzhiyun 			return ret;
3108*4882a593Smuzhiyun 	}
3109*4882a593Smuzhiyun 
3110*4882a593Smuzhiyun 	rbd_osd_setup_write_ops(osd_req, which);
3111*4882a593Smuzhiyun 	rbd_osd_format_write(osd_req);
3112*4882a593Smuzhiyun 
3113*4882a593Smuzhiyun 	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3114*4882a593Smuzhiyun 	if (ret)
3115*4882a593Smuzhiyun 		return ret;
3116*4882a593Smuzhiyun 
3117*4882a593Smuzhiyun 	rbd_osd_submit(osd_req);
3118*4882a593Smuzhiyun 	return 0;
3119*4882a593Smuzhiyun }
3120*4882a593Smuzhiyun 
setup_copyup_bvecs(struct rbd_obj_request * obj_req,u64 obj_overlap)3121*4882a593Smuzhiyun static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
3122*4882a593Smuzhiyun {
3123*4882a593Smuzhiyun 	u32 i;
3124*4882a593Smuzhiyun 
3125*4882a593Smuzhiyun 	rbd_assert(!obj_req->copyup_bvecs);
3126*4882a593Smuzhiyun 	obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
3127*4882a593Smuzhiyun 	obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
3128*4882a593Smuzhiyun 					sizeof(*obj_req->copyup_bvecs),
3129*4882a593Smuzhiyun 					GFP_NOIO);
3130*4882a593Smuzhiyun 	if (!obj_req->copyup_bvecs)
3131*4882a593Smuzhiyun 		return -ENOMEM;
3132*4882a593Smuzhiyun 
3133*4882a593Smuzhiyun 	for (i = 0; i < obj_req->copyup_bvec_count; i++) {
3134*4882a593Smuzhiyun 		unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
3135*4882a593Smuzhiyun 
3136*4882a593Smuzhiyun 		obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
3137*4882a593Smuzhiyun 		if (!obj_req->copyup_bvecs[i].bv_page)
3138*4882a593Smuzhiyun 			return -ENOMEM;
3139*4882a593Smuzhiyun 
3140*4882a593Smuzhiyun 		obj_req->copyup_bvecs[i].bv_offset = 0;
3141*4882a593Smuzhiyun 		obj_req->copyup_bvecs[i].bv_len = len;
3142*4882a593Smuzhiyun 		obj_overlap -= len;
3143*4882a593Smuzhiyun 	}
3144*4882a593Smuzhiyun 
3145*4882a593Smuzhiyun 	rbd_assert(!obj_overlap);
3146*4882a593Smuzhiyun 	return 0;
3147*4882a593Smuzhiyun }
3148*4882a593Smuzhiyun 
3149*4882a593Smuzhiyun /*
3150*4882a593Smuzhiyun  * The target object doesn't exist.  Read the data for the entire
3151*4882a593Smuzhiyun  * target object up to the overlap point (if any) from the parent,
3152*4882a593Smuzhiyun  * so we can use it for a copyup.
3153*4882a593Smuzhiyun  */
rbd_obj_copyup_read_parent(struct rbd_obj_request * obj_req)3154*4882a593Smuzhiyun static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
3155*4882a593Smuzhiyun {
3156*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3157*4882a593Smuzhiyun 	int ret;
3158*4882a593Smuzhiyun 
3159*4882a593Smuzhiyun 	rbd_assert(obj_req->num_img_extents);
3160*4882a593Smuzhiyun 	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
3161*4882a593Smuzhiyun 		      rbd_dev->parent_overlap);
3162*4882a593Smuzhiyun 	if (!obj_req->num_img_extents) {
3163*4882a593Smuzhiyun 		/*
3164*4882a593Smuzhiyun 		 * The overlap has become 0 (most likely because the
3165*4882a593Smuzhiyun 		 * image has been flattened).  Re-submit the original write
3166*4882a593Smuzhiyun 		 * request -- pass MODS_ONLY since the copyup isn't needed
3167*4882a593Smuzhiyun 		 * anymore.
3168*4882a593Smuzhiyun 		 */
3169*4882a593Smuzhiyun 		return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
3170*4882a593Smuzhiyun 	}
3171*4882a593Smuzhiyun 
3172*4882a593Smuzhiyun 	ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
3173*4882a593Smuzhiyun 	if (ret)
3174*4882a593Smuzhiyun 		return ret;
3175*4882a593Smuzhiyun 
3176*4882a593Smuzhiyun 	return rbd_obj_read_from_parent(obj_req);
3177*4882a593Smuzhiyun }
3178*4882a593Smuzhiyun 
rbd_obj_copyup_object_maps(struct rbd_obj_request * obj_req)3179*4882a593Smuzhiyun static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
3180*4882a593Smuzhiyun {
3181*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3182*4882a593Smuzhiyun 	struct ceph_snap_context *snapc = obj_req->img_request->snapc;
3183*4882a593Smuzhiyun 	u8 new_state;
3184*4882a593Smuzhiyun 	u32 i;
3185*4882a593Smuzhiyun 	int ret;
3186*4882a593Smuzhiyun 
3187*4882a593Smuzhiyun 	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3188*4882a593Smuzhiyun 
3189*4882a593Smuzhiyun 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3190*4882a593Smuzhiyun 		return;
3191*4882a593Smuzhiyun 
3192*4882a593Smuzhiyun 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3193*4882a593Smuzhiyun 		return;
3194*4882a593Smuzhiyun 
3195*4882a593Smuzhiyun 	for (i = 0; i < snapc->num_snaps; i++) {
3196*4882a593Smuzhiyun 		if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
3197*4882a593Smuzhiyun 		    i + 1 < snapc->num_snaps)
3198*4882a593Smuzhiyun 			new_state = OBJECT_EXISTS_CLEAN;
3199*4882a593Smuzhiyun 		else
3200*4882a593Smuzhiyun 			new_state = OBJECT_EXISTS;
3201*4882a593Smuzhiyun 
3202*4882a593Smuzhiyun 		ret = rbd_object_map_update(obj_req, snapc->snaps[i],
3203*4882a593Smuzhiyun 					    new_state, NULL);
3204*4882a593Smuzhiyun 		if (ret < 0) {
3205*4882a593Smuzhiyun 			obj_req->pending.result = ret;
3206*4882a593Smuzhiyun 			return;
3207*4882a593Smuzhiyun 		}
3208*4882a593Smuzhiyun 
3209*4882a593Smuzhiyun 		rbd_assert(!ret);
3210*4882a593Smuzhiyun 		obj_req->pending.num_pending++;
3211*4882a593Smuzhiyun 	}
3212*4882a593Smuzhiyun }
3213*4882a593Smuzhiyun 
rbd_obj_copyup_write_object(struct rbd_obj_request * obj_req)3214*4882a593Smuzhiyun static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
3215*4882a593Smuzhiyun {
3216*4882a593Smuzhiyun 	u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3217*4882a593Smuzhiyun 	int ret;
3218*4882a593Smuzhiyun 
3219*4882a593Smuzhiyun 	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3220*4882a593Smuzhiyun 
3221*4882a593Smuzhiyun 	/*
3222*4882a593Smuzhiyun 	 * Only send non-zero copyup data to save some I/O and network
3223*4882a593Smuzhiyun 	 * bandwidth -- zero copyup data is equivalent to the object not
3224*4882a593Smuzhiyun 	 * existing.
3225*4882a593Smuzhiyun 	 */
3226*4882a593Smuzhiyun 	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3227*4882a593Smuzhiyun 		bytes = 0;
3228*4882a593Smuzhiyun 
3229*4882a593Smuzhiyun 	if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3230*4882a593Smuzhiyun 		/*
3231*4882a593Smuzhiyun 		 * Send a copyup request with an empty snapshot context to
3232*4882a593Smuzhiyun 		 * deep-copyup the object through all existing snapshots.
3233*4882a593Smuzhiyun 		 * A second request with the current snapshot context will be
3234*4882a593Smuzhiyun 		 * sent for the actual modification.
3235*4882a593Smuzhiyun 		 */
3236*4882a593Smuzhiyun 		ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3237*4882a593Smuzhiyun 		if (ret) {
3238*4882a593Smuzhiyun 			obj_req->pending.result = ret;
3239*4882a593Smuzhiyun 			return;
3240*4882a593Smuzhiyun 		}
3241*4882a593Smuzhiyun 
3242*4882a593Smuzhiyun 		obj_req->pending.num_pending++;
3243*4882a593Smuzhiyun 		bytes = MODS_ONLY;
3244*4882a593Smuzhiyun 	}
3245*4882a593Smuzhiyun 
3246*4882a593Smuzhiyun 	ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3247*4882a593Smuzhiyun 	if (ret) {
3248*4882a593Smuzhiyun 		obj_req->pending.result = ret;
3249*4882a593Smuzhiyun 		return;
3250*4882a593Smuzhiyun 	}
3251*4882a593Smuzhiyun 
3252*4882a593Smuzhiyun 	obj_req->pending.num_pending++;
3253*4882a593Smuzhiyun }
3254*4882a593Smuzhiyun 
rbd_obj_advance_copyup(struct rbd_obj_request * obj_req,int * result)3255*4882a593Smuzhiyun static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
3256*4882a593Smuzhiyun {
3257*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3258*4882a593Smuzhiyun 	int ret;
3259*4882a593Smuzhiyun 
3260*4882a593Smuzhiyun again:
3261*4882a593Smuzhiyun 	switch (obj_req->copyup_state) {
3262*4882a593Smuzhiyun 	case RBD_OBJ_COPYUP_START:
3263*4882a593Smuzhiyun 		rbd_assert(!*result);
3264*4882a593Smuzhiyun 
3265*4882a593Smuzhiyun 		ret = rbd_obj_copyup_read_parent(obj_req);
3266*4882a593Smuzhiyun 		if (ret) {
3267*4882a593Smuzhiyun 			*result = ret;
3268*4882a593Smuzhiyun 			return true;
3269*4882a593Smuzhiyun 		}
3270*4882a593Smuzhiyun 		if (obj_req->num_img_extents)
3271*4882a593Smuzhiyun 			obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3272*4882a593Smuzhiyun 		else
3273*4882a593Smuzhiyun 			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3274*4882a593Smuzhiyun 		return false;
3275*4882a593Smuzhiyun 	case RBD_OBJ_COPYUP_READ_PARENT:
3276*4882a593Smuzhiyun 		if (*result)
3277*4882a593Smuzhiyun 			return true;
3278*4882a593Smuzhiyun 
3279*4882a593Smuzhiyun 		if (is_zero_bvecs(obj_req->copyup_bvecs,
3280*4882a593Smuzhiyun 				  rbd_obj_img_extents_bytes(obj_req))) {
3281*4882a593Smuzhiyun 			dout("%s %p detected zeros\n", __func__, obj_req);
3282*4882a593Smuzhiyun 			obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
3283*4882a593Smuzhiyun 		}
3284*4882a593Smuzhiyun 
3285*4882a593Smuzhiyun 		rbd_obj_copyup_object_maps(obj_req);
3286*4882a593Smuzhiyun 		if (!obj_req->pending.num_pending) {
3287*4882a593Smuzhiyun 			*result = obj_req->pending.result;
3288*4882a593Smuzhiyun 			obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
3289*4882a593Smuzhiyun 			goto again;
3290*4882a593Smuzhiyun 		}
3291*4882a593Smuzhiyun 		obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
3292*4882a593Smuzhiyun 		return false;
3293*4882a593Smuzhiyun 	case __RBD_OBJ_COPYUP_OBJECT_MAPS:
3294*4882a593Smuzhiyun 		if (!pending_result_dec(&obj_req->pending, result))
3295*4882a593Smuzhiyun 			return false;
3296*4882a593Smuzhiyun 		fallthrough;
3297*4882a593Smuzhiyun 	case RBD_OBJ_COPYUP_OBJECT_MAPS:
3298*4882a593Smuzhiyun 		if (*result) {
3299*4882a593Smuzhiyun 			rbd_warn(rbd_dev, "snap object map update failed: %d",
3300*4882a593Smuzhiyun 				 *result);
3301*4882a593Smuzhiyun 			return true;
3302*4882a593Smuzhiyun 		}
3303*4882a593Smuzhiyun 
3304*4882a593Smuzhiyun 		rbd_obj_copyup_write_object(obj_req);
3305*4882a593Smuzhiyun 		if (!obj_req->pending.num_pending) {
3306*4882a593Smuzhiyun 			*result = obj_req->pending.result;
3307*4882a593Smuzhiyun 			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3308*4882a593Smuzhiyun 			goto again;
3309*4882a593Smuzhiyun 		}
3310*4882a593Smuzhiyun 		obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3311*4882a593Smuzhiyun 		return false;
3312*4882a593Smuzhiyun 	case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3313*4882a593Smuzhiyun 		if (!pending_result_dec(&obj_req->pending, result))
3314*4882a593Smuzhiyun 			return false;
3315*4882a593Smuzhiyun 		fallthrough;
3316*4882a593Smuzhiyun 	case RBD_OBJ_COPYUP_WRITE_OBJECT:
3317*4882a593Smuzhiyun 		return true;
3318*4882a593Smuzhiyun 	default:
3319*4882a593Smuzhiyun 		BUG();
3320*4882a593Smuzhiyun 	}
3321*4882a593Smuzhiyun }
3322*4882a593Smuzhiyun 
3323*4882a593Smuzhiyun /*
3324*4882a593Smuzhiyun  * Return:
3325*4882a593Smuzhiyun  *   0 - object map update sent
3326*4882a593Smuzhiyun  *   1 - object map update isn't needed
3327*4882a593Smuzhiyun  *  <0 - error
3328*4882a593Smuzhiyun  */
rbd_obj_write_post_object_map(struct rbd_obj_request * obj_req)3329*4882a593Smuzhiyun static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
3330*4882a593Smuzhiyun {
3331*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3332*4882a593Smuzhiyun 	u8 current_state = OBJECT_PENDING;
3333*4882a593Smuzhiyun 
3334*4882a593Smuzhiyun 	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3335*4882a593Smuzhiyun 		return 1;
3336*4882a593Smuzhiyun 
3337*4882a593Smuzhiyun 	if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
3338*4882a593Smuzhiyun 		return 1;
3339*4882a593Smuzhiyun 
3340*4882a593Smuzhiyun 	return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
3341*4882a593Smuzhiyun 				     &current_state);
3342*4882a593Smuzhiyun }
3343*4882a593Smuzhiyun 
rbd_obj_advance_write(struct rbd_obj_request * obj_req,int * result)3344*4882a593Smuzhiyun static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
3345*4882a593Smuzhiyun {
3346*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3347*4882a593Smuzhiyun 	int ret;
3348*4882a593Smuzhiyun 
3349*4882a593Smuzhiyun again:
3350*4882a593Smuzhiyun 	switch (obj_req->write_state) {
3351*4882a593Smuzhiyun 	case RBD_OBJ_WRITE_START:
3352*4882a593Smuzhiyun 		rbd_assert(!*result);
3353*4882a593Smuzhiyun 
3354*4882a593Smuzhiyun 		if (rbd_obj_write_is_noop(obj_req))
3355*4882a593Smuzhiyun 			return true;
3356*4882a593Smuzhiyun 
3357*4882a593Smuzhiyun 		ret = rbd_obj_write_pre_object_map(obj_req);
3358*4882a593Smuzhiyun 		if (ret < 0) {
3359*4882a593Smuzhiyun 			*result = ret;
3360*4882a593Smuzhiyun 			return true;
3361*4882a593Smuzhiyun 		}
3362*4882a593Smuzhiyun 		obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
3363*4882a593Smuzhiyun 		if (ret > 0)
3364*4882a593Smuzhiyun 			goto again;
3365*4882a593Smuzhiyun 		return false;
3366*4882a593Smuzhiyun 	case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
3367*4882a593Smuzhiyun 		if (*result) {
3368*4882a593Smuzhiyun 			rbd_warn(rbd_dev, "pre object map update failed: %d",
3369*4882a593Smuzhiyun 				 *result);
3370*4882a593Smuzhiyun 			return true;
3371*4882a593Smuzhiyun 		}
3372*4882a593Smuzhiyun 		ret = rbd_obj_write_object(obj_req);
3373*4882a593Smuzhiyun 		if (ret) {
3374*4882a593Smuzhiyun 			*result = ret;
3375*4882a593Smuzhiyun 			return true;
3376*4882a593Smuzhiyun 		}
3377*4882a593Smuzhiyun 		obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
3378*4882a593Smuzhiyun 		return false;
3379*4882a593Smuzhiyun 	case RBD_OBJ_WRITE_OBJECT:
3380*4882a593Smuzhiyun 		if (*result == -ENOENT) {
3381*4882a593Smuzhiyun 			if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3382*4882a593Smuzhiyun 				*result = 0;
3383*4882a593Smuzhiyun 				obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3384*4882a593Smuzhiyun 				obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3385*4882a593Smuzhiyun 				goto again;
3386*4882a593Smuzhiyun 			}
3387*4882a593Smuzhiyun 			/*
3388*4882a593Smuzhiyun 			 * On a non-existent object:
3389*4882a593Smuzhiyun 			 *   delete - -ENOENT, truncate/zero - 0
3390*4882a593Smuzhiyun 			 */
3391*4882a593Smuzhiyun 			if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3392*4882a593Smuzhiyun 				*result = 0;
3393*4882a593Smuzhiyun 		}
3394*4882a593Smuzhiyun 		if (*result)
3395*4882a593Smuzhiyun 			return true;
3396*4882a593Smuzhiyun 
3397*4882a593Smuzhiyun 		obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3398*4882a593Smuzhiyun 		goto again;
3399*4882a593Smuzhiyun 	case __RBD_OBJ_WRITE_COPYUP:
3400*4882a593Smuzhiyun 		if (!rbd_obj_advance_copyup(obj_req, result))
3401*4882a593Smuzhiyun 			return false;
3402*4882a593Smuzhiyun 		fallthrough;
3403*4882a593Smuzhiyun 	case RBD_OBJ_WRITE_COPYUP:
3404*4882a593Smuzhiyun 		if (*result) {
3405*4882a593Smuzhiyun 			rbd_warn(rbd_dev, "copyup failed: %d", *result);
3406*4882a593Smuzhiyun 			return true;
3407*4882a593Smuzhiyun 		}
3408*4882a593Smuzhiyun 		ret = rbd_obj_write_post_object_map(obj_req);
3409*4882a593Smuzhiyun 		if (ret < 0) {
3410*4882a593Smuzhiyun 			*result = ret;
3411*4882a593Smuzhiyun 			return true;
3412*4882a593Smuzhiyun 		}
3413*4882a593Smuzhiyun 		obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
3414*4882a593Smuzhiyun 		if (ret > 0)
3415*4882a593Smuzhiyun 			goto again;
3416*4882a593Smuzhiyun 		return false;
3417*4882a593Smuzhiyun 	case RBD_OBJ_WRITE_POST_OBJECT_MAP:
3418*4882a593Smuzhiyun 		if (*result)
3419*4882a593Smuzhiyun 			rbd_warn(rbd_dev, "post object map update failed: %d",
3420*4882a593Smuzhiyun 				 *result);
3421*4882a593Smuzhiyun 		return true;
3422*4882a593Smuzhiyun 	default:
3423*4882a593Smuzhiyun 		BUG();
3424*4882a593Smuzhiyun 	}
3425*4882a593Smuzhiyun }
3426*4882a593Smuzhiyun 
3427*4882a593Smuzhiyun /*
3428*4882a593Smuzhiyun  * Return true if @obj_req is completed.
3429*4882a593Smuzhiyun  */
__rbd_obj_handle_request(struct rbd_obj_request * obj_req,int * result)3430*4882a593Smuzhiyun static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
3431*4882a593Smuzhiyun 				     int *result)
3432*4882a593Smuzhiyun {
3433*4882a593Smuzhiyun 	struct rbd_img_request *img_req = obj_req->img_request;
3434*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3435*4882a593Smuzhiyun 	bool done;
3436*4882a593Smuzhiyun 
3437*4882a593Smuzhiyun 	mutex_lock(&obj_req->state_mutex);
3438*4882a593Smuzhiyun 	if (!rbd_img_is_write(img_req))
3439*4882a593Smuzhiyun 		done = rbd_obj_advance_read(obj_req, result);
3440*4882a593Smuzhiyun 	else
3441*4882a593Smuzhiyun 		done = rbd_obj_advance_write(obj_req, result);
3442*4882a593Smuzhiyun 	mutex_unlock(&obj_req->state_mutex);
3443*4882a593Smuzhiyun 
3444*4882a593Smuzhiyun 	if (done && *result) {
3445*4882a593Smuzhiyun 		rbd_assert(*result < 0);
3446*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
3447*4882a593Smuzhiyun 			 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
3448*4882a593Smuzhiyun 			 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
3449*4882a593Smuzhiyun 	}
3450*4882a593Smuzhiyun 	return done;
3451*4882a593Smuzhiyun }
3452*4882a593Smuzhiyun 
3453*4882a593Smuzhiyun /*
3454*4882a593Smuzhiyun  * This is open-coded in rbd_img_handle_request() to avoid parent chain
3455*4882a593Smuzhiyun  * recursion.
3456*4882a593Smuzhiyun  */
rbd_obj_handle_request(struct rbd_obj_request * obj_req,int result)3457*4882a593Smuzhiyun static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
3458*4882a593Smuzhiyun {
3459*4882a593Smuzhiyun 	if (__rbd_obj_handle_request(obj_req, &result))
3460*4882a593Smuzhiyun 		rbd_img_handle_request(obj_req->img_request, result);
3461*4882a593Smuzhiyun }
3462*4882a593Smuzhiyun 
need_exclusive_lock(struct rbd_img_request * img_req)3463*4882a593Smuzhiyun static bool need_exclusive_lock(struct rbd_img_request *img_req)
3464*4882a593Smuzhiyun {
3465*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3466*4882a593Smuzhiyun 
3467*4882a593Smuzhiyun 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3468*4882a593Smuzhiyun 		return false;
3469*4882a593Smuzhiyun 
3470*4882a593Smuzhiyun 	if (rbd_is_ro(rbd_dev))
3471*4882a593Smuzhiyun 		return false;
3472*4882a593Smuzhiyun 
3473*4882a593Smuzhiyun 	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
3474*4882a593Smuzhiyun 	if (rbd_dev->opts->lock_on_read ||
3475*4882a593Smuzhiyun 	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3476*4882a593Smuzhiyun 		return true;
3477*4882a593Smuzhiyun 
3478*4882a593Smuzhiyun 	return rbd_img_is_write(img_req);
3479*4882a593Smuzhiyun }
3480*4882a593Smuzhiyun 
rbd_lock_add_request(struct rbd_img_request * img_req)3481*4882a593Smuzhiyun static bool rbd_lock_add_request(struct rbd_img_request *img_req)
3482*4882a593Smuzhiyun {
3483*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3484*4882a593Smuzhiyun 	bool locked;
3485*4882a593Smuzhiyun 
3486*4882a593Smuzhiyun 	lockdep_assert_held(&rbd_dev->lock_rwsem);
3487*4882a593Smuzhiyun 	locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
3488*4882a593Smuzhiyun 	spin_lock(&rbd_dev->lock_lists_lock);
3489*4882a593Smuzhiyun 	rbd_assert(list_empty(&img_req->lock_item));
3490*4882a593Smuzhiyun 	if (!locked)
3491*4882a593Smuzhiyun 		list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3492*4882a593Smuzhiyun 	else
3493*4882a593Smuzhiyun 		list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
3494*4882a593Smuzhiyun 	spin_unlock(&rbd_dev->lock_lists_lock);
3495*4882a593Smuzhiyun 	return locked;
3496*4882a593Smuzhiyun }
3497*4882a593Smuzhiyun 
rbd_lock_del_request(struct rbd_img_request * img_req)3498*4882a593Smuzhiyun static void rbd_lock_del_request(struct rbd_img_request *img_req)
3499*4882a593Smuzhiyun {
3500*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3501*4882a593Smuzhiyun 	bool need_wakeup;
3502*4882a593Smuzhiyun 
3503*4882a593Smuzhiyun 	lockdep_assert_held(&rbd_dev->lock_rwsem);
3504*4882a593Smuzhiyun 	spin_lock(&rbd_dev->lock_lists_lock);
3505*4882a593Smuzhiyun 	rbd_assert(!list_empty(&img_req->lock_item));
3506*4882a593Smuzhiyun 	list_del_init(&img_req->lock_item);
3507*4882a593Smuzhiyun 	need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3508*4882a593Smuzhiyun 		       list_empty(&rbd_dev->running_list));
3509*4882a593Smuzhiyun 	spin_unlock(&rbd_dev->lock_lists_lock);
3510*4882a593Smuzhiyun 	if (need_wakeup)
3511*4882a593Smuzhiyun 		complete(&rbd_dev->releasing_wait);
3512*4882a593Smuzhiyun }
3513*4882a593Smuzhiyun 
rbd_img_exclusive_lock(struct rbd_img_request * img_req)3514*4882a593Smuzhiyun static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3515*4882a593Smuzhiyun {
3516*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3517*4882a593Smuzhiyun 
3518*4882a593Smuzhiyun 	if (!need_exclusive_lock(img_req))
3519*4882a593Smuzhiyun 		return 1;
3520*4882a593Smuzhiyun 
3521*4882a593Smuzhiyun 	if (rbd_lock_add_request(img_req))
3522*4882a593Smuzhiyun 		return 1;
3523*4882a593Smuzhiyun 
3524*4882a593Smuzhiyun 	if (rbd_dev->opts->exclusive) {
3525*4882a593Smuzhiyun 		WARN_ON(1); /* lock got released? */
3526*4882a593Smuzhiyun 		return -EROFS;
3527*4882a593Smuzhiyun 	}
3528*4882a593Smuzhiyun 
3529*4882a593Smuzhiyun 	/*
3530*4882a593Smuzhiyun 	 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3531*4882a593Smuzhiyun 	 * and cancel_delayed_work() in wake_lock_waiters().
3532*4882a593Smuzhiyun 	 */
3533*4882a593Smuzhiyun 	dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3534*4882a593Smuzhiyun 	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3535*4882a593Smuzhiyun 	return 0;
3536*4882a593Smuzhiyun }
3537*4882a593Smuzhiyun 
rbd_img_object_requests(struct rbd_img_request * img_req)3538*4882a593Smuzhiyun static void rbd_img_object_requests(struct rbd_img_request *img_req)
3539*4882a593Smuzhiyun {
3540*4882a593Smuzhiyun 	struct rbd_obj_request *obj_req;
3541*4882a593Smuzhiyun 
3542*4882a593Smuzhiyun 	rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
3543*4882a593Smuzhiyun 
3544*4882a593Smuzhiyun 	for_each_obj_request(img_req, obj_req) {
3545*4882a593Smuzhiyun 		int result = 0;
3546*4882a593Smuzhiyun 
3547*4882a593Smuzhiyun 		if (__rbd_obj_handle_request(obj_req, &result)) {
3548*4882a593Smuzhiyun 			if (result) {
3549*4882a593Smuzhiyun 				img_req->pending.result = result;
3550*4882a593Smuzhiyun 				return;
3551*4882a593Smuzhiyun 			}
3552*4882a593Smuzhiyun 		} else {
3553*4882a593Smuzhiyun 			img_req->pending.num_pending++;
3554*4882a593Smuzhiyun 		}
3555*4882a593Smuzhiyun 	}
3556*4882a593Smuzhiyun }
3557*4882a593Smuzhiyun 
rbd_img_advance(struct rbd_img_request * img_req,int * result)3558*4882a593Smuzhiyun static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
3559*4882a593Smuzhiyun {
3560*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3561*4882a593Smuzhiyun 	int ret;
3562*4882a593Smuzhiyun 
3563*4882a593Smuzhiyun again:
3564*4882a593Smuzhiyun 	switch (img_req->state) {
3565*4882a593Smuzhiyun 	case RBD_IMG_START:
3566*4882a593Smuzhiyun 		rbd_assert(!*result);
3567*4882a593Smuzhiyun 
3568*4882a593Smuzhiyun 		ret = rbd_img_exclusive_lock(img_req);
3569*4882a593Smuzhiyun 		if (ret < 0) {
3570*4882a593Smuzhiyun 			*result = ret;
3571*4882a593Smuzhiyun 			return true;
3572*4882a593Smuzhiyun 		}
3573*4882a593Smuzhiyun 		img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3574*4882a593Smuzhiyun 		if (ret > 0)
3575*4882a593Smuzhiyun 			goto again;
3576*4882a593Smuzhiyun 		return false;
3577*4882a593Smuzhiyun 	case RBD_IMG_EXCLUSIVE_LOCK:
3578*4882a593Smuzhiyun 		if (*result)
3579*4882a593Smuzhiyun 			return true;
3580*4882a593Smuzhiyun 
3581*4882a593Smuzhiyun 		rbd_assert(!need_exclusive_lock(img_req) ||
3582*4882a593Smuzhiyun 			   __rbd_is_lock_owner(rbd_dev));
3583*4882a593Smuzhiyun 
3584*4882a593Smuzhiyun 		rbd_img_object_requests(img_req);
3585*4882a593Smuzhiyun 		if (!img_req->pending.num_pending) {
3586*4882a593Smuzhiyun 			*result = img_req->pending.result;
3587*4882a593Smuzhiyun 			img_req->state = RBD_IMG_OBJECT_REQUESTS;
3588*4882a593Smuzhiyun 			goto again;
3589*4882a593Smuzhiyun 		}
3590*4882a593Smuzhiyun 		img_req->state = __RBD_IMG_OBJECT_REQUESTS;
3591*4882a593Smuzhiyun 		return false;
3592*4882a593Smuzhiyun 	case __RBD_IMG_OBJECT_REQUESTS:
3593*4882a593Smuzhiyun 		if (!pending_result_dec(&img_req->pending, result))
3594*4882a593Smuzhiyun 			return false;
3595*4882a593Smuzhiyun 		fallthrough;
3596*4882a593Smuzhiyun 	case RBD_IMG_OBJECT_REQUESTS:
3597*4882a593Smuzhiyun 		return true;
3598*4882a593Smuzhiyun 	default:
3599*4882a593Smuzhiyun 		BUG();
3600*4882a593Smuzhiyun 	}
3601*4882a593Smuzhiyun }
3602*4882a593Smuzhiyun 
3603*4882a593Smuzhiyun /*
3604*4882a593Smuzhiyun  * Return true if @img_req is completed.
3605*4882a593Smuzhiyun  */
__rbd_img_handle_request(struct rbd_img_request * img_req,int * result)3606*4882a593Smuzhiyun static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
3607*4882a593Smuzhiyun 				     int *result)
3608*4882a593Smuzhiyun {
3609*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = img_req->rbd_dev;
3610*4882a593Smuzhiyun 	bool done;
3611*4882a593Smuzhiyun 
3612*4882a593Smuzhiyun 	if (need_exclusive_lock(img_req)) {
3613*4882a593Smuzhiyun 		down_read(&rbd_dev->lock_rwsem);
3614*4882a593Smuzhiyun 		mutex_lock(&img_req->state_mutex);
3615*4882a593Smuzhiyun 		done = rbd_img_advance(img_req, result);
3616*4882a593Smuzhiyun 		if (done)
3617*4882a593Smuzhiyun 			rbd_lock_del_request(img_req);
3618*4882a593Smuzhiyun 		mutex_unlock(&img_req->state_mutex);
3619*4882a593Smuzhiyun 		up_read(&rbd_dev->lock_rwsem);
3620*4882a593Smuzhiyun 	} else {
3621*4882a593Smuzhiyun 		mutex_lock(&img_req->state_mutex);
3622*4882a593Smuzhiyun 		done = rbd_img_advance(img_req, result);
3623*4882a593Smuzhiyun 		mutex_unlock(&img_req->state_mutex);
3624*4882a593Smuzhiyun 	}
3625*4882a593Smuzhiyun 
3626*4882a593Smuzhiyun 	if (done && *result) {
3627*4882a593Smuzhiyun 		rbd_assert(*result < 0);
3628*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "%s%s result %d",
3629*4882a593Smuzhiyun 		      test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
3630*4882a593Smuzhiyun 		      obj_op_name(img_req->op_type), *result);
3631*4882a593Smuzhiyun 	}
3632*4882a593Smuzhiyun 	return done;
3633*4882a593Smuzhiyun }
3634*4882a593Smuzhiyun 
rbd_img_handle_request(struct rbd_img_request * img_req,int result)3635*4882a593Smuzhiyun static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
3636*4882a593Smuzhiyun {
3637*4882a593Smuzhiyun again:
3638*4882a593Smuzhiyun 	if (!__rbd_img_handle_request(img_req, &result))
3639*4882a593Smuzhiyun 		return;
3640*4882a593Smuzhiyun 
3641*4882a593Smuzhiyun 	if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
3642*4882a593Smuzhiyun 		struct rbd_obj_request *obj_req = img_req->obj_request;
3643*4882a593Smuzhiyun 
3644*4882a593Smuzhiyun 		rbd_img_request_destroy(img_req);
3645*4882a593Smuzhiyun 		if (__rbd_obj_handle_request(obj_req, &result)) {
3646*4882a593Smuzhiyun 			img_req = obj_req->img_request;
3647*4882a593Smuzhiyun 			goto again;
3648*4882a593Smuzhiyun 		}
3649*4882a593Smuzhiyun 	} else {
3650*4882a593Smuzhiyun 		struct request *rq = blk_mq_rq_from_pdu(img_req);
3651*4882a593Smuzhiyun 
3652*4882a593Smuzhiyun 		rbd_img_request_destroy(img_req);
3653*4882a593Smuzhiyun 		blk_mq_end_request(rq, errno_to_blk_status(result));
3654*4882a593Smuzhiyun 	}
3655*4882a593Smuzhiyun }
3656*4882a593Smuzhiyun 
3657*4882a593Smuzhiyun static const struct rbd_client_id rbd_empty_cid;
3658*4882a593Smuzhiyun 
rbd_cid_equal(const struct rbd_client_id * lhs,const struct rbd_client_id * rhs)3659*4882a593Smuzhiyun static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3660*4882a593Smuzhiyun 			  const struct rbd_client_id *rhs)
3661*4882a593Smuzhiyun {
3662*4882a593Smuzhiyun 	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3663*4882a593Smuzhiyun }
3664*4882a593Smuzhiyun 
rbd_get_cid(struct rbd_device * rbd_dev)3665*4882a593Smuzhiyun static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3666*4882a593Smuzhiyun {
3667*4882a593Smuzhiyun 	struct rbd_client_id cid;
3668*4882a593Smuzhiyun 
3669*4882a593Smuzhiyun 	mutex_lock(&rbd_dev->watch_mutex);
3670*4882a593Smuzhiyun 	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3671*4882a593Smuzhiyun 	cid.handle = rbd_dev->watch_cookie;
3672*4882a593Smuzhiyun 	mutex_unlock(&rbd_dev->watch_mutex);
3673*4882a593Smuzhiyun 	return cid;
3674*4882a593Smuzhiyun }
3675*4882a593Smuzhiyun 
3676*4882a593Smuzhiyun /*
3677*4882a593Smuzhiyun  * lock_rwsem must be held for write
3678*4882a593Smuzhiyun  */
rbd_set_owner_cid(struct rbd_device * rbd_dev,const struct rbd_client_id * cid)3679*4882a593Smuzhiyun static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3680*4882a593Smuzhiyun 			      const struct rbd_client_id *cid)
3681*4882a593Smuzhiyun {
3682*4882a593Smuzhiyun 	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3683*4882a593Smuzhiyun 	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3684*4882a593Smuzhiyun 	     cid->gid, cid->handle);
3685*4882a593Smuzhiyun 	rbd_dev->owner_cid = *cid; /* struct */
3686*4882a593Smuzhiyun }
3687*4882a593Smuzhiyun 
format_lock_cookie(struct rbd_device * rbd_dev,char * buf)3688*4882a593Smuzhiyun static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3689*4882a593Smuzhiyun {
3690*4882a593Smuzhiyun 	mutex_lock(&rbd_dev->watch_mutex);
3691*4882a593Smuzhiyun 	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3692*4882a593Smuzhiyun 	mutex_unlock(&rbd_dev->watch_mutex);
3693*4882a593Smuzhiyun }
3694*4882a593Smuzhiyun 
__rbd_lock(struct rbd_device * rbd_dev,const char * cookie)3695*4882a593Smuzhiyun static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3696*4882a593Smuzhiyun {
3697*4882a593Smuzhiyun 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3698*4882a593Smuzhiyun 
3699*4882a593Smuzhiyun 	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3700*4882a593Smuzhiyun 	strcpy(rbd_dev->lock_cookie, cookie);
3701*4882a593Smuzhiyun 	rbd_set_owner_cid(rbd_dev, &cid);
3702*4882a593Smuzhiyun 	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3703*4882a593Smuzhiyun }
3704*4882a593Smuzhiyun 
3705*4882a593Smuzhiyun /*
3706*4882a593Smuzhiyun  * lock_rwsem must be held for write
3707*4882a593Smuzhiyun  */
rbd_lock(struct rbd_device * rbd_dev)3708*4882a593Smuzhiyun static int rbd_lock(struct rbd_device *rbd_dev)
3709*4882a593Smuzhiyun {
3710*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3711*4882a593Smuzhiyun 	char cookie[32];
3712*4882a593Smuzhiyun 	int ret;
3713*4882a593Smuzhiyun 
3714*4882a593Smuzhiyun 	WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3715*4882a593Smuzhiyun 		rbd_dev->lock_cookie[0] != '\0');
3716*4882a593Smuzhiyun 
3717*4882a593Smuzhiyun 	format_lock_cookie(rbd_dev, cookie);
3718*4882a593Smuzhiyun 	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3719*4882a593Smuzhiyun 			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3720*4882a593Smuzhiyun 			    RBD_LOCK_TAG, "", 0);
3721*4882a593Smuzhiyun 	if (ret)
3722*4882a593Smuzhiyun 		return ret;
3723*4882a593Smuzhiyun 
3724*4882a593Smuzhiyun 	__rbd_lock(rbd_dev, cookie);
3725*4882a593Smuzhiyun 	return 0;
3726*4882a593Smuzhiyun }
3727*4882a593Smuzhiyun 
3728*4882a593Smuzhiyun /*
3729*4882a593Smuzhiyun  * lock_rwsem must be held for write
3730*4882a593Smuzhiyun  */
rbd_unlock(struct rbd_device * rbd_dev)3731*4882a593Smuzhiyun static void rbd_unlock(struct rbd_device *rbd_dev)
3732*4882a593Smuzhiyun {
3733*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3734*4882a593Smuzhiyun 	int ret;
3735*4882a593Smuzhiyun 
3736*4882a593Smuzhiyun 	WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3737*4882a593Smuzhiyun 		rbd_dev->lock_cookie[0] == '\0');
3738*4882a593Smuzhiyun 
3739*4882a593Smuzhiyun 	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3740*4882a593Smuzhiyun 			      RBD_LOCK_NAME, rbd_dev->lock_cookie);
3741*4882a593Smuzhiyun 	if (ret && ret != -ENOENT)
3742*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
3743*4882a593Smuzhiyun 
3744*4882a593Smuzhiyun 	/* treat errors as the image is unlocked */
3745*4882a593Smuzhiyun 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3746*4882a593Smuzhiyun 	rbd_dev->lock_cookie[0] = '\0';
3747*4882a593Smuzhiyun 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3748*4882a593Smuzhiyun 	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3749*4882a593Smuzhiyun }
3750*4882a593Smuzhiyun 
__rbd_notify_op_lock(struct rbd_device * rbd_dev,enum rbd_notify_op notify_op,struct page *** preply_pages,size_t * preply_len)3751*4882a593Smuzhiyun static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3752*4882a593Smuzhiyun 				enum rbd_notify_op notify_op,
3753*4882a593Smuzhiyun 				struct page ***preply_pages,
3754*4882a593Smuzhiyun 				size_t *preply_len)
3755*4882a593Smuzhiyun {
3756*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3757*4882a593Smuzhiyun 	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3758*4882a593Smuzhiyun 	char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3759*4882a593Smuzhiyun 	int buf_size = sizeof(buf);
3760*4882a593Smuzhiyun 	void *p = buf;
3761*4882a593Smuzhiyun 
3762*4882a593Smuzhiyun 	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3763*4882a593Smuzhiyun 
3764*4882a593Smuzhiyun 	/* encode *LockPayload NotifyMessage (op + ClientId) */
3765*4882a593Smuzhiyun 	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3766*4882a593Smuzhiyun 	ceph_encode_32(&p, notify_op);
3767*4882a593Smuzhiyun 	ceph_encode_64(&p, cid.gid);
3768*4882a593Smuzhiyun 	ceph_encode_64(&p, cid.handle);
3769*4882a593Smuzhiyun 
3770*4882a593Smuzhiyun 	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3771*4882a593Smuzhiyun 				&rbd_dev->header_oloc, buf, buf_size,
3772*4882a593Smuzhiyun 				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3773*4882a593Smuzhiyun }
3774*4882a593Smuzhiyun 
rbd_notify_op_lock(struct rbd_device * rbd_dev,enum rbd_notify_op notify_op)3775*4882a593Smuzhiyun static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3776*4882a593Smuzhiyun 			       enum rbd_notify_op notify_op)
3777*4882a593Smuzhiyun {
3778*4882a593Smuzhiyun 	__rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL);
3779*4882a593Smuzhiyun }
3780*4882a593Smuzhiyun 
rbd_notify_acquired_lock(struct work_struct * work)3781*4882a593Smuzhiyun static void rbd_notify_acquired_lock(struct work_struct *work)
3782*4882a593Smuzhiyun {
3783*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3784*4882a593Smuzhiyun 						  acquired_lock_work);
3785*4882a593Smuzhiyun 
3786*4882a593Smuzhiyun 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3787*4882a593Smuzhiyun }
3788*4882a593Smuzhiyun 
rbd_notify_released_lock(struct work_struct * work)3789*4882a593Smuzhiyun static void rbd_notify_released_lock(struct work_struct *work)
3790*4882a593Smuzhiyun {
3791*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3792*4882a593Smuzhiyun 						  released_lock_work);
3793*4882a593Smuzhiyun 
3794*4882a593Smuzhiyun 	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3795*4882a593Smuzhiyun }
3796*4882a593Smuzhiyun 
rbd_request_lock(struct rbd_device * rbd_dev)3797*4882a593Smuzhiyun static int rbd_request_lock(struct rbd_device *rbd_dev)
3798*4882a593Smuzhiyun {
3799*4882a593Smuzhiyun 	struct page **reply_pages;
3800*4882a593Smuzhiyun 	size_t reply_len;
3801*4882a593Smuzhiyun 	bool lock_owner_responded = false;
3802*4882a593Smuzhiyun 	int ret;
3803*4882a593Smuzhiyun 
3804*4882a593Smuzhiyun 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3805*4882a593Smuzhiyun 
3806*4882a593Smuzhiyun 	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3807*4882a593Smuzhiyun 				   &reply_pages, &reply_len);
3808*4882a593Smuzhiyun 	if (ret && ret != -ETIMEDOUT) {
3809*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3810*4882a593Smuzhiyun 		goto out;
3811*4882a593Smuzhiyun 	}
3812*4882a593Smuzhiyun 
3813*4882a593Smuzhiyun 	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3814*4882a593Smuzhiyun 		void *p = page_address(reply_pages[0]);
3815*4882a593Smuzhiyun 		void *const end = p + reply_len;
3816*4882a593Smuzhiyun 		u32 n;
3817*4882a593Smuzhiyun 
3818*4882a593Smuzhiyun 		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3819*4882a593Smuzhiyun 		while (n--) {
3820*4882a593Smuzhiyun 			u8 struct_v;
3821*4882a593Smuzhiyun 			u32 len;
3822*4882a593Smuzhiyun 
3823*4882a593Smuzhiyun 			ceph_decode_need(&p, end, 8 + 8, e_inval);
3824*4882a593Smuzhiyun 			p += 8 + 8; /* skip gid and cookie */
3825*4882a593Smuzhiyun 
3826*4882a593Smuzhiyun 			ceph_decode_32_safe(&p, end, len, e_inval);
3827*4882a593Smuzhiyun 			if (!len)
3828*4882a593Smuzhiyun 				continue;
3829*4882a593Smuzhiyun 
3830*4882a593Smuzhiyun 			if (lock_owner_responded) {
3831*4882a593Smuzhiyun 				rbd_warn(rbd_dev,
3832*4882a593Smuzhiyun 					 "duplicate lock owners detected");
3833*4882a593Smuzhiyun 				ret = -EIO;
3834*4882a593Smuzhiyun 				goto out;
3835*4882a593Smuzhiyun 			}
3836*4882a593Smuzhiyun 
3837*4882a593Smuzhiyun 			lock_owner_responded = true;
3838*4882a593Smuzhiyun 			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3839*4882a593Smuzhiyun 						  &struct_v, &len);
3840*4882a593Smuzhiyun 			if (ret) {
3841*4882a593Smuzhiyun 				rbd_warn(rbd_dev,
3842*4882a593Smuzhiyun 					 "failed to decode ResponseMessage: %d",
3843*4882a593Smuzhiyun 					 ret);
3844*4882a593Smuzhiyun 				goto e_inval;
3845*4882a593Smuzhiyun 			}
3846*4882a593Smuzhiyun 
3847*4882a593Smuzhiyun 			ret = ceph_decode_32(&p);
3848*4882a593Smuzhiyun 		}
3849*4882a593Smuzhiyun 	}
3850*4882a593Smuzhiyun 
3851*4882a593Smuzhiyun 	if (!lock_owner_responded) {
3852*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "no lock owners detected");
3853*4882a593Smuzhiyun 		ret = -ETIMEDOUT;
3854*4882a593Smuzhiyun 	}
3855*4882a593Smuzhiyun 
3856*4882a593Smuzhiyun out:
3857*4882a593Smuzhiyun 	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3858*4882a593Smuzhiyun 	return ret;
3859*4882a593Smuzhiyun 
3860*4882a593Smuzhiyun e_inval:
3861*4882a593Smuzhiyun 	ret = -EINVAL;
3862*4882a593Smuzhiyun 	goto out;
3863*4882a593Smuzhiyun }
3864*4882a593Smuzhiyun 
3865*4882a593Smuzhiyun /*
3866*4882a593Smuzhiyun  * Either image request state machine(s) or rbd_add_acquire_lock()
3867*4882a593Smuzhiyun  * (i.e. "rbd map").
3868*4882a593Smuzhiyun  */
wake_lock_waiters(struct rbd_device * rbd_dev,int result)3869*4882a593Smuzhiyun static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
3870*4882a593Smuzhiyun {
3871*4882a593Smuzhiyun 	struct rbd_img_request *img_req;
3872*4882a593Smuzhiyun 
3873*4882a593Smuzhiyun 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3874*4882a593Smuzhiyun 	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
3875*4882a593Smuzhiyun 
3876*4882a593Smuzhiyun 	cancel_delayed_work(&rbd_dev->lock_dwork);
3877*4882a593Smuzhiyun 	if (!completion_done(&rbd_dev->acquire_wait)) {
3878*4882a593Smuzhiyun 		rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3879*4882a593Smuzhiyun 			   list_empty(&rbd_dev->running_list));
3880*4882a593Smuzhiyun 		rbd_dev->acquire_err = result;
3881*4882a593Smuzhiyun 		complete_all(&rbd_dev->acquire_wait);
3882*4882a593Smuzhiyun 		return;
3883*4882a593Smuzhiyun 	}
3884*4882a593Smuzhiyun 
3885*4882a593Smuzhiyun 	list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
3886*4882a593Smuzhiyun 		mutex_lock(&img_req->state_mutex);
3887*4882a593Smuzhiyun 		rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3888*4882a593Smuzhiyun 		rbd_img_schedule(img_req, result);
3889*4882a593Smuzhiyun 		mutex_unlock(&img_req->state_mutex);
3890*4882a593Smuzhiyun 	}
3891*4882a593Smuzhiyun 
3892*4882a593Smuzhiyun 	list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
3893*4882a593Smuzhiyun }
3894*4882a593Smuzhiyun 
get_lock_owner_info(struct rbd_device * rbd_dev,struct ceph_locker ** lockers,u32 * num_lockers)3895*4882a593Smuzhiyun static int get_lock_owner_info(struct rbd_device *rbd_dev,
3896*4882a593Smuzhiyun 			       struct ceph_locker **lockers, u32 *num_lockers)
3897*4882a593Smuzhiyun {
3898*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3899*4882a593Smuzhiyun 	u8 lock_type;
3900*4882a593Smuzhiyun 	char *lock_tag;
3901*4882a593Smuzhiyun 	int ret;
3902*4882a593Smuzhiyun 
3903*4882a593Smuzhiyun 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
3904*4882a593Smuzhiyun 
3905*4882a593Smuzhiyun 	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3906*4882a593Smuzhiyun 				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3907*4882a593Smuzhiyun 				 &lock_type, &lock_tag, lockers, num_lockers);
3908*4882a593Smuzhiyun 	if (ret)
3909*4882a593Smuzhiyun 		return ret;
3910*4882a593Smuzhiyun 
3911*4882a593Smuzhiyun 	if (*num_lockers == 0) {
3912*4882a593Smuzhiyun 		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3913*4882a593Smuzhiyun 		goto out;
3914*4882a593Smuzhiyun 	}
3915*4882a593Smuzhiyun 
3916*4882a593Smuzhiyun 	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3917*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3918*4882a593Smuzhiyun 			 lock_tag);
3919*4882a593Smuzhiyun 		ret = -EBUSY;
3920*4882a593Smuzhiyun 		goto out;
3921*4882a593Smuzhiyun 	}
3922*4882a593Smuzhiyun 
3923*4882a593Smuzhiyun 	if (lock_type == CEPH_CLS_LOCK_SHARED) {
3924*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "shared lock type detected");
3925*4882a593Smuzhiyun 		ret = -EBUSY;
3926*4882a593Smuzhiyun 		goto out;
3927*4882a593Smuzhiyun 	}
3928*4882a593Smuzhiyun 
3929*4882a593Smuzhiyun 	if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3930*4882a593Smuzhiyun 		    strlen(RBD_LOCK_COOKIE_PREFIX))) {
3931*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3932*4882a593Smuzhiyun 			 (*lockers)[0].id.cookie);
3933*4882a593Smuzhiyun 		ret = -EBUSY;
3934*4882a593Smuzhiyun 		goto out;
3935*4882a593Smuzhiyun 	}
3936*4882a593Smuzhiyun 
3937*4882a593Smuzhiyun out:
3938*4882a593Smuzhiyun 	kfree(lock_tag);
3939*4882a593Smuzhiyun 	return ret;
3940*4882a593Smuzhiyun }
3941*4882a593Smuzhiyun 
find_watcher(struct rbd_device * rbd_dev,const struct ceph_locker * locker)3942*4882a593Smuzhiyun static int find_watcher(struct rbd_device *rbd_dev,
3943*4882a593Smuzhiyun 			const struct ceph_locker *locker)
3944*4882a593Smuzhiyun {
3945*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3946*4882a593Smuzhiyun 	struct ceph_watch_item *watchers;
3947*4882a593Smuzhiyun 	u32 num_watchers;
3948*4882a593Smuzhiyun 	u64 cookie;
3949*4882a593Smuzhiyun 	int i;
3950*4882a593Smuzhiyun 	int ret;
3951*4882a593Smuzhiyun 
3952*4882a593Smuzhiyun 	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3953*4882a593Smuzhiyun 				      &rbd_dev->header_oloc, &watchers,
3954*4882a593Smuzhiyun 				      &num_watchers);
3955*4882a593Smuzhiyun 	if (ret)
3956*4882a593Smuzhiyun 		return ret;
3957*4882a593Smuzhiyun 
3958*4882a593Smuzhiyun 	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3959*4882a593Smuzhiyun 	for (i = 0; i < num_watchers; i++) {
3960*4882a593Smuzhiyun 		if (!memcmp(&watchers[i].addr, &locker->info.addr,
3961*4882a593Smuzhiyun 			    sizeof(locker->info.addr)) &&
3962*4882a593Smuzhiyun 		    watchers[i].cookie == cookie) {
3963*4882a593Smuzhiyun 			struct rbd_client_id cid = {
3964*4882a593Smuzhiyun 				.gid = le64_to_cpu(watchers[i].name.num),
3965*4882a593Smuzhiyun 				.handle = cookie,
3966*4882a593Smuzhiyun 			};
3967*4882a593Smuzhiyun 
3968*4882a593Smuzhiyun 			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3969*4882a593Smuzhiyun 			     rbd_dev, cid.gid, cid.handle);
3970*4882a593Smuzhiyun 			rbd_set_owner_cid(rbd_dev, &cid);
3971*4882a593Smuzhiyun 			ret = 1;
3972*4882a593Smuzhiyun 			goto out;
3973*4882a593Smuzhiyun 		}
3974*4882a593Smuzhiyun 	}
3975*4882a593Smuzhiyun 
3976*4882a593Smuzhiyun 	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3977*4882a593Smuzhiyun 	ret = 0;
3978*4882a593Smuzhiyun out:
3979*4882a593Smuzhiyun 	kfree(watchers);
3980*4882a593Smuzhiyun 	return ret;
3981*4882a593Smuzhiyun }
3982*4882a593Smuzhiyun 
3983*4882a593Smuzhiyun /*
3984*4882a593Smuzhiyun  * lock_rwsem must be held for write
3985*4882a593Smuzhiyun  */
rbd_try_lock(struct rbd_device * rbd_dev)3986*4882a593Smuzhiyun static int rbd_try_lock(struct rbd_device *rbd_dev)
3987*4882a593Smuzhiyun {
3988*4882a593Smuzhiyun 	struct ceph_client *client = rbd_dev->rbd_client->client;
3989*4882a593Smuzhiyun 	struct ceph_locker *lockers;
3990*4882a593Smuzhiyun 	u32 num_lockers;
3991*4882a593Smuzhiyun 	int ret;
3992*4882a593Smuzhiyun 
3993*4882a593Smuzhiyun 	for (;;) {
3994*4882a593Smuzhiyun 		ret = rbd_lock(rbd_dev);
3995*4882a593Smuzhiyun 		if (ret != -EBUSY)
3996*4882a593Smuzhiyun 			return ret;
3997*4882a593Smuzhiyun 
3998*4882a593Smuzhiyun 		/* determine if the current lock holder is still alive */
3999*4882a593Smuzhiyun 		ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
4000*4882a593Smuzhiyun 		if (ret)
4001*4882a593Smuzhiyun 			return ret;
4002*4882a593Smuzhiyun 
4003*4882a593Smuzhiyun 		if (num_lockers == 0)
4004*4882a593Smuzhiyun 			goto again;
4005*4882a593Smuzhiyun 
4006*4882a593Smuzhiyun 		ret = find_watcher(rbd_dev, lockers);
4007*4882a593Smuzhiyun 		if (ret)
4008*4882a593Smuzhiyun 			goto out; /* request lock or error */
4009*4882a593Smuzhiyun 
4010*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
4011*4882a593Smuzhiyun 			 ENTITY_NAME(lockers[0].id.name));
4012*4882a593Smuzhiyun 
4013*4882a593Smuzhiyun 		ret = ceph_monc_blocklist_add(&client->monc,
4014*4882a593Smuzhiyun 					      &lockers[0].info.addr);
4015*4882a593Smuzhiyun 		if (ret) {
4016*4882a593Smuzhiyun 			rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d",
4017*4882a593Smuzhiyun 				 ENTITY_NAME(lockers[0].id.name), ret);
4018*4882a593Smuzhiyun 			goto out;
4019*4882a593Smuzhiyun 		}
4020*4882a593Smuzhiyun 
4021*4882a593Smuzhiyun 		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
4022*4882a593Smuzhiyun 					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
4023*4882a593Smuzhiyun 					  lockers[0].id.cookie,
4024*4882a593Smuzhiyun 					  &lockers[0].id.name);
4025*4882a593Smuzhiyun 		if (ret && ret != -ENOENT)
4026*4882a593Smuzhiyun 			goto out;
4027*4882a593Smuzhiyun 
4028*4882a593Smuzhiyun again:
4029*4882a593Smuzhiyun 		ceph_free_lockers(lockers, num_lockers);
4030*4882a593Smuzhiyun 	}
4031*4882a593Smuzhiyun 
4032*4882a593Smuzhiyun out:
4033*4882a593Smuzhiyun 	ceph_free_lockers(lockers, num_lockers);
4034*4882a593Smuzhiyun 	return ret;
4035*4882a593Smuzhiyun }
4036*4882a593Smuzhiyun 
rbd_post_acquire_action(struct rbd_device * rbd_dev)4037*4882a593Smuzhiyun static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
4038*4882a593Smuzhiyun {
4039*4882a593Smuzhiyun 	int ret;
4040*4882a593Smuzhiyun 
4041*4882a593Smuzhiyun 	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
4042*4882a593Smuzhiyun 		ret = rbd_object_map_open(rbd_dev);
4043*4882a593Smuzhiyun 		if (ret)
4044*4882a593Smuzhiyun 			return ret;
4045*4882a593Smuzhiyun 	}
4046*4882a593Smuzhiyun 
4047*4882a593Smuzhiyun 	return 0;
4048*4882a593Smuzhiyun }
4049*4882a593Smuzhiyun 
4050*4882a593Smuzhiyun /*
4051*4882a593Smuzhiyun  * Return:
4052*4882a593Smuzhiyun  *   0 - lock acquired
4053*4882a593Smuzhiyun  *   1 - caller should call rbd_request_lock()
4054*4882a593Smuzhiyun  *  <0 - error
4055*4882a593Smuzhiyun  */
rbd_try_acquire_lock(struct rbd_device * rbd_dev)4056*4882a593Smuzhiyun static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
4057*4882a593Smuzhiyun {
4058*4882a593Smuzhiyun 	int ret;
4059*4882a593Smuzhiyun 
4060*4882a593Smuzhiyun 	down_read(&rbd_dev->lock_rwsem);
4061*4882a593Smuzhiyun 	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4062*4882a593Smuzhiyun 	     rbd_dev->lock_state);
4063*4882a593Smuzhiyun 	if (__rbd_is_lock_owner(rbd_dev)) {
4064*4882a593Smuzhiyun 		up_read(&rbd_dev->lock_rwsem);
4065*4882a593Smuzhiyun 		return 0;
4066*4882a593Smuzhiyun 	}
4067*4882a593Smuzhiyun 
4068*4882a593Smuzhiyun 	up_read(&rbd_dev->lock_rwsem);
4069*4882a593Smuzhiyun 	down_write(&rbd_dev->lock_rwsem);
4070*4882a593Smuzhiyun 	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4071*4882a593Smuzhiyun 	     rbd_dev->lock_state);
4072*4882a593Smuzhiyun 	if (__rbd_is_lock_owner(rbd_dev)) {
4073*4882a593Smuzhiyun 		up_write(&rbd_dev->lock_rwsem);
4074*4882a593Smuzhiyun 		return 0;
4075*4882a593Smuzhiyun 	}
4076*4882a593Smuzhiyun 
4077*4882a593Smuzhiyun 	ret = rbd_try_lock(rbd_dev);
4078*4882a593Smuzhiyun 	if (ret < 0) {
4079*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "failed to lock header: %d", ret);
4080*4882a593Smuzhiyun 		if (ret == -EBLOCKLISTED)
4081*4882a593Smuzhiyun 			goto out;
4082*4882a593Smuzhiyun 
4083*4882a593Smuzhiyun 		ret = 1; /* request lock anyway */
4084*4882a593Smuzhiyun 	}
4085*4882a593Smuzhiyun 	if (ret > 0) {
4086*4882a593Smuzhiyun 		up_write(&rbd_dev->lock_rwsem);
4087*4882a593Smuzhiyun 		return ret;
4088*4882a593Smuzhiyun 	}
4089*4882a593Smuzhiyun 
4090*4882a593Smuzhiyun 	rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4091*4882a593Smuzhiyun 	rbd_assert(list_empty(&rbd_dev->running_list));
4092*4882a593Smuzhiyun 
4093*4882a593Smuzhiyun 	ret = rbd_post_acquire_action(rbd_dev);
4094*4882a593Smuzhiyun 	if (ret) {
4095*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
4096*4882a593Smuzhiyun 		/*
4097*4882a593Smuzhiyun 		 * Can't stay in RBD_LOCK_STATE_LOCKED because
4098*4882a593Smuzhiyun 		 * rbd_lock_add_request() would let the request through,
4099*4882a593Smuzhiyun 		 * assuming that e.g. object map is locked and loaded.
4100*4882a593Smuzhiyun 		 */
4101*4882a593Smuzhiyun 		rbd_unlock(rbd_dev);
4102*4882a593Smuzhiyun 	}
4103*4882a593Smuzhiyun 
4104*4882a593Smuzhiyun out:
4105*4882a593Smuzhiyun 	wake_lock_waiters(rbd_dev, ret);
4106*4882a593Smuzhiyun 	up_write(&rbd_dev->lock_rwsem);
4107*4882a593Smuzhiyun 	return ret;
4108*4882a593Smuzhiyun }
4109*4882a593Smuzhiyun 
rbd_acquire_lock(struct work_struct * work)4110*4882a593Smuzhiyun static void rbd_acquire_lock(struct work_struct *work)
4111*4882a593Smuzhiyun {
4112*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4113*4882a593Smuzhiyun 					    struct rbd_device, lock_dwork);
4114*4882a593Smuzhiyun 	int ret;
4115*4882a593Smuzhiyun 
4116*4882a593Smuzhiyun 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4117*4882a593Smuzhiyun again:
4118*4882a593Smuzhiyun 	ret = rbd_try_acquire_lock(rbd_dev);
4119*4882a593Smuzhiyun 	if (ret <= 0) {
4120*4882a593Smuzhiyun 		dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
4121*4882a593Smuzhiyun 		return;
4122*4882a593Smuzhiyun 	}
4123*4882a593Smuzhiyun 
4124*4882a593Smuzhiyun 	ret = rbd_request_lock(rbd_dev);
4125*4882a593Smuzhiyun 	if (ret == -ETIMEDOUT) {
4126*4882a593Smuzhiyun 		goto again; /* treat this as a dead client */
4127*4882a593Smuzhiyun 	} else if (ret == -EROFS) {
4128*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "peer will not release lock");
4129*4882a593Smuzhiyun 		down_write(&rbd_dev->lock_rwsem);
4130*4882a593Smuzhiyun 		wake_lock_waiters(rbd_dev, ret);
4131*4882a593Smuzhiyun 		up_write(&rbd_dev->lock_rwsem);
4132*4882a593Smuzhiyun 	} else if (ret < 0) {
4133*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4134*4882a593Smuzhiyun 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4135*4882a593Smuzhiyun 				 RBD_RETRY_DELAY);
4136*4882a593Smuzhiyun 	} else {
4137*4882a593Smuzhiyun 		/*
4138*4882a593Smuzhiyun 		 * lock owner acked, but resend if we don't see them
4139*4882a593Smuzhiyun 		 * release the lock
4140*4882a593Smuzhiyun 		 */
4141*4882a593Smuzhiyun 		dout("%s rbd_dev %p requeuing lock_dwork\n", __func__,
4142*4882a593Smuzhiyun 		     rbd_dev);
4143*4882a593Smuzhiyun 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4144*4882a593Smuzhiyun 		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4145*4882a593Smuzhiyun 	}
4146*4882a593Smuzhiyun }
4147*4882a593Smuzhiyun 
rbd_quiesce_lock(struct rbd_device * rbd_dev)4148*4882a593Smuzhiyun static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
4149*4882a593Smuzhiyun {
4150*4882a593Smuzhiyun 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4151*4882a593Smuzhiyun 	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
4152*4882a593Smuzhiyun 
4153*4882a593Smuzhiyun 	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4154*4882a593Smuzhiyun 		return false;
4155*4882a593Smuzhiyun 
4156*4882a593Smuzhiyun 	/*
4157*4882a593Smuzhiyun 	 * Ensure that all in-flight IO is flushed.
4158*4882a593Smuzhiyun 	 */
4159*4882a593Smuzhiyun 	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4160*4882a593Smuzhiyun 	rbd_assert(!completion_done(&rbd_dev->releasing_wait));
4161*4882a593Smuzhiyun 	if (list_empty(&rbd_dev->running_list))
4162*4882a593Smuzhiyun 		return true;
4163*4882a593Smuzhiyun 
4164*4882a593Smuzhiyun 	up_write(&rbd_dev->lock_rwsem);
4165*4882a593Smuzhiyun 	wait_for_completion(&rbd_dev->releasing_wait);
4166*4882a593Smuzhiyun 
4167*4882a593Smuzhiyun 	down_write(&rbd_dev->lock_rwsem);
4168*4882a593Smuzhiyun 	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4169*4882a593Smuzhiyun 		return false;
4170*4882a593Smuzhiyun 
4171*4882a593Smuzhiyun 	rbd_assert(list_empty(&rbd_dev->running_list));
4172*4882a593Smuzhiyun 	return true;
4173*4882a593Smuzhiyun }
4174*4882a593Smuzhiyun 
rbd_pre_release_action(struct rbd_device * rbd_dev)4175*4882a593Smuzhiyun static void rbd_pre_release_action(struct rbd_device *rbd_dev)
4176*4882a593Smuzhiyun {
4177*4882a593Smuzhiyun 	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
4178*4882a593Smuzhiyun 		rbd_object_map_close(rbd_dev);
4179*4882a593Smuzhiyun }
4180*4882a593Smuzhiyun 
__rbd_release_lock(struct rbd_device * rbd_dev)4181*4882a593Smuzhiyun static void __rbd_release_lock(struct rbd_device *rbd_dev)
4182*4882a593Smuzhiyun {
4183*4882a593Smuzhiyun 	rbd_assert(list_empty(&rbd_dev->running_list));
4184*4882a593Smuzhiyun 
4185*4882a593Smuzhiyun 	rbd_pre_release_action(rbd_dev);
4186*4882a593Smuzhiyun 	rbd_unlock(rbd_dev);
4187*4882a593Smuzhiyun }
4188*4882a593Smuzhiyun 
4189*4882a593Smuzhiyun /*
4190*4882a593Smuzhiyun  * lock_rwsem must be held for write
4191*4882a593Smuzhiyun  */
rbd_release_lock(struct rbd_device * rbd_dev)4192*4882a593Smuzhiyun static void rbd_release_lock(struct rbd_device *rbd_dev)
4193*4882a593Smuzhiyun {
4194*4882a593Smuzhiyun 	if (!rbd_quiesce_lock(rbd_dev))
4195*4882a593Smuzhiyun 		return;
4196*4882a593Smuzhiyun 
4197*4882a593Smuzhiyun 	__rbd_release_lock(rbd_dev);
4198*4882a593Smuzhiyun 
4199*4882a593Smuzhiyun 	/*
4200*4882a593Smuzhiyun 	 * Give others a chance to grab the lock - we would re-acquire
4201*4882a593Smuzhiyun 	 * almost immediately if we got new IO while draining the running
4202*4882a593Smuzhiyun 	 * list otherwise.  We need to ack our own notifications, so this
4203*4882a593Smuzhiyun 	 * lock_dwork will be requeued from rbd_handle_released_lock() by
4204*4882a593Smuzhiyun 	 * way of maybe_kick_acquire().
4205*4882a593Smuzhiyun 	 */
4206*4882a593Smuzhiyun 	cancel_delayed_work(&rbd_dev->lock_dwork);
4207*4882a593Smuzhiyun }
4208*4882a593Smuzhiyun 
rbd_release_lock_work(struct work_struct * work)4209*4882a593Smuzhiyun static void rbd_release_lock_work(struct work_struct *work)
4210*4882a593Smuzhiyun {
4211*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4212*4882a593Smuzhiyun 						  unlock_work);
4213*4882a593Smuzhiyun 
4214*4882a593Smuzhiyun 	down_write(&rbd_dev->lock_rwsem);
4215*4882a593Smuzhiyun 	rbd_release_lock(rbd_dev);
4216*4882a593Smuzhiyun 	up_write(&rbd_dev->lock_rwsem);
4217*4882a593Smuzhiyun }
4218*4882a593Smuzhiyun 
maybe_kick_acquire(struct rbd_device * rbd_dev)4219*4882a593Smuzhiyun static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4220*4882a593Smuzhiyun {
4221*4882a593Smuzhiyun 	bool have_requests;
4222*4882a593Smuzhiyun 
4223*4882a593Smuzhiyun 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4224*4882a593Smuzhiyun 	if (__rbd_is_lock_owner(rbd_dev))
4225*4882a593Smuzhiyun 		return;
4226*4882a593Smuzhiyun 
4227*4882a593Smuzhiyun 	spin_lock(&rbd_dev->lock_lists_lock);
4228*4882a593Smuzhiyun 	have_requests = !list_empty(&rbd_dev->acquiring_list);
4229*4882a593Smuzhiyun 	spin_unlock(&rbd_dev->lock_lists_lock);
4230*4882a593Smuzhiyun 	if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4231*4882a593Smuzhiyun 		dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4232*4882a593Smuzhiyun 		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4233*4882a593Smuzhiyun 	}
4234*4882a593Smuzhiyun }
4235*4882a593Smuzhiyun 
rbd_handle_acquired_lock(struct rbd_device * rbd_dev,u8 struct_v,void ** p)4236*4882a593Smuzhiyun static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4237*4882a593Smuzhiyun 				     void **p)
4238*4882a593Smuzhiyun {
4239*4882a593Smuzhiyun 	struct rbd_client_id cid = { 0 };
4240*4882a593Smuzhiyun 
4241*4882a593Smuzhiyun 	if (struct_v >= 2) {
4242*4882a593Smuzhiyun 		cid.gid = ceph_decode_64(p);
4243*4882a593Smuzhiyun 		cid.handle = ceph_decode_64(p);
4244*4882a593Smuzhiyun 	}
4245*4882a593Smuzhiyun 
4246*4882a593Smuzhiyun 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4247*4882a593Smuzhiyun 	     cid.handle);
4248*4882a593Smuzhiyun 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4249*4882a593Smuzhiyun 		down_write(&rbd_dev->lock_rwsem);
4250*4882a593Smuzhiyun 		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4251*4882a593Smuzhiyun 			dout("%s rbd_dev %p cid %llu-%llu == owner_cid\n",
4252*4882a593Smuzhiyun 			     __func__, rbd_dev, cid.gid, cid.handle);
4253*4882a593Smuzhiyun 		} else {
4254*4882a593Smuzhiyun 			rbd_set_owner_cid(rbd_dev, &cid);
4255*4882a593Smuzhiyun 		}
4256*4882a593Smuzhiyun 		downgrade_write(&rbd_dev->lock_rwsem);
4257*4882a593Smuzhiyun 	} else {
4258*4882a593Smuzhiyun 		down_read(&rbd_dev->lock_rwsem);
4259*4882a593Smuzhiyun 	}
4260*4882a593Smuzhiyun 
4261*4882a593Smuzhiyun 	maybe_kick_acquire(rbd_dev);
4262*4882a593Smuzhiyun 	up_read(&rbd_dev->lock_rwsem);
4263*4882a593Smuzhiyun }
4264*4882a593Smuzhiyun 
rbd_handle_released_lock(struct rbd_device * rbd_dev,u8 struct_v,void ** p)4265*4882a593Smuzhiyun static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4266*4882a593Smuzhiyun 				     void **p)
4267*4882a593Smuzhiyun {
4268*4882a593Smuzhiyun 	struct rbd_client_id cid = { 0 };
4269*4882a593Smuzhiyun 
4270*4882a593Smuzhiyun 	if (struct_v >= 2) {
4271*4882a593Smuzhiyun 		cid.gid = ceph_decode_64(p);
4272*4882a593Smuzhiyun 		cid.handle = ceph_decode_64(p);
4273*4882a593Smuzhiyun 	}
4274*4882a593Smuzhiyun 
4275*4882a593Smuzhiyun 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4276*4882a593Smuzhiyun 	     cid.handle);
4277*4882a593Smuzhiyun 	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4278*4882a593Smuzhiyun 		down_write(&rbd_dev->lock_rwsem);
4279*4882a593Smuzhiyun 		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4280*4882a593Smuzhiyun 			dout("%s rbd_dev %p cid %llu-%llu != owner_cid %llu-%llu\n",
4281*4882a593Smuzhiyun 			     __func__, rbd_dev, cid.gid, cid.handle,
4282*4882a593Smuzhiyun 			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
4283*4882a593Smuzhiyun 		} else {
4284*4882a593Smuzhiyun 			rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4285*4882a593Smuzhiyun 		}
4286*4882a593Smuzhiyun 		downgrade_write(&rbd_dev->lock_rwsem);
4287*4882a593Smuzhiyun 	} else {
4288*4882a593Smuzhiyun 		down_read(&rbd_dev->lock_rwsem);
4289*4882a593Smuzhiyun 	}
4290*4882a593Smuzhiyun 
4291*4882a593Smuzhiyun 	maybe_kick_acquire(rbd_dev);
4292*4882a593Smuzhiyun 	up_read(&rbd_dev->lock_rwsem);
4293*4882a593Smuzhiyun }
4294*4882a593Smuzhiyun 
4295*4882a593Smuzhiyun /*
4296*4882a593Smuzhiyun  * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
4297*4882a593Smuzhiyun  * ResponseMessage is needed.
4298*4882a593Smuzhiyun  */
rbd_handle_request_lock(struct rbd_device * rbd_dev,u8 struct_v,void ** p)4299*4882a593Smuzhiyun static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4300*4882a593Smuzhiyun 				   void **p)
4301*4882a593Smuzhiyun {
4302*4882a593Smuzhiyun 	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4303*4882a593Smuzhiyun 	struct rbd_client_id cid = { 0 };
4304*4882a593Smuzhiyun 	int result = 1;
4305*4882a593Smuzhiyun 
4306*4882a593Smuzhiyun 	if (struct_v >= 2) {
4307*4882a593Smuzhiyun 		cid.gid = ceph_decode_64(p);
4308*4882a593Smuzhiyun 		cid.handle = ceph_decode_64(p);
4309*4882a593Smuzhiyun 	}
4310*4882a593Smuzhiyun 
4311*4882a593Smuzhiyun 	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4312*4882a593Smuzhiyun 	     cid.handle);
4313*4882a593Smuzhiyun 	if (rbd_cid_equal(&cid, &my_cid))
4314*4882a593Smuzhiyun 		return result;
4315*4882a593Smuzhiyun 
4316*4882a593Smuzhiyun 	down_read(&rbd_dev->lock_rwsem);
4317*4882a593Smuzhiyun 	if (__rbd_is_lock_owner(rbd_dev)) {
4318*4882a593Smuzhiyun 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
4319*4882a593Smuzhiyun 		    rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
4320*4882a593Smuzhiyun 			goto out_unlock;
4321*4882a593Smuzhiyun 
4322*4882a593Smuzhiyun 		/*
4323*4882a593Smuzhiyun 		 * encode ResponseMessage(0) so the peer can detect
4324*4882a593Smuzhiyun 		 * a missing owner
4325*4882a593Smuzhiyun 		 */
4326*4882a593Smuzhiyun 		result = 0;
4327*4882a593Smuzhiyun 
4328*4882a593Smuzhiyun 		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
4329*4882a593Smuzhiyun 			if (!rbd_dev->opts->exclusive) {
4330*4882a593Smuzhiyun 				dout("%s rbd_dev %p queueing unlock_work\n",
4331*4882a593Smuzhiyun 				     __func__, rbd_dev);
4332*4882a593Smuzhiyun 				queue_work(rbd_dev->task_wq,
4333*4882a593Smuzhiyun 					   &rbd_dev->unlock_work);
4334*4882a593Smuzhiyun 			} else {
4335*4882a593Smuzhiyun 				/* refuse to release the lock */
4336*4882a593Smuzhiyun 				result = -EROFS;
4337*4882a593Smuzhiyun 			}
4338*4882a593Smuzhiyun 		}
4339*4882a593Smuzhiyun 	}
4340*4882a593Smuzhiyun 
4341*4882a593Smuzhiyun out_unlock:
4342*4882a593Smuzhiyun 	up_read(&rbd_dev->lock_rwsem);
4343*4882a593Smuzhiyun 	return result;
4344*4882a593Smuzhiyun }
4345*4882a593Smuzhiyun 
__rbd_acknowledge_notify(struct rbd_device * rbd_dev,u64 notify_id,u64 cookie,s32 * result)4346*4882a593Smuzhiyun static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4347*4882a593Smuzhiyun 				     u64 notify_id, u64 cookie, s32 *result)
4348*4882a593Smuzhiyun {
4349*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4350*4882a593Smuzhiyun 	char buf[4 + CEPH_ENCODING_START_BLK_LEN];
4351*4882a593Smuzhiyun 	int buf_size = sizeof(buf);
4352*4882a593Smuzhiyun 	int ret;
4353*4882a593Smuzhiyun 
4354*4882a593Smuzhiyun 	if (result) {
4355*4882a593Smuzhiyun 		void *p = buf;
4356*4882a593Smuzhiyun 
4357*4882a593Smuzhiyun 		/* encode ResponseMessage */
4358*4882a593Smuzhiyun 		ceph_start_encoding(&p, 1, 1,
4359*4882a593Smuzhiyun 				    buf_size - CEPH_ENCODING_START_BLK_LEN);
4360*4882a593Smuzhiyun 		ceph_encode_32(&p, *result);
4361*4882a593Smuzhiyun 	} else {
4362*4882a593Smuzhiyun 		buf_size = 0;
4363*4882a593Smuzhiyun 	}
4364*4882a593Smuzhiyun 
4365*4882a593Smuzhiyun 	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4366*4882a593Smuzhiyun 				   &rbd_dev->header_oloc, notify_id, cookie,
4367*4882a593Smuzhiyun 				   buf, buf_size);
4368*4882a593Smuzhiyun 	if (ret)
4369*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4370*4882a593Smuzhiyun }
4371*4882a593Smuzhiyun 
rbd_acknowledge_notify(struct rbd_device * rbd_dev,u64 notify_id,u64 cookie)4372*4882a593Smuzhiyun static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4373*4882a593Smuzhiyun 				   u64 cookie)
4374*4882a593Smuzhiyun {
4375*4882a593Smuzhiyun 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4376*4882a593Smuzhiyun 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4377*4882a593Smuzhiyun }
4378*4882a593Smuzhiyun 
rbd_acknowledge_notify_result(struct rbd_device * rbd_dev,u64 notify_id,u64 cookie,s32 result)4379*4882a593Smuzhiyun static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4380*4882a593Smuzhiyun 					  u64 notify_id, u64 cookie, s32 result)
4381*4882a593Smuzhiyun {
4382*4882a593Smuzhiyun 	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4383*4882a593Smuzhiyun 	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4384*4882a593Smuzhiyun }
4385*4882a593Smuzhiyun 
rbd_watch_cb(void * arg,u64 notify_id,u64 cookie,u64 notifier_id,void * data,size_t data_len)4386*4882a593Smuzhiyun static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4387*4882a593Smuzhiyun 			 u64 notifier_id, void *data, size_t data_len)
4388*4882a593Smuzhiyun {
4389*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = arg;
4390*4882a593Smuzhiyun 	void *p = data;
4391*4882a593Smuzhiyun 	void *const end = p + data_len;
4392*4882a593Smuzhiyun 	u8 struct_v = 0;
4393*4882a593Smuzhiyun 	u32 len;
4394*4882a593Smuzhiyun 	u32 notify_op;
4395*4882a593Smuzhiyun 	int ret;
4396*4882a593Smuzhiyun 
4397*4882a593Smuzhiyun 	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4398*4882a593Smuzhiyun 	     __func__, rbd_dev, cookie, notify_id, data_len);
4399*4882a593Smuzhiyun 	if (data_len) {
4400*4882a593Smuzhiyun 		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4401*4882a593Smuzhiyun 					  &struct_v, &len);
4402*4882a593Smuzhiyun 		if (ret) {
4403*4882a593Smuzhiyun 			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4404*4882a593Smuzhiyun 				 ret);
4405*4882a593Smuzhiyun 			return;
4406*4882a593Smuzhiyun 		}
4407*4882a593Smuzhiyun 
4408*4882a593Smuzhiyun 		notify_op = ceph_decode_32(&p);
4409*4882a593Smuzhiyun 	} else {
4410*4882a593Smuzhiyun 		/* legacy notification for header updates */
4411*4882a593Smuzhiyun 		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4412*4882a593Smuzhiyun 		len = 0;
4413*4882a593Smuzhiyun 	}
4414*4882a593Smuzhiyun 
4415*4882a593Smuzhiyun 	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4416*4882a593Smuzhiyun 	switch (notify_op) {
4417*4882a593Smuzhiyun 	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4418*4882a593Smuzhiyun 		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4419*4882a593Smuzhiyun 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4420*4882a593Smuzhiyun 		break;
4421*4882a593Smuzhiyun 	case RBD_NOTIFY_OP_RELEASED_LOCK:
4422*4882a593Smuzhiyun 		rbd_handle_released_lock(rbd_dev, struct_v, &p);
4423*4882a593Smuzhiyun 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4424*4882a593Smuzhiyun 		break;
4425*4882a593Smuzhiyun 	case RBD_NOTIFY_OP_REQUEST_LOCK:
4426*4882a593Smuzhiyun 		ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
4427*4882a593Smuzhiyun 		if (ret <= 0)
4428*4882a593Smuzhiyun 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
4429*4882a593Smuzhiyun 						      cookie, ret);
4430*4882a593Smuzhiyun 		else
4431*4882a593Smuzhiyun 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4432*4882a593Smuzhiyun 		break;
4433*4882a593Smuzhiyun 	case RBD_NOTIFY_OP_HEADER_UPDATE:
4434*4882a593Smuzhiyun 		ret = rbd_dev_refresh(rbd_dev);
4435*4882a593Smuzhiyun 		if (ret)
4436*4882a593Smuzhiyun 			rbd_warn(rbd_dev, "refresh failed: %d", ret);
4437*4882a593Smuzhiyun 
4438*4882a593Smuzhiyun 		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4439*4882a593Smuzhiyun 		break;
4440*4882a593Smuzhiyun 	default:
4441*4882a593Smuzhiyun 		if (rbd_is_lock_owner(rbd_dev))
4442*4882a593Smuzhiyun 			rbd_acknowledge_notify_result(rbd_dev, notify_id,
4443*4882a593Smuzhiyun 						      cookie, -EOPNOTSUPP);
4444*4882a593Smuzhiyun 		else
4445*4882a593Smuzhiyun 			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4446*4882a593Smuzhiyun 		break;
4447*4882a593Smuzhiyun 	}
4448*4882a593Smuzhiyun }
4449*4882a593Smuzhiyun 
4450*4882a593Smuzhiyun static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
4451*4882a593Smuzhiyun 
rbd_watch_errcb(void * arg,u64 cookie,int err)4452*4882a593Smuzhiyun static void rbd_watch_errcb(void *arg, u64 cookie, int err)
4453*4882a593Smuzhiyun {
4454*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = arg;
4455*4882a593Smuzhiyun 
4456*4882a593Smuzhiyun 	rbd_warn(rbd_dev, "encountered watch error: %d", err);
4457*4882a593Smuzhiyun 
4458*4882a593Smuzhiyun 	down_write(&rbd_dev->lock_rwsem);
4459*4882a593Smuzhiyun 	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4460*4882a593Smuzhiyun 	up_write(&rbd_dev->lock_rwsem);
4461*4882a593Smuzhiyun 
4462*4882a593Smuzhiyun 	mutex_lock(&rbd_dev->watch_mutex);
4463*4882a593Smuzhiyun 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
4464*4882a593Smuzhiyun 		__rbd_unregister_watch(rbd_dev);
4465*4882a593Smuzhiyun 		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
4466*4882a593Smuzhiyun 
4467*4882a593Smuzhiyun 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
4468*4882a593Smuzhiyun 	}
4469*4882a593Smuzhiyun 	mutex_unlock(&rbd_dev->watch_mutex);
4470*4882a593Smuzhiyun }
4471*4882a593Smuzhiyun 
4472*4882a593Smuzhiyun /*
4473*4882a593Smuzhiyun  * watch_mutex must be locked
4474*4882a593Smuzhiyun  */
__rbd_register_watch(struct rbd_device * rbd_dev)4475*4882a593Smuzhiyun static int __rbd_register_watch(struct rbd_device *rbd_dev)
4476*4882a593Smuzhiyun {
4477*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4478*4882a593Smuzhiyun 	struct ceph_osd_linger_request *handle;
4479*4882a593Smuzhiyun 
4480*4882a593Smuzhiyun 	rbd_assert(!rbd_dev->watch_handle);
4481*4882a593Smuzhiyun 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4482*4882a593Smuzhiyun 
4483*4882a593Smuzhiyun 	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4484*4882a593Smuzhiyun 				 &rbd_dev->header_oloc, rbd_watch_cb,
4485*4882a593Smuzhiyun 				 rbd_watch_errcb, rbd_dev);
4486*4882a593Smuzhiyun 	if (IS_ERR(handle))
4487*4882a593Smuzhiyun 		return PTR_ERR(handle);
4488*4882a593Smuzhiyun 
4489*4882a593Smuzhiyun 	rbd_dev->watch_handle = handle;
4490*4882a593Smuzhiyun 	return 0;
4491*4882a593Smuzhiyun }
4492*4882a593Smuzhiyun 
4493*4882a593Smuzhiyun /*
4494*4882a593Smuzhiyun  * watch_mutex must be locked
4495*4882a593Smuzhiyun  */
__rbd_unregister_watch(struct rbd_device * rbd_dev)4496*4882a593Smuzhiyun static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
4497*4882a593Smuzhiyun {
4498*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4499*4882a593Smuzhiyun 	int ret;
4500*4882a593Smuzhiyun 
4501*4882a593Smuzhiyun 	rbd_assert(rbd_dev->watch_handle);
4502*4882a593Smuzhiyun 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4503*4882a593Smuzhiyun 
4504*4882a593Smuzhiyun 	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4505*4882a593Smuzhiyun 	if (ret)
4506*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
4507*4882a593Smuzhiyun 
4508*4882a593Smuzhiyun 	rbd_dev->watch_handle = NULL;
4509*4882a593Smuzhiyun }
4510*4882a593Smuzhiyun 
rbd_register_watch(struct rbd_device * rbd_dev)4511*4882a593Smuzhiyun static int rbd_register_watch(struct rbd_device *rbd_dev)
4512*4882a593Smuzhiyun {
4513*4882a593Smuzhiyun 	int ret;
4514*4882a593Smuzhiyun 
4515*4882a593Smuzhiyun 	mutex_lock(&rbd_dev->watch_mutex);
4516*4882a593Smuzhiyun 	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
4517*4882a593Smuzhiyun 	ret = __rbd_register_watch(rbd_dev);
4518*4882a593Smuzhiyun 	if (ret)
4519*4882a593Smuzhiyun 		goto out;
4520*4882a593Smuzhiyun 
4521*4882a593Smuzhiyun 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4522*4882a593Smuzhiyun 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4523*4882a593Smuzhiyun 
4524*4882a593Smuzhiyun out:
4525*4882a593Smuzhiyun 	mutex_unlock(&rbd_dev->watch_mutex);
4526*4882a593Smuzhiyun 	return ret;
4527*4882a593Smuzhiyun }
4528*4882a593Smuzhiyun 
cancel_tasks_sync(struct rbd_device * rbd_dev)4529*4882a593Smuzhiyun static void cancel_tasks_sync(struct rbd_device *rbd_dev)
4530*4882a593Smuzhiyun {
4531*4882a593Smuzhiyun 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4532*4882a593Smuzhiyun 
4533*4882a593Smuzhiyun 	cancel_work_sync(&rbd_dev->acquired_lock_work);
4534*4882a593Smuzhiyun 	cancel_work_sync(&rbd_dev->released_lock_work);
4535*4882a593Smuzhiyun 	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4536*4882a593Smuzhiyun 	cancel_work_sync(&rbd_dev->unlock_work);
4537*4882a593Smuzhiyun }
4538*4882a593Smuzhiyun 
4539*4882a593Smuzhiyun /*
4540*4882a593Smuzhiyun  * header_rwsem must not be held to avoid a deadlock with
4541*4882a593Smuzhiyun  * rbd_dev_refresh() when flushing notifies.
4542*4882a593Smuzhiyun  */
rbd_unregister_watch(struct rbd_device * rbd_dev)4543*4882a593Smuzhiyun static void rbd_unregister_watch(struct rbd_device *rbd_dev)
4544*4882a593Smuzhiyun {
4545*4882a593Smuzhiyun 	cancel_tasks_sync(rbd_dev);
4546*4882a593Smuzhiyun 
4547*4882a593Smuzhiyun 	mutex_lock(&rbd_dev->watch_mutex);
4548*4882a593Smuzhiyun 	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
4549*4882a593Smuzhiyun 		__rbd_unregister_watch(rbd_dev);
4550*4882a593Smuzhiyun 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4551*4882a593Smuzhiyun 	mutex_unlock(&rbd_dev->watch_mutex);
4552*4882a593Smuzhiyun 
4553*4882a593Smuzhiyun 	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
4554*4882a593Smuzhiyun 	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
4555*4882a593Smuzhiyun }
4556*4882a593Smuzhiyun 
4557*4882a593Smuzhiyun /*
4558*4882a593Smuzhiyun  * lock_rwsem must be held for write
4559*4882a593Smuzhiyun  */
rbd_reacquire_lock(struct rbd_device * rbd_dev)4560*4882a593Smuzhiyun static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
4561*4882a593Smuzhiyun {
4562*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4563*4882a593Smuzhiyun 	char cookie[32];
4564*4882a593Smuzhiyun 	int ret;
4565*4882a593Smuzhiyun 
4566*4882a593Smuzhiyun 	if (!rbd_quiesce_lock(rbd_dev))
4567*4882a593Smuzhiyun 		return;
4568*4882a593Smuzhiyun 
4569*4882a593Smuzhiyun 	format_lock_cookie(rbd_dev, cookie);
4570*4882a593Smuzhiyun 	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
4571*4882a593Smuzhiyun 				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
4572*4882a593Smuzhiyun 				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
4573*4882a593Smuzhiyun 				  RBD_LOCK_TAG, cookie);
4574*4882a593Smuzhiyun 	if (ret) {
4575*4882a593Smuzhiyun 		if (ret != -EOPNOTSUPP)
4576*4882a593Smuzhiyun 			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
4577*4882a593Smuzhiyun 				 ret);
4578*4882a593Smuzhiyun 
4579*4882a593Smuzhiyun 		/*
4580*4882a593Smuzhiyun 		 * Lock cookie cannot be updated on older OSDs, so do
4581*4882a593Smuzhiyun 		 * a manual release and queue an acquire.
4582*4882a593Smuzhiyun 		 */
4583*4882a593Smuzhiyun 		__rbd_release_lock(rbd_dev);
4584*4882a593Smuzhiyun 		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4585*4882a593Smuzhiyun 	} else {
4586*4882a593Smuzhiyun 		__rbd_lock(rbd_dev, cookie);
4587*4882a593Smuzhiyun 		wake_lock_waiters(rbd_dev, 0);
4588*4882a593Smuzhiyun 	}
4589*4882a593Smuzhiyun }
4590*4882a593Smuzhiyun 
rbd_reregister_watch(struct work_struct * work)4591*4882a593Smuzhiyun static void rbd_reregister_watch(struct work_struct *work)
4592*4882a593Smuzhiyun {
4593*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4594*4882a593Smuzhiyun 					    struct rbd_device, watch_dwork);
4595*4882a593Smuzhiyun 	int ret;
4596*4882a593Smuzhiyun 
4597*4882a593Smuzhiyun 	dout("%s rbd_dev %p\n", __func__, rbd_dev);
4598*4882a593Smuzhiyun 
4599*4882a593Smuzhiyun 	mutex_lock(&rbd_dev->watch_mutex);
4600*4882a593Smuzhiyun 	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
4601*4882a593Smuzhiyun 		mutex_unlock(&rbd_dev->watch_mutex);
4602*4882a593Smuzhiyun 		return;
4603*4882a593Smuzhiyun 	}
4604*4882a593Smuzhiyun 
4605*4882a593Smuzhiyun 	ret = __rbd_register_watch(rbd_dev);
4606*4882a593Smuzhiyun 	if (ret) {
4607*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
4608*4882a593Smuzhiyun 		if (ret != -EBLOCKLISTED && ret != -ENOENT) {
4609*4882a593Smuzhiyun 			queue_delayed_work(rbd_dev->task_wq,
4610*4882a593Smuzhiyun 					   &rbd_dev->watch_dwork,
4611*4882a593Smuzhiyun 					   RBD_RETRY_DELAY);
4612*4882a593Smuzhiyun 			mutex_unlock(&rbd_dev->watch_mutex);
4613*4882a593Smuzhiyun 			return;
4614*4882a593Smuzhiyun 		}
4615*4882a593Smuzhiyun 
4616*4882a593Smuzhiyun 		mutex_unlock(&rbd_dev->watch_mutex);
4617*4882a593Smuzhiyun 		down_write(&rbd_dev->lock_rwsem);
4618*4882a593Smuzhiyun 		wake_lock_waiters(rbd_dev, ret);
4619*4882a593Smuzhiyun 		up_write(&rbd_dev->lock_rwsem);
4620*4882a593Smuzhiyun 		return;
4621*4882a593Smuzhiyun 	}
4622*4882a593Smuzhiyun 
4623*4882a593Smuzhiyun 	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4624*4882a593Smuzhiyun 	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4625*4882a593Smuzhiyun 	mutex_unlock(&rbd_dev->watch_mutex);
4626*4882a593Smuzhiyun 
4627*4882a593Smuzhiyun 	down_write(&rbd_dev->lock_rwsem);
4628*4882a593Smuzhiyun 	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
4629*4882a593Smuzhiyun 		rbd_reacquire_lock(rbd_dev);
4630*4882a593Smuzhiyun 	up_write(&rbd_dev->lock_rwsem);
4631*4882a593Smuzhiyun 
4632*4882a593Smuzhiyun 	ret = rbd_dev_refresh(rbd_dev);
4633*4882a593Smuzhiyun 	if (ret)
4634*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
4635*4882a593Smuzhiyun }
4636*4882a593Smuzhiyun 
4637*4882a593Smuzhiyun /*
4638*4882a593Smuzhiyun  * Synchronous osd object method call.  Returns the number of bytes
4639*4882a593Smuzhiyun  * returned in the outbound buffer, or a negative error code.
4640*4882a593Smuzhiyun  */
rbd_obj_method_sync(struct rbd_device * rbd_dev,struct ceph_object_id * oid,struct ceph_object_locator * oloc,const char * method_name,const void * outbound,size_t outbound_size,void * inbound,size_t inbound_size)4641*4882a593Smuzhiyun static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
4642*4882a593Smuzhiyun 			     struct ceph_object_id *oid,
4643*4882a593Smuzhiyun 			     struct ceph_object_locator *oloc,
4644*4882a593Smuzhiyun 			     const char *method_name,
4645*4882a593Smuzhiyun 			     const void *outbound,
4646*4882a593Smuzhiyun 			     size_t outbound_size,
4647*4882a593Smuzhiyun 			     void *inbound,
4648*4882a593Smuzhiyun 			     size_t inbound_size)
4649*4882a593Smuzhiyun {
4650*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4651*4882a593Smuzhiyun 	struct page *req_page = NULL;
4652*4882a593Smuzhiyun 	struct page *reply_page;
4653*4882a593Smuzhiyun 	int ret;
4654*4882a593Smuzhiyun 
4655*4882a593Smuzhiyun 	/*
4656*4882a593Smuzhiyun 	 * Method calls are ultimately read operations.  The result
4657*4882a593Smuzhiyun 	 * should placed into the inbound buffer provided.  They
4658*4882a593Smuzhiyun 	 * also supply outbound data--parameters for the object
4659*4882a593Smuzhiyun 	 * method.  Currently if this is present it will be a
4660*4882a593Smuzhiyun 	 * snapshot id.
4661*4882a593Smuzhiyun 	 */
4662*4882a593Smuzhiyun 	if (outbound) {
4663*4882a593Smuzhiyun 		if (outbound_size > PAGE_SIZE)
4664*4882a593Smuzhiyun 			return -E2BIG;
4665*4882a593Smuzhiyun 
4666*4882a593Smuzhiyun 		req_page = alloc_page(GFP_KERNEL);
4667*4882a593Smuzhiyun 		if (!req_page)
4668*4882a593Smuzhiyun 			return -ENOMEM;
4669*4882a593Smuzhiyun 
4670*4882a593Smuzhiyun 		memcpy(page_address(req_page), outbound, outbound_size);
4671*4882a593Smuzhiyun 	}
4672*4882a593Smuzhiyun 
4673*4882a593Smuzhiyun 	reply_page = alloc_page(GFP_KERNEL);
4674*4882a593Smuzhiyun 	if (!reply_page) {
4675*4882a593Smuzhiyun 		if (req_page)
4676*4882a593Smuzhiyun 			__free_page(req_page);
4677*4882a593Smuzhiyun 		return -ENOMEM;
4678*4882a593Smuzhiyun 	}
4679*4882a593Smuzhiyun 
4680*4882a593Smuzhiyun 	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4681*4882a593Smuzhiyun 			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
4682*4882a593Smuzhiyun 			     &reply_page, &inbound_size);
4683*4882a593Smuzhiyun 	if (!ret) {
4684*4882a593Smuzhiyun 		memcpy(inbound, page_address(reply_page), inbound_size);
4685*4882a593Smuzhiyun 		ret = inbound_size;
4686*4882a593Smuzhiyun 	}
4687*4882a593Smuzhiyun 
4688*4882a593Smuzhiyun 	if (req_page)
4689*4882a593Smuzhiyun 		__free_page(req_page);
4690*4882a593Smuzhiyun 	__free_page(reply_page);
4691*4882a593Smuzhiyun 	return ret;
4692*4882a593Smuzhiyun }
4693*4882a593Smuzhiyun 
rbd_queue_workfn(struct work_struct * work)4694*4882a593Smuzhiyun static void rbd_queue_workfn(struct work_struct *work)
4695*4882a593Smuzhiyun {
4696*4882a593Smuzhiyun 	struct rbd_img_request *img_request =
4697*4882a593Smuzhiyun 	    container_of(work, struct rbd_img_request, work);
4698*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = img_request->rbd_dev;
4699*4882a593Smuzhiyun 	enum obj_operation_type op_type = img_request->op_type;
4700*4882a593Smuzhiyun 	struct request *rq = blk_mq_rq_from_pdu(img_request);
4701*4882a593Smuzhiyun 	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4702*4882a593Smuzhiyun 	u64 length = blk_rq_bytes(rq);
4703*4882a593Smuzhiyun 	u64 mapping_size;
4704*4882a593Smuzhiyun 	int result;
4705*4882a593Smuzhiyun 
4706*4882a593Smuzhiyun 	/* Ignore/skip any zero-length requests */
4707*4882a593Smuzhiyun 	if (!length) {
4708*4882a593Smuzhiyun 		dout("%s: zero-length request\n", __func__);
4709*4882a593Smuzhiyun 		result = 0;
4710*4882a593Smuzhiyun 		goto err_img_request;
4711*4882a593Smuzhiyun 	}
4712*4882a593Smuzhiyun 
4713*4882a593Smuzhiyun 	blk_mq_start_request(rq);
4714*4882a593Smuzhiyun 
4715*4882a593Smuzhiyun 	down_read(&rbd_dev->header_rwsem);
4716*4882a593Smuzhiyun 	mapping_size = rbd_dev->mapping.size;
4717*4882a593Smuzhiyun 	rbd_img_capture_header(img_request);
4718*4882a593Smuzhiyun 	up_read(&rbd_dev->header_rwsem);
4719*4882a593Smuzhiyun 
4720*4882a593Smuzhiyun 	if (offset + length > mapping_size) {
4721*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
4722*4882a593Smuzhiyun 			 length, mapping_size);
4723*4882a593Smuzhiyun 		result = -EIO;
4724*4882a593Smuzhiyun 		goto err_img_request;
4725*4882a593Smuzhiyun 	}
4726*4882a593Smuzhiyun 
4727*4882a593Smuzhiyun 	dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
4728*4882a593Smuzhiyun 	     img_request, obj_op_name(op_type), offset, length);
4729*4882a593Smuzhiyun 
4730*4882a593Smuzhiyun 	if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
4731*4882a593Smuzhiyun 		result = rbd_img_fill_nodata(img_request, offset, length);
4732*4882a593Smuzhiyun 	else
4733*4882a593Smuzhiyun 		result = rbd_img_fill_from_bio(img_request, offset, length,
4734*4882a593Smuzhiyun 					       rq->bio);
4735*4882a593Smuzhiyun 	if (result)
4736*4882a593Smuzhiyun 		goto err_img_request;
4737*4882a593Smuzhiyun 
4738*4882a593Smuzhiyun 	rbd_img_handle_request(img_request, 0);
4739*4882a593Smuzhiyun 	return;
4740*4882a593Smuzhiyun 
4741*4882a593Smuzhiyun err_img_request:
4742*4882a593Smuzhiyun 	rbd_img_request_destroy(img_request);
4743*4882a593Smuzhiyun 	if (result)
4744*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
4745*4882a593Smuzhiyun 			 obj_op_name(op_type), length, offset, result);
4746*4882a593Smuzhiyun 	blk_mq_end_request(rq, errno_to_blk_status(result));
4747*4882a593Smuzhiyun }
4748*4882a593Smuzhiyun 
rbd_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)4749*4882a593Smuzhiyun static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
4750*4882a593Smuzhiyun 		const struct blk_mq_queue_data *bd)
4751*4882a593Smuzhiyun {
4752*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = hctx->queue->queuedata;
4753*4882a593Smuzhiyun 	struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
4754*4882a593Smuzhiyun 	enum obj_operation_type op_type;
4755*4882a593Smuzhiyun 
4756*4882a593Smuzhiyun 	switch (req_op(bd->rq)) {
4757*4882a593Smuzhiyun 	case REQ_OP_DISCARD:
4758*4882a593Smuzhiyun 		op_type = OBJ_OP_DISCARD;
4759*4882a593Smuzhiyun 		break;
4760*4882a593Smuzhiyun 	case REQ_OP_WRITE_ZEROES:
4761*4882a593Smuzhiyun 		op_type = OBJ_OP_ZEROOUT;
4762*4882a593Smuzhiyun 		break;
4763*4882a593Smuzhiyun 	case REQ_OP_WRITE:
4764*4882a593Smuzhiyun 		op_type = OBJ_OP_WRITE;
4765*4882a593Smuzhiyun 		break;
4766*4882a593Smuzhiyun 	case REQ_OP_READ:
4767*4882a593Smuzhiyun 		op_type = OBJ_OP_READ;
4768*4882a593Smuzhiyun 		break;
4769*4882a593Smuzhiyun 	default:
4770*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
4771*4882a593Smuzhiyun 		return BLK_STS_IOERR;
4772*4882a593Smuzhiyun 	}
4773*4882a593Smuzhiyun 
4774*4882a593Smuzhiyun 	rbd_img_request_init(img_req, rbd_dev, op_type);
4775*4882a593Smuzhiyun 
4776*4882a593Smuzhiyun 	if (rbd_img_is_write(img_req)) {
4777*4882a593Smuzhiyun 		if (rbd_is_ro(rbd_dev)) {
4778*4882a593Smuzhiyun 			rbd_warn(rbd_dev, "%s on read-only mapping",
4779*4882a593Smuzhiyun 				 obj_op_name(img_req->op_type));
4780*4882a593Smuzhiyun 			return BLK_STS_IOERR;
4781*4882a593Smuzhiyun 		}
4782*4882a593Smuzhiyun 		rbd_assert(!rbd_is_snap(rbd_dev));
4783*4882a593Smuzhiyun 	}
4784*4882a593Smuzhiyun 
4785*4882a593Smuzhiyun 	INIT_WORK(&img_req->work, rbd_queue_workfn);
4786*4882a593Smuzhiyun 	queue_work(rbd_wq, &img_req->work);
4787*4882a593Smuzhiyun 	return BLK_STS_OK;
4788*4882a593Smuzhiyun }
4789*4882a593Smuzhiyun 
rbd_free_disk(struct rbd_device * rbd_dev)4790*4882a593Smuzhiyun static void rbd_free_disk(struct rbd_device *rbd_dev)
4791*4882a593Smuzhiyun {
4792*4882a593Smuzhiyun 	blk_cleanup_queue(rbd_dev->disk->queue);
4793*4882a593Smuzhiyun 	blk_mq_free_tag_set(&rbd_dev->tag_set);
4794*4882a593Smuzhiyun 	put_disk(rbd_dev->disk);
4795*4882a593Smuzhiyun 	rbd_dev->disk = NULL;
4796*4882a593Smuzhiyun }
4797*4882a593Smuzhiyun 
rbd_obj_read_sync(struct rbd_device * rbd_dev,struct ceph_object_id * oid,struct ceph_object_locator * oloc,void * buf,int buf_len)4798*4882a593Smuzhiyun static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
4799*4882a593Smuzhiyun 			     struct ceph_object_id *oid,
4800*4882a593Smuzhiyun 			     struct ceph_object_locator *oloc,
4801*4882a593Smuzhiyun 			     void *buf, int buf_len)
4802*4882a593Smuzhiyun 
4803*4882a593Smuzhiyun {
4804*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4805*4882a593Smuzhiyun 	struct ceph_osd_request *req;
4806*4882a593Smuzhiyun 	struct page **pages;
4807*4882a593Smuzhiyun 	int num_pages = calc_pages_for(0, buf_len);
4808*4882a593Smuzhiyun 	int ret;
4809*4882a593Smuzhiyun 
4810*4882a593Smuzhiyun 	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4811*4882a593Smuzhiyun 	if (!req)
4812*4882a593Smuzhiyun 		return -ENOMEM;
4813*4882a593Smuzhiyun 
4814*4882a593Smuzhiyun 	ceph_oid_copy(&req->r_base_oid, oid);
4815*4882a593Smuzhiyun 	ceph_oloc_copy(&req->r_base_oloc, oloc);
4816*4882a593Smuzhiyun 	req->r_flags = CEPH_OSD_FLAG_READ;
4817*4882a593Smuzhiyun 
4818*4882a593Smuzhiyun 	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4819*4882a593Smuzhiyun 	if (IS_ERR(pages)) {
4820*4882a593Smuzhiyun 		ret = PTR_ERR(pages);
4821*4882a593Smuzhiyun 		goto out_req;
4822*4882a593Smuzhiyun 	}
4823*4882a593Smuzhiyun 
4824*4882a593Smuzhiyun 	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4825*4882a593Smuzhiyun 	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4826*4882a593Smuzhiyun 					 true);
4827*4882a593Smuzhiyun 
4828*4882a593Smuzhiyun 	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4829*4882a593Smuzhiyun 	if (ret)
4830*4882a593Smuzhiyun 		goto out_req;
4831*4882a593Smuzhiyun 
4832*4882a593Smuzhiyun 	ceph_osdc_start_request(osdc, req, false);
4833*4882a593Smuzhiyun 	ret = ceph_osdc_wait_request(osdc, req);
4834*4882a593Smuzhiyun 	if (ret >= 0)
4835*4882a593Smuzhiyun 		ceph_copy_from_page_vector(pages, buf, 0, ret);
4836*4882a593Smuzhiyun 
4837*4882a593Smuzhiyun out_req:
4838*4882a593Smuzhiyun 	ceph_osdc_put_request(req);
4839*4882a593Smuzhiyun 	return ret;
4840*4882a593Smuzhiyun }
4841*4882a593Smuzhiyun 
4842*4882a593Smuzhiyun /*
4843*4882a593Smuzhiyun  * Read the complete header for the given rbd device.  On successful
4844*4882a593Smuzhiyun  * return, the rbd_dev->header field will contain up-to-date
4845*4882a593Smuzhiyun  * information about the image.
4846*4882a593Smuzhiyun  */
rbd_dev_v1_header_info(struct rbd_device * rbd_dev)4847*4882a593Smuzhiyun static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
4848*4882a593Smuzhiyun {
4849*4882a593Smuzhiyun 	struct rbd_image_header_ondisk *ondisk = NULL;
4850*4882a593Smuzhiyun 	u32 snap_count = 0;
4851*4882a593Smuzhiyun 	u64 names_size = 0;
4852*4882a593Smuzhiyun 	u32 want_count;
4853*4882a593Smuzhiyun 	int ret;
4854*4882a593Smuzhiyun 
4855*4882a593Smuzhiyun 	/*
4856*4882a593Smuzhiyun 	 * The complete header will include an array of its 64-bit
4857*4882a593Smuzhiyun 	 * snapshot ids, followed by the names of those snapshots as
4858*4882a593Smuzhiyun 	 * a contiguous block of NUL-terminated strings.  Note that
4859*4882a593Smuzhiyun 	 * the number of snapshots could change by the time we read
4860*4882a593Smuzhiyun 	 * it in, in which case we re-read it.
4861*4882a593Smuzhiyun 	 */
4862*4882a593Smuzhiyun 	do {
4863*4882a593Smuzhiyun 		size_t size;
4864*4882a593Smuzhiyun 
4865*4882a593Smuzhiyun 		kfree(ondisk);
4866*4882a593Smuzhiyun 
4867*4882a593Smuzhiyun 		size = sizeof (*ondisk);
4868*4882a593Smuzhiyun 		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4869*4882a593Smuzhiyun 		size += names_size;
4870*4882a593Smuzhiyun 		ondisk = kmalloc(size, GFP_KERNEL);
4871*4882a593Smuzhiyun 		if (!ondisk)
4872*4882a593Smuzhiyun 			return -ENOMEM;
4873*4882a593Smuzhiyun 
4874*4882a593Smuzhiyun 		ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4875*4882a593Smuzhiyun 					&rbd_dev->header_oloc, ondisk, size);
4876*4882a593Smuzhiyun 		if (ret < 0)
4877*4882a593Smuzhiyun 			goto out;
4878*4882a593Smuzhiyun 		if ((size_t)ret < size) {
4879*4882a593Smuzhiyun 			ret = -ENXIO;
4880*4882a593Smuzhiyun 			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4881*4882a593Smuzhiyun 				size, ret);
4882*4882a593Smuzhiyun 			goto out;
4883*4882a593Smuzhiyun 		}
4884*4882a593Smuzhiyun 		if (!rbd_dev_ondisk_valid(ondisk)) {
4885*4882a593Smuzhiyun 			ret = -ENXIO;
4886*4882a593Smuzhiyun 			rbd_warn(rbd_dev, "invalid header");
4887*4882a593Smuzhiyun 			goto out;
4888*4882a593Smuzhiyun 		}
4889*4882a593Smuzhiyun 
4890*4882a593Smuzhiyun 		names_size = le64_to_cpu(ondisk->snap_names_len);
4891*4882a593Smuzhiyun 		want_count = snap_count;
4892*4882a593Smuzhiyun 		snap_count = le32_to_cpu(ondisk->snap_count);
4893*4882a593Smuzhiyun 	} while (snap_count != want_count);
4894*4882a593Smuzhiyun 
4895*4882a593Smuzhiyun 	ret = rbd_header_from_disk(rbd_dev, ondisk);
4896*4882a593Smuzhiyun out:
4897*4882a593Smuzhiyun 	kfree(ondisk);
4898*4882a593Smuzhiyun 
4899*4882a593Smuzhiyun 	return ret;
4900*4882a593Smuzhiyun }
4901*4882a593Smuzhiyun 
rbd_dev_update_size(struct rbd_device * rbd_dev)4902*4882a593Smuzhiyun static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4903*4882a593Smuzhiyun {
4904*4882a593Smuzhiyun 	sector_t size;
4905*4882a593Smuzhiyun 
4906*4882a593Smuzhiyun 	/*
4907*4882a593Smuzhiyun 	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4908*4882a593Smuzhiyun 	 * try to update its size.  If REMOVING is set, updating size
4909*4882a593Smuzhiyun 	 * is just useless work since the device can't be opened.
4910*4882a593Smuzhiyun 	 */
4911*4882a593Smuzhiyun 	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4912*4882a593Smuzhiyun 	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
4913*4882a593Smuzhiyun 		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4914*4882a593Smuzhiyun 		dout("setting size to %llu sectors", (unsigned long long)size);
4915*4882a593Smuzhiyun 		set_capacity(rbd_dev->disk, size);
4916*4882a593Smuzhiyun 		revalidate_disk_size(rbd_dev->disk, true);
4917*4882a593Smuzhiyun 	}
4918*4882a593Smuzhiyun }
4919*4882a593Smuzhiyun 
rbd_dev_refresh(struct rbd_device * rbd_dev)4920*4882a593Smuzhiyun static int rbd_dev_refresh(struct rbd_device *rbd_dev)
4921*4882a593Smuzhiyun {
4922*4882a593Smuzhiyun 	u64 mapping_size;
4923*4882a593Smuzhiyun 	int ret;
4924*4882a593Smuzhiyun 
4925*4882a593Smuzhiyun 	down_write(&rbd_dev->header_rwsem);
4926*4882a593Smuzhiyun 	mapping_size = rbd_dev->mapping.size;
4927*4882a593Smuzhiyun 
4928*4882a593Smuzhiyun 	ret = rbd_dev_header_info(rbd_dev);
4929*4882a593Smuzhiyun 	if (ret)
4930*4882a593Smuzhiyun 		goto out;
4931*4882a593Smuzhiyun 
4932*4882a593Smuzhiyun 	/*
4933*4882a593Smuzhiyun 	 * If there is a parent, see if it has disappeared due to the
4934*4882a593Smuzhiyun 	 * mapped image getting flattened.
4935*4882a593Smuzhiyun 	 */
4936*4882a593Smuzhiyun 	if (rbd_dev->parent) {
4937*4882a593Smuzhiyun 		ret = rbd_dev_v2_parent_info(rbd_dev);
4938*4882a593Smuzhiyun 		if (ret)
4939*4882a593Smuzhiyun 			goto out;
4940*4882a593Smuzhiyun 	}
4941*4882a593Smuzhiyun 
4942*4882a593Smuzhiyun 	rbd_assert(!rbd_is_snap(rbd_dev));
4943*4882a593Smuzhiyun 	rbd_dev->mapping.size = rbd_dev->header.image_size;
4944*4882a593Smuzhiyun 
4945*4882a593Smuzhiyun out:
4946*4882a593Smuzhiyun 	up_write(&rbd_dev->header_rwsem);
4947*4882a593Smuzhiyun 	if (!ret && mapping_size != rbd_dev->mapping.size)
4948*4882a593Smuzhiyun 		rbd_dev_update_size(rbd_dev);
4949*4882a593Smuzhiyun 
4950*4882a593Smuzhiyun 	return ret;
4951*4882a593Smuzhiyun }
4952*4882a593Smuzhiyun 
4953*4882a593Smuzhiyun static const struct blk_mq_ops rbd_mq_ops = {
4954*4882a593Smuzhiyun 	.queue_rq	= rbd_queue_rq,
4955*4882a593Smuzhiyun };
4956*4882a593Smuzhiyun 
rbd_init_disk(struct rbd_device * rbd_dev)4957*4882a593Smuzhiyun static int rbd_init_disk(struct rbd_device *rbd_dev)
4958*4882a593Smuzhiyun {
4959*4882a593Smuzhiyun 	struct gendisk *disk;
4960*4882a593Smuzhiyun 	struct request_queue *q;
4961*4882a593Smuzhiyun 	unsigned int objset_bytes =
4962*4882a593Smuzhiyun 	    rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
4963*4882a593Smuzhiyun 	int err;
4964*4882a593Smuzhiyun 
4965*4882a593Smuzhiyun 	/* create gendisk info */
4966*4882a593Smuzhiyun 	disk = alloc_disk(single_major ?
4967*4882a593Smuzhiyun 			  (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4968*4882a593Smuzhiyun 			  RBD_MINORS_PER_MAJOR);
4969*4882a593Smuzhiyun 	if (!disk)
4970*4882a593Smuzhiyun 		return -ENOMEM;
4971*4882a593Smuzhiyun 
4972*4882a593Smuzhiyun 	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4973*4882a593Smuzhiyun 		 rbd_dev->dev_id);
4974*4882a593Smuzhiyun 	disk->major = rbd_dev->major;
4975*4882a593Smuzhiyun 	disk->first_minor = rbd_dev->minor;
4976*4882a593Smuzhiyun 	if (single_major)
4977*4882a593Smuzhiyun 		disk->flags |= GENHD_FL_EXT_DEVT;
4978*4882a593Smuzhiyun 	disk->fops = &rbd_bd_ops;
4979*4882a593Smuzhiyun 	disk->private_data = rbd_dev;
4980*4882a593Smuzhiyun 
4981*4882a593Smuzhiyun 	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4982*4882a593Smuzhiyun 	rbd_dev->tag_set.ops = &rbd_mq_ops;
4983*4882a593Smuzhiyun 	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
4984*4882a593Smuzhiyun 	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
4985*4882a593Smuzhiyun 	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
4986*4882a593Smuzhiyun 	rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
4987*4882a593Smuzhiyun 	rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
4988*4882a593Smuzhiyun 
4989*4882a593Smuzhiyun 	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4990*4882a593Smuzhiyun 	if (err)
4991*4882a593Smuzhiyun 		goto out_disk;
4992*4882a593Smuzhiyun 
4993*4882a593Smuzhiyun 	q = blk_mq_init_queue(&rbd_dev->tag_set);
4994*4882a593Smuzhiyun 	if (IS_ERR(q)) {
4995*4882a593Smuzhiyun 		err = PTR_ERR(q);
4996*4882a593Smuzhiyun 		goto out_tag_set;
4997*4882a593Smuzhiyun 	}
4998*4882a593Smuzhiyun 
4999*4882a593Smuzhiyun 	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
5000*4882a593Smuzhiyun 	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
5001*4882a593Smuzhiyun 
5002*4882a593Smuzhiyun 	blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
5003*4882a593Smuzhiyun 	q->limits.max_sectors = queue_max_hw_sectors(q);
5004*4882a593Smuzhiyun 	blk_queue_max_segments(q, USHRT_MAX);
5005*4882a593Smuzhiyun 	blk_queue_max_segment_size(q, UINT_MAX);
5006*4882a593Smuzhiyun 	blk_queue_io_min(q, rbd_dev->opts->alloc_size);
5007*4882a593Smuzhiyun 	blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
5008*4882a593Smuzhiyun 
5009*4882a593Smuzhiyun 	if (rbd_dev->opts->trim) {
5010*4882a593Smuzhiyun 		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
5011*4882a593Smuzhiyun 		q->limits.discard_granularity = rbd_dev->opts->alloc_size;
5012*4882a593Smuzhiyun 		blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
5013*4882a593Smuzhiyun 		blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
5014*4882a593Smuzhiyun 	}
5015*4882a593Smuzhiyun 
5016*4882a593Smuzhiyun 	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
5017*4882a593Smuzhiyun 		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
5018*4882a593Smuzhiyun 
5019*4882a593Smuzhiyun 	/*
5020*4882a593Smuzhiyun 	 * disk_release() expects a queue ref from add_disk() and will
5021*4882a593Smuzhiyun 	 * put it.  Hold an extra ref until add_disk() is called.
5022*4882a593Smuzhiyun 	 */
5023*4882a593Smuzhiyun 	WARN_ON(!blk_get_queue(q));
5024*4882a593Smuzhiyun 	disk->queue = q;
5025*4882a593Smuzhiyun 	q->queuedata = rbd_dev;
5026*4882a593Smuzhiyun 
5027*4882a593Smuzhiyun 	rbd_dev->disk = disk;
5028*4882a593Smuzhiyun 
5029*4882a593Smuzhiyun 	return 0;
5030*4882a593Smuzhiyun out_tag_set:
5031*4882a593Smuzhiyun 	blk_mq_free_tag_set(&rbd_dev->tag_set);
5032*4882a593Smuzhiyun out_disk:
5033*4882a593Smuzhiyun 	put_disk(disk);
5034*4882a593Smuzhiyun 	return err;
5035*4882a593Smuzhiyun }
5036*4882a593Smuzhiyun 
5037*4882a593Smuzhiyun /*
5038*4882a593Smuzhiyun   sysfs
5039*4882a593Smuzhiyun */
5040*4882a593Smuzhiyun 
dev_to_rbd_dev(struct device * dev)5041*4882a593Smuzhiyun static struct rbd_device *dev_to_rbd_dev(struct device *dev)
5042*4882a593Smuzhiyun {
5043*4882a593Smuzhiyun 	return container_of(dev, struct rbd_device, dev);
5044*4882a593Smuzhiyun }
5045*4882a593Smuzhiyun 
rbd_size_show(struct device * dev,struct device_attribute * attr,char * buf)5046*4882a593Smuzhiyun static ssize_t rbd_size_show(struct device *dev,
5047*4882a593Smuzhiyun 			     struct device_attribute *attr, char *buf)
5048*4882a593Smuzhiyun {
5049*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5050*4882a593Smuzhiyun 
5051*4882a593Smuzhiyun 	return sprintf(buf, "%llu\n",
5052*4882a593Smuzhiyun 		(unsigned long long)rbd_dev->mapping.size);
5053*4882a593Smuzhiyun }
5054*4882a593Smuzhiyun 
rbd_features_show(struct device * dev,struct device_attribute * attr,char * buf)5055*4882a593Smuzhiyun static ssize_t rbd_features_show(struct device *dev,
5056*4882a593Smuzhiyun 			     struct device_attribute *attr, char *buf)
5057*4882a593Smuzhiyun {
5058*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5059*4882a593Smuzhiyun 
5060*4882a593Smuzhiyun 	return sprintf(buf, "0x%016llx\n", rbd_dev->header.features);
5061*4882a593Smuzhiyun }
5062*4882a593Smuzhiyun 
rbd_major_show(struct device * dev,struct device_attribute * attr,char * buf)5063*4882a593Smuzhiyun static ssize_t rbd_major_show(struct device *dev,
5064*4882a593Smuzhiyun 			      struct device_attribute *attr, char *buf)
5065*4882a593Smuzhiyun {
5066*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5067*4882a593Smuzhiyun 
5068*4882a593Smuzhiyun 	if (rbd_dev->major)
5069*4882a593Smuzhiyun 		return sprintf(buf, "%d\n", rbd_dev->major);
5070*4882a593Smuzhiyun 
5071*4882a593Smuzhiyun 	return sprintf(buf, "(none)\n");
5072*4882a593Smuzhiyun }
5073*4882a593Smuzhiyun 
rbd_minor_show(struct device * dev,struct device_attribute * attr,char * buf)5074*4882a593Smuzhiyun static ssize_t rbd_minor_show(struct device *dev,
5075*4882a593Smuzhiyun 			      struct device_attribute *attr, char *buf)
5076*4882a593Smuzhiyun {
5077*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5078*4882a593Smuzhiyun 
5079*4882a593Smuzhiyun 	return sprintf(buf, "%d\n", rbd_dev->minor);
5080*4882a593Smuzhiyun }
5081*4882a593Smuzhiyun 
rbd_client_addr_show(struct device * dev,struct device_attribute * attr,char * buf)5082*4882a593Smuzhiyun static ssize_t rbd_client_addr_show(struct device *dev,
5083*4882a593Smuzhiyun 				    struct device_attribute *attr, char *buf)
5084*4882a593Smuzhiyun {
5085*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5086*4882a593Smuzhiyun 	struct ceph_entity_addr *client_addr =
5087*4882a593Smuzhiyun 	    ceph_client_addr(rbd_dev->rbd_client->client);
5088*4882a593Smuzhiyun 
5089*4882a593Smuzhiyun 	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5090*4882a593Smuzhiyun 		       le32_to_cpu(client_addr->nonce));
5091*4882a593Smuzhiyun }
5092*4882a593Smuzhiyun 
rbd_client_id_show(struct device * dev,struct device_attribute * attr,char * buf)5093*4882a593Smuzhiyun static ssize_t rbd_client_id_show(struct device *dev,
5094*4882a593Smuzhiyun 				  struct device_attribute *attr, char *buf)
5095*4882a593Smuzhiyun {
5096*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5097*4882a593Smuzhiyun 
5098*4882a593Smuzhiyun 	return sprintf(buf, "client%lld\n",
5099*4882a593Smuzhiyun 		       ceph_client_gid(rbd_dev->rbd_client->client));
5100*4882a593Smuzhiyun }
5101*4882a593Smuzhiyun 
rbd_cluster_fsid_show(struct device * dev,struct device_attribute * attr,char * buf)5102*4882a593Smuzhiyun static ssize_t rbd_cluster_fsid_show(struct device *dev,
5103*4882a593Smuzhiyun 				     struct device_attribute *attr, char *buf)
5104*4882a593Smuzhiyun {
5105*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5106*4882a593Smuzhiyun 
5107*4882a593Smuzhiyun 	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5108*4882a593Smuzhiyun }
5109*4882a593Smuzhiyun 
rbd_config_info_show(struct device * dev,struct device_attribute * attr,char * buf)5110*4882a593Smuzhiyun static ssize_t rbd_config_info_show(struct device *dev,
5111*4882a593Smuzhiyun 				    struct device_attribute *attr, char *buf)
5112*4882a593Smuzhiyun {
5113*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5114*4882a593Smuzhiyun 
5115*4882a593Smuzhiyun 	if (!capable(CAP_SYS_ADMIN))
5116*4882a593Smuzhiyun 		return -EPERM;
5117*4882a593Smuzhiyun 
5118*4882a593Smuzhiyun 	return sprintf(buf, "%s\n", rbd_dev->config_info);
5119*4882a593Smuzhiyun }
5120*4882a593Smuzhiyun 
rbd_pool_show(struct device * dev,struct device_attribute * attr,char * buf)5121*4882a593Smuzhiyun static ssize_t rbd_pool_show(struct device *dev,
5122*4882a593Smuzhiyun 			     struct device_attribute *attr, char *buf)
5123*4882a593Smuzhiyun {
5124*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5125*4882a593Smuzhiyun 
5126*4882a593Smuzhiyun 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
5127*4882a593Smuzhiyun }
5128*4882a593Smuzhiyun 
rbd_pool_id_show(struct device * dev,struct device_attribute * attr,char * buf)5129*4882a593Smuzhiyun static ssize_t rbd_pool_id_show(struct device *dev,
5130*4882a593Smuzhiyun 			     struct device_attribute *attr, char *buf)
5131*4882a593Smuzhiyun {
5132*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5133*4882a593Smuzhiyun 
5134*4882a593Smuzhiyun 	return sprintf(buf, "%llu\n",
5135*4882a593Smuzhiyun 			(unsigned long long) rbd_dev->spec->pool_id);
5136*4882a593Smuzhiyun }
5137*4882a593Smuzhiyun 
rbd_pool_ns_show(struct device * dev,struct device_attribute * attr,char * buf)5138*4882a593Smuzhiyun static ssize_t rbd_pool_ns_show(struct device *dev,
5139*4882a593Smuzhiyun 				struct device_attribute *attr, char *buf)
5140*4882a593Smuzhiyun {
5141*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5142*4882a593Smuzhiyun 
5143*4882a593Smuzhiyun 	return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5144*4882a593Smuzhiyun }
5145*4882a593Smuzhiyun 
rbd_name_show(struct device * dev,struct device_attribute * attr,char * buf)5146*4882a593Smuzhiyun static ssize_t rbd_name_show(struct device *dev,
5147*4882a593Smuzhiyun 			     struct device_attribute *attr, char *buf)
5148*4882a593Smuzhiyun {
5149*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5150*4882a593Smuzhiyun 
5151*4882a593Smuzhiyun 	if (rbd_dev->spec->image_name)
5152*4882a593Smuzhiyun 		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5153*4882a593Smuzhiyun 
5154*4882a593Smuzhiyun 	return sprintf(buf, "(unknown)\n");
5155*4882a593Smuzhiyun }
5156*4882a593Smuzhiyun 
rbd_image_id_show(struct device * dev,struct device_attribute * attr,char * buf)5157*4882a593Smuzhiyun static ssize_t rbd_image_id_show(struct device *dev,
5158*4882a593Smuzhiyun 			     struct device_attribute *attr, char *buf)
5159*4882a593Smuzhiyun {
5160*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5161*4882a593Smuzhiyun 
5162*4882a593Smuzhiyun 	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
5163*4882a593Smuzhiyun }
5164*4882a593Smuzhiyun 
5165*4882a593Smuzhiyun /*
5166*4882a593Smuzhiyun  * Shows the name of the currently-mapped snapshot (or
5167*4882a593Smuzhiyun  * RBD_SNAP_HEAD_NAME for the base image).
5168*4882a593Smuzhiyun  */
rbd_snap_show(struct device * dev,struct device_attribute * attr,char * buf)5169*4882a593Smuzhiyun static ssize_t rbd_snap_show(struct device *dev,
5170*4882a593Smuzhiyun 			     struct device_attribute *attr,
5171*4882a593Smuzhiyun 			     char *buf)
5172*4882a593Smuzhiyun {
5173*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5174*4882a593Smuzhiyun 
5175*4882a593Smuzhiyun 	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
5176*4882a593Smuzhiyun }
5177*4882a593Smuzhiyun 
rbd_snap_id_show(struct device * dev,struct device_attribute * attr,char * buf)5178*4882a593Smuzhiyun static ssize_t rbd_snap_id_show(struct device *dev,
5179*4882a593Smuzhiyun 				struct device_attribute *attr, char *buf)
5180*4882a593Smuzhiyun {
5181*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5182*4882a593Smuzhiyun 
5183*4882a593Smuzhiyun 	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
5184*4882a593Smuzhiyun }
5185*4882a593Smuzhiyun 
5186*4882a593Smuzhiyun /*
5187*4882a593Smuzhiyun  * For a v2 image, shows the chain of parent images, separated by empty
5188*4882a593Smuzhiyun  * lines.  For v1 images or if there is no parent, shows "(no parent
5189*4882a593Smuzhiyun  * image)".
5190*4882a593Smuzhiyun  */
rbd_parent_show(struct device * dev,struct device_attribute * attr,char * buf)5191*4882a593Smuzhiyun static ssize_t rbd_parent_show(struct device *dev,
5192*4882a593Smuzhiyun 			       struct device_attribute *attr,
5193*4882a593Smuzhiyun 			       char *buf)
5194*4882a593Smuzhiyun {
5195*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5196*4882a593Smuzhiyun 	ssize_t count = 0;
5197*4882a593Smuzhiyun 
5198*4882a593Smuzhiyun 	if (!rbd_dev->parent)
5199*4882a593Smuzhiyun 		return sprintf(buf, "(no parent image)\n");
5200*4882a593Smuzhiyun 
5201*4882a593Smuzhiyun 	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5202*4882a593Smuzhiyun 		struct rbd_spec *spec = rbd_dev->parent_spec;
5203*4882a593Smuzhiyun 
5204*4882a593Smuzhiyun 		count += sprintf(&buf[count], "%s"
5205*4882a593Smuzhiyun 			    "pool_id %llu\npool_name %s\n"
5206*4882a593Smuzhiyun 			    "pool_ns %s\n"
5207*4882a593Smuzhiyun 			    "image_id %s\nimage_name %s\n"
5208*4882a593Smuzhiyun 			    "snap_id %llu\nsnap_name %s\n"
5209*4882a593Smuzhiyun 			    "overlap %llu\n",
5210*4882a593Smuzhiyun 			    !count ? "" : "\n", /* first? */
5211*4882a593Smuzhiyun 			    spec->pool_id, spec->pool_name,
5212*4882a593Smuzhiyun 			    spec->pool_ns ?: "",
5213*4882a593Smuzhiyun 			    spec->image_id, spec->image_name ?: "(unknown)",
5214*4882a593Smuzhiyun 			    spec->snap_id, spec->snap_name,
5215*4882a593Smuzhiyun 			    rbd_dev->parent_overlap);
5216*4882a593Smuzhiyun 	}
5217*4882a593Smuzhiyun 
5218*4882a593Smuzhiyun 	return count;
5219*4882a593Smuzhiyun }
5220*4882a593Smuzhiyun 
rbd_image_refresh(struct device * dev,struct device_attribute * attr,const char * buf,size_t size)5221*4882a593Smuzhiyun static ssize_t rbd_image_refresh(struct device *dev,
5222*4882a593Smuzhiyun 				 struct device_attribute *attr,
5223*4882a593Smuzhiyun 				 const char *buf,
5224*4882a593Smuzhiyun 				 size_t size)
5225*4882a593Smuzhiyun {
5226*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5227*4882a593Smuzhiyun 	int ret;
5228*4882a593Smuzhiyun 
5229*4882a593Smuzhiyun 	if (!capable(CAP_SYS_ADMIN))
5230*4882a593Smuzhiyun 		return -EPERM;
5231*4882a593Smuzhiyun 
5232*4882a593Smuzhiyun 	ret = rbd_dev_refresh(rbd_dev);
5233*4882a593Smuzhiyun 	if (ret)
5234*4882a593Smuzhiyun 		return ret;
5235*4882a593Smuzhiyun 
5236*4882a593Smuzhiyun 	return size;
5237*4882a593Smuzhiyun }
5238*4882a593Smuzhiyun 
5239*4882a593Smuzhiyun static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
5240*4882a593Smuzhiyun static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
5241*4882a593Smuzhiyun static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
5242*4882a593Smuzhiyun static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
5243*4882a593Smuzhiyun static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
5244*4882a593Smuzhiyun static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
5245*4882a593Smuzhiyun static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
5246*4882a593Smuzhiyun static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
5247*4882a593Smuzhiyun static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
5248*4882a593Smuzhiyun static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
5249*4882a593Smuzhiyun static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
5250*4882a593Smuzhiyun static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
5251*4882a593Smuzhiyun static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
5252*4882a593Smuzhiyun static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
5253*4882a593Smuzhiyun static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
5254*4882a593Smuzhiyun static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
5255*4882a593Smuzhiyun static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
5256*4882a593Smuzhiyun 
5257*4882a593Smuzhiyun static struct attribute *rbd_attrs[] = {
5258*4882a593Smuzhiyun 	&dev_attr_size.attr,
5259*4882a593Smuzhiyun 	&dev_attr_features.attr,
5260*4882a593Smuzhiyun 	&dev_attr_major.attr,
5261*4882a593Smuzhiyun 	&dev_attr_minor.attr,
5262*4882a593Smuzhiyun 	&dev_attr_client_addr.attr,
5263*4882a593Smuzhiyun 	&dev_attr_client_id.attr,
5264*4882a593Smuzhiyun 	&dev_attr_cluster_fsid.attr,
5265*4882a593Smuzhiyun 	&dev_attr_config_info.attr,
5266*4882a593Smuzhiyun 	&dev_attr_pool.attr,
5267*4882a593Smuzhiyun 	&dev_attr_pool_id.attr,
5268*4882a593Smuzhiyun 	&dev_attr_pool_ns.attr,
5269*4882a593Smuzhiyun 	&dev_attr_name.attr,
5270*4882a593Smuzhiyun 	&dev_attr_image_id.attr,
5271*4882a593Smuzhiyun 	&dev_attr_current_snap.attr,
5272*4882a593Smuzhiyun 	&dev_attr_snap_id.attr,
5273*4882a593Smuzhiyun 	&dev_attr_parent.attr,
5274*4882a593Smuzhiyun 	&dev_attr_refresh.attr,
5275*4882a593Smuzhiyun 	NULL
5276*4882a593Smuzhiyun };
5277*4882a593Smuzhiyun 
5278*4882a593Smuzhiyun static struct attribute_group rbd_attr_group = {
5279*4882a593Smuzhiyun 	.attrs = rbd_attrs,
5280*4882a593Smuzhiyun };
5281*4882a593Smuzhiyun 
5282*4882a593Smuzhiyun static const struct attribute_group *rbd_attr_groups[] = {
5283*4882a593Smuzhiyun 	&rbd_attr_group,
5284*4882a593Smuzhiyun 	NULL
5285*4882a593Smuzhiyun };
5286*4882a593Smuzhiyun 
5287*4882a593Smuzhiyun static void rbd_dev_release(struct device *dev);
5288*4882a593Smuzhiyun 
5289*4882a593Smuzhiyun static const struct device_type rbd_device_type = {
5290*4882a593Smuzhiyun 	.name		= "rbd",
5291*4882a593Smuzhiyun 	.groups		= rbd_attr_groups,
5292*4882a593Smuzhiyun 	.release	= rbd_dev_release,
5293*4882a593Smuzhiyun };
5294*4882a593Smuzhiyun 
rbd_spec_get(struct rbd_spec * spec)5295*4882a593Smuzhiyun static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
5296*4882a593Smuzhiyun {
5297*4882a593Smuzhiyun 	kref_get(&spec->kref);
5298*4882a593Smuzhiyun 
5299*4882a593Smuzhiyun 	return spec;
5300*4882a593Smuzhiyun }
5301*4882a593Smuzhiyun 
5302*4882a593Smuzhiyun static void rbd_spec_free(struct kref *kref);
rbd_spec_put(struct rbd_spec * spec)5303*4882a593Smuzhiyun static void rbd_spec_put(struct rbd_spec *spec)
5304*4882a593Smuzhiyun {
5305*4882a593Smuzhiyun 	if (spec)
5306*4882a593Smuzhiyun 		kref_put(&spec->kref, rbd_spec_free);
5307*4882a593Smuzhiyun }
5308*4882a593Smuzhiyun 
rbd_spec_alloc(void)5309*4882a593Smuzhiyun static struct rbd_spec *rbd_spec_alloc(void)
5310*4882a593Smuzhiyun {
5311*4882a593Smuzhiyun 	struct rbd_spec *spec;
5312*4882a593Smuzhiyun 
5313*4882a593Smuzhiyun 	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
5314*4882a593Smuzhiyun 	if (!spec)
5315*4882a593Smuzhiyun 		return NULL;
5316*4882a593Smuzhiyun 
5317*4882a593Smuzhiyun 	spec->pool_id = CEPH_NOPOOL;
5318*4882a593Smuzhiyun 	spec->snap_id = CEPH_NOSNAP;
5319*4882a593Smuzhiyun 	kref_init(&spec->kref);
5320*4882a593Smuzhiyun 
5321*4882a593Smuzhiyun 	return spec;
5322*4882a593Smuzhiyun }
5323*4882a593Smuzhiyun 
rbd_spec_free(struct kref * kref)5324*4882a593Smuzhiyun static void rbd_spec_free(struct kref *kref)
5325*4882a593Smuzhiyun {
5326*4882a593Smuzhiyun 	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
5327*4882a593Smuzhiyun 
5328*4882a593Smuzhiyun 	kfree(spec->pool_name);
5329*4882a593Smuzhiyun 	kfree(spec->pool_ns);
5330*4882a593Smuzhiyun 	kfree(spec->image_id);
5331*4882a593Smuzhiyun 	kfree(spec->image_name);
5332*4882a593Smuzhiyun 	kfree(spec->snap_name);
5333*4882a593Smuzhiyun 	kfree(spec);
5334*4882a593Smuzhiyun }
5335*4882a593Smuzhiyun 
rbd_dev_free(struct rbd_device * rbd_dev)5336*4882a593Smuzhiyun static void rbd_dev_free(struct rbd_device *rbd_dev)
5337*4882a593Smuzhiyun {
5338*4882a593Smuzhiyun 	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
5339*4882a593Smuzhiyun 	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
5340*4882a593Smuzhiyun 
5341*4882a593Smuzhiyun 	ceph_oid_destroy(&rbd_dev->header_oid);
5342*4882a593Smuzhiyun 	ceph_oloc_destroy(&rbd_dev->header_oloc);
5343*4882a593Smuzhiyun 	kfree(rbd_dev->config_info);
5344*4882a593Smuzhiyun 
5345*4882a593Smuzhiyun 	rbd_put_client(rbd_dev->rbd_client);
5346*4882a593Smuzhiyun 	rbd_spec_put(rbd_dev->spec);
5347*4882a593Smuzhiyun 	kfree(rbd_dev->opts);
5348*4882a593Smuzhiyun 	kfree(rbd_dev);
5349*4882a593Smuzhiyun }
5350*4882a593Smuzhiyun 
rbd_dev_release(struct device * dev)5351*4882a593Smuzhiyun static void rbd_dev_release(struct device *dev)
5352*4882a593Smuzhiyun {
5353*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5354*4882a593Smuzhiyun 	bool need_put = !!rbd_dev->opts;
5355*4882a593Smuzhiyun 
5356*4882a593Smuzhiyun 	if (need_put) {
5357*4882a593Smuzhiyun 		destroy_workqueue(rbd_dev->task_wq);
5358*4882a593Smuzhiyun 		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5359*4882a593Smuzhiyun 	}
5360*4882a593Smuzhiyun 
5361*4882a593Smuzhiyun 	rbd_dev_free(rbd_dev);
5362*4882a593Smuzhiyun 
5363*4882a593Smuzhiyun 	/*
5364*4882a593Smuzhiyun 	 * This is racy, but way better than putting module outside of
5365*4882a593Smuzhiyun 	 * the release callback.  The race window is pretty small, so
5366*4882a593Smuzhiyun 	 * doing something similar to dm (dm-builtin.c) is overkill.
5367*4882a593Smuzhiyun 	 */
5368*4882a593Smuzhiyun 	if (need_put)
5369*4882a593Smuzhiyun 		module_put(THIS_MODULE);
5370*4882a593Smuzhiyun }
5371*4882a593Smuzhiyun 
__rbd_dev_create(struct rbd_client * rbdc,struct rbd_spec * spec)5372*4882a593Smuzhiyun static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
5373*4882a593Smuzhiyun 					   struct rbd_spec *spec)
5374*4882a593Smuzhiyun {
5375*4882a593Smuzhiyun 	struct rbd_device *rbd_dev;
5376*4882a593Smuzhiyun 
5377*4882a593Smuzhiyun 	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
5378*4882a593Smuzhiyun 	if (!rbd_dev)
5379*4882a593Smuzhiyun 		return NULL;
5380*4882a593Smuzhiyun 
5381*4882a593Smuzhiyun 	spin_lock_init(&rbd_dev->lock);
5382*4882a593Smuzhiyun 	INIT_LIST_HEAD(&rbd_dev->node);
5383*4882a593Smuzhiyun 	init_rwsem(&rbd_dev->header_rwsem);
5384*4882a593Smuzhiyun 
5385*4882a593Smuzhiyun 	rbd_dev->header.data_pool_id = CEPH_NOPOOL;
5386*4882a593Smuzhiyun 	ceph_oid_init(&rbd_dev->header_oid);
5387*4882a593Smuzhiyun 	rbd_dev->header_oloc.pool = spec->pool_id;
5388*4882a593Smuzhiyun 	if (spec->pool_ns) {
5389*4882a593Smuzhiyun 		WARN_ON(!*spec->pool_ns);
5390*4882a593Smuzhiyun 		rbd_dev->header_oloc.pool_ns =
5391*4882a593Smuzhiyun 		    ceph_find_or_create_string(spec->pool_ns,
5392*4882a593Smuzhiyun 					       strlen(spec->pool_ns));
5393*4882a593Smuzhiyun 	}
5394*4882a593Smuzhiyun 
5395*4882a593Smuzhiyun 	mutex_init(&rbd_dev->watch_mutex);
5396*4882a593Smuzhiyun 	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
5397*4882a593Smuzhiyun 	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
5398*4882a593Smuzhiyun 
5399*4882a593Smuzhiyun 	init_rwsem(&rbd_dev->lock_rwsem);
5400*4882a593Smuzhiyun 	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5401*4882a593Smuzhiyun 	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5402*4882a593Smuzhiyun 	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5403*4882a593Smuzhiyun 	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5404*4882a593Smuzhiyun 	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
5405*4882a593Smuzhiyun 	spin_lock_init(&rbd_dev->lock_lists_lock);
5406*4882a593Smuzhiyun 	INIT_LIST_HEAD(&rbd_dev->acquiring_list);
5407*4882a593Smuzhiyun 	INIT_LIST_HEAD(&rbd_dev->running_list);
5408*4882a593Smuzhiyun 	init_completion(&rbd_dev->acquire_wait);
5409*4882a593Smuzhiyun 	init_completion(&rbd_dev->releasing_wait);
5410*4882a593Smuzhiyun 
5411*4882a593Smuzhiyun 	spin_lock_init(&rbd_dev->object_map_lock);
5412*4882a593Smuzhiyun 
5413*4882a593Smuzhiyun 	rbd_dev->dev.bus = &rbd_bus_type;
5414*4882a593Smuzhiyun 	rbd_dev->dev.type = &rbd_device_type;
5415*4882a593Smuzhiyun 	rbd_dev->dev.parent = &rbd_root_dev;
5416*4882a593Smuzhiyun 	device_initialize(&rbd_dev->dev);
5417*4882a593Smuzhiyun 
5418*4882a593Smuzhiyun 	rbd_dev->rbd_client = rbdc;
5419*4882a593Smuzhiyun 	rbd_dev->spec = spec;
5420*4882a593Smuzhiyun 
5421*4882a593Smuzhiyun 	return rbd_dev;
5422*4882a593Smuzhiyun }
5423*4882a593Smuzhiyun 
5424*4882a593Smuzhiyun /*
5425*4882a593Smuzhiyun  * Create a mapping rbd_dev.
5426*4882a593Smuzhiyun  */
rbd_dev_create(struct rbd_client * rbdc,struct rbd_spec * spec,struct rbd_options * opts)5427*4882a593Smuzhiyun static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
5428*4882a593Smuzhiyun 					 struct rbd_spec *spec,
5429*4882a593Smuzhiyun 					 struct rbd_options *opts)
5430*4882a593Smuzhiyun {
5431*4882a593Smuzhiyun 	struct rbd_device *rbd_dev;
5432*4882a593Smuzhiyun 
5433*4882a593Smuzhiyun 	rbd_dev = __rbd_dev_create(rbdc, spec);
5434*4882a593Smuzhiyun 	if (!rbd_dev)
5435*4882a593Smuzhiyun 		return NULL;
5436*4882a593Smuzhiyun 
5437*4882a593Smuzhiyun 	rbd_dev->opts = opts;
5438*4882a593Smuzhiyun 
5439*4882a593Smuzhiyun 	/* get an id and fill in device name */
5440*4882a593Smuzhiyun 	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
5441*4882a593Smuzhiyun 					 minor_to_rbd_dev_id(1 << MINORBITS),
5442*4882a593Smuzhiyun 					 GFP_KERNEL);
5443*4882a593Smuzhiyun 	if (rbd_dev->dev_id < 0)
5444*4882a593Smuzhiyun 		goto fail_rbd_dev;
5445*4882a593Smuzhiyun 
5446*4882a593Smuzhiyun 	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
5447*4882a593Smuzhiyun 	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
5448*4882a593Smuzhiyun 						   rbd_dev->name);
5449*4882a593Smuzhiyun 	if (!rbd_dev->task_wq)
5450*4882a593Smuzhiyun 		goto fail_dev_id;
5451*4882a593Smuzhiyun 
5452*4882a593Smuzhiyun 	/* we have a ref from do_rbd_add() */
5453*4882a593Smuzhiyun 	__module_get(THIS_MODULE);
5454*4882a593Smuzhiyun 
5455*4882a593Smuzhiyun 	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
5456*4882a593Smuzhiyun 	return rbd_dev;
5457*4882a593Smuzhiyun 
5458*4882a593Smuzhiyun fail_dev_id:
5459*4882a593Smuzhiyun 	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5460*4882a593Smuzhiyun fail_rbd_dev:
5461*4882a593Smuzhiyun 	rbd_dev_free(rbd_dev);
5462*4882a593Smuzhiyun 	return NULL;
5463*4882a593Smuzhiyun }
5464*4882a593Smuzhiyun 
rbd_dev_destroy(struct rbd_device * rbd_dev)5465*4882a593Smuzhiyun static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5466*4882a593Smuzhiyun {
5467*4882a593Smuzhiyun 	if (rbd_dev)
5468*4882a593Smuzhiyun 		put_device(&rbd_dev->dev);
5469*4882a593Smuzhiyun }
5470*4882a593Smuzhiyun 
5471*4882a593Smuzhiyun /*
5472*4882a593Smuzhiyun  * Get the size and object order for an image snapshot, or if
5473*4882a593Smuzhiyun  * snap_id is CEPH_NOSNAP, gets this information for the base
5474*4882a593Smuzhiyun  * image.
5475*4882a593Smuzhiyun  */
_rbd_dev_v2_snap_size(struct rbd_device * rbd_dev,u64 snap_id,u8 * order,u64 * snap_size)5476*4882a593Smuzhiyun static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5477*4882a593Smuzhiyun 				u8 *order, u64 *snap_size)
5478*4882a593Smuzhiyun {
5479*4882a593Smuzhiyun 	__le64 snapid = cpu_to_le64(snap_id);
5480*4882a593Smuzhiyun 	int ret;
5481*4882a593Smuzhiyun 	struct {
5482*4882a593Smuzhiyun 		u8 order;
5483*4882a593Smuzhiyun 		__le64 size;
5484*4882a593Smuzhiyun 	} __attribute__ ((packed)) size_buf = { 0 };
5485*4882a593Smuzhiyun 
5486*4882a593Smuzhiyun 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5487*4882a593Smuzhiyun 				  &rbd_dev->header_oloc, "get_size",
5488*4882a593Smuzhiyun 				  &snapid, sizeof(snapid),
5489*4882a593Smuzhiyun 				  &size_buf, sizeof(size_buf));
5490*4882a593Smuzhiyun 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5491*4882a593Smuzhiyun 	if (ret < 0)
5492*4882a593Smuzhiyun 		return ret;
5493*4882a593Smuzhiyun 	if (ret < sizeof (size_buf))
5494*4882a593Smuzhiyun 		return -ERANGE;
5495*4882a593Smuzhiyun 
5496*4882a593Smuzhiyun 	if (order) {
5497*4882a593Smuzhiyun 		*order = size_buf.order;
5498*4882a593Smuzhiyun 		dout("  order %u", (unsigned int)*order);
5499*4882a593Smuzhiyun 	}
5500*4882a593Smuzhiyun 	*snap_size = le64_to_cpu(size_buf.size);
5501*4882a593Smuzhiyun 
5502*4882a593Smuzhiyun 	dout("  snap_id 0x%016llx snap_size = %llu\n",
5503*4882a593Smuzhiyun 		(unsigned long long)snap_id,
5504*4882a593Smuzhiyun 		(unsigned long long)*snap_size);
5505*4882a593Smuzhiyun 
5506*4882a593Smuzhiyun 	return 0;
5507*4882a593Smuzhiyun }
5508*4882a593Smuzhiyun 
rbd_dev_v2_image_size(struct rbd_device * rbd_dev)5509*4882a593Smuzhiyun static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
5510*4882a593Smuzhiyun {
5511*4882a593Smuzhiyun 	return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
5512*4882a593Smuzhiyun 					&rbd_dev->header.obj_order,
5513*4882a593Smuzhiyun 					&rbd_dev->header.image_size);
5514*4882a593Smuzhiyun }
5515*4882a593Smuzhiyun 
rbd_dev_v2_object_prefix(struct rbd_device * rbd_dev)5516*4882a593Smuzhiyun static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
5517*4882a593Smuzhiyun {
5518*4882a593Smuzhiyun 	size_t size;
5519*4882a593Smuzhiyun 	void *reply_buf;
5520*4882a593Smuzhiyun 	int ret;
5521*4882a593Smuzhiyun 	void *p;
5522*4882a593Smuzhiyun 
5523*4882a593Smuzhiyun 	/* Response will be an encoded string, which includes a length */
5524*4882a593Smuzhiyun 	size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
5525*4882a593Smuzhiyun 	reply_buf = kzalloc(size, GFP_KERNEL);
5526*4882a593Smuzhiyun 	if (!reply_buf)
5527*4882a593Smuzhiyun 		return -ENOMEM;
5528*4882a593Smuzhiyun 
5529*4882a593Smuzhiyun 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5530*4882a593Smuzhiyun 				  &rbd_dev->header_oloc, "get_object_prefix",
5531*4882a593Smuzhiyun 				  NULL, 0, reply_buf, size);
5532*4882a593Smuzhiyun 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5533*4882a593Smuzhiyun 	if (ret < 0)
5534*4882a593Smuzhiyun 		goto out;
5535*4882a593Smuzhiyun 
5536*4882a593Smuzhiyun 	p = reply_buf;
5537*4882a593Smuzhiyun 	rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
5538*4882a593Smuzhiyun 						p + ret, NULL, GFP_NOIO);
5539*4882a593Smuzhiyun 	ret = 0;
5540*4882a593Smuzhiyun 
5541*4882a593Smuzhiyun 	if (IS_ERR(rbd_dev->header.object_prefix)) {
5542*4882a593Smuzhiyun 		ret = PTR_ERR(rbd_dev->header.object_prefix);
5543*4882a593Smuzhiyun 		rbd_dev->header.object_prefix = NULL;
5544*4882a593Smuzhiyun 	} else {
5545*4882a593Smuzhiyun 		dout("  object_prefix = %s\n", rbd_dev->header.object_prefix);
5546*4882a593Smuzhiyun 	}
5547*4882a593Smuzhiyun out:
5548*4882a593Smuzhiyun 	kfree(reply_buf);
5549*4882a593Smuzhiyun 
5550*4882a593Smuzhiyun 	return ret;
5551*4882a593Smuzhiyun }
5552*4882a593Smuzhiyun 
_rbd_dev_v2_snap_features(struct rbd_device * rbd_dev,u64 snap_id,bool read_only,u64 * snap_features)5553*4882a593Smuzhiyun static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5554*4882a593Smuzhiyun 				     bool read_only, u64 *snap_features)
5555*4882a593Smuzhiyun {
5556*4882a593Smuzhiyun 	struct {
5557*4882a593Smuzhiyun 		__le64 snap_id;
5558*4882a593Smuzhiyun 		u8 read_only;
5559*4882a593Smuzhiyun 	} features_in;
5560*4882a593Smuzhiyun 	struct {
5561*4882a593Smuzhiyun 		__le64 features;
5562*4882a593Smuzhiyun 		__le64 incompat;
5563*4882a593Smuzhiyun 	} __attribute__ ((packed)) features_buf = { 0 };
5564*4882a593Smuzhiyun 	u64 unsup;
5565*4882a593Smuzhiyun 	int ret;
5566*4882a593Smuzhiyun 
5567*4882a593Smuzhiyun 	features_in.snap_id = cpu_to_le64(snap_id);
5568*4882a593Smuzhiyun 	features_in.read_only = read_only;
5569*4882a593Smuzhiyun 
5570*4882a593Smuzhiyun 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5571*4882a593Smuzhiyun 				  &rbd_dev->header_oloc, "get_features",
5572*4882a593Smuzhiyun 				  &features_in, sizeof(features_in),
5573*4882a593Smuzhiyun 				  &features_buf, sizeof(features_buf));
5574*4882a593Smuzhiyun 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5575*4882a593Smuzhiyun 	if (ret < 0)
5576*4882a593Smuzhiyun 		return ret;
5577*4882a593Smuzhiyun 	if (ret < sizeof (features_buf))
5578*4882a593Smuzhiyun 		return -ERANGE;
5579*4882a593Smuzhiyun 
5580*4882a593Smuzhiyun 	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5581*4882a593Smuzhiyun 	if (unsup) {
5582*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5583*4882a593Smuzhiyun 			 unsup);
5584*4882a593Smuzhiyun 		return -ENXIO;
5585*4882a593Smuzhiyun 	}
5586*4882a593Smuzhiyun 
5587*4882a593Smuzhiyun 	*snap_features = le64_to_cpu(features_buf.features);
5588*4882a593Smuzhiyun 
5589*4882a593Smuzhiyun 	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
5590*4882a593Smuzhiyun 		(unsigned long long)snap_id,
5591*4882a593Smuzhiyun 		(unsigned long long)*snap_features,
5592*4882a593Smuzhiyun 		(unsigned long long)le64_to_cpu(features_buf.incompat));
5593*4882a593Smuzhiyun 
5594*4882a593Smuzhiyun 	return 0;
5595*4882a593Smuzhiyun }
5596*4882a593Smuzhiyun 
rbd_dev_v2_features(struct rbd_device * rbd_dev)5597*4882a593Smuzhiyun static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5598*4882a593Smuzhiyun {
5599*4882a593Smuzhiyun 	return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
5600*4882a593Smuzhiyun 					 rbd_is_ro(rbd_dev),
5601*4882a593Smuzhiyun 					 &rbd_dev->header.features);
5602*4882a593Smuzhiyun }
5603*4882a593Smuzhiyun 
5604*4882a593Smuzhiyun /*
5605*4882a593Smuzhiyun  * These are generic image flags, but since they are used only for
5606*4882a593Smuzhiyun  * object map, store them in rbd_dev->object_map_flags.
5607*4882a593Smuzhiyun  *
5608*4882a593Smuzhiyun  * For the same reason, this function is called only on object map
5609*4882a593Smuzhiyun  * (re)load and not on header refresh.
5610*4882a593Smuzhiyun  */
rbd_dev_v2_get_flags(struct rbd_device * rbd_dev)5611*4882a593Smuzhiyun static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
5612*4882a593Smuzhiyun {
5613*4882a593Smuzhiyun 	__le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5614*4882a593Smuzhiyun 	__le64 flags;
5615*4882a593Smuzhiyun 	int ret;
5616*4882a593Smuzhiyun 
5617*4882a593Smuzhiyun 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5618*4882a593Smuzhiyun 				  &rbd_dev->header_oloc, "get_flags",
5619*4882a593Smuzhiyun 				  &snapid, sizeof(snapid),
5620*4882a593Smuzhiyun 				  &flags, sizeof(flags));
5621*4882a593Smuzhiyun 	if (ret < 0)
5622*4882a593Smuzhiyun 		return ret;
5623*4882a593Smuzhiyun 	if (ret < sizeof(flags))
5624*4882a593Smuzhiyun 		return -EBADMSG;
5625*4882a593Smuzhiyun 
5626*4882a593Smuzhiyun 	rbd_dev->object_map_flags = le64_to_cpu(flags);
5627*4882a593Smuzhiyun 	return 0;
5628*4882a593Smuzhiyun }
5629*4882a593Smuzhiyun 
5630*4882a593Smuzhiyun struct parent_image_info {
5631*4882a593Smuzhiyun 	u64		pool_id;
5632*4882a593Smuzhiyun 	const char	*pool_ns;
5633*4882a593Smuzhiyun 	const char	*image_id;
5634*4882a593Smuzhiyun 	u64		snap_id;
5635*4882a593Smuzhiyun 
5636*4882a593Smuzhiyun 	bool		has_overlap;
5637*4882a593Smuzhiyun 	u64		overlap;
5638*4882a593Smuzhiyun };
5639*4882a593Smuzhiyun 
5640*4882a593Smuzhiyun /*
5641*4882a593Smuzhiyun  * The caller is responsible for @pii.
5642*4882a593Smuzhiyun  */
decode_parent_image_spec(void ** p,void * end,struct parent_image_info * pii)5643*4882a593Smuzhiyun static int decode_parent_image_spec(void **p, void *end,
5644*4882a593Smuzhiyun 				    struct parent_image_info *pii)
5645*4882a593Smuzhiyun {
5646*4882a593Smuzhiyun 	u8 struct_v;
5647*4882a593Smuzhiyun 	u32 struct_len;
5648*4882a593Smuzhiyun 	int ret;
5649*4882a593Smuzhiyun 
5650*4882a593Smuzhiyun 	ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5651*4882a593Smuzhiyun 				  &struct_v, &struct_len);
5652*4882a593Smuzhiyun 	if (ret)
5653*4882a593Smuzhiyun 		return ret;
5654*4882a593Smuzhiyun 
5655*4882a593Smuzhiyun 	ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5656*4882a593Smuzhiyun 	pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5657*4882a593Smuzhiyun 	if (IS_ERR(pii->pool_ns)) {
5658*4882a593Smuzhiyun 		ret = PTR_ERR(pii->pool_ns);
5659*4882a593Smuzhiyun 		pii->pool_ns = NULL;
5660*4882a593Smuzhiyun 		return ret;
5661*4882a593Smuzhiyun 	}
5662*4882a593Smuzhiyun 	pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5663*4882a593Smuzhiyun 	if (IS_ERR(pii->image_id)) {
5664*4882a593Smuzhiyun 		ret = PTR_ERR(pii->image_id);
5665*4882a593Smuzhiyun 		pii->image_id = NULL;
5666*4882a593Smuzhiyun 		return ret;
5667*4882a593Smuzhiyun 	}
5668*4882a593Smuzhiyun 	ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5669*4882a593Smuzhiyun 	return 0;
5670*4882a593Smuzhiyun 
5671*4882a593Smuzhiyun e_inval:
5672*4882a593Smuzhiyun 	return -EINVAL;
5673*4882a593Smuzhiyun }
5674*4882a593Smuzhiyun 
__get_parent_info(struct rbd_device * rbd_dev,struct page * req_page,struct page * reply_page,struct parent_image_info * pii)5675*4882a593Smuzhiyun static int __get_parent_info(struct rbd_device *rbd_dev,
5676*4882a593Smuzhiyun 			     struct page *req_page,
5677*4882a593Smuzhiyun 			     struct page *reply_page,
5678*4882a593Smuzhiyun 			     struct parent_image_info *pii)
5679*4882a593Smuzhiyun {
5680*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5681*4882a593Smuzhiyun 	size_t reply_len = PAGE_SIZE;
5682*4882a593Smuzhiyun 	void *p, *end;
5683*4882a593Smuzhiyun 	int ret;
5684*4882a593Smuzhiyun 
5685*4882a593Smuzhiyun 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5686*4882a593Smuzhiyun 			     "rbd", "parent_get", CEPH_OSD_FLAG_READ,
5687*4882a593Smuzhiyun 			     req_page, sizeof(u64), &reply_page, &reply_len);
5688*4882a593Smuzhiyun 	if (ret)
5689*4882a593Smuzhiyun 		return ret == -EOPNOTSUPP ? 1 : ret;
5690*4882a593Smuzhiyun 
5691*4882a593Smuzhiyun 	p = page_address(reply_page);
5692*4882a593Smuzhiyun 	end = p + reply_len;
5693*4882a593Smuzhiyun 	ret = decode_parent_image_spec(&p, end, pii);
5694*4882a593Smuzhiyun 	if (ret)
5695*4882a593Smuzhiyun 		return ret;
5696*4882a593Smuzhiyun 
5697*4882a593Smuzhiyun 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5698*4882a593Smuzhiyun 			     "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
5699*4882a593Smuzhiyun 			     req_page, sizeof(u64), &reply_page, &reply_len);
5700*4882a593Smuzhiyun 	if (ret)
5701*4882a593Smuzhiyun 		return ret;
5702*4882a593Smuzhiyun 
5703*4882a593Smuzhiyun 	p = page_address(reply_page);
5704*4882a593Smuzhiyun 	end = p + reply_len;
5705*4882a593Smuzhiyun 	ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5706*4882a593Smuzhiyun 	if (pii->has_overlap)
5707*4882a593Smuzhiyun 		ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5708*4882a593Smuzhiyun 
5709*4882a593Smuzhiyun 	return 0;
5710*4882a593Smuzhiyun 
5711*4882a593Smuzhiyun e_inval:
5712*4882a593Smuzhiyun 	return -EINVAL;
5713*4882a593Smuzhiyun }
5714*4882a593Smuzhiyun 
5715*4882a593Smuzhiyun /*
5716*4882a593Smuzhiyun  * The caller is responsible for @pii.
5717*4882a593Smuzhiyun  */
__get_parent_info_legacy(struct rbd_device * rbd_dev,struct page * req_page,struct page * reply_page,struct parent_image_info * pii)5718*4882a593Smuzhiyun static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5719*4882a593Smuzhiyun 				    struct page *req_page,
5720*4882a593Smuzhiyun 				    struct page *reply_page,
5721*4882a593Smuzhiyun 				    struct parent_image_info *pii)
5722*4882a593Smuzhiyun {
5723*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5724*4882a593Smuzhiyun 	size_t reply_len = PAGE_SIZE;
5725*4882a593Smuzhiyun 	void *p, *end;
5726*4882a593Smuzhiyun 	int ret;
5727*4882a593Smuzhiyun 
5728*4882a593Smuzhiyun 	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5729*4882a593Smuzhiyun 			     "rbd", "get_parent", CEPH_OSD_FLAG_READ,
5730*4882a593Smuzhiyun 			     req_page, sizeof(u64), &reply_page, &reply_len);
5731*4882a593Smuzhiyun 	if (ret)
5732*4882a593Smuzhiyun 		return ret;
5733*4882a593Smuzhiyun 
5734*4882a593Smuzhiyun 	p = page_address(reply_page);
5735*4882a593Smuzhiyun 	end = p + reply_len;
5736*4882a593Smuzhiyun 	ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5737*4882a593Smuzhiyun 	pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5738*4882a593Smuzhiyun 	if (IS_ERR(pii->image_id)) {
5739*4882a593Smuzhiyun 		ret = PTR_ERR(pii->image_id);
5740*4882a593Smuzhiyun 		pii->image_id = NULL;
5741*4882a593Smuzhiyun 		return ret;
5742*4882a593Smuzhiyun 	}
5743*4882a593Smuzhiyun 	ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
5744*4882a593Smuzhiyun 	pii->has_overlap = true;
5745*4882a593Smuzhiyun 	ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5746*4882a593Smuzhiyun 
5747*4882a593Smuzhiyun 	return 0;
5748*4882a593Smuzhiyun 
5749*4882a593Smuzhiyun e_inval:
5750*4882a593Smuzhiyun 	return -EINVAL;
5751*4882a593Smuzhiyun }
5752*4882a593Smuzhiyun 
get_parent_info(struct rbd_device * rbd_dev,struct parent_image_info * pii)5753*4882a593Smuzhiyun static int get_parent_info(struct rbd_device *rbd_dev,
5754*4882a593Smuzhiyun 			   struct parent_image_info *pii)
5755*4882a593Smuzhiyun {
5756*4882a593Smuzhiyun 	struct page *req_page, *reply_page;
5757*4882a593Smuzhiyun 	void *p;
5758*4882a593Smuzhiyun 	int ret;
5759*4882a593Smuzhiyun 
5760*4882a593Smuzhiyun 	req_page = alloc_page(GFP_KERNEL);
5761*4882a593Smuzhiyun 	if (!req_page)
5762*4882a593Smuzhiyun 		return -ENOMEM;
5763*4882a593Smuzhiyun 
5764*4882a593Smuzhiyun 	reply_page = alloc_page(GFP_KERNEL);
5765*4882a593Smuzhiyun 	if (!reply_page) {
5766*4882a593Smuzhiyun 		__free_page(req_page);
5767*4882a593Smuzhiyun 		return -ENOMEM;
5768*4882a593Smuzhiyun 	}
5769*4882a593Smuzhiyun 
5770*4882a593Smuzhiyun 	p = page_address(req_page);
5771*4882a593Smuzhiyun 	ceph_encode_64(&p, rbd_dev->spec->snap_id);
5772*4882a593Smuzhiyun 	ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5773*4882a593Smuzhiyun 	if (ret > 0)
5774*4882a593Smuzhiyun 		ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5775*4882a593Smuzhiyun 					       pii);
5776*4882a593Smuzhiyun 
5777*4882a593Smuzhiyun 	__free_page(req_page);
5778*4882a593Smuzhiyun 	__free_page(reply_page);
5779*4882a593Smuzhiyun 	return ret;
5780*4882a593Smuzhiyun }
5781*4882a593Smuzhiyun 
rbd_dev_v2_parent_info(struct rbd_device * rbd_dev)5782*4882a593Smuzhiyun static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5783*4882a593Smuzhiyun {
5784*4882a593Smuzhiyun 	struct rbd_spec *parent_spec;
5785*4882a593Smuzhiyun 	struct parent_image_info pii = { 0 };
5786*4882a593Smuzhiyun 	int ret;
5787*4882a593Smuzhiyun 
5788*4882a593Smuzhiyun 	parent_spec = rbd_spec_alloc();
5789*4882a593Smuzhiyun 	if (!parent_spec)
5790*4882a593Smuzhiyun 		return -ENOMEM;
5791*4882a593Smuzhiyun 
5792*4882a593Smuzhiyun 	ret = get_parent_info(rbd_dev, &pii);
5793*4882a593Smuzhiyun 	if (ret)
5794*4882a593Smuzhiyun 		goto out_err;
5795*4882a593Smuzhiyun 
5796*4882a593Smuzhiyun 	dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5797*4882a593Smuzhiyun 	     __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5798*4882a593Smuzhiyun 	     pii.has_overlap, pii.overlap);
5799*4882a593Smuzhiyun 
5800*4882a593Smuzhiyun 	if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
5801*4882a593Smuzhiyun 		/*
5802*4882a593Smuzhiyun 		 * Either the parent never existed, or we have
5803*4882a593Smuzhiyun 		 * record of it but the image got flattened so it no
5804*4882a593Smuzhiyun 		 * longer has a parent.  When the parent of a
5805*4882a593Smuzhiyun 		 * layered image disappears we immediately set the
5806*4882a593Smuzhiyun 		 * overlap to 0.  The effect of this is that all new
5807*4882a593Smuzhiyun 		 * requests will be treated as if the image had no
5808*4882a593Smuzhiyun 		 * parent.
5809*4882a593Smuzhiyun 		 *
5810*4882a593Smuzhiyun 		 * If !pii.has_overlap, the parent image spec is not
5811*4882a593Smuzhiyun 		 * applicable.  It's there to avoid duplication in each
5812*4882a593Smuzhiyun 		 * snapshot record.
5813*4882a593Smuzhiyun 		 */
5814*4882a593Smuzhiyun 		if (rbd_dev->parent_overlap) {
5815*4882a593Smuzhiyun 			rbd_dev->parent_overlap = 0;
5816*4882a593Smuzhiyun 			rbd_dev_parent_put(rbd_dev);
5817*4882a593Smuzhiyun 			pr_info("%s: clone image has been flattened\n",
5818*4882a593Smuzhiyun 				rbd_dev->disk->disk_name);
5819*4882a593Smuzhiyun 		}
5820*4882a593Smuzhiyun 
5821*4882a593Smuzhiyun 		goto out;	/* No parent?  No problem. */
5822*4882a593Smuzhiyun 	}
5823*4882a593Smuzhiyun 
5824*4882a593Smuzhiyun 	/* The ceph file layout needs to fit pool id in 32 bits */
5825*4882a593Smuzhiyun 
5826*4882a593Smuzhiyun 	ret = -EIO;
5827*4882a593Smuzhiyun 	if (pii.pool_id > (u64)U32_MAX) {
5828*4882a593Smuzhiyun 		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
5829*4882a593Smuzhiyun 			(unsigned long long)pii.pool_id, U32_MAX);
5830*4882a593Smuzhiyun 		goto out_err;
5831*4882a593Smuzhiyun 	}
5832*4882a593Smuzhiyun 
5833*4882a593Smuzhiyun 	/*
5834*4882a593Smuzhiyun 	 * The parent won't change (except when the clone is
5835*4882a593Smuzhiyun 	 * flattened, already handled that).  So we only need to
5836*4882a593Smuzhiyun 	 * record the parent spec we have not already done so.
5837*4882a593Smuzhiyun 	 */
5838*4882a593Smuzhiyun 	if (!rbd_dev->parent_spec) {
5839*4882a593Smuzhiyun 		parent_spec->pool_id = pii.pool_id;
5840*4882a593Smuzhiyun 		if (pii.pool_ns && *pii.pool_ns) {
5841*4882a593Smuzhiyun 			parent_spec->pool_ns = pii.pool_ns;
5842*4882a593Smuzhiyun 			pii.pool_ns = NULL;
5843*4882a593Smuzhiyun 		}
5844*4882a593Smuzhiyun 		parent_spec->image_id = pii.image_id;
5845*4882a593Smuzhiyun 		pii.image_id = NULL;
5846*4882a593Smuzhiyun 		parent_spec->snap_id = pii.snap_id;
5847*4882a593Smuzhiyun 
5848*4882a593Smuzhiyun 		rbd_dev->parent_spec = parent_spec;
5849*4882a593Smuzhiyun 		parent_spec = NULL;	/* rbd_dev now owns this */
5850*4882a593Smuzhiyun 	}
5851*4882a593Smuzhiyun 
5852*4882a593Smuzhiyun 	/*
5853*4882a593Smuzhiyun 	 * We always update the parent overlap.  If it's zero we issue
5854*4882a593Smuzhiyun 	 * a warning, as we will proceed as if there was no parent.
5855*4882a593Smuzhiyun 	 */
5856*4882a593Smuzhiyun 	if (!pii.overlap) {
5857*4882a593Smuzhiyun 		if (parent_spec) {
5858*4882a593Smuzhiyun 			/* refresh, careful to warn just once */
5859*4882a593Smuzhiyun 			if (rbd_dev->parent_overlap)
5860*4882a593Smuzhiyun 				rbd_warn(rbd_dev,
5861*4882a593Smuzhiyun 				    "clone now standalone (overlap became 0)");
5862*4882a593Smuzhiyun 		} else {
5863*4882a593Smuzhiyun 			/* initial probe */
5864*4882a593Smuzhiyun 			rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
5865*4882a593Smuzhiyun 		}
5866*4882a593Smuzhiyun 	}
5867*4882a593Smuzhiyun 	rbd_dev->parent_overlap = pii.overlap;
5868*4882a593Smuzhiyun 
5869*4882a593Smuzhiyun out:
5870*4882a593Smuzhiyun 	ret = 0;
5871*4882a593Smuzhiyun out_err:
5872*4882a593Smuzhiyun 	kfree(pii.pool_ns);
5873*4882a593Smuzhiyun 	kfree(pii.image_id);
5874*4882a593Smuzhiyun 	rbd_spec_put(parent_spec);
5875*4882a593Smuzhiyun 	return ret;
5876*4882a593Smuzhiyun }
5877*4882a593Smuzhiyun 
rbd_dev_v2_striping_info(struct rbd_device * rbd_dev)5878*4882a593Smuzhiyun static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5879*4882a593Smuzhiyun {
5880*4882a593Smuzhiyun 	struct {
5881*4882a593Smuzhiyun 		__le64 stripe_unit;
5882*4882a593Smuzhiyun 		__le64 stripe_count;
5883*4882a593Smuzhiyun 	} __attribute__ ((packed)) striping_info_buf = { 0 };
5884*4882a593Smuzhiyun 	size_t size = sizeof (striping_info_buf);
5885*4882a593Smuzhiyun 	void *p;
5886*4882a593Smuzhiyun 	int ret;
5887*4882a593Smuzhiyun 
5888*4882a593Smuzhiyun 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5889*4882a593Smuzhiyun 				&rbd_dev->header_oloc, "get_stripe_unit_count",
5890*4882a593Smuzhiyun 				NULL, 0, &striping_info_buf, size);
5891*4882a593Smuzhiyun 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5892*4882a593Smuzhiyun 	if (ret < 0)
5893*4882a593Smuzhiyun 		return ret;
5894*4882a593Smuzhiyun 	if (ret < size)
5895*4882a593Smuzhiyun 		return -ERANGE;
5896*4882a593Smuzhiyun 
5897*4882a593Smuzhiyun 	p = &striping_info_buf;
5898*4882a593Smuzhiyun 	rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5899*4882a593Smuzhiyun 	rbd_dev->header.stripe_count = ceph_decode_64(&p);
5900*4882a593Smuzhiyun 	return 0;
5901*4882a593Smuzhiyun }
5902*4882a593Smuzhiyun 
rbd_dev_v2_data_pool(struct rbd_device * rbd_dev)5903*4882a593Smuzhiyun static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5904*4882a593Smuzhiyun {
5905*4882a593Smuzhiyun 	__le64 data_pool_id;
5906*4882a593Smuzhiyun 	int ret;
5907*4882a593Smuzhiyun 
5908*4882a593Smuzhiyun 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5909*4882a593Smuzhiyun 				  &rbd_dev->header_oloc, "get_data_pool",
5910*4882a593Smuzhiyun 				  NULL, 0, &data_pool_id, sizeof(data_pool_id));
5911*4882a593Smuzhiyun 	if (ret < 0)
5912*4882a593Smuzhiyun 		return ret;
5913*4882a593Smuzhiyun 	if (ret < sizeof(data_pool_id))
5914*4882a593Smuzhiyun 		return -EBADMSG;
5915*4882a593Smuzhiyun 
5916*4882a593Smuzhiyun 	rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5917*4882a593Smuzhiyun 	WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5918*4882a593Smuzhiyun 	return 0;
5919*4882a593Smuzhiyun }
5920*4882a593Smuzhiyun 
rbd_dev_image_name(struct rbd_device * rbd_dev)5921*4882a593Smuzhiyun static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5922*4882a593Smuzhiyun {
5923*4882a593Smuzhiyun 	CEPH_DEFINE_OID_ONSTACK(oid);
5924*4882a593Smuzhiyun 	size_t image_id_size;
5925*4882a593Smuzhiyun 	char *image_id;
5926*4882a593Smuzhiyun 	void *p;
5927*4882a593Smuzhiyun 	void *end;
5928*4882a593Smuzhiyun 	size_t size;
5929*4882a593Smuzhiyun 	void *reply_buf = NULL;
5930*4882a593Smuzhiyun 	size_t len = 0;
5931*4882a593Smuzhiyun 	char *image_name = NULL;
5932*4882a593Smuzhiyun 	int ret;
5933*4882a593Smuzhiyun 
5934*4882a593Smuzhiyun 	rbd_assert(!rbd_dev->spec->image_name);
5935*4882a593Smuzhiyun 
5936*4882a593Smuzhiyun 	len = strlen(rbd_dev->spec->image_id);
5937*4882a593Smuzhiyun 	image_id_size = sizeof (__le32) + len;
5938*4882a593Smuzhiyun 	image_id = kmalloc(image_id_size, GFP_KERNEL);
5939*4882a593Smuzhiyun 	if (!image_id)
5940*4882a593Smuzhiyun 		return NULL;
5941*4882a593Smuzhiyun 
5942*4882a593Smuzhiyun 	p = image_id;
5943*4882a593Smuzhiyun 	end = image_id + image_id_size;
5944*4882a593Smuzhiyun 	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
5945*4882a593Smuzhiyun 
5946*4882a593Smuzhiyun 	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5947*4882a593Smuzhiyun 	reply_buf = kmalloc(size, GFP_KERNEL);
5948*4882a593Smuzhiyun 	if (!reply_buf)
5949*4882a593Smuzhiyun 		goto out;
5950*4882a593Smuzhiyun 
5951*4882a593Smuzhiyun 	ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5952*4882a593Smuzhiyun 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5953*4882a593Smuzhiyun 				  "dir_get_name", image_id, image_id_size,
5954*4882a593Smuzhiyun 				  reply_buf, size);
5955*4882a593Smuzhiyun 	if (ret < 0)
5956*4882a593Smuzhiyun 		goto out;
5957*4882a593Smuzhiyun 	p = reply_buf;
5958*4882a593Smuzhiyun 	end = reply_buf + ret;
5959*4882a593Smuzhiyun 
5960*4882a593Smuzhiyun 	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5961*4882a593Smuzhiyun 	if (IS_ERR(image_name))
5962*4882a593Smuzhiyun 		image_name = NULL;
5963*4882a593Smuzhiyun 	else
5964*4882a593Smuzhiyun 		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5965*4882a593Smuzhiyun out:
5966*4882a593Smuzhiyun 	kfree(reply_buf);
5967*4882a593Smuzhiyun 	kfree(image_id);
5968*4882a593Smuzhiyun 
5969*4882a593Smuzhiyun 	return image_name;
5970*4882a593Smuzhiyun }
5971*4882a593Smuzhiyun 
rbd_v1_snap_id_by_name(struct rbd_device * rbd_dev,const char * name)5972*4882a593Smuzhiyun static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5973*4882a593Smuzhiyun {
5974*4882a593Smuzhiyun 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5975*4882a593Smuzhiyun 	const char *snap_name;
5976*4882a593Smuzhiyun 	u32 which = 0;
5977*4882a593Smuzhiyun 
5978*4882a593Smuzhiyun 	/* Skip over names until we find the one we are looking for */
5979*4882a593Smuzhiyun 
5980*4882a593Smuzhiyun 	snap_name = rbd_dev->header.snap_names;
5981*4882a593Smuzhiyun 	while (which < snapc->num_snaps) {
5982*4882a593Smuzhiyun 		if (!strcmp(name, snap_name))
5983*4882a593Smuzhiyun 			return snapc->snaps[which];
5984*4882a593Smuzhiyun 		snap_name += strlen(snap_name) + 1;
5985*4882a593Smuzhiyun 		which++;
5986*4882a593Smuzhiyun 	}
5987*4882a593Smuzhiyun 	return CEPH_NOSNAP;
5988*4882a593Smuzhiyun }
5989*4882a593Smuzhiyun 
rbd_v2_snap_id_by_name(struct rbd_device * rbd_dev,const char * name)5990*4882a593Smuzhiyun static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5991*4882a593Smuzhiyun {
5992*4882a593Smuzhiyun 	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5993*4882a593Smuzhiyun 	u32 which;
5994*4882a593Smuzhiyun 	bool found = false;
5995*4882a593Smuzhiyun 	u64 snap_id;
5996*4882a593Smuzhiyun 
5997*4882a593Smuzhiyun 	for (which = 0; !found && which < snapc->num_snaps; which++) {
5998*4882a593Smuzhiyun 		const char *snap_name;
5999*4882a593Smuzhiyun 
6000*4882a593Smuzhiyun 		snap_id = snapc->snaps[which];
6001*4882a593Smuzhiyun 		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
6002*4882a593Smuzhiyun 		if (IS_ERR(snap_name)) {
6003*4882a593Smuzhiyun 			/* ignore no-longer existing snapshots */
6004*4882a593Smuzhiyun 			if (PTR_ERR(snap_name) == -ENOENT)
6005*4882a593Smuzhiyun 				continue;
6006*4882a593Smuzhiyun 			else
6007*4882a593Smuzhiyun 				break;
6008*4882a593Smuzhiyun 		}
6009*4882a593Smuzhiyun 		found = !strcmp(name, snap_name);
6010*4882a593Smuzhiyun 		kfree(snap_name);
6011*4882a593Smuzhiyun 	}
6012*4882a593Smuzhiyun 	return found ? snap_id : CEPH_NOSNAP;
6013*4882a593Smuzhiyun }
6014*4882a593Smuzhiyun 
6015*4882a593Smuzhiyun /*
6016*4882a593Smuzhiyun  * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
6017*4882a593Smuzhiyun  * no snapshot by that name is found, or if an error occurs.
6018*4882a593Smuzhiyun  */
rbd_snap_id_by_name(struct rbd_device * rbd_dev,const char * name)6019*4882a593Smuzhiyun static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6020*4882a593Smuzhiyun {
6021*4882a593Smuzhiyun 	if (rbd_dev->image_format == 1)
6022*4882a593Smuzhiyun 		return rbd_v1_snap_id_by_name(rbd_dev, name);
6023*4882a593Smuzhiyun 
6024*4882a593Smuzhiyun 	return rbd_v2_snap_id_by_name(rbd_dev, name);
6025*4882a593Smuzhiyun }
6026*4882a593Smuzhiyun 
6027*4882a593Smuzhiyun /*
6028*4882a593Smuzhiyun  * An image being mapped will have everything but the snap id.
6029*4882a593Smuzhiyun  */
rbd_spec_fill_snap_id(struct rbd_device * rbd_dev)6030*4882a593Smuzhiyun static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
6031*4882a593Smuzhiyun {
6032*4882a593Smuzhiyun 	struct rbd_spec *spec = rbd_dev->spec;
6033*4882a593Smuzhiyun 
6034*4882a593Smuzhiyun 	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
6035*4882a593Smuzhiyun 	rbd_assert(spec->image_id && spec->image_name);
6036*4882a593Smuzhiyun 	rbd_assert(spec->snap_name);
6037*4882a593Smuzhiyun 
6038*4882a593Smuzhiyun 	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
6039*4882a593Smuzhiyun 		u64 snap_id;
6040*4882a593Smuzhiyun 
6041*4882a593Smuzhiyun 		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
6042*4882a593Smuzhiyun 		if (snap_id == CEPH_NOSNAP)
6043*4882a593Smuzhiyun 			return -ENOENT;
6044*4882a593Smuzhiyun 
6045*4882a593Smuzhiyun 		spec->snap_id = snap_id;
6046*4882a593Smuzhiyun 	} else {
6047*4882a593Smuzhiyun 		spec->snap_id = CEPH_NOSNAP;
6048*4882a593Smuzhiyun 	}
6049*4882a593Smuzhiyun 
6050*4882a593Smuzhiyun 	return 0;
6051*4882a593Smuzhiyun }
6052*4882a593Smuzhiyun 
6053*4882a593Smuzhiyun /*
6054*4882a593Smuzhiyun  * A parent image will have all ids but none of the names.
6055*4882a593Smuzhiyun  *
6056*4882a593Smuzhiyun  * All names in an rbd spec are dynamically allocated.  It's OK if we
6057*4882a593Smuzhiyun  * can't figure out the name for an image id.
6058*4882a593Smuzhiyun  */
rbd_spec_fill_names(struct rbd_device * rbd_dev)6059*4882a593Smuzhiyun static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
6060*4882a593Smuzhiyun {
6061*4882a593Smuzhiyun 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
6062*4882a593Smuzhiyun 	struct rbd_spec *spec = rbd_dev->spec;
6063*4882a593Smuzhiyun 	const char *pool_name;
6064*4882a593Smuzhiyun 	const char *image_name;
6065*4882a593Smuzhiyun 	const char *snap_name;
6066*4882a593Smuzhiyun 	int ret;
6067*4882a593Smuzhiyun 
6068*4882a593Smuzhiyun 	rbd_assert(spec->pool_id != CEPH_NOPOOL);
6069*4882a593Smuzhiyun 	rbd_assert(spec->image_id);
6070*4882a593Smuzhiyun 	rbd_assert(spec->snap_id != CEPH_NOSNAP);
6071*4882a593Smuzhiyun 
6072*4882a593Smuzhiyun 	/* Get the pool name; we have to make our own copy of this */
6073*4882a593Smuzhiyun 
6074*4882a593Smuzhiyun 	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
6075*4882a593Smuzhiyun 	if (!pool_name) {
6076*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
6077*4882a593Smuzhiyun 		return -EIO;
6078*4882a593Smuzhiyun 	}
6079*4882a593Smuzhiyun 	pool_name = kstrdup(pool_name, GFP_KERNEL);
6080*4882a593Smuzhiyun 	if (!pool_name)
6081*4882a593Smuzhiyun 		return -ENOMEM;
6082*4882a593Smuzhiyun 
6083*4882a593Smuzhiyun 	/* Fetch the image name; tolerate failure here */
6084*4882a593Smuzhiyun 
6085*4882a593Smuzhiyun 	image_name = rbd_dev_image_name(rbd_dev);
6086*4882a593Smuzhiyun 	if (!image_name)
6087*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "unable to get image name");
6088*4882a593Smuzhiyun 
6089*4882a593Smuzhiyun 	/* Fetch the snapshot name */
6090*4882a593Smuzhiyun 
6091*4882a593Smuzhiyun 	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
6092*4882a593Smuzhiyun 	if (IS_ERR(snap_name)) {
6093*4882a593Smuzhiyun 		ret = PTR_ERR(snap_name);
6094*4882a593Smuzhiyun 		goto out_err;
6095*4882a593Smuzhiyun 	}
6096*4882a593Smuzhiyun 
6097*4882a593Smuzhiyun 	spec->pool_name = pool_name;
6098*4882a593Smuzhiyun 	spec->image_name = image_name;
6099*4882a593Smuzhiyun 	spec->snap_name = snap_name;
6100*4882a593Smuzhiyun 
6101*4882a593Smuzhiyun 	return 0;
6102*4882a593Smuzhiyun 
6103*4882a593Smuzhiyun out_err:
6104*4882a593Smuzhiyun 	kfree(image_name);
6105*4882a593Smuzhiyun 	kfree(pool_name);
6106*4882a593Smuzhiyun 	return ret;
6107*4882a593Smuzhiyun }
6108*4882a593Smuzhiyun 
rbd_dev_v2_snap_context(struct rbd_device * rbd_dev)6109*4882a593Smuzhiyun static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
6110*4882a593Smuzhiyun {
6111*4882a593Smuzhiyun 	size_t size;
6112*4882a593Smuzhiyun 	int ret;
6113*4882a593Smuzhiyun 	void *reply_buf;
6114*4882a593Smuzhiyun 	void *p;
6115*4882a593Smuzhiyun 	void *end;
6116*4882a593Smuzhiyun 	u64 seq;
6117*4882a593Smuzhiyun 	u32 snap_count;
6118*4882a593Smuzhiyun 	struct ceph_snap_context *snapc;
6119*4882a593Smuzhiyun 	u32 i;
6120*4882a593Smuzhiyun 
6121*4882a593Smuzhiyun 	/*
6122*4882a593Smuzhiyun 	 * We'll need room for the seq value (maximum snapshot id),
6123*4882a593Smuzhiyun 	 * snapshot count, and array of that many snapshot ids.
6124*4882a593Smuzhiyun 	 * For now we have a fixed upper limit on the number we're
6125*4882a593Smuzhiyun 	 * prepared to receive.
6126*4882a593Smuzhiyun 	 */
6127*4882a593Smuzhiyun 	size = sizeof (__le64) + sizeof (__le32) +
6128*4882a593Smuzhiyun 			RBD_MAX_SNAP_COUNT * sizeof (__le64);
6129*4882a593Smuzhiyun 	reply_buf = kzalloc(size, GFP_KERNEL);
6130*4882a593Smuzhiyun 	if (!reply_buf)
6131*4882a593Smuzhiyun 		return -ENOMEM;
6132*4882a593Smuzhiyun 
6133*4882a593Smuzhiyun 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6134*4882a593Smuzhiyun 				  &rbd_dev->header_oloc, "get_snapcontext",
6135*4882a593Smuzhiyun 				  NULL, 0, reply_buf, size);
6136*4882a593Smuzhiyun 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6137*4882a593Smuzhiyun 	if (ret < 0)
6138*4882a593Smuzhiyun 		goto out;
6139*4882a593Smuzhiyun 
6140*4882a593Smuzhiyun 	p = reply_buf;
6141*4882a593Smuzhiyun 	end = reply_buf + ret;
6142*4882a593Smuzhiyun 	ret = -ERANGE;
6143*4882a593Smuzhiyun 	ceph_decode_64_safe(&p, end, seq, out);
6144*4882a593Smuzhiyun 	ceph_decode_32_safe(&p, end, snap_count, out);
6145*4882a593Smuzhiyun 
6146*4882a593Smuzhiyun 	/*
6147*4882a593Smuzhiyun 	 * Make sure the reported number of snapshot ids wouldn't go
6148*4882a593Smuzhiyun 	 * beyond the end of our buffer.  But before checking that,
6149*4882a593Smuzhiyun 	 * make sure the computed size of the snapshot context we
6150*4882a593Smuzhiyun 	 * allocate is representable in a size_t.
6151*4882a593Smuzhiyun 	 */
6152*4882a593Smuzhiyun 	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
6153*4882a593Smuzhiyun 				 / sizeof (u64)) {
6154*4882a593Smuzhiyun 		ret = -EINVAL;
6155*4882a593Smuzhiyun 		goto out;
6156*4882a593Smuzhiyun 	}
6157*4882a593Smuzhiyun 	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
6158*4882a593Smuzhiyun 		goto out;
6159*4882a593Smuzhiyun 	ret = 0;
6160*4882a593Smuzhiyun 
6161*4882a593Smuzhiyun 	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
6162*4882a593Smuzhiyun 	if (!snapc) {
6163*4882a593Smuzhiyun 		ret = -ENOMEM;
6164*4882a593Smuzhiyun 		goto out;
6165*4882a593Smuzhiyun 	}
6166*4882a593Smuzhiyun 	snapc->seq = seq;
6167*4882a593Smuzhiyun 	for (i = 0; i < snap_count; i++)
6168*4882a593Smuzhiyun 		snapc->snaps[i] = ceph_decode_64(&p);
6169*4882a593Smuzhiyun 
6170*4882a593Smuzhiyun 	ceph_put_snap_context(rbd_dev->header.snapc);
6171*4882a593Smuzhiyun 	rbd_dev->header.snapc = snapc;
6172*4882a593Smuzhiyun 
6173*4882a593Smuzhiyun 	dout("  snap context seq = %llu, snap_count = %u\n",
6174*4882a593Smuzhiyun 		(unsigned long long)seq, (unsigned int)snap_count);
6175*4882a593Smuzhiyun out:
6176*4882a593Smuzhiyun 	kfree(reply_buf);
6177*4882a593Smuzhiyun 
6178*4882a593Smuzhiyun 	return ret;
6179*4882a593Smuzhiyun }
6180*4882a593Smuzhiyun 
rbd_dev_v2_snap_name(struct rbd_device * rbd_dev,u64 snap_id)6181*4882a593Smuzhiyun static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
6182*4882a593Smuzhiyun 					u64 snap_id)
6183*4882a593Smuzhiyun {
6184*4882a593Smuzhiyun 	size_t size;
6185*4882a593Smuzhiyun 	void *reply_buf;
6186*4882a593Smuzhiyun 	__le64 snapid;
6187*4882a593Smuzhiyun 	int ret;
6188*4882a593Smuzhiyun 	void *p;
6189*4882a593Smuzhiyun 	void *end;
6190*4882a593Smuzhiyun 	char *snap_name;
6191*4882a593Smuzhiyun 
6192*4882a593Smuzhiyun 	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6193*4882a593Smuzhiyun 	reply_buf = kmalloc(size, GFP_KERNEL);
6194*4882a593Smuzhiyun 	if (!reply_buf)
6195*4882a593Smuzhiyun 		return ERR_PTR(-ENOMEM);
6196*4882a593Smuzhiyun 
6197*4882a593Smuzhiyun 	snapid = cpu_to_le64(snap_id);
6198*4882a593Smuzhiyun 	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6199*4882a593Smuzhiyun 				  &rbd_dev->header_oloc, "get_snapshot_name",
6200*4882a593Smuzhiyun 				  &snapid, sizeof(snapid), reply_buf, size);
6201*4882a593Smuzhiyun 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6202*4882a593Smuzhiyun 	if (ret < 0) {
6203*4882a593Smuzhiyun 		snap_name = ERR_PTR(ret);
6204*4882a593Smuzhiyun 		goto out;
6205*4882a593Smuzhiyun 	}
6206*4882a593Smuzhiyun 
6207*4882a593Smuzhiyun 	p = reply_buf;
6208*4882a593Smuzhiyun 	end = reply_buf + ret;
6209*4882a593Smuzhiyun 	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
6210*4882a593Smuzhiyun 	if (IS_ERR(snap_name))
6211*4882a593Smuzhiyun 		goto out;
6212*4882a593Smuzhiyun 
6213*4882a593Smuzhiyun 	dout("  snap_id 0x%016llx snap_name = %s\n",
6214*4882a593Smuzhiyun 		(unsigned long long)snap_id, snap_name);
6215*4882a593Smuzhiyun out:
6216*4882a593Smuzhiyun 	kfree(reply_buf);
6217*4882a593Smuzhiyun 
6218*4882a593Smuzhiyun 	return snap_name;
6219*4882a593Smuzhiyun }
6220*4882a593Smuzhiyun 
rbd_dev_v2_header_info(struct rbd_device * rbd_dev)6221*4882a593Smuzhiyun static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
6222*4882a593Smuzhiyun {
6223*4882a593Smuzhiyun 	bool first_time = rbd_dev->header.object_prefix == NULL;
6224*4882a593Smuzhiyun 	int ret;
6225*4882a593Smuzhiyun 
6226*4882a593Smuzhiyun 	ret = rbd_dev_v2_image_size(rbd_dev);
6227*4882a593Smuzhiyun 	if (ret)
6228*4882a593Smuzhiyun 		return ret;
6229*4882a593Smuzhiyun 
6230*4882a593Smuzhiyun 	if (first_time) {
6231*4882a593Smuzhiyun 		ret = rbd_dev_v2_header_onetime(rbd_dev);
6232*4882a593Smuzhiyun 		if (ret)
6233*4882a593Smuzhiyun 			return ret;
6234*4882a593Smuzhiyun 	}
6235*4882a593Smuzhiyun 
6236*4882a593Smuzhiyun 	ret = rbd_dev_v2_snap_context(rbd_dev);
6237*4882a593Smuzhiyun 	if (ret && first_time) {
6238*4882a593Smuzhiyun 		kfree(rbd_dev->header.object_prefix);
6239*4882a593Smuzhiyun 		rbd_dev->header.object_prefix = NULL;
6240*4882a593Smuzhiyun 	}
6241*4882a593Smuzhiyun 
6242*4882a593Smuzhiyun 	return ret;
6243*4882a593Smuzhiyun }
6244*4882a593Smuzhiyun 
rbd_dev_header_info(struct rbd_device * rbd_dev)6245*4882a593Smuzhiyun static int rbd_dev_header_info(struct rbd_device *rbd_dev)
6246*4882a593Smuzhiyun {
6247*4882a593Smuzhiyun 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6248*4882a593Smuzhiyun 
6249*4882a593Smuzhiyun 	if (rbd_dev->image_format == 1)
6250*4882a593Smuzhiyun 		return rbd_dev_v1_header_info(rbd_dev);
6251*4882a593Smuzhiyun 
6252*4882a593Smuzhiyun 	return rbd_dev_v2_header_info(rbd_dev);
6253*4882a593Smuzhiyun }
6254*4882a593Smuzhiyun 
6255*4882a593Smuzhiyun /*
6256*4882a593Smuzhiyun  * Skips over white space at *buf, and updates *buf to point to the
6257*4882a593Smuzhiyun  * first found non-space character (if any). Returns the length of
6258*4882a593Smuzhiyun  * the token (string of non-white space characters) found.  Note
6259*4882a593Smuzhiyun  * that *buf must be terminated with '\0'.
6260*4882a593Smuzhiyun  */
next_token(const char ** buf)6261*4882a593Smuzhiyun static inline size_t next_token(const char **buf)
6262*4882a593Smuzhiyun {
6263*4882a593Smuzhiyun         /*
6264*4882a593Smuzhiyun         * These are the characters that produce nonzero for
6265*4882a593Smuzhiyun         * isspace() in the "C" and "POSIX" locales.
6266*4882a593Smuzhiyun         */
6267*4882a593Smuzhiyun         const char *spaces = " \f\n\r\t\v";
6268*4882a593Smuzhiyun 
6269*4882a593Smuzhiyun         *buf += strspn(*buf, spaces);	/* Find start of token */
6270*4882a593Smuzhiyun 
6271*4882a593Smuzhiyun 	return strcspn(*buf, spaces);   /* Return token length */
6272*4882a593Smuzhiyun }
6273*4882a593Smuzhiyun 
6274*4882a593Smuzhiyun /*
6275*4882a593Smuzhiyun  * Finds the next token in *buf, dynamically allocates a buffer big
6276*4882a593Smuzhiyun  * enough to hold a copy of it, and copies the token into the new
6277*4882a593Smuzhiyun  * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
6278*4882a593Smuzhiyun  * that a duplicate buffer is created even for a zero-length token.
6279*4882a593Smuzhiyun  *
6280*4882a593Smuzhiyun  * Returns a pointer to the newly-allocated duplicate, or a null
6281*4882a593Smuzhiyun  * pointer if memory for the duplicate was not available.  If
6282*4882a593Smuzhiyun  * the lenp argument is a non-null pointer, the length of the token
6283*4882a593Smuzhiyun  * (not including the '\0') is returned in *lenp.
6284*4882a593Smuzhiyun  *
6285*4882a593Smuzhiyun  * If successful, the *buf pointer will be updated to point beyond
6286*4882a593Smuzhiyun  * the end of the found token.
6287*4882a593Smuzhiyun  *
6288*4882a593Smuzhiyun  * Note: uses GFP_KERNEL for allocation.
6289*4882a593Smuzhiyun  */
dup_token(const char ** buf,size_t * lenp)6290*4882a593Smuzhiyun static inline char *dup_token(const char **buf, size_t *lenp)
6291*4882a593Smuzhiyun {
6292*4882a593Smuzhiyun 	char *dup;
6293*4882a593Smuzhiyun 	size_t len;
6294*4882a593Smuzhiyun 
6295*4882a593Smuzhiyun 	len = next_token(buf);
6296*4882a593Smuzhiyun 	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
6297*4882a593Smuzhiyun 	if (!dup)
6298*4882a593Smuzhiyun 		return NULL;
6299*4882a593Smuzhiyun 	*(dup + len) = '\0';
6300*4882a593Smuzhiyun 	*buf += len;
6301*4882a593Smuzhiyun 
6302*4882a593Smuzhiyun 	if (lenp)
6303*4882a593Smuzhiyun 		*lenp = len;
6304*4882a593Smuzhiyun 
6305*4882a593Smuzhiyun 	return dup;
6306*4882a593Smuzhiyun }
6307*4882a593Smuzhiyun 
rbd_parse_param(struct fs_parameter * param,struct rbd_parse_opts_ctx * pctx)6308*4882a593Smuzhiyun static int rbd_parse_param(struct fs_parameter *param,
6309*4882a593Smuzhiyun 			    struct rbd_parse_opts_ctx *pctx)
6310*4882a593Smuzhiyun {
6311*4882a593Smuzhiyun 	struct rbd_options *opt = pctx->opts;
6312*4882a593Smuzhiyun 	struct fs_parse_result result;
6313*4882a593Smuzhiyun 	struct p_log log = {.prefix = "rbd"};
6314*4882a593Smuzhiyun 	int token, ret;
6315*4882a593Smuzhiyun 
6316*4882a593Smuzhiyun 	ret = ceph_parse_param(param, pctx->copts, NULL);
6317*4882a593Smuzhiyun 	if (ret != -ENOPARAM)
6318*4882a593Smuzhiyun 		return ret;
6319*4882a593Smuzhiyun 
6320*4882a593Smuzhiyun 	token = __fs_parse(&log, rbd_parameters, param, &result);
6321*4882a593Smuzhiyun 	dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
6322*4882a593Smuzhiyun 	if (token < 0) {
6323*4882a593Smuzhiyun 		if (token == -ENOPARAM)
6324*4882a593Smuzhiyun 			return inval_plog(&log, "Unknown parameter '%s'",
6325*4882a593Smuzhiyun 					  param->key);
6326*4882a593Smuzhiyun 		return token;
6327*4882a593Smuzhiyun 	}
6328*4882a593Smuzhiyun 
6329*4882a593Smuzhiyun 	switch (token) {
6330*4882a593Smuzhiyun 	case Opt_queue_depth:
6331*4882a593Smuzhiyun 		if (result.uint_32 < 1)
6332*4882a593Smuzhiyun 			goto out_of_range;
6333*4882a593Smuzhiyun 		opt->queue_depth = result.uint_32;
6334*4882a593Smuzhiyun 		break;
6335*4882a593Smuzhiyun 	case Opt_alloc_size:
6336*4882a593Smuzhiyun 		if (result.uint_32 < SECTOR_SIZE)
6337*4882a593Smuzhiyun 			goto out_of_range;
6338*4882a593Smuzhiyun 		if (!is_power_of_2(result.uint_32))
6339*4882a593Smuzhiyun 			return inval_plog(&log, "alloc_size must be a power of 2");
6340*4882a593Smuzhiyun 		opt->alloc_size = result.uint_32;
6341*4882a593Smuzhiyun 		break;
6342*4882a593Smuzhiyun 	case Opt_lock_timeout:
6343*4882a593Smuzhiyun 		/* 0 is "wait forever" (i.e. infinite timeout) */
6344*4882a593Smuzhiyun 		if (result.uint_32 > INT_MAX / 1000)
6345*4882a593Smuzhiyun 			goto out_of_range;
6346*4882a593Smuzhiyun 		opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000);
6347*4882a593Smuzhiyun 		break;
6348*4882a593Smuzhiyun 	case Opt_pool_ns:
6349*4882a593Smuzhiyun 		kfree(pctx->spec->pool_ns);
6350*4882a593Smuzhiyun 		pctx->spec->pool_ns = param->string;
6351*4882a593Smuzhiyun 		param->string = NULL;
6352*4882a593Smuzhiyun 		break;
6353*4882a593Smuzhiyun 	case Opt_compression_hint:
6354*4882a593Smuzhiyun 		switch (result.uint_32) {
6355*4882a593Smuzhiyun 		case Opt_compression_hint_none:
6356*4882a593Smuzhiyun 			opt->alloc_hint_flags &=
6357*4882a593Smuzhiyun 			    ~(CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE |
6358*4882a593Smuzhiyun 			      CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE);
6359*4882a593Smuzhiyun 			break;
6360*4882a593Smuzhiyun 		case Opt_compression_hint_compressible:
6361*4882a593Smuzhiyun 			opt->alloc_hint_flags |=
6362*4882a593Smuzhiyun 			    CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6363*4882a593Smuzhiyun 			opt->alloc_hint_flags &=
6364*4882a593Smuzhiyun 			    ~CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6365*4882a593Smuzhiyun 			break;
6366*4882a593Smuzhiyun 		case Opt_compression_hint_incompressible:
6367*4882a593Smuzhiyun 			opt->alloc_hint_flags |=
6368*4882a593Smuzhiyun 			    CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6369*4882a593Smuzhiyun 			opt->alloc_hint_flags &=
6370*4882a593Smuzhiyun 			    ~CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6371*4882a593Smuzhiyun 			break;
6372*4882a593Smuzhiyun 		default:
6373*4882a593Smuzhiyun 			BUG();
6374*4882a593Smuzhiyun 		}
6375*4882a593Smuzhiyun 		break;
6376*4882a593Smuzhiyun 	case Opt_read_only:
6377*4882a593Smuzhiyun 		opt->read_only = true;
6378*4882a593Smuzhiyun 		break;
6379*4882a593Smuzhiyun 	case Opt_read_write:
6380*4882a593Smuzhiyun 		opt->read_only = false;
6381*4882a593Smuzhiyun 		break;
6382*4882a593Smuzhiyun 	case Opt_lock_on_read:
6383*4882a593Smuzhiyun 		opt->lock_on_read = true;
6384*4882a593Smuzhiyun 		break;
6385*4882a593Smuzhiyun 	case Opt_exclusive:
6386*4882a593Smuzhiyun 		opt->exclusive = true;
6387*4882a593Smuzhiyun 		break;
6388*4882a593Smuzhiyun 	case Opt_notrim:
6389*4882a593Smuzhiyun 		opt->trim = false;
6390*4882a593Smuzhiyun 		break;
6391*4882a593Smuzhiyun 	default:
6392*4882a593Smuzhiyun 		BUG();
6393*4882a593Smuzhiyun 	}
6394*4882a593Smuzhiyun 
6395*4882a593Smuzhiyun 	return 0;
6396*4882a593Smuzhiyun 
6397*4882a593Smuzhiyun out_of_range:
6398*4882a593Smuzhiyun 	return inval_plog(&log, "%s out of range", param->key);
6399*4882a593Smuzhiyun }
6400*4882a593Smuzhiyun 
6401*4882a593Smuzhiyun /*
6402*4882a593Smuzhiyun  * This duplicates most of generic_parse_monolithic(), untying it from
6403*4882a593Smuzhiyun  * fs_context and skipping standard superblock and security options.
6404*4882a593Smuzhiyun  */
rbd_parse_options(char * options,struct rbd_parse_opts_ctx * pctx)6405*4882a593Smuzhiyun static int rbd_parse_options(char *options, struct rbd_parse_opts_ctx *pctx)
6406*4882a593Smuzhiyun {
6407*4882a593Smuzhiyun 	char *key;
6408*4882a593Smuzhiyun 	int ret = 0;
6409*4882a593Smuzhiyun 
6410*4882a593Smuzhiyun 	dout("%s '%s'\n", __func__, options);
6411*4882a593Smuzhiyun 	while ((key = strsep(&options, ",")) != NULL) {
6412*4882a593Smuzhiyun 		if (*key) {
6413*4882a593Smuzhiyun 			struct fs_parameter param = {
6414*4882a593Smuzhiyun 				.key	= key,
6415*4882a593Smuzhiyun 				.type	= fs_value_is_flag,
6416*4882a593Smuzhiyun 			};
6417*4882a593Smuzhiyun 			char *value = strchr(key, '=');
6418*4882a593Smuzhiyun 			size_t v_len = 0;
6419*4882a593Smuzhiyun 
6420*4882a593Smuzhiyun 			if (value) {
6421*4882a593Smuzhiyun 				if (value == key)
6422*4882a593Smuzhiyun 					continue;
6423*4882a593Smuzhiyun 				*value++ = 0;
6424*4882a593Smuzhiyun 				v_len = strlen(value);
6425*4882a593Smuzhiyun 				param.string = kmemdup_nul(value, v_len,
6426*4882a593Smuzhiyun 							   GFP_KERNEL);
6427*4882a593Smuzhiyun 				if (!param.string)
6428*4882a593Smuzhiyun 					return -ENOMEM;
6429*4882a593Smuzhiyun 				param.type = fs_value_is_string;
6430*4882a593Smuzhiyun 			}
6431*4882a593Smuzhiyun 			param.size = v_len;
6432*4882a593Smuzhiyun 
6433*4882a593Smuzhiyun 			ret = rbd_parse_param(&param, pctx);
6434*4882a593Smuzhiyun 			kfree(param.string);
6435*4882a593Smuzhiyun 			if (ret)
6436*4882a593Smuzhiyun 				break;
6437*4882a593Smuzhiyun 		}
6438*4882a593Smuzhiyun 	}
6439*4882a593Smuzhiyun 
6440*4882a593Smuzhiyun 	return ret;
6441*4882a593Smuzhiyun }
6442*4882a593Smuzhiyun 
6443*4882a593Smuzhiyun /*
6444*4882a593Smuzhiyun  * Parse the options provided for an "rbd add" (i.e., rbd image
6445*4882a593Smuzhiyun  * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
6446*4882a593Smuzhiyun  * and the data written is passed here via a NUL-terminated buffer.
6447*4882a593Smuzhiyun  * Returns 0 if successful or an error code otherwise.
6448*4882a593Smuzhiyun  *
6449*4882a593Smuzhiyun  * The information extracted from these options is recorded in
6450*4882a593Smuzhiyun  * the other parameters which return dynamically-allocated
6451*4882a593Smuzhiyun  * structures:
6452*4882a593Smuzhiyun  *  ceph_opts
6453*4882a593Smuzhiyun  *      The address of a pointer that will refer to a ceph options
6454*4882a593Smuzhiyun  *      structure.  Caller must release the returned pointer using
6455*4882a593Smuzhiyun  *      ceph_destroy_options() when it is no longer needed.
6456*4882a593Smuzhiyun  *  rbd_opts
6457*4882a593Smuzhiyun  *	Address of an rbd options pointer.  Fully initialized by
6458*4882a593Smuzhiyun  *	this function; caller must release with kfree().
6459*4882a593Smuzhiyun  *  spec
6460*4882a593Smuzhiyun  *	Address of an rbd image specification pointer.  Fully
6461*4882a593Smuzhiyun  *	initialized by this function based on parsed options.
6462*4882a593Smuzhiyun  *	Caller must release with rbd_spec_put().
6463*4882a593Smuzhiyun  *
6464*4882a593Smuzhiyun  * The options passed take this form:
6465*4882a593Smuzhiyun  *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6466*4882a593Smuzhiyun  * where:
6467*4882a593Smuzhiyun  *  <mon_addrs>
6468*4882a593Smuzhiyun  *      A comma-separated list of one or more monitor addresses.
6469*4882a593Smuzhiyun  *      A monitor address is an ip address, optionally followed
6470*4882a593Smuzhiyun  *      by a port number (separated by a colon).
6471*4882a593Smuzhiyun  *        I.e.:  ip1[:port1][,ip2[:port2]...]
6472*4882a593Smuzhiyun  *  <options>
6473*4882a593Smuzhiyun  *      A comma-separated list of ceph and/or rbd options.
6474*4882a593Smuzhiyun  *  <pool_name>
6475*4882a593Smuzhiyun  *      The name of the rados pool containing the rbd image.
6476*4882a593Smuzhiyun  *  <image_name>
6477*4882a593Smuzhiyun  *      The name of the image in that pool to map.
6478*4882a593Smuzhiyun  *  <snap_id>
6479*4882a593Smuzhiyun  *      An optional snapshot id.  If provided, the mapping will
6480*4882a593Smuzhiyun  *      present data from the image at the time that snapshot was
6481*4882a593Smuzhiyun  *      created.  The image head is used if no snapshot id is
6482*4882a593Smuzhiyun  *      provided.  Snapshot mappings are always read-only.
6483*4882a593Smuzhiyun  */
rbd_add_parse_args(const char * buf,struct ceph_options ** ceph_opts,struct rbd_options ** opts,struct rbd_spec ** rbd_spec)6484*4882a593Smuzhiyun static int rbd_add_parse_args(const char *buf,
6485*4882a593Smuzhiyun 				struct ceph_options **ceph_opts,
6486*4882a593Smuzhiyun 				struct rbd_options **opts,
6487*4882a593Smuzhiyun 				struct rbd_spec **rbd_spec)
6488*4882a593Smuzhiyun {
6489*4882a593Smuzhiyun 	size_t len;
6490*4882a593Smuzhiyun 	char *options;
6491*4882a593Smuzhiyun 	const char *mon_addrs;
6492*4882a593Smuzhiyun 	char *snap_name;
6493*4882a593Smuzhiyun 	size_t mon_addrs_size;
6494*4882a593Smuzhiyun 	struct rbd_parse_opts_ctx pctx = { 0 };
6495*4882a593Smuzhiyun 	int ret;
6496*4882a593Smuzhiyun 
6497*4882a593Smuzhiyun 	/* The first four tokens are required */
6498*4882a593Smuzhiyun 
6499*4882a593Smuzhiyun 	len = next_token(&buf);
6500*4882a593Smuzhiyun 	if (!len) {
6501*4882a593Smuzhiyun 		rbd_warn(NULL, "no monitor address(es) provided");
6502*4882a593Smuzhiyun 		return -EINVAL;
6503*4882a593Smuzhiyun 	}
6504*4882a593Smuzhiyun 	mon_addrs = buf;
6505*4882a593Smuzhiyun 	mon_addrs_size = len;
6506*4882a593Smuzhiyun 	buf += len;
6507*4882a593Smuzhiyun 
6508*4882a593Smuzhiyun 	ret = -EINVAL;
6509*4882a593Smuzhiyun 	options = dup_token(&buf, NULL);
6510*4882a593Smuzhiyun 	if (!options)
6511*4882a593Smuzhiyun 		return -ENOMEM;
6512*4882a593Smuzhiyun 	if (!*options) {
6513*4882a593Smuzhiyun 		rbd_warn(NULL, "no options provided");
6514*4882a593Smuzhiyun 		goto out_err;
6515*4882a593Smuzhiyun 	}
6516*4882a593Smuzhiyun 
6517*4882a593Smuzhiyun 	pctx.spec = rbd_spec_alloc();
6518*4882a593Smuzhiyun 	if (!pctx.spec)
6519*4882a593Smuzhiyun 		goto out_mem;
6520*4882a593Smuzhiyun 
6521*4882a593Smuzhiyun 	pctx.spec->pool_name = dup_token(&buf, NULL);
6522*4882a593Smuzhiyun 	if (!pctx.spec->pool_name)
6523*4882a593Smuzhiyun 		goto out_mem;
6524*4882a593Smuzhiyun 	if (!*pctx.spec->pool_name) {
6525*4882a593Smuzhiyun 		rbd_warn(NULL, "no pool name provided");
6526*4882a593Smuzhiyun 		goto out_err;
6527*4882a593Smuzhiyun 	}
6528*4882a593Smuzhiyun 
6529*4882a593Smuzhiyun 	pctx.spec->image_name = dup_token(&buf, NULL);
6530*4882a593Smuzhiyun 	if (!pctx.spec->image_name)
6531*4882a593Smuzhiyun 		goto out_mem;
6532*4882a593Smuzhiyun 	if (!*pctx.spec->image_name) {
6533*4882a593Smuzhiyun 		rbd_warn(NULL, "no image name provided");
6534*4882a593Smuzhiyun 		goto out_err;
6535*4882a593Smuzhiyun 	}
6536*4882a593Smuzhiyun 
6537*4882a593Smuzhiyun 	/*
6538*4882a593Smuzhiyun 	 * Snapshot name is optional; default is to use "-"
6539*4882a593Smuzhiyun 	 * (indicating the head/no snapshot).
6540*4882a593Smuzhiyun 	 */
6541*4882a593Smuzhiyun 	len = next_token(&buf);
6542*4882a593Smuzhiyun 	if (!len) {
6543*4882a593Smuzhiyun 		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
6544*4882a593Smuzhiyun 		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
6545*4882a593Smuzhiyun 	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
6546*4882a593Smuzhiyun 		ret = -ENAMETOOLONG;
6547*4882a593Smuzhiyun 		goto out_err;
6548*4882a593Smuzhiyun 	}
6549*4882a593Smuzhiyun 	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6550*4882a593Smuzhiyun 	if (!snap_name)
6551*4882a593Smuzhiyun 		goto out_mem;
6552*4882a593Smuzhiyun 	*(snap_name + len) = '\0';
6553*4882a593Smuzhiyun 	pctx.spec->snap_name = snap_name;
6554*4882a593Smuzhiyun 
6555*4882a593Smuzhiyun 	pctx.copts = ceph_alloc_options();
6556*4882a593Smuzhiyun 	if (!pctx.copts)
6557*4882a593Smuzhiyun 		goto out_mem;
6558*4882a593Smuzhiyun 
6559*4882a593Smuzhiyun 	/* Initialize all rbd options to the defaults */
6560*4882a593Smuzhiyun 
6561*4882a593Smuzhiyun 	pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6562*4882a593Smuzhiyun 	if (!pctx.opts)
6563*4882a593Smuzhiyun 		goto out_mem;
6564*4882a593Smuzhiyun 
6565*4882a593Smuzhiyun 	pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6566*4882a593Smuzhiyun 	pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
6567*4882a593Smuzhiyun 	pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
6568*4882a593Smuzhiyun 	pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6569*4882a593Smuzhiyun 	pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6570*4882a593Smuzhiyun 	pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6571*4882a593Smuzhiyun 	pctx.opts->trim = RBD_TRIM_DEFAULT;
6572*4882a593Smuzhiyun 
6573*4882a593Smuzhiyun 	ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL);
6574*4882a593Smuzhiyun 	if (ret)
6575*4882a593Smuzhiyun 		goto out_err;
6576*4882a593Smuzhiyun 
6577*4882a593Smuzhiyun 	ret = rbd_parse_options(options, &pctx);
6578*4882a593Smuzhiyun 	if (ret)
6579*4882a593Smuzhiyun 		goto out_err;
6580*4882a593Smuzhiyun 
6581*4882a593Smuzhiyun 	*ceph_opts = pctx.copts;
6582*4882a593Smuzhiyun 	*opts = pctx.opts;
6583*4882a593Smuzhiyun 	*rbd_spec = pctx.spec;
6584*4882a593Smuzhiyun 	kfree(options);
6585*4882a593Smuzhiyun 	return 0;
6586*4882a593Smuzhiyun 
6587*4882a593Smuzhiyun out_mem:
6588*4882a593Smuzhiyun 	ret = -ENOMEM;
6589*4882a593Smuzhiyun out_err:
6590*4882a593Smuzhiyun 	kfree(pctx.opts);
6591*4882a593Smuzhiyun 	ceph_destroy_options(pctx.copts);
6592*4882a593Smuzhiyun 	rbd_spec_put(pctx.spec);
6593*4882a593Smuzhiyun 	kfree(options);
6594*4882a593Smuzhiyun 	return ret;
6595*4882a593Smuzhiyun }
6596*4882a593Smuzhiyun 
rbd_dev_image_unlock(struct rbd_device * rbd_dev)6597*4882a593Smuzhiyun static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6598*4882a593Smuzhiyun {
6599*4882a593Smuzhiyun 	down_write(&rbd_dev->lock_rwsem);
6600*4882a593Smuzhiyun 	if (__rbd_is_lock_owner(rbd_dev))
6601*4882a593Smuzhiyun 		__rbd_release_lock(rbd_dev);
6602*4882a593Smuzhiyun 	up_write(&rbd_dev->lock_rwsem);
6603*4882a593Smuzhiyun }
6604*4882a593Smuzhiyun 
6605*4882a593Smuzhiyun /*
6606*4882a593Smuzhiyun  * If the wait is interrupted, an error is returned even if the lock
6607*4882a593Smuzhiyun  * was successfully acquired.  rbd_dev_image_unlock() will release it
6608*4882a593Smuzhiyun  * if needed.
6609*4882a593Smuzhiyun  */
rbd_add_acquire_lock(struct rbd_device * rbd_dev)6610*4882a593Smuzhiyun static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6611*4882a593Smuzhiyun {
6612*4882a593Smuzhiyun 	long ret;
6613*4882a593Smuzhiyun 
6614*4882a593Smuzhiyun 	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
6615*4882a593Smuzhiyun 		if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6616*4882a593Smuzhiyun 			return 0;
6617*4882a593Smuzhiyun 
6618*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6619*4882a593Smuzhiyun 		return -EINVAL;
6620*4882a593Smuzhiyun 	}
6621*4882a593Smuzhiyun 
6622*4882a593Smuzhiyun 	if (rbd_is_ro(rbd_dev))
6623*4882a593Smuzhiyun 		return 0;
6624*4882a593Smuzhiyun 
6625*4882a593Smuzhiyun 	rbd_assert(!rbd_is_lock_owner(rbd_dev));
6626*4882a593Smuzhiyun 	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6627*4882a593Smuzhiyun 	ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6628*4882a593Smuzhiyun 			    ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
6629*4882a593Smuzhiyun 	if (ret > 0) {
6630*4882a593Smuzhiyun 		ret = rbd_dev->acquire_err;
6631*4882a593Smuzhiyun 	} else {
6632*4882a593Smuzhiyun 		cancel_delayed_work_sync(&rbd_dev->lock_dwork);
6633*4882a593Smuzhiyun 		if (!ret)
6634*4882a593Smuzhiyun 			ret = -ETIMEDOUT;
6635*4882a593Smuzhiyun 	}
6636*4882a593Smuzhiyun 
6637*4882a593Smuzhiyun 	if (ret) {
6638*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
6639*4882a593Smuzhiyun 		return ret;
6640*4882a593Smuzhiyun 	}
6641*4882a593Smuzhiyun 
6642*4882a593Smuzhiyun 	/*
6643*4882a593Smuzhiyun 	 * The lock may have been released by now, unless automatic lock
6644*4882a593Smuzhiyun 	 * transitions are disabled.
6645*4882a593Smuzhiyun 	 */
6646*4882a593Smuzhiyun 	rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
6647*4882a593Smuzhiyun 	return 0;
6648*4882a593Smuzhiyun }
6649*4882a593Smuzhiyun 
6650*4882a593Smuzhiyun /*
6651*4882a593Smuzhiyun  * An rbd format 2 image has a unique identifier, distinct from the
6652*4882a593Smuzhiyun  * name given to it by the user.  Internally, that identifier is
6653*4882a593Smuzhiyun  * what's used to specify the names of objects related to the image.
6654*4882a593Smuzhiyun  *
6655*4882a593Smuzhiyun  * A special "rbd id" object is used to map an rbd image name to its
6656*4882a593Smuzhiyun  * id.  If that object doesn't exist, then there is no v2 rbd image
6657*4882a593Smuzhiyun  * with the supplied name.
6658*4882a593Smuzhiyun  *
6659*4882a593Smuzhiyun  * This function will record the given rbd_dev's image_id field if
6660*4882a593Smuzhiyun  * it can be determined, and in that case will return 0.  If any
6661*4882a593Smuzhiyun  * errors occur a negative errno will be returned and the rbd_dev's
6662*4882a593Smuzhiyun  * image_id field will be unchanged (and should be NULL).
6663*4882a593Smuzhiyun  */
rbd_dev_image_id(struct rbd_device * rbd_dev)6664*4882a593Smuzhiyun static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6665*4882a593Smuzhiyun {
6666*4882a593Smuzhiyun 	int ret;
6667*4882a593Smuzhiyun 	size_t size;
6668*4882a593Smuzhiyun 	CEPH_DEFINE_OID_ONSTACK(oid);
6669*4882a593Smuzhiyun 	void *response;
6670*4882a593Smuzhiyun 	char *image_id;
6671*4882a593Smuzhiyun 
6672*4882a593Smuzhiyun 	/*
6673*4882a593Smuzhiyun 	 * When probing a parent image, the image id is already
6674*4882a593Smuzhiyun 	 * known (and the image name likely is not).  There's no
6675*4882a593Smuzhiyun 	 * need to fetch the image id again in this case.  We
6676*4882a593Smuzhiyun 	 * do still need to set the image format though.
6677*4882a593Smuzhiyun 	 */
6678*4882a593Smuzhiyun 	if (rbd_dev->spec->image_id) {
6679*4882a593Smuzhiyun 		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6680*4882a593Smuzhiyun 
6681*4882a593Smuzhiyun 		return 0;
6682*4882a593Smuzhiyun 	}
6683*4882a593Smuzhiyun 
6684*4882a593Smuzhiyun 	/*
6685*4882a593Smuzhiyun 	 * First, see if the format 2 image id file exists, and if
6686*4882a593Smuzhiyun 	 * so, get the image's persistent id from it.
6687*4882a593Smuzhiyun 	 */
6688*4882a593Smuzhiyun 	ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6689*4882a593Smuzhiyun 			       rbd_dev->spec->image_name);
6690*4882a593Smuzhiyun 	if (ret)
6691*4882a593Smuzhiyun 		return ret;
6692*4882a593Smuzhiyun 
6693*4882a593Smuzhiyun 	dout("rbd id object name is %s\n", oid.name);
6694*4882a593Smuzhiyun 
6695*4882a593Smuzhiyun 	/* Response will be an encoded string, which includes a length */
6696*4882a593Smuzhiyun 	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6697*4882a593Smuzhiyun 	response = kzalloc(size, GFP_NOIO);
6698*4882a593Smuzhiyun 	if (!response) {
6699*4882a593Smuzhiyun 		ret = -ENOMEM;
6700*4882a593Smuzhiyun 		goto out;
6701*4882a593Smuzhiyun 	}
6702*4882a593Smuzhiyun 
6703*4882a593Smuzhiyun 	/* If it doesn't exist we'll assume it's a format 1 image */
6704*4882a593Smuzhiyun 
6705*4882a593Smuzhiyun 	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6706*4882a593Smuzhiyun 				  "get_id", NULL, 0,
6707*4882a593Smuzhiyun 				  response, size);
6708*4882a593Smuzhiyun 	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6709*4882a593Smuzhiyun 	if (ret == -ENOENT) {
6710*4882a593Smuzhiyun 		image_id = kstrdup("", GFP_KERNEL);
6711*4882a593Smuzhiyun 		ret = image_id ? 0 : -ENOMEM;
6712*4882a593Smuzhiyun 		if (!ret)
6713*4882a593Smuzhiyun 			rbd_dev->image_format = 1;
6714*4882a593Smuzhiyun 	} else if (ret >= 0) {
6715*4882a593Smuzhiyun 		void *p = response;
6716*4882a593Smuzhiyun 
6717*4882a593Smuzhiyun 		image_id = ceph_extract_encoded_string(&p, p + ret,
6718*4882a593Smuzhiyun 						NULL, GFP_NOIO);
6719*4882a593Smuzhiyun 		ret = PTR_ERR_OR_ZERO(image_id);
6720*4882a593Smuzhiyun 		if (!ret)
6721*4882a593Smuzhiyun 			rbd_dev->image_format = 2;
6722*4882a593Smuzhiyun 	}
6723*4882a593Smuzhiyun 
6724*4882a593Smuzhiyun 	if (!ret) {
6725*4882a593Smuzhiyun 		rbd_dev->spec->image_id = image_id;
6726*4882a593Smuzhiyun 		dout("image_id is %s\n", image_id);
6727*4882a593Smuzhiyun 	}
6728*4882a593Smuzhiyun out:
6729*4882a593Smuzhiyun 	kfree(response);
6730*4882a593Smuzhiyun 	ceph_oid_destroy(&oid);
6731*4882a593Smuzhiyun 	return ret;
6732*4882a593Smuzhiyun }
6733*4882a593Smuzhiyun 
6734*4882a593Smuzhiyun /*
6735*4882a593Smuzhiyun  * Undo whatever state changes are made by v1 or v2 header info
6736*4882a593Smuzhiyun  * call.
6737*4882a593Smuzhiyun  */
rbd_dev_unprobe(struct rbd_device * rbd_dev)6738*4882a593Smuzhiyun static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
6739*4882a593Smuzhiyun {
6740*4882a593Smuzhiyun 	struct rbd_image_header	*header;
6741*4882a593Smuzhiyun 
6742*4882a593Smuzhiyun 	rbd_dev_parent_put(rbd_dev);
6743*4882a593Smuzhiyun 	rbd_object_map_free(rbd_dev);
6744*4882a593Smuzhiyun 	rbd_dev_mapping_clear(rbd_dev);
6745*4882a593Smuzhiyun 
6746*4882a593Smuzhiyun 	/* Free dynamic fields from the header, then zero it out */
6747*4882a593Smuzhiyun 
6748*4882a593Smuzhiyun 	header = &rbd_dev->header;
6749*4882a593Smuzhiyun 	ceph_put_snap_context(header->snapc);
6750*4882a593Smuzhiyun 	kfree(header->snap_sizes);
6751*4882a593Smuzhiyun 	kfree(header->snap_names);
6752*4882a593Smuzhiyun 	kfree(header->object_prefix);
6753*4882a593Smuzhiyun 	memset(header, 0, sizeof (*header));
6754*4882a593Smuzhiyun }
6755*4882a593Smuzhiyun 
rbd_dev_v2_header_onetime(struct rbd_device * rbd_dev)6756*4882a593Smuzhiyun static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
6757*4882a593Smuzhiyun {
6758*4882a593Smuzhiyun 	int ret;
6759*4882a593Smuzhiyun 
6760*4882a593Smuzhiyun 	ret = rbd_dev_v2_object_prefix(rbd_dev);
6761*4882a593Smuzhiyun 	if (ret)
6762*4882a593Smuzhiyun 		goto out_err;
6763*4882a593Smuzhiyun 
6764*4882a593Smuzhiyun 	/*
6765*4882a593Smuzhiyun 	 * Get the and check features for the image.  Currently the
6766*4882a593Smuzhiyun 	 * features are assumed to never change.
6767*4882a593Smuzhiyun 	 */
6768*4882a593Smuzhiyun 	ret = rbd_dev_v2_features(rbd_dev);
6769*4882a593Smuzhiyun 	if (ret)
6770*4882a593Smuzhiyun 		goto out_err;
6771*4882a593Smuzhiyun 
6772*4882a593Smuzhiyun 	/* If the image supports fancy striping, get its parameters */
6773*4882a593Smuzhiyun 
6774*4882a593Smuzhiyun 	if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6775*4882a593Smuzhiyun 		ret = rbd_dev_v2_striping_info(rbd_dev);
6776*4882a593Smuzhiyun 		if (ret < 0)
6777*4882a593Smuzhiyun 			goto out_err;
6778*4882a593Smuzhiyun 	}
6779*4882a593Smuzhiyun 
6780*4882a593Smuzhiyun 	if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
6781*4882a593Smuzhiyun 		ret = rbd_dev_v2_data_pool(rbd_dev);
6782*4882a593Smuzhiyun 		if (ret)
6783*4882a593Smuzhiyun 			goto out_err;
6784*4882a593Smuzhiyun 	}
6785*4882a593Smuzhiyun 
6786*4882a593Smuzhiyun 	rbd_init_layout(rbd_dev);
6787*4882a593Smuzhiyun 	return 0;
6788*4882a593Smuzhiyun 
6789*4882a593Smuzhiyun out_err:
6790*4882a593Smuzhiyun 	rbd_dev->header.features = 0;
6791*4882a593Smuzhiyun 	kfree(rbd_dev->header.object_prefix);
6792*4882a593Smuzhiyun 	rbd_dev->header.object_prefix = NULL;
6793*4882a593Smuzhiyun 	return ret;
6794*4882a593Smuzhiyun }
6795*4882a593Smuzhiyun 
6796*4882a593Smuzhiyun /*
6797*4882a593Smuzhiyun  * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
6798*4882a593Smuzhiyun  * rbd_dev_image_probe() recursion depth, which means it's also the
6799*4882a593Smuzhiyun  * length of the already discovered part of the parent chain.
6800*4882a593Smuzhiyun  */
rbd_dev_probe_parent(struct rbd_device * rbd_dev,int depth)6801*4882a593Smuzhiyun static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
6802*4882a593Smuzhiyun {
6803*4882a593Smuzhiyun 	struct rbd_device *parent = NULL;
6804*4882a593Smuzhiyun 	int ret;
6805*4882a593Smuzhiyun 
6806*4882a593Smuzhiyun 	if (!rbd_dev->parent_spec)
6807*4882a593Smuzhiyun 		return 0;
6808*4882a593Smuzhiyun 
6809*4882a593Smuzhiyun 	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
6810*4882a593Smuzhiyun 		pr_info("parent chain is too long (%d)\n", depth);
6811*4882a593Smuzhiyun 		ret = -EINVAL;
6812*4882a593Smuzhiyun 		goto out_err;
6813*4882a593Smuzhiyun 	}
6814*4882a593Smuzhiyun 
6815*4882a593Smuzhiyun 	parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
6816*4882a593Smuzhiyun 	if (!parent) {
6817*4882a593Smuzhiyun 		ret = -ENOMEM;
6818*4882a593Smuzhiyun 		goto out_err;
6819*4882a593Smuzhiyun 	}
6820*4882a593Smuzhiyun 
6821*4882a593Smuzhiyun 	/*
6822*4882a593Smuzhiyun 	 * Images related by parent/child relationships always share
6823*4882a593Smuzhiyun 	 * rbd_client and spec/parent_spec, so bump their refcounts.
6824*4882a593Smuzhiyun 	 */
6825*4882a593Smuzhiyun 	__rbd_get_client(rbd_dev->rbd_client);
6826*4882a593Smuzhiyun 	rbd_spec_get(rbd_dev->parent_spec);
6827*4882a593Smuzhiyun 
6828*4882a593Smuzhiyun 	__set_bit(RBD_DEV_FLAG_READONLY, &parent->flags);
6829*4882a593Smuzhiyun 
6830*4882a593Smuzhiyun 	ret = rbd_dev_image_probe(parent, depth);
6831*4882a593Smuzhiyun 	if (ret < 0)
6832*4882a593Smuzhiyun 		goto out_err;
6833*4882a593Smuzhiyun 
6834*4882a593Smuzhiyun 	rbd_dev->parent = parent;
6835*4882a593Smuzhiyun 	atomic_set(&rbd_dev->parent_ref, 1);
6836*4882a593Smuzhiyun 	return 0;
6837*4882a593Smuzhiyun 
6838*4882a593Smuzhiyun out_err:
6839*4882a593Smuzhiyun 	rbd_dev_unparent(rbd_dev);
6840*4882a593Smuzhiyun 	rbd_dev_destroy(parent);
6841*4882a593Smuzhiyun 	return ret;
6842*4882a593Smuzhiyun }
6843*4882a593Smuzhiyun 
rbd_dev_device_release(struct rbd_device * rbd_dev)6844*4882a593Smuzhiyun static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6845*4882a593Smuzhiyun {
6846*4882a593Smuzhiyun 	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6847*4882a593Smuzhiyun 	rbd_free_disk(rbd_dev);
6848*4882a593Smuzhiyun 	if (!single_major)
6849*4882a593Smuzhiyun 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6850*4882a593Smuzhiyun }
6851*4882a593Smuzhiyun 
6852*4882a593Smuzhiyun /*
6853*4882a593Smuzhiyun  * rbd_dev->header_rwsem must be locked for write and will be unlocked
6854*4882a593Smuzhiyun  * upon return.
6855*4882a593Smuzhiyun  */
rbd_dev_device_setup(struct rbd_device * rbd_dev)6856*4882a593Smuzhiyun static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
6857*4882a593Smuzhiyun {
6858*4882a593Smuzhiyun 	int ret;
6859*4882a593Smuzhiyun 
6860*4882a593Smuzhiyun 	/* Record our major and minor device numbers. */
6861*4882a593Smuzhiyun 
6862*4882a593Smuzhiyun 	if (!single_major) {
6863*4882a593Smuzhiyun 		ret = register_blkdev(0, rbd_dev->name);
6864*4882a593Smuzhiyun 		if (ret < 0)
6865*4882a593Smuzhiyun 			goto err_out_unlock;
6866*4882a593Smuzhiyun 
6867*4882a593Smuzhiyun 		rbd_dev->major = ret;
6868*4882a593Smuzhiyun 		rbd_dev->minor = 0;
6869*4882a593Smuzhiyun 	} else {
6870*4882a593Smuzhiyun 		rbd_dev->major = rbd_major;
6871*4882a593Smuzhiyun 		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6872*4882a593Smuzhiyun 	}
6873*4882a593Smuzhiyun 
6874*4882a593Smuzhiyun 	/* Set up the blkdev mapping. */
6875*4882a593Smuzhiyun 
6876*4882a593Smuzhiyun 	ret = rbd_init_disk(rbd_dev);
6877*4882a593Smuzhiyun 	if (ret)
6878*4882a593Smuzhiyun 		goto err_out_blkdev;
6879*4882a593Smuzhiyun 
6880*4882a593Smuzhiyun 	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
6881*4882a593Smuzhiyun 	set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev));
6882*4882a593Smuzhiyun 
6883*4882a593Smuzhiyun 	ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
6884*4882a593Smuzhiyun 	if (ret)
6885*4882a593Smuzhiyun 		goto err_out_disk;
6886*4882a593Smuzhiyun 
6887*4882a593Smuzhiyun 	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6888*4882a593Smuzhiyun 	up_write(&rbd_dev->header_rwsem);
6889*4882a593Smuzhiyun 	return 0;
6890*4882a593Smuzhiyun 
6891*4882a593Smuzhiyun err_out_disk:
6892*4882a593Smuzhiyun 	rbd_free_disk(rbd_dev);
6893*4882a593Smuzhiyun err_out_blkdev:
6894*4882a593Smuzhiyun 	if (!single_major)
6895*4882a593Smuzhiyun 		unregister_blkdev(rbd_dev->major, rbd_dev->name);
6896*4882a593Smuzhiyun err_out_unlock:
6897*4882a593Smuzhiyun 	up_write(&rbd_dev->header_rwsem);
6898*4882a593Smuzhiyun 	return ret;
6899*4882a593Smuzhiyun }
6900*4882a593Smuzhiyun 
rbd_dev_header_name(struct rbd_device * rbd_dev)6901*4882a593Smuzhiyun static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6902*4882a593Smuzhiyun {
6903*4882a593Smuzhiyun 	struct rbd_spec *spec = rbd_dev->spec;
6904*4882a593Smuzhiyun 	int ret;
6905*4882a593Smuzhiyun 
6906*4882a593Smuzhiyun 	/* Record the header object name for this rbd image. */
6907*4882a593Smuzhiyun 
6908*4882a593Smuzhiyun 	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6909*4882a593Smuzhiyun 	if (rbd_dev->image_format == 1)
6910*4882a593Smuzhiyun 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6911*4882a593Smuzhiyun 				       spec->image_name, RBD_SUFFIX);
6912*4882a593Smuzhiyun 	else
6913*4882a593Smuzhiyun 		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6914*4882a593Smuzhiyun 				       RBD_HEADER_PREFIX, spec->image_id);
6915*4882a593Smuzhiyun 
6916*4882a593Smuzhiyun 	return ret;
6917*4882a593Smuzhiyun }
6918*4882a593Smuzhiyun 
rbd_print_dne(struct rbd_device * rbd_dev,bool is_snap)6919*4882a593Smuzhiyun static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap)
6920*4882a593Smuzhiyun {
6921*4882a593Smuzhiyun 	if (!is_snap) {
6922*4882a593Smuzhiyun 		pr_info("image %s/%s%s%s does not exist\n",
6923*4882a593Smuzhiyun 			rbd_dev->spec->pool_name,
6924*4882a593Smuzhiyun 			rbd_dev->spec->pool_ns ?: "",
6925*4882a593Smuzhiyun 			rbd_dev->spec->pool_ns ? "/" : "",
6926*4882a593Smuzhiyun 			rbd_dev->spec->image_name);
6927*4882a593Smuzhiyun 	} else {
6928*4882a593Smuzhiyun 		pr_info("snap %s/%s%s%s@%s does not exist\n",
6929*4882a593Smuzhiyun 			rbd_dev->spec->pool_name,
6930*4882a593Smuzhiyun 			rbd_dev->spec->pool_ns ?: "",
6931*4882a593Smuzhiyun 			rbd_dev->spec->pool_ns ? "/" : "",
6932*4882a593Smuzhiyun 			rbd_dev->spec->image_name,
6933*4882a593Smuzhiyun 			rbd_dev->spec->snap_name);
6934*4882a593Smuzhiyun 	}
6935*4882a593Smuzhiyun }
6936*4882a593Smuzhiyun 
rbd_dev_image_release(struct rbd_device * rbd_dev)6937*4882a593Smuzhiyun static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6938*4882a593Smuzhiyun {
6939*4882a593Smuzhiyun 	if (!rbd_is_ro(rbd_dev))
6940*4882a593Smuzhiyun 		rbd_unregister_watch(rbd_dev);
6941*4882a593Smuzhiyun 
6942*4882a593Smuzhiyun 	rbd_dev_unprobe(rbd_dev);
6943*4882a593Smuzhiyun 	rbd_dev->image_format = 0;
6944*4882a593Smuzhiyun 	kfree(rbd_dev->spec->image_id);
6945*4882a593Smuzhiyun 	rbd_dev->spec->image_id = NULL;
6946*4882a593Smuzhiyun }
6947*4882a593Smuzhiyun 
6948*4882a593Smuzhiyun /*
6949*4882a593Smuzhiyun  * Probe for the existence of the header object for the given rbd
6950*4882a593Smuzhiyun  * device.  If this image is the one being mapped (i.e., not a
6951*4882a593Smuzhiyun  * parent), initiate a watch on its header object before using that
6952*4882a593Smuzhiyun  * object to get detailed information about the rbd image.
6953*4882a593Smuzhiyun  *
6954*4882a593Smuzhiyun  * On success, returns with header_rwsem held for write if called
6955*4882a593Smuzhiyun  * with @depth == 0.
6956*4882a593Smuzhiyun  */
rbd_dev_image_probe(struct rbd_device * rbd_dev,int depth)6957*4882a593Smuzhiyun static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
6958*4882a593Smuzhiyun {
6959*4882a593Smuzhiyun 	bool need_watch = !rbd_is_ro(rbd_dev);
6960*4882a593Smuzhiyun 	int ret;
6961*4882a593Smuzhiyun 
6962*4882a593Smuzhiyun 	/*
6963*4882a593Smuzhiyun 	 * Get the id from the image id object.  Unless there's an
6964*4882a593Smuzhiyun 	 * error, rbd_dev->spec->image_id will be filled in with
6965*4882a593Smuzhiyun 	 * a dynamically-allocated string, and rbd_dev->image_format
6966*4882a593Smuzhiyun 	 * will be set to either 1 or 2.
6967*4882a593Smuzhiyun 	 */
6968*4882a593Smuzhiyun 	ret = rbd_dev_image_id(rbd_dev);
6969*4882a593Smuzhiyun 	if (ret)
6970*4882a593Smuzhiyun 		return ret;
6971*4882a593Smuzhiyun 
6972*4882a593Smuzhiyun 	ret = rbd_dev_header_name(rbd_dev);
6973*4882a593Smuzhiyun 	if (ret)
6974*4882a593Smuzhiyun 		goto err_out_format;
6975*4882a593Smuzhiyun 
6976*4882a593Smuzhiyun 	if (need_watch) {
6977*4882a593Smuzhiyun 		ret = rbd_register_watch(rbd_dev);
6978*4882a593Smuzhiyun 		if (ret) {
6979*4882a593Smuzhiyun 			if (ret == -ENOENT)
6980*4882a593Smuzhiyun 				rbd_print_dne(rbd_dev, false);
6981*4882a593Smuzhiyun 			goto err_out_format;
6982*4882a593Smuzhiyun 		}
6983*4882a593Smuzhiyun 	}
6984*4882a593Smuzhiyun 
6985*4882a593Smuzhiyun 	if (!depth)
6986*4882a593Smuzhiyun 		down_write(&rbd_dev->header_rwsem);
6987*4882a593Smuzhiyun 
6988*4882a593Smuzhiyun 	ret = rbd_dev_header_info(rbd_dev);
6989*4882a593Smuzhiyun 	if (ret) {
6990*4882a593Smuzhiyun 		if (ret == -ENOENT && !need_watch)
6991*4882a593Smuzhiyun 			rbd_print_dne(rbd_dev, false);
6992*4882a593Smuzhiyun 		goto err_out_probe;
6993*4882a593Smuzhiyun 	}
6994*4882a593Smuzhiyun 
6995*4882a593Smuzhiyun 	/*
6996*4882a593Smuzhiyun 	 * If this image is the one being mapped, we have pool name and
6997*4882a593Smuzhiyun 	 * id, image name and id, and snap name - need to fill snap id.
6998*4882a593Smuzhiyun 	 * Otherwise this is a parent image, identified by pool, image
6999*4882a593Smuzhiyun 	 * and snap ids - need to fill in names for those ids.
7000*4882a593Smuzhiyun 	 */
7001*4882a593Smuzhiyun 	if (!depth)
7002*4882a593Smuzhiyun 		ret = rbd_spec_fill_snap_id(rbd_dev);
7003*4882a593Smuzhiyun 	else
7004*4882a593Smuzhiyun 		ret = rbd_spec_fill_names(rbd_dev);
7005*4882a593Smuzhiyun 	if (ret) {
7006*4882a593Smuzhiyun 		if (ret == -ENOENT)
7007*4882a593Smuzhiyun 			rbd_print_dne(rbd_dev, true);
7008*4882a593Smuzhiyun 		goto err_out_probe;
7009*4882a593Smuzhiyun 	}
7010*4882a593Smuzhiyun 
7011*4882a593Smuzhiyun 	ret = rbd_dev_mapping_set(rbd_dev);
7012*4882a593Smuzhiyun 	if (ret)
7013*4882a593Smuzhiyun 		goto err_out_probe;
7014*4882a593Smuzhiyun 
7015*4882a593Smuzhiyun 	if (rbd_is_snap(rbd_dev) &&
7016*4882a593Smuzhiyun 	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
7017*4882a593Smuzhiyun 		ret = rbd_object_map_load(rbd_dev);
7018*4882a593Smuzhiyun 		if (ret)
7019*4882a593Smuzhiyun 			goto err_out_probe;
7020*4882a593Smuzhiyun 	}
7021*4882a593Smuzhiyun 
7022*4882a593Smuzhiyun 	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
7023*4882a593Smuzhiyun 		ret = rbd_dev_v2_parent_info(rbd_dev);
7024*4882a593Smuzhiyun 		if (ret)
7025*4882a593Smuzhiyun 			goto err_out_probe;
7026*4882a593Smuzhiyun 	}
7027*4882a593Smuzhiyun 
7028*4882a593Smuzhiyun 	ret = rbd_dev_probe_parent(rbd_dev, depth);
7029*4882a593Smuzhiyun 	if (ret)
7030*4882a593Smuzhiyun 		goto err_out_probe;
7031*4882a593Smuzhiyun 
7032*4882a593Smuzhiyun 	dout("discovered format %u image, header name is %s\n",
7033*4882a593Smuzhiyun 		rbd_dev->image_format, rbd_dev->header_oid.name);
7034*4882a593Smuzhiyun 	return 0;
7035*4882a593Smuzhiyun 
7036*4882a593Smuzhiyun err_out_probe:
7037*4882a593Smuzhiyun 	if (!depth)
7038*4882a593Smuzhiyun 		up_write(&rbd_dev->header_rwsem);
7039*4882a593Smuzhiyun 	if (need_watch)
7040*4882a593Smuzhiyun 		rbd_unregister_watch(rbd_dev);
7041*4882a593Smuzhiyun 	rbd_dev_unprobe(rbd_dev);
7042*4882a593Smuzhiyun err_out_format:
7043*4882a593Smuzhiyun 	rbd_dev->image_format = 0;
7044*4882a593Smuzhiyun 	kfree(rbd_dev->spec->image_id);
7045*4882a593Smuzhiyun 	rbd_dev->spec->image_id = NULL;
7046*4882a593Smuzhiyun 	return ret;
7047*4882a593Smuzhiyun }
7048*4882a593Smuzhiyun 
do_rbd_add(struct bus_type * bus,const char * buf,size_t count)7049*4882a593Smuzhiyun static ssize_t do_rbd_add(struct bus_type *bus,
7050*4882a593Smuzhiyun 			  const char *buf,
7051*4882a593Smuzhiyun 			  size_t count)
7052*4882a593Smuzhiyun {
7053*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = NULL;
7054*4882a593Smuzhiyun 	struct ceph_options *ceph_opts = NULL;
7055*4882a593Smuzhiyun 	struct rbd_options *rbd_opts = NULL;
7056*4882a593Smuzhiyun 	struct rbd_spec *spec = NULL;
7057*4882a593Smuzhiyun 	struct rbd_client *rbdc;
7058*4882a593Smuzhiyun 	int rc;
7059*4882a593Smuzhiyun 
7060*4882a593Smuzhiyun 	if (!capable(CAP_SYS_ADMIN))
7061*4882a593Smuzhiyun 		return -EPERM;
7062*4882a593Smuzhiyun 
7063*4882a593Smuzhiyun 	if (!try_module_get(THIS_MODULE))
7064*4882a593Smuzhiyun 		return -ENODEV;
7065*4882a593Smuzhiyun 
7066*4882a593Smuzhiyun 	/* parse add command */
7067*4882a593Smuzhiyun 	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
7068*4882a593Smuzhiyun 	if (rc < 0)
7069*4882a593Smuzhiyun 		goto out;
7070*4882a593Smuzhiyun 
7071*4882a593Smuzhiyun 	rbdc = rbd_get_client(ceph_opts);
7072*4882a593Smuzhiyun 	if (IS_ERR(rbdc)) {
7073*4882a593Smuzhiyun 		rc = PTR_ERR(rbdc);
7074*4882a593Smuzhiyun 		goto err_out_args;
7075*4882a593Smuzhiyun 	}
7076*4882a593Smuzhiyun 
7077*4882a593Smuzhiyun 	/* pick the pool */
7078*4882a593Smuzhiyun 	rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
7079*4882a593Smuzhiyun 	if (rc < 0) {
7080*4882a593Smuzhiyun 		if (rc == -ENOENT)
7081*4882a593Smuzhiyun 			pr_info("pool %s does not exist\n", spec->pool_name);
7082*4882a593Smuzhiyun 		goto err_out_client;
7083*4882a593Smuzhiyun 	}
7084*4882a593Smuzhiyun 	spec->pool_id = (u64)rc;
7085*4882a593Smuzhiyun 
7086*4882a593Smuzhiyun 	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
7087*4882a593Smuzhiyun 	if (!rbd_dev) {
7088*4882a593Smuzhiyun 		rc = -ENOMEM;
7089*4882a593Smuzhiyun 		goto err_out_client;
7090*4882a593Smuzhiyun 	}
7091*4882a593Smuzhiyun 	rbdc = NULL;		/* rbd_dev now owns this */
7092*4882a593Smuzhiyun 	spec = NULL;		/* rbd_dev now owns this */
7093*4882a593Smuzhiyun 	rbd_opts = NULL;	/* rbd_dev now owns this */
7094*4882a593Smuzhiyun 
7095*4882a593Smuzhiyun 	/* if we are mapping a snapshot it will be a read-only mapping */
7096*4882a593Smuzhiyun 	if (rbd_dev->opts->read_only ||
7097*4882a593Smuzhiyun 	    strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME))
7098*4882a593Smuzhiyun 		__set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
7099*4882a593Smuzhiyun 
7100*4882a593Smuzhiyun 	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
7101*4882a593Smuzhiyun 	if (!rbd_dev->config_info) {
7102*4882a593Smuzhiyun 		rc = -ENOMEM;
7103*4882a593Smuzhiyun 		goto err_out_rbd_dev;
7104*4882a593Smuzhiyun 	}
7105*4882a593Smuzhiyun 
7106*4882a593Smuzhiyun 	rc = rbd_dev_image_probe(rbd_dev, 0);
7107*4882a593Smuzhiyun 	if (rc < 0)
7108*4882a593Smuzhiyun 		goto err_out_rbd_dev;
7109*4882a593Smuzhiyun 
7110*4882a593Smuzhiyun 	if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
7111*4882a593Smuzhiyun 		rbd_warn(rbd_dev, "alloc_size adjusted to %u",
7112*4882a593Smuzhiyun 			 rbd_dev->layout.object_size);
7113*4882a593Smuzhiyun 		rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
7114*4882a593Smuzhiyun 	}
7115*4882a593Smuzhiyun 
7116*4882a593Smuzhiyun 	rc = rbd_dev_device_setup(rbd_dev);
7117*4882a593Smuzhiyun 	if (rc)
7118*4882a593Smuzhiyun 		goto err_out_image_probe;
7119*4882a593Smuzhiyun 
7120*4882a593Smuzhiyun 	rc = rbd_add_acquire_lock(rbd_dev);
7121*4882a593Smuzhiyun 	if (rc)
7122*4882a593Smuzhiyun 		goto err_out_image_lock;
7123*4882a593Smuzhiyun 
7124*4882a593Smuzhiyun 	/* Everything's ready.  Announce the disk to the world. */
7125*4882a593Smuzhiyun 
7126*4882a593Smuzhiyun 	rc = device_add(&rbd_dev->dev);
7127*4882a593Smuzhiyun 	if (rc)
7128*4882a593Smuzhiyun 		goto err_out_image_lock;
7129*4882a593Smuzhiyun 
7130*4882a593Smuzhiyun 	device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
7131*4882a593Smuzhiyun 	/* see rbd_init_disk() */
7132*4882a593Smuzhiyun 	blk_put_queue(rbd_dev->disk->queue);
7133*4882a593Smuzhiyun 
7134*4882a593Smuzhiyun 	spin_lock(&rbd_dev_list_lock);
7135*4882a593Smuzhiyun 	list_add_tail(&rbd_dev->node, &rbd_dev_list);
7136*4882a593Smuzhiyun 	spin_unlock(&rbd_dev_list_lock);
7137*4882a593Smuzhiyun 
7138*4882a593Smuzhiyun 	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
7139*4882a593Smuzhiyun 		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
7140*4882a593Smuzhiyun 		rbd_dev->header.features);
7141*4882a593Smuzhiyun 	rc = count;
7142*4882a593Smuzhiyun out:
7143*4882a593Smuzhiyun 	module_put(THIS_MODULE);
7144*4882a593Smuzhiyun 	return rc;
7145*4882a593Smuzhiyun 
7146*4882a593Smuzhiyun err_out_image_lock:
7147*4882a593Smuzhiyun 	rbd_dev_image_unlock(rbd_dev);
7148*4882a593Smuzhiyun 	rbd_dev_device_release(rbd_dev);
7149*4882a593Smuzhiyun err_out_image_probe:
7150*4882a593Smuzhiyun 	rbd_dev_image_release(rbd_dev);
7151*4882a593Smuzhiyun err_out_rbd_dev:
7152*4882a593Smuzhiyun 	rbd_dev_destroy(rbd_dev);
7153*4882a593Smuzhiyun err_out_client:
7154*4882a593Smuzhiyun 	rbd_put_client(rbdc);
7155*4882a593Smuzhiyun err_out_args:
7156*4882a593Smuzhiyun 	rbd_spec_put(spec);
7157*4882a593Smuzhiyun 	kfree(rbd_opts);
7158*4882a593Smuzhiyun 	goto out;
7159*4882a593Smuzhiyun }
7160*4882a593Smuzhiyun 
add_store(struct bus_type * bus,const char * buf,size_t count)7161*4882a593Smuzhiyun static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
7162*4882a593Smuzhiyun {
7163*4882a593Smuzhiyun 	if (single_major)
7164*4882a593Smuzhiyun 		return -EINVAL;
7165*4882a593Smuzhiyun 
7166*4882a593Smuzhiyun 	return do_rbd_add(bus, buf, count);
7167*4882a593Smuzhiyun }
7168*4882a593Smuzhiyun 
add_single_major_store(struct bus_type * bus,const char * buf,size_t count)7169*4882a593Smuzhiyun static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
7170*4882a593Smuzhiyun 				      size_t count)
7171*4882a593Smuzhiyun {
7172*4882a593Smuzhiyun 	return do_rbd_add(bus, buf, count);
7173*4882a593Smuzhiyun }
7174*4882a593Smuzhiyun 
rbd_dev_remove_parent(struct rbd_device * rbd_dev)7175*4882a593Smuzhiyun static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
7176*4882a593Smuzhiyun {
7177*4882a593Smuzhiyun 	while (rbd_dev->parent) {
7178*4882a593Smuzhiyun 		struct rbd_device *first = rbd_dev;
7179*4882a593Smuzhiyun 		struct rbd_device *second = first->parent;
7180*4882a593Smuzhiyun 		struct rbd_device *third;
7181*4882a593Smuzhiyun 
7182*4882a593Smuzhiyun 		/*
7183*4882a593Smuzhiyun 		 * Follow to the parent with no grandparent and
7184*4882a593Smuzhiyun 		 * remove it.
7185*4882a593Smuzhiyun 		 */
7186*4882a593Smuzhiyun 		while (second && (third = second->parent)) {
7187*4882a593Smuzhiyun 			first = second;
7188*4882a593Smuzhiyun 			second = third;
7189*4882a593Smuzhiyun 		}
7190*4882a593Smuzhiyun 		rbd_assert(second);
7191*4882a593Smuzhiyun 		rbd_dev_image_release(second);
7192*4882a593Smuzhiyun 		rbd_dev_destroy(second);
7193*4882a593Smuzhiyun 		first->parent = NULL;
7194*4882a593Smuzhiyun 		first->parent_overlap = 0;
7195*4882a593Smuzhiyun 
7196*4882a593Smuzhiyun 		rbd_assert(first->parent_spec);
7197*4882a593Smuzhiyun 		rbd_spec_put(first->parent_spec);
7198*4882a593Smuzhiyun 		first->parent_spec = NULL;
7199*4882a593Smuzhiyun 	}
7200*4882a593Smuzhiyun }
7201*4882a593Smuzhiyun 
do_rbd_remove(struct bus_type * bus,const char * buf,size_t count)7202*4882a593Smuzhiyun static ssize_t do_rbd_remove(struct bus_type *bus,
7203*4882a593Smuzhiyun 			     const char *buf,
7204*4882a593Smuzhiyun 			     size_t count)
7205*4882a593Smuzhiyun {
7206*4882a593Smuzhiyun 	struct rbd_device *rbd_dev = NULL;
7207*4882a593Smuzhiyun 	struct list_head *tmp;
7208*4882a593Smuzhiyun 	int dev_id;
7209*4882a593Smuzhiyun 	char opt_buf[6];
7210*4882a593Smuzhiyun 	bool force = false;
7211*4882a593Smuzhiyun 	int ret;
7212*4882a593Smuzhiyun 
7213*4882a593Smuzhiyun 	if (!capable(CAP_SYS_ADMIN))
7214*4882a593Smuzhiyun 		return -EPERM;
7215*4882a593Smuzhiyun 
7216*4882a593Smuzhiyun 	dev_id = -1;
7217*4882a593Smuzhiyun 	opt_buf[0] = '\0';
7218*4882a593Smuzhiyun 	sscanf(buf, "%d %5s", &dev_id, opt_buf);
7219*4882a593Smuzhiyun 	if (dev_id < 0) {
7220*4882a593Smuzhiyun 		pr_err("dev_id out of range\n");
7221*4882a593Smuzhiyun 		return -EINVAL;
7222*4882a593Smuzhiyun 	}
7223*4882a593Smuzhiyun 	if (opt_buf[0] != '\0') {
7224*4882a593Smuzhiyun 		if (!strcmp(opt_buf, "force")) {
7225*4882a593Smuzhiyun 			force = true;
7226*4882a593Smuzhiyun 		} else {
7227*4882a593Smuzhiyun 			pr_err("bad remove option at '%s'\n", opt_buf);
7228*4882a593Smuzhiyun 			return -EINVAL;
7229*4882a593Smuzhiyun 		}
7230*4882a593Smuzhiyun 	}
7231*4882a593Smuzhiyun 
7232*4882a593Smuzhiyun 	ret = -ENOENT;
7233*4882a593Smuzhiyun 	spin_lock(&rbd_dev_list_lock);
7234*4882a593Smuzhiyun 	list_for_each(tmp, &rbd_dev_list) {
7235*4882a593Smuzhiyun 		rbd_dev = list_entry(tmp, struct rbd_device, node);
7236*4882a593Smuzhiyun 		if (rbd_dev->dev_id == dev_id) {
7237*4882a593Smuzhiyun 			ret = 0;
7238*4882a593Smuzhiyun 			break;
7239*4882a593Smuzhiyun 		}
7240*4882a593Smuzhiyun 	}
7241*4882a593Smuzhiyun 	if (!ret) {
7242*4882a593Smuzhiyun 		spin_lock_irq(&rbd_dev->lock);
7243*4882a593Smuzhiyun 		if (rbd_dev->open_count && !force)
7244*4882a593Smuzhiyun 			ret = -EBUSY;
7245*4882a593Smuzhiyun 		else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
7246*4882a593Smuzhiyun 					  &rbd_dev->flags))
7247*4882a593Smuzhiyun 			ret = -EINPROGRESS;
7248*4882a593Smuzhiyun 		spin_unlock_irq(&rbd_dev->lock);
7249*4882a593Smuzhiyun 	}
7250*4882a593Smuzhiyun 	spin_unlock(&rbd_dev_list_lock);
7251*4882a593Smuzhiyun 	if (ret)
7252*4882a593Smuzhiyun 		return ret;
7253*4882a593Smuzhiyun 
7254*4882a593Smuzhiyun 	if (force) {
7255*4882a593Smuzhiyun 		/*
7256*4882a593Smuzhiyun 		 * Prevent new IO from being queued and wait for existing
7257*4882a593Smuzhiyun 		 * IO to complete/fail.
7258*4882a593Smuzhiyun 		 */
7259*4882a593Smuzhiyun 		blk_mq_freeze_queue(rbd_dev->disk->queue);
7260*4882a593Smuzhiyun 		blk_set_queue_dying(rbd_dev->disk->queue);
7261*4882a593Smuzhiyun 	}
7262*4882a593Smuzhiyun 
7263*4882a593Smuzhiyun 	del_gendisk(rbd_dev->disk);
7264*4882a593Smuzhiyun 	spin_lock(&rbd_dev_list_lock);
7265*4882a593Smuzhiyun 	list_del_init(&rbd_dev->node);
7266*4882a593Smuzhiyun 	spin_unlock(&rbd_dev_list_lock);
7267*4882a593Smuzhiyun 	device_del(&rbd_dev->dev);
7268*4882a593Smuzhiyun 
7269*4882a593Smuzhiyun 	rbd_dev_image_unlock(rbd_dev);
7270*4882a593Smuzhiyun 	rbd_dev_device_release(rbd_dev);
7271*4882a593Smuzhiyun 	rbd_dev_image_release(rbd_dev);
7272*4882a593Smuzhiyun 	rbd_dev_destroy(rbd_dev);
7273*4882a593Smuzhiyun 	return count;
7274*4882a593Smuzhiyun }
7275*4882a593Smuzhiyun 
remove_store(struct bus_type * bus,const char * buf,size_t count)7276*4882a593Smuzhiyun static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
7277*4882a593Smuzhiyun {
7278*4882a593Smuzhiyun 	if (single_major)
7279*4882a593Smuzhiyun 		return -EINVAL;
7280*4882a593Smuzhiyun 
7281*4882a593Smuzhiyun 	return do_rbd_remove(bus, buf, count);
7282*4882a593Smuzhiyun }
7283*4882a593Smuzhiyun 
remove_single_major_store(struct bus_type * bus,const char * buf,size_t count)7284*4882a593Smuzhiyun static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
7285*4882a593Smuzhiyun 					 size_t count)
7286*4882a593Smuzhiyun {
7287*4882a593Smuzhiyun 	return do_rbd_remove(bus, buf, count);
7288*4882a593Smuzhiyun }
7289*4882a593Smuzhiyun 
7290*4882a593Smuzhiyun /*
7291*4882a593Smuzhiyun  * create control files in sysfs
7292*4882a593Smuzhiyun  * /sys/bus/rbd/...
7293*4882a593Smuzhiyun  */
rbd_sysfs_init(void)7294*4882a593Smuzhiyun static int __init rbd_sysfs_init(void)
7295*4882a593Smuzhiyun {
7296*4882a593Smuzhiyun 	int ret;
7297*4882a593Smuzhiyun 
7298*4882a593Smuzhiyun 	ret = device_register(&rbd_root_dev);
7299*4882a593Smuzhiyun 	if (ret < 0)
7300*4882a593Smuzhiyun 		return ret;
7301*4882a593Smuzhiyun 
7302*4882a593Smuzhiyun 	ret = bus_register(&rbd_bus_type);
7303*4882a593Smuzhiyun 	if (ret < 0)
7304*4882a593Smuzhiyun 		device_unregister(&rbd_root_dev);
7305*4882a593Smuzhiyun 
7306*4882a593Smuzhiyun 	return ret;
7307*4882a593Smuzhiyun }
7308*4882a593Smuzhiyun 
rbd_sysfs_cleanup(void)7309*4882a593Smuzhiyun static void __exit rbd_sysfs_cleanup(void)
7310*4882a593Smuzhiyun {
7311*4882a593Smuzhiyun 	bus_unregister(&rbd_bus_type);
7312*4882a593Smuzhiyun 	device_unregister(&rbd_root_dev);
7313*4882a593Smuzhiyun }
7314*4882a593Smuzhiyun 
rbd_slab_init(void)7315*4882a593Smuzhiyun static int __init rbd_slab_init(void)
7316*4882a593Smuzhiyun {
7317*4882a593Smuzhiyun 	rbd_assert(!rbd_img_request_cache);
7318*4882a593Smuzhiyun 	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
7319*4882a593Smuzhiyun 	if (!rbd_img_request_cache)
7320*4882a593Smuzhiyun 		return -ENOMEM;
7321*4882a593Smuzhiyun 
7322*4882a593Smuzhiyun 	rbd_assert(!rbd_obj_request_cache);
7323*4882a593Smuzhiyun 	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
7324*4882a593Smuzhiyun 	if (!rbd_obj_request_cache)
7325*4882a593Smuzhiyun 		goto out_err;
7326*4882a593Smuzhiyun 
7327*4882a593Smuzhiyun 	return 0;
7328*4882a593Smuzhiyun 
7329*4882a593Smuzhiyun out_err:
7330*4882a593Smuzhiyun 	kmem_cache_destroy(rbd_img_request_cache);
7331*4882a593Smuzhiyun 	rbd_img_request_cache = NULL;
7332*4882a593Smuzhiyun 	return -ENOMEM;
7333*4882a593Smuzhiyun }
7334*4882a593Smuzhiyun 
rbd_slab_exit(void)7335*4882a593Smuzhiyun static void rbd_slab_exit(void)
7336*4882a593Smuzhiyun {
7337*4882a593Smuzhiyun 	rbd_assert(rbd_obj_request_cache);
7338*4882a593Smuzhiyun 	kmem_cache_destroy(rbd_obj_request_cache);
7339*4882a593Smuzhiyun 	rbd_obj_request_cache = NULL;
7340*4882a593Smuzhiyun 
7341*4882a593Smuzhiyun 	rbd_assert(rbd_img_request_cache);
7342*4882a593Smuzhiyun 	kmem_cache_destroy(rbd_img_request_cache);
7343*4882a593Smuzhiyun 	rbd_img_request_cache = NULL;
7344*4882a593Smuzhiyun }
7345*4882a593Smuzhiyun 
rbd_init(void)7346*4882a593Smuzhiyun static int __init rbd_init(void)
7347*4882a593Smuzhiyun {
7348*4882a593Smuzhiyun 	int rc;
7349*4882a593Smuzhiyun 
7350*4882a593Smuzhiyun 	if (!libceph_compatible(NULL)) {
7351*4882a593Smuzhiyun 		rbd_warn(NULL, "libceph incompatibility (quitting)");
7352*4882a593Smuzhiyun 		return -EINVAL;
7353*4882a593Smuzhiyun 	}
7354*4882a593Smuzhiyun 
7355*4882a593Smuzhiyun 	rc = rbd_slab_init();
7356*4882a593Smuzhiyun 	if (rc)
7357*4882a593Smuzhiyun 		return rc;
7358*4882a593Smuzhiyun 
7359*4882a593Smuzhiyun 	/*
7360*4882a593Smuzhiyun 	 * The number of active work items is limited by the number of
7361*4882a593Smuzhiyun 	 * rbd devices * queue depth, so leave @max_active at default.
7362*4882a593Smuzhiyun 	 */
7363*4882a593Smuzhiyun 	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7364*4882a593Smuzhiyun 	if (!rbd_wq) {
7365*4882a593Smuzhiyun 		rc = -ENOMEM;
7366*4882a593Smuzhiyun 		goto err_out_slab;
7367*4882a593Smuzhiyun 	}
7368*4882a593Smuzhiyun 
7369*4882a593Smuzhiyun 	if (single_major) {
7370*4882a593Smuzhiyun 		rbd_major = register_blkdev(0, RBD_DRV_NAME);
7371*4882a593Smuzhiyun 		if (rbd_major < 0) {
7372*4882a593Smuzhiyun 			rc = rbd_major;
7373*4882a593Smuzhiyun 			goto err_out_wq;
7374*4882a593Smuzhiyun 		}
7375*4882a593Smuzhiyun 	}
7376*4882a593Smuzhiyun 
7377*4882a593Smuzhiyun 	rc = rbd_sysfs_init();
7378*4882a593Smuzhiyun 	if (rc)
7379*4882a593Smuzhiyun 		goto err_out_blkdev;
7380*4882a593Smuzhiyun 
7381*4882a593Smuzhiyun 	if (single_major)
7382*4882a593Smuzhiyun 		pr_info("loaded (major %d)\n", rbd_major);
7383*4882a593Smuzhiyun 	else
7384*4882a593Smuzhiyun 		pr_info("loaded\n");
7385*4882a593Smuzhiyun 
7386*4882a593Smuzhiyun 	return 0;
7387*4882a593Smuzhiyun 
7388*4882a593Smuzhiyun err_out_blkdev:
7389*4882a593Smuzhiyun 	if (single_major)
7390*4882a593Smuzhiyun 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
7391*4882a593Smuzhiyun err_out_wq:
7392*4882a593Smuzhiyun 	destroy_workqueue(rbd_wq);
7393*4882a593Smuzhiyun err_out_slab:
7394*4882a593Smuzhiyun 	rbd_slab_exit();
7395*4882a593Smuzhiyun 	return rc;
7396*4882a593Smuzhiyun }
7397*4882a593Smuzhiyun 
rbd_exit(void)7398*4882a593Smuzhiyun static void __exit rbd_exit(void)
7399*4882a593Smuzhiyun {
7400*4882a593Smuzhiyun 	ida_destroy(&rbd_dev_id_ida);
7401*4882a593Smuzhiyun 	rbd_sysfs_cleanup();
7402*4882a593Smuzhiyun 	if (single_major)
7403*4882a593Smuzhiyun 		unregister_blkdev(rbd_major, RBD_DRV_NAME);
7404*4882a593Smuzhiyun 	destroy_workqueue(rbd_wq);
7405*4882a593Smuzhiyun 	rbd_slab_exit();
7406*4882a593Smuzhiyun }
7407*4882a593Smuzhiyun 
7408*4882a593Smuzhiyun module_init(rbd_init);
7409*4882a593Smuzhiyun module_exit(rbd_exit);
7410*4882a593Smuzhiyun 
7411*4882a593Smuzhiyun MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
7412*4882a593Smuzhiyun MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7413*4882a593Smuzhiyun MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
7414*4882a593Smuzhiyun /* following authorship retained from original osdblk.c */
7415*4882a593Smuzhiyun MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7416*4882a593Smuzhiyun 
7417*4882a593Smuzhiyun MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
7418*4882a593Smuzhiyun MODULE_LICENSE("GPL");
7419