xref: /OK3568_Linux_fs/kernel/include/linux/ceph/osdmap.h (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun /* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun #ifndef _FS_CEPH_OSDMAP_H
3*4882a593Smuzhiyun #define _FS_CEPH_OSDMAP_H
4*4882a593Smuzhiyun 
5*4882a593Smuzhiyun #include <linux/rbtree.h>
6*4882a593Smuzhiyun #include <linux/ceph/types.h>
7*4882a593Smuzhiyun #include <linux/ceph/decode.h>
8*4882a593Smuzhiyun #include <linux/crush/crush.h>
9*4882a593Smuzhiyun 
10*4882a593Smuzhiyun /*
11*4882a593Smuzhiyun  * The osd map describes the current membership of the osd cluster and
12*4882a593Smuzhiyun  * specifies the mapping of objects to placement groups and placement
13*4882a593Smuzhiyun  * groups to (sets of) osds.  That is, it completely specifies the
14*4882a593Smuzhiyun  * (desired) distribution of all data objects in the system at some
15*4882a593Smuzhiyun  * point in time.
16*4882a593Smuzhiyun  *
17*4882a593Smuzhiyun  * Each map version is identified by an epoch, which increases monotonically.
18*4882a593Smuzhiyun  *
19*4882a593Smuzhiyun  * The map can be updated either via an incremental map (diff) describing
20*4882a593Smuzhiyun  * the change between two successive epochs, or as a fully encoded map.
21*4882a593Smuzhiyun  */
22*4882a593Smuzhiyun struct ceph_pg {
23*4882a593Smuzhiyun 	uint64_t pool;
24*4882a593Smuzhiyun 	uint32_t seed;
25*4882a593Smuzhiyun };
26*4882a593Smuzhiyun 
27*4882a593Smuzhiyun #define CEPH_SPG_NOSHARD	-1
28*4882a593Smuzhiyun 
29*4882a593Smuzhiyun struct ceph_spg {
30*4882a593Smuzhiyun 	struct ceph_pg pgid;
31*4882a593Smuzhiyun 	s8 shard;
32*4882a593Smuzhiyun };
33*4882a593Smuzhiyun 
34*4882a593Smuzhiyun int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
35*4882a593Smuzhiyun int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs);
36*4882a593Smuzhiyun 
37*4882a593Smuzhiyun #define CEPH_POOL_FLAG_HASHPSPOOL	(1ULL << 0) /* hash pg seed and pool id
38*4882a593Smuzhiyun 						       together */
39*4882a593Smuzhiyun #define CEPH_POOL_FLAG_FULL		(1ULL << 1) /* pool is full */
40*4882a593Smuzhiyun #define CEPH_POOL_FLAG_FULL_QUOTA	(1ULL << 10) /* pool ran out of quota,
41*4882a593Smuzhiyun 							will set FULL too */
42*4882a593Smuzhiyun #define CEPH_POOL_FLAG_NEARFULL		(1ULL << 11) /* pool is nearfull */
43*4882a593Smuzhiyun 
44*4882a593Smuzhiyun struct ceph_pg_pool_info {
45*4882a593Smuzhiyun 	struct rb_node node;
46*4882a593Smuzhiyun 	s64 id;
47*4882a593Smuzhiyun 	u8 type; /* CEPH_POOL_TYPE_* */
48*4882a593Smuzhiyun 	u8 size;
49*4882a593Smuzhiyun 	u8 min_size;
50*4882a593Smuzhiyun 	u8 crush_ruleset;
51*4882a593Smuzhiyun 	u8 object_hash;
52*4882a593Smuzhiyun 	u32 last_force_request_resend;
53*4882a593Smuzhiyun 	u32 pg_num, pgp_num;
54*4882a593Smuzhiyun 	int pg_num_mask, pgp_num_mask;
55*4882a593Smuzhiyun 	s64 read_tier;
56*4882a593Smuzhiyun 	s64 write_tier; /* wins for read+write ops */
57*4882a593Smuzhiyun 	u64 flags; /* CEPH_POOL_FLAG_* */
58*4882a593Smuzhiyun 	char *name;
59*4882a593Smuzhiyun 
60*4882a593Smuzhiyun 	bool was_full;  /* for handle_one_map() */
61*4882a593Smuzhiyun };
62*4882a593Smuzhiyun 
ceph_can_shift_osds(struct ceph_pg_pool_info * pool)63*4882a593Smuzhiyun static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
64*4882a593Smuzhiyun {
65*4882a593Smuzhiyun 	switch (pool->type) {
66*4882a593Smuzhiyun 	case CEPH_POOL_TYPE_REP:
67*4882a593Smuzhiyun 		return true;
68*4882a593Smuzhiyun 	case CEPH_POOL_TYPE_EC:
69*4882a593Smuzhiyun 		return false;
70*4882a593Smuzhiyun 	default:
71*4882a593Smuzhiyun 		BUG();
72*4882a593Smuzhiyun 	}
73*4882a593Smuzhiyun }
74*4882a593Smuzhiyun 
75*4882a593Smuzhiyun struct ceph_object_locator {
76*4882a593Smuzhiyun 	s64 pool;
77*4882a593Smuzhiyun 	struct ceph_string *pool_ns;
78*4882a593Smuzhiyun };
79*4882a593Smuzhiyun 
ceph_oloc_init(struct ceph_object_locator * oloc)80*4882a593Smuzhiyun static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
81*4882a593Smuzhiyun {
82*4882a593Smuzhiyun 	oloc->pool = -1;
83*4882a593Smuzhiyun 	oloc->pool_ns = NULL;
84*4882a593Smuzhiyun }
85*4882a593Smuzhiyun 
ceph_oloc_empty(const struct ceph_object_locator * oloc)86*4882a593Smuzhiyun static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
87*4882a593Smuzhiyun {
88*4882a593Smuzhiyun 	return oloc->pool == -1;
89*4882a593Smuzhiyun }
90*4882a593Smuzhiyun 
91*4882a593Smuzhiyun void ceph_oloc_copy(struct ceph_object_locator *dest,
92*4882a593Smuzhiyun 		    const struct ceph_object_locator *src);
93*4882a593Smuzhiyun void ceph_oloc_destroy(struct ceph_object_locator *oloc);
94*4882a593Smuzhiyun 
95*4882a593Smuzhiyun /*
96*4882a593Smuzhiyun  * 51-char inline_name is long enough for all cephfs and all but one
97*4882a593Smuzhiyun  * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
98*4882a593Smuzhiyun  * arbitrarily long (~PAGE_SIZE).  It's done once during rbd map; all
99*4882a593Smuzhiyun  * other rbd requests fit into inline_name.
100*4882a593Smuzhiyun  *
101*4882a593Smuzhiyun  * Makes ceph_object_id 64 bytes on 64-bit.
102*4882a593Smuzhiyun  */
103*4882a593Smuzhiyun #define CEPH_OID_INLINE_LEN 52
104*4882a593Smuzhiyun 
105*4882a593Smuzhiyun /*
106*4882a593Smuzhiyun  * Both inline and external buffers have space for a NUL-terminator,
107*4882a593Smuzhiyun  * which is carried around.  It's not required though - RADOS object
108*4882a593Smuzhiyun  * names don't have to be NUL-terminated and may contain NULs.
109*4882a593Smuzhiyun  */
110*4882a593Smuzhiyun struct ceph_object_id {
111*4882a593Smuzhiyun 	char *name;
112*4882a593Smuzhiyun 	char inline_name[CEPH_OID_INLINE_LEN];
113*4882a593Smuzhiyun 	int name_len;
114*4882a593Smuzhiyun };
115*4882a593Smuzhiyun 
116*4882a593Smuzhiyun #define __CEPH_OID_INITIALIZER(oid) { .name = (oid).inline_name }
117*4882a593Smuzhiyun 
118*4882a593Smuzhiyun #define CEPH_DEFINE_OID_ONSTACK(oid)				\
119*4882a593Smuzhiyun 	struct ceph_object_id oid = __CEPH_OID_INITIALIZER(oid)
120*4882a593Smuzhiyun 
ceph_oid_init(struct ceph_object_id * oid)121*4882a593Smuzhiyun static inline void ceph_oid_init(struct ceph_object_id *oid)
122*4882a593Smuzhiyun {
123*4882a593Smuzhiyun 	*oid = (struct ceph_object_id) __CEPH_OID_INITIALIZER(*oid);
124*4882a593Smuzhiyun }
125*4882a593Smuzhiyun 
ceph_oid_empty(const struct ceph_object_id * oid)126*4882a593Smuzhiyun static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
127*4882a593Smuzhiyun {
128*4882a593Smuzhiyun 	return oid->name == oid->inline_name && !oid->name_len;
129*4882a593Smuzhiyun }
130*4882a593Smuzhiyun 
131*4882a593Smuzhiyun void ceph_oid_copy(struct ceph_object_id *dest,
132*4882a593Smuzhiyun 		   const struct ceph_object_id *src);
133*4882a593Smuzhiyun __printf(2, 3)
134*4882a593Smuzhiyun void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
135*4882a593Smuzhiyun __printf(3, 4)
136*4882a593Smuzhiyun int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
137*4882a593Smuzhiyun 		     const char *fmt, ...);
138*4882a593Smuzhiyun void ceph_oid_destroy(struct ceph_object_id *oid);
139*4882a593Smuzhiyun 
140*4882a593Smuzhiyun struct workspace_manager {
141*4882a593Smuzhiyun 	struct list_head idle_ws;
142*4882a593Smuzhiyun 	spinlock_t ws_lock;
143*4882a593Smuzhiyun 	/* Number of free workspaces */
144*4882a593Smuzhiyun 	int free_ws;
145*4882a593Smuzhiyun 	/* Total number of allocated workspaces */
146*4882a593Smuzhiyun 	atomic_t total_ws;
147*4882a593Smuzhiyun 	/* Waiters for a free workspace */
148*4882a593Smuzhiyun 	wait_queue_head_t ws_wait;
149*4882a593Smuzhiyun };
150*4882a593Smuzhiyun 
151*4882a593Smuzhiyun struct ceph_pg_mapping {
152*4882a593Smuzhiyun 	struct rb_node node;
153*4882a593Smuzhiyun 	struct ceph_pg pgid;
154*4882a593Smuzhiyun 
155*4882a593Smuzhiyun 	union {
156*4882a593Smuzhiyun 		struct {
157*4882a593Smuzhiyun 			int len;
158*4882a593Smuzhiyun 			int osds[];
159*4882a593Smuzhiyun 		} pg_temp, pg_upmap;
160*4882a593Smuzhiyun 		struct {
161*4882a593Smuzhiyun 			int osd;
162*4882a593Smuzhiyun 		} primary_temp;
163*4882a593Smuzhiyun 		struct {
164*4882a593Smuzhiyun 			int len;
165*4882a593Smuzhiyun 			int from_to[][2];
166*4882a593Smuzhiyun 		} pg_upmap_items;
167*4882a593Smuzhiyun 	};
168*4882a593Smuzhiyun };
169*4882a593Smuzhiyun 
170*4882a593Smuzhiyun struct ceph_osdmap {
171*4882a593Smuzhiyun 	struct ceph_fsid fsid;
172*4882a593Smuzhiyun 	u32 epoch;
173*4882a593Smuzhiyun 	struct ceph_timespec created, modified;
174*4882a593Smuzhiyun 
175*4882a593Smuzhiyun 	u32 flags;         /* CEPH_OSDMAP_* */
176*4882a593Smuzhiyun 
177*4882a593Smuzhiyun 	u32 max_osd;       /* size of osd_state, _offload, _addr arrays */
178*4882a593Smuzhiyun 	u32 *osd_state;    /* CEPH_OSD_* */
179*4882a593Smuzhiyun 	u32 *osd_weight;   /* 0 = failed, 0x10000 = 100% normal */
180*4882a593Smuzhiyun 	struct ceph_entity_addr *osd_addr;
181*4882a593Smuzhiyun 
182*4882a593Smuzhiyun 	struct rb_root pg_temp;
183*4882a593Smuzhiyun 	struct rb_root primary_temp;
184*4882a593Smuzhiyun 
185*4882a593Smuzhiyun 	/* remap (post-CRUSH, pre-up) */
186*4882a593Smuzhiyun 	struct rb_root pg_upmap;	/* PG := raw set */
187*4882a593Smuzhiyun 	struct rb_root pg_upmap_items;	/* from -> to within raw set */
188*4882a593Smuzhiyun 
189*4882a593Smuzhiyun 	u32 *osd_primary_affinity;
190*4882a593Smuzhiyun 
191*4882a593Smuzhiyun 	struct rb_root pg_pools;
192*4882a593Smuzhiyun 	u32 pool_max;
193*4882a593Smuzhiyun 
194*4882a593Smuzhiyun 	/* the CRUSH map specifies the mapping of placement groups to
195*4882a593Smuzhiyun 	 * the list of osds that store+replicate them. */
196*4882a593Smuzhiyun 	struct crush_map *crush;
197*4882a593Smuzhiyun 
198*4882a593Smuzhiyun 	struct workspace_manager crush_wsm;
199*4882a593Smuzhiyun };
200*4882a593Smuzhiyun 
ceph_osd_exists(struct ceph_osdmap * map,int osd)201*4882a593Smuzhiyun static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
202*4882a593Smuzhiyun {
203*4882a593Smuzhiyun 	return osd >= 0 && osd < map->max_osd &&
204*4882a593Smuzhiyun 	       (map->osd_state[osd] & CEPH_OSD_EXISTS);
205*4882a593Smuzhiyun }
206*4882a593Smuzhiyun 
ceph_osd_is_up(struct ceph_osdmap * map,int osd)207*4882a593Smuzhiyun static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
208*4882a593Smuzhiyun {
209*4882a593Smuzhiyun 	return ceph_osd_exists(map, osd) &&
210*4882a593Smuzhiyun 	       (map->osd_state[osd] & CEPH_OSD_UP);
211*4882a593Smuzhiyun }
212*4882a593Smuzhiyun 
ceph_osd_is_down(struct ceph_osdmap * map,int osd)213*4882a593Smuzhiyun static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
214*4882a593Smuzhiyun {
215*4882a593Smuzhiyun 	return !ceph_osd_is_up(map, osd);
216*4882a593Smuzhiyun }
217*4882a593Smuzhiyun 
218*4882a593Smuzhiyun char *ceph_osdmap_state_str(char *str, int len, u32 state);
219*4882a593Smuzhiyun extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
220*4882a593Smuzhiyun 
ceph_osd_addr(struct ceph_osdmap * map,int osd)221*4882a593Smuzhiyun static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
222*4882a593Smuzhiyun 						     int osd)
223*4882a593Smuzhiyun {
224*4882a593Smuzhiyun 	if (osd >= map->max_osd)
225*4882a593Smuzhiyun 		return NULL;
226*4882a593Smuzhiyun 	return &map->osd_addr[osd];
227*4882a593Smuzhiyun }
228*4882a593Smuzhiyun 
229*4882a593Smuzhiyun #define CEPH_PGID_ENCODING_LEN		(1 + 8 + 4 + 4)
230*4882a593Smuzhiyun 
ceph_decode_pgid(void ** p,void * end,struct ceph_pg * pgid)231*4882a593Smuzhiyun static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
232*4882a593Smuzhiyun {
233*4882a593Smuzhiyun 	__u8 version;
234*4882a593Smuzhiyun 
235*4882a593Smuzhiyun 	if (!ceph_has_room(p, end, CEPH_PGID_ENCODING_LEN)) {
236*4882a593Smuzhiyun 		pr_warn("incomplete pg encoding\n");
237*4882a593Smuzhiyun 		return -EINVAL;
238*4882a593Smuzhiyun 	}
239*4882a593Smuzhiyun 	version = ceph_decode_8(p);
240*4882a593Smuzhiyun 	if (version > 1) {
241*4882a593Smuzhiyun 		pr_warn("do not understand pg encoding %d > 1\n",
242*4882a593Smuzhiyun 			(int)version);
243*4882a593Smuzhiyun 		return -EINVAL;
244*4882a593Smuzhiyun 	}
245*4882a593Smuzhiyun 
246*4882a593Smuzhiyun 	pgid->pool = ceph_decode_64(p);
247*4882a593Smuzhiyun 	pgid->seed = ceph_decode_32(p);
248*4882a593Smuzhiyun 	*p += 4;	/* skip deprecated preferred value */
249*4882a593Smuzhiyun 
250*4882a593Smuzhiyun 	return 0;
251*4882a593Smuzhiyun }
252*4882a593Smuzhiyun 
253*4882a593Smuzhiyun struct ceph_osdmap *ceph_osdmap_alloc(void);
254*4882a593Smuzhiyun extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
255*4882a593Smuzhiyun struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
256*4882a593Smuzhiyun 					     struct ceph_osdmap *map);
257*4882a593Smuzhiyun extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
258*4882a593Smuzhiyun 
259*4882a593Smuzhiyun struct ceph_osds {
260*4882a593Smuzhiyun 	int osds[CEPH_PG_MAX_SIZE];
261*4882a593Smuzhiyun 	int size;
262*4882a593Smuzhiyun 	int primary; /* id, NOT index */
263*4882a593Smuzhiyun };
264*4882a593Smuzhiyun 
ceph_osds_init(struct ceph_osds * set)265*4882a593Smuzhiyun static inline void ceph_osds_init(struct ceph_osds *set)
266*4882a593Smuzhiyun {
267*4882a593Smuzhiyun 	set->size = 0;
268*4882a593Smuzhiyun 	set->primary = -1;
269*4882a593Smuzhiyun }
270*4882a593Smuzhiyun 
271*4882a593Smuzhiyun void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
272*4882a593Smuzhiyun 
273*4882a593Smuzhiyun bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
274*4882a593Smuzhiyun 		      u32 new_pg_num);
275*4882a593Smuzhiyun bool ceph_is_new_interval(const struct ceph_osds *old_acting,
276*4882a593Smuzhiyun 			  const struct ceph_osds *new_acting,
277*4882a593Smuzhiyun 			  const struct ceph_osds *old_up,
278*4882a593Smuzhiyun 			  const struct ceph_osds *new_up,
279*4882a593Smuzhiyun 			  int old_size,
280*4882a593Smuzhiyun 			  int new_size,
281*4882a593Smuzhiyun 			  int old_min_size,
282*4882a593Smuzhiyun 			  int new_min_size,
283*4882a593Smuzhiyun 			  u32 old_pg_num,
284*4882a593Smuzhiyun 			  u32 new_pg_num,
285*4882a593Smuzhiyun 			  bool old_sort_bitwise,
286*4882a593Smuzhiyun 			  bool new_sort_bitwise,
287*4882a593Smuzhiyun 			  bool old_recovery_deletes,
288*4882a593Smuzhiyun 			  bool new_recovery_deletes,
289*4882a593Smuzhiyun 			  const struct ceph_pg *pgid);
290*4882a593Smuzhiyun bool ceph_osds_changed(const struct ceph_osds *old_acting,
291*4882a593Smuzhiyun 		       const struct ceph_osds *new_acting,
292*4882a593Smuzhiyun 		       bool any_change);
293*4882a593Smuzhiyun 
294*4882a593Smuzhiyun void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
295*4882a593Smuzhiyun 				 const struct ceph_object_id *oid,
296*4882a593Smuzhiyun 				 const struct ceph_object_locator *oloc,
297*4882a593Smuzhiyun 				 struct ceph_pg *raw_pgid);
298*4882a593Smuzhiyun int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
299*4882a593Smuzhiyun 			      const struct ceph_object_id *oid,
300*4882a593Smuzhiyun 			      const struct ceph_object_locator *oloc,
301*4882a593Smuzhiyun 			      struct ceph_pg *raw_pgid);
302*4882a593Smuzhiyun 
303*4882a593Smuzhiyun void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
304*4882a593Smuzhiyun 			       struct ceph_pg_pool_info *pi,
305*4882a593Smuzhiyun 			       const struct ceph_pg *raw_pgid,
306*4882a593Smuzhiyun 			       struct ceph_osds *up,
307*4882a593Smuzhiyun 			       struct ceph_osds *acting);
308*4882a593Smuzhiyun bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
309*4882a593Smuzhiyun 			      struct ceph_pg_pool_info *pi,
310*4882a593Smuzhiyun 			      const struct ceph_pg *raw_pgid,
311*4882a593Smuzhiyun 			      struct ceph_spg *spgid);
312*4882a593Smuzhiyun int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
313*4882a593Smuzhiyun 			      const struct ceph_pg *raw_pgid);
314*4882a593Smuzhiyun 
315*4882a593Smuzhiyun struct crush_loc {
316*4882a593Smuzhiyun 	char *cl_type_name;
317*4882a593Smuzhiyun 	char *cl_name;
318*4882a593Smuzhiyun };
319*4882a593Smuzhiyun 
320*4882a593Smuzhiyun struct crush_loc_node {
321*4882a593Smuzhiyun 	struct rb_node cl_node;
322*4882a593Smuzhiyun 	struct crush_loc cl_loc;  /* pointers into cl_data */
323*4882a593Smuzhiyun 	char cl_data[];
324*4882a593Smuzhiyun };
325*4882a593Smuzhiyun 
326*4882a593Smuzhiyun int ceph_parse_crush_location(char *crush_location, struct rb_root *locs);
327*4882a593Smuzhiyun int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2);
328*4882a593Smuzhiyun void ceph_clear_crush_locs(struct rb_root *locs);
329*4882a593Smuzhiyun 
330*4882a593Smuzhiyun int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id,
331*4882a593Smuzhiyun 			    struct rb_root *locs);
332*4882a593Smuzhiyun 
333*4882a593Smuzhiyun extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
334*4882a593Smuzhiyun 						    u64 id);
335*4882a593Smuzhiyun extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
336*4882a593Smuzhiyun extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
337*4882a593Smuzhiyun u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id);
338*4882a593Smuzhiyun 
339*4882a593Smuzhiyun #endif
340