1*4882a593Smuzhiyun /* SPDX-License-Identifier: GPL-2.0 */
2*4882a593Smuzhiyun #ifndef _FS_CEPH_OSDMAP_H
3*4882a593Smuzhiyun #define _FS_CEPH_OSDMAP_H
4*4882a593Smuzhiyun
5*4882a593Smuzhiyun #include <linux/rbtree.h>
6*4882a593Smuzhiyun #include <linux/ceph/types.h>
7*4882a593Smuzhiyun #include <linux/ceph/decode.h>
8*4882a593Smuzhiyun #include <linux/crush/crush.h>
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun /*
11*4882a593Smuzhiyun * The osd map describes the current membership of the osd cluster and
12*4882a593Smuzhiyun * specifies the mapping of objects to placement groups and placement
13*4882a593Smuzhiyun * groups to (sets of) osds. That is, it completely specifies the
14*4882a593Smuzhiyun * (desired) distribution of all data objects in the system at some
15*4882a593Smuzhiyun * point in time.
16*4882a593Smuzhiyun *
17*4882a593Smuzhiyun * Each map version is identified by an epoch, which increases monotonically.
18*4882a593Smuzhiyun *
19*4882a593Smuzhiyun * The map can be updated either via an incremental map (diff) describing
20*4882a593Smuzhiyun * the change between two successive epochs, or as a fully encoded map.
21*4882a593Smuzhiyun */
22*4882a593Smuzhiyun struct ceph_pg {
23*4882a593Smuzhiyun uint64_t pool;
24*4882a593Smuzhiyun uint32_t seed;
25*4882a593Smuzhiyun };
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun #define CEPH_SPG_NOSHARD -1
28*4882a593Smuzhiyun
29*4882a593Smuzhiyun struct ceph_spg {
30*4882a593Smuzhiyun struct ceph_pg pgid;
31*4882a593Smuzhiyun s8 shard;
32*4882a593Smuzhiyun };
33*4882a593Smuzhiyun
34*4882a593Smuzhiyun int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
35*4882a593Smuzhiyun int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs);
36*4882a593Smuzhiyun
37*4882a593Smuzhiyun #define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id
38*4882a593Smuzhiyun together */
39*4882a593Smuzhiyun #define CEPH_POOL_FLAG_FULL (1ULL << 1) /* pool is full */
40*4882a593Smuzhiyun #define CEPH_POOL_FLAG_FULL_QUOTA (1ULL << 10) /* pool ran out of quota,
41*4882a593Smuzhiyun will set FULL too */
42*4882a593Smuzhiyun #define CEPH_POOL_FLAG_NEARFULL (1ULL << 11) /* pool is nearfull */
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun struct ceph_pg_pool_info {
45*4882a593Smuzhiyun struct rb_node node;
46*4882a593Smuzhiyun s64 id;
47*4882a593Smuzhiyun u8 type; /* CEPH_POOL_TYPE_* */
48*4882a593Smuzhiyun u8 size;
49*4882a593Smuzhiyun u8 min_size;
50*4882a593Smuzhiyun u8 crush_ruleset;
51*4882a593Smuzhiyun u8 object_hash;
52*4882a593Smuzhiyun u32 last_force_request_resend;
53*4882a593Smuzhiyun u32 pg_num, pgp_num;
54*4882a593Smuzhiyun int pg_num_mask, pgp_num_mask;
55*4882a593Smuzhiyun s64 read_tier;
56*4882a593Smuzhiyun s64 write_tier; /* wins for read+write ops */
57*4882a593Smuzhiyun u64 flags; /* CEPH_POOL_FLAG_* */
58*4882a593Smuzhiyun char *name;
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun bool was_full; /* for handle_one_map() */
61*4882a593Smuzhiyun };
62*4882a593Smuzhiyun
ceph_can_shift_osds(struct ceph_pg_pool_info * pool)63*4882a593Smuzhiyun static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
64*4882a593Smuzhiyun {
65*4882a593Smuzhiyun switch (pool->type) {
66*4882a593Smuzhiyun case CEPH_POOL_TYPE_REP:
67*4882a593Smuzhiyun return true;
68*4882a593Smuzhiyun case CEPH_POOL_TYPE_EC:
69*4882a593Smuzhiyun return false;
70*4882a593Smuzhiyun default:
71*4882a593Smuzhiyun BUG();
72*4882a593Smuzhiyun }
73*4882a593Smuzhiyun }
74*4882a593Smuzhiyun
75*4882a593Smuzhiyun struct ceph_object_locator {
76*4882a593Smuzhiyun s64 pool;
77*4882a593Smuzhiyun struct ceph_string *pool_ns;
78*4882a593Smuzhiyun };
79*4882a593Smuzhiyun
ceph_oloc_init(struct ceph_object_locator * oloc)80*4882a593Smuzhiyun static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
81*4882a593Smuzhiyun {
82*4882a593Smuzhiyun oloc->pool = -1;
83*4882a593Smuzhiyun oloc->pool_ns = NULL;
84*4882a593Smuzhiyun }
85*4882a593Smuzhiyun
ceph_oloc_empty(const struct ceph_object_locator * oloc)86*4882a593Smuzhiyun static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
87*4882a593Smuzhiyun {
88*4882a593Smuzhiyun return oloc->pool == -1;
89*4882a593Smuzhiyun }
90*4882a593Smuzhiyun
91*4882a593Smuzhiyun void ceph_oloc_copy(struct ceph_object_locator *dest,
92*4882a593Smuzhiyun const struct ceph_object_locator *src);
93*4882a593Smuzhiyun void ceph_oloc_destroy(struct ceph_object_locator *oloc);
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun /*
96*4882a593Smuzhiyun * 51-char inline_name is long enough for all cephfs and all but one
97*4882a593Smuzhiyun * rbd requests: <imgname> in "<imgname>.rbd"/"rbd_id.<imgname>" can be
98*4882a593Smuzhiyun * arbitrarily long (~PAGE_SIZE). It's done once during rbd map; all
99*4882a593Smuzhiyun * other rbd requests fit into inline_name.
100*4882a593Smuzhiyun *
101*4882a593Smuzhiyun * Makes ceph_object_id 64 bytes on 64-bit.
102*4882a593Smuzhiyun */
103*4882a593Smuzhiyun #define CEPH_OID_INLINE_LEN 52
104*4882a593Smuzhiyun
105*4882a593Smuzhiyun /*
106*4882a593Smuzhiyun * Both inline and external buffers have space for a NUL-terminator,
107*4882a593Smuzhiyun * which is carried around. It's not required though - RADOS object
108*4882a593Smuzhiyun * names don't have to be NUL-terminated and may contain NULs.
109*4882a593Smuzhiyun */
110*4882a593Smuzhiyun struct ceph_object_id {
111*4882a593Smuzhiyun char *name;
112*4882a593Smuzhiyun char inline_name[CEPH_OID_INLINE_LEN];
113*4882a593Smuzhiyun int name_len;
114*4882a593Smuzhiyun };
115*4882a593Smuzhiyun
116*4882a593Smuzhiyun #define __CEPH_OID_INITIALIZER(oid) { .name = (oid).inline_name }
117*4882a593Smuzhiyun
118*4882a593Smuzhiyun #define CEPH_DEFINE_OID_ONSTACK(oid) \
119*4882a593Smuzhiyun struct ceph_object_id oid = __CEPH_OID_INITIALIZER(oid)
120*4882a593Smuzhiyun
ceph_oid_init(struct ceph_object_id * oid)121*4882a593Smuzhiyun static inline void ceph_oid_init(struct ceph_object_id *oid)
122*4882a593Smuzhiyun {
123*4882a593Smuzhiyun *oid = (struct ceph_object_id) __CEPH_OID_INITIALIZER(*oid);
124*4882a593Smuzhiyun }
125*4882a593Smuzhiyun
ceph_oid_empty(const struct ceph_object_id * oid)126*4882a593Smuzhiyun static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
127*4882a593Smuzhiyun {
128*4882a593Smuzhiyun return oid->name == oid->inline_name && !oid->name_len;
129*4882a593Smuzhiyun }
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun void ceph_oid_copy(struct ceph_object_id *dest,
132*4882a593Smuzhiyun const struct ceph_object_id *src);
133*4882a593Smuzhiyun __printf(2, 3)
134*4882a593Smuzhiyun void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...);
135*4882a593Smuzhiyun __printf(3, 4)
136*4882a593Smuzhiyun int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp,
137*4882a593Smuzhiyun const char *fmt, ...);
138*4882a593Smuzhiyun void ceph_oid_destroy(struct ceph_object_id *oid);
139*4882a593Smuzhiyun
140*4882a593Smuzhiyun struct workspace_manager {
141*4882a593Smuzhiyun struct list_head idle_ws;
142*4882a593Smuzhiyun spinlock_t ws_lock;
143*4882a593Smuzhiyun /* Number of free workspaces */
144*4882a593Smuzhiyun int free_ws;
145*4882a593Smuzhiyun /* Total number of allocated workspaces */
146*4882a593Smuzhiyun atomic_t total_ws;
147*4882a593Smuzhiyun /* Waiters for a free workspace */
148*4882a593Smuzhiyun wait_queue_head_t ws_wait;
149*4882a593Smuzhiyun };
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun struct ceph_pg_mapping {
152*4882a593Smuzhiyun struct rb_node node;
153*4882a593Smuzhiyun struct ceph_pg pgid;
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun union {
156*4882a593Smuzhiyun struct {
157*4882a593Smuzhiyun int len;
158*4882a593Smuzhiyun int osds[];
159*4882a593Smuzhiyun } pg_temp, pg_upmap;
160*4882a593Smuzhiyun struct {
161*4882a593Smuzhiyun int osd;
162*4882a593Smuzhiyun } primary_temp;
163*4882a593Smuzhiyun struct {
164*4882a593Smuzhiyun int len;
165*4882a593Smuzhiyun int from_to[][2];
166*4882a593Smuzhiyun } pg_upmap_items;
167*4882a593Smuzhiyun };
168*4882a593Smuzhiyun };
169*4882a593Smuzhiyun
170*4882a593Smuzhiyun struct ceph_osdmap {
171*4882a593Smuzhiyun struct ceph_fsid fsid;
172*4882a593Smuzhiyun u32 epoch;
173*4882a593Smuzhiyun struct ceph_timespec created, modified;
174*4882a593Smuzhiyun
175*4882a593Smuzhiyun u32 flags; /* CEPH_OSDMAP_* */
176*4882a593Smuzhiyun
177*4882a593Smuzhiyun u32 max_osd; /* size of osd_state, _offload, _addr arrays */
178*4882a593Smuzhiyun u32 *osd_state; /* CEPH_OSD_* */
179*4882a593Smuzhiyun u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
180*4882a593Smuzhiyun struct ceph_entity_addr *osd_addr;
181*4882a593Smuzhiyun
182*4882a593Smuzhiyun struct rb_root pg_temp;
183*4882a593Smuzhiyun struct rb_root primary_temp;
184*4882a593Smuzhiyun
185*4882a593Smuzhiyun /* remap (post-CRUSH, pre-up) */
186*4882a593Smuzhiyun struct rb_root pg_upmap; /* PG := raw set */
187*4882a593Smuzhiyun struct rb_root pg_upmap_items; /* from -> to within raw set */
188*4882a593Smuzhiyun
189*4882a593Smuzhiyun u32 *osd_primary_affinity;
190*4882a593Smuzhiyun
191*4882a593Smuzhiyun struct rb_root pg_pools;
192*4882a593Smuzhiyun u32 pool_max;
193*4882a593Smuzhiyun
194*4882a593Smuzhiyun /* the CRUSH map specifies the mapping of placement groups to
195*4882a593Smuzhiyun * the list of osds that store+replicate them. */
196*4882a593Smuzhiyun struct crush_map *crush;
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun struct workspace_manager crush_wsm;
199*4882a593Smuzhiyun };
200*4882a593Smuzhiyun
ceph_osd_exists(struct ceph_osdmap * map,int osd)201*4882a593Smuzhiyun static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
202*4882a593Smuzhiyun {
203*4882a593Smuzhiyun return osd >= 0 && osd < map->max_osd &&
204*4882a593Smuzhiyun (map->osd_state[osd] & CEPH_OSD_EXISTS);
205*4882a593Smuzhiyun }
206*4882a593Smuzhiyun
ceph_osd_is_up(struct ceph_osdmap * map,int osd)207*4882a593Smuzhiyun static inline bool ceph_osd_is_up(struct ceph_osdmap *map, int osd)
208*4882a593Smuzhiyun {
209*4882a593Smuzhiyun return ceph_osd_exists(map, osd) &&
210*4882a593Smuzhiyun (map->osd_state[osd] & CEPH_OSD_UP);
211*4882a593Smuzhiyun }
212*4882a593Smuzhiyun
ceph_osd_is_down(struct ceph_osdmap * map,int osd)213*4882a593Smuzhiyun static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
214*4882a593Smuzhiyun {
215*4882a593Smuzhiyun return !ceph_osd_is_up(map, osd);
216*4882a593Smuzhiyun }
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun char *ceph_osdmap_state_str(char *str, int len, u32 state);
219*4882a593Smuzhiyun extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
220*4882a593Smuzhiyun
ceph_osd_addr(struct ceph_osdmap * map,int osd)221*4882a593Smuzhiyun static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
222*4882a593Smuzhiyun int osd)
223*4882a593Smuzhiyun {
224*4882a593Smuzhiyun if (osd >= map->max_osd)
225*4882a593Smuzhiyun return NULL;
226*4882a593Smuzhiyun return &map->osd_addr[osd];
227*4882a593Smuzhiyun }
228*4882a593Smuzhiyun
229*4882a593Smuzhiyun #define CEPH_PGID_ENCODING_LEN (1 + 8 + 4 + 4)
230*4882a593Smuzhiyun
ceph_decode_pgid(void ** p,void * end,struct ceph_pg * pgid)231*4882a593Smuzhiyun static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
232*4882a593Smuzhiyun {
233*4882a593Smuzhiyun __u8 version;
234*4882a593Smuzhiyun
235*4882a593Smuzhiyun if (!ceph_has_room(p, end, CEPH_PGID_ENCODING_LEN)) {
236*4882a593Smuzhiyun pr_warn("incomplete pg encoding\n");
237*4882a593Smuzhiyun return -EINVAL;
238*4882a593Smuzhiyun }
239*4882a593Smuzhiyun version = ceph_decode_8(p);
240*4882a593Smuzhiyun if (version > 1) {
241*4882a593Smuzhiyun pr_warn("do not understand pg encoding %d > 1\n",
242*4882a593Smuzhiyun (int)version);
243*4882a593Smuzhiyun return -EINVAL;
244*4882a593Smuzhiyun }
245*4882a593Smuzhiyun
246*4882a593Smuzhiyun pgid->pool = ceph_decode_64(p);
247*4882a593Smuzhiyun pgid->seed = ceph_decode_32(p);
248*4882a593Smuzhiyun *p += 4; /* skip deprecated preferred value */
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun return 0;
251*4882a593Smuzhiyun }
252*4882a593Smuzhiyun
253*4882a593Smuzhiyun struct ceph_osdmap *ceph_osdmap_alloc(void);
254*4882a593Smuzhiyun extern struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end);
255*4882a593Smuzhiyun struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
256*4882a593Smuzhiyun struct ceph_osdmap *map);
257*4882a593Smuzhiyun extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
258*4882a593Smuzhiyun
259*4882a593Smuzhiyun struct ceph_osds {
260*4882a593Smuzhiyun int osds[CEPH_PG_MAX_SIZE];
261*4882a593Smuzhiyun int size;
262*4882a593Smuzhiyun int primary; /* id, NOT index */
263*4882a593Smuzhiyun };
264*4882a593Smuzhiyun
ceph_osds_init(struct ceph_osds * set)265*4882a593Smuzhiyun static inline void ceph_osds_init(struct ceph_osds *set)
266*4882a593Smuzhiyun {
267*4882a593Smuzhiyun set->size = 0;
268*4882a593Smuzhiyun set->primary = -1;
269*4882a593Smuzhiyun }
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
272*4882a593Smuzhiyun
273*4882a593Smuzhiyun bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
274*4882a593Smuzhiyun u32 new_pg_num);
275*4882a593Smuzhiyun bool ceph_is_new_interval(const struct ceph_osds *old_acting,
276*4882a593Smuzhiyun const struct ceph_osds *new_acting,
277*4882a593Smuzhiyun const struct ceph_osds *old_up,
278*4882a593Smuzhiyun const struct ceph_osds *new_up,
279*4882a593Smuzhiyun int old_size,
280*4882a593Smuzhiyun int new_size,
281*4882a593Smuzhiyun int old_min_size,
282*4882a593Smuzhiyun int new_min_size,
283*4882a593Smuzhiyun u32 old_pg_num,
284*4882a593Smuzhiyun u32 new_pg_num,
285*4882a593Smuzhiyun bool old_sort_bitwise,
286*4882a593Smuzhiyun bool new_sort_bitwise,
287*4882a593Smuzhiyun bool old_recovery_deletes,
288*4882a593Smuzhiyun bool new_recovery_deletes,
289*4882a593Smuzhiyun const struct ceph_pg *pgid);
290*4882a593Smuzhiyun bool ceph_osds_changed(const struct ceph_osds *old_acting,
291*4882a593Smuzhiyun const struct ceph_osds *new_acting,
292*4882a593Smuzhiyun bool any_change);
293*4882a593Smuzhiyun
294*4882a593Smuzhiyun void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
295*4882a593Smuzhiyun const struct ceph_object_id *oid,
296*4882a593Smuzhiyun const struct ceph_object_locator *oloc,
297*4882a593Smuzhiyun struct ceph_pg *raw_pgid);
298*4882a593Smuzhiyun int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
299*4882a593Smuzhiyun const struct ceph_object_id *oid,
300*4882a593Smuzhiyun const struct ceph_object_locator *oloc,
301*4882a593Smuzhiyun struct ceph_pg *raw_pgid);
302*4882a593Smuzhiyun
303*4882a593Smuzhiyun void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
304*4882a593Smuzhiyun struct ceph_pg_pool_info *pi,
305*4882a593Smuzhiyun const struct ceph_pg *raw_pgid,
306*4882a593Smuzhiyun struct ceph_osds *up,
307*4882a593Smuzhiyun struct ceph_osds *acting);
308*4882a593Smuzhiyun bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
309*4882a593Smuzhiyun struct ceph_pg_pool_info *pi,
310*4882a593Smuzhiyun const struct ceph_pg *raw_pgid,
311*4882a593Smuzhiyun struct ceph_spg *spgid);
312*4882a593Smuzhiyun int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
313*4882a593Smuzhiyun const struct ceph_pg *raw_pgid);
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun struct crush_loc {
316*4882a593Smuzhiyun char *cl_type_name;
317*4882a593Smuzhiyun char *cl_name;
318*4882a593Smuzhiyun };
319*4882a593Smuzhiyun
320*4882a593Smuzhiyun struct crush_loc_node {
321*4882a593Smuzhiyun struct rb_node cl_node;
322*4882a593Smuzhiyun struct crush_loc cl_loc; /* pointers into cl_data */
323*4882a593Smuzhiyun char cl_data[];
324*4882a593Smuzhiyun };
325*4882a593Smuzhiyun
326*4882a593Smuzhiyun int ceph_parse_crush_location(char *crush_location, struct rb_root *locs);
327*4882a593Smuzhiyun int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2);
328*4882a593Smuzhiyun void ceph_clear_crush_locs(struct rb_root *locs);
329*4882a593Smuzhiyun
330*4882a593Smuzhiyun int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id,
331*4882a593Smuzhiyun struct rb_root *locs);
332*4882a593Smuzhiyun
333*4882a593Smuzhiyun extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map,
334*4882a593Smuzhiyun u64 id);
335*4882a593Smuzhiyun extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
336*4882a593Smuzhiyun extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
337*4882a593Smuzhiyun u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id);
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun #endif
340