2 #include <linux/ceph/ceph_debug.h>
4 #include <linux/module.h>
5 #include <linux/slab.h>
8 #include <linux/ceph/libceph.h>
9 #include <linux/ceph/osdmap.h>
10 #include <linux/ceph/decode.h>
11 #include <linux/crush/hash.h>
12 #include <linux/crush/mapper.h>
14 char *ceph_osdmap_state_str(char *str
, int len
, int state
)
19 if ((state
& CEPH_OSD_EXISTS
) && (state
& CEPH_OSD_UP
))
20 snprintf(str
, len
, "exists, up");
21 else if (state
& CEPH_OSD_EXISTS
)
22 snprintf(str
, len
, "exists");
23 else if (state
& CEPH_OSD_UP
)
24 snprintf(str
, len
, "up");
26 snprintf(str
, len
, "doesn't exist");
33 static int calc_bits_of(unsigned int t
)
44 * the foo_mask is the smallest value 2^n-1 that is >= foo.
46 static void calc_pg_masks(struct ceph_pg_pool_info
*pi
)
48 pi
->pg_num_mask
= (1 << calc_bits_of(pi
->pg_num
-1)) - 1;
49 pi
->pgp_num_mask
= (1 << calc_bits_of(pi
->pgp_num
-1)) - 1;
55 static int crush_decode_uniform_bucket(void **p
, void *end
,
56 struct crush_bucket_uniform
*b
)
58 dout("crush_decode_uniform_bucket %p to %p\n", *p
, end
);
59 ceph_decode_need(p
, end
, (1+b
->h
.size
) * sizeof(u32
), bad
);
60 b
->item_weight
= ceph_decode_32(p
);
66 static int crush_decode_list_bucket(void **p
, void *end
,
67 struct crush_bucket_list
*b
)
70 dout("crush_decode_list_bucket %p to %p\n", *p
, end
);
71 b
->item_weights
= kcalloc(b
->h
.size
, sizeof(u32
), GFP_NOFS
);
72 if (b
->item_weights
== NULL
)
74 b
->sum_weights
= kcalloc(b
->h
.size
, sizeof(u32
), GFP_NOFS
);
75 if (b
->sum_weights
== NULL
)
77 ceph_decode_need(p
, end
, 2 * b
->h
.size
* sizeof(u32
), bad
);
78 for (j
= 0; j
< b
->h
.size
; j
++) {
79 b
->item_weights
[j
] = ceph_decode_32(p
);
80 b
->sum_weights
[j
] = ceph_decode_32(p
);
87 static int crush_decode_tree_bucket(void **p
, void *end
,
88 struct crush_bucket_tree
*b
)
91 dout("crush_decode_tree_bucket %p to %p\n", *p
, end
);
92 ceph_decode_32_safe(p
, end
, b
->num_nodes
, bad
);
93 b
->node_weights
= kcalloc(b
->num_nodes
, sizeof(u32
), GFP_NOFS
);
94 if (b
->node_weights
== NULL
)
96 ceph_decode_need(p
, end
, b
->num_nodes
* sizeof(u32
), bad
);
97 for (j
= 0; j
< b
->num_nodes
; j
++)
98 b
->node_weights
[j
] = ceph_decode_32(p
);
104 static int crush_decode_straw_bucket(void **p
, void *end
,
105 struct crush_bucket_straw
*b
)
108 dout("crush_decode_straw_bucket %p to %p\n", *p
, end
);
109 b
->item_weights
= kcalloc(b
->h
.size
, sizeof(u32
), GFP_NOFS
);
110 if (b
->item_weights
== NULL
)
112 b
->straws
= kcalloc(b
->h
.size
, sizeof(u32
), GFP_NOFS
);
113 if (b
->straws
== NULL
)
115 ceph_decode_need(p
, end
, 2 * b
->h
.size
* sizeof(u32
), bad
);
116 for (j
= 0; j
< b
->h
.size
; j
++) {
117 b
->item_weights
[j
] = ceph_decode_32(p
);
118 b
->straws
[j
] = ceph_decode_32(p
);
125 static int skip_name_map(void **p
, void *end
)
128 ceph_decode_32_safe(p
, end
, len
,bad
);
132 ceph_decode_32_safe(p
, end
, strlen
, bad
);
140 static struct crush_map
*crush_decode(void *pbyval
, void *end
)
146 void *start
= pbyval
;
150 dout("crush_decode %p to %p len %d\n", *p
, end
, (int)(end
- *p
));
152 c
= kzalloc(sizeof(*c
), GFP_NOFS
);
154 return ERR_PTR(-ENOMEM
);
156 /* set tunables to default values */
157 c
->choose_local_tries
= 2;
158 c
->choose_local_fallback_tries
= 5;
159 c
->choose_total_tries
= 19;
160 c
->chooseleaf_descend_once
= 0;
162 ceph_decode_need(p
, end
, 4*sizeof(u32
), bad
);
163 magic
= ceph_decode_32(p
);
164 if (magic
!= CRUSH_MAGIC
) {
165 pr_err("crush_decode magic %x != current %x\n",
166 (unsigned int)magic
, (unsigned int)CRUSH_MAGIC
);
169 c
->max_buckets
= ceph_decode_32(p
);
170 c
->max_rules
= ceph_decode_32(p
);
171 c
->max_devices
= ceph_decode_32(p
);
173 c
->buckets
= kcalloc(c
->max_buckets
, sizeof(*c
->buckets
), GFP_NOFS
);
174 if (c
->buckets
== NULL
)
176 c
->rules
= kcalloc(c
->max_rules
, sizeof(*c
->rules
), GFP_NOFS
);
177 if (c
->rules
== NULL
)
181 for (i
= 0; i
< c
->max_buckets
; i
++) {
184 struct crush_bucket
*b
;
186 ceph_decode_32_safe(p
, end
, alg
, bad
);
188 c
->buckets
[i
] = NULL
;
191 dout("crush_decode bucket %d off %x %p to %p\n",
192 i
, (int)(*p
-start
), *p
, end
);
195 case CRUSH_BUCKET_UNIFORM
:
196 size
= sizeof(struct crush_bucket_uniform
);
198 case CRUSH_BUCKET_LIST
:
199 size
= sizeof(struct crush_bucket_list
);
201 case CRUSH_BUCKET_TREE
:
202 size
= sizeof(struct crush_bucket_tree
);
204 case CRUSH_BUCKET_STRAW
:
205 size
= sizeof(struct crush_bucket_straw
);
212 b
= c
->buckets
[i
] = kzalloc(size
, GFP_NOFS
);
216 ceph_decode_need(p
, end
, 4*sizeof(u32
), bad
);
217 b
->id
= ceph_decode_32(p
);
218 b
->type
= ceph_decode_16(p
);
219 b
->alg
= ceph_decode_8(p
);
220 b
->hash
= ceph_decode_8(p
);
221 b
->weight
= ceph_decode_32(p
);
222 b
->size
= ceph_decode_32(p
);
224 dout("crush_decode bucket size %d off %x %p to %p\n",
225 b
->size
, (int)(*p
-start
), *p
, end
);
227 b
->items
= kcalloc(b
->size
, sizeof(__s32
), GFP_NOFS
);
228 if (b
->items
== NULL
)
230 b
->perm
= kcalloc(b
->size
, sizeof(u32
), GFP_NOFS
);
235 ceph_decode_need(p
, end
, b
->size
*sizeof(u32
), bad
);
236 for (j
= 0; j
< b
->size
; j
++)
237 b
->items
[j
] = ceph_decode_32(p
);
240 case CRUSH_BUCKET_UNIFORM
:
241 err
= crush_decode_uniform_bucket(p
, end
,
242 (struct crush_bucket_uniform
*)b
);
246 case CRUSH_BUCKET_LIST
:
247 err
= crush_decode_list_bucket(p
, end
,
248 (struct crush_bucket_list
*)b
);
252 case CRUSH_BUCKET_TREE
:
253 err
= crush_decode_tree_bucket(p
, end
,
254 (struct crush_bucket_tree
*)b
);
258 case CRUSH_BUCKET_STRAW
:
259 err
= crush_decode_straw_bucket(p
, end
,
260 (struct crush_bucket_straw
*)b
);
268 dout("rule vec is %p\n", c
->rules
);
269 for (i
= 0; i
< c
->max_rules
; i
++) {
271 struct crush_rule
*r
;
273 ceph_decode_32_safe(p
, end
, yes
, bad
);
275 dout("crush_decode NO rule %d off %x %p to %p\n",
276 i
, (int)(*p
-start
), *p
, end
);
281 dout("crush_decode rule %d off %x %p to %p\n",
282 i
, (int)(*p
-start
), *p
, end
);
285 ceph_decode_32_safe(p
, end
, yes
, bad
);
286 #if BITS_PER_LONG == 32
288 if (yes
> (ULONG_MAX
- sizeof(*r
))
289 / sizeof(struct crush_rule_step
))
292 r
= c
->rules
[i
] = kmalloc(sizeof(*r
) +
293 yes
*sizeof(struct crush_rule_step
),
297 dout(" rule %d is at %p\n", i
, r
);
299 ceph_decode_copy_safe(p
, end
, &r
->mask
, 4, bad
); /* 4 u8's */
300 ceph_decode_need(p
, end
, r
->len
*3*sizeof(u32
), bad
);
301 for (j
= 0; j
< r
->len
; j
++) {
302 r
->steps
[j
].op
= ceph_decode_32(p
);
303 r
->steps
[j
].arg1
= ceph_decode_32(p
);
304 r
->steps
[j
].arg2
= ceph_decode_32(p
);
308 /* ignore trailing name maps. */
309 for (num_name_maps
= 0; num_name_maps
< 3; num_name_maps
++) {
310 err
= skip_name_map(p
, end
);
316 ceph_decode_need(p
, end
, 3*sizeof(u32
), done
);
317 c
->choose_local_tries
= ceph_decode_32(p
);
318 c
->choose_local_fallback_tries
= ceph_decode_32(p
);
319 c
->choose_total_tries
= ceph_decode_32(p
);
320 dout("crush decode tunable choose_local_tries = %d",
321 c
->choose_local_tries
);
322 dout("crush decode tunable choose_local_fallback_tries = %d",
323 c
->choose_local_fallback_tries
);
324 dout("crush decode tunable choose_total_tries = %d",
325 c
->choose_total_tries
);
327 ceph_decode_need(p
, end
, sizeof(u32
), done
);
328 c
->chooseleaf_descend_once
= ceph_decode_32(p
);
329 dout("crush decode tunable chooseleaf_descend_once = %d",
330 c
->chooseleaf_descend_once
);
333 dout("crush_decode success\n");
339 dout("crush_decode fail %d\n", err
);
345 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
348 static int pgid_cmp(struct ceph_pg l
, struct ceph_pg r
)
361 static int __insert_pg_mapping(struct ceph_pg_mapping
*new,
362 struct rb_root
*root
)
364 struct rb_node
**p
= &root
->rb_node
;
365 struct rb_node
*parent
= NULL
;
366 struct ceph_pg_mapping
*pg
= NULL
;
369 dout("__insert_pg_mapping %llx %p\n", *(u64
*)&new->pgid
, new);
372 pg
= rb_entry(parent
, struct ceph_pg_mapping
, node
);
373 c
= pgid_cmp(new->pgid
, pg
->pgid
);
382 rb_link_node(&new->node
, parent
, p
);
383 rb_insert_color(&new->node
, root
);
387 static struct ceph_pg_mapping
*__lookup_pg_mapping(struct rb_root
*root
,
390 struct rb_node
*n
= root
->rb_node
;
391 struct ceph_pg_mapping
*pg
;
395 pg
= rb_entry(n
, struct ceph_pg_mapping
, node
);
396 c
= pgid_cmp(pgid
, pg
->pgid
);
402 dout("__lookup_pg_mapping %lld.%x got %p\n",
403 pgid
.pool
, pgid
.seed
, pg
);
410 static int __remove_pg_mapping(struct rb_root
*root
, struct ceph_pg pgid
)
412 struct ceph_pg_mapping
*pg
= __lookup_pg_mapping(root
, pgid
);
415 dout("__remove_pg_mapping %lld.%x %p\n", pgid
.pool
, pgid
.seed
,
417 rb_erase(&pg
->node
, root
);
421 dout("__remove_pg_mapping %lld.%x dne\n", pgid
.pool
, pgid
.seed
);
426 * rbtree of pg pool info
428 static int __insert_pg_pool(struct rb_root
*root
, struct ceph_pg_pool_info
*new)
430 struct rb_node
**p
= &root
->rb_node
;
431 struct rb_node
*parent
= NULL
;
432 struct ceph_pg_pool_info
*pi
= NULL
;
436 pi
= rb_entry(parent
, struct ceph_pg_pool_info
, node
);
437 if (new->id
< pi
->id
)
439 else if (new->id
> pi
->id
)
445 rb_link_node(&new->node
, parent
, p
);
446 rb_insert_color(&new->node
, root
);
450 static struct ceph_pg_pool_info
*__lookup_pg_pool(struct rb_root
*root
, u64 id
)
452 struct ceph_pg_pool_info
*pi
;
453 struct rb_node
*n
= root
->rb_node
;
456 pi
= rb_entry(n
, struct ceph_pg_pool_info
, node
);
459 else if (id
> pi
->id
)
467 const char *ceph_pg_pool_name_by_id(struct ceph_osdmap
*map
, u64 id
)
469 struct ceph_pg_pool_info
*pi
;
471 if (id
== CEPH_NOPOOL
)
474 if (WARN_ON_ONCE(id
> (u64
) INT_MAX
))
477 pi
= __lookup_pg_pool(&map
->pg_pools
, (int) id
);
479 return pi
? pi
->name
: NULL
;
481 EXPORT_SYMBOL(ceph_pg_pool_name_by_id
);
483 int ceph_pg_poolid_by_name(struct ceph_osdmap
*map
, const char *name
)
487 for (rbp
= rb_first(&map
->pg_pools
); rbp
; rbp
= rb_next(rbp
)) {
488 struct ceph_pg_pool_info
*pi
=
489 rb_entry(rbp
, struct ceph_pg_pool_info
, node
);
490 if (pi
->name
&& strcmp(pi
->name
, name
) == 0)
495 EXPORT_SYMBOL(ceph_pg_poolid_by_name
);
497 static void __remove_pg_pool(struct rb_root
*root
, struct ceph_pg_pool_info
*pi
)
499 rb_erase(&pi
->node
, root
);
504 static int __decode_pool(void **p
, void *end
, struct ceph_pg_pool_info
*pi
)
510 ceph_decode_need(p
, end
, 2 + 4, bad
);
511 ev
= ceph_decode_8(p
); /* encoding version */
512 cv
= ceph_decode_8(p
); /* compat version */
514 pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev
, cv
);
518 pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev
, cv
);
521 len
= ceph_decode_32(p
);
522 ceph_decode_need(p
, end
, len
, bad
);
525 pi
->type
= ceph_decode_8(p
);
526 pi
->size
= ceph_decode_8(p
);
527 pi
->crush_ruleset
= ceph_decode_8(p
);
528 pi
->object_hash
= ceph_decode_8(p
);
530 pi
->pg_num
= ceph_decode_32(p
);
531 pi
->pgp_num
= ceph_decode_32(p
);
533 *p
+= 4 + 4; /* skip lpg* */
534 *p
+= 4; /* skip last_change */
535 *p
+= 8 + 4; /* skip snap_seq, snap_epoch */
538 num
= ceph_decode_32(p
);
540 *p
+= 8; /* snapid key */
541 *p
+= 1 + 1; /* versions */
542 len
= ceph_decode_32(p
);
546 /* skip removed snaps */
547 num
= ceph_decode_32(p
);
550 *p
+= 8; /* skip auid */
551 pi
->flags
= ceph_decode_64(p
);
553 /* ignore the rest */
563 static int __decode_pool_names(void **p
, void *end
, struct ceph_osdmap
*map
)
565 struct ceph_pg_pool_info
*pi
;
569 ceph_decode_32_safe(p
, end
, num
, bad
);
570 dout(" %d pool names\n", num
);
572 ceph_decode_64_safe(p
, end
, pool
, bad
);
573 ceph_decode_32_safe(p
, end
, len
, bad
);
574 dout(" pool %llu len %d\n", pool
, len
);
575 ceph_decode_need(p
, end
, len
, bad
);
576 pi
= __lookup_pg_pool(&map
->pg_pools
, pool
);
578 char *name
= kstrndup(*p
, len
, GFP_NOFS
);
584 dout(" name is %s\n", pi
->name
);
597 void ceph_osdmap_destroy(struct ceph_osdmap
*map
)
599 dout("osdmap_destroy %p\n", map
);
601 crush_destroy(map
->crush
);
602 while (!RB_EMPTY_ROOT(&map
->pg_temp
)) {
603 struct ceph_pg_mapping
*pg
=
604 rb_entry(rb_first(&map
->pg_temp
),
605 struct ceph_pg_mapping
, node
);
606 rb_erase(&pg
->node
, &map
->pg_temp
);
609 while (!RB_EMPTY_ROOT(&map
->pg_pools
)) {
610 struct ceph_pg_pool_info
*pi
=
611 rb_entry(rb_first(&map
->pg_pools
),
612 struct ceph_pg_pool_info
, node
);
613 __remove_pg_pool(&map
->pg_pools
, pi
);
615 kfree(map
->osd_state
);
616 kfree(map
->osd_weight
);
617 kfree(map
->osd_addr
);
622 * adjust max osd value. reallocate arrays.
624 static int osdmap_set_max_osd(struct ceph_osdmap
*map
, int max
)
627 struct ceph_entity_addr
*addr
;
630 state
= kcalloc(max
, sizeof(*state
), GFP_NOFS
);
631 addr
= kcalloc(max
, sizeof(*addr
), GFP_NOFS
);
632 weight
= kcalloc(max
, sizeof(*weight
), GFP_NOFS
);
633 if (state
== NULL
|| addr
== NULL
|| weight
== NULL
) {
641 if (map
->osd_state
) {
642 memcpy(state
, map
->osd_state
, map
->max_osd
*sizeof(*state
));
643 memcpy(addr
, map
->osd_addr
, map
->max_osd
*sizeof(*addr
));
644 memcpy(weight
, map
->osd_weight
, map
->max_osd
*sizeof(*weight
));
645 kfree(map
->osd_state
);
646 kfree(map
->osd_addr
);
647 kfree(map
->osd_weight
);
650 map
->osd_state
= state
;
651 map
->osd_weight
= weight
;
652 map
->osd_addr
= addr
;
660 struct ceph_osdmap
*osdmap_decode(void **p
, void *end
)
662 struct ceph_osdmap
*map
;
667 struct ceph_pg_pool_info
*pi
;
669 dout("osdmap_decode %p to %p len %d\n", *p
, end
, (int)(end
- *p
));
671 map
= kzalloc(sizeof(*map
), GFP_NOFS
);
673 return ERR_PTR(-ENOMEM
);
674 map
->pg_temp
= RB_ROOT
;
676 ceph_decode_16_safe(p
, end
, version
, bad
);
678 pr_warning("got unknown v %d > 6 of osdmap\n", version
);
682 pr_warning("got old v %d < 6 of osdmap\n", version
);
686 ceph_decode_need(p
, end
, 2*sizeof(u64
)+6*sizeof(u32
), bad
);
687 ceph_decode_copy(p
, &map
->fsid
, sizeof(map
->fsid
));
688 map
->epoch
= ceph_decode_32(p
);
689 ceph_decode_copy(p
, &map
->created
, sizeof(map
->created
));
690 ceph_decode_copy(p
, &map
->modified
, sizeof(map
->modified
));
692 ceph_decode_32_safe(p
, end
, max
, bad
);
694 ceph_decode_need(p
, end
, 8 + 2, bad
);
696 pi
= kzalloc(sizeof(*pi
), GFP_NOFS
);
699 pi
->id
= ceph_decode_64(p
);
700 err
= __decode_pool(p
, end
, pi
);
705 __insert_pg_pool(&map
->pg_pools
, pi
);
708 err
= __decode_pool_names(p
, end
, map
);
710 dout("fail to decode pool names");
714 ceph_decode_32_safe(p
, end
, map
->pool_max
, bad
);
716 ceph_decode_32_safe(p
, end
, map
->flags
, bad
);
718 max
= ceph_decode_32(p
);
720 /* (re)alloc osd arrays */
721 err
= osdmap_set_max_osd(map
, max
);
724 dout("osdmap_decode max_osd = %d\n", map
->max_osd
);
728 ceph_decode_need(p
, end
, 3*sizeof(u32
) +
729 map
->max_osd
*(1 + sizeof(*map
->osd_weight
) +
730 sizeof(*map
->osd_addr
)), bad
);
731 *p
+= 4; /* skip length field (should match max) */
732 ceph_decode_copy(p
, map
->osd_state
, map
->max_osd
);
734 *p
+= 4; /* skip length field (should match max) */
735 for (i
= 0; i
< map
->max_osd
; i
++)
736 map
->osd_weight
[i
] = ceph_decode_32(p
);
738 *p
+= 4; /* skip length field (should match max) */
739 ceph_decode_copy(p
, map
->osd_addr
, map
->max_osd
*sizeof(*map
->osd_addr
));
740 for (i
= 0; i
< map
->max_osd
; i
++)
741 ceph_decode_addr(&map
->osd_addr
[i
]);
744 ceph_decode_32_safe(p
, end
, len
, bad
);
745 for (i
= 0; i
< len
; i
++) {
748 struct ceph_pg_mapping
*pg
;
750 err
= ceph_decode_pgid(p
, end
, &pgid
);
753 ceph_decode_need(p
, end
, sizeof(u32
), bad
);
754 n
= ceph_decode_32(p
);
756 if (n
> (UINT_MAX
- sizeof(*pg
)) / sizeof(u32
))
758 ceph_decode_need(p
, end
, n
* sizeof(u32
), bad
);
760 pg
= kmalloc(sizeof(*pg
) + n
*sizeof(u32
), GFP_NOFS
);
765 for (j
= 0; j
< n
; j
++)
766 pg
->osds
[j
] = ceph_decode_32(p
);
768 err
= __insert_pg_mapping(pg
, &map
->pg_temp
);
771 dout(" added pg_temp %lld.%x len %d\n", pgid
.pool
, pgid
.seed
,
776 ceph_decode_32_safe(p
, end
, len
, bad
);
777 dout("osdmap_decode crush len %d from off 0x%x\n", len
,
779 ceph_decode_need(p
, end
, len
, bad
);
780 map
->crush
= crush_decode(*p
, end
);
782 if (IS_ERR(map
->crush
)) {
783 err
= PTR_ERR(map
->crush
);
788 /* ignore the rest of the map */
791 dout("osdmap_decode done %p %p\n", *p
, end
);
795 dout("osdmap_decode fail err %d\n", err
);
796 ceph_osdmap_destroy(map
);
801 * decode and apply an incremental map update.
803 struct ceph_osdmap
*osdmap_apply_incremental(void **p
, void *end
,
804 struct ceph_osdmap
*map
,
805 struct ceph_messenger
*msgr
)
807 struct crush_map
*newcrush
= NULL
;
808 struct ceph_fsid fsid
;
810 struct ceph_timespec modified
;
814 __s32 new_flags
, max
;
819 ceph_decode_16_safe(p
, end
, version
, bad
);
821 pr_warning("got unknown v %d != 6 of inc osdmap\n", version
);
825 ceph_decode_need(p
, end
, sizeof(fsid
)+sizeof(modified
)+2*sizeof(u32
),
827 ceph_decode_copy(p
, &fsid
, sizeof(fsid
));
828 epoch
= ceph_decode_32(p
);
829 BUG_ON(epoch
!= map
->epoch
+1);
830 ceph_decode_copy(p
, &modified
, sizeof(modified
));
831 new_pool_max
= ceph_decode_64(p
);
832 new_flags
= ceph_decode_32(p
);
835 ceph_decode_32_safe(p
, end
, len
, bad
);
837 dout("apply_incremental full map len %d, %p to %p\n",
839 return osdmap_decode(p
, min(*p
+len
, end
));
843 ceph_decode_32_safe(p
, end
, len
, bad
);
845 dout("apply_incremental new crush map len %d, %p to %p\n",
847 newcrush
= crush_decode(*p
, min(*p
+len
, end
));
848 if (IS_ERR(newcrush
))
849 return ERR_CAST(newcrush
);
855 map
->flags
= new_flags
;
856 if (new_pool_max
>= 0)
857 map
->pool_max
= new_pool_max
;
859 ceph_decode_need(p
, end
, 5*sizeof(u32
), bad
);
862 max
= ceph_decode_32(p
);
864 err
= osdmap_set_max_osd(map
, max
);
870 map
->modified
= modified
;
873 crush_destroy(map
->crush
);
874 map
->crush
= newcrush
;
879 ceph_decode_32_safe(p
, end
, len
, bad
);
881 struct ceph_pg_pool_info
*pi
;
883 ceph_decode_64_safe(p
, end
, pool
, bad
);
884 pi
= __lookup_pg_pool(&map
->pg_pools
, pool
);
886 pi
= kzalloc(sizeof(*pi
), GFP_NOFS
);
892 __insert_pg_pool(&map
->pg_pools
, pi
);
894 err
= __decode_pool(p
, end
, pi
);
899 err
= __decode_pool_names(p
, end
, map
);
905 ceph_decode_32_safe(p
, end
, len
, bad
);
907 struct ceph_pg_pool_info
*pi
;
909 ceph_decode_64_safe(p
, end
, pool
, bad
);
910 pi
= __lookup_pg_pool(&map
->pg_pools
, pool
);
912 __remove_pg_pool(&map
->pg_pools
, pi
);
917 ceph_decode_32_safe(p
, end
, len
, bad
);
920 struct ceph_entity_addr addr
;
921 ceph_decode_32_safe(p
, end
, osd
, bad
);
922 ceph_decode_copy_safe(p
, end
, &addr
, sizeof(addr
), bad
);
923 ceph_decode_addr(&addr
);
924 pr_info("osd%d up\n", osd
);
925 BUG_ON(osd
>= map
->max_osd
);
926 map
->osd_state
[osd
] |= CEPH_OSD_UP
;
927 map
->osd_addr
[osd
] = addr
;
931 ceph_decode_32_safe(p
, end
, len
, bad
);
935 ceph_decode_32_safe(p
, end
, osd
, bad
);
936 xorstate
= **(u8
**)p
;
937 (*p
)++; /* clean flag */
939 xorstate
= CEPH_OSD_UP
;
940 if (xorstate
& CEPH_OSD_UP
)
941 pr_info("osd%d down\n", osd
);
942 if (osd
< map
->max_osd
)
943 map
->osd_state
[osd
] ^= xorstate
;
947 ceph_decode_32_safe(p
, end
, len
, bad
);
950 ceph_decode_need(p
, end
, sizeof(u32
)*2, bad
);
951 osd
= ceph_decode_32(p
);
952 off
= ceph_decode_32(p
);
953 pr_info("osd%d weight 0x%x %s\n", osd
, off
,
954 off
== CEPH_OSD_IN
? "(in)" :
955 (off
== CEPH_OSD_OUT
? "(out)" : ""));
956 if (osd
< map
->max_osd
)
957 map
->osd_weight
[osd
] = off
;
961 ceph_decode_32_safe(p
, end
, len
, bad
);
963 struct ceph_pg_mapping
*pg
;
968 err
= ceph_decode_pgid(p
, end
, &pgid
);
971 ceph_decode_need(p
, end
, sizeof(u32
), bad
);
972 pglen
= ceph_decode_32(p
);
974 ceph_decode_need(p
, end
, pglen
*sizeof(u32
), bad
);
976 /* removing existing (if any) */
977 (void) __remove_pg_mapping(&map
->pg_temp
, pgid
);
981 if (pglen
> (UINT_MAX
- sizeof(*pg
)) / sizeof(u32
))
984 pg
= kmalloc(sizeof(*pg
) + sizeof(u32
)*pglen
, GFP_NOFS
);
989 for (j
= 0; j
< pglen
; j
++)
990 pg
->osds
[j
] = ceph_decode_32(p
);
991 err
= __insert_pg_mapping(pg
, &map
->pg_temp
);
996 dout(" added pg_temp %lld.%x len %d\n", pgid
.pool
,
1000 __remove_pg_mapping(&map
->pg_temp
, pgid
);
1004 /* ignore the rest */
1009 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
1010 epoch
, (int)(*p
- start
), *p
, start
, end
);
1011 print_hex_dump(KERN_DEBUG
, "osdmap: ",
1012 DUMP_PREFIX_OFFSET
, 16, 1,
1013 start
, end
- start
, true);
1015 crush_destroy(newcrush
);
1016 return ERR_PTR(err
);
1023 * calculate file layout from given offset, length.
1024 * fill in correct oid, logical length, and object extent
1027 * for now, we write only a single su, until we can
1028 * pass a stride back to the caller.
1030 int ceph_calc_file_object_mapping(struct ceph_file_layout
*layout
,
1033 u64
*oxoff
, u64
*oxlen
)
1035 u32 osize
= le32_to_cpu(layout
->fl_object_size
);
1036 u32 su
= le32_to_cpu(layout
->fl_stripe_unit
);
1037 u32 sc
= le32_to_cpu(layout
->fl_stripe_count
);
1038 u32 bl
, stripeno
, stripepos
, objsetno
;
1042 dout("mapping %llu~%llu osize %u fl_su %u\n", off
, len
,
1044 if (su
== 0 || sc
== 0)
1046 su_per_object
= osize
/ su
;
1047 if (su_per_object
== 0)
1049 dout("osize %u / su %u = su_per_object %u\n", osize
, su
,
1052 if ((su
& ~PAGE_MASK
) != 0)
1055 /* bl = *off / su; */
1059 dout("off %llu / su %u = bl %u\n", off
, su
, bl
);
1062 stripepos
= bl
% sc
;
1063 objsetno
= stripeno
/ su_per_object
;
1065 *ono
= objsetno
* sc
+ stripepos
;
1066 dout("objset %u * sc %u = ono %u\n", objsetno
, sc
, (unsigned int)*ono
);
1068 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
1070 su_offset
= do_div(t
, su
);
1071 *oxoff
= su_offset
+ (stripeno
% su_per_object
) * su
;
1074 * Calculate the length of the extent being written to the selected
1075 * object. This is the minimum of the full length requested (len) or
1076 * the remainder of the current stripe being written to.
1078 *oxlen
= min_t(u64
, len
, su
- su_offset
);
1080 dout(" obj extent %llu~%llu\n", *oxoff
, *oxlen
);
1084 dout(" invalid layout\n");
1090 EXPORT_SYMBOL(ceph_calc_file_object_mapping
);
1093 * calculate an object layout (i.e. pgid) from an oid,
1094 * file_layout, and osdmap
1096 int ceph_calc_ceph_pg(struct ceph_pg
*pg
, const char *oid
,
1097 struct ceph_osdmap
*osdmap
, uint64_t pool
)
1099 struct ceph_pg_pool_info
*pool_info
;
1102 pool_info
= __lookup_pg_pool(&osdmap
->pg_pools
, pool
);
1106 pg
->seed
= ceph_str_hash(pool_info
->object_hash
, oid
, strlen(oid
));
1108 dout("%s '%s' pgid %lld.%x\n", __func__
, oid
, pg
->pool
, pg
->seed
);
1111 EXPORT_SYMBOL(ceph_calc_ceph_pg
);
1114 * Calculate raw osd vector for the given pgid. Return pointer to osd
1115 * array, or NULL on failure.
1117 static int *calc_pg_raw(struct ceph_osdmap
*osdmap
, struct ceph_pg pgid
,
1118 int *osds
, int *num
)
1120 struct ceph_pg_mapping
*pg
;
1121 struct ceph_pg_pool_info
*pool
;
1126 pool
= __lookup_pg_pool(&osdmap
->pg_pools
, pgid
.pool
);
1131 pgid
.seed
= ceph_stable_mod(pgid
.seed
, pool
->pg_num
,
1133 pg
= __lookup_pg_mapping(&osdmap
->pg_temp
, pgid
);
1140 ruleno
= crush_find_rule(osdmap
->crush
, pool
->crush_ruleset
,
1141 pool
->type
, pool
->size
);
1143 pr_err("no crush rule pool %lld ruleset %d type %d size %d\n",
1144 pgid
.pool
, pool
->crush_ruleset
, pool
->type
,
1149 if (pool
->flags
& CEPH_POOL_FLAG_HASHPSPOOL
) {
1150 /* hash pool id and seed sothat pool PGs do not overlap */
1151 pps
= crush_hash32_2(CRUSH_HASH_RJENKINS1
,
1152 ceph_stable_mod(pgid
.seed
, pool
->pgp_num
,
1153 pool
->pgp_num_mask
),
1157 * legacy ehavior: add ps and pool together. this is
1158 * not a great approach because the PGs from each pool
1159 * will overlap on top of each other: 0.5 == 1.4 ==
1162 pps
= ceph_stable_mod(pgid
.seed
, pool
->pgp_num
,
1163 pool
->pgp_num_mask
) +
1164 (unsigned)pgid
.pool
;
1166 r
= crush_do_rule(osdmap
->crush
, ruleno
, pps
, osds
,
1167 min_t(int, pool
->size
, *num
),
1168 osdmap
->osd_weight
);
1170 pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
1171 " size %d\n", r
, pgid
.pool
, pool
->crush_ruleset
,
1172 pool
->type
, pool
->size
);
1180 * Return acting set for given pgid.
1182 int ceph_calc_pg_acting(struct ceph_osdmap
*osdmap
, struct ceph_pg pgid
,
1185 int rawosds
[CEPH_PG_MAX_SIZE
], *osds
;
1186 int i
, o
, num
= CEPH_PG_MAX_SIZE
;
1188 osds
= calc_pg_raw(osdmap
, pgid
, rawosds
, &num
);
1192 /* primary is first up osd */
1194 for (i
= 0; i
< num
; i
++)
1195 if (ceph_osd_is_up(osdmap
, osds
[i
]))
1196 acting
[o
++] = osds
[i
];
1201 * Return primary osd for given pgid, or -1 if none.
1203 int ceph_calc_pg_primary(struct ceph_osdmap
*osdmap
, struct ceph_pg pgid
)
1205 int rawosds
[CEPH_PG_MAX_SIZE
], *osds
;
1206 int i
, num
= CEPH_PG_MAX_SIZE
;
1208 osds
= calc_pg_raw(osdmap
, pgid
, rawosds
, &num
);
1212 /* primary is first up osd */
1213 for (i
= 0; i
< num
; i
++)
1214 if (ceph_osd_is_up(osdmap
, osds
[i
]))
1218 EXPORT_SYMBOL(ceph_calc_pg_primary
);