2 #include <linux/ceph/ceph_debug.h>
4 #include <linux/module.h>
5 #include <linux/slab.h>
8 #include <linux/ceph/libceph.h>
9 #include <linux/ceph/osdmap.h>
10 #include <linux/ceph/decode.h>
11 #include <linux/crush/hash.h>
12 #include <linux/crush/mapper.h>
14 char *ceph_osdmap_state_str(char *str
, int len
, int state
)
19 if ((state
& CEPH_OSD_EXISTS
) && (state
& CEPH_OSD_UP
))
20 snprintf(str
, len
, "exists, up");
21 else if (state
& CEPH_OSD_EXISTS
)
22 snprintf(str
, len
, "exists");
23 else if (state
& CEPH_OSD_UP
)
24 snprintf(str
, len
, "up");
26 snprintf(str
, len
, "doesn't exist");
33 static int calc_bits_of(unsigned int t
)
44 * the foo_mask is the smallest value 2^n-1 that is >= foo.
46 static void calc_pg_masks(struct ceph_pg_pool_info
*pi
)
48 pi
->pg_num_mask
= (1 << calc_bits_of(pi
->pg_num
-1)) - 1;
49 pi
->pgp_num_mask
= (1 << calc_bits_of(pi
->pgp_num
-1)) - 1;
55 static int crush_decode_uniform_bucket(void **p
, void *end
,
56 struct crush_bucket_uniform
*b
)
58 dout("crush_decode_uniform_bucket %p to %p\n", *p
, end
);
59 ceph_decode_need(p
, end
, (1+b
->h
.size
) * sizeof(u32
), bad
);
60 b
->item_weight
= ceph_decode_32(p
);
66 static int crush_decode_list_bucket(void **p
, void *end
,
67 struct crush_bucket_list
*b
)
70 dout("crush_decode_list_bucket %p to %p\n", *p
, end
);
71 b
->item_weights
= kcalloc(b
->h
.size
, sizeof(u32
), GFP_NOFS
);
72 if (b
->item_weights
== NULL
)
74 b
->sum_weights
= kcalloc(b
->h
.size
, sizeof(u32
), GFP_NOFS
);
75 if (b
->sum_weights
== NULL
)
77 ceph_decode_need(p
, end
, 2 * b
->h
.size
* sizeof(u32
), bad
);
78 for (j
= 0; j
< b
->h
.size
; j
++) {
79 b
->item_weights
[j
] = ceph_decode_32(p
);
80 b
->sum_weights
[j
] = ceph_decode_32(p
);
87 static int crush_decode_tree_bucket(void **p
, void *end
,
88 struct crush_bucket_tree
*b
)
91 dout("crush_decode_tree_bucket %p to %p\n", *p
, end
);
92 ceph_decode_32_safe(p
, end
, b
->num_nodes
, bad
);
93 b
->node_weights
= kcalloc(b
->num_nodes
, sizeof(u32
), GFP_NOFS
);
94 if (b
->node_weights
== NULL
)
96 ceph_decode_need(p
, end
, b
->num_nodes
* sizeof(u32
), bad
);
97 for (j
= 0; j
< b
->num_nodes
; j
++)
98 b
->node_weights
[j
] = ceph_decode_32(p
);
104 static int crush_decode_straw_bucket(void **p
, void *end
,
105 struct crush_bucket_straw
*b
)
108 dout("crush_decode_straw_bucket %p to %p\n", *p
, end
);
109 b
->item_weights
= kcalloc(b
->h
.size
, sizeof(u32
), GFP_NOFS
);
110 if (b
->item_weights
== NULL
)
112 b
->straws
= kcalloc(b
->h
.size
, sizeof(u32
), GFP_NOFS
);
113 if (b
->straws
== NULL
)
115 ceph_decode_need(p
, end
, 2 * b
->h
.size
* sizeof(u32
), bad
);
116 for (j
= 0; j
< b
->h
.size
; j
++) {
117 b
->item_weights
[j
] = ceph_decode_32(p
);
118 b
->straws
[j
] = ceph_decode_32(p
);
125 static int skip_name_map(void **p
, void *end
)
128 ceph_decode_32_safe(p
, end
, len
,bad
);
132 ceph_decode_32_safe(p
, end
, strlen
, bad
);
140 static struct crush_map
*crush_decode(void *pbyval
, void *end
)
146 void *start
= pbyval
;
150 dout("crush_decode %p to %p len %d\n", *p
, end
, (int)(end
- *p
));
152 c
= kzalloc(sizeof(*c
), GFP_NOFS
);
154 return ERR_PTR(-ENOMEM
);
156 /* set tunables to default values */
157 c
->choose_local_tries
= 2;
158 c
->choose_local_fallback_tries
= 5;
159 c
->choose_total_tries
= 19;
160 c
->chooseleaf_descend_once
= 0;
162 ceph_decode_need(p
, end
, 4*sizeof(u32
), bad
);
163 magic
= ceph_decode_32(p
);
164 if (magic
!= CRUSH_MAGIC
) {
165 pr_err("crush_decode magic %x != current %x\n",
166 (unsigned int)magic
, (unsigned int)CRUSH_MAGIC
);
169 c
->max_buckets
= ceph_decode_32(p
);
170 c
->max_rules
= ceph_decode_32(p
);
171 c
->max_devices
= ceph_decode_32(p
);
173 c
->buckets
= kcalloc(c
->max_buckets
, sizeof(*c
->buckets
), GFP_NOFS
);
174 if (c
->buckets
== NULL
)
176 c
->rules
= kcalloc(c
->max_rules
, sizeof(*c
->rules
), GFP_NOFS
);
177 if (c
->rules
== NULL
)
181 for (i
= 0; i
< c
->max_buckets
; i
++) {
184 struct crush_bucket
*b
;
186 ceph_decode_32_safe(p
, end
, alg
, bad
);
188 c
->buckets
[i
] = NULL
;
191 dout("crush_decode bucket %d off %x %p to %p\n",
192 i
, (int)(*p
-start
), *p
, end
);
195 case CRUSH_BUCKET_UNIFORM
:
196 size
= sizeof(struct crush_bucket_uniform
);
198 case CRUSH_BUCKET_LIST
:
199 size
= sizeof(struct crush_bucket_list
);
201 case CRUSH_BUCKET_TREE
:
202 size
= sizeof(struct crush_bucket_tree
);
204 case CRUSH_BUCKET_STRAW
:
205 size
= sizeof(struct crush_bucket_straw
);
212 b
= c
->buckets
[i
] = kzalloc(size
, GFP_NOFS
);
216 ceph_decode_need(p
, end
, 4*sizeof(u32
), bad
);
217 b
->id
= ceph_decode_32(p
);
218 b
->type
= ceph_decode_16(p
);
219 b
->alg
= ceph_decode_8(p
);
220 b
->hash
= ceph_decode_8(p
);
221 b
->weight
= ceph_decode_32(p
);
222 b
->size
= ceph_decode_32(p
);
224 dout("crush_decode bucket size %d off %x %p to %p\n",
225 b
->size
, (int)(*p
-start
), *p
, end
);
227 b
->items
= kcalloc(b
->size
, sizeof(__s32
), GFP_NOFS
);
228 if (b
->items
== NULL
)
230 b
->perm
= kcalloc(b
->size
, sizeof(u32
), GFP_NOFS
);
235 ceph_decode_need(p
, end
, b
->size
*sizeof(u32
), bad
);
236 for (j
= 0; j
< b
->size
; j
++)
237 b
->items
[j
] = ceph_decode_32(p
);
240 case CRUSH_BUCKET_UNIFORM
:
241 err
= crush_decode_uniform_bucket(p
, end
,
242 (struct crush_bucket_uniform
*)b
);
246 case CRUSH_BUCKET_LIST
:
247 err
= crush_decode_list_bucket(p
, end
,
248 (struct crush_bucket_list
*)b
);
252 case CRUSH_BUCKET_TREE
:
253 err
= crush_decode_tree_bucket(p
, end
,
254 (struct crush_bucket_tree
*)b
);
258 case CRUSH_BUCKET_STRAW
:
259 err
= crush_decode_straw_bucket(p
, end
,
260 (struct crush_bucket_straw
*)b
);
268 dout("rule vec is %p\n", c
->rules
);
269 for (i
= 0; i
< c
->max_rules
; i
++) {
271 struct crush_rule
*r
;
273 ceph_decode_32_safe(p
, end
, yes
, bad
);
275 dout("crush_decode NO rule %d off %x %p to %p\n",
276 i
, (int)(*p
-start
), *p
, end
);
281 dout("crush_decode rule %d off %x %p to %p\n",
282 i
, (int)(*p
-start
), *p
, end
);
285 ceph_decode_32_safe(p
, end
, yes
, bad
);
286 #if BITS_PER_LONG == 32
288 if (yes
> (ULONG_MAX
- sizeof(*r
))
289 / sizeof(struct crush_rule_step
))
292 r
= c
->rules
[i
] = kmalloc(sizeof(*r
) +
293 yes
*sizeof(struct crush_rule_step
),
297 dout(" rule %d is at %p\n", i
, r
);
299 ceph_decode_copy_safe(p
, end
, &r
->mask
, 4, bad
); /* 4 u8's */
300 ceph_decode_need(p
, end
, r
->len
*3*sizeof(u32
), bad
);
301 for (j
= 0; j
< r
->len
; j
++) {
302 r
->steps
[j
].op
= ceph_decode_32(p
);
303 r
->steps
[j
].arg1
= ceph_decode_32(p
);
304 r
->steps
[j
].arg2
= ceph_decode_32(p
);
308 /* ignore trailing name maps. */
309 for (num_name_maps
= 0; num_name_maps
< 3; num_name_maps
++) {
310 err
= skip_name_map(p
, end
);
316 ceph_decode_need(p
, end
, 3*sizeof(u32
), done
);
317 c
->choose_local_tries
= ceph_decode_32(p
);
318 c
->choose_local_fallback_tries
= ceph_decode_32(p
);
319 c
->choose_total_tries
= ceph_decode_32(p
);
320 dout("crush decode tunable choose_local_tries = %d",
321 c
->choose_local_tries
);
322 dout("crush decode tunable choose_local_fallback_tries = %d",
323 c
->choose_local_fallback_tries
);
324 dout("crush decode tunable choose_total_tries = %d",
325 c
->choose_total_tries
);
327 ceph_decode_need(p
, end
, sizeof(u32
), done
);
328 c
->chooseleaf_descend_once
= ceph_decode_32(p
);
329 dout("crush decode tunable chooseleaf_descend_once = %d",
330 c
->chooseleaf_descend_once
);
333 dout("crush_decode success\n");
339 dout("crush_decode fail %d\n", err
);
345 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
348 static int pgid_cmp(struct ceph_pg l
, struct ceph_pg r
)
361 static int __insert_pg_mapping(struct ceph_pg_mapping
*new,
362 struct rb_root
*root
)
364 struct rb_node
**p
= &root
->rb_node
;
365 struct rb_node
*parent
= NULL
;
366 struct ceph_pg_mapping
*pg
= NULL
;
369 dout("__insert_pg_mapping %llx %p\n", *(u64
*)&new->pgid
, new);
372 pg
= rb_entry(parent
, struct ceph_pg_mapping
, node
);
373 c
= pgid_cmp(new->pgid
, pg
->pgid
);
382 rb_link_node(&new->node
, parent
, p
);
383 rb_insert_color(&new->node
, root
);
387 static struct ceph_pg_mapping
*__lookup_pg_mapping(struct rb_root
*root
,
390 struct rb_node
*n
= root
->rb_node
;
391 struct ceph_pg_mapping
*pg
;
395 pg
= rb_entry(n
, struct ceph_pg_mapping
, node
);
396 c
= pgid_cmp(pgid
, pg
->pgid
);
402 dout("__lookup_pg_mapping %lld.%x got %p\n",
403 pgid
.pool
, pgid
.seed
, pg
);
410 static int __remove_pg_mapping(struct rb_root
*root
, struct ceph_pg pgid
)
412 struct ceph_pg_mapping
*pg
= __lookup_pg_mapping(root
, pgid
);
415 dout("__remove_pg_mapping %lld.%x %p\n", pgid
.pool
, pgid
.seed
,
417 rb_erase(&pg
->node
, root
);
421 dout("__remove_pg_mapping %lld.%x dne\n", pgid
.pool
, pgid
.seed
);
426 * rbtree of pg pool info
428 static int __insert_pg_pool(struct rb_root
*root
, struct ceph_pg_pool_info
*new)
430 struct rb_node
**p
= &root
->rb_node
;
431 struct rb_node
*parent
= NULL
;
432 struct ceph_pg_pool_info
*pi
= NULL
;
436 pi
= rb_entry(parent
, struct ceph_pg_pool_info
, node
);
437 if (new->id
< pi
->id
)
439 else if (new->id
> pi
->id
)
445 rb_link_node(&new->node
, parent
, p
);
446 rb_insert_color(&new->node
, root
);
450 static struct ceph_pg_pool_info
*__lookup_pg_pool(struct rb_root
*root
, u64 id
)
452 struct ceph_pg_pool_info
*pi
;
453 struct rb_node
*n
= root
->rb_node
;
456 pi
= rb_entry(n
, struct ceph_pg_pool_info
, node
);
459 else if (id
> pi
->id
)
467 const char *ceph_pg_pool_name_by_id(struct ceph_osdmap
*map
, u64 id
)
469 struct ceph_pg_pool_info
*pi
;
471 if (id
== CEPH_NOPOOL
)
474 if (WARN_ON_ONCE(id
> (u64
) INT_MAX
))
477 pi
= __lookup_pg_pool(&map
->pg_pools
, (int) id
);
479 return pi
? pi
->name
: NULL
;
481 EXPORT_SYMBOL(ceph_pg_pool_name_by_id
);
483 int ceph_pg_poolid_by_name(struct ceph_osdmap
*map
, const char *name
)
487 for (rbp
= rb_first(&map
->pg_pools
); rbp
; rbp
= rb_next(rbp
)) {
488 struct ceph_pg_pool_info
*pi
=
489 rb_entry(rbp
, struct ceph_pg_pool_info
, node
);
490 if (pi
->name
&& strcmp(pi
->name
, name
) == 0)
495 EXPORT_SYMBOL(ceph_pg_poolid_by_name
);
497 static void __remove_pg_pool(struct rb_root
*root
, struct ceph_pg_pool_info
*pi
)
499 rb_erase(&pi
->node
, root
);
504 static int __decode_pool(void **p
, void *end
, struct ceph_pg_pool_info
*pi
)
510 ceph_decode_need(p
, end
, 2 + 4, bad
);
511 ev
= ceph_decode_8(p
); /* encoding version */
512 cv
= ceph_decode_8(p
); /* compat version */
514 pr_warning("got v %d < 5 cv %d of ceph_pg_pool\n", ev
, cv
);
518 pr_warning("got v %d cv %d > 7 of ceph_pg_pool\n", ev
, cv
);
521 len
= ceph_decode_32(p
);
522 ceph_decode_need(p
, end
, len
, bad
);
525 pi
->type
= ceph_decode_8(p
);
526 pi
->size
= ceph_decode_8(p
);
527 pi
->crush_ruleset
= ceph_decode_8(p
);
528 pi
->object_hash
= ceph_decode_8(p
);
530 pi
->pg_num
= ceph_decode_32(p
);
531 pi
->pgp_num
= ceph_decode_32(p
);
533 *p
+= 4 + 4; /* skip lpg* */
534 *p
+= 4; /* skip last_change */
535 *p
+= 8 + 4; /* skip snap_seq, snap_epoch */
538 num
= ceph_decode_32(p
);
540 *p
+= 8; /* snapid key */
541 *p
+= 1 + 1; /* versions */
542 len
= ceph_decode_32(p
);
546 /* skip removed snaps */
547 num
= ceph_decode_32(p
);
550 *p
+= 8; /* skip auid */
551 pi
->flags
= ceph_decode_64(p
);
553 /* ignore the rest */
563 static int __decode_pool_names(void **p
, void *end
, struct ceph_osdmap
*map
)
565 struct ceph_pg_pool_info
*pi
;
569 ceph_decode_32_safe(p
, end
, num
, bad
);
570 dout(" %d pool names\n", num
);
572 ceph_decode_64_safe(p
, end
, pool
, bad
);
573 ceph_decode_32_safe(p
, end
, len
, bad
);
574 dout(" pool %llu len %d\n", pool
, len
);
575 ceph_decode_need(p
, end
, len
, bad
);
576 pi
= __lookup_pg_pool(&map
->pg_pools
, pool
);
578 char *name
= kstrndup(*p
, len
, GFP_NOFS
);
584 dout(" name is %s\n", pi
->name
);
597 void ceph_osdmap_destroy(struct ceph_osdmap
*map
)
599 dout("osdmap_destroy %p\n", map
);
601 crush_destroy(map
->crush
);
602 while (!RB_EMPTY_ROOT(&map
->pg_temp
)) {
603 struct ceph_pg_mapping
*pg
=
604 rb_entry(rb_first(&map
->pg_temp
),
605 struct ceph_pg_mapping
, node
);
606 rb_erase(&pg
->node
, &map
->pg_temp
);
609 while (!RB_EMPTY_ROOT(&map
->pg_pools
)) {
610 struct ceph_pg_pool_info
*pi
=
611 rb_entry(rb_first(&map
->pg_pools
),
612 struct ceph_pg_pool_info
, node
);
613 __remove_pg_pool(&map
->pg_pools
, pi
);
615 kfree(map
->osd_state
);
616 kfree(map
->osd_weight
);
617 kfree(map
->osd_addr
);
622 * adjust max osd value. reallocate arrays.
624 static int osdmap_set_max_osd(struct ceph_osdmap
*map
, int max
)
627 struct ceph_entity_addr
*addr
;
630 state
= kcalloc(max
, sizeof(*state
), GFP_NOFS
);
631 addr
= kcalloc(max
, sizeof(*addr
), GFP_NOFS
);
632 weight
= kcalloc(max
, sizeof(*weight
), GFP_NOFS
);
633 if (state
== NULL
|| addr
== NULL
|| weight
== NULL
) {
641 if (map
->osd_state
) {
642 memcpy(state
, map
->osd_state
, map
->max_osd
*sizeof(*state
));
643 memcpy(addr
, map
->osd_addr
, map
->max_osd
*sizeof(*addr
));
644 memcpy(weight
, map
->osd_weight
, map
->max_osd
*sizeof(*weight
));
645 kfree(map
->osd_state
);
646 kfree(map
->osd_addr
);
647 kfree(map
->osd_weight
);
650 map
->osd_state
= state
;
651 map
->osd_weight
= weight
;
652 map
->osd_addr
= addr
;
657 static int __decode_pgid(void **p
, void *end
, struct ceph_pg
*pg
)
661 ceph_decode_need(p
, end
, 1+8+4+4, bad
);
662 v
= ceph_decode_8(p
);
665 pg
->pool
= ceph_decode_64(p
);
666 pg
->seed
= ceph_decode_32(p
);
667 *p
+= 4; /* skip preferred */
671 dout("error decoding pgid\n");
678 struct ceph_osdmap
*osdmap_decode(void **p
, void *end
)
680 struct ceph_osdmap
*map
;
685 struct ceph_pg_pool_info
*pi
;
687 dout("osdmap_decode %p to %p len %d\n", *p
, end
, (int)(end
- *p
));
689 map
= kzalloc(sizeof(*map
), GFP_NOFS
);
691 return ERR_PTR(-ENOMEM
);
692 map
->pg_temp
= RB_ROOT
;
694 ceph_decode_16_safe(p
, end
, version
, bad
);
696 pr_warning("got unknown v %d > 6 of osdmap\n", version
);
700 pr_warning("got old v %d < 6 of osdmap\n", version
);
704 ceph_decode_need(p
, end
, 2*sizeof(u64
)+6*sizeof(u32
), bad
);
705 ceph_decode_copy(p
, &map
->fsid
, sizeof(map
->fsid
));
706 map
->epoch
= ceph_decode_32(p
);
707 ceph_decode_copy(p
, &map
->created
, sizeof(map
->created
));
708 ceph_decode_copy(p
, &map
->modified
, sizeof(map
->modified
));
710 ceph_decode_32_safe(p
, end
, max
, bad
);
712 ceph_decode_need(p
, end
, 8 + 2, bad
);
714 pi
= kzalloc(sizeof(*pi
), GFP_NOFS
);
717 pi
->id
= ceph_decode_64(p
);
718 err
= __decode_pool(p
, end
, pi
);
723 __insert_pg_pool(&map
->pg_pools
, pi
);
726 err
= __decode_pool_names(p
, end
, map
);
728 dout("fail to decode pool names");
732 ceph_decode_32_safe(p
, end
, map
->pool_max
, bad
);
734 ceph_decode_32_safe(p
, end
, map
->flags
, bad
);
736 max
= ceph_decode_32(p
);
738 /* (re)alloc osd arrays */
739 err
= osdmap_set_max_osd(map
, max
);
742 dout("osdmap_decode max_osd = %d\n", map
->max_osd
);
746 ceph_decode_need(p
, end
, 3*sizeof(u32
) +
747 map
->max_osd
*(1 + sizeof(*map
->osd_weight
) +
748 sizeof(*map
->osd_addr
)), bad
);
749 *p
+= 4; /* skip length field (should match max) */
750 ceph_decode_copy(p
, map
->osd_state
, map
->max_osd
);
752 *p
+= 4; /* skip length field (should match max) */
753 for (i
= 0; i
< map
->max_osd
; i
++)
754 map
->osd_weight
[i
] = ceph_decode_32(p
);
756 *p
+= 4; /* skip length field (should match max) */
757 ceph_decode_copy(p
, map
->osd_addr
, map
->max_osd
*sizeof(*map
->osd_addr
));
758 for (i
= 0; i
< map
->max_osd
; i
++)
759 ceph_decode_addr(&map
->osd_addr
[i
]);
762 ceph_decode_32_safe(p
, end
, len
, bad
);
763 for (i
= 0; i
< len
; i
++) {
766 struct ceph_pg_mapping
*pg
;
768 err
= __decode_pgid(p
, end
, &pgid
);
771 ceph_decode_need(p
, end
, sizeof(u32
), bad
);
772 n
= ceph_decode_32(p
);
774 if (n
> (UINT_MAX
- sizeof(*pg
)) / sizeof(u32
))
776 ceph_decode_need(p
, end
, n
* sizeof(u32
), bad
);
778 pg
= kmalloc(sizeof(*pg
) + n
*sizeof(u32
), GFP_NOFS
);
783 for (j
= 0; j
< n
; j
++)
784 pg
->osds
[j
] = ceph_decode_32(p
);
786 err
= __insert_pg_mapping(pg
, &map
->pg_temp
);
789 dout(" added pg_temp %lld.%x len %d\n", pgid
.pool
, pgid
.seed
,
794 ceph_decode_32_safe(p
, end
, len
, bad
);
795 dout("osdmap_decode crush len %d from off 0x%x\n", len
,
797 ceph_decode_need(p
, end
, len
, bad
);
798 map
->crush
= crush_decode(*p
, end
);
800 if (IS_ERR(map
->crush
)) {
801 err
= PTR_ERR(map
->crush
);
806 /* ignore the rest of the map */
809 dout("osdmap_decode done %p %p\n", *p
, end
);
813 dout("osdmap_decode fail err %d\n", err
);
814 ceph_osdmap_destroy(map
);
819 * decode and apply an incremental map update.
821 struct ceph_osdmap
*osdmap_apply_incremental(void **p
, void *end
,
822 struct ceph_osdmap
*map
,
823 struct ceph_messenger
*msgr
)
825 struct crush_map
*newcrush
= NULL
;
826 struct ceph_fsid fsid
;
828 struct ceph_timespec modified
;
832 __s32 new_flags
, max
;
837 ceph_decode_16_safe(p
, end
, version
, bad
);
839 pr_warning("got unknown v %d != 6 of inc osdmap\n", version
);
843 ceph_decode_need(p
, end
, sizeof(fsid
)+sizeof(modified
)+2*sizeof(u32
),
845 ceph_decode_copy(p
, &fsid
, sizeof(fsid
));
846 epoch
= ceph_decode_32(p
);
847 BUG_ON(epoch
!= map
->epoch
+1);
848 ceph_decode_copy(p
, &modified
, sizeof(modified
));
849 new_pool_max
= ceph_decode_64(p
);
850 new_flags
= ceph_decode_32(p
);
853 ceph_decode_32_safe(p
, end
, len
, bad
);
855 dout("apply_incremental full map len %d, %p to %p\n",
857 return osdmap_decode(p
, min(*p
+len
, end
));
861 ceph_decode_32_safe(p
, end
, len
, bad
);
863 dout("apply_incremental new crush map len %d, %p to %p\n",
865 newcrush
= crush_decode(*p
, min(*p
+len
, end
));
866 if (IS_ERR(newcrush
))
867 return ERR_CAST(newcrush
);
873 map
->flags
= new_flags
;
874 if (new_pool_max
>= 0)
875 map
->pool_max
= new_pool_max
;
877 ceph_decode_need(p
, end
, 5*sizeof(u32
), bad
);
880 max
= ceph_decode_32(p
);
882 err
= osdmap_set_max_osd(map
, max
);
888 map
->modified
= modified
;
891 crush_destroy(map
->crush
);
892 map
->crush
= newcrush
;
897 ceph_decode_32_safe(p
, end
, len
, bad
);
899 struct ceph_pg_pool_info
*pi
;
901 ceph_decode_64_safe(p
, end
, pool
, bad
);
902 pi
= __lookup_pg_pool(&map
->pg_pools
, pool
);
904 pi
= kzalloc(sizeof(*pi
), GFP_NOFS
);
910 __insert_pg_pool(&map
->pg_pools
, pi
);
912 err
= __decode_pool(p
, end
, pi
);
917 err
= __decode_pool_names(p
, end
, map
);
923 ceph_decode_32_safe(p
, end
, len
, bad
);
925 struct ceph_pg_pool_info
*pi
;
927 ceph_decode_64_safe(p
, end
, pool
, bad
);
928 pi
= __lookup_pg_pool(&map
->pg_pools
, pool
);
930 __remove_pg_pool(&map
->pg_pools
, pi
);
935 ceph_decode_32_safe(p
, end
, len
, bad
);
938 struct ceph_entity_addr addr
;
939 ceph_decode_32_safe(p
, end
, osd
, bad
);
940 ceph_decode_copy_safe(p
, end
, &addr
, sizeof(addr
), bad
);
941 ceph_decode_addr(&addr
);
942 pr_info("osd%d up\n", osd
);
943 BUG_ON(osd
>= map
->max_osd
);
944 map
->osd_state
[osd
] |= CEPH_OSD_UP
;
945 map
->osd_addr
[osd
] = addr
;
949 ceph_decode_32_safe(p
, end
, len
, bad
);
953 ceph_decode_32_safe(p
, end
, osd
, bad
);
954 xorstate
= **(u8
**)p
;
955 (*p
)++; /* clean flag */
957 xorstate
= CEPH_OSD_UP
;
958 if (xorstate
& CEPH_OSD_UP
)
959 pr_info("osd%d down\n", osd
);
960 if (osd
< map
->max_osd
)
961 map
->osd_state
[osd
] ^= xorstate
;
965 ceph_decode_32_safe(p
, end
, len
, bad
);
968 ceph_decode_need(p
, end
, sizeof(u32
)*2, bad
);
969 osd
= ceph_decode_32(p
);
970 off
= ceph_decode_32(p
);
971 pr_info("osd%d weight 0x%x %s\n", osd
, off
,
972 off
== CEPH_OSD_IN
? "(in)" :
973 (off
== CEPH_OSD_OUT
? "(out)" : ""));
974 if (osd
< map
->max_osd
)
975 map
->osd_weight
[osd
] = off
;
979 ceph_decode_32_safe(p
, end
, len
, bad
);
981 struct ceph_pg_mapping
*pg
;
986 err
= __decode_pgid(p
, end
, &pgid
);
989 ceph_decode_need(p
, end
, sizeof(u32
), bad
);
990 pglen
= ceph_decode_32(p
);
992 ceph_decode_need(p
, end
, pglen
*sizeof(u32
), bad
);
994 /* removing existing (if any) */
995 (void) __remove_pg_mapping(&map
->pg_temp
, pgid
);
999 if (pglen
> (UINT_MAX
- sizeof(*pg
)) / sizeof(u32
))
1002 pg
= kmalloc(sizeof(*pg
) + sizeof(u32
)*pglen
, GFP_NOFS
);
1007 for (j
= 0; j
< pglen
; j
++)
1008 pg
->osds
[j
] = ceph_decode_32(p
);
1009 err
= __insert_pg_mapping(pg
, &map
->pg_temp
);
1014 dout(" added pg_temp %lld.%x len %d\n", pgid
.pool
,
1018 __remove_pg_mapping(&map
->pg_temp
, pgid
);
1022 /* ignore the rest */
1027 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
1028 epoch
, (int)(*p
- start
), *p
, start
, end
);
1029 print_hex_dump(KERN_DEBUG
, "osdmap: ",
1030 DUMP_PREFIX_OFFSET
, 16, 1,
1031 start
, end
- start
, true);
1033 crush_destroy(newcrush
);
1034 return ERR_PTR(err
);
1041 * calculate file layout from given offset, length.
1042 * fill in correct oid, logical length, and object extent
1045 * for now, we write only a single su, until we can
1046 * pass a stride back to the caller.
1048 int ceph_calc_file_object_mapping(struct ceph_file_layout
*layout
,
1051 u64
*oxoff
, u64
*oxlen
)
1053 u32 osize
= le32_to_cpu(layout
->fl_object_size
);
1054 u32 su
= le32_to_cpu(layout
->fl_stripe_unit
);
1055 u32 sc
= le32_to_cpu(layout
->fl_stripe_count
);
1056 u32 bl
, stripeno
, stripepos
, objsetno
;
1060 dout("mapping %llu~%llu osize %u fl_su %u\n", off
, len
,
1062 if (su
== 0 || sc
== 0)
1064 su_per_object
= osize
/ su
;
1065 if (su_per_object
== 0)
1067 dout("osize %u / su %u = su_per_object %u\n", osize
, su
,
1070 if ((su
& ~PAGE_MASK
) != 0)
1073 /* bl = *off / su; */
1077 dout("off %llu / su %u = bl %u\n", off
, su
, bl
);
1080 stripepos
= bl
% sc
;
1081 objsetno
= stripeno
/ su_per_object
;
1083 *ono
= objsetno
* sc
+ stripepos
;
1084 dout("objset %u * sc %u = ono %u\n", objsetno
, sc
, (unsigned int)*ono
);
1086 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
1088 su_offset
= do_div(t
, su
);
1089 *oxoff
= su_offset
+ (stripeno
% su_per_object
) * su
;
1092 * Calculate the length of the extent being written to the selected
1093 * object. This is the minimum of the full length requested (len) or
1094 * the remainder of the current stripe being written to.
1096 *oxlen
= min_t(u64
, len
, su
- su_offset
);
1098 dout(" obj extent %llu~%llu\n", *oxoff
, *oxlen
);
1102 dout(" invalid layout\n");
1108 EXPORT_SYMBOL(ceph_calc_file_object_mapping
);
1111 * calculate an object layout (i.e. pgid) from an oid,
1112 * file_layout, and osdmap
1114 int ceph_calc_object_layout(struct ceph_pg
*pg
,
1116 struct ceph_file_layout
*fl
,
1117 struct ceph_osdmap
*osdmap
)
1119 unsigned int num
, num_mask
;
1120 struct ceph_pg_pool_info
*pool
;
1123 pg
->pool
= le32_to_cpu(fl
->fl_pg_pool
);
1124 pool
= __lookup_pg_pool(&osdmap
->pg_pools
, pg
->pool
);
1127 pg
->seed
= ceph_str_hash(pool
->object_hash
, oid
, strlen(oid
));
1129 num_mask
= pool
->pg_num_mask
;
1131 dout("calc_object_layout '%s' pgid %lld.%x\n", oid
, pg
->pool
, pg
->seed
);
1134 EXPORT_SYMBOL(ceph_calc_object_layout
);
1137 * Calculate raw osd vector for the given pgid. Return pointer to osd
1138 * array, or NULL on failure.
1140 static int *calc_pg_raw(struct ceph_osdmap
*osdmap
, struct ceph_pg pgid
,
1141 int *osds
, int *num
)
1143 struct ceph_pg_mapping
*pg
;
1144 struct ceph_pg_pool_info
*pool
;
1149 pool
= __lookup_pg_pool(&osdmap
->pg_pools
, pgid
.pool
);
1154 pgid
.seed
= ceph_stable_mod(pgid
.seed
, pool
->pg_num
,
1155 pool
->pgp_num_mask
);
1156 pg
= __lookup_pg_mapping(&osdmap
->pg_temp
, pgid
);
1163 ruleno
= crush_find_rule(osdmap
->crush
, pool
->crush_ruleset
,
1164 pool
->type
, pool
->size
);
1166 pr_err("no crush rule pool %lld ruleset %d type %d size %d\n",
1167 pgid
.pool
, pool
->crush_ruleset
, pool
->type
,
1172 if (pool
->flags
& CEPH_POOL_FLAG_HASHPSPOOL
) {
1173 /* hash pool id and seed sothat pool PGs do not overlap */
1174 pps
= crush_hash32_2(CRUSH_HASH_RJENKINS1
,
1175 ceph_stable_mod(pgid
.seed
, pool
->pgp_num
,
1176 pool
->pgp_num_mask
),
1180 * legacy ehavior: add ps and pool together. this is
1181 * not a great approach because the PGs from each pool
1182 * will overlap on top of each other: 0.5 == 1.4 ==
1185 pps
= ceph_stable_mod(pgid
.seed
, pool
->pgp_num
,
1186 pool
->pgp_num_mask
) +
1187 (unsigned)pgid
.pool
;
1189 r
= crush_do_rule(osdmap
->crush
, ruleno
, pps
, osds
,
1190 min_t(int, pool
->size
, *num
),
1191 osdmap
->osd_weight
);
1193 pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
1194 " size %d\n", r
, pgid
.pool
, pool
->crush_ruleset
,
1195 pool
->type
, pool
->size
);
1203 * Return acting set for given pgid.
1205 int ceph_calc_pg_acting(struct ceph_osdmap
*osdmap
, struct ceph_pg pgid
,
1208 int rawosds
[CEPH_PG_MAX_SIZE
], *osds
;
1209 int i
, o
, num
= CEPH_PG_MAX_SIZE
;
1211 osds
= calc_pg_raw(osdmap
, pgid
, rawosds
, &num
);
1215 /* primary is first up osd */
1217 for (i
= 0; i
< num
; i
++)
1218 if (ceph_osd_is_up(osdmap
, osds
[i
]))
1219 acting
[o
++] = osds
[i
];
1224 * Return primary osd for given pgid, or -1 if none.
1226 int ceph_calc_pg_primary(struct ceph_osdmap
*osdmap
, struct ceph_pg pgid
)
1228 int rawosds
[CEPH_PG_MAX_SIZE
], *osds
;
1229 int i
, num
= CEPH_PG_MAX_SIZE
;
1231 osds
= calc_pg_raw(osdmap
, pgid
, rawosds
, &num
);
1235 /* primary is first up osd */
1236 for (i
= 0; i
< num
; i
++)
1237 if (ceph_osd_is_up(osdmap
, osds
[i
]))
1241 EXPORT_SYMBOL(ceph_calc_pg_primary
);