4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26 * Copyright (c) 2014 Integros [integros.com]
27 * Copyright 2017 Nexenta Systems, Inc.
33 #include <sys/zfs_context.h>
35 #include <sys/refcount.h>
36 #include <sys/zap_impl.h>
37 #include <sys/zap_leaf.h>
40 #include <sys/dmu_objset.h>
43 #include <sys/sunddi.h>
46 extern inline mzap_phys_t
*zap_m_phys(zap_t
*zap
);
48 static int mzap_upgrade(zap_t
**zapp
,
49 void *tag
, dmu_tx_t
*tx
, zap_flags_t flags
);
52 zap_getflags(zap_t
*zap
)
56 return (zap_f_phys(zap
)->zap_flags
);
60 zap_hashbits(zap_t
*zap
)
62 if (zap_getflags(zap
) & ZAP_FLAG_HASH64
)
71 if (zap_getflags(zap
) & ZAP_FLAG_HASH64
)
78 zap_hash(zap_name_t
*zn
)
80 zap_t
*zap
= zn
->zn_zap
;
83 if (zap_getflags(zap
) & ZAP_FLAG_PRE_HASHED_KEY
) {
84 ASSERT(zap_getflags(zap
) & ZAP_FLAG_UINT64_KEY
);
85 h
= *(uint64_t *)zn
->zn_key_orig
;
89 ASSERT(zfs_crc64_table
[128] == ZFS_CRC64_POLY
);
91 if (zap_getflags(zap
) & ZAP_FLAG_UINT64_KEY
) {
92 const uint64_t *wp
= zn
->zn_key_norm
;
94 ASSERT(zn
->zn_key_intlen
== 8);
95 for (int i
= 0; i
< zn
->zn_key_norm_numints
;
99 for (int j
= 0; j
< zn
->zn_key_intlen
; j
++) {
101 zfs_crc64_table
[(h
^ word
) & 0xFF];
106 const uint8_t *cp
= zn
->zn_key_norm
;
109 * We previously stored the terminating null on
110 * disk, but didn't hash it, so we need to
111 * continue to not hash it. (The
112 * zn_key_*_numints includes the terminating
113 * null for non-binary keys.)
115 int len
= zn
->zn_key_norm_numints
- 1;
117 ASSERT(zn
->zn_key_intlen
== 1);
118 for (int i
= 0; i
< len
; cp
++, i
++) {
120 zfs_crc64_table
[(h
^ *cp
) & 0xFF];
125 * Don't use all 64 bits, since we need some in the cookie for
126 * the collision differentiator. We MUST use the high bits,
127 * since those are the ones that we first pay attention to when
128 * chosing the bucket.
130 h
&= ~((1ULL << (64 - zap_hashbits(zap
))) - 1);
136 zap_normalize(zap_t
*zap
, const char *name
, char *namenorm
, int normflags
)
138 ASSERT(!(zap_getflags(zap
) & ZAP_FLAG_UINT64_KEY
));
140 size_t inlen
= strlen(name
) + 1;
141 size_t outlen
= ZAP_MAXNAMELEN
;
144 (void) u8_textprep_str((char *)name
, &inlen
, namenorm
, &outlen
,
145 normflags
| U8_TEXTPREP_IGNORE_NULL
| U8_TEXTPREP_IGNORE_INVALID
,
146 U8_UNICODE_LATEST
, &err
);
152 zap_match(zap_name_t
*zn
, const char *matchname
)
154 ASSERT(!(zap_getflags(zn
->zn_zap
) & ZAP_FLAG_UINT64_KEY
));
156 if (zn
->zn_matchtype
& MT_NORMALIZE
) {
157 char norm
[ZAP_MAXNAMELEN
];
159 if (zap_normalize(zn
->zn_zap
, matchname
, norm
,
160 zn
->zn_normflags
) != 0)
163 return (strcmp(zn
->zn_key_norm
, norm
) == 0);
165 return (strcmp(zn
->zn_key_orig
, matchname
) == 0);
170 zap_name_free(zap_name_t
*zn
)
172 kmem_free(zn
, sizeof (zap_name_t
));
176 zap_name_alloc(zap_t
*zap
, const char *key
, matchtype_t mt
)
178 zap_name_t
*zn
= kmem_alloc(sizeof (zap_name_t
), KM_SLEEP
);
181 zn
->zn_key_intlen
= sizeof (*key
);
182 zn
->zn_key_orig
= key
;
183 zn
->zn_key_orig_numints
= strlen(zn
->zn_key_orig
) + 1;
184 zn
->zn_matchtype
= mt
;
185 zn
->zn_normflags
= zap
->zap_normflags
;
188 * If we're dealing with a case sensitive lookup on a mixed or
189 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
190 * will fold case to all caps overriding the lookup request.
192 if (mt
& MT_MATCH_CASE
)
193 zn
->zn_normflags
&= ~U8_TEXTPREP_TOUPPER
;
195 if (zap
->zap_normflags
) {
197 * We *must* use zap_normflags because this normalization is
198 * what the hash is computed from.
200 if (zap_normalize(zap
, key
, zn
->zn_normbuf
,
201 zap
->zap_normflags
) != 0) {
205 zn
->zn_key_norm
= zn
->zn_normbuf
;
206 zn
->zn_key_norm_numints
= strlen(zn
->zn_key_norm
) + 1;
212 zn
->zn_key_norm
= zn
->zn_key_orig
;
213 zn
->zn_key_norm_numints
= zn
->zn_key_orig_numints
;
216 zn
->zn_hash
= zap_hash(zn
);
218 if (zap
->zap_normflags
!= zn
->zn_normflags
) {
220 * We *must* use zn_normflags because this normalization is
221 * what the matching is based on. (Not the hash!)
223 if (zap_normalize(zap
, key
, zn
->zn_normbuf
,
224 zn
->zn_normflags
) != 0) {
228 zn
->zn_key_norm_numints
= strlen(zn
->zn_key_norm
) + 1;
235 zap_name_alloc_uint64(zap_t
*zap
, const uint64_t *key
, int numints
)
237 zap_name_t
*zn
= kmem_alloc(sizeof (zap_name_t
), KM_SLEEP
);
239 ASSERT(zap
->zap_normflags
== 0);
241 zn
->zn_key_intlen
= sizeof (*key
);
242 zn
->zn_key_orig
= zn
->zn_key_norm
= key
;
243 zn
->zn_key_orig_numints
= zn
->zn_key_norm_numints
= numints
;
244 zn
->zn_matchtype
= 0;
246 zn
->zn_hash
= zap_hash(zn
);
251 mzap_byteswap(mzap_phys_t
*buf
, size_t size
)
253 buf
->mz_block_type
= BSWAP_64(buf
->mz_block_type
);
254 buf
->mz_salt
= BSWAP_64(buf
->mz_salt
);
255 buf
->mz_normflags
= BSWAP_64(buf
->mz_normflags
);
256 int max
= (size
/ MZAP_ENT_LEN
) - 1;
257 for (int i
= 0; i
< max
; i
++) {
258 buf
->mz_chunk
[i
].mze_value
=
259 BSWAP_64(buf
->mz_chunk
[i
].mze_value
);
260 buf
->mz_chunk
[i
].mze_cd
=
261 BSWAP_32(buf
->mz_chunk
[i
].mze_cd
);
266 zap_byteswap(void *buf
, size_t size
)
268 uint64_t block_type
= *(uint64_t *)buf
;
270 if (block_type
== ZBT_MICRO
|| block_type
== BSWAP_64(ZBT_MICRO
)) {
271 /* ASSERT(magic == ZAP_LEAF_MAGIC); */
272 mzap_byteswap(buf
, size
);
274 fzap_byteswap(buf
, size
);
279 mze_compare(const void *arg1
, const void *arg2
)
281 const mzap_ent_t
*mze1
= arg1
;
282 const mzap_ent_t
*mze2
= arg2
;
284 if (mze1
->mze_hash
> mze2
->mze_hash
)
286 if (mze1
->mze_hash
< mze2
->mze_hash
)
288 if (mze1
->mze_cd
> mze2
->mze_cd
)
290 if (mze1
->mze_cd
< mze2
->mze_cd
)
296 mze_insert(zap_t
*zap
, int chunkid
, uint64_t hash
)
298 ASSERT(zap
->zap_ismicro
);
299 ASSERT(RW_WRITE_HELD(&zap
->zap_rwlock
));
301 mzap_ent_t
*mze
= kmem_alloc(sizeof (mzap_ent_t
), KM_SLEEP
);
302 mze
->mze_chunkid
= chunkid
;
303 mze
->mze_hash
= hash
;
304 mze
->mze_cd
= MZE_PHYS(zap
, mze
)->mze_cd
;
305 ASSERT(MZE_PHYS(zap
, mze
)->mze_name
[0] != 0);
306 avl_add(&zap
->zap_m
.zap_avl
, mze
);
310 mze_find(zap_name_t
*zn
)
312 mzap_ent_t mze_tofind
;
315 avl_tree_t
*avl
= &zn
->zn_zap
->zap_m
.zap_avl
;
317 ASSERT(zn
->zn_zap
->zap_ismicro
);
318 ASSERT(RW_LOCK_HELD(&zn
->zn_zap
->zap_rwlock
));
320 mze_tofind
.mze_hash
= zn
->zn_hash
;
321 mze_tofind
.mze_cd
= 0;
323 mze
= avl_find(avl
, &mze_tofind
, &idx
);
325 mze
= avl_nearest(avl
, idx
, AVL_AFTER
);
326 for (; mze
&& mze
->mze_hash
== zn
->zn_hash
; mze
= AVL_NEXT(avl
, mze
)) {
327 ASSERT3U(mze
->mze_cd
, ==, MZE_PHYS(zn
->zn_zap
, mze
)->mze_cd
);
328 if (zap_match(zn
, MZE_PHYS(zn
->zn_zap
, mze
)->mze_name
))
336 mze_find_unused_cd(zap_t
*zap
, uint64_t hash
)
338 mzap_ent_t mze_tofind
;
340 avl_tree_t
*avl
= &zap
->zap_m
.zap_avl
;
342 ASSERT(zap
->zap_ismicro
);
343 ASSERT(RW_LOCK_HELD(&zap
->zap_rwlock
));
345 mze_tofind
.mze_hash
= hash
;
346 mze_tofind
.mze_cd
= 0;
349 for (mzap_ent_t
*mze
= avl_find(avl
, &mze_tofind
, &idx
);
350 mze
&& mze
->mze_hash
== hash
; mze
= AVL_NEXT(avl
, mze
)) {
351 if (mze
->mze_cd
!= cd
)
360 mze_remove(zap_t
*zap
, mzap_ent_t
*mze
)
362 ASSERT(zap
->zap_ismicro
);
363 ASSERT(RW_WRITE_HELD(&zap
->zap_rwlock
));
365 avl_remove(&zap
->zap_m
.zap_avl
, mze
);
366 kmem_free(mze
, sizeof (mzap_ent_t
));
370 mze_destroy(zap_t
*zap
)
373 void *avlcookie
= NULL
;
375 while (mze
= avl_destroy_nodes(&zap
->zap_m
.zap_avl
, &avlcookie
))
376 kmem_free(mze
, sizeof (mzap_ent_t
));
377 avl_destroy(&zap
->zap_m
.zap_avl
);
381 mzap_open(objset_t
*os
, uint64_t obj
, dmu_buf_t
*db
)
384 uint64_t *zap_hdr
= (uint64_t *)db
->db_data
;
385 uint64_t zap_block_type
= zap_hdr
[0];
386 uint64_t zap_magic
= zap_hdr
[1];
388 ASSERT3U(MZAP_ENT_LEN
, ==, sizeof (mzap_ent_phys_t
));
390 zap_t
*zap
= kmem_zalloc(sizeof (zap_t
), KM_SLEEP
);
391 rw_init(&zap
->zap_rwlock
, 0, 0, 0);
392 rw_enter(&zap
->zap_rwlock
, RW_WRITER
);
393 zap
->zap_objset
= os
;
394 zap
->zap_object
= obj
;
397 if (zap_block_type
!= ZBT_MICRO
) {
398 mutex_init(&zap
->zap_f
.zap_num_entries_mtx
, 0, 0, 0);
399 zap
->zap_f
.zap_block_shift
= highbit64(db
->db_size
) - 1;
400 if (zap_block_type
!= ZBT_HEADER
|| zap_magic
!= ZAP_MAGIC
) {
401 winner
= NULL
; /* No actual winner here... */
405 zap
->zap_ismicro
= TRUE
;
409 * Make sure that zap_ismicro is set before we let others see
410 * it, because zap_lockdir() checks zap_ismicro without the lock
413 dmu_buf_init_user(&zap
->zap_dbu
, zap_evict_sync
, NULL
, &zap
->zap_dbuf
);
414 winner
= dmu_buf_set_user(db
, &zap
->zap_dbu
);
419 if (zap
->zap_ismicro
) {
420 zap
->zap_salt
= zap_m_phys(zap
)->mz_salt
;
421 zap
->zap_normflags
= zap_m_phys(zap
)->mz_normflags
;
422 zap
->zap_m
.zap_num_chunks
= db
->db_size
/ MZAP_ENT_LEN
- 1;
423 avl_create(&zap
->zap_m
.zap_avl
, mze_compare
,
424 sizeof (mzap_ent_t
), offsetof(mzap_ent_t
, mze_node
));
426 for (int i
= 0; i
< zap
->zap_m
.zap_num_chunks
; i
++) {
427 mzap_ent_phys_t
*mze
=
428 &zap_m_phys(zap
)->mz_chunk
[i
];
429 if (mze
->mze_name
[0]) {
432 zap
->zap_m
.zap_num_entries
++;
433 zn
= zap_name_alloc(zap
, mze
->mze_name
, 0);
434 mze_insert(zap
, i
, zn
->zn_hash
);
439 zap
->zap_salt
= zap_f_phys(zap
)->zap_salt
;
440 zap
->zap_normflags
= zap_f_phys(zap
)->zap_normflags
;
442 ASSERT3U(sizeof (struct zap_leaf_header
), ==,
443 2*ZAP_LEAF_CHUNKSIZE
);
446 * The embedded pointer table should not overlap the
449 ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap
, 0), >,
450 &zap_f_phys(zap
)->zap_salt
);
453 * The embedded pointer table should end at the end of
456 ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap
,
457 1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap
)) -
458 (uintptr_t)zap_f_phys(zap
), ==,
459 zap
->zap_dbuf
->db_size
);
461 rw_exit(&zap
->zap_rwlock
);
465 rw_exit(&zap
->zap_rwlock
);
466 rw_destroy(&zap
->zap_rwlock
);
467 if (!zap
->zap_ismicro
)
468 mutex_destroy(&zap
->zap_f
.zap_num_entries_mtx
);
469 kmem_free(zap
, sizeof (zap_t
));
474 * This routine "consumes" the caller's hold on the dbuf, which must
475 * have the specified tag.
478 zap_lockdir_impl(dmu_buf_t
*db
, void *tag
, dmu_tx_t
*tx
,
479 krw_t lti
, boolean_t fatreader
, boolean_t adding
, zap_t
**zapp
)
481 ASSERT0(db
->db_offset
);
482 objset_t
*os
= dmu_buf_get_objset(db
);
483 uint64_t obj
= db
->db_object
;
487 zap_t
*zap
= dmu_buf_get_user(db
);
489 zap
= mzap_open(os
, obj
, db
);
492 * mzap_open() didn't like what it saw on-disk.
493 * Check for corruption!
495 return (SET_ERROR(EIO
));
500 * We're checking zap_ismicro without the lock held, in order to
501 * tell what type of lock we want. Once we have some sort of
502 * lock, see if it really is the right type. In practice this
503 * can only be different if it was upgraded from micro to fat,
504 * and micro wanted WRITER but fat only needs READER.
506 krw_t lt
= (!zap
->zap_ismicro
&& fatreader
) ? RW_READER
: lti
;
507 rw_enter(&zap
->zap_rwlock
, lt
);
508 if (lt
!= ((!zap
->zap_ismicro
&& fatreader
) ? RW_READER
: lti
)) {
509 /* it was upgraded, now we only need reader */
510 ASSERT(lt
== RW_WRITER
);
512 (!zap
->zap_ismicro
&& fatreader
) ? RW_READER
: lti
);
513 rw_downgrade(&zap
->zap_rwlock
);
517 zap
->zap_objset
= os
;
520 dmu_buf_will_dirty(db
, tx
);
522 ASSERT3P(zap
->zap_dbuf
, ==, db
);
524 ASSERT(!zap
->zap_ismicro
||
525 zap
->zap_m
.zap_num_entries
<= zap
->zap_m
.zap_num_chunks
);
526 if (zap
->zap_ismicro
&& tx
&& adding
&&
527 zap
->zap_m
.zap_num_entries
== zap
->zap_m
.zap_num_chunks
) {
528 uint64_t newsz
= db
->db_size
+ SPA_MINBLOCKSIZE
;
529 if (newsz
> MZAP_MAX_BLKSZ
) {
530 dprintf("upgrading obj %llu: num_entries=%u\n",
531 obj
, zap
->zap_m
.zap_num_entries
);
533 int err
= mzap_upgrade(zapp
, tag
, tx
, 0);
535 rw_exit(&zap
->zap_rwlock
);
538 VERIFY0(dmu_object_set_blocksize(os
, obj
, newsz
, 0, tx
));
539 zap
->zap_m
.zap_num_chunks
=
540 db
->db_size
/ MZAP_ENT_LEN
- 1;
548 zap_lockdir_by_dnode(dnode_t
*dn
, dmu_tx_t
*tx
,
549 krw_t lti
, boolean_t fatreader
, boolean_t adding
, void *tag
, zap_t
**zapp
)
553 int err
= dmu_buf_hold_by_dnode(dn
, 0, tag
, &db
, DMU_READ_NO_PREFETCH
);
559 dmu_object_info_t doi
;
560 dmu_object_info_from_db(db
, &doi
);
561 ASSERT3U(DMU_OT_BYTESWAP(doi
.doi_type
), ==, DMU_BSWAP_ZAP
);
565 err
= zap_lockdir_impl(db
, tag
, tx
, lti
, fatreader
, adding
, zapp
);
567 dmu_buf_rele(db
, tag
);
573 zap_lockdir(objset_t
*os
, uint64_t obj
, dmu_tx_t
*tx
,
574 krw_t lti
, boolean_t fatreader
, boolean_t adding
, void *tag
, zap_t
**zapp
)
578 int err
= dmu_buf_hold(os
, obj
, 0, tag
, &db
, DMU_READ_NO_PREFETCH
);
583 dmu_object_info_t doi
;
584 dmu_object_info_from_db(db
, &doi
);
585 ASSERT3U(DMU_OT_BYTESWAP(doi
.doi_type
), ==, DMU_BSWAP_ZAP
);
588 err
= zap_lockdir_impl(db
, tag
, tx
, lti
, fatreader
, adding
, zapp
);
590 dmu_buf_rele(db
, tag
);
595 zap_unlockdir(zap_t
*zap
, void *tag
)
597 rw_exit(&zap
->zap_rwlock
);
598 dmu_buf_rele(zap
->zap_dbuf
, tag
);
602 mzap_upgrade(zap_t
**zapp
, void *tag
, dmu_tx_t
*tx
, zap_flags_t flags
)
607 ASSERT(RW_WRITE_HELD(&zap
->zap_rwlock
));
609 int sz
= zap
->zap_dbuf
->db_size
;
610 mzap_phys_t
*mzp
= zio_buf_alloc(sz
);
611 bcopy(zap
->zap_dbuf
->db_data
, mzp
, sz
);
612 int nchunks
= zap
->zap_m
.zap_num_chunks
;
615 err
= dmu_object_set_blocksize(zap
->zap_objset
, zap
->zap_object
,
616 1ULL << fzap_default_block_shift
, 0, tx
);
618 zio_buf_free(mzp
, sz
);
623 dprintf("upgrading obj=%llu with %u chunks\n",
624 zap
->zap_object
, nchunks
);
625 /* XXX destroy the avl later, so we can use the stored hash value */
628 fzap_upgrade(zap
, tx
, flags
);
630 for (int i
= 0; i
< nchunks
; i
++) {
631 mzap_ent_phys_t
*mze
= &mzp
->mz_chunk
[i
];
632 if (mze
->mze_name
[0] == 0)
634 dprintf("adding %s=%llu\n",
635 mze
->mze_name
, mze
->mze_value
);
636 zap_name_t
*zn
= zap_name_alloc(zap
, mze
->mze_name
, 0);
637 err
= fzap_add_cd(zn
, 8, 1, &mze
->mze_value
, mze
->mze_cd
,
639 zap
= zn
->zn_zap
; /* fzap_add_cd() may change zap */
644 zio_buf_free(mzp
, sz
);
650 * The "normflags" determine the behavior of the matchtype_t which is
651 * passed to zap_lookup_norm(). Names which have the same normalized
652 * version will be stored with the same hash value, and therefore we can
653 * perform normalization-insensitive lookups. We can be Unicode form-
654 * insensitive and/or case-insensitive. The following flags are valid for
661 * U8_TEXTPREP_TOUPPER
663 * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
664 * of them may be supplied.
667 mzap_create_impl(objset_t
*os
, uint64_t obj
, int normflags
, zap_flags_t flags
,
672 VERIFY0(dmu_buf_hold(os
, obj
, 0, FTAG
, &db
, DMU_READ_NO_PREFETCH
));
674 dmu_buf_will_dirty(db
, tx
);
675 mzap_phys_t
*zp
= db
->db_data
;
676 zp
->mz_block_type
= ZBT_MICRO
;
677 zp
->mz_salt
= ((uintptr_t)db
^ (uintptr_t)tx
^ (obj
<< 1)) | 1ULL;
678 zp
->mz_normflags
= normflags
;
682 /* Only fat zap supports flags; upgrade immediately. */
683 VERIFY0(zap_lockdir_impl(db
, FTAG
, tx
, RW_WRITER
,
684 B_FALSE
, B_FALSE
, &zap
));
685 VERIFY0(mzap_upgrade(&zap
, FTAG
, tx
, flags
));
686 zap_unlockdir(zap
, FTAG
);
688 dmu_buf_rele(db
, FTAG
);
693 zap_create_claim(objset_t
*os
, uint64_t obj
, dmu_object_type_t ot
,
694 dmu_object_type_t bonustype
, int bonuslen
, dmu_tx_t
*tx
)
696 return (zap_create_claim_norm(os
, obj
,
697 0, ot
, bonustype
, bonuslen
, tx
));
701 zap_create_claim_norm(objset_t
*os
, uint64_t obj
, int normflags
,
702 dmu_object_type_t ot
,
703 dmu_object_type_t bonustype
, int bonuslen
, dmu_tx_t
*tx
)
705 ASSERT3U(DMU_OT_BYTESWAP(ot
), ==, DMU_BSWAP_ZAP
);
706 int err
= dmu_object_claim(os
, obj
, ot
, 0, bonustype
, bonuslen
, tx
);
709 mzap_create_impl(os
, obj
, normflags
, 0, tx
);
714 zap_create(objset_t
*os
, dmu_object_type_t ot
,
715 dmu_object_type_t bonustype
, int bonuslen
, dmu_tx_t
*tx
)
717 return (zap_create_norm(os
, 0, ot
, bonustype
, bonuslen
, tx
));
721 zap_create_norm(objset_t
*os
, int normflags
, dmu_object_type_t ot
,
722 dmu_object_type_t bonustype
, int bonuslen
, dmu_tx_t
*tx
)
724 ASSERT3U(DMU_OT_BYTESWAP(ot
), ==, DMU_BSWAP_ZAP
);
725 uint64_t obj
= dmu_object_alloc(os
, ot
, 0, bonustype
, bonuslen
, tx
);
727 mzap_create_impl(os
, obj
, normflags
, 0, tx
);
732 zap_create_flags(objset_t
*os
, int normflags
, zap_flags_t flags
,
733 dmu_object_type_t ot
, int leaf_blockshift
, int indirect_blockshift
,
734 dmu_object_type_t bonustype
, int bonuslen
, dmu_tx_t
*tx
)
736 ASSERT3U(DMU_OT_BYTESWAP(ot
), ==, DMU_BSWAP_ZAP
);
737 uint64_t obj
= dmu_object_alloc(os
, ot
, 0, bonustype
, bonuslen
, tx
);
739 ASSERT(leaf_blockshift
>= SPA_MINBLOCKSHIFT
&&
740 leaf_blockshift
<= SPA_OLD_MAXBLOCKSHIFT
&&
741 indirect_blockshift
>= SPA_MINBLOCKSHIFT
&&
742 indirect_blockshift
<= SPA_OLD_MAXBLOCKSHIFT
);
744 VERIFY(dmu_object_set_blocksize(os
, obj
,
745 1ULL << leaf_blockshift
, indirect_blockshift
, tx
) == 0);
747 mzap_create_impl(os
, obj
, normflags
, flags
, tx
);
752 zap_destroy(objset_t
*os
, uint64_t zapobj
, dmu_tx_t
*tx
)
755 * dmu_object_free will free the object number and free the
756 * data. Freeing the data will cause our pageout function to be
757 * called, which will destroy our data (zap_leaf_t's and zap_t).
760 return (dmu_object_free(os
, zapobj
, tx
));
764 zap_evict_sync(void *dbu
)
768 rw_destroy(&zap
->zap_rwlock
);
770 if (zap
->zap_ismicro
)
773 mutex_destroy(&zap
->zap_f
.zap_num_entries_mtx
);
775 kmem_free(zap
, sizeof (zap_t
));
779 zap_count(objset_t
*os
, uint64_t zapobj
, uint64_t *count
)
784 zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
787 if (!zap
->zap_ismicro
) {
788 err
= fzap_count(zap
, count
);
790 *count
= zap
->zap_m
.zap_num_entries
;
792 zap_unlockdir(zap
, FTAG
);
797 * zn may be NULL; if not specified, it will be computed if needed.
798 * See also the comment above zap_entry_normalization_conflict().
801 mzap_normalization_conflict(zap_t
*zap
, zap_name_t
*zn
, mzap_ent_t
*mze
)
803 int direction
= AVL_BEFORE
;
804 boolean_t allocdzn
= B_FALSE
;
806 if (zap
->zap_normflags
== 0)
810 for (mzap_ent_t
*other
= avl_walk(&zap
->zap_m
.zap_avl
, mze
, direction
);
811 other
&& other
->mze_hash
== mze
->mze_hash
;
812 other
= avl_walk(&zap
->zap_m
.zap_avl
, other
, direction
)) {
815 zn
= zap_name_alloc(zap
, MZE_PHYS(zap
, mze
)->mze_name
,
819 if (zap_match(zn
, MZE_PHYS(zap
, other
)->mze_name
)) {
826 if (direction
== AVL_BEFORE
) {
827 direction
= AVL_AFTER
;
837 * Routines for manipulating attributes.
841 zap_lookup(objset_t
*os
, uint64_t zapobj
, const char *name
,
842 uint64_t integer_size
, uint64_t num_integers
, void *buf
)
844 return (zap_lookup_norm(os
, zapobj
, name
, integer_size
,
845 num_integers
, buf
, 0, NULL
, 0, NULL
));
849 zap_lookup_impl(zap_t
*zap
, const char *name
,
850 uint64_t integer_size
, uint64_t num_integers
, void *buf
,
851 matchtype_t mt
, char *realname
, int rn_len
,
856 zap_name_t
*zn
= zap_name_alloc(zap
, name
, mt
);
858 return (SET_ERROR(ENOTSUP
));
860 if (!zap
->zap_ismicro
) {
861 err
= fzap_lookup(zn
, integer_size
, num_integers
, buf
,
862 realname
, rn_len
, ncp
);
864 mzap_ent_t
*mze
= mze_find(zn
);
866 err
= SET_ERROR(ENOENT
);
868 if (num_integers
< 1) {
869 err
= SET_ERROR(EOVERFLOW
);
870 } else if (integer_size
!= 8) {
871 err
= SET_ERROR(EINVAL
);
874 MZE_PHYS(zap
, mze
)->mze_value
;
875 (void) strlcpy(realname
,
876 MZE_PHYS(zap
, mze
)->mze_name
, rn_len
);
878 *ncp
= mzap_normalization_conflict(zap
,
889 zap_lookup_norm(objset_t
*os
, uint64_t zapobj
, const char *name
,
890 uint64_t integer_size
, uint64_t num_integers
, void *buf
,
891 matchtype_t mt
, char *realname
, int rn_len
,
897 zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
900 err
= zap_lookup_impl(zap
, name
, integer_size
,
901 num_integers
, buf
, mt
, realname
, rn_len
, ncp
);
902 zap_unlockdir(zap
, FTAG
);
907 zap_lookup_by_dnode(dnode_t
*dn
, const char *name
,
908 uint64_t integer_size
, uint64_t num_integers
, void *buf
)
910 return (zap_lookup_norm_by_dnode(dn
, name
, integer_size
,
911 num_integers
, buf
, 0, NULL
, 0, NULL
));
915 zap_lookup_norm_by_dnode(dnode_t
*dn
, const char *name
,
916 uint64_t integer_size
, uint64_t num_integers
, void *buf
,
917 matchtype_t mt
, char *realname
, int rn_len
,
922 int err
= zap_lockdir_by_dnode(dn
, NULL
, RW_READER
, TRUE
, FALSE
,
926 err
= zap_lookup_impl(zap
, name
, integer_size
,
927 num_integers
, buf
, mt
, realname
, rn_len
, ncp
);
928 zap_unlockdir(zap
, FTAG
);
933 zap_prefetch_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
939 zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
942 zap_name_t
*zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
944 zap_unlockdir(zap
, FTAG
);
945 return (SET_ERROR(ENOTSUP
));
950 zap_unlockdir(zap
, FTAG
);
955 zap_lookup_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
956 int key_numints
, uint64_t integer_size
, uint64_t num_integers
, void *buf
)
961 zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
964 zap_name_t
*zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
966 zap_unlockdir(zap
, FTAG
);
967 return (SET_ERROR(ENOTSUP
));
970 err
= fzap_lookup(zn
, integer_size
, num_integers
, buf
,
973 zap_unlockdir(zap
, FTAG
);
978 zap_contains(objset_t
*os
, uint64_t zapobj
, const char *name
)
980 int err
= zap_lookup_norm(os
, zapobj
, name
, 0,
981 0, NULL
, 0, NULL
, 0, NULL
);
982 if (err
== EOVERFLOW
|| err
== EINVAL
)
983 err
= 0; /* found, but skipped reading the value */
988 zap_length(objset_t
*os
, uint64_t zapobj
, const char *name
,
989 uint64_t *integer_size
, uint64_t *num_integers
)
994 zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
997 zap_name_t
*zn
= zap_name_alloc(zap
, name
, 0);
999 zap_unlockdir(zap
, FTAG
);
1000 return (SET_ERROR(ENOTSUP
));
1002 if (!zap
->zap_ismicro
) {
1003 err
= fzap_length(zn
, integer_size
, num_integers
);
1005 mzap_ent_t
*mze
= mze_find(zn
);
1007 err
= SET_ERROR(ENOENT
);
1016 zap_unlockdir(zap
, FTAG
);
1021 zap_length_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
1022 int key_numints
, uint64_t *integer_size
, uint64_t *num_integers
)
1027 zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
1030 zap_name_t
*zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
1032 zap_unlockdir(zap
, FTAG
);
1033 return (SET_ERROR(ENOTSUP
));
1035 err
= fzap_length(zn
, integer_size
, num_integers
);
1037 zap_unlockdir(zap
, FTAG
);
1042 mzap_addent(zap_name_t
*zn
, uint64_t value
)
1044 zap_t
*zap
= zn
->zn_zap
;
1045 int start
= zap
->zap_m
.zap_alloc_next
;
1047 ASSERT(RW_WRITE_HELD(&zap
->zap_rwlock
));
1050 for (int i
= 0; i
< zap
->zap_m
.zap_num_chunks
; i
++) {
1051 mzap_ent_phys_t
*mze
= &zap_m_phys(zap
)->mz_chunk
[i
];
1052 ASSERT(strcmp(zn
->zn_key_orig
, mze
->mze_name
) != 0);
1056 uint32_t cd
= mze_find_unused_cd(zap
, zn
->zn_hash
);
1057 /* given the limited size of the microzap, this can't happen */
1058 ASSERT(cd
< zap_maxcd(zap
));
1061 for (int i
= start
; i
< zap
->zap_m
.zap_num_chunks
; i
++) {
1062 mzap_ent_phys_t
*mze
= &zap_m_phys(zap
)->mz_chunk
[i
];
1063 if (mze
->mze_name
[0] == 0) {
1064 mze
->mze_value
= value
;
1066 (void) strcpy(mze
->mze_name
, zn
->zn_key_orig
);
1067 zap
->zap_m
.zap_num_entries
++;
1068 zap
->zap_m
.zap_alloc_next
= i
+1;
1069 if (zap
->zap_m
.zap_alloc_next
==
1070 zap
->zap_m
.zap_num_chunks
)
1071 zap
->zap_m
.zap_alloc_next
= 0;
1072 mze_insert(zap
, i
, zn
->zn_hash
);
1080 ASSERT(!"out of entries!");
1084 zap_add_impl(zap_t
*zap
, const char *key
,
1085 int integer_size
, uint64_t num_integers
,
1086 const void *val
, dmu_tx_t
*tx
, void *tag
)
1088 const uint64_t *intval
= val
;
1091 zap_name_t
*zn
= zap_name_alloc(zap
, key
, 0);
1093 zap_unlockdir(zap
, tag
);
1094 return (SET_ERROR(ENOTSUP
));
1096 if (!zap
->zap_ismicro
) {
1097 err
= fzap_add(zn
, integer_size
, num_integers
, val
, tag
, tx
);
1098 zap
= zn
->zn_zap
; /* fzap_add() may change zap */
1099 } else if (integer_size
!= 8 || num_integers
!= 1 ||
1100 strlen(key
) >= MZAP_NAME_LEN
) {
1101 err
= mzap_upgrade(&zn
->zn_zap
, tag
, tx
, 0);
1103 err
= fzap_add(zn
, integer_size
, num_integers
, val
,
1106 zap
= zn
->zn_zap
; /* fzap_add() may change zap */
1108 if (mze_find(zn
) != NULL
) {
1109 err
= SET_ERROR(EEXIST
);
1111 mzap_addent(zn
, *intval
);
1114 ASSERT(zap
== zn
->zn_zap
);
1116 if (zap
!= NULL
) /* may be NULL if fzap_add() failed */
1117 zap_unlockdir(zap
, tag
);
1122 zap_add(objset_t
*os
, uint64_t zapobj
, const char *key
,
1123 int integer_size
, uint64_t num_integers
,
1124 const void *val
, dmu_tx_t
*tx
)
1129 err
= zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, TRUE
, FTAG
, &zap
);
1132 err
= zap_add_impl(zap
, key
, integer_size
, num_integers
, val
, tx
, FTAG
);
1133 /* zap_add_impl() calls zap_unlockdir() */
1138 zap_add_by_dnode(dnode_t
*dn
, const char *key
,
1139 int integer_size
, uint64_t num_integers
,
1140 const void *val
, dmu_tx_t
*tx
)
1145 err
= zap_lockdir_by_dnode(dn
, tx
, RW_WRITER
, TRUE
, TRUE
, FTAG
, &zap
);
1148 err
= zap_add_impl(zap
, key
, integer_size
, num_integers
, val
, tx
, FTAG
);
1149 /* zap_add_impl() calls zap_unlockdir() */
1154 zap_add_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
1155 int key_numints
, int integer_size
, uint64_t num_integers
,
1156 const void *val
, dmu_tx_t
*tx
)
1161 zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, TRUE
, FTAG
, &zap
);
1164 zap_name_t
*zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
1166 zap_unlockdir(zap
, FTAG
);
1167 return (SET_ERROR(ENOTSUP
));
1169 err
= fzap_add(zn
, integer_size
, num_integers
, val
, FTAG
, tx
);
1170 zap
= zn
->zn_zap
; /* fzap_add() may change zap */
1172 if (zap
!= NULL
) /* may be NULL if fzap_add() failed */
1173 zap_unlockdir(zap
, FTAG
);
1178 zap_update(objset_t
*os
, uint64_t zapobj
, const char *name
,
1179 int integer_size
, uint64_t num_integers
, const void *val
, dmu_tx_t
*tx
)
1183 const uint64_t *intval
= val
;
1187 * If there is an old value, it shouldn't change across the
1188 * lockdir (eg, due to bprewrite's xlation).
1190 if (integer_size
== 8 && num_integers
== 1)
1191 (void) zap_lookup(os
, zapobj
, name
, 8, 1, &oldval
);
1195 zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, TRUE
, FTAG
, &zap
);
1198 zap_name_t
*zn
= zap_name_alloc(zap
, name
, 0);
1200 zap_unlockdir(zap
, FTAG
);
1201 return (SET_ERROR(ENOTSUP
));
1203 if (!zap
->zap_ismicro
) {
1204 err
= fzap_update(zn
, integer_size
, num_integers
, val
,
1206 zap
= zn
->zn_zap
; /* fzap_update() may change zap */
1207 } else if (integer_size
!= 8 || num_integers
!= 1 ||
1208 strlen(name
) >= MZAP_NAME_LEN
) {
1209 dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
1210 zapobj
, integer_size
, num_integers
, name
);
1211 err
= mzap_upgrade(&zn
->zn_zap
, FTAG
, tx
, 0);
1213 err
= fzap_update(zn
, integer_size
, num_integers
,
1216 zap
= zn
->zn_zap
; /* fzap_update() may change zap */
1218 mzap_ent_t
*mze
= mze_find(zn
);
1220 ASSERT3U(MZE_PHYS(zap
, mze
)->mze_value
, ==, oldval
);
1221 MZE_PHYS(zap
, mze
)->mze_value
= *intval
;
1223 mzap_addent(zn
, *intval
);
1226 ASSERT(zap
== zn
->zn_zap
);
1228 if (zap
!= NULL
) /* may be NULL if fzap_upgrade() failed */
1229 zap_unlockdir(zap
, FTAG
);
1234 zap_update_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
1236 int integer_size
, uint64_t num_integers
, const void *val
, dmu_tx_t
*tx
)
1241 zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, TRUE
, FTAG
, &zap
);
1244 zap_name_t
*zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
1246 zap_unlockdir(zap
, FTAG
);
1247 return (SET_ERROR(ENOTSUP
));
1249 err
= fzap_update(zn
, integer_size
, num_integers
, val
, FTAG
, tx
);
1250 zap
= zn
->zn_zap
; /* fzap_update() may change zap */
1252 if (zap
!= NULL
) /* may be NULL if fzap_upgrade() failed */
1253 zap_unlockdir(zap
, FTAG
);
1258 zap_remove(objset_t
*os
, uint64_t zapobj
, const char *name
, dmu_tx_t
*tx
)
1260 return (zap_remove_norm(os
, zapobj
, name
, 0, tx
));
1264 zap_remove_impl(zap_t
*zap
, const char *name
,
1265 matchtype_t mt
, dmu_tx_t
*tx
)
1269 zap_name_t
*zn
= zap_name_alloc(zap
, name
, mt
);
1271 return (SET_ERROR(ENOTSUP
));
1272 if (!zap
->zap_ismicro
) {
1273 err
= fzap_remove(zn
, tx
);
1275 mzap_ent_t
*mze
= mze_find(zn
);
1277 err
= SET_ERROR(ENOENT
);
1279 zap
->zap_m
.zap_num_entries
--;
1280 bzero(&zap_m_phys(zap
)->mz_chunk
[mze
->mze_chunkid
],
1281 sizeof (mzap_ent_phys_t
));
1282 mze_remove(zap
, mze
);
1290 zap_remove_norm(objset_t
*os
, uint64_t zapobj
, const char *name
,
1291 matchtype_t mt
, dmu_tx_t
*tx
)
1296 err
= zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, FALSE
, FTAG
, &zap
);
1299 err
= zap_remove_impl(zap
, name
, mt
, tx
);
1300 zap_unlockdir(zap
, FTAG
);
1305 zap_remove_by_dnode(dnode_t
*dn
, const char *name
, dmu_tx_t
*tx
)
1310 err
= zap_lockdir_by_dnode(dn
, tx
, RW_WRITER
, TRUE
, FALSE
, FTAG
, &zap
);
1313 err
= zap_remove_impl(zap
, name
, 0, tx
);
1314 zap_unlockdir(zap
, FTAG
);
1319 zap_remove_uint64(objset_t
*os
, uint64_t zapobj
, const uint64_t *key
,
1320 int key_numints
, dmu_tx_t
*tx
)
1325 zap_lockdir(os
, zapobj
, tx
, RW_WRITER
, TRUE
, FALSE
, FTAG
, &zap
);
1328 zap_name_t
*zn
= zap_name_alloc_uint64(zap
, key
, key_numints
);
1330 zap_unlockdir(zap
, FTAG
);
1331 return (SET_ERROR(ENOTSUP
));
1333 err
= fzap_remove(zn
, tx
);
1335 zap_unlockdir(zap
, FTAG
);
1340 * Routines for iterating over the attributes.
1344 zap_cursor_init_serialized(zap_cursor_t
*zc
, objset_t
*os
, uint64_t zapobj
,
1345 uint64_t serialized
)
1350 zc
->zc_zapobj
= zapobj
;
1351 zc
->zc_serialized
= serialized
;
1357 zap_cursor_init(zap_cursor_t
*zc
, objset_t
*os
, uint64_t zapobj
)
1359 zap_cursor_init_serialized(zc
, os
, zapobj
, 0);
1363 zap_cursor_fini(zap_cursor_t
*zc
)
1366 rw_enter(&zc
->zc_zap
->zap_rwlock
, RW_READER
);
1367 zap_unlockdir(zc
->zc_zap
, NULL
);
1371 rw_enter(&zc
->zc_leaf
->l_rwlock
, RW_READER
);
1372 zap_put_leaf(zc
->zc_leaf
);
1375 zc
->zc_objset
= NULL
;
1379 zap_cursor_serialize(zap_cursor_t
*zc
)
1381 if (zc
->zc_hash
== -1ULL)
1383 if (zc
->zc_zap
== NULL
)
1384 return (zc
->zc_serialized
);
1385 ASSERT((zc
->zc_hash
& zap_maxcd(zc
->zc_zap
)) == 0);
1386 ASSERT(zc
->zc_cd
< zap_maxcd(zc
->zc_zap
));
1389 * We want to keep the high 32 bits of the cursor zero if we can, so
1390 * that 32-bit programs can access this. So usually use a small
1391 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
1394 * [ collision differentiator | zap_hashbits()-bit hash value ]
1396 return ((zc
->zc_hash
>> (64 - zap_hashbits(zc
->zc_zap
))) |
1397 ((uint64_t)zc
->zc_cd
<< zap_hashbits(zc
->zc_zap
)));
1401 zap_cursor_retrieve(zap_cursor_t
*zc
, zap_attribute_t
*za
)
1405 if (zc
->zc_hash
== -1ULL)
1406 return (SET_ERROR(ENOENT
));
1408 if (zc
->zc_zap
== NULL
) {
1410 err
= zap_lockdir(zc
->zc_objset
, zc
->zc_zapobj
, NULL
,
1411 RW_READER
, TRUE
, FALSE
, NULL
, &zc
->zc_zap
);
1416 * To support zap_cursor_init_serialized, advance, retrieve,
1417 * we must add to the existing zc_cd, which may already
1418 * be 1 due to the zap_cursor_advance.
1420 ASSERT(zc
->zc_hash
== 0);
1421 hb
= zap_hashbits(zc
->zc_zap
);
1422 zc
->zc_hash
= zc
->zc_serialized
<< (64 - hb
);
1423 zc
->zc_cd
+= zc
->zc_serialized
>> hb
;
1424 if (zc
->zc_cd
>= zap_maxcd(zc
->zc_zap
)) /* corrupt serialized */
1427 rw_enter(&zc
->zc_zap
->zap_rwlock
, RW_READER
);
1429 if (!zc
->zc_zap
->zap_ismicro
) {
1430 err
= fzap_cursor_retrieve(zc
->zc_zap
, zc
, za
);
1433 mzap_ent_t mze_tofind
;
1435 mze_tofind
.mze_hash
= zc
->zc_hash
;
1436 mze_tofind
.mze_cd
= zc
->zc_cd
;
1439 avl_find(&zc
->zc_zap
->zap_m
.zap_avl
, &mze_tofind
, &idx
);
1441 mze
= avl_nearest(&zc
->zc_zap
->zap_m
.zap_avl
,
1445 mzap_ent_phys_t
*mzep
= MZE_PHYS(zc
->zc_zap
, mze
);
1446 ASSERT3U(mze
->mze_cd
, ==, mzep
->mze_cd
);
1447 za
->za_normalization_conflict
=
1448 mzap_normalization_conflict(zc
->zc_zap
, NULL
, mze
);
1449 za
->za_integer_length
= 8;
1450 za
->za_num_integers
= 1;
1451 za
->za_first_integer
= mzep
->mze_value
;
1452 (void) strcpy(za
->za_name
, mzep
->mze_name
);
1453 zc
->zc_hash
= mze
->mze_hash
;
1454 zc
->zc_cd
= mze
->mze_cd
;
1457 zc
->zc_hash
= -1ULL;
1458 err
= SET_ERROR(ENOENT
);
1461 rw_exit(&zc
->zc_zap
->zap_rwlock
);
1466 zap_cursor_advance(zap_cursor_t
*zc
)
1468 if (zc
->zc_hash
== -1ULL)
1474 zap_get_stats(objset_t
*os
, uint64_t zapobj
, zap_stats_t
*zs
)
1479 zap_lockdir(os
, zapobj
, NULL
, RW_READER
, TRUE
, FALSE
, FTAG
, &zap
);
1483 bzero(zs
, sizeof (zap_stats_t
));
1485 if (zap
->zap_ismicro
) {
1486 zs
->zs_blocksize
= zap
->zap_dbuf
->db_size
;
1487 zs
->zs_num_entries
= zap
->zap_m
.zap_num_entries
;
1488 zs
->zs_num_blocks
= 1;
1490 fzap_get_stats(zap
, zs
);
1492 zap_unlockdir(zap
, FTAG
);