sbin/*hammer: Add missing braces to conform to code style
[dragonfly.git] / sbin / hammer / cmd_dedup.c
blobda58e315e0bfed234ccf111034febdbf01cca428
1 /*
2 * Copyright (c) 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Ilya Dryomov <idryomov@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
35 #include <libutil.h>
36 #include <crypto/sha2/sha2.h>
38 #include "hammer.h"
40 #define DEDUP_BUF (64 * 1024)
42 /* Sorted list of block CRCs - light version for dedup-simulate */
43 struct sim_dedup_entry_rb_tree;
44 RB_HEAD(sim_dedup_entry_rb_tree, sim_dedup_entry) sim_dedup_tree =
45 RB_INITIALIZER(&sim_dedup_tree);
46 RB_PROTOTYPE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
47 rb_sim_dedup_entry_compare, hammer_crc_t);
49 struct sim_dedup_entry {
50 hammer_crc_t crc;
51 uint64_t ref_blks; /* number of blocks referenced */
52 uint64_t ref_size; /* size of data referenced */
53 RB_ENTRY(sim_dedup_entry) rb_entry;
56 struct dedup_entry {
57 struct hammer_btree_leaf_elm leaf;
58 union {
59 struct {
60 uint64_t ref_blks;
61 uint64_t ref_size;
62 } de;
63 RB_HEAD(sha_dedup_entry_rb_tree, sha_dedup_entry) fict_root;
64 } u;
65 uint8_t flags;
66 RB_ENTRY(dedup_entry) rb_entry;
69 #define HAMMER_DEDUP_ENTRY_FICTITIOUS 0x0001
71 struct sha_dedup_entry {
72 struct hammer_btree_leaf_elm leaf;
73 uint64_t ref_blks;
74 uint64_t ref_size;
75 uint8_t sha_hash[SHA256_DIGEST_LENGTH];
76 RB_ENTRY(sha_dedup_entry) fict_entry;
79 /* Sorted list of HAMMER B-Tree keys */
80 struct dedup_entry_rb_tree;
81 struct sha_dedup_entry_rb_tree;
83 RB_HEAD(dedup_entry_rb_tree, dedup_entry) dedup_tree =
84 RB_INITIALIZER(&dedup_tree);
85 RB_PROTOTYPE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
86 rb_dedup_entry_compare, hammer_crc_t);
88 RB_PROTOTYPE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
89 rb_sha_dedup_entry_compare);
92 * Pass2 list - contains entries that were not dedup'ed because ioctl failed
94 STAILQ_HEAD(, pass2_dedup_entry) pass2_dedup_queue =
95 STAILQ_HEAD_INITIALIZER(pass2_dedup_queue);
97 struct pass2_dedup_entry {
98 struct hammer_btree_leaf_elm leaf;
99 STAILQ_ENTRY(pass2_dedup_entry) sq_entry;
102 #define DEDUP_PASS2 0x0001 /* process_btree_elm() mode */
104 static int SigInfoFlag;
105 static int SigAlrmFlag;
106 static int64_t DedupDataReads;
107 static int64_t DedupCurrentRecords;
108 static int64_t DedupTotalRecords;
109 static uint32_t DedupCrcStart;
110 static uint32_t DedupCrcEnd;
111 static uint64_t MemoryUse;
113 /* PFS global ids - we deal with just one PFS at a run */
114 static int glob_fd;
115 static struct hammer_ioc_pseudofs_rw glob_pfs;
118 * Global accounting variables
120 * Last three don't have to be 64-bit, just to be safe..
122 static uint64_t dedup_alloc_size;
123 static uint64_t dedup_ref_size;
124 static uint64_t dedup_skipped_size;
125 static uint64_t dedup_crc_failures;
126 static uint64_t dedup_sha_failures;
127 static uint64_t dedup_underflows;
128 static uint64_t dedup_successes_count;
129 static uint64_t dedup_successes_bytes;
131 static int rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
132 struct sim_dedup_entry *sim_de2);
133 static int rb_dedup_entry_compare(struct dedup_entry *de1,
134 struct dedup_entry *de2);
135 static int rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
136 struct sha_dedup_entry *sha_de2);
137 typedef int (*scan_pfs_cb_t)(hammer_btree_leaf_elm_t scan_leaf, int flags);
138 static void scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id);
139 static int collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
140 static int count_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
141 static int process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags);
142 static int upgrade_chksum(hammer_btree_leaf_elm_t leaf, uint8_t *sha_hash);
143 static void dump_simulated_dedup(void);
144 static void dump_real_dedup(void);
145 static void dedup_usage(int code);
147 RB_GENERATE2(sim_dedup_entry_rb_tree, sim_dedup_entry, rb_entry,
148 rb_sim_dedup_entry_compare, hammer_crc_t, crc);
149 RB_GENERATE2(dedup_entry_rb_tree, dedup_entry, rb_entry,
150 rb_dedup_entry_compare, hammer_crc_t, leaf.data_crc);
151 RB_GENERATE(sha_dedup_entry_rb_tree, sha_dedup_entry, fict_entry,
152 rb_sha_dedup_entry_compare);
154 static
156 rb_sim_dedup_entry_compare(struct sim_dedup_entry *sim_de1,
157 struct sim_dedup_entry *sim_de2)
159 if (sim_de1->crc < sim_de2->crc)
160 return (-1);
161 if (sim_de1->crc > sim_de2->crc)
162 return (1);
164 return (0);
167 static
169 rb_dedup_entry_compare(struct dedup_entry *de1, struct dedup_entry *de2)
171 if (de1->leaf.data_crc < de2->leaf.data_crc)
172 return (-1);
173 if (de1->leaf.data_crc > de2->leaf.data_crc)
174 return (1);
176 return (0);
179 static
181 rb_sha_dedup_entry_compare(struct sha_dedup_entry *sha_de1,
182 struct sha_dedup_entry *sha_de2)
184 unsigned long *h1 = (unsigned long *)&sha_de1->sha_hash;
185 unsigned long *h2 = (unsigned long *)&sha_de2->sha_hash;
186 int i;
188 for (i = 0; i < SHA256_DIGEST_LENGTH / (int)sizeof(unsigned long); ++i) {
189 if (h1[i] < h2[i])
190 return (-1);
191 if (h1[i] > h2[i])
192 return (1);
195 return (0);
199 * dedup-simulate <filesystem>
201 void
202 hammer_cmd_dedup_simulate(char **av, int ac)
204 struct sim_dedup_entry *sim_de;
206 if (ac != 1) {
207 dedup_usage(1);
208 /* not reached */
211 glob_fd = getpfs(&glob_pfs, av[0]);
214 * Collection passes (memory limited)
216 printf("Dedup-simulate running\n");
217 do {
218 DedupCrcStart = DedupCrcEnd;
219 DedupCrcEnd = 0;
220 MemoryUse = 0;
222 if (VerboseOpt) {
223 printf("B-Tree pass crc-range %08x-max\n",
224 DedupCrcStart);
225 fflush(stdout);
227 scan_pfs(av[0], collect_btree_elm, "simu-pass");
229 if (VerboseOpt >= 2)
230 dump_simulated_dedup();
233 * Calculate simulated dedup ratio and get rid of the tree
235 while ((sim_de = RB_ROOT(&sim_dedup_tree)) != NULL) {
236 assert(sim_de->ref_blks != 0);
237 dedup_ref_size += sim_de->ref_size;
238 dedup_alloc_size += sim_de->ref_size / sim_de->ref_blks;
240 RB_REMOVE(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
241 free(sim_de);
243 if (DedupCrcEnd && VerboseOpt == 0)
244 printf(".");
245 } while (DedupCrcEnd);
247 printf("Dedup-simulate %s succeeded\n", av[0]);
248 relpfs(glob_fd, &glob_pfs);
250 printf("Simulated dedup ratio = %.2f\n",
251 (dedup_alloc_size != 0) ?
252 (double)dedup_ref_size / dedup_alloc_size : 0);
256 * dedup <filesystem>
258 void
259 hammer_cmd_dedup(char **av, int ac)
261 struct dedup_entry *de;
262 struct sha_dedup_entry *sha_de;
263 struct pass2_dedup_entry *pass2_de;
264 char *tmp;
265 char buf[8];
266 int needfree = 0;
268 if (TimeoutOpt > 0)
269 alarm(TimeoutOpt);
271 if (ac != 1) {
272 dedup_usage(1);
273 /* not reached */
276 STAILQ_INIT(&pass2_dedup_queue);
278 glob_fd = getpfs(&glob_pfs, av[0]);
281 * A cycle file is _required_ for resuming dedup after the timeout
282 * specified with -t has expired. If no -c option, then place a
283 * .dedup.cycle file either in the PFS snapshots directory or in
284 * the default snapshots directory.
286 if (!CyclePath) {
287 if (glob_pfs.ondisk->snapshots[0] != '/') {
288 asprintf(&tmp, "%s/%s/.dedup.cycle",
289 SNAPSHOTS_BASE, av[0]);
290 } else {
291 asprintf(&tmp, "%s/.dedup.cycle",
292 glob_pfs.ondisk->snapshots);
294 CyclePath = tmp;
295 needfree = 1;
299 * Pre-pass to cache the btree
301 scan_pfs(av[0], count_btree_elm, "pre-pass ");
302 DedupTotalRecords = DedupCurrentRecords;
305 * Collection passes (memory limited)
307 printf("Dedup running\n");
308 do {
309 DedupCrcStart = DedupCrcEnd;
310 DedupCrcEnd = 0;
311 MemoryUse = 0;
313 if (VerboseOpt) {
314 printf("B-Tree pass crc-range %08x-max\n",
315 DedupCrcStart);
316 fflush(stdout);
318 scan_pfs(av[0], process_btree_elm, "main-pass");
320 while ((pass2_de = STAILQ_FIRST(&pass2_dedup_queue)) != NULL) {
321 if (process_btree_elm(&pass2_de->leaf, DEDUP_PASS2))
322 dedup_skipped_size -= pass2_de->leaf.data_len;
324 STAILQ_REMOVE_HEAD(&pass2_dedup_queue, sq_entry);
325 free(pass2_de);
327 assert(STAILQ_EMPTY(&pass2_dedup_queue));
329 if (VerboseOpt >= 2)
330 dump_real_dedup();
333 * Calculate dedup ratio and get rid of the trees
335 while ((de = RB_ROOT(&dedup_tree)) != NULL) {
336 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
337 while ((sha_de = RB_ROOT(&de->u.fict_root)) != NULL) {
338 assert(sha_de->ref_blks != 0);
339 dedup_ref_size += sha_de->ref_size;
340 dedup_alloc_size += sha_de->ref_size / sha_de->ref_blks;
342 RB_REMOVE(sha_dedup_entry_rb_tree,
343 &de->u.fict_root, sha_de);
344 free(sha_de);
346 assert(RB_EMPTY(&de->u.fict_root));
347 } else {
348 assert(de->u.de.ref_blks != 0);
349 dedup_ref_size += de->u.de.ref_size;
350 dedup_alloc_size += de->u.de.ref_size / de->u.de.ref_blks;
353 RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
354 free(de);
356 assert(RB_EMPTY(&dedup_tree));
357 if (DedupCrcEnd && VerboseOpt == 0)
358 printf(".");
359 } while (DedupCrcEnd);
361 printf("Dedup %s succeeded\n", av[0]);
362 relpfs(glob_fd, &glob_pfs);
364 humanize_unsigned(buf, sizeof(buf), dedup_ref_size, "B", 1024);
365 printf("Dedup ratio = %.2f (in this run)\n"
366 " %8s referenced\n",
367 ((dedup_alloc_size != 0) ?
368 (double)dedup_ref_size / dedup_alloc_size : 0),
371 humanize_unsigned(buf, sizeof(buf), dedup_alloc_size, "B", 1024);
372 printf(" %8s allocated\n", buf);
373 humanize_unsigned(buf, sizeof(buf), dedup_skipped_size, "B", 1024);
374 printf(" %8s skipped\n", buf);
375 printf(" %8jd CRC collisions\n"
376 " %8jd SHA collisions\n"
377 " %8jd big-block underflows\n"
378 " %8jd new dedup records\n"
379 " %8jd new dedup bytes\n",
380 (intmax_t)dedup_crc_failures,
381 (intmax_t)dedup_sha_failures,
382 (intmax_t)dedup_underflows,
383 (intmax_t)dedup_successes_count,
384 (intmax_t)dedup_successes_bytes
387 /* Once completed remove cycle file */
388 hammer_reset_cycle();
390 /* We don't want to mess up with other directives */
391 if (needfree) {
392 free(tmp);
393 CyclePath = NULL;
397 static
399 count_btree_elm(hammer_btree_leaf_elm_t scan_leaf __unused, int flags __unused)
401 return(1);
404 static
406 collect_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags __unused)
408 struct sim_dedup_entry *sim_de;
411 * If we are using too much memory we have to clean some out, which
412 * will cause the run to use multiple passes. Be careful of integer
413 * overflows!
415 if (MemoryUse > MemoryLimit) {
416 DedupCrcEnd = DedupCrcStart +
417 (uint32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
418 if (VerboseOpt) {
419 printf("memory limit crc-range %08x-%08x\n",
420 DedupCrcStart, DedupCrcEnd);
421 fflush(stdout);
423 for (;;) {
424 sim_de = RB_MAX(sim_dedup_entry_rb_tree,
425 &sim_dedup_tree);
426 if (sim_de == NULL || sim_de->crc < DedupCrcEnd)
427 break;
428 RB_REMOVE(sim_dedup_entry_rb_tree,
429 &sim_dedup_tree, sim_de);
430 MemoryUse -= sizeof(*sim_de);
431 free(sim_de);
436 * Collect statistics based on the CRC only, do not try to read
437 * any data blocks or run SHA hashes.
439 sim_de = RB_LOOKUP(sim_dedup_entry_rb_tree, &sim_dedup_tree,
440 scan_leaf->data_crc);
442 if (sim_de == NULL) {
443 sim_de = calloc(1, sizeof(*sim_de));
444 sim_de->crc = scan_leaf->data_crc;
445 RB_INSERT(sim_dedup_entry_rb_tree, &sim_dedup_tree, sim_de);
446 MemoryUse += sizeof(*sim_de);
449 sim_de->ref_blks += 1;
450 sim_de->ref_size += scan_leaf->data_len;
451 return (1);
454 static __inline
456 validate_dedup_pair(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
458 if (HAMMER_ZONE(p->data_offset) != HAMMER_ZONE(q->data_offset))
459 return (1);
460 if (p->data_len != q->data_len)
461 return (1);
463 return (0);
466 #define DEDUP_TECH_FAILURE 1
467 #define DEDUP_CMP_FAILURE 2
468 #define DEDUP_INVALID_ZONE 3
469 #define DEDUP_UNDERFLOW 4
470 #define DEDUP_VERS_FAILURE 5
472 static __inline
474 deduplicate(hammer_btree_leaf_elm_t p, hammer_btree_leaf_elm_t q)
476 struct hammer_ioc_dedup dedup;
478 bzero(&dedup, sizeof(dedup));
481 * If data_offset fields are the same there is no need to run ioctl,
482 * candidate is already dedup'ed.
484 if (p->data_offset == q->data_offset)
485 return (0);
487 dedup.elm1 = p->base;
488 dedup.elm2 = q->base;
489 RunningIoctl = 1;
490 if (ioctl(glob_fd, HAMMERIOC_DEDUP, &dedup) < 0) {
491 if (errno == EOPNOTSUPP)
492 return (DEDUP_VERS_FAILURE); /* must be at least version 5 */
493 /* Technical failure - locking or w/e */
494 return (DEDUP_TECH_FAILURE);
496 if (dedup.head.flags & HAMMER_IOC_DEDUP_CMP_FAILURE)
497 return (DEDUP_CMP_FAILURE);
498 if (dedup.head.flags & HAMMER_IOC_DEDUP_INVALID_ZONE)
499 return (DEDUP_INVALID_ZONE);
500 if (dedup.head.flags & HAMMER_IOC_DEDUP_UNDERFLOW)
501 return (DEDUP_UNDERFLOW);
502 RunningIoctl = 0;
503 ++dedup_successes_count;
504 dedup_successes_bytes += p->data_len;
505 return (0);
508 static
510 process_btree_elm(hammer_btree_leaf_elm_t scan_leaf, int flags)
512 struct dedup_entry *de;
513 struct sha_dedup_entry *sha_de, temp;
514 struct pass2_dedup_entry *pass2_de;
515 int error;
518 * If we are using too much memory we have to clean some out, which
519 * will cause the run to use multiple passes. Be careful of integer
520 * overflows!
522 while (MemoryUse > MemoryLimit) {
523 DedupCrcEnd = DedupCrcStart +
524 (uint32_t)(DedupCrcEnd - DedupCrcStart - 1) / 2;
525 if (VerboseOpt) {
526 printf("memory limit crc-range %08x-%08x\n",
527 DedupCrcStart, DedupCrcEnd);
528 fflush(stdout);
531 for (;;) {
532 de = RB_MAX(dedup_entry_rb_tree, &dedup_tree);
533 if (de == NULL || de->leaf.data_crc < DedupCrcEnd)
534 break;
535 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
536 while ((sha_de = RB_ROOT(&de->u.fict_root)) !=
537 NULL) {
538 RB_REMOVE(sha_dedup_entry_rb_tree,
539 &de->u.fict_root, sha_de);
540 MemoryUse -= sizeof(*sha_de);
541 free(sha_de);
544 RB_REMOVE(dedup_entry_rb_tree, &dedup_tree, de);
545 MemoryUse -= sizeof(*de);
546 free(de);
551 * Collect statistics based on the CRC. Colliding CRCs usually
552 * cause a SHA sub-tree to be created under the de.
554 * Trivial case if de not found.
556 de = RB_LOOKUP(dedup_entry_rb_tree, &dedup_tree, scan_leaf->data_crc);
557 if (de == NULL) {
558 de = calloc(1, sizeof(*de));
559 de->leaf = *scan_leaf;
560 RB_INSERT(dedup_entry_rb_tree, &dedup_tree, de);
561 MemoryUse += sizeof(*de);
562 goto upgrade_stats;
566 * Found entry in CRC tree
568 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
570 * Optimize the case where a CRC failure results in multiple
571 * SHA entries. If we unconditionally issue a data-read a
572 * degenerate situation where a colliding CRC's second SHA
573 * entry contains the lion's share of the deduplication
574 * candidates will result in excessive data block reads.
576 * Deal with the degenerate case by looking for a matching
577 * data_offset/data_len in the SHA elements we already have
578 * before reading the data block and generating a new SHA.
580 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
581 if (sha_de->leaf.data_offset ==
582 scan_leaf->data_offset &&
583 sha_de->leaf.data_len == scan_leaf->data_len) {
584 memcpy(temp.sha_hash, sha_de->sha_hash,
585 SHA256_DIGEST_LENGTH);
586 break;
591 * Entry in CRC tree is fictitious, so we already had problems
592 * with this CRC. Upgrade (compute SHA) the candidate and
593 * dive into SHA subtree. If upgrade fails insert the candidate
594 * into Pass2 list (it will be processed later).
596 if (sha_de == NULL) {
597 if (upgrade_chksum(scan_leaf, temp.sha_hash))
598 goto pass2_insert;
600 sha_de = RB_FIND(sha_dedup_entry_rb_tree,
601 &de->u.fict_root, &temp);
605 * Nothing in SHA subtree so far, so this is a new
606 * 'dataset'. Insert new entry into SHA subtree.
608 if (sha_de == NULL) {
609 sha_de = calloc(1, sizeof(*sha_de));
610 sha_de->leaf = *scan_leaf;
611 memcpy(sha_de->sha_hash, temp.sha_hash,
612 SHA256_DIGEST_LENGTH);
613 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root,
614 sha_de);
615 MemoryUse += sizeof(*sha_de);
616 goto upgrade_stats_sha;
620 * Found entry in SHA subtree, it means we have a potential
621 * dedup pair. Validate it (zones have to match and data_len
622 * field have to be the same too. If validation fails, treat
623 * it as a SHA collision (jump to sha256_failure).
625 if (validate_dedup_pair(&sha_de->leaf, scan_leaf))
626 goto sha256_failure;
629 * We have a valid dedup pair (SHA match, validated).
631 * In case of technical failure (dedup pair was good, but
632 * ioctl failed anyways) insert the candidate into Pass2 list
633 * (we will try to dedup it after we are done with the rest of
634 * the tree).
636 * If ioctl fails because either of blocks is in the non-dedup
637 * zone (we can dedup only in LARGE_DATA and SMALL_DATA) don't
638 * bother with the candidate and terminate early.
640 * If ioctl fails because of big-block underflow replace the
641 * leaf node that found dedup entry represents with scan_leaf.
643 error = deduplicate(&sha_de->leaf, scan_leaf);
644 switch(error) {
645 case 0:
646 goto upgrade_stats_sha;
647 case DEDUP_TECH_FAILURE:
648 goto pass2_insert;
649 case DEDUP_CMP_FAILURE:
650 goto sha256_failure;
651 case DEDUP_INVALID_ZONE:
652 goto terminate_early;
653 case DEDUP_UNDERFLOW:
654 ++dedup_underflows;
655 sha_de->leaf = *scan_leaf;
656 memcpy(sha_de->sha_hash, temp.sha_hash,
657 SHA256_DIGEST_LENGTH);
658 goto upgrade_stats_sha;
659 case DEDUP_VERS_FAILURE:
660 errx(1, "HAMMER filesystem must be at least "
661 "version 5 to dedup");
662 /* not reached */
663 default:
664 fprintf(stderr, "Unknown error\n");
665 goto terminate_early;
669 * Ooh la la.. SHA-256 collision. Terminate early, there's
670 * nothing we can do here.
672 sha256_failure:
673 ++dedup_sha_failures;
674 goto terminate_early;
675 } else {
677 * Candidate CRC is good for now (we found an entry in CRC
678 * tree and it's not fictitious). This means we have a
679 * potential dedup pair.
681 if (validate_dedup_pair(&de->leaf, scan_leaf))
682 goto crc_failure;
685 * We have a valid dedup pair (CRC match, validated)
687 error = deduplicate(&de->leaf, scan_leaf);
688 switch(error) {
689 case 0:
690 goto upgrade_stats;
691 case DEDUP_TECH_FAILURE:
692 goto pass2_insert;
693 case DEDUP_CMP_FAILURE:
694 goto crc_failure;
695 case DEDUP_INVALID_ZONE:
696 goto terminate_early;
697 case DEDUP_UNDERFLOW:
698 ++dedup_underflows;
699 de->leaf = *scan_leaf;
700 goto upgrade_stats;
701 case DEDUP_VERS_FAILURE:
702 errx(1, "HAMMER filesystem must be at least "
703 "version 5 to dedup");
704 /* not reached */
705 default:
706 fprintf(stderr, "Unknown error\n");
707 goto terminate_early;
710 crc_failure:
712 * We got a CRC collision - either ioctl failed because of
713 * the comparison failure or validation of the potential
714 * dedup pair went bad. In all cases insert both blocks
715 * into SHA subtree (this requires checksum upgrade) and mark
716 * entry that corresponds to this CRC in the CRC tree
717 * fictitious, so that all futher operations with this CRC go
718 * through SHA subtree.
720 ++dedup_crc_failures;
723 * Insert block that was represented by now fictitious dedup
724 * entry (create a new SHA entry and preserve stats of the
725 * old CRC one). If checksum upgrade fails insert the
726 * candidate into Pass2 list and return - keep both trees
727 * unmodified.
729 sha_de = calloc(1, sizeof(*sha_de));
730 sha_de->leaf = de->leaf;
731 sha_de->ref_blks = de->u.de.ref_blks;
732 sha_de->ref_size = de->u.de.ref_size;
733 if (upgrade_chksum(&sha_de->leaf, sha_de->sha_hash)) {
734 free(sha_de);
735 goto pass2_insert;
737 MemoryUse += sizeof(*sha_de);
739 RB_INIT(&de->u.fict_root);
741 * Here we can insert without prior checking because the tree
742 * is empty at this point
744 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
747 * Mark entry in CRC tree fictitious
749 de->flags |= HAMMER_DEDUP_ENTRY_FICTITIOUS;
752 * Upgrade checksum of the candidate and insert it into
753 * SHA subtree. If upgrade fails insert the candidate into
754 * Pass2 list.
756 if (upgrade_chksum(scan_leaf, temp.sha_hash))
757 goto pass2_insert;
758 sha_de = RB_FIND(sha_dedup_entry_rb_tree, &de->u.fict_root,
759 &temp);
760 if (sha_de != NULL) {
761 /* There is an entry with this SHA already, but the only
762 * RB-tree element at this point is that entry we just
763 * added. We know for sure these blocks are different
764 * (this is crc_failure branch) so treat it as SHA
765 * collision.
767 goto sha256_failure;
770 sha_de = calloc(1, sizeof(*sha_de));
771 sha_de->leaf = *scan_leaf;
772 memcpy(sha_de->sha_hash, temp.sha_hash, SHA256_DIGEST_LENGTH);
773 RB_INSERT(sha_dedup_entry_rb_tree, &de->u.fict_root, sha_de);
774 MemoryUse += sizeof(*sha_de);
775 goto upgrade_stats_sha;
778 upgrade_stats:
779 de->u.de.ref_blks += 1;
780 de->u.de.ref_size += scan_leaf->data_len;
781 return (1);
783 upgrade_stats_sha:
784 sha_de->ref_blks += 1;
785 sha_de->ref_size += scan_leaf->data_len;
786 return (1);
788 pass2_insert:
790 * If in pass2 mode don't insert anything, fall through to
791 * terminate_early
793 if ((flags & DEDUP_PASS2) == 0) {
794 pass2_de = calloc(1, sizeof(*pass2_de));
795 pass2_de->leaf = *scan_leaf;
796 STAILQ_INSERT_TAIL(&pass2_dedup_queue, pass2_de, sq_entry);
797 dedup_skipped_size += scan_leaf->data_len;
798 return (1);
801 terminate_early:
803 * Early termination path. Fixup stats.
805 dedup_alloc_size += scan_leaf->data_len;
806 dedup_ref_size += scan_leaf->data_len;
807 return (0);
810 static
812 upgrade_chksum(hammer_btree_leaf_elm_t leaf, uint8_t *sha_hash)
814 struct hammer_ioc_data data;
815 char *buf = malloc(DEDUP_BUF);
816 SHA256_CTX ctx;
817 int error;
819 bzero(&data, sizeof(data));
820 data.elm = leaf->base;
821 data.ubuf = buf;
822 data.size = DEDUP_BUF;
824 error = 0;
825 if (ioctl(glob_fd, HAMMERIOC_GET_DATA, &data) < 0) {
826 fprintf(stderr, "Get-data failed: %s\n", strerror(errno));
827 error = 1;
828 goto done;
830 DedupDataReads += leaf->data_len;
832 if (data.leaf.data_len != leaf->data_len) {
833 error = 1;
834 goto done;
837 if (data.leaf.base.btype == HAMMER_BTREE_TYPE_RECORD &&
838 data.leaf.base.rec_type == HAMMER_RECTYPE_DATA) {
839 SHA256_Init(&ctx);
840 SHA256_Update(&ctx, (void *)buf, data.leaf.data_len);
841 SHA256_Final(sha_hash, &ctx);
844 done:
845 free(buf);
846 return (error);
849 static
850 void
851 sigAlrm(int signo __unused)
853 SigAlrmFlag = 1;
856 static
857 void
858 sigInfo(int signo __unused)
860 SigInfoFlag = 1;
863 static
864 void
865 scan_pfs(char *filesystem, scan_pfs_cb_t func, const char *id)
867 struct hammer_ioc_mirror_rw mirror;
868 hammer_ioc_mrecord_any_t mrec;
869 struct hammer_btree_leaf_elm elm;
870 char *buf = malloc(DEDUP_BUF);
871 char buf1[8];
872 char buf2[8];
873 int offset, bytes;
875 SigInfoFlag = 0;
876 DedupDataReads = 0;
877 DedupCurrentRecords = 0;
878 signal(SIGINFO, sigInfo);
879 signal(SIGALRM, sigAlrm);
882 * Deduplication happens per element so hammer(8) is in full
883 * control of the ioctl()s to actually perform it. SIGALRM
884 * needs to be handled within hammer(8) but a checkpoint
885 * is needed for resuming. Use cycle file for that.
887 * Try to obtain the previous obj_id from the cycle file and
888 * if not available just start from the beginning.
890 bzero(&mirror, sizeof(mirror));
891 hammer_key_beg_init(&mirror.key_beg);
892 hammer_get_cycle(&mirror.key_beg, &mirror.tid_beg);
894 if (mirror.key_beg.obj_id != (int64_t)HAMMER_MIN_OBJID) {
895 if (VerboseOpt) {
896 fprintf(stderr, "%s: mirror-read: Resuming at object %016jx\n",
897 id, (uintmax_t)mirror.key_beg.obj_id);
901 hammer_key_end_init(&mirror.key_end);
903 mirror.tid_beg = glob_pfs.ondisk->sync_beg_tid;
904 mirror.tid_end = glob_pfs.ondisk->sync_end_tid;
905 mirror.head.flags |= HAMMER_IOC_MIRROR_NODATA; /* we want only keys */
906 mirror.ubuf = buf;
907 mirror.size = DEDUP_BUF;
908 mirror.pfs_id = glob_pfs.pfs_id;
909 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
911 if (VerboseOpt && DedupCrcStart == 0) {
912 printf("%s %s: objspace %016jx:%04x %016jx:%04x\n",
913 id, filesystem,
914 (uintmax_t)mirror.key_beg.obj_id,
915 mirror.key_beg.localization,
916 (uintmax_t)mirror.key_end.obj_id,
917 mirror.key_end.localization);
918 printf("%s %s: pfs_id %d\n",
919 id, filesystem, glob_pfs.pfs_id);
921 fflush(stdout);
922 fflush(stderr);
924 do {
925 mirror.count = 0;
926 mirror.pfs_id = glob_pfs.pfs_id;
927 mirror.shared_uuid = glob_pfs.ondisk->shared_uuid;
928 if (ioctl(glob_fd, HAMMERIOC_MIRROR_READ, &mirror) < 0) {
929 err(1, "Mirror-read %s failed", filesystem);
930 /* not reached */
932 if (mirror.head.flags & HAMMER_IOC_HEAD_ERROR) {
933 errx(1, "Mirror-read %s fatal error %d",
934 filesystem, mirror.head.error);
935 /* not reached */
937 if (mirror.count) {
938 offset = 0;
939 while (offset < mirror.count) {
940 mrec = (void *)((char *)buf + offset);
941 bytes = HAMMER_HEAD_DOALIGN(mrec->head.rec_size);
942 if (offset + bytes > mirror.count) {
943 errx(1, "Misaligned record");
944 /* not reached */
946 assert((mrec->head.type &
947 HAMMER_MRECF_TYPE_MASK) ==
948 HAMMER_MREC_TYPE_REC);
949 offset += bytes;
950 elm = mrec->rec.leaf;
951 if (elm.base.btype != HAMMER_BTREE_TYPE_RECORD)
952 continue;
953 if (elm.base.rec_type != HAMMER_RECTYPE_DATA)
954 continue;
955 ++DedupCurrentRecords;
956 if (DedupCrcStart != DedupCrcEnd) {
957 if (elm.data_crc < DedupCrcStart)
958 continue;
959 if (DedupCrcEnd &&
960 elm.data_crc >= DedupCrcEnd) {
961 continue;
964 func(&elm, 0);
967 mirror.key_beg = mirror.key_cur;
968 if (DidInterrupt || SigAlrmFlag) {
969 if (VerboseOpt) {
970 fprintf(stderr, "%s\n",
971 (DidInterrupt ? "Interrupted" : "Timeout"));
973 hammer_set_cycle(&mirror.key_cur, mirror.tid_beg);
974 if (VerboseOpt) {
975 fprintf(stderr, "Cyclefile %s updated for "
976 "continuation\n", CyclePath);
978 exit(1);
980 if (SigInfoFlag) {
981 if (DedupTotalRecords) {
982 humanize_unsigned(buf1, sizeof(buf1),
983 DedupDataReads,
984 "B", 1024);
985 humanize_unsigned(buf2, sizeof(buf2),
986 dedup_successes_bytes,
987 "B", 1024);
988 fprintf(stderr, "%s count %7jd/%jd "
989 "(%02d.%02d%%) "
990 "ioread %s newddup %s\n",
992 (intmax_t)DedupCurrentRecords,
993 (intmax_t)DedupTotalRecords,
994 (int)(DedupCurrentRecords * 100 /
995 DedupTotalRecords),
996 (int)(DedupCurrentRecords * 10000 /
997 DedupTotalRecords % 100),
998 buf1, buf2);
999 } else {
1000 fprintf(stderr, "%s count %-7jd\n",
1002 (intmax_t)DedupCurrentRecords);
1004 SigInfoFlag = 0;
1006 } while (mirror.count != 0);
1008 signal(SIGINFO, SIG_IGN);
1009 signal(SIGALRM, SIG_IGN);
1011 free(buf);
1014 static
1015 void
1016 dump_simulated_dedup(void)
1018 struct sim_dedup_entry *sim_de;
1020 printf("=== Dumping simulated dedup entries:\n");
1021 RB_FOREACH(sim_de, sim_dedup_entry_rb_tree, &sim_dedup_tree) {
1022 printf("\tcrc=%08x cnt=%ju size=%ju\n",
1023 sim_de->crc,
1024 (intmax_t)sim_de->ref_blks,
1025 (intmax_t)sim_de->ref_size);
1027 printf("end of dump ===\n");
1030 static
1031 void
1032 dump_real_dedup(void)
1034 struct dedup_entry *de;
1035 struct sha_dedup_entry *sha_de;
1036 int i;
1038 printf("=== Dumping dedup entries:\n");
1039 RB_FOREACH(de, dedup_entry_rb_tree, &dedup_tree) {
1040 if (de->flags & HAMMER_DEDUP_ENTRY_FICTITIOUS) {
1041 printf("\tcrc=%08x fictitious\n", de->leaf.data_crc);
1043 RB_FOREACH(sha_de, sha_dedup_entry_rb_tree, &de->u.fict_root) {
1044 printf("\t\tcrc=%08x cnt=%ju size=%ju\n\t"
1045 "\t\tsha=",
1046 sha_de->leaf.data_crc,
1047 (intmax_t)sha_de->ref_blks,
1048 (intmax_t)sha_de->ref_size);
1049 for (i = 0; i < SHA256_DIGEST_LENGTH; ++i)
1050 printf("%02x", sha_de->sha_hash[i]);
1051 printf("\n");
1053 } else {
1054 printf("\tcrc=%08x cnt=%ju size=%ju\n",
1055 de->leaf.data_crc,
1056 (intmax_t)de->u.de.ref_blks,
1057 (intmax_t)de->u.de.ref_size);
1060 printf("end of dump ===\n");
1063 static
1064 void
1065 dedup_usage(int code)
1067 fprintf(stderr,
1068 "hammer dedup-simulate <filesystem>\n"
1069 "hammer dedup <filesystem>\n"
1071 exit(code);