Autogenerated HTML docs for v2.42.0-526-g3130c1
[git-htmldocs.git] / howto / recover-corrupted-object-harder.html
bloba589ca8bae039bc672c5b84d132c0a2c22b74df9
1 <?xml version="1.0" encoding="UTF-8"?>
2 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
3 "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
4 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
5 <head>
6 <meta http-equiv="Content-Type" content="application/xhtml+xml; charset=UTF-8" />
7 <meta name="generator" content="AsciiDoc 10.2.0" />
8 <title>How to recover an object from scratch</title>
9 <style type="text/css">
10 /* Shared CSS for AsciiDoc xhtml11 and html5 backends */
12 /* Default font. */
13 body {
14 font-family: Georgia,serif;
17 /* Title font. */
18 h1, h2, h3, h4, h5, h6,
19 div.title, caption.title,
20 thead, p.table.header,
21 #toctitle,
22 #author, #revnumber, #revdate, #revremark,
23 #footer {
24 font-family: Arial,Helvetica,sans-serif;
27 body {
28 margin: 1em 5% 1em 5%;
31 a {
32 color: blue;
33 text-decoration: underline;
35 a:visited {
36 color: fuchsia;
39 em {
40 font-style: italic;
41 color: navy;
44 strong {
45 font-weight: bold;
46 color: #083194;
49 h1, h2, h3, h4, h5, h6 {
50 color: #527bbd;
51 margin-top: 1.2em;
52 margin-bottom: 0.5em;
53 line-height: 1.3;
56 h1, h2, h3 {
57 border-bottom: 2px solid silver;
59 h2 {
60 padding-top: 0.5em;
62 h3 {
63 float: left;
65 h3 + * {
66 clear: left;
68 h5 {
69 font-size: 1.0em;
72 div.sectionbody {
73 margin-left: 0;
76 hr {
77 border: 1px solid silver;
80 p {
81 margin-top: 0.5em;
82 margin-bottom: 0.5em;
85 ul, ol, li > p {
86 margin-top: 0;
88 ul > li { color: #aaa; }
89 ul > li > * { color: black; }
91 .monospaced, code, pre {
92 font-family: "Courier New", Courier, monospace;
93 font-size: inherit;
94 color: navy;
95 padding: 0;
96 margin: 0;
98 pre {
99 white-space: pre-wrap;
102 #author {
103 color: #527bbd;
104 font-weight: bold;
105 font-size: 1.1em;
107 #email {
109 #revnumber, #revdate, #revremark {
112 #footer {
113 font-size: small;
114 border-top: 2px solid silver;
115 padding-top: 0.5em;
116 margin-top: 4.0em;
118 #footer-text {
119 float: left;
120 padding-bottom: 0.5em;
122 #footer-badges {
123 float: right;
124 padding-bottom: 0.5em;
127 #preamble {
128 margin-top: 1.5em;
129 margin-bottom: 1.5em;
131 div.imageblock, div.exampleblock, div.verseblock,
132 div.quoteblock, div.literalblock, div.listingblock, div.sidebarblock,
133 div.admonitionblock {
134 margin-top: 1.0em;
135 margin-bottom: 1.5em;
137 div.admonitionblock {
138 margin-top: 2.0em;
139 margin-bottom: 2.0em;
140 margin-right: 10%;
141 color: #606060;
144 div.content { /* Block element content. */
145 padding: 0;
148 /* Block element titles. */
149 div.title, caption.title {
150 color: #527bbd;
151 font-weight: bold;
152 text-align: left;
153 margin-top: 1.0em;
154 margin-bottom: 0.5em;
156 div.title + * {
157 margin-top: 0;
160 td div.title:first-child {
161 margin-top: 0.0em;
163 div.content div.title:first-child {
164 margin-top: 0.0em;
166 div.content + div.title {
167 margin-top: 0.0em;
170 div.sidebarblock > div.content {
171 background: #ffffee;
172 border: 1px solid #dddddd;
173 border-left: 4px solid #f0f0f0;
174 padding: 0.5em;
177 div.listingblock > div.content {
178 border: 1px solid #dddddd;
179 border-left: 5px solid #f0f0f0;
180 background: #f8f8f8;
181 padding: 0.5em;
184 div.quoteblock, div.verseblock {
185 padding-left: 1.0em;
186 margin-left: 1.0em;
187 margin-right: 10%;
188 border-left: 5px solid #f0f0f0;
189 color: #888;
192 div.quoteblock > div.attribution {
193 padding-top: 0.5em;
194 text-align: right;
197 div.verseblock > pre.content {
198 font-family: inherit;
199 font-size: inherit;
201 div.verseblock > div.attribution {
202 padding-top: 0.75em;
203 text-align: left;
205 /* DEPRECATED: Pre version 8.2.7 verse style literal block. */
206 div.verseblock + div.attribution {
207 text-align: left;
210 div.admonitionblock .icon {
211 vertical-align: top;
212 font-size: 1.1em;
213 font-weight: bold;
214 text-decoration: underline;
215 color: #527bbd;
216 padding-right: 0.5em;
218 div.admonitionblock td.content {
219 padding-left: 0.5em;
220 border-left: 3px solid #dddddd;
223 div.exampleblock > div.content {
224 border-left: 3px solid #dddddd;
225 padding-left: 0.5em;
228 div.imageblock div.content { padding-left: 0; }
229 span.image img { border-style: none; vertical-align: text-bottom; }
230 a.image:visited { color: white; }
232 dl {
233 margin-top: 0.8em;
234 margin-bottom: 0.8em;
236 dt {
237 margin-top: 0.5em;
238 margin-bottom: 0;
239 font-style: normal;
240 color: navy;
242 dd > *:first-child {
243 margin-top: 0.1em;
246 ul, ol {
247 list-style-position: outside;
249 ol.arabic {
250 list-style-type: decimal;
252 ol.loweralpha {
253 list-style-type: lower-alpha;
255 ol.upperalpha {
256 list-style-type: upper-alpha;
258 ol.lowerroman {
259 list-style-type: lower-roman;
261 ol.upperroman {
262 list-style-type: upper-roman;
265 div.compact ul, div.compact ol,
266 div.compact p, div.compact p,
267 div.compact div, div.compact div {
268 margin-top: 0.1em;
269 margin-bottom: 0.1em;
272 tfoot {
273 font-weight: bold;
275 td > div.verse {
276 white-space: pre;
279 div.hdlist {
280 margin-top: 0.8em;
281 margin-bottom: 0.8em;
283 div.hdlist tr {
284 padding-bottom: 15px;
286 dt.hdlist1.strong, td.hdlist1.strong {
287 font-weight: bold;
289 td.hdlist1 {
290 vertical-align: top;
291 font-style: normal;
292 padding-right: 0.8em;
293 color: navy;
295 td.hdlist2 {
296 vertical-align: top;
298 div.hdlist.compact tr {
299 margin: 0;
300 padding-bottom: 0;
303 .comment {
304 background: yellow;
307 .footnote, .footnoteref {
308 font-size: 0.8em;
311 span.footnote, span.footnoteref {
312 vertical-align: super;
315 #footnotes {
316 margin: 20px 0 20px 0;
317 padding: 7px 0 0 0;
320 #footnotes div.footnote {
321 margin: 0 0 5px 0;
324 #footnotes hr {
325 border: none;
326 border-top: 1px solid silver;
327 height: 1px;
328 text-align: left;
329 margin-left: 0;
330 width: 20%;
331 min-width: 100px;
334 div.colist td {
335 padding-right: 0.5em;
336 padding-bottom: 0.3em;
337 vertical-align: top;
339 div.colist td img {
340 margin-top: 0.3em;
343 @media print {
344 #footer-badges { display: none; }
347 #toc {
348 margin-bottom: 2.5em;
351 #toctitle {
352 color: #527bbd;
353 font-size: 1.1em;
354 font-weight: bold;
355 margin-top: 1.0em;
356 margin-bottom: 0.1em;
359 div.toclevel0, div.toclevel1, div.toclevel2, div.toclevel3, div.toclevel4 {
360 margin-top: 0;
361 margin-bottom: 0;
363 div.toclevel2 {
364 margin-left: 2em;
365 font-size: 0.9em;
367 div.toclevel3 {
368 margin-left: 4em;
369 font-size: 0.9em;
371 div.toclevel4 {
372 margin-left: 6em;
373 font-size: 0.9em;
376 span.aqua { color: aqua; }
377 span.black { color: black; }
378 span.blue { color: blue; }
379 span.fuchsia { color: fuchsia; }
380 span.gray { color: gray; }
381 span.green { color: green; }
382 span.lime { color: lime; }
383 span.maroon { color: maroon; }
384 span.navy { color: navy; }
385 span.olive { color: olive; }
386 span.purple { color: purple; }
387 span.red { color: red; }
388 span.silver { color: silver; }
389 span.teal { color: teal; }
390 span.white { color: white; }
391 span.yellow { color: yellow; }
393 span.aqua-background { background: aqua; }
394 span.black-background { background: black; }
395 span.blue-background { background: blue; }
396 span.fuchsia-background { background: fuchsia; }
397 span.gray-background { background: gray; }
398 span.green-background { background: green; }
399 span.lime-background { background: lime; }
400 span.maroon-background { background: maroon; }
401 span.navy-background { background: navy; }
402 span.olive-background { background: olive; }
403 span.purple-background { background: purple; }
404 span.red-background { background: red; }
405 span.silver-background { background: silver; }
406 span.teal-background { background: teal; }
407 span.white-background { background: white; }
408 span.yellow-background { background: yellow; }
410 span.big { font-size: 2em; }
411 span.small { font-size: 0.6em; }
413 span.underline { text-decoration: underline; }
414 span.overline { text-decoration: overline; }
415 span.line-through { text-decoration: line-through; }
417 div.unbreakable { page-break-inside: avoid; }
421 * xhtml11 specific
423 * */
425 div.tableblock {
426 margin-top: 1.0em;
427 margin-bottom: 1.5em;
429 div.tableblock > table {
430 border: 3px solid #527bbd;
432 thead, p.table.header {
433 font-weight: bold;
434 color: #527bbd;
436 p.table {
437 margin-top: 0;
439 /* Because the table frame attribute is overridden by CSS in most browsers. */
440 div.tableblock > table[frame="void"] {
441 border-style: none;
443 div.tableblock > table[frame="hsides"] {
444 border-left-style: none;
445 border-right-style: none;
447 div.tableblock > table[frame="vsides"] {
448 border-top-style: none;
449 border-bottom-style: none;
454 * html5 specific
456 * */
458 table.tableblock {
459 margin-top: 1.0em;
460 margin-bottom: 1.5em;
462 thead, p.tableblock.header {
463 font-weight: bold;
464 color: #527bbd;
466 p.tableblock {
467 margin-top: 0;
469 table.tableblock {
470 border-width: 3px;
471 border-spacing: 0px;
472 border-style: solid;
473 border-color: #527bbd;
474 border-collapse: collapse;
476 th.tableblock, td.tableblock {
477 border-width: 1px;
478 padding: 4px;
479 border-style: solid;
480 border-color: #527bbd;
483 table.tableblock.frame-topbot {
484 border-left-style: hidden;
485 border-right-style: hidden;
487 table.tableblock.frame-sides {
488 border-top-style: hidden;
489 border-bottom-style: hidden;
491 table.tableblock.frame-none {
492 border-style: hidden;
495 th.tableblock.halign-left, td.tableblock.halign-left {
496 text-align: left;
498 th.tableblock.halign-center, td.tableblock.halign-center {
499 text-align: center;
501 th.tableblock.halign-right, td.tableblock.halign-right {
502 text-align: right;
505 th.tableblock.valign-top, td.tableblock.valign-top {
506 vertical-align: top;
508 th.tableblock.valign-middle, td.tableblock.valign-middle {
509 vertical-align: middle;
511 th.tableblock.valign-bottom, td.tableblock.valign-bottom {
512 vertical-align: bottom;
517 * manpage specific
519 * */
521 body.manpage h1 {
522 padding-top: 0.5em;
523 padding-bottom: 0.5em;
524 border-top: 2px solid silver;
525 border-bottom: 2px solid silver;
527 body.manpage h2 {
528 border-style: none;
530 body.manpage div.sectionbody {
531 margin-left: 3em;
534 @media print {
535 body.manpage div#toc { display: none; }
539 </style>
540 <script type="text/javascript">
541 /*<![CDATA[*/
542 var asciidoc = { // Namespace.
544 /////////////////////////////////////////////////////////////////////
545 // Table Of Contents generator
546 /////////////////////////////////////////////////////////////////////
548 /* Author: Mihai Bazon, September 2002
549 * http://students.infoiasi.ro/~mishoo
551 * Table Of Content generator
552 * Version: 0.4
554 * Feel free to use this script under the terms of the GNU General Public
555 * License, as long as you do not remove or alter this notice.
558 /* modified by Troy D. Hanson, September 2006. License: GPL */
559 /* modified by Stuart Rackham, 2006, 2009. License: GPL */
561 // toclevels = 1..4.
562 toc: function (toclevels) {
564 function getText(el) {
565 var text = "";
566 for (var i = el.firstChild; i != null; i = i.nextSibling) {
567 if (i.nodeType == 3 /* Node.TEXT_NODE */) // IE doesn't speak constants.
568 text += i.data;
569 else if (i.firstChild != null)
570 text += getText(i);
572 return text;
575 function TocEntry(el, text, toclevel) {
576 this.element = el;
577 this.text = text;
578 this.toclevel = toclevel;
581 function tocEntries(el, toclevels) {
582 var result = new Array;
583 var re = new RegExp('[hH]([1-'+(toclevels+1)+'])');
584 // Function that scans the DOM tree for header elements (the DOM2
585 // nodeIterator API would be a better technique but not supported by all
586 // browsers).
587 var iterate = function (el) {
588 for (var i = el.firstChild; i != null; i = i.nextSibling) {
589 if (i.nodeType == 1 /* Node.ELEMENT_NODE */) {
590 var mo = re.exec(i.tagName);
591 if (mo && (i.getAttribute("class") || i.getAttribute("className")) != "float") {
592 result[result.length] = new TocEntry(i, getText(i), mo[1]-1);
594 iterate(i);
598 iterate(el);
599 return result;
602 var toc = document.getElementById("toc");
603 if (!toc) {
604 return;
607 // Delete existing TOC entries in case we're reloading the TOC.
608 var tocEntriesToRemove = [];
609 var i;
610 for (i = 0; i < toc.childNodes.length; i++) {
611 var entry = toc.childNodes[i];
612 if (entry.nodeName.toLowerCase() == 'div'
613 && entry.getAttribute("class")
614 && entry.getAttribute("class").match(/^toclevel/))
615 tocEntriesToRemove.push(entry);
617 for (i = 0; i < tocEntriesToRemove.length; i++) {
618 toc.removeChild(tocEntriesToRemove[i]);
621 // Rebuild TOC entries.
622 var entries = tocEntries(document.getElementById("content"), toclevels);
623 for (var i = 0; i < entries.length; ++i) {
624 var entry = entries[i];
625 if (entry.element.id == "")
626 entry.element.id = "_toc_" + i;
627 var a = document.createElement("a");
628 a.href = "#" + entry.element.id;
629 a.appendChild(document.createTextNode(entry.text));
630 var div = document.createElement("div");
631 div.appendChild(a);
632 div.className = "toclevel" + entry.toclevel;
633 toc.appendChild(div);
635 if (entries.length == 0)
636 toc.parentNode.removeChild(toc);
640 /////////////////////////////////////////////////////////////////////
641 // Footnotes generator
642 /////////////////////////////////////////////////////////////////////
644 /* Based on footnote generation code from:
645 * http://www.brandspankingnew.net/archive/2005/07/format_footnote.html
648 footnotes: function () {
649 // Delete existing footnote entries in case we're reloading the footnodes.
650 var i;
651 var noteholder = document.getElementById("footnotes");
652 if (!noteholder) {
653 return;
655 var entriesToRemove = [];
656 for (i = 0; i < noteholder.childNodes.length; i++) {
657 var entry = noteholder.childNodes[i];
658 if (entry.nodeName.toLowerCase() == 'div' && entry.getAttribute("class") == "footnote")
659 entriesToRemove.push(entry);
661 for (i = 0; i < entriesToRemove.length; i++) {
662 noteholder.removeChild(entriesToRemove[i]);
665 // Rebuild footnote entries.
666 var cont = document.getElementById("content");
667 var spans = cont.getElementsByTagName("span");
668 var refs = {};
669 var n = 0;
670 for (i=0; i<spans.length; i++) {
671 if (spans[i].className == "footnote") {
672 n++;
673 var note = spans[i].getAttribute("data-note");
674 if (!note) {
675 // Use [\s\S] in place of . so multi-line matches work.
676 // Because JavaScript has no s (dotall) regex flag.
677 note = spans[i].innerHTML.match(/\s*\[([\s\S]*)]\s*/)[1];
678 spans[i].innerHTML =
679 "[<a id='_footnoteref_" + n + "' href='#_footnote_" + n +
680 "' title='View footnote' class='footnote'>" + n + "</a>]";
681 spans[i].setAttribute("data-note", note);
683 noteholder.innerHTML +=
684 "<div class='footnote' id='_footnote_" + n + "'>" +
685 "<a href='#_footnoteref_" + n + "' title='Return to text'>" +
686 n + "</a>. " + note + "</div>";
687 var id =spans[i].getAttribute("id");
688 if (id != null) refs["#"+id] = n;
691 if (n == 0)
692 noteholder.parentNode.removeChild(noteholder);
693 else {
694 // Process footnoterefs.
695 for (i=0; i<spans.length; i++) {
696 if (spans[i].className == "footnoteref") {
697 var href = spans[i].getElementsByTagName("a")[0].getAttribute("href");
698 href = href.match(/#.*/)[0]; // Because IE return full URL.
699 n = refs[href];
700 spans[i].innerHTML =
701 "[<a href='#_footnote_" + n +
702 "' title='View footnote' class='footnote'>" + n + "</a>]";
708 install: function(toclevels) {
709 var timerId;
711 function reinstall() {
712 asciidoc.footnotes();
713 if (toclevels) {
714 asciidoc.toc(toclevels);
718 function reinstallAndRemoveTimer() {
719 clearInterval(timerId);
720 reinstall();
723 timerId = setInterval(reinstall, 500);
724 if (document.addEventListener)
725 document.addEventListener("DOMContentLoaded", reinstallAndRemoveTimer, false);
726 else
727 window.onload = reinstallAndRemoveTimer;
731 asciidoc.install();
732 /*]]>*/
733 </script>
734 </head>
735 <body class="article">
736 <div id="header">
737 <h1>How to recover an object from scratch</h1>
738 <span id="revdate">2023-10-29</span>
739 </div>
740 <div id="content">
741 <div id="preamble">
742 <div class="sectionbody">
743 <div class="paragraph"><p>I was recently presented with a repository with a corrupted packfile,
744 and was asked if the data was recoverable. This post-mortem describes
745 the steps I took to investigate and fix the problem. I thought others
746 might find the process interesting, and it might help somebody in the
747 same situation.</p></div>
748 <div class="sidebarblock">
749 <div class="content">
750 <div class="paragraph"><p>Note: In this case, no good copy of the repository was available. For
751 the much easier case where you can get the corrupted object from
752 elsewhere, see <a href="recover-corrupted-blob-object.html">this howto</a>.</p></div>
753 </div></div>
754 <div class="paragraph"><p>I started with an fsck, which found a problem with exactly one object
755 (I&#8217;ve used $pack and $obj below to keep the output readable, and also
756 because I&#8217;ll refer to them later):</p></div>
757 <div class="listingblock">
758 <div class="content">
759 <pre><code> $ git fsck
760 error: $pack SHA1 checksum mismatch
761 error: index CRC mismatch for object $obj from $pack at offset 51653873
762 error: inflate: data stream error (incorrect data check)
763 error: cannot unpack $obj from $pack at offset 51653873</code></pre>
764 </div></div>
765 <div class="paragraph"><p>The pack checksum failing means a byte is munged somewhere, and it is
766 presumably in the object mentioned (since both the index checksum and
767 zlib were failing).</p></div>
768 <div class="paragraph"><p>Reading the zlib source code, I found that "incorrect data check" means
769 that the adler-32 checksum at the end of the zlib data did not match the
770 inflated data. So stepping the data through zlib would not help, as it
771 did not fail until the very end, when we realize the CRC does not match.
772 The problematic bytes could be anywhere in the object data.</p></div>
773 <div class="paragraph"><p>The first thing I did was pull the broken data out of the packfile. I
774 needed to know how big the object was, which I found out with:</p></div>
775 <div class="listingblock">
776 <div class="content">
777 <pre><code> $ git show-index &lt;$idx | cut -d' ' -f1 | sort -n | grep -A1 51653873
778 51653873
779 51664736</code></pre>
780 </div></div>
781 <div class="paragraph"><p>Show-index gives us the list of objects and their offsets. We throw away
782 everything but the offsets, and then sort them so that our interesting
783 offset (which we got from the fsck output above) is followed immediately
784 by the offset of the next object. Now we know that the object data is
785 10863 bytes long, and we can grab it with:</p></div>
786 <div class="listingblock">
787 <div class="content">
788 <pre><code> dd if=$pack of=object bs=1 skip=51653873 count=10863</code></pre>
789 </div></div>
790 <div class="paragraph"><p>I inspected a hexdump of the data, looking for any obvious bogosity
791 (e.g., a 4K run of zeroes would be a good sign of filesystem
792 corruption). But everything looked pretty reasonable.</p></div>
793 <div class="paragraph"><p>Note that the "object" file isn&#8217;t fit for feeding straight to zlib; it
794 has the git packed object header, which is variable-length. We want to
795 strip that off so we can start playing with the zlib data directly. You
796 can either work your way through it manually (the format is described in
797 <a href="../gitformat-pack.html">gitformat-pack(5)</a>),
798 or you can walk through it in a debugger. I did the latter, creating a
799 valid pack like:</p></div>
800 <div class="listingblock">
801 <div class="content">
802 <pre><code> # pack magic and version
803 printf 'PACK\0\0\0\2' &gt;tmp.pack
804 # pack has one object
805 printf '\0\0\0\1' &gt;&gt;tmp.pack
806 # now add our object data
807 cat object &gt;&gt;tmp.pack
808 # and then append the pack trailer
809 /path/to/git.git/t/helper/test-tool sha1 -b &lt;tmp.pack &gt;trailer
810 cat trailer &gt;&gt;tmp.pack</code></pre>
811 </div></div>
812 <div class="paragraph"><p>and then running "git index-pack tmp.pack" in the debugger (stop at
813 unpack_raw_entry). Doing this, I found that there were 3 bytes of header
814 (and the header itself had a sane type and size). So I stripped those
815 off with:</p></div>
816 <div class="listingblock">
817 <div class="content">
818 <pre><code> dd if=object of=zlib bs=1 skip=3</code></pre>
819 </div></div>
820 <div class="paragraph"><p>I ran the result through zlib&#8217;s inflate using a custom C program. And
821 while it did report the error, I did get the right number of output
822 bytes (i.e., it matched git&#8217;s size header that we decoded above). But
823 feeding the result back to "git hash-object" didn&#8217;t produce the same
824 sha1. So there were some wrong bytes, but I didn&#8217;t know which. The file
825 happened to be C source code, so I hoped I could notice something
826 obviously wrong with it, but I didn&#8217;t. I even got it to compile!</p></div>
827 <div class="paragraph"><p>I also tried comparing it to other versions of the same path in the
828 repository, hoping that there would be some part of the diff that didn&#8217;t
829 make sense. Unfortunately, this happened to be the only revision of this
830 particular file in the repository, so I had nothing to compare against.</p></div>
831 <div class="paragraph"><p>So I took a different approach. Working under the guess that the
832 corruption was limited to a single byte, I wrote a program to munge each
833 byte individually, and try inflating the result. Since the object was
834 only 10K compressed, that worked out to about 2.5M attempts, which took
835 a few minutes.</p></div>
836 <div class="paragraph"><p>The program I used is here:</p></div>
837 <div class="listingblock">
838 <div class="content">
839 <pre><code>#include &lt;stdio.h&gt;
840 #include &lt;unistd.h&gt;
841 #include &lt;string.h&gt;
842 #include &lt;signal.h&gt;
843 #include &lt;zlib.h&gt;
845 static int try_zlib(unsigned char *buf, int len)
847 /* make this absurdly large so we don't have to loop */
848 static unsigned char out[1024*1024];
849 z_stream z;
850 int ret;
852 memset(&amp;z, 0, sizeof(z));
853 inflateInit(&amp;z);
855 z.next_in = buf;
856 z.avail_in = len;
857 z.next_out = out;
858 z.avail_out = sizeof(out);
860 ret = inflate(&amp;z, 0);
861 inflateEnd(&amp;z);
862 return ret &gt;= 0;
865 /* eye candy */
866 static int counter = 0;
867 static void progress(int sig)
869 fprintf(stderr, "\r%d", counter);
870 alarm(1);
873 int main(void)
875 /* oversized so we can read the whole buffer in */
876 unsigned char buf[1024*1024];
877 int len;
878 unsigned i, j;
880 signal(SIGALRM, progress);
881 alarm(1);
883 len = read(0, buf, sizeof(buf));
884 for (i = 0; i &lt; len; i++) {
885 unsigned char c = buf[i];
886 for (j = 0; j &lt;= 0xff; j++) {
887 buf[i] = j;
889 counter++;
890 if (try_zlib(buf, len))
891 printf("i=%d, j=%x\n", i, j);
893 buf[i] = c;
896 alarm(0);
897 fprintf(stderr, "\n");
898 return 0;
899 }</code></pre>
900 </div></div>
901 <div class="paragraph"><p>I compiled and ran with:</p></div>
902 <div class="listingblock">
903 <div class="content">
904 <pre><code> gcc -Wall -Werror -O3 munge.c -o munge -lz
905 ./munge &lt;zlib</code></pre>
906 </div></div>
907 <div class="paragraph"><p>There were a few false positives early on (if you write "no data" in the
908 zlib header, zlib thinks it&#8217;s just fine :) ). But I got a hit about
909 halfway through:</p></div>
910 <div class="listingblock">
911 <div class="content">
912 <pre><code> i=5642, j=c7</code></pre>
913 </div></div>
914 <div class="paragraph"><p>I let it run to completion, and got a few more hits at the end (where it
915 was munging the CRC to match our broken data). So there was a good
916 chance this middle hit was the source of the problem.</p></div>
917 <div class="paragraph"><p>I confirmed by tweaking the byte in a hex editor, zlib inflating the
918 result (no errors!), and then piping the output into "git hash-object",
919 which reported the sha1 of the broken object. Success!</p></div>
920 <div class="paragraph"><p>I fixed the packfile itself with:</p></div>
921 <div class="listingblock">
922 <div class="content">
923 <pre><code> chmod +w $pack
924 printf '\xc7' | dd of=$pack bs=1 seek=51659518 conv=notrunc
925 chmod -w $pack</code></pre>
926 </div></div>
927 <div class="paragraph"><p>The <code>\xc7</code> comes from the replacement byte our "munge" program found.
928 The offset 51659518 is derived by taking the original object offset
929 (51653873), adding the replacement offset found by "munge" (5642), and
930 then adding back in the 3 bytes of git header we stripped.</p></div>
931 <div class="paragraph"><p>After that, "git fsck" ran clean.</p></div>
932 <div class="paragraph"><p>As for the corruption itself, I was lucky that it was indeed a single
933 byte. In fact, it turned out to be a single bit. The byte 0xc7 was
934 corrupted to 0xc5. So presumably it was caused by faulty hardware, or a
935 cosmic ray.</p></div>
936 <div class="paragraph"><p>And the aborted attempt to look at the inflated output to see what was
937 wrong? I could have looked forever and never found it. Here&#8217;s the diff
938 between what the corrupted data inflates to, versus the real data:</p></div>
939 <div class="listingblock">
940 <div class="content">
941 <pre><code> - cp = strtok (arg, "+");
942 + cp = strtok (arg, ".");</code></pre>
943 </div></div>
944 <div class="paragraph"><p>It tweaked one byte and still ended up as valid, readable C that just
945 happened to do something totally different! One takeaway is that on a
946 less unlucky day, looking at the zlib output might have actually been
947 helpful, as most random changes would actually break the C code.</p></div>
948 <div class="paragraph"><p>But more importantly, git&#8217;s hashing and checksumming noticed a problem
949 that easily could have gone undetected in another system. The result
950 still compiled, but would have caused an interesting bug (that would
951 have been blamed on some random commit).</p></div>
952 </div>
953 </div>
954 <div class="sect1">
955 <h2 id="_the_adventure_continues_8230">The adventure continues&#8230;</h2>
956 <div class="sectionbody">
957 <div class="paragraph"><p>I ended up doing this again! Same entity, new hardware. The assumption
958 at this point is that the old disk corrupted the packfile, and then the
959 corruption was migrated to the new hardware (because it was done by
960 rsync or similar, and no fsck was done at the time of migration).</p></div>
961 <div class="paragraph"><p>This time, the affected blob was over 20 megabytes, which was far too
962 large to do a brute-force on. I followed the instructions above to
963 create the <code>zlib</code> file. I then used the <code>inflate</code> program below to pull
964 the corrupted data from that. Examining that output gave me a hint about
965 where in the file the corruption was. But now I was working with the
966 file itself, not the zlib contents. So knowing the sha1 of the object
967 and the approximate area of the corruption, I used the <code>sha1-munge</code>
968 program below to brute-force the correct byte.</p></div>
969 <div class="paragraph"><p>Here&#8217;s the inflate program (it&#8217;s essentially <code>gunzip</code> but without the
970 <code>.gz</code> header processing):</p></div>
971 <div class="listingblock">
972 <div class="content">
973 <pre><code>#include &lt;stdio.h&gt;
974 #include &lt;string.h&gt;
975 #include &lt;zlib.h&gt;
976 #include &lt;stdlib.h&gt;
978 int main(int argc, char **argv)
981 * oversized so we can read the whole buffer in;
982 * this could actually be switched to streaming
983 * to avoid any memory limitations
985 static unsigned char buf[25 * 1024 * 1024];
986 static unsigned char out[25 * 1024 * 1024];
987 int len;
988 z_stream z;
989 int ret;
991 len = read(0, buf, sizeof(buf));
992 memset(&amp;z, 0, sizeof(z));
993 inflateInit(&amp;z);
995 z.next_in = buf;
996 z.avail_in = len;
997 z.next_out = out;
998 z.avail_out = sizeof(out);
1000 ret = inflate(&amp;z, 0);
1001 if (ret != Z_OK &amp;&amp; ret != Z_STREAM_END)
1002 fprintf(stderr, "initial inflate failed (%d)\n", ret);
1004 fprintf(stderr, "outputting %lu bytes", z.total_out);
1005 fwrite(out, 1, z.total_out, stdout);
1006 return 0;
1007 }</code></pre>
1008 </div></div>
1009 <div class="paragraph"><p>And here is the <code>sha1-munge</code> program:</p></div>
1010 <div class="listingblock">
1011 <div class="content">
1012 <pre><code>#include &lt;stdio.h&gt;
1013 #include &lt;unistd.h&gt;
1014 #include &lt;string.h&gt;
1015 #include &lt;signal.h&gt;
1016 #include &lt;openssl/sha.h&gt;
1017 #include &lt;stdlib.h&gt;
1019 /* eye candy */
1020 static int counter = 0;
1021 static void progress(int sig)
1023 fprintf(stderr, "\r%d", counter);
1024 alarm(1);
1027 static const signed char hexval_table[256] = {
1028 -1, -1, -1, -1, -1, -1, -1, -1, /* 00-07 */
1029 -1, -1, -1, -1, -1, -1, -1, -1, /* 08-0f */
1030 -1, -1, -1, -1, -1, -1, -1, -1, /* 10-17 */
1031 -1, -1, -1, -1, -1, -1, -1, -1, /* 18-1f */
1032 -1, -1, -1, -1, -1, -1, -1, -1, /* 20-27 */
1033 -1, -1, -1, -1, -1, -1, -1, -1, /* 28-2f */
1034 0, 1, 2, 3, 4, 5, 6, 7, /* 30-37 */
1035 8, 9, -1, -1, -1, -1, -1, -1, /* 38-3f */
1036 -1, 10, 11, 12, 13, 14, 15, -1, /* 40-47 */
1037 -1, -1, -1, -1, -1, -1, -1, -1, /* 48-4f */
1038 -1, -1, -1, -1, -1, -1, -1, -1, /* 50-57 */
1039 -1, -1, -1, -1, -1, -1, -1, -1, /* 58-5f */
1040 -1, 10, 11, 12, 13, 14, 15, -1, /* 60-67 */
1041 -1, -1, -1, -1, -1, -1, -1, -1, /* 68-67 */
1042 -1, -1, -1, -1, -1, -1, -1, -1, /* 70-77 */
1043 -1, -1, -1, -1, -1, -1, -1, -1, /* 78-7f */
1044 -1, -1, -1, -1, -1, -1, -1, -1, /* 80-87 */
1045 -1, -1, -1, -1, -1, -1, -1, -1, /* 88-8f */
1046 -1, -1, -1, -1, -1, -1, -1, -1, /* 90-97 */
1047 -1, -1, -1, -1, -1, -1, -1, -1, /* 98-9f */
1048 -1, -1, -1, -1, -1, -1, -1, -1, /* a0-a7 */
1049 -1, -1, -1, -1, -1, -1, -1, -1, /* a8-af */
1050 -1, -1, -1, -1, -1, -1, -1, -1, /* b0-b7 */
1051 -1, -1, -1, -1, -1, -1, -1, -1, /* b8-bf */
1052 -1, -1, -1, -1, -1, -1, -1, -1, /* c0-c7 */
1053 -1, -1, -1, -1, -1, -1, -1, -1, /* c8-cf */
1054 -1, -1, -1, -1, -1, -1, -1, -1, /* d0-d7 */
1055 -1, -1, -1, -1, -1, -1, -1, -1, /* d8-df */
1056 -1, -1, -1, -1, -1, -1, -1, -1, /* e0-e7 */
1057 -1, -1, -1, -1, -1, -1, -1, -1, /* e8-ef */
1058 -1, -1, -1, -1, -1, -1, -1, -1, /* f0-f7 */
1059 -1, -1, -1, -1, -1, -1, -1, -1, /* f8-ff */
1062 static inline unsigned int hexval(unsigned char c)
1064 return hexval_table[c];
1067 static int get_sha1_hex(const char *hex, unsigned char *sha1)
1069 int i;
1070 for (i = 0; i &lt; 20; i++) {
1071 unsigned int val;
1073 * hex[1]=='\0' is caught when val is checked below,
1074 * but if hex[0] is NUL we have to avoid reading
1075 * past the end of the string:
1077 if (!hex[0])
1078 return -1;
1079 val = (hexval(hex[0]) &lt;&lt; 4) | hexval(hex[1]);
1080 if (val &amp; ~0xff)
1081 return -1;
1082 *sha1++ = val;
1083 hex += 2;
1085 return 0;
1088 int main(int argc, char **argv)
1090 /* oversized so we can read the whole buffer in */
1091 static unsigned char buf[25 * 1024 * 1024];
1092 char header[32];
1093 int header_len;
1094 unsigned char have[20], want[20];
1095 int start, len;
1096 SHA_CTX orig;
1097 unsigned i, j;
1099 if (!argv[1] || get_sha1_hex(argv[1], want)) {
1100 fprintf(stderr, "usage: sha1-munge &lt;sha1&gt; [start] &lt;file.in\n");
1101 return 1;
1104 if (argv[2])
1105 start = atoi(argv[2]);
1106 else
1107 start = 0;
1109 len = read(0, buf, sizeof(buf));
1110 header_len = sprintf(header, "blob %d", len) + 1;
1111 fprintf(stderr, "using header: %s\n", header);
1114 * We keep a running sha1 so that if you are munging
1115 * near the end of the file, we do not have to re-sha1
1116 * the unchanged earlier bytes
1118 SHA1_Init(&amp;orig);
1119 SHA1_Update(&amp;orig, header, header_len);
1120 if (start)
1121 SHA1_Update(&amp;orig, buf, start);
1123 signal(SIGALRM, progress);
1124 alarm(1);
1126 for (i = start; i &lt; len; i++) {
1127 unsigned char c;
1128 SHA_CTX x;
1130 #if 0
1132 * deletion -- this would not actually work in practice,
1133 * I think, because we've already committed to a
1134 * particular size in the header. Ditto for addition
1135 * below. In those cases, you'd have to do the whole
1136 * sha1 from scratch, or possibly keep three running
1137 * "orig" sha1 computations going.
1139 memcpy(&amp;x, &amp;orig, sizeof(x));
1140 SHA1_Update(&amp;x, buf + i + 1, len - i - 1);
1141 SHA1_Final(have, &amp;x);
1142 if (!memcmp(have, want, 20))
1143 printf("i=%d, deletion\n", i);
1144 #endif
1147 * replacement -- note that this tries each of the 256
1148 * possible bytes. If you suspect a single-bit flip,
1149 * it would be much shorter to just try the 8
1150 * bit-flipped variants.
1152 c = buf[i];
1153 for (j = 0; j &lt;= 0xff; j++) {
1154 buf[i] = j;
1156 memcpy(&amp;x, &amp;orig, sizeof(x));
1157 SHA1_Update(&amp;x, buf + i, len - i);
1158 SHA1_Final(have, &amp;x);
1159 if (!memcmp(have, want, 20))
1160 printf("i=%d, j=%02x\n", i, j);
1162 buf[i] = c;
1164 #if 0
1165 /* addition */
1166 for (j = 0; j &lt;= 0xff; j++) {
1167 unsigned char extra = j;
1168 memcpy(&amp;x, &amp;orig, sizeof(x));
1169 SHA1_Update(&amp;x, &amp;extra, 1);
1170 SHA1_Update(&amp;x, buf + i, len - i);
1171 SHA1_Final(have, &amp;x);
1172 if (!memcmp(have, want, 20))
1173 printf("i=%d, addition=%02x", i, j);
1175 #endif
1177 SHA1_Update(&amp;orig, buf + i, 1);
1178 counter++;
1181 alarm(0);
1182 fprintf(stderr, "\r%d\n", counter);
1183 return 0;
1184 }</code></pre>
1185 </div></div>
1186 </div>
1187 </div>
1188 </div>
1189 <div id="footnotes"><hr /></div>
1190 <div id="footer">
1191 <div id="footer-text">
1192 Last updated
1193 2023-10-30 08:42:32 JST
1194 </div>
1195 </div>
1196 </body>
1197 </html>