Object validation tests for "jgit fsck"
[egit.git] / org.spearce.jgit / src / org / spearce / jgit / util / RawParseUtils.java
bloba31734b17bf880b59b6dc7b156a2a13f446397c1
1 /*
2 * Copyright (C) 2008, Shawn O. Pearce <spearce@spearce.org>
4 * All rights reserved.
6 * Redistribution and use in source and binary forms, with or
7 * without modification, are permitted provided that the following
8 * conditions are met:
10 * - Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * - Redistributions in binary form must reproduce the above
14 * copyright notice, this list of conditions and the following
15 * disclaimer in the documentation and/or other materials provided
16 * with the distribution.
18 * - Neither the name of the Git Development Community nor the
19 * names of its contributors may be used to endorse or promote
20 * products derived from this software without specific prior
21 * written permission.
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
24 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
25 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
28 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
29 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
30 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
32 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
33 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
35 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 package org.spearce.jgit.util;
40 import static org.spearce.jgit.lib.ObjectChecker.author;
41 import static org.spearce.jgit.lib.ObjectChecker.committer;
42 import static org.spearce.jgit.lib.ObjectChecker.encoding;
44 import java.nio.ByteBuffer;
45 import java.nio.charset.Charset;
46 import java.util.Arrays;
48 import org.spearce.jgit.lib.Constants;
49 import org.spearce.jgit.lib.PersonIdent;
51 /** Handy utility functions to parse raw object contents. */
52 public final class RawParseUtils {
53 private static final byte[] digits;
55 static {
56 digits = new byte['9' + 1];
57 Arrays.fill(digits, (byte) -1);
58 for (char i = '0'; i <= '9'; i++)
59 digits[i] = (byte) (i - '0');
62 /**
63 * Determine if b[ptr] matches src.
65 * @param b
66 * the buffer to scan.
67 * @param ptr
68 * first position within b, this should match src[0].
69 * @param src
70 * the buffer to test for equality with b.
71 * @return ptr += src.length if b[ptr..src.length] == src; else -1.
73 public static final int match(final byte[] b, int ptr, final byte[] src) {
74 if (ptr + src.length >= b.length)
75 return -1;
76 for (int i = 0; i < src.length; i++, ptr++)
77 if (b[ptr] != src[i])
78 return -1;
79 return ptr;
82 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
83 '6', '7', '8', '9' };
85 /**
86 * Format a base 10 numeric into a temporary buffer.
87 * <p>
88 * Formatting is performed backwards. The method starts at offset
89 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
90 * <code>digits</code> is the number of positions necessary to store the
91 * base 10 value.
92 * <p>
93 * The argument and return values from this method make it easy to chain
94 * writing, for example:
95 * </p>
97 * <pre>
98 * final byte[] tmp = new byte[64];
99 * int ptr = tmp.length;
100 * tmp[--ptr] = '\n';
101 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
102 * tmp[--ptr] = ' ';
103 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
104 * tmp[--ptr] = 0;
105 * final String str = new String(tmp, ptr, tmp.length - ptr);
106 * </pre>
108 * @param b
109 * buffer to write into.
110 * @param o
111 * one offset past the location where writing will begin; writing
112 * proceeds towards lower index values.
113 * @param value
114 * the value to store.
115 * @return the new offset value <code>o</code>. This is the position of
116 * the last byte written. Additional writing should start at one
117 * position earlier.
119 public static int formatBase10(final byte[] b, int o, int value) {
120 if (value == 0) {
121 b[--o] = '0';
122 return o;
124 final boolean isneg = value < 0;
125 while (value != 0) {
126 b[--o] = base10byte[value % 10];
127 value /= 10;
129 if (isneg)
130 b[--o] = '-';
131 return o;
135 * Parse a base 10 numeric from a sequence of ASCII digits.
136 * <p>
137 * Digit sequences can begin with an optional run of spaces before the
138 * sequence, and may start with a '+' or a '-' to indicate sign position.
139 * Any other characters will cause the method to stop and return the current
140 * result to the caller.
142 * @param b
143 * buffer to scan.
144 * @param ptr
145 * position within buffer to start parsing digits at.
146 * @param ptrResult
147 * optional location to return the new ptr value through. If null
148 * the ptr value will be discarded.
149 * @return the value at this location; 0 if the location is not a valid
150 * numeric.
152 public static final int parseBase10(final byte[] b, int ptr,
153 final MutableInteger ptrResult) {
154 int r = 0;
155 int sign = 0;
156 try {
157 final int sz = b.length;
158 while (ptr < sz && b[ptr] == ' ')
159 ptr++;
160 if (ptr >= sz)
161 return 0;
163 switch (b[ptr]) {
164 case '-':
165 sign = -1;
166 ptr++;
167 break;
168 case '+':
169 ptr++;
170 break;
173 while (ptr < sz) {
174 final byte v = digits[b[ptr]];
175 if (v < 0)
176 break;
177 r = (r * 10) + v;
178 ptr++;
180 } catch (ArrayIndexOutOfBoundsException e) {
181 // Not a valid digit.
183 if (ptrResult != null)
184 ptrResult.value = ptr;
185 return sign < 0 ? -r : r;
189 * Parse a Git style timezone string.
190 * <p>
191 * The sequence "-0315" will be parsed as the numeric value -195, as the
192 * lower two positions count minutes, not 100ths of an hour.
194 * @param b
195 * buffer to scan.
196 * @param ptr
197 * position within buffer to start parsing digits at.
198 * @return the timezone at this location, expressed in minutes.
200 public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
201 final int v = parseBase10(b, ptr, null);
202 final int tzMins = v % 100;
203 final int tzHours = v / 100;
204 return tzHours * 60 + tzMins;
208 * Locate the first position after a given character.
210 * @param b
211 * buffer to scan.
212 * @param ptr
213 * position within buffer to start looking for LF at.
214 * @param chrA
215 * character to find.
216 * @return new position just after chr.
218 public static final int next(final byte[] b, int ptr, final char chrA) {
219 final int sz = b.length;
220 while (ptr < sz) {
221 if (b[ptr] == chrA)
222 return ptr + 1;
223 else
224 ptr++;
226 return ptr;
230 * Locate the first position after either the given character or LF.
231 * <p>
232 * This method stops on the first match it finds from either chrA or '\n'.
234 * @param b
235 * buffer to scan.
236 * @param ptr
237 * position within buffer to start looking for LF at.
238 * @param chrA
239 * character to find.
240 * @return new position just after the first chrA or chrB to be found.
242 public static final int nextLF(final byte[] b, int ptr, final char chrA) {
243 final int sz = b.length;
244 while (ptr < sz) {
245 final byte c = b[ptr];
246 if (c == chrA || c == '\n')
247 return ptr + 1;
248 else
249 ptr++;
251 return ptr;
255 * Locate the "author " header line data.
257 * @param b
258 * buffer to scan.
259 * @param ptr
260 * position in buffer to start the scan at. Most callers should
261 * pass 0 to ensure the scan starts from the beginning of the
262 * commit buffer and does not accidentally look at message body.
263 * @return position just after the space in "author ", so the first
264 * character of the author's name. If no author header can be
265 * located -1 is returned.
267 public static final int author(final byte[] b, int ptr) {
268 final int sz = b.length;
269 if (ptr == 0)
270 ptr += 46; // skip the "tree ..." line.
271 while (ptr < sz && b[ptr] == 'p')
272 ptr += 48; // skip this parent.
273 return match(b, ptr, author);
277 * Locate the "committer " header line data.
279 * @param b
280 * buffer to scan.
281 * @param ptr
282 * position in buffer to start the scan at. Most callers should
283 * pass 0 to ensure the scan starts from the beginning of the
284 * commit buffer and does not accidentally look at message body.
285 * @return position just after the space in "committer ", so the first
286 * character of the committer's name. If no committer header can be
287 * located -1 is returned.
289 public static final int committer(final byte[] b, int ptr) {
290 final int sz = b.length;
291 if (ptr == 0)
292 ptr += 46; // skip the "tree ..." line.
293 while (ptr < sz && b[ptr] == 'p')
294 ptr += 48; // skip this parent.
295 if (ptr < sz && b[ptr] == 'a')
296 ptr = next(b, ptr, '\n');
297 return match(b, ptr, committer);
301 * Locate the "encoding " header line.
303 * @param b
304 * buffer to scan.
305 * @param ptr
306 * position in buffer to start the scan at. Most callers should
307 * pass 0 to ensure the scan starts from the beginning of the
308 * buffer and does not accidentally look at the message body.
309 * @return position just after the space in "encoding ", so the first
310 * character of the encoding's name. If no encoding header can be
311 * located -1 is returned (and UTF-8 should be assumed).
313 public static final int encoding(final byte[] b, int ptr) {
314 final int sz = b.length;
315 while (ptr < sz) {
316 if (b[ptr] == '\n')
317 return -1;
318 if (b[ptr] == 'e')
319 break;
320 ptr = next(b, ptr, '\n');
322 return match(b, ptr, encoding);
326 * Parse the "encoding " header into a character set reference.
327 * <p>
328 * Locates the "encoding " header (if present) by first calling
329 * {@link #encoding(byte[], int)} and then returns the proper character set
330 * to apply to this buffer to evaluate its contents as character data.
331 * <p>
332 * If no encoding header is present, {@link Constants#CHARSET} is assumed.
334 * @param b
335 * buffer to scan.
336 * @return the Java character set representation. Never null.
338 public static Charset parseEncoding(final byte[] b) {
339 final int enc = encoding(b, 0);
340 if (enc < 0)
341 return Constants.CHARSET;
342 final int lf = next(b, enc, '\n');
343 return Charset.forName(decode(Constants.CHARSET, b, enc, lf - 1));
347 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
348 * <p>
349 * When passing in a value for <code>nameB</code> callers should use the
350 * return value of {@link #author(byte[], int)} or
351 * {@link #committer(byte[], int)}, as these methods provide the proper
352 * position within the buffer.
354 * @param raw
355 * the buffer to parse character data from.
356 * @param nameB
357 * first position of the identity information. This should be the
358 * first position after the space which delimits the header field
359 * name (e.g. "author" or "committer") from the rest of the
360 * identity line.
361 * @return the parsed identity. Never null.
363 public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
364 final Charset cs = parseEncoding(raw);
365 final int emailB = nextLF(raw, nameB, '<');
366 final int emailE = nextLF(raw, emailB, '>');
368 final String name = decode(cs, raw, nameB, emailB - 2);
369 final String email = decode(cs, raw, emailB, emailE - 1);
371 final MutableInteger ptrout = new MutableInteger();
372 final int when = parseBase10(raw, emailE + 1, ptrout);
373 final int tz = parseTimeZoneOffset(raw, ptrout.value);
375 return new PersonIdent(name, email, when * 1000L, tz);
379 * Decode a region of the buffer under the specified character set.
381 * @param cs
382 * character set to use when decoding the buffer.
383 * @param buffer
384 * buffer to pull raw bytes from.
385 * @param start
386 * first position within the buffer to take data from.
387 * @param end
388 * one position past the last location within the buffer to take
389 * data from.
390 * @return a string representation of the range <code>[start,end)</code>,
391 * after decoding the region through the specified character set.
393 public static String decode(final Charset cs, final byte[] buffer,
394 final int start, final int end) {
395 final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
396 return cs.decode(b).toString();
400 * Locate the position of the commit message body.
402 * @param b
403 * buffer to scan.
404 * @param ptr
405 * position in buffer to start the scan at. Most callers should
406 * pass 0 to ensure the scan starts from the beginning of the
407 * commit buffer.
408 * @return position of the user's message buffer.
410 public static final int commitMessage(final byte[] b, int ptr) {
411 final int sz = b.length;
412 if (ptr == 0)
413 ptr += 46; // skip the "tree ..." line.
414 while (ptr < sz && b[ptr] == 'p')
415 ptr += 48; // skip this parent.
417 // skip any remaining header lines, ignoring what their actual
418 // header line type is.
420 while (ptr < sz && b[ptr] != '\n')
421 ptr = next(b, ptr, '\n');
422 if (ptr < sz && b[ptr] == '\n')
423 return ptr + 1;
424 return -1;
428 * Locate the end of a paragraph.
429 * <p>
430 * A paragraph is ended by two consecutive LF bytes.
432 * @param b
433 * buffer to scan.
434 * @param start
435 * position in buffer to start the scan at. Most callers will
436 * want to pass the first position of the commit message (as
437 * found by {@link #commitMessage(byte[], int)}.
438 * @return position of the LF at the end of the paragraph;
439 * <code>b.length</code> if no paragraph end could be located.
441 public static final int endOfParagraph(final byte[] b, final int start) {
442 int ptr = start;
443 final int sz = b.length;
444 while (ptr < sz && b[ptr] != '\n')
445 ptr = next(b, ptr, '\n');
446 while (0 < ptr && start < ptr && b[ptr - 1] == '\n')
447 ptr--;
448 return ptr;
451 private RawParseUtils() {
452 // Don't create instances of a static only utility.