Fix RawParseUtils to match on the end of the buffer
[egit/charleso.git] / org.spearce.jgit / src / org / spearce / jgit / util / RawParseUtils.java
blob758e7aff85bac08084ee1d33e3ab192983e0c865
1 /*
2 * Copyright (C) 2008, Shawn O. Pearce <spearce@spearce.org>
4 * All rights reserved.
6 * Redistribution and use in source and binary forms, with or
7 * without modification, are permitted provided that the following
8 * conditions are met:
10 * - Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * - Redistributions in binary form must reproduce the above
14 * copyright notice, this list of conditions and the following
15 * disclaimer in the documentation and/or other materials provided
16 * with the distribution.
18 * - Neither the name of the Git Development Community nor the
19 * names of its contributors may be used to endorse or promote
20 * products derived from this software without specific prior
21 * written permission.
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
24 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
25 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
28 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
29 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
30 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
32 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
33 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
35 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 package org.spearce.jgit.util;
40 import static org.spearce.jgit.lib.ObjectChecker.author;
41 import static org.spearce.jgit.lib.ObjectChecker.committer;
42 import static org.spearce.jgit.lib.ObjectChecker.encoding;
44 import java.nio.ByteBuffer;
45 import java.nio.charset.CharacterCodingException;
46 import java.nio.charset.Charset;
47 import java.nio.charset.CharsetDecoder;
48 import java.nio.charset.CodingErrorAction;
49 import java.util.Arrays;
51 import org.spearce.jgit.lib.Constants;
52 import org.spearce.jgit.lib.PersonIdent;
54 /** Handy utility functions to parse raw object contents. */
55 public final class RawParseUtils {
56 private static final byte[] digits;
58 static {
59 digits = new byte['9' + 1];
60 Arrays.fill(digits, (byte) -1);
61 for (char i = '0'; i <= '9'; i++)
62 digits[i] = (byte) (i - '0');
65 /**
66 * Determine if b[ptr] matches src.
68 * @param b
69 * the buffer to scan.
70 * @param ptr
71 * first position within b, this should match src[0].
72 * @param src
73 * the buffer to test for equality with b.
74 * @return ptr + src.length if b[ptr..src.length] == src; else -1.
76 public static final int match(final byte[] b, int ptr, final byte[] src) {
77 if (ptr + src.length > b.length)
78 return -1;
79 for (int i = 0; i < src.length; i++, ptr++)
80 if (b[ptr] != src[i])
81 return -1;
82 return ptr;
85 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
86 '6', '7', '8', '9' };
88 /**
89 * Format a base 10 numeric into a temporary buffer.
90 * <p>
91 * Formatting is performed backwards. The method starts at offset
92 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
93 * <code>digits</code> is the number of positions necessary to store the
94 * base 10 value.
95 * <p>
96 * The argument and return values from this method make it easy to chain
97 * writing, for example:
98 * </p>
100 * <pre>
101 * final byte[] tmp = new byte[64];
102 * int ptr = tmp.length;
103 * tmp[--ptr] = '\n';
104 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
105 * tmp[--ptr] = ' ';
106 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
107 * tmp[--ptr] = 0;
108 * final String str = new String(tmp, ptr, tmp.length - ptr);
109 * </pre>
111 * @param b
112 * buffer to write into.
113 * @param o
114 * one offset past the location where writing will begin; writing
115 * proceeds towards lower index values.
116 * @param value
117 * the value to store.
118 * @return the new offset value <code>o</code>. This is the position of
119 * the last byte written. Additional writing should start at one
120 * position earlier.
122 public static int formatBase10(final byte[] b, int o, int value) {
123 if (value == 0) {
124 b[--o] = '0';
125 return o;
127 final boolean isneg = value < 0;
128 while (value != 0) {
129 b[--o] = base10byte[value % 10];
130 value /= 10;
132 if (isneg)
133 b[--o] = '-';
134 return o;
138 * Parse a base 10 numeric from a sequence of ASCII digits into an int.
139 * <p>
140 * Digit sequences can begin with an optional run of spaces before the
141 * sequence, and may start with a '+' or a '-' to indicate sign position.
142 * Any other characters will cause the method to stop and return the current
143 * result to the caller.
145 * @param b
146 * buffer to scan.
147 * @param ptr
148 * position within buffer to start parsing digits at.
149 * @param ptrResult
150 * optional location to return the new ptr value through. If null
151 * the ptr value will be discarded.
152 * @return the value at this location; 0 if the location is not a valid
153 * numeric.
155 public static final int parseBase10(final byte[] b, int ptr,
156 final MutableInteger ptrResult) {
157 int r = 0;
158 int sign = 0;
159 try {
160 final int sz = b.length;
161 while (ptr < sz && b[ptr] == ' ')
162 ptr++;
163 if (ptr >= sz)
164 return 0;
166 switch (b[ptr]) {
167 case '-':
168 sign = -1;
169 ptr++;
170 break;
171 case '+':
172 ptr++;
173 break;
176 while (ptr < sz) {
177 final byte v = digits[b[ptr]];
178 if (v < 0)
179 break;
180 r = (r * 10) + v;
181 ptr++;
183 } catch (ArrayIndexOutOfBoundsException e) {
184 // Not a valid digit.
186 if (ptrResult != null)
187 ptrResult.value = ptr;
188 return sign < 0 ? -r : r;
192 * Parse a base 10 numeric from a sequence of ASCII digits into a long.
193 * <p>
194 * Digit sequences can begin with an optional run of spaces before the
195 * sequence, and may start with a '+' or a '-' to indicate sign position.
196 * Any other characters will cause the method to stop and return the current
197 * result to the caller.
199 * @param b
200 * buffer to scan.
201 * @param ptr
202 * position within buffer to start parsing digits at.
203 * @param ptrResult
204 * optional location to return the new ptr value through. If null
205 * the ptr value will be discarded.
206 * @return the value at this location; 0 if the location is not a valid
207 * numeric.
209 public static final long parseLongBase10(final byte[] b, int ptr,
210 final MutableInteger ptrResult) {
211 long r = 0;
212 int sign = 0;
213 try {
214 final int sz = b.length;
215 while (ptr < sz && b[ptr] == ' ')
216 ptr++;
217 if (ptr >= sz)
218 return 0;
220 switch (b[ptr]) {
221 case '-':
222 sign = -1;
223 ptr++;
224 break;
225 case '+':
226 ptr++;
227 break;
230 while (ptr < sz) {
231 final byte v = digits[b[ptr]];
232 if (v < 0)
233 break;
234 r = (r * 10) + v;
235 ptr++;
237 } catch (ArrayIndexOutOfBoundsException e) {
238 // Not a valid digit.
240 if (ptrResult != null)
241 ptrResult.value = ptr;
242 return sign < 0 ? -r : r;
246 * Parse a Git style timezone string.
247 * <p>
248 * The sequence "-0315" will be parsed as the numeric value -195, as the
249 * lower two positions count minutes, not 100ths of an hour.
251 * @param b
252 * buffer to scan.
253 * @param ptr
254 * position within buffer to start parsing digits at.
255 * @return the timezone at this location, expressed in minutes.
257 public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
258 final int v = parseBase10(b, ptr, null);
259 final int tzMins = v % 100;
260 final int tzHours = v / 100;
261 return tzHours * 60 + tzMins;
265 * Locate the first position after a given character.
267 * @param b
268 * buffer to scan.
269 * @param ptr
270 * position within buffer to start looking for chrA at.
271 * @param chrA
272 * character to find.
273 * @return new position just after chrA.
275 public static final int next(final byte[] b, int ptr, final char chrA) {
276 final int sz = b.length;
277 while (ptr < sz) {
278 if (b[ptr++] == chrA)
279 return ptr;
281 return ptr;
285 * Locate the first position after the next LF.
286 * <p>
287 * This method stops on the first '\n' it finds.
289 * @param b
290 * buffer to scan.
291 * @param ptr
292 * position within buffer to start looking for LF at.
293 * @return new position just after the first LF found.
295 public static final int nextLF(final byte[] b, int ptr) {
296 return next(b, ptr, '\n');
300 * Locate the first position after either the given character or LF.
301 * <p>
302 * This method stops on the first match it finds from either chrA or '\n'.
304 * @param b
305 * buffer to scan.
306 * @param ptr
307 * position within buffer to start looking for chrA or LF at.
308 * @param chrA
309 * character to find.
310 * @return new position just after the first chrA or LF to be found.
312 public static final int nextLF(final byte[] b, int ptr, final char chrA) {
313 final int sz = b.length;
314 while (ptr < sz) {
315 final byte c = b[ptr++];
316 if (c == chrA || c == '\n')
317 return ptr;
319 return ptr;
323 * Index the region between <code>[ptr, end)</code> to find line starts.
324 * <p>
325 * The returned list is 1 indexed. Index 0 contains
326 * {@link Integer#MIN_VALUE} to pad the list out.
327 * <p>
328 * Using a 1 indexed list means that line numbers can be directly accessed
329 * from the list, so <code>list.get(1)</code> (aka get line 1) returns
330 * <code>ptr</code>.
332 * @param buf
333 * buffer to scan.
334 * @param ptr
335 * position within the buffer corresponding to the first byte of
336 * line 1.
337 * @param end
338 * 1 past the end of the content within <code>buf</code>.
339 * @return a line map indexing the start position of each line.
341 public static final IntList lineMap(final byte[] buf, int ptr, int end) {
342 // Experimentally derived from multiple source repositories
343 // the average number of bytes/line is 36. Its a rough guess
344 // to initially size our map close to the target.
346 final IntList map = new IntList((end - ptr) / 36);
347 map.fillTo(1, Integer.MIN_VALUE);
348 for (; ptr < end; ptr = nextLF(buf, ptr))
349 map.add(ptr);
350 return map;
354 * Locate the "author " header line data.
356 * @param b
357 * buffer to scan.
358 * @param ptr
359 * position in buffer to start the scan at. Most callers should
360 * pass 0 to ensure the scan starts from the beginning of the
361 * commit buffer and does not accidentally look at message body.
362 * @return position just after the space in "author ", so the first
363 * character of the author's name. If no author header can be
364 * located -1 is returned.
366 public static final int author(final byte[] b, int ptr) {
367 final int sz = b.length;
368 if (ptr == 0)
369 ptr += 46; // skip the "tree ..." line.
370 while (ptr < sz && b[ptr] == 'p')
371 ptr += 48; // skip this parent.
372 return match(b, ptr, author);
376 * Locate the "committer " header line data.
378 * @param b
379 * buffer to scan.
380 * @param ptr
381 * position in buffer to start the scan at. Most callers should
382 * pass 0 to ensure the scan starts from the beginning of the
383 * commit buffer and does not accidentally look at message body.
384 * @return position just after the space in "committer ", so the first
385 * character of the committer's name. If no committer header can be
386 * located -1 is returned.
388 public static final int committer(final byte[] b, int ptr) {
389 final int sz = b.length;
390 if (ptr == 0)
391 ptr += 46; // skip the "tree ..." line.
392 while (ptr < sz && b[ptr] == 'p')
393 ptr += 48; // skip this parent.
394 if (ptr < sz && b[ptr] == 'a')
395 ptr = nextLF(b, ptr);
396 return match(b, ptr, committer);
400 * Locate the "encoding " header line.
402 * @param b
403 * buffer to scan.
404 * @param ptr
405 * position in buffer to start the scan at. Most callers should
406 * pass 0 to ensure the scan starts from the beginning of the
407 * buffer and does not accidentally look at the message body.
408 * @return position just after the space in "encoding ", so the first
409 * character of the encoding's name. If no encoding header can be
410 * located -1 is returned (and UTF-8 should be assumed).
412 public static final int encoding(final byte[] b, int ptr) {
413 final int sz = b.length;
414 while (ptr < sz) {
415 if (b[ptr] == '\n')
416 return -1;
417 if (b[ptr] == 'e')
418 break;
419 ptr = nextLF(b, ptr);
421 return match(b, ptr, encoding);
425 * Parse the "encoding " header into a character set reference.
426 * <p>
427 * Locates the "encoding " header (if present) by first calling
428 * {@link #encoding(byte[], int)} and then returns the proper character set
429 * to apply to this buffer to evaluate its contents as character data.
430 * <p>
431 * If no encoding header is present, {@link Constants#CHARSET} is assumed.
433 * @param b
434 * buffer to scan.
435 * @return the Java character set representation. Never null.
437 public static Charset parseEncoding(final byte[] b) {
438 final int enc = encoding(b, 0);
439 if (enc < 0)
440 return Constants.CHARSET;
441 final int lf = nextLF(b, enc);
442 return Charset.forName(decode(Constants.CHARSET, b, enc, lf - 1));
446 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
447 * <p>
448 * When passing in a value for <code>nameB</code> callers should use the
449 * return value of {@link #author(byte[], int)} or
450 * {@link #committer(byte[], int)}, as these methods provide the proper
451 * position within the buffer.
453 * @param raw
454 * the buffer to parse character data from.
455 * @param nameB
456 * first position of the identity information. This should be the
457 * first position after the space which delimits the header field
458 * name (e.g. "author" or "committer") from the rest of the
459 * identity line.
460 * @return the parsed identity. Never null.
462 public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
463 final Charset cs = parseEncoding(raw);
464 final int emailB = nextLF(raw, nameB, '<');
465 final int emailE = nextLF(raw, emailB, '>');
467 final String name = decode(cs, raw, nameB, emailB - 2);
468 final String email = decode(cs, raw, emailB, emailE - 1);
470 final MutableInteger ptrout = new MutableInteger();
471 final long when = parseLongBase10(raw, emailE + 1, ptrout);
472 final int tz = parseTimeZoneOffset(raw, ptrout.value);
474 return new PersonIdent(name, email, when * 1000L, tz);
478 * Decode a buffer under UTF-8, if possible.
480 * If the byte stream cannot be decoded that way, the platform default is tried
481 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
483 * @param buffer
484 * buffer to pull raw bytes from.
485 * @return a string representation of the range <code>[start,end)</code>,
486 * after decoding the region through the specified character set.
488 public static String decode(final byte[] buffer) {
489 return decode(Constants.CHARSET, buffer, 0, buffer.length);
493 * Decode a buffer under the specified character set if possible.
495 * If the byte stream cannot be decoded that way, the platform default is tried
496 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
498 * @param cs
499 * character set to use when decoding the buffer.
500 * @param buffer
501 * buffer to pull raw bytes from.
502 * @return a string representation of the range <code>[start,end)</code>,
503 * after decoding the region through the specified character set.
505 public static String decode(final Charset cs, final byte[] buffer) {
506 return decode(cs, buffer, 0, buffer.length);
510 * Decode a region of the buffer under the specified character set if possible.
512 * If the byte stream cannot be decoded that way, the platform default is tried
513 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
515 * @param cs
516 * character set to use when decoding the buffer.
517 * @param buffer
518 * buffer to pull raw bytes from.
519 * @param start
520 * first position within the buffer to take data from.
521 * @param end
522 * one position past the last location within the buffer to take
523 * data from.
524 * @return a string representation of the range <code>[start,end)</code>,
525 * after decoding the region through the specified character set.
527 public static String decode(final Charset cs, final byte[] buffer,
528 final int start, final int end) {
529 try {
530 return decodeNoFallback(cs, buffer, start, end);
531 } catch (CharacterCodingException e) {
532 // Fall back to an ISO-8859-1 style encoding. At least all of
533 // the bytes will be present in the output.
535 return extractBinaryString(buffer, start, end);
540 * Decode a region of the buffer under the specified character set if
541 * possible.
543 * If the byte stream cannot be decoded that way, the platform default is
544 * tried and if that too fails, an exception is thrown.
546 * @param cs
547 * character set to use when decoding the buffer.
548 * @param buffer
549 * buffer to pull raw bytes from.
550 * @param start
551 * first position within the buffer to take data from.
552 * @param end
553 * one position past the last location within the buffer to take
554 * data from.
555 * @return a string representation of the range <code>[start,end)</code>,
556 * after decoding the region through the specified character set.
557 * @throws CharacterCodingException
558 * the input is not in any of the tested character sets.
560 public static String decodeNoFallback(final Charset cs,
561 final byte[] buffer, final int start, final int end)
562 throws CharacterCodingException {
563 final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
564 b.mark();
566 // Try our built-in favorite. The assumption here is that
567 // decoding will fail if the data is not actually encoded
568 // using that encoder.
570 try {
571 return decode(b, Constants.CHARSET);
572 } catch (CharacterCodingException e) {
573 b.reset();
576 if (!cs.equals(Constants.CHARSET)) {
577 // Try the suggested encoding, it might be right since it was
578 // provided by the caller.
580 try {
581 return decode(b, cs);
582 } catch (CharacterCodingException e) {
583 b.reset();
587 // Try the default character set. A small group of people
588 // might actually use the same (or very similar) locale.
590 final Charset defcs = Charset.defaultCharset();
591 if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) {
592 try {
593 return decode(b, defcs);
594 } catch (CharacterCodingException e) {
595 b.reset();
599 throw new CharacterCodingException();
603 * Decode a region of the buffer under the ISO-8859-1 encoding.
605 * Each byte is treated as a single character in the 8859-1 character
606 * encoding, performing a raw binary->char conversion.
608 * @param buffer
609 * buffer to pull raw bytes from.
610 * @param start
611 * first position within the buffer to take data from.
612 * @param end
613 * one position past the last location within the buffer to take
614 * data from.
615 * @return a string representation of the range <code>[start,end)</code>.
617 public static String extractBinaryString(final byte[] buffer,
618 final int start, final int end) {
619 final StringBuilder r = new StringBuilder(end - start);
620 for (int i = start; i < end; i++)
621 r.append((char) (buffer[i] & 0xff));
622 return r.toString();
625 private static String decode(final ByteBuffer b, final Charset charset)
626 throws CharacterCodingException {
627 final CharsetDecoder d = charset.newDecoder();
628 d.onMalformedInput(CodingErrorAction.REPORT);
629 d.onUnmappableCharacter(CodingErrorAction.REPORT);
630 return d.decode(b).toString();
634 * Locate the position of the commit message body.
636 * @param b
637 * buffer to scan.
638 * @param ptr
639 * position in buffer to start the scan at. Most callers should
640 * pass 0 to ensure the scan starts from the beginning of the
641 * commit buffer.
642 * @return position of the user's message buffer.
644 public static final int commitMessage(final byte[] b, int ptr) {
645 final int sz = b.length;
646 if (ptr == 0)
647 ptr += 46; // skip the "tree ..." line.
648 while (ptr < sz && b[ptr] == 'p')
649 ptr += 48; // skip this parent.
651 // skip any remaining header lines, ignoring what their actual
652 // header line type is.
654 while (ptr < sz && b[ptr] != '\n')
655 ptr = nextLF(b, ptr);
656 if (ptr < sz && b[ptr] == '\n')
657 return ptr + 1;
658 return -1;
662 * Locate the end of a paragraph.
663 * <p>
664 * A paragraph is ended by two consecutive LF bytes.
666 * @param b
667 * buffer to scan.
668 * @param start
669 * position in buffer to start the scan at. Most callers will
670 * want to pass the first position of the commit message (as
671 * found by {@link #commitMessage(byte[], int)}.
672 * @return position of the LF at the end of the paragraph;
673 * <code>b.length</code> if no paragraph end could be located.
675 public static final int endOfParagraph(final byte[] b, final int start) {
676 int ptr = start;
677 final int sz = b.length;
678 while (ptr < sz && b[ptr] != '\n')
679 ptr = nextLF(b, ptr);
680 while (0 < ptr && start < ptr && b[ptr - 1] == '\n')
681 ptr--;
682 return ptr;
685 private RawParseUtils() {
686 // Don't create instances of a static only utility.