2 * Copyright (C) 2008, Shawn O. Pearce <spearce@spearce.org>
6 * Redistribution and use in source and binary forms, with or
7 * without modification, are permitted provided that the following
10 * - Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * - Redistributions in binary form must reproduce the above
14 * copyright notice, this list of conditions and the following
15 * disclaimer in the documentation and/or other materials provided
16 * with the distribution.
18 * - Neither the name of the Git Development Community nor the
19 * names of its contributors may be used to endorse or promote
20 * products derived from this software without specific prior
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
24 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
25 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
28 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
29 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
30 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
32 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
33 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
35 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 package org
.spearce
.jgit
.util
;
40 import static org
.spearce
.jgit
.lib
.ObjectChecker
.author
;
41 import static org
.spearce
.jgit
.lib
.ObjectChecker
.committer
;
42 import static org
.spearce
.jgit
.lib
.ObjectChecker
.encoding
;
44 import java
.nio
.ByteBuffer
;
45 import java
.nio
.charset
.CharacterCodingException
;
46 import java
.nio
.charset
.Charset
;
47 import java
.nio
.charset
.CharsetDecoder
;
48 import java
.nio
.charset
.CodingErrorAction
;
49 import java
.util
.Arrays
;
51 import org
.spearce
.jgit
.lib
.Constants
;
52 import org
.spearce
.jgit
.lib
.PersonIdent
;
54 /** Handy utility functions to parse raw object contents. */
55 public final class RawParseUtils
{
56 private static final byte[] digits
;
59 digits
= new byte['9' + 1];
60 Arrays
.fill(digits
, (byte) -1);
61 for (char i
= '0'; i
<= '9'; i
++)
62 digits
[i
] = (byte) (i
- '0');
66 * Determine if b[ptr] matches src.
71 * first position within b, this should match src[0].
73 * the buffer to test for equality with b.
74 * @return ptr += src.length if b[ptr..src.length] == src; else -1.
76 public static final int match(final byte[] b
, int ptr
, final byte[] src
) {
77 if (ptr
+ src
.length
>= b
.length
)
79 for (int i
= 0; i
< src
.length
; i
++, ptr
++)
85 private static final byte[] base10byte
= { '0', '1', '2', '3', '4', '5',
89 * Format a base 10 numeric into a temporary buffer.
91 * Formatting is performed backwards. The method starts at offset
92 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
93 * <code>digits</code> is the number of positions necessary to store the
96 * The argument and return values from this method make it easy to chain
97 * writing, for example:
101 * final byte[] tmp = new byte[64];
102 * int ptr = tmp.length;
104 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
106 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
108 * final String str = new String(tmp, ptr, tmp.length - ptr);
112 * buffer to write into.
114 * one offset past the location where writing will begin; writing
115 * proceeds towards lower index values.
117 * the value to store.
118 * @return the new offset value <code>o</code>. This is the position of
119 * the last byte written. Additional writing should start at one
122 public static int formatBase10(final byte[] b
, int o
, int value
) {
127 final boolean isneg
= value
< 0;
129 b
[--o
] = base10byte
[value
% 10];
138 * Parse a base 10 numeric from a sequence of ASCII digits.
140 * Digit sequences can begin with an optional run of spaces before the
141 * sequence, and may start with a '+' or a '-' to indicate sign position.
142 * Any other characters will cause the method to stop and return the current
143 * result to the caller.
148 * position within buffer to start parsing digits at.
150 * optional location to return the new ptr value through. If null
151 * the ptr value will be discarded.
152 * @return the value at this location; 0 if the location is not a valid
155 public static final int parseBase10(final byte[] b
, int ptr
,
156 final MutableInteger ptrResult
) {
160 final int sz
= b
.length
;
161 while (ptr
< sz
&& b
[ptr
] == ' ')
177 final byte v
= digits
[b
[ptr
]];
183 } catch (ArrayIndexOutOfBoundsException e
) {
184 // Not a valid digit.
186 if (ptrResult
!= null)
187 ptrResult
.value
= ptr
;
188 return sign
< 0 ?
-r
: r
;
192 * Parse a Git style timezone string.
194 * The sequence "-0315" will be parsed as the numeric value -195, as the
195 * lower two positions count minutes, not 100ths of an hour.
200 * position within buffer to start parsing digits at.
201 * @return the timezone at this location, expressed in minutes.
203 public static final int parseTimeZoneOffset(final byte[] b
, int ptr
) {
204 final int v
= parseBase10(b
, ptr
, null);
205 final int tzMins
= v
% 100;
206 final int tzHours
= v
/ 100;
207 return tzHours
* 60 + tzMins
;
211 * Locate the first position after a given character.
216 * position within buffer to start looking for LF at.
219 * @return new position just after chr.
221 public static final int next(final byte[] b
, int ptr
, final char chrA
) {
222 final int sz
= b
.length
;
233 * Locate the first position after either the given character or LF.
235 * This method stops on the first match it finds from either chrA or '\n'.
240 * position within buffer to start looking for LF at.
243 * @return new position just after the first chrA or chrB to be found.
245 public static final int nextLF(final byte[] b
, int ptr
, final char chrA
) {
246 final int sz
= b
.length
;
248 final byte c
= b
[ptr
];
249 if (c
== chrA
|| c
== '\n')
258 * Locate the "author " header line data.
263 * position in buffer to start the scan at. Most callers should
264 * pass 0 to ensure the scan starts from the beginning of the
265 * commit buffer and does not accidentally look at message body.
266 * @return position just after the space in "author ", so the first
267 * character of the author's name. If no author header can be
268 * located -1 is returned.
270 public static final int author(final byte[] b
, int ptr
) {
271 final int sz
= b
.length
;
273 ptr
+= 46; // skip the "tree ..." line.
274 while (ptr
< sz
&& b
[ptr
] == 'p')
275 ptr
+= 48; // skip this parent.
276 return match(b
, ptr
, author
);
280 * Locate the "committer " header line data.
285 * position in buffer to start the scan at. Most callers should
286 * pass 0 to ensure the scan starts from the beginning of the
287 * commit buffer and does not accidentally look at message body.
288 * @return position just after the space in "committer ", so the first
289 * character of the committer's name. If no committer header can be
290 * located -1 is returned.
292 public static final int committer(final byte[] b
, int ptr
) {
293 final int sz
= b
.length
;
295 ptr
+= 46; // skip the "tree ..." line.
296 while (ptr
< sz
&& b
[ptr
] == 'p')
297 ptr
+= 48; // skip this parent.
298 if (ptr
< sz
&& b
[ptr
] == 'a')
299 ptr
= next(b
, ptr
, '\n');
300 return match(b
, ptr
, committer
);
304 * Locate the "encoding " header line.
309 * position in buffer to start the scan at. Most callers should
310 * pass 0 to ensure the scan starts from the beginning of the
311 * buffer and does not accidentally look at the message body.
312 * @return position just after the space in "encoding ", so the first
313 * character of the encoding's name. If no encoding header can be
314 * located -1 is returned (and UTF-8 should be assumed).
316 public static final int encoding(final byte[] b
, int ptr
) {
317 final int sz
= b
.length
;
323 ptr
= next(b
, ptr
, '\n');
325 return match(b
, ptr
, encoding
);
329 * Parse the "encoding " header into a character set reference.
331 * Locates the "encoding " header (if present) by first calling
332 * {@link #encoding(byte[], int)} and then returns the proper character set
333 * to apply to this buffer to evaluate its contents as character data.
335 * If no encoding header is present, {@link Constants#CHARSET} is assumed.
339 * @return the Java character set representation. Never null.
341 public static Charset
parseEncoding(final byte[] b
) {
342 final int enc
= encoding(b
, 0);
344 return Constants
.CHARSET
;
345 final int lf
= next(b
, enc
, '\n');
346 return Charset
.forName(decode(Constants
.CHARSET
, b
, enc
, lf
- 1));
350 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
352 * When passing in a value for <code>nameB</code> callers should use the
353 * return value of {@link #author(byte[], int)} or
354 * {@link #committer(byte[], int)}, as these methods provide the proper
355 * position within the buffer.
358 * the buffer to parse character data from.
360 * first position of the identity information. This should be the
361 * first position after the space which delimits the header field
362 * name (e.g. "author" or "committer") from the rest of the
364 * @return the parsed identity. Never null.
366 public static PersonIdent
parsePersonIdent(final byte[] raw
, final int nameB
) {
367 final Charset cs
= parseEncoding(raw
);
368 final int emailB
= nextLF(raw
, nameB
, '<');
369 final int emailE
= nextLF(raw
, emailB
, '>');
371 final String name
= decode(cs
, raw
, nameB
, emailB
- 2);
372 final String email
= decode(cs
, raw
, emailB
, emailE
- 1);
374 final MutableInteger ptrout
= new MutableInteger();
375 final int when
= parseBase10(raw
, emailE
+ 1, ptrout
);
376 final int tz
= parseTimeZoneOffset(raw
, ptrout
.value
);
378 return new PersonIdent(name
, email
, when
* 1000L, tz
);
382 * Decode a buffer under UTF-8, if possible.
384 * If the byte stream cannot be decoded that way, the platform default is tried
385 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
388 * buffer to pull raw bytes from.
389 * @return a string representation of the range <code>[start,end)</code>,
390 * after decoding the region through the specified character set.
392 public static String
decode(final byte[] buffer
) {
393 return decode(Constants
.CHARSET
, buffer
, 0, buffer
.length
);
397 * Decode a buffer under the specified character set if possible.
399 * If the byte stream cannot be decoded that way, the platform default is tried
400 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
403 * character set to use when decoding the buffer.
405 * buffer to pull raw bytes from.
406 * @return a string representation of the range <code>[start,end)</code>,
407 * after decoding the region through the specified character set.
409 public static String
decode(final Charset cs
, final byte[] buffer
) {
410 return decode(cs
, buffer
, 0, buffer
.length
);
414 * Decode a region of the buffer under the specified character set if possible.
416 * If the byte stream cannot be decoded that way, the platform default is tried
417 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
420 * character set to use when decoding the buffer.
422 * buffer to pull raw bytes from.
424 * first position within the buffer to take data from.
426 * one position past the last location within the buffer to take
428 * @return a string representation of the range <code>[start,end)</code>,
429 * after decoding the region through the specified character set.
431 public static String
decode(final Charset cs
, final byte[] buffer
,
432 final int start
, final int end
) {
433 final ByteBuffer b
= ByteBuffer
.wrap(buffer
, start
, end
- start
);
436 // Try our built-in favorite. The assumption here is that
437 // decoding will fail if the data is not actually encoded
438 // using that encoder.
441 return decode(b
, Constants
.CHARSET
);
442 } catch (CharacterCodingException e
) {
446 if (!cs
.equals(Constants
.CHARSET
)) {
447 // Try the suggested encoding, it might be right since it was
448 // provided by the caller.
451 return decode(b
, cs
);
452 } catch (CharacterCodingException e
) {
457 // Try the default character set. A small group of people
458 // might actually use the same (or very similar) locale.
460 final Charset defcs
= Charset
.defaultCharset();
461 if (!defcs
.equals(cs
) && !defcs
.equals(Constants
.CHARSET
)) {
463 return decode(b
, defcs
);
464 } catch (CharacterCodingException e
) {
469 // Fall back to an ISO-8859-1 style encoding. At least all of
470 // the bytes will be present in the output.
472 final StringBuilder r
= new StringBuilder(end
- start
);
473 for (int i
= start
; i
< end
; i
++)
474 r
.append((char) (buffer
[i
] & 0xff));
478 private static String
decode(final ByteBuffer b
, final Charset charset
)
479 throws CharacterCodingException
{
480 final CharsetDecoder d
= charset
.newDecoder();
481 d
.onMalformedInput(CodingErrorAction
.REPORT
);
482 d
.onUnmappableCharacter(CodingErrorAction
.REPORT
);
483 return d
.decode(b
).toString();
487 * Locate the position of the commit message body.
492 * position in buffer to start the scan at. Most callers should
493 * pass 0 to ensure the scan starts from the beginning of the
495 * @return position of the user's message buffer.
497 public static final int commitMessage(final byte[] b
, int ptr
) {
498 final int sz
= b
.length
;
500 ptr
+= 46; // skip the "tree ..." line.
501 while (ptr
< sz
&& b
[ptr
] == 'p')
502 ptr
+= 48; // skip this parent.
504 // skip any remaining header lines, ignoring what their actual
505 // header line type is.
507 while (ptr
< sz
&& b
[ptr
] != '\n')
508 ptr
= next(b
, ptr
, '\n');
509 if (ptr
< sz
&& b
[ptr
] == '\n')
515 * Locate the end of a paragraph.
517 * A paragraph is ended by two consecutive LF bytes.
522 * position in buffer to start the scan at. Most callers will
523 * want to pass the first position of the commit message (as
524 * found by {@link #commitMessage(byte[], int)}.
525 * @return position of the LF at the end of the paragraph;
526 * <code>b.length</code> if no paragraph end could be located.
528 public static final int endOfParagraph(final byte[] b
, final int start
) {
530 final int sz
= b
.length
;
531 while (ptr
< sz
&& b
[ptr
] != '\n')
532 ptr
= next(b
, ptr
, '\n');
533 while (0 < ptr
&& start
< ptr
&& b
[ptr
- 1] == '\n')
538 private RawParseUtils() {
539 // Don't create instances of a static only utility.