Switch jgit library to the EDL (3-clause BSD)
[jgit.git] / org.spearce.jgit / src / org / spearce / jgit / util / RawParseUtils.java
blobf268ffc20e3e08c2986bc942f0e1db0703d63ba8
1 /*
2 * Copyright (C) 2008, Shawn O. Pearce <spearce@spearce.org>
4 * All rights reserved.
6 * Redistribution and use in source and binary forms, with or
7 * without modification, are permitted provided that the following
8 * conditions are met:
10 * - Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
13 * - Redistributions in binary form must reproduce the above
14 * copyright notice, this list of conditions and the following
15 * disclaimer in the documentation and/or other materials provided
16 * with the distribution.
18 * - Neither the name of the Git Development Community nor the
19 * names of its contributors may be used to endorse or promote
20 * products derived from this software without specific prior
21 * written permission.
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
24 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
25 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
28 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
29 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
30 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
32 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
33 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
35 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 package org.spearce.jgit.util;
40 import java.nio.ByteBuffer;
41 import java.nio.charset.Charset;
42 import java.util.Arrays;
44 import org.spearce.jgit.lib.Constants;
45 import org.spearce.jgit.lib.PersonIdent;
47 /** Handy utility functions to parse raw object contents. */
48 public final class RawParseUtils {
49 private static final byte[] author = Constants.encodeASCII("author ");
51 private static final byte[] committer = Constants.encodeASCII("committer ");
53 private static final byte[] encoding = Constants.encodeASCII("encoding ");
55 private static final byte[] digits;
57 static {
58 digits = new byte['9' + 1];
59 Arrays.fill(digits, (byte) -1);
60 for (char i = '0'; i <= '9'; i++)
61 digits[i] = (byte) (i - '0');
64 private static final int match(final byte[] b, int ptr, final byte[] src) {
65 if (ptr + src.length >= b.length)
66 return -1;
67 for (int i = 0; i < src.length; i++, ptr++)
68 if (b[ptr] != src[i])
69 return -1;
70 return ptr;
73 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
74 '6', '7', '8', '9' };
76 /**
77 * Format a base 10 numeric into a temporary buffer.
78 * <p>
79 * Formatting is performed backwards. The method starts at offset
80 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
81 * <code>digits</code> is the number of positions necessary to store the
82 * base 10 value.
83 * <p>
84 * The argument and return values from this method make it easy to chain
85 * writing, for example:
86 * </p>
88 * <pre>
89 * final byte[] tmp = new byte[64];
90 * int ptr = tmp.length;
91 * tmp[--ptr] = '\n';
92 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
93 * tmp[--ptr] = ' ';
94 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
95 * tmp[--ptr] = 0;
96 * final String str = new String(tmp, ptr, tmp.length - ptr);
97 * </pre>
99 * @param b
100 * buffer to write into.
101 * @param o
102 * one offset past the location where writing will begin; writing
103 * proceeds towards lower index values.
104 * @param value
105 * the value to store.
106 * @return the new offset value <code>o</code>. This is the position of
107 * the last byte written. Additional writing should start at one
108 * position earlier.
110 public static int formatBase10(final byte[] b, int o, int value) {
111 if (value == 0) {
112 b[--o] = '0';
113 return o;
115 final boolean isneg = value < 0;
116 while (value != 0) {
117 b[--o] = base10byte[value % 10];
118 value /= 10;
120 if (isneg)
121 b[--o] = '-';
122 return o;
126 * Parse a base 10 numeric from a sequence of ASCII digits.
127 * <p>
128 * Digit sequences can begin with an optional run of spaces before the
129 * sequence, and may start with a '+' or a '-' to indicate sign position.
130 * Any other characters will cause the method to stop and return the current
131 * result to the caller.
133 * @param b
134 * buffer to scan.
135 * @param ptr
136 * position within buffer to start parsing digits at.
137 * @param ptrResult
138 * optional location to return the new ptr value through. If null
139 * the ptr value will be discarded.
140 * @return the value at this location; 0 if the location is not a valid
141 * numeric.
143 public static final int parseBase10(final byte[] b, int ptr,
144 final MutableInteger ptrResult) {
145 int r = 0;
146 int sign = 0;
147 try {
148 final int sz = b.length;
149 while (ptr < sz && b[ptr] == ' ')
150 ptr++;
151 if (ptr >= sz)
152 return 0;
154 switch (b[ptr]) {
155 case '-':
156 sign = -1;
157 ptr++;
158 break;
159 case '+':
160 ptr++;
161 break;
164 while (ptr < sz) {
165 final byte v = digits[b[ptr]];
166 if (v < 0)
167 break;
168 r = (r * 10) + v;
169 ptr++;
171 } catch (ArrayIndexOutOfBoundsException e) {
172 // Not a valid digit.
174 if (ptrResult != null)
175 ptrResult.value = ptr;
176 return sign < 0 ? -r : r;
180 * Parse a Git style timezone string.
181 * <p>
182 * The sequence "-0315" will be parsed as the numeric value -195, as the
183 * lower two positions count minutes, not 100ths of an hour.
185 * @param b
186 * buffer to scan.
187 * @param ptr
188 * position within buffer to start parsing digits at.
189 * @return the timezone at this location, expressed in minutes.
191 public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
192 final int v = parseBase10(b, ptr, null);
193 final int tzMins = v % 100;
194 final int tzHours = v / 100;
195 return tzHours * 60 + tzMins;
199 * Locate the first position after a given character.
201 * @param b
202 * buffer to scan.
203 * @param ptr
204 * position within buffer to start looking for LF at.
205 * @param chrA
206 * character to find.
207 * @return new position just after chr.
209 public static final int next(final byte[] b, int ptr, final char chrA) {
210 final int sz = b.length;
211 while (ptr < sz) {
212 if (b[ptr] == chrA)
213 return ptr + 1;
214 else
215 ptr++;
217 return ptr;
221 * Locate the first position after either the given character or LF.
222 * <p>
223 * This method stops on the first match it finds from either chrA or '\n'.
225 * @param b
226 * buffer to scan.
227 * @param ptr
228 * position within buffer to start looking for LF at.
229 * @param chrA
230 * character to find.
231 * @return new position just after the first chrA or chrB to be found.
233 public static final int nextLF(final byte[] b, int ptr, final char chrA) {
234 final int sz = b.length;
235 while (ptr < sz) {
236 final byte c = b[ptr];
237 if (c == chrA || c == '\n')
238 return ptr + 1;
239 else
240 ptr++;
242 return ptr;
246 * Locate the "author " header line data.
248 * @param b
249 * buffer to scan.
250 * @param ptr
251 * position in buffer to start the scan at. Most callers should
252 * pass 0 to ensure the scan starts from the beginning of the
253 * commit buffer and does not accidentally look at message body.
254 * @return position just after the space in "author ", so the first
255 * character of the author's name. If no author header can be
256 * located -1 is returned.
258 public static final int author(final byte[] b, int ptr) {
259 final int sz = b.length;
260 if (ptr == 0)
261 ptr += 46; // skip the "tree ..." line.
262 while (ptr < sz && b[ptr] == 'p')
263 ptr += 48; // skip this parent.
264 return match(b, ptr, author);
268 * Locate the "committer " header line data.
270 * @param b
271 * buffer to scan.
272 * @param ptr
273 * position in buffer to start the scan at. Most callers should
274 * pass 0 to ensure the scan starts from the beginning of the
275 * commit buffer and does not accidentally look at message body.
276 * @return position just after the space in "committer ", so the first
277 * character of the committer's name. If no committer header can be
278 * located -1 is returned.
280 public static final int committer(final byte[] b, int ptr) {
281 final int sz = b.length;
282 if (ptr == 0)
283 ptr += 46; // skip the "tree ..." line.
284 while (ptr < sz && b[ptr] == 'p')
285 ptr += 48; // skip this parent.
286 if (ptr < sz && b[ptr] == 'a')
287 ptr = next(b, ptr, '\n');
288 return match(b, ptr, committer);
292 * Locate the "encoding " header line.
294 * @param b
295 * buffer to scan.
296 * @param ptr
297 * position in buffer to start the scan at. Most callers should
298 * pass 0 to ensure the scan starts from the beginning of the
299 * buffer and does not accidentally look at the message body.
300 * @return position just after the space in "encoding ", so the first
301 * character of the encoding's name. If no encoding header can be
302 * located -1 is returned (and UTF-8 should be assumed).
304 public static final int encoding(final byte[] b, int ptr) {
305 final int sz = b.length;
306 while (ptr < sz) {
307 if (b[ptr] == '\n')
308 return -1;
309 if (b[ptr] == 'e')
310 break;
311 ptr = next(b, ptr, '\n');
313 return match(b, ptr, encoding);
317 * Parse the "encoding " header into a character set reference.
318 * <p>
319 * Locates the "encoding " header (if present) by first calling
320 * {@link #encoding(byte[], int)} and then returns the proper character set
321 * to apply to this buffer to evaluate its contents as character data.
322 * <p>
323 * If no encoding header is present, {@link Constants#CHARSET} is assumed.
325 * @param b
326 * buffer to scan.
327 * @return the Java character set representation. Never null.
329 public static Charset parseEncoding(final byte[] b) {
330 final int enc = encoding(b, 0);
331 if (enc < 0)
332 return Constants.CHARSET;
333 final int lf = next(b, enc, '\n');
334 return Charset.forName(decode(Constants.CHARSET, b, enc, lf - 1));
338 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
339 * <p>
340 * When passing in a value for <code>nameB</code> callers should use the
341 * return value of {@link #author(byte[], int)} or
342 * {@link #committer(byte[], int)}, as these methods provide the proper
343 * position within the buffer.
345 * @param raw
346 * the buffer to parse character data from.
347 * @param nameB
348 * first position of the identity information. This should be the
349 * first position after the space which delimits the header field
350 * name (e.g. "author" or "committer") from the rest of the
351 * identity line.
352 * @return the parsed identity. Never null.
354 public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
355 final Charset cs = parseEncoding(raw);
356 final int emailB = nextLF(raw, nameB, '<');
357 final int emailE = nextLF(raw, emailB, '>');
359 final String name = decode(cs, raw, nameB, emailB - 2);
360 final String email = decode(cs, raw, emailB, emailE - 1);
362 final MutableInteger ptrout = new MutableInteger();
363 final int when = parseBase10(raw, emailE + 1, ptrout);
364 final int tz = parseTimeZoneOffset(raw, ptrout.value);
366 return new PersonIdent(name, email, when * 1000L, tz);
370 * Decode a region of the buffer under the specified character set.
372 * @param cs
373 * character set to use when decoding the buffer.
374 * @param buffer
375 * buffer to pull raw bytes from.
376 * @param start
377 * first position within the buffer to take data from.
378 * @param end
379 * one position past the last location within the buffer to take
380 * data from.
381 * @return a string representation of the range <code>[start,end)</code>,
382 * after decoding the region through the specified character set.
384 public static String decode(final Charset cs, final byte[] buffer,
385 final int start, final int end) {
386 final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
387 return cs.decode(b).toString();
391 * Locate the position of the commit message body.
393 * @param b
394 * buffer to scan.
395 * @param ptr
396 * position in buffer to start the scan at. Most callers should
397 * pass 0 to ensure the scan starts from the beginning of the
398 * commit buffer.
399 * @return position of the user's message buffer.
401 public static final int commitMessage(final byte[] b, int ptr) {
402 final int sz = b.length;
403 if (ptr == 0)
404 ptr += 46; // skip the "tree ..." line.
405 while (ptr < sz && b[ptr] == 'p')
406 ptr += 48; // skip this parent.
408 // skip any remaining header lines, ignoring what their actual
409 // header line type is.
411 while (ptr < sz && b[ptr] != '\n')
412 ptr = next(b, ptr, '\n');
413 if (ptr < sz && b[ptr] == '\n')
414 return ptr + 1;
415 return -1;
419 * Locate the end of a paragraph.
420 * <p>
421 * A paragraph is ended by two consecutive LF bytes.
423 * @param b
424 * buffer to scan.
425 * @param ptr
426 * position in buffer to start the scan at. Most callers will
427 * want to pass the first position of the commit message (as
428 * found by {@link #commitMessage(byte[], int)}.
429 * @return position of the LF at the end of the paragraph;
430 * <code>b.length</code> if no paragraph end could be located.
432 public static final int endOfParagraph(final byte[] b, int ptr) {
433 final int sz = b.length;
434 while (ptr < sz && b[ptr] != '\n')
435 ptr = next(b, ptr, '\n');
436 if (ptr < sz && b[ptr] == '\n')
437 return ptr - 1;
438 return sz;
441 private RawParseUtils() {
442 // Don't create instances of a static only utility.