Teach RawParseUtil how to format base 10 numbers it reads
[egit/zawir.git] / org.spearce.jgit / src / org / spearce / jgit / util / RawParseUtils.java
blob0e317424fa36e5f73ae366156097bd00e73de595
1 /*
2 * Copyright (C) 2008 Shawn Pearce <spearce@spearce.org>
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License, version 2, as published by the Free Software Foundation.
8 * This library is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
13 * You should have received a copy of the GNU General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301
17 package org.spearce.jgit.util;
19 import java.nio.ByteBuffer;
20 import java.nio.charset.Charset;
21 import java.util.Arrays;
23 import org.spearce.jgit.lib.Constants;
24 import org.spearce.jgit.lib.PersonIdent;
26 /** Handy utility functions to parse raw object contents. */
27 public final class RawParseUtils {
28 private static final byte[] author = Constants.encodeASCII("author ");
30 private static final byte[] committer = Constants.encodeASCII("committer ");
32 private static final byte[] encoding = Constants.encodeASCII("encoding ");
34 private static final byte[] digits;
36 static {
37 digits = new byte['9' + 1];
38 Arrays.fill(digits, (byte) -1);
39 for (char i = '0'; i <= '9'; i++)
40 digits[i] = (byte) (i - '0');
43 private static final int match(final byte[] b, int ptr, final byte[] src) {
44 if (ptr + src.length >= b.length)
45 return -1;
46 for (int i = 0; i < src.length; i++, ptr++)
47 if (b[ptr] != src[i])
48 return -1;
49 return ptr;
52 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
53 '6', '7', '8', '9' };
55 /**
56 * Format a base 10 numeric into a temporary buffer.
57 * <p>
58 * Formatting is performed backwards. The method starts at offset
59 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
60 * <code>digits</code> is the number of positions necessary to store the
61 * base 10 value.
62 * <p>
63 * The argument and return values from this method make it easy to chain
64 * writing, for example:
65 * </p>
67 * <pre>
68 * final byte[] tmp = new byte[64];
69 * int ptr = tmp.length;
70 * tmp[--ptr] = '\n';
71 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
72 * tmp[--ptr] = ' ';
73 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
74 * tmp[--ptr] = 0;
75 * final String str = new String(tmp, ptr, tmp.length - ptr);
76 * </pre>
78 * @param b
79 * buffer to write into.
80 * @param o
81 * one offset past the location where writing will begin; writing
82 * proceeds towards lower index values.
83 * @param value
84 * the value to store.
85 * @return the new offset value <code>o</code>. This is the position of
86 * the last byte written. Additional writing should start at one
87 * position earlier.
89 public static int formatBase10(final byte[] b, int o, int value) {
90 if (value == 0) {
91 b[--o] = '0';
92 return o;
94 final boolean isneg = value < 0;
95 while (value != 0) {
96 b[--o] = base10byte[value % 10];
97 value /= 10;
99 if (isneg)
100 b[--o] = '-';
101 return o;
105 * Parse a base 10 numeric from a sequence of ASCII digits.
106 * <p>
107 * Digit sequences can begin with an optional run of spaces before the
108 * sequence, and may start with a '+' or a '-' to indicate sign position.
109 * Any other characters will cause the method to stop and return the current
110 * result to the caller.
112 * @param b
113 * buffer to scan.
114 * @param ptr
115 * position within buffer to start parsing digits at.
116 * @param ptrResult
117 * optional location to return the new ptr value through. If null
118 * the ptr value will be discarded.
119 * @return the value at this location; 0 if the location is not a valid
120 * numeric.
122 public static final int parseBase10(final byte[] b, int ptr,
123 final MutableInteger ptrResult) {
124 int r = 0;
125 int sign = 0;
126 try {
127 final int sz = b.length;
128 while (ptr < sz && b[ptr] == ' ')
129 ptr++;
130 if (ptr >= sz)
131 return 0;
133 switch (b[ptr]) {
134 case '-':
135 sign = -1;
136 ptr++;
137 break;
138 case '+':
139 ptr++;
140 break;
143 while (ptr < sz) {
144 final byte v = digits[b[ptr]];
145 if (v < 0)
146 break;
147 r = (r * 10) + v;
148 ptr++;
150 } catch (ArrayIndexOutOfBoundsException e) {
151 // Not a valid digit.
153 if (ptrResult != null)
154 ptrResult.value = ptr;
155 return sign < 0 ? -r : r;
159 * Parse a Git style timezone string.
160 * <p>
161 * The sequence "-0315" will be parsed as the numeric value -195, as the
162 * lower two positions count minutes, not 100ths of an hour.
164 * @param b
165 * buffer to scan.
166 * @param ptr
167 * position within buffer to start parsing digits at.
168 * @return the timezone at this location, expressed in minutes.
170 public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
171 final int v = parseBase10(b, ptr, null);
172 final int tzMins = v % 100;
173 final int tzHours = v / 100;
174 return tzHours * 60 + tzMins;
178 * Locate the first position after a given character.
180 * @param b
181 * buffer to scan.
182 * @param ptr
183 * position within buffer to start looking for LF at.
184 * @param chrA
185 * character to find.
186 * @return new position just after chr.
188 public static final int next(final byte[] b, int ptr, final char chrA) {
189 final int sz = b.length;
190 while (ptr < sz) {
191 if (b[ptr] == chrA)
192 return ptr + 1;
193 else
194 ptr++;
196 return ptr;
200 * Locate the first position after either the given character or LF.
201 * <p>
202 * This method stops on the first match it finds from either chrA or '\n'.
204 * @param b
205 * buffer to scan.
206 * @param ptr
207 * position within buffer to start looking for LF at.
208 * @param chrA
209 * character to find.
210 * @return new position just after the first chrA or chrB to be found.
212 public static final int nextLF(final byte[] b, int ptr, final char chrA) {
213 final int sz = b.length;
214 while (ptr < sz) {
215 final byte c = b[ptr];
216 if (c == chrA || c == '\n')
217 return ptr + 1;
218 else
219 ptr++;
221 return ptr;
225 * Locate the "author " header line data.
227 * @param b
228 * buffer to scan.
229 * @param ptr
230 * position in buffer to start the scan at. Most callers should
231 * pass 0 to ensure the scan starts from the beginning of the
232 * commit buffer and does not accidentally look at message body.
233 * @return position just after the space in "author ", so the first
234 * character of the author's name. If no author header can be
235 * located -1 is returned.
237 public static final int author(final byte[] b, int ptr) {
238 final int sz = b.length;
239 if (ptr == 0)
240 ptr += 46; // skip the "tree ..." line.
241 while (ptr < sz && b[ptr] == 'p')
242 ptr += 48; // skip this parent.
243 return match(b, ptr, author);
247 * Locate the "committer " header line data.
249 * @param b
250 * buffer to scan.
251 * @param ptr
252 * position in buffer to start the scan at. Most callers should
253 * pass 0 to ensure the scan starts from the beginning of the
254 * commit buffer and does not accidentally look at message body.
255 * @return position just after the space in "committer ", so the first
256 * character of the committer's name. If no committer header can be
257 * located -1 is returned.
259 public static final int committer(final byte[] b, int ptr) {
260 final int sz = b.length;
261 if (ptr == 0)
262 ptr += 46; // skip the "tree ..." line.
263 while (ptr < sz && b[ptr] == 'p')
264 ptr += 48; // skip this parent.
265 if (ptr < sz && b[ptr] == 'a')
266 ptr = next(b, ptr, '\n');
267 return match(b, ptr, committer);
271 * Locate the "encoding " header line.
273 * @param b
274 * buffer to scan.
275 * @param ptr
276 * position in buffer to start the scan at. Most callers should
277 * pass 0 to ensure the scan starts from the beginning of the
278 * buffer and does not accidentally look at the message body.
279 * @return position just after the space in "encoding ", so the first
280 * character of the encoding's name. If no encoding header can be
281 * located -1 is returned (and UTF-8 should be assumed).
283 public static final int encoding(final byte[] b, int ptr) {
284 final int sz = b.length;
285 while (ptr < sz) {
286 if (b[ptr] == '\n')
287 return -1;
288 if (b[ptr] == 'e')
289 break;
290 ptr = next(b, ptr, '\n');
292 return match(b, ptr, encoding);
296 * Parse the "encoding " header into a character set reference.
297 * <p>
298 * Locates the "encoding " header (if present) by first calling
299 * {@link #encoding(byte[], int)} and then returns the proper character set
300 * to apply to this buffer to evaluate its contents as character data.
301 * <p>
302 * If no encoding header is present, {@link Constants#CHARSET} is assumed.
304 * @param b
305 * buffer to scan.
306 * @return the Java character set representation. Never null.
308 public static Charset parseEncoding(final byte[] b) {
309 final int enc = encoding(b, 0);
310 if (enc < 0)
311 return Constants.CHARSET;
312 final int lf = next(b, enc, '\n');
313 return Charset.forName(decode(Constants.CHARSET, b, enc, lf - 1));
317 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
318 * <p>
319 * When passing in a value for <code>nameB</code> callers should use the
320 * return value of {@link #author(byte[], int)} or
321 * {@link #committer(byte[], int)}, as these methods provide the proper
322 * position within the buffer.
324 * @param raw
325 * the buffer to parse character data from.
326 * @param nameB
327 * first position of the identity information. This should be the
328 * first position after the space which delimits the header field
329 * name (e.g. "author" or "committer") from the rest of the
330 * identity line.
331 * @return the parsed identity. Never null.
333 public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
334 final Charset cs = parseEncoding(raw);
335 final int emailB = nextLF(raw, nameB, '<');
336 final int emailE = nextLF(raw, emailB, '>');
338 final String name = decode(cs, raw, nameB, emailB - 2);
339 final String email = decode(cs, raw, emailB, emailE - 1);
341 final MutableInteger ptrout = new MutableInteger();
342 final int when = parseBase10(raw, emailE + 1, ptrout);
343 final int tz = parseTimeZoneOffset(raw, ptrout.value);
345 return new PersonIdent(name, email, when * 1000L, tz);
349 * Decode a region of the buffer under the specified character set.
351 * @param cs
352 * character set to use when decoding the buffer.
353 * @param buffer
354 * buffer to pull raw bytes from.
355 * @param start
356 * first position within the buffer to take data from.
357 * @param end
358 * one position past the last location within the buffer to take
359 * data from.
360 * @return a string representation of the range <code>[start,end)</code>,
361 * after decoding the region through the specified character set.
363 public static String decode(final Charset cs, final byte[] buffer,
364 final int start, final int end) {
365 final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
366 return cs.decode(b).toString();
370 * Locate the position of the commit message body.
372 * @param b
373 * buffer to scan.
374 * @param ptr
375 * position in buffer to start the scan at. Most callers should
376 * pass 0 to ensure the scan starts from the beginning of the
377 * commit buffer.
378 * @return position of the user's message buffer.
380 public static final int commitMessage(final byte[] b, int ptr) {
381 final int sz = b.length;
382 if (ptr == 0)
383 ptr += 46; // skip the "tree ..." line.
384 while (ptr < sz && b[ptr] == 'p')
385 ptr += 48; // skip this parent.
387 // skip any remaining header lines, ignoring what their actual
388 // header line type is.
390 while (ptr < sz && b[ptr] != '\n')
391 ptr = next(b, ptr, '\n');
392 if (ptr < sz && b[ptr] == '\n')
393 return ptr + 1;
394 return -1;
398 * Locate the end of a paragraph.
399 * <p>
400 * A paragraph is ended by two consecutive LF bytes.
402 * @param b
403 * buffer to scan.
404 * @param ptr
405 * position in buffer to start the scan at. Most callers will
406 * want to pass the first position of the commit message (as
407 * found by {@link #commitMessage(byte[], int)}.
408 * @return position of the LF at the end of the paragraph;
409 * <code>b.length</code> if no paragraph end could be located.
411 public static final int endOfParagraph(final byte[] b, int ptr) {
412 final int sz = b.length;
413 while (ptr < sz && b[ptr] != '\n')
414 ptr = next(b, ptr, '\n');
415 if (ptr < sz && b[ptr] == '\n')
416 return ptr - 1;
417 return sz;
420 private RawParseUtils() {
421 // Don't create instances of a static only utility.