org.spearce.jgit/src/org/spearce/jgit/util/RawParseUtils.java

   1 /*
   2  * Copyright (C) 2008, Shawn O. Pearce <spearce@spearce.org>
   3  *
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or
   7  * without modification, are permitted provided that the following
   8  * conditions are met:
   9  *
  10  * - Redistributions of source code must retain the above copyright
  11  *   notice, this list of conditions and the following disclaimer.
  12  *
  13  * - Redistributions in binary form must reproduce the above
  14  *   copyright notice, this list of conditions and the following
  15  *   disclaimer in the documentation and/or other materials provided
  16  *   with the distribution.
  17  *
  18  * - Neither the name of the Git Development Community nor the
  19  *   names of its contributors may be used to endorse or promote
  20  *   products derived from this software without specific prior
  21  *   written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  24  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  25  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  28  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  29  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  30  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  32  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  33  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  35  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  36  */
  37
  38 package org.spearce.jgit.util;
  39
  40 import java.nio.ByteBuffer;
  41 import java.nio.charset.Charset;
  42 import java.util.Arrays;
  43
  44 import org.spearce.jgit.lib.Constants;
  45 import org.spearce.jgit.lib.PersonIdent;
  46
  47 /** Handy utility functions to parse raw object contents. */
  48 public final class RawParseUtils {
  49         private static final byte[] author = Constants.encodeASCII("author ");
  50
  51         private static final byte[] committer = Constants.encodeASCII("committer ");
  52
  53         private static final byte[] encoding = Constants.encodeASCII("encoding ");
  54
  55         private static final byte[] digits;
  56
  57         static {
  58                 digits = new byte['9' + 1];
  59                 Arrays.fill(digits, (byte) -1);
  60                 for (char i = '0'; i <= '9'; i++)
  61                         digits[i] = (byte) (i - '0');
  62         }
  63
  64         private static final int match(final byte[] b, int ptr, final byte[] src) {
  65                 if (ptr + src.length >= b.length)
  66                         return -1;
  67                 for (int i = 0; i < src.length; i++, ptr++)
  68                         if (b[ptr] != src[i])
  69                                 return -1;
  70                 return ptr;
  71         }
  72
  73         private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
  74                         '6', '7', '8', '9' };
  75
  76         /**
  77          * Format a base 10 numeric into a temporary buffer.
  78          * <p>
  79          * Formatting is performed backwards. The method starts at offset
  80          * <code>o-1</code> and ends at <code>o-1-digits</code>, where
  81          * <code>digits</code> is the number of positions necessary to store the
  82          * base 10 value.
  83          * <p>
  84          * The argument and return values from this method make it easy to chain
  85          * writing, for example:
  86          * </p>
  87          *
  88          * <pre>
  89          * final byte[] tmp = new byte[64];
  90          * int ptr = tmp.length;
  91          * tmp[--ptr] = '\n';
  92          * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
  93          * tmp[--ptr] = ' ';
  94          * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
  95          * tmp[--ptr] = 0;
  96          * final String str = new String(tmp, ptr, tmp.length - ptr);
  97          * </pre>
  98          *
  99          * @param b
 100          *            buffer to write into.
 101          * @param o
 102          *            one offset past the location where writing will begin; writing
 103          *            proceeds towards lower index values.
 104          * @param value
 105          *            the value to store.
 106          * @return the new offset value <code>o</code>. This is the position of
 107          *         the last byte written. Additional writing should start at one
 108          *         position earlier.
 109          */
 110         public static int formatBase10(final byte[] b, int o, int value) {
 111                 if (value == 0) {
 112                         b[--o] = '0';
 113                         return o;
 114                 }
 115                 final boolean isneg = value < 0;
 116                 while (value != 0) {
 117                         b[--o] = base10byte[value % 10];
 118                         value /= 10;
 119                 }
 120                 if (isneg)
 121                         b[--o] = '-';
 122                 return o;
 123         }
 124
 125         /**
 126          * Parse a base 10 numeric from a sequence of ASCII digits.
 127          * <p>
 128          * Digit sequences can begin with an optional run of spaces before the
 129          * sequence, and may start with a '+' or a '-' to indicate sign position.
 130          * Any other characters will cause the method to stop and return the current
 131          * result to the caller.
 132          *
 133          * @param b
 134          *            buffer to scan.
 135          * @param ptr
 136          *            position within buffer to start parsing digits at.
 137          * @param ptrResult
 138          *            optional location to return the new ptr value through. If null
 139          *            the ptr value will be discarded.
 140          * @return the value at this location; 0 if the location is not a valid
 141          *         numeric.
 142          */
 143         public static final int parseBase10(final byte[] b, int ptr,
 144                         final MutableInteger ptrResult) {
 145                 int r = 0;
 146                 int sign = 0;
 147                 try {
 148                         final int sz = b.length;
 149                         while (ptr < sz && b[ptr] == ' ')
 150                                 ptr++;
 151                         if (ptr >= sz)
 152                                 return 0;
 153
 154                         switch (b[ptr]) {
 155                         case '-':
 156                                 sign = -1;
 157                                 ptr++;
 158                                 break;
 159                         case '+':
 160                                 ptr++;
 161                                 break;
 162                         }
 163
 164                         while (ptr < sz) {
 165                                 final byte v = digits[b[ptr]];
 166                                 if (v < 0)
 167                                         break;
 168                                 r = (r * 10) + v;
 169                                 ptr++;
 170                         }
 171                 } catch (ArrayIndexOutOfBoundsException e) {
 172                         // Not a valid digit.
 173                 }
 174                 if (ptrResult != null)
 175                         ptrResult.value = ptr;
 176                 return sign < 0 ? -r : r;
 177         }
 178
 179         /**
 180          * Parse a Git style timezone string.
 181          * <p>
 182          * The sequence "-0315" will be parsed as the numeric value -195, as the
 183          * lower two positions count minutes, not 100ths of an hour.
 184          *
 185          * @param b
 186          *            buffer to scan.
 187          * @param ptr
 188          *            position within buffer to start parsing digits at.
 189          * @return the timezone at this location, expressed in minutes.
 190          */
 191         public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
 192                 final int v = parseBase10(b, ptr, null);
 193                 final int tzMins = v % 100;
 194                 final int tzHours = v / 100;
 195                 return tzHours * 60 + tzMins;
 196         }
 197
 198         /**
 199          * Locate the first position after a given character.
 200          *
 201          * @param b
 202          *            buffer to scan.
 203          * @param ptr
 204          *            position within buffer to start looking for LF at.
 205          * @param chrA
 206          *            character to find.
 207          * @return new position just after chr.
 208          */
 209         public static final int next(final byte[] b, int ptr, final char chrA) {
 210                 final int sz = b.length;
 211                 while (ptr < sz) {
 212                         if (b[ptr] == chrA)
 213                                 return ptr + 1;
 214                         else
 215                                 ptr++;
 216                 }
 217                 return ptr;
 218         }
 219
 220         /**
 221          * Locate the first position after either the given character or LF.
 222          * <p>
 223          * This method stops on the first match it finds from either chrA or '\n'.
 224          *
 225          * @param b
 226          *            buffer to scan.
 227          * @param ptr
 228          *            position within buffer to start looking for LF at.
 229          * @param chrA
 230          *            character to find.
 231          * @return new position just after the first chrA or chrB to be found.
 232          */
 233         public static final int nextLF(final byte[] b, int ptr, final char chrA) {
 234                 final int sz = b.length;
 235                 while (ptr < sz) {
 236                         final byte c = b[ptr];
 237                         if (c == chrA || c == '\n')
 238                                 return ptr + 1;
 239                         else
 240                                 ptr++;
 241                 }
 242                 return ptr;
 243         }
 244
 245         /**
 246          * Locate the "author " header line data.
 247          *
 248          * @param b
 249          *            buffer to scan.
 250          * @param ptr
 251          *            position in buffer to start the scan at. Most callers should
 252          *            pass 0 to ensure the scan starts from the beginning of the
 253          *            commit buffer and does not accidentally look at message body.
 254          * @return position just after the space in "author ", so the first
 255          *         character of the author's name. If no author header can be
 256          *         located -1 is returned.
 257          */
 258         public static final int author(final byte[] b, int ptr) {
 259                 final int sz = b.length;
 260                 if (ptr == 0)
 261                         ptr += 46; // skip the "tree ..." line.
 262                 while (ptr < sz && b[ptr] == 'p')
 263                         ptr += 48; // skip this parent.
 264                 return match(b, ptr, author);
 265         }
 266
 267         /**
 268          * Locate the "committer " header line data.
 269          *
 270          * @param b
 271          *            buffer to scan.
 272          * @param ptr
 273          *            position in buffer to start the scan at. Most callers should
 274          *            pass 0 to ensure the scan starts from the beginning of the
 275          *            commit buffer and does not accidentally look at message body.
 276          * @return position just after the space in "committer ", so the first
 277          *         character of the committer's name. If no committer header can be
 278          *         located -1 is returned.
 279          */
 280         public static final int committer(final byte[] b, int ptr) {
 281                 final int sz = b.length;
 282                 if (ptr == 0)
 283                         ptr += 46; // skip the "tree ..." line.
 284                 while (ptr < sz && b[ptr] == 'p')
 285                         ptr += 48; // skip this parent.
 286                 if (ptr < sz && b[ptr] == 'a')
 287                         ptr = next(b, ptr, '\n');
 288                 return match(b, ptr, committer);
 289         }
 290
 291         /**
 292          * Locate the "encoding " header line.
 293          *
 294          * @param b
 295          *            buffer to scan.
 296          * @param ptr
 297          *            position in buffer to start the scan at. Most callers should
 298          *            pass 0 to ensure the scan starts from the beginning of the
 299          *            buffer and does not accidentally look at the message body.
 300          * @return position just after the space in "encoding ", so the first
 301          *         character of the encoding's name. If no encoding header can be
 302          *         located -1 is returned (and UTF-8 should be assumed).
 303          */
 304         public static final int encoding(final byte[] b, int ptr) {
 305                 final int sz = b.length;
 306                 while (ptr < sz) {
 307                         if (b[ptr] == '\n')
 308                                 return -1;
 309                         if (b[ptr] == 'e')
 310                                 break;
 311                         ptr = next(b, ptr, '\n');
 312                 }
 313                 return match(b, ptr, encoding);
 314         }
 315
 316         /**
 317          * Parse the "encoding " header into a character set reference.
 318          * <p>
 319          * Locates the "encoding " header (if present) by first calling
 320          * {@link #encoding(byte[], int)} and then returns the proper character set
 321          * to apply to this buffer to evaluate its contents as character data.
 322          * <p>
 323          * If no encoding header is present, {@link Constants#CHARSET} is assumed.
 324          *
 325          * @param b
 326          *            buffer to scan.
 327          * @return the Java character set representation. Never null.
 328          */
 329         public static Charset parseEncoding(final byte[] b) {
 330                 final int enc = encoding(b, 0);
 331                 if (enc < 0)
 332                         return Constants.CHARSET;
 333                 final int lf = next(b, enc, '\n');
 334                 return Charset.forName(decode(Constants.CHARSET, b, enc, lf - 1));
 335         }
 336
 337         /**
 338          * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
 339          * <p>
 340          * When passing in a value for <code>nameB</code> callers should use the
 341          * return value of {@link #author(byte[], int)} or
 342          * {@link #committer(byte[], int)}, as these methods provide the proper
 343          * position within the buffer.
 344          *
 345          * @param raw
 346          *            the buffer to parse character data from.
 347          * @param nameB
 348          *            first position of the identity information. This should be the
 349          *            first position after the space which delimits the header field
 350          *            name (e.g. "author" or "committer") from the rest of the
 351          *            identity line.
 352          * @return the parsed identity. Never null.
 353          */
 354         public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
 355                 final Charset cs = parseEncoding(raw);
 356                 final int emailB = nextLF(raw, nameB, '<');
 357                 final int emailE = nextLF(raw, emailB, '>');
 358
 359                 final String name = decode(cs, raw, nameB, emailB - 2);
 360                 final String email = decode(cs, raw, emailB, emailE - 1);
 361
 362                 final MutableInteger ptrout = new MutableInteger();
 363                 final int when = parseBase10(raw, emailE + 1, ptrout);
 364                 final int tz = parseTimeZoneOffset(raw, ptrout.value);
 365
 366                 return new PersonIdent(name, email, when * 1000L, tz);
 367         }
 368
 369         /**
 370          * Decode a region of the buffer under the specified character set.
 371          *
 372          * @param cs
 373          *            character set to use when decoding the buffer.
 374          * @param buffer
 375          *            buffer to pull raw bytes from.
 376          * @param start
 377          *            first position within the buffer to take data from.
 378          * @param end
 379          *            one position past the last location within the buffer to take
 380          *            data from.
 381          * @return a string representation of the range <code>[start,end)</code>,
 382          *         after decoding the region through the specified character set.
 383          */
 384         public static String decode(final Charset cs, final byte[] buffer,
 385                         final int start, final int end) {
 386                 final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
 387                 return cs.decode(b).toString();
 388         }
 389
 390         /**
 391          * Locate the position of the commit message body.
 392          *
 393          * @param b
 394          *            buffer to scan.
 395          * @param ptr
 396          *            position in buffer to start the scan at. Most callers should
 397          *            pass 0 to ensure the scan starts from the beginning of the
 398          *            commit buffer.
 399          * @return position of the user's message buffer.
 400          */
 401         public static final int commitMessage(final byte[] b, int ptr) {
 402                 final int sz = b.length;
 403                 if (ptr == 0)
 404                         ptr += 46; // skip the "tree ..." line.
 405                 while (ptr < sz && b[ptr] == 'p')
 406                         ptr += 48; // skip this parent.
 407
 408                 // skip any remaining header lines, ignoring what their actual
 409                 // header line type is.
 410                 //
 411                 while (ptr < sz && b[ptr] != '\n')
 412                         ptr = next(b, ptr, '\n');
 413                 if (ptr < sz && b[ptr] == '\n')
 414                         return ptr + 1;
 415                 return -1;
 416         }
 417
 418         /**
 419          * Locate the end of a paragraph.
 420          * <p>
 421          * A paragraph is ended by two consecutive LF bytes.
 422          *
 423          * @param b
 424          *            buffer to scan.
 425          * @param ptr
 426          *            position in buffer to start the scan at. Most callers will
 427          *            want to pass the first position of the commit message (as
 428          *            found by {@link #commitMessage(byte[], int)}.
 429          * @return position of the LF at the end of the paragraph;
 430          *         <code>b.length</code> if no paragraph end could be located.
 431          */
 432         public static final int endOfParagraph(final byte[] b, int ptr) {
 433                 final int sz = b.length;
 434                 while (ptr < sz && b[ptr] != '\n')
 435                         ptr = next(b, ptr, '\n');
 436                 if (ptr < sz && b[ptr] == '\n')
 437                         return ptr - 1;
 438                 return sz;
 439         }
 440
 441         private RawParseUtils() {
 442                 // Don't create instances of a static only utility.
 443         }
 444 }