org.spearce.jgit/src/org/spearce/jgit/util/RawParseUtils.java

   1 /*
   2  * Copyright (C) 2008, Shawn O. Pearce <spearce@spearce.org>
   3  *
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or
   7  * without modification, are permitted provided that the following
   8  * conditions are met:
   9  *
  10  * - Redistributions of source code must retain the above copyright
  11  *   notice, this list of conditions and the following disclaimer.
  12  *
  13  * - Redistributions in binary form must reproduce the above
  14  *   copyright notice, this list of conditions and the following
  15  *   disclaimer in the documentation and/or other materials provided
  16  *   with the distribution.
  17  *
  18  * - Neither the name of the Git Development Community nor the
  19  *   names of its contributors may be used to endorse or promote
  20  *   products derived from this software without specific prior
  21  *   written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  24  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  25  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  28  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  29  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  30  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  32  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  33  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  35  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  36  */
  37
  38 package org.spearce.jgit.util;
  39
  40 import static org.spearce.jgit.lib.ObjectChecker.author;
  41 import static org.spearce.jgit.lib.ObjectChecker.committer;
  42 import static org.spearce.jgit.lib.ObjectChecker.encoding;
  43
  44 import java.nio.ByteBuffer;
  45 import java.nio.charset.Charset;
  46 import java.util.Arrays;
  47
  48 import org.spearce.jgit.lib.Constants;
  49 import org.spearce.jgit.lib.PersonIdent;
  50
  51 /** Handy utility functions to parse raw object contents. */
  52 public final class RawParseUtils {
  53         private static final byte[] digits;
  54
  55         static {
  56                 digits = new byte['9' + 1];
  57                 Arrays.fill(digits, (byte) -1);
  58                 for (char i = '0'; i <= '9'; i++)
  59                         digits[i] = (byte) (i - '0');
  60         }
  61
  62         /**
  63          * Determine if b[ptr] matches src.
  64          *
  65          * @param b
  66          *            the buffer to scan.
  67          * @param ptr
  68          *            first position within b, this should match src[0].
  69          * @param src
  70          *            the buffer to test for equality with b.
  71          * @return ptr += src.length if b[ptr..src.length] == src; else -1.
  72          */
  73         public static final int match(final byte[] b, int ptr, final byte[] src) {
  74                 if (ptr + src.length >= b.length)
  75                         return -1;
  76                 for (int i = 0; i < src.length; i++, ptr++)
  77                         if (b[ptr] != src[i])
  78                                 return -1;
  79                 return ptr;
  80         }
  81
  82         private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
  83                         '6', '7', '8', '9' };
  84
  85         /**
  86          * Format a base 10 numeric into a temporary buffer.
  87          * <p>
  88          * Formatting is performed backwards. The method starts at offset
  89          * <code>o-1</code> and ends at <code>o-1-digits</code>, where
  90          * <code>digits</code> is the number of positions necessary to store the
  91          * base 10 value.
  92          * <p>
  93          * The argument and return values from this method make it easy to chain
  94          * writing, for example:
  95          * </p>
  96          *
  97          * <pre>
  98          * final byte[] tmp = new byte[64];
  99          * int ptr = tmp.length;
 100          * tmp[--ptr] = '\n';
 101          * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
 102          * tmp[--ptr] = ' ';
 103          * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
 104          * tmp[--ptr] = 0;
 105          * final String str = new String(tmp, ptr, tmp.length - ptr);
 106          * </pre>
 107          *
 108          * @param b
 109          *            buffer to write into.
 110          * @param o
 111          *            one offset past the location where writing will begin; writing
 112          *            proceeds towards lower index values.
 113          * @param value
 114          *            the value to store.
 115          * @return the new offset value <code>o</code>. This is the position of
 116          *         the last byte written. Additional writing should start at one
 117          *         position earlier.
 118          */
 119         public static int formatBase10(final byte[] b, int o, int value) {
 120                 if (value == 0) {
 121                         b[--o] = '0';
 122                         return o;
 123                 }
 124                 final boolean isneg = value < 0;
 125                 while (value != 0) {
 126                         b[--o] = base10byte[value % 10];
 127                         value /= 10;
 128                 }
 129                 if (isneg)
 130                         b[--o] = '-';
 131                 return o;
 132         }
 133
 134         /**
 135          * Parse a base 10 numeric from a sequence of ASCII digits.
 136          * <p>
 137          * Digit sequences can begin with an optional run of spaces before the
 138          * sequence, and may start with a '+' or a '-' to indicate sign position.
 139          * Any other characters will cause the method to stop and return the current
 140          * result to the caller.
 141          *
 142          * @param b
 143          *            buffer to scan.
 144          * @param ptr
 145          *            position within buffer to start parsing digits at.
 146          * @param ptrResult
 147          *            optional location to return the new ptr value through. If null
 148          *            the ptr value will be discarded.
 149          * @return the value at this location; 0 if the location is not a valid
 150          *         numeric.
 151          */
 152         public static final int parseBase10(final byte[] b, int ptr,
 153                         final MutableInteger ptrResult) {
 154                 int r = 0;
 155                 int sign = 0;
 156                 try {
 157                         final int sz = b.length;
 158                         while (ptr < sz && b[ptr] == ' ')
 159                                 ptr++;
 160                         if (ptr >= sz)
 161                                 return 0;
 162
 163                         switch (b[ptr]) {
 164                         case '-':
 165                                 sign = -1;
 166                                 ptr++;
 167                                 break;
 168                         case '+':
 169                                 ptr++;
 170                                 break;
 171                         }
 172
 173                         while (ptr < sz) {
 174                                 final byte v = digits[b[ptr]];
 175                                 if (v < 0)
 176                                         break;
 177                                 r = (r * 10) + v;
 178                                 ptr++;
 179                         }
 180                 } catch (ArrayIndexOutOfBoundsException e) {
 181                         // Not a valid digit.
 182                 }
 183                 if (ptrResult != null)
 184                         ptrResult.value = ptr;
 185                 return sign < 0 ? -r : r;
 186         }
 187
 188         /**
 189          * Parse a Git style timezone string.
 190          * <p>
 191          * The sequence "-0315" will be parsed as the numeric value -195, as the
 192          * lower two positions count minutes, not 100ths of an hour.
 193          *
 194          * @param b
 195          *            buffer to scan.
 196          * @param ptr
 197          *            position within buffer to start parsing digits at.
 198          * @return the timezone at this location, expressed in minutes.
 199          */
 200         public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
 201                 final int v = parseBase10(b, ptr, null);
 202                 final int tzMins = v % 100;
 203                 final int tzHours = v / 100;
 204                 return tzHours * 60 + tzMins;
 205         }
 206
 207         /**
 208          * Locate the first position after a given character.
 209          *
 210          * @param b
 211          *            buffer to scan.
 212          * @param ptr
 213          *            position within buffer to start looking for LF at.
 214          * @param chrA
 215          *            character to find.
 216          * @return new position just after chr.
 217          */
 218         public static final int next(final byte[] b, int ptr, final char chrA) {
 219                 final int sz = b.length;
 220                 while (ptr < sz) {
 221                         if (b[ptr] == chrA)
 222                                 return ptr + 1;
 223                         else
 224                                 ptr++;
 225                 }
 226                 return ptr;
 227         }
 228
 229         /**
 230          * Locate the first position after either the given character or LF.
 231          * <p>
 232          * This method stops on the first match it finds from either chrA or '\n'.
 233          *
 234          * @param b
 235          *            buffer to scan.
 236          * @param ptr
 237          *            position within buffer to start looking for LF at.
 238          * @param chrA
 239          *            character to find.
 240          * @return new position just after the first chrA or chrB to be found.
 241          */
 242         public static final int nextLF(final byte[] b, int ptr, final char chrA) {
 243                 final int sz = b.length;
 244                 while (ptr < sz) {
 245                         final byte c = b[ptr];
 246                         if (c == chrA || c == '\n')
 247                                 return ptr + 1;
 248                         else
 249                                 ptr++;
 250                 }
 251                 return ptr;
 252         }
 253
 254         /**
 255          * Locate the "author " header line data.
 256          *
 257          * @param b
 258          *            buffer to scan.
 259          * @param ptr
 260          *            position in buffer to start the scan at. Most callers should
 261          *            pass 0 to ensure the scan starts from the beginning of the
 262          *            commit buffer and does not accidentally look at message body.
 263          * @return position just after the space in "author ", so the first
 264          *         character of the author's name. If no author header can be
 265          *         located -1 is returned.
 266          */
 267         public static final int author(final byte[] b, int ptr) {
 268                 final int sz = b.length;
 269                 if (ptr == 0)
 270                         ptr += 46; // skip the "tree ..." line.
 271                 while (ptr < sz && b[ptr] == 'p')
 272                         ptr += 48; // skip this parent.
 273                 return match(b, ptr, author);
 274         }
 275
 276         /**
 277          * Locate the "committer " header line data.
 278          *
 279          * @param b
 280          *            buffer to scan.
 281          * @param ptr
 282          *            position in buffer to start the scan at. Most callers should
 283          *            pass 0 to ensure the scan starts from the beginning of the
 284          *            commit buffer and does not accidentally look at message body.
 285          * @return position just after the space in "committer ", so the first
 286          *         character of the committer's name. If no committer header can be
 287          *         located -1 is returned.
 288          */
 289         public static final int committer(final byte[] b, int ptr) {
 290                 final int sz = b.length;
 291                 if (ptr == 0)
 292                         ptr += 46; // skip the "tree ..." line.
 293                 while (ptr < sz && b[ptr] == 'p')
 294                         ptr += 48; // skip this parent.
 295                 if (ptr < sz && b[ptr] == 'a')
 296                         ptr = next(b, ptr, '\n');
 297                 return match(b, ptr, committer);
 298         }
 299
 300         /**
 301          * Locate the "encoding " header line.
 302          *
 303          * @param b
 304          *            buffer to scan.
 305          * @param ptr
 306          *            position in buffer to start the scan at. Most callers should
 307          *            pass 0 to ensure the scan starts from the beginning of the
 308          *            buffer and does not accidentally look at the message body.
 309          * @return position just after the space in "encoding ", so the first
 310          *         character of the encoding's name. If no encoding header can be
 311          *         located -1 is returned (and UTF-8 should be assumed).
 312          */
 313         public static final int encoding(final byte[] b, int ptr) {
 314                 final int sz = b.length;
 315                 while (ptr < sz) {
 316                         if (b[ptr] == '\n')
 317                                 return -1;
 318                         if (b[ptr] == 'e')
 319                                 break;
 320                         ptr = next(b, ptr, '\n');
 321                 }
 322                 return match(b, ptr, encoding);
 323         }
 324
 325         /**
 326          * Parse the "encoding " header into a character set reference.
 327          * <p>
 328          * Locates the "encoding " header (if present) by first calling
 329          * {@link #encoding(byte[], int)} and then returns the proper character set
 330          * to apply to this buffer to evaluate its contents as character data.
 331          * <p>
 332          * If no encoding header is present, {@link Constants#CHARSET} is assumed.
 333          *
 334          * @param b
 335          *            buffer to scan.
 336          * @return the Java character set representation. Never null.
 337          */
 338         public static Charset parseEncoding(final byte[] b) {
 339                 final int enc = encoding(b, 0);
 340                 if (enc < 0)
 341                         return Constants.CHARSET;
 342                 final int lf = next(b, enc, '\n');
 343                 return Charset.forName(decode(Constants.CHARSET, b, enc, lf - 1));
 344         }
 345
 346         /**
 347          * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
 348          * <p>
 349          * When passing in a value for <code>nameB</code> callers should use the
 350          * return value of {@link #author(byte[], int)} or
 351          * {@link #committer(byte[], int)}, as these methods provide the proper
 352          * position within the buffer.
 353          *
 354          * @param raw
 355          *            the buffer to parse character data from.
 356          * @param nameB
 357          *            first position of the identity information. This should be the
 358          *            first position after the space which delimits the header field
 359          *            name (e.g. "author" or "committer") from the rest of the
 360          *            identity line.
 361          * @return the parsed identity. Never null.
 362          */
 363         public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
 364                 final Charset cs = parseEncoding(raw);
 365                 final int emailB = nextLF(raw, nameB, '<');
 366                 final int emailE = nextLF(raw, emailB, '>');
 367
 368                 final String name = decode(cs, raw, nameB, emailB - 2);
 369                 final String email = decode(cs, raw, emailB, emailE - 1);
 370
 371                 final MutableInteger ptrout = new MutableInteger();
 372                 final int when = parseBase10(raw, emailE + 1, ptrout);
 373                 final int tz = parseTimeZoneOffset(raw, ptrout.value);
 374
 375                 return new PersonIdent(name, email, when * 1000L, tz);
 376         }
 377
 378         /**
 379          * Decode a region of the buffer under the specified character set.
 380          *
 381          * @param cs
 382          *            character set to use when decoding the buffer.
 383          * @param buffer
 384          *            buffer to pull raw bytes from.
 385          * @param start
 386          *            first position within the buffer to take data from.
 387          * @param end
 388          *            one position past the last location within the buffer to take
 389          *            data from.
 390          * @return a string representation of the range <code>[start,end)</code>,
 391          *         after decoding the region through the specified character set.
 392          */
 393         public static String decode(final Charset cs, final byte[] buffer,
 394                         final int start, final int end) {
 395                 final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
 396                 return cs.decode(b).toString();
 397         }
 398
 399         /**
 400          * Locate the position of the commit message body.
 401          *
 402          * @param b
 403          *            buffer to scan.
 404          * @param ptr
 405          *            position in buffer to start the scan at. Most callers should
 406          *            pass 0 to ensure the scan starts from the beginning of the
 407          *            commit buffer.
 408          * @return position of the user's message buffer.
 409          */
 410         public static final int commitMessage(final byte[] b, int ptr) {
 411                 final int sz = b.length;
 412                 if (ptr == 0)
 413                         ptr += 46; // skip the "tree ..." line.
 414                 while (ptr < sz && b[ptr] == 'p')
 415                         ptr += 48; // skip this parent.
 416
 417                 // skip any remaining header lines, ignoring what their actual
 418                 // header line type is.
 419                 //
 420                 while (ptr < sz && b[ptr] != '\n')
 421                         ptr = next(b, ptr, '\n');
 422                 if (ptr < sz && b[ptr] == '\n')
 423                         return ptr + 1;
 424                 return -1;
 425         }
 426
 427         /**
 428          * Locate the end of a paragraph.
 429          * <p>
 430          * A paragraph is ended by two consecutive LF bytes.
 431          *
 432          * @param b
 433          *            buffer to scan.
 434          * @param start
 435          *            position in buffer to start the scan at. Most callers will
 436          *            want to pass the first position of the commit message (as
 437          *            found by {@link #commitMessage(byte[], int)}.
 438          * @return position of the LF at the end of the paragraph;
 439          *         <code>b.length</code> if no paragraph end could be located.
 440          */
 441         public static final int endOfParagraph(final byte[] b, final int start) {
 442                 int ptr = start;
 443                 final int sz = b.length;
 444                 while (ptr < sz && b[ptr] != '\n')
 445                         ptr = next(b, ptr, '\n');
 446                 while (0 < ptr && start < ptr && b[ptr - 1] == '\n')
 447                         ptr--;
 448                 return ptr;
 449         }
 450
 451         private RawParseUtils() {
 452                 // Don't create instances of a static only utility.
 453         }
 454 }