org.spearce.jgit/src/org/spearce/jgit/util/RawParseUtils.java

   1 /*
   2  * Copyright (C) 2008, Shawn O. Pearce <spearce@spearce.org>
   3  *
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or
   7  * without modification, are permitted provided that the following
   8  * conditions are met:
   9  *
  10  * - Redistributions of source code must retain the above copyright
  11  *   notice, this list of conditions and the following disclaimer.
  12  *
  13  * - Redistributions in binary form must reproduce the above
  14  *   copyright notice, this list of conditions and the following
  15  *   disclaimer in the documentation and/or other materials provided
  16  *   with the distribution.
  17  *
  18  * - Neither the name of the Git Development Community nor the
  19  *   names of its contributors may be used to endorse or promote
  20  *   products derived from this software without specific prior
  21  *   written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  24  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  25  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  28  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  29  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  30  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  32  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  33  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  35  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  36  */
  37
  38 package org.spearce.jgit.util;
  39
  40 import static org.spearce.jgit.lib.ObjectChecker.author;
  41 import static org.spearce.jgit.lib.ObjectChecker.committer;
  42 import static org.spearce.jgit.lib.ObjectChecker.encoding;
  43
  44 import java.nio.ByteBuffer;
  45 import java.nio.charset.CharacterCodingException;
  46 import java.nio.charset.Charset;
  47 import java.nio.charset.CharsetDecoder;
  48 import java.nio.charset.CodingErrorAction;
  49 import java.util.Arrays;
  50
  51 import org.spearce.jgit.lib.Constants;
  52 import org.spearce.jgit.lib.PersonIdent;
  53
  54 /** Handy utility functions to parse raw object contents. */
  55 public final class RawParseUtils {
  56         private static final byte[] digits;
  57
  58         static {
  59                 digits = new byte['9' + 1];
  60                 Arrays.fill(digits, (byte) -1);
  61                 for (char i = '0'; i <= '9'; i++)
  62                         digits[i] = (byte) (i - '0');
  63         }
  64
  65         /**
  66          * Determine if b[ptr] matches src.
  67          *
  68          * @param b
  69          *            the buffer to scan.
  70          * @param ptr
  71          *            first position within b, this should match src[0].
  72          * @param src
  73          *            the buffer to test for equality with b.
  74          * @return ptr += src.length if b[ptr..src.length] == src; else -1.
  75          */
  76         public static final int match(final byte[] b, int ptr, final byte[] src) {
  77                 if (ptr + src.length >= b.length)
  78                         return -1;
  79                 for (int i = 0; i < src.length; i++, ptr++)
  80                         if (b[ptr] != src[i])
  81                                 return -1;
  82                 return ptr;
  83         }
  84
  85         private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
  86                         '6', '7', '8', '9' };
  87
  88         /**
  89          * Format a base 10 numeric into a temporary buffer.
  90          * <p>
  91          * Formatting is performed backwards. The method starts at offset
  92          * <code>o-1</code> and ends at <code>o-1-digits</code>, where
  93          * <code>digits</code> is the number of positions necessary to store the
  94          * base 10 value.
  95          * <p>
  96          * The argument and return values from this method make it easy to chain
  97          * writing, for example:
  98          * </p>
  99          *
 100          * <pre>
 101          * final byte[] tmp = new byte[64];
 102          * int ptr = tmp.length;
 103          * tmp[--ptr] = '\n';
 104          * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
 105          * tmp[--ptr] = ' ';
 106          * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
 107          * tmp[--ptr] = 0;
 108          * final String str = new String(tmp, ptr, tmp.length - ptr);
 109          * </pre>
 110          *
 111          * @param b
 112          *            buffer to write into.
 113          * @param o
 114          *            one offset past the location where writing will begin; writing
 115          *            proceeds towards lower index values.
 116          * @param value
 117          *            the value to store.
 118          * @return the new offset value <code>o</code>. This is the position of
 119          *         the last byte written. Additional writing should start at one
 120          *         position earlier.
 121          */
 122         public static int formatBase10(final byte[] b, int o, int value) {
 123                 if (value == 0) {
 124                         b[--o] = '0';
 125                         return o;
 126                 }
 127                 final boolean isneg = value < 0;
 128                 while (value != 0) {
 129                         b[--o] = base10byte[value % 10];
 130                         value /= 10;
 131                 }
 132                 if (isneg)
 133                         b[--o] = '-';
 134                 return o;
 135         }
 136
 137         /**
 138          * Parse a base 10 numeric from a sequence of ASCII digits.
 139          * <p>
 140          * Digit sequences can begin with an optional run of spaces before the
 141          * sequence, and may start with a '+' or a '-' to indicate sign position.
 142          * Any other characters will cause the method to stop and return the current
 143          * result to the caller.
 144          *
 145          * @param b
 146          *            buffer to scan.
 147          * @param ptr
 148          *            position within buffer to start parsing digits at.
 149          * @param ptrResult
 150          *            optional location to return the new ptr value through. If null
 151          *            the ptr value will be discarded.
 152          * @return the value at this location; 0 if the location is not a valid
 153          *         numeric.
 154          */
 155         public static final int parseBase10(final byte[] b, int ptr,
 156                         final MutableInteger ptrResult) {
 157                 int r = 0;
 158                 int sign = 0;
 159                 try {
 160                         final int sz = b.length;
 161                         while (ptr < sz && b[ptr] == ' ')
 162                                 ptr++;
 163                         if (ptr >= sz)
 164                                 return 0;
 165
 166                         switch (b[ptr]) {
 167                         case '-':
 168                                 sign = -1;
 169                                 ptr++;
 170                                 break;
 171                         case '+':
 172                                 ptr++;
 173                                 break;
 174                         }
 175
 176                         while (ptr < sz) {
 177                                 final byte v = digits[b[ptr]];
 178                                 if (v < 0)
 179                                         break;
 180                                 r = (r * 10) + v;
 181                                 ptr++;
 182                         }
 183                 } catch (ArrayIndexOutOfBoundsException e) {
 184                         // Not a valid digit.
 185                 }
 186                 if (ptrResult != null)
 187                         ptrResult.value = ptr;
 188                 return sign < 0 ? -r : r;
 189         }
 190
 191         /**
 192          * Parse a Git style timezone string.
 193          * <p>
 194          * The sequence "-0315" will be parsed as the numeric value -195, as the
 195          * lower two positions count minutes, not 100ths of an hour.
 196          *
 197          * @param b
 198          *            buffer to scan.
 199          * @param ptr
 200          *            position within buffer to start parsing digits at.
 201          * @return the timezone at this location, expressed in minutes.
 202          */
 203         public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
 204                 final int v = parseBase10(b, ptr, null);
 205                 final int tzMins = v % 100;
 206                 final int tzHours = v / 100;
 207                 return tzHours * 60 + tzMins;
 208         }
 209
 210         /**
 211          * Locate the first position after a given character.
 212          *
 213          * @param b
 214          *            buffer to scan.
 215          * @param ptr
 216          *            position within buffer to start looking for chrA at.
 217          * @param chrA
 218          *            character to find.
 219          * @return new position just after chrA.
 220          */
 221         public static final int next(final byte[] b, int ptr, final char chrA) {
 222                 final int sz = b.length;
 223                 while (ptr < sz) {
 224                         if (b[ptr++] == chrA)
 225                                 return ptr;
 226                 }
 227                 return ptr;
 228         }
 229
 230         /**
 231          * Locate the first position after the next LF.
 232          * <p>
 233          * This method stops on the first '\n' it finds.
 234          *
 235          * @param b
 236          *            buffer to scan.
 237          * @param ptr
 238          *            position within buffer to start looking for LF at.
 239          * @return new position just after the first LF found.
 240          */
 241         public static final int nextLF(final byte[] b, int ptr) {
 242                 return next(b, ptr, '\n');
 243         }
 244
 245         /**
 246          * Locate the first position after either the given character or LF.
 247          * <p>
 248          * This method stops on the first match it finds from either chrA or '\n'.
 249          *
 250          * @param b
 251          *            buffer to scan.
 252          * @param ptr
 253          *            position within buffer to start looking for chrA or LF at.
 254          * @param chrA
 255          *            character to find.
 256          * @return new position just after the first chrA or LF to be found.
 257          */
 258         public static final int nextLF(final byte[] b, int ptr, final char chrA) {
 259                 final int sz = b.length;
 260                 while (ptr < sz) {
 261                         final byte c = b[ptr++];
 262                         if (c == chrA || c == '\n')
 263                                 return ptr;
 264                 }
 265                 return ptr;
 266         }
 267
 268         /**
 269          * Index the region between <code>[ptr, end)</code> to find line starts.
 270          * <p>
 271          * The returned list is 1 indexed. Index 0 contains
 272          * {@link Integer#MIN_VALUE} to pad the list out.
 273          * <p>
 274          * Using a 1 indexed list means that line numbers can be directly accessed
 275          * from the list, so <code>list.get(1)</code> (aka get line 1) returns
 276          * <code>ptr</code>.
 277          *
 278          * @param buf
 279          *            buffer to scan.
 280          * @param ptr
 281          *            position within the buffer corresponding to the first byte of
 282          *            line 1.
 283          * @param end
 284          *            1 past the end of the content within <code>buf</code>.
 285          * @return a line map indexing the start position of each line.
 286          */
 287         public static final IntList lineMap(final byte[] buf, int ptr, int end) {
 288                 // Experimentally derived from multiple source repositories
 289                 // the average number of bytes/line is 36. Its a rough guess
 290                 // to initially size our map close to the target.
 291                 //
 292                 final IntList map = new IntList((end - ptr) / 36);
 293                 map.fillTo(1, Integer.MIN_VALUE);
 294                 for (; ptr < end; ptr = nextLF(buf, ptr))
 295                         map.add(ptr);
 296                 return map;
 297         }
 298
 299         /**
 300          * Locate the "author " header line data.
 301          *
 302          * @param b
 303          *            buffer to scan.
 304          * @param ptr
 305          *            position in buffer to start the scan at. Most callers should
 306          *            pass 0 to ensure the scan starts from the beginning of the
 307          *            commit buffer and does not accidentally look at message body.
 308          * @return position just after the space in "author ", so the first
 309          *         character of the author's name. If no author header can be
 310          *         located -1 is returned.
 311          */
 312         public static final int author(final byte[] b, int ptr) {
 313                 final int sz = b.length;
 314                 if (ptr == 0)
 315                         ptr += 46; // skip the "tree ..." line.
 316                 while (ptr < sz && b[ptr] == 'p')
 317                         ptr += 48; // skip this parent.
 318                 return match(b, ptr, author);
 319         }
 320
 321         /**
 322          * Locate the "committer " header line data.
 323          *
 324          * @param b
 325          *            buffer to scan.
 326          * @param ptr
 327          *            position in buffer to start the scan at. Most callers should
 328          *            pass 0 to ensure the scan starts from the beginning of the
 329          *            commit buffer and does not accidentally look at message body.
 330          * @return position just after the space in "committer ", so the first
 331          *         character of the committer's name. If no committer header can be
 332          *         located -1 is returned.
 333          */
 334         public static final int committer(final byte[] b, int ptr) {
 335                 final int sz = b.length;
 336                 if (ptr == 0)
 337                         ptr += 46; // skip the "tree ..." line.
 338                 while (ptr < sz && b[ptr] == 'p')
 339                         ptr += 48; // skip this parent.
 340                 if (ptr < sz && b[ptr] == 'a')
 341                         ptr = nextLF(b, ptr);
 342                 return match(b, ptr, committer);
 343         }
 344
 345         /**
 346          * Locate the "encoding " header line.
 347          *
 348          * @param b
 349          *            buffer to scan.
 350          * @param ptr
 351          *            position in buffer to start the scan at. Most callers should
 352          *            pass 0 to ensure the scan starts from the beginning of the
 353          *            buffer and does not accidentally look at the message body.
 354          * @return position just after the space in "encoding ", so the first
 355          *         character of the encoding's name. If no encoding header can be
 356          *         located -1 is returned (and UTF-8 should be assumed).
 357          */
 358         public static final int encoding(final byte[] b, int ptr) {
 359                 final int sz = b.length;
 360                 while (ptr < sz) {
 361                         if (b[ptr] == '\n')
 362                                 return -1;
 363                         if (b[ptr] == 'e')
 364                                 break;
 365                         ptr = nextLF(b, ptr);
 366                 }
 367                 return match(b, ptr, encoding);
 368         }
 369
 370         /**
 371          * Parse the "encoding " header into a character set reference.
 372          * <p>
 373          * Locates the "encoding " header (if present) by first calling
 374          * {@link #encoding(byte[], int)} and then returns the proper character set
 375          * to apply to this buffer to evaluate its contents as character data.
 376          * <p>
 377          * If no encoding header is present, {@link Constants#CHARSET} is assumed.
 378          *
 379          * @param b
 380          *            buffer to scan.
 381          * @return the Java character set representation. Never null.
 382          */
 383         public static Charset parseEncoding(final byte[] b) {
 384                 final int enc = encoding(b, 0);
 385                 if (enc < 0)
 386                         return Constants.CHARSET;
 387                 final int lf = nextLF(b, enc);
 388                 return Charset.forName(decode(Constants.CHARSET, b, enc, lf - 1));
 389         }
 390
 391         /**
 392          * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
 393          * <p>
 394          * When passing in a value for <code>nameB</code> callers should use the
 395          * return value of {@link #author(byte[], int)} or
 396          * {@link #committer(byte[], int)}, as these methods provide the proper
 397          * position within the buffer.
 398          *
 399          * @param raw
 400          *            the buffer to parse character data from.
 401          * @param nameB
 402          *            first position of the identity information. This should be the
 403          *            first position after the space which delimits the header field
 404          *            name (e.g. "author" or "committer") from the rest of the
 405          *            identity line.
 406          * @return the parsed identity. Never null.
 407          */
 408         public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
 409                 final Charset cs = parseEncoding(raw);
 410                 final int emailB = nextLF(raw, nameB, '<');
 411                 final int emailE = nextLF(raw, emailB, '>');
 412
 413                 final String name = decode(cs, raw, nameB, emailB - 2);
 414                 final String email = decode(cs, raw, emailB, emailE - 1);
 415
 416                 final MutableInteger ptrout = new MutableInteger();
 417                 final int when = parseBase10(raw, emailE + 1, ptrout);
 418                 final int tz = parseTimeZoneOffset(raw, ptrout.value);
 419
 420                 return new PersonIdent(name, email, when * 1000L, tz);
 421         }
 422
 423         /**
 424          * Decode a buffer under UTF-8, if possible.
 425          *
 426          * If the byte stream cannot be decoded that way, the platform default is tried
 427          * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
 428          *
 429          * @param buffer
 430          *            buffer to pull raw bytes from.
 431          * @return a string representation of the range <code>[start,end)</code>,
 432          *         after decoding the region through the specified character set.
 433          */
 434         public static String decode(final byte[] buffer) {
 435                 return decode(Constants.CHARSET, buffer, 0, buffer.length);
 436         }
 437
 438         /**
 439          * Decode a buffer under the specified character set if possible.
 440          *
 441          * If the byte stream cannot be decoded that way, the platform default is tried
 442          * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
 443          *
 444          * @param cs
 445          *            character set to use when decoding the buffer.
 446          * @param buffer
 447          *            buffer to pull raw bytes from.
 448          * @return a string representation of the range <code>[start,end)</code>,
 449          *         after decoding the region through the specified character set.
 450          */
 451         public static String decode(final Charset cs, final byte[] buffer) {
 452                 return decode(cs, buffer, 0, buffer.length);
 453         }
 454
 455         /**
 456          * Decode a region of the buffer under the specified character set if possible.
 457          *
 458          * If the byte stream cannot be decoded that way, the platform default is tried
 459          * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
 460          *
 461          * @param cs
 462          *            character set to use when decoding the buffer.
 463          * @param buffer
 464          *            buffer to pull raw bytes from.
 465          * @param start
 466          *            first position within the buffer to take data from.
 467          * @param end
 468          *            one position past the last location within the buffer to take
 469          *            data from.
 470          * @return a string representation of the range <code>[start,end)</code>,
 471          *         after decoding the region through the specified character set.
 472          */
 473         public static String decode(final Charset cs, final byte[] buffer,
 474                         final int start, final int end) {
 475                 final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
 476                 b.mark();
 477
 478                 // Try our built-in favorite. The assumption here is that
 479                 // decoding will fail if the data is not actually encoded
 480                 // using that encoder.
 481                 //
 482                 try {
 483                         return decode(b, Constants.CHARSET);
 484                 } catch (CharacterCodingException e) {
 485                         b.reset();
 486                 }
 487
 488                 if (!cs.equals(Constants.CHARSET)) {
 489                         // Try the suggested encoding, it might be right since it was
 490                         // provided by the caller.
 491                         //
 492                         try {
 493                                 return decode(b, cs);
 494                         } catch (CharacterCodingException e) {
 495                                 b.reset();
 496                         }
 497                 }
 498
 499                 // Try the default character set. A small group of people
 500                 // might actually use the same (or very similar) locale.
 501                 //
 502                 final Charset defcs = Charset.defaultCharset();
 503                 if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) {
 504                         try {
 505                                 return decode(b, defcs);
 506                         } catch (CharacterCodingException e) {
 507                                 b.reset();
 508                         }
 509                 }
 510
 511                 // Fall back to an ISO-8859-1 style encoding. At least all of
 512                 // the bytes will be present in the output.
 513                 //
 514                 final StringBuilder r = new StringBuilder(end - start);
 515                 for (int i = start; i < end; i++)
 516                         r.append((char) (buffer[i] & 0xff));
 517                 return r.toString();
 518         }
 519
 520         private static String decode(final ByteBuffer b, final Charset charset)
 521                         throws CharacterCodingException {
 522                 final CharsetDecoder d = charset.newDecoder();
 523                 d.onMalformedInput(CodingErrorAction.REPORT);
 524                 d.onUnmappableCharacter(CodingErrorAction.REPORT);
 525                 return d.decode(b).toString();
 526         }
 527
 528         /**
 529          * Locate the position of the commit message body.
 530          *
 531          * @param b
 532          *            buffer to scan.
 533          * @param ptr
 534          *            position in buffer to start the scan at. Most callers should
 535          *            pass 0 to ensure the scan starts from the beginning of the
 536          *            commit buffer.
 537          * @return position of the user's message buffer.
 538          */
 539         public static final int commitMessage(final byte[] b, int ptr) {
 540                 final int sz = b.length;
 541                 if (ptr == 0)
 542                         ptr += 46; // skip the "tree ..." line.
 543                 while (ptr < sz && b[ptr] == 'p')
 544                         ptr += 48; // skip this parent.
 545
 546                 // skip any remaining header lines, ignoring what their actual
 547                 // header line type is.
 548                 //
 549                 while (ptr < sz && b[ptr] != '\n')
 550                         ptr = nextLF(b, ptr);
 551                 if (ptr < sz && b[ptr] == '\n')
 552                         return ptr + 1;
 553                 return -1;
 554         }
 555
 556         /**
 557          * Locate the end of a paragraph.
 558          * <p>
 559          * A paragraph is ended by two consecutive LF bytes.
 560          *
 561          * @param b
 562          *            buffer to scan.
 563          * @param start
 564          *            position in buffer to start the scan at. Most callers will
 565          *            want to pass the first position of the commit message (as
 566          *            found by {@link #commitMessage(byte[], int)}.
 567          * @return position of the LF at the end of the paragraph;
 568          *         <code>b.length</code> if no paragraph end could be located.
 569          */
 570         public static final int endOfParagraph(final byte[] b, final int start) {
 571                 int ptr = start;
 572                 final int sz = b.length;
 573                 while (ptr < sz && b[ptr] != '\n')
 574                         ptr = nextLF(b, ptr);
 575                 while (0 < ptr && start < ptr && b[ptr - 1] == '\n')
 576                         ptr--;
 577                 return ptr;
 578         }
 579
 580         private RawParseUtils() {
 581                 // Don't create instances of a static only utility.
 582         }
 583 }