org.spearce.jgit/src/org/spearce/jgit/util/RawParseUtils.java

   1 /*
   2  * Copyright (C) 2008, Shawn O. Pearce <spearce@spearce.org>
   3  *
   4  * All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or
   7  * without modification, are permitted provided that the following
   8  * conditions are met:
   9  *
  10  * - Redistributions of source code must retain the above copyright
  11  *   notice, this list of conditions and the following disclaimer.
  12  *
  13  * - Redistributions in binary form must reproduce the above
  14  *   copyright notice, this list of conditions and the following
  15  *   disclaimer in the documentation and/or other materials provided
  16  *   with the distribution.
  17  *
  18  * - Neither the name of the Git Development Community nor the
  19  *   names of its contributors may be used to endorse or promote
  20  *   products derived from this software without specific prior
  21  *   written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  24  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  25  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  26  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  27  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  28  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  29  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  30  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  32  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  33  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  35  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  36  */
  37
  38 package org.spearce.jgit.util;
  39
  40 import static org.spearce.jgit.lib.ObjectChecker.author;
  41 import static org.spearce.jgit.lib.ObjectChecker.committer;
  42 import static org.spearce.jgit.lib.ObjectChecker.encoding;
  43 import static org.spearce.jgit.lib.ObjectChecker.tagger;
  44
  45 import java.nio.ByteBuffer;
  46 import java.nio.charset.CharacterCodingException;
  47 import java.nio.charset.Charset;
  48 import java.nio.charset.CharsetDecoder;
  49 import java.nio.charset.CodingErrorAction;
  50 import java.util.Arrays;
  51
  52 import org.spearce.jgit.lib.Constants;
  53 import org.spearce.jgit.lib.PersonIdent;
  54
  55 /** Handy utility functions to parse raw object contents. */
  56 public final class RawParseUtils {
  57         private static final byte[] digits10;
  58
  59         private static final byte[] digits16;
  60
  61         static {
  62                 digits10 = new byte['9' + 1];
  63                 Arrays.fill(digits10, (byte) -1);
  64                 for (char i = '0'; i <= '9'; i++)
  65                         digits10[i] = (byte) (i - '0');
  66
  67                 digits16 = new byte['f' + 1];
  68                 Arrays.fill(digits16, (byte) -1);
  69                 for (char i = '0'; i <= '9'; i++)
  70                         digits16[i] = (byte) (i - '0');
  71                 for (char i = 'a'; i <= 'f'; i++)
  72                         digits16[i] = (byte) ((i - 'a') + 10);
  73                 for (char i = 'A'; i <= 'F'; i++)
  74                         digits16[i] = (byte) ((i - 'A') + 10);
  75         }
  76
  77         /**
  78          * Determine if b[ptr] matches src.
  79          *
  80          * @param b
  81          *            the buffer to scan.
  82          * @param ptr
  83          *            first position within b, this should match src[0].
  84          * @param src
  85          *            the buffer to test for equality with b.
  86          * @return ptr + src.length if b[ptr..src.length] == src; else -1.
  87          */
  88         public static final int match(final byte[] b, int ptr, final byte[] src) {
  89                 if (ptr + src.length > b.length)
  90                         return -1;
  91                 for (int i = 0; i < src.length; i++, ptr++)
  92                         if (b[ptr] != src[i])
  93                                 return -1;
  94                 return ptr;
  95         }
  96
  97         private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
  98                         '6', '7', '8', '9' };
  99
 100         /**
 101          * Format a base 10 numeric into a temporary buffer.
 102          * <p>
 103          * Formatting is performed backwards. The method starts at offset
 104          * <code>o-1</code> and ends at <code>o-1-digits</code>, where
 105          * <code>digits</code> is the number of positions necessary to store the
 106          * base 10 value.
 107          * <p>
 108          * The argument and return values from this method make it easy to chain
 109          * writing, for example:
 110          * </p>
 111          *
 112          * <pre>
 113          * final byte[] tmp = new byte[64];
 114          * int ptr = tmp.length;
 115          * tmp[--ptr] = '\n';
 116          * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
 117          * tmp[--ptr] = ' ';
 118          * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
 119          * tmp[--ptr] = 0;
 120          * final String str = new String(tmp, ptr, tmp.length - ptr);
 121          * </pre>
 122          *
 123          * @param b
 124          *            buffer to write into.
 125          * @param o
 126          *            one offset past the location where writing will begin; writing
 127          *            proceeds towards lower index values.
 128          * @param value
 129          *            the value to store.
 130          * @return the new offset value <code>o</code>. This is the position of
 131          *         the last byte written. Additional writing should start at one
 132          *         position earlier.
 133          */
 134         public static int formatBase10(final byte[] b, int o, int value) {
 135                 if (value == 0) {
 136                         b[--o] = '0';
 137                         return o;
 138                 }
 139                 final boolean isneg = value < 0;
 140                 while (value != 0) {
 141                         b[--o] = base10byte[value % 10];
 142                         value /= 10;
 143                 }
 144                 if (isneg)
 145                         b[--o] = '-';
 146                 return o;
 147         }
 148
 149         /**
 150          * Parse a base 10 numeric from a sequence of ASCII digits into an int.
 151          * <p>
 152          * Digit sequences can begin with an optional run of spaces before the
 153          * sequence, and may start with a '+' or a '-' to indicate sign position.
 154          * Any other characters will cause the method to stop and return the current
 155          * result to the caller.
 156          *
 157          * @param b
 158          *            buffer to scan.
 159          * @param ptr
 160          *            position within buffer to start parsing digits at.
 161          * @param ptrResult
 162          *            optional location to return the new ptr value through. If null
 163          *            the ptr value will be discarded.
 164          * @return the value at this location; 0 if the location is not a valid
 165          *         numeric.
 166          */
 167         public static final int parseBase10(final byte[] b, int ptr,
 168                         final MutableInteger ptrResult) {
 169                 int r = 0;
 170                 int sign = 0;
 171                 try {
 172                         final int sz = b.length;
 173                         while (ptr < sz && b[ptr] == ' ')
 174                                 ptr++;
 175                         if (ptr >= sz)
 176                                 return 0;
 177
 178                         switch (b[ptr]) {
 179                         case '-':
 180                                 sign = -1;
 181                                 ptr++;
 182                                 break;
 183                         case '+':
 184                                 ptr++;
 185                                 break;
 186                         }
 187
 188                         while (ptr < sz) {
 189                                 final byte v = digits10[b[ptr]];
 190                                 if (v < 0)
 191                                         break;
 192                                 r = (r * 10) + v;
 193                                 ptr++;
 194                         }
 195                 } catch (ArrayIndexOutOfBoundsException e) {
 196                         // Not a valid digit.
 197                 }
 198                 if (ptrResult != null)
 199                         ptrResult.value = ptr;
 200                 return sign < 0 ? -r : r;
 201         }
 202
 203         /**
 204          * Parse a base 10 numeric from a sequence of ASCII digits into a long.
 205          * <p>
 206          * Digit sequences can begin with an optional run of spaces before the
 207          * sequence, and may start with a '+' or a '-' to indicate sign position.
 208          * Any other characters will cause the method to stop and return the current
 209          * result to the caller.
 210          *
 211          * @param b
 212          *            buffer to scan.
 213          * @param ptr
 214          *            position within buffer to start parsing digits at.
 215          * @param ptrResult
 216          *            optional location to return the new ptr value through. If null
 217          *            the ptr value will be discarded.
 218          * @return the value at this location; 0 if the location is not a valid
 219          *         numeric.
 220          */
 221         public static final long parseLongBase10(final byte[] b, int ptr,
 222                         final MutableInteger ptrResult) {
 223                 long r = 0;
 224                 int sign = 0;
 225                 try {
 226                         final int sz = b.length;
 227                         while (ptr < sz && b[ptr] == ' ')
 228                                 ptr++;
 229                         if (ptr >= sz)
 230                                 return 0;
 231
 232                         switch (b[ptr]) {
 233                         case '-':
 234                                 sign = -1;
 235                                 ptr++;
 236                                 break;
 237                         case '+':
 238                                 ptr++;
 239                                 break;
 240                         }
 241
 242                         while (ptr < sz) {
 243                                 final byte v = digits10[b[ptr]];
 244                                 if (v < 0)
 245                                         break;
 246                                 r = (r * 10) + v;
 247                                 ptr++;
 248                         }
 249                 } catch (ArrayIndexOutOfBoundsException e) {
 250                         // Not a valid digit.
 251                 }
 252                 if (ptrResult != null)
 253                         ptrResult.value = ptr;
 254                 return sign < 0 ? -r : r;
 255         }
 256
 257         /**
 258          * Parse 4 character base 16 (hex) formatted string to unsigned integer.
 259          * <p>
 260          * The number is read in network byte order, that is, most significant
 261          * nybble first.
 262          *
 263          * @param bs
 264          *            buffer to parse digits from; positions {@code [p, p+4)} will
 265          *            be parsed.
 266          * @param p
 267          *            first position within the buffer to parse.
 268          * @return the integer value.
 269          * @throws ArrayIndexOutOfBoundsException
 270          *             if the string is not hex formatted.
 271          */
 272         public static final int parseHexInt16(final byte[] bs, final int p) {
 273                 int r = digits16[bs[p]] << 4;
 274
 275                 r |= digits16[bs[p + 1]];
 276                 r <<= 4;
 277
 278                 r |= digits16[bs[p + 2]];
 279                 r <<= 4;
 280
 281                 r |= digits16[bs[p + 3]];
 282                 if (r < 0)
 283                         throw new ArrayIndexOutOfBoundsException();
 284                 return r;
 285         }
 286
 287         /**
 288          * Parse 8 character base 16 (hex) formatted string to unsigned integer.
 289          * <p>
 290          * The number is read in network byte order, that is, most significant
 291          * nybble first.
 292          *
 293          * @param bs
 294          *            buffer to parse digits from; positions {@code [p, p+8)} will
 295          *            be parsed.
 296          * @param p
 297          *            first position within the buffer to parse.
 298          * @return the integer value.
 299          * @throws ArrayIndexOutOfBoundsException
 300          *             if the string is not hex formatted.
 301          */
 302         public static final int parseHexInt32(final byte[] bs, final int p) {
 303                 int r = digits16[bs[p]] << 4;
 304
 305                 r |= digits16[bs[p + 1]];
 306                 r <<= 4;
 307
 308                 r |= digits16[bs[p + 2]];
 309                 r <<= 4;
 310
 311                 r |= digits16[bs[p + 3]];
 312                 r <<= 4;
 313
 314                 r |= digits16[bs[p + 4]];
 315                 r <<= 4;
 316
 317                 r |= digits16[bs[p + 5]];
 318                 r <<= 4;
 319
 320                 r |= digits16[bs[p + 6]];
 321
 322                 final int last = digits16[bs[p + 7]];
 323                 if (r < 0 || last < 0)
 324                         throw new ArrayIndexOutOfBoundsException();
 325                 return (r << 4) | last;
 326         }
 327
 328         /**
 329          * Parse a single hex digit to its numeric value (0-15).
 330          *
 331          * @param digit
 332          *            hex character to parse.
 333          * @return numeric value, in the range 0-15.
 334          * @throws ArrayIndexOutOfBoundsException
 335          *             if the input digit is not a valid hex digit.
 336          */
 337         public static final int parseHexInt4(final byte digit) {
 338                 final byte r = digits16[digit];
 339                 if (r < 0)
 340                         throw new ArrayIndexOutOfBoundsException();
 341                 return r;
 342         }
 343
 344         /**
 345          * Parse a Git style timezone string.
 346          * <p>
 347          * The sequence "-0315" will be parsed as the numeric value -195, as the
 348          * lower two positions count minutes, not 100ths of an hour.
 349          *
 350          * @param b
 351          *            buffer to scan.
 352          * @param ptr
 353          *            position within buffer to start parsing digits at.
 354          * @return the timezone at this location, expressed in minutes.
 355          */
 356         public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
 357                 final int v = parseBase10(b, ptr, null);
 358                 final int tzMins = v % 100;
 359                 final int tzHours = v / 100;
 360                 return tzHours * 60 + tzMins;
 361         }
 362
 363         /**
 364          * Locate the first position after a given character.
 365          *
 366          * @param b
 367          *            buffer to scan.
 368          * @param ptr
 369          *            position within buffer to start looking for chrA at.
 370          * @param chrA
 371          *            character to find.
 372          * @return new position just after chrA.
 373          */
 374         public static final int next(final byte[] b, int ptr, final char chrA) {
 375                 final int sz = b.length;
 376                 while (ptr < sz) {
 377                         if (b[ptr++] == chrA)
 378                                 return ptr;
 379                 }
 380                 return ptr;
 381         }
 382
 383         /**
 384          * Locate the first position after the next LF.
 385          * <p>
 386          * This method stops on the first '\n' it finds.
 387          *
 388          * @param b
 389          *            buffer to scan.
 390          * @param ptr
 391          *            position within buffer to start looking for LF at.
 392          * @return new position just after the first LF found.
 393          */
 394         public static final int nextLF(final byte[] b, int ptr) {
 395                 return next(b, ptr, '\n');
 396         }
 397
 398         /**
 399          * Locate the first position after either the given character or LF.
 400          * <p>
 401          * This method stops on the first match it finds from either chrA or '\n'.
 402          *
 403          * @param b
 404          *            buffer to scan.
 405          * @param ptr
 406          *            position within buffer to start looking for chrA or LF at.
 407          * @param chrA
 408          *            character to find.
 409          * @return new position just after the first chrA or LF to be found.
 410          */
 411         public static final int nextLF(final byte[] b, int ptr, final char chrA) {
 412                 final int sz = b.length;
 413                 while (ptr < sz) {
 414                         final byte c = b[ptr++];
 415                         if (c == chrA || c == '\n')
 416                                 return ptr;
 417                 }
 418                 return ptr;
 419         }
 420
 421         /**
 422          * Index the region between <code>[ptr, end)</code> to find line starts.
 423          * <p>
 424          * The returned list is 1 indexed. Index 0 contains
 425          * {@link Integer#MIN_VALUE} to pad the list out.
 426          * <p>
 427          * Using a 1 indexed list means that line numbers can be directly accessed
 428          * from the list, so <code>list.get(1)</code> (aka get line 1) returns
 429          * <code>ptr</code>.
 430          * <p>
 431          * The last element (index <code>map.size()-1</code>) always contains
 432          * <code>end</code>.
 433          *
 434          * @param buf
 435          *            buffer to scan.
 436          * @param ptr
 437          *            position within the buffer corresponding to the first byte of
 438          *            line 1.
 439          * @param end
 440          *            1 past the end of the content within <code>buf</code>.
 441          * @return a line map indexing the start position of each line.
 442          */
 443         public static final IntList lineMap(final byte[] buf, int ptr, int end) {
 444                 // Experimentally derived from multiple source repositories
 445                 // the average number of bytes/line is 36. Its a rough guess
 446                 // to initially size our map close to the target.
 447                 //
 448                 final IntList map = new IntList((end - ptr) / 36);
 449                 map.fillTo(1, Integer.MIN_VALUE);
 450                 for (; ptr < end; ptr = nextLF(buf, ptr))
 451                         map.add(ptr);
 452                 map.add(end);
 453                 return map;
 454         }
 455
 456         /**
 457          * Locate the "author " header line data.
 458          *
 459          * @param b
 460          *            buffer to scan.
 461          * @param ptr
 462          *            position in buffer to start the scan at. Most callers should
 463          *            pass 0 to ensure the scan starts from the beginning of the
 464          *            commit buffer and does not accidentally look at message body.
 465          * @return position just after the space in "author ", so the first
 466          *         character of the author's name. If no author header can be
 467          *         located -1 is returned.
 468          */
 469         public static final int author(final byte[] b, int ptr) {
 470                 final int sz = b.length;
 471                 if (ptr == 0)
 472                         ptr += 46; // skip the "tree ..." line.
 473                 while (ptr < sz && b[ptr] == 'p')
 474                         ptr += 48; // skip this parent.
 475                 return match(b, ptr, author);
 476         }
 477
 478         /**
 479          * Locate the "committer " header line data.
 480          *
 481          * @param b
 482          *            buffer to scan.
 483          * @param ptr
 484          *            position in buffer to start the scan at. Most callers should
 485          *            pass 0 to ensure the scan starts from the beginning of the
 486          *            commit buffer and does not accidentally look at message body.
 487          * @return position just after the space in "committer ", so the first
 488          *         character of the committer's name. If no committer header can be
 489          *         located -1 is returned.
 490          */
 491         public static final int committer(final byte[] b, int ptr) {
 492                 final int sz = b.length;
 493                 if (ptr == 0)
 494                         ptr += 46; // skip the "tree ..." line.
 495                 while (ptr < sz && b[ptr] == 'p')
 496                         ptr += 48; // skip this parent.
 497                 if (ptr < sz && b[ptr] == 'a')
 498                         ptr = nextLF(b, ptr);
 499                 return match(b, ptr, committer);
 500         }
 501
 502         /**
 503          * Locate the "tagger " header line data.
 504          *
 505          * @param b
 506          *            buffer to scan.
 507          * @param ptr
 508          *            position in buffer to start the scan at. Most callers should
 509          *            pass 0 to ensure the scan starts from the beginning of the tag
 510          *            buffer and does not accidentally look at message body.
 511          * @return position just after the space in "tagger ", so the first
 512          *         character of the tagger's name. If no tagger header can be
 513          *         located -1 is returned.
 514          */
 515         public static final int tagger(final byte[] b, int ptr) {
 516                 final int sz = b.length;
 517                 if (ptr == 0)
 518                         ptr += 48; // skip the "object ..." line.
 519                 while (ptr < sz) {
 520                         if (b[ptr] == '\n')
 521                                 return -1;
 522                         final int m = match(b, ptr, tagger);
 523                         if (m >= 0)
 524                                 return m;
 525                         ptr = nextLF(b, ptr);
 526                 }
 527                 return -1;
 528         }
 529
 530         /**
 531          * Locate the "encoding " header line.
 532          *
 533          * @param b
 534          *            buffer to scan.
 535          * @param ptr
 536          *            position in buffer to start the scan at. Most callers should
 537          *            pass 0 to ensure the scan starts from the beginning of the
 538          *            buffer and does not accidentally look at the message body.
 539          * @return position just after the space in "encoding ", so the first
 540          *         character of the encoding's name. If no encoding header can be
 541          *         located -1 is returned (and UTF-8 should be assumed).
 542          */
 543         public static final int encoding(final byte[] b, int ptr) {
 544                 final int sz = b.length;
 545                 while (ptr < sz) {
 546                         if (b[ptr] == '\n')
 547                                 return -1;
 548                         if (b[ptr] == 'e')
 549                                 break;
 550                         ptr = nextLF(b, ptr);
 551                 }
 552                 return match(b, ptr, encoding);
 553         }
 554
 555         /**
 556          * Parse the "encoding " header into a character set reference.
 557          * <p>
 558          * Locates the "encoding " header (if present) by first calling
 559          * {@link #encoding(byte[], int)} and then returns the proper character set
 560          * to apply to this buffer to evaluate its contents as character data.
 561          * <p>
 562          * If no encoding header is present, {@link Constants#CHARSET} is assumed.
 563          *
 564          * @param b
 565          *            buffer to scan.
 566          * @return the Java character set representation. Never null.
 567          */
 568         public static Charset parseEncoding(final byte[] b) {
 569                 final int enc = encoding(b, 0);
 570                 if (enc < 0)
 571                         return Constants.CHARSET;
 572                 final int lf = nextLF(b, enc);
 573                 return Charset.forName(decode(Constants.CHARSET, b, enc, lf - 1));
 574         }
 575
 576         /**
 577          * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
 578          * <p>
 579          * When passing in a value for <code>nameB</code> callers should use the
 580          * return value of {@link #author(byte[], int)} or
 581          * {@link #committer(byte[], int)}, as these methods provide the proper
 582          * position within the buffer.
 583          *
 584          * @param raw
 585          *            the buffer to parse character data from.
 586          * @param nameB
 587          *            first position of the identity information. This should be the
 588          *            first position after the space which delimits the header field
 589          *            name (e.g. "author" or "committer") from the rest of the
 590          *            identity line.
 591          * @return the parsed identity. Never null.
 592          */
 593         public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
 594                 final Charset cs = parseEncoding(raw);
 595                 final int emailB = nextLF(raw, nameB, '<');
 596                 final int emailE = nextLF(raw, emailB, '>');
 597
 598                 final String name = decode(cs, raw, nameB, emailB - 2);
 599                 final String email = decode(cs, raw, emailB, emailE - 1);
 600
 601                 final MutableInteger ptrout = new MutableInteger();
 602                 final long when = parseLongBase10(raw, emailE + 1, ptrout);
 603                 final int tz = parseTimeZoneOffset(raw, ptrout.value);
 604
 605                 return new PersonIdent(name, email, when * 1000L, tz);
 606         }
 607
 608         /**
 609          * Decode a buffer under UTF-8, if possible.
 610          *
 611          * If the byte stream cannot be decoded that way, the platform default is tried
 612          * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
 613          *
 614          * @param buffer
 615          *            buffer to pull raw bytes from.
 616          * @return a string representation of the range <code>[start,end)</code>,
 617          *         after decoding the region through the specified character set.
 618          */
 619         public static String decode(final byte[] buffer) {
 620                 return decode(Constants.CHARSET, buffer, 0, buffer.length);
 621         }
 622
 623         /**
 624          * Decode a buffer under the specified character set if possible.
 625          *
 626          * If the byte stream cannot be decoded that way, the platform default is tried
 627          * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
 628          *
 629          * @param cs
 630          *            character set to use when decoding the buffer.
 631          * @param buffer
 632          *            buffer to pull raw bytes from.
 633          * @return a string representation of the range <code>[start,end)</code>,
 634          *         after decoding the region through the specified character set.
 635          */
 636         public static String decode(final Charset cs, final byte[] buffer) {
 637                 return decode(cs, buffer, 0, buffer.length);
 638         }
 639
 640         /**
 641          * Decode a region of the buffer under the specified character set if possible.
 642          *
 643          * If the byte stream cannot be decoded that way, the platform default is tried
 644          * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
 645          *
 646          * @param cs
 647          *            character set to use when decoding the buffer.
 648          * @param buffer
 649          *            buffer to pull raw bytes from.
 650          * @param start
 651          *            first position within the buffer to take data from.
 652          * @param end
 653          *            one position past the last location within the buffer to take
 654          *            data from.
 655          * @return a string representation of the range <code>[start,end)</code>,
 656          *         after decoding the region through the specified character set.
 657          */
 658         public static String decode(final Charset cs, final byte[] buffer,
 659                         final int start, final int end) {
 660                 try {
 661                         return decodeNoFallback(cs, buffer, start, end);
 662                 } catch (CharacterCodingException e) {
 663                         // Fall back to an ISO-8859-1 style encoding. At least all of
 664                         // the bytes will be present in the output.
 665                         //
 666                         return extractBinaryString(buffer, start, end);
 667                 }
 668         }
 669
 670         /**
 671          * Decode a region of the buffer under the specified character set if
 672          * possible.
 673          *
 674          * If the byte stream cannot be decoded that way, the platform default is
 675          * tried and if that too fails, an exception is thrown.
 676          *
 677          * @param cs
 678          *            character set to use when decoding the buffer.
 679          * @param buffer
 680          *            buffer to pull raw bytes from.
 681          * @param start
 682          *            first position within the buffer to take data from.
 683          * @param end
 684          *            one position past the last location within the buffer to take
 685          *            data from.
 686          * @return a string representation of the range <code>[start,end)</code>,
 687          *         after decoding the region through the specified character set.
 688          * @throws CharacterCodingException
 689          *             the input is not in any of the tested character sets.
 690          */
 691         public static String decodeNoFallback(final Charset cs,
 692                         final byte[] buffer, final int start, final int end)
 693                         throws CharacterCodingException {
 694                 final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
 695                 b.mark();
 696
 697                 // Try our built-in favorite. The assumption here is that
 698                 // decoding will fail if the data is not actually encoded
 699                 // using that encoder.
 700                 //
 701                 try {
 702                         return decode(b, Constants.CHARSET);
 703                 } catch (CharacterCodingException e) {
 704                         b.reset();
 705                 }
 706
 707                 if (!cs.equals(Constants.CHARSET)) {
 708                         // Try the suggested encoding, it might be right since it was
 709                         // provided by the caller.
 710                         //
 711                         try {
 712                                 return decode(b, cs);
 713                         } catch (CharacterCodingException e) {
 714                                 b.reset();
 715                         }
 716                 }
 717
 718                 // Try the default character set. A small group of people
 719                 // might actually use the same (or very similar) locale.
 720                 //
 721                 final Charset defcs = Charset.defaultCharset();
 722                 if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) {
 723                         try {
 724                                 return decode(b, defcs);
 725                         } catch (CharacterCodingException e) {
 726                                 b.reset();
 727                         }
 728                 }
 729
 730                 throw new CharacterCodingException();
 731         }
 732
 733         /**
 734          * Decode a region of the buffer under the ISO-8859-1 encoding.
 735          *
 736          * Each byte is treated as a single character in the 8859-1 character
 737          * encoding, performing a raw binary->char conversion.
 738          *
 739          * @param buffer
 740          *            buffer to pull raw bytes from.
 741          * @param start
 742          *            first position within the buffer to take data from.
 743          * @param end
 744          *            one position past the last location within the buffer to take
 745          *            data from.
 746          * @return a string representation of the range <code>[start,end)</code>.
 747          */
 748         public static String extractBinaryString(final byte[] buffer,
 749                         final int start, final int end) {
 750                 final StringBuilder r = new StringBuilder(end - start);
 751                 for (int i = start; i < end; i++)
 752                         r.append((char) (buffer[i] & 0xff));
 753                 return r.toString();
 754         }
 755
 756         private static String decode(final ByteBuffer b, final Charset charset)
 757                         throws CharacterCodingException {
 758                 final CharsetDecoder d = charset.newDecoder();
 759                 d.onMalformedInput(CodingErrorAction.REPORT);
 760                 d.onUnmappableCharacter(CodingErrorAction.REPORT);
 761                 return d.decode(b).toString();
 762         }
 763
 764         /**
 765          * Locate the position of the commit message body.
 766          *
 767          * @param b
 768          *            buffer to scan.
 769          * @param ptr
 770          *            position in buffer to start the scan at. Most callers should
 771          *            pass 0 to ensure the scan starts from the beginning of the
 772          *            commit buffer.
 773          * @return position of the user's message buffer.
 774          */
 775         public static final int commitMessage(final byte[] b, int ptr) {
 776                 final int sz = b.length;
 777                 if (ptr == 0)
 778                         ptr += 46; // skip the "tree ..." line.
 779                 while (ptr < sz && b[ptr] == 'p')
 780                         ptr += 48; // skip this parent.
 781
 782                 // Skip any remaining header lines, ignoring what their actual
 783                 // header line type is. This is identical to the logic for a tag.
 784                 //
 785                 return tagMessage(b, ptr);
 786         }
 787
 788         /**
 789          * Locate the position of the tag message body.
 790          *
 791          * @param b
 792          *            buffer to scan.
 793          * @param ptr
 794          *            position in buffer to start the scan at. Most callers should
 795          *            pass 0 to ensure the scan starts from the beginning of the tag
 796          *            buffer.
 797          * @return position of the user's message buffer.
 798          */
 799         public static final int tagMessage(final byte[] b, int ptr) {
 800                 final int sz = b.length;
 801                 if (ptr == 0)
 802                         ptr += 48; // skip the "object ..." line.
 803                 while (ptr < sz && b[ptr] != '\n')
 804                         ptr = nextLF(b, ptr);
 805                 if (ptr < sz && b[ptr] == '\n')
 806                         return ptr + 1;
 807                 return -1;
 808         }
 809
 810         /**
 811          * Locate the end of a paragraph.
 812          * <p>
 813          * A paragraph is ended by two consecutive LF bytes.
 814          *
 815          * @param b
 816          *            buffer to scan.
 817          * @param start
 818          *            position in buffer to start the scan at. Most callers will
 819          *            want to pass the first position of the commit message (as
 820          *            found by {@link #commitMessage(byte[], int)}.
 821          * @return position of the LF at the end of the paragraph;
 822          *         <code>b.length</code> if no paragraph end could be located.
 823          */
 824         public static final int endOfParagraph(final byte[] b, final int start) {
 825                 int ptr = start;
 826                 final int sz = b.length;
 827                 while (ptr < sz && b[ptr] != '\n')
 828                         ptr = nextLF(b, ptr);
 829                 while (0 < ptr && start < ptr && b[ptr - 1] == '\n')
 830                         ptr--;
 831                 return ptr;
 832         }
 833
 834         private RawParseUtils() {
 835                 // Don't create instances of a static only utility.
 836         }
 837 }