org.spearce.jgit/src/org/spearce/jgit/util/RawParseUtils.java

   1 /*
   2  *  Copyright (C) 2008  Shawn Pearce <spearce@spearce.org>
   3  *
   4  *  This library is free software; you can redistribute it and/or
   5  *  modify it under the terms of the GNU General Public
   6  *  License, version 2, as published by the Free Software Foundation.
   7  *
   8  *  This library is distributed in the hope that it will be useful,
   9  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  *  General Public License for more details.
  12  *
  13  *  You should have received a copy of the GNU General Public
  14  *  License along with this library; if not, write to the Free Software
  15  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
  16  */
  17 package org.spearce.jgit.util;
  18
  19 import java.nio.ByteBuffer;
  20 import java.nio.charset.Charset;
  21 import java.util.Arrays;
  22
  23 import org.spearce.jgit.lib.Constants;
  24 import org.spearce.jgit.lib.PersonIdent;
  25
  26 /** Handy utility functions to parse raw object contents. */
  27 public final class RawParseUtils {
  28         private static final byte[] author = Constants.encodeASCII("author ");
  29
  30         private static final byte[] committer = Constants.encodeASCII("committer ");
  31
  32         private static final byte[] encoding = Constants.encodeASCII("encoding ");
  33
  34         private static final byte[] digits;
  35
  36         static {
  37                 digits = new byte['9' + 1];
  38                 Arrays.fill(digits, (byte) -1);
  39                 for (char i = '0'; i <= '9'; i++)
  40                         digits[i] = (byte) (i - '0');
  41         }
  42
  43         private static final int match(final byte[] b, int ptr, final byte[] src) {
  44                 if (ptr + src.length >= b.length)
  45                         return -1;
  46                 for (int i = 0; i < src.length; i++, ptr++)
  47                         if (b[ptr] != src[i])
  48                                 return -1;
  49                 return ptr;
  50         }
  51
  52         private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
  53                         '6', '7', '8', '9' };
  54
  55         /**
  56          * Format a base 10 numeric into a temporary buffer.
  57          * <p>
  58          * Formatting is performed backwards. The method starts at offset
  59          * <code>o-1</code> and ends at <code>o-1-digits</code>, where
  60          * <code>digits</code> is the number of positions necessary to store the
  61          * base 10 value.
  62          * <p>
  63          * The argument and return values from this method make it easy to chain
  64          * writing, for example:
  65          * </p>
  66          *
  67          * <pre>
  68          * final byte[] tmp = new byte[64];
  69          * int ptr = tmp.length;
  70          * tmp[--ptr] = '\n';
  71          * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
  72          * tmp[--ptr] = ' ';
  73          * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
  74          * tmp[--ptr] = 0;
  75          * final String str = new String(tmp, ptr, tmp.length - ptr);
  76          * </pre>
  77          *
  78          * @param b
  79          *            buffer to write into.
  80          * @param o
  81          *            one offset past the location where writing will begin; writing
  82          *            proceeds towards lower index values.
  83          * @param value
  84          *            the value to store.
  85          * @return the new offset value <code>o</code>. This is the position of
  86          *         the last byte written. Additional writing should start at one
  87          *         position earlier.
  88          */
  89         public static int formatBase10(final byte[] b, int o, int value) {
  90                 if (value == 0) {
  91                         b[--o] = '0';
  92                         return o;
  93                 }
  94                 final boolean isneg = value < 0;
  95                 while (value != 0) {
  96                         b[--o] = base10byte[value % 10];
  97                         value /= 10;
  98                 }
  99                 if (isneg)
 100                         b[--o] = '-';
 101                 return o;
 102         }
 103
 104         /**
 105          * Parse a base 10 numeric from a sequence of ASCII digits.
 106          * <p>
 107          * Digit sequences can begin with an optional run of spaces before the
 108          * sequence, and may start with a '+' or a '-' to indicate sign position.
 109          * Any other characters will cause the method to stop and return the current
 110          * result to the caller.
 111          *
 112          * @param b
 113          *            buffer to scan.
 114          * @param ptr
 115          *            position within buffer to start parsing digits at.
 116          * @param ptrResult
 117          *            optional location to return the new ptr value through. If null
 118          *            the ptr value will be discarded.
 119          * @return the value at this location; 0 if the location is not a valid
 120          *         numeric.
 121          */
 122         public static final int parseBase10(final byte[] b, int ptr,
 123                         final MutableInteger ptrResult) {
 124                 int r = 0;
 125                 int sign = 0;
 126                 try {
 127                         final int sz = b.length;
 128                         while (ptr < sz && b[ptr] == ' ')
 129                                 ptr++;
 130                         if (ptr >= sz)
 131                                 return 0;
 132
 133                         switch (b[ptr]) {
 134                         case '-':
 135                                 sign = -1;
 136                                 ptr++;
 137                                 break;
 138                         case '+':
 139                                 ptr++;
 140                                 break;
 141                         }
 142
 143                         while (ptr < sz) {
 144                                 final byte v = digits[b[ptr]];
 145                                 if (v < 0)
 146                                         break;
 147                                 r = (r * 10) + v;
 148                                 ptr++;
 149                         }
 150                 } catch (ArrayIndexOutOfBoundsException e) {
 151                         // Not a valid digit.
 152                 }
 153                 if (ptrResult != null)
 154                         ptrResult.value = ptr;
 155                 return sign < 0 ? -r : r;
 156         }
 157
 158         /**
 159          * Parse a Git style timezone string.
 160          * <p>
 161          * The sequence "-0315" will be parsed as the numeric value -195, as the
 162          * lower two positions count minutes, not 100ths of an hour.
 163          *
 164          * @param b
 165          *            buffer to scan.
 166          * @param ptr
 167          *            position within buffer to start parsing digits at.
 168          * @return the timezone at this location, expressed in minutes.
 169          */
 170         public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
 171                 final int v = parseBase10(b, ptr, null);
 172                 final int tzMins = v % 100;
 173                 final int tzHours = v / 100;
 174                 return tzHours * 60 + tzMins;
 175         }
 176
 177         /**
 178          * Locate the first position after a given character.
 179          *
 180          * @param b
 181          *            buffer to scan.
 182          * @param ptr
 183          *            position within buffer to start looking for LF at.
 184          * @param chrA
 185          *            character to find.
 186          * @return new position just after chr.
 187          */
 188         public static final int next(final byte[] b, int ptr, final char chrA) {
 189                 final int sz = b.length;
 190                 while (ptr < sz) {
 191                         if (b[ptr] == chrA)
 192                                 return ptr + 1;
 193                         else
 194                                 ptr++;
 195                 }
 196                 return ptr;
 197         }
 198
 199         /**
 200          * Locate the first position after either the given character or LF.
 201          * <p>
 202          * This method stops on the first match it finds from either chrA or '\n'.
 203          *
 204          * @param b
 205          *            buffer to scan.
 206          * @param ptr
 207          *            position within buffer to start looking for LF at.
 208          * @param chrA
 209          *            character to find.
 210          * @return new position just after the first chrA or chrB to be found.
 211          */
 212         public static final int nextLF(final byte[] b, int ptr, final char chrA) {
 213                 final int sz = b.length;
 214                 while (ptr < sz) {
 215                         final byte c = b[ptr];
 216                         if (c == chrA || c == '\n')
 217                                 return ptr + 1;
 218                         else
 219                                 ptr++;
 220                 }
 221                 return ptr;
 222         }
 223
 224         /**
 225          * Locate the "author " header line data.
 226          *
 227          * @param b
 228          *            buffer to scan.
 229          * @param ptr
 230          *            position in buffer to start the scan at. Most callers should
 231          *            pass 0 to ensure the scan starts from the beginning of the
 232          *            commit buffer and does not accidentally look at message body.
 233          * @return position just after the space in "author ", so the first
 234          *         character of the author's name. If no author header can be
 235          *         located -1 is returned.
 236          */
 237         public static final int author(final byte[] b, int ptr) {
 238                 final int sz = b.length;
 239                 if (ptr == 0)
 240                         ptr += 46; // skip the "tree ..." line.
 241                 while (ptr < sz && b[ptr] == 'p')
 242                         ptr += 48; // skip this parent.
 243                 return match(b, ptr, author);
 244         }
 245
 246         /**
 247          * Locate the "committer " header line data.
 248          *
 249          * @param b
 250          *            buffer to scan.
 251          * @param ptr
 252          *            position in buffer to start the scan at. Most callers should
 253          *            pass 0 to ensure the scan starts from the beginning of the
 254          *            commit buffer and does not accidentally look at message body.
 255          * @return position just after the space in "committer ", so the first
 256          *         character of the committer's name. If no committer header can be
 257          *         located -1 is returned.
 258          */
 259         public static final int committer(final byte[] b, int ptr) {
 260                 final int sz = b.length;
 261                 if (ptr == 0)
 262                         ptr += 46; // skip the "tree ..." line.
 263                 while (ptr < sz && b[ptr] == 'p')
 264                         ptr += 48; // skip this parent.
 265                 if (ptr < sz && b[ptr] == 'a')
 266                         ptr = next(b, ptr, '\n');
 267                 return match(b, ptr, committer);
 268         }
 269
 270         /**
 271          * Locate the "encoding " header line.
 272          *
 273          * @param b
 274          *            buffer to scan.
 275          * @param ptr
 276          *            position in buffer to start the scan at. Most callers should
 277          *            pass 0 to ensure the scan starts from the beginning of the
 278          *            buffer and does not accidentally look at the message body.
 279          * @return position just after the space in "encoding ", so the first
 280          *         character of the encoding's name. If no encoding header can be
 281          *         located -1 is returned (and UTF-8 should be assumed).
 282          */
 283         public static final int encoding(final byte[] b, int ptr) {
 284                 final int sz = b.length;
 285                 while (ptr < sz) {
 286                         if (b[ptr] == '\n')
 287                                 return -1;
 288                         if (b[ptr] == 'e')
 289                                 break;
 290                         ptr = next(b, ptr, '\n');
 291                 }
 292                 return match(b, ptr, encoding);
 293         }
 294
 295         /**
 296          * Parse the "encoding " header into a character set reference.
 297          * <p>
 298          * Locates the "encoding " header (if present) by first calling
 299          * {@link #encoding(byte[], int)} and then returns the proper character set
 300          * to apply to this buffer to evaluate its contents as character data.
 301          * <p>
 302          * If no encoding header is present, {@link Constants#CHARSET} is assumed.
 303          *
 304          * @param b
 305          *            buffer to scan.
 306          * @return the Java character set representation. Never null.
 307          */
 308         public static Charset parseEncoding(final byte[] b) {
 309                 final int enc = encoding(b, 0);
 310                 if (enc < 0)
 311                         return Constants.CHARSET;
 312                 final int lf = next(b, enc, '\n');
 313                 return Charset.forName(decode(Constants.CHARSET, b, enc, lf - 1));
 314         }
 315
 316         /**
 317          * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
 318          * <p>
 319          * When passing in a value for <code>nameB</code> callers should use the
 320          * return value of {@link #author(byte[], int)} or
 321          * {@link #committer(byte[], int)}, as these methods provide the proper
 322          * position within the buffer.
 323          *
 324          * @param raw
 325          *            the buffer to parse character data from.
 326          * @param nameB
 327          *            first position of the identity information. This should be the
 328          *            first position after the space which delimits the header field
 329          *            name (e.g. "author" or "committer") from the rest of the
 330          *            identity line.
 331          * @return the parsed identity. Never null.
 332          */
 333         public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
 334                 final Charset cs = parseEncoding(raw);
 335                 final int emailB = nextLF(raw, nameB, '<');
 336                 final int emailE = nextLF(raw, emailB, '>');
 337
 338                 final String name = decode(cs, raw, nameB, emailB - 2);
 339                 final String email = decode(cs, raw, emailB, emailE - 1);
 340
 341                 final MutableInteger ptrout = new MutableInteger();
 342                 final int when = parseBase10(raw, emailE + 1, ptrout);
 343                 final int tz = parseTimeZoneOffset(raw, ptrout.value);
 344
 345                 return new PersonIdent(name, email, when * 1000L, tz);
 346         }
 347
 348         /**
 349          * Decode a region of the buffer under the specified character set.
 350          *
 351          * @param cs
 352          *            character set to use when decoding the buffer.
 353          * @param buffer
 354          *            buffer to pull raw bytes from.
 355          * @param start
 356          *            first position within the buffer to take data from.
 357          * @param end
 358          *            one position past the last location within the buffer to take
 359          *            data from.
 360          * @return a string representation of the range <code>[start,end)</code>,
 361          *         after decoding the region through the specified character set.
 362          */
 363         public static String decode(final Charset cs, final byte[] buffer,
 364                         final int start, final int end) {
 365                 final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
 366                 return cs.decode(b).toString();
 367         }
 368
 369         /**
 370          * Locate the position of the commit message body.
 371          *
 372          * @param b
 373          *            buffer to scan.
 374          * @param ptr
 375          *            position in buffer to start the scan at. Most callers should
 376          *            pass 0 to ensure the scan starts from the beginning of the
 377          *            commit buffer.
 378          * @return position of the user's message buffer.
 379          */
 380         public static final int commitMessage(final byte[] b, int ptr) {
 381                 final int sz = b.length;
 382                 if (ptr == 0)
 383                         ptr += 46; // skip the "tree ..." line.
 384                 while (ptr < sz && b[ptr] == 'p')
 385                         ptr += 48; // skip this parent.
 386
 387                 // skip any remaining header lines, ignoring what their actual
 388                 // header line type is.
 389                 //
 390                 while (ptr < sz && b[ptr] != '\n')
 391                         ptr = next(b, ptr, '\n');
 392                 if (ptr < sz && b[ptr] == '\n')
 393                         return ptr + 1;
 394                 return -1;
 395         }
 396
 397         /**
 398          * Locate the end of a paragraph.
 399          * <p>
 400          * A paragraph is ended by two consecutive LF bytes.
 401          *
 402          * @param b
 403          *            buffer to scan.
 404          * @param ptr
 405          *            position in buffer to start the scan at. Most callers will
 406          *            want to pass the first position of the commit message (as
 407          *            found by {@link #commitMessage(byte[], int)}.
 408          * @return position of the LF at the end of the paragraph;
 409          *         <code>b.length</code> if no paragraph end could be located.
 410          */
 411         public static final int endOfParagraph(final byte[] b, int ptr) {
 412                 final int sz = b.length;
 413                 while (ptr < sz && b[ptr] != '\n')
 414                         ptr = next(b, ptr, '\n');
 415                 if (ptr < sz && b[ptr] == '\n')
 416                         return ptr - 1;
 417                 return sz;
 418         }
 419
 420         private RawParseUtils() {
 421                 // Don't create instances of a static only utility.
 422         }
 423 }