From ff18ea72691de0e0b2da2e3dc04069d1b076c840 Mon Sep 17 00:00:00 2001 From: "Shawn O. Pearce" Date: Mon, 11 Aug 2008 18:07:50 -0700 Subject: [PATCH] Add Constants.encode as a utility for quick encoding in UTF-8 We often need to convert a string into a UTF-8 encoding, so that we can use this string as a path filter in a TreeWalk or in some other suitable place where we assume a standard UTF-8 encoding is being used. As we have already done the lookup for the CHARSET we can reuse that same CHARSET reference during future encoding calls, while allowing the CharSet implementation to cache and reuse the actual encoder instance. Whenever possible we try to avoid copying the result as most of the time the returned ByteBuffer's internal array matches the result array we need to return to our caller. Signed-off-by: Shawn O. Pearce Signed-off-by: Robin Rosenberg --- .../spearce/jgit/lib/ConstantsEncodingTest.java | 89 ++++++++++++++++++++++ .../src/org/spearce/jgit/lib/Constants.java | 25 ++++++ 2 files changed, 114 insertions(+) create mode 100644 org.spearce.jgit.test/tst/org/spearce/jgit/lib/ConstantsEncodingTest.java diff --git a/org.spearce.jgit.test/tst/org/spearce/jgit/lib/ConstantsEncodingTest.java b/org.spearce.jgit.test/tst/org/spearce/jgit/lib/ConstantsEncodingTest.java new file mode 100644 index 00000000..7b3e5a00 --- /dev/null +++ b/org.spearce.jgit.test/tst/org/spearce/jgit/lib/ConstantsEncodingTest.java @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2008, Google Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * - Neither the name of the Git Development Community nor the + * names of its contributors may be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.spearce.jgit.lib; + +import java.io.UnsupportedEncodingException; +import java.util.Arrays; + +import junit.framework.TestCase; + +public class ConstantsEncodingTest extends TestCase { + public void testEncodeASCII_SimpleASCII() + throws UnsupportedEncodingException { + final String src = "abc"; + final byte[] exp = { 'a', 'b', 'c' }; + final byte[] res = Constants.encodeASCII(src); + assertTrue(Arrays.equals(exp, res)); + assertEquals(src, new String(res, 0, res.length, "UTF-8")); + } + + public void testEncodeASCII_FailOnNonASCII() { + final String src = "Ūnĭcōde̽"; + try { + Constants.encodeASCII(src); + fail("Incorrectly accepted a Unicode character"); + } catch (IllegalArgumentException err) { + assertEquals("Not ASCII string: " + src, err.getMessage()); + } + } + + public void testEncodeASCII_Number13() { + final long src = 13; + final byte[] exp = { '1', '3' }; + final byte[] res = Constants.encodeASCII(src); + assertTrue(Arrays.equals(exp, res)); + } + + public void testEncode_SimpleASCII() throws UnsupportedEncodingException { + final String src = "abc"; + final byte[] exp = { 'a', 'b', 'c' }; + final byte[] res = Constants.encode(src); + assertTrue(Arrays.equals(exp, res)); + assertEquals(src, new String(res, 0, res.length, "UTF-8")); + } + + public void testEncode_Unicode() throws UnsupportedEncodingException { + final String src = "Ūnĭcōde̽"; + final byte[] exp = { (byte) 0xC5, (byte) 0xAA, 0x6E, (byte) 0xC4, + (byte) 0xAD, 0x63, (byte) 0xC5, (byte) 0x8D, 0x64, 0x65, + (byte) 0xCC, (byte) 0xBD }; + final byte[] res = Constants.encode(src); + assertTrue(Arrays.equals(exp, res)); + assertEquals(src, new String(res, 0, res.length, "UTF-8")); + } +} diff --git a/org.spearce.jgit/src/org/spearce/jgit/lib/Constants.java b/org.spearce.jgit/src/org/spearce/jgit/lib/Constants.java index 7c2cef9f..1f8603d8 100644 --- a/org.spearce.jgit/src/org/spearce/jgit/lib/Constants.java +++ b/org.spearce.jgit/src/org/spearce/jgit/lib/Constants.java @@ -1,6 +1,7 @@ /* * Copyright (C) 2008, Robin Rosenberg * Copyright (C) 2008, Shawn O. Pearce + * Copyright (C) 2008, Google Inc. * * All rights reserved. * @@ -38,6 +39,7 @@ package org.spearce.jgit.lib; +import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; @@ -387,6 +389,29 @@ public final class Constants { return r; } + /** + * Convert a string to a byte array in the standard character encoding. + * + * @param str + * the string to convert. May contain any Unicode characters. + * @return a byte array representing the requested string, encoded using the + * default character encoding (UTF-8). + * @see #CHARACTER_ENCODING + */ + public static byte[] encode(final String str) { + final ByteBuffer bb = Constants.CHARSET.encode(str); + final int len = bb.limit(); + if (bb.hasArray() && bb.arrayOffset() == 0) { + final byte[] arr = bb.array(); + if (arr.length == len) + return arr; + } + + final byte[] arr = new byte[len]; + bb.get(arr); + return arr; + } + static { if (OBJECT_ID_LENGTH != newMessageDigest().getDigestLength()) throw new LinkageError("Incorrect OBJECT_ID_LENGTH."); -- 2.11.4.GIT