From b0a36898c27a7e0f5a65dd5d813508a8284d6777 Mon Sep 17 00:00:00 2001 From: "benjamin.peterson" Date: Thu, 29 Oct 2009 01:22:38 +0000 Subject: [PATCH] Merged revisions 75928 via svnmerge from svn+ssh://pythondev@svn.python.org/python/branches/py3k ........ r75928 | benjamin.peterson | 2009-10-28 16:59:39 -0500 (Wed, 28 Oct 2009) | 5 lines in wide builds, avoid storing high unicode characters from source code with surrogates This is accomplished by decoding with utf-32 instead of utf-16 on all builds. The patch is by Adam Olsen. ........ git-svn-id: http://svn.python.org/projects/python/branches/release31-maint@75930 6015fed2-1504-0410-9fe1-9d1591cc4771 --- Lib/test/test_pep263.py | 8 ++++++++ Misc/NEWS | 3 +++ Python/ast.c | 23 ++++++++++++++--------- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/Lib/test/test_pep263.py b/Lib/test/test_pep263.py index 05ca47ff43..587b2fcc11 100644 --- a/Lib/test/test_pep263.py +++ b/Lib/test/test_pep263.py @@ -36,6 +36,14 @@ class PEP263Test(unittest.TestCase): exec(c, d) self.assertEquals(d['\xc6'], '\xc6') + def test_issue3297(self): + c = compile("a, b = '\U0001010F', '\\U0001010F'", "dummy", "exec") + d = {} + exec(c, d) + self.assertEqual(d['a'], d['b']) + self.assertEqual(len(d['a']), len(d['b'])) + self.assertEqual(ascii(d['a']), ascii(d['b'])) + def test_main(): support.run_unittest(PEP263Test) diff --git a/Misc/NEWS b/Misc/NEWS index 895ef49431..f63991f402 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,9 @@ What's New in Python 3.1.2? Core and Builtins ----------------- +- Issue #3297: On wide unicode builds, do not split unicode characters into + surrogates. + - Issue #1722344: threading._shutdown() is now called in Py_Finalize(), which fixes the problem of some exceptions being thrown at shutdown when the interpreter is killed. Patch by Adam Olsen. diff --git a/Python/ast.c b/Python/ast.c index b0684c5730..2a806af51a 100644 --- a/Python/ast.c +++ b/Python/ast.c @@ -3217,10 +3217,11 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons u = NULL; } else { /* check for integer overflow */ - if (len > PY_SIZE_MAX / 4) + if (len > PY_SIZE_MAX / 6) return NULL; - /* "\XX" may become "\u005c\uHHLL" (12 bytes) */ - u = PyBytes_FromStringAndSize((char *)NULL, len * 4); + /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5 + "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */ + u = PyBytes_FromStringAndSize((char *)NULL, len * 6); if (u == NULL) return NULL; p = buf = PyBytes_AsString(u); @@ -3237,20 +3238,24 @@ decode_unicode(struct compiling *c, const char *s, size_t len, int rawmode, cons PyObject *w; char *r; Py_ssize_t rn, i; - w = decode_utf8(c, &s, end, "utf-16-be"); + w = decode_utf8(c, &s, end, "utf-32-be"); if (w == NULL) { Py_DECREF(u); return NULL; } r = PyBytes_AS_STRING(w); rn = Py_SIZE(w); - assert(rn % 2 == 0); - for (i = 0; i < rn; i += 2) { - sprintf(p, "\\u%02x%02x", + assert(rn % 4 == 0); + for (i = 0; i < rn; i += 4) { + sprintf(p, "\\U%02x%02x%02x%02x", r[i + 0] & 0xFF, - r[i + 1] & 0xFF); - p += 6; + r[i + 1] & 0xFF, + r[i + 2] & 0xFF, + r[i + 3] & 0xFF); + p += 10; } + /* Should be impossible to overflow */ + assert(p - buf <= Py_SIZE(u)); Py_DECREF(w); } else { *p++ = *s++; -- 2.11.4.GIT