From eb03381ed03cdaf6d0fc472054ee1491454a499e Mon Sep 17 00:00:00 2001 From: petere Date: Wed, 29 Oct 2008 08:04:54 +0000 Subject: [PATCH] Unicode escapes in strings and identifiers --- doc/src/sgml/syntax.sgml | 140 +++++++++++++++- src/backend/catalog/sql_features.txt | 4 +- src/backend/parser/scan.l | 186 ++++++++++++++++++++- src/backend/utils/adt/xml.c | 23 +-- src/backend/utils/mb/wchar.c | 35 ++++ src/bin/psql/psqlscan.l | 65 ++++++- src/include/mb/pg_wchar.h | 1 + src/interfaces/ecpg/preproc/pgc.l | 58 ++++++- src/interfaces/ecpg/preproc/preproc.y | 7 +- src/interfaces/ecpg/test/ecpg_schedule | 1 + src/interfaces/ecpg/test/ecpg_schedule_tcp | 1 + .../ecpg/test/expected/preproc-strings.c | 62 +++++++ .../ecpg/test/expected/preproc-strings.stderr | 36 ++++ .../ecpg/test/expected/preproc-strings.stdout | 1 + src/interfaces/ecpg/test/preproc/Makefile | 1 + src/interfaces/ecpg/test/preproc/strings.pgc | 27 +++ src/test/regress/expected/strings.out | 25 +++ src/test/regress/sql/strings.sql | 8 + 18 files changed, 630 insertions(+), 51 deletions(-) create mode 100644 src/interfaces/ecpg/test/expected/preproc-strings.c create mode 100644 src/interfaces/ecpg/test/expected/preproc-strings.stderr create mode 100644 src/interfaces/ecpg/test/expected/preproc-strings.stdout create mode 100644 src/interfaces/ecpg/test/preproc/strings.pgc diff --git a/doc/src/sgml/syntax.sgml b/doc/src/sgml/syntax.sgml index 367d0086f3..e09d715444 100644 --- a/doc/src/sgml/syntax.sgml +++ b/doc/src/sgml/syntax.sgml @@ -190,6 +190,57 @@ UPDATE "my_table" SET "a" = 5; + Unicode escapein + identifiers A variant of quoted + identifiers allows including escaped Unicode characters identified + by their code points. This variant starts + with U& (upper or lower case U followed by + ampersand) immediately before the opening double quote, without + any spaces in between, for example U&"foo". + (Note that this creates an ambiguity with the + operator &. Use spaces around the operator to + avoid this problem.) Inside the quotes, Unicode characters can be + specified in escaped form by writing a backslash followed by the + four-digit hexadecimal code point number or alternatively a + backslash followed by a plus sign followed by a six-digit + hexadecimal code point number. For example, the + identifier "data" could be written as + +U&"d\0061t\+000061" + + The following less trivial example writes the Russian + word slon (elephant) in Cyrillic letters: + +U&"\0441\043B\043E\043D" + + + + + If a different escape character than backslash is desired, it can + be specified using + the UESCAPEUESCAPE + clause after the string, for example: + +U&"d!0061t!+000061" UESCAPE '!' + + The escape character can be any single character other than a + hexadecimal digit, the plus sign, a single quote, a double quote, + or a whitespace character. Note that the escape character is + written in single quotes, not double quotes. + + + + To include the escape character in the identifier literally, write + it twice. + + + + The Unicode escape syntax works only when the server encoding is + UTF8. When other server encodings are used, only code points in + the ASCII range (up to \007F) can be specified. + + + Quoting an identifier also makes it case-sensitive, whereas unquoted names are always folded to lower case. For example, the identifiers FOO, foo, and @@ -245,7 +296,7 @@ UPDATE "my_table" SET "a" = 5; write two adjacent single quotes, e.g. 'Dianne''s horse'. Note that this is not the same as a double-quote - character ("). + character ("). @@ -269,14 +320,19 @@ SELECT 'foo' 'bar'; by SQL; PostgreSQL is following the standard.) + - - + + String Constants with C-Style Escapes + + escape string syntax - + backslash escapes + + PostgreSQL also accepts escape string constants, which are an extension to the SQL standard. An escape string constant is specified by writing the letter @@ -287,7 +343,8 @@ SELECT 'foo' 'bar'; Within an escape string, a backslash character (\) begins a C-like backslash escape sequence, in which the combination of backslash and following character(s) represent a special byte - value: + value, as shown in . + Backslash Escape Sequences @@ -341,14 +398,24 @@ SELECT 'foo' 'bar';
- It is your responsibility that the byte sequences you create are - valid characters in the server character set encoding. Any other + + Any other character following a backslash is taken literally. Thus, to include a backslash character, write two backslashes (\\). Also, a single quote can be included in an escape string by writing \', in addition to the normal way of ''. + + It is your responsibility that the byte sequences you create are + valid characters in the server character set encoding. When the + server encoding is UTF-8, then the alternative Unicode escape + syntax, explained in , + should be used instead. (The alternative would be doing the + UTF-8 encoding by hand and writing out the bytes, which would be + very cumbersome.) + + If the configuration parameter @@ -379,6 +446,65 @@ SELECT 'foo' 'bar';
+ + String Constants with Unicode Escapes + + + Unicode escape + in string constants + + + + PostgreSQL also supports another type + of escape syntax for strings that allows specifying arbitrary + Unicode characters by code point. A Unicode escape string + constant starts with U& (upper or lower case + letter U followed by ampersand) immediately before the opening + quote, without any spaces in between, for + example U&'foo'. (Note that this creates an + ambiguity with the operator &. Use spaces + around the operator to avoid this problem.) Inside the quotes, + Unicode characters can be specified in escaped form by writing a + backslash followed by the four-digit hexadecimal code point + number or alternatively a backslash followed by a plus sign + followed by a six-digit hexadecimal code point number. For + example, the string 'data' could be written as + +U&'d\0061t\+000061' + + The following less trivial example writes the Russian + word slon (elephant) in Cyrillic letters: + +U&'\0441\043B\043E\043D' + + + + + If a different escape character than backslash is desired, it can + be specified using + the UESCAPEUESCAPE + clause after the string, for example: + + U&'d!0061t!+000061' UESCAPE '!' + + The escape character can be any single character other than a + hexadecimal digit, the plus sign, a single quote, a double quote, + or a whitespace character. + + + + The Unicode escape syntax works only when the server encoding is + UTF8. When other server encodings are used, only code points in + the ASCII range (up to \007F) can be + specified. + + + + To include the escape character in the string literally, write it + twice. + + + Dollar-Quoted String Constants diff --git a/src/backend/catalog/sql_features.txt b/src/backend/catalog/sql_features.txt index b795a70f3c..707a071083 100644 --- a/src/backend/catalog/sql_features.txt +++ b/src/backend/catalog/sql_features.txt @@ -238,8 +238,8 @@ F381 Extended schema manipulation 02 ALTER TABLE statement: ADD CONSTRAINT claus F381 Extended schema manipulation 03 ALTER TABLE statement: DROP CONSTRAINT clause YES F382 Alter column data type YES F391 Long identifiers YES -F392 Unicode escapes in identifiers NO -F393 Unicode escapes in literals NO +F392 Unicode escapes in identifiers YES +F393 Unicode escapes in literals YES F394 Optional normal form specification NO F401 Extended joined table YES F401 Extended joined table 01 NATURAL JOIN YES diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index 6bcc716c57..8066843203 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -76,6 +76,7 @@ static int literalalloc; /* current allocated buffer size */ static void addlit(char *ytext, int yleng); static void addlitchar(unsigned char ychar); static char *litbufdup(void); +static char *litbuf_udeescape(unsigned char escape); #define lexer_errposition() scanner_errposition(yylloc) @@ -125,6 +126,8 @@ static unsigned char unescape_single_char(unsigned char c); * standard quoted strings * extended quoted strings (support backslash escape sequences) * $foo$ quoted strings + * quoted identifier with Unicode escapes + * quoted string with Unicode escapes */ %x xb @@ -134,6 +137,8 @@ static unsigned char unescape_single_char(unsigned char c); %x xe %x xq %x xdolq +%x xui +%x xus /* * In order to make the world safe for Windows and Mac clients as well as @@ -244,6 +249,25 @@ xdstop {dquote} xddouble {dquote}{dquote} xdinside [^"]+ +/* Unicode escapes */ +uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} +/* error rule to avoid backup */ +uescapefail ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]) + +/* Quoted identifier with Unicode escapes */ +xuistart [uU]&{dquote} +xuistop1 {dquote}{whitespace}*{uescapefail}? +xuistop2 {dquote}{whitespace}*{uescape} + +/* Quoted string with Unicode escapes */ +xusstart [uU]&{quote} +xusstop1 {quote}{whitespace}*{uescapefail}? +xusstop2 {quote}{whitespace}*{uescape} + +/* error rule to avoid backup */ +xufailed [uU]& + + /* C-style comments * * The "extended comment" syntax closely resembles allowable operator syntax. @@ -444,6 +468,11 @@ other . BEGIN(xe); startlit(); } +{xusstart} { + SET_YYLLOC(); + BEGIN(xus); + startlit(); + } {quotestop} | {quotefail} { yyless(1); @@ -456,10 +485,22 @@ other . yylval.str = litbufdup(); return SCONST; } -{xqdouble} { +{xusstop1} { + /* throw back all but the quote */ + yyless(1); + BEGIN(INITIAL); + yylval.str = litbuf_udeescape('\\'); + return SCONST; + } +{xusstop2} { + BEGIN(INITIAL); + yylval.str = litbuf_udeescape(yytext[yyleng-2]); + return SCONST; + } +{xqdouble} { addlitchar('\''); } -{xqinside} { +{xqinside} { addlit(yytext, yyleng); } {xeinside} { @@ -496,14 +537,14 @@ other . if (IS_HIGHBIT_SET(c)) saw_high_bit = true; } -{quotecontinue} { +{quotecontinue} { /* ignore */ } . { /* This is only needed for \ just before EOF */ addlitchar(yytext[0]); } -<> { yyerror("unterminated quoted string"); } +<> { yyerror("unterminated quoted string"); } {dolqdelim} { SET_YYLLOC(); @@ -553,6 +594,11 @@ other . BEGIN(xd); startlit(); } +{xuistart} { + SET_YYLLOC(); + BEGIN(xui); + startlit(); + } {xdstop} { char *ident; @@ -565,13 +611,46 @@ other . yylval.str = ident; return IDENT; } -{xddouble} { +{xuistop1} { + char *ident; + + BEGIN(INITIAL); + if (literallen == 0) + yyerror("zero-length delimited identifier"); + ident = litbuf_udeescape('\\'); + if (literallen >= NAMEDATALEN) + truncate_identifier(ident, literallen, true); + yylval.str = ident; + /* throw back all but the quote */ + yyless(1); + return IDENT; + } +{xuistop2} { + char *ident; + + BEGIN(INITIAL); + if (literallen == 0) + yyerror("zero-length delimited identifier"); + ident = litbuf_udeescape(yytext[yyleng - 2]); + if (literallen >= NAMEDATALEN) + truncate_identifier(ident, literallen, true); + yylval.str = ident; + return IDENT; + } +{xddouble} { addlitchar('"'); } -{xdinside} { +{xdinside} { addlit(yytext, yyleng); } -<> { yyerror("unterminated quoted identifier"); } +<> { yyerror("unterminated quoted identifier"); } + +{xufailed} { + /* throw back all but the initial u/U */ + yyless(1); + /* and treat it as {other} */ + return yytext[0]; + } {typecast} { SET_YYLLOC(); @@ -908,6 +987,99 @@ litbufdup(void) return new; } +static int +hexval(unsigned char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'a' && c <= 'f') + return c - 'a' + 0xA; + if (c >= 'A' && c <= 'F') + return c - 'A' + 0xA; + elog(ERROR, "invalid hexadecimal digit"); + return 0; /* not reached */ +} + +static void +check_unicode_value(pg_wchar c, char * loc) +{ + if (GetDatabaseEncoding() == PG_UTF8) + return; + + if (c > 0x7F) + { + yylloc += (char *) loc - literalbuf + 3; /* 3 for U&" */ + yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8"); + } +} + +static char * +litbuf_udeescape(unsigned char escape) +{ + char *new; + char *in, *out; + + if (isxdigit(escape) + || escape == '+' + || escape == '\'' + || escape == '"' + || scanner_isspace(escape)) + { + yylloc += literallen + yyleng + 1; + yyerror("invalid Unicode escape character"); + } + + /* + * This relies on the subtle assumption that a UTF-8 expansion + * cannot be longer than its escaped representation. + */ + new = palloc(literallen + 1); + + in = literalbuf; + out = new; + while (*in) + { + if (in[0] == escape) + { + if (in[1] == escape) + { + *out++ = escape; + in += 2; + } + else if (isxdigit(in[1]) && isxdigit(in[2]) && isxdigit(in[3]) && isxdigit(in[4])) + { + pg_wchar unicode = hexval(in[1]) * 16*16*16 + hexval(in[2]) * 16*16 + hexval(in[3]) * 16 + hexval(in[4]); + check_unicode_value(unicode, in); + unicode_to_utf8(unicode, (unsigned char *) out); + in += 5; + out += pg_mblen(out); + } + else if (in[1] == '+' + && isxdigit(in[2]) && isxdigit(in[3]) + && isxdigit(in[4]) && isxdigit(in[5]) + && isxdigit(in[6]) && isxdigit(in[7])) + { + pg_wchar unicode = hexval(in[2]) * 16*16*16*16*16 + hexval(in[3]) * 16*16*16*16 + hexval(in[4]) * 16*16*16 + + hexval(in[5]) * 16*16 + hexval(in[6]) * 16 + hexval(in[7]); + check_unicode_value(unicode, in); + unicode_to_utf8(unicode, (unsigned char *) out); + in += 8; + out += pg_mblen(out); + } + else + { + yylloc += in - literalbuf + 3; /* 3 for U&" */ + yyerror("invalid Unicode escape value"); + } + } + else + *out++ = *in++; + } + + *out = '\0'; + pg_verifymbstr(new, out - new, false); + return new; +} static unsigned char unescape_single_char(unsigned char c) diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index 552594cc44..25c320a52d 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -1497,28 +1497,7 @@ unicode_to_sqlchar(pg_wchar c) { static unsigned char utf8string[5]; /* need trailing zero */ - if (c <= 0x7F) - { - utf8string[0] = c; - } - else if (c <= 0x7FF) - { - utf8string[0] = 0xC0 | ((c >> 6) & 0x1F); - utf8string[1] = 0x80 | (c & 0x3F); - } - else if (c <= 0xFFFF) - { - utf8string[0] = 0xE0 | ((c >> 12) & 0x0F); - utf8string[1] = 0x80 | ((c >> 6) & 0x3F); - utf8string[2] = 0x80 | (c & 0x3F); - } - else - { - utf8string[0] = 0xF0 | ((c >> 18) & 0x07); - utf8string[1] = 0x80 | ((c >> 12) & 0x3F); - utf8string[2] = 0x80 | ((c >> 6) & 0x3F); - utf8string[3] = 0x80 | (c & 0x3F); - } + unicode_to_utf8(c, utf8string); return (char *) pg_do_encoding_conversion(utf8string, pg_mblen((char *) utf8string), diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index 35b9686aa8..9a6dc00ccc 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -419,6 +419,41 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len) return cnt; } + +/* + * Map a Unicode code point to UTF-8. utf8string must have 4 bytes of + * space allocated. + */ +unsigned char * +unicode_to_utf8(pg_wchar c, unsigned char *utf8string) +{ + if (c <= 0x7F) + { + utf8string[0] = c; + } + else if (c <= 0x7FF) + { + utf8string[0] = 0xC0 | ((c >> 6) & 0x1F); + utf8string[1] = 0x80 | (c & 0x3F); + } + else if (c <= 0xFFFF) + { + utf8string[0] = 0xE0 | ((c >> 12) & 0x0F); + utf8string[1] = 0x80 | ((c >> 6) & 0x3F); + utf8string[2] = 0x80 | (c & 0x3F); + } + else + { + utf8string[0] = 0xF0 | ((c >> 18) & 0x07); + utf8string[1] = 0x80 | ((c >> 12) & 0x3F); + utf8string[2] = 0x80 | ((c >> 6) & 0x3F); + utf8string[3] = 0x80 | (c & 0x3F); + } + + return utf8string; +} + + /* * Return the byte length of a UTF8 character pointed to by s * diff --git a/src/bin/psql/psqlscan.l b/src/bin/psql/psqlscan.l index 965c41bc36..31c83af641 100644 --- a/src/bin/psql/psqlscan.l +++ b/src/bin/psql/psqlscan.l @@ -156,6 +156,8 @@ static void emit(const char *txt, int len); * standard quoted strings * extended quoted strings (support backslash escape sequences) * $foo$ quoted strings + * quoted identifier with Unicode escapes + * quoted string with Unicode escapes */ %x xb @@ -165,6 +167,8 @@ static void emit(const char *txt, int len); %x xe %x xq %x xdolq +%x xui +%x xus /* Additional exclusive states for psql only: lex backslash commands */ %x xslashcmd %x xslasharg @@ -281,6 +285,25 @@ xdstop {dquote} xddouble {dquote}{dquote} xdinside [^"]+ +/* Unicode escapes */ +uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} +/* error rule to avoid backup */ +uescapefail ("-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]) + +/* Quoted identifier with Unicode escapes */ +xuistart [uU]&{dquote} +xuistop1 {dquote}{whitespace}*{uescapefail}? +xuistop2 {dquote}{whitespace}*{uescape} + +/* Quoted string with Unicode escapes */ +xusstart [uU]&{quote} +xusstop1 {quote}{whitespace}*{uescapefail}? +xusstop2 {quote}{whitespace}*{uescape} + +/* error rule to avoid backup */ +xufailed [uU]& + + /* C-style comments * * The "extended comment" syntax closely resembles allowable operator syntax. @@ -460,16 +483,29 @@ other . BEGIN(xe); ECHO; } +{xusstart} { + BEGIN(xus); + ECHO; + } {quotestop} | {quotefail} { yyless(1); BEGIN(INITIAL); ECHO; } -{xqdouble} { +{xusstop1} { + yyless(1); + BEGIN(INITIAL); + ECHO; + } +{xusstop2} { + BEGIN(INITIAL); + ECHO; + } +{xqdouble} { ECHO; } -{xqinside} { +{xqinside} { ECHO; } {xeinside} { @@ -484,7 +520,7 @@ other . {xehexesc} { ECHO; } -{quotecontinue} { +{quotecontinue} { ECHO; } . { @@ -535,14 +571,33 @@ other . BEGIN(xd); ECHO; } +{xuistart} { + BEGIN(xui); + ECHO; + } {xdstop} { BEGIN(INITIAL); ECHO; } -{xddouble} { +{xuistop1} { + yyless(1); + BEGIN(INITIAL); + ECHO; + } +{xuistop2} { + BEGIN(INITIAL); ECHO; } -{xdinside} { +{xddouble} { + ECHO; + } +{xdinside} { + ECHO; + } + +{xufailed} { + /* throw back all but the initial u/U */ + yyless(1); ECHO; } diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index b0a0911954..e3d8696620 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -380,6 +380,7 @@ extern const char *GetDatabaseEncodingName(void); extern int pg_valid_client_encoding(const char *name); extern int pg_valid_server_encoding(const char *name); +extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string); extern int pg_utf_mblen(const unsigned char *); extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len, int src_encoding, diff --git a/src/interfaces/ecpg/preproc/pgc.l b/src/interfaces/ecpg/preproc/pgc.l index 7e12caf908..5b69527972 100644 --- a/src/interfaces/ecpg/preproc/pgc.l +++ b/src/interfaces/ecpg/preproc/pgc.l @@ -103,6 +103,8 @@ static struct _if_value * extended quoted strings (support backslash escape sequences) * national character quoted strings * $foo$ quoted strings + * quoted identifier with Unicode escapes + * quoted string with Unicode escapes */ %x xb @@ -117,6 +119,8 @@ static struct _if_value %x xdolq %x xcond %x xskip +%x xui +%x xus /* Bit string */ @@ -172,6 +176,18 @@ xdstop {dquote} xddouble {dquote}{dquote} xdinside [^"]+ +/* Unicode escapes */ +/* (The ecpg scanner is not backup-free, so the fail rules in scan.l are not needed here, but could be added if desired.) */ +uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote} + +/* Quoted identifier with Unicode escapes */ +xuistart [uU]&{dquote} +xuistop {dquote}({whitespace}*{uescape})? + +/* Quoted string with Unicode escapes */ +xusstart [uU]&{quote} +xusstop {quote}({whitespace}*{uescape})? + /* special stuff for C strings */ xdcqq \\\\ xdcqdq \\\" @@ -433,6 +449,13 @@ cppline {space}*#(.*\\{space})*.*{newline} BEGIN(xe); startlit(); } +{xusstart} { + token_start = yytext; + state_before = YYSTATE; + BEGIN(xus); + startlit(); + addlit(yytext, yyleng); + } {quotestop} | {quotefail} { yyless(1); @@ -454,22 +477,28 @@ cppline {space}*#(.*\\{space})*.*{newline} yylval.str = mm_strdup(literalbuf); return NCONST; } -{xqdouble} { addlitchar('\''); } +{xusstop} { + addlit(yytext, yyleng); + BEGIN(state_before); + yylval.str = mm_strdup(literalbuf); + return UCONST; + } +{xqdouble} { addlitchar('\''); } {xqcquote} { addlitchar('\\'); addlitchar('\''); } -{xqinside} { addlit(yytext, yyleng); } +{xqinside} { addlit(yytext, yyleng); } {xeinside} { addlit(yytext, yyleng); } {xeescape} { addlit(yytext, yyleng); } {xeoctesc} { addlit(yytext, yyleng); } {xehexesc} { addlit(yytext, yyleng); } -{quotecontinue} { /* ignore */ } +{quotecontinue} { /* ignore */ } . { /* This is only needed for \ just before EOF */ addlitchar(yytext[0]); } -<> { mmerror(PARSE_ERROR, ET_FATAL, "unterminated quoted string"); } +<> { mmerror(PARSE_ERROR, ET_FATAL, "unterminated quoted string"); } {dolqfailed} { /* throw back all but the initial "$" */ yyless(1); @@ -515,6 +544,12 @@ cppline {space}*#(.*\\{space})*.*{newline} BEGIN(xd); startlit(); } +{xuistart} { + state_before = YYSTATE; + BEGIN(xui); + startlit(); + addlit(yytext, yyleng); + } {xdstop} { BEGIN(state_before); if (literallen == 0) @@ -528,9 +563,18 @@ cppline {space}*#(.*\\{space})*.*{newline} yylval.str = mm_strdup(literalbuf); return CSTRING; } -{xddouble} { addlitchar('"'); } -{xdinside} { addlit(yytext, yyleng); } -<> { mmerror(PARSE_ERROR, ET_FATAL, "unterminated quoted identifier"); } +{xuistop} { + BEGIN(state_before); + if (literallen == 2) /* "U&" */ + mmerror(PARSE_ERROR, ET_ERROR, "zero-length delimited identifier"); + /* The backend will truncate the idnetifier here. We do not as it does not change the result. */ + addlit(yytext, yyleng); + yylval.str = mm_strdup(literalbuf); + return UIDENT; + } +{xddouble} { addlitchar('"'); } +{xdinside} { addlit(yytext, yyleng); } +<> { mmerror(PARSE_ERROR, ET_FATAL, "unterminated quoted identifier"); } {xdstart} { state_before = YYSTATE; BEGIN(xdc); diff --git a/src/interfaces/ecpg/preproc/preproc.y b/src/interfaces/ecpg/preproc/preproc.y index 70c7588c51..3baec64b28 100644 --- a/src/interfaces/ecpg/preproc/preproc.y +++ b/src/interfaces/ecpg/preproc/preproc.y @@ -509,7 +509,7 @@ add_typedef(char *name, char * dimension, char * length, enum ECPGttype type_enu /* Special token types, not actually keywords - see the "lex" file */ %token IDENT SCONST Op CSTRING CVARIABLE CPP_LINE IP BCONST -%token XCONST DOLCONST ECONST NCONST +%token XCONST DOLCONST ECONST NCONST UCONST UIDENT %token ICONST PARAM %token FCONST @@ -4966,6 +4966,10 @@ Sconst: SCONST $$[strlen($1)+3]='\0'; free($1); } + | UCONST + { + $$ = $1; + } | DOLCONST { $$ = $1; @@ -7013,6 +7017,7 @@ cvariable: CVARIABLE ; ident: IDENT { $$ = $1; } | CSTRING { $$ = make3_str(make_str("\""), $1, make_str("\"")); } + | UIDENT { $$ = $1; } ; quoted_ident_stringvar: name diff --git a/src/interfaces/ecpg/test/ecpg_schedule b/src/interfaces/ecpg/test/ecpg_schedule index c478ed126b..14fcd41a46 100644 --- a/src/interfaces/ecpg/test/ecpg_schedule +++ b/src/interfaces/ecpg/test/ecpg_schedule @@ -18,6 +18,7 @@ test: preproc/autoprep test: preproc/comment test: preproc/define test: preproc/init +test: preproc/strings test: preproc/type test: preproc/variable test: preproc/whenever diff --git a/src/interfaces/ecpg/test/ecpg_schedule_tcp b/src/interfaces/ecpg/test/ecpg_schedule_tcp index 5dbca9dd16..8143473244 100644 --- a/src/interfaces/ecpg/test/ecpg_schedule_tcp +++ b/src/interfaces/ecpg/test/ecpg_schedule_tcp @@ -18,6 +18,7 @@ test: preproc/autoprep test: preproc/comment test: preproc/define test: preproc/init +test: preproc/strings test: preproc/type test: preproc/variable test: preproc/whenever diff --git a/src/interfaces/ecpg/test/expected/preproc-strings.c b/src/interfaces/ecpg/test/expected/preproc-strings.c new file mode 100644 index 0000000000..9a99dad11e --- /dev/null +++ b/src/interfaces/ecpg/test/expected/preproc-strings.c @@ -0,0 +1,62 @@ +/* Processed by ecpg (regression mode) */ +/* These include files are added by the preprocessor */ +#include +#include +#include +/* End of automatic include section */ +#define ECPGdebug(X,Y) ECPGdebug((X)+100,(Y)) + +#line 1 "strings.pgc" +#include + + +#line 1 "regression.h" + + + + + + +#line 3 "strings.pgc" + + +/* exec sql begin declare section */ + + +#line 6 "strings.pgc" + char * s1 , * s2 , * s3 , * s4 , * s5 , * s6 ; +/* exec sql end declare section */ +#line 7 "strings.pgc" + + +int main(void) +{ + ECPGdebug(1, stderr); + + { ECPGconnect(__LINE__, 0, "regress1" , NULL, NULL , NULL, 0); } +#line 13 "strings.pgc" + + + { ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "select 'abcdef' , N'abcdef' as foo , E'abc\\bdef' as \"foo\" , U&'d\\0061t\\0061' as U&\"foo\" , U&'d!+000061t!+000061' uescape '!' , $foo$abc$def$foo$ ", ECPGt_EOIT, + ECPGt_char,&(s1),(long)0,(long)1,(1)*sizeof(char), + ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, + ECPGt_char,&(s2),(long)0,(long)1,(1)*sizeof(char), + ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, + ECPGt_char,&(s3),(long)0,(long)1,(1)*sizeof(char), + ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, + ECPGt_char,&(s4),(long)0,(long)1,(1)*sizeof(char), + ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, + ECPGt_char,&(s5),(long)0,(long)1,(1)*sizeof(char), + ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, + ECPGt_char,&(s6),(long)0,(long)1,(1)*sizeof(char), + ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, ECPGt_EORT);} +#line 21 "strings.pgc" + + + printf("%s %s %s %s %s %s\n", s1, s2, s3, s4, s5, s6); + + { ECPGdisconnect(__LINE__, "CURRENT");} +#line 25 "strings.pgc" + + exit (0); +} diff --git a/src/interfaces/ecpg/test/expected/preproc-strings.stderr b/src/interfaces/ecpg/test/expected/preproc-strings.stderr new file mode 100644 index 0000000000..021e2801eb --- /dev/null +++ b/src/interfaces/ecpg/test/expected/preproc-strings.stderr @@ -0,0 +1,36 @@ +[NO_PID]: ECPGdebug: set to 1 +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ECPGconnect: opening database regress1 on port +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ecpg_execute on line 15: query: select 'abcdef' , N'abcdef' as foo , E'abc\bdef' as "foo" , U&'d\0061t\0061' as U&"foo" , U&'d!+000061t!+000061' uescape '!' , $foo$abc$def$foo$ ; with 0 parameter(s) on connection regress1 +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ecpg_execute on line 15: using PQexec +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ecpg_execute on line 15: correctly got 1 tuples with 6 fields +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ecpg_get_data on line 15: RESULT: abcdef offset: -1; array: yes +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ecpg_get_data on line 15: RESULT: abcdef offset: -1; array: yes +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ecpg_get_data on line 15: RESULT: abcdef offset: -1; array: yes +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ecpg_get_data on line 15: RESULT: data offset: -1; array: yes +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ecpg_get_data on line 15: RESULT: data offset: -1; array: yes +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ecpg_store_result on line 15: allocating memory for 1 tuples +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ecpg_get_data on line 15: RESULT: abc$def offset: -1; array: yes +[NO_PID]: sqlca: code: 0, state: 00000 +[NO_PID]: ecpg_finish: connection regress1 closed +[NO_PID]: sqlca: code: 0, state: 00000 diff --git a/src/interfaces/ecpg/test/expected/preproc-strings.stdout b/src/interfaces/ecpg/test/expected/preproc-strings.stdout new file mode 100644 index 0000000000..730d72dd64 --- /dev/null +++ b/src/interfaces/ecpg/test/expected/preproc-strings.stdout @@ -0,0 +1 @@ +abcdef abcdef abcdef data data abc$def diff --git a/src/interfaces/ecpg/test/preproc/Makefile b/src/interfaces/ecpg/test/preproc/Makefile index 6928a1f3fe..94b6779a41 100644 --- a/src/interfaces/ecpg/test/preproc/Makefile +++ b/src/interfaces/ecpg/test/preproc/Makefile @@ -9,6 +9,7 @@ TESTS = array_of_struct array_of_struct.c \ comment comment.c \ define define.c \ init init.c \ + strings strings.c \ type type.c \ variable variable.c \ whenever whenever.c diff --git a/src/interfaces/ecpg/test/preproc/strings.pgc b/src/interfaces/ecpg/test/preproc/strings.pgc new file mode 100644 index 0000000000..1a8c0d707d --- /dev/null +++ b/src/interfaces/ecpg/test/preproc/strings.pgc @@ -0,0 +1,27 @@ +#include + +exec sql include ../regression; + +exec sql begin declare section; +char *s1, *s2, *s3, *s4, *s5, *s6; +exec sql end declare section; + +int main(void) +{ + ECPGdebug(1, stderr); + + exec sql connect to REGRESSDB1; + + exec sql select 'abcdef', + N'abcdef' AS foo, + E'abc\bdef' AS "foo", + U&'d\0061t\0061' AS U&"foo", + U&'d!+000061t!+000061' uescape '!', + $foo$abc$def$foo$ + into :s1, :s2, :s3, :s4, :s5, :s6; + + printf("%s %s %s %s %s %s\n", s1, s2, s3, s4, s5, s6); + + exec sql disconnect; + exit (0); +} diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 742ec9291f..6b9dc5df9f 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -21,6 +21,31 @@ SELECT 'first line' ERROR: syntax error at or near "' - third line'" LINE 3: ' - third line' ^ +-- Unicode escapes +SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061"; + data +------ + data +(1 row) + +SELECT U&'d!0061t\+000061' UESCAPE '!' AS U&"d*0061t\+000061" UESCAPE '*'; + dat\+000061 +------------- + dat\+000061 +(1 row) + +SELECT U&'wrong: \061'; +ERROR: invalid Unicode escape value at or near "\061'" +LINE 1: SELECT U&'wrong: \061'; + ^ +SELECT U&'wrong: \+0061'; +ERROR: invalid Unicode escape value at or near "\+0061'" +LINE 1: SELECT U&'wrong: \+0061'; + ^ +SELECT U&'wrong: +0061' UESCAPE '+'; +ERROR: invalid Unicode escape character at or near "+'" +LINE 1: SELECT U&'wrong: +0061' UESCAPE '+'; + ^ -- -- test conversions between various string types -- E021-10 implicit casting among the character data types diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index c042f33b01..0da88c7b29 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -16,6 +16,14 @@ SELECT 'first line' ' - third line' AS "Illegal comment within continuation"; +-- Unicode escapes +SELECT U&'d\0061t\+000061' AS U&"d\0061t\+000061"; +SELECT U&'d!0061t\+000061' UESCAPE '!' AS U&"d*0061t\+000061" UESCAPE '*'; + +SELECT U&'wrong: \061'; +SELECT U&'wrong: \+0061'; +SELECT U&'wrong: +0061' UESCAPE '+'; + -- -- test conversions between various string types -- E021-10 implicit casting among the character data types -- 2.11.4.GIT