Enhance the command-line completion extension to return the names of
[sqlite.git] / ext / misc / normalize.c
blobfd656f13038152783463934e8858191e92d7e7b5
1 /*
2 ** 2018-01-08
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
13 ** This file contains code to implement the sqlite3_normalize() function.
15 ** char *sqlite3_normalize(const char *zSql);
17 ** This function takes an SQL string as input and returns a "normalized"
18 ** version of that string in memory obtained from sqlite3_malloc64(). The
19 ** caller is responsible for ensuring that the returned memory is freed.
21 ** If a memory allocation error occurs, this routine returns NULL.
23 ** The normalization consists of the following transformations:
25 ** (1) Convert every literal (string, blob literal, numeric constant,
26 ** or "NULL" constant) into a ?
28 ** (2) Remove all superfluous whitespace, including comments. Change
29 ** all required whitespace to a single space character.
31 ** (3) Lowercase all ASCII characters.
33 ** (4) If an IN or NOT IN operator is followed by a list of 1 or more
34 ** values, convert that list into "(?,?,?)".
36 ** The purpose of normalization is two-fold:
38 ** (1) Sanitize queries by removing potentially private or sensitive
39 ** information contained in literals.
41 ** (2) Identify structurally identical queries by comparing their
42 ** normalized forms.
44 ** Command-Line Utility
45 ** --------------------
47 ** This file also contains code for a command-line utility that converts
48 ** SQL queries in text files into their normalized forms. To build the
49 ** command-line program, compile this file with -DSQLITE_NORMALIZE_CLI
50 ** and link it against the SQLite library.
52 #include <sqlite3.h>
53 #include <string.h>
56 ** Implementation note:
58 ** Much of the tokenizer logic is copied out of the tokenize.c source file
59 ** of SQLite. That logic could be simplified for this particular application,
60 ** but that would impose a risk of introducing subtle errors. It is best to
61 ** keep the code as close to the original as possible.
63 ** The tokenize code is in sync with the SQLite core as of 2018-01-08.
64 ** Any future changes to the core tokenizer might require corresponding
65 ** adjustments to the tokenizer logic in this module.
69 /* Character classes for tokenizing
71 ** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented
72 ** using a lookup table, whereas a switch() directly on c uses a binary search.
73 ** The lookup table is much faster. To maximize speed, and to ensure that
74 ** a lookup table is used, all of the classes need to be small integers and
75 ** all of them need to be used within the switch.
77 #define CC_X 0 /* The letter 'x', or start of BLOB literal */
78 #define CC_KYWD 1 /* Alphabetics or '_'. Usable in a keyword */
79 #define CC_ID 2 /* unicode characters usable in IDs */
80 #define CC_DIGIT 3 /* Digits */
81 #define CC_DOLLAR 4 /* '$' */
82 #define CC_VARALPHA 5 /* '@', '#', ':'. Alphabetic SQL variables */
83 #define CC_VARNUM 6 /* '?'. Numeric SQL variables */
84 #define CC_SPACE 7 /* Space characters */
85 #define CC_QUOTE 8 /* '"', '\'', or '`'. String literals, quoted ids */
86 #define CC_QUOTE2 9 /* '['. [...] style quoted ids */
87 #define CC_PIPE 10 /* '|'. Bitwise OR or concatenate */
88 #define CC_MINUS 11 /* '-'. Minus or SQL-style comment */
89 #define CC_LT 12 /* '<'. Part of < or <= or <> */
90 #define CC_GT 13 /* '>'. Part of > or >= */
91 #define CC_EQ 14 /* '='. Part of = or == */
92 #define CC_BANG 15 /* '!'. Part of != */
93 #define CC_SLASH 16 /* '/'. / or c-style comment */
94 #define CC_LP 17 /* '(' */
95 #define CC_RP 18 /* ')' */
96 #define CC_SEMI 19 /* ';' */
97 #define CC_PLUS 20 /* '+' */
98 #define CC_STAR 21 /* '*' */
99 #define CC_PERCENT 22 /* '%' */
100 #define CC_COMMA 23 /* ',' */
101 #define CC_AND 24 /* '&' */
102 #define CC_TILDA 25 /* '~' */
103 #define CC_DOT 26 /* '.' */
104 #define CC_ILLEGAL 27 /* Illegal character */
106 static const unsigned char aiClass[] = {
107 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xa xb xc xd xe xf */
108 /* 0x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 7, 7, 27, 7, 7, 27, 27,
109 /* 1x */ 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
110 /* 2x */ 7, 15, 8, 5, 4, 22, 24, 8, 17, 18, 21, 20, 23, 11, 26, 16,
111 /* 3x */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 19, 12, 14, 13, 6,
112 /* 4x */ 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
113 /* 5x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 9, 27, 27, 27, 1,
114 /* 6x */ 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
115 /* 7x */ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 27, 10, 27, 25, 27,
116 /* 8x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
117 /* 9x */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
118 /* Ax */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
119 /* Bx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
120 /* Cx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
121 /* Dx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
122 /* Ex */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
123 /* Fx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
126 /* An array to map all upper-case characters into their corresponding
127 ** lower-case character.
129 ** SQLite only considers US-ASCII (or EBCDIC) characters. We do not
130 ** handle case conversions for the UTF character set since the tables
131 ** involved are nearly as big or bigger than SQLite itself.
133 static const unsigned char sqlite3UpperToLower[] = {
134 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
135 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
136 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
137 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
138 104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
139 122, 91, 92, 93, 94, 95, 96, 97, 98, 99,100,101,102,103,104,105,106,107,
140 108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
141 126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
142 144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,
143 162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,
144 180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,
145 198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,
146 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,
147 234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,
148 252,253,254,255
152 ** The following 256 byte lookup table is used to support SQLites built-in
153 ** equivalents to the following standard library functions:
155 ** isspace() 0x01
156 ** isalpha() 0x02
157 ** isdigit() 0x04
158 ** isalnum() 0x06
159 ** isxdigit() 0x08
160 ** toupper() 0x20
161 ** SQLite identifier character 0x40
162 ** Quote character 0x80
164 ** Bit 0x20 is set if the mapped character requires translation to upper
165 ** case. i.e. if the character is a lower-case ASCII character.
166 ** If x is a lower-case ASCII character, then its upper-case equivalent
167 ** is (x - 0x20). Therefore toupper() can be implemented as:
169 ** (x & ~(map[x]&0x20))
171 ** The equivalent of tolower() is implemented using the sqlite3UpperToLower[]
172 ** array. tolower() is used more often than toupper() by SQLite.
174 ** Bit 0x40 is set if the character is non-alphanumeric and can be used in an
175 ** SQLite identifier. Identifiers are alphanumerics, "_", "$", and any
176 ** non-ASCII UTF character. Hence the test for whether or not a character is
177 ** part of an identifier is 0x46.
179 static const unsigned char sqlite3CtypeMap[256] = {
180 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 00..07 ........ */
181 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, /* 08..0f ........ */
182 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10..17 ........ */
183 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 18..1f ........ */
184 0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80, /* 20..27 !"#$%&' */
185 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 28..2f ()*+,-./ */
186 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, /* 30..37 01234567 */
187 0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 38..3f 89:;<=>? */
189 0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02, /* 40..47 @ABCDEFG */
190 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 48..4f HIJKLMNO */
191 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, /* 50..57 PQRSTUVW */
192 0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40, /* 58..5f XYZ[\]^_ */
193 0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22, /* 60..67 `abcdefg */
194 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 68..6f hijklmno */
195 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, /* 70..77 pqrstuvw */
196 0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, /* 78..7f xyz{|}~. */
198 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 80..87 ........ */
199 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 88..8f ........ */
200 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 90..97 ........ */
201 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* 98..9f ........ */
202 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a0..a7 ........ */
203 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* a8..af ........ */
204 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b0..b7 ........ */
205 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* b8..bf ........ */
207 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c0..c7 ........ */
208 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* c8..cf ........ */
209 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d0..d7 ........ */
210 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* d8..df ........ */
211 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e0..e7 ........ */
212 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* e8..ef ........ */
213 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, /* f0..f7 ........ */
214 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40 /* f8..ff ........ */
216 #define sqlite3Toupper(x) ((x)&~(sqlite3CtypeMap[(unsigned char)(x)]&0x20))
217 #define sqlite3Isspace(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x01)
218 #define sqlite3Isalnum(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x06)
219 #define sqlite3Isalpha(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x02)
220 #define sqlite3Isdigit(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x04)
221 #define sqlite3Isxdigit(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x08)
222 #define sqlite3Tolower(x) (sqlite3UpperToLower[(unsigned char)(x)])
223 #define sqlite3Isquote(x) (sqlite3CtypeMap[(unsigned char)(x)]&0x80)
227 ** If X is a character that can be used in an identifier then
228 ** IdChar(X) will be true. Otherwise it is false.
230 ** For ASCII, any character with the high-order bit set is
231 ** allowed in an identifier. For 7-bit characters,
232 ** sqlite3IsIdChar[X] must be 1.
234 ** For EBCDIC, the rules are more complex but have the same
235 ** end result.
237 ** Ticket #1066. the SQL standard does not allow '$' in the
238 ** middle of identifiers. But many SQL implementations do.
239 ** SQLite will allow '$' in identifiers for compatibility.
240 ** But the feature is undocumented.
242 #define IdChar(C) ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0)
245 ** Ignore testcase() macros
247 #define testcase(X)
250 ** Token values
252 #define TK_SPACE 0
253 #define TK_NAME 1
254 #define TK_LITERAL 2
255 #define TK_PUNCT 3
256 #define TK_ERROR 4
258 #define TK_MINUS TK_PUNCT
259 #define TK_LP TK_PUNCT
260 #define TK_RP TK_PUNCT
261 #define TK_SEMI TK_PUNCT
262 #define TK_PLUS TK_PUNCT
263 #define TK_STAR TK_PUNCT
264 #define TK_SLASH TK_PUNCT
265 #define TK_REM TK_PUNCT
266 #define TK_EQ TK_PUNCT
267 #define TK_LE TK_PUNCT
268 #define TK_NE TK_PUNCT
269 #define TK_LSHIFT TK_PUNCT
270 #define TK_LT TK_PUNCT
271 #define TK_GE TK_PUNCT
272 #define TK_RSHIFT TK_PUNCT
273 #define TK_GT TK_PUNCT
274 #define TK_GE TK_PUNCT
275 #define TK_BITOR TK_PUNCT
276 #define TK_CONCAT TK_PUNCT
277 #define TK_COMMA TK_PUNCT
278 #define TK_BITAND TK_PUNCT
279 #define TK_BITNOT TK_PUNCT
280 #define TK_STRING TK_LITERAL
281 #define TK_ID TK_NAME
282 #define TK_ILLEGAL TK_ERROR
283 #define TK_DOT TK_PUNCT
284 #define TK_INTEGER TK_LITERAL
285 #define TK_FLOAT TK_LITERAL
286 #define TK_VARIABLE TK_LITERAL
287 #define TK_BLOB TK_LITERAL
290 ** Return the length (in bytes) of the token that begins at z[0].
291 ** Store the token type in *tokenType before returning.
293 static int sqlite3GetToken(const unsigned char *z, int *tokenType){
294 int i, c;
295 switch( aiClass[*z] ){ /* Switch on the character-class of the first byte
296 ** of the token. See the comment on the CC_ defines
297 ** above. */
298 case CC_SPACE: {
299 for(i=1; sqlite3Isspace(z[i]); i++){}
300 *tokenType = TK_SPACE;
301 return i;
303 case CC_MINUS: {
304 if( z[1]=='-' ){
305 for(i=2; (c=z[i])!=0 && c!='\n'; i++){}
306 *tokenType = TK_SPACE;
307 return i;
309 *tokenType = TK_MINUS;
310 return 1;
312 case CC_LP: {
313 *tokenType = TK_LP;
314 return 1;
316 case CC_RP: {
317 *tokenType = TK_RP;
318 return 1;
320 case CC_SEMI: {
321 *tokenType = TK_SEMI;
322 return 1;
324 case CC_PLUS: {
325 *tokenType = TK_PLUS;
326 return 1;
328 case CC_STAR: {
329 *tokenType = TK_STAR;
330 return 1;
332 case CC_SLASH: {
333 if( z[1]!='*' || z[2]==0 ){
334 *tokenType = TK_SLASH;
335 return 1;
337 for(i=3, c=z[2]; (c!='*' || z[i]!='/') && (c=z[i])!=0; i++){}
338 if( c ) i++;
339 *tokenType = TK_SPACE;
340 return i;
342 case CC_PERCENT: {
343 *tokenType = TK_REM;
344 return 1;
346 case CC_EQ: {
347 *tokenType = TK_EQ;
348 return 1 + (z[1]=='=');
350 case CC_LT: {
351 if( (c=z[1])=='=' ){
352 *tokenType = TK_LE;
353 return 2;
354 }else if( c=='>' ){
355 *tokenType = TK_NE;
356 return 2;
357 }else if( c=='<' ){
358 *tokenType = TK_LSHIFT;
359 return 2;
360 }else{
361 *tokenType = TK_LT;
362 return 1;
365 case CC_GT: {
366 if( (c=z[1])=='=' ){
367 *tokenType = TK_GE;
368 return 2;
369 }else if( c=='>' ){
370 *tokenType = TK_RSHIFT;
371 return 2;
372 }else{
373 *tokenType = TK_GT;
374 return 1;
377 case CC_BANG: {
378 if( z[1]!='=' ){
379 *tokenType = TK_ILLEGAL;
380 return 1;
381 }else{
382 *tokenType = TK_NE;
383 return 2;
386 case CC_PIPE: {
387 if( z[1]!='|' ){
388 *tokenType = TK_BITOR;
389 return 1;
390 }else{
391 *tokenType = TK_CONCAT;
392 return 2;
395 case CC_COMMA: {
396 *tokenType = TK_COMMA;
397 return 1;
399 case CC_AND: {
400 *tokenType = TK_BITAND;
401 return 1;
403 case CC_TILDA: {
404 *tokenType = TK_BITNOT;
405 return 1;
407 case CC_QUOTE: {
408 int delim = z[0];
409 testcase( delim=='`' );
410 testcase( delim=='\'' );
411 testcase( delim=='"' );
412 for(i=1; (c=z[i])!=0; i++){
413 if( c==delim ){
414 if( z[i+1]==delim ){
415 i++;
416 }else{
417 break;
421 if( c=='\'' ){
422 *tokenType = TK_STRING;
423 return i+1;
424 }else if( c!=0 ){
425 *tokenType = TK_ID;
426 return i+1;
427 }else{
428 *tokenType = TK_ILLEGAL;
429 return i;
432 case CC_DOT: {
433 if( !sqlite3Isdigit(z[1]) ){
434 *tokenType = TK_DOT;
435 return 1;
437 /* If the next character is a digit, this is a floating point
438 ** number that begins with ".". Fall thru into the next case */
440 case CC_DIGIT: {
441 *tokenType = TK_INTEGER;
442 if( z[0]=='0' && (z[1]=='x' || z[1]=='X') && sqlite3Isxdigit(z[2]) ){
443 for(i=3; sqlite3Isxdigit(z[i]); i++){}
444 return i;
446 for(i=0; sqlite3Isdigit(z[i]); i++){}
447 if( z[i]=='.' ){
448 i++;
449 while( sqlite3Isdigit(z[i]) ){ i++; }
450 *tokenType = TK_FLOAT;
452 if( (z[i]=='e' || z[i]=='E') &&
453 ( sqlite3Isdigit(z[i+1])
454 || ((z[i+1]=='+' || z[i+1]=='-') && sqlite3Isdigit(z[i+2]))
457 i += 2;
458 while( sqlite3Isdigit(z[i]) ){ i++; }
459 *tokenType = TK_FLOAT;
461 while( IdChar(z[i]) ){
462 *tokenType = TK_ILLEGAL;
463 i++;
465 return i;
467 case CC_QUOTE2: {
468 for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
469 *tokenType = c==']' ? TK_ID : TK_ILLEGAL;
470 return i;
472 case CC_VARNUM: {
473 *tokenType = TK_VARIABLE;
474 for(i=1; sqlite3Isdigit(z[i]); i++){}
475 return i;
477 case CC_DOLLAR:
478 case CC_VARALPHA: {
479 int n = 0;
480 testcase( z[0]=='$' ); testcase( z[0]=='@' );
481 testcase( z[0]==':' ); testcase( z[0]=='#' );
482 *tokenType = TK_VARIABLE;
483 for(i=1; (c=z[i])!=0; i++){
484 if( IdChar(c) ){
485 n++;
486 }else if( c=='(' && n>0 ){
488 i++;
489 }while( (c=z[i])!=0 && !sqlite3Isspace(c) && c!=')' );
490 if( c==')' ){
491 i++;
492 }else{
493 *tokenType = TK_ILLEGAL;
495 break;
496 }else if( c==':' && z[i+1]==':' ){
497 i++;
498 }else{
499 break;
502 if( n==0 ) *tokenType = TK_ILLEGAL;
503 return i;
505 case CC_KYWD: {
506 for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
507 if( IdChar(z[i]) ){
508 /* This token started out using characters that can appear in keywords,
509 ** but z[i] is a character not allowed within keywords, so this must
510 ** be an identifier instead */
511 i++;
512 break;
514 *tokenType = TK_ID;
515 return i;
517 case CC_X: {
518 testcase( z[0]=='x' ); testcase( z[0]=='X' );
519 if( z[1]=='\'' ){
520 *tokenType = TK_BLOB;
521 for(i=2; sqlite3Isxdigit(z[i]); i++){}
522 if( z[i]!='\'' || i%2 ){
523 *tokenType = TK_ILLEGAL;
524 while( z[i] && z[i]!='\'' ){ i++; }
526 if( z[i] ) i++;
527 return i;
529 /* If it is not a BLOB literal, then it must be an ID, since no
530 ** SQL keywords start with the letter 'x'. Fall through */
532 case CC_ID: {
533 i = 1;
534 break;
536 default: {
537 *tokenType = TK_ILLEGAL;
538 return 1;
541 while( IdChar(z[i]) ){ i++; }
542 *tokenType = TK_ID;
543 return i;
546 char *sqlite3_normalize(const char *zSql){
547 char *z; /* The output string */
548 sqlite3_int64 nZ; /* Size of the output string in bytes */
549 sqlite3_int64 nSql; /* Size of the input string in bytes */
550 int i; /* Next character to read from zSql[] */
551 int j; /* Next slot to fill in on z[] */
552 int tokenType; /* Type of the next token */
553 int n; /* Size of the next token */
554 int k; /* Loop counter */
556 nSql = strlen(zSql);
557 nZ = nSql;
558 z = sqlite3_malloc64( nZ+2 );
559 if( z==0 ) return 0;
560 for(i=j=0; zSql[i]; i += n){
561 n = sqlite3GetToken((unsigned char*)zSql+i, &tokenType);
562 switch( tokenType ){
563 case TK_SPACE: {
564 break;
566 case TK_ERROR: {
567 sqlite3_free(z);
568 return 0;
570 case TK_LITERAL: {
571 z[j++] = '?';
572 break;
574 case TK_PUNCT:
575 case TK_NAME: {
576 if( n==4 && sqlite3_strnicmp(zSql+i,"NULL",4)==0 ){
577 if( (j>=3 && strncmp(z+j-2,"is",2)==0 && !IdChar(z[j-3]))
578 || (j>=4 && strncmp(z+j-3,"not",3)==0 && !IdChar(z[j-4]))
580 /* NULL is a keyword in this case, not a literal value */
581 }else{
582 /* Here the NULL is a literal value */
583 z[j++] = '?';
584 break;
587 if( j>0 && IdChar(z[j-1]) && IdChar(zSql[i]) ) z[j++] = ' ';
588 for(k=0; k<n; k++){
589 z[j++] = sqlite3Tolower(zSql[i+k]);
591 break;
595 while( j>0 && z[j-1]==' ' ){ j--; }
596 if( i>0 && z[j-1]!=';' ){ z[j++] = ';'; }
597 z[j] = 0;
599 /* Make a second pass converting "in(...)" where the "..." is not a
600 ** SELECT statement into "in(?,?,?)" */
601 for(i=0; i<j; i=n){
602 char *zIn = strstr(z+i, "in(");
603 int nParen;
604 if( zIn==0 ) break;
605 n = (int)(zIn-z)+3; /* Index of first char past "in(" */
606 if( n && IdChar(zIn[-1]) ) continue;
607 if( strncmp(zIn, "in(select",9)==0 && !IdChar(zIn[9]) ) continue;
608 if( strncmp(zIn, "in(with",7)==0 && !IdChar(zIn[7]) ) continue;
609 for(nParen=1, k=0; z[n+k]; k++){
610 if( z[n+k]=='(' ) nParen++;
611 if( z[n+k]==')' ){
612 nParen--;
613 if( nParen==0 ) break;
616 /* k is the number of bytes in the "..." within "in(...)" */
617 if( k<5 ){
618 z = sqlite3_realloc64(z, j+(5-k)+1);
619 if( z==0 ) return 0;
620 memmove(z+n+5, z+n+k, j-(n+k));
621 }else if( k>5 ){
622 memmove(z+n+5, z+n+k, j-(n+k));
624 j = j-k+5;
625 z[j] = 0;
626 memcpy(z+n, "?,?,?", 5);
628 return z;
632 ** For testing purposes, or to build a stand-alone SQL normalizer program,
633 ** compile this one source file with the -DSQLITE_NORMALIZE_CLI and link
634 ** it against any SQLite library. The resulting command-line program will
635 ** run sqlite3_normalize() over the text of all files named on the command-
636 ** line and show the result on standard output.
638 #ifdef SQLITE_NORMALIZE_CLI
639 #include <stdio.h>
640 #include <stdlib.h>
643 ** Break zIn up into separate SQL statements and run sqlite3_normalize()
644 ** on each one. Print the result of each run.
646 static void normalizeFile(char *zIn){
647 int i;
648 if( zIn==0 ) return;
649 for(i=0; zIn[i]; i++){
650 char cSaved;
651 if( zIn[i]!=';' ) continue;
652 cSaved = zIn[i+1];
653 zIn[i+1] = 0;
654 if( sqlite3_complete(zIn) ){
655 char *zOut = sqlite3_normalize(zIn);
656 if( zOut ){
657 printf("%s\n", zOut);
658 sqlite3_free(zOut);
659 }else{
660 fprintf(stderr, "ERROR: %s\n", zIn);
662 zIn[i+1] = cSaved;
663 zIn += i+1;
664 i = -1;
665 }else{
666 zIn[i+1] = cSaved;
672 ** The main routine for "sql_normalize". Read files named on the
673 ** command-line and run the text of each through sqlite3_normalize().
675 int main(int argc, char **argv){
676 int i;
677 FILE *in;
678 char *zBuf = 0;
679 sqlite3_int64 sz, got;
681 for(i=1; i<argc; i++){
682 in = fopen(argv[i], "rb");
683 if( in==0 ){
684 fprintf(stderr, "cannot open \"%s\"\n", argv[i]);
685 continue;
687 fseek(in, 0, SEEK_END);
688 sz = ftell(in);
689 rewind(in);
690 zBuf = sqlite3_realloc64(zBuf, sz+1);
691 if( zBuf==0 ){
692 fprintf(stderr, "failed to malloc for %lld bytes\n", sz);
693 exit(1);
695 got = fread(zBuf, 1, sz, in);
696 fclose(in);
697 if( got!=sz ){
698 fprintf(stderr, "only able to read %lld of %lld bytes from \"%s\"\n",
699 got, sz, argv[i]);
700 }else{
701 zBuf[got] = 0;
702 normalizeFile(zBuf);
705 sqlite3_free(zBuf);
707 #endif /* SQLITE_NORMALIZE_CLI */