From 0c618f4b7f0d8ee20b30a276efc8a2c17acb469c Mon Sep 17 00:00:00 2001
From: bellard <bellard>
Date: Sun, 13 Apr 2003 17:37:14 +0000
Subject: [PATCH] sanitized string and character constant parsing

---
 tcc.c | 385 +++++++++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 215 insertions(+), 170 deletions(-)
diff --git a/tcc.c b/tcc.c
index 9834fb35..c557beaf 100644
--- a/tcc.c
+++ b/tcc.c
@@ -223,7 +223,7 @@ typedef struct BufferedFile {
     uint8_t *buf_ptr;
     uint8_t *buf_end;
     int fd;
-    int line_num;    /* current line number - here to simply code */
+    int line_num;    /* current line number - here to simplify code */
     int ifndef_macro;  /* #ifndef macro / #endif search */
     int ifndef_macro_saved; /* saved ifndef_macro */
     int *ifdef_stack_ptr; /* ifdef_stack value at the start of the file */
@@ -1738,6 +1738,18 @@ static int handle_stray1(uint8_t *p)
     return c;
 }
 
+/* handle just the EOB case, but not stray */
+#define PEEKC_EOB(c, p)\
+{\
+    p++;\
+    c = *p;\
+    if (c == '\\') {\
+        file->buf_ptr = p;\
+        c = handle_eob();\
+        p = file->buf_ptr;\
+    }\
+}
+
 /* handle the complicated stray case */
 #define PEEKC(c, p)\
 {\
@@ -1862,11 +1874,73 @@ static inline void skip_spaces(void)
         cinp();
 }
 
+/* parse a string without interpreting escapes */
+static uint8_t *parse_pp_string(uint8_t *p,
+                                int sep, CString *str)
+{
+    int c;
+    p++;
+    for(;;) {
+        c = *p;
+        if (c == sep) {
+            break;
+        } else if (c == '\\') {
+            file->buf_ptr = p;
+            c = handle_eob();
+            p = file->buf_ptr;
+            if (c == CH_EOF) {
+            unterminated_string:
+                /* XXX: indicate line number of start of string */
+                error("missing terminating %c character", sep);
+            } else if (c == '\\') {
+                /* escape : just skip \[\r]\n */
+                PEEKC_EOB(c, p);
+                if (c == '\n') {
+                    file->line_num++;
+                    p++;
+                } else if (c == '\r') {
+                    PEEKC_EOB(c, p);
+                    if (c != '\n')
+                        expect("'\n' after '\r'");
+                    file->line_num++;
+                    p++;
+                } else if (c == CH_EOF) {
+                    goto unterminated_string;
+                } else {
+                    if (str) {
+                        cstr_ccat(str, '\\');
+                        cstr_ccat(str, c);
+                    }
+                    p++;
+                }
+            }
+        } else if (c == '\n') {
+            file->line_num++;
+            goto add_char;
+        } else if (c == '\r') {
+            PEEKC_EOB(c, p);
+            if (c != '\n') {
+                cstr_ccat(str, '\r');
+            } else {
+                file->line_num++;
+                goto add_char;
+            }
+        } else {
+        add_char:
+            if (str)
+                cstr_ccat(str, c);
+            p++;
+        }
+    }
+    p++;
+    return p;
+}
+
 /* skip block of text until #else, #elif or #endif. skip also pairs of
    #if/#endif */
 void preprocess_skip(void)
 {
-    int a, start_of_line, sep, c;
+    int a, start_of_line, c;
     uint8_t *p;
 
     p = file->buf_ptr;
@@ -1903,41 +1977,7 @@ void preprocess_skip(void)
             /* skip strings */
         case '\"':
         case '\'':
-            sep = c;
-            p++;
-            for(;;) {
-                c = *p;
-                if (c == sep) {
-                    break;
-                } else if (c == '\\') {
-                    file->buf_ptr = p;
-                    c = handle_eob();
-                    p = file->buf_ptr;
-                    if (c == CH_EOF) {
-                        /* XXX: better error message */
-                        error("unterminated string");
-                    } else if (c == '\\') {
-                        /* ignore next char */
-                        p++;
-                        c = *p;
-                        if (c == '\\') {
-                            file->buf_ptr = p;
-                            c = handle_eob();
-                            p = file->buf_ptr;
-                        }
-                        if (c == '\n')
-                            file->line_num++;
-                        else if (c != CH_EOF)
-                            p++;
-                    }
-                } else if (c == '\n') {
-                    file->line_num++;
-                    p++;
-                } else {
-                    p++;
-                }
-            }
-            p++;
+            p = parse_pp_string(p, c, NULL);
             break;
             /* skip comments */
         case '/':
@@ -2724,106 +2764,105 @@ static void preprocess(int is_bof)
     parse_flags = saved_parse_flags;
 }
 
-/* read a number in base b */
-static int getn(int b)
-{
-    int n, t;
-    n = 0;
-    while (1) {
-        if (ch >= 'a' && ch <= 'f')
-            t = ch - 'a' + 10;
-        else if (ch >= 'A' && ch <= 'F')
-            t = ch - 'A' + 10;
-        else if (isnum(ch))
-            t = ch - '0';
-        else
-            break;
-        if (t < 0 || t >= b)
-            break;
-        n = n * b + t;
-        inp();
-    }
-    return n;
-}
-
-/* read a character for string or char constant and eval escape codes */
-static int getq(void)
+/* evaluate escape codes in a string. */
+static void parse_escape_string(CString *outstr, const uint8_t *buf, int is_long)
 {
-    int c;
+    int c, n;
+    const char *p;
 
- redo:
-    c = ch;
-    inp();
-    if (c == '\\') {
-        switch(ch) {
-        case '0': case '1': case '2': case '3':
-        case '4': case '5': case '6': case '7':
-            /* at most three octal digits */
-            c = ch - '0';
-            inp();
-            if (isoct(ch)) {
-                c = c * 8 + ch - '0';
-                inp();
-                if (isoct(ch)) {
-                    c = c * 8 + ch - '0';
-                    inp();
+    p = buf;
+    for(;;) {
+        c = *p;
+        if (c == '\0')
+            break;
+        if (c == '\\') {
+            p++;
+            /* escape */
+            c = *p;
+            switch(c) {
+            case '0': case '1': case '2': case '3':
+            case '4': case '5': case '6': case '7':
+                /* at most three octal digits */
+                n = c - '0';
+                p++;
+                c = *p;
+                if (isoct(c)) {
+                    n = n * 8 + c - '0';
+                    p++;
+                    c = *p;
+                    if (isoct(c)) {
+                        n = n * 8 + c - '0';
+                        p++;
+                    }
+                }
+                c = n;
+                goto add_char_nonext;
+            case 'x':
+                p++;
+                n = 0;
+                for(;;) {
+                    c = *p;
+                    if (c >= 'a' && c <= 'f')
+                        c = c - 'a' + 10;
+                    else if (c >= 'A' && c <= 'F')
+                        c = c - 'A' + 10;
+                    else if (isnum(c))
+                        c = c - '0';
+                    else
+                        break;
+                    n = n * 16 + c;
+                    p++;
                 }
+                c = n;
+                goto add_char_nonext;
+            case 'a':
+                c = '\a';
+                break;
+            case 'b':
+                c = '\b';
+                break;
+            case 'f':
+                c = '\f';
+                break;
+            case 'n':
+                c = '\n';
+                break;
+            case 'r':
+                c = '\r';
+                break;
+            case 't':
+                c = '\t';
+                break;
+            case 'v':
+                c = '\v';
+                break;
+            case 'e':
+                if (!gnu_ext)
+                    goto invalid_escape;
+                c = 27;
+                break;
+            case '\'':
+            case '\"':
+            case '\\': 
+            case '?':
+                break;
+            default:
+            invalid_escape:
+                error("invalid escaped char");
             }
-            return c;
-        case 'x':
-            inp();
-            return getn(16);
-        case 'a':
-            c = '\a';
-            break;
-        case 'b':
-            c = '\b';
-            break;
-        case 'f':
-            c = '\f';
-            break;
-        case 'n':
-            c = '\n';
-            break;
-        case 'r':
-            c = '\r';
-            break;
-        case 't':
-            c = '\t';
-            break;
-        case 'v':
-            c = '\v';
-            break;
-        case 'e':
-            if (!gnu_ext)
-                goto invalid_escape;
-            c = 27;
-            break;
-        case '\'':
-        case '\"':
-        case '\\': 
-        case '?':
-            c = ch;
-            break;
-        case '\n':
-            inp();
-            goto redo;
-        case '\r':
-            inp();
-            if (ch != '\n')
-                goto invalid_escape;
-            inp();
-            goto redo;
-        default:
-        invalid_escape:
-            error("invalid escaped char");
         }
-        inp();
-    } else if (c == '\r' && ch == '\n') {
-        inp();
-        c = '\n';
+        p++;
+    add_char_nonext:
+        if (!is_long)
+            cstr_ccat(outstr, c);
+        else
+            cstr_wccat(outstr, c);
     }
-    return c;
+    /* add a trailing '\0' */
+    if (!is_long)
+        cstr_ccat(outstr, '\0');
+    else
+        cstr_wccat(outstr, '\0');
 }
 
 /* we use 64 bit numbers */
@@ -3132,7 +3171,7 @@ void parse_number(const char *p)
 /* return next token without macro substitution */
 static inline void next_nomacro1(void)
 {
-    int b, t, c;
+    int t, c, is_long;
     TokenSym *ts;
     uint8_t *p, *p1;
     unsigned int h;
@@ -3304,11 +3343,8 @@ static inline void next_nomacro1(void)
             goto parse_ident_fast;
         } else {
             PEEKC(c, p);
-            if (c == '\'') {
-                tok = TOK_LCHAR;
-                goto char_const; 
-            } else if (c == '\"') {
-                tok = TOK_LSTR;
+            if (c == '\'' || c == '\"') {
+                is_long = 1;
                 goto str_const;
             } else {
                 cstr_reset(&tokcstr);
@@ -3357,42 +3393,51 @@ static inline void next_nomacro1(void)
         }
         break;
     case '\'':
-        tok = TOK_CCHAR;
-    char_const:
-        file->buf_ptr = p;
-        inp();
-        b = getq();
-        /* this cast is needed if >= 128 */
-        if (tok == TOK_CCHAR)
-            b = (char)b; 
-        tokc.i = b;
-        if (ch != '\'')
-            error("unterminated character constant");
-        p = file->buf_ptr;
-        p++;
-        break;
     case '\"':
-        tok = TOK_STR;
+        is_long = 0;
     str_const:
-        file->buf_ptr = p;
-        inp();
-        cstr_reset(&tokcstr);
-        while (ch != '\"') {
-            b = getq();
-            if (ch == CH_EOF)
-                error("unterminated string");
-            if (tok == TOK_STR)
-                cstr_ccat(&tokcstr, b);
-            else
-                cstr_wccat(&tokcstr, b);
+        {
+            CString str;
+            int sep;
+
+            sep = c;
+
+            /* parse the string */
+            cstr_new(&str);
+            p = parse_pp_string(p, sep, &str);
+            cstr_ccat(&str, '\0');
+            
+            /* eval the escape (should be done as TOK_PPNUM) */
+            cstr_reset(&tokcstr);
+            parse_escape_string(&tokcstr, str.data, is_long);
+            cstr_free(&str);
+
+            if (sep == '\'') {
+                int char_size;
+                /* XXX: make it portable */
+                if (!is_long)
+                    char_size = 1;
+                else
+                    char_size = sizeof(int);
+                if (tokcstr.size <= char_size)
+                    error("empty character constant");
+                if (tokcstr.size > 2 * char_size)
+                    warning("multi-character character constant");
+                if (!is_long) {
+                    tokc.i = *(int8_t *)tokcstr.data;
+                    tok = TOK_CCHAR;
+                } else {
+                    tokc.i = *(int *)tokcstr.data;
+                    tok = TOK_LCHAR;
+                }
+            } else {
+                tokc.cstr = &tokcstr;
+                if (!is_long)
+                    tok = TOK_STR;
+                else
+                    tok = TOK_LSTR;
+            }
         }
-        if (tok == TOK_STR)
-            cstr_ccat(&tokcstr, '\0');
-        else
-            cstr_wccat(&tokcstr, '\0');
-        tokc.cstr = &tokcstr;
-        p = file->buf_ptr;
-        p++;
         break;
 
     case '<':
-- 
2.11.4.GIT