From 8901fbeef15205d9dda3950c9eb8115ec677893d Mon Sep 17 00:00:00 2001
From: bellard <bellard>
Date: Sat, 23 Nov 2002 18:15:17 +0000
Subject: [PATCH] parsing optimizations

---
 tcc.c    | 535 +++++++++++++++++++++++++++++++++----------------------
 tccelf.c |   2 +-
 2 files changed, 319 insertions(+), 218 deletions(-)
diff --git a/tcc.c b/tcc.c
index de9a257..c9f54e4 100644
--- a/tcc.c
+++ b/tcc.c
@@ -1118,11 +1118,13 @@ void test_lvalue(void)
 TokenSym *tok_alloc(const char *str, int len)
 {
     TokenSym *ts, **pts, **ptable;
-    int h, i;
+    int i;
+    unsigned int h;
     
     h = 1;
     for(i=0;i<len;i++)
-        h = (h * 263 +  ((unsigned char *)str)[i]) & (TOK_HASH_SIZE - 1);
+        h = h * 263 +  ((unsigned char *)str)[i];
+    h &= (TOK_HASH_SIZE - 1);
 
     pts = &hash_ident[h];
     while (1) {
@@ -1522,7 +1524,12 @@ static int tcc_peekc_slow(BufferedFile *bf)
     /* only tries to read if really end of buffer */
     if (bf->buf_ptr >= bf->buf_end) {
         if (bf->fd != -1) {
-            len = read(bf->fd, bf->buffer, IO_BUF_SIZE);
+#if defined(PARSE_DEBUG)
+            len = 8;
+#else
+            len = IO_BUF_SIZE;
+#endif
+            len = read(bf->fd, bf->buffer, len);
             if (len < 0)
                 len = 0;
         } else {
@@ -1541,13 +1548,11 @@ static int tcc_peekc_slow(BufferedFile *bf)
     }
 }
 
-/* no need to put that inline */
-void handle_eob(void)
+/* return the current character, handling end of block if necessary
+   (but not stray) */
+static int handle_eob(void)
 {
-    /* no need to do anything if not at EOB */
-    if (file->buf_ptr < file->buf_end)
-        return;
-    ch = tcc_peekc_slow(file);
+    return tcc_peekc_slow(file);
 }
 
 /* read next char from current input file and handle end of input buffer */
@@ -1556,7 +1561,7 @@ static inline void inp(void)
     ch = *(++(file->buf_ptr));
     /* end of buffer/file handling */
     if (ch == CH_EOB)
-        handle_eob();
+        ch = handle_eob();
 }
 
 /* handle '\[\r]\n' */
@@ -1580,6 +1585,40 @@ static void handle_stray(void)
     }
 }
 
+/* skip the stray and handle the \\n case. Output an error if
+   incorrect char after the stray */
+static int handle_stray1(uint8_t *p)
+{
+    int c;
+
+    if (p >= file->buf_end) {
+        file->buf_ptr = p;
+        c = handle_eob();
+        p = file->buf_ptr;
+        if (c == '\\')
+            goto parse_stray;
+    } else {
+    parse_stray:
+        file->buf_ptr = p;
+        ch = *p;
+        handle_stray();
+        p = file->buf_ptr;
+        c = *p;
+    }
+    return c;
+}
+
+/* handle the complicated stray case */
+#define PEEKC(c, p)\
+{\
+    p++;\
+    c = *p;\
+    if (c == '\\') {\
+        c = handle_stray1(p);\
+        p = file->buf_ptr;\
+    }\
+}
+
 /* input with '\[\r]\n' handling. Note that this function cannot
    handle other characters after '\', so you cannot call it inside
    strings or comments */
@@ -1606,8 +1645,8 @@ static void parse_comment(void)
     int c;
     
     /* C comments */
-    minp();
     p = file->buf_ptr;
+    p++;
     for(;;) {
         /* fast skip loop */
         for(;;) {
@@ -1628,49 +1667,49 @@ static void parse_comment(void)
             p++;
             for(;;) {
                 c = *p;
-                if (c == '/') {
+                if (c == '*') {
+                    p++;
+                } else if (c == '/') {
                     goto end_of_comment;
                 } else if (c == '\\') {
-                    if (p >= file->buf_end) {
-                        file->buf_ptr = p;
-                        handle_eob();
-                        p = file->buf_ptr;
-                        if (p >= file->buf_end)
-                            goto eof_found;
-                        continue;
+                    file->buf_ptr = p;
+                    c = handle_eob();
+                    if (c == '\\') {
+                        /* skip '\\n', but if '\' followed but another
+                           char, behave asif a stray was parsed */
+                        ch = file->buf_ptr[0];
+                        while (ch == '\\') {
+                            inp();
+                            if (ch == '\n') {
+                                file->line_num++;
+                                inp();
+                            } else if (ch == '\r') {
+                                inp();
+                                if (ch == '\n') {
+                                    file->line_num++;
+                                    inp();
+                                }
+                            } else {
+                                p = file->buf_ptr;
+                                break;
+                            }
+                        }
                     }
-                    p++;
-                    c = *p;
-                    if (c == '\n') {
-                        file->line_num++;
-                        p++;
-                    } else if (c == '\r') {
-                        p++;
-                        c = *p;
-                        if (c != '\n')
-                            break;
-                        file->line_num++;
-                        p++;
-                    } else {
-                        break;
-                    }
-                } else if (c == '*') {
-                    p++;
+                    p = file->buf_ptr;
                 } else {
                     break;
                 }
             }
-        } else if (p >= file->buf_end) {
-            file->buf_ptr = p;
-            handle_eob();
-            p = file->buf_ptr;
-            if (p >= file->buf_end) {
-            eof_found:
-                error("unexpected end of file in comment");
-            }
         } else {
-            /* stray */
-            p++;
+            /* stray, eob or eof */
+            file->buf_ptr = p;
+            c = handle_eob();
+            p = file->buf_ptr;
+            if (c == CH_EOF) {
+                error("unexpected end of file in comment");
+            } else if (c == '\\') {
+                p++;
+            }
         }
     }
  end_of_comment:
@@ -1697,63 +1736,98 @@ static inline void skip_spaces(void)
    #if/#endif */
 void preprocess_skip(void)
 {
-    int a, start_of_line, sep;
-    
+    int a, start_of_line, sep, c;
+    uint8_t *p;
+
+    p = file->buf_ptr;
     start_of_line = 1;
     a = 0;
     for(;;) {
     redo_no_start:
-        switch(ch) {
+        c = *p;
+        switch(c) {
         case ' ':
         case '\t':
         case '\f':
         case '\v':
         case '\r':
-            inp();
+            p++;
             goto redo_no_start;
         case '\n':
             start_of_line = 1;
             file->line_num++;
-            inp();
+            p++;
             goto redo_no_start;
         case '\\':
-            handle_stray();
+            file->buf_ptr = p;
+            c = handle_eob();
+            if (c == CH_EOF) {
+                expect("#endif");
+            } else if (c == '\\') {
+                /* XXX: incorrect: should not give an error */
+                ch = file->buf_ptr[0];
+                handle_stray();
+            }
+            p = file->buf_ptr;
             goto redo_no_start;
             /* skip strings */
         case '\"':
         case '\'':
-            sep = ch;
-            inp();
-            while (ch != sep) {
-                /* XXX: better error message */
-                if (ch == TOK_EOF) {
-                    error("unterminated string");
-                } else if (ch == '\n') {
+            sep = c;
+            p++;
+            for(;;) {
+                c = *p;
+                if (c == sep) {
+                    break;
+                } else if (c == '\\') {
+                    file->buf_ptr = p;
+                    c = handle_eob();
+                    p = file->buf_ptr;
+                    if (c == CH_EOF) {
+                        /* XXX: better error message */
+                        error("unterminated string");
+                    } else if (c == '\\') {
+                        /* ignore next char */
+                        p++;
+                        c = *p;
+                        if (c == '\\') {
+                            file->buf_ptr = p;
+                            c = handle_eob();
+                            p = file->buf_ptr;
+                        }
+                        if (c == '\n')
+                            file->line_num++;
+                        else if (c != CH_EOF)
+                            p++;
+                    }
+                } else if (c == '\n') {
                     file->line_num++;
-                } else if (ch == '\\') {
-                    /* ignore next char */
-                    inp();
-                    if (ch == '\n')
-                        file->line_num++;
+                    p++;
+                } else {
+                    p++;
                 }
-                inp();
             }
-            minp();
+            p++;
             break;
             /* skip comments */
         case '/':
+            file->buf_ptr = p;
+            ch = *p;
             minp();
             if (ch == '*') {
                 parse_comment();
             } else if (ch == '/') {
                 parse_line_comment();
             }
+            p = file->buf_ptr;
             break;
 
         case '#':
-            minp();
+            p++;
             if (start_of_line) {
+                file->buf_ptr = p;
                 next_nomacro();
+                p = file->buf_ptr;
                 if (a == 0 && 
                     (tok == TOK_ELSE || tok == TOK_ELIF || tok == TOK_ENDIF))
                     goto the_end;
@@ -1763,16 +1837,14 @@ void preprocess_skip(void)
                     a--;
             }
             break;
-        case CH_EOF:
-            expect("#endif");
-            break;
         default:
-            inp();
+            p++;
             break;
         }
         start_of_line = 0;
     }
  the_end: ;
+    file->buf_ptr = p;
 }
 
 /* ParseState handling */
@@ -2040,10 +2112,10 @@ void tok_print(int *str)
 #endif
 
 /* parse after #define */
-void parse_define(void)
+static void parse_define(void)
 {
     Sym *s, *first, **ps;
-    int v, t, varg, is_vaargs;
+    int v, t, varg, is_vaargs, c;
     TokenString str;
     
     v = tok;
@@ -2053,7 +2125,10 @@ void parse_define(void)
     first = NULL;
     t = MACRO_OBJ;
     /* '(' must be just after macro definition for MACRO_FUNC */
-    if (ch == '(') {
+    c = file->buf_ptr[0];
+    if (c == '\\')
+        c = handle_stray1(file->buf_ptr);
+    if (c == '(') {
         next_nomacro();
         next_nomacro();
         ps = &first;
@@ -2156,6 +2231,8 @@ static void preprocess(int is_bof)
             define_undef(s);
         break;
     case TOK_INCLUDE:
+        ch = file->buf_ptr[0];
+        /* XXX: incorrect if comments : use next_nomacro with a special mode */
         skip_spaces();
         if (ch == '<') {
             c = '>';
@@ -2781,32 +2858,55 @@ void parse_number(const char *p)
     }
 }
 
+
+#define PARSE2(c1, tok1, c2, tok2)              \
+    case c1:                                    \
+        PEEKC(c, p);                            \
+        if (c == c2) {                          \
+            p++;                                \
+            tok = tok2;                         \
+        } else {                                \
+            tok = tok1;                         \
+        }                                       \
+        break;
+
 /* return next token without macro substitution */
 static inline void next_nomacro1(void)
 {
-    int b, t;
-    char *q;
+    int b, t, c;
     TokenSym *ts;
+    uint8_t *p, *p1;
 
+    p = file->buf_ptr;
  redo_no_start:
-    switch(ch) {
+    c = *p;
+    switch(c) {
     case ' ':
     case '\t':
     case '\f':
     case '\v':
     case '\r':
-        inp();
+        p++;
         goto redo_no_start;
         
     case '\\':
         /* first look if it is in fact an end of buffer */
-        handle_eob();
-        if (ch != '\\')
+        if (p >= file->buf_end) {
+            file->buf_ptr = p;
+            handle_eob();
+            p = file->buf_ptr;
+            if (p >= file->buf_end)
+                goto parse_eof;
+            else
+                goto redo_no_start;
+        } else {
+            file->buf_ptr = p;
+            ch = *p;
+            handle_stray();
+            p = file->buf_ptr;
             goto redo_no_start;
-        handle_stray();
-        goto redo_no_start;
-
-    case CH_EOF:
+        }
+    parse_eof:
         {
             TCCState *s1 = tcc_state;
 
@@ -2837,6 +2937,7 @@ static inline void next_nomacro1(void)
                 s1->include_stack_ptr--;
                 file = *s1->include_stack_ptr;
                 inp();
+                p = file->buf_ptr;
                 goto redo_no_start;
             }
         }
@@ -2848,19 +2949,22 @@ static inline void next_nomacro1(void)
             tok = TOK_LINEFEED;
         } else {
             tok_flags |= TOK_FLAG_BOL;
-            inp();
+            p++;
             goto redo_no_start;
         }
         break;
 
     case '#':
-        minp();
+        /* XXX: simplify */
+        PEEKC(c, p);
         if (tok_flags & TOK_FLAG_BOL) {
+            file->buf_ptr = p;
             preprocess(tok_flags & TOK_FLAG_BOF);
+            p = file->buf_ptr;
             goto redo_no_start;
         } else {
-            if (ch == '#') {
-                inp();
+            if (c == '#') {
+                p++;
                 tok = TOK_TWOSHARPS;
             } else {
                 tok = '#';
@@ -2883,34 +2987,57 @@ static inline void next_nomacro1(void)
     case 'U': case 'V': case 'W': case 'X':
     case 'Y': case 'Z': 
     case '_':
-        q = token_buf;
-        *q++ = ch;
-        cinp();
-    parse_ident:
-        while (isid(ch) || isnum(ch)) {
-            if (q >= token_buf + STRING_MAX_SIZE)
-                error("ident too long");
-            *q++ = ch;
-            cinp();
+    parse_ident_fast:
+        p1 = p;
+        p++;
+        for(;;) {
+            c = *p;
+            if (!isid(c) && !isnum(c))
+                break;
+            p++;
+        }
+        if (c != '\\') {
+            /* fast case : no stray found, so we have the full token */
+            ts = tok_alloc(p1, p - p1);
+        } else {
+            /* slower case */
+            cstr_reset(&tokcstr);
+
+            while (p1 < p) {
+                cstr_ccat(&tokcstr, *p1);
+                p1++;
+            }
+            p--;
+            PEEKC(c, p);
+        parse_ident_slow:
+            while (isid(c) || isnum(c)) {
+                cstr_ccat(&tokcstr, c);
+                PEEKC(c, p);
+            }
+            ts = tok_alloc(tokcstr.data, tokcstr.size);
         }
-        *q = '\0';
-        ts = tok_alloc(token_buf, q - token_buf);
         tok = ts->tok;
         break;
     case 'L':
-        minp();
-        if (ch == '\'') {
-            tok = TOK_LCHAR;
-            goto char_const;
+        c = p[1];
+        if (c != '\\' && c != '\'' && c != '\"') {
+            /* fast case */
+            goto parse_ident_fast;
+        } else {
+            PEEKC(c, p);
+            if (c == '\'') {
+                tok = TOK_LCHAR;
+                goto char_const; 
+            } else if (c == '\"') {
+                tok = TOK_LSTR;
+                goto str_const;
+            } else {
+                cstr_reset(&tokcstr);
+                cstr_ccat(&tokcstr, 'L');
+                goto parse_ident_slow;
+            }
         }
-        if (ch == '\"') {
-            tok = TOK_LSTR;
-            goto str_const;
-        }
-        q = token_buf;
-        *q++ = 'L';
-        goto parse_ident;
-
+        break;
     case '0': case '1': case '2': case '3':
     case '4': case '5': case '6': case '7':
     case '8': case '9':
@@ -2920,11 +3047,11 @@ static inline void next_nomacro1(void)
            prefixed by 'eEpP' */
     parse_num:
         for(;;) {
-            t = ch;
-            cstr_ccat(&tokcstr, ch);
-            cinp();
-            if (!(isnum(ch) || isid(ch) || ch == '.' ||
-                  ((ch == '+' || ch == '-') && 
+            t = c;
+            cstr_ccat(&tokcstr, c);
+            PEEKC(c, p);
+            if (!(isnum(c) || isid(c) || c == '.' ||
+                  ((c == '+' || c == '-') && 
                    (t == 'e' || t == 'E' || t == 'p' || t == 'P'))))
                 break;
         }
@@ -2935,17 +3062,16 @@ static inline void next_nomacro1(void)
         break;
     case '.':
         /* special dot handling because it can also start a number */
-        cinp();
-        if (isnum(ch)) {
+        PEEKC(c, p);
+        if (isnum(c)) {
             cstr_reset(&tokcstr);
             cstr_ccat(&tokcstr, '.');
             goto parse_num;
-        }
-        if (ch == '.') {
-            cinp();
-            if (ch != '.')
+        } else if (c == '.') {
+            PEEKC(c, p);
+            if (c != '.')
                 expect("'.'");
-            cinp();
+            PEEKC(c, p);
             tok = TOK_DOTS;
         } else {
             tok = '.';
@@ -2954,6 +3080,7 @@ static inline void next_nomacro1(void)
     case '\'':
         tok = TOK_CCHAR;
     char_const:
+        file->buf_ptr = p;
         inp();
         b = getq();
         /* this cast is needed if >= 128 */
@@ -2962,11 +3089,13 @@ static inline void next_nomacro1(void)
         tokc.i = b;
         if (ch != '\'')
             error("unterminated character constant");
-        inp();
+        p = file->buf_ptr;
+        p++;
         break;
     case '\"':
         tok = TOK_STR;
     str_const:
+        file->buf_ptr = p;
         inp();
         cstr_reset(&tokcstr);
         while (ch != '\"') {
@@ -2983,18 +3112,19 @@ static inline void next_nomacro1(void)
         else
             cstr_wccat(&tokcstr, '\0');
         tokc.cstr = &tokcstr;
-        inp();
+        p = file->buf_ptr;
+        p++;
         break;
 
     case '<':
-        cinp();
-        if (ch == '=') {
-            cinp();
+        PEEKC(c, p);
+        if (c == '=') {
+            p++;
             tok = TOK_LE;
-        } else if (ch == '<') {
-            cinp();
-            if (ch == '=') {
-                cinp();
+        } else if (c == '<') {
+            PEEKC(c, p);
+            if (c == '=') {
+                p++;
                 tok = TOK_A_SHL;
             } else {
                 tok = TOK_SHL;
@@ -3005,14 +3135,14 @@ static inline void next_nomacro1(void)
         break;
         
     case '>':
-        cinp();
-        if (ch == '=') {
-            cinp();
+        PEEKC(c, p);
+        if (c == '=') {
+            p++;
             tok = TOK_GE;
-        } else if (ch == '>') {
-            cinp();
-            if (ch == '=') {
-                cinp();
+        } else if (c == '>') {
+            PEEKC(c, p);
+            if (c == '=') {
+                p++;
                 tok = TOK_A_SAR;
             } else {
                 tok = TOK_SAR;
@@ -3022,113 +3152,82 @@ static inline void next_nomacro1(void)
         }
         break;
         
-    case '!':
-        tok = ch;
-        cinp();
-        if (ch == '=') {
-            cinp();
-            tok = TOK_NE;
-        }
-        break;
-
-    case '=':
-        tok = ch;
-        cinp();
-        if (ch == '=') {
-            cinp();
-            tok = TOK_EQ;
-        }
-        break;
-
     case '&':
-        tok = ch;
-        cinp();
-        if (ch == '&') {
-            cinp();
+        PEEKC(c, p);
+        if (c == '&') {
+            p++;
             tok = TOK_LAND;
-        } else if (ch == '=') {
-            cinp();
+        } else if (c == '=') {
+            p++;
             tok = TOK_A_AND;
+        } else {
+            tok = '&';
         }
         break;
         
     case '|':
-        tok = ch;
-        cinp();
-        if (ch == '|') {
-            cinp();
+        PEEKC(c, p);
+        if (c == '|') {
+            p++;
             tok = TOK_LOR;
-        } else if (ch == '=') {
-            cinp();
+        } else if (c == '=') {
+            p++;
             tok = TOK_A_OR;
+        } else {
+            tok = '|';
         }
         break;
 
     case '+':
-        tok = ch;
-        cinp();
-        if (ch == '+') {
-            cinp();
+        PEEKC(c, p);
+        if (c == '+') {
+            p++;
             tok = TOK_INC;
-        } else if (ch == '=') {
-            cinp();
+        } else if (c == '=') {
+            p++;
             tok = TOK_A_ADD;
+        } else {
+            tok = '+';
         }
         break;
         
     case '-':
-        tok = ch;
-        cinp();
-        if (ch == '-') {
-            cinp();
+        PEEKC(c, p);
+        if (c == '-') {
+            p++;
             tok = TOK_DEC;
-        } else if (ch == '=') {
-            cinp();
+        } else if (c == '=') {
+            p++;
             tok = TOK_A_SUB;
-        } else if (ch == '>') {
-            cinp();
+        } else if (c == '>') {
+            p++;
             tok = TOK_ARROW;
+        } else {
+            tok = '-';
         }
         break;
 
-    case '*':
-        tok = ch;
-        cinp();
-        if (ch == '=') {
-            cinp();
-            tok = TOK_A_MUL;
-        }
-        break;
-
-    case '%':
-        tok = ch;
-        cinp();
-        if (ch == '=') {
-            cinp();
-            tok = TOK_A_MOD;
-        }
-        break;
+    PARSE2('!', '!', '=', TOK_NE)
+    PARSE2('=', '=', '=', TOK_EQ)
+    PARSE2('*', '*', '=', TOK_A_MUL)
+    PARSE2('%', '%', '=', TOK_A_MOD)
+    PARSE2('^', '^', '=', TOK_A_XOR)
         
-    case '^':
-        tok = ch;
-        cinp();
-        if (ch == '=') {
-            cinp();
-            tok = TOK_A_XOR;
-        }
-        break;
-
         /* comments or operator */
     case '/':
-        minp();
-        if (ch == '*') {
+        PEEKC(c, p);
+        if (c == '*') {
+            file->buf_ptr = p;
             parse_comment();
+            p = file->buf_ptr;
             goto redo_no_start;
-        } else if (ch == '/') {
+        } else if (c == '/') {
+            file->buf_ptr = p;
             parse_line_comment();
+            p = file->buf_ptr;
             goto redo_no_start;
-        } else if (ch == '=') {
-            cinp();
+        } else if (c == '=') {
+            p++;
             tok = TOK_A_DIV;
         } else {
             tok = '/';
@@ -3147,13 +3246,14 @@ static inline void next_nomacro1(void)
     case ':':
     case '?':
     case '~':
-        tok = ch;
-        cinp();
+        tok = c;
+        p++;
         break;
     default:
-        error("unrecognized character \\x%02x", ch);
+        error("unrecognized character \\x%02x", c);
         break;
     }
+    file->buf_ptr = p;
     tok_flags = 0;
 #if defined(PARSE_DEBUG)
     printf("token = %s\n", get_tok_str(tok, &tokc));
@@ -3427,6 +3527,7 @@ static int macro_subst_tok(TokenString *tok_str,
                 t = *macro_ptr;
             } else {
                 /* XXX: incorrect with comments */
+                ch = file->buf_ptr[0];
                 while (is_space(ch) || ch == '\n')
                     cinp();
                 t = ch;
diff --git a/tccelf.c b/tccelf.c
index 430c396..33aeac1 100644
--- a/tccelf.c
+++ b/tccelf.c
@@ -1891,7 +1891,7 @@ static int tcc_load_ldscript(TCCState *s1)
     int t;
     
     ch = file->buf_ptr[0];
-    handle_eob();
+    ch = handle_eob();
     for(;;) {
         t = ld_next(s1, cmd, sizeof(cmd));
         if (t == LD_TOK_EOF)