From 936819a1b90f2618bb3f86730189cf2895948ba0 Mon Sep 17 00:00:00 2001 From: seyko Date: Tue, 5 Apr 2016 13:05:09 +0300 Subject: [PATCH] utf8 in identifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit made like in pcc (pcc.ludd.ltu.se/ftp/pub/pcc-docs/pcc-utf8-ver3.pdf) We treat all chars with high bit set as alphabetic. This allow code like #include int Lefèvre=2; int main() { printf("Lefèvre=%d\n",Lefèvre); return 0; } --- tccpp.c | 14 +++++++++++++- tests/tests2/83_utf8_in_identifiers.c | 9 +++++++++ tests/tests2/83_utf8_in_identifiers.expect | 2 ++ 3 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 tests/tests2/83_utf8_in_identifiers.c create mode 100644 tests/tests2/83_utf8_in_identifiers.expect diff --git a/tccpp.c b/tccpp.c index 260c9f4..4745bec 100644 --- a/tccpp.c +++ b/tccpp.c @@ -2340,6 +2340,8 @@ static inline void next_nomacro1(void) p = file->buf_ptr; redo_no_start: c = *p; + if (c & 0x80) + goto parse_ident_fast; switch(c) { case ' ': case '\t': @@ -2444,6 +2446,12 @@ maybe_newline: || (parse_flags & PARSE_FLAG_ASM_FILE)) goto parse_simple; +#if (__TINYC__ || __GNUC__) + case 'a' ... 'z': + case 'A' ... 'K': + case 'M' ... 'Z': + case '_': +#else case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': @@ -2459,6 +2467,7 @@ maybe_newline: case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '_': +#endif parse_ident_fast: p1 = p; h = TOK_HASH_INIT; @@ -3364,13 +3373,16 @@ ST_FUNC void preprocess_new(void) const char *p, *r; /* init isid table */ - for(i = CH_EOF; i<256; i++) + for(i = CH_EOF; i<128; i++) isidnum_table[i - CH_EOF] = is_space(i) ? IS_SPC : isid(i) ? IS_ID : isnum(i) ? IS_NUM : 0; + for(i = 128; i<256; i++) + isidnum_table[i - CH_EOF] = IS_ID; + memset(hash_ident, 0, TOK_HASH_SIZE * sizeof(TokenSym *)); tok_ident = TOK_IDENT; diff --git a/tests/tests2/83_utf8_in_identifiers.c b/tests/tests2/83_utf8_in_identifiers.c new file mode 100644 index 0000000..1f86095 --- /dev/null +++ b/tests/tests2/83_utf8_in_identifiers.c @@ -0,0 +1,9 @@ +#include +double привет=0.1; +int Lefèvre=2; +int main(){ + printf("привет=%g\n",привет); + printf("Lefèvre=%d\n",Lefèvre); + return 0; +} +// pcc & tcc only diff --git a/tests/tests2/83_utf8_in_identifiers.expect b/tests/tests2/83_utf8_in_identifiers.expect new file mode 100644 index 0000000..1553f5f --- /dev/null +++ b/tests/tests2/83_utf8_in_identifiers.expect @@ -0,0 +1,2 @@ +привет=0.1 +Lefèvre=2