From 51bf3f2585e60b2b11dd57a7529eb3c37c4b880d Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Tue, 24 Apr 2018 12:27:13 +1200 Subject: [PATCH] Sync changes from latest snowball compiler version * Avoid comparing with uninitialised array element in compiler. * Fix GCC7 misleading indentation and switch case fall-through warnings. * Fix // comments in snowball compiler code for C90 compatibility. * Support for {U+1234} notation to specify Unicode codepoints. --- xapian-core/languages/Makefile.mk | 2 +- xapian-core/languages/compiler/analyser.c | 177 +++++++++++++++++++--------- xapian-core/languages/compiler/driver.c | 183 +++++++++++++++++++++++------ xapian-core/languages/compiler/generator.c | 84 +++++++------ xapian-core/languages/compiler/header.h | 52 ++++++-- xapian-core/languages/compiler/tokeniser.c | 96 +++++++++++++-- 6 files changed, 452 insertions(+), 142 deletions(-) diff --git a/xapian-core/languages/Makefile.mk b/xapian-core/languages/Makefile.mk index d44c63c3e..6b3936b9f 100644 --- a/xapian-core/languages/Makefile.mk +++ b/xapian-core/languages/Makefile.mk @@ -85,7 +85,7 @@ $(snowball_built_sources): languages/snowball $(snowball_algorithms) languages/snowball: $(snowball_sources) $(snowball_headers) $(CC_FOR_BUILD) -o languages/snowball \ - -DDISABLE_JAVA -DDISABLE_JSX -DDISABLE_PYTHON \ + -DDISABLE_CSHARP -DDISABLE_GO -DDISABLE_JAVA -DDISABLE_JS -DDISABLE_PYTHON -DDISABLE_RUST \ `for f in $(snowball_sources) ; do test -f $$f && echo $$f || echo $(srcdir)/$$f ; done` .sbl.cc: diff --git a/xapian-core/languages/compiler/analyser.c b/xapian-core/languages/compiler/analyser.c index 35c8b4462..c7fd3f83e 100644 --- a/xapian-core/languages/compiler/analyser.c +++ b/xapian-core/languages/compiler/analyser.c @@ -9,7 +9,7 @@ typedef enum { e_unexpected_token = 1, e_string_omitted = 2, e_unexpected_token_in_among = 3, - // For codes above here, report "after " t->previous_token after the error. + /* For codes above here, report "after " t->previous_token after the error. */ e_unresolved_substring = 14, e_not_allowed_inside_reverse = 15, e_empty_grouping = 16, @@ -17,7 +17,7 @@ typedef enum { e_empty_among = 18, e_adjacent_bracketed_in_among = 19, e_substring_preceded_by_substring = 20, - // For codes below here, tokeniser->b is printed before the error. + /* For codes below here, tokeniser->b is printed before the error. */ e_redeclared = 30, e_undeclared = 31, e_declared_as_different_mode = 32, @@ -35,8 +35,6 @@ static struct node * read_C(struct analyser * a); static struct node * C_style(struct analyser * a, const char * s, int token); -static void fault(int n) { fprintf(stderr, "fault %d\n", n); exit(1); } - static void print_node_(struct node * p, int n, const char * s) { int i; @@ -76,22 +74,37 @@ static struct node * new_node(struct analyser * a, int type) { static const char * name_of_mode(int n) { switch (n) { - default: fault(0); - case m_backward: return "string backward"; - case m_forward: return "string forward"; - /* case m_integer: return "integer"; */ + case m_backward: return "string backward"; + case m_forward: return "string forward"; + /* case m_integer: return "integer"; */ } + fprintf(stderr, "Invalid mode %d in name_of_mode()\n", n); + exit(1); } static const char * name_of_type(int n) { switch (n) { - default: fault(1); - case 's': return "string"; - case 'i': return "integer"; - case 'r': return "routine"; - case 'R': return "routine or grouping"; - case 'g': return "grouping"; + case 's': return "string"; + case 'i': return "integer"; + case 'r': return "routine"; + case 'R': return "routine or grouping"; + case 'g': return "grouping"; + } + fprintf(stderr, "Invalid type %d in name_of_type()\n", n); + exit(1); +} + +static const char * name_of_name_type(int code) { + switch (code) { + case t_string: return "string"; + case t_boolean: return "boolean"; + case t_integer: return "integer"; + case t_routine: return "routine"; + case t_external: return "external"; + case t_grouping: return "grouping"; } + fprintf(stderr, "Invalid type code %d in name_of_name_type()\n", code); + exit(1); } static void count_error(struct analyser * a) { @@ -110,6 +123,7 @@ static void error2(struct analyser * a, error_code n, int x) { fprintf(stderr, "%s omitted", name_of_token(t->omission)); break; case e_unexpected_token_in_among: fprintf(stderr, "in among(...), "); + /* fall through */ case e_unexpected_token: fprintf(stderr, "unexpected %s", name_of_token(t->token)); if (t->token == c_number) fprintf(stderr, " %d", t->number); @@ -228,13 +242,24 @@ static void check_routine_mode(struct analyser * a, struct name * p, int mode) { static void check_name_type(struct analyser * a, struct name * p, int type) { switch (type) { - case 's': if (p->type == t_string) return; break; - case 'i': if (p->type == t_integer) return; break; - case 'b': if (p->type == t_boolean) return; break; - case 'R': if (p->type == t_grouping) return; - case 'r': if (p->type == t_routine || - p->type == t_external) return; break; - case 'g': if (p->type == t_grouping) return; break; + case 's': + if (p->type == t_string) return; + break; + case 'i': + if (p->type == t_integer) return; + break; + case 'b': + if (p->type == t_boolean) return; + break; + case 'R': + if (p->type == t_grouping) return; + /* FALLTHRU */ + case 'r': + if (p->type == t_routine || p->type == t_external) return; + break; + case 'g': + if (p->type == t_grouping) return; + break; } error2(a, e_not_of_type_x, type); } @@ -281,7 +306,8 @@ handle_as_name: p->local_to = 0; p->grouping = 0; p->definition = 0; - a->name_count[type] ++; + p->declaration_line_number = t->line_number; + a->name_count[type]++; p->next = a->names; a->names = p; if (token != c_name) { @@ -358,8 +384,18 @@ static struct node * read_AE(struct analyser * a, int B) { struct node * q; switch (read_token(t)) { case c_minus: /* monadic */ + q = read_AE(a, 100); + if (q->type == c_neg) { + /* Optimise away double negation, which avoids generators + * having to worry about generating "--" (decrement operator + * in many languages). + */ + p = q->right; + /* Don't free q, it's in the linked list a->nodes. */ + break; + } p = new_node(a, c_neg); - p->right = read_AE(a, 100); + p->right = q; break; case c_bra: p = read_AE(a, 0); @@ -372,6 +408,7 @@ static struct node * read_AE(struct analyser * a, int B) { case c_maxint: case c_minint: a->int_limits_used = true; + /* fall through */ case c_cursor: case c_limit: case c_len: @@ -617,6 +654,8 @@ static struct node * read_among(struct analyser * a) { q = read_C_list(a); break; default: error(a, e_unexpected_token_in_among); + previous_token = token; + continue; case c_ket: if (p->number == 0) error(a, e_empty_among); if (t->error_count == 0) make_among(a, p, substring); @@ -688,6 +727,7 @@ static struct node * read_C(struct analyser * a) { return C_style(a, "A", token); case c_delete: check_modifyable(a); + /* fall through */ case c_next: case c_tolimit: case c_atlimit: @@ -722,7 +762,12 @@ static struct node * read_C(struct analyser * a) { switch (q ? q->type : t_string) /* above line was: switch (q->type) - bug #1 fix 7/2/2003 */ { - default: error(a, e_not_of_type_string_or_integer); + default: + error(a, e_not_of_type_string_or_integer); + /* Handle $foo for unknown 'foo' as string since + * that's more common and so less likely to cause + * an error avalanche. */ + /* fall through */ case t_string: a->mode = m_forward; a->modifyable = true; @@ -824,10 +869,11 @@ static void read_define_grouping(struct analyser * a, struct name * q) { NEW(grouping, p); if (a->groupings == 0) a->groupings = p; else a->groupings_end->next = p; a->groupings_end = p; - q->grouping = p; + if (q) q->grouping = p; p->next = 0; p->name = q; - p->number = q->count; + p->number = q ? q->count : 0; + p->line_number = a->tokeniser->line_number; p->b = create_b(0); while (true) { switch (read_token(t)) { @@ -841,7 +887,7 @@ static void read_define_grouping(struct analyser * a, struct name * q) { } break; case c_literalstring: - p->b = alter_grouping(p->b, t->b, style, a->utf8); + p->b = alter_grouping(p->b, t->b, style, (a->encoding == ENC_UTF8)); break; default: error(a, e_unexpected_token); return; } @@ -885,8 +931,8 @@ static void read_define_routine(struct analyser * a, struct name * q) { if (q) q->definition = p->left; if (a->substring != 0) { - error2(a, e_unresolved_substring, a->substring->line_number); - a->substring = 0; + error2(a, e_unresolved_substring, a->substring->line_number); + a->substring = 0; } p->amongvar_needed = a->amongvar_needed; } @@ -894,8 +940,26 @@ static void read_define_routine(struct analyser * a, struct name * q) { static void read_define(struct analyser * a) { if (get_token(a, c_name)) { struct name * q = find_name(a); - if (q != 0 && q->type == t_grouping) read_define_grouping(a, q); - else read_define_routine(a, q); + int type; + if (q) { + type = q->type; + } else { + /* No declaration, so sniff next token - if it is 'as' then parse + * as a routine, otherwise as a grouping. + */ + if (read_token(a->tokeniser) == c_as) { + type = t_routine; + } else { + type = t_grouping; + } + a->tokeniser->token_held = true; + } + + if (type == t_grouping) { + read_define_grouping(a, q); + } else { + read_define_routine(a, q); + } } } @@ -923,6 +987,7 @@ static void read_program_(struct analyser * a, int terminator) { case c_backwardmode:read_backwardmode(a); break; case c_ket: if (terminator == c_ket) return; + /* fall through */ default: error(a, e_unexpected_token); break; case -1: @@ -939,9 +1004,11 @@ extern void read_program(struct analyser * a) { while (q) { switch (q->type) { case t_external: case t_routine: - if (q->used && q->definition == 0) error4(a, q); break; + if (q->used && q->definition == 0) error4(a, q); + break; case t_grouping: - if (q->used && q->grouping == 0) error4(a, q); break; + if (q->used && q->grouping == 0) error4(a, q); + break; } q = q->next; } @@ -949,33 +1016,37 @@ extern void read_program(struct analyser * a) { if (a->tokeniser->error_count == 0) { struct name * q = a->names; - int warned = false; while (q) { if (!q->referenced) { - if (!warned) { - fprintf(stderr, "Declared but not used:"); - warned = true; + fprintf(stderr, "%s:%d: warning: %s '", + a->tokeniser->file, + q->declaration_line_number, + name_of_name_type(q->type)); + report_b(stderr, q->b); + if (q->type == t_routine || + q->type == t_external || + q->type == t_grouping) { + fprintf(stderr, "' declared but not defined\n"); + } else { + fprintf(stderr, "' defined but not used\n"); } - fprintf(stderr, " "); report_b(stderr, q->b); - } - q = q->next; - } - if (warned) fprintf(stderr, "\n"); - - q = a->names; - warned = false; - while (q) { - if (! q->used && (q->type == t_routine || - q->type == t_grouping)) { - if (!warned) { - fprintf(stderr, "Declared and defined but not used:"); - warned = true; + } else if (!q->used && + (q->type == t_routine || q->type == t_grouping)) { + int line_num; + if (q->type == t_routine) { + line_num = q->definition->line_number; + } else { + line_num = q->grouping->line_number; } - fprintf(stderr, " "); report_b(stderr, q->b); + fprintf(stderr, "%s:%d: warning: %s '", + a->tokeniser->file, + line_num, + name_of_name_type(q->type)); + report_b(stderr, q->b); + fprintf(stderr, "' defined but not used\n"); } q = q->next; } - if (warned) fprintf(stderr, "\n"); } } diff --git a/xapian-core/languages/compiler/driver.c b/xapian-core/languages/compiler/driver.c index e70ae62df..123306d1d 100644 --- a/xapian-core/languages/compiler/driver.c +++ b/xapian-core/languages/compiler/driver.c @@ -7,24 +7,40 @@ #define DEFAULT_BASE_CLASS "org.tartarus.snowball.SnowballProgram" #define DEFAULT_AMONG_CLASS "org.tartarus.snowball.Among" #define DEFAULT_STRING_CLASS "java.lang.StringBuilder" +#define DEFAULT_GO_PACKAGE "snowball" +#define DEFAULT_GO_SNOWBALL_RUNTIME "github.com/snowballstem/snowball/go" + +#define DEFAULT_CS_NAMESPACE "Snowball" +#define DEFAULT_CS_BASE_CLASS "Stemmer" +#define DEFAULT_CS_AMONG_CLASS "Among" +#define DEFAULT_CS_STRING_CLASS "StringBuilder" static int eq(const char * s1, const char * s2) { return strcmp(s1, s2) == 0; } static void print_arglist(void) { - fprintf(stderr, "Usage: snowball [options]\n\n" + fprintf(stderr, "Usage: snowball ... [options]\n\n" "options are: [-o[utput] file]\n" " [-s[yntax]]\n" #ifndef DISABLE_JAVA " [-j[ava]]\n" #endif +#ifndef DISABLE_CSHARP + " [-cs[harp]]\n" +#endif " [-c++]\n" #ifndef DISABLE_PYTHON " [-py[thon]]\n" #endif -#ifndef DISABLE_JSX - " [-jsx]\n" +#ifndef DISABLE_JS + " [-js]\n" +#endif +#ifndef DISABLE_RUST + " [-rust]\n" +#endif +#ifndef DISABLE_GO + " [-go]\n" #endif " [-w[idechars]]\n" " [-u[tf8]]\n" @@ -34,11 +50,15 @@ static void print_arglist(void) { " [-i[nclude] directory]\n" " [-r[untime] path to runtime headers]\n" " [-p[arentclassname] fully qualified parent class name]\n" -#ifndef DISABLE_JAVA +#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP) " [-P[ackage] package name for stemmers]\n" " [-S[tringclass] StringBuffer-compatible class]\n" " [-a[mongclass] fully qualified name of the Among class]\n" #endif +#ifndef DISABLE_GO + " [-gop[ackage] Go package name for stemmers]\n" + " [-gor[untime] Go snowball runtime package]\n" +#endif ); exit(1); } @@ -61,9 +81,10 @@ static FILE * get_output(symbol * b) { return output; } -static void read_options(struct options * o, int argc, char * argv[]) { +static int read_options(struct options * o, int argc, char * argv[]) { char * s; - int i = 2; + int i = 1; + int new_argc = 1; /* set defaults: */ @@ -73,24 +94,30 @@ static void read_options(struct options * o, int argc, char * argv[]) { o->variables_prefix = 0; o->runtime_path = 0; o->parent_class_name = DEFAULT_BASE_CLASS; -#ifndef DISABLE_JAVA o->string_class = DEFAULT_STRING_CLASS; o->among_class = DEFAULT_AMONG_CLASS; o->package = DEFAULT_PACKAGE; -#endif + o->go_package = DEFAULT_GO_PACKAGE; + o->go_snowball_runtime = DEFAULT_GO_SNOWBALL_RUNTIME; o->name = ""; o->make_lang = LANG_C; - o->widechars = false; o->includes = 0; o->includes_end = 0; - o->utf8 = false; + o->encoding = ENC_SINGLEBYTE; /* read options: */ while (i < argc) { s = argv[i++]; - { if (eq(s, "-o") || eq(s, "-output")) { - check_lim(i, argc); + if (s[0] != '-') { + /* Non-option argument - shuffle down. */ + argv[new_argc++] = s; + continue; + } + + { + if (eq(s, "-o") || eq(s, "-output")) { + check_lim(i, argc); o->output_file = argv[i++]; continue; } @@ -99,17 +126,42 @@ static void read_options(struct options * o, int argc, char * argv[]) { o->name = argv[i++]; continue; } -#ifndef DISABLE_JSX - if (eq(s, "-jsx")) { - o->make_lang = LANG_JSX; - o->widechars = true; +#ifndef DISABLE_JS + if (eq(s, "-js")) { + o->make_lang = LANG_JAVASCRIPT; + o->encoding = ENC_WIDECHARS; + continue; + } +#endif +#ifndef DISABLE_RUST + if (eq(s, "-rust")) { + o->make_lang = LANG_RUST; + o->encoding = ENC_UTF8; + continue; + } +#endif +#ifndef DISABLE_GO + if (eq(s, "-go")) { + o->make_lang = LANG_GO; + o->encoding = ENC_UTF8; continue; } #endif #ifndef DISABLE_JAVA if (eq(s, "-j") || eq(s, "-java")) { o->make_lang = LANG_JAVA; - o->widechars = true; + o->encoding = ENC_WIDECHARS; + continue; + } +#endif +#ifndef DISABLE_CSHARP + if (eq(s, "-cs") || eq(s, "-csharp")) { + o->make_lang = LANG_CSHARP; + o->encoding = ENC_WIDECHARS; + o->parent_class_name = DEFAULT_CS_BASE_CLASS; + o->string_class = DEFAULT_CS_STRING_CLASS; + o->among_class = DEFAULT_CS_AMONG_CLASS; + o->package = DEFAULT_CS_NAMESPACE; continue; } #endif @@ -120,13 +172,12 @@ static void read_options(struct options * o, int argc, char * argv[]) { #ifndef DISABLE_PYTHON if (eq(s, "-py") || eq(s, "-python")) { o->make_lang = LANG_PYTHON; - o->widechars = true; + o->encoding = ENC_WIDECHARS; continue; } #endif if (eq(s, "-w") || eq(s, "-widechars")) { - o->widechars = true; - o->utf8 = false; + o->encoding = ENC_WIDECHARS; continue; } if (eq(s, "-s") || eq(s, "-syntax")) { @@ -164,8 +215,7 @@ static void read_options(struct options * o, int argc, char * argv[]) { continue; } if (eq(s, "-u") || eq(s, "-utf8")) { - o->utf8 = true; - o->widechars = false; + o->encoding = ENC_UTF8; continue; } if (eq(s, "-p") || eq(s, "-parentclassname")) { @@ -173,7 +223,7 @@ static void read_options(struct options * o, int argc, char * argv[]) { o->parent_class_name = argv[i++]; continue; } -#ifndef DISABLE_JAVA +#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP) if (eq(s, "-P") || eq(s, "-Package")) { check_lim(i, argc); o->package = argv[i++]; @@ -190,10 +240,27 @@ static void read_options(struct options * o, int argc, char * argv[]) { continue; } #endif +#ifndef DISABLE_GO + if (eq(s, "-gop") || eq(s, "-gopackage")) { + check_lim(i, argc); + o->go_package = argv[i++]; + continue; + } + if (eq(s, "-gor") || eq(s, "-goruntime")) { + check_lim(i, argc); + o->go_snowball_runtime = argv[i++]; + continue; + } +#endif fprintf(stderr, "'%s' misplaced\n", s); print_arglist(); } } + if (new_argc == 1) { + fprintf(stderr, "no source files specified\n"); + print_arglist(); + } + argv[new_argc] = NULL; if (o->make_lang != LANG_C && o->make_lang != LANG_CPLUSPLUS) { if (o->runtime_path) { @@ -204,17 +271,19 @@ static void read_options(struct options * o, int argc, char * argv[]) { } } if (!o->externals_prefix) o->externals_prefix = ""; + return new_argc; } extern int main(int argc, char * argv[]) { + int i; NEW(options, o); - if (argc == 1) print_arglist(); - read_options(o, argc, argv); + argc = read_options(o, argc, argv); { symbol * filename = add_s_to_b(0, argv[1]); char * file; symbol * u = get_input(filename, &file); + lose_b(filename); if (u == 0) { fprintf(stderr, "Can't open input %s\n", argv[1]); exit(1); @@ -222,9 +291,29 @@ extern int main(int argc, char * argv[]) { { struct tokeniser * t = create_tokeniser(u, file); struct analyser * a = create_analyser(t); - t->widechars = o->widechars; + struct input ** next_input_ptr = &(t->next); + a->encoding = t->encoding = o->encoding; t->includes = o->includes; - a->utf8 = t->utf8 = o->utf8; + /* If multiple source files are specified, set up the others to be + * read after the first in order, using the same mechanism as + * 'get' uses. */ + for (i = 2; i != argc; ++i) { + NEW(input, q); + filename = add_s_to_b(0, argv[i]); + u = get_input(filename, &file); + lose_b(filename); + if (u == 0) { + fprintf(stderr, "Can't open input %s\n", argv[i]); + exit(1); + } + q->p = u; + q->c = 0; + q->file = file; + q->line_number = 1; + *next_input_ptr = q; + next_input_ptr = &(q->next); + } + *next_input_ptr = NULL; read_program(a); if (t->error_count > 0) exit(1); if (o->syntax_tree) print_program(a); @@ -274,13 +363,43 @@ extern int main(int argc, char * argv[]) { fclose(o->output_src); } #endif -#ifndef DISABLE_JSX - if (o->make_lang == LANG_JSX) { +#ifndef DISABLE_JS + if (o->make_lang == LANG_JAVASCRIPT) { symbol * b = add_s_to_b(0, s); - b = add_s_to_b(b, ".jsx"); + b = add_s_to_b(b, ".js"); o->output_src = get_output(b); lose_b(b); - generate_program_jsx(g); + generate_program_js(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_CSHARP + if (o->make_lang == LANG_CSHARP) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".cs"); + o->output_src = get_output(b); + lose_b(b); + generate_program_csharp(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_RUST + if (o->make_lang == LANG_RUST) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".rs"); + o->output_src = get_output(b); + lose_b(b); + generate_program_rust(g); + fclose(o->output_src); + } +#endif +#ifndef DISABLE_GO + if (o->make_lang == LANG_GO) { + symbol * b = add_s_to_b(0, s); + b = add_s_to_b(b, ".go"); + o->output_src = get_output(b); + lose_b(b); + generate_program_go(g); fclose(o->output_src); } #endif @@ -289,7 +408,6 @@ extern int main(int argc, char * argv[]) { close_analyser(a); } lose_b(u); - lose_b(filename); } { struct include * p = o->includes; while (p) { @@ -301,4 +419,3 @@ extern int main(int argc, char * argv[]) { if (space_count) fprintf(stderr, "%d blocks unfreed\n", space_count); return 0; } - diff --git a/xapian-core/languages/compiler/generator.c b/xapian-core/languages/compiler/generator.c index 6f233b53e..1921cb007 100644 --- a/xapian-core/languages/compiler/generator.c +++ b/xapian-core/languages/compiler/generator.c @@ -83,11 +83,10 @@ static void write_hex(struct generator * g, int i) { static void wlitch(struct generator * g, int ch) { if (32 <= ch && ch < 127) { write_char(g, '\''); - switch (ch) { - case '\'': - case '\\': write_char(g, '\\'); - default: write_char(g, ch); + if (ch == '\'' || ch == '\\') { + write_char(g, '\\'); } + write_char(g, ch); write_char(g, '\''); } else { write_string(g, "0x"); write_hex(g, ch); @@ -175,13 +174,29 @@ static void write_block_end(struct generator * g) { /* block end */ static void w(struct generator * g, const char * s); -static void wk(struct generator * g, struct node * p) { /* keep c */ +/* keep c */ +static void wk(struct generator * g, struct node * p, int keep_limit) { ++g->keep_count; if (p->mode == m_forward) { - write_string(g, "int c"); write_int(g, g->keep_count); w(g, " = ~zc;"); + write_string(g, "int c"); + write_int(g, g->keep_count); + w(g, " = ~zc"); + if (keep_limit) { + write_string(g, ", mlimit"); + write_int(g, g->keep_count); + } + write_char(g, ';'); } else { - write_string(g, "int m"); write_int(g, g->keep_count); w(g, " = ~zl - ~zc; /*(void)m"); - write_int(g, g->keep_count); write_string(g, "*/;"); + write_string(g, "int m"); + write_int(g, g->keep_count); + w(g, " = ~zl - ~zc"); + if (keep_limit) { + write_string(g, ", mlimit"); + write_int(g, g->keep_count); + } + write_string(g, "; (void)m"); + write_int(g, g->keep_count); + write_char(g, ';'); } } @@ -267,6 +282,7 @@ static void write_data_address(struct generator * g, struct node * p) { static void writef(struct generator * g, const char * input, struct node * p) { int i = 0; int l = strlen(input); + while (i < l) { int ch = input[i++]; if (ch != '~') { @@ -276,7 +292,8 @@ static void writef(struct generator * g, const char * input, struct node * p) { switch (input[i++]) { default: write_char(g, input[i - 1]); continue; case 'C': write_comment(g, p); continue; - case 'k': wk(g, p); continue; + case 'k': wk(g, p, false); continue; + case 'K': wk(g, p, true); continue; case 'i': winc(g, p); continue; case 'l': write_check_limit(g, p); continue; case 'f': write_failure(g, p); continue; @@ -316,7 +333,7 @@ static void w(struct generator * g, const char * s) { } static void generate_AE(struct generator * g, struct node * p) { - char * s; + const char * s; switch (p->type) { case c_name: write_varref(g, p->name); break; @@ -344,7 +361,7 @@ static void generate_AE(struct generator * g, struct node * p) { case c_limit: w(g, p->mode == m_forward ? "~zl" : "~zlb"); break; case c_len: - if (g->options->utf8) { + if (g->options->encoding == ENC_UTF8) { w(g, "len_utf8(~zp)"); break; } @@ -353,7 +370,7 @@ static void generate_AE(struct generator * g, struct node * p) { w(g, "SIZE(~zp)"); break; case c_lenof: - if (g->options->utf8) { + if (g->options->encoding == ENC_UTF8) { g->V[0] = p->name; w(g, "len_utf8(~V0)"); break; @@ -367,11 +384,11 @@ static void generate_AE(struct generator * g, struct node * p) { } /* K_needed() tests to see if we really need to keep c. Not true when the - the command does not touch the cursor. This and repeat_score() could be + command does not touch the cursor. This and repeat_score() could be elaborated almost indefinitely. */ -static int K_needed(struct generator * g, struct node * p) { +extern int K_needed(struct generator * g, struct node * p) { while (p) { switch (p->type) { case c_dollar: @@ -460,7 +477,7 @@ static int repeat_score(struct generator * g, struct node * p) { /* tests if an expression requires cursor reinstatement in a repeat */ -static int repeat_restore(struct generator * g, struct node * p) { +extern int repeat_restore(struct generator * g, struct node * p) { return repeat_score(g, p) >= 2; } @@ -531,7 +548,7 @@ static void generate_or(struct generator * g, struct node * p) { static void generate_backwards(struct generator * g, struct node * p) { - writef(g,"~M~zlb = ~zc; ~zc = ~zl;~C~N", p); + writef(g, "~M~zlb = ~zc; ~zc = ~zl;~C~N", p); generate(g, p->left); w(g, "~M~zc = ~zlb;~N"); } @@ -654,7 +671,7 @@ static void generate_do(struct generator * g, struct node * p) { } static void generate_next(struct generator * g, struct node * p) { - if (g->options->utf8) { + if (g->options->encoding == ENC_UTF8) { if (p->mode == m_forward) w(g, "~{int ret = skip_utf8(~zp, ~zc, 0, ~zl, 1"); else @@ -673,7 +690,7 @@ static void generate_GO_grouping(struct generator * g, struct node * p, int is_g struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "in" : "out"; - g->S[2] = g->options->utf8 ? "_U" : ""; + g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : ""; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; @@ -817,7 +834,7 @@ static void generate_hop(struct generator * g, struct node * p) { g->S[0] = p->mode == m_forward ? "+" : "-"; g->S[1] = p->mode == m_forward ? "0" : (g->options->make_lang == LANG_C ? "z->lb" : "lb"); - if (g->options->utf8) { + if (g->options->encoding == ENC_UTF8) { w(g, "~{int ret = skip_utf8(~zp, ~zc, ~S1, ~zl, ~S0 "); generate_AE(g, p->AE); writef(g, ");~C", p); writef(g, "~Mif (ret < 0) ~f~N", p); @@ -919,11 +936,8 @@ static void generate_slicefrom(struct generator * g, struct node * p) { static void generate_setlimit(struct generator * g, struct node * p) { int keep_c; - writef(g, "~{~k~C", p); + writef(g, "~{~K~C", p); keep_c = g->keep_count; - w(g, "~Mint mlimit"); - write_int(g, keep_c); - w(g, ";~N"); generate(g, p->left); w(g, "~Mmlimit"); @@ -941,6 +955,8 @@ static void generate_setlimit(struct generator * g, struct node * p) { static const char * vars[] = { "p", "c", "l", "lb", "bra", "ket", NULL }; +/* dollar sets snowball up to operate on a string variable as if it were the + * current string */ static void generate_dollar(struct generator * g, struct node * p) { int used = g->label_used; @@ -1027,7 +1043,7 @@ static void generate_grouping(struct generator * g, struct node * p, int complem struct grouping * q = p->name->grouping; g->S[0] = p->mode == m_forward ? "" : "_b"; g->S[1] = complement ? "out" : "in"; - g->S[2] = g->options->utf8 ? "_U" : ""; + g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : ""; g->V[0] = p->name; g->I[0] = q->smallest_ch; g->I[1] = q->largest_ch; @@ -1049,7 +1065,7 @@ static void generate_literalstring(struct generator * g, struct node * p) { * function call. In UTF-8 mode, only do this for the ASCII subset, * since multi-byte characters are more complex to text against. */ - if (g->options->utf8 && *b >= 128) { + if (g->options->encoding == ENC_UTF8 && *b >= 128) { printf("single byte %d\n", *b); exit(1); } @@ -1144,7 +1160,7 @@ static void generate_substring(struct generator * g, struct node * p) { if (n_cases > 2) break; } if (block == -1) { - if (ch == cases[0]) continue; + if (n_cases > 0 && ch == cases[0]) continue; if (n_cases < 2) { cases[n_cases++] = ch; } else if (ch != cases[1]) { @@ -1248,7 +1264,7 @@ static void generate_among(struct generator * g, struct node * p) { if (x->substring == 0) generate_substring(g, p); if (x->command_count == 0 && x->starter == 0) return; - if (x->starter) generate(g, x->starter); + if (x->starter != 0) generate(g, x->starter); writef(g, "~Mswitch (among_var) {~C~+" "~Mcase 0: ~f~N", p); @@ -1586,16 +1602,14 @@ static void generate_grouping_table(struct generator * g, struct grouping * q) { for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch); - { - g->V[0] = q->name; + g->V[0] = q->name; - w(g, "static const unsigned char ~V0[] = { "); - for (i = 0; i < size; i++) { - write_int(g, map[i]); - if (i < size - 1) w(g, ", "); - } - w(g, " };~N~N"); + w(g, "static const unsigned char ~V0[] = { "); + for (i = 0; i < size; i++) { + write_int(g, map[i]); + if (i < size - 1) w(g, ", "); } + w(g, " };~N~N"); lose_b(map); } diff --git a/xapian-core/languages/compiler/header.h b/xapian-core/languages/compiler/header.h index 9b30805fb..aa1090d04 100644 --- a/xapian-core/languages/compiler/header.h +++ b/xapian-core/languages/compiler/header.h @@ -1,3 +1,4 @@ +#include typedef unsigned char byte; typedef unsigned short symbol; @@ -47,6 +48,8 @@ extern int get_utf8(const symbol * p, int * slot); extern int put_utf8(int ch, symbol * p); extern void output_str(FILE * outfile, struct str * str); +typedef enum { ENC_SINGLEBYTE, ENC_UTF8, ENC_WIDECHARS } enc; + struct m_pair { struct m_pair * next; @@ -89,6 +92,12 @@ enum token_codes { NUM_TOKEN_CODES }; +enum uplus_modes { + UPLUS_NONE, + UPLUS_DEFINED, + UPLUS_UNICODE +}; + /* struct input must be a prefix of struct tokeniser. */ struct tokeniser { @@ -108,12 +117,18 @@ struct tokeniser { int token; int previous_token; byte token_held; - byte widechars; - byte utf8; + enc encoding; int omission; struct include * includes; + /* Mode in which U+ has been used: + * UPLUS_NONE - not used yet + * UPLUS_DEFINED - stringdef U+xxxx .... + * UPLUS_UNICODE - {U+xxxx} used with implicit meaning + */ + int uplusmode; + char token_disabled[NUM_TOKEN_CODES]; }; @@ -144,6 +159,7 @@ struct name { byte used_in_among; /* Function used in among? */ struct node * used; /* First use, or NULL if not used */ struct name * local_to; /* Local to one routine/external */ + int declaration_line_number;/* Line number of declaration */ }; @@ -185,6 +201,7 @@ struct grouping { int largest_ch; /* character with max code */ int smallest_ch; /* character with min code */ struct name * name; /* so g->name->grouping == g */ + int line_number; }; struct node { @@ -243,7 +260,7 @@ struct analyser { struct grouping * groupings; struct grouping * groupings_end; struct node * substring; /* pending 'substring' in current routine definition */ - byte utf8; + enc encoding; byte int_limits_used; /* are maxint or minint used? */ }; @@ -278,7 +295,7 @@ struct generator { * if < 0, the negated keep_count for the limit to restore in case of * failure. */ int failure_keep_count; -#if !defined(DISABLE_JAVA) && !defined(DISABLE_JSX) && !defined(DISABLE_PYTHON) +#if !defined(DISABLE_JAVA) && !defined(DISABLE_JS) && !defined(DISABLE_PYTHON) && !defined(DISABLE_CSHARP) struct str * failure_str; /* This is used by some generators instead of failure_keep_count */ #endif @@ -309,18 +326,19 @@ struct options { FILE * output_src; FILE * output_h; byte syntax_tree; - byte widechars; - enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS, LANG_PYTHON, LANG_JSX } make_lang; + enc encoding; + enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS, LANG_CSHARP, LANG_PYTHON, LANG_JAVASCRIPT, LANG_RUST, LANG_GO } make_lang; const char * externals_prefix; const char * variables_prefix; const char * runtime_path; const char * parent_class_name; const char * package; + const char * go_package; + const char * go_snowball_runtime; const char * string_class; const char * among_class; struct include * includes; struct include * includes_end; - byte utf8; }; /* Generator functions common to several backends. */ @@ -335,6 +353,9 @@ extern void write_int(struct generator * g, int i); extern void write_b(struct generator * g, symbol * b); extern void write_str(struct generator * g, struct str * str); +extern int K_needed(struct generator * g, struct node * p); +extern int repeat_restore(struct generator * g, struct node * p); + /* Generator for C code. */ extern void generate_program_c(struct generator * g); @@ -343,11 +364,24 @@ extern void generate_program_c(struct generator * g); extern void generate_program_java(struct generator * g); #endif +#ifndef DISABLE_CSHARP +/* Generator for C# code. */ +extern void generate_program_csharp(struct generator * g); +#endif + #ifndef DISABLE_PYTHON /* Generator for Python code. */ extern void generate_program_python(struct generator * g); #endif -#ifndef DISABLE_JSX -extern void generate_program_jsx(struct generator * g); +#ifndef DISABLE_JS +extern void generate_program_js(struct generator * g); +#endif + +#ifndef DISABLE_RUST +extern void generate_program_rust(struct generator * g); +#endif + +#ifndef DISABLE_GO +extern void generate_program_go(struct generator * g); #endif diff --git a/xapian-core/languages/compiler/tokeniser.c b/xapian-core/languages/compiler/tokeniser.c index 8c7ab4064..0fcbd6d45 100644 --- a/xapian-core/languages/compiler/tokeniser.c +++ b/xapian-core/languages/compiler/tokeniser.c @@ -16,6 +16,8 @@ struct system_word { #include "syswords.h" +static int hex_to_num(int ch); + static int smaller(int a, int b) { return a < b ? a : b; } extern symbol * get_input(symbol * p, char ** p_file) { @@ -102,10 +104,13 @@ static int eq_s(struct tokeniser * t, const char * s) { static int white_space(struct tokeniser * t, int ch) { switch (ch) { - case '\n': t->line_number++; + case '\n': + t->line_number++; + /* fall through */ case '\r': case '\t': - case ' ': return true; + case ' ': + return true; } return false; } @@ -129,6 +134,7 @@ static int read_literal_string(struct tokeniser * t, int c) { if (ch == '\n') { error1(t, "string not terminated"); return c; } c++; if (ch == t->m_start) { + /* Inside insert characters. */ int c0 = c; int newlines = false; /* no newlines as yet */ int black_found = false; /* no printing chars as yet */ @@ -150,7 +156,65 @@ static int read_literal_string(struct tokeniser * t, int c) { if (q == 0) { if (n == 1 && (firstch == '\'' || firstch == t->m_start)) t->b = add_to_b(t->b, 1, p + c0); - else + else if (n >= 3 && firstch == 'U' && p[c0 + 1] == '+') { + int codepoint = 0; + int x; + if (t->uplusmode == UPLUS_DEFINED) { + /* See if found with xxxx upper-cased. */ + symbol * uc = create_b(n); + int i; + for (i = 0; i != n; ++i) { + uc[i] = toupper(p[c0 + i]); + } + q = find_in_m(t, n, uc); + lose_b(uc); + if (q != 0) { + t->b = add_to_b(t->b, SIZE(q), q); + continue; + } + error1(t, "Some U+xxxx stringdefs seen but not this one"); + } else { + t->uplusmode = UPLUS_UNICODE; + } + for (x = c0 + 2; x != c - 1; ++x) { + int hex = hex_to_num(p[x]); + if (hex < 0) { + error1(t, "Bad hex digit following U+"); + break; + } + codepoint = (codepoint << 4) | hex; + } + if (t->encoding == ENC_UTF8) { + if (codepoint < 0 || codepoint > 0x01ffff) { + error1(t, "character values exceed 0x01ffff"); + } + /* Ensure there's enough space for a max length + * UTF-8 sequence. */ + if (CAPACITY(t->b) < SIZE(t->b) + 3) { + t->b = increase_capacity(t->b, 3); + } + SIZE(t->b) += put_utf8(codepoint, t->b + SIZE(t->b)); + } else { + if (t->encoding == ENC_SINGLEBYTE) { + /* Only ISO-8859-1 is handled this way - for + * other single-byte character sets you need + * stringdef all the U+xxxx codes you use + * like - e.g.: + * + * stringdef U+0171 hex 'FB' + */ + if (codepoint < 0 || codepoint > 0xff) { + error1(t, "character values exceed 256"); + } + } else { + if (codepoint < 0 || codepoint > 0xffff) { + error1(t, "character values exceed 64K"); + } + } + symbol sym = codepoint; + t->b = add_to_b(t->b, 1, &sym); + } + } else error(t, "string macro '", n, p + c0, "' undeclared"); } else t->b = add_to_b(t->b, SIZE(q), q); @@ -241,6 +305,7 @@ static int decimal_to_num(int ch) { static int hex_to_num(int ch) { if ('0' <= ch && ch <= '9') return ch - '0'; if ('a' <= ch && ch <= 'f') return ch - 'a' + 10; + if ('A' <= ch && ch <= 'F') return ch - 'A' + 10; return -1; } @@ -261,7 +326,7 @@ static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) { return; } } else { - ch = hex_to_num(tolower(ch)); + ch = hex_to_num(ch); if (ch < 0) { error1(t, "hex string contains non-hex characters"); return; @@ -270,18 +335,18 @@ static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) { number = base * number + ch; c++; } - if (t->widechars || t->utf8) { - if (number < 0 || number > 0xffff) { - error1(t, "character values exceed 64K"); + if (t->encoding == ENC_SINGLEBYTE) { + if (number < 0 || number > 0xff) { + error1(t, "character values exceed 256"); return; } } else { - if (number < 0 || number > 0xff) { - error1(t, "character values exceed 256"); + if (number < 0 || number > 0xffff) { + error1(t, "character values exceed 64K"); return; } } - if (t->utf8) + if (t->encoding == ENC_UTF8) d += put_utf8(number, p + d); else p[d++] = number; @@ -340,6 +405,14 @@ extern int read_token(struct tokeniser * t) { q->name = copy_b(t->b2); q->value = copy_b(t->b); t->m_pairs = q; + if (t->uplusmode != UPLUS_DEFINED && + (SIZE(t->b2) >= 3 && t->b2[0] == 'U' && t->b2[1] == '+')) { + if (t->uplusmode == UPLUS_UNICODE) { + error1(t, "U+xxxx already used with implicit meaning"); + } else { + t->uplusmode = UPLUS_DEFINED; + } + } } } continue; @@ -391,7 +464,7 @@ extern int read_token(struct tokeniser * t) { t->get_depth--; continue; } - /* drop through */ + /* fall through */ default: t->previous_token = t->token; t->token = code; @@ -439,6 +512,7 @@ extern struct tokeniser * create_tokeniser(symbol * p, char * file) { t->token_held = false; t->token = -2; t->previous_token = -2; + t->uplusmode = UPLUS_NONE; memset(t->token_disabled, 0, sizeof(t->token_disabled)); return t; } -- 2.11.4.GIT