From 6ceee0995607b09481725f44d53e506997b4a295 Mon Sep 17 00:00:00 2001 From: Olly Betts Date: Thu, 3 May 2018 16:38:15 +1200 Subject: [PATCH] Sync changes from latest snowball compiler version * Generate simpler code for special case setlimit tomark AE for C * Elide saving and restoring of c for more commands * Avoid infinite recursion in compiler for recursive routines * Warn about variables which are either never initialised or never read --- xapian-core/languages/compiler/analyser.c | 103 +++++++++++++++++++++++------ xapian-core/languages/compiler/driver.c | 62 ++++++++++++++++- xapian-core/languages/compiler/generator.c | 102 ++++++++++++++++++++++------ xapian-core/languages/compiler/header.h | 2 + 4 files changed, 230 insertions(+), 39 deletions(-) diff --git a/xapian-core/languages/compiler/analyser.c b/xapian-core/languages/compiler/analyser.c index c7fd3f83e..91e2383f4 100644 --- a/xapian-core/languages/compiler/analyser.c +++ b/xapian-core/languages/compiler/analyser.c @@ -303,6 +303,8 @@ handle_as_name: p->referenced = false; p->used_in_among = false; p->used = 0; + p->value_used = false; + p->initialised = false; p->local_to = 0; p->grouping = 0; p->definition = 0; @@ -404,6 +406,7 @@ static struct node * read_AE(struct analyser * a, int B) { case c_name: p = new_node(a, c_name); name_to_node(a, p, 'i'); + if (p->name) p->name->value_used = true; break; case c_maxint: case c_minint: @@ -719,8 +722,11 @@ static struct node * read_C(struct analyser * a) { case c_loop: case c_atleast: return C_style(a, "AC", token); - case c_setmark: - return C_style(a, "i", token); + case c_setmark: { + struct node * n = C_style(a, "i", token); + if (n->name) n->name->initialised = true; + return n; + } case c_tomark: case c_atmark: case c_hop: @@ -738,20 +744,31 @@ static struct node * read_C(struct analyser * a) { case c_debug: return C_style(a, "", token); case c_assignto: - case c_sliceto: + case c_sliceto: { + struct node *n; check_modifyable(a); - return C_style(a, "s", token); + n = C_style(a, "s", token); + if (n->name) n->name->initialised = true; + return n; + } case c_assign: case c_insert: case c_attach: - case c_slicefrom: + case c_slicefrom: { + struct node *n; check_modifyable(a); - return C_style(a, "S", token); + n = C_style(a, "S", token); + if (n->name) n->name->value_used = true; + return n; + } case c_setlimit: return C_style(a, "CfD", token); case c_set: - case c_unset: - return C_style(a, "b", token); + case c_unset: { + struct node * n = C_style(a, "b", token); + if (n->name) n->name->initialised = true; + return n; + } case c_dollar: get_token(a, c_name); { @@ -769,6 +786,11 @@ static struct node * read_C(struct analyser * a) { * an error avalanche. */ /* fall through */ case t_string: + /* Assume for now that $ on string both initialises and + * uses the string variable. FIXME: Can we do better? + */ + q->initialised = true; + q->value_used = true; a->mode = m_forward; a->modifyable = true; p = new_node(a, c_dollar); @@ -776,7 +798,27 @@ static struct node * read_C(struct analyser * a) { case t_integer: /* a->mode = m_integer; */ p = new_node(a, read_AE_test(a)); - p->AE = read_AE(a, 0); break; + p->AE = read_AE(a, 0); + if (q) { + /* +=, etc don't "initialise" as they only amend an + * existing value. Similarly, they don't count as + * using the value. + */ + switch (p->type) { + case c_mathassign: + q->initialised = true; + break; + case c_eq: + case c_ne: + case c_gr: + case c_ge: + case c_ls: + case c_le: + q->value_used = true; + break; + } + } + break; } if (q) mark_used_in(a, q, p); p->name = q; @@ -792,10 +834,14 @@ static struct node * read_C(struct analyser * a) { mark_used_in(a, q, p); switch (q->type) { case t_boolean: - p->type = c_booltest; break; + p->type = c_booltest; + q->value_used = true; + break; case t_integer: error(a, e_misplaced); /* integer name misplaced */ + break; case t_string: + q->value_used = true; break; case t_routine: case t_external: @@ -1030,20 +1076,39 @@ extern void read_program(struct analyser * a) { } else { fprintf(stderr, "' defined but not used\n"); } - } else if (!q->used && - (q->type == t_routine || q->type == t_grouping)) { - int line_num; - if (q->type == t_routine) { - line_num = q->definition->line_number; - } else { - line_num = q->grouping->line_number; + } else if (q->type == t_routine || q->type == t_grouping) { + if (!q->used) { + int line_num; + if (q->type == t_routine) { + line_num = q->definition->line_number; + } else { + line_num = q->grouping->line_number; + } + fprintf(stderr, "%s:%d: warning: %s '", + a->tokeniser->file, + line_num, + name_of_name_type(q->type)); + report_b(stderr, q->b); + fprintf(stderr, "' defined but not used\n"); } + } else if (q->type == t_external) { + /* Unused is OK. */ + } else if (!q->initialised) { + count_error(a); + fprintf(stderr, "%s:%d: warning: %s '", + a->tokeniser->file, + q->declaration_line_number, + name_of_name_type(q->type)); + report_b(stderr, q->b); + fprintf(stderr, "' is never initialised\n"); + } else if (!q->value_used) { + count_error(a); fprintf(stderr, "%s:%d: warning: %s '", a->tokeniser->file, - line_num, + q->declaration_line_number, name_of_name_type(q->type)); report_b(stderr, q->b); - fprintf(stderr, "' defined but not used\n"); + fprintf(stderr, "' is set but never used\n"); } q = q->next; } diff --git a/xapian-core/languages/compiler/driver.c b/xapian-core/languages/compiler/driver.c index 123306d1d..0afd79b62 100644 --- a/xapian-core/languages/compiler/driver.c +++ b/xapian-core/languages/compiler/driver.c @@ -99,7 +99,7 @@ static int read_options(struct options * o, int argc, char * argv[]) { o->package = DEFAULT_PACKAGE; o->go_package = DEFAULT_GO_PACKAGE; o->go_snowball_runtime = DEFAULT_GO_SNOWBALL_RUNTIME; - o->name = ""; + o->name = NULL; o->make_lang = LANG_C; o->includes = 0; o->includes_end = 0; @@ -271,6 +271,66 @@ static int read_options(struct options * o, int argc, char * argv[]) { } } if (!o->externals_prefix) o->externals_prefix = ""; + + if (!o->name && o->output_file) { + /* Default class name to basename of output_file - this is the standard + * convention for at least Java and C#. + */ + const char * slash = strrchr(o->output_file, '/'); + size_t len; + const char * leaf = (slash == NULL) ? o->output_file : slash + 1; + + slash = strrchr(leaf, '\\'); + if (slash != NULL) leaf = slash + 1; + + { + const char * dot = strchr(leaf, '.'); + len = (dot == NULL) ? strlen(leaf) : (size_t)(dot - leaf); + } + + { + char * new_name = malloc(len + 1); + switch (o->make_lang) { + case LANG_CSHARP: + /* Upper case initial letter. */ + memcpy(new_name, leaf, len); + new_name[0] = toupper(new_name[0]); + break; + case LANG_JAVASCRIPT: + case LANG_PYTHON: { + /* Upper case initial letter and change each + * underscore+letter or hyphen+letter to an upper case + * letter. + */ + int i, j = 0; + int uc_next = true; + for (i = 0; i != len; ++i) { + unsigned char ch = leaf[i]; + if (ch == '_' || ch == '-') { + uc_next = true; + } else { + if (uc_next) { + new_name[j] = toupper(ch); + uc_next = false; + } else { + new_name[j] = ch; + } + ++j; + } + } + len = j; + break; + } + default: + /* Just copy. */ + memcpy(new_name, leaf, len); + break; + } + new_name[len] = '\0'; + o->name = new_name; + } + } + return new_argc; } diff --git a/xapian-core/languages/compiler/generator.c b/xapian-core/languages/compiler/generator.c index 1921cb007..e30b932f6 100644 --- a/xapian-core/languages/compiler/generator.c +++ b/xapian-core/languages/compiler/generator.c @@ -388,9 +388,11 @@ static void generate_AE(struct generator * g, struct node * p) { elaborated almost indefinitely. */ -extern int K_needed(struct generator * g, struct node * p) { +static int K_needed_(struct generator * g, struct node * p, int call_depth) { while (p) { switch (p->type) { + case c_atlimit: + case c_do: case c_dollar: case c_leftslice: case c_rightslice: @@ -407,17 +409,27 @@ extern int K_needed(struct generator * g, struct node * p) { case c_le: case c_sliceto: case c_booltest: + case c_set: + case c_unset: case c_true: case c_false: case c_debug: break; case c_call: - if (K_needed(g, p->name->definition)) return true; + /* Recursive functions aren't typical in snowball programs, so + * make the pessimistic assumption that keep is needed if we + * hit a generous limit on recursion. It's not likely to make + * a difference to any real world program, but means we won't + * recurse until we run out of stack for pathological cases. + */ + if (call_depth >= 100) return true; + if (K_needed_(g, p->name->definition, call_depth + 1)) + return true; break; case c_bra: - if (K_needed(g, p->left)) return true; + if (K_needed_(g, p->left, call_depth)) return true; break; default: return true; @@ -427,7 +439,11 @@ extern int K_needed(struct generator * g, struct node * p) { return false; } -static int repeat_score(struct generator * g, struct node * p) { +extern int K_needed(struct generator * g, struct node * p) { + return K_needed_(g, p, 0); +} + +static int repeat_score(struct generator * g, struct node * p, int call_depth) { int score = 0; while (p) { switch (p->type) { @@ -450,11 +466,25 @@ static int repeat_score(struct generator * g, struct node * p) { break; case c_call: - score += repeat_score(g, p->name->definition); + /* Recursive functions aren't typical in snowball programs, so + * make the pessimistic assumption that repeat requires cursor + * reinstatement if we hit a generous limit on recursion. It's + * not likely to make a difference to any real world program, + * but means we won't recurse until we run out of stack for + * pathological cases. + */ + if (call_depth >= 100) { + return 2; + } + score += repeat_score(g, p->name->definition, call_depth + 1); + if (score >= 2) + return score; break; case c_bra: - score += repeat_score(g, p->left); + score += repeat_score(g, p->left, call_depth); + if (score >= 2) + return score; break; case c_name: @@ -463,12 +493,12 @@ static int repeat_score(struct generator * g, struct node * p) { case c_grouping: case c_non: case c_hop: - score = score + 1; + if (++score >= 2) + return score; break; default: - score = 2; - break; + return 2; } p = p->right; } @@ -478,7 +508,7 @@ static int repeat_score(struct generator * g, struct node * p) { /* tests if an expression requires cursor reinstatement in a repeat */ extern int repeat_restore(struct generator * g, struct node * p) { - return repeat_score(g, p) >= 2; + return repeat_score(g, p, 0) >= 2; } static void generate_bra(struct generator * g, struct node * p) { @@ -936,15 +966,49 @@ static void generate_slicefrom(struct generator * g, struct node * p) { static void generate_setlimit(struct generator * g, struct node * p) { int keep_c; - writef(g, "~{~K~C", p); - keep_c = g->keep_count; - generate(g, p->left); + if (p->left && p->left->type == c_tomark && !p->left->right) { + /* Special case for: + * + * setlimit tomark AE for C + * + * All uses of setlimit in the current stemmers we ship follow this + * pattern, and by special-casing we can avoid having to save and + * restore c. + */ + struct node * q = p->left; + + ++g->keep_count; + writef(g, "~N~{int mlimit", p); + write_int(g, g->keep_count); + writef(g, ";~C", p); + keep_c = g->keep_count; + + g->S[0] = q->mode == m_forward ? ">" : "<"; + + w(g, "~Mif (~zc ~S0 "); generate_AE(g, q->AE); writef(g, ") ~f~N", q); + w(g, "~Mmlimit"); + write_int(g, keep_c); + if (p->mode == m_forward) { + w(g, " = ~zl - ~zc; ~zl = "); + } else { + w(g, " = ~zlb; ~zlb = "); + } + generate_AE(g, q->AE); + w(g, ";~N"); + } else { + writef(g, "~{~K~C", p); + keep_c = g->keep_count; + generate(g, p->left); + + w(g, "~Mmlimit"); + write_int(g, keep_c); + if (p->mode == m_forward) + w(g, " = ~zl - ~zc; ~zl = ~zc;~N"); + else + w(g, " = ~zlb; ~zlb = ~zc;~N"); + w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); + } - w(g, "~Mmlimit"); - write_int(g, keep_c); - if (p->mode == m_forward) w(g, " = ~zl - ~zc; ~zl = ~zc;~N"); - else w(g, " = ~zlb; ~zlb = ~zc;~N"); - w(g, "~M"); wrestore(g, p, keep_c); w(g, "~N"); g->failure_keep_count = -keep_c; generate(g, p->aux); w(g, "~M"); @@ -1063,7 +1127,7 @@ static void generate_literalstring(struct generator * g, struct node * p) { /* It's quite common to compare with a single character literal string, * so just inline the simpler code for this case rather than making a * function call. In UTF-8 mode, only do this for the ASCII subset, - * since multi-byte characters are more complex to text against. + * since multi-byte characters are more complex to test against. */ if (g->options->encoding == ENC_UTF8 && *b >= 128) { printf("single byte %d\n", *b); diff --git a/xapian-core/languages/compiler/header.h b/xapian-core/languages/compiler/header.h index aa1090d04..2dcdee368 100644 --- a/xapian-core/languages/compiler/header.h +++ b/xapian-core/languages/compiler/header.h @@ -157,6 +157,8 @@ struct name { struct grouping * grouping; /* for grouping names */ byte referenced; byte used_in_among; /* Function used in among? */ + byte value_used; /* (For variables) is its value ever used? */ + byte initialised; /* (For variables) is it ever initialised? */ struct node * used; /* First use, or NULL if not used */ struct name * local_to; /* Local to one routine/external */ int declaration_line_number;/* Line number of declaration */ -- 2.11.4.GIT