From 51bf3f2585e60b2b11dd57a7529eb3c37c4b880d Mon Sep 17 00:00:00 2001
From: Olly Betts <olly@survex.com>
Date: Tue, 24 Apr 2018 12:27:13 +1200
Subject: [PATCH] Sync changes from latest snowball compiler version

* Avoid comparing with uninitialised array element in compiler.
* Fix GCC7 misleading indentation and switch case fall-through warnings.
* Fix // comments in snowball compiler code for C90 compatibility.
* Support for {U+1234} notation to specify Unicode codepoints.
---
 xapian-core/languages/Makefile.mk          |   2 +-
 xapian-core/languages/compiler/analyser.c  | 177 +++++++++++++++++++---------
 xapian-core/languages/compiler/driver.c    | 183 +++++++++++++++++++++++------
 xapian-core/languages/compiler/generator.c |  84 +++++++------
 xapian-core/languages/compiler/header.h    |  52 ++++++--
 xapian-core/languages/compiler/tokeniser.c |  96 +++++++++++++--
 6 files changed, 452 insertions(+), 142 deletions(-)

diff --git a/xapian-core/languages/Makefile.mk b/xapian-core/languages/Makefile.mk
index d44c63c3e..6b3936b9f 100644
--- a/xapian-core/languages/Makefile.mk
+++ b/xapian-core/languages/Makefile.mk
@@ -85,7 +85,7 @@ $(snowball_built_sources): languages/snowball $(snowball_algorithms)
 
 languages/snowball: $(snowball_sources) $(snowball_headers)
 	$(CC_FOR_BUILD) -o languages/snowball \
-	    -DDISABLE_JAVA -DDISABLE_JSX -DDISABLE_PYTHON \
+	    -DDISABLE_CSHARP -DDISABLE_GO -DDISABLE_JAVA -DDISABLE_JS -DDISABLE_PYTHON -DDISABLE_RUST \
 	    `for f in $(snowball_sources) ; do test -f $$f && echo $$f || echo $(srcdir)/$$f ; done`
 
 .sbl.cc:
diff --git a/xapian-core/languages/compiler/analyser.c b/xapian-core/languages/compiler/analyser.c
index 35c8b4462..c7fd3f83e 100644
--- a/xapian-core/languages/compiler/analyser.c
+++ b/xapian-core/languages/compiler/analyser.c
@@ -9,7 +9,7 @@ typedef enum {
     e_unexpected_token = 1,
     e_string_omitted = 2,
     e_unexpected_token_in_among = 3,
-    // For codes above here, report "after " t->previous_token after the error.
+    /* For codes above here, report "after " t->previous_token after the error. */
     e_unresolved_substring = 14,
     e_not_allowed_inside_reverse = 15,
     e_empty_grouping = 16,
@@ -17,7 +17,7 @@ typedef enum {
     e_empty_among = 18,
     e_adjacent_bracketed_in_among = 19,
     e_substring_preceded_by_substring = 20,
-    // For codes below here, tokeniser->b is printed before the error.
+    /* For codes below here, tokeniser->b is printed before the error. */
     e_redeclared = 30,
     e_undeclared = 31,
     e_declared_as_different_mode = 32,
@@ -35,8 +35,6 @@ static struct node * read_C(struct analyser * a);
 static struct node * C_style(struct analyser * a, const char * s, int token);
 
 
-static void fault(int n) { fprintf(stderr, "fault %d\n", n); exit(1); }
-
 static void print_node_(struct node * p, int n, const char * s) {
 
     int i;
@@ -76,22 +74,37 @@ static struct node * new_node(struct analyser * a, int type) {
 
 static const char * name_of_mode(int n) {
     switch (n) {
-         default: fault(0);
-         case m_backward: return "string backward";
-         case m_forward:  return "string forward";
-    /*   case m_integer:  return "integer";  */
+        case m_backward: return "string backward";
+        case m_forward:  return "string forward";
+    /*  case m_integer:  return "integer";  */
     }
+    fprintf(stderr, "Invalid mode %d in name_of_mode()\n", n);
+    exit(1);
 }
 
 static const char * name_of_type(int n) {
     switch (n) {
-         default: fault(1);
-         case 's': return "string";
-         case 'i': return "integer";
-         case 'r': return "routine";
-         case 'R': return "routine or grouping";
-         case 'g': return "grouping";
+        case 's': return "string";
+        case 'i': return "integer";
+        case 'r': return "routine";
+        case 'R': return "routine or grouping";
+        case 'g': return "grouping";
+    }
+    fprintf(stderr, "Invalid type %d in name_of_type()\n", n);
+    exit(1);
+}
+
+static const char * name_of_name_type(int code) {
+    switch (code) {
+        case t_string: return "string";
+        case t_boolean: return "boolean";
+        case t_integer: return "integer";
+        case t_routine: return "routine";
+        case t_external: return "external";
+        case t_grouping: return "grouping";
     }
+    fprintf(stderr, "Invalid type code %d in name_of_name_type()\n", code);
+    exit(1);
 }
 
 static void count_error(struct analyser * a) {
@@ -110,6 +123,7 @@ static void error2(struct analyser * a, error_code n, int x) {
             fprintf(stderr, "%s omitted", name_of_token(t->omission)); break;
         case e_unexpected_token_in_among:
             fprintf(stderr, "in among(...), ");
+            /* fall through */
         case e_unexpected_token:
             fprintf(stderr, "unexpected %s", name_of_token(t->token));
             if (t->token == c_number) fprintf(stderr, " %d", t->number);
@@ -228,13 +242,24 @@ static void check_routine_mode(struct analyser * a, struct name * p, int mode) {
 
 static void check_name_type(struct analyser * a, struct name * p, int type) {
     switch (type) {
-        case 's': if (p->type == t_string) return; break;
-        case 'i': if (p->type == t_integer) return; break;
-        case 'b': if (p->type == t_boolean) return; break;
-        case 'R': if (p->type == t_grouping) return;
-        case 'r': if (p->type == t_routine ||
-                      p->type == t_external) return; break;
-        case 'g': if (p->type == t_grouping) return; break;
+        case 's':
+            if (p->type == t_string) return;
+            break;
+        case 'i':
+            if (p->type == t_integer) return;
+            break;
+        case 'b':
+            if (p->type == t_boolean) return;
+            break;
+        case 'R':
+            if (p->type == t_grouping) return;
+            /* FALLTHRU */
+        case 'r':
+            if (p->type == t_routine || p->type == t_external) return;
+            break;
+        case 'g':
+            if (p->type == t_grouping) return;
+            break;
     }
     error2(a, e_not_of_type_x, type);
 }
@@ -281,7 +306,8 @@ handle_as_name:
                     p->local_to = 0;
                     p->grouping = 0;
                     p->definition = 0;
-                    a->name_count[type] ++;
+                    p->declaration_line_number = t->line_number;
+                    a->name_count[type]++;
                     p->next = a->names;
                     a->names = p;
                     if (token != c_name) {
@@ -358,8 +384,18 @@ static struct node * read_AE(struct analyser * a, int B) {
     struct node * q;
     switch (read_token(t)) {
         case c_minus: /* monadic */
+            q = read_AE(a, 100);
+            if (q->type == c_neg) {
+                /* Optimise away double negation, which avoids generators
+                 * having to worry about generating "--" (decrement operator
+                 * in many languages).
+                 */
+                p = q->right;
+                /* Don't free q, it's in the linked list a->nodes. */
+                break;
+            }
             p = new_node(a, c_neg);
-            p->right = read_AE(a, 100);
+            p->right = q;
             break;
         case c_bra:
             p = read_AE(a, 0);
@@ -372,6 +408,7 @@ static struct node * read_AE(struct analyser * a, int B) {
         case c_maxint:
         case c_minint:
             a->int_limits_used = true;
+            /* fall through */
         case c_cursor:
         case c_limit:
         case c_len:
@@ -617,6 +654,8 @@ static struct node * read_among(struct analyser * a) {
                 q = read_C_list(a); break;
             default:
                 error(a, e_unexpected_token_in_among);
+                previous_token = token;
+                continue;
             case c_ket:
                 if (p->number == 0) error(a, e_empty_among);
                 if (t->error_count == 0) make_among(a, p, substring);
@@ -688,6 +727,7 @@ static struct node * read_C(struct analyser * a) {
             return C_style(a, "A", token);
         case c_delete:
             check_modifyable(a);
+            /* fall through */
         case c_next:
         case c_tolimit:
         case c_atlimit:
@@ -722,7 +762,12 @@ static struct node * read_C(struct analyser * a) {
                 switch (q ? q->type : t_string)
                     /* above line was: switch (q->type) - bug #1 fix 7/2/2003 */
                 {
-                    default: error(a, e_not_of_type_string_or_integer);
+                    default:
+                        error(a, e_not_of_type_string_or_integer);
+                        /* Handle $foo for unknown 'foo' as string since
+                         * that's more common and so less likely to cause
+                         * an error avalanche. */
+                        /* fall through */
                     case t_string:
                         a->mode = m_forward;
                         a->modifyable = true;
@@ -824,10 +869,11 @@ static void read_define_grouping(struct analyser * a, struct name * q) {
         NEW(grouping, p);
         if (a->groupings == 0) a->groupings = p; else a->groupings_end->next = p;
         a->groupings_end = p;
-        q->grouping = p;
+        if (q) q->grouping = p;
         p->next = 0;
         p->name = q;
-        p->number = q->count;
+        p->number = q ? q->count : 0;
+        p->line_number = a->tokeniser->line_number;
         p->b = create_b(0);
         while (true) {
             switch (read_token(t)) {
@@ -841,7 +887,7 @@ static void read_define_grouping(struct analyser * a, struct name * q) {
                     }
                     break;
                 case c_literalstring:
-                    p->b = alter_grouping(p->b, t->b, style, a->utf8);
+                    p->b = alter_grouping(p->b, t->b, style, (a->encoding == ENC_UTF8));
                     break;
                 default: error(a, e_unexpected_token); return;
             }
@@ -885,8 +931,8 @@ static void read_define_routine(struct analyser * a, struct name * q) {
     if (q) q->definition = p->left;
 
     if (a->substring != 0) {
-         error2(a, e_unresolved_substring, a->substring->line_number);
-         a->substring = 0;
+        error2(a, e_unresolved_substring, a->substring->line_number);
+        a->substring = 0;
     }
     p->amongvar_needed = a->amongvar_needed;
 }
@@ -894,8 +940,26 @@ static void read_define_routine(struct analyser * a, struct name * q) {
 static void read_define(struct analyser * a) {
     if (get_token(a, c_name)) {
         struct name * q = find_name(a);
-        if (q != 0 && q->type == t_grouping) read_define_grouping(a, q);
-            else read_define_routine(a, q);
+        int type;
+        if (q) {
+            type = q->type;
+        } else {
+            /* No declaration, so sniff next token - if it is 'as' then parse
+             * as a routine, otherwise as a grouping.
+             */
+            if (read_token(a->tokeniser) == c_as) {
+                type = t_routine;
+            } else {
+                type = t_grouping;
+            }
+            a->tokeniser->token_held = true;
+        }
+
+        if (type == t_grouping) {
+            read_define_grouping(a, q);
+        } else {
+            read_define_routine(a, q);
+        }
     }
 }
 
@@ -923,6 +987,7 @@ static void read_program_(struct analyser * a, int terminator) {
             case c_backwardmode:read_backwardmode(a); break;
             case c_ket:
                 if (terminator == c_ket) return;
+                /* fall through */
             default:
                 error(a, e_unexpected_token); break;
             case -1:
@@ -939,9 +1004,11 @@ extern void read_program(struct analyser * a) {
         while (q) {
             switch (q->type) {
                 case t_external: case t_routine:
-                    if (q->used && q->definition == 0) error4(a, q); break;
+                    if (q->used && q->definition == 0) error4(a, q);
+                    break;
                 case t_grouping:
-                    if (q->used && q->grouping == 0) error4(a, q); break;
+                    if (q->used && q->grouping == 0) error4(a, q);
+                    break;
             }
             q = q->next;
         }
@@ -949,33 +1016,37 @@ extern void read_program(struct analyser * a) {
 
     if (a->tokeniser->error_count == 0) {
         struct name * q = a->names;
-        int warned = false;
         while (q) {
             if (!q->referenced) {
-                if (!warned) {
-                    fprintf(stderr, "Declared but not used:");
-                    warned = true;
+                fprintf(stderr, "%s:%d: warning: %s '",
+                        a->tokeniser->file,
+                        q->declaration_line_number,
+                        name_of_name_type(q->type));
+                report_b(stderr, q->b);
+                if (q->type == t_routine ||
+                    q->type == t_external ||
+                    q->type == t_grouping) {
+                    fprintf(stderr, "' declared but not defined\n");
+                } else {
+                    fprintf(stderr, "' defined but not used\n");
                 }
-                fprintf(stderr, " "); report_b(stderr, q->b);
-            }
-            q = q->next;
-        }
-        if (warned) fprintf(stderr, "\n");
-
-        q = a->names;
-        warned = false;
-        while (q) {
-            if (! q->used && (q->type == t_routine ||
-                              q->type == t_grouping)) {
-                if (!warned) {
-                    fprintf(stderr, "Declared and defined but not used:");
-                    warned = true;
+            } else if (!q->used &&
+                       (q->type == t_routine || q->type == t_grouping)) {
+                int line_num;
+                if (q->type == t_routine) {
+                    line_num = q->definition->line_number;
+                } else {
+                    line_num = q->grouping->line_number;
                 }
-                fprintf(stderr, " "); report_b(stderr, q->b);
+                fprintf(stderr, "%s:%d: warning: %s '",
+                        a->tokeniser->file,
+                        line_num,
+                        name_of_name_type(q->type));
+                report_b(stderr, q->b);
+                fprintf(stderr, "' defined but not used\n");
             }
             q = q->next;
         }
-        if (warned) fprintf(stderr, "\n");
     }
 }
 
diff --git a/xapian-core/languages/compiler/driver.c b/xapian-core/languages/compiler/driver.c
index e70ae62df..123306d1d 100644
--- a/xapian-core/languages/compiler/driver.c
+++ b/xapian-core/languages/compiler/driver.c
@@ -7,24 +7,40 @@
 #define DEFAULT_BASE_CLASS "org.tartarus.snowball.SnowballProgram"
 #define DEFAULT_AMONG_CLASS "org.tartarus.snowball.Among"
 #define DEFAULT_STRING_CLASS "java.lang.StringBuilder"
+#define DEFAULT_GO_PACKAGE "snowball"
+#define DEFAULT_GO_SNOWBALL_RUNTIME "github.com/snowballstem/snowball/go"
+
+#define DEFAULT_CS_NAMESPACE "Snowball"
+#define DEFAULT_CS_BASE_CLASS "Stemmer"
+#define DEFAULT_CS_AMONG_CLASS "Among"
+#define DEFAULT_CS_STRING_CLASS "StringBuilder"
 
 static int eq(const char * s1, const char * s2) {
     return strcmp(s1, s2) == 0;
 }
 
 static void print_arglist(void) {
-    fprintf(stderr, "Usage: snowball <file> [options]\n\n"
+    fprintf(stderr, "Usage: snowball <file>... [options]\n\n"
                     "options are: [-o[utput] file]\n"
                     "             [-s[yntax]]\n"
 #ifndef DISABLE_JAVA
                     "             [-j[ava]]\n"
 #endif
+#ifndef DISABLE_CSHARP
+                    "             [-cs[harp]]\n"
+#endif
                     "             [-c++]\n"
 #ifndef DISABLE_PYTHON
                     "             [-py[thon]]\n"
 #endif
-#ifndef DISABLE_JSX
-                    "             [-jsx]\n"
+#ifndef DISABLE_JS
+                    "             [-js]\n"
+#endif
+#ifndef DISABLE_RUST
+                    "             [-rust]\n"
+#endif
+#ifndef DISABLE_GO
+                    "             [-go]\n"
 #endif
                     "             [-w[idechars]]\n"
                     "             [-u[tf8]]\n"
@@ -34,11 +50,15 @@ static void print_arglist(void) {
                     "             [-i[nclude] directory]\n"
                     "             [-r[untime] path to runtime headers]\n"
                     "             [-p[arentclassname] fully qualified parent class name]\n"
-#ifndef DISABLE_JAVA
+#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP)
                     "             [-P[ackage] package name for stemmers]\n"
                     "             [-S[tringclass] StringBuffer-compatible class]\n"
                     "             [-a[mongclass] fully qualified name of the Among class]\n"
 #endif
+#ifndef DISABLE_GO
+                    "             [-gop[ackage] Go package name for stemmers]\n"
+                    "             [-gor[untime] Go snowball runtime package]\n"
+#endif
            );
     exit(1);
 }
@@ -61,9 +81,10 @@ static FILE * get_output(symbol * b) {
     return output;
 }
 
-static void read_options(struct options * o, int argc, char * argv[]) {
+static int read_options(struct options * o, int argc, char * argv[]) {
     char * s;
-    int i = 2;
+    int i = 1;
+    int new_argc = 1;
 
     /* set defaults: */
 
@@ -73,24 +94,30 @@ static void read_options(struct options * o, int argc, char * argv[]) {
     o->variables_prefix = 0;
     o->runtime_path = 0;
     o->parent_class_name = DEFAULT_BASE_CLASS;
-#ifndef DISABLE_JAVA
     o->string_class = DEFAULT_STRING_CLASS;
     o->among_class = DEFAULT_AMONG_CLASS;
     o->package = DEFAULT_PACKAGE;
-#endif
+    o->go_package = DEFAULT_GO_PACKAGE;
+    o->go_snowball_runtime = DEFAULT_GO_SNOWBALL_RUNTIME;
     o->name = "";
     o->make_lang = LANG_C;
-    o->widechars = false;
     o->includes = 0;
     o->includes_end = 0;
-    o->utf8 = false;
+    o->encoding = ENC_SINGLEBYTE;
 
     /* read options: */
 
     while (i < argc) {
         s = argv[i++];
-        {   if (eq(s, "-o") || eq(s, "-output")) {
-                check_lim(i, argc);
+        if (s[0] != '-') {
+            /* Non-option argument - shuffle down. */
+            argv[new_argc++] = s;
+            continue;
+        }
+
+        {
+            if (eq(s, "-o") || eq(s, "-output")) {
+               check_lim(i, argc);
                 o->output_file = argv[i++];
                 continue;
             }
@@ -99,17 +126,42 @@ static void read_options(struct options * o, int argc, char * argv[]) {
                 o->name = argv[i++];
                 continue;
             }
-#ifndef DISABLE_JSX
-            if (eq(s, "-jsx")) {
-                o->make_lang = LANG_JSX;
-                o->widechars = true;
+#ifndef DISABLE_JS
+            if (eq(s, "-js")) {
+                o->make_lang = LANG_JAVASCRIPT;
+                o->encoding = ENC_WIDECHARS;
+                continue;
+            }
+#endif
+#ifndef DISABLE_RUST
+            if (eq(s, "-rust")) {
+                o->make_lang = LANG_RUST;
+                o->encoding = ENC_UTF8;
+                continue;
+            }
+#endif
+#ifndef DISABLE_GO
+            if (eq(s, "-go")) {
+                o->make_lang = LANG_GO;
+                o->encoding = ENC_UTF8;
                 continue;
             }
 #endif
 #ifndef DISABLE_JAVA
             if (eq(s, "-j") || eq(s, "-java")) {
                 o->make_lang = LANG_JAVA;
-                o->widechars = true;
+                o->encoding = ENC_WIDECHARS;
+                continue;
+            }
+#endif
+#ifndef DISABLE_CSHARP
+            if (eq(s, "-cs") || eq(s, "-csharp")) {
+                o->make_lang = LANG_CSHARP;
+		o->encoding = ENC_WIDECHARS;
+                o->parent_class_name = DEFAULT_CS_BASE_CLASS;
+                o->string_class = DEFAULT_CS_STRING_CLASS;
+                o->among_class = DEFAULT_CS_AMONG_CLASS;
+                o->package = DEFAULT_CS_NAMESPACE;
                 continue;
             }
 #endif
@@ -120,13 +172,12 @@ static void read_options(struct options * o, int argc, char * argv[]) {
 #ifndef DISABLE_PYTHON
             if (eq(s, "-py") || eq(s, "-python")) {
                 o->make_lang = LANG_PYTHON;
-                o->widechars = true;
+                o->encoding = ENC_WIDECHARS;
                 continue;
             }
 #endif
             if (eq(s, "-w") || eq(s, "-widechars")) {
-                o->widechars = true;
-                o->utf8 = false;
+                o->encoding = ENC_WIDECHARS;
                 continue;
             }
             if (eq(s, "-s") || eq(s, "-syntax")) {
@@ -164,8 +215,7 @@ static void read_options(struct options * o, int argc, char * argv[]) {
                 continue;
             }
             if (eq(s, "-u") || eq(s, "-utf8")) {
-                o->utf8 = true;
-                o->widechars = false;
+                o->encoding = ENC_UTF8;
                 continue;
             }
             if (eq(s, "-p") || eq(s, "-parentclassname")) {
@@ -173,7 +223,7 @@ static void read_options(struct options * o, int argc, char * argv[]) {
                 o->parent_class_name = argv[i++];
                 continue;
             }
-#ifndef DISABLE_JAVA
+#if !defined(DISABLE_JAVA) || !defined(DISABLE_CSHARP)
             if (eq(s, "-P") || eq(s, "-Package")) {
                 check_lim(i, argc);
                 o->package = argv[i++];
@@ -190,10 +240,27 @@ static void read_options(struct options * o, int argc, char * argv[]) {
                 continue;
             }
 #endif
+#ifndef DISABLE_GO
+            if (eq(s, "-gop") || eq(s, "-gopackage")) {
+                check_lim(i, argc);
+                o->go_package = argv[i++];
+                continue;
+            }
+            if (eq(s, "-gor") || eq(s, "-goruntime")) {
+                check_lim(i, argc);
+                o->go_snowball_runtime = argv[i++];
+                continue;
+            }
+#endif
             fprintf(stderr, "'%s' misplaced\n", s);
             print_arglist();
         }
     }
+    if (new_argc == 1) {
+        fprintf(stderr, "no source files specified\n");
+        print_arglist();
+    }
+    argv[new_argc] = NULL;
 
     if (o->make_lang != LANG_C && o->make_lang != LANG_CPLUSPLUS) {
 	if (o->runtime_path) {
@@ -204,17 +271,19 @@ static void read_options(struct options * o, int argc, char * argv[]) {
 	}
     }
     if (!o->externals_prefix) o->externals_prefix = "";
+    return new_argc;
 }
 
 extern int main(int argc, char * argv[]) {
 
+    int i;
     NEW(options, o);
-    if (argc == 1) print_arglist();
-    read_options(o, argc, argv);
+    argc = read_options(o, argc, argv);
     {
         symbol * filename = add_s_to_b(0, argv[1]);
         char * file;
         symbol * u = get_input(filename, &file);
+        lose_b(filename);
         if (u == 0) {
             fprintf(stderr, "Can't open input %s\n", argv[1]);
             exit(1);
@@ -222,9 +291,29 @@ extern int main(int argc, char * argv[]) {
         {
             struct tokeniser * t = create_tokeniser(u, file);
             struct analyser * a = create_analyser(t);
-            t->widechars = o->widechars;
+            struct input ** next_input_ptr = &(t->next);
+            a->encoding = t->encoding = o->encoding;
             t->includes = o->includes;
-            a->utf8 = t->utf8 = o->utf8;
+            /* If multiple source files are specified, set up the others to be
+             * read after the first in order, using the same mechanism as
+             * 'get' uses. */
+            for (i = 2; i != argc; ++i) {
+                NEW(input, q);
+                filename = add_s_to_b(0, argv[i]);
+                u = get_input(filename, &file);
+                lose_b(filename);
+                if (u == 0) {
+                    fprintf(stderr, "Can't open input %s\n", argv[i]);
+                    exit(1);
+                }
+                q->p = u;
+                q->c = 0;
+                q->file = file;
+                q->line_number = 1;
+                *next_input_ptr = q;
+                next_input_ptr = &(q->next);
+            }
+            *next_input_ptr = NULL;
             read_program(a);
             if (t->error_count > 0) exit(1);
             if (o->syntax_tree) print_program(a);
@@ -274,13 +363,43 @@ extern int main(int argc, char * argv[]) {
                     fclose(o->output_src);
                 }
 #endif
-#ifndef DISABLE_JSX
-                if (o->make_lang == LANG_JSX) {
+#ifndef DISABLE_JS
+                if (o->make_lang == LANG_JAVASCRIPT) {
                     symbol * b = add_s_to_b(0, s);
-                    b = add_s_to_b(b, ".jsx");
+                    b = add_s_to_b(b, ".js");
                     o->output_src = get_output(b);
                     lose_b(b);
-                    generate_program_jsx(g);
+                    generate_program_js(g);
+                    fclose(o->output_src);
+                }
+#endif
+#ifndef DISABLE_CSHARP
+                if (o->make_lang == LANG_CSHARP) {
+                    symbol * b = add_s_to_b(0, s);
+                    b = add_s_to_b(b, ".cs");
+                    o->output_src = get_output(b);
+                    lose_b(b);
+                    generate_program_csharp(g);
+                    fclose(o->output_src);
+                }
+#endif
+#ifndef DISABLE_RUST
+                if (o->make_lang == LANG_RUST) {
+                    symbol * b = add_s_to_b(0, s);
+                    b = add_s_to_b(b, ".rs");
+                    o->output_src = get_output(b);
+                    lose_b(b);
+                    generate_program_rust(g);
+                    fclose(o->output_src);
+                }
+#endif
+#ifndef DISABLE_GO
+                if (o->make_lang == LANG_GO) {
+                    symbol * b = add_s_to_b(0, s);
+                    b = add_s_to_b(b, ".go");
+                    o->output_src = get_output(b);
+                    lose_b(b);
+                    generate_program_go(g);
                     fclose(o->output_src);
                 }
 #endif
@@ -289,7 +408,6 @@ extern int main(int argc, char * argv[]) {
             close_analyser(a);
         }
         lose_b(u);
-        lose_b(filename);
     }
     {   struct include * p = o->includes;
         while (p) {
@@ -301,4 +419,3 @@ extern int main(int argc, char * argv[]) {
     if (space_count) fprintf(stderr, "%d blocks unfreed\n", space_count);
     return 0;
 }
-
diff --git a/xapian-core/languages/compiler/generator.c b/xapian-core/languages/compiler/generator.c
index 6f233b53e..1921cb007 100644
--- a/xapian-core/languages/compiler/generator.c
+++ b/xapian-core/languages/compiler/generator.c
@@ -83,11 +83,10 @@ static void write_hex(struct generator * g, int i) {
 static void wlitch(struct generator * g, int ch) {
     if (32 <= ch && ch < 127) {
         write_char(g, '\'');
-        switch (ch) {
-            case '\'':
-            case '\\': write_char(g, '\\');
-            default:   write_char(g, ch);
+        if (ch == '\'' || ch == '\\') {
+            write_char(g, '\\');
         }
+        write_char(g, ch);
         write_char(g, '\'');
     }  else {
         write_string(g, "0x"); write_hex(g, ch);
@@ -175,13 +174,29 @@ static void write_block_end(struct generator * g) {    /* block end */
 
 static void w(struct generator * g, const char * s);
 
-static void wk(struct generator * g, struct node * p) {     /* keep c */
+/* keep c */
+static void wk(struct generator * g, struct node * p, int keep_limit) {
     ++g->keep_count;
     if (p->mode == m_forward) {
-        write_string(g, "int c"); write_int(g, g->keep_count); w(g, " = ~zc;");
+        write_string(g, "int c");
+        write_int(g, g->keep_count);
+        w(g, " = ~zc");
+	if (keep_limit) {
+            write_string(g, ", mlimit");
+            write_int(g, g->keep_count);
+        }
+        write_char(g, ';');
     } else {
-        write_string(g, "int m"); write_int(g, g->keep_count); w(g, " = ~zl - ~zc; /*(void)m");
-        write_int(g, g->keep_count); write_string(g, "*/;");
+        write_string(g, "int m");
+        write_int(g, g->keep_count);
+        w(g, " = ~zl - ~zc");
+	if (keep_limit) {
+            write_string(g, ", mlimit");
+            write_int(g, g->keep_count);
+        }
+        write_string(g, "; (void)m");
+        write_int(g, g->keep_count);
+        write_char(g, ';');
     }
 }
 
@@ -267,6 +282,7 @@ static void write_data_address(struct generator * g, struct node * p) {
 static void writef(struct generator * g, const char * input, struct node * p) {
     int i = 0;
     int l = strlen(input);
+
     while (i < l) {
         int ch = input[i++];
         if (ch != '~') {
@@ -276,7 +292,8 @@ static void writef(struct generator * g, const char * input, struct node * p) {
         switch (input[i++]) {
             default: write_char(g, input[i - 1]); continue;
             case 'C': write_comment(g, p); continue;
-            case 'k': wk(g, p); continue;
+            case 'k': wk(g, p, false); continue;
+            case 'K': wk(g, p, true); continue;
             case 'i': winc(g, p); continue;
             case 'l': write_check_limit(g, p); continue;
             case 'f': write_failure(g, p); continue;
@@ -316,7 +333,7 @@ static void w(struct generator * g, const char * s) {
 }
 
 static void generate_AE(struct generator * g, struct node * p) {
-    char * s;
+    const char * s;
     switch (p->type) {
         case c_name:
             write_varref(g, p->name); break;
@@ -344,7 +361,7 @@ static void generate_AE(struct generator * g, struct node * p) {
         case c_limit:
             w(g, p->mode == m_forward ? "~zl" : "~zlb"); break;
         case c_len:
-            if (g->options->utf8) {
+            if (g->options->encoding == ENC_UTF8) {
                 w(g, "len_utf8(~zp)");
                 break;
             }
@@ -353,7 +370,7 @@ static void generate_AE(struct generator * g, struct node * p) {
             w(g, "SIZE(~zp)");
             break;
         case c_lenof:
-            if (g->options->utf8) {
+            if (g->options->encoding == ENC_UTF8) {
                 g->V[0] = p->name;
                 w(g, "len_utf8(~V0)");
                 break;
@@ -367,11 +384,11 @@ static void generate_AE(struct generator * g, struct node * p) {
 }
 
 /* K_needed() tests to see if we really need to keep c. Not true when the
-   the command does not touch the cursor. This and repeat_score() could be
+   command does not touch the cursor. This and repeat_score() could be
    elaborated almost indefinitely.
 */
 
-static int K_needed(struct generator * g, struct node * p) {
+extern int K_needed(struct generator * g, struct node * p) {
     while (p) {
         switch (p->type) {
             case c_dollar:
@@ -460,7 +477,7 @@ static int repeat_score(struct generator * g, struct node * p) {
 
 /* tests if an expression requires cursor reinstatement in a repeat */
 
-static int repeat_restore(struct generator * g, struct node * p) {
+extern int repeat_restore(struct generator * g, struct node * p) {
     return repeat_score(g, p) >= 2;
 }
 
@@ -531,7 +548,7 @@ static void generate_or(struct generator * g, struct node * p) {
 
 static void generate_backwards(struct generator * g, struct node * p) {
 
-    writef(g,"~M~zlb = ~zc; ~zc = ~zl;~C~N", p);
+    writef(g, "~M~zlb = ~zc; ~zc = ~zl;~C~N", p);
     generate(g, p->left);
     w(g, "~M~zc = ~zlb;~N");
 }
@@ -654,7 +671,7 @@ static void generate_do(struct generator * g, struct node * p) {
 }
 
 static void generate_next(struct generator * g, struct node * p) {
-    if (g->options->utf8) {
+    if (g->options->encoding == ENC_UTF8) {
         if (p->mode == m_forward)
             w(g, "~{int ret = skip_utf8(~zp, ~zc, 0, ~zl, 1");
         else
@@ -673,7 +690,7 @@ static void generate_GO_grouping(struct generator * g, struct node * p, int is_g
     struct grouping * q = p->name->grouping;
     g->S[0] = p->mode == m_forward ? "" : "_b";
     g->S[1] = complement ? "in" : "out";
-    g->S[2] = g->options->utf8 ? "_U" : "";
+    g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : "";
     g->V[0] = p->name;
     g->I[0] = q->smallest_ch;
     g->I[1] = q->largest_ch;
@@ -817,7 +834,7 @@ static void generate_hop(struct generator * g, struct node * p) {
     g->S[0] = p->mode == m_forward ? "+" : "-";
     g->S[1] = p->mode == m_forward ? "0" :
 	(g->options->make_lang == LANG_C ? "z->lb" : "lb");
-    if (g->options->utf8) {
+    if (g->options->encoding == ENC_UTF8) {
         w(g, "~{int ret = skip_utf8(~zp, ~zc, ~S1, ~zl, ~S0 ");
         generate_AE(g, p->AE); writef(g, ");~C", p);
         writef(g, "~Mif (ret < 0) ~f~N", p);
@@ -919,11 +936,8 @@ static void generate_slicefrom(struct generator * g, struct node * p) {
 
 static void generate_setlimit(struct generator * g, struct node * p) {
     int keep_c;
-    writef(g, "~{~k~C", p);
+    writef(g, "~{~K~C", p);
     keep_c = g->keep_count;
-    w(g, "~Mint mlimit");
-    write_int(g, keep_c);
-    w(g, ";~N");
     generate(g, p->left);
 
     w(g, "~Mmlimit");
@@ -941,6 +955,8 @@ static void generate_setlimit(struct generator * g, struct node * p) {
 
 static const char * vars[] = { "p", "c", "l", "lb", "bra", "ket", NULL };
 
+/* dollar sets snowball up to operate on a string variable as if it were the
+ * current string */
 static void generate_dollar(struct generator * g, struct node * p) {
 
     int used = g->label_used;
@@ -1027,7 +1043,7 @@ static void generate_grouping(struct generator * g, struct node * p, int complem
     struct grouping * q = p->name->grouping;
     g->S[0] = p->mode == m_forward ? "" : "_b";
     g->S[1] = complement ? "out" : "in";
-    g->S[2] = g->options->utf8 ? "_U" : "";
+    g->S[2] = g->options->encoding == ENC_UTF8 ? "_U" : "";
     g->V[0] = p->name;
     g->I[0] = q->smallest_ch;
     g->I[1] = q->largest_ch;
@@ -1049,7 +1065,7 @@ static void generate_literalstring(struct generator * g, struct node * p) {
          * function call.  In UTF-8 mode, only do this for the ASCII subset,
          * since multi-byte characters are more complex to text against.
          */
-        if (g->options->utf8 && *b >= 128) {
+        if (g->options->encoding == ENC_UTF8 && *b >= 128) {
             printf("single byte %d\n", *b);
             exit(1);
         }
@@ -1144,7 +1160,7 @@ static void generate_substring(struct generator * g, struct node * p) {
             if (n_cases > 2) break;
         }
         if (block == -1) {
-            if (ch == cases[0]) continue;
+            if (n_cases > 0 && ch == cases[0]) continue;
             if (n_cases < 2) {
                 cases[n_cases++] = ch;
             } else if (ch != cases[1]) {
@@ -1248,7 +1264,7 @@ static void generate_among(struct generator * g, struct node * p) {
     if (x->substring == 0) generate_substring(g, p);
     if (x->command_count == 0 && x->starter == 0) return;
 
-    if (x->starter) generate(g, x->starter);
+    if (x->starter != 0) generate(g, x->starter);
 
     writef(g, "~Mswitch (among_var) {~C~+"
               "~Mcase 0: ~f~N", p);
@@ -1586,16 +1602,14 @@ static void generate_grouping_table(struct generator * g, struct grouping * q) {
 
     for (i = 0; i < SIZE(b); i++) set_bit(map, b[i] - q->smallest_ch);
 
-    {
-        g->V[0] = q->name;
+    g->V[0] = q->name;
 
-        w(g, "static const unsigned char ~V0[] = { ");
-        for (i = 0; i < size; i++) {
-             write_int(g, map[i]);
-             if (i < size - 1) w(g, ", ");
-        }
-        w(g, " };~N~N");
+    w(g, "static const unsigned char ~V0[] = { ");
+    for (i = 0; i < size; i++) {
+        write_int(g, map[i]);
+        if (i < size - 1) w(g, ", ");
     }
+    w(g, " };~N~N");
     lose_b(map);
 }
 
diff --git a/xapian-core/languages/compiler/header.h b/xapian-core/languages/compiler/header.h
index 9b30805fb..aa1090d04 100644
--- a/xapian-core/languages/compiler/header.h
+++ b/xapian-core/languages/compiler/header.h
@@ -1,3 +1,4 @@
+#include <stdio.h>
 
 typedef unsigned char byte;
 typedef unsigned short symbol;
@@ -47,6 +48,8 @@ extern int get_utf8(const symbol * p, int * slot);
 extern int put_utf8(int ch, symbol * p);
 extern void output_str(FILE * outfile, struct str * str);
 
+typedef enum { ENC_SINGLEBYTE, ENC_UTF8, ENC_WIDECHARS } enc;
+
 struct m_pair {
 
     struct m_pair * next;
@@ -89,6 +92,12 @@ enum token_codes {
     NUM_TOKEN_CODES
 };
 
+enum uplus_modes {
+    UPLUS_NONE,
+    UPLUS_DEFINED,
+    UPLUS_UNICODE
+};
+
 /* struct input must be a prefix of struct tokeniser. */
 struct tokeniser {
 
@@ -108,12 +117,18 @@ struct tokeniser {
     int token;
     int previous_token;
     byte token_held;
-    byte widechars;
-    byte utf8;
+    enc encoding;
 
     int omission;
     struct include * includes;
 
+    /* Mode in which U+ has been used:
+     * UPLUS_NONE - not used yet
+     * UPLUS_DEFINED - stringdef U+xxxx ....
+     * UPLUS_UNICODE - {U+xxxx} used with implicit meaning
+     */
+    int uplusmode;
+
     char token_disabled[NUM_TOKEN_CODES];
 };
 
@@ -144,6 +159,7 @@ struct name {
     byte used_in_among;         /* Function used in among? */
     struct node * used;         /* First use, or NULL if not used */
     struct name * local_to;     /* Local to one routine/external */
+    int declaration_line_number;/* Line number of declaration */
 
 };
 
@@ -185,6 +201,7 @@ struct grouping {
     int largest_ch;           /* character with max code */
     int smallest_ch;          /* character with min code */
     struct name * name;       /* so g->name->grouping == g */
+    int line_number;
 };
 
 struct node {
@@ -243,7 +260,7 @@ struct analyser {
     struct grouping * groupings;
     struct grouping * groupings_end;
     struct node * substring;  /* pending 'substring' in current routine definition */
-    byte utf8;
+    enc encoding;
     byte int_limits_used;     /* are maxint or minint used? */
 };
 
@@ -278,7 +295,7 @@ struct generator {
      * if < 0, the negated keep_count for the limit to restore in case of
      * failure. */
     int failure_keep_count;
-#if !defined(DISABLE_JAVA) && !defined(DISABLE_JSX) && !defined(DISABLE_PYTHON)
+#if !defined(DISABLE_JAVA) && !defined(DISABLE_JS) && !defined(DISABLE_PYTHON) && !defined(DISABLE_CSHARP)
     struct str * failure_str;  /* This is used by some generators instead of failure_keep_count */
 #endif
 
@@ -309,18 +326,19 @@ struct options {
     FILE * output_src;
     FILE * output_h;
     byte syntax_tree;
-    byte widechars;
-    enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS, LANG_PYTHON, LANG_JSX } make_lang;
+    enc encoding;
+    enum { LANG_JAVA, LANG_C, LANG_CPLUSPLUS, LANG_CSHARP, LANG_PYTHON, LANG_JAVASCRIPT, LANG_RUST, LANG_GO } make_lang;
     const char * externals_prefix;
     const char * variables_prefix;
     const char * runtime_path;
     const char * parent_class_name;
     const char * package;
+    const char * go_package;
+    const char * go_snowball_runtime;
     const char * string_class;
     const char * among_class;
     struct include * includes;
     struct include * includes_end;
-    byte utf8;
 };
 
 /* Generator functions common to several backends. */
@@ -335,6 +353,9 @@ extern void write_int(struct generator * g, int i);
 extern void write_b(struct generator * g, symbol * b);
 extern void write_str(struct generator * g, struct str * str);
 
+extern int K_needed(struct generator * g, struct node * p);
+extern int repeat_restore(struct generator * g, struct node * p);
+
 /* Generator for C code. */
 extern void generate_program_c(struct generator * g);
 
@@ -343,11 +364,24 @@ extern void generate_program_c(struct generator * g);
 extern void generate_program_java(struct generator * g);
 #endif
 
+#ifndef DISABLE_CSHARP
+/* Generator for C# code. */
+extern void generate_program_csharp(struct generator * g);
+#endif
+
 #ifndef DISABLE_PYTHON
 /* Generator for Python code. */
 extern void generate_program_python(struct generator * g);
 #endif
 
-#ifndef DISABLE_JSX
-extern void generate_program_jsx(struct generator * g);
+#ifndef DISABLE_JS
+extern void generate_program_js(struct generator * g);
+#endif
+
+#ifndef DISABLE_RUST
+extern void generate_program_rust(struct generator * g);
+#endif
+
+#ifndef DISABLE_GO
+extern void generate_program_go(struct generator * g);
 #endif
diff --git a/xapian-core/languages/compiler/tokeniser.c b/xapian-core/languages/compiler/tokeniser.c
index 8c7ab4064..0fcbd6d45 100644
--- a/xapian-core/languages/compiler/tokeniser.c
+++ b/xapian-core/languages/compiler/tokeniser.c
@@ -16,6 +16,8 @@ struct system_word {
 
 #include "syswords.h"
 
+static int hex_to_num(int ch);
+
 static int smaller(int a, int b) { return a < b ? a : b; }
 
 extern symbol * get_input(symbol * p, char ** p_file) {
@@ -102,10 +104,13 @@ static int eq_s(struct tokeniser * t, const char * s) {
 
 static int white_space(struct tokeniser * t, int ch) {
     switch (ch) {
-        case '\n': t->line_number++;
+        case '\n':
+            t->line_number++;
+            /* fall through */
         case '\r':
         case '\t':
-        case ' ': return true;
+        case ' ':
+            return true;
     }
     return false;
 }
@@ -129,6 +134,7 @@ static int read_literal_string(struct tokeniser * t, int c) {
         if (ch == '\n') { error1(t, "string not terminated"); return c; }
         c++;
         if (ch == t->m_start) {
+            /* Inside insert characters. */
             int c0 = c;
             int newlines = false; /* no newlines as yet */
             int black_found = false; /* no printing chars as yet */
@@ -150,7 +156,65 @@ static int read_literal_string(struct tokeniser * t, int c) {
                 if (q == 0) {
                     if (n == 1 && (firstch == '\'' || firstch == t->m_start))
                         t->b = add_to_b(t->b, 1, p + c0);
-                    else
+                    else if (n >= 3 && firstch == 'U' && p[c0 + 1] == '+') {
+                        int codepoint = 0;
+                        int x;
+                        if (t->uplusmode == UPLUS_DEFINED) {
+                            /* See if found with xxxx upper-cased. */
+                            symbol * uc = create_b(n);
+                            int i;
+                            for (i = 0; i != n; ++i) {
+                                uc[i] = toupper(p[c0 + i]);
+                            }
+                            q = find_in_m(t, n, uc);
+                            lose_b(uc);
+                            if (q != 0) {
+                                t->b = add_to_b(t->b, SIZE(q), q);
+                                continue;
+                            }
+                            error1(t, "Some U+xxxx stringdefs seen but not this one");
+                        } else {
+                            t->uplusmode = UPLUS_UNICODE;
+                        }
+                        for (x = c0 + 2; x != c - 1; ++x) {
+                            int hex = hex_to_num(p[x]);
+                            if (hex < 0) {
+                                error1(t, "Bad hex digit following U+");
+                                break;
+                            }
+                            codepoint = (codepoint << 4) | hex;
+                        }
+                        if (t->encoding == ENC_UTF8) {
+                            if (codepoint < 0 || codepoint > 0x01ffff) {
+                                error1(t, "character values exceed 0x01ffff");
+                            }
+                            /* Ensure there's enough space for a max length
+                             * UTF-8 sequence. */
+                            if (CAPACITY(t->b) < SIZE(t->b) + 3) {
+                                t->b = increase_capacity(t->b, 3);
+                            }
+                            SIZE(t->b) += put_utf8(codepoint, t->b + SIZE(t->b));
+                        } else {
+                            if (t->encoding == ENC_SINGLEBYTE) {
+                                /* Only ISO-8859-1 is handled this way - for
+                                 * other single-byte character sets you need
+                                 * stringdef all the U+xxxx codes you use
+                                 * like - e.g.:
+                                 *
+                                 * stringdef U+0171   hex 'FB'
+                                 */
+                                if (codepoint < 0 || codepoint > 0xff) {
+                                    error1(t, "character values exceed 256");
+                                }
+                            } else {
+                                if (codepoint < 0 || codepoint > 0xffff) {
+                                    error1(t, "character values exceed 64K");
+                                }
+                            }
+                            symbol sym = codepoint;
+                            t->b = add_to_b(t->b, 1, &sym);
+                        }
+                    } else
                         error(t, "string macro '", n, p + c0, "' undeclared");
                 } else
                     t->b = add_to_b(t->b, SIZE(q), q);
@@ -241,6 +305,7 @@ static int decimal_to_num(int ch) {
 static int hex_to_num(int ch) {
     if ('0' <= ch && ch <= '9') return ch - '0';
     if ('a' <= ch && ch <= 'f') return ch - 'a' + 10;
+    if ('A' <= ch && ch <= 'F') return ch - 'A' + 10;
     return -1;
 }
 
@@ -261,7 +326,7 @@ static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
                         return;
                     }
                 } else {
-                    ch = hex_to_num(tolower(ch));
+                    ch = hex_to_num(ch);
                     if (ch < 0) {
                         error1(t, "hex string contains non-hex characters");
                         return;
@@ -270,18 +335,18 @@ static void convert_numeric_string(struct tokeniser * t, symbol * p, int base) {
                 number = base * number + ch;
                 c++;
             }
-            if (t->widechars || t->utf8) {
-                if (number < 0 || number > 0xffff) {
-                    error1(t, "character values exceed 64K");
+            if (t->encoding == ENC_SINGLEBYTE) {
+                if (number < 0 || number > 0xff) {
+                    error1(t, "character values exceed 256");
                     return;
                 }
             } else {
-                if (number < 0 || number > 0xff) {
-                    error1(t, "character values exceed 256");
+                if (number < 0 || number > 0xffff) {
+                    error1(t, "character values exceed 64K");
                     return;
                 }
             }
-            if (t->utf8)
+            if (t->encoding == ENC_UTF8)
                 d += put_utf8(number, p + d);
             else
                 p[d++] = number;
@@ -340,6 +405,14 @@ extern int read_token(struct tokeniser * t) {
                        q->name = copy_b(t->b2);
                        q->value = copy_b(t->b);
                        t->m_pairs = q;
+                       if (t->uplusmode != UPLUS_DEFINED &&
+                           (SIZE(t->b2) >= 3 && t->b2[0] == 'U' && t->b2[1] == '+')) {
+                           if (t->uplusmode == UPLUS_UNICODE) {
+                               error1(t, "U+xxxx already used with implicit meaning");
+                           } else {
+                               t->uplusmode = UPLUS_DEFINED;
+                           }
+                       }
                    }
                }
                continue;
@@ -391,7 +464,7 @@ extern int read_token(struct tokeniser * t) {
                    t->get_depth--;
                    continue;
                }
-               /* drop through */
+               /* fall through */
             default:
                 t->previous_token = t->token;
                 t->token = code;
@@ -439,6 +512,7 @@ extern struct tokeniser * create_tokeniser(symbol * p, char * file) {
     t->token_held = false;
     t->token = -2;
     t->previous_token = -2;
+    t->uplusmode = UPLUS_NONE;
     memset(t->token_disabled, 0, sizeof(t->token_disabled));
     return t;
 }
-- 
2.11.4.GIT