From 9eba9e87fc3ed257bf82c1bc7e2f77dba1aaf9a5 Mon Sep 17 00:00:00 2001 From: Steve Bennett Date: Sun, 27 Oct 2019 21:17:42 +1000 Subject: [PATCH] regexp -indices should return character indices Not byte indices Reported-by: dbohdan Signed-off-by: Steve Bennett --- jim-regexp.c | 15 +++++++-------- tests/regexp2.test | 4 ++++ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/jim-regexp.c b/jim-regexp.c index 81f3207..3134598 100644 --- a/jim-regexp.c +++ b/jim-regexp.c @@ -281,16 +281,15 @@ int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) } } else { - int len = pmatch[j].rm_eo - pmatch[j].rm_so; - if (opt_indices) { - Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, - offset + pmatch[j].rm_so)); - Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, - offset + pmatch[j].rm_so + len - 1)); + /* rm_so and rm_eo are byte offsets. We need char offsets */ + int so = utf8_strlen(source_str, pmatch[j].rm_so); + int eo = utf8_strlen(source_str + pmatch[j].rm_so, pmatch[j].rm_eo); + Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, offset + so)); + Jim_ListAppendElement(interp, resultObj, Jim_NewIntObj(interp, offset + eo - 1)); } else { - Jim_AppendString(interp, resultObj, source_str + pmatch[j].rm_so, len); + Jim_AppendString(interp, resultObj, source_str + pmatch[j].rm_so, pmatch[j].rm_eo - pmatch[j].rm_so); } } @@ -311,7 +310,7 @@ int Jim_RegexpCmd(Jim_Interp *interp, int argc, Jim_Obj *const *argv) try_next_match: if (opt_all && (pattern[0] != '^' || (regcomp_flags & REG_NEWLINE)) && *source_str) { if (pmatch[0].rm_eo) { - offset += pmatch[0].rm_eo; + offset += utf8_strlen(source_str, pmatch[0].rm_eo); source_str += pmatch[0].rm_eo; } else { diff --git a/tests/regexp2.test b/tests/regexp2.test index 76735e6..1aee8cd 100644 --- a/tests/regexp2.test +++ b/tests/regexp2.test @@ -633,6 +633,7 @@ test regexpComp-16.5 {regexp -start with utf8} utf8 { test regexpComp-16.6 {regexp -start with utf8} utf8 { regsub -start 1 . \u0442\u0435\u0441\u0442 x } \u0442x\u0441\u0442 + test regexpComp-17.1 {regexp -inline} { regexp -inline b ababa } {b} @@ -654,6 +655,9 @@ test regexpComp-17.6 {regexp -inline no matches} { test regexpComp-17.7 {regexp -inline, no matchvars allowed} { list [catch {regexp -inline b abc match} msg] $msg } {1 {regexp match variables not allowed when using -inline}} +test regexpComp-17.8 {regexp -indices utf8} utf8 { + regexp -all -inline -start 1 -indices . \u0442\u0435\u0441\u0442 +} {{1 1} {2 2} {3 3}} test regexpComp-18.1 {regexp -all} { regexp -all b bbbbb -- 2.11.4.GIT