From aa2d0b07edcfc884f0d778eaa89c201094c43458 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Tue, 5 Apr 2011 12:15:39 +0300 Subject: [PATCH] Ticket #1882: PCRE search: escape sequence support in replacements, UTF8 support (just a flag for libPCRE) Enables use of escape sequences inside regex replace strings, Enables UTF-8 caseless search in PCRE. Supported escape sequences: \DEC, \xHEX, \0OCT, \n, \t, \v, \b, \r, \f, \a. Any of them could be enclosed into \{}. Signed-off-by: Slava Zanko --- lib/search/regex.c | 233 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 210 insertions(+), 23 deletions(-) diff --git a/lib/search/regex.c b/lib/search/regex.c index 69179723f..97f5fa907 100644 --- a/lib/search/regex.c +++ b/lib/search/regex.c @@ -5,7 +5,8 @@ Copyright (C) 2009 The Free Software Foundation, Inc. Written by: - Slava Zanko , 2009. + Slava Zanko , 2009,2010,2011 + Vitaliy Filippov , 2011 This file is part of the Midnight Commander. @@ -53,6 +54,13 @@ typedef enum REPLACE_T_LOW_TRANSFORM = 8 } replace_transform_type_t; +typedef enum +{ + REPLACE_PREPARE_T_NOTHING_SPECIAL = -1, + REPLACE_PREPARE_T_REPLACE_FLAG = -2, + REPLACE_PREPARE_T_ESCAPE_SEQ = -3, +} replace_prepare_t; + /*** file scope variables ************************************************************************/ /*** file scope functions ************************************************************************/ @@ -374,6 +382,71 @@ mc_search_regex__get_token_by_num (const mc_search_t * lc_mc_search, gsize lc_in /* --------------------------------------------------------------------------------------------- */ +static gboolean +mc_search_regex__replace_handle_esc_seq (const GString * replace_str, const gsize current_pos, + gsize * skip_len, int *ret, char *next_char) +{ + char *curr_str = &(replace_str->str[current_pos]); + + *next_char = *(curr_str + 1); + + if (replace_str->len > current_pos + 2) + { + if (*next_char == '{') + { + for (*skip_len = 2; /* \{ */ + current_pos + *skip_len < replace_str->len + && (*(curr_str + *skip_len)) != '}'; (*skip_len)++); + if (current_pos + *skip_len < replace_str->len) /* } */ + (*skip_len)++; + *ret = REPLACE_PREPARE_T_ESCAPE_SEQ; + return FALSE; + } + + if (*next_char == 'x') + { + *skip_len = 2; /* \x */ + *next_char = *(curr_str + 2); + if (*next_char == '{') + { + for (*skip_len = 3; /* \x{ */ + current_pos + *skip_len < replace_str->len + && (*(curr_str + *skip_len)) != '}'; (*skip_len)++); + if (current_pos + *skip_len < replace_str->len) + (*skip_len)++; + *ret = REPLACE_PREPARE_T_ESCAPE_SEQ; + return FALSE; + } + else if (!g_ascii_isxdigit ((guchar) * next_char)) + { + *skip_len = 2; /* \x without number behind */ + *ret = REPLACE_PREPARE_T_NOTHING_SPECIAL; + return FALSE; + } + else + { + *next_char = *(curr_str + 3); + if (!g_ascii_isxdigit ((guchar) * next_char)) + *skip_len = 3; /* \xH */ + else + *skip_len = 4; /* \xHH */ + *ret = REPLACE_PREPARE_T_ESCAPE_SEQ; + return FALSE; + } + } + } + + if (strchr ("ntvbrfa", *next_char) != NULL) + { + *skip_len = 2; + *ret = REPLACE_PREPARE_T_ESCAPE_SEQ; + return FALSE; + } + return TRUE; +} + +/* --------------------------------------------------------------------------------------------- */ + static int mc_search_regex__process_replace_str (const GString * replace_str, const gsize current_pos, gsize * skip_len, replace_transform_type_t * replace_flags) @@ -383,16 +456,17 @@ mc_search_regex__process_replace_str (const GString * replace_str, const gsize c const char *curr_str = &(replace_str->str[current_pos]); if (current_pos > replace_str->len) - return -1; + return REPLACE_PREPARE_T_NOTHING_SPECIAL; *skip_len = 0; - if (*curr_str == '$' && *(curr_str + 1) == '{' && (*(curr_str + 2) & (char) 0xf0) == 0x30) + if ((*curr_str == '$') && (*(curr_str + 1) == '{') && ((*(curr_str + 2) & (char) 0xf0) == 0x30) + && (replace_str->len > current_pos + 2)) { if (strutils_is_char_escaped (replace_str->str, curr_str)) { *skip_len = 1; - return -1; + return REPLACE_PREPARE_T_NOTHING_SPECIAL; } for (*skip_len = 0; @@ -400,36 +474,43 @@ mc_search_regex__process_replace_str (const GString * replace_str, const gsize c && (*(curr_str + 2 + *skip_len) & (char) 0xf0) == 0x30; (*skip_len)++); if (*(curr_str + 2 + *skip_len) != '}') - return -1; + return REPLACE_PREPARE_T_NOTHING_SPECIAL; tmp_str = g_strndup (curr_str + 2, *skip_len); if (tmp_str == NULL) - return -1; + return REPLACE_PREPARE_T_NOTHING_SPECIAL; ret = atoi (tmp_str); g_free (tmp_str); *skip_len += 3; /* ${} */ - return ret; + return ret; /* capture buffer index >= 0 */ } - if (*curr_str == '\\') + if ((*curr_str == '\\') && (replace_str->len > current_pos + 1)) { + char next_char; + if (strutils_is_char_escaped (replace_str->str, curr_str)) { *skip_len = 1; - return -1; + return REPLACE_PREPARE_T_NOTHING_SPECIAL; } if (g_ascii_isdigit (*(curr_str + 1))) { - ret = g_ascii_digit_value (*(curr_str + 1)); + ret = g_ascii_digit_value (*(curr_str + 1)); /* capture buffer index >= 0 */ *skip_len = 2; /* \\ and one digit */ return ret; } - ret = -2; + + if (!mc_search_regex__replace_handle_esc_seq + (replace_str, current_pos, skip_len, &ret, &next_char)) + return ret; + + ret = REPLACE_PREPARE_T_REPLACE_FLAG; *skip_len += 2; - switch (*(curr_str + 1)) + switch (next_char) { case 'U': *replace_flags |= REPLACE_T_UPP_TRANSFORM; @@ -449,13 +530,15 @@ mc_search_regex__process_replace_str (const GString * replace_str, const gsize c *replace_flags = REPLACE_T_NO_TRANSFORM; break; default: - ret = -1; + ret = REPLACE_PREPARE_T_NOTHING_SPECIAL; break; } } return ret; } +/* --------------------------------------------------------------------------------------------- */ + static void mc_search_regex__process_append_str (GString * dest_str, const char *from, gsize len, replace_transform_type_t * replace_flags) @@ -517,7 +600,79 @@ mc_search_regex__process_append_str (GString * dest_str, const char *from, gsize } +/* --------------------------------------------------------------------------------------------- */ + +static void +mc_search_regex__process_escape_sequence (GString * dest_str, const char *from, gsize len, + replace_transform_type_t * replace_flags) +{ + gsize i = 0; + char c = 0; + + if (len == (gsize) (-1)) + len = strlen (from); + if (len == 0) + return; + if (from[i] == '{') + i++; + if (i >= len) + return; + if (from[i] == 'x') + { + i++; + if (i < len && from[i] == '{') + i++; + for (; i < len; i++) + { + if (from[i] >= '0' && from[i] <= '9') + c = c * 16 + from[i] - '0'; + else if (from[i] >= 'a' && from[i] <= 'f') + c = c * 16 + 10 + from[i] - 'a'; + else if (from[i] >= 'A' && from[i] <= 'F') + c = c * 16 + 10 + from[i] - 'A'; + else + break; + } + } + else if (from[i] >= '0' && from[i] <= '9') + for (; i < len && from[i] >= '0' && from[i] <= '7'; i++) + c = c * 8 + from[i] - '0'; + else + { + switch (from[i]) + { + case 'n': + c = '\n'; + break; + case 't': + c = '\t'; + break; + case 'v': + c = '\v'; + break; + case 'b': + c = '\b'; + break; + case 'r': + c = '\r'; + break; + case 'f': + c = '\f'; + break; + case 'a': + c = '\a'; + break; + default: + mc_search_regex__process_append_str (dest_str, from, len, replace_flags); + return; + } + } + g_string_append_len (dest_str, &c, 1); +} + +/* --------------------------------------------------------------------------------------------- */ /*** public functions ****************************************************************************/ +/* --------------------------------------------------------------------------------------------- */ void mc_search__cond_struct_new_init_regex (const char *charset, mc_search_t * lc_mc_search, @@ -525,10 +680,6 @@ mc_search__cond_struct_new_init_regex (const char *charset, mc_search_t * lc_mc_ { #ifdef SEARCH_TYPE_GLIB GError *error = NULL; -#else /* SEARCH_TYPE_GLIB */ - const char *error; - int erroffset; -#endif /* SEARCH_TYPE_GLIB */ if (!lc_mc_search->is_case_sensitive) { @@ -538,10 +689,9 @@ mc_search__cond_struct_new_init_regex (const char *charset, mc_search_t * lc_mc_ mc_search_cond->str = mc_search__cond_struct_new_regex_ci_str (charset, tmp); g_string_free (tmp, TRUE); } -#ifdef SEARCH_TYPE_GLIB mc_search_cond->regex_handle = - g_regex_new (mc_search_cond->str->str, G_REGEX_OPTIMIZE | G_REGEX_RAW | G_REGEX_DOTALL, 0, - &error); + g_regex_new (mc_search_cond->str->str, G_REGEX_OPTIMIZE | G_REGEX_RAW | G_REGEX_DOTALL, + 0, &error); if (error != NULL) { @@ -551,8 +701,30 @@ mc_search__cond_struct_new_init_regex (const char *charset, mc_search_t * lc_mc_ return; } #else /* SEARCH_TYPE_GLIB */ + const char *error; + int erroffset; + int pcre_options = PCRE_EXTRA | PCRE_MULTILINE; + + if (str_isutf8(charset)) + { + pcre_options |= PCRE_UTF8; + if (lc_mc_search->is_case_sensitive) + pcre_options |= PCRE_CASELESS; + } + else + { + if (!lc_mc_search->is_case_sensitive) + { + GString *tmp; + + tmp = mc_search_cond->str; + mc_search_cond->str = mc_search__cond_struct_new_regex_ci_str (charset, tmp); + g_string_free (tmp, TRUE); + } + } + mc_search_cond->regex_handle = - pcre_compile (mc_search_cond->str->str, PCRE_EXTRA, &error, &erroffset, NULL); + pcre_compile (mc_search_cond->str->str, pcre_options, &error, &erroffset, NULL); if (mc_search_cond->regex_handle == NULL) { lc_mc_search->error = MC_SEARCH_E_REGEX_COMPILE; @@ -708,7 +880,7 @@ mc_search_regex_prepare_replace_str (mc_search_t * lc_mc_search, GString * repla { lc_index = mc_search_regex__process_replace_str (replace_str, loop, &len, &replace_flags); - if (lc_index == -1) + if (lc_index == REPLACE_PREPARE_T_NOTHING_SPECIAL) { if (len != 0) { @@ -723,7 +895,7 @@ mc_search_regex_prepare_replace_str (mc_search_t * lc_mc_search, GString * repla continue; } - if (lc_index == -2) + if (lc_index == REPLACE_PREPARE_T_REPLACE_FLAG) { if (loop) mc_search_regex__process_append_str (ret, prev_str, @@ -734,6 +906,21 @@ mc_search_regex_prepare_replace_str (mc_search_t * lc_mc_search, GString * repla continue; } + /* escape sequence */ + if (lc_index == REPLACE_PREPARE_T_ESCAPE_SEQ) + { + mc_search_regex__process_append_str (ret, prev_str, + replace_str->str + loop - prev_str, + &replace_flags); + /* call process_escape_sequence without starting '\\' */ + mc_search_regex__process_escape_sequence (ret, replace_str->str + loop + 1, len - 1, + &replace_flags); + prev_str = replace_str->str + loop + len; + loop += len - 1; + continue; + } + + /* invalid capture buffer number */ if (lc_index > lc_mc_search->num_results) { g_string_free (ret, TRUE); -- 2.11.4.GIT