1 // class template regex -*- C++ -*-
3 // Copyright (C) 2013-2018 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
26 * @file bits/regex_scanner.tcc
27 * This is an internal header file, included by other library headers.
28 * Do not attempt to use it directly. @headername{regex}
31 // FIXME make comments doxygen format.
33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
35 // 1) grep is basic except '\n' is treated as '|'
36 // 2) egrep is extended except '\n' is treated as '|'
37 // 3) awk is extended except special escaping rules, and there's no
42 // ECMAScript: ECMA-262 15.10
45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
49 namespace std _GLIBCXX_VISIBILITY(default)
51 _GLIBCXX_BEGIN_NAMESPACE_VERSION
55 template<typename _CharT>
57 _Scanner(typename _Scanner::_IterT __begin,
58 typename _Scanner::_IterT __end,
59 _FlagT __flags, std::locale __loc)
60 : _ScannerBase(__flags),
61 _M_current(__begin), _M_end(__end),
62 _M_ctype(std::use_facet<_CtypeT>(__loc)),
63 _M_eat_escape(_M_is_ecma()
64 ? &_Scanner::_M_eat_escape_ecma
65 : &_Scanner::_M_eat_escape_posix)
68 template<typename _CharT>
73 if (_M_current == _M_end)
75 _M_token = _S_token_eof;
79 if (_M_state == _S_state_normal)
81 else if (_M_state == _S_state_in_bracket)
83 else if (_M_state == _S_state_in_brace)
87 __glibcxx_assert(false);
91 // Differences between styles:
92 // 1) "\(", "\)", "\{" in basic. It's not escaping.
93 // 2) "(?:", "(?=", "(?!" in ECMAScript.
94 template<typename _CharT>
99 auto __c = *_M_current++;
101 if (std::strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr)
103 _M_token = _S_token_ord_char;
104 _M_value.assign(1, __c);
109 if (_M_current == _M_end)
111 regex_constants::error_escape,
112 "Unexpected end of regex when escaping.");
115 || (*_M_current != '('
116 && *_M_current != ')'
117 && *_M_current != '{'))
119 (this->*_M_eat_escape)();
126 if (_M_is_ecma() && *_M_current == '?')
128 if (++_M_current == _M_end)
130 regex_constants::error_paren,
131 "Unexpected end of regex when in an open parenthesis.");
133 if (*_M_current == ':')
136 _M_token = _S_token_subexpr_no_group_begin;
138 else if (*_M_current == '=')
141 _M_token = _S_token_subexpr_lookahead_begin;
142 _M_value.assign(1, 'p');
144 else if (*_M_current == '!')
147 _M_token = _S_token_subexpr_lookahead_begin;
148 _M_value.assign(1, 'n');
152 regex_constants::error_paren,
153 "Invalid special open parenthesis.");
155 else if (_M_flags & regex_constants::nosubs)
156 _M_token = _S_token_subexpr_no_group_begin;
158 _M_token = _S_token_subexpr_begin;
161 _M_token = _S_token_subexpr_end;
164 _M_state = _S_state_in_bracket;
165 _M_at_bracket_start = true;
166 if (_M_current != _M_end && *_M_current == '^')
168 _M_token = _S_token_bracket_neg_begin;
172 _M_token = _S_token_bracket_begin;
176 _M_state = _S_state_in_brace;
177 _M_token = _S_token_interval_begin;
179 else if (__c != ']' && __c != '}')
181 auto __it = _M_token_tbl;
182 auto __narrowc = _M_ctype.narrow(__c, '\0');
183 for (; __it->first != '\0'; ++__it)
184 if (__it->first == __narrowc)
186 _M_token = __it->second;
189 __glibcxx_assert(false);
193 _M_token = _S_token_ord_char;
194 _M_value.assign(1, __c);
198 // Differences between styles:
199 // 1) different semantics of "[]" and "[^]".
200 // 2) Escaping in bracket expr.
201 template<typename _CharT>
206 if (_M_current == _M_end)
208 regex_constants::error_brack,
209 "Unexpected end of regex when in bracket expression.");
211 auto __c = *_M_current++;
214 _M_token = _S_token_bracket_dash;
217 if (_M_current == _M_end)
218 __throw_regex_error(regex_constants::error_brack,
219 "Unexpected character class open bracket.");
221 if (*_M_current == '.')
223 _M_token = _S_token_collsymbol;
224 _M_eat_class(*_M_current++);
226 else if (*_M_current == ':')
228 _M_token = _S_token_char_class_name;
229 _M_eat_class(*_M_current++);
231 else if (*_M_current == '=')
233 _M_token = _S_token_equiv_class_name;
234 _M_eat_class(*_M_current++);
238 _M_token = _S_token_ord_char;
239 _M_value.assign(1, __c);
242 // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
243 // literally. So "[]]" and "[^]]" are valid regexes. See the testcases
244 // `*/empty_range.cc`.
245 else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
247 _M_token = _S_token_bracket_end;
248 _M_state = _S_state_normal;
250 // ECMAScript and awk permits escaping in bracket.
251 else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
252 (this->*_M_eat_escape)();
255 _M_token = _S_token_ord_char;
256 _M_value.assign(1, __c);
258 _M_at_bracket_start = false;
261 // Differences between styles:
262 // 1) "\}" in basic style.
263 template<typename _CharT>
268 if (_M_current == _M_end)
270 regex_constants::error_brace,
271 "Unexpected end of regex when in brace expression.");
273 auto __c = *_M_current++;
275 if (_M_ctype.is(_CtypeT::digit, __c))
277 _M_token = _S_token_dup_count;
278 _M_value.assign(1, __c);
279 while (_M_current != _M_end
280 && _M_ctype.is(_CtypeT::digit, *_M_current))
281 _M_value += *_M_current++;
284 _M_token = _S_token_comma;
286 else if (_M_is_basic())
288 if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
290 _M_state = _S_state_normal;
291 _M_token = _S_token_interval_end;
295 __throw_regex_error(regex_constants::error_badbrace,
296 "Unexpected character in brace expression.");
300 _M_state = _S_state_normal;
301 _M_token = _S_token_interval_end;
304 __throw_regex_error(regex_constants::error_badbrace,
305 "Unexpected character in brace expression.");
308 template<typename _CharT>
313 if (_M_current == _M_end)
314 __throw_regex_error(regex_constants::error_escape,
315 "Unexpected end of regex when escaping.");
317 auto __c = *_M_current++;
318 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
320 if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
322 _M_token = _S_token_ord_char;
323 _M_value.assign(1, *__pos);
327 _M_token = _S_token_word_bound;
328 _M_value.assign(1, 'p');
332 _M_token = _S_token_word_bound;
333 _M_value.assign(1, 'n');
343 _M_token = _S_token_quoted_class;
344 _M_value.assign(1, __c);
348 if (_M_current == _M_end)
350 regex_constants::error_escape,
351 "Unexpected end of regex when reading control code.");
352 _M_token = _S_token_ord_char;
353 _M_value.assign(1, *_M_current++);
355 else if (__c == 'x' || __c == 'u')
358 for (int __i = 0; __i < (__c == 'x' ? 2 : 4); __i++)
360 if (_M_current == _M_end
361 || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
363 regex_constants::error_escape,
364 "Unexpected end of regex when ascii character.");
365 _M_value += *_M_current++;
367 _M_token = _S_token_hex_num;
369 // ECMAScript recognizes multi-digit back-references.
370 else if (_M_ctype.is(_CtypeT::digit, __c))
372 _M_value.assign(1, __c);
373 while (_M_current != _M_end
374 && _M_ctype.is(_CtypeT::digit, *_M_current))
375 _M_value += *_M_current++;
376 _M_token = _S_token_backref;
380 _M_token = _S_token_ord_char;
381 _M_value.assign(1, __c);
385 // Differences between styles:
386 // 1) Extended doesn't support backref, but basic does.
387 template<typename _CharT>
390 _M_eat_escape_posix()
392 if (_M_current == _M_end)
393 __throw_regex_error(regex_constants::error_escape,
394 "Unexpected end of regex when escaping.");
396 auto __c = *_M_current;
397 auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
399 if (__pos != nullptr && *__pos != '\0')
401 _M_token = _S_token_ord_char;
402 _M_value.assign(1, __c);
404 // We MUST judge awk before handling backrefs. There's no backref in awk.
405 else if (_M_is_awk())
410 else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
412 _M_token = _S_token_backref;
413 _M_value.assign(1, __c);
417 #ifdef __STRICT_ANSI__
418 // POSIX says it is undefined to escape ordinary characters
419 __throw_regex_error(regex_constants::error_escape,
420 "Unexpected escape character.");
422 _M_token = _S_token_ord_char;
423 _M_value.assign(1, __c);
429 template<typename _CharT>
434 auto __c = *_M_current++;
435 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
437 if (__pos != nullptr)
439 _M_token = _S_token_ord_char;
440 _M_value.assign(1, *__pos);
442 // \ddd for oct representation
443 else if (_M_ctype.is(_CtypeT::digit, __c)
447 _M_value.assign(1, __c);
450 && _M_current != _M_end
451 && _M_ctype.is(_CtypeT::digit, *_M_current)
452 && *_M_current != '8'
453 && *_M_current != '9';
455 _M_value += *_M_current++;
456 _M_token = _S_token_oct_num;
460 __throw_regex_error(regex_constants::error_escape,
461 "Unexpected escape character.");
464 // Eats a character class or throws an exception.
465 // __ch could be ':', '.' or '=', _M_current is the char after ']' when
467 template<typename _CharT>
470 _M_eat_class(char __ch)
472 for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
473 _M_value += *_M_current++;
474 if (_M_current == _M_end
475 || *_M_current++ != __ch
476 || _M_current == _M_end // skip __ch
477 || *_M_current++ != ']') // skip ']'
480 __throw_regex_error(regex_constants::error_ctype,
481 "Unexpected end of character class.");
483 __throw_regex_error(regex_constants::error_collate,
484 "Unexpected end of character class.");
488 #ifdef _GLIBCXX_DEBUG
489 template<typename _CharT>
492 _M_print(std::ostream& ostr)
496 case _S_token_anychar:
497 ostr << "any-character\n";
499 case _S_token_backref:
502 case _S_token_bracket_begin:
503 ostr << "bracket-begin\n";
505 case _S_token_bracket_neg_begin:
506 ostr << "bracket-neg-begin\n";
508 case _S_token_bracket_end:
509 ostr << "bracket-end\n";
511 case _S_token_char_class_name:
512 ostr << "char-class-name \"" << _M_value << "\"\n";
514 case _S_token_closure0:
515 ostr << "closure0\n";
517 case _S_token_closure1:
518 ostr << "closure1\n";
520 case _S_token_collsymbol:
521 ostr << "collsymbol \"" << _M_value << "\"\n";
526 case _S_token_dup_count:
527 ostr << "dup count: " << _M_value << "\n";
532 case _S_token_equiv_class_name:
533 ostr << "equiv-class-name \"" << _M_value << "\"\n";
535 case _S_token_interval_begin:
536 ostr << "interval begin\n";
538 case _S_token_interval_end:
539 ostr << "interval end\n";
541 case _S_token_line_begin:
542 ostr << "line begin\n";
544 case _S_token_line_end:
545 ostr << "line end\n";
553 case _S_token_ord_char:
554 ostr << "ordinary character: \"" << _M_value << "\"\n";
556 case _S_token_subexpr_begin:
557 ostr << "subexpr begin\n";
559 case _S_token_subexpr_no_group_begin:
560 ostr << "no grouping subexpr begin\n";
562 case _S_token_subexpr_lookahead_begin:
563 ostr << "lookahead subexpr begin\n";
565 case _S_token_subexpr_end:
566 ostr << "subexpr end\n";
568 case _S_token_unknown:
569 ostr << "-- unknown token --\n";
571 case _S_token_oct_num:
572 ostr << "oct number " << _M_value << "\n";
574 case _S_token_hex_num:
575 ostr << "hex number " << _M_value << "\n";
577 case _S_token_quoted_class:
578 ostr << "quoted class " << "\\" << _M_value << "\n";
581 _GLIBCXX_DEBUG_ASSERT(false);
587 } // namespace __detail
588 _GLIBCXX_END_NAMESPACE_VERSION