libstdc++-v3/include/bits/regex_scanner.tcc

   1 // class template regex -*- C++ -*-
   2
   3 // Copyright (C) 2013 Free Software Foundation, Inc.
   4 //
   5 // This file is part of the GNU ISO C++ Library.  This library is free
   6 // software; you can redistribute it and/or modify it under the
   7 // terms of the GNU General Public License as published by the
   8 // Free Software Foundation; either version 3, or (at your option)
   9 // any later version.
  10
  11 // This library is distributed in the hope that it will be useful,
  12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 // GNU General Public License for more details.
  15
  16 // Under Section 7 of GPL version 3, you are granted additional
  17 // permissions described in the GCC Runtime Library Exception, version
  18 // 3.1, as published by the Free Software Foundation.
  19
  20 // You should have received a copy of the GNU General Public License and
  21 // a copy of the GCC Runtime Library Exception along with this program;
  22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23 // <http://www.gnu.org/licenses/>.
  24
  25 /**
  26  *  @file bits/regex_scanner.tcc
  27  *  This is an internal header file, included by other library headers.
  28  *  Do not attempt to use it directly. @headername{regex}
  29  */
  30
  31 // FIXME make comments doxygen format.
  32
  33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
  34 // and awk
  35 // 1) grep is basic except '\n' is treated as '|'
  36 // 2) egrep is extended except '\n' is treated as '|'
  37 // 3) awk is extended except special escaping rules, and there's no
  38 //    back-reference.
  39 //
  40 // References:
  41 //
  42 // ECMAScript: ECMA-262 15.10
  43 //
  44 // basic, extended:
  45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
  46 //
  47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
  48
  49 namespace std _GLIBCXX_VISIBILITY(default)
  50 {
  51 namespace __detail
  52 {
  53 _GLIBCXX_BEGIN_NAMESPACE_VERSION
  54
  55   template<typename _FwdIter>
  56     _Scanner<_FwdIter>::
  57     _Scanner(_FwdIter __begin, _FwdIter __end,
  58              _FlagT __flags, std::locale __loc)
  59     : _M_state(_S_state_normal), _M_current(__begin), _M_end(__end),
  60       _M_flags(__flags),
  61       _M_ctype(std::use_facet<_CtypeT>(__loc)),
  62       _M_at_bracket_start(false),
  63       _M_token_map
  64         {
  65           {'^', _S_token_line_begin},
  66           {'$', _S_token_line_end},
  67           {'.', _S_token_anychar},
  68           {'*', _S_token_closure0},
  69           {'+', _S_token_closure1},
  70           {'?', _S_token_opt},
  71           {'|', _S_token_or},
  72           // grep and egrep
  73           {'\n', _S_token_or},
  74         },
  75       _M_ecma_escape_map
  76         {
  77           {'0', '\0'},
  78           {'b', '\b'},
  79           {'f', '\f'},
  80           {'n', '\n'},
  81           {'r', '\r'},
  82           {'t', '\t'},
  83           {'v', '\v'},
  84         },
  85       _M_awk_escape_map
  86         {
  87           {'"', '"'},
  88           {'/', '/'},
  89           {'\\', '\\'},
  90           {'a', '\a'},
  91           {'b', '\b'},
  92           {'f', '\f'},
  93           {'n', '\n'},
  94           {'r', '\r'},
  95           {'t', '\t'},
  96           {'v', '\v'},
  97         },
  98       _M_ecma_spec_char
  99         {
 100           '^',
 101           '$',
 102           '\\',
 103           '.',
 104           '*',
 105           '+',
 106           '?',
 107           '(',
 108           ')',
 109           '[',
 110           ']',
 111           '{',
 112           '}',
 113           '|',
 114         },
 115       _M_basic_spec_char
 116         {
 117           '.',
 118           '[',
 119           '\\',
 120           '*',
 121           '^',
 122           '$',
 123         },
 124       _M_extended_spec_char
 125         {
 126           '.',
 127           '[',
 128           '\\',
 129           '(',
 130           ')',
 131           '*',
 132           '+',
 133           '?',
 134           '{',
 135           '|',
 136           '^',
 137           '$',
 138         },
 139       _M_escape_map(_M_is_ecma()
 140                     ? _M_ecma_escape_map
 141                     : _M_awk_escape_map),
 142       _M_spec_char(_M_is_ecma()
 143                    ? _M_ecma_spec_char
 144                    : _M_is_basic()
 145                    ? _M_basic_spec_char
 146                    : _M_extended_spec_char),
 147       _M_eat_escape(_M_is_ecma()
 148                     ? &_Scanner::_M_eat_escape_ecma
 149                     : &_Scanner::_M_eat_escape_posix)
 150     { _M_advance(); }
 151
 152   template<typename _FwdIter>
 153     void
 154     _Scanner<_FwdIter>::
 155     _M_advance()
 156     {
 157       if (_M_current == _M_end)
 158         {
 159           _M_token = _S_token_eof;
 160           return;
 161         }
 162
 163       if (_M_state == _S_state_normal)
 164         _M_scan_normal();
 165       else if (_M_state == _S_state_in_bracket)
 166         _M_scan_in_bracket();
 167       else if (_M_state == _S_state_in_brace)
 168         _M_scan_in_brace();
 169       else
 170         _GLIBCXX_DEBUG_ASSERT(false);
 171     }
 172
 173   // Differences between styles:
 174   // 1) "\(", "\)", "\{" in basic. It's not escaping.
 175   // 2) "(?:", "(?=", "(?!" in ECMAScript.
 176   template<typename _FwdIter>
 177     void
 178     _Scanner<_FwdIter>::
 179     _M_scan_normal()
 180     {
 181       auto __c = *_M_current++;
 182
 183       if (__c == '\\')
 184         {
 185           if (_M_current == _M_end)
 186             __throw_regex_error(regex_constants::error_escape);
 187
 188           if (!_M_is_basic()
 189               || (*_M_current != '('
 190                   && *_M_current != ')'
 191                   && *_M_current != '{'))
 192             {
 193               (this->*_M_eat_escape)();
 194               return;
 195             }
 196           __c = *_M_current++;
 197         }
 198       if (__c == '(')
 199         {
 200           if (_M_is_ecma() && *_M_current == '?')
 201             {
 202               if (++_M_current == _M_end)
 203                 __throw_regex_error(regex_constants::error_paren);
 204
 205               if (*_M_current == ':')
 206                 {
 207                   ++_M_current;
 208                   _M_token = _S_token_subexpr_no_group_begin;
 209                 }
 210               else if (*_M_current == '=')
 211                 {
 212                   ++_M_current;
 213                   _M_token = _S_token_subexpr_lookahead_begin;
 214                   _M_value.assign(1, 'p');
 215                 }
 216               else if (*_M_current == '!')
 217                 {
 218                   ++_M_current;
 219                   _M_token = _S_token_subexpr_lookahead_begin;
 220                   _M_value.assign(1, 'n');
 221                 }
 222               else
 223                 __throw_regex_error(regex_constants::error_paren);
 224             }
 225           else
 226             _M_token = _S_token_subexpr_begin;
 227         }
 228       else if (__c == ')')
 229         _M_token = _S_token_subexpr_end;
 230       else if (__c == '[')
 231         {
 232           _M_state = _S_state_in_bracket;
 233           _M_at_bracket_start = true;
 234           if (_M_current != _M_end && *_M_current == '^')
 235             {
 236               _M_token = _S_token_bracket_neg_begin;
 237               ++_M_current;
 238             }
 239           else
 240             _M_token = _S_token_bracket_begin;
 241         }
 242       else if (__c == '{')
 243         {
 244           _M_state = _S_state_in_brace;
 245           _M_token = _S_token_interval_begin;
 246         }
 247       else if ((_M_spec_char.count(_M_ctype.narrow(__c, '\0'))
 248                 && __c != ']'
 249                 && __c != '}')
 250                || (_M_is_grep() && __c == '\n'))
 251         _M_token = _M_token_map.at(__c);
 252       else
 253         {
 254           _M_token = _S_token_ord_char;
 255           _M_value.assign(1, __c);
 256         }
 257     }
 258
 259   // Differences between styles:
 260   // 1) different semantics of "[]" and "[^]".
 261   // 2) Escaping in bracket expr.
 262   template<typename _FwdIter>
 263     void
 264     _Scanner<_FwdIter>::
 265     _M_scan_in_bracket()
 266     {
 267       if (_M_current == _M_end)
 268         __throw_regex_error(regex_constants::error_brack);
 269
 270       auto __c = *_M_current++;
 271
 272       if (__c == '[')
 273         {
 274           if (_M_current == _M_end)
 275             __throw_regex_error(regex_constants::error_brack);
 276
 277           if (*_M_current == '.')
 278             {
 279               _M_token = _S_token_collsymbol;
 280               _M_eat_class(*_M_current++);
 281             }
 282           else if (*_M_current == ':')
 283             {
 284               _M_token = _S_token_char_class_name;
 285               _M_eat_class(*_M_current++);
 286             }
 287           else if (*_M_current == '=')
 288             {
 289               _M_token = _S_token_equiv_class_name;
 290               _M_eat_class(*_M_current++);
 291             }
 292           else
 293             {
 294               _M_token = _S_token_ord_char;
 295               _M_value.assign(1, __c);
 296             }
 297         }
 298       // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
 299       // literally. So "[]]" or "[^]]" is valid regex. See the testcases
 300       // `*/empty_range.cc`.
 301       else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
 302         {
 303           _M_token = _S_token_bracket_end;
 304           _M_state = _S_state_normal;
 305         }
 306       // ECMAScirpt and awk permmits escaping in bracket.
 307       else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
 308         (this->*_M_eat_escape)();
 309       else
 310         {
 311           _M_token = _S_token_ord_char;
 312           _M_value.assign(1, __c);
 313         }
 314       _M_at_bracket_start = false;
 315     }
 316
 317   // Differences between styles:
 318   // 1) "\}" in basic style.
 319   template<typename _FwdIter>
 320     void
 321     _Scanner<_FwdIter>::
 322     _M_scan_in_brace()
 323     {
 324       if (_M_current == _M_end)
 325         __throw_regex_error(regex_constants::error_brace);
 326
 327       auto __c = *_M_current++;
 328
 329       if (_M_ctype.is(_CtypeT::digit, __c))
 330         {
 331           _M_token = _S_token_dup_count;
 332           _M_value.assign(1, __c);
 333           while (_M_current != _M_end
 334                  && _M_ctype.is(_CtypeT::digit, *_M_current))
 335             _M_value += *_M_current++;
 336         }
 337       else if (__c == ',')
 338         _M_token = _S_token_comma;
 339       // basic use \}.
 340       else if (_M_is_basic())
 341         {
 342           if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
 343             {
 344               _M_state = _S_state_normal;
 345               _M_token = _S_token_interval_end;
 346               ++_M_current;
 347             }
 348           else
 349             __throw_regex_error(regex_constants::error_badbrace);
 350         }
 351       else if (__c == '}')
 352         {
 353           _M_state = _S_state_normal;
 354           _M_token = _S_token_interval_end;
 355         }
 356       else
 357         __throw_regex_error(regex_constants::error_badbrace);
 358     }
 359
 360   template<typename _FwdIter>
 361     void
 362     _Scanner<_FwdIter>::
 363     _M_eat_escape_ecma()
 364     {
 365       if (_M_current == _M_end)
 366         __throw_regex_error(regex_constants::error_escape);
 367
 368       auto __c = *_M_current++;
 369
 370       if (_M_escape_map.count(_M_ctype.narrow(__c, '\0'))
 371           && (__c != 'b' || _M_state == _S_state_in_bracket))
 372         {
 373           _M_token = _S_token_ord_char;
 374           _M_value.assign(1, _M_escape_map.at(__c));
 375         }
 376       else if (__c == 'b')
 377         {
 378           _M_token = _S_token_word_bound;
 379           _M_value.assign(1, 'p');
 380         }
 381       else if (__c == 'B')
 382         {
 383           _M_token = _S_token_word_bound;
 384           _M_value.assign(1, 'n');
 385         }
 386       // N3376 28.13
 387       else if (__c == 'd'
 388                || __c == 'D'
 389                || __c == 's'
 390                || __c == 'S'
 391                || __c == 'w'
 392                || __c == 'W')
 393         {
 394           _M_token = _S_token_quoted_class;
 395           _M_value.assign(1, __c);
 396         }
 397       else if (__c == 'c')
 398         {
 399           if (_M_current == _M_end)
 400             __throw_regex_error(regex_constants::error_escape);
 401           _M_token = _S_token_ord_char;
 402           _M_value.assign(1, *_M_current++);
 403         }
 404       else if (__c == 'x' || __c == 'u')
 405         {
 406           _M_value.erase();
 407           for (int i = 0; i < (__c == 'x' ? 2 : 4); i++)
 408             {
 409               if (_M_current == _M_end
 410                   || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
 411                 __throw_regex_error(regex_constants::error_escape);
 412               _M_value += *_M_current++;
 413             }
 414           _M_token = _S_token_hex_num;
 415         }
 416       // ECMAScript recongnizes multi-digit back-references.
 417       else if (_M_ctype.is(_CtypeT::digit, __c))
 418         {
 419           _M_value.assign(1, __c);
 420           while (_M_current != _M_end
 421                  && _M_ctype.is(_CtypeT::digit, *_M_current))
 422             _M_value += *_M_current++;
 423           _M_token = _S_token_backref;
 424         }
 425       else
 426         {
 427           _M_token = _S_token_ord_char;
 428           _M_value.assign(1, __c);
 429         }
 430     }
 431
 432   // Differences between styles:
 433   // 1) Extended doesn't support backref, but basic does.
 434   template<typename _FwdIter>
 435     void
 436     _Scanner<_FwdIter>::
 437     _M_eat_escape_posix()
 438     {
 439       if (_M_current == _M_end)
 440         __throw_regex_error(regex_constants::error_escape);
 441
 442       auto __c = *_M_current;
 443
 444       if (_M_spec_char.count(_M_ctype.narrow(__c, '\0')))
 445         {
 446           _M_token = _S_token_ord_char;
 447           _M_value.assign(1, __c);
 448         }
 449       // We MUST judge awk before handling backrefs. There's no backref in awk.
 450       else if (_M_is_awk())
 451         {
 452           _M_eat_escape_awk();
 453           return;
 454         }
 455       else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
 456         {
 457           _M_token = _S_token_backref;
 458           _M_value.assign(1, __c);
 459         }
 460       else
 461         {
 462 #ifdef __STRICT_ANSI__
 463           __throw_regex_error(regex_constants::error_escape);
 464 #else
 465           _M_token = _S_token_ord_char;
 466           _M_value.assign(1, __c);
 467 #endif
 468         }
 469       ++_M_current;
 470     }
 471
 472   template<typename _FwdIter>
 473     void
 474     _Scanner<_FwdIter>::
 475     _M_eat_escape_awk()
 476     {
 477       auto __c = *_M_current++;
 478
 479       if (_M_escape_map.count(_M_ctype.narrow(__c, '\0')))
 480         {
 481           _M_token = _S_token_ord_char;
 482           _M_value.assign(1, _M_escape_map.at(__c));
 483         }
 484       // \ddd for oct representation
 485       else if (_M_ctype.is(_CtypeT::digit, __c)
 486                && __c != '8'
 487                && __c != '9')
 488         {
 489           _M_value.assign(1,  __c);
 490           for (int __i = 0;
 491                __i < 2
 492                && _M_current != _M_end
 493                && _M_ctype.is(_CtypeT::digit, *_M_current)
 494                && *_M_current != '8'
 495                && *_M_current != '9';
 496                __i++)
 497             _M_value += *_M_current++;
 498           _M_token = _S_token_oct_num;
 499           return;
 500         }
 501       else
 502         __throw_regex_error(regex_constants::error_escape);
 503     }
 504
 505   // Eats a character class or throwns an exception.
 506   // __ch cound be ':', '.' or '=', _M_current is the char after ']' when
 507   // returning.
 508   template<typename _FwdIter>
 509     void
 510     _Scanner<_FwdIter>::
 511     _M_eat_class(char __ch)
 512     {
 513       for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
 514         _M_value += *_M_current++;
 515       if (_M_current == _M_end
 516           || *_M_current++ != __ch
 517           || _M_current == _M_end // skip __ch
 518           || *_M_current++ != ']') // skip ']'
 519         {
 520           if (__ch == ':')
 521             __throw_regex_error(regex_constants::error_ctype);
 522           else
 523             __throw_regex_error(regex_constants::error_collate);
 524         }
 525     }
 526
 527 #ifdef _GLIBCXX_DEBUG
 528   template<typename _FwdIter>
 529     std::ostream&
 530     _Scanner<_FwdIter>::
 531     _M_print(std::ostream& ostr)
 532     {
 533       switch (_M_token)
 534       {
 535       case _S_token_anychar:
 536         ostr << "any-character\n";
 537         break;
 538       case _S_token_backref:
 539         ostr << "backref\n";
 540         break;
 541       case _S_token_bracket_begin:
 542         ostr << "bracket-begin\n";
 543         break;
 544       case _S_token_bracket_neg_begin:
 545         ostr << "bracket-neg-begin\n";
 546         break;
 547       case _S_token_bracket_end:
 548         ostr << "bracket-end\n";
 549         break;
 550       case _S_token_char_class_name:
 551         ostr << "char-class-name \"" << _M_value << "\"\n";
 552         break;
 553       case _S_token_closure0:
 554         ostr << "closure0\n";
 555         break;
 556       case _S_token_closure1:
 557         ostr << "closure1\n";
 558         break;
 559       case _S_token_collsymbol:
 560         ostr << "collsymbol \"" << _M_value << "\"\n";
 561         break;
 562       case _S_token_comma:
 563         ostr << "comma\n";
 564         break;
 565       case _S_token_dup_count:
 566         ostr << "dup count: " << _M_value << "\n";
 567         break;
 568       case _S_token_eof:
 569         ostr << "EOF\n";
 570         break;
 571       case _S_token_equiv_class_name:
 572         ostr << "equiv-class-name \"" << _M_value << "\"\n";
 573         break;
 574       case _S_token_interval_begin:
 575         ostr << "interval begin\n";
 576         break;
 577       case _S_token_interval_end:
 578         ostr << "interval end\n";
 579         break;
 580       case _S_token_line_begin:
 581         ostr << "line begin\n";
 582         break;
 583       case _S_token_line_end:
 584         ostr << "line end\n";
 585         break;
 586       case _S_token_opt:
 587         ostr << "opt\n";
 588         break;
 589       case _S_token_or:
 590         ostr << "or\n";
 591         break;
 592       case _S_token_ord_char:
 593         ostr << "ordinary character: \"" << _M_value << "\"\n";
 594         break;
 595       case _S_token_subexpr_begin:
 596         ostr << "subexpr begin\n";
 597         break;
 598       case _S_token_subexpr_no_group_begin:
 599         ostr << "no grouping subexpr begin\n";
 600         break;
 601       case _S_token_subexpr_lookahead_begin:
 602         ostr << "lookahead subexpr begin\n";
 603         break;
 604       case _S_token_subexpr_end:
 605         ostr << "subexpr end\n";
 606         break;
 607       case _S_token_unknown:
 608         ostr << "-- unknown token --\n";
 609         break;
 610       case _S_token_oct_num:
 611         ostr << "oct number " << _M_value << "\n";
 612         break;
 613       case _S_token_hex_num:
 614         ostr << "hex number " << _M_value << "\n";
 615         break;
 616       case _S_token_quoted_class:
 617         ostr << "quoted class " << "\\" << _M_value << "\n";
 618         break;
 619       default:
 620         _GLIBCXX_DEBUG_ASSERT(false);
 621       }
 622       return ostr;
 623     }
 624 #endif
 625
 626 _GLIBCXX_END_NAMESPACE_VERSION
 627 } // namespace __detail
 628 } // namespace