libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2013-2014 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_scanner.tcc 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 // FIXME make comments doxygen format. 00032 00033 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep 00034 // and awk 00035 // 1) grep is basic except '\n' is treated as '|' 00036 // 2) egrep is extended except '\n' is treated as '|' 00037 // 3) awk is extended except special escaping rules, and there's no 00038 // back-reference. 00039 // 00040 // References: 00041 // 00042 // ECMAScript: ECMA-262 15.10 00043 // 00044 // basic, extended: 00045 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html 00046 // 00047 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html 00048 00049 namespace std _GLIBCXX_VISIBILITY(default) 00050 { 00051 namespace __detail 00052 { 00053 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00054 00055 template<typename _CharT> 00056 _Scanner<_CharT>:: 00057 _Scanner(typename _Scanner::_IterT __begin, 00058 typename _Scanner::_IterT __end, 00059 _FlagT __flags, std::locale __loc) 00060 : _ScannerBase(__flags), 00061 _M_current(__begin), _M_end(__end), 00062 _M_ctype(std::use_facet<_CtypeT>(__loc)), 00063 _M_eat_escape(_M_is_ecma() 00064 ? &_Scanner::_M_eat_escape_ecma 00065 : &_Scanner::_M_eat_escape_posix) 00066 { _M_advance(); } 00067 00068 template<typename _CharT> 00069 void 00070 _Scanner<_CharT>:: 00071 _M_advance() 00072 { 00073 if (_M_current == _M_end) 00074 { 00075 _M_token = _S_token_eof; 00076 return; 00077 } 00078 00079 if (_M_state == _S_state_normal) 00080 _M_scan_normal(); 00081 else if (_M_state == _S_state_in_bracket) 00082 _M_scan_in_bracket(); 00083 else if (_M_state == _S_state_in_brace) 00084 _M_scan_in_brace(); 00085 else 00086 _GLIBCXX_DEBUG_ASSERT(false); 00087 } 00088 00089 // Differences between styles: 00090 // 1) "\(", "\)", "\{" in basic. It's not escaping. 00091 // 2) "(?:", "(?=", "(?!" in ECMAScript. 00092 template<typename _CharT> 00093 void 00094 _Scanner<_CharT>:: 00095 _M_scan_normal() 00096 { 00097 auto __c = *_M_current++; 00098 const char* __pos; 00099 00100 if (__c == '\\') 00101 { 00102 if (_M_current == _M_end) 00103 __throw_regex_error(regex_constants::error_escape); 00104 00105 if (!_M_is_basic() 00106 || (*_M_current != '(' 00107 && *_M_current != ')' 00108 && *_M_current != '{')) 00109 { 00110 (this->*_M_eat_escape)(); 00111 return; 00112 } 00113 __c = *_M_current++; 00114 } 00115 if (__c == '(') 00116 { 00117 if (_M_is_ecma() && *_M_current == '?') 00118 { 00119 if (++_M_current == _M_end) 00120 __throw_regex_error(regex_constants::error_paren); 00121 00122 if (*_M_current == ':') 00123 { 00124 ++_M_current; 00125 _M_token = _S_token_subexpr_no_group_begin; 00126 } 00127 else if (*_M_current == '=') 00128 { 00129 ++_M_current; 00130 _M_token = _S_token_subexpr_lookahead_begin; 00131 _M_value.assign(1, 'p'); 00132 } 00133 else if (*_M_current == '!') 00134 { 00135 ++_M_current; 00136 _M_token = _S_token_subexpr_lookahead_begin; 00137 _M_value.assign(1, 'n'); 00138 } 00139 else 00140 __throw_regex_error(regex_constants::error_paren); 00141 } 00142 else if (_M_flags & regex_constants::nosubs) 00143 _M_token = _S_token_subexpr_no_group_begin; 00144 else 00145 _M_token = _S_token_subexpr_begin; 00146 } 00147 else if (__c == ')') 00148 _M_token = _S_token_subexpr_end; 00149 else if (__c == '[') 00150 { 00151 _M_state = _S_state_in_bracket; 00152 _M_at_bracket_start = true; 00153 if (_M_current != _M_end && *_M_current == '^') 00154 { 00155 _M_token = _S_token_bracket_neg_begin; 00156 ++_M_current; 00157 } 00158 else 00159 _M_token = _S_token_bracket_begin; 00160 } 00161 else if (__c == '{') 00162 { 00163 _M_state = _S_state_in_brace; 00164 _M_token = _S_token_interval_begin; 00165 } 00166 else if (((__pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'))) 00167 != nullptr 00168 && *__pos != '\0' 00169 && __c != ']' 00170 && __c != '}') 00171 || (_M_is_grep() && __c == '\n')) 00172 { 00173 auto __it = _M_token_tbl; 00174 auto __narrowc = _M_ctype.narrow(__c, '\0'); 00175 for (; __it->first != '\0'; ++__it) 00176 if (__it->first == __narrowc) 00177 { 00178 _M_token = __it->second; 00179 return; 00180 } 00181 _GLIBCXX_DEBUG_ASSERT(false); 00182 } 00183 else 00184 { 00185 _M_token = _S_token_ord_char; 00186 _M_value.assign(1, __c); 00187 } 00188 } 00189 00190 // Differences between styles: 00191 // 1) different semantics of "[]" and "[^]". 00192 // 2) Escaping in bracket expr. 00193 template<typename _CharT> 00194 void 00195 _Scanner<_CharT>:: 00196 _M_scan_in_bracket() 00197 { 00198 if (_M_current == _M_end) 00199 __throw_regex_error(regex_constants::error_brack); 00200 00201 auto __c = *_M_current++; 00202 00203 if (__c == '[') 00204 { 00205 if (_M_current == _M_end) 00206 __throw_regex_error(regex_constants::error_brack); 00207 00208 if (*_M_current == '.') 00209 { 00210 _M_token = _S_token_collsymbol; 00211 _M_eat_class(*_M_current++); 00212 } 00213 else if (*_M_current == ':') 00214 { 00215 _M_token = _S_token_char_class_name; 00216 _M_eat_class(*_M_current++); 00217 } 00218 else if (*_M_current == '=') 00219 { 00220 _M_token = _S_token_equiv_class_name; 00221 _M_eat_class(*_M_current++); 00222 } 00223 else 00224 { 00225 _M_token = _S_token_ord_char; 00226 _M_value.assign(1, __c); 00227 } 00228 } 00229 // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted 00230 // literally. So "[]]" or "[^]]" is valid regex. See the testcases 00231 // `*/empty_range.cc`. 00232 else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start)) 00233 { 00234 _M_token = _S_token_bracket_end; 00235 _M_state = _S_state_normal; 00236 } 00237 // ECMAScirpt and awk permmits escaping in bracket. 00238 else if (__c == '\\' && (_M_is_ecma() || _M_is_awk())) 00239 (this->*_M_eat_escape)(); 00240 else 00241 { 00242 _M_token = _S_token_ord_char; 00243 _M_value.assign(1, __c); 00244 } 00245 _M_at_bracket_start = false; 00246 } 00247 00248 // Differences between styles: 00249 // 1) "\}" in basic style. 00250 template<typename _CharT> 00251 void 00252 _Scanner<_CharT>:: 00253 _M_scan_in_brace() 00254 { 00255 if (_M_current == _M_end) 00256 __throw_regex_error(regex_constants::error_brace); 00257 00258 auto __c = *_M_current++; 00259 00260 if (_M_ctype.is(_CtypeT::digit, __c)) 00261 { 00262 _M_token = _S_token_dup_count; 00263 _M_value.assign(1, __c); 00264 while (_M_current != _M_end 00265 && _M_ctype.is(_CtypeT::digit, *_M_current)) 00266 _M_value += *_M_current++; 00267 } 00268 else if (__c == ',') 00269 _M_token = _S_token_comma; 00270 // basic use \}. 00271 else if (_M_is_basic()) 00272 { 00273 if (__c == '\\' && _M_current != _M_end && *_M_current == '}') 00274 { 00275 _M_state = _S_state_normal; 00276 _M_token = _S_token_interval_end; 00277 ++_M_current; 00278 } 00279 else 00280 __throw_regex_error(regex_constants::error_badbrace); 00281 } 00282 else if (__c == '}') 00283 { 00284 _M_state = _S_state_normal; 00285 _M_token = _S_token_interval_end; 00286 } 00287 else 00288 __throw_regex_error(regex_constants::error_badbrace); 00289 } 00290 00291 template<typename _CharT> 00292 void 00293 _Scanner<_CharT>:: 00294 _M_eat_escape_ecma() 00295 { 00296 if (_M_current == _M_end) 00297 __throw_regex_error(regex_constants::error_escape); 00298 00299 auto __c = *_M_current++; 00300 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 00301 00302 if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket)) 00303 { 00304 _M_token = _S_token_ord_char; 00305 _M_value.assign(1, *__pos); 00306 } 00307 else if (__c == 'b') 00308 { 00309 _M_token = _S_token_word_bound; 00310 _M_value.assign(1, 'p'); 00311 } 00312 else if (__c == 'B') 00313 { 00314 _M_token = _S_token_word_bound; 00315 _M_value.assign(1, 'n'); 00316 } 00317 // N3376 28.13 00318 else if (__c == 'd' 00319 || __c == 'D' 00320 || __c == 's' 00321 || __c == 'S' 00322 || __c == 'w' 00323 || __c == 'W') 00324 { 00325 _M_token = _S_token_quoted_class; 00326 _M_value.assign(1, __c); 00327 } 00328 else if (__c == 'c') 00329 { 00330 if (_M_current == _M_end) 00331 __throw_regex_error(regex_constants::error_escape); 00332 _M_token = _S_token_ord_char; 00333 _M_value.assign(1, *_M_current++); 00334 } 00335 else if (__c == 'x' || __c == 'u') 00336 { 00337 _M_value.erase(); 00338 for (int i = 0; i < (__c == 'x' ? 2 : 4); i++) 00339 { 00340 if (_M_current == _M_end 00341 || !_M_ctype.is(_CtypeT::xdigit, *_M_current)) 00342 __throw_regex_error(regex_constants::error_escape); 00343 _M_value += *_M_current++; 00344 } 00345 _M_token = _S_token_hex_num; 00346 } 00347 // ECMAScript recongnizes multi-digit back-references. 00348 else if (_M_ctype.is(_CtypeT::digit, __c)) 00349 { 00350 _M_value.assign(1, __c); 00351 while (_M_current != _M_end 00352 && _M_ctype.is(_CtypeT::digit, *_M_current)) 00353 _M_value += *_M_current++; 00354 _M_token = _S_token_backref; 00355 } 00356 else 00357 { 00358 _M_token = _S_token_ord_char; 00359 _M_value.assign(1, __c); 00360 } 00361 } 00362 00363 // Differences between styles: 00364 // 1) Extended doesn't support backref, but basic does. 00365 template<typename _CharT> 00366 void 00367 _Scanner<_CharT>:: 00368 _M_eat_escape_posix() 00369 { 00370 if (_M_current == _M_end) 00371 __throw_regex_error(regex_constants::error_escape); 00372 00373 auto __c = *_M_current; 00374 auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')); 00375 00376 if (__pos != nullptr && *__pos != '\0') 00377 { 00378 _M_token = _S_token_ord_char; 00379 _M_value.assign(1, __c); 00380 } 00381 // We MUST judge awk before handling backrefs. There's no backref in awk. 00382 else if (_M_is_awk()) 00383 { 00384 _M_eat_escape_awk(); 00385 return; 00386 } 00387 else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0') 00388 { 00389 _M_token = _S_token_backref; 00390 _M_value.assign(1, __c); 00391 } 00392 else 00393 { 00394 #ifdef __STRICT_ANSI__ 00395 __throw_regex_error(regex_constants::error_escape); 00396 #else 00397 _M_token = _S_token_ord_char; 00398 _M_value.assign(1, __c); 00399 #endif 00400 } 00401 ++_M_current; 00402 } 00403 00404 template<typename _CharT> 00405 void 00406 _Scanner<_CharT>:: 00407 _M_eat_escape_awk() 00408 { 00409 auto __c = *_M_current++; 00410 auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0')); 00411 00412 if (__pos != nullptr) 00413 { 00414 _M_token = _S_token_ord_char; 00415 _M_value.assign(1, *__pos); 00416 } 00417 // \ddd for oct representation 00418 else if (_M_ctype.is(_CtypeT::digit, __c) 00419 && __c != '8' 00420 && __c != '9') 00421 { 00422 _M_value.assign(1, __c); 00423 for (int __i = 0; 00424 __i < 2 00425 && _M_current != _M_end 00426 && _M_ctype.is(_CtypeT::digit, *_M_current) 00427 && *_M_current != '8' 00428 && *_M_current != '9'; 00429 __i++) 00430 _M_value += *_M_current++; 00431 _M_token = _S_token_oct_num; 00432 return; 00433 } 00434 else 00435 __throw_regex_error(regex_constants::error_escape); 00436 } 00437 00438 // Eats a character class or throwns an exception. 00439 // __ch cound be ':', '.' or '=', _M_current is the char after ']' when 00440 // returning. 00441 template<typename _CharT> 00442 void 00443 _Scanner<_CharT>:: 00444 _M_eat_class(char __ch) 00445 { 00446 for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;) 00447 _M_value += *_M_current++; 00448 if (_M_current == _M_end 00449 || *_M_current++ != __ch 00450 || _M_current == _M_end // skip __ch 00451 || *_M_current++ != ']') // skip ']' 00452 { 00453 if (__ch == ':') 00454 __throw_regex_error(regex_constants::error_ctype); 00455 else 00456 __throw_regex_error(regex_constants::error_collate); 00457 } 00458 } 00459 00460 #ifdef _GLIBCXX_DEBUG 00461 template<typename _CharT> 00462 std::ostream& 00463 _Scanner<_CharT>:: 00464 _M_print(std::ostream& ostr) 00465 { 00466 switch (_M_token) 00467 { 00468 case _S_token_anychar: 00469 ostr << "any-character\n"; 00470 break; 00471 case _S_token_backref: 00472 ostr << "backref\n"; 00473 break; 00474 case _S_token_bracket_begin: 00475 ostr << "bracket-begin\n"; 00476 break; 00477 case _S_token_bracket_neg_begin: 00478 ostr << "bracket-neg-begin\n"; 00479 break; 00480 case _S_token_bracket_end: 00481 ostr << "bracket-end\n"; 00482 break; 00483 case _S_token_char_class_name: 00484 ostr << "char-class-name \"" << _M_value << "\"\n"; 00485 break; 00486 case _S_token_closure0: 00487 ostr << "closure0\n"; 00488 break; 00489 case _S_token_closure1: 00490 ostr << "closure1\n"; 00491 break; 00492 case _S_token_collsymbol: 00493 ostr << "collsymbol \"" << _M_value << "\"\n"; 00494 break; 00495 case _S_token_comma: 00496 ostr << "comma\n"; 00497 break; 00498 case _S_token_dup_count: 00499 ostr << "dup count: " << _M_value << "\n"; 00500 break; 00501 case _S_token_eof: 00502 ostr << "EOF\n"; 00503 break; 00504 case _S_token_equiv_class_name: 00505 ostr << "equiv-class-name \"" << _M_value << "\"\n"; 00506 break; 00507 case _S_token_interval_begin: 00508 ostr << "interval begin\n"; 00509 break; 00510 case _S_token_interval_end: 00511 ostr << "interval end\n"; 00512 break; 00513 case _S_token_line_begin: 00514 ostr << "line begin\n"; 00515 break; 00516 case _S_token_line_end: 00517 ostr << "line end\n"; 00518 break; 00519 case _S_token_opt: 00520 ostr << "opt\n"; 00521 break; 00522 case _S_token_or: 00523 ostr << "or\n"; 00524 break; 00525 case _S_token_ord_char: 00526 ostr << "ordinary character: \"" << _M_value << "\"\n"; 00527 break; 00528 case _S_token_subexpr_begin: 00529 ostr << "subexpr begin\n"; 00530 break; 00531 case _S_token_subexpr_no_group_begin: 00532 ostr << "no grouping subexpr begin\n"; 00533 break; 00534 case _S_token_subexpr_lookahead_begin: 00535 ostr << "lookahead subexpr begin\n"; 00536 break; 00537 case _S_token_subexpr_end: 00538 ostr << "subexpr end\n"; 00539 break; 00540 case _S_token_unknown: 00541 ostr << "-- unknown token --\n"; 00542 break; 00543 case _S_token_oct_num: 00544 ostr << "oct number " << _M_value << "\n"; 00545 break; 00546 case _S_token_hex_num: 00547 ostr << "hex number " << _M_value << "\n"; 00548 break; 00549 case _S_token_quoted_class: 00550 ostr << "quoted class " << "\\" << _M_value << "\n"; 00551 break; 00552 default: 00553 _GLIBCXX_DEBUG_ASSERT(false); 00554 } 00555 return ostr; 00556 } 00557 #endif 00558 00559 _GLIBCXX_END_NAMESPACE_VERSION 00560 } // namespace __detail 00561 } // namespace