libstdc++
regex_compiler.h
Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2010-2018 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex_compiler.h
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 namespace std _GLIBCXX_VISIBILITY(default)
00032 {
00033 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00034 _GLIBCXX_BEGIN_NAMESPACE_CXX11
00035 
00036   template<typename>
00037     class regex_traits;
00038 
00039 _GLIBCXX_END_NAMESPACE_CXX11
00040 
00041 namespace __detail
00042 {
00043   /**
00044    * @addtogroup regex-detail
00045    * @{
00046    */
00047 
00048   template<typename, bool, bool>
00049     struct _BracketMatcher;
00050 
00051   /**
00052    * @brief Builds an NFA from an input iterator range.
00053    *
00054    * The %_TraitsT type should fulfill requirements [28.3].
00055    */
00056   template<typename _TraitsT>
00057     class _Compiler
00058     {
00059     public:
00060       typedef typename _TraitsT::char_type        _CharT;
00061       typedef const _CharT*                       _IterT;
00062       typedef _NFA<_TraitsT>                      _RegexT;
00063       typedef regex_constants::syntax_option_type _FlagT;
00064 
00065       _Compiler(_IterT __b, _IterT __e,
00066                 const typename _TraitsT::locale_type& __traits, _FlagT __flags);
00067 
00068       shared_ptr<const _RegexT>
00069       _M_get_nfa()
00070       { return std::move(_M_nfa); }
00071 
00072     private:
00073       typedef _Scanner<_CharT>               _ScannerT;
00074       typedef typename _TraitsT::string_type _StringT;
00075       typedef typename _ScannerT::_TokenT    _TokenT;
00076       typedef _StateSeq<_TraitsT>            _StateSeqT;
00077       typedef std::stack<_StateSeqT>         _StackT;
00078       typedef std::ctype<_CharT>             _CtypeT;
00079 
00080       // accepts a specific token or returns false.
00081       bool
00082       _M_match_token(_TokenT __token);
00083 
00084       void
00085       _M_disjunction();
00086 
00087       void
00088       _M_alternative();
00089 
00090       bool
00091       _M_term();
00092 
00093       bool
00094       _M_assertion();
00095 
00096       bool
00097       _M_quantifier();
00098 
00099       bool
00100       _M_atom();
00101 
00102       bool
00103       _M_bracket_expression();
00104 
00105       template<bool __icase, bool __collate>
00106         void
00107         _M_insert_any_matcher_ecma();
00108 
00109       template<bool __icase, bool __collate>
00110         void
00111         _M_insert_any_matcher_posix();
00112 
00113       template<bool __icase, bool __collate>
00114         void
00115         _M_insert_char_matcher();
00116 
00117       template<bool __icase, bool __collate>
00118         void
00119         _M_insert_character_class_matcher();
00120 
00121       template<bool __icase, bool __collate>
00122         void
00123         _M_insert_bracket_matcher(bool __neg);
00124 
00125       // Returns true if successfully matched one term and should continue.
00126       // Returns false if the compiler should move on.
00127       template<bool __icase, bool __collate>
00128         bool
00129         _M_expression_term(pair<bool, _CharT>& __last_char,
00130                            _BracketMatcher<_TraitsT, __icase, __collate>&
00131                            __matcher);
00132 
00133       int
00134       _M_cur_int_value(int __radix);
00135 
00136       bool
00137       _M_try_char();
00138 
00139       _StateSeqT
00140       _M_pop()
00141       {
00142         auto ret = _M_stack.top();
00143         _M_stack.pop();
00144         return ret;
00145       }
00146 
00147       _FlagT              _M_flags;
00148       _ScannerT           _M_scanner;
00149       shared_ptr<_RegexT> _M_nfa;
00150       _StringT            _M_value;
00151       _StackT             _M_stack;
00152       const _TraitsT&     _M_traits;
00153       const _CtypeT&      _M_ctype;
00154     };
00155 
00156   template<typename _Tp>
00157     struct __has_contiguous_iter : std::false_type { };
00158 
00159   template<typename _Ch, typename _Tr, typename _Alloc>
00160     struct __has_contiguous_iter<std::basic_string<_Ch, _Tr, _Alloc>>
00161     : std::true_type
00162     { };
00163 
00164   template<typename _Tp, typename _Alloc>
00165     struct __has_contiguous_iter<std::vector<_Tp, _Alloc>>
00166     : std::true_type
00167     { };
00168 
00169   template<typename _Tp>
00170     struct __is_contiguous_normal_iter : std::false_type { };
00171 
00172   template<typename _CharT>
00173     struct __is_contiguous_normal_iter<_CharT*> : std::true_type { };
00174 
00175   template<typename _Tp, typename _Cont>
00176     struct
00177     __is_contiguous_normal_iter<__gnu_cxx::__normal_iterator<_Tp, _Cont>>
00178     : __has_contiguous_iter<_Cont>::type
00179     { };
00180 
00181   template<typename _Iter, typename _TraitsT>
00182     using __enable_if_contiguous_normal_iter
00183       = typename enable_if< __is_contiguous_normal_iter<_Iter>::value,
00184                            std::shared_ptr<const _NFA<_TraitsT>> >::type;
00185 
00186   template<typename _Iter, typename _TraitsT>
00187     using __disable_if_contiguous_normal_iter
00188       = typename enable_if< !__is_contiguous_normal_iter<_Iter>::value,
00189                            std::shared_ptr<const _NFA<_TraitsT>> >::type;
00190 
00191   template<typename _TraitsT, typename _FwdIter>
00192     inline __enable_if_contiguous_normal_iter<_FwdIter, _TraitsT>
00193     __compile_nfa(_FwdIter __first, _FwdIter __last,
00194                   const typename _TraitsT::locale_type& __loc,
00195                   regex_constants::syntax_option_type __flags)
00196     {
00197       size_t __len = __last - __first;
00198       const auto* __cfirst = __len ? std::__addressof(*__first) : nullptr;
00199       using _Cmplr = _Compiler<_TraitsT>;
00200       return _Cmplr(__cfirst, __cfirst + __len, __loc, __flags)._M_get_nfa();
00201     }
00202 
00203   template<typename _TraitsT, typename _FwdIter>
00204     inline __disable_if_contiguous_normal_iter<_FwdIter, _TraitsT>
00205     __compile_nfa(_FwdIter __first, _FwdIter __last,
00206                   const typename _TraitsT::locale_type& __loc,
00207                   regex_constants::syntax_option_type __flags)
00208     {
00209       const basic_string<typename _TraitsT::char_type> __str(__first, __last);
00210       return __compile_nfa<_TraitsT>(__str.data(), __str.data() + __str.size(),
00211                                      __loc, __flags);
00212     }
00213 
00214   // [28.13.14]
00215   template<typename _TraitsT, bool __icase, bool __collate>
00216     class _RegexTranslatorBase
00217     {
00218     public:
00219       typedef typename _TraitsT::char_type            _CharT;
00220       typedef typename _TraitsT::string_type          _StringT;
00221       typedef _StringT _StrTransT;
00222 
00223       explicit
00224       _RegexTranslatorBase(const _TraitsT& __traits)
00225       : _M_traits(__traits)
00226       { }
00227 
00228       _CharT
00229       _M_translate(_CharT __ch) const
00230       {
00231         if (__icase)
00232           return _M_traits.translate_nocase(__ch);
00233         else if (__collate)
00234           return _M_traits.translate(__ch);
00235         else
00236           return __ch;
00237       }
00238 
00239       _StrTransT
00240       _M_transform(_CharT __ch) const
00241       {
00242         _StrTransT __str(1, __ch);
00243         return _M_traits.transform(__str.begin(), __str.end());
00244       }
00245 
00246       // See LWG 523. It's not efficiently implementable when _TraitsT is not
00247       // std::regex_traits<>, and __collate is true. See specializations for
00248       // implementations of other cases.
00249       bool
00250       _M_match_range(const _StrTransT& __first, const _StrTransT& __last,
00251                      const _StrTransT& __s) const
00252       { return __first <= __s && __s <= __last; }
00253 
00254     protected:
00255       bool _M_in_range_icase(_CharT __first, _CharT __last, _CharT __ch) const
00256       {
00257         typedef std::ctype<_CharT> __ctype_type;
00258         const auto& __fctyp = use_facet<__ctype_type>(this->_M_traits.getloc());
00259         auto __lower = __fctyp.tolower(__ch);
00260         auto __upper = __fctyp.toupper(__ch);
00261         return (__first <= __lower && __lower <= __last)
00262           || (__first <= __upper && __upper <= __last);
00263       }
00264 
00265       const _TraitsT& _M_traits;
00266     };
00267 
00268   template<typename _TraitsT, bool __icase, bool __collate>
00269     class _RegexTranslator
00270     : public _RegexTranslatorBase<_TraitsT, __icase, __collate>
00271     {
00272     public:
00273       typedef _RegexTranslatorBase<_TraitsT, __icase, __collate> _Base;
00274       using _Base::_Base;
00275     };
00276 
00277   template<typename _TraitsT, bool __icase>
00278     class _RegexTranslator<_TraitsT, __icase, false>
00279     : public _RegexTranslatorBase<_TraitsT, __icase, false>
00280     {
00281     public:
00282       typedef _RegexTranslatorBase<_TraitsT, __icase, false> _Base;
00283       typedef typename _Base::_CharT _CharT;
00284       typedef _CharT _StrTransT;
00285 
00286       using _Base::_Base;
00287 
00288       _StrTransT
00289       _M_transform(_CharT __ch) const
00290       { return __ch; }
00291 
00292       bool
00293       _M_match_range(_CharT __first, _CharT __last, _CharT __ch) const
00294       {
00295         if (!__icase)
00296           return __first <= __ch && __ch <= __last;
00297         return this->_M_in_range_icase(__first, __last, __ch);
00298       }
00299     };
00300 
00301   template<typename _CharType>
00302     class _RegexTranslator<std::regex_traits<_CharType>, true, true>
00303     : public _RegexTranslatorBase<std::regex_traits<_CharType>, true, true>
00304     {
00305     public:
00306       typedef _RegexTranslatorBase<std::regex_traits<_CharType>, true, true>
00307         _Base;
00308       typedef typename _Base::_CharT _CharT;
00309       typedef typename _Base::_StrTransT _StrTransT;
00310 
00311       using _Base::_Base;
00312 
00313       bool
00314       _M_match_range(const _StrTransT& __first, const _StrTransT& __last,
00315                      const _StrTransT& __str) const
00316       {
00317         __glibcxx_assert(__first.size() == 1);
00318         __glibcxx_assert(__last.size() == 1);
00319         __glibcxx_assert(__str.size() == 1);
00320         return this->_M_in_range_icase(__first[0], __last[0], __str[0]);
00321       }
00322     };
00323 
00324   template<typename _TraitsT>
00325     class _RegexTranslator<_TraitsT, false, false>
00326     {
00327     public:
00328       typedef typename _TraitsT::char_type _CharT;
00329       typedef _CharT                       _StrTransT;
00330 
00331       explicit
00332       _RegexTranslator(const _TraitsT&)
00333       { }
00334 
00335       _CharT
00336       _M_translate(_CharT __ch) const
00337       { return __ch; }
00338 
00339       _StrTransT
00340       _M_transform(_CharT __ch) const
00341       { return __ch; }
00342 
00343       bool
00344       _M_match_range(_CharT __first, _CharT __last, _CharT __ch) const
00345       { return __first <= __ch && __ch <= __last; }
00346     };
00347 
00348   template<typename _TraitsT, bool __is_ecma, bool __icase, bool __collate>
00349     struct _AnyMatcher;
00350 
00351   template<typename _TraitsT, bool __icase, bool __collate>
00352     struct _AnyMatcher<_TraitsT, false, __icase, __collate>
00353     {
00354       typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
00355       typedef typename _TransT::_CharT                       _CharT;
00356 
00357       explicit
00358       _AnyMatcher(const _TraitsT& __traits)
00359       : _M_translator(__traits)
00360       { }
00361 
00362       bool
00363       operator()(_CharT __ch) const
00364       {
00365         static auto __nul = _M_translator._M_translate('\0');
00366         return _M_translator._M_translate(__ch) != __nul;
00367       }
00368 
00369       _TransT _M_translator;
00370     };
00371 
00372   template<typename _TraitsT, bool __icase, bool __collate>
00373     struct _AnyMatcher<_TraitsT, true, __icase, __collate>
00374     {
00375       typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
00376       typedef typename _TransT::_CharT                       _CharT;
00377 
00378       explicit
00379       _AnyMatcher(const _TraitsT& __traits)
00380       : _M_translator(__traits)
00381       { }
00382 
00383       bool
00384       operator()(_CharT __ch) const
00385       { return _M_apply(__ch, typename is_same<_CharT, char>::type()); }
00386 
00387       bool
00388       _M_apply(_CharT __ch, true_type) const
00389       {
00390         auto __c = _M_translator._M_translate(__ch);
00391         auto __n = _M_translator._M_translate('\n');
00392         auto __r = _M_translator._M_translate('\r');
00393         return __c != __n && __c != __r;
00394       }
00395 
00396       bool
00397       _M_apply(_CharT __ch, false_type) const
00398       {
00399         auto __c = _M_translator._M_translate(__ch);
00400         auto __n = _M_translator._M_translate('\n');
00401         auto __r = _M_translator._M_translate('\r');
00402         auto __u2028 = _M_translator._M_translate(u'\u2028');
00403         auto __u2029 = _M_translator._M_translate(u'\u2029');
00404         return __c != __n && __c != __r && __c != __u2028 && __c != __u2029;
00405       }
00406 
00407       _TransT _M_translator;
00408     };
00409 
00410   template<typename _TraitsT, bool __icase, bool __collate>
00411     struct _CharMatcher
00412     {
00413       typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
00414       typedef typename _TransT::_CharT                       _CharT;
00415 
00416       _CharMatcher(_CharT __ch, const _TraitsT& __traits)
00417       : _M_translator(__traits), _M_ch(_M_translator._M_translate(__ch))
00418       { }
00419 
00420       bool
00421       operator()(_CharT __ch) const
00422       { return _M_ch == _M_translator._M_translate(__ch); }
00423 
00424       _TransT _M_translator;
00425       _CharT  _M_ch;
00426     };
00427 
00428   /// Matches a character range (bracket expression)
00429   template<typename _TraitsT, bool __icase, bool __collate>
00430     struct _BracketMatcher
00431     {
00432     public:
00433       typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
00434       typedef typename _TransT::_CharT                       _CharT;
00435       typedef typename _TransT::_StrTransT                   _StrTransT;
00436       typedef typename _TraitsT::string_type                 _StringT;
00437       typedef typename _TraitsT::char_class_type             _CharClassT;
00438 
00439     public:
00440       _BracketMatcher(bool __is_non_matching,
00441                       const _TraitsT& __traits)
00442       : _M_class_set(0), _M_translator(__traits), _M_traits(__traits),
00443       _M_is_non_matching(__is_non_matching)
00444       { }
00445 
00446       bool
00447       operator()(_CharT __ch) const
00448       {
00449         _GLIBCXX_DEBUG_ASSERT(_M_is_ready);
00450         return _M_apply(__ch, _UseCache());
00451       }
00452 
00453       void
00454       _M_add_char(_CharT __c)
00455       {
00456         _M_char_set.push_back(_M_translator._M_translate(__c));
00457         _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
00458       }
00459 
00460       _StringT
00461       _M_add_collate_element(const _StringT& __s)
00462       {
00463         auto __st = _M_traits.lookup_collatename(__s.data(),
00464                                                  __s.data() + __s.size());
00465         if (__st.empty())
00466           __throw_regex_error(regex_constants::error_collate,
00467                               "Invalid collate element.");
00468         _M_char_set.push_back(_M_translator._M_translate(__st[0]));
00469         _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
00470         return __st;
00471       }
00472 
00473       void
00474       _M_add_equivalence_class(const _StringT& __s)
00475       {
00476         auto __st = _M_traits.lookup_collatename(__s.data(),
00477                                                  __s.data() + __s.size());
00478         if (__st.empty())
00479           __throw_regex_error(regex_constants::error_collate,
00480                               "Invalid equivalence class.");
00481         __st = _M_traits.transform_primary(__st.data(),
00482                                            __st.data() + __st.size());
00483         _M_equiv_set.push_back(__st);
00484         _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
00485       }
00486 
00487       // __neg should be true for \D, \S and \W only.
00488       void
00489       _M_add_character_class(const _StringT& __s, bool __neg)
00490       {
00491         auto __mask = _M_traits.lookup_classname(__s.data(),
00492                                                  __s.data() + __s.size(),
00493                                                  __icase);
00494         if (__mask == 0)
00495           __throw_regex_error(regex_constants::error_collate,
00496                               "Invalid character class.");
00497         if (!__neg)
00498           _M_class_set |= __mask;
00499         else
00500           _M_neg_class_set.push_back(__mask);
00501         _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
00502       }
00503 
00504       void
00505       _M_make_range(_CharT __l, _CharT __r)
00506       {
00507         if (__l > __r)
00508           __throw_regex_error(regex_constants::error_range,
00509                               "Invalid range in bracket expression.");
00510         _M_range_set.push_back(make_pair(_M_translator._M_transform(__l),
00511                                          _M_translator._M_transform(__r)));
00512         _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
00513       }
00514 
00515       void
00516       _M_ready()
00517       {
00518         std::sort(_M_char_set.begin(), _M_char_set.end());
00519         auto __end = std::unique(_M_char_set.begin(), _M_char_set.end());
00520         _M_char_set.erase(__end, _M_char_set.end());
00521         _M_make_cache(_UseCache());
00522         _GLIBCXX_DEBUG_ONLY(_M_is_ready = true);
00523       }
00524 
00525     private:
00526       // Currently we only use the cache for char
00527       typedef typename std::is_same<_CharT, char>::type _UseCache;
00528 
00529       static constexpr size_t
00530       _S_cache_size()
00531       {
00532         return 1ul << (sizeof(_CharT) * __CHAR_BIT__ * int(_UseCache::value));
00533       }
00534 
00535       struct _Dummy { };
00536       typedef typename std::conditional<_UseCache::value,
00537                                         std::bitset<_S_cache_size()>,
00538                                         _Dummy>::type _CacheT;
00539       typedef typename std::make_unsigned<_CharT>::type _UnsignedCharT;
00540 
00541       bool
00542       _M_apply(_CharT __ch, false_type) const;
00543 
00544       bool
00545       _M_apply(_CharT __ch, true_type) const
00546       { return _M_cache[static_cast<_UnsignedCharT>(__ch)]; }
00547 
00548       void
00549       _M_make_cache(true_type)
00550       {
00551         for (unsigned __i = 0; __i < _M_cache.size(); __i++)
00552           _M_cache[__i] = _M_apply(static_cast<_CharT>(__i), false_type());
00553       }
00554 
00555       void
00556       _M_make_cache(false_type)
00557       { }
00558 
00559     private:
00560       std::vector<_CharT>                       _M_char_set;
00561       std::vector<_StringT>                     _M_equiv_set;
00562       std::vector<pair<_StrTransT, _StrTransT>> _M_range_set;
00563       std::vector<_CharClassT>                  _M_neg_class_set;
00564       _CharClassT                               _M_class_set;
00565       _TransT                                   _M_translator;
00566       const _TraitsT&                           _M_traits;
00567       bool                                      _M_is_non_matching;
00568       _CacheT                                   _M_cache;
00569 #ifdef _GLIBCXX_DEBUG
00570       bool                                      _M_is_ready = false;
00571 #endif
00572     };
00573 
00574  //@} regex-detail
00575 } // namespace __detail
00576 _GLIBCXX_END_NAMESPACE_VERSION
00577 } // namespace std
00578 
00579 #include <bits/regex_compiler.tcc>