libstdc++
regex_automaton.h
Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex_automaton.h
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 namespace std _GLIBCXX_VISIBILITY(default)
00032 {
00033 namespace __detail
00034 {
00035 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00036 
00037   /**
00038    *  @defgroup regex-detail Base and Implementation Classes
00039    *  @ingroup regex
00040    *  @{
00041    */
00042 
00043   typedef long _StateIdT;
00044   static const _StateIdT _S_invalid_state_id  = -1;
00045 
00046   template<typename _CharT>
00047     using _Matcher = std::function<bool (_CharT)>;
00048 
00049   /// Operation codes that define the type of transitions within the base NFA
00050   /// that represents the regular expression.
00051   enum _Opcode : int
00052   {
00053       _S_opcode_unknown,
00054       _S_opcode_alternative,
00055       _S_opcode_backref,
00056       _S_opcode_line_begin_assertion,
00057       _S_opcode_line_end_assertion,
00058       _S_opcode_word_boundary,
00059       _S_opcode_subexpr_lookahead,
00060       _S_opcode_subexpr_begin,
00061       _S_opcode_subexpr_end,
00062       _S_opcode_dummy,
00063       _S_opcode_match,
00064       _S_opcode_accept,
00065   };
00066 
00067   struct _State_base
00068   {
00069     _Opcode      _M_opcode;           // type of outgoing transition
00070     _StateIdT    _M_next;             // outgoing transition
00071     union // Since they are mutually exclusive.
00072     {
00073       size_t _M_subexpr;        // for _S_opcode_subexpr_*
00074       size_t _M_backref_index;  // for _S_opcode_backref
00075       struct
00076       {
00077     // for _S_opcode_alternative.
00078     _StateIdT  _M_quant_index;
00079     // for _S_opcode_alternative or _S_opcode_subexpr_lookahead
00080     _StateIdT  _M_alt;
00081     // for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or
00082     // quantifiers (ungreedy if set true)
00083     bool       _M_neg;
00084       };
00085     };
00086 
00087     explicit _State_base(_Opcode __opcode)
00088     : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
00089     { }
00090 
00091   protected:
00092     ~_State_base() = default;
00093 
00094   public:
00095 #ifdef _GLIBCXX_DEBUG
00096     std::ostream&
00097     _M_print(std::ostream& ostr) const;
00098 
00099     // Prints graphviz dot commands for state.
00100     std::ostream&
00101     _M_dot(std::ostream& __ostr, _StateIdT __id) const;
00102 #endif
00103   };
00104 
00105   template<typename _TraitsT>
00106     struct _State : _State_base
00107     {
00108       typedef _Matcher<typename _TraitsT::char_type> _MatcherT;
00109 
00110       _MatcherT      _M_matches;        // for _S_opcode_match
00111 
00112       explicit _State(_Opcode __opcode) : _State_base(__opcode) { }
00113     };
00114 
00115   struct _NFA_base
00116   {
00117     typedef size_t                              _SizeT;
00118     typedef regex_constants::syntax_option_type _FlagT;
00119 
00120     explicit
00121     _NFA_base(_FlagT __f)
00122     : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
00123     _M_quant_count(0), _M_has_backref(false)
00124     { }
00125 
00126     _NFA_base(_NFA_base&&) = default;
00127 
00128   protected:
00129     ~_NFA_base() = default;
00130 
00131   public:
00132     _FlagT
00133     _M_options() const
00134     { return _M_flags; }
00135 
00136     _StateIdT
00137     _M_start() const
00138     { return _M_start_state; }
00139 
00140     _SizeT
00141     _M_sub_count() const
00142     { return _M_subexpr_count; }
00143 
00144     std::vector<size_t>       _M_paren_stack;
00145     _FlagT                    _M_flags;
00146     _StateIdT                 _M_start_state;
00147     _SizeT                    _M_subexpr_count;
00148     _SizeT                    _M_quant_count;
00149     bool                      _M_has_backref;
00150   };
00151 
00152   template<typename _TraitsT>
00153     struct _NFA
00154     : _NFA_base, std::vector<_State<_TraitsT>>
00155     {
00156       typedef _State<_TraitsT>              _StateT;
00157       typedef _Matcher<typename _TraitsT::char_type>    _MatcherT;
00158 
00159       using _NFA_base::_NFA_base;
00160 
00161       // for performance reasons _NFA objects should only be moved not copied
00162       _NFA(const _NFA&) = delete;
00163       _NFA(_NFA&&) = default;
00164 
00165       _StateIdT
00166       _M_insert_accept()
00167       {
00168     auto __ret = _M_insert_state(_StateT(_S_opcode_accept));
00169     return __ret;
00170       }
00171 
00172       _StateIdT
00173       _M_insert_alt(_StateIdT __next, _StateIdT __alt, bool __neg)
00174       {
00175     _StateT __tmp(_S_opcode_alternative);
00176     // It labels every quantifier to make greedy comparison easier in BFS
00177     // approach.
00178     __tmp._M_quant_index = this->_M_quant_count++;
00179     __tmp._M_next = __next;
00180     __tmp._M_alt = __alt;
00181     __tmp._M_neg = __neg;
00182     return _M_insert_state(std::move(__tmp));
00183       }
00184 
00185       _StateIdT
00186       _M_insert_matcher(_MatcherT __m)
00187       {
00188     _StateT __tmp(_S_opcode_match);
00189     __tmp._M_matches = std::move(__m);
00190     return _M_insert_state(std::move(__tmp));
00191       }
00192 
00193       _StateIdT
00194       _M_insert_subexpr_begin()
00195       {
00196     auto __id = this->_M_subexpr_count++;
00197     this->_M_paren_stack.push_back(__id);
00198     _StateT __tmp(_S_opcode_subexpr_begin);
00199     __tmp._M_subexpr = __id;
00200     return _M_insert_state(std::move(__tmp));
00201       }
00202 
00203       _StateIdT
00204       _M_insert_subexpr_end()
00205       {
00206     _StateT __tmp(_S_opcode_subexpr_end);
00207     __tmp._M_subexpr = this->_M_paren_stack.back();
00208     this->_M_paren_stack.pop_back();
00209     return _M_insert_state(std::move(__tmp));
00210       }
00211 
00212       _StateIdT
00213       _M_insert_backref(size_t __index);
00214 
00215       _StateIdT
00216       _M_insert_line_begin()
00217       { return _M_insert_state(_StateT(_S_opcode_line_begin_assertion)); }
00218 
00219       _StateIdT
00220       _M_insert_line_end()
00221       { return _M_insert_state(_StateT(_S_opcode_line_end_assertion)); }
00222 
00223       _StateIdT
00224       _M_insert_word_bound(bool __neg)
00225       {
00226     _StateT __tmp(_S_opcode_word_boundary);
00227     __tmp._M_neg = __neg;
00228     return _M_insert_state(std::move(__tmp));
00229       }
00230 
00231       _StateIdT
00232       _M_insert_lookahead(_StateIdT __alt, bool __neg)
00233       {
00234     _StateT __tmp(_S_opcode_subexpr_lookahead);
00235     __tmp._M_alt = __alt;
00236     __tmp._M_neg = __neg;
00237     return _M_insert_state(std::move(__tmp));
00238       }
00239 
00240       _StateIdT
00241       _M_insert_dummy()
00242       { return _M_insert_state(_StateT(_S_opcode_dummy)); }
00243 
00244       _StateIdT
00245       _M_insert_state(_StateT __s)
00246       {
00247     this->push_back(std::move(__s));
00248     return this->size()-1;
00249       }
00250 
00251       // Eliminate dummy node in this NFA to make it compact.
00252       void
00253       _M_eliminate_dummy();
00254 
00255 #ifdef _GLIBCXX_DEBUG
00256       std::ostream&
00257       _M_dot(std::ostream& __ostr) const;
00258 #endif
00259     };
00260 
00261   /// Describes a sequence of one or more %_State, its current start
00262   /// and end(s).  This structure contains fragments of an NFA during
00263   /// construction.
00264   template<typename _TraitsT>
00265     class _StateSeq
00266     {
00267     public:
00268       typedef _NFA<_TraitsT> _RegexT;
00269 
00270     public:
00271       _StateSeq(_RegexT& __nfa, _StateIdT __s)
00272       : _M_nfa(__nfa), _M_start(__s), _M_end(__s)
00273       { }
00274 
00275       _StateSeq(_RegexT& __nfa, _StateIdT __s, _StateIdT __end)
00276       : _M_nfa(__nfa), _M_start(__s), _M_end(__end)
00277       { }
00278 
00279       // Append a state on *this and change *this to the new sequence.
00280       void
00281       _M_append(_StateIdT __id)
00282       {
00283     _M_nfa[_M_end]._M_next = __id;
00284     _M_end = __id;
00285       }
00286 
00287       // Append a sequence on *this and change *this to the new sequence.
00288       void
00289       _M_append(const _StateSeq& __s)
00290       {
00291     _M_nfa[_M_end]._M_next = __s._M_start;
00292     _M_end = __s._M_end;
00293       }
00294 
00295       // Clones an entire sequence.
00296       _StateSeq
00297       _M_clone();
00298 
00299     public:
00300       _RegexT&  _M_nfa;
00301       _StateIdT _M_start;
00302       _StateIdT _M_end;
00303     };
00304 
00305  //@} regex-detail
00306 _GLIBCXX_END_NAMESPACE_VERSION
00307 } // namespace __detail
00308 } // namespace std
00309 
00310 #include <bits/regex_automaton.tcc>