libstdc++
regex_scanner.h
Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2013-2017 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex_scanner.h
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 namespace std _GLIBCXX_VISIBILITY(default)
00032 {
00033 namespace __detail
00034 {
00035 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00036 
00037   /**
00038    * @addtogroup regex-detail
00039    * @{
00040    */
00041 
00042   struct _ScannerBase
00043   {
00044   public:
00045     /// Token types returned from the scanner.
00046     enum _TokenT : unsigned
00047     {
00048       _S_token_anychar,
00049       _S_token_ord_char,
00050       _S_token_oct_num,
00051       _S_token_hex_num,
00052       _S_token_backref,
00053       _S_token_subexpr_begin,
00054       _S_token_subexpr_no_group_begin,
00055       _S_token_subexpr_lookahead_begin, // neg if _M_value[0] == 'n'
00056       _S_token_subexpr_end,
00057       _S_token_bracket_begin,
00058       _S_token_bracket_neg_begin,
00059       _S_token_bracket_end,
00060       _S_token_interval_begin,
00061       _S_token_interval_end,
00062       _S_token_quoted_class,
00063       _S_token_char_class_name,
00064       _S_token_collsymbol,
00065       _S_token_equiv_class_name,
00066       _S_token_opt,
00067       _S_token_or,
00068       _S_token_closure0,
00069       _S_token_closure1,
00070       _S_token_line_begin,
00071       _S_token_line_end,
00072       _S_token_word_bound, // neg if _M_value[0] == 'n'
00073       _S_token_comma,
00074       _S_token_dup_count,
00075       _S_token_eof,
00076       _S_token_bracket_dash,
00077       _S_token_unknown = -1u
00078     };
00079 
00080   protected:
00081     typedef regex_constants::syntax_option_type _FlagT;
00082 
00083     enum _StateT
00084     {
00085       _S_state_normal,
00086       _S_state_in_brace,
00087       _S_state_in_bracket,
00088     };
00089 
00090   protected:
00091     _ScannerBase(_FlagT __flags)
00092     : _M_state(_S_state_normal),
00093     _M_flags(__flags),
00094     _M_escape_tbl(_M_is_ecma()
00095                   ? _M_ecma_escape_tbl
00096                   : _M_awk_escape_tbl),
00097     _M_spec_char(_M_is_ecma()
00098                  ? _M_ecma_spec_char
00099                  : _M_flags & regex_constants::basic
00100                  ? _M_basic_spec_char
00101                  : _M_flags & regex_constants::extended
00102                  ? _M_extended_spec_char
00103                  : _M_flags & regex_constants::grep
00104                  ?  ".[\\*^$\n"
00105                  : _M_flags & regex_constants::egrep
00106                  ? ".[\\()*+?{|^$\n"
00107                  : _M_flags & regex_constants::awk
00108                  ? _M_extended_spec_char
00109                  : nullptr),
00110     _M_at_bracket_start(false)
00111     { __glibcxx_assert(_M_spec_char); }
00112 
00113   protected:
00114     const char*
00115     _M_find_escape(char __c)
00116     {
00117       auto __it = _M_escape_tbl;
00118       for (; __it->first != '\0'; ++__it)
00119         if (__it->first == __c)
00120           return &__it->second;
00121       return nullptr;
00122     }
00123 
00124     bool
00125     _M_is_ecma() const
00126     { return _M_flags & regex_constants::ECMAScript; }
00127 
00128     bool
00129     _M_is_basic() const
00130     { return _M_flags & (regex_constants::basic | regex_constants::grep); }
00131 
00132     bool
00133     _M_is_extended() const
00134     {
00135       return _M_flags & (regex_constants::extended
00136                          | regex_constants::egrep
00137                          | regex_constants::awk);
00138     }
00139 
00140     bool
00141     _M_is_grep() const
00142     { return _M_flags & (regex_constants::grep | regex_constants::egrep); }
00143 
00144     bool
00145     _M_is_awk() const
00146     { return _M_flags & regex_constants::awk; }
00147 
00148   protected:
00149     // TODO: Make them static in the next abi change.
00150     const std::pair<char, _TokenT> _M_token_tbl[9] =
00151       {
00152         {'^', _S_token_line_begin},
00153         {'$', _S_token_line_end},
00154         {'.', _S_token_anychar},
00155         {'*', _S_token_closure0},
00156         {'+', _S_token_closure1},
00157         {'?', _S_token_opt},
00158         {'|', _S_token_or},
00159         {'\n', _S_token_or}, // grep and egrep
00160         {'\0', _S_token_or},
00161       };
00162     const std::pair<char, char> _M_ecma_escape_tbl[8] =
00163       {
00164         {'0', '\0'},
00165         {'b', '\b'},
00166         {'f', '\f'},
00167         {'n', '\n'},
00168         {'r', '\r'},
00169         {'t', '\t'},
00170         {'v', '\v'},
00171         {'\0', '\0'},
00172       };
00173     const std::pair<char, char> _M_awk_escape_tbl[11] =
00174       {
00175         {'"', '"'},
00176         {'/', '/'},
00177         {'\\', '\\'},
00178         {'a', '\a'},
00179         {'b', '\b'},
00180         {'f', '\f'},
00181         {'n', '\n'},
00182         {'r', '\r'},
00183         {'t', '\t'},
00184         {'v', '\v'},
00185         {'\0', '\0'},
00186       };
00187     const char* _M_ecma_spec_char = "^$\\.*+?()[]{}|";
00188     const char* _M_basic_spec_char = ".[\\*^$";
00189     const char* _M_extended_spec_char = ".[\\()*+?{|^$";
00190 
00191     _StateT                       _M_state;
00192     _FlagT                        _M_flags;
00193     _TokenT                       _M_token;
00194     const std::pair<char, char>*  _M_escape_tbl;
00195     const char*                   _M_spec_char;
00196     bool                          _M_at_bracket_start;
00197   };
00198 
00199   /**
00200    * @brief Scans an input range for regex tokens.
00201    *
00202    * The %_Scanner class interprets the regular expression pattern in
00203    * the input range passed to its constructor as a sequence of parse
00204    * tokens passed to the regular expression compiler.  The sequence
00205    * of tokens provided depends on the flag settings passed to the
00206    * constructor: different regular expression grammars will interpret
00207    * the same input pattern in syntactically different ways.
00208    */
00209   template<typename _CharT>
00210     class _Scanner
00211     : public _ScannerBase
00212     {
00213     public:
00214       typedef const _CharT*                                       _IterT;
00215       typedef std::basic_string<_CharT>                           _StringT;
00216       typedef regex_constants::syntax_option_type                 _FlagT;
00217       typedef const std::ctype<_CharT>                            _CtypeT;
00218 
00219       _Scanner(_IterT __begin, _IterT __end,
00220                _FlagT __flags, std::locale __loc);
00221 
00222       void
00223       _M_advance();
00224 
00225       _TokenT
00226       _M_get_token() const
00227       { return _M_token; }
00228 
00229       const _StringT&
00230       _M_get_value() const
00231       { return _M_value; }
00232 
00233 #ifdef _GLIBCXX_DEBUG
00234       std::ostream&
00235       _M_print(std::ostream&);
00236 #endif
00237 
00238     private:
00239       void
00240       _M_scan_normal();
00241 
00242       void
00243       _M_scan_in_bracket();
00244 
00245       void
00246       _M_scan_in_brace();
00247 
00248       void
00249       _M_eat_escape_ecma();
00250 
00251       void
00252       _M_eat_escape_posix();
00253 
00254       void
00255       _M_eat_escape_awk();
00256 
00257       void
00258       _M_eat_class(char);
00259 
00260       _IterT                        _M_current;
00261       _IterT                        _M_end;
00262       _CtypeT&                      _M_ctype;
00263       _StringT                      _M_value;
00264       void (_Scanner::* _M_eat_escape)();
00265     };
00266 
00267  //@} regex-detail
00268 _GLIBCXX_END_NAMESPACE_VERSION
00269 } // namespace __detail
00270 } // namespace std
00271 
00272 #include <bits/regex_scanner.tcc>