libstdc++
regex_scanner.h
Go to the documentation of this file.
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2022 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex_scanner.h
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 namespace std _GLIBCXX_VISIBILITY(default)
32 {
33 _GLIBCXX_BEGIN_NAMESPACE_VERSION
34 
35 namespace __detail
36 {
37  /**
38  * @addtogroup regex-detail
39  * @{
40  */
41 
42  struct _ScannerBase
43  {
44  public:
45  /// Token types returned from the scanner.
46  enum _TokenT : unsigned
47  {
48  _S_token_anychar,
49  _S_token_ord_char,
50  _S_token_oct_num,
51  _S_token_hex_num,
52  _S_token_backref,
53  _S_token_subexpr_begin,
54  _S_token_subexpr_no_group_begin,
55  _S_token_subexpr_lookahead_begin, // neg if _M_value[0] == 'n'
56  _S_token_subexpr_end,
57  _S_token_bracket_begin,
58  _S_token_bracket_neg_begin,
59  _S_token_bracket_end,
60  _S_token_interval_begin,
61  _S_token_interval_end,
62  _S_token_quoted_class,
63  _S_token_char_class_name,
64  _S_token_collsymbol,
65  _S_token_equiv_class_name,
66  _S_token_opt,
67  _S_token_or,
68  _S_token_closure0,
69  _S_token_closure1,
70  _S_token_line_begin,
71  _S_token_line_end,
72  _S_token_word_bound, // neg if _M_value[0] == 'n'
73  _S_token_comma,
74  _S_token_dup_count,
75  _S_token_eof,
76  _S_token_bracket_dash,
77  _S_token_unknown = -1u
78  };
79 
80  protected:
82 
83  enum _StateT
84  {
85  _S_state_normal,
86  _S_state_in_brace,
87  _S_state_in_bracket,
88  };
89 
90  protected:
91  _ScannerBase(_FlagT __flags)
92  : _M_state(_S_state_normal),
93  _M_flags(__flags),
94  _M_escape_tbl(_M_is_ecma()
95  ? _M_ecma_escape_tbl
96  : _M_awk_escape_tbl),
97  _M_spec_char(_M_is_ecma()
98  ? _M_ecma_spec_char
99  : _M_flags & regex_constants::basic
100  ? _M_basic_spec_char
101  : _M_flags & regex_constants::extended
102  ? _M_extended_spec_char
103  : _M_flags & regex_constants::grep
104  ? ".[\\*^$\n"
105  : _M_flags & regex_constants::egrep
106  ? ".[\\()*+?{|^$\n"
107  : _M_flags & regex_constants::awk
108  ? _M_extended_spec_char
109  : nullptr),
110  _M_at_bracket_start(false)
111  { __glibcxx_assert(_M_spec_char); }
112 
113  protected:
114  const char*
115  _M_find_escape(char __c)
116  {
117  auto __it = _M_escape_tbl;
118  for (; __it->first != '\0'; ++__it)
119  if (__it->first == __c)
120  return &__it->second;
121  return nullptr;
122  }
123 
124  bool
125  _M_is_ecma() const
126  { return _M_flags & regex_constants::ECMAScript; }
127 
128  bool
129  _M_is_basic() const
130  { return _M_flags & (regex_constants::basic | regex_constants::grep); }
131 
132  bool
133  _M_is_extended() const
134  {
135  return _M_flags & (regex_constants::extended
138  }
139 
140  bool
141  _M_is_grep() const
142  { return _M_flags & (regex_constants::grep | regex_constants::egrep); }
143 
144  bool
145  _M_is_awk() const
146  { return _M_flags & regex_constants::awk; }
147 
148  protected:
149  // TODO: Make them static in the next abi change.
150  const std::pair<char, _TokenT> _M_token_tbl[9] =
151  {
152  {'^', _S_token_line_begin},
153  {'$', _S_token_line_end},
154  {'.', _S_token_anychar},
155  {'*', _S_token_closure0},
156  {'+', _S_token_closure1},
157  {'?', _S_token_opt},
158  {'|', _S_token_or},
159  {'\n', _S_token_or}, // grep and egrep
160  {'\0', _S_token_or},
161  };
162  const std::pair<char, char> _M_ecma_escape_tbl[8] =
163  {
164  {'0', '\0'},
165  {'b', '\b'},
166  {'f', '\f'},
167  {'n', '\n'},
168  {'r', '\r'},
169  {'t', '\t'},
170  {'v', '\v'},
171  {'\0', '\0'},
172  };
173  const std::pair<char, char> _M_awk_escape_tbl[11] =
174  {
175  {'"', '"'},
176  {'/', '/'},
177  {'\\', '\\'},
178  {'a', '\a'},
179  {'b', '\b'},
180  {'f', '\f'},
181  {'n', '\n'},
182  {'r', '\r'},
183  {'t', '\t'},
184  {'v', '\v'},
185  {'\0', '\0'},
186  };
187  const char* _M_ecma_spec_char = "^$\\.*+?()[]{}|";
188  const char* _M_basic_spec_char = ".[\\*^$";
189  const char* _M_extended_spec_char = ".[\\()*+?{|^$";
190 
191  _StateT _M_state;
192  _FlagT _M_flags;
193  _TokenT _M_token;
194  const std::pair<char, char>* _M_escape_tbl;
195  const char* _M_spec_char;
196  bool _M_at_bracket_start;
197  };
198 
199  /**
200  * @brief Scans an input range for regex tokens.
201  *
202  * The %_Scanner class interprets the regular expression pattern in
203  * the input range passed to its constructor as a sequence of parse
204  * tokens passed to the regular expression compiler. The sequence
205  * of tokens provided depends on the flag settings passed to the
206  * constructor: different regular expression grammars will interpret
207  * the same input pattern in syntactically different ways.
208  */
209  template<typename _CharT>
210  class _Scanner
211  : public _ScannerBase
212  {
213  public:
216  typedef const std::ctype<_CharT> _CtypeT;
217 
218  _Scanner(const _CharT* __begin, const _CharT* __end,
219  _FlagT __flags, std::locale __loc);
220 
221  void
222  _M_advance();
223 
224  _TokenT
225  _M_get_token() const noexcept
226  { return _M_token; }
227 
228  const _StringT&
229  _M_get_value() const noexcept
230  { return _M_value; }
231 
232 #ifdef _GLIBCXX_DEBUG
233  std::ostream&
234  _M_print(std::ostream&);
235 #endif
236 
237  private:
238  void
239  _M_scan_normal();
240 
241  void
242  _M_scan_in_bracket();
243 
244  void
245  _M_scan_in_brace();
246 
247  void
248  _M_eat_escape_ecma();
249 
250  void
251  _M_eat_escape_posix();
252 
253  void
254  _M_eat_escape_awk();
255 
256  void
257  _M_eat_class(char);
258 
259  const _CharT* _M_current;
260  const _CharT* _M_end;
261  _CtypeT& _M_ctype;
262  _StringT _M_value;
263  void (_Scanner::* _M_eat_escape)();
264  };
265 
266  ///@} regex-detail
267 } // namespace __detail
268 _GLIBCXX_END_NAMESPACE_VERSION
269 } // namespace std
270 
271 #include <bits/regex_scanner.tcc>
ISO C++ entities toplevel namespace is std.
constexpr syntax_option_type ECMAScript
constexpr syntax_option_type egrep
syntax_option_type
This is a bitmask type indicating how to interpret the regex.
constexpr syntax_option_type awk
constexpr syntax_option_type extended
constexpr syntax_option_type basic
constexpr syntax_option_type grep
Container class for localization functionality.
Primary class template ctype facet.
Scans an input range for regex tokens.