libstdc++
regex_scanner.tcc
Go to the documentation of this file.
1 // class template regex -*- C++ -*-
2 
3 // Copyright (C) 2013-2022 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /**
26  * @file bits/regex_scanner.tcc
27  * This is an internal header file, included by other library headers.
28  * Do not attempt to use it directly. @headername{regex}
29  */
30 
31 // FIXME make comments doxygen format.
32 
33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
34 // and awk
35 // 1) grep is basic except '\n' is treated as '|'
36 // 2) egrep is extended except '\n' is treated as '|'
37 // 3) awk is extended except special escaping rules, and there's no
38 // back-reference.
39 //
40 // References:
41 //
42 // ECMAScript: ECMA-262 15.10
43 //
44 // basic, extended:
45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
46 //
47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
48 
49 namespace std _GLIBCXX_VISIBILITY(default)
50 {
51 _GLIBCXX_BEGIN_NAMESPACE_VERSION
52 
53 namespace __detail
54 {
55  template<typename _CharT>
56  _Scanner<_CharT>::
57  _Scanner(const _CharT* __begin, const _CharT* __end,
58  _FlagT __flags, std::locale __loc)
59  : _ScannerBase(__flags),
60  _M_current(__begin), _M_end(__end),
61  _M_ctype(std::use_facet<_CtypeT>(__loc)),
62  _M_eat_escape(_M_is_ecma()
63  ? &_Scanner::_M_eat_escape_ecma
64  : &_Scanner::_M_eat_escape_posix)
65  { _M_advance(); }
66 
67  template<typename _CharT>
68  void
69  _Scanner<_CharT>::
70  _M_advance()
71  {
72  if (_M_current == _M_end)
73  {
74  _M_token = _S_token_eof;
75  return;
76  }
77 
78  if (_M_state == _S_state_normal)
79  _M_scan_normal();
80  else if (_M_state == _S_state_in_bracket)
81  _M_scan_in_bracket();
82  else if (_M_state == _S_state_in_brace)
83  _M_scan_in_brace();
84  else
85  {
86  __glibcxx_assert(!"unexpected state while processing regex");
87  }
88  }
89 
90  // Differences between styles:
91  // 1) "\‍(", "\‍)", "\{" in basic. It's not escaping.
92  // 2) "(?:", "(?=", "(?!" in ECMAScript.
93  template<typename _CharT>
94  void
95  _Scanner<_CharT>::
96  _M_scan_normal()
97  {
98  auto __c = *_M_current++;
99 
100  if (__builtin_strchr(_M_spec_char, _M_ctype.narrow(__c, ' ')) == nullptr)
101  {
102  _M_token = _S_token_ord_char;
103  _M_value.assign(1, __c);
104  return;
105  }
106  if (__c == '\\')
107  {
108  if (_M_current == _M_end)
109  __throw_regex_error(
111  "Invalid escape at end of regular expression");
112 
113  if (!_M_is_basic()
114  || (*_M_current != '('
115  && *_M_current != ')'
116  && *_M_current != '{'))
117  {
118  (this->*_M_eat_escape)();
119  return;
120  }
121  __c = *_M_current++;
122  }
123  if (__c == '(')
124  {
125  if (_M_is_ecma() && *_M_current == '?')
126  {
127  if (++_M_current == _M_end)
128  __throw_regex_error(regex_constants::error_paren);
129 
130  if (*_M_current == ':')
131  {
132  ++_M_current;
133  _M_token = _S_token_subexpr_no_group_begin;
134  }
135  else if (*_M_current == '=')
136  {
137  ++_M_current;
138  _M_token = _S_token_subexpr_lookahead_begin;
139  _M_value.assign(1, 'p');
140  }
141  else if (*_M_current == '!')
142  {
143  ++_M_current;
144  _M_token = _S_token_subexpr_lookahead_begin;
145  _M_value.assign(1, 'n');
146  }
147  else
148  __throw_regex_error(regex_constants::error_paren,
149  "Invalid '(?...)' zero-width assertion "
150  "in regular expression");
151  }
152  else if (_M_flags & regex_constants::nosubs)
153  _M_token = _S_token_subexpr_no_group_begin;
154  else
155  _M_token = _S_token_subexpr_begin;
156  }
157  else if (__c == ')')
158  _M_token = _S_token_subexpr_end;
159  else if (__c == '[')
160  {
161  _M_state = _S_state_in_bracket;
162  _M_at_bracket_start = true;
163  if (_M_current != _M_end && *_M_current == '^')
164  {
165  _M_token = _S_token_bracket_neg_begin;
166  ++_M_current;
167  }
168  else
169  _M_token = _S_token_bracket_begin;
170  }
171  else if (__c == '{')
172  {
173  _M_state = _S_state_in_brace;
174  _M_token = _S_token_interval_begin;
175  }
176  else if (__builtin_expect(__c == _CharT(0), false))
177  {
178  if (!_M_is_ecma())
179  __throw_regex_error(regex_constants::_S_null);
180  _M_token = _S_token_ord_char;
181  _M_value.assign(1, __c);
182  }
183  else if (__c != ']' && __c != '}')
184  {
185  auto __it = _M_token_tbl;
186  auto __narrowc = _M_ctype.narrow(__c, '\0');
187  for (; __it->first != '\0'; ++__it)
188  if (__it->first == __narrowc)
189  {
190  _M_token = __it->second;
191  return;
192  }
193  __glibcxx_assert(!"unexpected special character in regex");
194  }
195  else
196  {
197  _M_token = _S_token_ord_char;
198  _M_value.assign(1, __c);
199  }
200  }
201 
202  // Differences between styles:
203  // 1) different semantics of "[]" and "[^]".
204  // 2) Escaping in bracket expr.
205  template<typename _CharT>
206  void
207  _Scanner<_CharT>::
208  _M_scan_in_bracket()
209  {
210  if (_M_current == _M_end)
211  __throw_regex_error(regex_constants::error_brack);
212 
213  auto __c = *_M_current++;
214 
215  if (__c == '-')
216  _M_token = _S_token_bracket_dash;
217  else if (__c == '[')
218  {
219  if (_M_current == _M_end)
220  __throw_regex_error(regex_constants::error_brack,
221  "Incomplete '[[' character class in "
222  "regular expression");
223 
224  if (*_M_current == '.')
225  {
226  _M_token = _S_token_collsymbol;
227  _M_eat_class(*_M_current++);
228  }
229  else if (*_M_current == ':')
230  {
231  _M_token = _S_token_char_class_name;
232  _M_eat_class(*_M_current++);
233  }
234  else if (*_M_current == '=')
235  {
236  _M_token = _S_token_equiv_class_name;
237  _M_eat_class(*_M_current++);
238  }
239  else
240  {
241  _M_token = _S_token_ord_char;
242  _M_value.assign(1, __c);
243  }
244  }
245  // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
246  // literally. So "[]]" and "[^]]" are valid regexes. See the testcases
247  // `.../empty_range.cc`.
248  else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
249  {
250  _M_token = _S_token_bracket_end;
251  _M_state = _S_state_normal;
252  }
253  // ECMAScript and awk permits escaping in bracket.
254  else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
255  (this->*_M_eat_escape)();
256  else
257  {
258  _M_token = _S_token_ord_char;
259  _M_value.assign(1, __c);
260  }
261  _M_at_bracket_start = false;
262  }
263 
264  // Differences between styles:
265  // 1) "\}" in basic style.
266  template<typename _CharT>
267  void
268  _Scanner<_CharT>::
269  _M_scan_in_brace()
270  {
271  if (_M_current == _M_end)
272  __throw_regex_error(regex_constants::error_brace);
273 
274  auto __c = *_M_current++;
275 
276  if (_M_ctype.is(_CtypeT::digit, __c))
277  {
278  _M_token = _S_token_dup_count;
279  _M_value.assign(1, __c);
280  while (_M_current != _M_end
281  && _M_ctype.is(_CtypeT::digit, *_M_current))
282  _M_value += *_M_current++;
283  }
284  else if (__c == ',')
285  _M_token = _S_token_comma;
286  // basic use \}.
287  else if (_M_is_basic())
288  {
289  if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
290  {
291  _M_state = _S_state_normal;
292  _M_token = _S_token_interval_end;
293  ++_M_current;
294  }
295  else
296  __throw_regex_error(regex_constants::error_badbrace);
297  }
298  else if (__c == '}')
299  {
300  _M_state = _S_state_normal;
301  _M_token = _S_token_interval_end;
302  }
303  else
304  __throw_regex_error(regex_constants::error_badbrace);
305  }
306 
307  template<typename _CharT>
308  void
309  _Scanner<_CharT>::
310  _M_eat_escape_ecma()
311  {
312  if (_M_current == _M_end)
313  __throw_regex_error(regex_constants::error_escape);
314 
315  auto __c = *_M_current++;
316  auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
317 
318  if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
319  {
320  _M_token = _S_token_ord_char;
321  _M_value.assign(1, *__pos);
322  }
323  else if (__c == 'b')
324  {
325  _M_token = _S_token_word_bound;
326  _M_value.assign(1, 'p');
327  }
328  else if (__c == 'B')
329  {
330  _M_token = _S_token_word_bound;
331  _M_value.assign(1, 'n');
332  }
333  // N3376 28.13
334  else if (__c == 'd'
335  || __c == 'D'
336  || __c == 's'
337  || __c == 'S'
338  || __c == 'w'
339  || __c == 'W')
340  {
341  _M_token = _S_token_quoted_class;
342  _M_value.assign(1, __c);
343  }
344  else if (__c == 'c')
345  {
346  if (_M_current == _M_end)
347  __throw_regex_error(regex_constants::error_escape,
348  "invalid '\\cX' control character in "
349  "regular expression");
350  _M_token = _S_token_ord_char;
351  _M_value.assign(1, *_M_current++);
352  }
353  else if (__c == 'x' || __c == 'u')
354  {
355  _M_value.clear();
356  const int __n = __c == 'x' ? 2 : 4;
357  for (int __i = 0; __i < __n; __i++)
358  {
359  if (_M_current == _M_end
360  || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
361  __throw_regex_error(regex_constants::error_escape,
362  __n == 2
363  ? "Invalid '\\xNN' control character in "
364  "regular expression"
365  : "Invalid '\\uNNNN' control character in "
366  "regular expression");
367  _M_value += *_M_current++;
368  }
369  _M_token = _S_token_hex_num;
370  }
371  // ECMAScript recognizes multi-digit back-references.
372  else if (_M_ctype.is(_CtypeT::digit, __c))
373  {
374  _M_value.assign(1, __c);
375  while (_M_current != _M_end
376  && _M_ctype.is(_CtypeT::digit, *_M_current))
377  _M_value += *_M_current++;
378  _M_token = _S_token_backref;
379  }
380  else
381  {
382  _M_token = _S_token_ord_char;
383  _M_value.assign(1, __c);
384  }
385  }
386 
387  // Differences between styles:
388  // 1) Extended doesn't support backref, but basic does.
389  template<typename _CharT>
390  void
391  _Scanner<_CharT>::
392  _M_eat_escape_posix()
393  {
394  if (_M_current == _M_end)
395  __throw_regex_error(regex_constants::error_escape);
396 
397  auto __c = *_M_current;
398  auto __pos = __builtin_strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
399 
400  if (__pos != nullptr && *__pos != '\0')
401  {
402  _M_token = _S_token_ord_char;
403  _M_value.assign(1, __c);
404  }
405  // We MUST judge awk before handling backrefs. There's no backref in awk.
406  else if (_M_is_awk())
407  {
408  _M_eat_escape_awk();
409  return;
410  }
411  else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
412  {
413  _M_token = _S_token_backref;
414  _M_value.assign(1, __c);
415  }
416  else
417  {
418 #ifdef __STRICT_ANSI__
419  // POSIX says it is undefined to escape ordinary characters
420  __throw_regex_error(regex_constants::error_escape);
421 #else
422  _M_token = _S_token_ord_char;
423  _M_value.assign(1, __c);
424 #endif
425  }
426  ++_M_current;
427  }
428 
429  template<typename _CharT>
430  void
431  _Scanner<_CharT>::
432  _M_eat_escape_awk()
433  {
434  auto __c = *_M_current++;
435  auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
436 
437  if (__pos != nullptr)
438  {
439  _M_token = _S_token_ord_char;
440  _M_value.assign(1, *__pos);
441  }
442  // \ddd for oct representation
443  else if (_M_ctype.is(_CtypeT::digit, __c)
444  && __c != '8'
445  && __c != '9')
446  {
447  _M_value.assign(1, __c);
448  for (int __i = 0;
449  __i < 2
450  && _M_current != _M_end
451  && _M_ctype.is(_CtypeT::digit, *_M_current)
452  && *_M_current != '8'
453  && *_M_current != '9';
454  __i++)
455  _M_value += *_M_current++;
456  _M_token = _S_token_oct_num;
457  return;
458  }
459  else
460  __throw_regex_error(regex_constants::error_escape);
461  }
462 
463  // Eats a character class or throws an exception.
464  // __ch could be ':', '.' or '=', _M_current is the char after ']' when
465  // returning.
466  template<typename _CharT>
467  void
468  _Scanner<_CharT>::
469  _M_eat_class(char __ch)
470  {
471  for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
472  _M_value += *_M_current++;
473  if (_M_current == _M_end
474  || *_M_current++ != __ch
475  || _M_current == _M_end // skip __ch
476  || *_M_current++ != ']') // skip ']'
477  {
478  __throw_regex_error(__ch == ':' ? regex_constants::error_ctype
480  }
481  }
482 
483 #ifdef _GLIBCXX_DEBUG
484  template<typename _CharT>
485  std::ostream&
486  _Scanner<_CharT>::
487  _M_print(std::ostream& ostr)
488  {
489  switch (_M_token)
490  {
491  case _S_token_anychar:
492  ostr << "any-character\n";
493  break;
494  case _S_token_backref:
495  ostr << "backref\n";
496  break;
497  case _S_token_bracket_begin:
498  ostr << "bracket-begin\n";
499  break;
500  case _S_token_bracket_neg_begin:
501  ostr << "bracket-neg-begin\n";
502  break;
503  case _S_token_bracket_end:
504  ostr << "bracket-end\n";
505  break;
506  case _S_token_char_class_name:
507  ostr << "char-class-name \"" << _M_value << "\"\n";
508  break;
509  case _S_token_closure0:
510  ostr << "closure0\n";
511  break;
512  case _S_token_closure1:
513  ostr << "closure1\n";
514  break;
515  case _S_token_collsymbol:
516  ostr << "collsymbol \"" << _M_value << "\"\n";
517  break;
518  case _S_token_comma:
519  ostr << "comma\n";
520  break;
521  case _S_token_dup_count:
522  ostr << "dup count: " << _M_value << "\n";
523  break;
524  case _S_token_eof:
525  ostr << "EOF\n";
526  break;
527  case _S_token_equiv_class_name:
528  ostr << "equiv-class-name \"" << _M_value << "\"\n";
529  break;
530  case _S_token_interval_begin:
531  ostr << "interval begin\n";
532  break;
533  case _S_token_interval_end:
534  ostr << "interval end\n";
535  break;
536  case _S_token_line_begin:
537  ostr << "line begin\n";
538  break;
539  case _S_token_line_end:
540  ostr << "line end\n";
541  break;
542  case _S_token_opt:
543  ostr << "opt\n";
544  break;
545  case _S_token_or:
546  ostr << "or\n";
547  break;
548  case _S_token_ord_char:
549  ostr << "ordinary character: \"" << _M_value << "\"\n";
550  break;
551  case _S_token_subexpr_begin:
552  ostr << "subexpr begin\n";
553  break;
554  case _S_token_subexpr_no_group_begin:
555  ostr << "no grouping subexpr begin\n";
556  break;
557  case _S_token_subexpr_lookahead_begin:
558  ostr << "lookahead subexpr begin\n";
559  break;
560  case _S_token_subexpr_end:
561  ostr << "subexpr end\n";
562  break;
563  case _S_token_unknown:
564  ostr << "-- unknown token --\n";
565  break;
566  case _S_token_oct_num:
567  ostr << "oct number " << _M_value << "\n";
568  break;
569  case _S_token_hex_num:
570  ostr << "hex number " << _M_value << "\n";
571  break;
572  case _S_token_quoted_class:
573  ostr << "quoted class " << "\\" << _M_value << "\n";
574  break;
575  default:
576  _GLIBCXX_DEBUG_ASSERT(false);
577  }
578  return ostr;
579  }
580 #endif
581 
582 } // namespace __detail
583 _GLIBCXX_END_NAMESPACE_VERSION
584 } // namespace
const _Facet & use_facet(const locale &__loc)
Return a facet.
ISO C++ entities toplevel namespace is std.
constexpr error_type error_ctype(_S_error_ctype)
constexpr error_type error_brace(_S_error_brace)
constexpr error_type error_badbrace(_S_error_badbrace)
constexpr error_type error_escape(_S_error_escape)
constexpr error_type error_paren(_S_error_paren)
constexpr error_type error_brack(_S_error_brack)
constexpr syntax_option_type nosubs
constexpr error_type error_collate(_S_error_collate)
Container class for localization functionality.