regex_automaton.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. // class template regex -*- C++ -*-
  2. // Copyright (C) 2013-2022 Free Software Foundation, Inc.
  3. //
  4. // This file is part of the GNU ISO C++ Library. This library is free
  5. // software; you can redistribute it and/or modify it under the
  6. // terms of the GNU General Public License as published by the
  7. // Free Software Foundation; either version 3, or (at your option)
  8. // any later version.
  9. // This library is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. // Under Section 7 of GPL version 3, you are granted additional
  14. // permissions described in the GCC Runtime Library Exception, version
  15. // 3.1, as published by the Free Software Foundation.
  16. // You should have received a copy of the GNU General Public License and
  17. // a copy of the GCC Runtime Library Exception along with this program;
  18. // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  19. // <http://www.gnu.org/licenses/>.
  20. /**
  21. * @file bits/regex_automaton.h
  22. * This is an internal header file, included by other library headers.
  23. * Do not attempt to use it directly. @headername{regex}
  24. */
  25. // This macro defines the maximal state number a NFA can have.
  26. #ifndef _GLIBCXX_REGEX_STATE_LIMIT
  27. #define _GLIBCXX_REGEX_STATE_LIMIT 100000
  28. #endif
  29. namespace std _GLIBCXX_VISIBILITY(default)
  30. {
  31. _GLIBCXX_BEGIN_NAMESPACE_VERSION
  32. namespace __detail
  33. {
  34. /**
  35. * @defgroup regex-detail Base and Implementation Classes
  36. * @ingroup regex
  37. * @{
  38. */
  39. typedef long _StateIdT;
  40. static const _StateIdT _S_invalid_state_id = -1;
  41. template<typename _CharT>
  42. using _Matcher = std::function<bool (_CharT)>;
  43. /// Operation codes that define the type of transitions within the base NFA
  44. /// that represents the regular expression.
  45. enum _Opcode : int
  46. {
  47. _S_opcode_unknown,
  48. _S_opcode_alternative,
  49. _S_opcode_repeat,
  50. _S_opcode_backref,
  51. _S_opcode_line_begin_assertion,
  52. _S_opcode_line_end_assertion,
  53. _S_opcode_word_boundary,
  54. _S_opcode_subexpr_lookahead,
  55. _S_opcode_subexpr_begin,
  56. _S_opcode_subexpr_end,
  57. _S_opcode_dummy,
  58. _S_opcode_match,
  59. _S_opcode_accept,
  60. };
  61. struct _State_base
  62. {
  63. protected:
  64. _Opcode _M_opcode; // type of outgoing transition
  65. public:
  66. _StateIdT _M_next; // outgoing transition
  67. union // Since they are mutually exclusive.
  68. {
  69. size_t _M_subexpr; // for _S_opcode_subexpr_*
  70. size_t _M_backref_index; // for _S_opcode_backref
  71. struct
  72. {
  73. // for _S_opcode_alternative, _S_opcode_repeat and
  74. // _S_opcode_subexpr_lookahead
  75. _StateIdT _M_alt;
  76. // for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or
  77. // quantifiers (ungreedy if set true)
  78. bool _M_neg;
  79. };
  80. // For _S_opcode_match
  81. __gnu_cxx::__aligned_membuf<_Matcher<char>> _M_matcher_storage;
  82. };
  83. protected:
  84. explicit _State_base(_Opcode __opcode) noexcept
  85. : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
  86. { }
  87. public:
  88. bool
  89. _M_has_alt() const noexcept
  90. {
  91. return _M_opcode == _S_opcode_alternative
  92. || _M_opcode == _S_opcode_repeat
  93. || _M_opcode == _S_opcode_subexpr_lookahead;
  94. }
  95. #ifdef _GLIBCXX_DEBUG
  96. std::ostream&
  97. _M_print(std::ostream& ostr) const;
  98. // Prints graphviz dot commands for state.
  99. std::ostream&
  100. _M_dot(std::ostream& __ostr, _StateIdT __id) const;
  101. #endif
  102. };
  103. template<typename _Char_type>
  104. struct _State : _State_base
  105. {
  106. typedef _Matcher<_Char_type> _MatcherT;
  107. static_assert(sizeof(_MatcherT) == sizeof(_Matcher<char>),
  108. "std::function<bool(T)> has the same size as "
  109. "std::function<bool(char)>");
  110. static_assert(alignof(_MatcherT) == alignof(_Matcher<char>),
  111. "std::function<bool(T)> has the same alignment as "
  112. "std::function<bool(char)>");
  113. explicit
  114. _State(_Opcode __opcode) noexcept : _State_base(__opcode)
  115. {
  116. if (_M_opcode() == _S_opcode_match)
  117. new (this->_M_matcher_storage._M_addr()) _MatcherT();
  118. }
  119. _State(const _State& __rhs) : _State_base(__rhs)
  120. {
  121. if (__rhs._M_opcode() == _S_opcode_match)
  122. new (this->_M_matcher_storage._M_addr())
  123. _MatcherT(__rhs._M_get_matcher());
  124. }
  125. _State(_State&& __rhs) noexcept : _State_base(__rhs)
  126. {
  127. if (__rhs._M_opcode() == _S_opcode_match)
  128. new (this->_M_matcher_storage._M_addr())
  129. _MatcherT(std::move(__rhs._M_get_matcher()));
  130. }
  131. _State&
  132. operator=(const _State&) = delete;
  133. ~_State()
  134. {
  135. if (_M_opcode() == _S_opcode_match)
  136. _M_get_matcher().~_MatcherT();
  137. }
  138. // Since correct ctor and dtor rely on _M_opcode, it's better not to
  139. // change it over time.
  140. _Opcode
  141. _M_opcode() const noexcept
  142. { return _State_base::_M_opcode; }
  143. bool
  144. _M_matches(_Char_type __char) const
  145. { return _M_get_matcher()(__char); }
  146. _MatcherT&
  147. _M_get_matcher() noexcept
  148. { return *static_cast<_MatcherT*>(this->_M_matcher_storage._M_addr()); }
  149. const _MatcherT&
  150. _M_get_matcher() const noexcept
  151. {
  152. return *static_cast<const _MatcherT*>(
  153. this->_M_matcher_storage._M_addr());
  154. }
  155. };
  156. struct _NFA_base
  157. {
  158. typedef regex_constants::syntax_option_type _FlagT;
  159. explicit
  160. _NFA_base(_FlagT __f) noexcept
  161. : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
  162. _M_has_backref(false)
  163. { }
  164. _NFA_base(_NFA_base&&) = default;
  165. protected:
  166. ~_NFA_base() = default;
  167. public:
  168. _FlagT
  169. _M_options() const noexcept
  170. { return _M_flags; }
  171. _StateIdT
  172. _M_start() const noexcept
  173. { return _M_start_state; }
  174. size_t
  175. _M_sub_count() const noexcept
  176. { return _M_subexpr_count; }
  177. _GLIBCXX_STD_C::vector<size_t> _M_paren_stack;
  178. _FlagT _M_flags;
  179. _StateIdT _M_start_state;
  180. size_t _M_subexpr_count;
  181. bool _M_has_backref;
  182. };
  183. template<typename _TraitsT>
  184. struct _NFA
  185. : _NFA_base, _GLIBCXX_STD_C::vector<_State<typename _TraitsT::char_type>>
  186. {
  187. typedef typename _TraitsT::char_type _Char_type;
  188. typedef _State<_Char_type> _StateT;
  189. typedef _Matcher<_Char_type> _MatcherT;
  190. _NFA(const typename _TraitsT::locale_type& __loc, _FlagT __flags)
  191. : _NFA_base(__flags)
  192. { _M_traits.imbue(__loc); }
  193. // for performance reasons _NFA objects should only be moved not copied
  194. _NFA(const _NFA&) = delete;
  195. _NFA(_NFA&&) = default;
  196. _StateIdT
  197. _M_insert_accept()
  198. {
  199. auto __ret = _M_insert_state(_StateT(_S_opcode_accept));
  200. return __ret;
  201. }
  202. _StateIdT
  203. _M_insert_alt(_StateIdT __next, _StateIdT __alt,
  204. bool __neg __attribute__((__unused__)))
  205. {
  206. _StateT __tmp(_S_opcode_alternative);
  207. // It labels every quantifier to make greedy comparison easier in BFS
  208. // approach.
  209. __tmp._M_next = __next;
  210. __tmp._M_alt = __alt;
  211. return _M_insert_state(std::move(__tmp));
  212. }
  213. _StateIdT
  214. _M_insert_repeat(_StateIdT __next, _StateIdT __alt, bool __neg)
  215. {
  216. _StateT __tmp(_S_opcode_repeat);
  217. // It labels every quantifier to make greedy comparison easier in BFS
  218. // approach.
  219. __tmp._M_next = __next;
  220. __tmp._M_alt = __alt;
  221. __tmp._M_neg = __neg;
  222. return _M_insert_state(std::move(__tmp));
  223. }
  224. _StateIdT
  225. _M_insert_matcher(_MatcherT __m)
  226. {
  227. _StateT __tmp(_S_opcode_match);
  228. __tmp._M_get_matcher() = std::move(__m);
  229. return _M_insert_state(std::move(__tmp));
  230. }
  231. _StateIdT
  232. _M_insert_subexpr_begin()
  233. {
  234. auto __id = this->_M_subexpr_count++;
  235. this->_M_paren_stack.push_back(__id);
  236. _StateT __tmp(_S_opcode_subexpr_begin);
  237. __tmp._M_subexpr = __id;
  238. return _M_insert_state(std::move(__tmp));
  239. }
  240. _StateIdT
  241. _M_insert_subexpr_end()
  242. {
  243. _StateT __tmp(_S_opcode_subexpr_end);
  244. __tmp._M_subexpr = this->_M_paren_stack.back();
  245. this->_M_paren_stack.pop_back();
  246. return _M_insert_state(std::move(__tmp));
  247. }
  248. _StateIdT
  249. _M_insert_backref(size_t __index);
  250. _StateIdT
  251. _M_insert_line_begin()
  252. { return _M_insert_state(_StateT(_S_opcode_line_begin_assertion)); }
  253. _StateIdT
  254. _M_insert_line_end()
  255. { return _M_insert_state(_StateT(_S_opcode_line_end_assertion)); }
  256. _StateIdT
  257. _M_insert_word_bound(bool __neg)
  258. {
  259. _StateT __tmp(_S_opcode_word_boundary);
  260. __tmp._M_neg = __neg;
  261. return _M_insert_state(std::move(__tmp));
  262. }
  263. _StateIdT
  264. _M_insert_lookahead(_StateIdT __alt, bool __neg)
  265. {
  266. _StateT __tmp(_S_opcode_subexpr_lookahead);
  267. __tmp._M_alt = __alt;
  268. __tmp._M_neg = __neg;
  269. return _M_insert_state(std::move(__tmp));
  270. }
  271. _StateIdT
  272. _M_insert_dummy()
  273. { return _M_insert_state(_StateT(_S_opcode_dummy)); }
  274. _StateIdT
  275. _M_insert_state(_StateT __s)
  276. {
  277. this->push_back(std::move(__s));
  278. if (this->size() > _GLIBCXX_REGEX_STATE_LIMIT)
  279. __throw_regex_error(
  280. regex_constants::error_space,
  281. "Number of NFA states exceeds limit. Please use shorter regex "
  282. "string, or use smaller brace expression, or make "
  283. "_GLIBCXX_REGEX_STATE_LIMIT larger.");
  284. return this->size() - 1;
  285. }
  286. // Eliminate dummy node in this NFA to make it compact.
  287. void
  288. _M_eliminate_dummy();
  289. #ifdef _GLIBCXX_DEBUG
  290. std::ostream&
  291. _M_dot(std::ostream& __ostr) const;
  292. #endif
  293. public:
  294. _TraitsT _M_traits;
  295. };
  296. /// Describes a sequence of one or more %_State, its current start
  297. /// and end(s). This structure contains fragments of an NFA during
  298. /// construction.
  299. template<typename _TraitsT>
  300. class _StateSeq
  301. {
  302. public:
  303. typedef _NFA<_TraitsT> _RegexT;
  304. public:
  305. _StateSeq(_RegexT& __nfa, _StateIdT __s)
  306. : _M_nfa(__nfa), _M_start(__s), _M_end(__s)
  307. { }
  308. _StateSeq(_RegexT& __nfa, _StateIdT __s, _StateIdT __end)
  309. : _M_nfa(__nfa), _M_start(__s), _M_end(__end)
  310. { }
  311. // Append a state on *this and change *this to the new sequence.
  312. void
  313. _M_append(_StateIdT __id)
  314. {
  315. _M_nfa[_M_end]._M_next = __id;
  316. _M_end = __id;
  317. }
  318. // Append a sequence on *this and change *this to the new sequence.
  319. void
  320. _M_append(const _StateSeq& __s)
  321. {
  322. _M_nfa[_M_end]._M_next = __s._M_start;
  323. _M_end = __s._M_end;
  324. }
  325. // Clones an entire sequence.
  326. _StateSeq
  327. _M_clone();
  328. public:
  329. _RegexT& _M_nfa;
  330. _StateIdT _M_start;
  331. _StateIdT _M_end;
  332. };
  333. ///@} regex-detail
  334. } // namespace __detail
  335. _GLIBCXX_END_NAMESPACE_VERSION
  336. } // namespace std
  337. #include <bits/regex_automaton.tcc>