Skip to content

<regex>: Revise parsing of escape sequences #5380

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 73 additions & 65 deletions stl/inc/regex
Original file line number Diff line number Diff line change
Expand Up @@ -1729,21 +1729,21 @@ private:
void _Expect(_Meta_type, regex_constants::error_type);

// parsing
int _Do_digits(int _Base, int _Count, regex_constants::error_type _Error_type);
bool _DecimalDigits2(regex_constants::error_type _Error_type, int _Count = INT_MAX);
int _Do_digits(int _Base, int _Initial, int _Count, regex_constants::error_type _Error_type);
bool _DecimalDigits3(regex_constants::error_type _Error_type, int _Initial = 0);
void _HexDigits(int);
bool _OctalDigits();
void _Do_ex_class(_Meta_type);
bool _CharacterClassEscape(bool);
_Prs_ret _ClassEscape2();
_Prs_ret _ClassEscape3();
_Prs_ret _ClassAtom(bool);
void _ClassRanges();
void _CharacterClass();
bool _IdentityEscape();
bool _IsIdentityEscape() const;
bool _IdentityEscape(bool);
bool _IsIdentityEscape(bool) const;
bool _Do_ffn(_Elem);
bool _Do_ffnx(_Elem);
bool _CharacterEscape();
bool _CharacterEscape(bool);
void _AtomEscape();
void _Do_capture_group();
void _Do_noncapture_group();
Expand Down Expand Up @@ -1783,7 +1783,7 @@ enum _Lang_flags { // describe language properties
_L_esc_uni = 0x00000800, // has Unicode escape sequences
_L_esc_hex = 0x00001000, // has hexadecimal escape sequences
_L_esc_oct = 0x00002000, // has octal escape sequences
_L_esc_bsl = 0x00004000, // has escape backslash in character classes
_L_esc_bsp = 0x00004000, // has backspace escape in character classes
_L_esc_ffnx = 0x00008000, // has extra file escapes (\a and \b)
_L_esc_ffn = 0x00010000, // has limited file escapes (\[fnrtv])
_L_esc_wsd = 0x00020000, // has w, s, and d character set escapes
Expand All @@ -1797,7 +1797,8 @@ enum _Lang_flags { // describe language properties
_L_anch_rstr = 0x02000000, // anchor restricted to beginning/end
_L_star_beg = 0x04000000, // star okay at beginning of RE/expr (BRE)
_L_empty_grp = 0x08000000, // empty group allowed (ERE prohibits "()")
_L_paren_bal = 0x10000000, // ')'/'}'/']' special only after '('/'{'/']'
_L_paren_bal = 0x10000000, // ')'/'}' special only after '('/'{'
_L_brk_bal = 0x20000000, // ']' special only after '[' (ERE, BRE); TRANSITION, ABI: same value as _L_brk_rstr
_L_brk_rstr = 0x20000000, // ']' not special when first character in set
_L_mtch_long = 0x40000000, // find longest match (ERE, BRE)
};
Expand Down Expand Up @@ -4039,9 +4040,9 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Expect(_Meta_type _St, regex_constants:

template <class _FwdIt, class _Elem, class _RxTraits>
int _Parser<_FwdIt, _Elem, _RxTraits>::_Do_digits(
int _Base, int _Count, regex_constants::error_type _Error_type) { // translate digits to numeric value
int _Base, int _Initial, int _Count, regex_constants::error_type _Error_type) { // translate digits to numeric value
int _Chv;
_Val = 0;
_Val = _Initial;
while (_Count != 0 && (_Chv = _Traits.value(_Char, _Base)) != -1) { // append next digit
if (_Val > (INT_MAX - _Chv) / _Base) {
_Error(_Error_type);
Expand All @@ -4055,21 +4056,21 @@ int _Parser<_FwdIt, _Elem, _RxTraits>::_Do_digits(
}

template <class _FwdIt, class _Elem, class _RxTraits>
bool _Parser<_FwdIt, _Elem, _RxTraits>::_DecimalDigits2(
const regex_constants::error_type _Error_type, const int _Count /* = INT_MAX */) { // check for decimal value
return _Do_digits(10, _Count, _Error_type) != _Count;
bool _Parser<_FwdIt, _Elem, _RxTraits>::_DecimalDigits3(
const regex_constants::error_type _Error_type, const int _Initial /* = 0 */) { // check for decimal value
return _Do_digits(10, _Initial, INT_MAX, _Error_type) != INT_MAX;
}

template <class _FwdIt, class _Elem, class _RxTraits>
void _Parser<_FwdIt, _Elem, _RxTraits>::_HexDigits(int _Count) { // check for _Count hex digits
if (_Do_digits(16, _Count, regex_constants::error_escape) != 0) {
if (_Do_digits(16, 0, _Count, regex_constants::error_escape) != 0) {
_Error(regex_constants::error_escape);
}
}

template <class _FwdIt, class _Elem, class _RxTraits>
bool _Parser<_FwdIt, _Elem, _RxTraits>::_OctalDigits() { // check for up to 3 octal digits
return _Do_digits(8, 3, regex_constants::error_escape) != 3;
return _Do_digits(8, 0, 3, regex_constants::error_escape) != 3;
}

template <class _FwdIt, class _Elem, class _RxTraits>
Expand Down Expand Up @@ -4139,36 +4140,33 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_CharacterClassEscape(bool _Addit) { //
}

template <class _FwdIt, class _Elem, class _RxTraits>
_Prs_ret _Parser<_FwdIt, _Elem, _RxTraits>::_ClassEscape2() { // check for class escape
if ((_L_flags & _L_esc_bsl) && _Char == _Esc_bsl) { // handle escape backslash if allowed
_Val = _Esc_bsl;
_Prs_ret _Parser<_FwdIt, _Elem, _RxTraits>::_ClassEscape3() { // check for class escape
if ((_L_flags & _L_esc_bsp) && _Char == _Esc_ctrl_b) { // handle backspace escape
_Next();
_Val = _Meta_bsp;
return _Prs_chr;
} else if ((_L_flags & _L_esc_wsd) && _CharacterClassEscape(false)) {
return _Prs_set;
} else if (_DecimalDigits2(regex_constants::error_escape)) { // check for invalid value
if (_Val != 0) {
} else if ((_L_flags & (_L_bzr_chr | _L_bckr))
&& (_Val = _Traits.value(_Char, 10)) != -1) { // handle \0 and reject other escaped decimal literals
_Next();
if (!(_L_flags & _L_bzr_chr) || _Val != 0 || _Traits.value(_Char, 10) != -1) {
_Error(regex_constants::error_escape);
}

return _Prs_chr;
} else if (_CharacterEscape(true)) {
return _Prs_chr;
} else if ((_L_flags & _L_esc_wsd) && _CharacterClassEscape(false)) {
return _Prs_set;
}
return _CharacterEscape() ? _Prs_chr : _Prs_none;

_Error(regex_constants::error_escape);
}

template <class _FwdIt, class _Elem, class _RxTraits>
_Prs_ret _Parser<_FwdIt, _Elem, _RxTraits>::_ClassAtom(const bool _Initial) { // check for class atom
if (_Mchar == _Meta_esc) { // check for valid escape sequence
if (_Mchar == _Meta_esc && (_L_flags & (_L_grp_esc | _L_ident_awk))) { // check for valid escape sequence
_Next();
if (_L_flags & _L_grp_esc) {
return _ClassEscape2();
} else if ((_L_flags & _L_esc_ffn && _Do_ffn(_Char))
|| (_L_flags & _L_esc_ffnx && _Do_ffnx(_Char))) { // advance to next character
_Next();
return _Prs_chr;
}
_Val = _Meta_esc;
return _Prs_chr;
return _ClassEscape3();
} else if (_Mchar == _Meta_lsq) { // check for valid delimited expression
_Next();
if (_Mchar == _Meta_colon || _Mchar == _Meta_equal || _Mchar == _Meta_dot) { // handle delimited expression
Expand Down Expand Up @@ -4325,7 +4323,9 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_Wrapped_disjunction() { // add disjunct
}

template <class _FwdIt, class _Elem, class _RxTraits>
bool _Parser<_FwdIt, _Elem, _RxTraits>::_IsIdentityEscape() const { // check for valid identity escape
bool _Parser<_FwdIt, _Elem, _RxTraits>::_IsIdentityEscape(bool _In_character_class) const {
// check for valid identity escape

if (_L_flags & _L_ident_ECMA) {
// ECMAScript identity escape characters
switch (_Char) {
Expand All @@ -4343,35 +4343,38 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_IsIdentityEscape() const { // check for
}

switch (_Char) {
case _Meta_esc:
// BRE, ERE, awk identity escape characters (anywhere in awk)
return true;
case _Meta_dot:
case _Meta_lsq:
case _Meta_esc:
case _Meta_star:
case _Meta_bar:
case _Meta_caret:
case _Meta_dlr:
// BRE, ERE, awk identity escape characters
return true;
// BRE, ERE, awk identity escape characters (outside character classes only)
return !_In_character_class;
case _Meta_lpar:
case _Meta_rpar:
case _Meta_bar:
case _Meta_plus:
case _Meta_query:
case _Meta_lbr:
case _Meta_rbr:
// additional ERE identity escape characters
return (_L_flags & _L_ident_ERE) != 0;
// additional ERE identity escape characters (outside character classes only)
return (_L_flags & _L_ident_ERE) != 0 && !_In_character_class;
case '"':
case '/':
// additional awk identity escape characters
// additional awk identity escape characters (anywhere)
return (_L_flags & _L_ident_awk) != 0;
default:
return false;
}
}

template <class _FwdIt, class _Elem, class _RxTraits>
bool _Parser<_FwdIt, _Elem, _RxTraits>::_IdentityEscape() { // check whether an escape is valid, and process it if so
if (_IsIdentityEscape()) {
bool _Parser<_FwdIt, _Elem, _RxTraits>::_IdentityEscape(bool _In_character_class) {
// check whether an escape is valid, and process it if so
if (_IsIdentityEscape(_In_character_class)) {
_Val = _Char;
_Next();
return true;
Expand Down Expand Up @@ -4413,7 +4416,7 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_Do_ffnx(_Elem _Ch) { // check for the r
}

template <class _FwdIt, class _Elem, class _RxTraits>
bool _Parser<_FwdIt, _Elem, _RxTraits>::_CharacterEscape() { // check for valid character escape
bool _Parser<_FwdIt, _Elem, _RxTraits>::_CharacterEscape(bool _In_character_class) { // check for valid character escape
if (_Mchar == _Meta_eos) {
_Error(regex_constants::error_escape);
}
Expand All @@ -4439,7 +4442,7 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_CharacterEscape() { // check for valid
_Error(regex_constants::error_escape);
}
} else {
return _IdentityEscape();
return _IdentityEscape(_In_character_class);
}

if (_STD _Max_limit<typename _RxTraits::_Uelem>() < static_cast<unsigned int>(_Val)) {
Expand All @@ -4452,23 +4455,28 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_CharacterEscape() { // check for valid

template <class _FwdIt, class _Elem, class _RxTraits>
void _Parser<_FwdIt, _Elem, _RxTraits>::_AtomEscape() { // check for valid atom escape
constexpr int _Bre_max_backref_digits = 1;
if ((_L_flags & _L_bckr)
&& _DecimalDigits2(regex_constants::error_backref,
(_L_flags & _L_lim_bckr) ? _Bre_max_backref_digits : INT_MAX)) { // check for valid back reference
if (_Val == 0) { // handle \0
if (!(_L_flags & _L_bzr_chr)) {
if ((_L_flags & (_L_bzr_chr | _L_bckr)) && (_Val = _Traits.value(_Char, 10)) != -1) { // escaped decimal sequence
_Next();
if ((_L_flags & _L_bzr_chr) && _Val == 0) { // handle \0
if (_Traits.value(_Char, 10) != -1) {
_Error(regex_constants::error_escape);
}
_Nfa._Add_char2(_Elem{});
} else if (_L_flags & _L_bckr) { // check for valid backreference
if (!(_L_flags & _L_lim_bckr)) {
(void) _DecimalDigits3(regex_constants::error_backref, _Val);
}

if (_Val == 0) {
_Error(regex_constants::error_escape);
} else if (_Grp_idx < static_cast<size_t>(_Val) || _Finished_grps.size() <= static_cast<size_t>(_Val)
|| !_Finished_grps[static_cast<size_t>(_Val)]) {
_Error(regex_constants::error_backref);
} else {
_Nfa._Add_char2(static_cast<_Elem>(_Val));
_Nfa._Add_backreference(static_cast<size_t>(_Val));
}
} else if (_Grp_idx < static_cast<size_t>(_Val) || _Finished_grps.size() <= static_cast<size_t>(_Val)
|| !_Finished_grps[static_cast<size_t>(_Val)]) {
_Error(regex_constants::error_backref);
} else {
_Nfa._Add_backreference(static_cast<size_t>(_Val));
}
} else if (_CharacterEscape()) {
} else if (_CharacterEscape(false)) {
_Nfa._Add_char2(static_cast<_Elem>(_Val));
} else if (!(_L_flags & _L_esc_wsd) || !_CharacterClassEscape(true)) {
_Error(regex_constants::error_escape);
Expand All @@ -4486,7 +4494,7 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Quantifier() { // check for quantifier
_Max = 1;
} else if (_Mchar == _Meta_lbr) { // check for valid bracketed value
_Next();
if (!_DecimalDigits2(regex_constants::error_badbrace)) {
if (!_DecimalDigits3(regex_constants::error_badbrace)) {
_Error(regex_constants::error_badbrace);
}

Expand All @@ -4496,7 +4504,7 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Quantifier() { // check for quantifier
} else { // check for decimal constant following comma
_Next();
if (_Mchar != _Meta_rbr) {
if (!_DecimalDigits2(regex_constants::error_badbrace)) {
if (!_DecimalDigits3(regex_constants::error_badbrace)) {
_Error(regex_constants::error_badbrace);
}

Expand Down Expand Up @@ -4573,7 +4581,7 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_Alternative() { // check for valid alte
_Error(regex_constants::error_badrepeat);
} else if (_Mchar == _Meta_rbr && !(_L_flags & _L_paren_bal)) {
_Error(regex_constants::error_brace);
} else if (_Mchar == _Meta_rsq && !(_L_flags & _L_paren_bal)) {
} else if (_Mchar == _Meta_rsq && !(_L_flags & _L_brk_bal)) {
_Error(regex_constants::error_brack);
} else { // add character
_Nfa._Add_char2(_Char);
Expand Down Expand Up @@ -4702,17 +4710,17 @@ _Parser<_FwdIt, _Elem, _RxTraits>::_Parser(
: _Pat(_Pfirst), _Begin(_Pfirst), _End(_Plast), _Nfa(_Tr, _Fx), _Traits(_Tr), _Flags(_Fx) {

constexpr unsigned int _ECMA_flags = _L_ext_rep | _L_alt_pipe | _L_nex_grp | _L_nex_rep | _L_nc_grp | _L_asrt_gen
| _L_asrt_wrd | _L_bckr | _L_ngr_rep | _L_esc_uni | _L_esc_hex | _L_esc_bsl
| _L_asrt_wrd | _L_bckr | _L_ngr_rep | _L_esc_uni | _L_esc_hex | _L_esc_bsp
| _L_esc_ffn | _L_esc_wsd | _L_esc_ctrl | _L_bzr_chr | _L_grp_esc | _L_ident_ECMA
| _L_empty_grp;

constexpr unsigned int _Basic_flags =
_L_bckr | _L_lim_bckr | _L_anch_rstr | _L_star_beg | _L_empty_grp | _L_brk_rstr | _L_mtch_long;
_L_bckr | _L_lim_bckr | _L_anch_rstr | _L_star_beg | _L_empty_grp | _L_brk_bal | _L_brk_rstr | _L_mtch_long;

constexpr unsigned int _Grep_flags = _Basic_flags | _L_alt_nl | _L_no_nl;

constexpr unsigned int _Extended_flags =
_L_ext_rep | _L_alt_pipe | _L_nex_grp | _L_nex_rep | _L_ident_ERE | _L_paren_bal | _L_brk_rstr | _L_mtch_long;
constexpr unsigned int _Extended_flags = _L_ext_rep | _L_alt_pipe | _L_nex_grp | _L_nex_rep | _L_ident_ERE
| _L_paren_bal | _L_brk_bal | _L_brk_rstr | _L_mtch_long;

constexpr unsigned int _Awk_flags = _Extended_flags | _L_esc_oct | _L_esc_ffn | _L_esc_ffnx | _L_ident_awk;

Expand Down
1 change: 1 addition & 0 deletions tests/std/test.lst
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ tests\GH_004929_internal_tag_constructors
tests\GH_004930_char_traits_user_specialization
tests\GH_005090_stl_hardening
tests\GH_005204_regex_collating_ranges
tests\GH_005244_regex_escape_sequences
tests\GH_005315_destructor_tombstones
tests\LWG2381_num_get_floating_point
tests\LWG2597_complex_branch_cut
Expand Down
4 changes: 4 additions & 0 deletions tests/std/tests/GH_005244_regex_escape_sequences/env.lst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

RUNALL_INCLUDE ..\usual_matrix.lst
Loading