LibRegex: Use token-state restoration in character class parsing

Previously, we used restoration based on character position in parser.
This caused the lexer to re-tokenize from the middle of multi-character
tokens like escape sequences, and led to incorrect parse failures for
patterns like `[\[\]]`. We would backtrack to before the first `\[`
token, then re-lex the `[` as a separate token instead of part of the
`\[` escape.

Now we save and restore the actual token object along with the lexer
index, so we keep correct token state when backtracking.
This commit is contained in:
aplefull
2025-12-12 23:26:17 +01:00
committed by Ali Mohammad Pur
parent ff06a4a9e5
commit 3e391bdb2d
3 changed files with 39 additions and 18 deletions

View File

@@ -2162,7 +2162,8 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
bool ECMA262Parser::parse_class_set_expression(Vector<CompareTypeAndValuePair>& compares)
{
auto start_position = tell();
auto start_token = m_parser_state.current_token;
auto start_lexer_index = m_parser_state.lexer.tell();
// ClassSetExpression :: ClassUnion | ClassIntersection | ClassSubtraction
if (parse_class_subtraction(compares)) {
@@ -2172,7 +2173,9 @@ bool ECMA262Parser::parse_class_set_expression(Vector<CompareTypeAndValuePair>&
if (has_error())
return false;
back(tell() - start_position + 1);
m_parser_state.current_token = start_token;
m_parser_state.lexer.back(m_parser_state.lexer.tell() - start_lexer_index);
if (parse_class_intersection(compares)) {
consume(TokenType::RightBracket, Error::MismatchingBracket);
return true;
@@ -2180,7 +2183,9 @@ bool ECMA262Parser::parse_class_set_expression(Vector<CompareTypeAndValuePair>&
if (has_error())
return false;
back(tell() - start_position + 1);
m_parser_state.current_token = start_token;
m_parser_state.lexer.back(m_parser_state.lexer.tell() - start_lexer_index);
if (parse_class_union(compares)) {
consume(TokenType::RightBracket, Error::MismatchingBracket);
return true;
@@ -2191,8 +2196,7 @@ bool ECMA262Parser::parse_class_set_expression(Vector<CompareTypeAndValuePair>&
bool ECMA262Parser::parse_class_union(Vector<regex::CompareTypeAndValuePair>& compares)
{
auto start_position = tell();
ArmedScopeGuard restore_position { [&] { back(tell() - start_position + 1); } };
auto restore_position = save_parser_state();
auto first = true;
@@ -2227,8 +2231,7 @@ bool ECMA262Parser::parse_class_intersection(Vector<CompareTypeAndValuePair>& co
Vector<CompareTypeAndValuePair> lhs;
Vector<CompareTypeAndValuePair> rhs;
auto start_position = tell();
ArmedScopeGuard restore_position { [&] { back(tell() - start_position + 1); } };
auto restore_position = save_parser_state();
if (!parse_class_set_operand(lhs))
return false;
@@ -2262,8 +2265,7 @@ bool ECMA262Parser::parse_class_subtraction(Vector<CompareTypeAndValuePair>& com
Vector<CompareTypeAndValuePair> lhs;
Vector<CompareTypeAndValuePair> rhs;
auto start_position = tell();
ArmedScopeGuard restore_position { [&] { back(tell() - start_position + 1); } };
auto restore_position = save_parser_state();
if (!parse_class_set_operand(lhs))
return false;
@@ -2291,8 +2293,7 @@ bool ECMA262Parser::parse_class_subtraction(Vector<CompareTypeAndValuePair>& com
bool ECMA262Parser::parse_class_set_range(Vector<CompareTypeAndValuePair>& compares)
{
// ClassSetRange :: ClassSetCharacter "-" ClassSetCharacter
auto start_position = tell();
ArmedScopeGuard restore_position { [&] { back(tell() - start_position + 1); } };
auto restore_position = save_parser_state();
auto lhs = parse_class_set_character();
if (!lhs.has_value())
@@ -2332,6 +2333,8 @@ Optional<u32> ECMA262Parser::parse_class_set_character()
"&"sv, "-"sv, "!"sv, "#"sv, "%"sv, ","sv, ":"sv, ";"sv, "<"sv, "="sv, ">"sv, "@"sv, "`"sv, "~"sv
};
auto restore = save_parser_state();
if (done()) {
set_error(Error::InvalidPattern);
return {};
@@ -2342,13 +2345,11 @@ Optional<u32> ECMA262Parser::parse_class_set_character()
consume();
if (escape_value[0] == '\\' && escape_value.length() == 2) {
restore.disarm();
return escape_value[1];
}
}
auto start_position = tell();
ArmedScopeGuard restore { [&] { back(tell() - start_position + 1); } };
if (try_skip("\\"sv)) {
if (done()) {
set_error(Error::InvalidTrailingEscape);
@@ -2404,7 +2405,8 @@ Optional<u32> ECMA262Parser::parse_class_set_character()
bool ECMA262Parser::parse_class_set_operand(Vector<regex::CompareTypeAndValuePair>& compares)
{
auto start_position = tell();
auto start_token = m_parser_state.current_token;
auto start_lexer_index = m_parser_state.lexer.tell();
// ClassStringDisjunction :: "\q{" ClassStringDisjunctionContents "}"
// ClassStringDisjunctionContents :: ClassString | ClassString "|" ClassStringDisjunctionContents
@@ -2513,13 +2515,15 @@ bool ECMA262Parser::parse_class_set_operand(Vector<regex::CompareTypeAndValuePai
if (has_error())
return false;
back(tell() - start_position + 1);
m_parser_state.current_token = start_token;
m_parser_state.lexer.back(m_parser_state.lexer.tell() - start_lexer_index);
return false;
}
bool ECMA262Parser::parse_nested_class(Vector<regex::CompareTypeAndValuePair>& compares)
{
auto start_position = tell();
auto start_token = m_parser_state.current_token;
auto start_lexer_index = m_parser_state.lexer.tell();
// NestedClass :: "[" [lookahead ≠ ^ ] ClassContents [+UnicodeMode, +UnicodeSetsMode] "]"
// | "[" "^" ClassContents[+UnicodeMode, +UnicodeSetsMode] "]"
@@ -2613,7 +2617,8 @@ bool ECMA262Parser::parse_nested_class(Vector<regex::CompareTypeAndValuePair>& c
return false;
}
back(tell() - start_position + 1);
m_parser_state.current_token = start_token;
m_parser_state.lexer.back(m_parser_state.lexer.tell() - start_lexer_index);
return false;
}

View File

@@ -297,6 +297,17 @@ private:
size_t ensure_total_number_of_capturing_parenthesis();
auto save_parser_state()
{
auto saved_token = m_parser_state.current_token;
auto saved_lexer_index = m_parser_state.lexer.tell();
return ArmedScopeGuard { [this, saved_token, saved_lexer_index] {
m_parser_state.current_token = saved_token;
m_parser_state.lexer.back(m_parser_state.lexer.tell() - saved_lexer_index);
} };
}
void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); }
void exit_capture_group_scope()

View File

@@ -875,6 +875,11 @@ TEST_CASE(ECMA262_unicode_sets_parser_error)
{ "[[^\\u0430-\\u044f][\\p{RGI_Emoji}]]"sv, regex::Error::NoError },
{ "[^[[\\p{RGI_Emoji}]--[A-Z]]]"sv, regex::Error::NegatedCharacterClassStrings },
{ "[^[^\\p{RGI_Emoji}]]"sv, regex::Error::NegatedCharacterClassStrings },
{ "[\\[]"sv, regex::Error::NoError },
{ "[\\[\\]]"sv, regex::Error::NoError },
{ "[\\S[\\[]]"sv, regex::Error::NoError },
{ "[\\S&&[\\[]]"sv, regex::Error::NoError },
{ "[\\S--[\\[]]"sv, regex::Error::NoError },
};
for (auto test : tests) {