mirror of
https://github.com/zebrajr/ladybird.git
synced 2026-01-15 12:15:15 +00:00
LibRegex: Use token-state restoration in character class parsing
Previously, we used restoration based on character position in parser. This caused the lexer to re-tokenize from the middle of multi-character tokens like escape sequences, and led to incorrect parse failures for patterns like `[\[\]]`. We would backtrack to before the first `\[` token, then re-lex the `[` as a separate token instead of part of the `\[` escape. Now we save and restore the actual token object along with the lexer index, so we keep correct token state when backtracking.
This commit is contained in:
committed by
Ali Mohammad Pur
parent
ff06a4a9e5
commit
3e391bdb2d
@@ -2162,7 +2162,8 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
|
||||
|
||||
bool ECMA262Parser::parse_class_set_expression(Vector<CompareTypeAndValuePair>& compares)
|
||||
{
|
||||
auto start_position = tell();
|
||||
auto start_token = m_parser_state.current_token;
|
||||
auto start_lexer_index = m_parser_state.lexer.tell();
|
||||
|
||||
// ClassSetExpression :: ClassUnion | ClassIntersection | ClassSubtraction
|
||||
if (parse_class_subtraction(compares)) {
|
||||
@@ -2172,7 +2173,9 @@ bool ECMA262Parser::parse_class_set_expression(Vector<CompareTypeAndValuePair>&
|
||||
if (has_error())
|
||||
return false;
|
||||
|
||||
back(tell() - start_position + 1);
|
||||
m_parser_state.current_token = start_token;
|
||||
m_parser_state.lexer.back(m_parser_state.lexer.tell() - start_lexer_index);
|
||||
|
||||
if (parse_class_intersection(compares)) {
|
||||
consume(TokenType::RightBracket, Error::MismatchingBracket);
|
||||
return true;
|
||||
@@ -2180,7 +2183,9 @@ bool ECMA262Parser::parse_class_set_expression(Vector<CompareTypeAndValuePair>&
|
||||
if (has_error())
|
||||
return false;
|
||||
|
||||
back(tell() - start_position + 1);
|
||||
m_parser_state.current_token = start_token;
|
||||
m_parser_state.lexer.back(m_parser_state.lexer.tell() - start_lexer_index);
|
||||
|
||||
if (parse_class_union(compares)) {
|
||||
consume(TokenType::RightBracket, Error::MismatchingBracket);
|
||||
return true;
|
||||
@@ -2191,8 +2196,7 @@ bool ECMA262Parser::parse_class_set_expression(Vector<CompareTypeAndValuePair>&
|
||||
|
||||
bool ECMA262Parser::parse_class_union(Vector<regex::CompareTypeAndValuePair>& compares)
|
||||
{
|
||||
auto start_position = tell();
|
||||
ArmedScopeGuard restore_position { [&] { back(tell() - start_position + 1); } };
|
||||
auto restore_position = save_parser_state();
|
||||
|
||||
auto first = true;
|
||||
|
||||
@@ -2227,8 +2231,7 @@ bool ECMA262Parser::parse_class_intersection(Vector<CompareTypeAndValuePair>& co
|
||||
Vector<CompareTypeAndValuePair> lhs;
|
||||
Vector<CompareTypeAndValuePair> rhs;
|
||||
|
||||
auto start_position = tell();
|
||||
ArmedScopeGuard restore_position { [&] { back(tell() - start_position + 1); } };
|
||||
auto restore_position = save_parser_state();
|
||||
|
||||
if (!parse_class_set_operand(lhs))
|
||||
return false;
|
||||
@@ -2262,8 +2265,7 @@ bool ECMA262Parser::parse_class_subtraction(Vector<CompareTypeAndValuePair>& com
|
||||
Vector<CompareTypeAndValuePair> lhs;
|
||||
Vector<CompareTypeAndValuePair> rhs;
|
||||
|
||||
auto start_position = tell();
|
||||
ArmedScopeGuard restore_position { [&] { back(tell() - start_position + 1); } };
|
||||
auto restore_position = save_parser_state();
|
||||
|
||||
if (!parse_class_set_operand(lhs))
|
||||
return false;
|
||||
@@ -2291,8 +2293,7 @@ bool ECMA262Parser::parse_class_subtraction(Vector<CompareTypeAndValuePair>& com
|
||||
bool ECMA262Parser::parse_class_set_range(Vector<CompareTypeAndValuePair>& compares)
|
||||
{
|
||||
// ClassSetRange :: ClassSetCharacter "-" ClassSetCharacter
|
||||
auto start_position = tell();
|
||||
ArmedScopeGuard restore_position { [&] { back(tell() - start_position + 1); } };
|
||||
auto restore_position = save_parser_state();
|
||||
|
||||
auto lhs = parse_class_set_character();
|
||||
if (!lhs.has_value())
|
||||
@@ -2332,6 +2333,8 @@ Optional<u32> ECMA262Parser::parse_class_set_character()
|
||||
"&"sv, "-"sv, "!"sv, "#"sv, "%"sv, ","sv, ":"sv, ";"sv, "<"sv, "="sv, ">"sv, "@"sv, "`"sv, "~"sv
|
||||
};
|
||||
|
||||
auto restore = save_parser_state();
|
||||
|
||||
if (done()) {
|
||||
set_error(Error::InvalidPattern);
|
||||
return {};
|
||||
@@ -2342,13 +2345,11 @@ Optional<u32> ECMA262Parser::parse_class_set_character()
|
||||
consume();
|
||||
|
||||
if (escape_value[0] == '\\' && escape_value.length() == 2) {
|
||||
restore.disarm();
|
||||
return escape_value[1];
|
||||
}
|
||||
}
|
||||
|
||||
auto start_position = tell();
|
||||
ArmedScopeGuard restore { [&] { back(tell() - start_position + 1); } };
|
||||
|
||||
if (try_skip("\\"sv)) {
|
||||
if (done()) {
|
||||
set_error(Error::InvalidTrailingEscape);
|
||||
@@ -2404,7 +2405,8 @@ Optional<u32> ECMA262Parser::parse_class_set_character()
|
||||
|
||||
bool ECMA262Parser::parse_class_set_operand(Vector<regex::CompareTypeAndValuePair>& compares)
|
||||
{
|
||||
auto start_position = tell();
|
||||
auto start_token = m_parser_state.current_token;
|
||||
auto start_lexer_index = m_parser_state.lexer.tell();
|
||||
|
||||
// ClassStringDisjunction :: "\q{" ClassStringDisjunctionContents "}"
|
||||
// ClassStringDisjunctionContents :: ClassString | ClassString "|" ClassStringDisjunctionContents
|
||||
@@ -2513,13 +2515,15 @@ bool ECMA262Parser::parse_class_set_operand(Vector<regex::CompareTypeAndValuePai
|
||||
if (has_error())
|
||||
return false;
|
||||
|
||||
back(tell() - start_position + 1);
|
||||
m_parser_state.current_token = start_token;
|
||||
m_parser_state.lexer.back(m_parser_state.lexer.tell() - start_lexer_index);
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ECMA262Parser::parse_nested_class(Vector<regex::CompareTypeAndValuePair>& compares)
|
||||
{
|
||||
auto start_position = tell();
|
||||
auto start_token = m_parser_state.current_token;
|
||||
auto start_lexer_index = m_parser_state.lexer.tell();
|
||||
|
||||
// NestedClass :: "[" [lookahead ≠ ^ ] ClassContents [+UnicodeMode, +UnicodeSetsMode] "]"
|
||||
// | "[" "^" ClassContents[+UnicodeMode, +UnicodeSetsMode] "]"
|
||||
@@ -2613,7 +2617,8 @@ bool ECMA262Parser::parse_nested_class(Vector<regex::CompareTypeAndValuePair>& c
|
||||
return false;
|
||||
}
|
||||
|
||||
back(tell() - start_position + 1);
|
||||
m_parser_state.current_token = start_token;
|
||||
m_parser_state.lexer.back(m_parser_state.lexer.tell() - start_lexer_index);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -297,6 +297,17 @@ private:
|
||||
|
||||
size_t ensure_total_number_of_capturing_parenthesis();
|
||||
|
||||
auto save_parser_state()
|
||||
{
|
||||
auto saved_token = m_parser_state.current_token;
|
||||
auto saved_lexer_index = m_parser_state.lexer.tell();
|
||||
|
||||
return ArmedScopeGuard { [this, saved_token, saved_lexer_index] {
|
||||
m_parser_state.current_token = saved_token;
|
||||
m_parser_state.lexer.back(m_parser_state.lexer.tell() - saved_lexer_index);
|
||||
} };
|
||||
}
|
||||
|
||||
void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); }
|
||||
|
||||
void exit_capture_group_scope()
|
||||
|
||||
@@ -875,6 +875,11 @@ TEST_CASE(ECMA262_unicode_sets_parser_error)
|
||||
{ "[[^\\u0430-\\u044f][\\p{RGI_Emoji}]]"sv, regex::Error::NoError },
|
||||
{ "[^[[\\p{RGI_Emoji}]--[A-Z]]]"sv, regex::Error::NegatedCharacterClassStrings },
|
||||
{ "[^[^\\p{RGI_Emoji}]]"sv, regex::Error::NegatedCharacterClassStrings },
|
||||
{ "[\\[]"sv, regex::Error::NoError },
|
||||
{ "[\\[\\]]"sv, regex::Error::NoError },
|
||||
{ "[\\S[\\[]]"sv, regex::Error::NoError },
|
||||
{ "[\\S&&[\\[]]"sv, regex::Error::NoError },
|
||||
{ "[\\S--[\\[]]"sv, regex::Error::NoError },
|
||||
};
|
||||
|
||||
for (auto test : tests) {
|
||||
|
||||
Reference in New Issue
Block a user