diff options
author | Emanuel Sprung <emanuel.sprung@gmail.com> | 2020-06-09 17:16:04 +0200 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2020-11-27 21:32:41 +0100 |
commit | 6add8b9c054385b119be1c0f731e94d424392dd1 (patch) | |
tree | df3cbde70a0a73b86814dbee581f043560645a75 | |
parent | 3b7884ee8a7dca2ff7732e28a117143590717efe (diff) | |
download | serenity-6add8b9c054385b119be1c0f731e94d424392dd1.zip |
LibRegex: Remove backup file, remove BOM in RegexParser.cpp, run clang-format
-rw-r--r-- | Libraries/LibRegex/RegexByteCode.cpp | 2 | ||||
-rw-r--r-- | Libraries/LibRegex/RegexDebug.h | 2 | ||||
-rw-r--r-- | Libraries/LibRegex/RegexMatch.h | 6 | ||||
-rw-r--r-- | Libraries/LibRegex/RegexMatcher.cpp | 2 | ||||
-rw-r--r-- | Libraries/LibRegex/RegexMatcher.h | 1 | ||||
-rw-r--r-- | Libraries/LibRegex/RegexParser.cpp | 2 | ||||
-rw-r--r-- | Libraries/LibRegex/RegexParser.cpp_ | 803 | ||||
-rw-r--r-- | Libraries/LibRegex/RegexParser.h_ | 257 | ||||
-rw-r--r-- | Libraries/LibRegex/Tests/Regex.cpp | 2 |
9 files changed, 8 insertions, 1069 deletions
diff --git a/Libraries/LibRegex/RegexByteCode.cpp b/Libraries/LibRegex/RegexByteCode.cpp index 0b0191905e..a11bc4d3c8 100644 --- a/Libraries/LibRegex/RegexByteCode.cpp +++ b/Libraries/LibRegex/RegexByteCode.cpp @@ -565,7 +565,7 @@ const Vector<String> OpCode_Compare::variable_arguments_to_string(Optional<Match auto value = (CharRange)m_bytecode->at(offset++); result.empend(String::format("ch_range='%c'-'%c'", value.from, value.to)); if (!view.is_null()) - result.empend(String::format("compare against: '%s'", input.value().view.substring_view(state().string_position, state().string_position + 1 > view.length() ? 0 : 1).to_string().characters())); + result.empend(String::format("compare against: '%s'", input.value().view.substring_view(state().string_position, state().string_position + 1 > view.length() ? 0 : 1).to_string().characters())); } } return result; diff --git a/Libraries/LibRegex/RegexDebug.h b/Libraries/LibRegex/RegexDebug.h index 94c04eddd1..2ef0c4c7c3 100644 --- a/Libraries/LibRegex/RegexDebug.h +++ b/Libraries/LibRegex/RegexDebug.h @@ -26,8 +26,8 @@ #pragma once -#include "LibRegex/RegexMatcher.h" #include "AK/StringBuilder.h" +#include "LibRegex/RegexMatcher.h" //#define REGEX_DEBUG diff --git a/Libraries/LibRegex/RegexMatch.h b/Libraries/LibRegex/RegexMatch.h index 9ae8b5ae7e..d0fadd4ba7 100644 --- a/Libraries/LibRegex/RegexMatch.h +++ b/Libraries/LibRegex/RegexMatch.h @@ -193,7 +193,7 @@ public: const char* characters_without_null_termination() const { - if(is_u8_view()) + if (is_u8_view()) return u8view().characters_without_null_termination(); return to_string().characters(); // FIXME: it contains the null termination, does that actually matter? @@ -201,14 +201,14 @@ public: bool starts_with(const StringView& str) const { - if(is_u32_view()) + if (is_u32_view()) return false; return u8view().starts_with(str); } bool starts_with(const Utf32View& str) const { - if(is_u8_view()) + if (is_u8_view()) return false; StringBuilder builder; diff --git a/Libraries/LibRegex/RegexMatcher.cpp b/Libraries/LibRegex/RegexMatcher.cpp index d410c62543..bfe09b88c5 100644 --- a/Libraries/LibRegex/RegexMatcher.cpp +++ b/Libraries/LibRegex/RegexMatcher.cpp @@ -201,7 +201,7 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional if (match_count) { auto capture_groups_count = min(output.capture_group_matches.size(), output.matches.size()); for (size_t i = 0; i < capture_groups_count; ++i) { - if(input.regex_options & AllFlags::SkipTrimEmptyMatches) { + if (input.regex_options & AllFlags::SkipTrimEmptyMatches) { output_copy.capture_group_matches.append(output.capture_group_matches.at(i)); } else { Vector<Match> capture_group_matches; diff --git a/Libraries/LibRegex/RegexMatcher.h b/Libraries/LibRegex/RegexMatcher.h index 5d28c2b8d0..7597361f45 100644 --- a/Libraries/LibRegex/RegexMatcher.h +++ b/Libraries/LibRegex/RegexMatcher.h @@ -216,7 +216,6 @@ RegexResult search(const Vector<RegexStringView> views, Regex<Parser>& pattern, return pattern.search(views, regex_options); } - template<class Parser> bool search(const RegexStringView view, Regex<Parser>& pattern, RegexResult& res, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) { diff --git a/Libraries/LibRegex/RegexParser.cpp b/Libraries/LibRegex/RegexParser.cpp index ff64b59530..a4b1235e83 100644 --- a/Libraries/LibRegex/RegexParser.cpp +++ b/Libraries/LibRegex/RegexParser.cpp @@ -1,4 +1,4 @@ -/* +/* * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> * All rights reserved. * diff --git a/Libraries/LibRegex/RegexParser.cpp_ b/Libraries/LibRegex/RegexParser.cpp_ deleted file mode 100644 index 4292371cd2..0000000000 --- a/Libraries/LibRegex/RegexParser.cpp_ +++ /dev/null @@ -1,803 +0,0 @@ -/* - * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "RegexParser.h" -#include <AK/String.h> -#include <AK/StringBuilder.h> - -namespace AK { -namespace regex { - -const char* ByteCodeValue::name(OpCode type) -{ - switch (type) { -#define __ENUMERATE_OPCODE(x) \ - case OpCode::x: \ - return #x; - ENUMERATE_OPCODES -#undef __ENUMERATE_OPCODE - default: - ASSERT_NOT_REACHED(); - return "<Unknown>"; - } -} - -const char* ByteCodeValue::name() const -{ - return name(op_code); -} - -template<class T> -bool Parser<T>::set_error(Error error) -{ - if (m_parser_state.error == Error::NoError) { - m_parser_state.error = error; - m_parser_state.error_token = m_parser_state.current_token; - } - return false; // always return false, that eases the API usage (return set_error(...)) :^) -} - -template<class T> -bool Parser<T>::done() const -{ - return match(TokenType::Eof); -} - -template<class T> -bool Parser<T>::match(TokenType type) const -{ - return m_parser_state.current_token.type() == type; -} - -template<class T> -Token Parser<T>::consume() -{ - auto old_token = m_parser_state.current_token; - m_parser_state.current_token = m_parser_state.lexer.next(); - return old_token; -} - -template<class T> -Token Parser<T>::consume(TokenType type, Error error) -{ - if (m_parser_state.current_token.type() != type) { - set_error(error); -#ifdef __serenity__ - dbg() << "[PARSER] Error: Unexpected token " << m_parser_state.m_current_token.name() << ". Expected: " << Token::name(type); -#else - fprintf(stderr, "[PARSER] Error: Unexpected token %s. Expected %s\n", m_parser_state.current_token.name(), Token::name(type)); -#endif - } - return consume(); -} - -template<class T> -bool Parser<T>::consume(const String& str) -{ - size_t potentially_go_back { 1 }; - for (auto ch : str) { - if (match(TokenType::OrdinaryCharacter)) { - if (m_parser_state.current_token.value()[0] != ch) { - m_parser_state.lexer.back(potentially_go_back); - m_parser_state.current_token = m_parser_state.lexer.next(); - return false; - } - } else { - m_parser_state.lexer.back(potentially_go_back); - m_parser_state.current_token = m_parser_state.lexer.next(); - return false; - } - consume(TokenType::OrdinaryCharacter); - ++potentially_go_back; - } - return true; -} - -template<class T> -void Parser<T>::reset() -{ - m_parser_state.bytecode.clear(); - m_parser_state.lexer.reset(); - m_parser_state.current_token = m_parser_state.lexer.next(); - m_parser_state.error = Error::NoError; - m_parser_state.error_token = { TokenType::Eof, 0, StringView(nullptr) }; - m_parser_state.regex_options = {}; -} - -template<class T> -ParserResult Parser<T>::parse(Optional<OptionsType> regex_options) -{ - reset(); - if (regex_options.has_value()) - m_parser_state.regex_options = regex_options.value(); - if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum)) - consume(TokenType::Eof); - else - set_error(Error::InvalidPattern); - -#ifdef REGEX_DEBUG - printf("[PARSER] Produced bytecode with %lu entries (opcodes + arguments)\n", m_parser_state.m_bytes.size()); -#endif - return { - move(m_parser_state.bytecode), - move(m_parser_state.capture_groups_count), - move(m_parser_state.named_capture_groups_count), - move(m_parser_state.match_length_minimum), - move(m_parser_state.error), - move(m_parser_state.error_token) - }; -} - -template<class T> -void Parser<T>::insert_bytecode_compare_values(Vector<ByteCodeValue>& stack, Vector<CompareTypeAndValuePair>&& pairs) -{ - Vector<ByteCodeValue> bytecode; - - bytecode.empend(OpCode::Compare); - bytecode.empend(pairs.size()); // number of arguments - - for (auto& value : pairs) { - ASSERT(value.type != CharacterCompareType::RangeExpressionDummy); - ASSERT(value.type != CharacterCompareType::Undefined); - ASSERT(value.type != CharacterCompareType::OrdinaryCharacters); - - bytecode.append(move(value.type)); - if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnySingleCharacter) - bytecode.append(move(value.value)); - } - - stack.append(move(bytecode)); -} - -template<class T> -void Parser<T>::insert_bytecode_group_capture_left(Vector<ByteCodeValue>& stack) -{ - stack.empend(OpCode::SaveLeftCaptureGroup); - stack.empend(m_parser_state.capture_groups_count); -} - -template<class T> -void Parser<T>::insert_bytecode_group_capture_left(Vector<ByteCodeValue>& stack, const StringView& name) -{ - stack.empend(OpCode::SaveLeftNamedCaptureGroup); - stack.empend(name.characters_without_null_termination()); - stack.empend(name.length()); -} - -template<class T> -void Parser<T>::insert_bytecode_group_capture_right(Vector<ByteCodeValue>& stack) -{ - stack.empend(OpCode::SaveRightCaptureGroup); - stack.empend(m_parser_state.capture_groups_count); -} - -template<class T> -void Parser<T>::insert_bytecode_group_capture_right(Vector<ByteCodeValue>& stack, const StringView& name) -{ - stack.empend(OpCode::SaveRightNamedCaptureGroup); - stack.empend(name.characters_without_null_termination()); - stack.empend(name.length()); -} - -template<class T> -void Parser<T>::insert_bytecode_alternation(Vector<ByteCodeValue>& stack, Vector<ByteCodeValue>&& left, Vector<ByteCodeValue>&& right) -{ - - // FORKSTAY _ALT - // REGEXP ALT1 - // JUMP _END - // LABEL _ALT - // REGEXP ALT2 - // LABEL _END - - stack.empend(OpCode::ForkJump); - stack.empend(left.size() + 2); // Jump to the _ALT label - - for (auto& op : left) - stack.append(move(op)); - - stack.empend(OpCode::Jump); - stack.empend(right.size()); // Jump to the _END label - - // LABEL _ALT = bytecode.size() + 2 - - for (auto& op : right) - stack.append(move(op)); - - // LABEL _END = alterantive_bytecode.size -} - -template<class T> -void Parser<T>::insert_bytecode_repetition_min_max(Vector<ByteCodeValue>& bytecode_to_repeat, size_t minimum, Optional<size_t> maximum) -{ - Vector<ByteCodeValue> new_bytecode; - insert_bytecode_repetition_n(new_bytecode, bytecode_to_repeat, minimum); - - if (maximum.has_value()) { - if (maximum.value() > minimum) { - auto diff = maximum.value() - minimum; - new_bytecode.empend(OpCode::ForkStay); - new_bytecode.empend(diff * (bytecode_to_repeat.size() + 2)); // Jump to the _END label - - for (size_t i = 0; i < diff; ++i) { - new_bytecode.append(bytecode_to_repeat); - new_bytecode.empend(OpCode::ForkStay); - new_bytecode.empend((diff - i - 1) * (bytecode_to_repeat.size() + 2)); // Jump to the _END label - } - } - } else { - // no maximum value set, repeat finding if possible - new_bytecode.empend(OpCode::ForkJump); - new_bytecode.empend(-bytecode_to_repeat.size() - 2); // Jump to the last iteration - } - - bytecode_to_repeat = move(new_bytecode); -} - -template<class T> -void Parser<T>::insert_bytecode_repetition_n(Vector<ByteCodeValue>& stack, Vector<ByteCodeValue>& bytecode_to_repeat, size_t n) -{ - for (size_t i = 0; i < n; ++i) - stack.append(bytecode_to_repeat); -} - -template<class T> -void Parser<T>::insert_bytecode_repetition_min_one(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy) -{ - // LABEL _START = -bytecode_to_repeat.size() - // REGEXP - // FORKJUMP _START (FORKSTAY -> Greedy) - - if (greedy) - bytecode_to_repeat.empend(OpCode::ForkStay); - else - bytecode_to_repeat.empend(OpCode::ForkJump); - - bytecode_to_repeat.empend(-bytecode_to_repeat.size() - 1); // Jump to the _START label -} - -template<class T> -void Parser<T>::insert_bytecode_repetition_any(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy) -{ - // LABEL _START - // FORKSTAY _END (FORKJUMP -> Greedy) - // REGEXP - // JUMP _START - // LABEL _END - - // LABEL _START = stack.size(); - Vector<ByteCodeValue> bytecode; - - if (greedy) - bytecode.empend(OpCode::ForkJump); - else - bytecode.empend(OpCode::ForkStay); - - bytecode.empend(bytecode_to_repeat.size() + 2); // Jump to the _END label - - for (auto& op : bytecode_to_repeat) - bytecode.append(move(op)); - - bytecode.empend(OpCode::Jump); - bytecode.empend(-bytecode.size() - 1); // Jump to the _START label - // LABEL _END = bytecode.size() - - bytecode_to_repeat = move(bytecode); -} - -template<class T> -void Parser<T>::insert_bytecode_repetition_zero_or_one(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy) -{ - // FORKSTAY _END (FORKJUMP -> Greedy) - // REGEXP - // LABEL _END - Vector<ByteCodeValue> bytecode; - - if (greedy) - bytecode.empend(OpCode::ForkJump); - else - bytecode.empend(OpCode::ForkStay); - - bytecode.empend(bytecode_to_repeat.size()); // Jump to the _END label - - for (auto& op : bytecode_to_repeat) - bytecode.append(move(op)); - // LABEL _END = bytecode.size() - - bytecode_to_repeat = move(bytecode); -} - -// ============================= -// PosixExtended Parser -// ============================= - -bool PosixExtendedParser::parse_internal(Vector<ByteCodeValue>& stack, size_t& match_length_minimum) -{ - return parse_root(stack, match_length_minimum); -} - -bool PosixExtendedParser::match_repetition_symbol() -{ - auto type = m_parser_state.current_token.type(); - return (type == TokenType::Asterisk - || type == TokenType::Plus - || type == TokenType::Questionmark - || type == TokenType::LeftCurly); -} - -bool PosixExtendedParser::match_ordinary_characters() -{ - // NOTE: This method must not be called during bracket and repetition parsing! - // FIXME: Add assertion for that? - auto type = m_parser_state.current_token.type(); - return (type == TokenType::OrdinaryCharacter - || type == TokenType::Comma - || type == TokenType::Slash - || type == TokenType::EqualSign - || type == TokenType::HyphenMinus - || type == TokenType::Colon); -} - -bool PosixExtendedParser::parse_repetition_symbol(Vector<ByteCodeValue>& bytecode_to_repeat, size_t& match_length_minimum) -{ - if (match(TokenType::LeftCurly)) { - consume(); - - StringBuilder number_builder; - bool ok; - - while (match(TokenType::OrdinaryCharacter)) { - number_builder.append(consume().value()); - } - - size_t minimum = number_builder.build().to_uint(ok); - if (!ok) - return set_error(Error::InvalidBraceContent); - - match_length_minimum *= minimum; - - if (match(TokenType::Comma)) { - consume(); - } else { - Vector<ByteCodeValue> bytecode; - insert_bytecode_repetition_n(bytecode, bytecode_to_repeat, minimum); - bytecode_to_repeat = move(bytecode); - return !has_error(); - } - - Optional<size_t> maximum {}; - number_builder.clear(); - while (match(TokenType::OrdinaryCharacter)) { - number_builder.append(consume().value()); - } - if (!number_builder.is_empty()) { - maximum = number_builder.build().to_uint(ok); - if (!ok || minimum > maximum.value()) - return set_error(Error::InvalidBraceContent); - } - - insert_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maximum); - - consume(TokenType::RightCurly, Error::BraceMismatch); - return !has_error(); - - } else if (match(TokenType::Plus)) { - consume(); - - bool greedy = match(TokenType::Questionmark); - if (greedy) - consume(); - - // Note: dont touch match_length_minimum, it's already correct - insert_bytecode_repetition_min_one(bytecode_to_repeat, greedy); - return !has_error(); - - } else if (match(TokenType::Asterisk)) { - consume(); - match_length_minimum = 0; - - bool greedy = match(TokenType::Questionmark); - if (greedy) - consume(); - - insert_bytecode_repetition_any(bytecode_to_repeat, greedy); - - return !has_error(); - - } else if (match(TokenType::Questionmark)) { - consume(); - match_length_minimum = 0; - - bool greedy = match(TokenType::Questionmark); - if (greedy) - consume(); - - insert_bytecode_repetition_zero_or_one(bytecode_to_repeat, greedy); - return !has_error(); - } - - return false; -} - -bool PosixExtendedParser::parse_bracket_expression(Vector<ByteCodeValue>& stack, size_t& match_length_minimum) -{ - Vector<CompareTypeAndValuePair> values; - - for (;;) { - - if (match(TokenType::HyphenMinus)) { - consume(); - if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) { - // first in the bracket expression - values.append({ CharacterCompareType::OrdinaryCharacter, { '-' } }); - - } else if (match(TokenType::RightBracket)) { - // Last in the bracket expression - values.append({ CharacterCompareType::OrdinaryCharacter, { '-' } }); - - } else if (values.last().type == CharacterCompareType::OrdinaryCharacter) { - - values.append({ CharacterCompareType::RangeExpressionDummy, 0 }); - - if (match(TokenType::HyphenMinus)) { - consume(); - // Valid range, add ordinary character - values.append({ CharacterCompareType::OrdinaryCharacter, { '-' } }); - } - } else { - return set_error(Error::InvalidRange); - } - - } else if (match(TokenType::OrdinaryCharacter) || match(TokenType::Period) || match(TokenType::Asterisk) || match(TokenType::EscapeSequence) || match(TokenType::Plus)) { - values.append({ CharacterCompareType::OrdinaryCharacter, { *consume().value().characters_without_null_termination() } }); - - } else if (match(TokenType::Circumflex)) { - auto t = consume(); - - if (values.is_empty()) - values.append({ CharacterCompareType::Inverse, 0 }); - else - values.append({ CharacterCompareType::OrdinaryCharacter, { *t.value().characters_without_null_termination() } }); - - } else if (match(TokenType::LeftBracket)) { - consume(); - - if (match(TokenType::Period)) { - consume(); - - // FIXME: Parse collating element, this is needed when we have locale support - // This could have impact on length parameter, I guess. - ASSERT_NOT_REACHED(); - - consume(TokenType::Period, Error::InvalidCollationElement); - consume(TokenType::RightBracket, Error::BracketMismatch); - - } else if (match(TokenType::EqualSign)) { - consume(); - // FIXME: Parse collating element, this is needed when we have locale support - // This could have impact on length parameter, I guess. - ASSERT_NOT_REACHED(); - - consume(TokenType::EqualSign, Error::InvalidCollationElement); - consume(TokenType::RightBracket, Error::BracketMismatch); - - } else if (match(TokenType::Colon)) { - consume(); - - CharacterClass ch_class; - // parse character class - if (match(TokenType::OrdinaryCharacter)) { - if (consume("alnum")) - ch_class = CharacterClass::Alnum; - else if (consume("alpha")) - ch_class = CharacterClass::Alpha; - else if (consume("blank")) - ch_class = CharacterClass::Blank; - else if (consume("cntrl")) - ch_class = CharacterClass::Cntrl; - else if (consume("digit")) - ch_class = CharacterClass::Digit; - else if (consume("graph")) - ch_class = CharacterClass::Graph; - else if (consume("lower")) - ch_class = CharacterClass::Lower; - else if (consume("print")) - ch_class = CharacterClass::Print; - else if (consume("punct")) - ch_class = CharacterClass::Punct; - else if (consume("space")) - ch_class = CharacterClass::Space; - else if (consume("upper")) - ch_class = CharacterClass::Upper; - else if (consume("xdigit")) - ch_class = CharacterClass::Xdigit; - else - return set_error(Error::InvalidCharacterClass); - - values.append({ CharacterCompareType::CharacterClass, ch_class }); - - } else - return set_error(Error::InvalidCharacterClass); - - // FIXME: we do not support locale specific character classes until locales are implemented - - consume(TokenType::Colon, Error::InvalidCharacterClass); - consume(TokenType::RightBracket, Error::BracketMismatch); - } - } else if (match(TokenType::RightBracket)) { - - if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) { - // handle bracket as ordinary character - values.append({ CharacterCompareType::OrdinaryCharacter, { *consume().value().characters_without_null_termination() } }); - } else { - // closing bracket expression - break; - } - } else - // nothing matched, this is a failure, as at least the closing bracket must match... - return set_error(Error::BracketMismatch); - - // check if range expression has to be completed... - if (values.size() >= 3 && values.at(values.size() - 2).type == CharacterCompareType::RangeExpressionDummy) { - if (values.last().type != CharacterCompareType::OrdinaryCharacter) - return set_error(Error::InvalidRange); - - auto value2 = values.take_last(); - values.take_last(); // RangeExpressionDummy - auto value1 = values.take_last(); - - values.append({ CharacterCompareType::RangeExpression, ByteCodeValue { value1.value.ch, value2.value.ch } }); - } - } - - if (values.size()) - match_length_minimum = 1; - - if (values.first().type == CharacterCompareType::Inverse) - match_length_minimum = 0; - - insert_bytecode_compare_values(stack, move(values)); - - return !has_error(); -} - -bool PosixExtendedParser::parse_sub_expression(Vector<ByteCodeValue>& stack, size_t& match_length_minimum) -{ - Vector<ByteCodeValue> bytecode; - size_t length = 0; - bool should_parse_repetition_symbol { false }; - - for (;;) { - if (match_ordinary_characters()) { - Token start_token = m_parser_state.current_token; - Token last_token = m_parser_state.current_token; - for (;;) { - if (!match_ordinary_characters()) - break; - ++length; - last_token = consume(); - } - - if (length > 1) { - stack.empend(OpCode::Compare); - stack.empend(1ul); // number of arguments - stack.empend(CharacterCompareType::OrdinaryCharacters); - stack.empend(start_token.value().characters_without_null_termination()); - stack.empend(length - ((match_repetition_symbol() && length > 1) ? 1 : 0)); // last character is inserted into 'bytecode' for duplication symbol handling - } - - if ((match_repetition_symbol() && length > 1) || length == 1) // Create own compare opcode for last character before duplication symbol - insert_bytecode_compare_values(bytecode, { { CharacterCompareType::OrdinaryCharacter, { last_token.value().characters_without_null_termination()[0] } } }); - - should_parse_repetition_symbol = true; - break; - } - - if (match_repetition_symbol()) - return set_error(Error::InvalidRepetitionMarker); - - if (match(TokenType::Period)) { - length = 1; - consume(); - insert_bytecode_compare_values(bytecode, { { CharacterCompareType::AnySingleCharacter, { 0 } } }); - should_parse_repetition_symbol = true; - break; - } - - if (match(TokenType::EscapeSequence)) { - length = 1; - Token t = consume(); -#ifdef REGEX_DEBUG - printf("[PARSER] EscapeSequence with substring %s\n", String(t.value()).characters()); -#endif - - insert_bytecode_compare_values(bytecode, { { CharacterCompareType::OrdinaryCharacter, { (char)t.value().characters_without_null_termination()[1] } } }); - should_parse_repetition_symbol = true; - break; - } - - if (match(TokenType::LeftBracket)) { - consume(); - - Vector<ByteCodeValue> sub_ops; - if (!parse_bracket_expression(sub_ops, length) || !sub_ops.size()) - return set_error(Error::BracketMismatch); - - bytecode.append(move(sub_ops)); - - consume(TokenType::RightBracket); - should_parse_repetition_symbol = true; - break; - } - - if (match(TokenType::RightBracket)) { - return set_error(Error::BracketMismatch); - } - - if (match(TokenType::RightCurly)) { - return set_error(Error::BraceMismatch); - } - - if (match(TokenType::Circumflex)) { - consume(); - bytecode.empend(OpCode::CheckBegin); - break; - } - - if (match(TokenType::Dollar)) { - consume(); - bytecode.empend(OpCode::CheckEnd); - break; - } - - if (match(TokenType::LeftParen)) { - consume(); - Optional<StringView> capture_group_name; - bool no_subexpression_match_qualifier = false; - if (match(TokenType::Questionmark)) { - consume(); - - if (match(TokenType::Colon)) { - consume(); - no_subexpression_match_qualifier = true; - } else if (consume("<")) { // named capturing group - - Token start_token = m_parser_state.current_token; - Token last_token = m_parser_state.current_token; - size_t capture_group_name_length = 0; - for (;;) { - if (!match_ordinary_characters()) - return set_error(Error::InvalidNameForCaptureGroup); - if (match(TokenType::OrdinaryCharacter) && m_parser_state.current_token.value()[0] == '>') { - consume(); - break; - } - ++capture_group_name_length; - last_token = consume(); - } - capture_group_name = StringView(start_token.value().characters_without_null_termination(), capture_group_name_length); - - } else if (match(TokenType::EqualSign)) { // positive lookahead - consume(); - ASSERT_NOT_REACHED(); - } else if (consume("!")) { // negative lookahead - ASSERT_NOT_REACHED(); - } else if (consume("<")) { - if (match(TokenType::EqualSign)) { // positive lookbehind - consume(); - ASSERT_NOT_REACHED(); - } - if (consume("!")) // negative lookbehind - ASSERT_NOT_REACHED(); - } else { - return set_error(Error::InvalidRepetitionMarker); - } - } - - if (!(m_parser_state.regex_options & (u8)AllFlags::NoSubExpressions || no_subexpression_match_qualifier)) { - if (capture_group_name.has_value()) - insert_bytecode_group_capture_left(bytecode, capture_group_name.value()); - else - insert_bytecode_group_capture_left(bytecode); - } - - Vector<ByteCodeValue> capture_group_bytecode; - - bool res = !parse_root(capture_group_bytecode, length); - - if (capture_group_bytecode.is_empty() && match(TokenType::RightParen)) - return set_error(Error::ParenEmpty); - - if (!res) - return false; - - bytecode.append(move(capture_group_bytecode)); - - consume(TokenType::RightParen, Error::ParenMismatch); - - if (!(m_parser_state.regex_options & (u8)AllFlags::NoSubExpressions || no_subexpression_match_qualifier)) { - if (capture_group_name.has_value()) { - insert_bytecode_group_capture_right(bytecode, capture_group_name.value()); - ++m_parser_state.named_capture_groups_count; - } else { - insert_bytecode_group_capture_right(bytecode); - ++m_parser_state.capture_groups_count; - } - } - should_parse_repetition_symbol = true; - break; - } - - return false; - } - - if (match_repetition_symbol()) { - if (should_parse_repetition_symbol) - parse_repetition_symbol(bytecode, length); - else - return set_error(Error::InvalidRepetitionMarker); - } - - stack.append(move(bytecode)); - match_length_minimum += length; - - return true; -} - -bool PosixExtendedParser::parse_root(Vector<ByteCodeValue>& stack, size_t& match_length_minimum) -{ - Vector<ByteCodeValue> bytecode_left; - size_t match_length_minimum_left { 0 }; - - if (match_repetition_symbol()) - return set_error(Error::InvalidRepetitionMarker); - - for (;;) { - if (!parse_sub_expression(bytecode_left, match_length_minimum_left)) - break; - - if (match(TokenType::Pipe)) { - consume(); - - Vector<ByteCodeValue> bytecode_right; - size_t match_length_minimum_right { 0 }; - - if (!parse_root(bytecode_right, match_length_minimum_right) || bytecode_right.is_empty()) - return set_error(Error::InvalidPattern); - - Vector<ByteCodeValue> new_bytecode; - insert_bytecode_alternation(new_bytecode, move(bytecode_left), move(bytecode_right)); - bytecode_left = move(new_bytecode); - match_length_minimum_left = min(match_length_minimum_right, match_length_minimum_left); - } - } - - stack.append(move(bytecode_left)); - match_length_minimum = match_length_minimum_left; - return !has_error(); -} -} -} diff --git a/Libraries/LibRegex/RegexParser.h_ b/Libraries/LibRegex/RegexParser.h_ deleted file mode 100644 index 2f0dbed638..0000000000 --- a/Libraries/LibRegex/RegexParser.h_ +++ /dev/null @@ -1,257 +0,0 @@ -/* - * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#pragma once - -#include "RegexError.h" -#include "RegexLexer.h" -#include "RegexOptions.h" -#include <AK/Forward.h> -#include <AK/Types.h> -#include <AK/Vector.h> - -namespace AK { -namespace regex { - -#define ENUMERATE_OPCODES \ - __ENUMERATE_OPCODE(Compare) \ - __ENUMERATE_OPCODE(Jump) \ - __ENUMERATE_OPCODE(ForkJump) \ - __ENUMERATE_OPCODE(ForkStay) \ - __ENUMERATE_OPCODE(SaveLeftCaptureGroup) \ - __ENUMERATE_OPCODE(SaveRightCaptureGroup) \ - __ENUMERATE_OPCODE(SaveLeftNamedCaptureGroup) \ - __ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \ - __ENUMERATE_OPCODE(CheckBegin) \ - __ENUMERATE_OPCODE(CheckEnd) \ - __ENUMERATE_OPCODE(Exit) - -enum class OpCode : u8 { -#define __ENUMERATE_OPCODE(x) x, - ENUMERATE_OPCODES -#undef __ENUMERATE_OPCODE -}; - -enum class CharacterCompareType : u8 { - Undefined, - Inverse, - AnySingleCharacter, - OrdinaryCharacter, - OrdinaryCharacters, - CharacterClass, - RangeExpression, - RangeExpressionDummy, -}; - -enum class CharacterClass : u8 { - Alnum, - Cntrl, - Lower, - Space, - Alpha, - Digit, - Print, - Upper, - Blank, - Graph, - Punct, - Xdigit, -}; - -class ByteCodeValue { -public: - union CompareValue { - CompareValue(const CharacterClass value) - : character_class(value) - { - } - CompareValue(const char value1, const char value2) - : range_values { value1, value2 } - { - } - const CharacterClass character_class; - const struct { - const char from; - const char to; - } range_values; - }; - - union { - const OpCode op_code; - const char* string; - const char ch; - const int number; - const size_t positive_number; - const CompareValue compare_value; - const CharacterCompareType compare_type; - }; - - const char* name() const; - static const char* name(OpCode); - - ByteCodeValue(const OpCode value) - : op_code(value) - { - } - ByteCodeValue(const char* value) - : string(value) - { - } - ByteCodeValue(const char value) - : ch(value) - { - } - ByteCodeValue(const int value) - : number(value) - { - } - ByteCodeValue(const size_t value) - : positive_number(value) - { - } - ByteCodeValue(const CharacterClass value) - : compare_value(value) - { - } - ByteCodeValue(const char value1, const char value2) - : compare_value(value1, value2) - { - } - ByteCodeValue(const CharacterCompareType value) - : compare_type(value) - { - } - - ~ByteCodeValue() = default; -}; - -struct CompareTypeAndValuePair { - CharacterCompareType type; - ByteCodeValue value; -}; - -struct ParserResult { - Vector<ByteCodeValue> m_bytes; - size_t m_match_groups; - size_t m_min_match_length; - Error m_error; - Token m_error_token; -}; - -template<class T> -class Parser { -public: - explicit Parser(Lexer& lexer) - : m_parser_state(lexer) - { - } - - Parser(Lexer& lexer, T options) - : m_parser_state(lexer, options) - { - } - - virtual ~Parser() = default; - - virtual ParserResult parse(T options = {}, EngineOptions engine_options = {}); - bool has_error() const { return m_parser_state.m_error != Error::NoError; } - Error error() const { return m_parser_state.m_error; } - -protected: - virtual bool parse_internal(Vector<ByteCodeValue>&, size_t& min_length) = 0; - - bool match(TokenType type) const; - bool match(char ch) const; - Token consume(); - Token consume(TokenType type, Error error = Error::InvalidPattern); - bool consume(const String&); - void reset(); - bool done() const; - - bool set_error(Error error); - - void insert_bytecode_compare_values(Vector<ByteCodeValue>&, Vector<CompareTypeAndValuePair>&&); - void insert_bytecode_group_capture_left(Vector<ByteCodeValue>& stack); - void insert_bytecode_group_capture_right(Vector<ByteCodeValue>& stack); - void insert_bytecode_group_capture_left(Vector<ByteCodeValue>& stack, const StringView& name); - void insert_bytecode_group_capture_right(Vector<ByteCodeValue>& stack, const StringView& name); - void insert_bytecode_alternation(Vector<ByteCodeValue>& stack, Vector<ByteCodeValue>&&, Vector<ByteCodeValue>&&); - void insert_bytecode_repetition_min_max(Vector<ByteCodeValue>& bytecode_to_repeat, size_t minimum, Optional<size_t> maximum); - void insert_bytecode_repetition_n(Vector<ByteCodeValue>& stack, Vector<ByteCodeValue>& bytecode_to_repeat, size_t n); - void insert_bytecode_repetition_min_one(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy); - void insert_bytecode_repetition_any(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy); - void insert_bytecode_repetition_zero_or_one(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy); - - struct ParserState { - Lexer& lexer; - Token current_token; - Error error = Error::NoError; - Token error_token { TokenType::Eof, 0, StringView(nullptr) }; - Vector<ByteCodeValue> bytecode; - size_t capture_groups_count { 0 }; - size_t named_capture_groups_count { 0 }; - size_t match_length_minimum { 0 }; - OptionsType regex_options; - explicit ParserState(Lexer& lexer) - : lexer(lexer) - , current_token(lexer.next()) - { - } - explicit ParserState(Lexer& lexer, Optional<OptionsType> regex_options) - : lexer(lexer) - , current_token(lexer.next()) - , regex_options(regex_options.value_or({})) - { - } - }; - - ParserState m_parser_state; -}; - -class PosixExtendedParser final : public Parser<PosixOptions> { -public: - explicit PosixExtendedParser(Lexer& lexer) - : Parser(lexer) {}; - PosixExtendedParser(Lexer& lexer, Optional<OptionsType> regex_options) - : Parser(lexer, regex_options) {}; - ~PosixExtendedParser() = default; - -private: - bool match_repetition_symbol(); - bool match_ordinary_characters(); - - bool parse_internal(Vector<ByteCodeValue>&, size_t&) override; - - bool parse_root(Vector<ByteCodeValue>&, size_t&); - bool parse_sub_expression(Vector<ByteCodeValue>&, size_t&); - bool parse_bracket_expression(Vector<ByteCodeValue>&, size_t&); - bool parse_repetition_symbol(Vector<ByteCodeValue>&, size_t&); -}; -} -} - -using AK::regex::ParserResult; -using AK::regex::PosixExtendedParser; diff --git a/Libraries/LibRegex/Tests/Regex.cpp b/Libraries/LibRegex/Tests/Regex.cpp index 9c73b54914..8784cab0e4 100644 --- a/Libraries/LibRegex/Tests/Regex.cpp +++ b/Libraries/LibRegex/Tests/Regex.cpp @@ -26,9 +26,9 @@ #include <AK/TestSuite.h> // import first, to prevent warning of ASSERT* redefinition +#include <AK/StringBuilder.h> #include <LibRegex/Regex.h> #include <LibRegex/RegexDebug.h> -#include <AK/StringBuilder.h> #include <stdio.h> static ECMAScriptOptions match_test_api_options(const ECMAScriptOptions options) |