diff options
26 files changed, 7424 insertions, 6 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ee9852f8d..c2df09f98d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,6 +84,7 @@ add_subdirectory(Meta/Lagom) add_subdirectory(DevTools/IPCCompiler) add_subdirectory(Libraries/LibWeb/CodeGenerators) add_subdirectory(AK/Tests) +add_subdirectory(Libraries/LibRegex/Tests) set(write_if_different ${CMAKE_SOURCE_DIR}/Meta/write-only-on-difference.sh) diff --git a/Libraries/CMakeLists.txt b/Libraries/CMakeLists.txt index 795eeb0f95..1d205d3cc3 100644 --- a/Libraries/CMakeLists.txt +++ b/Libraries/CMakeLists.txt @@ -1,15 +1,17 @@ add_subdirectory(LibAudio) add_subdirectory(LibC) add_subdirectory(LibChess) +add_subdirectory(LibCompress) add_subdirectory(LibCore) +add_subdirectory(LibCpp) add_subdirectory(LibCrypt) add_subdirectory(LibCrypto) -add_subdirectory(LibCompress) add_subdirectory(LibDebug) add_subdirectory(LibDesktop) +add_subdirectory(LibDiff) +add_subdirectory(LibGUI) add_subdirectory(LibGemini) add_subdirectory(LibGfx) -add_subdirectory(LibGUI) add_subdirectory(LibHTTP) add_subdirectory(LibIPC) add_subdirectory(LibImageDecoderClient) @@ -21,12 +23,11 @@ add_subdirectory(LibMarkdown) add_subdirectory(LibPCIDB) add_subdirectory(LibProtocol) add_subdirectory(LibPthread) +add_subdirectory(LibRegex) +add_subdirectory(LibTLS) add_subdirectory(LibTar) add_subdirectory(LibTextCodec) add_subdirectory(LibThread) -add_subdirectory(LibTLS) add_subdirectory(LibVT) add_subdirectory(LibWeb) add_subdirectory(LibX86) -add_subdirectory(LibDiff) -add_subdirectory(LibCpp) diff --git a/Libraries/LibC/regex.h b/Libraries/LibC/regex.h new file mode 100644 index 0000000000..3fbf8fb4c3 --- /dev/null +++ b/Libraries/LibC/regex.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include <stddef.h> +#include <sys/types.h> + +__BEGIN_DECLS + +typedef ssize_t regoff_t; + +struct regex_t { + void* __data; +}; + +enum __Regex_Error { + __Regex_NoError, + __Regex_InvalidPattern, // Invalid regular expression. + __Regex_InvalidCollationElement, // Invalid collating element referenced. + __Regex_InvalidCharacterClass, // Invalid character class type referenced. + __Regex_InvalidTrailingEscape, // Trailing \ in pattern. + __Regex_InvalidNumber, // Number in \digit invalid or in error. + __Regex_MismatchingBracket, // [ ] imbalance. + __Regex_MismatchingParen, // ( ) imbalance. + __Regex_MismatchingBrace, // { } imbalance. + __Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second. + __Regex_InvalidBracketContent, // Content of [] invalid. + __Regex_InvalidRange, // Invalid endpoint in range expression. + __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression. + __Regex_ReachedMaxRecursion, // MaximumRecursion has been reached. + __Regex_EmptySubExpression, // Sub expression has empty content. + __Regex_InvalidCaptureGroup, // Content of capture group is invalid. + __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid. +}; + +enum ReError { + REG_NOERR = __Regex_NoError, + REG_BADPAT = __Regex_InvalidPattern, // Invalid regular expression. + REG_ECOLLATE = __Regex_InvalidCollationElement, // Invalid collating element referenced. + REG_ECTYPE = __Regex_InvalidCharacterClass, // Invalid character class type referenced. + REG_EESCAPE = __Regex_InvalidTrailingEscape, // Trailing \ in pattern. + REG_ESUBREG = __Regex_InvalidNumber, // Number in \digit invalid or in error. + REG_EBRACK = __Regex_MismatchingBracket, // [ ] imbalance. + REG_EPAREN = __Regex_MismatchingParen, // \( \) or ( ) imbalance. + REG_EBRACE = __Regex_MismatchingBrace, // \{ \} imbalance. + REG_BADBR = __Regex_InvalidBraceContent, // Content of \{ \} invalid: not a number, number too large, more than two numbers, first larger than second. + REG_ERANGE = __Regex_InvalidRange, // Invalid endpoint in range expression. + REG_BADRPT = __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression. + REG_EMPTY_EXPR = __Regex_EmptySubExpression, // Empty expression + REG_ENOSYS, // The implementation does not support the function. + REG_ESPACE, // Out of memory. + REG_NOMATCH, // regexec() failed to match. +}; + +struct regmatch_t { + regoff_t rm_so; // byte offset from start of string to start of substring + regoff_t rm_eo; // byte offset from start of string of the first character after the end of substring + regoff_t rm_cnt; // number of matches +}; + +enum __RegexAllFlags { + __Regex_Global = 1, // All matches (don't return after first match) + __Regex_Insensitive = __Regex_Global << 1, // Case insensitive match (ignores case of [a-zA-Z]) + __Regex_Ungreedy = __Regex_Global << 2, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy + __Regex_Unicode = __Regex_Global << 3, // Enable all unicode features and interpret all unicode escape sequences as such + __Regex_Extended = __Regex_Global << 4, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored + __Regex_Extra = __Regex_Global << 5, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted + __Regex_MatchNotBeginOfLine = __Regex_Global << 6, // Pattern is not forced to ^ -> search in whole string! + __Regex_MatchNotEndOfLine = __Regex_Global << 7, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set + __Regex_SkipSubExprResults = __Regex_Global << 8, // Do not return sub expressions in the result + __Regex_StringCopyMatches = __Regex_Global << 9, // Do explicitly copy results into new allocated string instead of StringView to original string. + __Regex_SingleLine = __Regex_Global << 10, // Dot matches newline characters + __Regex_Sticky = __Regex_Global << 11, // Force the pattern to only match consecutive matches from where the previous match ended. + __Regex_Multiline = __Regex_Global << 12, // Handle newline characters. Match each line, one by one. + __Regex_SkipTrimEmptyMatches = __Regex_Global << 13, // Do not remove empty capture group results. + __Regex_Last = __Regex_SkipTrimEmptyMatches +}; + +// Values for the cflags parameter to the regcomp() function: +#define REG_EXTENDED __Regex_Extended // Use Extended Regular Expressions. +#define REG_ICASE __Regex_Insensitive // Ignore case in match. +#define REG_NOSUB __Regex_SkipSubExprResults // Report only success or fail in regexec(). +#define REG_GLOBAL __Regex_Global // Don't stop searching for more match +#define REG_NEWLINE (__Regex_Multiline | REG_GLOBAL) // Change the handling of newline. + +// Values for the eflags parameter to the regexec() function: +#define REG_NOTBOL __Regex_MatchNotBeginOfLine // The circumflex character (^), when taken as a special character, will not match the beginning of string. +#define REG_NOTEOL __Regex_MatchNotEndOfLine // The dollar sign ($), when taken as a special character, will not match the end of string. + +//static_assert (sizeof(FlagsUnderlyingType) * 8 >= regex::POSIXFlags::Last << 1), "flags type too small") +#define REG_SEARCH __Regex_Last << 1 + +int regcomp(regex_t*, const char*, int); +int regexec(const regex_t*, const char*, size_t, regmatch_t[], int); +size_t regerror(int, const regex_t*, char*, size_t); +void regfree(regex_t*); + +__END_DECLS diff --git a/Libraries/LibRegex/C/Regex.cpp b/Libraries/LibRegex/C/Regex.cpp new file mode 100644 index 0000000000..7bb36204fb --- /dev/null +++ b/Libraries/LibRegex/C/Regex.cpp @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <AK/String.h> +#include <AK/StringBuilder.h> +#include <LibRegex/Regex.h> +#include <ctype.h> +#include <stdio.h> +#include <string.h> + +#ifdef __serenity__ +# include <regex.h> +#else +# include <LibC/regex.h> +#endif + +struct internal_regex_t { + u8 cflags; + u8 eflags; + OwnPtr<Regex<PosixExtended>> re; + size_t re_pat_errpos; + ReError re_pat_err; + String re_pat; + size_t re_nsub; +}; + +static internal_regex_t* impl_from(regex_t* re) +{ + if (!re) + return nullptr; + + return reinterpret_cast<internal_regex_t*>(re->__data); +} + +static const internal_regex_t* impl_from(const regex_t* re) +{ + return impl_from(const_cast<regex_t*>(re)); +} + +extern "C" { + +int regcomp(regex_t* reg, const char* pattern, int cflags) +{ + if (!reg) + return REG_ESPACE; + + // Note that subsequent uses of regcomp() without regfree() _will_ leak memory + // This could've been prevented if libc provided a reginit() or similar, but it does not. + reg->__data = new internal_regex_t { 0, 0, nullptr, 0, ReError::REG_NOERR, {}, 0 }; + + auto preg = impl_from(reg); + + if (!(cflags & REG_EXTENDED)) + return REG_ENOSYS; + + preg->cflags = cflags; + + String pattern_str(pattern); + preg->re = make<Regex<PosixExtended>>(pattern_str, PosixOptions {} | (PosixFlags)cflags); + + auto parser_result = preg->re->parser_result; + if (parser_result.error != regex::Error::NoError) { + preg->re_pat_errpos = parser_result.error_token.position(); + preg->re_pat_err = (ReError)parser_result.error; + preg->re_pat = pattern; + + dbg() << "Have Error: " << (ReError)parser_result.error; + + return (ReError)parser_result.error; + } + + preg->re_nsub = parser_result.capture_groups_count; + + return REG_NOERR; +} + +int regexec(const regex_t* reg, const char* string, size_t nmatch, regmatch_t pmatch[], int eflags) +{ + auto preg = impl_from(reg); + + if (!preg->re || preg->re_pat_err) { + if (preg->re_pat_err) + return preg->re_pat_err; + return REG_BADPAT; + } + + RegexResult result; + if (eflags & REG_SEARCH) + result = preg->re->search({ string }, PosixOptions {} | (PosixFlags)eflags); + else + result = preg->re->match({ string }, PosixOptions {} | (PosixFlags)eflags); + + if (result.success) { + auto size = result.matches.size(); + if (size && nmatch && pmatch) { + pmatch[0].rm_cnt = size; + + size_t match_index { 0 }; + for (size_t i = 0; i < size; ++i) { + pmatch[match_index].rm_so = result.matches.at(i).global_offset; + pmatch[match_index].rm_eo = pmatch[match_index].rm_so + result.matches.at(i).view.length(); + if (match_index > 0) + pmatch[match_index].rm_cnt = result.capture_group_matches.size(); + + ++match_index; + if (match_index >= nmatch) + return REG_NOERR; + + if (i < result.capture_group_matches.size()) { + auto capture_groups_size = result.capture_group_matches.at(i).size(); + for (size_t j = 0; j < preg->re->parser_result.capture_groups_count; ++j) { + if (j >= capture_groups_size || !result.capture_group_matches.at(i).at(j).view.length()) { + pmatch[match_index].rm_so = -1; + pmatch[match_index].rm_eo = -1; + pmatch[match_index].rm_cnt = 0; + } else { + pmatch[match_index].rm_so = result.capture_group_matches.at(i).at(j).global_offset; + pmatch[match_index].rm_eo = pmatch[match_index].rm_so + result.capture_group_matches.at(i).at(j).view.length(); + pmatch[match_index].rm_cnt = 1; + } + + ++match_index; + if (match_index >= nmatch) + return REG_NOERR; + } + } + } + + if (match_index < nmatch) { + for (size_t i = match_index; i < nmatch; ++i) { + pmatch[i].rm_so = -1; + pmatch[i].rm_eo = -1; + pmatch[i].rm_cnt = 0; + } + } + } + return REG_NOERR; + } else { + if (nmatch && pmatch) { + pmatch[0].rm_so = -1; + pmatch[0].rm_eo = -1; + pmatch[0].rm_cnt = 0; + } + } + + return REG_NOMATCH; +} + +inline static String get_error(ReError errcode) +{ + String error; + switch ((ReError)errcode) { + case REG_NOERR: + error = "No error"; + break; + case REG_NOMATCH: + error = "regexec() failed to match."; + break; + case REG_BADPAT: + error = "Invalid regular expression."; + break; + case REG_ECOLLATE: + error = "Invalid collating element referenced."; + break; + case REG_ECTYPE: + error = "Invalid character class type referenced."; + break; + case REG_EESCAPE: + error = "Trailing \\ in pattern."; + break; + case REG_ESUBREG: + error = "Number in \\digit invalid or in error."; + break; + case REG_EBRACK: + error = "[ ] imbalance."; + break; + case REG_EPAREN: + error = "\\( \\) or ( ) imbalance."; + break; + case REG_EBRACE: + error = "\\{ \\} imbalance."; + break; + case REG_BADBR: + error = "Content of \\{ \\} invalid: not a number, number too large, more than two numbers, first larger than second."; + break; + case REG_ERANGE: + error = "Invalid endpoint in range expression."; + break; + case REG_ESPACE: + error = "Out of memory."; + break; + case REG_BADRPT: + error = "?, * or + not preceded by valid regular expression."; + break; + case REG_ENOSYS: + error = "The implementation does not support the function."; + break; + case REG_EMPTY_EXPR: + error = "Empty expression provided"; + break; + } + + return error; +} + +size_t regerror(int errcode, const regex_t* reg, char* errbuf, size_t errbuf_size) +{ + String error; + auto preg = impl_from(reg); + + if (!preg) + error = get_error((ReError)errcode); + else + error = preg->re->error_string(get_error(preg->re_pat_err)); + + if (!errbuf_size) + return error.length(); + + if (!error.copy_characters_to_buffer(errbuf, errbuf_size)) + return 0; + + return error.length(); +} + +void regfree(regex_t* reg) +{ + auto preg = impl_from(reg); + if (preg) { + delete preg; + reg->__data = nullptr; + } +} +} diff --git a/Libraries/LibRegex/CMakeLists.txt b/Libraries/LibRegex/CMakeLists.txt new file mode 100644 index 0000000000..44cc6928f9 --- /dev/null +++ b/Libraries/LibRegex/CMakeLists.txt @@ -0,0 +1,10 @@ +set(SOURCES + C/Regex.cpp + RegexByteCode.cpp + RegexLexer.cpp + RegexMatcher.cpp + RegexParser.cpp +) + +serenity_lib(LibRegex regex) +target_link_libraries(LibRegex LibC LibCore) diff --git a/Libraries/LibRegex/Forward.h b/Libraries/LibRegex/Forward.h new file mode 100644 index 0000000000..b35a0f3476 --- /dev/null +++ b/Libraries/LibRegex/Forward.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include <AK/Types.h> + +namespace regex { +enum class Error : u8; +class Lexer; +class PosixExtendedParser; + +class ByteCode; +class OpCode; +class OpCode_Exit; +class OpCode_Jump; +class OpCode_ForkJump; +class OpCode_ForkStay; +class OpCode_CheckBegin; +class OpCode_CheckEnd; +class OpCode_SaveLeftCaptureGroup; +class OpCode_SaveRightCaptureGroup; +class OpCode_SaveLeftNamedCaptureGroup; +class OpCode_SaveNamedLeftCaptureGroup; +class OpCode_SaveRightNamedCaptureGroup; +class OpCode_Compare; +} + +using regex::Error; +using regex::Lexer; +using regex::PosixExtendedParser; diff --git a/Libraries/LibRegex/Regex.h b/Libraries/LibRegex/Regex.h new file mode 100644 index 0000000000..d3fdcec7f4 --- /dev/null +++ b/Libraries/LibRegex/Regex.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include <LibRegex/Forward.h> +#include <LibRegex/RegexDebug.h> +#include <LibRegex/RegexMatcher.h> diff --git a/Libraries/LibRegex/RegexByteCode.cpp b/Libraries/LibRegex/RegexByteCode.cpp new file mode 100644 index 0000000000..27bd16002e --- /dev/null +++ b/Libraries/LibRegex/RegexByteCode.cpp @@ -0,0 +1,572 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "RegexByteCode.h" +#include "AK/StringBuilder.h" +#include "RegexDebug.h" + +#include <ctype.h> + +namespace regex { + +const char* OpCode::name(OpCodeId opcode_id) +{ + switch (opcode_id) { +#define __ENUMERATE_OPCODE(x) \ + case OpCodeId::x: \ + return #x; + ENUMERATE_OPCODES +#undef __ENUMERATE_OPCODE + default: + ASSERT_NOT_REACHED(); + return "<Unknown>"; + } +} + +const char* OpCode::name() const +{ + return name(opcode_id()); +} + +const char* execution_result_name(ExecutionResult result) +{ + switch (result) { +#define __ENUMERATE_EXECUTION_RESULT(x) \ + case ExecutionResult::x: \ + return #x; + ENUMERATE_EXECUTION_RESULTS +#undef __ENUMERATE_EXECUTION_RESULT + default: + ASSERT_NOT_REACHED(); + return "<Unknown>"; + } +} + +const char* character_compare_type_name(CharacterCompareType ch_compare_type) +{ + switch (ch_compare_type) { +#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) \ + case CharacterCompareType::x: \ + return #x; + ENUMERATE_CHARACTER_COMPARE_TYPES +#undef __ENUMERATE_CHARACTER_COMPARE_TYPE + default: + ASSERT_NOT_REACHED(); + return "<Unknown>"; + } +} + +static const char* character_class_name(CharClass ch_class) +{ + switch (ch_class) { +#define __ENUMERATE_CHARACTER_CLASS(x) \ + case CharClass::x: \ + return #x; + ENUMERATE_CHARACTER_CLASSES +#undef __ENUMERATE_CHARACTER_CLASS + default: + ASSERT_NOT_REACHED(); + return "<Unknown>"; + } +} + +HashMap<u32, OwnPtr<OpCode>> ByteCode::s_opcodes {}; + +ALWAYS_INLINE OpCode* ByteCode::get_opcode_by_id(OpCodeId id) const +{ + if (!s_opcodes.size()) { + for (u32 i = (u32)OpCodeId::First; i <= (u32)OpCodeId::Last; ++i) { + switch ((OpCodeId)i) { + case OpCodeId::Exit: + s_opcodes.set(i, make<OpCode_Exit>(*const_cast<ByteCode*>(this))); + break; + case OpCodeId::Jump: + s_opcodes.set(i, make<OpCode_Jump>(*const_cast<ByteCode*>(this))); + break; + case OpCodeId::Compare: + s_opcodes.set(i, make<OpCode_Compare>(*const_cast<ByteCode*>(this))); + break; + case OpCodeId::CheckEnd: + s_opcodes.set(i, make<OpCode_CheckEnd>(*const_cast<ByteCode*>(this))); + break; + case OpCodeId::ForkJump: + s_opcodes.set(i, make<OpCode_ForkJump>(*const_cast<ByteCode*>(this))); + break; + case OpCodeId::ForkStay: + s_opcodes.set(i, make<OpCode_ForkStay>(*const_cast<ByteCode*>(this))); + break; + case OpCodeId::CheckBegin: + s_opcodes.set(i, make<OpCode_CheckBegin>(*const_cast<ByteCode*>(this))); + break; + case OpCodeId::SaveLeftCaptureGroup: + s_opcodes.set(i, make<OpCode_SaveLeftCaptureGroup>(*const_cast<ByteCode*>(this))); + break; + case OpCodeId::SaveRightCaptureGroup: + s_opcodes.set(i, make<OpCode_SaveRightCaptureGroup>(*const_cast<ByteCode*>(this))); + break; + case OpCodeId::SaveLeftNamedCaptureGroup: + s_opcodes.set(i, make<OpCode_SaveLeftNamedCaptureGroup>(*const_cast<ByteCode*>(this))); + break; + case OpCodeId::SaveRightNamedCaptureGroup: + s_opcodes.set(i, make<OpCode_SaveRightNamedCaptureGroup>(*const_cast<ByteCode*>(this))); + break; + } + } + } + + if (id > OpCodeId::Last) + return nullptr; + + return const_cast<OpCode*>(s_opcodes.get((u32)id).value())->set_bytecode(*const_cast<ByteCode*>(this)); +} + +OpCode* ByteCode::get_opcode(MatchState& state) const +{ + OpCode* op_code; + + if (state.instruction_position >= size()) { + op_code = get_opcode_by_id(OpCodeId::Exit); + } else + op_code = get_opcode_by_id((OpCodeId)at(state.instruction_position)); + + if (op_code) + op_code->set_state(state); + + return op_code; +} + +ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(const MatchInput& input, MatchState& state, MatchOutput&) const +{ + if (state.string_position > input.view.length() || state.instruction_position >= m_bytecode->size()) + return ExecutionResult::Succeeded; + + return ExecutionResult::Failed; +} + +ALWAYS_INLINE ExecutionResult OpCode_Jump::execute(const MatchInput&, MatchState& state, MatchOutput&) const +{ + + state.instruction_position += offset(); + return ExecutionResult::Continue; +} + +ALWAYS_INLINE ExecutionResult OpCode_ForkJump::execute(const MatchInput&, MatchState& state, MatchOutput&) const +{ + state.fork_at_position = state.instruction_position + size() + offset(); + return ExecutionResult::Fork_PrioHigh; +} + +ALWAYS_INLINE ExecutionResult OpCode_ForkStay::execute(const MatchInput&, MatchState& state, MatchOutput&) const +{ + state.fork_at_position = state.instruction_position + size() + offset(); + return ExecutionResult::Fork_PrioLow; +} + +ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(const MatchInput& input, MatchState& state, MatchOutput&) const +{ + if (0 == state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine)) + return ExecutionResult::Failed; + + if ((0 == state.string_position && !(input.regex_options & AllFlags::MatchNotBeginOfLine)) + || (0 != state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine)) + || (0 == state.string_position && (input.regex_options & AllFlags::Global))) + return ExecutionResult::Continue; + + return ExecutionResult::Failed; +} + +ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input, MatchState& state, MatchOutput&) const +{ + if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine)) + return ExecutionResult::Failed; + + if ((state.string_position == input.view.length() && !(input.regex_options & AllFlags::MatchNotEndOfLine)) + || (state.string_position != input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine))) + return ExecutionResult::Succeeded; + + return ExecutionResult::Failed; +} + +ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const +{ + if (input.match_index >= output.capture_group_matches.size()) { + output.capture_group_matches.ensure_capacity(input.match_index); + auto capacity = output.capture_group_matches.capacity(); + for (size_t i = output.capture_group_matches.size(); i <= capacity; ++i) + output.capture_group_matches.empend(); + } + + if (id() >= output.capture_group_matches.at(input.match_index).size()) { + output.capture_group_matches.at(input.match_index).ensure_capacity(id()); + auto capacity = output.capture_group_matches.at(input.match_index).capacity(); + for (size_t i = output.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i) + output.capture_group_matches.at(input.match_index).empend(); + } + + output.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position; + return ExecutionResult::Continue; +} + +ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const +{ + auto& match = output.capture_group_matches.at(input.match_index).at(id()); + auto start_position = match.left_column; + auto length = state.string_position - start_position; + + if (start_position < match.column) + return ExecutionResult::Continue; + + ASSERT(start_position + length <= input.view.length()); + + auto view = input.view.substring_view(start_position, length); + + if (input.regex_options & AllFlags::StringCopyMatches) { + match = { view.to_string(), input.line, start_position, input.global_offset + start_position }; // create a copy of the original string + } else { + match = { view, input.line, start_position, input.global_offset + start_position }; // take view to original string + } + + return ExecutionResult::Continue; +} + +ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const +{ + if (input.match_index >= output.named_capture_group_matches.size()) { + output.named_capture_group_matches.ensure_capacity(input.match_index); + auto capacity = output.named_capture_group_matches.capacity(); + for (size_t i = output.named_capture_group_matches.size(); i <= capacity; ++i) + output.named_capture_group_matches.empend(); + } + output.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position; + return ExecutionResult::Continue; +} + +ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const +{ + StringView capture_group_name = name(); + + if (output.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) { + auto start_position = output.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column; + auto length = state.string_position - start_position; + + auto& map = output.named_capture_group_matches.at(input.match_index); + +#ifdef REGEX_DEBUG + ASSERT(start_position + length < input.view.length()); + dbg() << "Save named capture group with name=" << capture_group_name << " and content: " << input.view.substring_view(start_position, length).to_string(); +#endif + + ASSERT(start_position + length <= input.view.length()); + auto view = input.view.substring_view(start_position, length); + if (input.regex_options & AllFlags::StringCopyMatches) { + map.set(capture_group_name, { view.to_string(), input.line, start_position, input.global_offset + start_position }); // create a copy of the original string + } else { + map.set(capture_group_name, { view, input.line, start_position, input.global_offset + start_position }); // take view to original string + } + } else { + fprintf(stderr, "Didn't find corresponding capture group match for name=%s, match_index=%lu\n", capture_group_name.to_string().characters(), input.match_index); + } + + return ExecutionResult::Continue; +} + +ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput&) const +{ + bool inverse { false }; + + size_t string_position = state.string_position; + bool inverse_matched { false }; + + size_t offset { state.instruction_position + 3 }; + for (size_t i = 0; i < arguments_count(); ++i) { + if (state.string_position > string_position) + break; + + auto compare_type = (CharacterCompareType)m_bytecode->at(offset++); + + if (compare_type == CharacterCompareType::Inverse) + inverse = true; + + else if (compare_type == CharacterCompareType::Char) { + char ch = m_bytecode->at(offset++); + + // We want to compare a string that is longer or equal in length to the available string + if (input.view.length() - state.string_position < 1) + return ExecutionResult::Failed_ExecuteLowPrioForks; + + compare_char(input, state, ch, inverse, inverse_matched); + + } else if (compare_type == CharacterCompareType::AnyChar) { + // We want to compare a string that is definitely longer than the available string + if (input.view.length() - state.string_position < 1) + return ExecutionResult::Failed_ExecuteLowPrioForks; + + ASSERT(!inverse); + ++state.string_position; + + } else if (compare_type == CharacterCompareType::String) { + ASSERT(!inverse); + + char* str = reinterpret_cast<char*>(m_bytecode->at(offset++)); + auto& length = m_bytecode->at(offset++); + + // We want to compare a string that is definitely longer than the available string + if (input.view.length() - state.string_position < length) + return ExecutionResult::Failed_ExecuteLowPrioForks; + + if (!compare_string(input, state, str, length)) + return ExecutionResult::Failed_ExecuteLowPrioForks; + + } else if (compare_type == CharacterCompareType::CharClass) { + + if (input.view.length() - state.string_position < 1) + return ExecutionResult::Failed_ExecuteLowPrioForks; + + auto character_class = (CharClass)m_bytecode->at(offset++); + auto& ch = input.view[state.string_position]; + + compare_character_class(input, state, character_class, ch, inverse, inverse_matched); + + } else if (compare_type == CharacterCompareType::CharRange) { + auto value = (CharRange)m_bytecode->at(offset++); + + auto from = value.from; + auto to = value.to; + auto ch = input.view[state.string_position]; + + compare_character_range(input, state, from, to, ch, inverse, inverse_matched); + + } else { + fprintf(stderr, "Undefined comparison: %i\n", (int)compare_type); + ASSERT_NOT_REACHED(); + break; + } + } + + if (inverse && !inverse_matched) + ++state.string_position; + + if (string_position == state.string_position || state.string_position > input.view.length()) + return ExecutionResult::Failed_ExecuteLowPrioForks; + + return ExecutionResult::Continue; +} + +ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchState& state, char& ch, bool inverse, bool& inverse_matched) const +{ + auto ch1 = ch; + auto ch2 = input.view[state.string_position]; + + if (input.regex_options & AllFlags::Insensitive) { + ch1 = tolower(ch1); + ch2 = tolower(ch2); + } + + if (ch1 == ch2) { + if (inverse) + inverse_matched = true; + else + ++state.string_position; + } +} + +ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length) const +{ + auto str_view1 = StringView(str, length); + auto str_view2 = StringView(&input.view[state.string_position], length); + String str1, str2; + if (input.regex_options & AllFlags::Insensitive) { + str1 = str_view1.to_string().to_lowercase(); + str2 = str_view2.to_string().to_lowercase(); + str_view1 = str1.view(); + str_view2 = str2.view(); + } + + if (str_view1 == str_view2) { + state.string_position += length; + return true; + } else + return false; +} + +ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, char ch, bool inverse, bool& inverse_matched) const +{ + switch (character_class) { + case CharClass::Alnum: + if (isalnum(ch)) { + if (inverse) + inverse_matched = true; + else + ++state.string_position; + } + break; + case CharClass::Alpha: + if (isalpha(ch)) + ++state.string_position; + break; + case CharClass::Blank: + if (ch == ' ' || ch == '\t') { + if (inverse) + inverse_matched = true; + else + ++state.string_position; + } + break; + case CharClass::Cntrl: + if (iscntrl(ch)) { + if (inverse) + inverse_matched = true; + else + ++state.string_position; + } + break; + case CharClass::Digit: + if (isdigit(ch)) { + if (inverse) + inverse_matched = true; + else + ++state.string_position; + } + break; + case CharClass::Graph: + if (isgraph(ch)) { + if (inverse) + inverse_matched = true; + else + ++state.string_position; + } + break; + case CharClass::Lower: + if (islower(ch) || ((input.regex_options & AllFlags::Insensitive) && isupper(ch))) { + if (inverse) + inverse_matched = true; + else + ++state.string_position; + } + break; + case CharClass::Print: + if (isprint(ch)) { + if (inverse) + inverse_matched = true; + else + ++state.string_position; + } + break; + case CharClass::Punct: + if (ispunct(ch)) { + if (inverse) + inverse_matched = true; + else + ++state.string_position; + } + break; + case CharClass::Space: + if (isspace(ch)) { + if (inverse) + inverse_matched = true; + else + ++state.string_position; + } + break; + case CharClass::Upper: + if (isupper(ch) || ((input.regex_options & AllFlags::Insensitive) && islower(ch))) { + if (inverse) + inverse_matched = true; + else + ++state.string_position; + } + break; + case CharClass::Xdigit: + if (isxdigit(ch)) { + if (inverse) + inverse_matched = true; + else + ++state.string_position; + } + break; + } +} + +ALWAYS_INLINE void OpCode_Compare::compare_character_range(const MatchInput& input, MatchState& state, char from, char to, char ch, bool inverse, bool& inverse_matched) const +{ + if (input.regex_options & AllFlags::Insensitive) { + from = tolower(from); + to = tolower(to); + ch = tolower(ch); + } + + if (ch >= from && ch <= to) { + if (inverse) + inverse_matched = true; + else + ++state.string_position; + } +} + +const String OpCode_Compare::arguments_string() const +{ + return String::format("argc=%lu, args=%lu ", arguments_count(), arguments_size()); +} + +const Vector<String> OpCode_Compare::variable_arguments_to_string(Optional<MatchInput> input) const +{ + Vector<String> result; + + size_t offset { state().instruction_position + 3 }; + StringView view; + if (input.has_value()) + view = input.value().view; + + for (size_t i = 0; i < arguments_count(); ++i) { + auto compare_type = (CharacterCompareType)m_bytecode->at(offset++); + result.empend(String::format("type=%lu [%s]", (size_t)compare_type, character_compare_type_name(compare_type))); + + if (compare_type == CharacterCompareType::Char) { + char ch = m_bytecode->at(offset++); + result.empend(String::format("value='%c'", ch)); + if (!view.is_null()) + result.empend(String::format("compare against: '%s'", String { view.substring_view(state().string_position, state().string_position + 1 > view.length() ? 0 : 1) }.characters())); + } else if (compare_type == CharacterCompareType::String) { + char* str = reinterpret_cast<char*>(m_bytecode->at(offset++)); + auto& length = m_bytecode->at(offset++); + result.empend(String::format("value=\"%s\"", String { str, length }.characters())); + if (!view.is_null()) + result.empend(String::format("compare against: \"%s\"", String { input.value().view.substring_view(state().string_position, state().string_position + length > view.length() ? 0 : length) }.characters())); + } else if (compare_type == CharacterCompareType::CharClass) { + auto character_class = (CharClass)m_bytecode->at(offset++); + result.empend(String::format("ch_class=%lu [%s]", (size_t)character_class, character_class_name(character_class))); + if (!view.is_null()) + result.empend(String::format("compare against: '%s'", String { input.value().view.substring_view(state().string_position, state().string_position + 1 > view.length() ? 0 : 1) }.characters())); + } else if (compare_type == CharacterCompareType::CharRange) { + auto value = (CharRange)m_bytecode->at(offset++); + result.empend(String::format("ch_range='%c'-'%c'", value.from, value.to)); + if (!view.is_null()) + result.empend(String::format("compare against: '%s'", String { input.value().view.substring_view(state().string_position, state().string_position + 1 > view.length() ? 0 : 1) }.characters())); + } + } + return result; +} +} diff --git a/Libraries/LibRegex/RegexByteCode.h b/Libraries/LibRegex/RegexByteCode.h new file mode 100644 index 0000000000..d3ffdcef67 --- /dev/null +++ b/Libraries/LibRegex/RegexByteCode.h @@ -0,0 +1,637 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include "RegexMatch.h" +#include "RegexOptions.h" + +#include <AK/Forward.h> +#include <AK/HashMap.h> +#include <AK/NonnullOwnPtr.h> +#include <AK/OwnPtr.h> +#include <AK/Traits.h> +#include <AK/Types.h> +#include <AK/Vector.h> + +namespace regex { + +using ByteCodeValueType = size_t; + +#define ENUMERATE_OPCODES \ + __ENUMERATE_OPCODE(Compare) \ + __ENUMERATE_OPCODE(Jump) \ + __ENUMERATE_OPCODE(ForkJump) \ + __ENUMERATE_OPCODE(ForkStay) \ + __ENUMERATE_OPCODE(SaveLeftCaptureGroup) \ + __ENUMERATE_OPCODE(SaveRightCaptureGroup) \ + __ENUMERATE_OPCODE(SaveLeftNamedCaptureGroup) \ + __ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \ + __ENUMERATE_OPCODE(CheckBegin) \ + __ENUMERATE_OPCODE(CheckEnd) \ + __ENUMERATE_OPCODE(Exit) + +enum class OpCodeId : ByteCodeValueType { +#define __ENUMERATE_OPCODE(x) x, + ENUMERATE_OPCODES +#undef __ENUMERATE_OPCODE + + First + = Compare, + Last + = Exit, +}; + +#define ENUMERATE_CHARACTER_COMPARE_TYPES \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(String) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) + +enum class CharacterCompareType : ByteCodeValueType { +#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x, + ENUMERATE_CHARACTER_COMPARE_TYPES +#undef __ENUMERATE_CHARACTER_COMPARE_TYPE +}; + +#define ENUMERATE_CHARACTER_CLASSES \ + __ENUMERATE_CHARACTER_CLASS(Alnum) \ + __ENUMERATE_CHARACTER_CLASS(Cntrl) \ + __ENUMERATE_CHARACTER_CLASS(Lower) \ + __ENUMERATE_CHARACTER_CLASS(Space) \ + __ENUMERATE_CHARACTER_CLASS(Alpha) \ + __ENUMERATE_CHARACTER_CLASS(Digit) \ + __ENUMERATE_CHARACTER_CLASS(Print) \ + __ENUMERATE_CHARACTER_CLASS(Upper) \ + __ENUMERATE_CHARACTER_CLASS(Blank) \ + __ENUMERATE_CHARACTER_CLASS(Graph) \ + __ENUMERATE_CHARACTER_CLASS(Punct) \ + __ENUMERATE_CHARACTER_CLASS(Xdigit) + +enum class CharClass : ByteCodeValueType { +#define __ENUMERATE_CHARACTER_CLASS(x) x, + ENUMERATE_CHARACTER_CLASSES +#undef __ENUMERATE_CHARACTER_CLASS +}; + +struct CharRange { + const char from; + const char to; + + CharRange(size_t value) + : from(value >> 8) + , to(value & 0xFF) + { + } + + CharRange(char from, char to) + : from(from) + , to(to) + { + } + + operator ByteCodeValueType() const { return (from << 8) | to; } +}; + +struct CompareTypeAndValuePair { + CharacterCompareType type; + ByteCodeValueType value; +}; + +class OpCode; + +class ByteCode : public Vector<ByteCodeValueType> { +public: + ByteCode() = default; + virtual ~ByteCode() = default; + + void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs) + { + ByteCode bytecode; + + bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare)); + bytecode.empend(pairs.size()); // number of arguments + + ByteCode arguments; + for (auto& value : pairs) { + ASSERT(value.type != CharacterCompareType::RangeExpressionDummy); + ASSERT(value.type != CharacterCompareType::Undefined); + ASSERT(value.type != CharacterCompareType::String); + + arguments.append((ByteCodeValueType)value.type); + if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar) + arguments.append(move(value.value)); + } + + bytecode.empend(arguments.size()); // size of arguments + bytecode.append(move(arguments)); + + append(move(bytecode)); + } + + void insert_bytecode_compare_string(StringView view, size_t length) + { + ByteCode bytecode; + + bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare)); + bytecode.empend(1); // number of arguments + + ByteCode arguments; + + arguments.empend(static_cast<ByteCodeValueType>(CharacterCompareType::String)); + arguments.empend(reinterpret_cast<ByteCodeValueType>(view.characters_without_null_termination())); + arguments.empend(length); + + bytecode.empend(arguments.size()); // size of arguments + bytecode.append(move(arguments)); + + append(move(bytecode)); + } + + void insert_bytecode_group_capture_left(size_t capture_groups_count) + { + empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftCaptureGroup)); + empend(capture_groups_count); + } + + void insert_bytecode_group_capture_left(const StringView& name) + { + empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftNamedCaptureGroup)); + empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination())); + empend(name.length()); + } + + void insert_bytecode_group_capture_right(size_t capture_groups_count) + { + empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightCaptureGroup)); + empend(capture_groups_count); + } + + void insert_bytecode_group_capture_right(const StringView& name) + { + empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightNamedCaptureGroup)); + empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination())); + empend(name.length()); + } + + void insert_bytecode_alternation(ByteCode&& left, ByteCode&& right) + { + + // FORKSTAY _ALT + // REGEXP ALT1 + // JUMP _END + // LABEL _ALT + // REGEXP ALT2 + // LABEL _END + + ByteCode byte_code; + + empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump)); + empend(left.size() + 2); // Jump to the _ALT label + + for (auto& op : left) + append(move(op)); + + empend(static_cast<ByteCodeValueType>(OpCodeId::Jump)); + empend(right.size()); // Jump to the _END label + + // LABEL _ALT = bytecode.size() + 2 + + for (auto& op : right) + append(move(op)); + + // LABEL _END = alterantive_bytecode.size + } + + void insert_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, size_t minimum, Optional<size_t> maximum) + { + ByteCode new_bytecode; + new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum); + + if (maximum.has_value()) { + if (maximum.value() > minimum) { + auto diff = maximum.value() - minimum; + new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay)); + new_bytecode.empend(diff * (bytecode_to_repeat.size() + 2)); // Jump to the _END label + + for (size_t i = 0; i < diff; ++i) { + new_bytecode.append(bytecode_to_repeat); + new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay)); + new_bytecode.empend((diff - i - 1) * (bytecode_to_repeat.size() + 2)); // Jump to the _END label + } + } + } else { + // no maximum value set, repeat finding if possible + new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump)); + new_bytecode.empend(-bytecode_to_repeat.size() - 2); // Jump to the last iteration + } + + bytecode_to_repeat = move(new_bytecode); + } + + void insert_bytecode_repetition_n(ByteCode& bytecode_to_repeat, size_t n) + { + for (size_t i = 0; i < n; ++i) + append(bytecode_to_repeat); + } + + void insert_bytecode_repetition_min_one(ByteCode& bytecode_to_repeat, bool greedy) + { + // LABEL _START = -bytecode_to_repeat.size() + // REGEXP + // FORKJUMP _START (FORKSTAY -> Greedy) + + if (greedy) + bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay)); + else + bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump)); + + bytecode_to_repeat.empend(-(bytecode_to_repeat.size() + 1)); // Jump to the _START label + } + + void insert_bytecode_repetition_any(ByteCode& bytecode_to_repeat, bool greedy) + { + // LABEL _START + // FORKSTAY _END (FORKJUMP -> Greedy) + // REGEXP + // JUMP _START + // LABEL _END + + // LABEL _START = m_bytes.size(); + ByteCode bytecode; + + if (greedy) + bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump)); + else + bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay)); + + bytecode.empend(bytecode_to_repeat.size() + 2); // Jump to the _END label + + for (auto& op : bytecode_to_repeat) + bytecode.append(move(op)); + + bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Jump)); + bytecode.empend(-bytecode.size() - 1); // Jump to the _START label + // LABEL _END = bytecode.size() + + bytecode_to_repeat = move(bytecode); + } + + void insert_bytecode_repetition_zero_or_one(ByteCode& bytecode_to_repeat, bool greedy) + { + // FORKSTAY _END (FORKJUMP -> Greedy) + // REGEXP + // LABEL _END + ByteCode bytecode; + + if (greedy) + bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump)); + else + bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay)); + + bytecode.empend(bytecode_to_repeat.size()); // Jump to the _END label + + for (auto& op : bytecode_to_repeat) + bytecode.append(move(op)); + // LABEL _END = bytecode.size() + + bytecode_to_repeat = move(bytecode); + } + + OpCode* get_opcode(MatchState& state) const; + +private: + ALWAYS_INLINE OpCode* get_opcode_by_id(OpCodeId id) const; + static HashMap<u32, OwnPtr<OpCode>> s_opcodes; +}; + +#define ENUMERATE_EXECUTION_RESULTS \ + __ENUMERATE_EXECUTION_RESULT(Continue) \ + __ENUMERATE_EXECUTION_RESULT(Fork_PrioHigh) \ + __ENUMERATE_EXECUTION_RESULT(Fork_PrioLow) \ + __ENUMERATE_EXECUTION_RESULT(Failed) \ + __ENUMERATE_EXECUTION_RESULT(Failed_ExecuteLowPrioForks) \ + __ENUMERATE_EXECUTION_RESULT(Succeeded) + +enum class ExecutionResult : u8 { +#define __ENUMERATE_EXECUTION_RESULT(x) x, + ENUMERATE_EXECUTION_RESULTS +#undef __ENUMERATE_EXECUTION_RESULT +}; + +const char* execution_result_name(ExecutionResult result); +const char* opcode_id_name(OpCodeId opcode_id); +const char* character_compare_type_name(CharacterCompareType result); +const char* execution_result_name(ExecutionResult result); + +class OpCode { +public: + OpCode(ByteCode& bytecode) + : m_bytecode(&bytecode) + { + } + + virtual ~OpCode() = default; + + virtual OpCodeId opcode_id() const = 0; + virtual size_t size() const = 0; + virtual ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const = 0; + + ALWAYS_INLINE ByteCodeValueType argument(size_t offset) const + { + ASSERT(state().instruction_position + offset <= m_bytecode->size()); + return m_bytecode->at(state().instruction_position + 1 + offset); + } + + ALWAYS_INLINE const char* name() const; + static const char* name(const OpCodeId); + + ALWAYS_INLINE OpCode* set_state(MatchState& state) + { + m_state = &state; + return this; + } + + ALWAYS_INLINE OpCode* set_bytecode(ByteCode& bytecode) + { + m_bytecode = &bytecode; + return this; + } + + ALWAYS_INLINE void reset_state() { m_state.clear(); } + + ALWAYS_INLINE const MatchState& state() const + { + ASSERT(m_state.has_value()); + return *m_state.value(); + } + + const String to_string() const + { + return String::format("[0x%02X] %s", opcode_id(), name(opcode_id())); + } + + virtual const String arguments_string() const = 0; + + ALWAYS_INLINE const ByteCode& bytecode() const { return *m_bytecode; } + +protected: + ByteCode* m_bytecode; + Optional<MatchState*> m_state; +}; + + +class OpCode_Exit final : public OpCode { +public: + OpCode_Exit(ByteCode& bytecode) + : OpCode(bytecode) + { + } + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Exit; } + ALWAYS_INLINE size_t size() const override { return 1; } + const String arguments_string() const override { return ""; } +}; + +class OpCode_Jump final : public OpCode { +public: + OpCode_Jump(ByteCode& bytecode) + : OpCode(bytecode) + { + } + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Jump; } + ALWAYS_INLINE size_t size() const override { return 2; } + ALWAYS_INLINE ssize_t offset() const { return argument(0); } + const String arguments_string() const override + { + return String::format("offset=%i [&%lu]", offset(), state().instruction_position + size() + offset()); + } +}; + +class OpCode_ForkJump final : public OpCode { +public: + OpCode_ForkJump(ByteCode& bytecode) + : OpCode(bytecode) + { + } + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkJump; } + ALWAYS_INLINE size_t size() const override { return 2; } + ALWAYS_INLINE ssize_t offset() const { return argument(0); } + const String arguments_string() const override + { + return String::format("offset=%i [&%lu], sp: %lu", offset(), state().instruction_position + size() + offset(), state().string_position); + } +}; + +class OpCode_ForkStay final : public OpCode { +public: + OpCode_ForkStay(ByteCode& bytecode) + : OpCode(bytecode) + { + } + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkStay; } + ALWAYS_INLINE size_t size() const override { return 2; } + ALWAYS_INLINE ssize_t offset() const { return argument(0); } + const String arguments_string() const override + { + return String::format("offset=%i [&%lu], sp: %lu", offset(), state().instruction_position + size() + offset(), state().string_position); + } +}; + +class OpCode_CheckBegin final : public OpCode { +public: + OpCode_CheckBegin(ByteCode& bytecode) + : OpCode(bytecode) + { + } + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBegin; } + ALWAYS_INLINE size_t size() const override { return 1; } + const String arguments_string() const override { return ""; } +}; + +class OpCode_CheckEnd final : public OpCode { +public: + OpCode_CheckEnd(ByteCode& bytecode) + : OpCode(bytecode) + { + } + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckEnd; } + ALWAYS_INLINE size_t size() const override { return 1; } + const String arguments_string() const override { return ""; } +}; + +class OpCode_SaveLeftCaptureGroup final : public OpCode { +public: + OpCode_SaveLeftCaptureGroup(ByteCode& bytecode) + : OpCode(bytecode) + { + } + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftCaptureGroup; } + ALWAYS_INLINE size_t size() const override { return 2; } + ALWAYS_INLINE size_t id() const { return argument(0); } + const String arguments_string() const override { return String::format("id=%lu", id()); } +}; + +class OpCode_SaveRightCaptureGroup final : public OpCode { +public: + OpCode_SaveRightCaptureGroup(ByteCode& bytecode) + : OpCode(bytecode) + { + } + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightCaptureGroup; } + ALWAYS_INLINE size_t size() const override { return 2; } + ALWAYS_INLINE size_t id() const { return argument(0); } + const String arguments_string() const override { return String::format("id=%lu", id()); } +}; + +class OpCode_SaveLeftNamedCaptureGroup final : public OpCode { +public: + OpCode_SaveLeftNamedCaptureGroup(ByteCode& bytecode) + : OpCode(bytecode) + { + } + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftNamedCaptureGroup; } + ALWAYS_INLINE size_t size() const override { return 3; } + ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; } + ALWAYS_INLINE size_t length() const { return argument(1); } + const String arguments_string() const override + { + return String::format("name=%s, length=%lu", name().to_string().characters(), length()); + } +}; + +class OpCode_SaveRightNamedCaptureGroup final : public OpCode { +public: + OpCode_SaveRightNamedCaptureGroup(ByteCode& bytecode) + : OpCode(bytecode) + { + } + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightNamedCaptureGroup; } + ALWAYS_INLINE size_t size() const override { return 3; } + ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; } + ALWAYS_INLINE size_t length() const { return argument(1); } + const String arguments_string() const override + { + return String::format("name=%s, length=%lu", name().to_string().characters(), length()); + } +}; + +class OpCode_Compare final : public OpCode { +public: + OpCode_Compare(ByteCode& bytecode) + : OpCode(bytecode) + { + } + ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override; + ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Compare; } + ALWAYS_INLINE size_t size() const override { return arguments_size() + 3; } + ALWAYS_INLINE size_t arguments_count() const { return argument(0); } + ALWAYS_INLINE size_t arguments_size() const { return argument(1); } + const String arguments_string() const override; + const Vector<String> variable_arguments_to_string(Optional<MatchInput> input = {}) const; + +private: + ALWAYS_INLINE void compare_char(const MatchInput& input, MatchState& state, char& ch, bool inverse, bool& inverse_matched) const; + ALWAYS_INLINE bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length) const; + ALWAYS_INLINE void compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, char ch, bool inverse, bool& inverse_matched) const; + ALWAYS_INLINE void compare_character_range(const MatchInput& input, MatchState& state, char from, char to, char ch, bool inverse, bool& inverse_matched) const; +}; + +template<typename T> +bool is(const OpCode&); + +template<typename T> +ALWAYS_INLINE bool is(const OpCode&) +{ + return false; +} + +template<typename T> +ALWAYS_INLINE bool is(const OpCode* opcode) +{ + return is<T>(*opcode); +} + +template<> +ALWAYS_INLINE bool is<OpCode_ForkStay>(const OpCode& opcode) +{ + return opcode.opcode_id() == OpCodeId::ForkStay; +} + +template<> +ALWAYS_INLINE bool is<OpCode_Exit>(const OpCode& opcode) +{ + return opcode.opcode_id() == OpCodeId::Exit; +} + +template<> +ALWAYS_INLINE bool is<OpCode_Compare>(const OpCode& opcode) +{ + return opcode.opcode_id() == OpCodeId::Compare; +} + +template<typename T> +ALWAYS_INLINE const T& to(const OpCode& opcode) +{ + ASSERT(is<T>(opcode)); + return static_cast<const T&>(opcode); +} + +template<typename T> +ALWAYS_INLINE T* to(OpCode* opcode) +{ + ASSERT(is<T>(opcode)); + return static_cast<T*>(opcode); +} + +template<typename T> +ALWAYS_INLINE const T* to(const OpCode* opcode) +{ + ASSERT(is<T>(opcode)); + return static_cast<const T*>(opcode); +} + +template<typename T> +ALWAYS_INLINE T& to(OpCode& opcode) +{ + ASSERT(is<T>(opcode)); + return static_cast<T&>(opcode); +} + +} diff --git a/Libraries/LibRegex/RegexDebug.h b/Libraries/LibRegex/RegexDebug.h new file mode 100644 index 0000000000..94c04eddd1 --- /dev/null +++ b/Libraries/LibRegex/RegexDebug.h @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include "LibRegex/RegexMatcher.h" +#include "AK/StringBuilder.h" + +//#define REGEX_DEBUG + +#ifdef REGEX_DEBUG + +namespace regex { + +class RegexDebug { +public: + RegexDebug(FILE* file = stdout) + : m_file(file) + { + } + + virtual ~RegexDebug() = default; + + template<typename T> + void print_raw_bytecode(Regex<T>& regex) const + { + auto& bytecode = regex.parser_result.bytecode; + size_t index { 0 }; + for (auto& value : bytecode) { + fprintf(m_file, "OpCode i=%3lu [0x%02X]\n", index, (u32)value); + ++index; + } + } + + template<typename T> + void print_bytecode(Regex<T>& regex) const + { + MatchState state; + auto& bytecode = regex.parser_result.bytecode; + + for (;;) { + auto* opcode = bytecode.get_opcode(state); + if (!opcode) { + dbg() << "Wrong opcode... failed!"; + return; + } + + print_opcode("PrintBytecode", *opcode, state); + fprintf(m_file, "%s", m_debug_stripline.characters()); + + if (is<OpCode_Exit>(*opcode)) + break; + + state.instruction_position += opcode->size(); + } + + fflush(m_file); + } + + void print_opcode(const String& system, OpCode& opcode, MatchState& state, size_t recursion = 0, bool newline = true) const + { + fprintf(m_file, "%-15s | %-5lu | %-9lu | %-35s | %-30s | %-20s%s", + system.characters(), + state.instruction_position, + recursion, + opcode.to_string().characters(), + opcode.arguments_string().characters(), + String::format("ip: %3lu, sp: %3lu", state.instruction_position, state.string_position).characters(), + newline ? "\n" : ""); + + if (newline && is<OpCode_Compare>(opcode)) { + for (auto& line : to<OpCode_Compare>(opcode).variable_arguments_to_string()) { + fprintf(m_file, "%-15s | %-5s | %-9s | %-35s | %-30s | %-20s%s", "", "", "", "", line.characters(), "", "\n"); + } + } + } + + void print_result(const OpCode& opcode, const ByteCode& bytecode, const MatchInput& input, MatchState& state, ExecutionResult result) const + { + StringBuilder builder; + builder.append(execution_result_name(result)); + if (result == ExecutionResult::Succeeded) { + builder.appendf(", ip: %lu/%lu, sp: %lu/%lu", state.instruction_position, bytecode.size() - 1, state.string_position, input.view.length() - 1); + } else if (result == ExecutionResult::Fork_PrioHigh) { + builder.appendf(", next ip: %lu", state.fork_at_position + opcode.size()); + } else if (result != ExecutionResult::Failed) { + builder.appendf(", next ip: %lu", state.instruction_position + opcode.size()); + } + + fprintf(m_file, " | %-20s\n", builder.to_string().characters()); + + if (is<OpCode_Compare>(opcode)) { + for (auto& line : to<OpCode_Compare>(opcode).variable_arguments_to_string(input)) { + fprintf(m_file, "%-15s | %-5s | %-9s | %-35s | %-30s | %-20s%s", "", "", "", "", line.characters(), "", "\n"); + } + } + + fprintf(m_file, "%s", m_debug_stripline.characters()); + } + + void print_header() + { + StringBuilder builder; + builder.appendf("%-15s | %-5s | %-9s | %-35s | %-30s | %-20s | %-20s\n", "System", "Index", "Recursion", "OpCode", "Arguments", "State", "Result"); + auto length = builder.length(); + for (size_t i = 0; i < length; ++i) { + builder.append('='); + } + + fprintf(m_file, "%s\n", builder.to_string().characters()); + fflush(m_file); + + builder.clear(); + for (size_t i = 0; i < length; ++i) { + builder.append('-'); + } + builder.append('\n'); + m_debug_stripline = builder.to_string(); + } + +private: + String m_debug_stripline; + FILE* m_file; +}; + +} + +using regex::RegexDebug; + +#endif diff --git a/Libraries/LibRegex/RegexError.h b/Libraries/LibRegex/RegexError.h new file mode 100644 index 0000000000..90d6a71eca --- /dev/null +++ b/Libraries/LibRegex/RegexError.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include <AK/String.h> +#include <AK/Types.h> +#ifdef __serenity__ +# include <regex.h> +#else +# include <LibC/regex.h> +#endif + +namespace regex { + +enum class Error : u8 { + NoError = __Regex_NoError, + InvalidPattern = __Regex_InvalidPattern, // Invalid regular expression. + InvalidCollationElement = __Regex_InvalidCollationElement, // Invalid collating element referenced. + InvalidCharacterClass = __Regex_InvalidCharacterClass, // Invalid character class type referenced. + InvalidTrailingEscape = __Regex_InvalidTrailingEscape, // Trailing \ in pattern. + InvalidNumber = __Regex_InvalidNumber, // Number in \digit invalid or in error. + MismatchingBracket = __Regex_MismatchingBracket, // [ ] imbalance. + MismatchingParen = __Regex_MismatchingParen, // ( ) imbalance. + MismatchingBrace = __Regex_MismatchingBrace, // { } imbalance. + InvalidBraceContent = __Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second. + InvalidBracketContent = __Regex_InvalidBracketContent, // Content of [] invalid. + InvalidRange = __Regex_InvalidRange, // Invalid endpoint in range expression. + InvalidRepetitionMarker = __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression. + ReachedMaxRecursion = __Regex_ReachedMaxRecursion, // MaximumRecursion has been reached. + EmptySubExpression = __Regex_EmptySubExpression, // Sub expression has empty content. + InvalidCaptureGroup = __Regex_InvalidCaptureGroup, // Content of capture group is invalid. + InvalidNameForCaptureGroup = __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid. +}; + +inline String get_error_string(Error error) +{ + switch (error) { + case Error::NoError: + return "No error"; + case Error::InvalidPattern: + return "Invalid regular expression."; + case Error::InvalidCollationElement: + return "Invalid collating element referenced."; + case Error::InvalidCharacterClass: + return "Invalid character class type referenced."; + case Error::InvalidTrailingEscape: + return "Trailing \\ in pattern."; + case Error::InvalidNumber: + return "Number in \\digit invalid or in error."; + case Error::MismatchingBracket: + return "[ ] imbalance."; + case Error::MismatchingParen: + return "( ) imbalance."; + case Error::MismatchingBrace: + return "{ } imbalance."; + case Error::InvalidBraceContent: + return "Content of {} invalid: not a number, number too large, more than two numbers, first larger than second."; + case Error::InvalidBracketContent: + return "Content of [] invalid."; + case Error::InvalidRange: + return "Invalid endpoint in range expression."; + case Error::InvalidRepetitionMarker: + return "?, * or + not preceded by valid regular expression."; + case Error::ReachedMaxRecursion: + return "Maximum recursion has been reached."; + case Error::EmptySubExpression: + return "Sub expression has empty content."; + case Error::InvalidCaptureGroup: + return "Content of capture group is invalid."; + case Error::InvalidNameForCaptureGroup: + return "Name of capture group is invalid."; + } + return "Undefined error."; +} +} + +using regex::Error; +using regex::get_error_string; diff --git a/Libraries/LibRegex/RegexLexer.cpp b/Libraries/LibRegex/RegexLexer.cpp new file mode 100644 index 0000000000..1cac7b1222 --- /dev/null +++ b/Libraries/LibRegex/RegexLexer.cpp @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "RegexLexer.h" +#include <AK/Assertions.h> +#include <stdio.h> + +namespace regex { + +const char* Token::name(const TokenType type) +{ + switch (type) { +#define __ENUMERATE_REGEX_TOKEN(x) \ + case TokenType::x: \ + return #x; + ENUMERATE_REGEX_TOKENS +#undef __ENUMERATE_REGEX_TOKEN + default: + ASSERT_NOT_REACHED(); + return "<Unknown>"; + } +} + +const char* Token::name() const +{ + return name(m_type); +} + +Lexer::Lexer(const StringView source) + : m_source(source) +{ +} + +ALWAYS_INLINE char Lexer::peek(size_t offset) const +{ + if ((m_position + offset) >= m_source.length()) + return EOF; + return m_source[m_position + offset]; +} + +void Lexer::back(size_t offset) +{ + m_position -= offset; + m_previous_position = m_position - 1; + m_current_char = m_source[m_position]; +} + +ALWAYS_INLINE void Lexer::consume() +{ + m_previous_position = m_position; + + if (m_position >= m_source.length()) { + m_position = m_source.length() + 1; + m_current_char = EOF; + return; + } + + m_current_char = m_source[m_position++]; +} + +void Lexer::reset() +{ + m_position = 0; + m_current_token = { TokenType::Eof, 0, StringView(nullptr) }; + m_current_char = 0; + m_previous_position = 0; +} + +Token Lexer::next() +{ + size_t token_start_position; + + auto begin_token = [&] { + token_start_position = m_position; + }; + + auto commit_token = [&](auto type) -> Token& { + ASSERT(token_start_position + m_previous_position - token_start_position + 1 <= m_source.length()); + auto substring = m_source.substring_view(token_start_position, m_previous_position - token_start_position + 1); + m_current_token = Token(type, token_start_position, substring); + return m_current_token; + }; + + auto emit_token = [&](auto type) -> Token& { + m_current_token = Token(type, m_position, m_source.substring_view(m_position, 1)); + consume(); + return m_current_token; + }; + + auto match_escape_sequence = [&]() -> size_t { + switch (peek(1)) { + case '^': + case '.': + case '[': + case ']': + case '$': + case '(': + case ')': + case '|': + case '*': + case '+': + case '?': + case '{': + case '\\': + return 2; + default: + fprintf(stderr, "[LEXER] Found invalid escape sequence: \\%c\n", peek(1)); + return 0; + } + }; + + while (m_position <= m_source.length()) { + auto ch = peek(); + if (ch == '(') + return emit_token(TokenType::LeftParen); + + if (ch == ')') + return emit_token(TokenType::RightParen); + + if (ch == '{') + return emit_token(TokenType::LeftCurly); + + if (ch == '}') + return emit_token(TokenType::RightCurly); + + if (ch == '[') + return emit_token(TokenType::LeftBracket); + + if (ch == ']') + return emit_token(TokenType::RightBracket); + + if (ch == '.') + return emit_token(TokenType::Period); + + if (ch == '*') + return emit_token(TokenType::Asterisk); + + if (ch == '+') + return emit_token(TokenType::Plus); + + if (ch == '$') + return emit_token(TokenType::Dollar); + + if (ch == '^') + return emit_token(TokenType::Circumflex); + + if (ch == '|') + return emit_token(TokenType::Pipe); + + if (ch == '?') + return emit_token(TokenType::Questionmark); + + if (ch == ',') + return emit_token(TokenType::Comma); + + if (ch == '/') + return emit_token(TokenType::Slash); + + if (ch == '=') + return emit_token(TokenType::EqualSign); + + if (ch == ':') + return emit_token(TokenType::Colon); + + if (ch == '-') + return emit_token(TokenType::HyphenMinus); + + if (ch == '\\') { + size_t escape = match_escape_sequence(); + if (escape > 0) { + begin_token(); + for (size_t i = 0; i < escape; ++i) + consume(); + return commit_token(TokenType::EscapeSequence); + } + } + + if (ch == EOF) + break; + + return emit_token(TokenType::Char); + } + + return Token(TokenType::Eof, m_position, nullptr); +} + +} diff --git a/Libraries/LibRegex/RegexLexer.h b/Libraries/LibRegex/RegexLexer.h new file mode 100644 index 0000000000..77a7017990 --- /dev/null +++ b/Libraries/LibRegex/RegexLexer.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include <AK/Forward.h> +#include <AK/StringView.h> + +namespace regex { + +#define ENUMERATE_REGEX_TOKENS \ + __ENUMERATE_REGEX_TOKEN(Eof) \ + __ENUMERATE_REGEX_TOKEN(Char) \ + __ENUMERATE_REGEX_TOKEN(Circumflex) \ + __ENUMERATE_REGEX_TOKEN(Period) \ + __ENUMERATE_REGEX_TOKEN(LeftParen) \ + __ENUMERATE_REGEX_TOKEN(RightParen) \ + __ENUMERATE_REGEX_TOKEN(LeftCurly) \ + __ENUMERATE_REGEX_TOKEN(RightCurly) \ + __ENUMERATE_REGEX_TOKEN(LeftBracket) \ + __ENUMERATE_REGEX_TOKEN(RightBracket) \ + __ENUMERATE_REGEX_TOKEN(Asterisk) \ + __ENUMERATE_REGEX_TOKEN(EscapeSequence) \ + __ENUMERATE_REGEX_TOKEN(Dollar) \ + __ENUMERATE_REGEX_TOKEN(Pipe) \ + __ENUMERATE_REGEX_TOKEN(Plus) \ + __ENUMERATE_REGEX_TOKEN(Comma) \ + __ENUMERATE_REGEX_TOKEN(Slash) \ + __ENUMERATE_REGEX_TOKEN(EqualSign) \ + __ENUMERATE_REGEX_TOKEN(HyphenMinus) \ + __ENUMERATE_REGEX_TOKEN(Colon) \ + __ENUMERATE_REGEX_TOKEN(Questionmark) + +enum class TokenType { +#define __ENUMERATE_REGEX_TOKEN(x) x, + ENUMERATE_REGEX_TOKENS +#undef __ENUMERATE_REGEX_TOKEN +}; + +class Token { +public: + Token() = default; + Token(const TokenType type, const size_t start_position, const StringView value) + : m_type(type) + , m_position(start_position) + , m_value(value) + { + } + + TokenType type() const { return m_type; } + const StringView& value() const { return m_value; } + size_t position() const { return m_position; } + + const char* name() const; + static const char* name(const TokenType); + +private: + TokenType m_type { TokenType::Eof }; + size_t m_position { 0 }; + StringView m_value { nullptr }; +}; + +class Lexer { +public: + Lexer() = default; + explicit Lexer(const StringView source); + Token next(); + void reset(); + void back(size_t offset); + void set_source(const StringView source) { m_source = source; } + +private: + ALWAYS_INLINE char peek(size_t offset = 0) const; + ALWAYS_INLINE void consume(); + + StringView m_source {}; + size_t m_position { 0 }; + size_t m_previous_position { 0 }; + Token m_current_token { TokenType::Eof, 0, StringView(nullptr) }; + int m_current_char { 0 }; +}; + +} + +using regex::Lexer; diff --git a/Libraries/LibRegex/RegexMatch.h b/Libraries/LibRegex/RegexMatch.h new file mode 100644 index 0000000000..8b41424079 --- /dev/null +++ b/Libraries/LibRegex/RegexMatch.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include "RegexOptions.h" + +#include "AK/FlyString.h" +#include "AK/HashMap.h" +#include "AK/String.h" +#include "AK/StringView.h" +#include "AK/Vector.h" + +namespace regex { + +class Match final { +private: + Optional<FlyString> string; + +public: + Match() = default; + ~Match() = default; + + Match(const StringView view_, const size_t line_, const size_t column_, const size_t global_offset_) + : view(view_) + , line(line_) + , column(column_) + , global_offset(global_offset_) + , left_column(column_) + { + } + + Match(const String string_, const size_t line_, const size_t column_, const size_t global_offset_) + : string(string_) + , view(string.value().view()) + , line(line_) + , column(column_) + , global_offset(global_offset_) + , left_column(column_) + { + } + + StringView view { nullptr }; + size_t line { 0 }; + size_t column { 0 }; + size_t global_offset { 0 }; + + // ugly, as not usable by user, but needed to prevent to create extra vectors that are + // able to store the column when the left paren has been found + size_t left_column { 0 }; +}; + +struct MatchInput { + StringView view { nullptr }; + AllOptions regex_options {}; + + size_t match_index { 0 }; + size_t line { 0 }; + size_t column { 0 }; + + size_t global_offset { 0 }; // For multiline matching, knowning the offset from start could be important +}; + +struct MatchState { + size_t string_position { 0 }; + size_t instruction_position { 0 }; + size_t fork_at_position { 0 }; +}; + +struct MatchOutput { + size_t operations; + Vector<Match> matches; + Vector<Vector<Match>> capture_group_matches; + Vector<HashMap<String, Match>> named_capture_group_matches; +}; + +} diff --git a/Libraries/LibRegex/RegexMatcher.cpp b/Libraries/LibRegex/RegexMatcher.cpp new file mode 100644 index 0000000000..9a636d9cb1 --- /dev/null +++ b/Libraries/LibRegex/RegexMatcher.cpp @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "RegexMatcher.h" +#include "RegexDebug.h" +#include "RegexParser.h" +#include <AK/String.h> +#include <AK/StringBuilder.h> + +namespace regex { + +#ifdef REGEX_DEBUG +static RegexDebug s_regex_dbg(stderr); +#endif + +template<class Parser> +Regex<Parser>::Regex(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options) +{ + pattern_value = pattern.to_string(); + regex::Lexer lexer(pattern); + + Parser parser(lexer, regex_options); + parser_result = parser.parse(); + + if (parser_result.error == regex::Error::NoError) { + matcher = make<Matcher<Parser>>(*this, regex_options); + } else { + fprintf(stderr, "%s\n", error_string().characters()); + } +} + +template<class Parser> +String Regex<Parser>::error_string(Optional<String> message) const +{ + StringBuilder eb; + eb.appendf("Error during parsing of regular expression:\n"); + eb.appendf(" %s\n ", pattern_value.characters()); + for (size_t i = 0; i < parser_result.error_token.position(); ++i) + eb.append(" "); + + eb.appendf("^---- %s\n", message.value_or(get_error_string(parser_result.error)).characters()); + return eb.build(); +} + +template<typename Parser> +RegexResult Matcher<Parser>::match(const StringView& view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options) const +{ + size_t match_count { 0 }; + Vector<StringView> views { view }; + + MatchInput input; + MatchState state; + MatchOutput output; + + input.regex_options = m_regex_options | regex_options.value_or({}).value(); + output.operations = 0; + + if (input.regex_options & AllFlags::Multiline) + views = view.lines(false); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string... + + if (c_match_preallocation_count) { + output.matches.ensure_capacity(c_match_preallocation_count); + output.capture_group_matches.ensure_capacity(c_match_preallocation_count); + output.named_capture_group_matches.ensure_capacity(c_match_preallocation_count); + + auto& capture_groups_count = m_pattern.parser_result.capture_groups_count; + auto& named_capture_groups_count = m_pattern.parser_result.named_capture_groups_count; + + for (size_t j = 0; j < c_match_preallocation_count; ++j) { + output.matches.empend(); + output.capture_group_matches.unchecked_append({}); + output.capture_group_matches.at(j).ensure_capacity(capture_groups_count); + for (size_t k = 0; k < capture_groups_count; ++k) + output.capture_group_matches.at(j).unchecked_append({}); + + output.named_capture_group_matches.unchecked_append({}); + output.named_capture_group_matches.at(j).ensure_capacity(named_capture_groups_count); + } + } + + auto append_match = [](auto& input, auto& state, auto& output, auto& start_position) { + if (output.matches.size() == input.match_index) + output.matches.empend(); + + ASSERT(start_position + state.string_position - start_position <= input.view.length()); + if (input.regex_options & AllFlags::StringCopyMatches) { + output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_string(), input.line, start_position, input.global_offset + start_position }; + } else { // let the view point to the original string ... + output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position }; + } + }; + +#ifdef REGEX_DEBUG + s_regex_dbg.print_header(); +#endif + + bool continue_search = (input.regex_options & AllFlags::Global) || (input.regex_options & AllFlags::Multiline); + + for (auto& view : views) { + input.view = view; +#ifdef REGEX_DEBUG + dbg() << "[match] Starting match with view (" << view.length() << "): _" << view << "_"; +#endif + + auto view_length = view.length(); + for (size_t view_index = 0; view_index < view_length; ++view_index) { + auto& match_length_minimum = m_pattern.parser_result.match_length_minimum; + // FIXME: More performant would be to know the remaining minimum string + // length needed to match from the current position onwards within + // the vm. Add new OpCode for MinMatchLengthFromSp with the value of + // the remaining string length from the current path. The value though + // has to be filled in reverse. That implies a second run over bytecode + // after generation has finished. + if (match_length_minimum && match_length_minimum > view_length - view_index) + break; + + input.column = match_count; + input.match_index = match_count; + + state.string_position = view_index; + state.instruction_position = 0; + + auto success = execute(input, state, output, 0); + if (!success.has_value()) + return { false, 0, {}, {}, {}, output.operations }; + + if (success.value()) { + + if ((input.regex_options & AllFlags::MatchNotEndOfLine) && state.string_position == input.view.length()) { + if (!continue_search) + break; + continue; + } + if ((input.regex_options & AllFlags::MatchNotBeginOfLine) && view_index == 0) { + if (!continue_search) + break; + continue; + } + +#ifdef REGEX_DEBUG + dbg() << "state.string_position: " << state.string_position << " view_index: " << view_index; + dbg() << "[match] Found a match (length = " << state.string_position - view_index << "): " << input.view.substring_view(view_index, state.string_position - view_index); +#endif + ++match_count; + + if (continue_search) { + append_match(input, state, output, view_index); + + bool has_zero_length = state.string_position == view_index; + view_index = state.string_position - (has_zero_length ? 0 : 1); + continue; + + } else if (!continue_search && state.string_position < view_length) + return { false, 0, {}, {}, {}, output.operations }; + + append_match(input, state, output, view_index); + break; + } + + if (!continue_search) + break; + } + + ++input.line; + input.global_offset += view.length() + 1; // +1 includes the line break character + } + + MatchOutput output_copy; + if (match_count) { + auto capture_groups_count = min(output.capture_group_matches.size(), output.matches.size()); + for (size_t i = 0; i < capture_groups_count; ++i) { + output_copy.capture_group_matches.append(output.capture_group_matches.at(i)); + } + + auto named_capture_groups_count = min(output.named_capture_group_matches.size(), output.matches.size()); + for (size_t i = 0; i < named_capture_groups_count; ++i) { + if (output.matches.at(i).view.length()) + output_copy.named_capture_group_matches.append(output.named_capture_group_matches.at(i)); + } + + for (size_t i = 0; i < match_count; ++i) + output_copy.matches.append(output.matches.at(i)); + + } else { + output_copy.capture_group_matches.clear_with_capacity(); + output_copy.named_capture_group_matches.clear_with_capacity(); + } + + return { + match_count ? true : false, + match_count, + move(output_copy.matches), + move(output_copy.capture_group_matches), + move(output_copy.named_capture_group_matches), + output.operations, + }; +} + +template<class Parser> +Optional<bool> Matcher<Parser>::execute(const MatchInput& input, MatchState& state, MatchOutput& output, size_t recursion_level) const +{ + if (recursion_level > c_max_recursion) + return false; + + Vector<MatchState> fork_low_prio_states; + MatchState fork_high_prio_state; + Optional<bool> success; + + auto& bytecode = m_pattern.parser_result.bytecode; + + for (;;) { + ++output.operations; + auto* opcode = bytecode.get_opcode(state); + + if (!opcode) { + dbg() << "Wrong opcode... failed!"; + return {}; + } + +#ifdef REGEX_DEBUG + s_regex_dbg.print_opcode("VM", *opcode, state, recursion_level, false); +#endif + + auto result = opcode->execute(input, state, output); + +#ifdef REGEX_DEBUG + s_regex_dbg.print_result(*opcode, bytecode, input, state, result); +#endif + + state.instruction_position += opcode->size(); + + switch (result) { + case ExecutionResult::Fork_PrioLow: + fork_low_prio_states.prepend(state); + continue; + case ExecutionResult::Fork_PrioHigh: + fork_high_prio_state = state; + fork_high_prio_state.instruction_position = fork_high_prio_state.fork_at_position; + success = execute(input, fork_high_prio_state, output, ++recursion_level); + if (!success.has_value()) + return {}; + + if (success.value()) { + state = fork_high_prio_state; + return true; + } + + continue; + case ExecutionResult::Continue: + continue; + case ExecutionResult::Succeeded: + return true; + case ExecutionResult::Failed: + return false; + case ExecutionResult::Failed_ExecuteLowPrioForks: + return execute_low_prio_forks(input, state, output, fork_low_prio_states, recursion_level + 1); + } + } + + ASSERT_NOT_REACHED(); +} + +template<class Parser> +ALWAYS_INLINE Optional<bool> Matcher<Parser>::execute_low_prio_forks(const MatchInput& input, MatchState& original_state, MatchOutput& output, Vector<MatchState> states, size_t recursion_level) const +{ + for (auto& state : states) { + + state.instruction_position = state.fork_at_position; +#ifdef REGEX_DEBUG + fprintf(stderr, "Forkstay... ip = %lu, sp = %lu\n", state.instruction_position, state.string_position); +#endif + auto success = execute(input, state, output, recursion_level); + if (!success.has_value()) + return {}; + if (success.value()) { +#ifdef REGEX_DEBUG + fprintf(stderr, "Forkstay succeeded... ip = %lu, sp = %lu\n", state.instruction_position, state.string_position); +#endif + original_state = state; + return true; + } + } + + original_state.string_position = 0; + return false; +}; + +template class Matcher<PosixExtendedParser>; +template class Regex<PosixExtendedParser>; +} diff --git a/Libraries/LibRegex/RegexMatcher.h b/Libraries/LibRegex/RegexMatcher.h new file mode 100644 index 0000000000..b98873c81f --- /dev/null +++ b/Libraries/LibRegex/RegexMatcher.h @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include "RegexByteCode.h" +#include "RegexMatch.h" +#include "RegexOptions.h" +#include "RegexParser.h" + +#include <AK/Forward.h> +#include <AK/HashMap.h> +#include <AK/NonnullOwnPtrVector.h> +#include <AK/Types.h> +#include <AK/Vector.h> + +#include <stdio.h> + +namespace regex { + +static const constexpr size_t c_max_recursion = 5000; +static const constexpr size_t c_match_preallocation_count = 0; + +struct RegexResult final { + bool success { false }; + size_t count { 0 }; + Vector<Match> matches; + Vector<Vector<Match>> capture_group_matches; + Vector<HashMap<String, Match>> named_capture_group_matches; + size_t operations { 0 }; +}; + +template<class Parser> +class Regex; + +template<class Parser> +class Matcher final { + +public: + Matcher(const Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) + : m_pattern(pattern) + , m_regex_options(regex_options.value_or({})) + { + } + ~Matcher() = default; + + RegexResult match(const StringView&, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const; + +private: + Optional<bool> execute(const MatchInput& input, MatchState& state, MatchOutput& output, size_t recursion_level) const; + ALWAYS_INLINE Optional<bool> execute_low_prio_forks(const MatchInput& input, MatchState& original_state, MatchOutput& output, Vector<MatchState> states, size_t recursion_level) const; + + const Regex<Parser>& m_pattern; + const typename ParserTraits<Parser>::OptionsType m_regex_options; +}; + +template<class Parser> +class Regex final { +public: + String pattern_value; + regex::Parser::Result parser_result; + OwnPtr<Matcher<Parser>> matcher { nullptr }; + + Regex(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options = {}); + ~Regex() = default; + + void print_bytecode(FILE* f = stdout) const; + String error_string(Optional<String> message = {}) const; + + RegexResult match(StringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const + { + if (!matcher || parser_result.error != Error::NoError) + return {}; + return matcher->match(view, regex_options); + } + + RegexResult search(StringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const + { + if (!matcher || parser_result.error != Error::NoError) + return {}; + + AllOptions options = (AllOptions)regex_options.value_or({}); + if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) { + options.reset_flag(AllFlags::MatchNotEndOfLine); + options.reset_flag(AllFlags::MatchNotBeginOfLine); + } + options |= AllFlags::Global; + + return matcher->match(view, options); + } + + bool match(StringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const + { + if (!matcher || parser_result.error != Error::NoError) + return {}; + m = matcher->match(view, regex_options); + return m.success; + } + + bool search(StringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const + { + m = search(view, regex_options); + return m.success; + } + + bool has_match(const StringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const + { + if (!matcher || parser_result.error != Error::NoError) + return false; + RegexResult result = matcher->match(view, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults); + return result.success; + } +}; + +template<class Parser> +RegexResult match(const StringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) +{ + if (!pattern.matcher || pattern.parser_result.error != Error::NoError) + return {}; + return pattern.matcher->match(view, regex_options); +} + +template<class Parser> +bool match(const StringView view, Regex<Parser>& pattern, RegexResult& res, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) +{ + if (!pattern.matcher || pattern.parser_result.error != Error::NoError) + return {}; + res = pattern.matcher->match(view, regex_options); + return res.success; +} + +template<class Parser> +RegexResult search(const StringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) +{ + if (!pattern.matcher || pattern.parser_result.error != Error::NoError) + return {}; + return pattern.matcher->search(view, regex_options); +} + +template<class Parser> +bool search(const StringView view, Regex<Parser>& pattern, RegexResult& res, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) +{ + if (!pattern.matcher || pattern.parser_result.error != Error::NoError) + return {}; + res = pattern.matcher->search(view, regex_options); + return res.success; +} + +template<class Parser> +bool has_match(const StringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) +{ + if (pattern.matcher == nullptr) + return {}; + RegexResult result = pattern.matcher->match(view, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults); + return result.success; +} + +} + +using regex::has_match; +using regex::match; +using regex::Regex; +using regex::RegexResult; diff --git a/Libraries/LibRegex/RegexOptions.h b/Libraries/LibRegex/RegexOptions.h new file mode 100644 index 0000000000..36cc0ce40a --- /dev/null +++ b/Libraries/LibRegex/RegexOptions.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include <AK/Types.h> +#include <stdio.h> +#ifdef __serenity__ +# include <regex.h> +#else +# include <LibC/regex.h> +#endif + +namespace regex { + +using FlagsUnderlyingType = u16; + +enum class AllFlags { + Global = __Regex_Global, // All matches (don't return after first match) + Insensitive = __Regex_Insensitive, // Case insensitive match (ignores case of [a-zA-Z]) + Ungreedy = __Regex_Ungreedy, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy + Unicode = __Regex_Unicode, // Enable all unicode features and interpret all unicode escape sequences as such + Extended = __Regex_Extended, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored + Extra = __Regex_Extra, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted + MatchNotBeginOfLine = __Regex_MatchNotBeginOfLine, // Pattern is not forced to ^ -> search in whole string! + MatchNotEndOfLine = __Regex_MatchNotEndOfLine, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set + SkipSubExprResults = __Regex_SkipSubExprResults, // Do not return sub expressions in the result + StringCopyMatches = __Regex_StringCopyMatches, // Do explicitly copy results into new allocated string instead of StringView to original string. + SingleLine = __Regex_SingleLine, // Dot matches newline characters + Sticky = __Regex_Sticky, // Force the pattern to only match consecutive matches from where the previous match ended. + Multiline = __Regex_Multiline, // Handle newline characters. Match each line, one by one. + SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches, // Do not remove empty capture group results. + Last = SkipTrimEmptyMatches +}; + +enum class PosixFlags : FlagsUnderlyingType { + Global = (FlagsUnderlyingType)AllFlags::Global, + Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive, + Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy, + Unicode = (FlagsUnderlyingType)AllFlags::Unicode, + Extended = (FlagsUnderlyingType)AllFlags::Extended, + Extra = (FlagsUnderlyingType)AllFlags::Extra, + MatchNotBeginOfLine = (FlagsUnderlyingType)AllFlags::MatchNotBeginOfLine, + MatchNotEndOfLine = (FlagsUnderlyingType)AllFlags::MatchNotEndOfLine, + SkipSubExprResults = (FlagsUnderlyingType)AllFlags::SkipSubExprResults, + Multiline = (FlagsUnderlyingType)AllFlags::Multiline, + StringCopyMatches = (FlagsUnderlyingType)AllFlags::StringCopyMatches, +}; + +enum class ECMAScriptFlags : FlagsUnderlyingType { + Global = (FlagsUnderlyingType)AllFlags::Global, + Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive, + Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy, + Unicode = (FlagsUnderlyingType)AllFlags::Unicode, + Extended = (FlagsUnderlyingType)AllFlags::Extended, + Extra = (FlagsUnderlyingType)AllFlags::Extra, + SingleLine = (FlagsUnderlyingType)AllFlags::SingleLine, + Sticky = (FlagsUnderlyingType)AllFlags::Sticky, + Multiline = (FlagsUnderlyingType)AllFlags::Multiline, + StringCopyMatches = (FlagsUnderlyingType)AllFlags::StringCopyMatches, +}; + +template<class T> +class RegexOptions { +public: + using FlagsType = T; + + RegexOptions() = default; + + RegexOptions(T flags) + : m_flags(flags) + { + } + + template<class U> + RegexOptions(RegexOptions<U> other) + : m_flags((T) static_cast<FlagsUnderlyingType>(other.value())) + { + } + + operator bool() const { return !!*this; } + bool operator!() const { return (FlagsUnderlyingType)m_flags == 0; } + + RegexOptions<T> operator|(T flag) const { return RegexOptions<T> { (T)((FlagsUnderlyingType)m_flags | (FlagsUnderlyingType)flag) }; } + RegexOptions<T> operator&(T flag) const { return RegexOptions<T> { (T)((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag) }; } + + RegexOptions<T>& operator|=(T flag) + { + m_flags = (T)((FlagsUnderlyingType)m_flags | (FlagsUnderlyingType)flag); + return *this; + } + + RegexOptions<T>& operator&=(T flag) + { + m_flags = (T)((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag); + return *this; + } + + void reset_flags() { m_flags = (T)0; } + void reset_flag(T flag) { m_flags = (T)((FlagsUnderlyingType)m_flags & ~(FlagsUnderlyingType)flag); } + void set_flag(T flag) { *this |= flag; } + bool has_flag_set(T flag) const { return *this & flag; } + T value() const { return m_flags; } + +private: + T m_flags { 0 }; +}; + +template<class T> +inline RegexOptions<T> operator|(T lhs, T rhs) +{ + return RegexOptions<T> { lhs } |= rhs; +} + +template<class T> +inline RegexOptions<T> operator&(T lhs, T rhs) +{ + return RegexOptions<T> { lhs } &= rhs; +} + +template<class T> +inline T operator~(T flag) +{ + return (T) ~((FlagsUnderlyingType)flag); +} + +using AllOptions = RegexOptions<AllFlags>; +using ECMAScriptOptions = RegexOptions<ECMAScriptFlags>; +using PosixOptions = RegexOptions<PosixFlags>; + +} + +using regex::ECMAScriptFlags; +using regex::ECMAScriptOptions; +using regex::PosixFlags; +using regex::PosixOptions; diff --git a/Libraries/LibRegex/RegexParser.cpp b/Libraries/LibRegex/RegexParser.cpp new file mode 100644 index 0000000000..ff64b59530 --- /dev/null +++ b/Libraries/LibRegex/RegexParser.cpp @@ -0,0 +1,598 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "RegexParser.h" +#include "RegexDebug.h" +#include <AK/String.h> +#include <AK/StringBuilder.h> +#include <cstdio> + +namespace regex { + +ALWAYS_INLINE bool Parser::set_error(Error error) +{ + if (m_parser_state.error == Error::NoError) { + m_parser_state.error = error; + m_parser_state.error_token = m_parser_state.current_token; + } + return false; // always return false, that eases the API usage (return set_error(...)) :^) +} + +ALWAYS_INLINE bool Parser::done() const +{ + return match(TokenType::Eof); +} + +ALWAYS_INLINE bool Parser::match(TokenType type) const +{ + return m_parser_state.current_token.type() == type; +} + +ALWAYS_INLINE Token Parser::consume() +{ + auto old_token = m_parser_state.current_token; + m_parser_state.current_token = m_parser_state.lexer.next(); + return old_token; +} + +ALWAYS_INLINE Token Parser::consume(TokenType type, Error error) +{ + if (m_parser_state.current_token.type() != type) { + set_error(error); + dbg() << "[PARSER] Error: Unexpected token " << m_parser_state.current_token.name() << ". Expected: " << Token::name(type); + } + return consume(); +} + +ALWAYS_INLINE bool Parser::consume(const String& str) +{ + size_t potentially_go_back { 1 }; + for (auto ch : str) { + if (match(TokenType::Char)) { + if (m_parser_state.current_token.value()[0] != ch) { + m_parser_state.lexer.back(potentially_go_back); + m_parser_state.current_token = m_parser_state.lexer.next(); + return false; + } + } else { + m_parser_state.lexer.back(potentially_go_back); + m_parser_state.current_token = m_parser_state.lexer.next(); + return false; + } + consume(TokenType::Char, Error::NoError); + ++potentially_go_back; + } + return true; +} + +ALWAYS_INLINE void Parser::reset() +{ + m_parser_state.bytecode.clear(); + m_parser_state.lexer.reset(); + m_parser_state.current_token = m_parser_state.lexer.next(); + m_parser_state.error = Error::NoError; + m_parser_state.error_token = { TokenType::Eof, 0, StringView(nullptr) }; + m_parser_state.regex_options = {}; +} + +Parser::Result Parser::parse(Optional<AllOptions> regex_options) +{ + reset(); + if (regex_options.has_value()) + m_parser_state.regex_options = regex_options.value(); + if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum)) + consume(TokenType::Eof, Error::InvalidPattern); + else + set_error(Error::InvalidPattern); + +#ifdef REGEX_DEBUG + fprintf(stderr, "[PARSER] Produced bytecode with %lu entries (opcodes + arguments)\n", m_parser_state.bytecode.size()); +#endif + return { + move(m_parser_state.bytecode), + move(m_parser_state.capture_groups_count), + move(m_parser_state.named_capture_groups_count), + move(m_parser_state.match_length_minimum), + move(m_parser_state.error), + move(m_parser_state.error_token) + }; +} + +// ============================= +// PosixExtended Parser +// ============================= + +bool PosixExtendedParser::parse_internal(ByteCode& stack, size_t& match_length_minimum) +{ + return parse_root(stack, match_length_minimum); +} + +ALWAYS_INLINE bool PosixExtendedParser::match_repetition_symbol() +{ + auto type = m_parser_state.current_token.type(); + return (type == TokenType::Asterisk + || type == TokenType::Plus + || type == TokenType::Questionmark + || type == TokenType::LeftCurly); +} + +ALWAYS_INLINE bool PosixExtendedParser::match_ordinary_characters() +{ + // NOTE: This method must not be called during bracket and repetition parsing! + // FIXME: Add assertion for that? + auto type = m_parser_state.current_token.type(); + return (type == TokenType::Char + || type == TokenType::Comma + || type == TokenType::Slash + || type == TokenType::EqualSign + || type == TokenType::HyphenMinus + || type == TokenType::Colon); +} + +ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& bytecode_to_repeat, size_t& match_length_minimum) +{ + if (match(TokenType::LeftCurly)) { + consume(); + + StringBuilder number_builder; + + while (match(TokenType::Char)) { + number_builder.append(consume().value()); + } + + auto maybe_minimum = number_builder.build().to_uint(); + if (!maybe_minimum.has_value()) + return set_error(Error::InvalidBraceContent); + + auto minimum = maybe_minimum.value(); + match_length_minimum *= minimum; + + if (match(TokenType::Comma)) { + consume(); + } else { + ByteCode bytecode; + bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum); + bytecode_to_repeat = move(bytecode); + + consume(TokenType::RightCurly, Error::MismatchingBrace); + return !has_error(); + } + + Optional<size_t> maybe_maximum {}; + number_builder.clear(); + while (match(TokenType::Char)) { + number_builder.append(consume().value()); + } + if (!number_builder.is_empty()) { + auto value = number_builder.build().to_uint(); + if (!value.has_value() || minimum > value.value()) + return set_error(Error::InvalidBraceContent); + + maybe_maximum = value.value(); + } + + bytecode_to_repeat.insert_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum); + + consume(TokenType::RightCurly, Error::MismatchingBrace); + return !has_error(); + + } else if (match(TokenType::Plus)) { + consume(); + + bool greedy = match(TokenType::Questionmark); + if (greedy) + consume(); + + // Note: dont touch match_length_minimum, it's already correct + bytecode_to_repeat.insert_bytecode_repetition_min_one(bytecode_to_repeat, greedy); + return !has_error(); + + } else if (match(TokenType::Asterisk)) { + consume(); + match_length_minimum = 0; + + bool greedy = match(TokenType::Questionmark); + if (greedy) + consume(); + + bytecode_to_repeat.insert_bytecode_repetition_any(bytecode_to_repeat, greedy); + + return !has_error(); + + } else if (match(TokenType::Questionmark)) { + consume(); + match_length_minimum = 0; + + bool greedy = match(TokenType::Questionmark); + if (greedy) + consume(); + + bytecode_to_repeat.insert_bytecode_repetition_zero_or_one(bytecode_to_repeat, greedy); + return !has_error(); + } + + return false; +} + +ALWAYS_INLINE bool PosixExtendedParser::parse_bracket_expression(ByteCode& stack, size_t& match_length_minimum) +{ + Vector<CompareTypeAndValuePair> values; + + for (;;) { + + if (match(TokenType::HyphenMinus)) { + consume(); + + if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) { + // first in the bracket expression + values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' }); + } else if (match(TokenType::RightBracket)) { + // Last in the bracket expression + values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' }); + } else if (values.last().type == CharacterCompareType::Char) { + values.append({ CharacterCompareType::RangeExpressionDummy, 0 }); + + if (match(TokenType::HyphenMinus)) { + consume(); + // Valid range, add ordinary character + values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' }); + } + } else { + return set_error(Error::InvalidRange); + } + + } else if (match(TokenType::Char) || match(TokenType::Period) || match(TokenType::Asterisk) || match(TokenType::EscapeSequence) || match(TokenType::Plus)) { + values.append({ CharacterCompareType::Char, (ByteCodeValueType)*consume().value().characters_without_null_termination() }); + + } else if (match(TokenType::Circumflex)) { + auto t = consume(); + + if (values.is_empty()) + values.append({ CharacterCompareType::Inverse, 0 }); + else + values.append({ CharacterCompareType::Char, (ByteCodeValueType)*t.value().characters_without_null_termination() }); + + } else if (match(TokenType::LeftBracket)) { + consume(); + + if (match(TokenType::Period)) { + consume(); + + // FIXME: Parse collating element, this is needed when we have locale support + // This could have impact on length parameter, I guess. + ASSERT_NOT_REACHED(); + + consume(TokenType::Period, Error::InvalidCollationElement); + consume(TokenType::RightBracket, Error::MismatchingBracket); + + } else if (match(TokenType::EqualSign)) { + consume(); + // FIXME: Parse collating element, this is needed when we have locale support + // This could have impact on length parameter, I guess. + ASSERT_NOT_REACHED(); + + consume(TokenType::EqualSign, Error::InvalidCollationElement); + consume(TokenType::RightBracket, Error::MismatchingBracket); + + } else if (match(TokenType::Colon)) { + consume(); + + CharClass ch_class; + // parse character class + if (match(TokenType::Char)) { + if (consume("alnum")) + ch_class = CharClass::Alnum; + else if (consume("alpha")) + ch_class = CharClass::Alpha; + else if (consume("blank")) + ch_class = CharClass::Blank; + else if (consume("cntrl")) + ch_class = CharClass::Cntrl; + else if (consume("digit")) + ch_class = CharClass::Digit; + else if (consume("graph")) + ch_class = CharClass::Graph; + else if (consume("lower")) + ch_class = CharClass::Lower; + else if (consume("print")) + ch_class = CharClass::Print; + else if (consume("punct")) + ch_class = CharClass::Punct; + else if (consume("space")) + ch_class = CharClass::Space; + else if (consume("upper")) + ch_class = CharClass::Upper; + else if (consume("xdigit")) + ch_class = CharClass::Xdigit; + else + return set_error(Error::InvalidCharacterClass); + + values.append({ CharacterCompareType::CharClass, (ByteCodeValueType)ch_class }); + + } else + return set_error(Error::InvalidCharacterClass); + + // FIXME: we do not support locale specific character classes until locales are implemented + + consume(TokenType::Colon, Error::InvalidCharacterClass); + consume(TokenType::RightBracket, Error::MismatchingBracket); + } else { + return set_error(Error::MismatchingBracket); + } + + } else if (match(TokenType::RightBracket)) { + + if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) { + // handle bracket as ordinary character + values.append({ CharacterCompareType::Char, (ByteCodeValueType)*consume().value().characters_without_null_termination() }); + } else { + // closing bracket expression + break; + } + } else + // nothing matched, this is a failure, as at least the closing bracket must match... + return set_error(Error::MismatchingBracket); + + // check if range expression has to be completed... + if (values.size() >= 3 && values.at(values.size() - 2).type == CharacterCompareType::RangeExpressionDummy) { + if (values.last().type != CharacterCompareType::Char) + return set_error(Error::InvalidRange); + + auto value2 = values.take_last(); + values.take_last(); // RangeExpressionDummy + auto value1 = values.take_last(); + + values.append({ CharacterCompareType::CharRange, static_cast<ByteCodeValueType>(CharRange { (char)value1.value, (char)value2.value }) }); + } + } + + if (values.size()) + match_length_minimum = 1; + + if (values.first().type == CharacterCompareType::Inverse) + match_length_minimum = 0; + + stack.insert_bytecode_compare_values(move(values)); + + return !has_error(); +} + +ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, size_t& match_length_minimum) +{ + ByteCode bytecode; + size_t length = 0; + bool should_parse_repetition_symbol { false }; + + for (;;) { + if (match_ordinary_characters()) { + Token start_token = m_parser_state.current_token; + Token last_token = m_parser_state.current_token; + for (;;) { + if (!match_ordinary_characters()) + break; + ++length; + last_token = consume(); + } + + if (length > 1) { + // last character is inserted into 'bytecode' for duplication symbol handling + auto new_length = length - ((match_repetition_symbol() && length > 1) ? 1 : 0); + stack.insert_bytecode_compare_string(start_token.value(), new_length); + } + + if ((match_repetition_symbol() && length > 1) || length == 1) // Create own compare opcode for last character before duplication symbol + bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)last_token.value().characters_without_null_termination()[0] } }); + + should_parse_repetition_symbol = true; + break; + } + + if (match_repetition_symbol()) + return set_error(Error::InvalidRepetitionMarker); + + if (match(TokenType::Period)) { + length = 1; + consume(); + bytecode.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } }); + should_parse_repetition_symbol = true; + break; + } + + if (match(TokenType::EscapeSequence)) { + length = 1; + Token t = consume(); +#ifdef REGEX_DEBUG + printf("[PARSER] EscapeSequence with substring %s\n", String(t.value()).characters()); +#endif + + bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u32)t.value().characters_without_null_termination()[1] } }); + should_parse_repetition_symbol = true; + break; + } + + if (match(TokenType::LeftBracket)) { + consume(); + + ByteCode sub_ops; + if (!parse_bracket_expression(sub_ops, length) || !sub_ops.size()) + return set_error(Error::InvalidBracketContent); + + bytecode.append(move(sub_ops)); + + consume(TokenType::RightBracket, Error::MismatchingBracket); + should_parse_repetition_symbol = true; + break; + } + + if (match(TokenType::RightBracket)) { + return set_error(Error::MismatchingBracket); + } + + if (match(TokenType::RightCurly)) { + return set_error(Error::MismatchingBrace); + } + + if (match(TokenType::Circumflex)) { + consume(); + bytecode.empend((ByteCodeValueType)OpCodeId::CheckBegin); + break; + } + + if (match(TokenType::Dollar)) { + consume(); + bytecode.empend((ByteCodeValueType)OpCodeId::CheckEnd); + break; + } + + if (match(TokenType::RightParen)) + return false; + + if (match(TokenType::LeftParen)) { + consume(); + Optional<StringView> capture_group_name; + bool prevent_capture_group = false; + if (match(TokenType::Questionmark)) { + consume(); + + if (match(TokenType::Colon)) { + consume(); + prevent_capture_group = true; + } else if (consume("<")) { // named capturing group + + Token start_token = m_parser_state.current_token; + Token last_token = m_parser_state.current_token; + size_t capture_group_name_length = 0; + for (;;) { + if (!match_ordinary_characters()) + return set_error(Error::InvalidNameForCaptureGroup); + if (match(TokenType::Char) && m_parser_state.current_token.value()[0] == '>') { + consume(); + break; + } + ++capture_group_name_length; + last_token = consume(); + } + capture_group_name = StringView(start_token.value().characters_without_null_termination(), capture_group_name_length); + + } else if (match(TokenType::EqualSign)) { // positive lookahead + consume(); + ASSERT_NOT_REACHED(); + } else if (consume("!")) { // negative lookahead + ASSERT_NOT_REACHED(); + } else if (consume("<")) { + if (match(TokenType::EqualSign)) { // positive lookbehind + consume(); + ASSERT_NOT_REACHED(); + } + if (consume("!")) // negative lookbehind + ASSERT_NOT_REACHED(); + } else { + return set_error(Error::InvalidRepetitionMarker); + } + } + + if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) { + if (capture_group_name.has_value()) + bytecode.insert_bytecode_group_capture_left(capture_group_name.value()); + else + bytecode.insert_bytecode_group_capture_left(m_parser_state.capture_groups_count); + } + + ByteCode capture_group_bytecode; + + if (!parse_root(capture_group_bytecode, length)) + return set_error(Error::InvalidPattern); + + bytecode.append(move(capture_group_bytecode)); + + consume(TokenType::RightParen, Error::MismatchingParen); + + if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) { + if (capture_group_name.has_value()) { + bytecode.insert_bytecode_group_capture_right(capture_group_name.value()); + ++m_parser_state.named_capture_groups_count; + } else { + bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count); + ++m_parser_state.capture_groups_count; + } + } + should_parse_repetition_symbol = true; + break; + } + + return false; + } + + if (match_repetition_symbol()) { + if (should_parse_repetition_symbol) + parse_repetition_symbol(bytecode, length); + else + return set_error(Error::InvalidRepetitionMarker); + } + + stack.append(move(bytecode)); + match_length_minimum += length; + + return true; +} + +bool PosixExtendedParser::parse_root(ByteCode& stack, size_t& match_length_minimum) +{ + ByteCode bytecode_left; + size_t match_length_minimum_left { 0 }; + + if (match_repetition_symbol()) + return set_error(Error::InvalidRepetitionMarker); + + for (;;) { + if (!parse_sub_expression(bytecode_left, match_length_minimum_left)) + break; + + if (match(TokenType::Pipe)) { + consume(); + + ByteCode bytecode_right; + size_t match_length_minimum_right { 0 }; + + if (!parse_root(bytecode_right, match_length_minimum_right) || bytecode_right.is_empty()) + return set_error(Error::InvalidPattern); + + ByteCode new_bytecode; + new_bytecode.insert_bytecode_alternation(move(bytecode_left), move(bytecode_right)); + bytecode_left = move(new_bytecode); + match_length_minimum_left = min(match_length_minimum_right, match_length_minimum_left); + } + } + + if (bytecode_left.is_empty()) + set_error(Error::EmptySubExpression); + + stack.append(move(bytecode_left)); + match_length_minimum = match_length_minimum_left; + return !has_error(); +} + +} diff --git a/Libraries/LibRegex/RegexParser.cpp_ b/Libraries/LibRegex/RegexParser.cpp_ new file mode 100644 index 0000000000..4292371cd2 --- /dev/null +++ b/Libraries/LibRegex/RegexParser.cpp_ @@ -0,0 +1,803 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "RegexParser.h" +#include <AK/String.h> +#include <AK/StringBuilder.h> + +namespace AK { +namespace regex { + +const char* ByteCodeValue::name(OpCode type) +{ + switch (type) { +#define __ENUMERATE_OPCODE(x) \ + case OpCode::x: \ + return #x; + ENUMERATE_OPCODES +#undef __ENUMERATE_OPCODE + default: + ASSERT_NOT_REACHED(); + return "<Unknown>"; + } +} + +const char* ByteCodeValue::name() const +{ + return name(op_code); +} + +template<class T> +bool Parser<T>::set_error(Error error) +{ + if (m_parser_state.error == Error::NoError) { + m_parser_state.error = error; + m_parser_state.error_token = m_parser_state.current_token; + } + return false; // always return false, that eases the API usage (return set_error(...)) :^) +} + +template<class T> +bool Parser<T>::done() const +{ + return match(TokenType::Eof); +} + +template<class T> +bool Parser<T>::match(TokenType type) const +{ + return m_parser_state.current_token.type() == type; +} + +template<class T> +Token Parser<T>::consume() +{ + auto old_token = m_parser_state.current_token; + m_parser_state.current_token = m_parser_state.lexer.next(); + return old_token; +} + +template<class T> +Token Parser<T>::consume(TokenType type, Error error) +{ + if (m_parser_state.current_token.type() != type) { + set_error(error); +#ifdef __serenity__ + dbg() << "[PARSER] Error: Unexpected token " << m_parser_state.m_current_token.name() << ". Expected: " << Token::name(type); +#else + fprintf(stderr, "[PARSER] Error: Unexpected token %s. Expected %s\n", m_parser_state.current_token.name(), Token::name(type)); +#endif + } + return consume(); +} + +template<class T> +bool Parser<T>::consume(const String& str) +{ + size_t potentially_go_back { 1 }; + for (auto ch : str) { + if (match(TokenType::OrdinaryCharacter)) { + if (m_parser_state.current_token.value()[0] != ch) { + m_parser_state.lexer.back(potentially_go_back); + m_parser_state.current_token = m_parser_state.lexer.next(); + return false; + } + } else { + m_parser_state.lexer.back(potentially_go_back); + m_parser_state.current_token = m_parser_state.lexer.next(); + return false; + } + consume(TokenType::OrdinaryCharacter); + ++potentially_go_back; + } + return true; +} + +template<class T> +void Parser<T>::reset() +{ + m_parser_state.bytecode.clear(); + m_parser_state.lexer.reset(); + m_parser_state.current_token = m_parser_state.lexer.next(); + m_parser_state.error = Error::NoError; + m_parser_state.error_token = { TokenType::Eof, 0, StringView(nullptr) }; + m_parser_state.regex_options = {}; +} + +template<class T> +ParserResult Parser<T>::parse(Optional<OptionsType> regex_options) +{ + reset(); + if (regex_options.has_value()) + m_parser_state.regex_options = regex_options.value(); + if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum)) + consume(TokenType::Eof); + else + set_error(Error::InvalidPattern); + +#ifdef REGEX_DEBUG + printf("[PARSER] Produced bytecode with %lu entries (opcodes + arguments)\n", m_parser_state.m_bytes.size()); +#endif + return { + move(m_parser_state.bytecode), + move(m_parser_state.capture_groups_count), + move(m_parser_state.named_capture_groups_count), + move(m_parser_state.match_length_minimum), + move(m_parser_state.error), + move(m_parser_state.error_token) + }; +} + +template<class T> +void Parser<T>::insert_bytecode_compare_values(Vector<ByteCodeValue>& stack, Vector<CompareTypeAndValuePair>&& pairs) +{ + Vector<ByteCodeValue> bytecode; + + bytecode.empend(OpCode::Compare); + bytecode.empend(pairs.size()); // number of arguments + + for (auto& value : pairs) { + ASSERT(value.type != CharacterCompareType::RangeExpressionDummy); + ASSERT(value.type != CharacterCompareType::Undefined); + ASSERT(value.type != CharacterCompareType::OrdinaryCharacters); + + bytecode.append(move(value.type)); + if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnySingleCharacter) + bytecode.append(move(value.value)); + } + + stack.append(move(bytecode)); +} + +template<class T> +void Parser<T>::insert_bytecode_group_capture_left(Vector<ByteCodeValue>& stack) +{ + stack.empend(OpCode::SaveLeftCaptureGroup); + stack.empend(m_parser_state.capture_groups_count); +} + +template<class T> +void Parser<T>::insert_bytecode_group_capture_left(Vector<ByteCodeValue>& stack, const StringView& name) +{ + stack.empend(OpCode::SaveLeftNamedCaptureGroup); + stack.empend(name.characters_without_null_termination()); + stack.empend(name.length()); +} + +template<class T> +void Parser<T>::insert_bytecode_group_capture_right(Vector<ByteCodeValue>& stack) +{ + stack.empend(OpCode::SaveRightCaptureGroup); + stack.empend(m_parser_state.capture_groups_count); +} + +template<class T> +void Parser<T>::insert_bytecode_group_capture_right(Vector<ByteCodeValue>& stack, const StringView& name) +{ + stack.empend(OpCode::SaveRightNamedCaptureGroup); + stack.empend(name.characters_without_null_termination()); + stack.empend(name.length()); +} + +template<class T> +void Parser<T>::insert_bytecode_alternation(Vector<ByteCodeValue>& stack, Vector<ByteCodeValue>&& left, Vector<ByteCodeValue>&& right) +{ + + // FORKSTAY _ALT + // REGEXP ALT1 + // JUMP _END + // LABEL _ALT + // REGEXP ALT2 + // LABEL _END + + stack.empend(OpCode::ForkJump); + stack.empend(left.size() + 2); // Jump to the _ALT label + + for (auto& op : left) + stack.append(move(op)); + + stack.empend(OpCode::Jump); + stack.empend(right.size()); // Jump to the _END label + + // LABEL _ALT = bytecode.size() + 2 + + for (auto& op : right) + stack.append(move(op)); + + // LABEL _END = alterantive_bytecode.size +} + +template<class T> +void Parser<T>::insert_bytecode_repetition_min_max(Vector<ByteCodeValue>& bytecode_to_repeat, size_t minimum, Optional<size_t> maximum) +{ + Vector<ByteCodeValue> new_bytecode; + insert_bytecode_repetition_n(new_bytecode, bytecode_to_repeat, minimum); + + if (maximum.has_value()) { + if (maximum.value() > minimum) { + auto diff = maximum.value() - minimum; + new_bytecode.empend(OpCode::ForkStay); + new_bytecode.empend(diff * (bytecode_to_repeat.size() + 2)); // Jump to the _END label + + for (size_t i = 0; i < diff; ++i) { + new_bytecode.append(bytecode_to_repeat); + new_bytecode.empend(OpCode::ForkStay); + new_bytecode.empend((diff - i - 1) * (bytecode_to_repeat.size() + 2)); // Jump to the _END label + } + } + } else { + // no maximum value set, repeat finding if possible + new_bytecode.empend(OpCode::ForkJump); + new_bytecode.empend(-bytecode_to_repeat.size() - 2); // Jump to the last iteration + } + + bytecode_to_repeat = move(new_bytecode); +} + +template<class T> +void Parser<T>::insert_bytecode_repetition_n(Vector<ByteCodeValue>& stack, Vector<ByteCodeValue>& bytecode_to_repeat, size_t n) +{ + for (size_t i = 0; i < n; ++i) + stack.append(bytecode_to_repeat); +} + +template<class T> +void Parser<T>::insert_bytecode_repetition_min_one(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy) +{ + // LABEL _START = -bytecode_to_repeat.size() + // REGEXP + // FORKJUMP _START (FORKSTAY -> Greedy) + + if (greedy) + bytecode_to_repeat.empend(OpCode::ForkStay); + else + bytecode_to_repeat.empend(OpCode::ForkJump); + + bytecode_to_repeat.empend(-bytecode_to_repeat.size() - 1); // Jump to the _START label +} + +template<class T> +void Parser<T>::insert_bytecode_repetition_any(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy) +{ + // LABEL _START + // FORKSTAY _END (FORKJUMP -> Greedy) + // REGEXP + // JUMP _START + // LABEL _END + + // LABEL _START = stack.size(); + Vector<ByteCodeValue> bytecode; + + if (greedy) + bytecode.empend(OpCode::ForkJump); + else + bytecode.empend(OpCode::ForkStay); + + bytecode.empend(bytecode_to_repeat.size() + 2); // Jump to the _END label + + for (auto& op : bytecode_to_repeat) + bytecode.append(move(op)); + + bytecode.empend(OpCode::Jump); + bytecode.empend(-bytecode.size() - 1); // Jump to the _START label + // LABEL _END = bytecode.size() + + bytecode_to_repeat = move(bytecode); +} + +template<class T> +void Parser<T>::insert_bytecode_repetition_zero_or_one(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy) +{ + // FORKSTAY _END (FORKJUMP -> Greedy) + // REGEXP + // LABEL _END + Vector<ByteCodeValue> bytecode; + + if (greedy) + bytecode.empend(OpCode::ForkJump); + else + bytecode.empend(OpCode::ForkStay); + + bytecode.empend(bytecode_to_repeat.size()); // Jump to the _END label + + for (auto& op : bytecode_to_repeat) + bytecode.append(move(op)); + // LABEL _END = bytecode.size() + + bytecode_to_repeat = move(bytecode); +} + +// ============================= +// PosixExtended Parser +// ============================= + +bool PosixExtendedParser::parse_internal(Vector<ByteCodeValue>& stack, size_t& match_length_minimum) +{ + return parse_root(stack, match_length_minimum); +} + +bool PosixExtendedParser::match_repetition_symbol() +{ + auto type = m_parser_state.current_token.type(); + return (type == TokenType::Asterisk + || type == TokenType::Plus + || type == TokenType::Questionmark + || type == TokenType::LeftCurly); +} + +bool PosixExtendedParser::match_ordinary_characters() +{ + // NOTE: This method must not be called during bracket and repetition parsing! + // FIXME: Add assertion for that? + auto type = m_parser_state.current_token.type(); + return (type == TokenType::OrdinaryCharacter + || type == TokenType::Comma + || type == TokenType::Slash + || type == TokenType::EqualSign + || type == TokenType::HyphenMinus + || type == TokenType::Colon); +} + +bool PosixExtendedParser::parse_repetition_symbol(Vector<ByteCodeValue>& bytecode_to_repeat, size_t& match_length_minimum) +{ + if (match(TokenType::LeftCurly)) { + consume(); + + StringBuilder number_builder; + bool ok; + + while (match(TokenType::OrdinaryCharacter)) { + number_builder.append(consume().value()); + } + + size_t minimum = number_builder.build().to_uint(ok); + if (!ok) + return set_error(Error::InvalidBraceContent); + + match_length_minimum *= minimum; + + if (match(TokenType::Comma)) { + consume(); + } else { + Vector<ByteCodeValue> bytecode; + insert_bytecode_repetition_n(bytecode, bytecode_to_repeat, minimum); + bytecode_to_repeat = move(bytecode); + return !has_error(); + } + + Optional<size_t> maximum {}; + number_builder.clear(); + while (match(TokenType::OrdinaryCharacter)) { + number_builder.append(consume().value()); + } + if (!number_builder.is_empty()) { + maximum = number_builder.build().to_uint(ok); + if (!ok || minimum > maximum.value()) + return set_error(Error::InvalidBraceContent); + } + + insert_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maximum); + + consume(TokenType::RightCurly, Error::BraceMismatch); + return !has_error(); + + } else if (match(TokenType::Plus)) { + consume(); + + bool greedy = match(TokenType::Questionmark); + if (greedy) + consume(); + + // Note: dont touch match_length_minimum, it's already correct + insert_bytecode_repetition_min_one(bytecode_to_repeat, greedy); + return !has_error(); + + } else if (match(TokenType::Asterisk)) { + consume(); + match_length_minimum = 0; + + bool greedy = match(TokenType::Questionmark); + if (greedy) + consume(); + + insert_bytecode_repetition_any(bytecode_to_repeat, greedy); + + return !has_error(); + + } else if (match(TokenType::Questionmark)) { + consume(); + match_length_minimum = 0; + + bool greedy = match(TokenType::Questionmark); + if (greedy) + consume(); + + insert_bytecode_repetition_zero_or_one(bytecode_to_repeat, greedy); + return !has_error(); + } + + return false; +} + +bool PosixExtendedParser::parse_bracket_expression(Vector<ByteCodeValue>& stack, size_t& match_length_minimum) +{ + Vector<CompareTypeAndValuePair> values; + + for (;;) { + + if (match(TokenType::HyphenMinus)) { + consume(); + if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) { + // first in the bracket expression + values.append({ CharacterCompareType::OrdinaryCharacter, { '-' } }); + + } else if (match(TokenType::RightBracket)) { + // Last in the bracket expression + values.append({ CharacterCompareType::OrdinaryCharacter, { '-' } }); + + } else if (values.last().type == CharacterCompareType::OrdinaryCharacter) { + + values.append({ CharacterCompareType::RangeExpressionDummy, 0 }); + + if (match(TokenType::HyphenMinus)) { + consume(); + // Valid range, add ordinary character + values.append({ CharacterCompareType::OrdinaryCharacter, { '-' } }); + } + } else { + return set_error(Error::InvalidRange); + } + + } else if (match(TokenType::OrdinaryCharacter) || match(TokenType::Period) || match(TokenType::Asterisk) || match(TokenType::EscapeSequence) || match(TokenType::Plus)) { + values.append({ CharacterCompareType::OrdinaryCharacter, { *consume().value().characters_without_null_termination() } }); + + } else if (match(TokenType::Circumflex)) { + auto t = consume(); + + if (values.is_empty()) + values.append({ CharacterCompareType::Inverse, 0 }); + else + values.append({ CharacterCompareType::OrdinaryCharacter, { *t.value().characters_without_null_termination() } }); + + } else if (match(TokenType::LeftBracket)) { + consume(); + + if (match(TokenType::Period)) { + consume(); + + // FIXME: Parse collating element, this is needed when we have locale support + // This could have impact on length parameter, I guess. + ASSERT_NOT_REACHED(); + + consume(TokenType::Period, Error::InvalidCollationElement); + consume(TokenType::RightBracket, Error::BracketMismatch); + + } else if (match(TokenType::EqualSign)) { + consume(); + // FIXME: Parse collating element, this is needed when we have locale support + // This could have impact on length parameter, I guess. + ASSERT_NOT_REACHED(); + + consume(TokenType::EqualSign, Error::InvalidCollationElement); + consume(TokenType::RightBracket, Error::BracketMismatch); + + } else if (match(TokenType::Colon)) { + consume(); + + CharacterClass ch_class; + // parse character class + if (match(TokenType::OrdinaryCharacter)) { + if (consume("alnum")) + ch_class = CharacterClass::Alnum; + else if (consume("alpha")) + ch_class = CharacterClass::Alpha; + else if (consume("blank")) + ch_class = CharacterClass::Blank; + else if (consume("cntrl")) + ch_class = CharacterClass::Cntrl; + else if (consume("digit")) + ch_class = CharacterClass::Digit; + else if (consume("graph")) + ch_class = CharacterClass::Graph; + else if (consume("lower")) + ch_class = CharacterClass::Lower; + else if (consume("print")) + ch_class = CharacterClass::Print; + else if (consume("punct")) + ch_class = CharacterClass::Punct; + else if (consume("space")) + ch_class = CharacterClass::Space; + else if (consume("upper")) + ch_class = CharacterClass::Upper; + else if (consume("xdigit")) + ch_class = CharacterClass::Xdigit; + else + return set_error(Error::InvalidCharacterClass); + + values.append({ CharacterCompareType::CharacterClass, ch_class }); + + } else + return set_error(Error::InvalidCharacterClass); + + // FIXME: we do not support locale specific character classes until locales are implemented + + consume(TokenType::Colon, Error::InvalidCharacterClass); + consume(TokenType::RightBracket, Error::BracketMismatch); + } + } else if (match(TokenType::RightBracket)) { + + if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) { + // handle bracket as ordinary character + values.append({ CharacterCompareType::OrdinaryCharacter, { *consume().value().characters_without_null_termination() } }); + } else { + // closing bracket expression + break; + } + } else + // nothing matched, this is a failure, as at least the closing bracket must match... + return set_error(Error::BracketMismatch); + + // check if range expression has to be completed... + if (values.size() >= 3 && values.at(values.size() - 2).type == CharacterCompareType::RangeExpressionDummy) { + if (values.last().type != CharacterCompareType::OrdinaryCharacter) + return set_error(Error::InvalidRange); + + auto value2 = values.take_last(); + values.take_last(); // RangeExpressionDummy + auto value1 = values.take_last(); + + values.append({ CharacterCompareType::RangeExpression, ByteCodeValue { value1.value.ch, value2.value.ch } }); + } + } + + if (values.size()) + match_length_minimum = 1; + + if (values.first().type == CharacterCompareType::Inverse) + match_length_minimum = 0; + + insert_bytecode_compare_values(stack, move(values)); + + return !has_error(); +} + +bool PosixExtendedParser::parse_sub_expression(Vector<ByteCodeValue>& stack, size_t& match_length_minimum) +{ + Vector<ByteCodeValue> bytecode; + size_t length = 0; + bool should_parse_repetition_symbol { false }; + + for (;;) { + if (match_ordinary_characters()) { + Token start_token = m_parser_state.current_token; + Token last_token = m_parser_state.current_token; + for (;;) { + if (!match_ordinary_characters()) + break; + ++length; + last_token = consume(); + } + + if (length > 1) { + stack.empend(OpCode::Compare); + stack.empend(1ul); // number of arguments + stack.empend(CharacterCompareType::OrdinaryCharacters); + stack.empend(start_token.value().characters_without_null_termination()); + stack.empend(length - ((match_repetition_symbol() && length > 1) ? 1 : 0)); // last character is inserted into 'bytecode' for duplication symbol handling + } + + if ((match_repetition_symbol() && length > 1) || length == 1) // Create own compare opcode for last character before duplication symbol + insert_bytecode_compare_values(bytecode, { { CharacterCompareType::OrdinaryCharacter, { last_token.value().characters_without_null_termination()[0] } } }); + + should_parse_repetition_symbol = true; + break; + } + + if (match_repetition_symbol()) + return set_error(Error::InvalidRepetitionMarker); + + if (match(TokenType::Period)) { + length = 1; + consume(); + insert_bytecode_compare_values(bytecode, { { CharacterCompareType::AnySingleCharacter, { 0 } } }); + should_parse_repetition_symbol = true; + break; + } + + if (match(TokenType::EscapeSequence)) { + length = 1; + Token t = consume(); +#ifdef REGEX_DEBUG + printf("[PARSER] EscapeSequence with substring %s\n", String(t.value()).characters()); +#endif + + insert_bytecode_compare_values(bytecode, { { CharacterCompareType::OrdinaryCharacter, { (char)t.value().characters_without_null_termination()[1] } } }); + should_parse_repetition_symbol = true; + break; + } + + if (match(TokenType::LeftBracket)) { + consume(); + + Vector<ByteCodeValue> sub_ops; + if (!parse_bracket_expression(sub_ops, length) || !sub_ops.size()) + return set_error(Error::BracketMismatch); + + bytecode.append(move(sub_ops)); + + consume(TokenType::RightBracket); + should_parse_repetition_symbol = true; + break; + } + + if (match(TokenType::RightBracket)) { + return set_error(Error::BracketMismatch); + } + + if (match(TokenType::RightCurly)) { + return set_error(Error::BraceMismatch); + } + + if (match(TokenType::Circumflex)) { + consume(); + bytecode.empend(OpCode::CheckBegin); + break; + } + + if (match(TokenType::Dollar)) { + consume(); + bytecode.empend(OpCode::CheckEnd); + break; + } + + if (match(TokenType::LeftParen)) { + consume(); + Optional<StringView> capture_group_name; + bool no_subexpression_match_qualifier = false; + if (match(TokenType::Questionmark)) { + consume(); + + if (match(TokenType::Colon)) { + consume(); + no_subexpression_match_qualifier = true; + } else if (consume("<")) { // named capturing group + + Token start_token = m_parser_state.current_token; + Token last_token = m_parser_state.current_token; + size_t capture_group_name_length = 0; + for (;;) { + if (!match_ordinary_characters()) + return set_error(Error::InvalidNameForCaptureGroup); + if (match(TokenType::OrdinaryCharacter) && m_parser_state.current_token.value()[0] == '>') { + consume(); + break; + } + ++capture_group_name_length; + last_token = consume(); + } + capture_group_name = StringView(start_token.value().characters_without_null_termination(), capture_group_name_length); + + } else if (match(TokenType::EqualSign)) { // positive lookahead + consume(); + ASSERT_NOT_REACHED(); + } else if (consume("!")) { // negative lookahead + ASSERT_NOT_REACHED(); + } else if (consume("<")) { + if (match(TokenType::EqualSign)) { // positive lookbehind + consume(); + ASSERT_NOT_REACHED(); + } + if (consume("!")) // negative lookbehind + ASSERT_NOT_REACHED(); + } else { + return set_error(Error::InvalidRepetitionMarker); + } + } + + if (!(m_parser_state.regex_options & (u8)AllFlags::NoSubExpressions || no_subexpression_match_qualifier)) { + if (capture_group_name.has_value()) + insert_bytecode_group_capture_left(bytecode, capture_group_name.value()); + else + insert_bytecode_group_capture_left(bytecode); + } + + Vector<ByteCodeValue> capture_group_bytecode; + + bool res = !parse_root(capture_group_bytecode, length); + + if (capture_group_bytecode.is_empty() && match(TokenType::RightParen)) + return set_error(Error::ParenEmpty); + + if (!res) + return false; + + bytecode.append(move(capture_group_bytecode)); + + consume(TokenType::RightParen, Error::ParenMismatch); + + if (!(m_parser_state.regex_options & (u8)AllFlags::NoSubExpressions || no_subexpression_match_qualifier)) { + if (capture_group_name.has_value()) { + insert_bytecode_group_capture_right(bytecode, capture_group_name.value()); + ++m_parser_state.named_capture_groups_count; + } else { + insert_bytecode_group_capture_right(bytecode); + ++m_parser_state.capture_groups_count; + } + } + should_parse_repetition_symbol = true; + break; + } + + return false; + } + + if (match_repetition_symbol()) { + if (should_parse_repetition_symbol) + parse_repetition_symbol(bytecode, length); + else + return set_error(Error::InvalidRepetitionMarker); + } + + stack.append(move(bytecode)); + match_length_minimum += length; + + return true; +} + +bool PosixExtendedParser::parse_root(Vector<ByteCodeValue>& stack, size_t& match_length_minimum) +{ + Vector<ByteCodeValue> bytecode_left; + size_t match_length_minimum_left { 0 }; + + if (match_repetition_symbol()) + return set_error(Error::InvalidRepetitionMarker); + + for (;;) { + if (!parse_sub_expression(bytecode_left, match_length_minimum_left)) + break; + + if (match(TokenType::Pipe)) { + consume(); + + Vector<ByteCodeValue> bytecode_right; + size_t match_length_minimum_right { 0 }; + + if (!parse_root(bytecode_right, match_length_minimum_right) || bytecode_right.is_empty()) + return set_error(Error::InvalidPattern); + + Vector<ByteCodeValue> new_bytecode; + insert_bytecode_alternation(new_bytecode, move(bytecode_left), move(bytecode_right)); + bytecode_left = move(new_bytecode); + match_length_minimum_left = min(match_length_minimum_right, match_length_minimum_left); + } + } + + stack.append(move(bytecode_left)); + match_length_minimum = match_length_minimum_left; + return !has_error(); +} +} +} diff --git a/Libraries/LibRegex/RegexParser.h b/Libraries/LibRegex/RegexParser.h new file mode 100644 index 0000000000..a94dfa3a28 --- /dev/null +++ b/Libraries/LibRegex/RegexParser.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include "RegexByteCode.h" +#include "RegexError.h" +#include "RegexLexer.h" +#include "RegexOptions.h" + +#include <AK/Forward.h> +#include <AK/StringBuilder.h> +#include <AK/Types.h> +#include <AK/Vector.h> + +namespace regex { + +class PosixExtendedParser; + +template<typename T> +struct GenericParserTraits { + using OptionsType = T; +}; + +template<typename T> +struct ParserTraits : public GenericParserTraits<T> { +}; + +template<> +struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> { +}; + +class Parser { +public: + struct Result { + ByteCode bytecode; + size_t capture_groups_count; + size_t named_capture_groups_count; + size_t match_length_minimum; + Error error; + Token error_token; + }; + + explicit Parser(Lexer& lexer) + : m_parser_state(lexer) + { + } + + Parser(Lexer& lexer, AllOptions regex_options) + : m_parser_state(lexer, regex_options) + { + } + + virtual ~Parser() = default; + + Result parse(Optional<AllOptions> regex_options = {}); + bool has_error() const { return m_parser_state.error != Error::NoError; } + Error error() const { return m_parser_state.error; } + +protected: + virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0; + + ALWAYS_INLINE bool match(TokenType type) const; + ALWAYS_INLINE bool match(char ch) const; + ALWAYS_INLINE Token consume(); + ALWAYS_INLINE Token consume(TokenType type, Error error); + ALWAYS_INLINE bool consume(const String&); + ALWAYS_INLINE void reset(); + ALWAYS_INLINE bool done() const; + ALWAYS_INLINE bool set_error(Error error); + + struct ParserState { + Lexer& lexer; + Token current_token; + Error error = Error::NoError; + Token error_token { TokenType::Eof, 0, StringView(nullptr) }; + ByteCode bytecode; + size_t capture_groups_count { 0 }; + size_t named_capture_groups_count { 0 }; + size_t match_length_minimum { 0 }; + AllOptions regex_options; + explicit ParserState(Lexer& lexer) + : lexer(lexer) + , current_token(lexer.next()) + { + } + explicit ParserState(Lexer& lexer, AllOptions regex_options) + : lexer(lexer) + , current_token(lexer.next()) + , regex_options(regex_options) + { + } + }; + + ParserState m_parser_state; +}; + +class PosixExtendedParser final : public Parser { +public: + explicit PosixExtendedParser(Lexer& lexer) + : Parser(lexer) {}; + PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options) + : Parser(lexer, regex_options.value_or({})) {}; + ~PosixExtendedParser() = default; + +private: + ALWAYS_INLINE bool match_repetition_symbol(); + ALWAYS_INLINE bool match_ordinary_characters(); + + bool parse_internal(ByteCode&, size_t&) override; + + bool parse_root(ByteCode&, size_t&); + ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&); + ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&); + ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&); +}; + +using PosixExtended = PosixExtendedParser; + +} + +using regex::PosixExtended; diff --git a/Libraries/LibRegex/RegexParser.h_ b/Libraries/LibRegex/RegexParser.h_ new file mode 100644 index 0000000000..2f0dbed638 --- /dev/null +++ b/Libraries/LibRegex/RegexParser.h_ @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include "RegexError.h" +#include "RegexLexer.h" +#include "RegexOptions.h" +#include <AK/Forward.h> +#include <AK/Types.h> +#include <AK/Vector.h> + +namespace AK { +namespace regex { + +#define ENUMERATE_OPCODES \ + __ENUMERATE_OPCODE(Compare) \ + __ENUMERATE_OPCODE(Jump) \ + __ENUMERATE_OPCODE(ForkJump) \ + __ENUMERATE_OPCODE(ForkStay) \ + __ENUMERATE_OPCODE(SaveLeftCaptureGroup) \ + __ENUMERATE_OPCODE(SaveRightCaptureGroup) \ + __ENUMERATE_OPCODE(SaveLeftNamedCaptureGroup) \ + __ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \ + __ENUMERATE_OPCODE(CheckBegin) \ + __ENUMERATE_OPCODE(CheckEnd) \ + __ENUMERATE_OPCODE(Exit) + +enum class OpCode : u8 { +#define __ENUMERATE_OPCODE(x) x, + ENUMERATE_OPCODES +#undef __ENUMERATE_OPCODE +}; + +enum class CharacterCompareType : u8 { + Undefined, + Inverse, + AnySingleCharacter, + OrdinaryCharacter, + OrdinaryCharacters, + CharacterClass, + RangeExpression, + RangeExpressionDummy, +}; + +enum class CharacterClass : u8 { + Alnum, + Cntrl, + Lower, + Space, + Alpha, + Digit, + Print, + Upper, + Blank, + Graph, + Punct, + Xdigit, +}; + +class ByteCodeValue { +public: + union CompareValue { + CompareValue(const CharacterClass value) + : character_class(value) + { + } + CompareValue(const char value1, const char value2) + : range_values { value1, value2 } + { + } + const CharacterClass character_class; + const struct { + const char from; + const char to; + } range_values; + }; + + union { + const OpCode op_code; + const char* string; + const char ch; + const int number; + const size_t positive_number; + const CompareValue compare_value; + const CharacterCompareType compare_type; + }; + + const char* name() const; + static const char* name(OpCode); + + ByteCodeValue(const OpCode value) + : op_code(value) + { + } + ByteCodeValue(const char* value) + : string(value) + { + } + ByteCodeValue(const char value) + : ch(value) + { + } + ByteCodeValue(const int value) + : number(value) + { + } + ByteCodeValue(const size_t value) + : positive_number(value) + { + } + ByteCodeValue(const CharacterClass value) + : compare_value(value) + { + } + ByteCodeValue(const char value1, const char value2) + : compare_value(value1, value2) + { + } + ByteCodeValue(const CharacterCompareType value) + : compare_type(value) + { + } + + ~ByteCodeValue() = default; +}; + +struct CompareTypeAndValuePair { + CharacterCompareType type; + ByteCodeValue value; +}; + +struct ParserResult { + Vector<ByteCodeValue> m_bytes; + size_t m_match_groups; + size_t m_min_match_length; + Error m_error; + Token m_error_token; +}; + +template<class T> +class Parser { +public: + explicit Parser(Lexer& lexer) + : m_parser_state(lexer) + { + } + + Parser(Lexer& lexer, T options) + : m_parser_state(lexer, options) + { + } + + virtual ~Parser() = default; + + virtual ParserResult parse(T options = {}, EngineOptions engine_options = {}); + bool has_error() const { return m_parser_state.m_error != Error::NoError; } + Error error() const { return m_parser_state.m_error; } + +protected: + virtual bool parse_internal(Vector<ByteCodeValue>&, size_t& min_length) = 0; + + bool match(TokenType type) const; + bool match(char ch) const; + Token consume(); + Token consume(TokenType type, Error error = Error::InvalidPattern); + bool consume(const String&); + void reset(); + bool done() const; + + bool set_error(Error error); + + void insert_bytecode_compare_values(Vector<ByteCodeValue>&, Vector<CompareTypeAndValuePair>&&); + void insert_bytecode_group_capture_left(Vector<ByteCodeValue>& stack); + void insert_bytecode_group_capture_right(Vector<ByteCodeValue>& stack); + void insert_bytecode_group_capture_left(Vector<ByteCodeValue>& stack, const StringView& name); + void insert_bytecode_group_capture_right(Vector<ByteCodeValue>& stack, const StringView& name); + void insert_bytecode_alternation(Vector<ByteCodeValue>& stack, Vector<ByteCodeValue>&&, Vector<ByteCodeValue>&&); + void insert_bytecode_repetition_min_max(Vector<ByteCodeValue>& bytecode_to_repeat, size_t minimum, Optional<size_t> maximum); + void insert_bytecode_repetition_n(Vector<ByteCodeValue>& stack, Vector<ByteCodeValue>& bytecode_to_repeat, size_t n); + void insert_bytecode_repetition_min_one(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy); + void insert_bytecode_repetition_any(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy); + void insert_bytecode_repetition_zero_or_one(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy); + + struct ParserState { + Lexer& lexer; + Token current_token; + Error error = Error::NoError; + Token error_token { TokenType::Eof, 0, StringView(nullptr) }; + Vector<ByteCodeValue> bytecode; + size_t capture_groups_count { 0 }; + size_t named_capture_groups_count { 0 }; + size_t match_length_minimum { 0 }; + OptionsType regex_options; + explicit ParserState(Lexer& lexer) + : lexer(lexer) + , current_token(lexer.next()) + { + } + explicit ParserState(Lexer& lexer, Optional<OptionsType> regex_options) + : lexer(lexer) + , current_token(lexer.next()) + , regex_options(regex_options.value_or({})) + { + } + }; + + ParserState m_parser_state; +}; + +class PosixExtendedParser final : public Parser<PosixOptions> { +public: + explicit PosixExtendedParser(Lexer& lexer) + : Parser(lexer) {}; + PosixExtendedParser(Lexer& lexer, Optional<OptionsType> regex_options) + : Parser(lexer, regex_options) {}; + ~PosixExtendedParser() = default; + +private: + bool match_repetition_symbol(); + bool match_ordinary_characters(); + + bool parse_internal(Vector<ByteCodeValue>&, size_t&) override; + + bool parse_root(Vector<ByteCodeValue>&, size_t&); + bool parse_sub_expression(Vector<ByteCodeValue>&, size_t&); + bool parse_bracket_expression(Vector<ByteCodeValue>&, size_t&); + bool parse_repetition_symbol(Vector<ByteCodeValue>&, size_t&); +}; +} +} + +using AK::regex::ParserResult; +using AK::regex::PosixExtendedParser; diff --git a/Libraries/LibRegex/Tests/Benchmark.cpp b/Libraries/LibRegex/Tests/Benchmark.cpp new file mode 100644 index 0000000000..fa90c2f80f --- /dev/null +++ b/Libraries/LibRegex/Tests/Benchmark.cpp @@ -0,0 +1,914 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <AK/TestSuite.h> // import first, to prevent warning of ASSERT* redefinition + +#include <LibRegex/Regex.h> +#include <stdio.h> + +#ifndef REGEX_DEBUG + +# define BENCHMARK_LOOP_ITERATIONS 100000 + +//# define REGEX_BENCHMARK_OUR +# ifndef __serenity__ +//# define REGEX_BENCHMARK_OTHER +# endif + +# if defined(REGEX_BENCHMARK_OTHER) +# include <regex> +# endif + +# if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(catch_all_benchmark) +{ + Regex<PosixExtended> re("^.*$"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT(re.match("Hello World", m)); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(catch_all_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("^.*$"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("Hello World", m, re), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(simple_start_benchmark) +{ + Regex<PosixExtended> re("^hello friends"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("Hello!", m), false); + EXPECT_EQ(re.match("hello friends", m), true); + EXPECT_EQ(re.match("Well, hello friends", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(simple_start_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("^hello friends"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("Hello", m, re), false); + EXPECT_EQ(std::regex_match("hello friends", m, re), true); + EXPECT_EQ(std::regex_match("Well, hello friends", m, re), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(simple_end_benchmark) +{ + Regex<PosixExtended> re(".*hello\\.\\.\\. there$"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("Hallo", m), false); + EXPECT_EQ(re.match("I said fyhello... there", m), true); + EXPECT_EQ(re.match("ahello... therea", m), false); + EXPECT_EQ(re.match("hello.. there", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(simple_end_benchmark_reference_stdcpp_regex_search) +{ + std::regex re(".*hello\\.\\.\\. there$"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_search("Hallo", m, re), false); + EXPECT_EQ(std::regex_search("I said fyhello... there", m, re), true); + EXPECT_EQ(std::regex_search("ahello... therea", m, re), false); + EXPECT_EQ(std::regex_search("hello.. there", m, re), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(simple_period_benchmark) +{ + Regex<PosixExtended> re("hello."); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("Hello1", m), false); + EXPECT_EQ(re.match("hello1", m), true); + EXPECT_EQ(re.match("hello2", m), true); + EXPECT_EQ(re.match("hello?", m), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(simple_period_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("hello."); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("Hello1", m, re), false); + EXPECT_EQ(std::regex_match("hello1", m, re), true); + EXPECT_EQ(std::regex_match("hello2", m, re), true); + EXPECT_EQ(std::regex_match("hello?", m, re), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(simple_period_end_benchmark) +{ + Regex<PosixExtended> re("hello.$"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.search("Hello1", m), false); + EXPECT_EQ(re.search("hello1hello1", m), true); + EXPECT_EQ(re.search("hello2hell", m), false); + EXPECT_EQ(re.search("hello?", m), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(simple_period_end_benchmark_reference_stdcpp_regex_search) +{ + std::regex re("hello.$"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_search("Hello1", m, re), false); + EXPECT_EQ(std::regex_search("hello1hello1", m, re), true); + EXPECT_EQ(std::regex_search("hello2hell", m, re), false); + EXPECT_EQ(std::regex_search("hello?", m, re), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(simple_escaped_benchmark) +{ + Regex<PosixExtended> re("hello\\."); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("hello", m), false); + EXPECT_EQ(re.match("hello.", m), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(simple_escaped_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("hello\\."); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("hello", m, re), false); + EXPECT_EQ(std::regex_match("hello.", m, re), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(simple_period2_end_benchmark) +{ + Regex<PosixExtended> re(".*hi... there$"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.search("Hello there", m), false); + EXPECT_EQ(re.search("I said fyhi... there", m), true); + EXPECT_EQ(re.search("....hi... ", m), false); + EXPECT_EQ(re.search("I said fyhihii there", m), true); + EXPECT_EQ(re.search("I said fyhihi there", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(simple_period2_end_benchmark_reference_stdcpp_regex_search) +{ + std::regex re(".*hi... there$"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_search("Hello there", m, re), false); + EXPECT_EQ(std::regex_search("I said fyhi... there", m, re), true); + EXPECT_EQ(std::regex_search("....hi... ", m, re), false); + EXPECT_EQ(std::regex_search("I said fyhihii there", m, re), true); + EXPECT_EQ(std::regex_search("I said fyhihi there", m, re), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(simple_plus_benchmark) +{ + Regex<PosixExtended> re("a+"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.search("b", m), false); + EXPECT_EQ(re.search("a", m), true); + EXPECT_EQ(re.search("aaaaaabbbbb", m), true); + EXPECT_EQ(re.search("aaaaaaaaaaa", m), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(simple_plus_benchmark_reference_stdcpp_regex_search) +{ + std::regex re("a+"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_search("b", m, re), false); + EXPECT_EQ(std::regex_search("a", m, re), true); + EXPECT_EQ(std::regex_search("aaaaaabbbbb", m, re), true); + EXPECT_EQ(std::regex_search("aaaaaaaaaaa", m, re), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(simple_questionmark_benchmark) +{ + Regex<PosixExtended> re("da?d"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.search("a", m), false); + EXPECT_EQ(re.search("daa", m), false); + EXPECT_EQ(re.search("ddddd", m), true); + EXPECT_EQ(re.search("dd", m), true); + EXPECT_EQ(re.search("dad", m), true); + EXPECT_EQ(re.search("dada", m), true); + EXPECT_EQ(re.search("adadaa", m), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(simple_questionmark_benchmark_reference_stdcpp_regex_search) +{ + std::regex re("da?d"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_search("a", m, re), false); + EXPECT_EQ(std::regex_search("daa", m, re), false); + EXPECT_EQ(std::regex_search("ddddd", m, re), true); + EXPECT_EQ(std::regex_search("dd", m, re), true); + EXPECT_EQ(std::regex_search("dad", m, re), true); + EXPECT_EQ(std::regex_search("dada", m, re), true); + EXPECT_EQ(std::regex_search("adadaa", m, re), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(character_class_benchmark) +{ + Regex<PosixExtended> re("[[:alpha:]]"); + RegexResult m; + String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n"; + + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match(haystack.characters(), m), false); + EXPECT_EQ(re.search(haystack.characters(), m), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(character_class_benchmark_reference_stdcpp_regex_search) +{ + std::regex re("[[:alpha:]]"); + std::cmatch m; + String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n"; + + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match(haystack.characters(), m, re), false); + EXPECT_EQ(std::regex_search(haystack.characters(), m, re), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(escaped_char_questionmark_benchmark) +{ + Regex<PosixExtended> re("This\\.?And\\.?That"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("ThisAndThat", m), true); + EXPECT_EQ(re.match("This.And.That", m), true); + EXPECT_EQ(re.match("This And That", m), false); + EXPECT_EQ(re.match("This..And..That", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(escaped_char_questionmark_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("This\\.?And\\.?That"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("ThisAndThat", m, re), true); + EXPECT_EQ(std::regex_match("This.And.That", m, re), true); + EXPECT_EQ(std::regex_match("This And That", m, re), false); + EXPECT_EQ(std::regex_match("This..And..That", m, re), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(char_qualifier_asterisk_benchmark) +{ + Regex<PosixExtended> re("regex*"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.search("#include <regex.h>", m), true); + EXPECT_EQ(re.search("#include <stdio.h>", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(char_qualifier_asterisk_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("regex*"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_search("#include <regex.h>", m, re), true); + EXPECT_EQ(std::regex_search("#include <stdio.h>", m, re), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(parens_qualifier_questionmark_benchmark) +{ + Regex<PosixExtended> re("test(hello)?test"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("testtest", m), true); + EXPECT_EQ(re.match("testhellotest", m), true); + EXPECT_EQ(re.match("testasfdtest", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(parens_qualifier_questionmark_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("test(hello)?test"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("testtest", m, re), true); + EXPECT_EQ(std::regex_match("testhellotest", m, re), true); + EXPECT_EQ(std::regex_match("testasfdtest", m, re), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(parens_qualifier_asterisk_benchmark) +{ + Regex<PosixExtended> re("test(hello)*test"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("testtest", m), true); + EXPECT_EQ(re.match("testhellohellotest", m), true); + EXPECT_EQ(re.search("testhellohellotest, testhellotest", m), true); + EXPECT_EQ(re.match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(parens_qualifier_asterisk_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("test(hello)*test"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("testtest", m, re), true); + EXPECT_EQ(std::regex_match("testhellohellotest", m, re), true); + EXPECT_EQ(std::regex_search("testhellohellotest, testhellotest", m, re), true); + EXPECT_EQ(std::regex_match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m, re), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(parens_qualifier_asterisk_2_benchmark) +{ + Regex<PosixExtended> re("test(.*)test"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("testasdftest", m), true); + EXPECT_EQ(re.match("testasdfasdftest", m), true); + EXPECT_EQ(re.search("testaaaatest, testbbbtest, testtest", m), true); + EXPECT_EQ(re.match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(parens_qualifier_asterisk_2_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("test(.*)test"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("testasdftest", m, re), true); + EXPECT_EQ(std::regex_match("testasdfasdftest", m, re), true); + EXPECT_EQ(std::regex_search("testaaaatest, testbbbtest, testtest", m, re), true); + EXPECT_EQ(std::regex_match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m, re), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(multi_parens_qualifier_questionmark_benchmark) +{ + Regex<PosixExtended> re("test(a)?(b)?(c)?test"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("testtest", m), true); + EXPECT_EQ(re.match("testabctest", m), true); + EXPECT_EQ(re.search("testabctest, testactest", m), true); + EXPECT_EQ(re.match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m), false); + EXPECT_EQ(re.match("test", m), false); + EXPECT_EQ(re.match("whaaaaat", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(multi_parens_qualifier_questionmark_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("test(a)?(b)?(c)?test"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("testtest", m, re), true); + EXPECT_EQ(std::regex_match("testabctest", m, re), true); + EXPECT_EQ(std::regex_search("testabctest, testactest", m, re), true); + EXPECT_EQ(std::regex_match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m, re), false); + EXPECT_EQ(std::regex_match("test", m, re), false); + EXPECT_EQ(std::regex_match("whaaaaat", m, re), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(simple_alternative_benchmark) +{ + Regex<PosixExtended> re("test|hello|friends"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("test", m), true); + EXPECT_EQ(re.match("hello", m), true); + EXPECT_EQ(re.match("friends", m), true); + EXPECT_EQ(re.match("whaaaaat", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(simple_alternative_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("test|hello|friends"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("test", m, re), true); + EXPECT_EQ(std::regex_match("hello", m, re), true); + EXPECT_EQ(std::regex_match("friends", m, re), true); + EXPECT_EQ(std::regex_match("whaaaaat", m, re), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(alternative_match_groups_benchmark) +{ + Regex<PosixExtended> re("test(a)?(b)?|hello ?(dear|my)? friends"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("test", m), true); + EXPECT_EQ(re.match("testa", m), true); + EXPECT_EQ(re.match("testb", m), true); + EXPECT_EQ(re.match("hello friends", m), true); + EXPECT_EQ(re.match("hello dear friends", m), true); + EXPECT_EQ(re.match("hello my friends", m), true); + EXPECT_EQ(re.match("testabc", m), false); + EXPECT_EQ(re.match("hello test friends", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(alternative_match_groups_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("test(a)?(b)?|hello ?(dear|my)? friends"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("test", m, re), true); + EXPECT_EQ(std::regex_match("testa", m, re), true); + EXPECT_EQ(std::regex_match("testb", m, re), true); + EXPECT_EQ(std::regex_match("hello friends", m, re), true); + EXPECT_EQ(std::regex_match("hello dear friends", m, re), true); + EXPECT_EQ(std::regex_match("hello my friends", m, re), true); + EXPECT_EQ(std::regex_match("testabc", m, re), false); + EXPECT_EQ(std::regex_match("hello test friends", m, re), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(parens_qualifier_exact_benchmark) +{ + Regex<PosixExtended> re("(hello){3}"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("hello", m), false); + EXPECT_EQ(re.match("hellohellohello", m), true); + EXPECT_EQ(re.search("hellohellohellohello", m), true); + EXPECT_EQ(re.search("test hellohellohello", m), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(parens_qualifier_exact_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("(hello){3}"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("hello", m, re), false); + EXPECT_EQ(std::regex_match("hellohellohello", m, re), true); + EXPECT_EQ(std::regex_search("hellohellohellohello", m, re), true); + EXPECT_EQ(std::regex_search("test hellohellohello", m, re), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(parens_qualifier_minimum_benchmark) +{ + Regex<PosixExtended> re("(hello){3,}"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("hello", m), false); + EXPECT_EQ(re.match("hellohellohello", m), true); + EXPECT_EQ(re.search("hellohellohellohello", m), true); + EXPECT_EQ(re.search("test hellohellohello", m), true); + EXPECT_EQ(re.search("test hellohellohellohello", m), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(parens_qualifier_minimum_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("(hello){3,}"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("hello", m, re), false); + EXPECT_EQ(std::regex_match("hellohellohello", m, re), true); + EXPECT_EQ(std::regex_search("hellohellohellohello", m, re), true); + EXPECT_EQ(std::regex_search("test hellohellohello", m, re), true); + EXPECT_EQ(std::regex_search("test hellohellohellohello", m, re), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(parens_qualifier_maximum_benchmark) +{ + Regex<PosixExtended> re("(hello){2,3}"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("hello", m), false); + EXPECT_EQ(re.match("hellohellohello", m), true); + EXPECT_EQ(re.search("hellohellohellohello", m), true); + EXPECT_EQ(re.search("test hellohellohello", m), true); + EXPECT_EQ(re.search("test hellohellohellohello", m), true); + EXPECT_EQ(re.match("test hellohellohellohello", m), false); + EXPECT_EQ(re.search("test hellohellohellohello", m), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(parens_qualifier_maximum_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("(hello){2,3}"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("hello", m, re), false); + EXPECT_EQ(std::regex_match("hellohellohello", m, re), true); + EXPECT_EQ(std::regex_search("hellohellohellohello", m, re), true); + EXPECT_EQ(std::regex_search("test hellohellohello", m, re), true); + EXPECT_EQ(std::regex_search("test hellohellohellohello", m, re), true); + EXPECT_EQ(std::regex_match("test hellohellohellohello", m, re), false); + EXPECT_EQ(std::regex_search("test hellohellohellohello", m, re), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(char_qualifier_min_max_benchmark) +{ + Regex<PosixExtended> re("c{3,30}"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("cc", m), false); + EXPECT_EQ(re.match("ccc", m), true); + EXPECT_EQ(re.match("cccccccccccccccccccccccccccccc", m), true); + EXPECT_EQ(re.match("ccccccccccccccccccccccccccccccc", m), false); + EXPECT_EQ(re.search("ccccccccccccccccccccccccccccccc", m), true); + EXPECT_EQ(re.match("cccccccccccccccccccccccccccccccc", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(char_qualifier_min_max_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("c{3,30}"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("cc", m, re), false); + EXPECT_EQ(std::regex_match("ccc", m, re), true); + EXPECT_EQ(std::regex_match("cccccccccccccccccccccccccccccc", m, re), true); + EXPECT_EQ(std::regex_match("ccccccccccccccccccccccccccccccc", m, re), false); + EXPECT_EQ(std::regex_search("ccccccccccccccccccccccccccccccc", m, re), true); + EXPECT_EQ(std::regex_match("cccccccccccccccccccccccccccccccc", m, re), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(simple_bracket_chars_benchmark) +{ + Regex<PosixExtended> re("[abc]"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("a", m), true); + EXPECT_EQ(re.match("b", m), true); + EXPECT_EQ(re.match("c", m), true); + EXPECT_EQ(re.match("d", m), false); + EXPECT_EQ(re.match("e", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(simple_bracket_chars_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("[abc]"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("a", m, re), true); + EXPECT_EQ(std::regex_match("b", m, re), true); + EXPECT_EQ(std::regex_match("c", m, re), true); + EXPECT_EQ(std::regex_match("d", m, re), false); + EXPECT_EQ(std::regex_match("e", m, re), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(simple_bracket_chars_inverse_benchmark) +{ + Regex<PosixExtended> re("[^abc]"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("a", m), false); + EXPECT_EQ(re.match("b", m), false); + EXPECT_EQ(re.match("c", m), false); + EXPECT_EQ(re.match("d", m), true); + EXPECT_EQ(re.match("e", m), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(simple_bracket_chars_inverse_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("[^abc]"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("a", m, re), false); + EXPECT_EQ(std::regex_match("b", m, re), false); + EXPECT_EQ(std::regex_match("c", m, re), false); + EXPECT_EQ(std::regex_match("d", m, re), true); + EXPECT_EQ(std::regex_match("e", m, re), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(simple_bracket_chars_range_benchmark) +{ + Regex<PosixExtended> re("[a-d]"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("a", m), true); + EXPECT_EQ(re.match("b", m), true); + EXPECT_EQ(re.match("c", m), true); + EXPECT_EQ(re.match("d", m), true); + EXPECT_EQ(re.match("e", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(simple_bracket_chars_range_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("[a-d]"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("a", m, re), true); + EXPECT_EQ(std::regex_match("b", m, re), true); + EXPECT_EQ(std::regex_match("c", m, re), true); + EXPECT_EQ(std::regex_match("d", m, re), true); + EXPECT_EQ(std::regex_match("e", m, re), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(simple_bracket_chars_range_inverse_benchmark) +{ + Regex<PosixExtended> re("[^a-df-z]"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("a", m), false); + EXPECT_EQ(re.match("b", m), false); + EXPECT_EQ(re.match("c", m), false); + EXPECT_EQ(re.match("d", m), false); + EXPECT_EQ(re.match("e", m), true); + EXPECT_EQ(re.match("k", m), false); + EXPECT_EQ(re.match("z", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(simple_bracket_chars_range_inverse_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("[^a-df-z]"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("a", m, re), false); + EXPECT_EQ(std::regex_match("b", m, re), false); + EXPECT_EQ(std::regex_match("c", m, re), false); + EXPECT_EQ(std::regex_match("d", m, re), false); + EXPECT_EQ(std::regex_match("e", m, re), true); + EXPECT_EQ(std::regex_match("k", m, re), false); + EXPECT_EQ(std::regex_match("z", m, re), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(bracket_character_class_uuid_benchmark) +{ + Regex<PosixExtended> re("^([[:xdigit:]]{8})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{12})$"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("fb9b62a2-1579-4e3a-afba-76239ccb6583", m), true); + EXPECT_EQ(re.match("fb9b62a2", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(bracket_character_class_uuid_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("^([[:xdigit:]]{8})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{12})$"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("fb9b62a2-1579-4e3a-afba-76239ccb6583", m, re), true); + EXPECT_EQ(std::regex_match("fb9b62a2", m, re), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(simple_bracket_character_class_inverse_benchmark) +{ + Regex<PosixExtended> re("[^[:digit:]]"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("1", m), false); + EXPECT_EQ(re.match("2", m), false); + EXPECT_EQ(re.match("3", m), false); + EXPECT_EQ(re.match("d", m), true); + EXPECT_EQ(re.match("e", m), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(simple_bracket_character_class_inverse_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("[^[:digit:]]"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("1", m, re), false); + EXPECT_EQ(std::regex_match("2", m, re), false); + EXPECT_EQ(std::regex_match("3", m, re), false); + EXPECT_EQ(std::regex_match("d", m, re), true); + EXPECT_EQ(std::regex_match("e", m, re), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(email_address_benchmark) +{ + Regex<PosixExtended> re("^[A-Z0-9a-z._%+-]{1,64}@(?:[A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$"); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("hello.world@domain.tld", m), true); + EXPECT_EQ(re.match("this.is.a.very_long_email_address@world.wide.web", m), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(email_address_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("^[A-Z0-9a-z._%+-]{1,64}@(?:[A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$"); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("hello.world@domain.tld", m, re), true); + EXPECT_EQ(std::regex_match("this.is.a.very_long_email_address@world.wide.web", m, re), true); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OUR) +BENCHMARK_CASE(simple_ignorecase_benchmark) +{ + Regex<PosixExtended> re("^hello friends", PosixFlags::Insensitive); + RegexResult m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(re.match("Hello Friends", m), true); + EXPECT_EQ(re.match("hello Friends", m), true); + + EXPECT_EQ(re.match("hello Friends!", m), false); + EXPECT_EQ(re.search("hello Friends", m), true); + + EXPECT_EQ(re.match("hell Friends", m), false); + EXPECT_EQ(re.search("hell Friends", m), false); + } +} +#endif + +#if defined(REGEX_BENCHMARK_OTHER) +BENCHMARK_CASE(simple_ignorecase_benchmark_reference_stdcpp_regex_match) +{ + std::regex re("^hello friends", std::regex_constants::icase); + std::cmatch m; + for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("Hello Friends", m, re), true); + EXPECT_EQ(std::regex_match("hello Friends", m, re), true); + + EXPECT_EQ(std::regex_match("hello Friends!", m, re), false); + EXPECT_EQ(std::regex_search("hello Friends", m, re), true); + + EXPECT_EQ(std::regex_match("hell Friends", m, re), false); + EXPECT_EQ(std::regex_search("hell Friends", m, re), false); + } +} +#endif +#endif + +TEST_MAIN(Regex) diff --git a/Libraries/LibRegex/Tests/CMakeLists.txt b/Libraries/LibRegex/Tests/CMakeLists.txt new file mode 100644 index 0000000000..a337f05153 --- /dev/null +++ b/Libraries/LibRegex/Tests/CMakeLists.txt @@ -0,0 +1,21 @@ +file(GLOB TEST_SOURCES CONFIGURE_DEPENDS "*.cpp") +file(GLOB REGEX_SOURCES CONFIGURE_DEPENDS "../*.cpp") +file(GLOB C_REGEX_SOURCES CONFIGURE_DEPENDS "../C/*.cpp") + +foreach(source ${TEST_SOURCES}) + get_filename_component(name ${source} NAME_WE) + add_executable(${name} ${source} ${REGEX_SOURCES} ${C_REGEX_SOURCES}) + target_link_libraries(${name} LagomCore) + add_test( + NAME ${name} + COMMAND ${name} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) + + set_tests_properties( + ${name} + PROPERTIES + FAIL_REGULAR_EXPRESSION + "FAIL" + ) +endforeach() diff --git a/Libraries/LibRegex/Tests/Regex.cpp b/Libraries/LibRegex/Tests/Regex.cpp new file mode 100644 index 0000000000..8a5cb904b1 --- /dev/null +++ b/Libraries/LibRegex/Tests/Regex.cpp @@ -0,0 +1,451 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <AK/TestSuite.h> // import first, to prevent warning of ASSERT* redefinition + +#include <LibRegex/Regex.h> +#include <LibRegex/RegexDebug.h> +#include <AK/StringBuilder.h> +#include <stdio.h> + +static ECMAScriptOptions match_test_api_options(const ECMAScriptOptions options) +{ + return options; +} + +static PosixOptions match_test_api_options(const PosixOptions options) +{ + return options; +} + +TEST_CASE(regex_options_ecmascript) +{ + ECMAScriptOptions eo; + eo |= ECMAScriptFlags::Global; + + EXPECT(eo & ECMAScriptFlags::Global); + EXPECT(!(eo & ECMAScriptFlags::Insensitive)); + + eo = match_test_api_options(ECMAScriptFlags::Global | ECMAScriptFlags::Insensitive | ECMAScriptFlags::Sticky); + EXPECT(eo & ECMAScriptFlags::Global); + EXPECT(eo & ECMAScriptFlags::Insensitive); + EXPECT(eo & ECMAScriptFlags::Sticky); + EXPECT(!(eo & ECMAScriptFlags::Unicode)); + EXPECT(!(eo & ECMAScriptFlags::Multiline)); + EXPECT(!(eo & ECMAScriptFlags::SingleLine)); + + eo &= ECMAScriptFlags::Insensitive; + EXPECT(!(eo & ECMAScriptFlags::Global)); + EXPECT(eo & ECMAScriptFlags::Insensitive); + EXPECT(!(eo & ECMAScriptFlags::Multiline)); + + eo &= ECMAScriptFlags::Sticky; + EXPECT(!(eo & ECMAScriptFlags::Global)); + EXPECT(!(eo & ECMAScriptFlags::Insensitive)); + EXPECT(!(eo & ECMAScriptFlags::Multiline)); + EXPECT(!(eo & ECMAScriptFlags::Sticky)); + + eo = ~ECMAScriptFlags::Insensitive; + EXPECT(eo & ECMAScriptFlags::Global); + EXPECT(!(eo & ECMAScriptFlags::Insensitive)); + EXPECT(eo & ECMAScriptFlags::Multiline); + EXPECT(eo & ECMAScriptFlags::Sticky); +} + +TEST_CASE(regex_options_posix) +{ + PosixOptions eo; + eo |= PosixFlags::Global; + + EXPECT(eo & PosixFlags::Global); + EXPECT(!(eo & PosixFlags::Insensitive)); + + eo = match_test_api_options(PosixFlags::Global | PosixFlags::Insensitive | PosixFlags::MatchNotBeginOfLine); + EXPECT(eo & PosixFlags::Global); + EXPECT(eo & PosixFlags::Insensitive); + EXPECT(eo & PosixFlags::MatchNotBeginOfLine); + EXPECT(!(eo & PosixFlags::Unicode)); + EXPECT(!(eo & PosixFlags::Multiline)); + + eo &= PosixFlags::Insensitive; + EXPECT(!(eo & PosixFlags::Global)); + EXPECT(eo & PosixFlags::Insensitive); + EXPECT(!(eo & PosixFlags::Multiline)); + + eo &= PosixFlags::MatchNotBeginOfLine; + EXPECT(!(eo & PosixFlags::Global)); + EXPECT(!(eo & PosixFlags::Insensitive)); + EXPECT(!(eo & PosixFlags::Multiline)); + + eo = ~PosixFlags::Insensitive; + EXPECT(eo & PosixFlags::Global); + EXPECT(!(eo & PosixFlags::Insensitive)); + EXPECT(eo & PosixFlags::Multiline); +} + +TEST_CASE(regex_lexer) +{ + Lexer l("/[.*+?^${}()|[\\]\\\\]/g"); + EXPECT(l.next().type() == regex::TokenType::Slash); + EXPECT(l.next().type() == regex::TokenType::LeftBracket); + EXPECT(l.next().type() == regex::TokenType::Period); + EXPECT(l.next().type() == regex::TokenType::Asterisk); + EXPECT(l.next().type() == regex::TokenType::Plus); + EXPECT(l.next().type() == regex::TokenType::Questionmark); + EXPECT(l.next().type() == regex::TokenType::Circumflex); + EXPECT(l.next().type() == regex::TokenType::Dollar); + EXPECT(l.next().type() == regex::TokenType::LeftCurly); + EXPECT(l.next().type() == regex::TokenType::RightCurly); + EXPECT(l.next().type() == regex::TokenType::LeftParen); + EXPECT(l.next().type() == regex::TokenType::RightParen); + EXPECT(l.next().type() == regex::TokenType::Pipe); + EXPECT(l.next().type() == regex::TokenType::LeftBracket); + EXPECT(l.next().type() == regex::TokenType::EscapeSequence); + EXPECT(l.next().type() == regex::TokenType::EscapeSequence); + EXPECT(l.next().type() == regex::TokenType::RightBracket); + EXPECT(l.next().type() == regex::TokenType::Slash); + EXPECT(l.next().type() == regex::TokenType::Char); +} + +TEST_CASE(parser_error_parens) +{ + String pattern = "test()test"; + Lexer l(pattern); + PosixExtendedParser p(l); + p.parse(); + EXPECT(p.has_error()); + EXPECT(p.error() == Error::EmptySubExpression); +} + +TEST_CASE(parser_error_special_characters_used_at_wrong_place) +{ + String pattern; + Vector<char, 5> chars = { '*', '+', '?', '{' }; + StringBuilder b; + + Lexer l; + PosixExtended p(l); + + for (auto& ch : chars) { + // First in ere + b.clear(); + b.append(ch); + pattern = b.build(); + l.set_source(pattern); + p.parse(); + EXPECT(p.has_error()); + EXPECT(p.error() == Error::InvalidRepetitionMarker); + + // After vertical line + b.clear(); + b.append("a|"); + b.append(ch); + pattern = b.build(); + l.set_source(pattern); + p.parse(); + EXPECT(p.has_error()); + EXPECT(p.error() == Error::InvalidRepetitionMarker); + + // After circumflex + b.clear(); + b.append("^"); + b.append(ch); + pattern = b.build(); + l.set_source(pattern); + p.parse(); + EXPECT(p.has_error()); + EXPECT(p.error() == Error::InvalidRepetitionMarker); + + // After dollar + b.clear(); + b.append("$"); + b.append(ch); + pattern = b.build(); + l.set_source(pattern); + p.parse(); + EXPECT(p.has_error()); + EXPECT(p.error() == Error::InvalidRepetitionMarker); + + // After left parens + b.clear(); + b.append("("); + b.append(ch); + b.append(")"); + pattern = b.build(); + l.set_source(pattern); + p.parse(); + EXPECT(p.has_error()); + EXPECT(p.error() == Error::InvalidRepetitionMarker); + } +} + +TEST_CASE(parser_error_vertical_line_used_at_wrong_place) +{ + Lexer l; + PosixExtended p(l); + + // First in ere + l.set_source("|asdf"); + p.parse(); + EXPECT(p.has_error()); + EXPECT(p.error() == Error::EmptySubExpression); + + // Last in ere + l.set_source("asdf|"); + p.parse(); + EXPECT(p.has_error()); + EXPECT(p.error() == Error::EmptySubExpression); + + // After left parens + l.set_source("(|asdf)"); + p.parse(); + EXPECT(p.has_error()); + EXPECT(p.error() == Error::EmptySubExpression); + + // Proceed right parens + l.set_source("(asdf)|"); + p.parse(); + EXPECT(p.has_error()); + EXPECT(p.error() == Error::EmptySubExpression); +} + +TEST_CASE(catch_all_first) +{ + Regex<PosixExtended> re("^.*$"); + RegexResult m; + re.match("Hello World", m); + EXPECT(m.count == 1); + EXPECT(re.match("Hello World", m)); +} + +TEST_CASE(catch_all) +{ + Regex<PosixExtended> re("^.*$", PosixFlags::Global); + + EXPECT(re.has_match("Hello World")); + EXPECT(re.match("Hello World").success); + EXPECT(re.match("Hello World").count == 1); + + EXPECT(has_match("Hello World", re)); + auto res = match("Hello World", re); + EXPECT(res.success); + EXPECT(res.count == 1); + EXPECT(res.matches.size() == 1); + EXPECT(res.matches.first().view == "Hello World"); +} + +TEST_CASE(catch_all_again) +{ + Regex<PosixExtended> re("^.*$", PosixFlags::Extra); + EXPECT_EQ(has_match("Hello World", re), true); +} + +TEST_CASE(char_utf8) +{ + Regex<PosixExtended> re("😀"); + RegexResult result; + + EXPECT_EQ((result = match("Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界", re, PosixFlags::Global)).success, true); + EXPECT_EQ(result.count, 2u); +} + +TEST_CASE(catch_all_newline) +{ + Regex<PosixExtended> re("^.*$", PosixFlags::Multiline | PosixFlags::StringCopyMatches); + RegexResult result; + auto lambda = [&result, &re]() { + String aaa = "Hello World\nTest\n1234\n"; + result = match(aaa, re); + EXPECT_EQ(result.success, true); + }; + lambda(); + EXPECT_EQ(result.count, 3u); + EXPECT_EQ(result.matches.at(0).view, "Hello World"); + EXPECT_EQ(result.matches.at(1).view, "Test"); + EXPECT_EQ(result.matches.at(2).view, "1234"); +} + +TEST_CASE(catch_all_newline_view) +{ + Regex<PosixExtended> re("^.*$", PosixFlags::Multiline); + RegexResult result; + + String aaa = "Hello World\nTest\n1234\n"; + result = match(aaa, re); + EXPECT_EQ(result.success, true); + EXPECT_EQ(result.count, 3u); + String str = "Hello World"; + EXPECT_EQ(result.matches.at(0).view, str.view()); + EXPECT_EQ(result.matches.at(1).view, "Test"); + EXPECT_EQ(result.matches.at(2).view, "1234"); +} + +TEST_CASE(catch_all_newline_2) +{ + Regex<PosixExtended> re("^.*$"); + RegexResult result; + result = match("Hello World\nTest\n1234\n", re, PosixFlags::Multiline | PosixFlags::StringCopyMatches); + EXPECT_EQ(result.success, true); + EXPECT_EQ(result.count, 3u); + EXPECT_EQ(result.matches.at(0).view, "Hello World"); + EXPECT_EQ(result.matches.at(1).view, "Test"); + EXPECT_EQ(result.matches.at(2).view, "1234"); + + result = match("Hello World\nTest\n1234\n", re); + EXPECT_EQ(result.success, true); + EXPECT_EQ(result.count, 1u); + EXPECT_EQ(result.matches.at(0).view, "Hello World\nTest\n1234\n"); +} + +TEST_CASE(match_all_character_class) +{ + Regex<PosixExtended> re("[[:alpha:]]"); + String str = "[Window]\nOpacity=255\nAudibleBeep=0\n"; + RegexResult result = match(str, re, PosixFlags::Global | PosixFlags::StringCopyMatches); + + EXPECT_EQ(result.success, true); + EXPECT_EQ(result.count, 24u); + EXPECT_EQ(result.matches.at(0).view, "W"); + EXPECT_EQ(result.matches.at(1).view, "i"); + EXPECT_EQ(result.matches.at(2).view, "n"); + EXPECT(&result.matches.at(0).view.characters_without_null_termination()[0] != &str.view().characters_without_null_termination()[1]); +} + +TEST_CASE(example_for_git_commit) +{ + Regex<PosixExtended> re("^.*$"); + auto result = re.match("Well, hello friends!\nHello World!"); + + EXPECT(result.success); + EXPECT(result.count == 1); + EXPECT(result.matches.at(0).view.starts_with("Well")); + EXPECT(result.matches.at(0).view.length() == 33); + + EXPECT(re.has_match("Well,....")); + + result = re.match("Well, hello friends!\nHello World!", PosixFlags::Multiline); + + EXPECT(result.success); + EXPECT(result.count == 2); + EXPECT(result.matches.at(0).view == "Well, hello friends!"); + EXPECT(result.matches.at(1).view == "Hello World!"); +} + +TEST_CASE(email_address) +{ + Regex<PosixExtended> re("^[A-Z0-9a-z._%+-]{1,64}@([A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$"); + EXPECT(re.has_match("hello.world@domain.tld")); + EXPECT(re.has_match("this.is.a.very_long_email_address@world.wide.web")); +} + +TEST_CASE(ini_file_entries) +{ + Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)|\\[(.*)\\]"); + RegexResult result; + +#ifdef REGEX_DEBUG + RegexDebug regex_dbg(stderr); + regex_dbg.print_raw_bytecode(re); + regex_dbg.print_header(); + regex_dbg.print_bytecode(re); +#endif + + String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n"; + EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true); + EXPECT_EQ(result.count, 3u); + +#ifdef REGEX_DEBUG + for (auto& v : result.matches) + fprintf(stderr, "%s\n", v.view.to_string().characters()); +#endif + + EXPECT_EQ(result.matches.at(0).view, "[Window]"); + EXPECT_EQ(result.capture_group_matches.at(0).at(1).view, "Window"); + EXPECT_EQ(result.matches.at(1).view, "Opacity=255"); + EXPECT_EQ(result.matches.at(1).line, 1u); + EXPECT_EQ(result.matches.at(1).column, 0u); + EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "255"); + EXPECT_EQ(result.capture_group_matches.at(1).at(0).line, 1u); + EXPECT_EQ(result.capture_group_matches.at(1).at(0).column, 8u); + EXPECT_EQ(result.matches.at(2).view, "AudibleBeep=0"); + EXPECT_EQ(result.capture_group_matches.at(2).at(0).view, "0"); + EXPECT_EQ(result.capture_group_matches.at(2).at(0).line, 2u); + EXPECT_EQ(result.capture_group_matches.at(2).at(0).column, 12u); +} + +TEST_CASE(named_capture_group) +{ + Regex<PosixExtended> re("[[:alpha:]]*=(?<Test>[[:digit:]]*)"); + RegexResult result; + +#ifdef REGEX_DEBUG + RegexDebug regex_dbg(stderr); + regex_dbg.print_raw_bytecode(re); + regex_dbg.print_header(); + regex_dbg.print_bytecode(re); +#endif + + String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n"; + EXPECT_EQ(re.search(haystack, result, PosixFlags::Multiline), true); + EXPECT_EQ(result.count, 2u); + EXPECT_EQ(result.matches.at(0).view, "Opacity=255"); + EXPECT_EQ(result.named_capture_group_matches.at(0).ensure("Test").view, "255"); + EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0"); + EXPECT_EQ(result.named_capture_group_matches.at(1).ensure("Test").view, "0"); +} + +TEST_CASE(a_star) +{ + Regex<PosixExtended> re("a*"); + RegexResult result; + +#ifdef REGEX_DEBUG + RegexDebug regex_dbg(stderr); + regex_dbg.print_raw_bytecode(re); + regex_dbg.print_header(); + regex_dbg.print_bytecode(re); +#endif + + String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n"; + EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true); + EXPECT_EQ(result.count, 32u); + EXPECT_EQ(result.matches.at(0).view.length(), 0u); + EXPECT_EQ(result.matches.at(10).view.length(), 1u); + EXPECT_EQ(result.matches.at(10).view, "a"); + EXPECT_EQ(result.matches.at(31).view.length(), 0u); +} + +TEST_CASE(simple_period_end_benchmark) +{ + Regex<PosixExtended> re("hello.$"); + RegexResult m; + EXPECT_EQ(re.search("Hello1", m), false); + EXPECT_EQ(re.search("hello1hello1", m), true); + EXPECT_EQ(re.search("hello2hell", m), false); + EXPECT_EQ(re.search("hello?", m), true); +} + +TEST_MAIN(Regex) diff --git a/Libraries/LibRegex/Tests/RegexLibC.cpp b/Libraries/LibRegex/Tests/RegexLibC.cpp new file mode 100644 index 0000000000..40e0303159 --- /dev/null +++ b/Libraries/LibRegex/Tests/RegexLibC.cpp @@ -0,0 +1,1211 @@ +/* + * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <AK/TestSuite.h> + +#include <AK/StringBuilder.h> +#include <LibC/regex.h> +#include <stdio.h> + +#define BENCHMARK_LOOP_ITERATIONS 100000 +#define DISABLE_REGEX_BENCHMARK + +//#if not(defined(REGEX_DEBUG) || defined(REGEX_MATCH_STATUS) || defined(DISABLE_REGEX_BENCHMARK)) +# include <regex> +//#endif + +TEST_CASE(catch_all) +{ + String pattern = "^.*$"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "Hello World", 0, NULL, 0), REG_NOERR); + + regfree(®ex); +} + +TEST_CASE(simple_start) +{ + String pattern = "^hello friends"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "Hello!", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "hello friends", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "Well, hello friends", 0, NULL, 0), REG_NOMATCH); + + regfree(®ex); +} + +TEST_CASE(simple_end) +{ + String pattern = ".*hello\\.\\.\\. there$"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "Hallo", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "I said fyhello... there", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "ahello... therea", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "hello.. there", 0, NULL, 0), REG_NOMATCH); + + regfree(®ex); +} + +TEST_CASE(simple_period) +{ + String pattern = "hello."; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "Hello1", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "hello1", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "hello2", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "hello?", 0, NULL, 0), REG_NOERR); + + regfree(®ex); +} + +TEST_CASE(simple_period_end) +{ + String pattern = "hello.$"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED | REG_NOSUB), REG_NOERR); + EXPECT_EQ(regexec(®ex, "Hello1", 0, NULL, REG_NOSUB), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "hello1hello1", 0, NULL, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(regexec(®ex, "hello2hell", 0, NULL, REG_GLOBAL), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "hello?", 0, NULL, REG_NOSUB), REG_NOERR); + + regfree(®ex); +} + +TEST_CASE(simple_escaped) +{ + String pattern = "hello\\."; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "hello", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "hello.", 0, NULL, 0), REG_NOERR); + + regfree(®ex); +} + +TEST_CASE(simple_period2_end) +{ + String pattern = ".*hi... there$"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "Hello there", 0, NULL, REG_GLOBAL), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "I said fyhi... there", 0, NULL, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(regexec(®ex, "....hi... ", 0, NULL, REG_GLOBAL), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "I said fyhihii there", 0, NULL, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(regexec(®ex, "I said fyhihi there", 0, NULL, REG_GLOBAL), REG_NOMATCH); + + regfree(®ex); +} + +TEST_CASE(simple_plus) +{ + String pattern = "a+"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED | REG_NOSUB), REG_NOERR); + EXPECT_EQ(regexec(®ex, "b", 0, NULL, REG_NOSUB), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "a", 0, NULL, REG_NOSUB), REG_NOERR); + EXPECT_EQ(regexec(®ex, "aaaaaabbbbb", 0, NULL, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(regexec(®ex, "aaaaaaaaaaa", 0, NULL, REG_GLOBAL), REG_NOERR); + + regfree(®ex); +} + +TEST_CASE(simple_questionmark) +{ + String pattern = "da?d"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "a", 0, NULL, REG_GLOBAL), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "daa", 0, NULL, REG_GLOBAL), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "ddddd", 0, NULL, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(regexec(®ex, "dd", 0, NULL, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(regexec(®ex, "dad", 0, NULL, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(regexec(®ex, "dada", 0, NULL, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(regexec(®ex, "adadaa", 0, NULL, REG_GLOBAL), REG_NOERR); + + regfree(®ex); +} + +TEST_CASE(simple_questionmark_matchall) +{ + String pattern = "da?d"; + regex_t regex; + static constexpr int num_matches { 5 }; + regmatch_t matches[num_matches]; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "a", num_matches, matches, REG_GLOBAL), REG_NOMATCH); + EXPECT_EQ(matches[0].rm_cnt, 0u); + EXPECT_EQ(regexec(®ex, "daa", num_matches, matches, REG_GLOBAL), REG_NOMATCH); + EXPECT_EQ(matches[0].rm_cnt, 0u); + + EXPECT_EQ(regexec(®ex, "ddddd", num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 2u); + + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 2u); + EXPECT_EQ(matches[1].rm_so, 2u); + EXPECT_EQ(matches[1].rm_eo, 4u); + + EXPECT_EQ(regexec(®ex, "dd", num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(regexec(®ex, "dad", num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(regexec(®ex, "dada", num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(regexec(®ex, "adadaa", num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + + regfree(®ex); +} + +TEST_CASE(character_class) +{ + String pattern = "[[:alpha:]]"; + regex_t regex; + static constexpr int num_matches { 5 }; + regmatch_t matches[num_matches]; + + String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n"; + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, haystack.characters(), num_matches, matches, 0), REG_NOMATCH); + EXPECT_EQ(matches[0].rm_cnt, 0u); + EXPECT_EQ(regexec(®ex, haystack.characters(), num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 24u); + EXPECT_EQ(haystack.substring_view(matches[0].rm_so, matches[0].rm_eo - matches[0].rm_so), "W"); + EXPECT_EQ(haystack.substring_view(matches[1].rm_so, matches[1].rm_eo - matches[1].rm_so), "i"); + + regfree(®ex); +} + +TEST_CASE(character_class2) +{ + String pattern = "[[:alpha:]]*=([[:digit:]]*)|\\[(.*)\\]"; + regex_t regex; + static constexpr int num_matches { 9 }; + regmatch_t matches[num_matches]; + + String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n"; + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED | REG_NEWLINE), REG_NOERR); + EXPECT_EQ(regexec(®ex, haystack.characters(), num_matches, matches, 0), REG_NOERR); + + EXPECT_EQ(matches[0].rm_cnt, 3u); +#if 0 + for (int i = 0; i < num_matches; ++i) { + fprintf(stderr, "Matches[%i].rm_so: %li, .rm_eo: %li .rm_cnt: %li: ", i, matches[i].rm_so, matches[i].rm_eo, matches[i].rm_cnt); + fprintf(stderr, "haystack length: %lu\n", haystack.length()); + if (matches[i].rm_so != -1) + fprintf(stderr, "%s\n", haystack.substring_view(matches[i].rm_so, matches[i].rm_eo - matches[i].rm_so).to_string().characters()); + } +#endif + + EXPECT_EQ(haystack.substring_view(matches[0].rm_so, matches[0].rm_eo - matches[0].rm_so), "[Window]"); + + EXPECT_EQ(matches[1].rm_so, -1); + EXPECT_EQ(matches[1].rm_eo, -1); + EXPECT_EQ(matches[1].rm_cnt, 0); + + EXPECT_EQ(haystack.substring_view(matches[2].rm_so, matches[2].rm_eo - matches[2].rm_so), "Window"); + EXPECT_EQ(haystack.substring_view(matches[3].rm_so, matches[3].rm_eo - matches[3].rm_so), "Opacity=255"); + + EXPECT_EQ(haystack.substring_view(matches[4].rm_so, matches[4].rm_eo - matches[4].rm_so), "255"); + + EXPECT_EQ(matches[5].rm_so, -1); + EXPECT_EQ(matches[5].rm_eo, -1); + EXPECT_EQ(matches[5].rm_cnt, 0); + + EXPECT_EQ(haystack.substring_view(matches[6].rm_so, matches[6].rm_eo - matches[6].rm_so), "AudibleBeep=0"); + EXPECT_EQ(haystack.substring_view(matches[7].rm_so, matches[7].rm_eo - matches[7].rm_so), "0"); + + EXPECT_EQ(matches[8].rm_so, -1); + EXPECT_EQ(matches[8].rm_eo, -1); + EXPECT_EQ(matches[8].rm_cnt, 0); + + regfree(®ex); +} + +TEST_CASE(escaped_char_questionmark) +{ + String pattern = "This\\.?And\\.?That"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "ThisAndThat", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "This.And.That", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "This And That", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "This..And..That", 0, NULL, 0), REG_NOMATCH); + + regfree(®ex); +} + +TEST_CASE(char_qualifier_asterisk) +{ + String pattern = "regex*"; + regex_t regex; + static constexpr int num_matches { 5 }; + regmatch_t matches[num_matches]; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "#include <regex.h>", num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + + regfree(®ex); +} + +TEST_CASE(char_utf8) +{ + String pattern = "😀"; + regex_t regex; + static constexpr int num_matches { 5 }; + regmatch_t matches[num_matches]; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界", num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 2u); + + regfree(®ex); +} + +TEST_CASE(parens) +{ + String pattern = "test(hello)test"; + regex_t regex; + static constexpr int num_matches { 5 }; + regmatch_t matches[num_matches]; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + + EXPECT_EQ(regexec(®ex, "testhellotest", num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 13u); + EXPECT_EQ(matches[1].rm_so, 4u); + EXPECT_EQ(matches[1].rm_eo, 9u); + + regfree(®ex); +} + +TEST_CASE(parser_error_parens) +{ + String pattern = "test()test"; + regex_t regex; + static constexpr int num_matches { 5 }; + regmatch_t matches[num_matches]; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_EMPTY_EXPR); + EXPECT_EQ(regexec(®ex, "testhellotest", num_matches, matches, 0), REG_EMPTY_EXPR); + + regfree(®ex); +} + +TEST_CASE(parser_error_special_characters_used_at_wrong_place) +{ + String pattern; + regex_t regex; + static constexpr int num_matches { 5 }; + regmatch_t matches[num_matches]; + + Vector<char, 4> chars = { '*', '+', '?', '}' }; + StringBuilder b; + + for (auto& ch : chars) { + + auto error_code_to_check = ch == '}' ? REG_EBRACE : REG_BADRPT; + + // First in ere + b.clear(); + b.append(ch); + pattern = b.build(); + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), error_code_to_check); + EXPECT_EQ(regexec(®ex, "test", num_matches, matches, 0), error_code_to_check); + + // After vertical line + b.clear(); + b.append("a|"); + b.append(ch); + pattern = b.build(); + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), error_code_to_check); + EXPECT_EQ(regexec(®ex, "test", num_matches, matches, 0), error_code_to_check); + + // After circumflex + b.clear(); + b.append("^"); + b.append(ch); + pattern = b.build(); + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), error_code_to_check); + EXPECT_EQ(regexec(®ex, "test", num_matches, matches, 0), error_code_to_check); + + // After dollar + b.clear(); + b.append("$"); + b.append(ch); + pattern = b.build(); + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), error_code_to_check); + EXPECT_EQ(regexec(®ex, "test", num_matches, matches, 0), error_code_to_check); + + // After left parens + b.clear(); + b.append("("); + b.append(ch); + b.append(")"); + pattern = b.build(); + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), error_code_to_check); + EXPECT_EQ(regexec(®ex, "test", num_matches, matches, 0), error_code_to_check); + } + + regfree(®ex); +} + +TEST_CASE(parser_error_vertical_line_used_at_wrong_place) +{ + String pattern; + regex_t regex; + static constexpr int num_matches { 5 }; + regmatch_t matches[num_matches]; + + // First in ere + pattern = "|asdf"; + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_EMPTY_EXPR); + EXPECT_EQ(regexec(®ex, "test", num_matches, matches, 0), REG_EMPTY_EXPR); + + // Last in ere + pattern = "asdf|"; + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_EMPTY_EXPR); + EXPECT_EQ(regexec(®ex, "test", num_matches, matches, 0), REG_EMPTY_EXPR); + + // After left parens + pattern = "(|asdf)"; + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_EMPTY_EXPR); + EXPECT_EQ(regexec(®ex, "test", num_matches, matches, 0), REG_EMPTY_EXPR); + + // Proceed right parens + pattern = "(asdf)|"; + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_EMPTY_EXPR); + EXPECT_EQ(regexec(®ex, "test", num_matches, matches, 0), REG_EMPTY_EXPR); + + regfree(®ex); +} + +TEST_CASE(parens_qualifier_questionmark) +{ + String pattern = "test(hello)?test"; + regex_t regex; + static constexpr int num_matches { 5 }; + regmatch_t matches[num_matches]; + const char* match_str; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + + match_str = "testtest"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 8u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testtest"); + + match_str = "testhellotest"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 13u); + EXPECT_EQ(matches[1].rm_so, 4u); + EXPECT_EQ(matches[1].rm_eo, 9u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testhellotest"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello"); + + regfree(®ex); +} + +TEST_CASE(parens_qualifier_asterisk) +{ + String pattern = "test(hello)*test"; + regex_t regex; + static constexpr int num_matches { 6 }; + regmatch_t matches[num_matches]; + const char* match_str; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + + match_str = "testtest"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 8u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testtest"); + + match_str = "testhellohellotest"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 18u); + EXPECT_EQ(matches[1].rm_so, 9u); + EXPECT_EQ(matches[1].rm_eo, 14u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testhellohellotest"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello"); + + match_str = "testhellohellotest, testhellotest"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 2u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 18u); + EXPECT_EQ(matches[1].rm_so, 9u); + EXPECT_EQ(matches[1].rm_eo, 14u); + EXPECT_EQ(matches[2].rm_so, 20u); + EXPECT_EQ(matches[2].rm_eo, 33u); + EXPECT_EQ(matches[3].rm_so, 24u); + EXPECT_EQ(matches[3].rm_eo, 29u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testhellohellotest"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello"); + EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "testhellotest"); + EXPECT_EQ(StringView(&match_str[matches[3].rm_so], matches[3].rm_eo - matches[3].rm_so), "hello"); + + regfree(®ex); +} + +TEST_CASE(parens_qualifier_asterisk_2) +{ + String pattern = "test(.*)test"; + regex_t regex; + static constexpr int num_matches { 6 }; + regmatch_t matches[num_matches]; + const char* match_str; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + + match_str = "testasdftest"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 12u); + EXPECT_EQ(matches[1].rm_so, 4u); + EXPECT_EQ(matches[1].rm_eo, 8u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testasdftest"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "asdf"); + + match_str = "testasdfasdftest"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 16u); + EXPECT_EQ(matches[1].rm_so, 4u); + EXPECT_EQ(matches[1].rm_eo, 12u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testasdfasdftest"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "asdfasdf"); + + match_str = "testaaaatest, testbbbtest, testtest"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 35u); + EXPECT_EQ(matches[1].rm_so, 4u); + EXPECT_EQ(matches[1].rm_eo, 31u); + + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testaaaatest, testbbbtest, testtest"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "aaaatest, testbbbtest, test"); + + regfree(®ex); +} + +TEST_CASE(mulit_parens_qualifier_too_less_result_values) +{ + String pattern = "test(a)?(b)?(c)?test"; + regex_t regex; + static constexpr int num_matches { 4 }; + regmatch_t matches[num_matches]; + const char* match_str; + + matches[3] = { -2, -2, 100 }; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + + match_str = "testabtest"; + EXPECT_EQ(regexec(®ex, match_str, num_matches - 1, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 10u); + EXPECT_EQ(matches[1].rm_so, 4u); + EXPECT_EQ(matches[1].rm_eo, 5u); + EXPECT_EQ(matches[2].rm_so, 5u); + EXPECT_EQ(matches[2].rm_eo, 6u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testabtest"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "a"); + EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "b"); + EXPECT_EQ(matches[3].rm_so, -2); + EXPECT_EQ(matches[3].rm_eo, -2); + EXPECT_EQ(matches[3].rm_cnt, 100u); + + match_str = "testabctest"; + EXPECT_EQ(regexec(®ex, match_str, num_matches - 1, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 11u); + EXPECT_EQ(matches[1].rm_so, 4u); + EXPECT_EQ(matches[1].rm_eo, 5u); + EXPECT_EQ(matches[2].rm_so, 5u); + EXPECT_EQ(matches[2].rm_eo, 6u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testabctest"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "a"); + EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "b"); + EXPECT_EQ(matches[3].rm_so, -2); + EXPECT_EQ(matches[3].rm_eo, -2); + EXPECT_EQ(matches[3].rm_cnt, 100u); + + match_str = "testabctest, testabctest"; + + EXPECT_EQ(regexec(®ex, match_str, num_matches - 1, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 2u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 11u); + EXPECT_EQ(matches[1].rm_so, 4u); + EXPECT_EQ(matches[1].rm_eo, 5u); + EXPECT_EQ(matches[2].rm_so, 5u); + EXPECT_EQ(matches[2].rm_eo, 6u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testabctest"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "a"); + EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "b"); + EXPECT_EQ(matches[3].rm_so, -2); + EXPECT_EQ(matches[3].rm_eo, -2); + EXPECT_EQ(matches[3].rm_cnt, 100u); + + regfree(®ex); +} + +TEST_CASE(multi_parens_qualifier_questionmark) +{ + String pattern = "test(a)?(b)?(c)?test"; + regex_t regex; + static constexpr int num_matches { 8 }; + regmatch_t matches[num_matches]; + const char* match_str; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + + match_str = "testtest"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 8u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testtest"); + + match_str = "testabctest"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 11u); + EXPECT_EQ(matches[1].rm_so, 4u); + EXPECT_EQ(matches[1].rm_eo, 5u); + EXPECT_EQ(matches[2].rm_so, 5u); + EXPECT_EQ(matches[2].rm_eo, 6u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testabctest"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "a"); + EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "b"); + + match_str = "testabctest, testactest"; + + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 2u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 11u); + EXPECT_EQ(matches[1].rm_so, 4u); + EXPECT_EQ(matches[1].rm_eo, 5u); + EXPECT_EQ(matches[2].rm_so, 5u); + EXPECT_EQ(matches[2].rm_eo, 6u); + EXPECT_EQ(matches[3].rm_so, 6u); + EXPECT_EQ(matches[3].rm_eo, 7u); + + EXPECT_EQ(matches[4].rm_so, 13u); + EXPECT_EQ(matches[4].rm_eo, 23u); + EXPECT_EQ(matches[5].rm_so, 17u); + EXPECT_EQ(matches[5].rm_eo, 18u); + EXPECT_EQ(matches[6].rm_so, -1); + EXPECT_EQ(matches[6].rm_eo, -1); + EXPECT_EQ(matches[7].rm_so, 18u); + EXPECT_EQ(matches[7].rm_eo, 19u); + + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testabctest"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "a"); + EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "b"); + EXPECT_EQ(StringView(&match_str[matches[3].rm_so], matches[3].rm_eo - matches[3].rm_so), "c"); + EXPECT_EQ(StringView(&match_str[matches[4].rm_so], matches[4].rm_eo - matches[4].rm_so), "testactest"); + EXPECT_EQ(StringView(&match_str[matches[5].rm_so], matches[5].rm_eo - matches[5].rm_so), "a"); + EXPECT_EQ(StringView(&match_str[matches[6].rm_so], matches[6].rm_eo - matches[6].rm_so), ""); + EXPECT_EQ(StringView(&match_str[matches[7].rm_so], matches[7].rm_eo - matches[7].rm_so), "c"); + + regfree(®ex); +} + +TEST_CASE(simple_alternative) +{ + String pattern = "test|hello|friends"; + regex_t regex; + static constexpr int num_matches { 1 }; + regmatch_t matches[num_matches]; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + + EXPECT_EQ(regexec(®ex, "test", num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 4u); + + EXPECT_EQ(regexec(®ex, "hello", num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 5u); + + EXPECT_EQ(regexec(®ex, "friends", num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 7u); + + regfree(®ex); +} + +TEST_CASE(alternative_match_groups) +{ + String pattern = "test(a)?(b)?|hello ?(dear|my)? friends"; + regex_t regex; + static constexpr int num_matches { 8 }; + regmatch_t matches[num_matches]; + const char* match_str; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + + match_str = "test"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0); + EXPECT_EQ(matches[0].rm_eo, 4); + EXPECT_EQ(matches[1].rm_so, -1); + EXPECT_EQ(matches[1].rm_eo, -1); + EXPECT_EQ(matches[2].rm_so, -1); + EXPECT_EQ(matches[2].rm_eo, -1); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "test"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), ""); + EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), ""); + + match_str = "testa"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 5u); + EXPECT_EQ(matches[1].rm_so, 4u); + EXPECT_EQ(matches[1].rm_eo, 5u); + EXPECT_EQ(matches[2].rm_so, -1); + EXPECT_EQ(matches[2].rm_eo, -1); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testa"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "a"); + EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), ""); + + match_str = "testb"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 5u); + EXPECT_EQ(matches[1].rm_so, -1); + EXPECT_EQ(matches[1].rm_eo, -1); + EXPECT_EQ(matches[2].rm_so, 4u); + EXPECT_EQ(matches[2].rm_eo, 5u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testb"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), ""); + EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "b"); + + match_str = "hello friends"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 13u); + EXPECT_EQ(matches[1].rm_so, -1); + EXPECT_EQ(matches[1].rm_eo, -1); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hello friends"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), ""); + + match_str = "hello dear friends"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 18u); + EXPECT_EQ(matches[1].rm_so, -1); + EXPECT_EQ(matches[1].rm_eo, -1); + EXPECT_EQ(matches[2].rm_so, -1); + EXPECT_EQ(matches[2].rm_eo, -1); + EXPECT_EQ(matches[3].rm_so, 6); + EXPECT_EQ(matches[3].rm_eo, 10); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hello dear friends"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), ""); + EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), ""); + EXPECT_EQ(StringView(&match_str[matches[3].rm_so], matches[3].rm_eo - matches[3].rm_so), "dear"); + + match_str = "hello my friends"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 16u); + EXPECT_EQ(matches[1].rm_so, -1); + EXPECT_EQ(matches[1].rm_eo, -1); + EXPECT_EQ(matches[2].rm_so, -1); + EXPECT_EQ(matches[2].rm_eo, -1); + EXPECT_EQ(matches[3].rm_so, 6); + EXPECT_EQ(matches[3].rm_eo, 8); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hello my friends"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), ""); + EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), ""); + EXPECT_EQ(StringView(&match_str[matches[3].rm_so], matches[3].rm_eo - matches[3].rm_so), "my"); + + match_str = "testabc"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOMATCH); + EXPECT_EQ(matches[0].rm_cnt, 0u); + EXPECT_EQ(matches[0].rm_so, -1); + EXPECT_EQ(matches[0].rm_eo, -1); + + match_str = "hello test friends"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOMATCH); + EXPECT_EQ(matches[0].rm_cnt, 0u); + EXPECT_EQ(matches[0].rm_so, -1); + EXPECT_EQ(matches[0].rm_eo, -1); + + regfree(®ex); +} + +TEST_CASE(parens_qualifier_exact) +{ + String pattern = "(hello){3}"; + regex_t regex; + static constexpr int num_matches { 5 }; + regmatch_t matches[num_matches]; + const char* match_str; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + + match_str = "hello"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOMATCH); + EXPECT_EQ(matches[0].rm_cnt, 0u); + + match_str = "hellohellohello"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 15u); + EXPECT_EQ(matches[1].rm_so, 10u); + EXPECT_EQ(matches[1].rm_eo, 15u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello"); + + match_str = "hellohellohellohello"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 15u); + EXPECT_EQ(matches[1].rm_so, 10u); + EXPECT_EQ(matches[1].rm_eo, 15u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello"); + + match_str = "test hellohellohello"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 5u); + EXPECT_EQ(matches[0].rm_eo, 20u); + EXPECT_EQ(matches[1].rm_so, 15u); + EXPECT_EQ(matches[1].rm_eo, 20u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello"); + + regfree(®ex); +} + +TEST_CASE(parens_qualifier_minimum) +{ + String pattern = "(hello){3,}"; + regex_t regex; + static constexpr int num_matches { 5 }; + regmatch_t matches[num_matches]; + const char* match_str; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + + match_str = "hello"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOMATCH); + EXPECT_EQ(matches[0].rm_cnt, 0u); + + match_str = "hellohellohello"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 15u); + EXPECT_EQ(matches[1].rm_so, 10u); + EXPECT_EQ(matches[1].rm_eo, 15u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello"); + + match_str = "hellohellohellohello"; + + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, REG_SEARCH), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 20u); + EXPECT_EQ(matches[1].rm_so, 15u); + EXPECT_EQ(matches[1].rm_eo, 20u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohellohello"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello"); + + match_str = "test hellohellohello"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 5u); + EXPECT_EQ(matches[0].rm_eo, 20u); + EXPECT_EQ(matches[1].rm_so, 15u); + EXPECT_EQ(matches[1].rm_eo, 20u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello"); + + match_str = "test hellohellohellohello"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 5u); + EXPECT_EQ(matches[0].rm_eo, 25u); + EXPECT_EQ(matches[1].rm_so, 20u); + EXPECT_EQ(matches[1].rm_eo, 25u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohellohello"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello"); + + regfree(®ex); +} + +TEST_CASE(parens_qualifier_maximum) +{ + String pattern = "(hello){2,3}"; + regex_t regex; + static constexpr int num_matches { 5 }; + regmatch_t matches[num_matches]; + const char* match_str; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + + match_str = "hello"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOMATCH); + EXPECT_EQ(matches[0].rm_cnt, 0u); + + match_str = "hellohellohello"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 15u); + EXPECT_EQ(matches[1].rm_so, 10u); + EXPECT_EQ(matches[1].rm_eo, 15u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello"); + + match_str = "hellohellohellohello"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 0u); + EXPECT_EQ(matches[0].rm_eo, 15u); + EXPECT_EQ(matches[1].rm_so, 10u); + EXPECT_EQ(matches[1].rm_eo, 15u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello"); + + match_str = "test hellohellohello"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 5u); + EXPECT_EQ(matches[0].rm_eo, 20u); + EXPECT_EQ(matches[1].rm_so, 15u); + EXPECT_EQ(matches[1].rm_eo, 20u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello"); + + match_str = "test hellohellohellohello"; + EXPECT_EQ(regexec(®ex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(matches[0].rm_so, 5u); + EXPECT_EQ(matches[0].rm_eo, 20u); + EXPECT_EQ(matches[1].rm_so, 15u); + EXPECT_EQ(matches[1].rm_eo, 20u); + EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello"); + EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello"); + + regfree(®ex); +} + +TEST_CASE(char_qualifier_min_max) +{ + String pattern = "c{3,30}"; + regex_t regex; + static constexpr int num_matches { 5 }; + regmatch_t matches[num_matches]; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + + EXPECT_EQ(regexec(®ex, "cc", num_matches, matches, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "ccc", num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "cccccccccccccccccccccccccccccc", num_matches, matches, 0), REG_NOERR); + EXPECT_EQ(matches[0].rm_cnt, 1u); + EXPECT_EQ(regexec(®ex, "ccccccccccccccccccccccccccccccc", num_matches, matches, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "ccccccccccccccccccccccccccccccc", num_matches, matches, REG_GLOBAL), REG_NOERR); + EXPECT_EQ(regexec(®ex, "cccccccccccccccccccccccccccccccc", num_matches, matches, 0), REG_NOMATCH); + + regfree(®ex); +} + +TEST_CASE(simple_bracket_chars) +{ + String pattern = "[abc]"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "a", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "b", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "c", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "d", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "e", 0, NULL, 0), REG_NOMATCH); +} + +TEST_CASE(simple_bracket_chars_inverse) +{ + String pattern = "[^abc]"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "a", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "b", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "c", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "d", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "e", 0, NULL, 0), REG_NOERR); +} + +TEST_CASE(simple_bracket_chars_range) +{ + String pattern = "[a-d]"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "a", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "b", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "c", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "d", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "e", 0, NULL, 0), REG_NOMATCH); +} + +TEST_CASE(simple_bracket_chars_range_inverse) +{ + String pattern = "[^a-df-z]"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "a", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "b", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "c", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "d", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "e", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "k", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "z", 0, NULL, 0), REG_NOMATCH); +} + +TEST_CASE(bracket_character_class_uuid) +{ + String pattern = "^([[:xdigit:]]{8})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{12})$"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "fb9b62a2-1579-4e3a-afba-76239ccb6583", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "fb9b62a2", 0, NULL, 0), REG_NOMATCH); + + regfree(®ex); +} + +TEST_CASE(simple_bracket_character_class_inverse) +{ + String pattern = "[^[:digit:]]"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "1", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "2", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "3", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "d", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "e", 0, NULL, 0), REG_NOERR); +} + +TEST_CASE(email_address) +{ + String pattern = "^[A-Z0-9a-z._%+-]{1,64}@(?:[A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_NOERR); + EXPECT_EQ(regexec(®ex, "emanuel.sprung@gmail.com", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "kling@serenityos.org", 0, NULL, 0), REG_NOERR); + + regfree(®ex); +} + +TEST_CASE(error_message) +{ + String pattern = "^[A-Z0-9[a-z._%+-]{1,64}@[A-Za-z0-9-]{1,63}\\.{1,125}[A-Za-z]{2,63}$"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED), REG_EBRACK); + EXPECT_EQ(regexec(®ex, "asdf@asdf.com", 0, NULL, 0), REG_EBRACK); + char buf[1024]; + size_t buflen = 1024; + auto len = regerror(0, ®ex, buf, buflen); + String expected = "Error during parsing of regular expression:\n ^[A-Z0-9[a-z._%+-]{1,64}@[A-Za-z0-9-]{1,63}\\.{1,125}[A-Za-z]{2,63}$\n ^---- [ ] imbalance.\n"; + for (size_t i = 0; i < len; ++i) { + EXPECT_EQ(buf[i], expected[i]); + } + + regfree(®ex); +} + +TEST_CASE(simple_ignorecase) +{ + String pattern = "^hello friends"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED | REG_NOSUB | REG_ICASE), REG_NOERR); + EXPECT_EQ(regexec(®ex, "Hello Friends", 0, NULL, 0), REG_NOERR); + EXPECT_EQ(regexec(®ex, "hello Friends", 0, NULL, 0), REG_NOERR); + + EXPECT_EQ(regexec(®ex, "hello Friends!", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "hello Friends!", 0, NULL, REG_GLOBAL), REG_NOERR); + + EXPECT_EQ(regexec(®ex, "hell Friends", 0, NULL, 0), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "hell Friends", 0, NULL, REG_GLOBAL), REG_NOMATCH); + + regfree(®ex); +} + +TEST_CASE(simple_notbol_noteol) +{ + String pattern = "^hello friends$"; + regex_t regex; + + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED | REG_NOSUB | REG_ICASE), REG_NOERR); + + EXPECT_EQ(regexec(®ex, "hello friends", 0, NULL, REG_NOTBOL), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "hello friends", 0, NULL, REG_NOTEOL), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "hello friends", 0, NULL, REG_NOTBOL | REG_NOTEOL), REG_NOMATCH); + + EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTBOL), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "a hello friends", 0, NULL, REG_NOTBOL), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "a hello friends", 0, NULL, REG_NOTBOL | REG_SEARCH), REG_NOERR); + EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_SEARCH), REG_NOERR); + + EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTEOL), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "hello friends b", 0, NULL, REG_NOTEOL), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "hello friends b", 0, NULL, REG_NOTEOL | REG_SEARCH), REG_NOERR); + EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTEOL | REG_SEARCH), REG_NOMATCH); + + EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_NOTEOL), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_NOTEOL | REG_SEARCH), REG_NOMATCH); + + pattern = "hello friends"; + EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED | REG_NOSUB | REG_ICASE), REG_NOERR); + + EXPECT_EQ(regexec(®ex, "hello friends", 0, NULL, REG_NOTBOL), REG_NOMATCH); + EXPECT_EQ(regexec(®ex, "hello friends", 0, NULL, REG_NOTEOL), REG_NOMATCH); + + regfree(®ex); +} + +//#if not(defined(REGEX_DEBUG) || defined(REGEX_MATCH_STATUS) || defined(DISABLE_REGEX_BENCHMARK)) +//BENCHMARK_CASE(simple_notbol_noteol_benchmark) +//{ +// String pattern = "^hello friends$"; +// regex_t regex; + +// EXPECT_EQ(regcomp(®ex, pattern.characters(), REG_EXTENDED | REG_NOSUB | REG_ICASE), REG_NOERR); + +// for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + +// EXPECT_EQ(regexec(®ex, "hello friends", 0, NULL, REG_NOTBOL), REG_NOMATCH); +// EXPECT_EQ(regexec(®ex, "hello friends", 0, NULL, REG_NOTEOL), REG_NOMATCH); +// EXPECT_EQ(regexec(®ex, "hello friends", 0, NULL, REG_NOTBOL | REG_NOTEOL), REG_NOMATCH); + +// EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTBOL), REG_NOMATCH); +// EXPECT_EQ(regexec(®ex, "a hello friends", 0, NULL, REG_NOTBOL), REG_NOMATCH); +// EXPECT_EQ(regexec(®ex, "a hello friends", 0, NULL, REG_NOTBOL | REG_SEARCH), REG_NOERR); +// EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_SEARCH), REG_NOMATCH); + +// EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTEOL), REG_NOMATCH); +// EXPECT_EQ(regexec(®ex, "hello friends b", 0, NULL, REG_NOTEOL), REG_NOMATCH); +// EXPECT_EQ(regexec(®ex, "hello friends b", 0, NULL, REG_NOTEOL | REG_SEARCH), REG_NOERR); +// EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTEOL | REG_SEARCH), REG_NOMATCH); + +// EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_NOTEOL), REG_NOMATCH); +// EXPECT_EQ(regexec(®ex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_NOTEOL | REG_SEARCH), REG_NOERR); +// } + +// regfree(®ex); +//} + +BENCHMARK_CASE(simple_notbol_noteol_benchmark_reference_stdcpp_regex_match) +{ + std::regex re1("^hello friends$", std::regex_constants::match_not_bol); + std::regex re2("^hello friends$", std::regex_constants::match_not_eol); + std::regex re3("^hello friends$", std::regex_constants::match_not_bol | std::regex_constants::match_not_eol); + std::regex re4("hello friends", std::regex_constants::match_not_bol); + std::regex re5("hello friends", std::regex_constants::match_not_eol); + std::cmatch m; + //for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) { + EXPECT_EQ(std::regex_match("hello friends", m, re1), false); + EXPECT_EQ(std::regex_match("hello friends", m, re2), false); + EXPECT_EQ(std::regex_match("hello friends", m, re3), false); + + EXPECT_EQ(std::regex_match("a hello friends b", m, re1), false); + EXPECT_EQ(std::regex_match("a hello friends", m, re1), false); + EXPECT_EQ(std::regex_search("a hello friends", m, re1), true); + EXPECT_EQ(std::regex_search("a hello friends b", m, re1), true); + + EXPECT_EQ(std::regex_match("a hello friends b", m, re2), false); + EXPECT_EQ(std::regex_match("hello friends b", m, re2), false); + EXPECT_EQ(std::regex_search("hello friends b", m, re2), true); + EXPECT_EQ(std::regex_search("a hello friends b", m, re2), false); + + EXPECT_EQ(std::regex_match("a hello friends b", m, re3), false); + EXPECT_EQ(std::regex_search("a hello friends b", m, re3), false); + + EXPECT_EQ(std::regex_match("hello friends", m, re4), false); + EXPECT_EQ(std::regex_match("hello friends", m, re5), false); + //} +} +//#endif + +TEST_MAIN(Regex) diff --git a/Meta/Lagom/CMakeLists.txt b/Meta/Lagom/CMakeLists.txt index 3160964c23..d2c733212a 100644 --- a/Meta/Lagom/CMakeLists.txt +++ b/Meta/Lagom/CMakeLists.txt @@ -39,6 +39,8 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") endif() file(GLOB AK_SOURCES CONFIGURE_DEPENDS "../../AK/*.cpp") +file(GLOB LIBREGEX_LIBC_SOURCES "../../Libraries/LibRegex/C/Regex.cpp") +file(GLOB LIBREGEX_SOURCES CONFIGURE_DEPENDS "../../Libraries/LibRegex/*.cpp") file(GLOB LIBCORE_SOURCES CONFIGURE_DEPENDS "../../Libraries/LibCore/*.cpp") file(GLOB LIBELF_SOURCES CONFIGURE_DEPENDS "../../Libraries/LibELF/*.cpp") file(GLOB LIBGEMINI_SOURCES CONFIGURE_DEPENDS "../../Libraries/LibGemini/*.cpp") @@ -58,8 +60,9 @@ file(GLOB LIBTLS_SOURCES CONFIGURE_DEPENDS "../../Libraries/LibTLS/*.cpp") file(GLOB SHELL_SOURCES CONFIGURE_DEPENDS "../../Shell/*.cpp") file(GLOB SHELL_TESTS CONFIGURE_DEPENDS "../../Shell/Tests/*.sh") +set(LAGOM_REGEX_SOURCES ${LIBREGEX_LIBC_SOURCES} ${LIBREGEX_SOURCES}) set(LAGOM_CORE_SOURCES ${AK_SOURCES} ${LIBCORE_SOURCES}) -set(LAGOM_MORE_SOURCES ${LIBELF_SOURCES} ${LIBIPC_SOURCES} ${LIBLINE_SOURCES} ${LIBJS_SOURCES} ${LIBJS_SUBDIR_SOURCES} ${LIBX86_SOURCES} ${LIBCRYPTO_SOURCES} ${LIBCOMPRESS_SOURCES} ${LIBCRYPTO_SUBDIR_SOURCES} ${LIBTLS_SOURCES} ${LIBMARKDOWN_SOURCES} ${LIBGEMINI_SOURCES} ${LIBGFX_SOURCES} ${LIBHTTP_SOURCES}) +set(LAGOM_MORE_SOURCES ${LIBELF_SOURCES} ${LIBIPC_SOURCES} ${LIBLINE_SOURCES} ${LIBJS_SOURCES} ${LIBJS_SUBDIR_SOURCES} ${LIBX86_SOURCES} ${LIBCRYPTO_SOURCES} ${LIBCOMPRESS_SOURCES} ${LIBCRYPTO_SUBDIR_SOURCES} ${LIBTLS_SOURCES} ${LIBMARKDOWN_SOURCES} ${LIBGEMINI_SOURCES} ${LIBGFX_SOURCES} ${LIBHTTP_SOURCES} ${LAGOM_REGEX_SOURCES}) include_directories (../../) include_directories (../../Libraries/) |