summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt1
-rw-r--r--Libraries/CMakeLists.txt11
-rw-r--r--Libraries/LibC/regex.h122
-rw-r--r--Libraries/LibRegex/C/Regex.cpp256
-rw-r--r--Libraries/LibRegex/CMakeLists.txt10
-rw-r--r--Libraries/LibRegex/Forward.h54
-rw-r--r--Libraries/LibRegex/Regex.h31
-rw-r--r--Libraries/LibRegex/RegexByteCode.cpp572
-rw-r--r--Libraries/LibRegex/RegexByteCode.h637
-rw-r--r--Libraries/LibRegex/RegexDebug.h153
-rw-r--r--Libraries/LibRegex/RegexError.h102
-rw-r--r--Libraries/LibRegex/RegexLexer.cpp210
-rw-r--r--Libraries/LibRegex/RegexLexer.h108
-rw-r--r--Libraries/LibRegex/RegexMatch.h100
-rw-r--r--Libraries/LibRegex/RegexMatcher.cpp313
-rw-r--r--Libraries/LibRegex/RegexMatcher.h186
-rw-r--r--Libraries/LibRegex/RegexOptions.h159
-rw-r--r--Libraries/LibRegex/RegexParser.cpp598
-rw-r--r--Libraries/LibRegex/RegexParser.cpp_803
-rw-r--r--Libraries/LibRegex/RegexParser.h145
-rw-r--r--Libraries/LibRegex/RegexParser.h_257
-rw-r--r--Libraries/LibRegex/Tests/Benchmark.cpp914
-rw-r--r--Libraries/LibRegex/Tests/CMakeLists.txt21
-rw-r--r--Libraries/LibRegex/Tests/Regex.cpp451
-rw-r--r--Libraries/LibRegex/Tests/RegexLibC.cpp1211
-rw-r--r--Meta/Lagom/CMakeLists.txt5
26 files changed, 7424 insertions, 6 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ee9852f8d..c2df09f98d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,6 +84,7 @@ add_subdirectory(Meta/Lagom)
add_subdirectory(DevTools/IPCCompiler)
add_subdirectory(Libraries/LibWeb/CodeGenerators)
add_subdirectory(AK/Tests)
+add_subdirectory(Libraries/LibRegex/Tests)
set(write_if_different ${CMAKE_SOURCE_DIR}/Meta/write-only-on-difference.sh)
diff --git a/Libraries/CMakeLists.txt b/Libraries/CMakeLists.txt
index 795eeb0f95..1d205d3cc3 100644
--- a/Libraries/CMakeLists.txt
+++ b/Libraries/CMakeLists.txt
@@ -1,15 +1,17 @@
add_subdirectory(LibAudio)
add_subdirectory(LibC)
add_subdirectory(LibChess)
+add_subdirectory(LibCompress)
add_subdirectory(LibCore)
+add_subdirectory(LibCpp)
add_subdirectory(LibCrypt)
add_subdirectory(LibCrypto)
-add_subdirectory(LibCompress)
add_subdirectory(LibDebug)
add_subdirectory(LibDesktop)
+add_subdirectory(LibDiff)
+add_subdirectory(LibGUI)
add_subdirectory(LibGemini)
add_subdirectory(LibGfx)
-add_subdirectory(LibGUI)
add_subdirectory(LibHTTP)
add_subdirectory(LibIPC)
add_subdirectory(LibImageDecoderClient)
@@ -21,12 +23,11 @@ add_subdirectory(LibMarkdown)
add_subdirectory(LibPCIDB)
add_subdirectory(LibProtocol)
add_subdirectory(LibPthread)
+add_subdirectory(LibRegex)
+add_subdirectory(LibTLS)
add_subdirectory(LibTar)
add_subdirectory(LibTextCodec)
add_subdirectory(LibThread)
-add_subdirectory(LibTLS)
add_subdirectory(LibVT)
add_subdirectory(LibWeb)
add_subdirectory(LibX86)
-add_subdirectory(LibDiff)
-add_subdirectory(LibCpp)
diff --git a/Libraries/LibC/regex.h b/Libraries/LibC/regex.h
new file mode 100644
index 0000000000..3fbf8fb4c3
--- /dev/null
+++ b/Libraries/LibC/regex.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <stddef.h>
+#include <sys/types.h>
+
+__BEGIN_DECLS
+
+typedef ssize_t regoff_t;
+
+struct regex_t {
+ void* __data;
+};
+
+enum __Regex_Error {
+ __Regex_NoError,
+ __Regex_InvalidPattern, // Invalid regular expression.
+ __Regex_InvalidCollationElement, // Invalid collating element referenced.
+ __Regex_InvalidCharacterClass, // Invalid character class type referenced.
+ __Regex_InvalidTrailingEscape, // Trailing \ in pattern.
+ __Regex_InvalidNumber, // Number in \digit invalid or in error.
+ __Regex_MismatchingBracket, // [ ] imbalance.
+ __Regex_MismatchingParen, // ( ) imbalance.
+ __Regex_MismatchingBrace, // { } imbalance.
+ __Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
+ __Regex_InvalidBracketContent, // Content of [] invalid.
+ __Regex_InvalidRange, // Invalid endpoint in range expression.
+ __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
+ __Regex_ReachedMaxRecursion, // MaximumRecursion has been reached.
+ __Regex_EmptySubExpression, // Sub expression has empty content.
+ __Regex_InvalidCaptureGroup, // Content of capture group is invalid.
+ __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
+};
+
+enum ReError {
+ REG_NOERR = __Regex_NoError,
+ REG_BADPAT = __Regex_InvalidPattern, // Invalid regular expression.
+ REG_ECOLLATE = __Regex_InvalidCollationElement, // Invalid collating element referenced.
+ REG_ECTYPE = __Regex_InvalidCharacterClass, // Invalid character class type referenced.
+ REG_EESCAPE = __Regex_InvalidTrailingEscape, // Trailing \ in pattern.
+ REG_ESUBREG = __Regex_InvalidNumber, // Number in \digit invalid or in error.
+ REG_EBRACK = __Regex_MismatchingBracket, // [ ] imbalance.
+ REG_EPAREN = __Regex_MismatchingParen, // \( \) or ( ) imbalance.
+ REG_EBRACE = __Regex_MismatchingBrace, // \{ \} imbalance.
+ REG_BADBR = __Regex_InvalidBraceContent, // Content of \{ \} invalid: not a number, number too large, more than two numbers, first larger than second.
+ REG_ERANGE = __Regex_InvalidRange, // Invalid endpoint in range expression.
+ REG_BADRPT = __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
+ REG_EMPTY_EXPR = __Regex_EmptySubExpression, // Empty expression
+ REG_ENOSYS, // The implementation does not support the function.
+ REG_ESPACE, // Out of memory.
+ REG_NOMATCH, // regexec() failed to match.
+};
+
+struct regmatch_t {
+ regoff_t rm_so; // byte offset from start of string to start of substring
+ regoff_t rm_eo; // byte offset from start of string of the first character after the end of substring
+ regoff_t rm_cnt; // number of matches
+};
+
+enum __RegexAllFlags {
+ __Regex_Global = 1, // All matches (don't return after first match)
+ __Regex_Insensitive = __Regex_Global << 1, // Case insensitive match (ignores case of [a-zA-Z])
+ __Regex_Ungreedy = __Regex_Global << 2, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
+ __Regex_Unicode = __Regex_Global << 3, // Enable all unicode features and interpret all unicode escape sequences as such
+ __Regex_Extended = __Regex_Global << 4, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
+ __Regex_Extra = __Regex_Global << 5, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
+ __Regex_MatchNotBeginOfLine = __Regex_Global << 6, // Pattern is not forced to ^ -> search in whole string!
+ __Regex_MatchNotEndOfLine = __Regex_Global << 7, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
+ __Regex_SkipSubExprResults = __Regex_Global << 8, // Do not return sub expressions in the result
+ __Regex_StringCopyMatches = __Regex_Global << 9, // Do explicitly copy results into new allocated string instead of StringView to original string.
+ __Regex_SingleLine = __Regex_Global << 10, // Dot matches newline characters
+ __Regex_Sticky = __Regex_Global << 11, // Force the pattern to only match consecutive matches from where the previous match ended.
+ __Regex_Multiline = __Regex_Global << 12, // Handle newline characters. Match each line, one by one.
+ __Regex_SkipTrimEmptyMatches = __Regex_Global << 13, // Do not remove empty capture group results.
+ __Regex_Last = __Regex_SkipTrimEmptyMatches
+};
+
+// Values for the cflags parameter to the regcomp() function:
+#define REG_EXTENDED __Regex_Extended // Use Extended Regular Expressions.
+#define REG_ICASE __Regex_Insensitive // Ignore case in match.
+#define REG_NOSUB __Regex_SkipSubExprResults // Report only success or fail in regexec().
+#define REG_GLOBAL __Regex_Global // Don't stop searching for more match
+#define REG_NEWLINE (__Regex_Multiline | REG_GLOBAL) // Change the handling of newline.
+
+// Values for the eflags parameter to the regexec() function:
+#define REG_NOTBOL __Regex_MatchNotBeginOfLine // The circumflex character (^), when taken as a special character, will not match the beginning of string.
+#define REG_NOTEOL __Regex_MatchNotEndOfLine // The dollar sign ($), when taken as a special character, will not match the end of string.
+
+//static_assert (sizeof(FlagsUnderlyingType) * 8 >= regex::POSIXFlags::Last << 1), "flags type too small")
+#define REG_SEARCH __Regex_Last << 1
+
+int regcomp(regex_t*, const char*, int);
+int regexec(const regex_t*, const char*, size_t, regmatch_t[], int);
+size_t regerror(int, const regex_t*, char*, size_t);
+void regfree(regex_t*);
+
+__END_DECLS
diff --git a/Libraries/LibRegex/C/Regex.cpp b/Libraries/LibRegex/C/Regex.cpp
new file mode 100644
index 0000000000..7bb36204fb
--- /dev/null
+++ b/Libraries/LibRegex/C/Regex.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <AK/String.h>
+#include <AK/StringBuilder.h>
+#include <LibRegex/Regex.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <string.h>
+
+#ifdef __serenity__
+# include <regex.h>
+#else
+# include <LibC/regex.h>
+#endif
+
+struct internal_regex_t {
+ u8 cflags;
+ u8 eflags;
+ OwnPtr<Regex<PosixExtended>> re;
+ size_t re_pat_errpos;
+ ReError re_pat_err;
+ String re_pat;
+ size_t re_nsub;
+};
+
+static internal_regex_t* impl_from(regex_t* re)
+{
+ if (!re)
+ return nullptr;
+
+ return reinterpret_cast<internal_regex_t*>(re->__data);
+}
+
+static const internal_regex_t* impl_from(const regex_t* re)
+{
+ return impl_from(const_cast<regex_t*>(re));
+}
+
+extern "C" {
+
+int regcomp(regex_t* reg, const char* pattern, int cflags)
+{
+ if (!reg)
+ return REG_ESPACE;
+
+ // Note that subsequent uses of regcomp() without regfree() _will_ leak memory
+ // This could've been prevented if libc provided a reginit() or similar, but it does not.
+ reg->__data = new internal_regex_t { 0, 0, nullptr, 0, ReError::REG_NOERR, {}, 0 };
+
+ auto preg = impl_from(reg);
+
+ if (!(cflags & REG_EXTENDED))
+ return REG_ENOSYS;
+
+ preg->cflags = cflags;
+
+ String pattern_str(pattern);
+ preg->re = make<Regex<PosixExtended>>(pattern_str, PosixOptions {} | (PosixFlags)cflags);
+
+ auto parser_result = preg->re->parser_result;
+ if (parser_result.error != regex::Error::NoError) {
+ preg->re_pat_errpos = parser_result.error_token.position();
+ preg->re_pat_err = (ReError)parser_result.error;
+ preg->re_pat = pattern;
+
+ dbg() << "Have Error: " << (ReError)parser_result.error;
+
+ return (ReError)parser_result.error;
+ }
+
+ preg->re_nsub = parser_result.capture_groups_count;
+
+ return REG_NOERR;
+}
+
+int regexec(const regex_t* reg, const char* string, size_t nmatch, regmatch_t pmatch[], int eflags)
+{
+ auto preg = impl_from(reg);
+
+ if (!preg->re || preg->re_pat_err) {
+ if (preg->re_pat_err)
+ return preg->re_pat_err;
+ return REG_BADPAT;
+ }
+
+ RegexResult result;
+ if (eflags & REG_SEARCH)
+ result = preg->re->search({ string }, PosixOptions {} | (PosixFlags)eflags);
+ else
+ result = preg->re->match({ string }, PosixOptions {} | (PosixFlags)eflags);
+
+ if (result.success) {
+ auto size = result.matches.size();
+ if (size && nmatch && pmatch) {
+ pmatch[0].rm_cnt = size;
+
+ size_t match_index { 0 };
+ for (size_t i = 0; i < size; ++i) {
+ pmatch[match_index].rm_so = result.matches.at(i).global_offset;
+ pmatch[match_index].rm_eo = pmatch[match_index].rm_so + result.matches.at(i).view.length();
+ if (match_index > 0)
+ pmatch[match_index].rm_cnt = result.capture_group_matches.size();
+
+ ++match_index;
+ if (match_index >= nmatch)
+ return REG_NOERR;
+
+ if (i < result.capture_group_matches.size()) {
+ auto capture_groups_size = result.capture_group_matches.at(i).size();
+ for (size_t j = 0; j < preg->re->parser_result.capture_groups_count; ++j) {
+ if (j >= capture_groups_size || !result.capture_group_matches.at(i).at(j).view.length()) {
+ pmatch[match_index].rm_so = -1;
+ pmatch[match_index].rm_eo = -1;
+ pmatch[match_index].rm_cnt = 0;
+ } else {
+ pmatch[match_index].rm_so = result.capture_group_matches.at(i).at(j).global_offset;
+ pmatch[match_index].rm_eo = pmatch[match_index].rm_so + result.capture_group_matches.at(i).at(j).view.length();
+ pmatch[match_index].rm_cnt = 1;
+ }
+
+ ++match_index;
+ if (match_index >= nmatch)
+ return REG_NOERR;
+ }
+ }
+ }
+
+ if (match_index < nmatch) {
+ for (size_t i = match_index; i < nmatch; ++i) {
+ pmatch[i].rm_so = -1;
+ pmatch[i].rm_eo = -1;
+ pmatch[i].rm_cnt = 0;
+ }
+ }
+ }
+ return REG_NOERR;
+ } else {
+ if (nmatch && pmatch) {
+ pmatch[0].rm_so = -1;
+ pmatch[0].rm_eo = -1;
+ pmatch[0].rm_cnt = 0;
+ }
+ }
+
+ return REG_NOMATCH;
+}
+
+inline static String get_error(ReError errcode)
+{
+ String error;
+ switch ((ReError)errcode) {
+ case REG_NOERR:
+ error = "No error";
+ break;
+ case REG_NOMATCH:
+ error = "regexec() failed to match.";
+ break;
+ case REG_BADPAT:
+ error = "Invalid regular expression.";
+ break;
+ case REG_ECOLLATE:
+ error = "Invalid collating element referenced.";
+ break;
+ case REG_ECTYPE:
+ error = "Invalid character class type referenced.";
+ break;
+ case REG_EESCAPE:
+ error = "Trailing \\ in pattern.";
+ break;
+ case REG_ESUBREG:
+ error = "Number in \\digit invalid or in error.";
+ break;
+ case REG_EBRACK:
+ error = "[ ] imbalance.";
+ break;
+ case REG_EPAREN:
+ error = "\\( \\) or ( ) imbalance.";
+ break;
+ case REG_EBRACE:
+ error = "\\{ \\} imbalance.";
+ break;
+ case REG_BADBR:
+ error = "Content of \\{ \\} invalid: not a number, number too large, more than two numbers, first larger than second.";
+ break;
+ case REG_ERANGE:
+ error = "Invalid endpoint in range expression.";
+ break;
+ case REG_ESPACE:
+ error = "Out of memory.";
+ break;
+ case REG_BADRPT:
+ error = "?, * or + not preceded by valid regular expression.";
+ break;
+ case REG_ENOSYS:
+ error = "The implementation does not support the function.";
+ break;
+ case REG_EMPTY_EXPR:
+ error = "Empty expression provided";
+ break;
+ }
+
+ return error;
+}
+
+size_t regerror(int errcode, const regex_t* reg, char* errbuf, size_t errbuf_size)
+{
+ String error;
+ auto preg = impl_from(reg);
+
+ if (!preg)
+ error = get_error((ReError)errcode);
+ else
+ error = preg->re->error_string(get_error(preg->re_pat_err));
+
+ if (!errbuf_size)
+ return error.length();
+
+ if (!error.copy_characters_to_buffer(errbuf, errbuf_size))
+ return 0;
+
+ return error.length();
+}
+
+void regfree(regex_t* reg)
+{
+ auto preg = impl_from(reg);
+ if (preg) {
+ delete preg;
+ reg->__data = nullptr;
+ }
+}
+}
diff --git a/Libraries/LibRegex/CMakeLists.txt b/Libraries/LibRegex/CMakeLists.txt
new file mode 100644
index 0000000000..44cc6928f9
--- /dev/null
+++ b/Libraries/LibRegex/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(SOURCES
+ C/Regex.cpp
+ RegexByteCode.cpp
+ RegexLexer.cpp
+ RegexMatcher.cpp
+ RegexParser.cpp
+)
+
+serenity_lib(LibRegex regex)
+target_link_libraries(LibRegex LibC LibCore)
diff --git a/Libraries/LibRegex/Forward.h b/Libraries/LibRegex/Forward.h
new file mode 100644
index 0000000000..b35a0f3476
--- /dev/null
+++ b/Libraries/LibRegex/Forward.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <AK/Types.h>
+
+namespace regex {
+enum class Error : u8;
+class Lexer;
+class PosixExtendedParser;
+
+class ByteCode;
+class OpCode;
+class OpCode_Exit;
+class OpCode_Jump;
+class OpCode_ForkJump;
+class OpCode_ForkStay;
+class OpCode_CheckBegin;
+class OpCode_CheckEnd;
+class OpCode_SaveLeftCaptureGroup;
+class OpCode_SaveRightCaptureGroup;
+class OpCode_SaveLeftNamedCaptureGroup;
+class OpCode_SaveNamedLeftCaptureGroup;
+class OpCode_SaveRightNamedCaptureGroup;
+class OpCode_Compare;
+}
+
+using regex::Error;
+using regex::Lexer;
+using regex::PosixExtendedParser;
diff --git a/Libraries/LibRegex/Regex.h b/Libraries/LibRegex/Regex.h
new file mode 100644
index 0000000000..d3fdcec7f4
--- /dev/null
+++ b/Libraries/LibRegex/Regex.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <LibRegex/Forward.h>
+#include <LibRegex/RegexDebug.h>
+#include <LibRegex/RegexMatcher.h>
diff --git a/Libraries/LibRegex/RegexByteCode.cpp b/Libraries/LibRegex/RegexByteCode.cpp
new file mode 100644
index 0000000000..27bd16002e
--- /dev/null
+++ b/Libraries/LibRegex/RegexByteCode.cpp
@@ -0,0 +1,572 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "RegexByteCode.h"
+#include "AK/StringBuilder.h"
+#include "RegexDebug.h"
+
+#include <ctype.h>
+
+namespace regex {
+
+const char* OpCode::name(OpCodeId opcode_id)
+{
+ switch (opcode_id) {
+#define __ENUMERATE_OPCODE(x) \
+ case OpCodeId::x: \
+ return #x;
+ ENUMERATE_OPCODES
+#undef __ENUMERATE_OPCODE
+ default:
+ ASSERT_NOT_REACHED();
+ return "<Unknown>";
+ }
+}
+
+const char* OpCode::name() const
+{
+ return name(opcode_id());
+}
+
+const char* execution_result_name(ExecutionResult result)
+{
+ switch (result) {
+#define __ENUMERATE_EXECUTION_RESULT(x) \
+ case ExecutionResult::x: \
+ return #x;
+ ENUMERATE_EXECUTION_RESULTS
+#undef __ENUMERATE_EXECUTION_RESULT
+ default:
+ ASSERT_NOT_REACHED();
+ return "<Unknown>";
+ }
+}
+
+const char* character_compare_type_name(CharacterCompareType ch_compare_type)
+{
+ switch (ch_compare_type) {
+#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) \
+ case CharacterCompareType::x: \
+ return #x;
+ ENUMERATE_CHARACTER_COMPARE_TYPES
+#undef __ENUMERATE_CHARACTER_COMPARE_TYPE
+ default:
+ ASSERT_NOT_REACHED();
+ return "<Unknown>";
+ }
+}
+
+static const char* character_class_name(CharClass ch_class)
+{
+ switch (ch_class) {
+#define __ENUMERATE_CHARACTER_CLASS(x) \
+ case CharClass::x: \
+ return #x;
+ ENUMERATE_CHARACTER_CLASSES
+#undef __ENUMERATE_CHARACTER_CLASS
+ default:
+ ASSERT_NOT_REACHED();
+ return "<Unknown>";
+ }
+}
+
+HashMap<u32, OwnPtr<OpCode>> ByteCode::s_opcodes {};
+
+ALWAYS_INLINE OpCode* ByteCode::get_opcode_by_id(OpCodeId id) const
+{
+ if (!s_opcodes.size()) {
+ for (u32 i = (u32)OpCodeId::First; i <= (u32)OpCodeId::Last; ++i) {
+ switch ((OpCodeId)i) {
+ case OpCodeId::Exit:
+ s_opcodes.set(i, make<OpCode_Exit>(*const_cast<ByteCode*>(this)));
+ break;
+ case OpCodeId::Jump:
+ s_opcodes.set(i, make<OpCode_Jump>(*const_cast<ByteCode*>(this)));
+ break;
+ case OpCodeId::Compare:
+ s_opcodes.set(i, make<OpCode_Compare>(*const_cast<ByteCode*>(this)));
+ break;
+ case OpCodeId::CheckEnd:
+ s_opcodes.set(i, make<OpCode_CheckEnd>(*const_cast<ByteCode*>(this)));
+ break;
+ case OpCodeId::ForkJump:
+ s_opcodes.set(i, make<OpCode_ForkJump>(*const_cast<ByteCode*>(this)));
+ break;
+ case OpCodeId::ForkStay:
+ s_opcodes.set(i, make<OpCode_ForkStay>(*const_cast<ByteCode*>(this)));
+ break;
+ case OpCodeId::CheckBegin:
+ s_opcodes.set(i, make<OpCode_CheckBegin>(*const_cast<ByteCode*>(this)));
+ break;
+ case OpCodeId::SaveLeftCaptureGroup:
+ s_opcodes.set(i, make<OpCode_SaveLeftCaptureGroup>(*const_cast<ByteCode*>(this)));
+ break;
+ case OpCodeId::SaveRightCaptureGroup:
+ s_opcodes.set(i, make<OpCode_SaveRightCaptureGroup>(*const_cast<ByteCode*>(this)));
+ break;
+ case OpCodeId::SaveLeftNamedCaptureGroup:
+ s_opcodes.set(i, make<OpCode_SaveLeftNamedCaptureGroup>(*const_cast<ByteCode*>(this)));
+ break;
+ case OpCodeId::SaveRightNamedCaptureGroup:
+ s_opcodes.set(i, make<OpCode_SaveRightNamedCaptureGroup>(*const_cast<ByteCode*>(this)));
+ break;
+ }
+ }
+ }
+
+ if (id > OpCodeId::Last)
+ return nullptr;
+
+ return const_cast<OpCode*>(s_opcodes.get((u32)id).value())->set_bytecode(*const_cast<ByteCode*>(this));
+}
+
+OpCode* ByteCode::get_opcode(MatchState& state) const
+{
+ OpCode* op_code;
+
+ if (state.instruction_position >= size()) {
+ op_code = get_opcode_by_id(OpCodeId::Exit);
+ } else
+ op_code = get_opcode_by_id((OpCodeId)at(state.instruction_position));
+
+ if (op_code)
+ op_code->set_state(state);
+
+ return op_code;
+}
+
+ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
+{
+ if (state.string_position > input.view.length() || state.instruction_position >= m_bytecode->size())
+ return ExecutionResult::Succeeded;
+
+ return ExecutionResult::Failed;
+}
+
+ALWAYS_INLINE ExecutionResult OpCode_Jump::execute(const MatchInput&, MatchState& state, MatchOutput&) const
+{
+
+ state.instruction_position += offset();
+ return ExecutionResult::Continue;
+}
+
+ALWAYS_INLINE ExecutionResult OpCode_ForkJump::execute(const MatchInput&, MatchState& state, MatchOutput&) const
+{
+ state.fork_at_position = state.instruction_position + size() + offset();
+ return ExecutionResult::Fork_PrioHigh;
+}
+
+ALWAYS_INLINE ExecutionResult OpCode_ForkStay::execute(const MatchInput&, MatchState& state, MatchOutput&) const
+{
+ state.fork_at_position = state.instruction_position + size() + offset();
+ return ExecutionResult::Fork_PrioLow;
+}
+
+ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
+{
+ if (0 == state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
+ return ExecutionResult::Failed;
+
+ if ((0 == state.string_position && !(input.regex_options & AllFlags::MatchNotBeginOfLine))
+ || (0 != state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
+ || (0 == state.string_position && (input.regex_options & AllFlags::Global)))
+ return ExecutionResult::Continue;
+
+ return ExecutionResult::Failed;
+}
+
+ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
+{
+ if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine))
+ return ExecutionResult::Failed;
+
+ if ((state.string_position == input.view.length() && !(input.regex_options & AllFlags::MatchNotEndOfLine))
+ || (state.string_position != input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine)))
+ return ExecutionResult::Succeeded;
+
+ return ExecutionResult::Failed;
+}
+
+ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
+{
+ if (input.match_index >= output.capture_group_matches.size()) {
+ output.capture_group_matches.ensure_capacity(input.match_index);
+ auto capacity = output.capture_group_matches.capacity();
+ for (size_t i = output.capture_group_matches.size(); i <= capacity; ++i)
+ output.capture_group_matches.empend();
+ }
+
+ if (id() >= output.capture_group_matches.at(input.match_index).size()) {
+ output.capture_group_matches.at(input.match_index).ensure_capacity(id());
+ auto capacity = output.capture_group_matches.at(input.match_index).capacity();
+ for (size_t i = output.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i)
+ output.capture_group_matches.at(input.match_index).empend();
+ }
+
+ output.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position;
+ return ExecutionResult::Continue;
+}
+
+ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
+{
+ auto& match = output.capture_group_matches.at(input.match_index).at(id());
+ auto start_position = match.left_column;
+ auto length = state.string_position - start_position;
+
+ if (start_position < match.column)
+ return ExecutionResult::Continue;
+
+ ASSERT(start_position + length <= input.view.length());
+
+ auto view = input.view.substring_view(start_position, length);
+
+ if (input.regex_options & AllFlags::StringCopyMatches) {
+ match = { view.to_string(), input.line, start_position, input.global_offset + start_position }; // create a copy of the original string
+ } else {
+ match = { view, input.line, start_position, input.global_offset + start_position }; // take view to original string
+ }
+
+ return ExecutionResult::Continue;
+}
+
+ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
+{
+ if (input.match_index >= output.named_capture_group_matches.size()) {
+ output.named_capture_group_matches.ensure_capacity(input.match_index);
+ auto capacity = output.named_capture_group_matches.capacity();
+ for (size_t i = output.named_capture_group_matches.size(); i <= capacity; ++i)
+ output.named_capture_group_matches.empend();
+ }
+ output.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position;
+ return ExecutionResult::Continue;
+}
+
+ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
+{
+ StringView capture_group_name = name();
+
+ if (output.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) {
+ auto start_position = output.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column;
+ auto length = state.string_position - start_position;
+
+ auto& map = output.named_capture_group_matches.at(input.match_index);
+
+#ifdef REGEX_DEBUG
+ ASSERT(start_position + length < input.view.length());
+ dbg() << "Save named capture group with name=" << capture_group_name << " and content: " << input.view.substring_view(start_position, length).to_string();
+#endif
+
+ ASSERT(start_position + length <= input.view.length());
+ auto view = input.view.substring_view(start_position, length);
+ if (input.regex_options & AllFlags::StringCopyMatches) {
+ map.set(capture_group_name, { view.to_string(), input.line, start_position, input.global_offset + start_position }); // create a copy of the original string
+ } else {
+ map.set(capture_group_name, { view, input.line, start_position, input.global_offset + start_position }); // take view to original string
+ }
+ } else {
+ fprintf(stderr, "Didn't find corresponding capture group match for name=%s, match_index=%lu\n", capture_group_name.to_string().characters(), input.match_index);
+ }
+
+ return ExecutionResult::Continue;
+}
+
+ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
+{
+ bool inverse { false };
+
+ size_t string_position = state.string_position;
+ bool inverse_matched { false };
+
+ size_t offset { state.instruction_position + 3 };
+ for (size_t i = 0; i < arguments_count(); ++i) {
+ if (state.string_position > string_position)
+ break;
+
+ auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
+
+ if (compare_type == CharacterCompareType::Inverse)
+ inverse = true;
+
+ else if (compare_type == CharacterCompareType::Char) {
+ char ch = m_bytecode->at(offset++);
+
+ // We want to compare a string that is longer or equal in length to the available string
+ if (input.view.length() - state.string_position < 1)
+ return ExecutionResult::Failed_ExecuteLowPrioForks;
+
+ compare_char(input, state, ch, inverse, inverse_matched);
+
+ } else if (compare_type == CharacterCompareType::AnyChar) {
+ // We want to compare a string that is definitely longer than the available string
+ if (input.view.length() - state.string_position < 1)
+ return ExecutionResult::Failed_ExecuteLowPrioForks;
+
+ ASSERT(!inverse);
+ ++state.string_position;
+
+ } else if (compare_type == CharacterCompareType::String) {
+ ASSERT(!inverse);
+
+ char* str = reinterpret_cast<char*>(m_bytecode->at(offset++));
+ auto& length = m_bytecode->at(offset++);
+
+ // We want to compare a string that is definitely longer than the available string
+ if (input.view.length() - state.string_position < length)
+ return ExecutionResult::Failed_ExecuteLowPrioForks;
+
+ if (!compare_string(input, state, str, length))
+ return ExecutionResult::Failed_ExecuteLowPrioForks;
+
+ } else if (compare_type == CharacterCompareType::CharClass) {
+
+ if (input.view.length() - state.string_position < 1)
+ return ExecutionResult::Failed_ExecuteLowPrioForks;
+
+ auto character_class = (CharClass)m_bytecode->at(offset++);
+ auto& ch = input.view[state.string_position];
+
+ compare_character_class(input, state, character_class, ch, inverse, inverse_matched);
+
+ } else if (compare_type == CharacterCompareType::CharRange) {
+ auto value = (CharRange)m_bytecode->at(offset++);
+
+ auto from = value.from;
+ auto to = value.to;
+ auto ch = input.view[state.string_position];
+
+ compare_character_range(input, state, from, to, ch, inverse, inverse_matched);
+
+ } else {
+ fprintf(stderr, "Undefined comparison: %i\n", (int)compare_type);
+ ASSERT_NOT_REACHED();
+ break;
+ }
+ }
+
+ if (inverse && !inverse_matched)
+ ++state.string_position;
+
+ if (string_position == state.string_position || state.string_position > input.view.length())
+ return ExecutionResult::Failed_ExecuteLowPrioForks;
+
+ return ExecutionResult::Continue;
+}
+
+ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchState& state, char& ch, bool inverse, bool& inverse_matched) const
+{
+ auto ch1 = ch;
+ auto ch2 = input.view[state.string_position];
+
+ if (input.regex_options & AllFlags::Insensitive) {
+ ch1 = tolower(ch1);
+ ch2 = tolower(ch2);
+ }
+
+ if (ch1 == ch2) {
+ if (inverse)
+ inverse_matched = true;
+ else
+ ++state.string_position;
+ }
+}
+
+ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length) const
+{
+ auto str_view1 = StringView(str, length);
+ auto str_view2 = StringView(&input.view[state.string_position], length);
+ String str1, str2;
+ if (input.regex_options & AllFlags::Insensitive) {
+ str1 = str_view1.to_string().to_lowercase();
+ str2 = str_view2.to_string().to_lowercase();
+ str_view1 = str1.view();
+ str_view2 = str2.view();
+ }
+
+ if (str_view1 == str_view2) {
+ state.string_position += length;
+ return true;
+ } else
+ return false;
+}
+
+ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, char ch, bool inverse, bool& inverse_matched) const
+{
+ switch (character_class) {
+ case CharClass::Alnum:
+ if (isalnum(ch)) {
+ if (inverse)
+ inverse_matched = true;
+ else
+ ++state.string_position;
+ }
+ break;
+ case CharClass::Alpha:
+ if (isalpha(ch))
+ ++state.string_position;
+ break;
+ case CharClass::Blank:
+ if (ch == ' ' || ch == '\t') {
+ if (inverse)
+ inverse_matched = true;
+ else
+ ++state.string_position;
+ }
+ break;
+ case CharClass::Cntrl:
+ if (iscntrl(ch)) {
+ if (inverse)
+ inverse_matched = true;
+ else
+ ++state.string_position;
+ }
+ break;
+ case CharClass::Digit:
+ if (isdigit(ch)) {
+ if (inverse)
+ inverse_matched = true;
+ else
+ ++state.string_position;
+ }
+ break;
+ case CharClass::Graph:
+ if (isgraph(ch)) {
+ if (inverse)
+ inverse_matched = true;
+ else
+ ++state.string_position;
+ }
+ break;
+ case CharClass::Lower:
+ if (islower(ch) || ((input.regex_options & AllFlags::Insensitive) && isupper(ch))) {
+ if (inverse)
+ inverse_matched = true;
+ else
+ ++state.string_position;
+ }
+ break;
+ case CharClass::Print:
+ if (isprint(ch)) {
+ if (inverse)
+ inverse_matched = true;
+ else
+ ++state.string_position;
+ }
+ break;
+ case CharClass::Punct:
+ if (ispunct(ch)) {
+ if (inverse)
+ inverse_matched = true;
+ else
+ ++state.string_position;
+ }
+ break;
+ case CharClass::Space:
+ if (isspace(ch)) {
+ if (inverse)
+ inverse_matched = true;
+ else
+ ++state.string_position;
+ }
+ break;
+ case CharClass::Upper:
+ if (isupper(ch) || ((input.regex_options & AllFlags::Insensitive) && islower(ch))) {
+ if (inverse)
+ inverse_matched = true;
+ else
+ ++state.string_position;
+ }
+ break;
+ case CharClass::Xdigit:
+ if (isxdigit(ch)) {
+ if (inverse)
+ inverse_matched = true;
+ else
+ ++state.string_position;
+ }
+ break;
+ }
+}
+
+ALWAYS_INLINE void OpCode_Compare::compare_character_range(const MatchInput& input, MatchState& state, char from, char to, char ch, bool inverse, bool& inverse_matched) const
+{
+ if (input.regex_options & AllFlags::Insensitive) {
+ from = tolower(from);
+ to = tolower(to);
+ ch = tolower(ch);
+ }
+
+ if (ch >= from && ch <= to) {
+ if (inverse)
+ inverse_matched = true;
+ else
+ ++state.string_position;
+ }
+}
+
+const String OpCode_Compare::arguments_string() const
+{
+ return String::format("argc=%lu, args=%lu ", arguments_count(), arguments_size());
+}
+
+const Vector<String> OpCode_Compare::variable_arguments_to_string(Optional<MatchInput> input) const
+{
+ Vector<String> result;
+
+ size_t offset { state().instruction_position + 3 };
+ StringView view;
+ if (input.has_value())
+ view = input.value().view;
+
+ for (size_t i = 0; i < arguments_count(); ++i) {
+ auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
+ result.empend(String::format("type=%lu [%s]", (size_t)compare_type, character_compare_type_name(compare_type)));
+
+ if (compare_type == CharacterCompareType::Char) {
+ char ch = m_bytecode->at(offset++);
+ result.empend(String::format("value='%c'", ch));
+ if (!view.is_null())
+ result.empend(String::format("compare against: '%s'", String { view.substring_view(state().string_position, state().string_position + 1 > view.length() ? 0 : 1) }.characters()));
+ } else if (compare_type == CharacterCompareType::String) {
+ char* str = reinterpret_cast<char*>(m_bytecode->at(offset++));
+ auto& length = m_bytecode->at(offset++);
+ result.empend(String::format("value=\"%s\"", String { str, length }.characters()));
+ if (!view.is_null())
+ result.empend(String::format("compare against: \"%s\"", String { input.value().view.substring_view(state().string_position, state().string_position + length > view.length() ? 0 : length) }.characters()));
+ } else if (compare_type == CharacterCompareType::CharClass) {
+ auto character_class = (CharClass)m_bytecode->at(offset++);
+ result.empend(String::format("ch_class=%lu [%s]", (size_t)character_class, character_class_name(character_class)));
+ if (!view.is_null())
+ result.empend(String::format("compare against: '%s'", String { input.value().view.substring_view(state().string_position, state().string_position + 1 > view.length() ? 0 : 1) }.characters()));
+ } else if (compare_type == CharacterCompareType::CharRange) {
+ auto value = (CharRange)m_bytecode->at(offset++);
+ result.empend(String::format("ch_range='%c'-'%c'", value.from, value.to));
+ if (!view.is_null())
+ result.empend(String::format("compare against: '%s'", String { input.value().view.substring_view(state().string_position, state().string_position + 1 > view.length() ? 0 : 1) }.characters()));
+ }
+ }
+ return result;
+}
+}
diff --git a/Libraries/LibRegex/RegexByteCode.h b/Libraries/LibRegex/RegexByteCode.h
new file mode 100644
index 0000000000..d3ffdcef67
--- /dev/null
+++ b/Libraries/LibRegex/RegexByteCode.h
@@ -0,0 +1,637 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include "RegexMatch.h"
+#include "RegexOptions.h"
+
+#include <AK/Forward.h>
+#include <AK/HashMap.h>
+#include <AK/NonnullOwnPtr.h>
+#include <AK/OwnPtr.h>
+#include <AK/Traits.h>
+#include <AK/Types.h>
+#include <AK/Vector.h>
+
+namespace regex {
+
+using ByteCodeValueType = size_t;
+
+#define ENUMERATE_OPCODES \
+ __ENUMERATE_OPCODE(Compare) \
+ __ENUMERATE_OPCODE(Jump) \
+ __ENUMERATE_OPCODE(ForkJump) \
+ __ENUMERATE_OPCODE(ForkStay) \
+ __ENUMERATE_OPCODE(SaveLeftCaptureGroup) \
+ __ENUMERATE_OPCODE(SaveRightCaptureGroup) \
+ __ENUMERATE_OPCODE(SaveLeftNamedCaptureGroup) \
+ __ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \
+ __ENUMERATE_OPCODE(CheckBegin) \
+ __ENUMERATE_OPCODE(CheckEnd) \
+ __ENUMERATE_OPCODE(Exit)
+
+enum class OpCodeId : ByteCodeValueType {
+#define __ENUMERATE_OPCODE(x) x,
+ ENUMERATE_OPCODES
+#undef __ENUMERATE_OPCODE
+
+ First
+ = Compare,
+ Last
+ = Exit,
+};
+
+#define ENUMERATE_CHARACTER_COMPARE_TYPES \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(String) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
+
+enum class CharacterCompareType : ByteCodeValueType {
+#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
+ ENUMERATE_CHARACTER_COMPARE_TYPES
+#undef __ENUMERATE_CHARACTER_COMPARE_TYPE
+};
+
+#define ENUMERATE_CHARACTER_CLASSES \
+ __ENUMERATE_CHARACTER_CLASS(Alnum) \
+ __ENUMERATE_CHARACTER_CLASS(Cntrl) \
+ __ENUMERATE_CHARACTER_CLASS(Lower) \
+ __ENUMERATE_CHARACTER_CLASS(Space) \
+ __ENUMERATE_CHARACTER_CLASS(Alpha) \
+ __ENUMERATE_CHARACTER_CLASS(Digit) \
+ __ENUMERATE_CHARACTER_CLASS(Print) \
+ __ENUMERATE_CHARACTER_CLASS(Upper) \
+ __ENUMERATE_CHARACTER_CLASS(Blank) \
+ __ENUMERATE_CHARACTER_CLASS(Graph) \
+ __ENUMERATE_CHARACTER_CLASS(Punct) \
+ __ENUMERATE_CHARACTER_CLASS(Xdigit)
+
+enum class CharClass : ByteCodeValueType {
+#define __ENUMERATE_CHARACTER_CLASS(x) x,
+ ENUMERATE_CHARACTER_CLASSES
+#undef __ENUMERATE_CHARACTER_CLASS
+};
+
+struct CharRange {
+ const char from;
+ const char to;
+
+ CharRange(size_t value)
+ : from(value >> 8)
+ , to(value & 0xFF)
+ {
+ }
+
+ CharRange(char from, char to)
+ : from(from)
+ , to(to)
+ {
+ }
+
+ operator ByteCodeValueType() const { return (from << 8) | to; }
+};
+
+struct CompareTypeAndValuePair {
+ CharacterCompareType type;
+ ByteCodeValueType value;
+};
+
+class OpCode;
+
+class ByteCode : public Vector<ByteCodeValueType> {
+public:
+ ByteCode() = default;
+ virtual ~ByteCode() = default;
+
+ void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs)
+ {
+ ByteCode bytecode;
+
+ bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
+ bytecode.empend(pairs.size()); // number of arguments
+
+ ByteCode arguments;
+ for (auto& value : pairs) {
+ ASSERT(value.type != CharacterCompareType::RangeExpressionDummy);
+ ASSERT(value.type != CharacterCompareType::Undefined);
+ ASSERT(value.type != CharacterCompareType::String);
+
+ arguments.append((ByteCodeValueType)value.type);
+ if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar)
+ arguments.append(move(value.value));
+ }
+
+ bytecode.empend(arguments.size()); // size of arguments
+ bytecode.append(move(arguments));
+
+ append(move(bytecode));
+ }
+
+ void insert_bytecode_compare_string(StringView view, size_t length)
+ {
+ ByteCode bytecode;
+
+ bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
+ bytecode.empend(1); // number of arguments
+
+ ByteCode arguments;
+
+ arguments.empend(static_cast<ByteCodeValueType>(CharacterCompareType::String));
+ arguments.empend(reinterpret_cast<ByteCodeValueType>(view.characters_without_null_termination()));
+ arguments.empend(length);
+
+ bytecode.empend(arguments.size()); // size of arguments
+ bytecode.append(move(arguments));
+
+ append(move(bytecode));
+ }
+
+ void insert_bytecode_group_capture_left(size_t capture_groups_count)
+ {
+ empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftCaptureGroup));
+ empend(capture_groups_count);
+ }
+
+ void insert_bytecode_group_capture_left(const StringView& name)
+ {
+ empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftNamedCaptureGroup));
+ empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
+ empend(name.length());
+ }
+
+ void insert_bytecode_group_capture_right(size_t capture_groups_count)
+ {
+ empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightCaptureGroup));
+ empend(capture_groups_count);
+ }
+
+ void insert_bytecode_group_capture_right(const StringView& name)
+ {
+ empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightNamedCaptureGroup));
+ empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
+ empend(name.length());
+ }
+
+ void insert_bytecode_alternation(ByteCode&& left, ByteCode&& right)
+ {
+
+ // FORKSTAY _ALT
+ // REGEXP ALT1
+ // JUMP _END
+ // LABEL _ALT
+ // REGEXP ALT2
+ // LABEL _END
+
+ ByteCode byte_code;
+
+ empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
+ empend(left.size() + 2); // Jump to the _ALT label
+
+ for (auto& op : left)
+ append(move(op));
+
+ empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
+ empend(right.size()); // Jump to the _END label
+
+ // LABEL _ALT = bytecode.size() + 2
+
+ for (auto& op : right)
+ append(move(op));
+
+ // LABEL _END = alterantive_bytecode.size
+ }
+
+ void insert_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, size_t minimum, Optional<size_t> maximum)
+ {
+ ByteCode new_bytecode;
+ new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum);
+
+ if (maximum.has_value()) {
+ if (maximum.value() > minimum) {
+ auto diff = maximum.value() - minimum;
+ new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
+ new_bytecode.empend(diff * (bytecode_to_repeat.size() + 2)); // Jump to the _END label
+
+ for (size_t i = 0; i < diff; ++i) {
+ new_bytecode.append(bytecode_to_repeat);
+ new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
+ new_bytecode.empend((diff - i - 1) * (bytecode_to_repeat.size() + 2)); // Jump to the _END label
+ }
+ }
+ } else {
+ // no maximum value set, repeat finding if possible
+ new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
+ new_bytecode.empend(-bytecode_to_repeat.size() - 2); // Jump to the last iteration
+ }
+
+ bytecode_to_repeat = move(new_bytecode);
+ }
+
+ void insert_bytecode_repetition_n(ByteCode& bytecode_to_repeat, size_t n)
+ {
+ for (size_t i = 0; i < n; ++i)
+ append(bytecode_to_repeat);
+ }
+
+ void insert_bytecode_repetition_min_one(ByteCode& bytecode_to_repeat, bool greedy)
+ {
+ // LABEL _START = -bytecode_to_repeat.size()
+ // REGEXP
+ // FORKJUMP _START (FORKSTAY -> Greedy)
+
+ if (greedy)
+ bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
+ else
+ bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
+
+ bytecode_to_repeat.empend(-(bytecode_to_repeat.size() + 1)); // Jump to the _START label
+ }
+
+ void insert_bytecode_repetition_any(ByteCode& bytecode_to_repeat, bool greedy)
+ {
+ // LABEL _START
+ // FORKSTAY _END (FORKJUMP -> Greedy)
+ // REGEXP
+ // JUMP _START
+ // LABEL _END
+
+ // LABEL _START = m_bytes.size();
+ ByteCode bytecode;
+
+ if (greedy)
+ bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
+ else
+ bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
+
+ bytecode.empend(bytecode_to_repeat.size() + 2); // Jump to the _END label
+
+ for (auto& op : bytecode_to_repeat)
+ bytecode.append(move(op));
+
+ bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
+ bytecode.empend(-bytecode.size() - 1); // Jump to the _START label
+ // LABEL _END = bytecode.size()
+
+ bytecode_to_repeat = move(bytecode);
+ }
+
+ void insert_bytecode_repetition_zero_or_one(ByteCode& bytecode_to_repeat, bool greedy)
+ {
+ // FORKSTAY _END (FORKJUMP -> Greedy)
+ // REGEXP
+ // LABEL _END
+ ByteCode bytecode;
+
+ if (greedy)
+ bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
+ else
+ bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
+
+ bytecode.empend(bytecode_to_repeat.size()); // Jump to the _END label
+
+ for (auto& op : bytecode_to_repeat)
+ bytecode.append(move(op));
+ // LABEL _END = bytecode.size()
+
+ bytecode_to_repeat = move(bytecode);
+ }
+
+ OpCode* get_opcode(MatchState& state) const;
+
+private:
+ ALWAYS_INLINE OpCode* get_opcode_by_id(OpCodeId id) const;
+ static HashMap<u32, OwnPtr<OpCode>> s_opcodes;
+};
+
+#define ENUMERATE_EXECUTION_RESULTS \
+ __ENUMERATE_EXECUTION_RESULT(Continue) \
+ __ENUMERATE_EXECUTION_RESULT(Fork_PrioHigh) \
+ __ENUMERATE_EXECUTION_RESULT(Fork_PrioLow) \
+ __ENUMERATE_EXECUTION_RESULT(Failed) \
+ __ENUMERATE_EXECUTION_RESULT(Failed_ExecuteLowPrioForks) \
+ __ENUMERATE_EXECUTION_RESULT(Succeeded)
+
+enum class ExecutionResult : u8 {
+#define __ENUMERATE_EXECUTION_RESULT(x) x,
+ ENUMERATE_EXECUTION_RESULTS
+#undef __ENUMERATE_EXECUTION_RESULT
+};
+
+const char* execution_result_name(ExecutionResult result);
+const char* opcode_id_name(OpCodeId opcode_id);
+const char* character_compare_type_name(CharacterCompareType result);
+const char* execution_result_name(ExecutionResult result);
+
+class OpCode {
+public:
+ OpCode(ByteCode& bytecode)
+ : m_bytecode(&bytecode)
+ {
+ }
+
+ virtual ~OpCode() = default;
+
+ virtual OpCodeId opcode_id() const = 0;
+ virtual size_t size() const = 0;
+ virtual ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const = 0;
+
+ ALWAYS_INLINE ByteCodeValueType argument(size_t offset) const
+ {
+ ASSERT(state().instruction_position + offset <= m_bytecode->size());
+ return m_bytecode->at(state().instruction_position + 1 + offset);
+ }
+
+ ALWAYS_INLINE const char* name() const;
+ static const char* name(const OpCodeId);
+
+ ALWAYS_INLINE OpCode* set_state(MatchState& state)
+ {
+ m_state = &state;
+ return this;
+ }
+
+ ALWAYS_INLINE OpCode* set_bytecode(ByteCode& bytecode)
+ {
+ m_bytecode = &bytecode;
+ return this;
+ }
+
+ ALWAYS_INLINE void reset_state() { m_state.clear(); }
+
+ ALWAYS_INLINE const MatchState& state() const
+ {
+ ASSERT(m_state.has_value());
+ return *m_state.value();
+ }
+
+ const String to_string() const
+ {
+ return String::format("[0x%02X] %s", opcode_id(), name(opcode_id()));
+ }
+
+ virtual const String arguments_string() const = 0;
+
+ ALWAYS_INLINE const ByteCode& bytecode() const { return *m_bytecode; }
+
+protected:
+ ByteCode* m_bytecode;
+ Optional<MatchState*> m_state;
+};
+
+
+class OpCode_Exit final : public OpCode {
+public:
+ OpCode_Exit(ByteCode& bytecode)
+ : OpCode(bytecode)
+ {
+ }
+ ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+ ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Exit; }
+ ALWAYS_INLINE size_t size() const override { return 1; }
+ const String arguments_string() const override { return ""; }
+};
+
+class OpCode_Jump final : public OpCode {
+public:
+ OpCode_Jump(ByteCode& bytecode)
+ : OpCode(bytecode)
+ {
+ }
+ ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+ ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Jump; }
+ ALWAYS_INLINE size_t size() const override { return 2; }
+ ALWAYS_INLINE ssize_t offset() const { return argument(0); }
+ const String arguments_string() const override
+ {
+ return String::format("offset=%i [&%lu]", offset(), state().instruction_position + size() + offset());
+ }
+};
+
+class OpCode_ForkJump final : public OpCode {
+public:
+ OpCode_ForkJump(ByteCode& bytecode)
+ : OpCode(bytecode)
+ {
+ }
+ ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+ ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkJump; }
+ ALWAYS_INLINE size_t size() const override { return 2; }
+ ALWAYS_INLINE ssize_t offset() const { return argument(0); }
+ const String arguments_string() const override
+ {
+ return String::format("offset=%i [&%lu], sp: %lu", offset(), state().instruction_position + size() + offset(), state().string_position);
+ }
+};
+
+class OpCode_ForkStay final : public OpCode {
+public:
+ OpCode_ForkStay(ByteCode& bytecode)
+ : OpCode(bytecode)
+ {
+ }
+ ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+ ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkStay; }
+ ALWAYS_INLINE size_t size() const override { return 2; }
+ ALWAYS_INLINE ssize_t offset() const { return argument(0); }
+ const String arguments_string() const override
+ {
+ return String::format("offset=%i [&%lu], sp: %lu", offset(), state().instruction_position + size() + offset(), state().string_position);
+ }
+};
+
+class OpCode_CheckBegin final : public OpCode {
+public:
+ OpCode_CheckBegin(ByteCode& bytecode)
+ : OpCode(bytecode)
+ {
+ }
+ ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+ ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBegin; }
+ ALWAYS_INLINE size_t size() const override { return 1; }
+ const String arguments_string() const override { return ""; }
+};
+
+class OpCode_CheckEnd final : public OpCode {
+public:
+ OpCode_CheckEnd(ByteCode& bytecode)
+ : OpCode(bytecode)
+ {
+ }
+ ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+ ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckEnd; }
+ ALWAYS_INLINE size_t size() const override { return 1; }
+ const String arguments_string() const override { return ""; }
+};
+
+class OpCode_SaveLeftCaptureGroup final : public OpCode {
+public:
+ OpCode_SaveLeftCaptureGroup(ByteCode& bytecode)
+ : OpCode(bytecode)
+ {
+ }
+ ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+ ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftCaptureGroup; }
+ ALWAYS_INLINE size_t size() const override { return 2; }
+ ALWAYS_INLINE size_t id() const { return argument(0); }
+ const String arguments_string() const override { return String::format("id=%lu", id()); }
+};
+
+class OpCode_SaveRightCaptureGroup final : public OpCode {
+public:
+ OpCode_SaveRightCaptureGroup(ByteCode& bytecode)
+ : OpCode(bytecode)
+ {
+ }
+ ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+ ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightCaptureGroup; }
+ ALWAYS_INLINE size_t size() const override { return 2; }
+ ALWAYS_INLINE size_t id() const { return argument(0); }
+ const String arguments_string() const override { return String::format("id=%lu", id()); }
+};
+
+class OpCode_SaveLeftNamedCaptureGroup final : public OpCode {
+public:
+ OpCode_SaveLeftNamedCaptureGroup(ByteCode& bytecode)
+ : OpCode(bytecode)
+ {
+ }
+ ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+ ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftNamedCaptureGroup; }
+ ALWAYS_INLINE size_t size() const override { return 3; }
+ ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
+ ALWAYS_INLINE size_t length() const { return argument(1); }
+ const String arguments_string() const override
+ {
+ return String::format("name=%s, length=%lu", name().to_string().characters(), length());
+ }
+};
+
+class OpCode_SaveRightNamedCaptureGroup final : public OpCode {
+public:
+ OpCode_SaveRightNamedCaptureGroup(ByteCode& bytecode)
+ : OpCode(bytecode)
+ {
+ }
+ ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+ ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightNamedCaptureGroup; }
+ ALWAYS_INLINE size_t size() const override { return 3; }
+ ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
+ ALWAYS_INLINE size_t length() const { return argument(1); }
+ const String arguments_string() const override
+ {
+ return String::format("name=%s, length=%lu", name().to_string().characters(), length());
+ }
+};
+
+class OpCode_Compare final : public OpCode {
+public:
+ OpCode_Compare(ByteCode& bytecode)
+ : OpCode(bytecode)
+ {
+ }
+ ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
+ ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Compare; }
+ ALWAYS_INLINE size_t size() const override { return arguments_size() + 3; }
+ ALWAYS_INLINE size_t arguments_count() const { return argument(0); }
+ ALWAYS_INLINE size_t arguments_size() const { return argument(1); }
+ const String arguments_string() const override;
+ const Vector<String> variable_arguments_to_string(Optional<MatchInput> input = {}) const;
+
+private:
+ ALWAYS_INLINE void compare_char(const MatchInput& input, MatchState& state, char& ch, bool inverse, bool& inverse_matched) const;
+ ALWAYS_INLINE bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length) const;
+ ALWAYS_INLINE void compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, char ch, bool inverse, bool& inverse_matched) const;
+ ALWAYS_INLINE void compare_character_range(const MatchInput& input, MatchState& state, char from, char to, char ch, bool inverse, bool& inverse_matched) const;
+};
+
+template<typename T>
+bool is(const OpCode&);
+
+template<typename T>
+ALWAYS_INLINE bool is(const OpCode&)
+{
+ return false;
+}
+
+template<typename T>
+ALWAYS_INLINE bool is(const OpCode* opcode)
+{
+ return is<T>(*opcode);
+}
+
+template<>
+ALWAYS_INLINE bool is<OpCode_ForkStay>(const OpCode& opcode)
+{
+ return opcode.opcode_id() == OpCodeId::ForkStay;
+}
+
+template<>
+ALWAYS_INLINE bool is<OpCode_Exit>(const OpCode& opcode)
+{
+ return opcode.opcode_id() == OpCodeId::Exit;
+}
+
+template<>
+ALWAYS_INLINE bool is<OpCode_Compare>(const OpCode& opcode)
+{
+ return opcode.opcode_id() == OpCodeId::Compare;
+}
+
+template<typename T>
+ALWAYS_INLINE const T& to(const OpCode& opcode)
+{
+ ASSERT(is<T>(opcode));
+ return static_cast<const T&>(opcode);
+}
+
+template<typename T>
+ALWAYS_INLINE T* to(OpCode* opcode)
+{
+ ASSERT(is<T>(opcode));
+ return static_cast<T*>(opcode);
+}
+
+template<typename T>
+ALWAYS_INLINE const T* to(const OpCode* opcode)
+{
+ ASSERT(is<T>(opcode));
+ return static_cast<const T*>(opcode);
+}
+
+template<typename T>
+ALWAYS_INLINE T& to(OpCode& opcode)
+{
+ ASSERT(is<T>(opcode));
+ return static_cast<T&>(opcode);
+}
+
+}
diff --git a/Libraries/LibRegex/RegexDebug.h b/Libraries/LibRegex/RegexDebug.h
new file mode 100644
index 0000000000..94c04eddd1
--- /dev/null
+++ b/Libraries/LibRegex/RegexDebug.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include "LibRegex/RegexMatcher.h"
+#include "AK/StringBuilder.h"
+
+//#define REGEX_DEBUG
+
+#ifdef REGEX_DEBUG
+
+namespace regex {
+
+class RegexDebug {
+public:
+ RegexDebug(FILE* file = stdout)
+ : m_file(file)
+ {
+ }
+
+ virtual ~RegexDebug() = default;
+
+ template<typename T>
+ void print_raw_bytecode(Regex<T>& regex) const
+ {
+ auto& bytecode = regex.parser_result.bytecode;
+ size_t index { 0 };
+ for (auto& value : bytecode) {
+ fprintf(m_file, "OpCode i=%3lu [0x%02X]\n", index, (u32)value);
+ ++index;
+ }
+ }
+
+ template<typename T>
+ void print_bytecode(Regex<T>& regex) const
+ {
+ MatchState state;
+ auto& bytecode = regex.parser_result.bytecode;
+
+ for (;;) {
+ auto* opcode = bytecode.get_opcode(state);
+ if (!opcode) {
+ dbg() << "Wrong opcode... failed!";
+ return;
+ }
+
+ print_opcode("PrintBytecode", *opcode, state);
+ fprintf(m_file, "%s", m_debug_stripline.characters());
+
+ if (is<OpCode_Exit>(*opcode))
+ break;
+
+ state.instruction_position += opcode->size();
+ }
+
+ fflush(m_file);
+ }
+
+ void print_opcode(const String& system, OpCode& opcode, MatchState& state, size_t recursion = 0, bool newline = true) const
+ {
+ fprintf(m_file, "%-15s | %-5lu | %-9lu | %-35s | %-30s | %-20s%s",
+ system.characters(),
+ state.instruction_position,
+ recursion,
+ opcode.to_string().characters(),
+ opcode.arguments_string().characters(),
+ String::format("ip: %3lu, sp: %3lu", state.instruction_position, state.string_position).characters(),
+ newline ? "\n" : "");
+
+ if (newline && is<OpCode_Compare>(opcode)) {
+ for (auto& line : to<OpCode_Compare>(opcode).variable_arguments_to_string()) {
+ fprintf(m_file, "%-15s | %-5s | %-9s | %-35s | %-30s | %-20s%s", "", "", "", "", line.characters(), "", "\n");
+ }
+ }
+ }
+
+ void print_result(const OpCode& opcode, const ByteCode& bytecode, const MatchInput& input, MatchState& state, ExecutionResult result) const
+ {
+ StringBuilder builder;
+ builder.append(execution_result_name(result));
+ if (result == ExecutionResult::Succeeded) {
+ builder.appendf(", ip: %lu/%lu, sp: %lu/%lu", state.instruction_position, bytecode.size() - 1, state.string_position, input.view.length() - 1);
+ } else if (result == ExecutionResult::Fork_PrioHigh) {
+ builder.appendf(", next ip: %lu", state.fork_at_position + opcode.size());
+ } else if (result != ExecutionResult::Failed) {
+ builder.appendf(", next ip: %lu", state.instruction_position + opcode.size());
+ }
+
+ fprintf(m_file, " | %-20s\n", builder.to_string().characters());
+
+ if (is<OpCode_Compare>(opcode)) {
+ for (auto& line : to<OpCode_Compare>(opcode).variable_arguments_to_string(input)) {
+ fprintf(m_file, "%-15s | %-5s | %-9s | %-35s | %-30s | %-20s%s", "", "", "", "", line.characters(), "", "\n");
+ }
+ }
+
+ fprintf(m_file, "%s", m_debug_stripline.characters());
+ }
+
+ void print_header()
+ {
+ StringBuilder builder;
+ builder.appendf("%-15s | %-5s | %-9s | %-35s | %-30s | %-20s | %-20s\n", "System", "Index", "Recursion", "OpCode", "Arguments", "State", "Result");
+ auto length = builder.length();
+ for (size_t i = 0; i < length; ++i) {
+ builder.append('=');
+ }
+
+ fprintf(m_file, "%s\n", builder.to_string().characters());
+ fflush(m_file);
+
+ builder.clear();
+ for (size_t i = 0; i < length; ++i) {
+ builder.append('-');
+ }
+ builder.append('\n');
+ m_debug_stripline = builder.to_string();
+ }
+
+private:
+ String m_debug_stripline;
+ FILE* m_file;
+};
+
+}
+
+using regex::RegexDebug;
+
+#endif
diff --git a/Libraries/LibRegex/RegexError.h b/Libraries/LibRegex/RegexError.h
new file mode 100644
index 0000000000..90d6a71eca
--- /dev/null
+++ b/Libraries/LibRegex/RegexError.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <AK/String.h>
+#include <AK/Types.h>
+#ifdef __serenity__
+# include <regex.h>
+#else
+# include <LibC/regex.h>
+#endif
+
+namespace regex {
+
+enum class Error : u8 {
+ NoError = __Regex_NoError,
+ InvalidPattern = __Regex_InvalidPattern, // Invalid regular expression.
+ InvalidCollationElement = __Regex_InvalidCollationElement, // Invalid collating element referenced.
+ InvalidCharacterClass = __Regex_InvalidCharacterClass, // Invalid character class type referenced.
+ InvalidTrailingEscape = __Regex_InvalidTrailingEscape, // Trailing \ in pattern.
+ InvalidNumber = __Regex_InvalidNumber, // Number in \digit invalid or in error.
+ MismatchingBracket = __Regex_MismatchingBracket, // [ ] imbalance.
+ MismatchingParen = __Regex_MismatchingParen, // ( ) imbalance.
+ MismatchingBrace = __Regex_MismatchingBrace, // { } imbalance.
+ InvalidBraceContent = __Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
+ InvalidBracketContent = __Regex_InvalidBracketContent, // Content of [] invalid.
+ InvalidRange = __Regex_InvalidRange, // Invalid endpoint in range expression.
+ InvalidRepetitionMarker = __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
+ ReachedMaxRecursion = __Regex_ReachedMaxRecursion, // MaximumRecursion has been reached.
+ EmptySubExpression = __Regex_EmptySubExpression, // Sub expression has empty content.
+ InvalidCaptureGroup = __Regex_InvalidCaptureGroup, // Content of capture group is invalid.
+ InvalidNameForCaptureGroup = __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
+};
+
+inline String get_error_string(Error error)
+{
+ switch (error) {
+ case Error::NoError:
+ return "No error";
+ case Error::InvalidPattern:
+ return "Invalid regular expression.";
+ case Error::InvalidCollationElement:
+ return "Invalid collating element referenced.";
+ case Error::InvalidCharacterClass:
+ return "Invalid character class type referenced.";
+ case Error::InvalidTrailingEscape:
+ return "Trailing \\ in pattern.";
+ case Error::InvalidNumber:
+ return "Number in \\digit invalid or in error.";
+ case Error::MismatchingBracket:
+ return "[ ] imbalance.";
+ case Error::MismatchingParen:
+ return "( ) imbalance.";
+ case Error::MismatchingBrace:
+ return "{ } imbalance.";
+ case Error::InvalidBraceContent:
+ return "Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.";
+ case Error::InvalidBracketContent:
+ return "Content of [] invalid.";
+ case Error::InvalidRange:
+ return "Invalid endpoint in range expression.";
+ case Error::InvalidRepetitionMarker:
+ return "?, * or + not preceded by valid regular expression.";
+ case Error::ReachedMaxRecursion:
+ return "Maximum recursion has been reached.";
+ case Error::EmptySubExpression:
+ return "Sub expression has empty content.";
+ case Error::InvalidCaptureGroup:
+ return "Content of capture group is invalid.";
+ case Error::InvalidNameForCaptureGroup:
+ return "Name of capture group is invalid.";
+ }
+ return "Undefined error.";
+}
+}
+
+using regex::Error;
+using regex::get_error_string;
diff --git a/Libraries/LibRegex/RegexLexer.cpp b/Libraries/LibRegex/RegexLexer.cpp
new file mode 100644
index 0000000000..1cac7b1222
--- /dev/null
+++ b/Libraries/LibRegex/RegexLexer.cpp
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "RegexLexer.h"
+#include <AK/Assertions.h>
+#include <stdio.h>
+
+namespace regex {
+
+const char* Token::name(const TokenType type)
+{
+ switch (type) {
+#define __ENUMERATE_REGEX_TOKEN(x) \
+ case TokenType::x: \
+ return #x;
+ ENUMERATE_REGEX_TOKENS
+#undef __ENUMERATE_REGEX_TOKEN
+ default:
+ ASSERT_NOT_REACHED();
+ return "<Unknown>";
+ }
+}
+
+const char* Token::name() const
+{
+ return name(m_type);
+}
+
+Lexer::Lexer(const StringView source)
+ : m_source(source)
+{
+}
+
+ALWAYS_INLINE char Lexer::peek(size_t offset) const
+{
+ if ((m_position + offset) >= m_source.length())
+ return EOF;
+ return m_source[m_position + offset];
+}
+
+void Lexer::back(size_t offset)
+{
+ m_position -= offset;
+ m_previous_position = m_position - 1;
+ m_current_char = m_source[m_position];
+}
+
+ALWAYS_INLINE void Lexer::consume()
+{
+ m_previous_position = m_position;
+
+ if (m_position >= m_source.length()) {
+ m_position = m_source.length() + 1;
+ m_current_char = EOF;
+ return;
+ }
+
+ m_current_char = m_source[m_position++];
+}
+
+void Lexer::reset()
+{
+ m_position = 0;
+ m_current_token = { TokenType::Eof, 0, StringView(nullptr) };
+ m_current_char = 0;
+ m_previous_position = 0;
+}
+
+Token Lexer::next()
+{
+ size_t token_start_position;
+
+ auto begin_token = [&] {
+ token_start_position = m_position;
+ };
+
+ auto commit_token = [&](auto type) -> Token& {
+ ASSERT(token_start_position + m_previous_position - token_start_position + 1 <= m_source.length());
+ auto substring = m_source.substring_view(token_start_position, m_previous_position - token_start_position + 1);
+ m_current_token = Token(type, token_start_position, substring);
+ return m_current_token;
+ };
+
+ auto emit_token = [&](auto type) -> Token& {
+ m_current_token = Token(type, m_position, m_source.substring_view(m_position, 1));
+ consume();
+ return m_current_token;
+ };
+
+ auto match_escape_sequence = [&]() -> size_t {
+ switch (peek(1)) {
+ case '^':
+ case '.':
+ case '[':
+ case ']':
+ case '$':
+ case '(':
+ case ')':
+ case '|':
+ case '*':
+ case '+':
+ case '?':
+ case '{':
+ case '\\':
+ return 2;
+ default:
+ fprintf(stderr, "[LEXER] Found invalid escape sequence: \\%c\n", peek(1));
+ return 0;
+ }
+ };
+
+ while (m_position <= m_source.length()) {
+ auto ch = peek();
+ if (ch == '(')
+ return emit_token(TokenType::LeftParen);
+
+ if (ch == ')')
+ return emit_token(TokenType::RightParen);
+
+ if (ch == '{')
+ return emit_token(TokenType::LeftCurly);
+
+ if (ch == '}')
+ return emit_token(TokenType::RightCurly);
+
+ if (ch == '[')
+ return emit_token(TokenType::LeftBracket);
+
+ if (ch == ']')
+ return emit_token(TokenType::RightBracket);
+
+ if (ch == '.')
+ return emit_token(TokenType::Period);
+
+ if (ch == '*')
+ return emit_token(TokenType::Asterisk);
+
+ if (ch == '+')
+ return emit_token(TokenType::Plus);
+
+ if (ch == '$')
+ return emit_token(TokenType::Dollar);
+
+ if (ch == '^')
+ return emit_token(TokenType::Circumflex);
+
+ if (ch == '|')
+ return emit_token(TokenType::Pipe);
+
+ if (ch == '?')
+ return emit_token(TokenType::Questionmark);
+
+ if (ch == ',')
+ return emit_token(TokenType::Comma);
+
+ if (ch == '/')
+ return emit_token(TokenType::Slash);
+
+ if (ch == '=')
+ return emit_token(TokenType::EqualSign);
+
+ if (ch == ':')
+ return emit_token(TokenType::Colon);
+
+ if (ch == '-')
+ return emit_token(TokenType::HyphenMinus);
+
+ if (ch == '\\') {
+ size_t escape = match_escape_sequence();
+ if (escape > 0) {
+ begin_token();
+ for (size_t i = 0; i < escape; ++i)
+ consume();
+ return commit_token(TokenType::EscapeSequence);
+ }
+ }
+
+ if (ch == EOF)
+ break;
+
+ return emit_token(TokenType::Char);
+ }
+
+ return Token(TokenType::Eof, m_position, nullptr);
+}
+
+}
diff --git a/Libraries/LibRegex/RegexLexer.h b/Libraries/LibRegex/RegexLexer.h
new file mode 100644
index 0000000000..77a7017990
--- /dev/null
+++ b/Libraries/LibRegex/RegexLexer.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <AK/Forward.h>
+#include <AK/StringView.h>
+
+namespace regex {
+
+#define ENUMERATE_REGEX_TOKENS \
+ __ENUMERATE_REGEX_TOKEN(Eof) \
+ __ENUMERATE_REGEX_TOKEN(Char) \
+ __ENUMERATE_REGEX_TOKEN(Circumflex) \
+ __ENUMERATE_REGEX_TOKEN(Period) \
+ __ENUMERATE_REGEX_TOKEN(LeftParen) \
+ __ENUMERATE_REGEX_TOKEN(RightParen) \
+ __ENUMERATE_REGEX_TOKEN(LeftCurly) \
+ __ENUMERATE_REGEX_TOKEN(RightCurly) \
+ __ENUMERATE_REGEX_TOKEN(LeftBracket) \
+ __ENUMERATE_REGEX_TOKEN(RightBracket) \
+ __ENUMERATE_REGEX_TOKEN(Asterisk) \
+ __ENUMERATE_REGEX_TOKEN(EscapeSequence) \
+ __ENUMERATE_REGEX_TOKEN(Dollar) \
+ __ENUMERATE_REGEX_TOKEN(Pipe) \
+ __ENUMERATE_REGEX_TOKEN(Plus) \
+ __ENUMERATE_REGEX_TOKEN(Comma) \
+ __ENUMERATE_REGEX_TOKEN(Slash) \
+ __ENUMERATE_REGEX_TOKEN(EqualSign) \
+ __ENUMERATE_REGEX_TOKEN(HyphenMinus) \
+ __ENUMERATE_REGEX_TOKEN(Colon) \
+ __ENUMERATE_REGEX_TOKEN(Questionmark)
+
+enum class TokenType {
+#define __ENUMERATE_REGEX_TOKEN(x) x,
+ ENUMERATE_REGEX_TOKENS
+#undef __ENUMERATE_REGEX_TOKEN
+};
+
+class Token {
+public:
+ Token() = default;
+ Token(const TokenType type, const size_t start_position, const StringView value)
+ : m_type(type)
+ , m_position(start_position)
+ , m_value(value)
+ {
+ }
+
+ TokenType type() const { return m_type; }
+ const StringView& value() const { return m_value; }
+ size_t position() const { return m_position; }
+
+ const char* name() const;
+ static const char* name(const TokenType);
+
+private:
+ TokenType m_type { TokenType::Eof };
+ size_t m_position { 0 };
+ StringView m_value { nullptr };
+};
+
+class Lexer {
+public:
+ Lexer() = default;
+ explicit Lexer(const StringView source);
+ Token next();
+ void reset();
+ void back(size_t offset);
+ void set_source(const StringView source) { m_source = source; }
+
+private:
+ ALWAYS_INLINE char peek(size_t offset = 0) const;
+ ALWAYS_INLINE void consume();
+
+ StringView m_source {};
+ size_t m_position { 0 };
+ size_t m_previous_position { 0 };
+ Token m_current_token { TokenType::Eof, 0, StringView(nullptr) };
+ int m_current_char { 0 };
+};
+
+}
+
+using regex::Lexer;
diff --git a/Libraries/LibRegex/RegexMatch.h b/Libraries/LibRegex/RegexMatch.h
new file mode 100644
index 0000000000..8b41424079
--- /dev/null
+++ b/Libraries/LibRegex/RegexMatch.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include "RegexOptions.h"
+
+#include "AK/FlyString.h"
+#include "AK/HashMap.h"
+#include "AK/String.h"
+#include "AK/StringView.h"
+#include "AK/Vector.h"
+
+namespace regex {
+
+class Match final {
+private:
+ Optional<FlyString> string;
+
+public:
+ Match() = default;
+ ~Match() = default;
+
+ Match(const StringView view_, const size_t line_, const size_t column_, const size_t global_offset_)
+ : view(view_)
+ , line(line_)
+ , column(column_)
+ , global_offset(global_offset_)
+ , left_column(column_)
+ {
+ }
+
+ Match(const String string_, const size_t line_, const size_t column_, const size_t global_offset_)
+ : string(string_)
+ , view(string.value().view())
+ , line(line_)
+ , column(column_)
+ , global_offset(global_offset_)
+ , left_column(column_)
+ {
+ }
+
+ StringView view { nullptr };
+ size_t line { 0 };
+ size_t column { 0 };
+ size_t global_offset { 0 };
+
+ // ugly, as not usable by user, but needed to prevent to create extra vectors that are
+ // able to store the column when the left paren has been found
+ size_t left_column { 0 };
+};
+
+struct MatchInput {
+ StringView view { nullptr };
+ AllOptions regex_options {};
+
+ size_t match_index { 0 };
+ size_t line { 0 };
+ size_t column { 0 };
+
+ size_t global_offset { 0 }; // For multiline matching, knowning the offset from start could be important
+};
+
+struct MatchState {
+ size_t string_position { 0 };
+ size_t instruction_position { 0 };
+ size_t fork_at_position { 0 };
+};
+
+struct MatchOutput {
+ size_t operations;
+ Vector<Match> matches;
+ Vector<Vector<Match>> capture_group_matches;
+ Vector<HashMap<String, Match>> named_capture_group_matches;
+};
+
+}
diff --git a/Libraries/LibRegex/RegexMatcher.cpp b/Libraries/LibRegex/RegexMatcher.cpp
new file mode 100644
index 0000000000..9a636d9cb1
--- /dev/null
+++ b/Libraries/LibRegex/RegexMatcher.cpp
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "RegexMatcher.h"
+#include "RegexDebug.h"
+#include "RegexParser.h"
+#include <AK/String.h>
+#include <AK/StringBuilder.h>
+
+namespace regex {
+
+#ifdef REGEX_DEBUG
+static RegexDebug s_regex_dbg(stderr);
+#endif
+
+template<class Parser>
+Regex<Parser>::Regex(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options)
+{
+ pattern_value = pattern.to_string();
+ regex::Lexer lexer(pattern);
+
+ Parser parser(lexer, regex_options);
+ parser_result = parser.parse();
+
+ if (parser_result.error == regex::Error::NoError) {
+ matcher = make<Matcher<Parser>>(*this, regex_options);
+ } else {
+ fprintf(stderr, "%s\n", error_string().characters());
+ }
+}
+
+template<class Parser>
+String Regex<Parser>::error_string(Optional<String> message) const
+{
+ StringBuilder eb;
+ eb.appendf("Error during parsing of regular expression:\n");
+ eb.appendf(" %s\n ", pattern_value.characters());
+ for (size_t i = 0; i < parser_result.error_token.position(); ++i)
+ eb.append(" ");
+
+ eb.appendf("^---- %s\n", message.value_or(get_error_string(parser_result.error)).characters());
+ return eb.build();
+}
+
+template<typename Parser>
+RegexResult Matcher<Parser>::match(const StringView& view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options) const
+{
+ size_t match_count { 0 };
+ Vector<StringView> views { view };
+
+ MatchInput input;
+ MatchState state;
+ MatchOutput output;
+
+ input.regex_options = m_regex_options | regex_options.value_or({}).value();
+ output.operations = 0;
+
+ if (input.regex_options & AllFlags::Multiline)
+ views = view.lines(false); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string...
+
+ if (c_match_preallocation_count) {
+ output.matches.ensure_capacity(c_match_preallocation_count);
+ output.capture_group_matches.ensure_capacity(c_match_preallocation_count);
+ output.named_capture_group_matches.ensure_capacity(c_match_preallocation_count);
+
+ auto& capture_groups_count = m_pattern.parser_result.capture_groups_count;
+ auto& named_capture_groups_count = m_pattern.parser_result.named_capture_groups_count;
+
+ for (size_t j = 0; j < c_match_preallocation_count; ++j) {
+ output.matches.empend();
+ output.capture_group_matches.unchecked_append({});
+ output.capture_group_matches.at(j).ensure_capacity(capture_groups_count);
+ for (size_t k = 0; k < capture_groups_count; ++k)
+ output.capture_group_matches.at(j).unchecked_append({});
+
+ output.named_capture_group_matches.unchecked_append({});
+ output.named_capture_group_matches.at(j).ensure_capacity(named_capture_groups_count);
+ }
+ }
+
+ auto append_match = [](auto& input, auto& state, auto& output, auto& start_position) {
+ if (output.matches.size() == input.match_index)
+ output.matches.empend();
+
+ ASSERT(start_position + state.string_position - start_position <= input.view.length());
+ if (input.regex_options & AllFlags::StringCopyMatches) {
+ output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_string(), input.line, start_position, input.global_offset + start_position };
+ } else { // let the view point to the original string ...
+ output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position };
+ }
+ };
+
+#ifdef REGEX_DEBUG
+ s_regex_dbg.print_header();
+#endif
+
+ bool continue_search = (input.regex_options & AllFlags::Global) || (input.regex_options & AllFlags::Multiline);
+
+ for (auto& view : views) {
+ input.view = view;
+#ifdef REGEX_DEBUG
+ dbg() << "[match] Starting match with view (" << view.length() << "): _" << view << "_";
+#endif
+
+ auto view_length = view.length();
+ for (size_t view_index = 0; view_index < view_length; ++view_index) {
+ auto& match_length_minimum = m_pattern.parser_result.match_length_minimum;
+ // FIXME: More performant would be to know the remaining minimum string
+ // length needed to match from the current position onwards within
+ // the vm. Add new OpCode for MinMatchLengthFromSp with the value of
+ // the remaining string length from the current path. The value though
+ // has to be filled in reverse. That implies a second run over bytecode
+ // after generation has finished.
+ if (match_length_minimum && match_length_minimum > view_length - view_index)
+ break;
+
+ input.column = match_count;
+ input.match_index = match_count;
+
+ state.string_position = view_index;
+ state.instruction_position = 0;
+
+ auto success = execute(input, state, output, 0);
+ if (!success.has_value())
+ return { false, 0, {}, {}, {}, output.operations };
+
+ if (success.value()) {
+
+ if ((input.regex_options & AllFlags::MatchNotEndOfLine) && state.string_position == input.view.length()) {
+ if (!continue_search)
+ break;
+ continue;
+ }
+ if ((input.regex_options & AllFlags::MatchNotBeginOfLine) && view_index == 0) {
+ if (!continue_search)
+ break;
+ continue;
+ }
+
+#ifdef REGEX_DEBUG
+ dbg() << "state.string_position: " << state.string_position << " view_index: " << view_index;
+ dbg() << "[match] Found a match (length = " << state.string_position - view_index << "): " << input.view.substring_view(view_index, state.string_position - view_index);
+#endif
+ ++match_count;
+
+ if (continue_search) {
+ append_match(input, state, output, view_index);
+
+ bool has_zero_length = state.string_position == view_index;
+ view_index = state.string_position - (has_zero_length ? 0 : 1);
+ continue;
+
+ } else if (!continue_search && state.string_position < view_length)
+ return { false, 0, {}, {}, {}, output.operations };
+
+ append_match(input, state, output, view_index);
+ break;
+ }
+
+ if (!continue_search)
+ break;
+ }
+
+ ++input.line;
+ input.global_offset += view.length() + 1; // +1 includes the line break character
+ }
+
+ MatchOutput output_copy;
+ if (match_count) {
+ auto capture_groups_count = min(output.capture_group_matches.size(), output.matches.size());
+ for (size_t i = 0; i < capture_groups_count; ++i) {
+ output_copy.capture_group_matches.append(output.capture_group_matches.at(i));
+ }
+
+ auto named_capture_groups_count = min(output.named_capture_group_matches.size(), output.matches.size());
+ for (size_t i = 0; i < named_capture_groups_count; ++i) {
+ if (output.matches.at(i).view.length())
+ output_copy.named_capture_group_matches.append(output.named_capture_group_matches.at(i));
+ }
+
+ for (size_t i = 0; i < match_count; ++i)
+ output_copy.matches.append(output.matches.at(i));
+
+ } else {
+ output_copy.capture_group_matches.clear_with_capacity();
+ output_copy.named_capture_group_matches.clear_with_capacity();
+ }
+
+ return {
+ match_count ? true : false,
+ match_count,
+ move(output_copy.matches),
+ move(output_copy.capture_group_matches),
+ move(output_copy.named_capture_group_matches),
+ output.operations,
+ };
+}
+
+template<class Parser>
+Optional<bool> Matcher<Parser>::execute(const MatchInput& input, MatchState& state, MatchOutput& output, size_t recursion_level) const
+{
+ if (recursion_level > c_max_recursion)
+ return false;
+
+ Vector<MatchState> fork_low_prio_states;
+ MatchState fork_high_prio_state;
+ Optional<bool> success;
+
+ auto& bytecode = m_pattern.parser_result.bytecode;
+
+ for (;;) {
+ ++output.operations;
+ auto* opcode = bytecode.get_opcode(state);
+
+ if (!opcode) {
+ dbg() << "Wrong opcode... failed!";
+ return {};
+ }
+
+#ifdef REGEX_DEBUG
+ s_regex_dbg.print_opcode("VM", *opcode, state, recursion_level, false);
+#endif
+
+ auto result = opcode->execute(input, state, output);
+
+#ifdef REGEX_DEBUG
+ s_regex_dbg.print_result(*opcode, bytecode, input, state, result);
+#endif
+
+ state.instruction_position += opcode->size();
+
+ switch (result) {
+ case ExecutionResult::Fork_PrioLow:
+ fork_low_prio_states.prepend(state);
+ continue;
+ case ExecutionResult::Fork_PrioHigh:
+ fork_high_prio_state = state;
+ fork_high_prio_state.instruction_position = fork_high_prio_state.fork_at_position;
+ success = execute(input, fork_high_prio_state, output, ++recursion_level);
+ if (!success.has_value())
+ return {};
+
+ if (success.value()) {
+ state = fork_high_prio_state;
+ return true;
+ }
+
+ continue;
+ case ExecutionResult::Continue:
+ continue;
+ case ExecutionResult::Succeeded:
+ return true;
+ case ExecutionResult::Failed:
+ return false;
+ case ExecutionResult::Failed_ExecuteLowPrioForks:
+ return execute_low_prio_forks(input, state, output, fork_low_prio_states, recursion_level + 1);
+ }
+ }
+
+ ASSERT_NOT_REACHED();
+}
+
+template<class Parser>
+ALWAYS_INLINE Optional<bool> Matcher<Parser>::execute_low_prio_forks(const MatchInput& input, MatchState& original_state, MatchOutput& output, Vector<MatchState> states, size_t recursion_level) const
+{
+ for (auto& state : states) {
+
+ state.instruction_position = state.fork_at_position;
+#ifdef REGEX_DEBUG
+ fprintf(stderr, "Forkstay... ip = %lu, sp = %lu\n", state.instruction_position, state.string_position);
+#endif
+ auto success = execute(input, state, output, recursion_level);
+ if (!success.has_value())
+ return {};
+ if (success.value()) {
+#ifdef REGEX_DEBUG
+ fprintf(stderr, "Forkstay succeeded... ip = %lu, sp = %lu\n", state.instruction_position, state.string_position);
+#endif
+ original_state = state;
+ return true;
+ }
+ }
+
+ original_state.string_position = 0;
+ return false;
+};
+
+template class Matcher<PosixExtendedParser>;
+template class Regex<PosixExtendedParser>;
+}
diff --git a/Libraries/LibRegex/RegexMatcher.h b/Libraries/LibRegex/RegexMatcher.h
new file mode 100644
index 0000000000..b98873c81f
--- /dev/null
+++ b/Libraries/LibRegex/RegexMatcher.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include "RegexByteCode.h"
+#include "RegexMatch.h"
+#include "RegexOptions.h"
+#include "RegexParser.h"
+
+#include <AK/Forward.h>
+#include <AK/HashMap.h>
+#include <AK/NonnullOwnPtrVector.h>
+#include <AK/Types.h>
+#include <AK/Vector.h>
+
+#include <stdio.h>
+
+namespace regex {
+
+static const constexpr size_t c_max_recursion = 5000;
+static const constexpr size_t c_match_preallocation_count = 0;
+
+struct RegexResult final {
+ bool success { false };
+ size_t count { 0 };
+ Vector<Match> matches;
+ Vector<Vector<Match>> capture_group_matches;
+ Vector<HashMap<String, Match>> named_capture_group_matches;
+ size_t operations { 0 };
+};
+
+template<class Parser>
+class Regex;
+
+template<class Parser>
+class Matcher final {
+
+public:
+ Matcher(const Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
+ : m_pattern(pattern)
+ , m_regex_options(regex_options.value_or({}))
+ {
+ }
+ ~Matcher() = default;
+
+ RegexResult match(const StringView&, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const;
+
+private:
+ Optional<bool> execute(const MatchInput& input, MatchState& state, MatchOutput& output, size_t recursion_level) const;
+ ALWAYS_INLINE Optional<bool> execute_low_prio_forks(const MatchInput& input, MatchState& original_state, MatchOutput& output, Vector<MatchState> states, size_t recursion_level) const;
+
+ const Regex<Parser>& m_pattern;
+ const typename ParserTraits<Parser>::OptionsType m_regex_options;
+};
+
+template<class Parser>
+class Regex final {
+public:
+ String pattern_value;
+ regex::Parser::Result parser_result;
+ OwnPtr<Matcher<Parser>> matcher { nullptr };
+
+ Regex(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
+ ~Regex() = default;
+
+ void print_bytecode(FILE* f = stdout) const;
+ String error_string(Optional<String> message = {}) const;
+
+ RegexResult match(StringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
+ {
+ if (!matcher || parser_result.error != Error::NoError)
+ return {};
+ return matcher->match(view, regex_options);
+ }
+
+ RegexResult search(StringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
+ {
+ if (!matcher || parser_result.error != Error::NoError)
+ return {};
+
+ AllOptions options = (AllOptions)regex_options.value_or({});
+ if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) {
+ options.reset_flag(AllFlags::MatchNotEndOfLine);
+ options.reset_flag(AllFlags::MatchNotBeginOfLine);
+ }
+ options |= AllFlags::Global;
+
+ return matcher->match(view, options);
+ }
+
+ bool match(StringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
+ {
+ if (!matcher || parser_result.error != Error::NoError)
+ return {};
+ m = matcher->match(view, regex_options);
+ return m.success;
+ }
+
+ bool search(StringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
+ {
+ m = search(view, regex_options);
+ return m.success;
+ }
+
+ bool has_match(const StringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
+ {
+ if (!matcher || parser_result.error != Error::NoError)
+ return false;
+ RegexResult result = matcher->match(view, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
+ return result.success;
+ }
+};
+
+template<class Parser>
+RegexResult match(const StringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
+{
+ if (!pattern.matcher || pattern.parser_result.error != Error::NoError)
+ return {};
+ return pattern.matcher->match(view, regex_options);
+}
+
+template<class Parser>
+bool match(const StringView view, Regex<Parser>& pattern, RegexResult& res, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
+{
+ if (!pattern.matcher || pattern.parser_result.error != Error::NoError)
+ return {};
+ res = pattern.matcher->match(view, regex_options);
+ return res.success;
+}
+
+template<class Parser>
+RegexResult search(const StringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
+{
+ if (!pattern.matcher || pattern.parser_result.error != Error::NoError)
+ return {};
+ return pattern.matcher->search(view, regex_options);
+}
+
+template<class Parser>
+bool search(const StringView view, Regex<Parser>& pattern, RegexResult& res, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
+{
+ if (!pattern.matcher || pattern.parser_result.error != Error::NoError)
+ return {};
+ res = pattern.matcher->search(view, regex_options);
+ return res.success;
+}
+
+template<class Parser>
+bool has_match(const StringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
+{
+ if (pattern.matcher == nullptr)
+ return {};
+ RegexResult result = pattern.matcher->match(view, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
+ return result.success;
+}
+
+}
+
+using regex::has_match;
+using regex::match;
+using regex::Regex;
+using regex::RegexResult;
diff --git a/Libraries/LibRegex/RegexOptions.h b/Libraries/LibRegex/RegexOptions.h
new file mode 100644
index 0000000000..36cc0ce40a
--- /dev/null
+++ b/Libraries/LibRegex/RegexOptions.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <AK/Types.h>
+#include <stdio.h>
+#ifdef __serenity__
+# include <regex.h>
+#else
+# include <LibC/regex.h>
+#endif
+
+namespace regex {
+
+using FlagsUnderlyingType = u16;
+
+enum class AllFlags {
+ Global = __Regex_Global, // All matches (don't return after first match)
+ Insensitive = __Regex_Insensitive, // Case insensitive match (ignores case of [a-zA-Z])
+ Ungreedy = __Regex_Ungreedy, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
+ Unicode = __Regex_Unicode, // Enable all unicode features and interpret all unicode escape sequences as such
+ Extended = __Regex_Extended, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
+ Extra = __Regex_Extra, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
+ MatchNotBeginOfLine = __Regex_MatchNotBeginOfLine, // Pattern is not forced to ^ -> search in whole string!
+ MatchNotEndOfLine = __Regex_MatchNotEndOfLine, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
+ SkipSubExprResults = __Regex_SkipSubExprResults, // Do not return sub expressions in the result
+ StringCopyMatches = __Regex_StringCopyMatches, // Do explicitly copy results into new allocated string instead of StringView to original string.
+ SingleLine = __Regex_SingleLine, // Dot matches newline characters
+ Sticky = __Regex_Sticky, // Force the pattern to only match consecutive matches from where the previous match ended.
+ Multiline = __Regex_Multiline, // Handle newline characters. Match each line, one by one.
+ SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches, // Do not remove empty capture group results.
+ Last = SkipTrimEmptyMatches
+};
+
+enum class PosixFlags : FlagsUnderlyingType {
+ Global = (FlagsUnderlyingType)AllFlags::Global,
+ Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
+ Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
+ Unicode = (FlagsUnderlyingType)AllFlags::Unicode,
+ Extended = (FlagsUnderlyingType)AllFlags::Extended,
+ Extra = (FlagsUnderlyingType)AllFlags::Extra,
+ MatchNotBeginOfLine = (FlagsUnderlyingType)AllFlags::MatchNotBeginOfLine,
+ MatchNotEndOfLine = (FlagsUnderlyingType)AllFlags::MatchNotEndOfLine,
+ SkipSubExprResults = (FlagsUnderlyingType)AllFlags::SkipSubExprResults,
+ Multiline = (FlagsUnderlyingType)AllFlags::Multiline,
+ StringCopyMatches = (FlagsUnderlyingType)AllFlags::StringCopyMatches,
+};
+
+enum class ECMAScriptFlags : FlagsUnderlyingType {
+ Global = (FlagsUnderlyingType)AllFlags::Global,
+ Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
+ Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
+ Unicode = (FlagsUnderlyingType)AllFlags::Unicode,
+ Extended = (FlagsUnderlyingType)AllFlags::Extended,
+ Extra = (FlagsUnderlyingType)AllFlags::Extra,
+ SingleLine = (FlagsUnderlyingType)AllFlags::SingleLine,
+ Sticky = (FlagsUnderlyingType)AllFlags::Sticky,
+ Multiline = (FlagsUnderlyingType)AllFlags::Multiline,
+ StringCopyMatches = (FlagsUnderlyingType)AllFlags::StringCopyMatches,
+};
+
+template<class T>
+class RegexOptions {
+public:
+ using FlagsType = T;
+
+ RegexOptions() = default;
+
+ RegexOptions(T flags)
+ : m_flags(flags)
+ {
+ }
+
+ template<class U>
+ RegexOptions(RegexOptions<U> other)
+ : m_flags((T) static_cast<FlagsUnderlyingType>(other.value()))
+ {
+ }
+
+ operator bool() const { return !!*this; }
+ bool operator!() const { return (FlagsUnderlyingType)m_flags == 0; }
+
+ RegexOptions<T> operator|(T flag) const { return RegexOptions<T> { (T)((FlagsUnderlyingType)m_flags | (FlagsUnderlyingType)flag) }; }
+ RegexOptions<T> operator&(T flag) const { return RegexOptions<T> { (T)((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag) }; }
+
+ RegexOptions<T>& operator|=(T flag)
+ {
+ m_flags = (T)((FlagsUnderlyingType)m_flags | (FlagsUnderlyingType)flag);
+ return *this;
+ }
+
+ RegexOptions<T>& operator&=(T flag)
+ {
+ m_flags = (T)((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag);
+ return *this;
+ }
+
+ void reset_flags() { m_flags = (T)0; }
+ void reset_flag(T flag) { m_flags = (T)((FlagsUnderlyingType)m_flags & ~(FlagsUnderlyingType)flag); }
+ void set_flag(T flag) { *this |= flag; }
+ bool has_flag_set(T flag) const { return *this & flag; }
+ T value() const { return m_flags; }
+
+private:
+ T m_flags { 0 };
+};
+
+template<class T>
+inline RegexOptions<T> operator|(T lhs, T rhs)
+{
+ return RegexOptions<T> { lhs } |= rhs;
+}
+
+template<class T>
+inline RegexOptions<T> operator&(T lhs, T rhs)
+{
+ return RegexOptions<T> { lhs } &= rhs;
+}
+
+template<class T>
+inline T operator~(T flag)
+{
+ return (T) ~((FlagsUnderlyingType)flag);
+}
+
+using AllOptions = RegexOptions<AllFlags>;
+using ECMAScriptOptions = RegexOptions<ECMAScriptFlags>;
+using PosixOptions = RegexOptions<PosixFlags>;
+
+}
+
+using regex::ECMAScriptFlags;
+using regex::ECMAScriptOptions;
+using regex::PosixFlags;
+using regex::PosixOptions;
diff --git a/Libraries/LibRegex/RegexParser.cpp b/Libraries/LibRegex/RegexParser.cpp
new file mode 100644
index 0000000000..ff64b59530
--- /dev/null
+++ b/Libraries/LibRegex/RegexParser.cpp
@@ -0,0 +1,598 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "RegexParser.h"
+#include "RegexDebug.h"
+#include <AK/String.h>
+#include <AK/StringBuilder.h>
+#include <cstdio>
+
+namespace regex {
+
+ALWAYS_INLINE bool Parser::set_error(Error error)
+{
+ if (m_parser_state.error == Error::NoError) {
+ m_parser_state.error = error;
+ m_parser_state.error_token = m_parser_state.current_token;
+ }
+ return false; // always return false, that eases the API usage (return set_error(...)) :^)
+}
+
+ALWAYS_INLINE bool Parser::done() const
+{
+ return match(TokenType::Eof);
+}
+
+ALWAYS_INLINE bool Parser::match(TokenType type) const
+{
+ return m_parser_state.current_token.type() == type;
+}
+
+ALWAYS_INLINE Token Parser::consume()
+{
+ auto old_token = m_parser_state.current_token;
+ m_parser_state.current_token = m_parser_state.lexer.next();
+ return old_token;
+}
+
+ALWAYS_INLINE Token Parser::consume(TokenType type, Error error)
+{
+ if (m_parser_state.current_token.type() != type) {
+ set_error(error);
+ dbg() << "[PARSER] Error: Unexpected token " << m_parser_state.current_token.name() << ". Expected: " << Token::name(type);
+ }
+ return consume();
+}
+
+ALWAYS_INLINE bool Parser::consume(const String& str)
+{
+ size_t potentially_go_back { 1 };
+ for (auto ch : str) {
+ if (match(TokenType::Char)) {
+ if (m_parser_state.current_token.value()[0] != ch) {
+ m_parser_state.lexer.back(potentially_go_back);
+ m_parser_state.current_token = m_parser_state.lexer.next();
+ return false;
+ }
+ } else {
+ m_parser_state.lexer.back(potentially_go_back);
+ m_parser_state.current_token = m_parser_state.lexer.next();
+ return false;
+ }
+ consume(TokenType::Char, Error::NoError);
+ ++potentially_go_back;
+ }
+ return true;
+}
+
+ALWAYS_INLINE void Parser::reset()
+{
+ m_parser_state.bytecode.clear();
+ m_parser_state.lexer.reset();
+ m_parser_state.current_token = m_parser_state.lexer.next();
+ m_parser_state.error = Error::NoError;
+ m_parser_state.error_token = { TokenType::Eof, 0, StringView(nullptr) };
+ m_parser_state.regex_options = {};
+}
+
+Parser::Result Parser::parse(Optional<AllOptions> regex_options)
+{
+ reset();
+ if (regex_options.has_value())
+ m_parser_state.regex_options = regex_options.value();
+ if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum))
+ consume(TokenType::Eof, Error::InvalidPattern);
+ else
+ set_error(Error::InvalidPattern);
+
+#ifdef REGEX_DEBUG
+ fprintf(stderr, "[PARSER] Produced bytecode with %lu entries (opcodes + arguments)\n", m_parser_state.bytecode.size());
+#endif
+ return {
+ move(m_parser_state.bytecode),
+ move(m_parser_state.capture_groups_count),
+ move(m_parser_state.named_capture_groups_count),
+ move(m_parser_state.match_length_minimum),
+ move(m_parser_state.error),
+ move(m_parser_state.error_token)
+ };
+}
+
+// =============================
+// PosixExtended Parser
+// =============================
+
+bool PosixExtendedParser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
+{
+ return parse_root(stack, match_length_minimum);
+}
+
+ALWAYS_INLINE bool PosixExtendedParser::match_repetition_symbol()
+{
+ auto type = m_parser_state.current_token.type();
+ return (type == TokenType::Asterisk
+ || type == TokenType::Plus
+ || type == TokenType::Questionmark
+ || type == TokenType::LeftCurly);
+}
+
+ALWAYS_INLINE bool PosixExtendedParser::match_ordinary_characters()
+{
+ // NOTE: This method must not be called during bracket and repetition parsing!
+ // FIXME: Add assertion for that?
+ auto type = m_parser_state.current_token.type();
+ return (type == TokenType::Char
+ || type == TokenType::Comma
+ || type == TokenType::Slash
+ || type == TokenType::EqualSign
+ || type == TokenType::HyphenMinus
+ || type == TokenType::Colon);
+}
+
+ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& bytecode_to_repeat, size_t& match_length_minimum)
+{
+ if (match(TokenType::LeftCurly)) {
+ consume();
+
+ StringBuilder number_builder;
+
+ while (match(TokenType::Char)) {
+ number_builder.append(consume().value());
+ }
+
+ auto maybe_minimum = number_builder.build().to_uint();
+ if (!maybe_minimum.has_value())
+ return set_error(Error::InvalidBraceContent);
+
+ auto minimum = maybe_minimum.value();
+ match_length_minimum *= minimum;
+
+ if (match(TokenType::Comma)) {
+ consume();
+ } else {
+ ByteCode bytecode;
+ bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum);
+ bytecode_to_repeat = move(bytecode);
+
+ consume(TokenType::RightCurly, Error::MismatchingBrace);
+ return !has_error();
+ }
+
+ Optional<size_t> maybe_maximum {};
+ number_builder.clear();
+ while (match(TokenType::Char)) {
+ number_builder.append(consume().value());
+ }
+ if (!number_builder.is_empty()) {
+ auto value = number_builder.build().to_uint();
+ if (!value.has_value() || minimum > value.value())
+ return set_error(Error::InvalidBraceContent);
+
+ maybe_maximum = value.value();
+ }
+
+ bytecode_to_repeat.insert_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum);
+
+ consume(TokenType::RightCurly, Error::MismatchingBrace);
+ return !has_error();
+
+ } else if (match(TokenType::Plus)) {
+ consume();
+
+ bool greedy = match(TokenType::Questionmark);
+ if (greedy)
+ consume();
+
+ // Note: dont touch match_length_minimum, it's already correct
+ bytecode_to_repeat.insert_bytecode_repetition_min_one(bytecode_to_repeat, greedy);
+ return !has_error();
+
+ } else if (match(TokenType::Asterisk)) {
+ consume();
+ match_length_minimum = 0;
+
+ bool greedy = match(TokenType::Questionmark);
+ if (greedy)
+ consume();
+
+ bytecode_to_repeat.insert_bytecode_repetition_any(bytecode_to_repeat, greedy);
+
+ return !has_error();
+
+ } else if (match(TokenType::Questionmark)) {
+ consume();
+ match_length_minimum = 0;
+
+ bool greedy = match(TokenType::Questionmark);
+ if (greedy)
+ consume();
+
+ bytecode_to_repeat.insert_bytecode_repetition_zero_or_one(bytecode_to_repeat, greedy);
+ return !has_error();
+ }
+
+ return false;
+}
+
+ALWAYS_INLINE bool PosixExtendedParser::parse_bracket_expression(ByteCode& stack, size_t& match_length_minimum)
+{
+ Vector<CompareTypeAndValuePair> values;
+
+ for (;;) {
+
+ if (match(TokenType::HyphenMinus)) {
+ consume();
+
+ if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
+ // first in the bracket expression
+ values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
+ } else if (match(TokenType::RightBracket)) {
+ // Last in the bracket expression
+ values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
+ } else if (values.last().type == CharacterCompareType::Char) {
+ values.append({ CharacterCompareType::RangeExpressionDummy, 0 });
+
+ if (match(TokenType::HyphenMinus)) {
+ consume();
+ // Valid range, add ordinary character
+ values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
+ }
+ } else {
+ return set_error(Error::InvalidRange);
+ }
+
+ } else if (match(TokenType::Char) || match(TokenType::Period) || match(TokenType::Asterisk) || match(TokenType::EscapeSequence) || match(TokenType::Plus)) {
+ values.append({ CharacterCompareType::Char, (ByteCodeValueType)*consume().value().characters_without_null_termination() });
+
+ } else if (match(TokenType::Circumflex)) {
+ auto t = consume();
+
+ if (values.is_empty())
+ values.append({ CharacterCompareType::Inverse, 0 });
+ else
+ values.append({ CharacterCompareType::Char, (ByteCodeValueType)*t.value().characters_without_null_termination() });
+
+ } else if (match(TokenType::LeftBracket)) {
+ consume();
+
+ if (match(TokenType::Period)) {
+ consume();
+
+ // FIXME: Parse collating element, this is needed when we have locale support
+ // This could have impact on length parameter, I guess.
+ ASSERT_NOT_REACHED();
+
+ consume(TokenType::Period, Error::InvalidCollationElement);
+ consume(TokenType::RightBracket, Error::MismatchingBracket);
+
+ } else if (match(TokenType::EqualSign)) {
+ consume();
+ // FIXME: Parse collating element, this is needed when we have locale support
+ // This could have impact on length parameter, I guess.
+ ASSERT_NOT_REACHED();
+
+ consume(TokenType::EqualSign, Error::InvalidCollationElement);
+ consume(TokenType::RightBracket, Error::MismatchingBracket);
+
+ } else if (match(TokenType::Colon)) {
+ consume();
+
+ CharClass ch_class;
+ // parse character class
+ if (match(TokenType::Char)) {
+ if (consume("alnum"))
+ ch_class = CharClass::Alnum;
+ else if (consume("alpha"))
+ ch_class = CharClass::Alpha;
+ else if (consume("blank"))
+ ch_class = CharClass::Blank;
+ else if (consume("cntrl"))
+ ch_class = CharClass::Cntrl;
+ else if (consume("digit"))
+ ch_class = CharClass::Digit;
+ else if (consume("graph"))
+ ch_class = CharClass::Graph;
+ else if (consume("lower"))
+ ch_class = CharClass::Lower;
+ else if (consume("print"))
+ ch_class = CharClass::Print;
+ else if (consume("punct"))
+ ch_class = CharClass::Punct;
+ else if (consume("space"))
+ ch_class = CharClass::Space;
+ else if (consume("upper"))
+ ch_class = CharClass::Upper;
+ else if (consume("xdigit"))
+ ch_class = CharClass::Xdigit;
+ else
+ return set_error(Error::InvalidCharacterClass);
+
+ values.append({ CharacterCompareType::CharClass, (ByteCodeValueType)ch_class });
+
+ } else
+ return set_error(Error::InvalidCharacterClass);
+
+ // FIXME: we do not support locale specific character classes until locales are implemented
+
+ consume(TokenType::Colon, Error::InvalidCharacterClass);
+ consume(TokenType::RightBracket, Error::MismatchingBracket);
+ } else {
+ return set_error(Error::MismatchingBracket);
+ }
+
+ } else if (match(TokenType::RightBracket)) {
+
+ if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
+ // handle bracket as ordinary character
+ values.append({ CharacterCompareType::Char, (ByteCodeValueType)*consume().value().characters_without_null_termination() });
+ } else {
+ // closing bracket expression
+ break;
+ }
+ } else
+ // nothing matched, this is a failure, as at least the closing bracket must match...
+ return set_error(Error::MismatchingBracket);
+
+ // check if range expression has to be completed...
+ if (values.size() >= 3 && values.at(values.size() - 2).type == CharacterCompareType::RangeExpressionDummy) {
+ if (values.last().type != CharacterCompareType::Char)
+ return set_error(Error::InvalidRange);
+
+ auto value2 = values.take_last();
+ values.take_last(); // RangeExpressionDummy
+ auto value1 = values.take_last();
+
+ values.append({ CharacterCompareType::CharRange, static_cast<ByteCodeValueType>(CharRange { (char)value1.value, (char)value2.value }) });
+ }
+ }
+
+ if (values.size())
+ match_length_minimum = 1;
+
+ if (values.first().type == CharacterCompareType::Inverse)
+ match_length_minimum = 0;
+
+ stack.insert_bytecode_compare_values(move(values));
+
+ return !has_error();
+}
+
+ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, size_t& match_length_minimum)
+{
+ ByteCode bytecode;
+ size_t length = 0;
+ bool should_parse_repetition_symbol { false };
+
+ for (;;) {
+ if (match_ordinary_characters()) {
+ Token start_token = m_parser_state.current_token;
+ Token last_token = m_parser_state.current_token;
+ for (;;) {
+ if (!match_ordinary_characters())
+ break;
+ ++length;
+ last_token = consume();
+ }
+
+ if (length > 1) {
+ // last character is inserted into 'bytecode' for duplication symbol handling
+ auto new_length = length - ((match_repetition_symbol() && length > 1) ? 1 : 0);
+ stack.insert_bytecode_compare_string(start_token.value(), new_length);
+ }
+
+ if ((match_repetition_symbol() && length > 1) || length == 1) // Create own compare opcode for last character before duplication symbol
+ bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)last_token.value().characters_without_null_termination()[0] } });
+
+ should_parse_repetition_symbol = true;
+ break;
+ }
+
+ if (match_repetition_symbol())
+ return set_error(Error::InvalidRepetitionMarker);
+
+ if (match(TokenType::Period)) {
+ length = 1;
+ consume();
+ bytecode.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
+ should_parse_repetition_symbol = true;
+ break;
+ }
+
+ if (match(TokenType::EscapeSequence)) {
+ length = 1;
+ Token t = consume();
+#ifdef REGEX_DEBUG
+ printf("[PARSER] EscapeSequence with substring %s\n", String(t.value()).characters());
+#endif
+
+ bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u32)t.value().characters_without_null_termination()[1] } });
+ should_parse_repetition_symbol = true;
+ break;
+ }
+
+ if (match(TokenType::LeftBracket)) {
+ consume();
+
+ ByteCode sub_ops;
+ if (!parse_bracket_expression(sub_ops, length) || !sub_ops.size())
+ return set_error(Error::InvalidBracketContent);
+
+ bytecode.append(move(sub_ops));
+
+ consume(TokenType::RightBracket, Error::MismatchingBracket);
+ should_parse_repetition_symbol = true;
+ break;
+ }
+
+ if (match(TokenType::RightBracket)) {
+ return set_error(Error::MismatchingBracket);
+ }
+
+ if (match(TokenType::RightCurly)) {
+ return set_error(Error::MismatchingBrace);
+ }
+
+ if (match(TokenType::Circumflex)) {
+ consume();
+ bytecode.empend((ByteCodeValueType)OpCodeId::CheckBegin);
+ break;
+ }
+
+ if (match(TokenType::Dollar)) {
+ consume();
+ bytecode.empend((ByteCodeValueType)OpCodeId::CheckEnd);
+ break;
+ }
+
+ if (match(TokenType::RightParen))
+ return false;
+
+ if (match(TokenType::LeftParen)) {
+ consume();
+ Optional<StringView> capture_group_name;
+ bool prevent_capture_group = false;
+ if (match(TokenType::Questionmark)) {
+ consume();
+
+ if (match(TokenType::Colon)) {
+ consume();
+ prevent_capture_group = true;
+ } else if (consume("<")) { // named capturing group
+
+ Token start_token = m_parser_state.current_token;
+ Token last_token = m_parser_state.current_token;
+ size_t capture_group_name_length = 0;
+ for (;;) {
+ if (!match_ordinary_characters())
+ return set_error(Error::InvalidNameForCaptureGroup);
+ if (match(TokenType::Char) && m_parser_state.current_token.value()[0] == '>') {
+ consume();
+ break;
+ }
+ ++capture_group_name_length;
+ last_token = consume();
+ }
+ capture_group_name = StringView(start_token.value().characters_without_null_termination(), capture_group_name_length);
+
+ } else if (match(TokenType::EqualSign)) { // positive lookahead
+ consume();
+ ASSERT_NOT_REACHED();
+ } else if (consume("!")) { // negative lookahead
+ ASSERT_NOT_REACHED();
+ } else if (consume("<")) {
+ if (match(TokenType::EqualSign)) { // positive lookbehind
+ consume();
+ ASSERT_NOT_REACHED();
+ }
+ if (consume("!")) // negative lookbehind
+ ASSERT_NOT_REACHED();
+ } else {
+ return set_error(Error::InvalidRepetitionMarker);
+ }
+ }
+
+ if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
+ if (capture_group_name.has_value())
+ bytecode.insert_bytecode_group_capture_left(capture_group_name.value());
+ else
+ bytecode.insert_bytecode_group_capture_left(m_parser_state.capture_groups_count);
+ }
+
+ ByteCode capture_group_bytecode;
+
+ if (!parse_root(capture_group_bytecode, length))
+ return set_error(Error::InvalidPattern);
+
+ bytecode.append(move(capture_group_bytecode));
+
+ consume(TokenType::RightParen, Error::MismatchingParen);
+
+ if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
+ if (capture_group_name.has_value()) {
+ bytecode.insert_bytecode_group_capture_right(capture_group_name.value());
+ ++m_parser_state.named_capture_groups_count;
+ } else {
+ bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count);
+ ++m_parser_state.capture_groups_count;
+ }
+ }
+ should_parse_repetition_symbol = true;
+ break;
+ }
+
+ return false;
+ }
+
+ if (match_repetition_symbol()) {
+ if (should_parse_repetition_symbol)
+ parse_repetition_symbol(bytecode, length);
+ else
+ return set_error(Error::InvalidRepetitionMarker);
+ }
+
+ stack.append(move(bytecode));
+ match_length_minimum += length;
+
+ return true;
+}
+
+bool PosixExtendedParser::parse_root(ByteCode& stack, size_t& match_length_minimum)
+{
+ ByteCode bytecode_left;
+ size_t match_length_minimum_left { 0 };
+
+ if (match_repetition_symbol())
+ return set_error(Error::InvalidRepetitionMarker);
+
+ for (;;) {
+ if (!parse_sub_expression(bytecode_left, match_length_minimum_left))
+ break;
+
+ if (match(TokenType::Pipe)) {
+ consume();
+
+ ByteCode bytecode_right;
+ size_t match_length_minimum_right { 0 };
+
+ if (!parse_root(bytecode_right, match_length_minimum_right) || bytecode_right.is_empty())
+ return set_error(Error::InvalidPattern);
+
+ ByteCode new_bytecode;
+ new_bytecode.insert_bytecode_alternation(move(bytecode_left), move(bytecode_right));
+ bytecode_left = move(new_bytecode);
+ match_length_minimum_left = min(match_length_minimum_right, match_length_minimum_left);
+ }
+ }
+
+ if (bytecode_left.is_empty())
+ set_error(Error::EmptySubExpression);
+
+ stack.append(move(bytecode_left));
+ match_length_minimum = match_length_minimum_left;
+ return !has_error();
+}
+
+}
diff --git a/Libraries/LibRegex/RegexParser.cpp_ b/Libraries/LibRegex/RegexParser.cpp_
new file mode 100644
index 0000000000..4292371cd2
--- /dev/null
+++ b/Libraries/LibRegex/RegexParser.cpp_
@@ -0,0 +1,803 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "RegexParser.h"
+#include <AK/String.h>
+#include <AK/StringBuilder.h>
+
+namespace AK {
+namespace regex {
+
+const char* ByteCodeValue::name(OpCode type)
+{
+ switch (type) {
+#define __ENUMERATE_OPCODE(x) \
+ case OpCode::x: \
+ return #x;
+ ENUMERATE_OPCODES
+#undef __ENUMERATE_OPCODE
+ default:
+ ASSERT_NOT_REACHED();
+ return "<Unknown>";
+ }
+}
+
+const char* ByteCodeValue::name() const
+{
+ return name(op_code);
+}
+
+template<class T>
+bool Parser<T>::set_error(Error error)
+{
+ if (m_parser_state.error == Error::NoError) {
+ m_parser_state.error = error;
+ m_parser_state.error_token = m_parser_state.current_token;
+ }
+ return false; // always return false, that eases the API usage (return set_error(...)) :^)
+}
+
+template<class T>
+bool Parser<T>::done() const
+{
+ return match(TokenType::Eof);
+}
+
+template<class T>
+bool Parser<T>::match(TokenType type) const
+{
+ return m_parser_state.current_token.type() == type;
+}
+
+template<class T>
+Token Parser<T>::consume()
+{
+ auto old_token = m_parser_state.current_token;
+ m_parser_state.current_token = m_parser_state.lexer.next();
+ return old_token;
+}
+
+template<class T>
+Token Parser<T>::consume(TokenType type, Error error)
+{
+ if (m_parser_state.current_token.type() != type) {
+ set_error(error);
+#ifdef __serenity__
+ dbg() << "[PARSER] Error: Unexpected token " << m_parser_state.m_current_token.name() << ". Expected: " << Token::name(type);
+#else
+ fprintf(stderr, "[PARSER] Error: Unexpected token %s. Expected %s\n", m_parser_state.current_token.name(), Token::name(type));
+#endif
+ }
+ return consume();
+}
+
+template<class T>
+bool Parser<T>::consume(const String& str)
+{
+ size_t potentially_go_back { 1 };
+ for (auto ch : str) {
+ if (match(TokenType::OrdinaryCharacter)) {
+ if (m_parser_state.current_token.value()[0] != ch) {
+ m_parser_state.lexer.back(potentially_go_back);
+ m_parser_state.current_token = m_parser_state.lexer.next();
+ return false;
+ }
+ } else {
+ m_parser_state.lexer.back(potentially_go_back);
+ m_parser_state.current_token = m_parser_state.lexer.next();
+ return false;
+ }
+ consume(TokenType::OrdinaryCharacter);
+ ++potentially_go_back;
+ }
+ return true;
+}
+
+template<class T>
+void Parser<T>::reset()
+{
+ m_parser_state.bytecode.clear();
+ m_parser_state.lexer.reset();
+ m_parser_state.current_token = m_parser_state.lexer.next();
+ m_parser_state.error = Error::NoError;
+ m_parser_state.error_token = { TokenType::Eof, 0, StringView(nullptr) };
+ m_parser_state.regex_options = {};
+}
+
+template<class T>
+ParserResult Parser<T>::parse(Optional<OptionsType> regex_options)
+{
+ reset();
+ if (regex_options.has_value())
+ m_parser_state.regex_options = regex_options.value();
+ if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum))
+ consume(TokenType::Eof);
+ else
+ set_error(Error::InvalidPattern);
+
+#ifdef REGEX_DEBUG
+ printf("[PARSER] Produced bytecode with %lu entries (opcodes + arguments)\n", m_parser_state.m_bytes.size());
+#endif
+ return {
+ move(m_parser_state.bytecode),
+ move(m_parser_state.capture_groups_count),
+ move(m_parser_state.named_capture_groups_count),
+ move(m_parser_state.match_length_minimum),
+ move(m_parser_state.error),
+ move(m_parser_state.error_token)
+ };
+}
+
+template<class T>
+void Parser<T>::insert_bytecode_compare_values(Vector<ByteCodeValue>& stack, Vector<CompareTypeAndValuePair>&& pairs)
+{
+ Vector<ByteCodeValue> bytecode;
+
+ bytecode.empend(OpCode::Compare);
+ bytecode.empend(pairs.size()); // number of arguments
+
+ for (auto& value : pairs) {
+ ASSERT(value.type != CharacterCompareType::RangeExpressionDummy);
+ ASSERT(value.type != CharacterCompareType::Undefined);
+ ASSERT(value.type != CharacterCompareType::OrdinaryCharacters);
+
+ bytecode.append(move(value.type));
+ if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnySingleCharacter)
+ bytecode.append(move(value.value));
+ }
+
+ stack.append(move(bytecode));
+}
+
+template<class T>
+void Parser<T>::insert_bytecode_group_capture_left(Vector<ByteCodeValue>& stack)
+{
+ stack.empend(OpCode::SaveLeftCaptureGroup);
+ stack.empend(m_parser_state.capture_groups_count);
+}
+
+template<class T>
+void Parser<T>::insert_bytecode_group_capture_left(Vector<ByteCodeValue>& stack, const StringView& name)
+{
+ stack.empend(OpCode::SaveLeftNamedCaptureGroup);
+ stack.empend(name.characters_without_null_termination());
+ stack.empend(name.length());
+}
+
+template<class T>
+void Parser<T>::insert_bytecode_group_capture_right(Vector<ByteCodeValue>& stack)
+{
+ stack.empend(OpCode::SaveRightCaptureGroup);
+ stack.empend(m_parser_state.capture_groups_count);
+}
+
+template<class T>
+void Parser<T>::insert_bytecode_group_capture_right(Vector<ByteCodeValue>& stack, const StringView& name)
+{
+ stack.empend(OpCode::SaveRightNamedCaptureGroup);
+ stack.empend(name.characters_without_null_termination());
+ stack.empend(name.length());
+}
+
+template<class T>
+void Parser<T>::insert_bytecode_alternation(Vector<ByteCodeValue>& stack, Vector<ByteCodeValue>&& left, Vector<ByteCodeValue>&& right)
+{
+
+ // FORKSTAY _ALT
+ // REGEXP ALT1
+ // JUMP _END
+ // LABEL _ALT
+ // REGEXP ALT2
+ // LABEL _END
+
+ stack.empend(OpCode::ForkJump);
+ stack.empend(left.size() + 2); // Jump to the _ALT label
+
+ for (auto& op : left)
+ stack.append(move(op));
+
+ stack.empend(OpCode::Jump);
+ stack.empend(right.size()); // Jump to the _END label
+
+ // LABEL _ALT = bytecode.size() + 2
+
+ for (auto& op : right)
+ stack.append(move(op));
+
+ // LABEL _END = alterantive_bytecode.size
+}
+
+template<class T>
+void Parser<T>::insert_bytecode_repetition_min_max(Vector<ByteCodeValue>& bytecode_to_repeat, size_t minimum, Optional<size_t> maximum)
+{
+ Vector<ByteCodeValue> new_bytecode;
+ insert_bytecode_repetition_n(new_bytecode, bytecode_to_repeat, minimum);
+
+ if (maximum.has_value()) {
+ if (maximum.value() > minimum) {
+ auto diff = maximum.value() - minimum;
+ new_bytecode.empend(OpCode::ForkStay);
+ new_bytecode.empend(diff * (bytecode_to_repeat.size() + 2)); // Jump to the _END label
+
+ for (size_t i = 0; i < diff; ++i) {
+ new_bytecode.append(bytecode_to_repeat);
+ new_bytecode.empend(OpCode::ForkStay);
+ new_bytecode.empend((diff - i - 1) * (bytecode_to_repeat.size() + 2)); // Jump to the _END label
+ }
+ }
+ } else {
+ // no maximum value set, repeat finding if possible
+ new_bytecode.empend(OpCode::ForkJump);
+ new_bytecode.empend(-bytecode_to_repeat.size() - 2); // Jump to the last iteration
+ }
+
+ bytecode_to_repeat = move(new_bytecode);
+}
+
+template<class T>
+void Parser<T>::insert_bytecode_repetition_n(Vector<ByteCodeValue>& stack, Vector<ByteCodeValue>& bytecode_to_repeat, size_t n)
+{
+ for (size_t i = 0; i < n; ++i)
+ stack.append(bytecode_to_repeat);
+}
+
+template<class T>
+void Parser<T>::insert_bytecode_repetition_min_one(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy)
+{
+ // LABEL _START = -bytecode_to_repeat.size()
+ // REGEXP
+ // FORKJUMP _START (FORKSTAY -> Greedy)
+
+ if (greedy)
+ bytecode_to_repeat.empend(OpCode::ForkStay);
+ else
+ bytecode_to_repeat.empend(OpCode::ForkJump);
+
+ bytecode_to_repeat.empend(-bytecode_to_repeat.size() - 1); // Jump to the _START label
+}
+
+template<class T>
+void Parser<T>::insert_bytecode_repetition_any(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy)
+{
+ // LABEL _START
+ // FORKSTAY _END (FORKJUMP -> Greedy)
+ // REGEXP
+ // JUMP _START
+ // LABEL _END
+
+ // LABEL _START = stack.size();
+ Vector<ByteCodeValue> bytecode;
+
+ if (greedy)
+ bytecode.empend(OpCode::ForkJump);
+ else
+ bytecode.empend(OpCode::ForkStay);
+
+ bytecode.empend(bytecode_to_repeat.size() + 2); // Jump to the _END label
+
+ for (auto& op : bytecode_to_repeat)
+ bytecode.append(move(op));
+
+ bytecode.empend(OpCode::Jump);
+ bytecode.empend(-bytecode.size() - 1); // Jump to the _START label
+ // LABEL _END = bytecode.size()
+
+ bytecode_to_repeat = move(bytecode);
+}
+
+template<class T>
+void Parser<T>::insert_bytecode_repetition_zero_or_one(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy)
+{
+ // FORKSTAY _END (FORKJUMP -> Greedy)
+ // REGEXP
+ // LABEL _END
+ Vector<ByteCodeValue> bytecode;
+
+ if (greedy)
+ bytecode.empend(OpCode::ForkJump);
+ else
+ bytecode.empend(OpCode::ForkStay);
+
+ bytecode.empend(bytecode_to_repeat.size()); // Jump to the _END label
+
+ for (auto& op : bytecode_to_repeat)
+ bytecode.append(move(op));
+ // LABEL _END = bytecode.size()
+
+ bytecode_to_repeat = move(bytecode);
+}
+
+// =============================
+// PosixExtended Parser
+// =============================
+
+bool PosixExtendedParser::parse_internal(Vector<ByteCodeValue>& stack, size_t& match_length_minimum)
+{
+ return parse_root(stack, match_length_minimum);
+}
+
+bool PosixExtendedParser::match_repetition_symbol()
+{
+ auto type = m_parser_state.current_token.type();
+ return (type == TokenType::Asterisk
+ || type == TokenType::Plus
+ || type == TokenType::Questionmark
+ || type == TokenType::LeftCurly);
+}
+
+bool PosixExtendedParser::match_ordinary_characters()
+{
+ // NOTE: This method must not be called during bracket and repetition parsing!
+ // FIXME: Add assertion for that?
+ auto type = m_parser_state.current_token.type();
+ return (type == TokenType::OrdinaryCharacter
+ || type == TokenType::Comma
+ || type == TokenType::Slash
+ || type == TokenType::EqualSign
+ || type == TokenType::HyphenMinus
+ || type == TokenType::Colon);
+}
+
+bool PosixExtendedParser::parse_repetition_symbol(Vector<ByteCodeValue>& bytecode_to_repeat, size_t& match_length_minimum)
+{
+ if (match(TokenType::LeftCurly)) {
+ consume();
+
+ StringBuilder number_builder;
+ bool ok;
+
+ while (match(TokenType::OrdinaryCharacter)) {
+ number_builder.append(consume().value());
+ }
+
+ size_t minimum = number_builder.build().to_uint(ok);
+ if (!ok)
+ return set_error(Error::InvalidBraceContent);
+
+ match_length_minimum *= minimum;
+
+ if (match(TokenType::Comma)) {
+ consume();
+ } else {
+ Vector<ByteCodeValue> bytecode;
+ insert_bytecode_repetition_n(bytecode, bytecode_to_repeat, minimum);
+ bytecode_to_repeat = move(bytecode);
+ return !has_error();
+ }
+
+ Optional<size_t> maximum {};
+ number_builder.clear();
+ while (match(TokenType::OrdinaryCharacter)) {
+ number_builder.append(consume().value());
+ }
+ if (!number_builder.is_empty()) {
+ maximum = number_builder.build().to_uint(ok);
+ if (!ok || minimum > maximum.value())
+ return set_error(Error::InvalidBraceContent);
+ }
+
+ insert_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maximum);
+
+ consume(TokenType::RightCurly, Error::BraceMismatch);
+ return !has_error();
+
+ } else if (match(TokenType::Plus)) {
+ consume();
+
+ bool greedy = match(TokenType::Questionmark);
+ if (greedy)
+ consume();
+
+ // Note: dont touch match_length_minimum, it's already correct
+ insert_bytecode_repetition_min_one(bytecode_to_repeat, greedy);
+ return !has_error();
+
+ } else if (match(TokenType::Asterisk)) {
+ consume();
+ match_length_minimum = 0;
+
+ bool greedy = match(TokenType::Questionmark);
+ if (greedy)
+ consume();
+
+ insert_bytecode_repetition_any(bytecode_to_repeat, greedy);
+
+ return !has_error();
+
+ } else if (match(TokenType::Questionmark)) {
+ consume();
+ match_length_minimum = 0;
+
+ bool greedy = match(TokenType::Questionmark);
+ if (greedy)
+ consume();
+
+ insert_bytecode_repetition_zero_or_one(bytecode_to_repeat, greedy);
+ return !has_error();
+ }
+
+ return false;
+}
+
+bool PosixExtendedParser::parse_bracket_expression(Vector<ByteCodeValue>& stack, size_t& match_length_minimum)
+{
+ Vector<CompareTypeAndValuePair> values;
+
+ for (;;) {
+
+ if (match(TokenType::HyphenMinus)) {
+ consume();
+ if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
+ // first in the bracket expression
+ values.append({ CharacterCompareType::OrdinaryCharacter, { '-' } });
+
+ } else if (match(TokenType::RightBracket)) {
+ // Last in the bracket expression
+ values.append({ CharacterCompareType::OrdinaryCharacter, { '-' } });
+
+ } else if (values.last().type == CharacterCompareType::OrdinaryCharacter) {
+
+ values.append({ CharacterCompareType::RangeExpressionDummy, 0 });
+
+ if (match(TokenType::HyphenMinus)) {
+ consume();
+ // Valid range, add ordinary character
+ values.append({ CharacterCompareType::OrdinaryCharacter, { '-' } });
+ }
+ } else {
+ return set_error(Error::InvalidRange);
+ }
+
+ } else if (match(TokenType::OrdinaryCharacter) || match(TokenType::Period) || match(TokenType::Asterisk) || match(TokenType::EscapeSequence) || match(TokenType::Plus)) {
+ values.append({ CharacterCompareType::OrdinaryCharacter, { *consume().value().characters_without_null_termination() } });
+
+ } else if (match(TokenType::Circumflex)) {
+ auto t = consume();
+
+ if (values.is_empty())
+ values.append({ CharacterCompareType::Inverse, 0 });
+ else
+ values.append({ CharacterCompareType::OrdinaryCharacter, { *t.value().characters_without_null_termination() } });
+
+ } else if (match(TokenType::LeftBracket)) {
+ consume();
+
+ if (match(TokenType::Period)) {
+ consume();
+
+ // FIXME: Parse collating element, this is needed when we have locale support
+ // This could have impact on length parameter, I guess.
+ ASSERT_NOT_REACHED();
+
+ consume(TokenType::Period, Error::InvalidCollationElement);
+ consume(TokenType::RightBracket, Error::BracketMismatch);
+
+ } else if (match(TokenType::EqualSign)) {
+ consume();
+ // FIXME: Parse collating element, this is needed when we have locale support
+ // This could have impact on length parameter, I guess.
+ ASSERT_NOT_REACHED();
+
+ consume(TokenType::EqualSign, Error::InvalidCollationElement);
+ consume(TokenType::RightBracket, Error::BracketMismatch);
+
+ } else if (match(TokenType::Colon)) {
+ consume();
+
+ CharacterClass ch_class;
+ // parse character class
+ if (match(TokenType::OrdinaryCharacter)) {
+ if (consume("alnum"))
+ ch_class = CharacterClass::Alnum;
+ else if (consume("alpha"))
+ ch_class = CharacterClass::Alpha;
+ else if (consume("blank"))
+ ch_class = CharacterClass::Blank;
+ else if (consume("cntrl"))
+ ch_class = CharacterClass::Cntrl;
+ else if (consume("digit"))
+ ch_class = CharacterClass::Digit;
+ else if (consume("graph"))
+ ch_class = CharacterClass::Graph;
+ else if (consume("lower"))
+ ch_class = CharacterClass::Lower;
+ else if (consume("print"))
+ ch_class = CharacterClass::Print;
+ else if (consume("punct"))
+ ch_class = CharacterClass::Punct;
+ else if (consume("space"))
+ ch_class = CharacterClass::Space;
+ else if (consume("upper"))
+ ch_class = CharacterClass::Upper;
+ else if (consume("xdigit"))
+ ch_class = CharacterClass::Xdigit;
+ else
+ return set_error(Error::InvalidCharacterClass);
+
+ values.append({ CharacterCompareType::CharacterClass, ch_class });
+
+ } else
+ return set_error(Error::InvalidCharacterClass);
+
+ // FIXME: we do not support locale specific character classes until locales are implemented
+
+ consume(TokenType::Colon, Error::InvalidCharacterClass);
+ consume(TokenType::RightBracket, Error::BracketMismatch);
+ }
+ } else if (match(TokenType::RightBracket)) {
+
+ if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
+ // handle bracket as ordinary character
+ values.append({ CharacterCompareType::OrdinaryCharacter, { *consume().value().characters_without_null_termination() } });
+ } else {
+ // closing bracket expression
+ break;
+ }
+ } else
+ // nothing matched, this is a failure, as at least the closing bracket must match...
+ return set_error(Error::BracketMismatch);
+
+ // check if range expression has to be completed...
+ if (values.size() >= 3 && values.at(values.size() - 2).type == CharacterCompareType::RangeExpressionDummy) {
+ if (values.last().type != CharacterCompareType::OrdinaryCharacter)
+ return set_error(Error::InvalidRange);
+
+ auto value2 = values.take_last();
+ values.take_last(); // RangeExpressionDummy
+ auto value1 = values.take_last();
+
+ values.append({ CharacterCompareType::RangeExpression, ByteCodeValue { value1.value.ch, value2.value.ch } });
+ }
+ }
+
+ if (values.size())
+ match_length_minimum = 1;
+
+ if (values.first().type == CharacterCompareType::Inverse)
+ match_length_minimum = 0;
+
+ insert_bytecode_compare_values(stack, move(values));
+
+ return !has_error();
+}
+
+bool PosixExtendedParser::parse_sub_expression(Vector<ByteCodeValue>& stack, size_t& match_length_minimum)
+{
+ Vector<ByteCodeValue> bytecode;
+ size_t length = 0;
+ bool should_parse_repetition_symbol { false };
+
+ for (;;) {
+ if (match_ordinary_characters()) {
+ Token start_token = m_parser_state.current_token;
+ Token last_token = m_parser_state.current_token;
+ for (;;) {
+ if (!match_ordinary_characters())
+ break;
+ ++length;
+ last_token = consume();
+ }
+
+ if (length > 1) {
+ stack.empend(OpCode::Compare);
+ stack.empend(1ul); // number of arguments
+ stack.empend(CharacterCompareType::OrdinaryCharacters);
+ stack.empend(start_token.value().characters_without_null_termination());
+ stack.empend(length - ((match_repetition_symbol() && length > 1) ? 1 : 0)); // last character is inserted into 'bytecode' for duplication symbol handling
+ }
+
+ if ((match_repetition_symbol() && length > 1) || length == 1) // Create own compare opcode for last character before duplication symbol
+ insert_bytecode_compare_values(bytecode, { { CharacterCompareType::OrdinaryCharacter, { last_token.value().characters_without_null_termination()[0] } } });
+
+ should_parse_repetition_symbol = true;
+ break;
+ }
+
+ if (match_repetition_symbol())
+ return set_error(Error::InvalidRepetitionMarker);
+
+ if (match(TokenType::Period)) {
+ length = 1;
+ consume();
+ insert_bytecode_compare_values(bytecode, { { CharacterCompareType::AnySingleCharacter, { 0 } } });
+ should_parse_repetition_symbol = true;
+ break;
+ }
+
+ if (match(TokenType::EscapeSequence)) {
+ length = 1;
+ Token t = consume();
+#ifdef REGEX_DEBUG
+ printf("[PARSER] EscapeSequence with substring %s\n", String(t.value()).characters());
+#endif
+
+ insert_bytecode_compare_values(bytecode, { { CharacterCompareType::OrdinaryCharacter, { (char)t.value().characters_without_null_termination()[1] } } });
+ should_parse_repetition_symbol = true;
+ break;
+ }
+
+ if (match(TokenType::LeftBracket)) {
+ consume();
+
+ Vector<ByteCodeValue> sub_ops;
+ if (!parse_bracket_expression(sub_ops, length) || !sub_ops.size())
+ return set_error(Error::BracketMismatch);
+
+ bytecode.append(move(sub_ops));
+
+ consume(TokenType::RightBracket);
+ should_parse_repetition_symbol = true;
+ break;
+ }
+
+ if (match(TokenType::RightBracket)) {
+ return set_error(Error::BracketMismatch);
+ }
+
+ if (match(TokenType::RightCurly)) {
+ return set_error(Error::BraceMismatch);
+ }
+
+ if (match(TokenType::Circumflex)) {
+ consume();
+ bytecode.empend(OpCode::CheckBegin);
+ break;
+ }
+
+ if (match(TokenType::Dollar)) {
+ consume();
+ bytecode.empend(OpCode::CheckEnd);
+ break;
+ }
+
+ if (match(TokenType::LeftParen)) {
+ consume();
+ Optional<StringView> capture_group_name;
+ bool no_subexpression_match_qualifier = false;
+ if (match(TokenType::Questionmark)) {
+ consume();
+
+ if (match(TokenType::Colon)) {
+ consume();
+ no_subexpression_match_qualifier = true;
+ } else if (consume("<")) { // named capturing group
+
+ Token start_token = m_parser_state.current_token;
+ Token last_token = m_parser_state.current_token;
+ size_t capture_group_name_length = 0;
+ for (;;) {
+ if (!match_ordinary_characters())
+ return set_error(Error::InvalidNameForCaptureGroup);
+ if (match(TokenType::OrdinaryCharacter) && m_parser_state.current_token.value()[0] == '>') {
+ consume();
+ break;
+ }
+ ++capture_group_name_length;
+ last_token = consume();
+ }
+ capture_group_name = StringView(start_token.value().characters_without_null_termination(), capture_group_name_length);
+
+ } else if (match(TokenType::EqualSign)) { // positive lookahead
+ consume();
+ ASSERT_NOT_REACHED();
+ } else if (consume("!")) { // negative lookahead
+ ASSERT_NOT_REACHED();
+ } else if (consume("<")) {
+ if (match(TokenType::EqualSign)) { // positive lookbehind
+ consume();
+ ASSERT_NOT_REACHED();
+ }
+ if (consume("!")) // negative lookbehind
+ ASSERT_NOT_REACHED();
+ } else {
+ return set_error(Error::InvalidRepetitionMarker);
+ }
+ }
+
+ if (!(m_parser_state.regex_options & (u8)AllFlags::NoSubExpressions || no_subexpression_match_qualifier)) {
+ if (capture_group_name.has_value())
+ insert_bytecode_group_capture_left(bytecode, capture_group_name.value());
+ else
+ insert_bytecode_group_capture_left(bytecode);
+ }
+
+ Vector<ByteCodeValue> capture_group_bytecode;
+
+ bool res = !parse_root(capture_group_bytecode, length);
+
+ if (capture_group_bytecode.is_empty() && match(TokenType::RightParen))
+ return set_error(Error::ParenEmpty);
+
+ if (!res)
+ return false;
+
+ bytecode.append(move(capture_group_bytecode));
+
+ consume(TokenType::RightParen, Error::ParenMismatch);
+
+ if (!(m_parser_state.regex_options & (u8)AllFlags::NoSubExpressions || no_subexpression_match_qualifier)) {
+ if (capture_group_name.has_value()) {
+ insert_bytecode_group_capture_right(bytecode, capture_group_name.value());
+ ++m_parser_state.named_capture_groups_count;
+ } else {
+ insert_bytecode_group_capture_right(bytecode);
+ ++m_parser_state.capture_groups_count;
+ }
+ }
+ should_parse_repetition_symbol = true;
+ break;
+ }
+
+ return false;
+ }
+
+ if (match_repetition_symbol()) {
+ if (should_parse_repetition_symbol)
+ parse_repetition_symbol(bytecode, length);
+ else
+ return set_error(Error::InvalidRepetitionMarker);
+ }
+
+ stack.append(move(bytecode));
+ match_length_minimum += length;
+
+ return true;
+}
+
+bool PosixExtendedParser::parse_root(Vector<ByteCodeValue>& stack, size_t& match_length_minimum)
+{
+ Vector<ByteCodeValue> bytecode_left;
+ size_t match_length_minimum_left { 0 };
+
+ if (match_repetition_symbol())
+ return set_error(Error::InvalidRepetitionMarker);
+
+ for (;;) {
+ if (!parse_sub_expression(bytecode_left, match_length_minimum_left))
+ break;
+
+ if (match(TokenType::Pipe)) {
+ consume();
+
+ Vector<ByteCodeValue> bytecode_right;
+ size_t match_length_minimum_right { 0 };
+
+ if (!parse_root(bytecode_right, match_length_minimum_right) || bytecode_right.is_empty())
+ return set_error(Error::InvalidPattern);
+
+ Vector<ByteCodeValue> new_bytecode;
+ insert_bytecode_alternation(new_bytecode, move(bytecode_left), move(bytecode_right));
+ bytecode_left = move(new_bytecode);
+ match_length_minimum_left = min(match_length_minimum_right, match_length_minimum_left);
+ }
+ }
+
+ stack.append(move(bytecode_left));
+ match_length_minimum = match_length_minimum_left;
+ return !has_error();
+}
+}
+}
diff --git a/Libraries/LibRegex/RegexParser.h b/Libraries/LibRegex/RegexParser.h
new file mode 100644
index 0000000000..a94dfa3a28
--- /dev/null
+++ b/Libraries/LibRegex/RegexParser.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include "RegexByteCode.h"
+#include "RegexError.h"
+#include "RegexLexer.h"
+#include "RegexOptions.h"
+
+#include <AK/Forward.h>
+#include <AK/StringBuilder.h>
+#include <AK/Types.h>
+#include <AK/Vector.h>
+
+namespace regex {
+
+class PosixExtendedParser;
+
+template<typename T>
+struct GenericParserTraits {
+ using OptionsType = T;
+};
+
+template<typename T>
+struct ParserTraits : public GenericParserTraits<T> {
+};
+
+template<>
+struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
+};
+
+class Parser {
+public:
+ struct Result {
+ ByteCode bytecode;
+ size_t capture_groups_count;
+ size_t named_capture_groups_count;
+ size_t match_length_minimum;
+ Error error;
+ Token error_token;
+ };
+
+ explicit Parser(Lexer& lexer)
+ : m_parser_state(lexer)
+ {
+ }
+
+ Parser(Lexer& lexer, AllOptions regex_options)
+ : m_parser_state(lexer, regex_options)
+ {
+ }
+
+ virtual ~Parser() = default;
+
+ Result parse(Optional<AllOptions> regex_options = {});
+ bool has_error() const { return m_parser_state.error != Error::NoError; }
+ Error error() const { return m_parser_state.error; }
+
+protected:
+ virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
+
+ ALWAYS_INLINE bool match(TokenType type) const;
+ ALWAYS_INLINE bool match(char ch) const;
+ ALWAYS_INLINE Token consume();
+ ALWAYS_INLINE Token consume(TokenType type, Error error);
+ ALWAYS_INLINE bool consume(const String&);
+ ALWAYS_INLINE void reset();
+ ALWAYS_INLINE bool done() const;
+ ALWAYS_INLINE bool set_error(Error error);
+
+ struct ParserState {
+ Lexer& lexer;
+ Token current_token;
+ Error error = Error::NoError;
+ Token error_token { TokenType::Eof, 0, StringView(nullptr) };
+ ByteCode bytecode;
+ size_t capture_groups_count { 0 };
+ size_t named_capture_groups_count { 0 };
+ size_t match_length_minimum { 0 };
+ AllOptions regex_options;
+ explicit ParserState(Lexer& lexer)
+ : lexer(lexer)
+ , current_token(lexer.next())
+ {
+ }
+ explicit ParserState(Lexer& lexer, AllOptions regex_options)
+ : lexer(lexer)
+ , current_token(lexer.next())
+ , regex_options(regex_options)
+ {
+ }
+ };
+
+ ParserState m_parser_state;
+};
+
+class PosixExtendedParser final : public Parser {
+public:
+ explicit PosixExtendedParser(Lexer& lexer)
+ : Parser(lexer) {};
+ PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
+ : Parser(lexer, regex_options.value_or({})) {};
+ ~PosixExtendedParser() = default;
+
+private:
+ ALWAYS_INLINE bool match_repetition_symbol();
+ ALWAYS_INLINE bool match_ordinary_characters();
+
+ bool parse_internal(ByteCode&, size_t&) override;
+
+ bool parse_root(ByteCode&, size_t&);
+ ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
+ ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
+ ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
+};
+
+using PosixExtended = PosixExtendedParser;
+
+}
+
+using regex::PosixExtended;
diff --git a/Libraries/LibRegex/RegexParser.h_ b/Libraries/LibRegex/RegexParser.h_
new file mode 100644
index 0000000000..2f0dbed638
--- /dev/null
+++ b/Libraries/LibRegex/RegexParser.h_
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include "RegexError.h"
+#include "RegexLexer.h"
+#include "RegexOptions.h"
+#include <AK/Forward.h>
+#include <AK/Types.h>
+#include <AK/Vector.h>
+
+namespace AK {
+namespace regex {
+
+#define ENUMERATE_OPCODES \
+ __ENUMERATE_OPCODE(Compare) \
+ __ENUMERATE_OPCODE(Jump) \
+ __ENUMERATE_OPCODE(ForkJump) \
+ __ENUMERATE_OPCODE(ForkStay) \
+ __ENUMERATE_OPCODE(SaveLeftCaptureGroup) \
+ __ENUMERATE_OPCODE(SaveRightCaptureGroup) \
+ __ENUMERATE_OPCODE(SaveLeftNamedCaptureGroup) \
+ __ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \
+ __ENUMERATE_OPCODE(CheckBegin) \
+ __ENUMERATE_OPCODE(CheckEnd) \
+ __ENUMERATE_OPCODE(Exit)
+
+enum class OpCode : u8 {
+#define __ENUMERATE_OPCODE(x) x,
+ ENUMERATE_OPCODES
+#undef __ENUMERATE_OPCODE
+};
+
+enum class CharacterCompareType : u8 {
+ Undefined,
+ Inverse,
+ AnySingleCharacter,
+ OrdinaryCharacter,
+ OrdinaryCharacters,
+ CharacterClass,
+ RangeExpression,
+ RangeExpressionDummy,
+};
+
+enum class CharacterClass : u8 {
+ Alnum,
+ Cntrl,
+ Lower,
+ Space,
+ Alpha,
+ Digit,
+ Print,
+ Upper,
+ Blank,
+ Graph,
+ Punct,
+ Xdigit,
+};
+
+class ByteCodeValue {
+public:
+ union CompareValue {
+ CompareValue(const CharacterClass value)
+ : character_class(value)
+ {
+ }
+ CompareValue(const char value1, const char value2)
+ : range_values { value1, value2 }
+ {
+ }
+ const CharacterClass character_class;
+ const struct {
+ const char from;
+ const char to;
+ } range_values;
+ };
+
+ union {
+ const OpCode op_code;
+ const char* string;
+ const char ch;
+ const int number;
+ const size_t positive_number;
+ const CompareValue compare_value;
+ const CharacterCompareType compare_type;
+ };
+
+ const char* name() const;
+ static const char* name(OpCode);
+
+ ByteCodeValue(const OpCode value)
+ : op_code(value)
+ {
+ }
+ ByteCodeValue(const char* value)
+ : string(value)
+ {
+ }
+ ByteCodeValue(const char value)
+ : ch(value)
+ {
+ }
+ ByteCodeValue(const int value)
+ : number(value)
+ {
+ }
+ ByteCodeValue(const size_t value)
+ : positive_number(value)
+ {
+ }
+ ByteCodeValue(const CharacterClass value)
+ : compare_value(value)
+ {
+ }
+ ByteCodeValue(const char value1, const char value2)
+ : compare_value(value1, value2)
+ {
+ }
+ ByteCodeValue(const CharacterCompareType value)
+ : compare_type(value)
+ {
+ }
+
+ ~ByteCodeValue() = default;
+};
+
+struct CompareTypeAndValuePair {
+ CharacterCompareType type;
+ ByteCodeValue value;
+};
+
+struct ParserResult {
+ Vector<ByteCodeValue> m_bytes;
+ size_t m_match_groups;
+ size_t m_min_match_length;
+ Error m_error;
+ Token m_error_token;
+};
+
+template<class T>
+class Parser {
+public:
+ explicit Parser(Lexer& lexer)
+ : m_parser_state(lexer)
+ {
+ }
+
+ Parser(Lexer& lexer, T options)
+ : m_parser_state(lexer, options)
+ {
+ }
+
+ virtual ~Parser() = default;
+
+ virtual ParserResult parse(T options = {}, EngineOptions engine_options = {});
+ bool has_error() const { return m_parser_state.m_error != Error::NoError; }
+ Error error() const { return m_parser_state.m_error; }
+
+protected:
+ virtual bool parse_internal(Vector<ByteCodeValue>&, size_t& min_length) = 0;
+
+ bool match(TokenType type) const;
+ bool match(char ch) const;
+ Token consume();
+ Token consume(TokenType type, Error error = Error::InvalidPattern);
+ bool consume(const String&);
+ void reset();
+ bool done() const;
+
+ bool set_error(Error error);
+
+ void insert_bytecode_compare_values(Vector<ByteCodeValue>&, Vector<CompareTypeAndValuePair>&&);
+ void insert_bytecode_group_capture_left(Vector<ByteCodeValue>& stack);
+ void insert_bytecode_group_capture_right(Vector<ByteCodeValue>& stack);
+ void insert_bytecode_group_capture_left(Vector<ByteCodeValue>& stack, const StringView& name);
+ void insert_bytecode_group_capture_right(Vector<ByteCodeValue>& stack, const StringView& name);
+ void insert_bytecode_alternation(Vector<ByteCodeValue>& stack, Vector<ByteCodeValue>&&, Vector<ByteCodeValue>&&);
+ void insert_bytecode_repetition_min_max(Vector<ByteCodeValue>& bytecode_to_repeat, size_t minimum, Optional<size_t> maximum);
+ void insert_bytecode_repetition_n(Vector<ByteCodeValue>& stack, Vector<ByteCodeValue>& bytecode_to_repeat, size_t n);
+ void insert_bytecode_repetition_min_one(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy);
+ void insert_bytecode_repetition_any(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy);
+ void insert_bytecode_repetition_zero_or_one(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy);
+
+ struct ParserState {
+ Lexer& lexer;
+ Token current_token;
+ Error error = Error::NoError;
+ Token error_token { TokenType::Eof, 0, StringView(nullptr) };
+ Vector<ByteCodeValue> bytecode;
+ size_t capture_groups_count { 0 };
+ size_t named_capture_groups_count { 0 };
+ size_t match_length_minimum { 0 };
+ OptionsType regex_options;
+ explicit ParserState(Lexer& lexer)
+ : lexer(lexer)
+ , current_token(lexer.next())
+ {
+ }
+ explicit ParserState(Lexer& lexer, Optional<OptionsType> regex_options)
+ : lexer(lexer)
+ , current_token(lexer.next())
+ , regex_options(regex_options.value_or({}))
+ {
+ }
+ };
+
+ ParserState m_parser_state;
+};
+
+class PosixExtendedParser final : public Parser<PosixOptions> {
+public:
+ explicit PosixExtendedParser(Lexer& lexer)
+ : Parser(lexer) {};
+ PosixExtendedParser(Lexer& lexer, Optional<OptionsType> regex_options)
+ : Parser(lexer, regex_options) {};
+ ~PosixExtendedParser() = default;
+
+private:
+ bool match_repetition_symbol();
+ bool match_ordinary_characters();
+
+ bool parse_internal(Vector<ByteCodeValue>&, size_t&) override;
+
+ bool parse_root(Vector<ByteCodeValue>&, size_t&);
+ bool parse_sub_expression(Vector<ByteCodeValue>&, size_t&);
+ bool parse_bracket_expression(Vector<ByteCodeValue>&, size_t&);
+ bool parse_repetition_symbol(Vector<ByteCodeValue>&, size_t&);
+};
+}
+}
+
+using AK::regex::ParserResult;
+using AK::regex::PosixExtendedParser;
diff --git a/Libraries/LibRegex/Tests/Benchmark.cpp b/Libraries/LibRegex/Tests/Benchmark.cpp
new file mode 100644
index 0000000000..fa90c2f80f
--- /dev/null
+++ b/Libraries/LibRegex/Tests/Benchmark.cpp
@@ -0,0 +1,914 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <AK/TestSuite.h> // import first, to prevent warning of ASSERT* redefinition
+
+#include <LibRegex/Regex.h>
+#include <stdio.h>
+
+#ifndef REGEX_DEBUG
+
+# define BENCHMARK_LOOP_ITERATIONS 100000
+
+//# define REGEX_BENCHMARK_OUR
+# ifndef __serenity__
+//# define REGEX_BENCHMARK_OTHER
+# endif
+
+# if defined(REGEX_BENCHMARK_OTHER)
+# include <regex>
+# endif
+
+# if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(catch_all_benchmark)
+{
+ Regex<PosixExtended> re("^.*$");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT(re.match("Hello World", m));
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(catch_all_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("^.*$");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("Hello World", m, re), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(simple_start_benchmark)
+{
+ Regex<PosixExtended> re("^hello friends");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("Hello!", m), false);
+ EXPECT_EQ(re.match("hello friends", m), true);
+ EXPECT_EQ(re.match("Well, hello friends", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(simple_start_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("^hello friends");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("Hello", m, re), false);
+ EXPECT_EQ(std::regex_match("hello friends", m, re), true);
+ EXPECT_EQ(std::regex_match("Well, hello friends", m, re), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(simple_end_benchmark)
+{
+ Regex<PosixExtended> re(".*hello\\.\\.\\. there$");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("Hallo", m), false);
+ EXPECT_EQ(re.match("I said fyhello... there", m), true);
+ EXPECT_EQ(re.match("ahello... therea", m), false);
+ EXPECT_EQ(re.match("hello.. there", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(simple_end_benchmark_reference_stdcpp_regex_search)
+{
+ std::regex re(".*hello\\.\\.\\. there$");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_search("Hallo", m, re), false);
+ EXPECT_EQ(std::regex_search("I said fyhello... there", m, re), true);
+ EXPECT_EQ(std::regex_search("ahello... therea", m, re), false);
+ EXPECT_EQ(std::regex_search("hello.. there", m, re), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(simple_period_benchmark)
+{
+ Regex<PosixExtended> re("hello.");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("Hello1", m), false);
+ EXPECT_EQ(re.match("hello1", m), true);
+ EXPECT_EQ(re.match("hello2", m), true);
+ EXPECT_EQ(re.match("hello?", m), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(simple_period_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("hello.");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("Hello1", m, re), false);
+ EXPECT_EQ(std::regex_match("hello1", m, re), true);
+ EXPECT_EQ(std::regex_match("hello2", m, re), true);
+ EXPECT_EQ(std::regex_match("hello?", m, re), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(simple_period_end_benchmark)
+{
+ Regex<PosixExtended> re("hello.$");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.search("Hello1", m), false);
+ EXPECT_EQ(re.search("hello1hello1", m), true);
+ EXPECT_EQ(re.search("hello2hell", m), false);
+ EXPECT_EQ(re.search("hello?", m), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(simple_period_end_benchmark_reference_stdcpp_regex_search)
+{
+ std::regex re("hello.$");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_search("Hello1", m, re), false);
+ EXPECT_EQ(std::regex_search("hello1hello1", m, re), true);
+ EXPECT_EQ(std::regex_search("hello2hell", m, re), false);
+ EXPECT_EQ(std::regex_search("hello?", m, re), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(simple_escaped_benchmark)
+{
+ Regex<PosixExtended> re("hello\\.");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("hello", m), false);
+ EXPECT_EQ(re.match("hello.", m), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(simple_escaped_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("hello\\.");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("hello", m, re), false);
+ EXPECT_EQ(std::regex_match("hello.", m, re), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(simple_period2_end_benchmark)
+{
+ Regex<PosixExtended> re(".*hi... there$");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.search("Hello there", m), false);
+ EXPECT_EQ(re.search("I said fyhi... there", m), true);
+ EXPECT_EQ(re.search("....hi... ", m), false);
+ EXPECT_EQ(re.search("I said fyhihii there", m), true);
+ EXPECT_EQ(re.search("I said fyhihi there", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(simple_period2_end_benchmark_reference_stdcpp_regex_search)
+{
+ std::regex re(".*hi... there$");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_search("Hello there", m, re), false);
+ EXPECT_EQ(std::regex_search("I said fyhi... there", m, re), true);
+ EXPECT_EQ(std::regex_search("....hi... ", m, re), false);
+ EXPECT_EQ(std::regex_search("I said fyhihii there", m, re), true);
+ EXPECT_EQ(std::regex_search("I said fyhihi there", m, re), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(simple_plus_benchmark)
+{
+ Regex<PosixExtended> re("a+");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.search("b", m), false);
+ EXPECT_EQ(re.search("a", m), true);
+ EXPECT_EQ(re.search("aaaaaabbbbb", m), true);
+ EXPECT_EQ(re.search("aaaaaaaaaaa", m), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(simple_plus_benchmark_reference_stdcpp_regex_search)
+{
+ std::regex re("a+");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_search("b", m, re), false);
+ EXPECT_EQ(std::regex_search("a", m, re), true);
+ EXPECT_EQ(std::regex_search("aaaaaabbbbb", m, re), true);
+ EXPECT_EQ(std::regex_search("aaaaaaaaaaa", m, re), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(simple_questionmark_benchmark)
+{
+ Regex<PosixExtended> re("da?d");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.search("a", m), false);
+ EXPECT_EQ(re.search("daa", m), false);
+ EXPECT_EQ(re.search("ddddd", m), true);
+ EXPECT_EQ(re.search("dd", m), true);
+ EXPECT_EQ(re.search("dad", m), true);
+ EXPECT_EQ(re.search("dada", m), true);
+ EXPECT_EQ(re.search("adadaa", m), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(simple_questionmark_benchmark_reference_stdcpp_regex_search)
+{
+ std::regex re("da?d");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_search("a", m, re), false);
+ EXPECT_EQ(std::regex_search("daa", m, re), false);
+ EXPECT_EQ(std::regex_search("ddddd", m, re), true);
+ EXPECT_EQ(std::regex_search("dd", m, re), true);
+ EXPECT_EQ(std::regex_search("dad", m, re), true);
+ EXPECT_EQ(std::regex_search("dada", m, re), true);
+ EXPECT_EQ(std::regex_search("adadaa", m, re), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(character_class_benchmark)
+{
+ Regex<PosixExtended> re("[[:alpha:]]");
+ RegexResult m;
+ String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
+
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match(haystack.characters(), m), false);
+ EXPECT_EQ(re.search(haystack.characters(), m), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(character_class_benchmark_reference_stdcpp_regex_search)
+{
+ std::regex re("[[:alpha:]]");
+ std::cmatch m;
+ String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
+
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match(haystack.characters(), m, re), false);
+ EXPECT_EQ(std::regex_search(haystack.characters(), m, re), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(escaped_char_questionmark_benchmark)
+{
+ Regex<PosixExtended> re("This\\.?And\\.?That");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("ThisAndThat", m), true);
+ EXPECT_EQ(re.match("This.And.That", m), true);
+ EXPECT_EQ(re.match("This And That", m), false);
+ EXPECT_EQ(re.match("This..And..That", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(escaped_char_questionmark_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("This\\.?And\\.?That");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("ThisAndThat", m, re), true);
+ EXPECT_EQ(std::regex_match("This.And.That", m, re), true);
+ EXPECT_EQ(std::regex_match("This And That", m, re), false);
+ EXPECT_EQ(std::regex_match("This..And..That", m, re), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(char_qualifier_asterisk_benchmark)
+{
+ Regex<PosixExtended> re("regex*");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.search("#include <regex.h>", m), true);
+ EXPECT_EQ(re.search("#include <stdio.h>", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(char_qualifier_asterisk_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("regex*");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_search("#include <regex.h>", m, re), true);
+ EXPECT_EQ(std::regex_search("#include <stdio.h>", m, re), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(parens_qualifier_questionmark_benchmark)
+{
+ Regex<PosixExtended> re("test(hello)?test");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("testtest", m), true);
+ EXPECT_EQ(re.match("testhellotest", m), true);
+ EXPECT_EQ(re.match("testasfdtest", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(parens_qualifier_questionmark_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("test(hello)?test");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("testtest", m, re), true);
+ EXPECT_EQ(std::regex_match("testhellotest", m, re), true);
+ EXPECT_EQ(std::regex_match("testasfdtest", m, re), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(parens_qualifier_asterisk_benchmark)
+{
+ Regex<PosixExtended> re("test(hello)*test");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("testtest", m), true);
+ EXPECT_EQ(re.match("testhellohellotest", m), true);
+ EXPECT_EQ(re.search("testhellohellotest, testhellotest", m), true);
+ EXPECT_EQ(re.match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(parens_qualifier_asterisk_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("test(hello)*test");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("testtest", m, re), true);
+ EXPECT_EQ(std::regex_match("testhellohellotest", m, re), true);
+ EXPECT_EQ(std::regex_search("testhellohellotest, testhellotest", m, re), true);
+ EXPECT_EQ(std::regex_match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m, re), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(parens_qualifier_asterisk_2_benchmark)
+{
+ Regex<PosixExtended> re("test(.*)test");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("testasdftest", m), true);
+ EXPECT_EQ(re.match("testasdfasdftest", m), true);
+ EXPECT_EQ(re.search("testaaaatest, testbbbtest, testtest", m), true);
+ EXPECT_EQ(re.match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(parens_qualifier_asterisk_2_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("test(.*)test");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("testasdftest", m, re), true);
+ EXPECT_EQ(std::regex_match("testasdfasdftest", m, re), true);
+ EXPECT_EQ(std::regex_search("testaaaatest, testbbbtest, testtest", m, re), true);
+ EXPECT_EQ(std::regex_match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m, re), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(multi_parens_qualifier_questionmark_benchmark)
+{
+ Regex<PosixExtended> re("test(a)?(b)?(c)?test");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("testtest", m), true);
+ EXPECT_EQ(re.match("testabctest", m), true);
+ EXPECT_EQ(re.search("testabctest, testactest", m), true);
+ EXPECT_EQ(re.match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m), false);
+ EXPECT_EQ(re.match("test", m), false);
+ EXPECT_EQ(re.match("whaaaaat", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(multi_parens_qualifier_questionmark_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("test(a)?(b)?(c)?test");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("testtest", m, re), true);
+ EXPECT_EQ(std::regex_match("testabctest", m, re), true);
+ EXPECT_EQ(std::regex_search("testabctest, testactest", m, re), true);
+ EXPECT_EQ(std::regex_match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m, re), false);
+ EXPECT_EQ(std::regex_match("test", m, re), false);
+ EXPECT_EQ(std::regex_match("whaaaaat", m, re), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(simple_alternative_benchmark)
+{
+ Regex<PosixExtended> re("test|hello|friends");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("test", m), true);
+ EXPECT_EQ(re.match("hello", m), true);
+ EXPECT_EQ(re.match("friends", m), true);
+ EXPECT_EQ(re.match("whaaaaat", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(simple_alternative_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("test|hello|friends");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("test", m, re), true);
+ EXPECT_EQ(std::regex_match("hello", m, re), true);
+ EXPECT_EQ(std::regex_match("friends", m, re), true);
+ EXPECT_EQ(std::regex_match("whaaaaat", m, re), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(alternative_match_groups_benchmark)
+{
+ Regex<PosixExtended> re("test(a)?(b)?|hello ?(dear|my)? friends");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("test", m), true);
+ EXPECT_EQ(re.match("testa", m), true);
+ EXPECT_EQ(re.match("testb", m), true);
+ EXPECT_EQ(re.match("hello friends", m), true);
+ EXPECT_EQ(re.match("hello dear friends", m), true);
+ EXPECT_EQ(re.match("hello my friends", m), true);
+ EXPECT_EQ(re.match("testabc", m), false);
+ EXPECT_EQ(re.match("hello test friends", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(alternative_match_groups_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("test(a)?(b)?|hello ?(dear|my)? friends");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("test", m, re), true);
+ EXPECT_EQ(std::regex_match("testa", m, re), true);
+ EXPECT_EQ(std::regex_match("testb", m, re), true);
+ EXPECT_EQ(std::regex_match("hello friends", m, re), true);
+ EXPECT_EQ(std::regex_match("hello dear friends", m, re), true);
+ EXPECT_EQ(std::regex_match("hello my friends", m, re), true);
+ EXPECT_EQ(std::regex_match("testabc", m, re), false);
+ EXPECT_EQ(std::regex_match("hello test friends", m, re), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(parens_qualifier_exact_benchmark)
+{
+ Regex<PosixExtended> re("(hello){3}");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("hello", m), false);
+ EXPECT_EQ(re.match("hellohellohello", m), true);
+ EXPECT_EQ(re.search("hellohellohellohello", m), true);
+ EXPECT_EQ(re.search("test hellohellohello", m), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(parens_qualifier_exact_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("(hello){3}");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("hello", m, re), false);
+ EXPECT_EQ(std::regex_match("hellohellohello", m, re), true);
+ EXPECT_EQ(std::regex_search("hellohellohellohello", m, re), true);
+ EXPECT_EQ(std::regex_search("test hellohellohello", m, re), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(parens_qualifier_minimum_benchmark)
+{
+ Regex<PosixExtended> re("(hello){3,}");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("hello", m), false);
+ EXPECT_EQ(re.match("hellohellohello", m), true);
+ EXPECT_EQ(re.search("hellohellohellohello", m), true);
+ EXPECT_EQ(re.search("test hellohellohello", m), true);
+ EXPECT_EQ(re.search("test hellohellohellohello", m), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(parens_qualifier_minimum_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("(hello){3,}");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("hello", m, re), false);
+ EXPECT_EQ(std::regex_match("hellohellohello", m, re), true);
+ EXPECT_EQ(std::regex_search("hellohellohellohello", m, re), true);
+ EXPECT_EQ(std::regex_search("test hellohellohello", m, re), true);
+ EXPECT_EQ(std::regex_search("test hellohellohellohello", m, re), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(parens_qualifier_maximum_benchmark)
+{
+ Regex<PosixExtended> re("(hello){2,3}");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("hello", m), false);
+ EXPECT_EQ(re.match("hellohellohello", m), true);
+ EXPECT_EQ(re.search("hellohellohellohello", m), true);
+ EXPECT_EQ(re.search("test hellohellohello", m), true);
+ EXPECT_EQ(re.search("test hellohellohellohello", m), true);
+ EXPECT_EQ(re.match("test hellohellohellohello", m), false);
+ EXPECT_EQ(re.search("test hellohellohellohello", m), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(parens_qualifier_maximum_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("(hello){2,3}");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("hello", m, re), false);
+ EXPECT_EQ(std::regex_match("hellohellohello", m, re), true);
+ EXPECT_EQ(std::regex_search("hellohellohellohello", m, re), true);
+ EXPECT_EQ(std::regex_search("test hellohellohello", m, re), true);
+ EXPECT_EQ(std::regex_search("test hellohellohellohello", m, re), true);
+ EXPECT_EQ(std::regex_match("test hellohellohellohello", m, re), false);
+ EXPECT_EQ(std::regex_search("test hellohellohellohello", m, re), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(char_qualifier_min_max_benchmark)
+{
+ Regex<PosixExtended> re("c{3,30}");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("cc", m), false);
+ EXPECT_EQ(re.match("ccc", m), true);
+ EXPECT_EQ(re.match("cccccccccccccccccccccccccccccc", m), true);
+ EXPECT_EQ(re.match("ccccccccccccccccccccccccccccccc", m), false);
+ EXPECT_EQ(re.search("ccccccccccccccccccccccccccccccc", m), true);
+ EXPECT_EQ(re.match("cccccccccccccccccccccccccccccccc", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(char_qualifier_min_max_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("c{3,30}");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("cc", m, re), false);
+ EXPECT_EQ(std::regex_match("ccc", m, re), true);
+ EXPECT_EQ(std::regex_match("cccccccccccccccccccccccccccccc", m, re), true);
+ EXPECT_EQ(std::regex_match("ccccccccccccccccccccccccccccccc", m, re), false);
+ EXPECT_EQ(std::regex_search("ccccccccccccccccccccccccccccccc", m, re), true);
+ EXPECT_EQ(std::regex_match("cccccccccccccccccccccccccccccccc", m, re), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(simple_bracket_chars_benchmark)
+{
+ Regex<PosixExtended> re("[abc]");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("a", m), true);
+ EXPECT_EQ(re.match("b", m), true);
+ EXPECT_EQ(re.match("c", m), true);
+ EXPECT_EQ(re.match("d", m), false);
+ EXPECT_EQ(re.match("e", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(simple_bracket_chars_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("[abc]");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("a", m, re), true);
+ EXPECT_EQ(std::regex_match("b", m, re), true);
+ EXPECT_EQ(std::regex_match("c", m, re), true);
+ EXPECT_EQ(std::regex_match("d", m, re), false);
+ EXPECT_EQ(std::regex_match("e", m, re), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(simple_bracket_chars_inverse_benchmark)
+{
+ Regex<PosixExtended> re("[^abc]");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("a", m), false);
+ EXPECT_EQ(re.match("b", m), false);
+ EXPECT_EQ(re.match("c", m), false);
+ EXPECT_EQ(re.match("d", m), true);
+ EXPECT_EQ(re.match("e", m), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(simple_bracket_chars_inverse_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("[^abc]");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("a", m, re), false);
+ EXPECT_EQ(std::regex_match("b", m, re), false);
+ EXPECT_EQ(std::regex_match("c", m, re), false);
+ EXPECT_EQ(std::regex_match("d", m, re), true);
+ EXPECT_EQ(std::regex_match("e", m, re), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(simple_bracket_chars_range_benchmark)
+{
+ Regex<PosixExtended> re("[a-d]");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("a", m), true);
+ EXPECT_EQ(re.match("b", m), true);
+ EXPECT_EQ(re.match("c", m), true);
+ EXPECT_EQ(re.match("d", m), true);
+ EXPECT_EQ(re.match("e", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(simple_bracket_chars_range_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("[a-d]");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("a", m, re), true);
+ EXPECT_EQ(std::regex_match("b", m, re), true);
+ EXPECT_EQ(std::regex_match("c", m, re), true);
+ EXPECT_EQ(std::regex_match("d", m, re), true);
+ EXPECT_EQ(std::regex_match("e", m, re), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(simple_bracket_chars_range_inverse_benchmark)
+{
+ Regex<PosixExtended> re("[^a-df-z]");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("a", m), false);
+ EXPECT_EQ(re.match("b", m), false);
+ EXPECT_EQ(re.match("c", m), false);
+ EXPECT_EQ(re.match("d", m), false);
+ EXPECT_EQ(re.match("e", m), true);
+ EXPECT_EQ(re.match("k", m), false);
+ EXPECT_EQ(re.match("z", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(simple_bracket_chars_range_inverse_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("[^a-df-z]");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("a", m, re), false);
+ EXPECT_EQ(std::regex_match("b", m, re), false);
+ EXPECT_EQ(std::regex_match("c", m, re), false);
+ EXPECT_EQ(std::regex_match("d", m, re), false);
+ EXPECT_EQ(std::regex_match("e", m, re), true);
+ EXPECT_EQ(std::regex_match("k", m, re), false);
+ EXPECT_EQ(std::regex_match("z", m, re), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(bracket_character_class_uuid_benchmark)
+{
+ Regex<PosixExtended> re("^([[:xdigit:]]{8})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{12})$");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("fb9b62a2-1579-4e3a-afba-76239ccb6583", m), true);
+ EXPECT_EQ(re.match("fb9b62a2", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(bracket_character_class_uuid_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("^([[:xdigit:]]{8})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{12})$");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("fb9b62a2-1579-4e3a-afba-76239ccb6583", m, re), true);
+ EXPECT_EQ(std::regex_match("fb9b62a2", m, re), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(simple_bracket_character_class_inverse_benchmark)
+{
+ Regex<PosixExtended> re("[^[:digit:]]");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("1", m), false);
+ EXPECT_EQ(re.match("2", m), false);
+ EXPECT_EQ(re.match("3", m), false);
+ EXPECT_EQ(re.match("d", m), true);
+ EXPECT_EQ(re.match("e", m), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(simple_bracket_character_class_inverse_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("[^[:digit:]]");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("1", m, re), false);
+ EXPECT_EQ(std::regex_match("2", m, re), false);
+ EXPECT_EQ(std::regex_match("3", m, re), false);
+ EXPECT_EQ(std::regex_match("d", m, re), true);
+ EXPECT_EQ(std::regex_match("e", m, re), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(email_address_benchmark)
+{
+ Regex<PosixExtended> re("^[A-Z0-9a-z._%+-]{1,64}@(?:[A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$");
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("hello.world@domain.tld", m), true);
+ EXPECT_EQ(re.match("this.is.a.very_long_email_address@world.wide.web", m), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(email_address_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("^[A-Z0-9a-z._%+-]{1,64}@(?:[A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$");
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("hello.world@domain.tld", m, re), true);
+ EXPECT_EQ(std::regex_match("this.is.a.very_long_email_address@world.wide.web", m, re), true);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OUR)
+BENCHMARK_CASE(simple_ignorecase_benchmark)
+{
+ Regex<PosixExtended> re("^hello friends", PosixFlags::Insensitive);
+ RegexResult m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(re.match("Hello Friends", m), true);
+ EXPECT_EQ(re.match("hello Friends", m), true);
+
+ EXPECT_EQ(re.match("hello Friends!", m), false);
+ EXPECT_EQ(re.search("hello Friends", m), true);
+
+ EXPECT_EQ(re.match("hell Friends", m), false);
+ EXPECT_EQ(re.search("hell Friends", m), false);
+ }
+}
+#endif
+
+#if defined(REGEX_BENCHMARK_OTHER)
+BENCHMARK_CASE(simple_ignorecase_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re("^hello friends", std::regex_constants::icase);
+ std::cmatch m;
+ for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("Hello Friends", m, re), true);
+ EXPECT_EQ(std::regex_match("hello Friends", m, re), true);
+
+ EXPECT_EQ(std::regex_match("hello Friends!", m, re), false);
+ EXPECT_EQ(std::regex_search("hello Friends", m, re), true);
+
+ EXPECT_EQ(std::regex_match("hell Friends", m, re), false);
+ EXPECT_EQ(std::regex_search("hell Friends", m, re), false);
+ }
+}
+#endif
+#endif
+
+TEST_MAIN(Regex)
diff --git a/Libraries/LibRegex/Tests/CMakeLists.txt b/Libraries/LibRegex/Tests/CMakeLists.txt
new file mode 100644
index 0000000000..a337f05153
--- /dev/null
+++ b/Libraries/LibRegex/Tests/CMakeLists.txt
@@ -0,0 +1,21 @@
+file(GLOB TEST_SOURCES CONFIGURE_DEPENDS "*.cpp")
+file(GLOB REGEX_SOURCES CONFIGURE_DEPENDS "../*.cpp")
+file(GLOB C_REGEX_SOURCES CONFIGURE_DEPENDS "../C/*.cpp")
+
+foreach(source ${TEST_SOURCES})
+ get_filename_component(name ${source} NAME_WE)
+ add_executable(${name} ${source} ${REGEX_SOURCES} ${C_REGEX_SOURCES})
+ target_link_libraries(${name} LagomCore)
+ add_test(
+ NAME ${name}
+ COMMAND ${name}
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+ )
+
+ set_tests_properties(
+ ${name}
+ PROPERTIES
+ FAIL_REGULAR_EXPRESSION
+ "FAIL"
+ )
+endforeach()
diff --git a/Libraries/LibRegex/Tests/Regex.cpp b/Libraries/LibRegex/Tests/Regex.cpp
new file mode 100644
index 0000000000..8a5cb904b1
--- /dev/null
+++ b/Libraries/LibRegex/Tests/Regex.cpp
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <AK/TestSuite.h> // import first, to prevent warning of ASSERT* redefinition
+
+#include <LibRegex/Regex.h>
+#include <LibRegex/RegexDebug.h>
+#include <AK/StringBuilder.h>
+#include <stdio.h>
+
+static ECMAScriptOptions match_test_api_options(const ECMAScriptOptions options)
+{
+ return options;
+}
+
+static PosixOptions match_test_api_options(const PosixOptions options)
+{
+ return options;
+}
+
+TEST_CASE(regex_options_ecmascript)
+{
+ ECMAScriptOptions eo;
+ eo |= ECMAScriptFlags::Global;
+
+ EXPECT(eo & ECMAScriptFlags::Global);
+ EXPECT(!(eo & ECMAScriptFlags::Insensitive));
+
+ eo = match_test_api_options(ECMAScriptFlags::Global | ECMAScriptFlags::Insensitive | ECMAScriptFlags::Sticky);
+ EXPECT(eo & ECMAScriptFlags::Global);
+ EXPECT(eo & ECMAScriptFlags::Insensitive);
+ EXPECT(eo & ECMAScriptFlags::Sticky);
+ EXPECT(!(eo & ECMAScriptFlags::Unicode));
+ EXPECT(!(eo & ECMAScriptFlags::Multiline));
+ EXPECT(!(eo & ECMAScriptFlags::SingleLine));
+
+ eo &= ECMAScriptFlags::Insensitive;
+ EXPECT(!(eo & ECMAScriptFlags::Global));
+ EXPECT(eo & ECMAScriptFlags::Insensitive);
+ EXPECT(!(eo & ECMAScriptFlags::Multiline));
+
+ eo &= ECMAScriptFlags::Sticky;
+ EXPECT(!(eo & ECMAScriptFlags::Global));
+ EXPECT(!(eo & ECMAScriptFlags::Insensitive));
+ EXPECT(!(eo & ECMAScriptFlags::Multiline));
+ EXPECT(!(eo & ECMAScriptFlags::Sticky));
+
+ eo = ~ECMAScriptFlags::Insensitive;
+ EXPECT(eo & ECMAScriptFlags::Global);
+ EXPECT(!(eo & ECMAScriptFlags::Insensitive));
+ EXPECT(eo & ECMAScriptFlags::Multiline);
+ EXPECT(eo & ECMAScriptFlags::Sticky);
+}
+
+TEST_CASE(regex_options_posix)
+{
+ PosixOptions eo;
+ eo |= PosixFlags::Global;
+
+ EXPECT(eo & PosixFlags::Global);
+ EXPECT(!(eo & PosixFlags::Insensitive));
+
+ eo = match_test_api_options(PosixFlags::Global | PosixFlags::Insensitive | PosixFlags::MatchNotBeginOfLine);
+ EXPECT(eo & PosixFlags::Global);
+ EXPECT(eo & PosixFlags::Insensitive);
+ EXPECT(eo & PosixFlags::MatchNotBeginOfLine);
+ EXPECT(!(eo & PosixFlags::Unicode));
+ EXPECT(!(eo & PosixFlags::Multiline));
+
+ eo &= PosixFlags::Insensitive;
+ EXPECT(!(eo & PosixFlags::Global));
+ EXPECT(eo & PosixFlags::Insensitive);
+ EXPECT(!(eo & PosixFlags::Multiline));
+
+ eo &= PosixFlags::MatchNotBeginOfLine;
+ EXPECT(!(eo & PosixFlags::Global));
+ EXPECT(!(eo & PosixFlags::Insensitive));
+ EXPECT(!(eo & PosixFlags::Multiline));
+
+ eo = ~PosixFlags::Insensitive;
+ EXPECT(eo & PosixFlags::Global);
+ EXPECT(!(eo & PosixFlags::Insensitive));
+ EXPECT(eo & PosixFlags::Multiline);
+}
+
+TEST_CASE(regex_lexer)
+{
+ Lexer l("/[.*+?^${}()|[\\]\\\\]/g");
+ EXPECT(l.next().type() == regex::TokenType::Slash);
+ EXPECT(l.next().type() == regex::TokenType::LeftBracket);
+ EXPECT(l.next().type() == regex::TokenType::Period);
+ EXPECT(l.next().type() == regex::TokenType::Asterisk);
+ EXPECT(l.next().type() == regex::TokenType::Plus);
+ EXPECT(l.next().type() == regex::TokenType::Questionmark);
+ EXPECT(l.next().type() == regex::TokenType::Circumflex);
+ EXPECT(l.next().type() == regex::TokenType::Dollar);
+ EXPECT(l.next().type() == regex::TokenType::LeftCurly);
+ EXPECT(l.next().type() == regex::TokenType::RightCurly);
+ EXPECT(l.next().type() == regex::TokenType::LeftParen);
+ EXPECT(l.next().type() == regex::TokenType::RightParen);
+ EXPECT(l.next().type() == regex::TokenType::Pipe);
+ EXPECT(l.next().type() == regex::TokenType::LeftBracket);
+ EXPECT(l.next().type() == regex::TokenType::EscapeSequence);
+ EXPECT(l.next().type() == regex::TokenType::EscapeSequence);
+ EXPECT(l.next().type() == regex::TokenType::RightBracket);
+ EXPECT(l.next().type() == regex::TokenType::Slash);
+ EXPECT(l.next().type() == regex::TokenType::Char);
+}
+
+TEST_CASE(parser_error_parens)
+{
+ String pattern = "test()test";
+ Lexer l(pattern);
+ PosixExtendedParser p(l);
+ p.parse();
+ EXPECT(p.has_error());
+ EXPECT(p.error() == Error::EmptySubExpression);
+}
+
+TEST_CASE(parser_error_special_characters_used_at_wrong_place)
+{
+ String pattern;
+ Vector<char, 5> chars = { '*', '+', '?', '{' };
+ StringBuilder b;
+
+ Lexer l;
+ PosixExtended p(l);
+
+ for (auto& ch : chars) {
+ // First in ere
+ b.clear();
+ b.append(ch);
+ pattern = b.build();
+ l.set_source(pattern);
+ p.parse();
+ EXPECT(p.has_error());
+ EXPECT(p.error() == Error::InvalidRepetitionMarker);
+
+ // After vertical line
+ b.clear();
+ b.append("a|");
+ b.append(ch);
+ pattern = b.build();
+ l.set_source(pattern);
+ p.parse();
+ EXPECT(p.has_error());
+ EXPECT(p.error() == Error::InvalidRepetitionMarker);
+
+ // After circumflex
+ b.clear();
+ b.append("^");
+ b.append(ch);
+ pattern = b.build();
+ l.set_source(pattern);
+ p.parse();
+ EXPECT(p.has_error());
+ EXPECT(p.error() == Error::InvalidRepetitionMarker);
+
+ // After dollar
+ b.clear();
+ b.append("$");
+ b.append(ch);
+ pattern = b.build();
+ l.set_source(pattern);
+ p.parse();
+ EXPECT(p.has_error());
+ EXPECT(p.error() == Error::InvalidRepetitionMarker);
+
+ // After left parens
+ b.clear();
+ b.append("(");
+ b.append(ch);
+ b.append(")");
+ pattern = b.build();
+ l.set_source(pattern);
+ p.parse();
+ EXPECT(p.has_error());
+ EXPECT(p.error() == Error::InvalidRepetitionMarker);
+ }
+}
+
+TEST_CASE(parser_error_vertical_line_used_at_wrong_place)
+{
+ Lexer l;
+ PosixExtended p(l);
+
+ // First in ere
+ l.set_source("|asdf");
+ p.parse();
+ EXPECT(p.has_error());
+ EXPECT(p.error() == Error::EmptySubExpression);
+
+ // Last in ere
+ l.set_source("asdf|");
+ p.parse();
+ EXPECT(p.has_error());
+ EXPECT(p.error() == Error::EmptySubExpression);
+
+ // After left parens
+ l.set_source("(|asdf)");
+ p.parse();
+ EXPECT(p.has_error());
+ EXPECT(p.error() == Error::EmptySubExpression);
+
+ // Proceed right parens
+ l.set_source("(asdf)|");
+ p.parse();
+ EXPECT(p.has_error());
+ EXPECT(p.error() == Error::EmptySubExpression);
+}
+
+TEST_CASE(catch_all_first)
+{
+ Regex<PosixExtended> re("^.*$");
+ RegexResult m;
+ re.match("Hello World", m);
+ EXPECT(m.count == 1);
+ EXPECT(re.match("Hello World", m));
+}
+
+TEST_CASE(catch_all)
+{
+ Regex<PosixExtended> re("^.*$", PosixFlags::Global);
+
+ EXPECT(re.has_match("Hello World"));
+ EXPECT(re.match("Hello World").success);
+ EXPECT(re.match("Hello World").count == 1);
+
+ EXPECT(has_match("Hello World", re));
+ auto res = match("Hello World", re);
+ EXPECT(res.success);
+ EXPECT(res.count == 1);
+ EXPECT(res.matches.size() == 1);
+ EXPECT(res.matches.first().view == "Hello World");
+}
+
+TEST_CASE(catch_all_again)
+{
+ Regex<PosixExtended> re("^.*$", PosixFlags::Extra);
+ EXPECT_EQ(has_match("Hello World", re), true);
+}
+
+TEST_CASE(char_utf8)
+{
+ Regex<PosixExtended> re("😀");
+ RegexResult result;
+
+ EXPECT_EQ((result = match("Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界", re, PosixFlags::Global)).success, true);
+ EXPECT_EQ(result.count, 2u);
+}
+
+TEST_CASE(catch_all_newline)
+{
+ Regex<PosixExtended> re("^.*$", PosixFlags::Multiline | PosixFlags::StringCopyMatches);
+ RegexResult result;
+ auto lambda = [&result, &re]() {
+ String aaa = "Hello World\nTest\n1234\n";
+ result = match(aaa, re);
+ EXPECT_EQ(result.success, true);
+ };
+ lambda();
+ EXPECT_EQ(result.count, 3u);
+ EXPECT_EQ(result.matches.at(0).view, "Hello World");
+ EXPECT_EQ(result.matches.at(1).view, "Test");
+ EXPECT_EQ(result.matches.at(2).view, "1234");
+}
+
+TEST_CASE(catch_all_newline_view)
+{
+ Regex<PosixExtended> re("^.*$", PosixFlags::Multiline);
+ RegexResult result;
+
+ String aaa = "Hello World\nTest\n1234\n";
+ result = match(aaa, re);
+ EXPECT_EQ(result.success, true);
+ EXPECT_EQ(result.count, 3u);
+ String str = "Hello World";
+ EXPECT_EQ(result.matches.at(0).view, str.view());
+ EXPECT_EQ(result.matches.at(1).view, "Test");
+ EXPECT_EQ(result.matches.at(2).view, "1234");
+}
+
+TEST_CASE(catch_all_newline_2)
+{
+ Regex<PosixExtended> re("^.*$");
+ RegexResult result;
+ result = match("Hello World\nTest\n1234\n", re, PosixFlags::Multiline | PosixFlags::StringCopyMatches);
+ EXPECT_EQ(result.success, true);
+ EXPECT_EQ(result.count, 3u);
+ EXPECT_EQ(result.matches.at(0).view, "Hello World");
+ EXPECT_EQ(result.matches.at(1).view, "Test");
+ EXPECT_EQ(result.matches.at(2).view, "1234");
+
+ result = match("Hello World\nTest\n1234\n", re);
+ EXPECT_EQ(result.success, true);
+ EXPECT_EQ(result.count, 1u);
+ EXPECT_EQ(result.matches.at(0).view, "Hello World\nTest\n1234\n");
+}
+
+TEST_CASE(match_all_character_class)
+{
+ Regex<PosixExtended> re("[[:alpha:]]");
+ String str = "[Window]\nOpacity=255\nAudibleBeep=0\n";
+ RegexResult result = match(str, re, PosixFlags::Global | PosixFlags::StringCopyMatches);
+
+ EXPECT_EQ(result.success, true);
+ EXPECT_EQ(result.count, 24u);
+ EXPECT_EQ(result.matches.at(0).view, "W");
+ EXPECT_EQ(result.matches.at(1).view, "i");
+ EXPECT_EQ(result.matches.at(2).view, "n");
+ EXPECT(&result.matches.at(0).view.characters_without_null_termination()[0] != &str.view().characters_without_null_termination()[1]);
+}
+
+TEST_CASE(example_for_git_commit)
+{
+ Regex<PosixExtended> re("^.*$");
+ auto result = re.match("Well, hello friends!\nHello World!");
+
+ EXPECT(result.success);
+ EXPECT(result.count == 1);
+ EXPECT(result.matches.at(0).view.starts_with("Well"));
+ EXPECT(result.matches.at(0).view.length() == 33);
+
+ EXPECT(re.has_match("Well,...."));
+
+ result = re.match("Well, hello friends!\nHello World!", PosixFlags::Multiline);
+
+ EXPECT(result.success);
+ EXPECT(result.count == 2);
+ EXPECT(result.matches.at(0).view == "Well, hello friends!");
+ EXPECT(result.matches.at(1).view == "Hello World!");
+}
+
+TEST_CASE(email_address)
+{
+ Regex<PosixExtended> re("^[A-Z0-9a-z._%+-]{1,64}@([A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$");
+ EXPECT(re.has_match("hello.world@domain.tld"));
+ EXPECT(re.has_match("this.is.a.very_long_email_address@world.wide.web"));
+}
+
+TEST_CASE(ini_file_entries)
+{
+ Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)|\\[(.*)\\]");
+ RegexResult result;
+
+#ifdef REGEX_DEBUG
+ RegexDebug regex_dbg(stderr);
+ regex_dbg.print_raw_bytecode(re);
+ regex_dbg.print_header();
+ regex_dbg.print_bytecode(re);
+#endif
+
+ String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
+ EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true);
+ EXPECT_EQ(result.count, 3u);
+
+#ifdef REGEX_DEBUG
+ for (auto& v : result.matches)
+ fprintf(stderr, "%s\n", v.view.to_string().characters());
+#endif
+
+ EXPECT_EQ(result.matches.at(0).view, "[Window]");
+ EXPECT_EQ(result.capture_group_matches.at(0).at(1).view, "Window");
+ EXPECT_EQ(result.matches.at(1).view, "Opacity=255");
+ EXPECT_EQ(result.matches.at(1).line, 1u);
+ EXPECT_EQ(result.matches.at(1).column, 0u);
+ EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "255");
+ EXPECT_EQ(result.capture_group_matches.at(1).at(0).line, 1u);
+ EXPECT_EQ(result.capture_group_matches.at(1).at(0).column, 8u);
+ EXPECT_EQ(result.matches.at(2).view, "AudibleBeep=0");
+ EXPECT_EQ(result.capture_group_matches.at(2).at(0).view, "0");
+ EXPECT_EQ(result.capture_group_matches.at(2).at(0).line, 2u);
+ EXPECT_EQ(result.capture_group_matches.at(2).at(0).column, 12u);
+}
+
+TEST_CASE(named_capture_group)
+{
+ Regex<PosixExtended> re("[[:alpha:]]*=(?<Test>[[:digit:]]*)");
+ RegexResult result;
+
+#ifdef REGEX_DEBUG
+ RegexDebug regex_dbg(stderr);
+ regex_dbg.print_raw_bytecode(re);
+ regex_dbg.print_header();
+ regex_dbg.print_bytecode(re);
+#endif
+
+ String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
+ EXPECT_EQ(re.search(haystack, result, PosixFlags::Multiline), true);
+ EXPECT_EQ(result.count, 2u);
+ EXPECT_EQ(result.matches.at(0).view, "Opacity=255");
+ EXPECT_EQ(result.named_capture_group_matches.at(0).ensure("Test").view, "255");
+ EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0");
+ EXPECT_EQ(result.named_capture_group_matches.at(1).ensure("Test").view, "0");
+}
+
+TEST_CASE(a_star)
+{
+ Regex<PosixExtended> re("a*");
+ RegexResult result;
+
+#ifdef REGEX_DEBUG
+ RegexDebug regex_dbg(stderr);
+ regex_dbg.print_raw_bytecode(re);
+ regex_dbg.print_header();
+ regex_dbg.print_bytecode(re);
+#endif
+
+ String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
+ EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true);
+ EXPECT_EQ(result.count, 32u);
+ EXPECT_EQ(result.matches.at(0).view.length(), 0u);
+ EXPECT_EQ(result.matches.at(10).view.length(), 1u);
+ EXPECT_EQ(result.matches.at(10).view, "a");
+ EXPECT_EQ(result.matches.at(31).view.length(), 0u);
+}
+
+TEST_CASE(simple_period_end_benchmark)
+{
+ Regex<PosixExtended> re("hello.$");
+ RegexResult m;
+ EXPECT_EQ(re.search("Hello1", m), false);
+ EXPECT_EQ(re.search("hello1hello1", m), true);
+ EXPECT_EQ(re.search("hello2hell", m), false);
+ EXPECT_EQ(re.search("hello?", m), true);
+}
+
+TEST_MAIN(Regex)
diff --git a/Libraries/LibRegex/Tests/RegexLibC.cpp b/Libraries/LibRegex/Tests/RegexLibC.cpp
new file mode 100644
index 0000000000..40e0303159
--- /dev/null
+++ b/Libraries/LibRegex/Tests/RegexLibC.cpp
@@ -0,0 +1,1211 @@
+/*
+ * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <AK/TestSuite.h>
+
+#include <AK/StringBuilder.h>
+#include <LibC/regex.h>
+#include <stdio.h>
+
+#define BENCHMARK_LOOP_ITERATIONS 100000
+#define DISABLE_REGEX_BENCHMARK
+
+//#if not(defined(REGEX_DEBUG) || defined(REGEX_MATCH_STATUS) || defined(DISABLE_REGEX_BENCHMARK))
+# include <regex>
+//#endif
+
+TEST_CASE(catch_all)
+{
+ String pattern = "^.*$";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "Hello World", 0, NULL, 0), REG_NOERR);
+
+ regfree(&regex);
+}
+
+TEST_CASE(simple_start)
+{
+ String pattern = "^hello friends";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "Hello!", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "hello friends", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "Well, hello friends", 0, NULL, 0), REG_NOMATCH);
+
+ regfree(&regex);
+}
+
+TEST_CASE(simple_end)
+{
+ String pattern = ".*hello\\.\\.\\. there$";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "Hallo", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "I said fyhello... there", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "ahello... therea", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "hello.. there", 0, NULL, 0), REG_NOMATCH);
+
+ regfree(&regex);
+}
+
+TEST_CASE(simple_period)
+{
+ String pattern = "hello.";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "Hello1", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "hello1", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "hello2", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "hello?", 0, NULL, 0), REG_NOERR);
+
+ regfree(&regex);
+}
+
+TEST_CASE(simple_period_end)
+{
+ String pattern = "hello.$";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED | REG_NOSUB), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "Hello1", 0, NULL, REG_NOSUB), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "hello1hello1", 0, NULL, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "hello2hell", 0, NULL, REG_GLOBAL), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "hello?", 0, NULL, REG_NOSUB), REG_NOERR);
+
+ regfree(&regex);
+}
+
+TEST_CASE(simple_escaped)
+{
+ String pattern = "hello\\.";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "hello", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "hello.", 0, NULL, 0), REG_NOERR);
+
+ regfree(&regex);
+}
+
+TEST_CASE(simple_period2_end)
+{
+ String pattern = ".*hi... there$";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "Hello there", 0, NULL, REG_GLOBAL), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "I said fyhi... there", 0, NULL, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "....hi... ", 0, NULL, REG_GLOBAL), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "I said fyhihii there", 0, NULL, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "I said fyhihi there", 0, NULL, REG_GLOBAL), REG_NOMATCH);
+
+ regfree(&regex);
+}
+
+TEST_CASE(simple_plus)
+{
+ String pattern = "a+";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED | REG_NOSUB), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "b", 0, NULL, REG_NOSUB), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "a", 0, NULL, REG_NOSUB), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "aaaaaabbbbb", 0, NULL, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "aaaaaaaaaaa", 0, NULL, REG_GLOBAL), REG_NOERR);
+
+ regfree(&regex);
+}
+
+TEST_CASE(simple_questionmark)
+{
+ String pattern = "da?d";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "a", 0, NULL, REG_GLOBAL), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "daa", 0, NULL, REG_GLOBAL), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "ddddd", 0, NULL, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "dd", 0, NULL, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "dad", 0, NULL, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "dada", 0, NULL, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "adadaa", 0, NULL, REG_GLOBAL), REG_NOERR);
+
+ regfree(&regex);
+}
+
+TEST_CASE(simple_questionmark_matchall)
+{
+ String pattern = "da?d";
+ regex_t regex;
+ static constexpr int num_matches { 5 };
+ regmatch_t matches[num_matches];
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "a", num_matches, matches, REG_GLOBAL), REG_NOMATCH);
+ EXPECT_EQ(matches[0].rm_cnt, 0u);
+ EXPECT_EQ(regexec(&regex, "daa", num_matches, matches, REG_GLOBAL), REG_NOMATCH);
+ EXPECT_EQ(matches[0].rm_cnt, 0u);
+
+ EXPECT_EQ(regexec(&regex, "ddddd", num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 2u);
+
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 2u);
+ EXPECT_EQ(matches[1].rm_so, 2u);
+ EXPECT_EQ(matches[1].rm_eo, 4u);
+
+ EXPECT_EQ(regexec(&regex, "dd", num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(regexec(&regex, "dad", num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(regexec(&regex, "dada", num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(regexec(&regex, "adadaa", num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+
+ regfree(&regex);
+}
+
+TEST_CASE(character_class)
+{
+ String pattern = "[[:alpha:]]";
+ regex_t regex;
+ static constexpr int num_matches { 5 };
+ regmatch_t matches[num_matches];
+
+ String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, haystack.characters(), num_matches, matches, 0), REG_NOMATCH);
+ EXPECT_EQ(matches[0].rm_cnt, 0u);
+ EXPECT_EQ(regexec(&regex, haystack.characters(), num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 24u);
+ EXPECT_EQ(haystack.substring_view(matches[0].rm_so, matches[0].rm_eo - matches[0].rm_so), "W");
+ EXPECT_EQ(haystack.substring_view(matches[1].rm_so, matches[1].rm_eo - matches[1].rm_so), "i");
+
+ regfree(&regex);
+}
+
+TEST_CASE(character_class2)
+{
+ String pattern = "[[:alpha:]]*=([[:digit:]]*)|\\[(.*)\\]";
+ regex_t regex;
+ static constexpr int num_matches { 9 };
+ regmatch_t matches[num_matches];
+
+ String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED | REG_NEWLINE), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, haystack.characters(), num_matches, matches, 0), REG_NOERR);
+
+ EXPECT_EQ(matches[0].rm_cnt, 3u);
+#if 0
+ for (int i = 0; i < num_matches; ++i) {
+ fprintf(stderr, "Matches[%i].rm_so: %li, .rm_eo: %li .rm_cnt: %li: ", i, matches[i].rm_so, matches[i].rm_eo, matches[i].rm_cnt);
+ fprintf(stderr, "haystack length: %lu\n", haystack.length());
+ if (matches[i].rm_so != -1)
+ fprintf(stderr, "%s\n", haystack.substring_view(matches[i].rm_so, matches[i].rm_eo - matches[i].rm_so).to_string().characters());
+ }
+#endif
+
+ EXPECT_EQ(haystack.substring_view(matches[0].rm_so, matches[0].rm_eo - matches[0].rm_so), "[Window]");
+
+ EXPECT_EQ(matches[1].rm_so, -1);
+ EXPECT_EQ(matches[1].rm_eo, -1);
+ EXPECT_EQ(matches[1].rm_cnt, 0);
+
+ EXPECT_EQ(haystack.substring_view(matches[2].rm_so, matches[2].rm_eo - matches[2].rm_so), "Window");
+ EXPECT_EQ(haystack.substring_view(matches[3].rm_so, matches[3].rm_eo - matches[3].rm_so), "Opacity=255");
+
+ EXPECT_EQ(haystack.substring_view(matches[4].rm_so, matches[4].rm_eo - matches[4].rm_so), "255");
+
+ EXPECT_EQ(matches[5].rm_so, -1);
+ EXPECT_EQ(matches[5].rm_eo, -1);
+ EXPECT_EQ(matches[5].rm_cnt, 0);
+
+ EXPECT_EQ(haystack.substring_view(matches[6].rm_so, matches[6].rm_eo - matches[6].rm_so), "AudibleBeep=0");
+ EXPECT_EQ(haystack.substring_view(matches[7].rm_so, matches[7].rm_eo - matches[7].rm_so), "0");
+
+ EXPECT_EQ(matches[8].rm_so, -1);
+ EXPECT_EQ(matches[8].rm_eo, -1);
+ EXPECT_EQ(matches[8].rm_cnt, 0);
+
+ regfree(&regex);
+}
+
+TEST_CASE(escaped_char_questionmark)
+{
+ String pattern = "This\\.?And\\.?That";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "ThisAndThat", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "This.And.That", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "This And That", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "This..And..That", 0, NULL, 0), REG_NOMATCH);
+
+ regfree(&regex);
+}
+
+TEST_CASE(char_qualifier_asterisk)
+{
+ String pattern = "regex*";
+ regex_t regex;
+ static constexpr int num_matches { 5 };
+ regmatch_t matches[num_matches];
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "#include <regex.h>", num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+
+ regfree(&regex);
+}
+
+TEST_CASE(char_utf8)
+{
+ String pattern = "😀";
+ regex_t regex;
+ static constexpr int num_matches { 5 };
+ regmatch_t matches[num_matches];
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界", num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 2u);
+
+ regfree(&regex);
+}
+
+TEST_CASE(parens)
+{
+ String pattern = "test(hello)test";
+ regex_t regex;
+ static constexpr int num_matches { 5 };
+ regmatch_t matches[num_matches];
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+
+ EXPECT_EQ(regexec(&regex, "testhellotest", num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 13u);
+ EXPECT_EQ(matches[1].rm_so, 4u);
+ EXPECT_EQ(matches[1].rm_eo, 9u);
+
+ regfree(&regex);
+}
+
+TEST_CASE(parser_error_parens)
+{
+ String pattern = "test()test";
+ regex_t regex;
+ static constexpr int num_matches { 5 };
+ regmatch_t matches[num_matches];
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_EMPTY_EXPR);
+ EXPECT_EQ(regexec(&regex, "testhellotest", num_matches, matches, 0), REG_EMPTY_EXPR);
+
+ regfree(&regex);
+}
+
+TEST_CASE(parser_error_special_characters_used_at_wrong_place)
+{
+ String pattern;
+ regex_t regex;
+ static constexpr int num_matches { 5 };
+ regmatch_t matches[num_matches];
+
+ Vector<char, 4> chars = { '*', '+', '?', '}' };
+ StringBuilder b;
+
+ for (auto& ch : chars) {
+
+ auto error_code_to_check = ch == '}' ? REG_EBRACE : REG_BADRPT;
+
+ // First in ere
+ b.clear();
+ b.append(ch);
+ pattern = b.build();
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), error_code_to_check);
+ EXPECT_EQ(regexec(&regex, "test", num_matches, matches, 0), error_code_to_check);
+
+ // After vertical line
+ b.clear();
+ b.append("a|");
+ b.append(ch);
+ pattern = b.build();
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), error_code_to_check);
+ EXPECT_EQ(regexec(&regex, "test", num_matches, matches, 0), error_code_to_check);
+
+ // After circumflex
+ b.clear();
+ b.append("^");
+ b.append(ch);
+ pattern = b.build();
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), error_code_to_check);
+ EXPECT_EQ(regexec(&regex, "test", num_matches, matches, 0), error_code_to_check);
+
+ // After dollar
+ b.clear();
+ b.append("$");
+ b.append(ch);
+ pattern = b.build();
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), error_code_to_check);
+ EXPECT_EQ(regexec(&regex, "test", num_matches, matches, 0), error_code_to_check);
+
+ // After left parens
+ b.clear();
+ b.append("(");
+ b.append(ch);
+ b.append(")");
+ pattern = b.build();
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), error_code_to_check);
+ EXPECT_EQ(regexec(&regex, "test", num_matches, matches, 0), error_code_to_check);
+ }
+
+ regfree(&regex);
+}
+
+TEST_CASE(parser_error_vertical_line_used_at_wrong_place)
+{
+ String pattern;
+ regex_t regex;
+ static constexpr int num_matches { 5 };
+ regmatch_t matches[num_matches];
+
+ // First in ere
+ pattern = "|asdf";
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_EMPTY_EXPR);
+ EXPECT_EQ(regexec(&regex, "test", num_matches, matches, 0), REG_EMPTY_EXPR);
+
+ // Last in ere
+ pattern = "asdf|";
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_EMPTY_EXPR);
+ EXPECT_EQ(regexec(&regex, "test", num_matches, matches, 0), REG_EMPTY_EXPR);
+
+ // After left parens
+ pattern = "(|asdf)";
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_EMPTY_EXPR);
+ EXPECT_EQ(regexec(&regex, "test", num_matches, matches, 0), REG_EMPTY_EXPR);
+
+ // Proceed right parens
+ pattern = "(asdf)|";
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_EMPTY_EXPR);
+ EXPECT_EQ(regexec(&regex, "test", num_matches, matches, 0), REG_EMPTY_EXPR);
+
+ regfree(&regex);
+}
+
+TEST_CASE(parens_qualifier_questionmark)
+{
+ String pattern = "test(hello)?test";
+ regex_t regex;
+ static constexpr int num_matches { 5 };
+ regmatch_t matches[num_matches];
+ const char* match_str;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+
+ match_str = "testtest";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 8u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testtest");
+
+ match_str = "testhellotest";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 13u);
+ EXPECT_EQ(matches[1].rm_so, 4u);
+ EXPECT_EQ(matches[1].rm_eo, 9u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testhellotest");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello");
+
+ regfree(&regex);
+}
+
+TEST_CASE(parens_qualifier_asterisk)
+{
+ String pattern = "test(hello)*test";
+ regex_t regex;
+ static constexpr int num_matches { 6 };
+ regmatch_t matches[num_matches];
+ const char* match_str;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+
+ match_str = "testtest";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 8u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testtest");
+
+ match_str = "testhellohellotest";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 18u);
+ EXPECT_EQ(matches[1].rm_so, 9u);
+ EXPECT_EQ(matches[1].rm_eo, 14u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testhellohellotest");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello");
+
+ match_str = "testhellohellotest, testhellotest";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 2u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 18u);
+ EXPECT_EQ(matches[1].rm_so, 9u);
+ EXPECT_EQ(matches[1].rm_eo, 14u);
+ EXPECT_EQ(matches[2].rm_so, 20u);
+ EXPECT_EQ(matches[2].rm_eo, 33u);
+ EXPECT_EQ(matches[3].rm_so, 24u);
+ EXPECT_EQ(matches[3].rm_eo, 29u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testhellohellotest");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello");
+ EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "testhellotest");
+ EXPECT_EQ(StringView(&match_str[matches[3].rm_so], matches[3].rm_eo - matches[3].rm_so), "hello");
+
+ regfree(&regex);
+}
+
+TEST_CASE(parens_qualifier_asterisk_2)
+{
+ String pattern = "test(.*)test";
+ regex_t regex;
+ static constexpr int num_matches { 6 };
+ regmatch_t matches[num_matches];
+ const char* match_str;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+
+ match_str = "testasdftest";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 12u);
+ EXPECT_EQ(matches[1].rm_so, 4u);
+ EXPECT_EQ(matches[1].rm_eo, 8u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testasdftest");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "asdf");
+
+ match_str = "testasdfasdftest";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 16u);
+ EXPECT_EQ(matches[1].rm_so, 4u);
+ EXPECT_EQ(matches[1].rm_eo, 12u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testasdfasdftest");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "asdfasdf");
+
+ match_str = "testaaaatest, testbbbtest, testtest";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 35u);
+ EXPECT_EQ(matches[1].rm_so, 4u);
+ EXPECT_EQ(matches[1].rm_eo, 31u);
+
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testaaaatest, testbbbtest, testtest");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "aaaatest, testbbbtest, test");
+
+ regfree(&regex);
+}
+
+TEST_CASE(mulit_parens_qualifier_too_less_result_values)
+{
+ String pattern = "test(a)?(b)?(c)?test";
+ regex_t regex;
+ static constexpr int num_matches { 4 };
+ regmatch_t matches[num_matches];
+ const char* match_str;
+
+ matches[3] = { -2, -2, 100 };
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+
+ match_str = "testabtest";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches - 1, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 10u);
+ EXPECT_EQ(matches[1].rm_so, 4u);
+ EXPECT_EQ(matches[1].rm_eo, 5u);
+ EXPECT_EQ(matches[2].rm_so, 5u);
+ EXPECT_EQ(matches[2].rm_eo, 6u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testabtest");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "a");
+ EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "b");
+ EXPECT_EQ(matches[3].rm_so, -2);
+ EXPECT_EQ(matches[3].rm_eo, -2);
+ EXPECT_EQ(matches[3].rm_cnt, 100u);
+
+ match_str = "testabctest";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches - 1, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 11u);
+ EXPECT_EQ(matches[1].rm_so, 4u);
+ EXPECT_EQ(matches[1].rm_eo, 5u);
+ EXPECT_EQ(matches[2].rm_so, 5u);
+ EXPECT_EQ(matches[2].rm_eo, 6u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testabctest");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "a");
+ EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "b");
+ EXPECT_EQ(matches[3].rm_so, -2);
+ EXPECT_EQ(matches[3].rm_eo, -2);
+ EXPECT_EQ(matches[3].rm_cnt, 100u);
+
+ match_str = "testabctest, testabctest";
+
+ EXPECT_EQ(regexec(&regex, match_str, num_matches - 1, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 2u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 11u);
+ EXPECT_EQ(matches[1].rm_so, 4u);
+ EXPECT_EQ(matches[1].rm_eo, 5u);
+ EXPECT_EQ(matches[2].rm_so, 5u);
+ EXPECT_EQ(matches[2].rm_eo, 6u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testabctest");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "a");
+ EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "b");
+ EXPECT_EQ(matches[3].rm_so, -2);
+ EXPECT_EQ(matches[3].rm_eo, -2);
+ EXPECT_EQ(matches[3].rm_cnt, 100u);
+
+ regfree(&regex);
+}
+
+TEST_CASE(multi_parens_qualifier_questionmark)
+{
+ String pattern = "test(a)?(b)?(c)?test";
+ regex_t regex;
+ static constexpr int num_matches { 8 };
+ regmatch_t matches[num_matches];
+ const char* match_str;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+
+ match_str = "testtest";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 8u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testtest");
+
+ match_str = "testabctest";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 11u);
+ EXPECT_EQ(matches[1].rm_so, 4u);
+ EXPECT_EQ(matches[1].rm_eo, 5u);
+ EXPECT_EQ(matches[2].rm_so, 5u);
+ EXPECT_EQ(matches[2].rm_eo, 6u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testabctest");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "a");
+ EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "b");
+
+ match_str = "testabctest, testactest";
+
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 2u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 11u);
+ EXPECT_EQ(matches[1].rm_so, 4u);
+ EXPECT_EQ(matches[1].rm_eo, 5u);
+ EXPECT_EQ(matches[2].rm_so, 5u);
+ EXPECT_EQ(matches[2].rm_eo, 6u);
+ EXPECT_EQ(matches[3].rm_so, 6u);
+ EXPECT_EQ(matches[3].rm_eo, 7u);
+
+ EXPECT_EQ(matches[4].rm_so, 13u);
+ EXPECT_EQ(matches[4].rm_eo, 23u);
+ EXPECT_EQ(matches[5].rm_so, 17u);
+ EXPECT_EQ(matches[5].rm_eo, 18u);
+ EXPECT_EQ(matches[6].rm_so, -1);
+ EXPECT_EQ(matches[6].rm_eo, -1);
+ EXPECT_EQ(matches[7].rm_so, 18u);
+ EXPECT_EQ(matches[7].rm_eo, 19u);
+
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testabctest");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "a");
+ EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "b");
+ EXPECT_EQ(StringView(&match_str[matches[3].rm_so], matches[3].rm_eo - matches[3].rm_so), "c");
+ EXPECT_EQ(StringView(&match_str[matches[4].rm_so], matches[4].rm_eo - matches[4].rm_so), "testactest");
+ EXPECT_EQ(StringView(&match_str[matches[5].rm_so], matches[5].rm_eo - matches[5].rm_so), "a");
+ EXPECT_EQ(StringView(&match_str[matches[6].rm_so], matches[6].rm_eo - matches[6].rm_so), "");
+ EXPECT_EQ(StringView(&match_str[matches[7].rm_so], matches[7].rm_eo - matches[7].rm_so), "c");
+
+ regfree(&regex);
+}
+
+TEST_CASE(simple_alternative)
+{
+ String pattern = "test|hello|friends";
+ regex_t regex;
+ static constexpr int num_matches { 1 };
+ regmatch_t matches[num_matches];
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+
+ EXPECT_EQ(regexec(&regex, "test", num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 4u);
+
+ EXPECT_EQ(regexec(&regex, "hello", num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 5u);
+
+ EXPECT_EQ(regexec(&regex, "friends", num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 7u);
+
+ regfree(&regex);
+}
+
+TEST_CASE(alternative_match_groups)
+{
+ String pattern = "test(a)?(b)?|hello ?(dear|my)? friends";
+ regex_t regex;
+ static constexpr int num_matches { 8 };
+ regmatch_t matches[num_matches];
+ const char* match_str;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+
+ match_str = "test";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0);
+ EXPECT_EQ(matches[0].rm_eo, 4);
+ EXPECT_EQ(matches[1].rm_so, -1);
+ EXPECT_EQ(matches[1].rm_eo, -1);
+ EXPECT_EQ(matches[2].rm_so, -1);
+ EXPECT_EQ(matches[2].rm_eo, -1);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "test");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "");
+ EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "");
+
+ match_str = "testa";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 5u);
+ EXPECT_EQ(matches[1].rm_so, 4u);
+ EXPECT_EQ(matches[1].rm_eo, 5u);
+ EXPECT_EQ(matches[2].rm_so, -1);
+ EXPECT_EQ(matches[2].rm_eo, -1);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testa");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "a");
+ EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "");
+
+ match_str = "testb";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 5u);
+ EXPECT_EQ(matches[1].rm_so, -1);
+ EXPECT_EQ(matches[1].rm_eo, -1);
+ EXPECT_EQ(matches[2].rm_so, 4u);
+ EXPECT_EQ(matches[2].rm_eo, 5u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "testb");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "");
+ EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "b");
+
+ match_str = "hello friends";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 13u);
+ EXPECT_EQ(matches[1].rm_so, -1);
+ EXPECT_EQ(matches[1].rm_eo, -1);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hello friends");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "");
+
+ match_str = "hello dear friends";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 18u);
+ EXPECT_EQ(matches[1].rm_so, -1);
+ EXPECT_EQ(matches[1].rm_eo, -1);
+ EXPECT_EQ(matches[2].rm_so, -1);
+ EXPECT_EQ(matches[2].rm_eo, -1);
+ EXPECT_EQ(matches[3].rm_so, 6);
+ EXPECT_EQ(matches[3].rm_eo, 10);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hello dear friends");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "");
+ EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "");
+ EXPECT_EQ(StringView(&match_str[matches[3].rm_so], matches[3].rm_eo - matches[3].rm_so), "dear");
+
+ match_str = "hello my friends";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 16u);
+ EXPECT_EQ(matches[1].rm_so, -1);
+ EXPECT_EQ(matches[1].rm_eo, -1);
+ EXPECT_EQ(matches[2].rm_so, -1);
+ EXPECT_EQ(matches[2].rm_eo, -1);
+ EXPECT_EQ(matches[3].rm_so, 6);
+ EXPECT_EQ(matches[3].rm_eo, 8);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hello my friends");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "");
+ EXPECT_EQ(StringView(&match_str[matches[2].rm_so], matches[2].rm_eo - matches[2].rm_so), "");
+ EXPECT_EQ(StringView(&match_str[matches[3].rm_so], matches[3].rm_eo - matches[3].rm_so), "my");
+
+ match_str = "testabc";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOMATCH);
+ EXPECT_EQ(matches[0].rm_cnt, 0u);
+ EXPECT_EQ(matches[0].rm_so, -1);
+ EXPECT_EQ(matches[0].rm_eo, -1);
+
+ match_str = "hello test friends";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOMATCH);
+ EXPECT_EQ(matches[0].rm_cnt, 0u);
+ EXPECT_EQ(matches[0].rm_so, -1);
+ EXPECT_EQ(matches[0].rm_eo, -1);
+
+ regfree(&regex);
+}
+
+TEST_CASE(parens_qualifier_exact)
+{
+ String pattern = "(hello){3}";
+ regex_t regex;
+ static constexpr int num_matches { 5 };
+ regmatch_t matches[num_matches];
+ const char* match_str;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+
+ match_str = "hello";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOMATCH);
+ EXPECT_EQ(matches[0].rm_cnt, 0u);
+
+ match_str = "hellohellohello";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 15u);
+ EXPECT_EQ(matches[1].rm_so, 10u);
+ EXPECT_EQ(matches[1].rm_eo, 15u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello");
+
+ match_str = "hellohellohellohello";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 15u);
+ EXPECT_EQ(matches[1].rm_so, 10u);
+ EXPECT_EQ(matches[1].rm_eo, 15u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello");
+
+ match_str = "test hellohellohello";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 5u);
+ EXPECT_EQ(matches[0].rm_eo, 20u);
+ EXPECT_EQ(matches[1].rm_so, 15u);
+ EXPECT_EQ(matches[1].rm_eo, 20u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello");
+
+ regfree(&regex);
+}
+
+TEST_CASE(parens_qualifier_minimum)
+{
+ String pattern = "(hello){3,}";
+ regex_t regex;
+ static constexpr int num_matches { 5 };
+ regmatch_t matches[num_matches];
+ const char* match_str;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+
+ match_str = "hello";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOMATCH);
+ EXPECT_EQ(matches[0].rm_cnt, 0u);
+
+ match_str = "hellohellohello";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 15u);
+ EXPECT_EQ(matches[1].rm_so, 10u);
+ EXPECT_EQ(matches[1].rm_eo, 15u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello");
+
+ match_str = "hellohellohellohello";
+
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, REG_SEARCH), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 20u);
+ EXPECT_EQ(matches[1].rm_so, 15u);
+ EXPECT_EQ(matches[1].rm_eo, 20u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohellohello");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello");
+
+ match_str = "test hellohellohello";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 5u);
+ EXPECT_EQ(matches[0].rm_eo, 20u);
+ EXPECT_EQ(matches[1].rm_so, 15u);
+ EXPECT_EQ(matches[1].rm_eo, 20u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello");
+
+ match_str = "test hellohellohellohello";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 5u);
+ EXPECT_EQ(matches[0].rm_eo, 25u);
+ EXPECT_EQ(matches[1].rm_so, 20u);
+ EXPECT_EQ(matches[1].rm_eo, 25u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohellohello");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello");
+
+ regfree(&regex);
+}
+
+TEST_CASE(parens_qualifier_maximum)
+{
+ String pattern = "(hello){2,3}";
+ regex_t regex;
+ static constexpr int num_matches { 5 };
+ regmatch_t matches[num_matches];
+ const char* match_str;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+
+ match_str = "hello";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOMATCH);
+ EXPECT_EQ(matches[0].rm_cnt, 0u);
+
+ match_str = "hellohellohello";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 15u);
+ EXPECT_EQ(matches[1].rm_so, 10u);
+ EXPECT_EQ(matches[1].rm_eo, 15u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello");
+
+ match_str = "hellohellohellohello";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 0u);
+ EXPECT_EQ(matches[0].rm_eo, 15u);
+ EXPECT_EQ(matches[1].rm_so, 10u);
+ EXPECT_EQ(matches[1].rm_eo, 15u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello");
+
+ match_str = "test hellohellohello";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 5u);
+ EXPECT_EQ(matches[0].rm_eo, 20u);
+ EXPECT_EQ(matches[1].rm_so, 15u);
+ EXPECT_EQ(matches[1].rm_eo, 20u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello");
+
+ match_str = "test hellohellohellohello";
+ EXPECT_EQ(regexec(&regex, match_str, num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(matches[0].rm_so, 5u);
+ EXPECT_EQ(matches[0].rm_eo, 20u);
+ EXPECT_EQ(matches[1].rm_so, 15u);
+ EXPECT_EQ(matches[1].rm_eo, 20u);
+ EXPECT_EQ(StringView(&match_str[matches[0].rm_so], matches[0].rm_eo - matches[0].rm_so), "hellohellohello");
+ EXPECT_EQ(StringView(&match_str[matches[1].rm_so], matches[1].rm_eo - matches[1].rm_so), "hello");
+
+ regfree(&regex);
+}
+
+TEST_CASE(char_qualifier_min_max)
+{
+ String pattern = "c{3,30}";
+ regex_t regex;
+ static constexpr int num_matches { 5 };
+ regmatch_t matches[num_matches];
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+
+ EXPECT_EQ(regexec(&regex, "cc", num_matches, matches, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "ccc", num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "cccccccccccccccccccccccccccccc", num_matches, matches, 0), REG_NOERR);
+ EXPECT_EQ(matches[0].rm_cnt, 1u);
+ EXPECT_EQ(regexec(&regex, "ccccccccccccccccccccccccccccccc", num_matches, matches, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "ccccccccccccccccccccccccccccccc", num_matches, matches, REG_GLOBAL), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "cccccccccccccccccccccccccccccccc", num_matches, matches, 0), REG_NOMATCH);
+
+ regfree(&regex);
+}
+
+TEST_CASE(simple_bracket_chars)
+{
+ String pattern = "[abc]";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "a", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "b", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "c", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "d", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "e", 0, NULL, 0), REG_NOMATCH);
+}
+
+TEST_CASE(simple_bracket_chars_inverse)
+{
+ String pattern = "[^abc]";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "a", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "b", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "c", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "d", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "e", 0, NULL, 0), REG_NOERR);
+}
+
+TEST_CASE(simple_bracket_chars_range)
+{
+ String pattern = "[a-d]";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "a", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "b", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "c", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "d", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "e", 0, NULL, 0), REG_NOMATCH);
+}
+
+TEST_CASE(simple_bracket_chars_range_inverse)
+{
+ String pattern = "[^a-df-z]";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "a", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "b", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "c", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "d", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "e", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "k", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "z", 0, NULL, 0), REG_NOMATCH);
+}
+
+TEST_CASE(bracket_character_class_uuid)
+{
+ String pattern = "^([[:xdigit:]]{8})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{12})$";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "fb9b62a2-1579-4e3a-afba-76239ccb6583", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "fb9b62a2", 0, NULL, 0), REG_NOMATCH);
+
+ regfree(&regex);
+}
+
+TEST_CASE(simple_bracket_character_class_inverse)
+{
+ String pattern = "[^[:digit:]]";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "1", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "2", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "3", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "d", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "e", 0, NULL, 0), REG_NOERR);
+}
+
+TEST_CASE(email_address)
+{
+ String pattern = "^[A-Z0-9a-z._%+-]{1,64}@(?:[A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "emanuel.sprung@gmail.com", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "kling@serenityos.org", 0, NULL, 0), REG_NOERR);
+
+ regfree(&regex);
+}
+
+TEST_CASE(error_message)
+{
+ String pattern = "^[A-Z0-9[a-z._%+-]{1,64}@[A-Za-z0-9-]{1,63}\\.{1,125}[A-Za-z]{2,63}$";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED), REG_EBRACK);
+ EXPECT_EQ(regexec(&regex, "asdf@asdf.com", 0, NULL, 0), REG_EBRACK);
+ char buf[1024];
+ size_t buflen = 1024;
+ auto len = regerror(0, &regex, buf, buflen);
+ String expected = "Error during parsing of regular expression:\n ^[A-Z0-9[a-z._%+-]{1,64}@[A-Za-z0-9-]{1,63}\\.{1,125}[A-Za-z]{2,63}$\n ^---- [ ] imbalance.\n";
+ for (size_t i = 0; i < len; ++i) {
+ EXPECT_EQ(buf[i], expected[i]);
+ }
+
+ regfree(&regex);
+}
+
+TEST_CASE(simple_ignorecase)
+{
+ String pattern = "^hello friends";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED | REG_NOSUB | REG_ICASE), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "Hello Friends", 0, NULL, 0), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "hello Friends", 0, NULL, 0), REG_NOERR);
+
+ EXPECT_EQ(regexec(&regex, "hello Friends!", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "hello Friends!", 0, NULL, REG_GLOBAL), REG_NOERR);
+
+ EXPECT_EQ(regexec(&regex, "hell Friends", 0, NULL, 0), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "hell Friends", 0, NULL, REG_GLOBAL), REG_NOMATCH);
+
+ regfree(&regex);
+}
+
+TEST_CASE(simple_notbol_noteol)
+{
+ String pattern = "^hello friends$";
+ regex_t regex;
+
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED | REG_NOSUB | REG_ICASE), REG_NOERR);
+
+ EXPECT_EQ(regexec(&regex, "hello friends", 0, NULL, REG_NOTBOL), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "hello friends", 0, NULL, REG_NOTEOL), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "hello friends", 0, NULL, REG_NOTBOL | REG_NOTEOL), REG_NOMATCH);
+
+ EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTBOL), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "a hello friends", 0, NULL, REG_NOTBOL), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "a hello friends", 0, NULL, REG_NOTBOL | REG_SEARCH), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_SEARCH), REG_NOERR);
+
+ EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTEOL), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "hello friends b", 0, NULL, REG_NOTEOL), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "hello friends b", 0, NULL, REG_NOTEOL | REG_SEARCH), REG_NOERR);
+ EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTEOL | REG_SEARCH), REG_NOMATCH);
+
+ EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_NOTEOL), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_NOTEOL | REG_SEARCH), REG_NOMATCH);
+
+ pattern = "hello friends";
+ EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED | REG_NOSUB | REG_ICASE), REG_NOERR);
+
+ EXPECT_EQ(regexec(&regex, "hello friends", 0, NULL, REG_NOTBOL), REG_NOMATCH);
+ EXPECT_EQ(regexec(&regex, "hello friends", 0, NULL, REG_NOTEOL), REG_NOMATCH);
+
+ regfree(&regex);
+}
+
+//#if not(defined(REGEX_DEBUG) || defined(REGEX_MATCH_STATUS) || defined(DISABLE_REGEX_BENCHMARK))
+//BENCHMARK_CASE(simple_notbol_noteol_benchmark)
+//{
+// String pattern = "^hello friends$";
+// regex_t regex;
+
+// EXPECT_EQ(regcomp(&regex, pattern.characters(), REG_EXTENDED | REG_NOSUB | REG_ICASE), REG_NOERR);
+
+// for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+
+// EXPECT_EQ(regexec(&regex, "hello friends", 0, NULL, REG_NOTBOL), REG_NOMATCH);
+// EXPECT_EQ(regexec(&regex, "hello friends", 0, NULL, REG_NOTEOL), REG_NOMATCH);
+// EXPECT_EQ(regexec(&regex, "hello friends", 0, NULL, REG_NOTBOL | REG_NOTEOL), REG_NOMATCH);
+
+// EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTBOL), REG_NOMATCH);
+// EXPECT_EQ(regexec(&regex, "a hello friends", 0, NULL, REG_NOTBOL), REG_NOMATCH);
+// EXPECT_EQ(regexec(&regex, "a hello friends", 0, NULL, REG_NOTBOL | REG_SEARCH), REG_NOERR);
+// EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_SEARCH), REG_NOMATCH);
+
+// EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTEOL), REG_NOMATCH);
+// EXPECT_EQ(regexec(&regex, "hello friends b", 0, NULL, REG_NOTEOL), REG_NOMATCH);
+// EXPECT_EQ(regexec(&regex, "hello friends b", 0, NULL, REG_NOTEOL | REG_SEARCH), REG_NOERR);
+// EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTEOL | REG_SEARCH), REG_NOMATCH);
+
+// EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_NOTEOL), REG_NOMATCH);
+// EXPECT_EQ(regexec(&regex, "a hello friends b", 0, NULL, REG_NOTBOL | REG_NOTEOL | REG_SEARCH), REG_NOERR);
+// }
+
+// regfree(&regex);
+//}
+
+BENCHMARK_CASE(simple_notbol_noteol_benchmark_reference_stdcpp_regex_match)
+{
+ std::regex re1("^hello friends$", std::regex_constants::match_not_bol);
+ std::regex re2("^hello friends$", std::regex_constants::match_not_eol);
+ std::regex re3("^hello friends$", std::regex_constants::match_not_bol | std::regex_constants::match_not_eol);
+ std::regex re4("hello friends", std::regex_constants::match_not_bol);
+ std::regex re5("hello friends", std::regex_constants::match_not_eol);
+ std::cmatch m;
+ //for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
+ EXPECT_EQ(std::regex_match("hello friends", m, re1), false);
+ EXPECT_EQ(std::regex_match("hello friends", m, re2), false);
+ EXPECT_EQ(std::regex_match("hello friends", m, re3), false);
+
+ EXPECT_EQ(std::regex_match("a hello friends b", m, re1), false);
+ EXPECT_EQ(std::regex_match("a hello friends", m, re1), false);
+ EXPECT_EQ(std::regex_search("a hello friends", m, re1), true);
+ EXPECT_EQ(std::regex_search("a hello friends b", m, re1), true);
+
+ EXPECT_EQ(std::regex_match("a hello friends b", m, re2), false);
+ EXPECT_EQ(std::regex_match("hello friends b", m, re2), false);
+ EXPECT_EQ(std::regex_search("hello friends b", m, re2), true);
+ EXPECT_EQ(std::regex_search("a hello friends b", m, re2), false);
+
+ EXPECT_EQ(std::regex_match("a hello friends b", m, re3), false);
+ EXPECT_EQ(std::regex_search("a hello friends b", m, re3), false);
+
+ EXPECT_EQ(std::regex_match("hello friends", m, re4), false);
+ EXPECT_EQ(std::regex_match("hello friends", m, re5), false);
+ //}
+}
+//#endif
+
+TEST_MAIN(Regex)
diff --git a/Meta/Lagom/CMakeLists.txt b/Meta/Lagom/CMakeLists.txt
index 3160964c23..d2c733212a 100644
--- a/Meta/Lagom/CMakeLists.txt
+++ b/Meta/Lagom/CMakeLists.txt
@@ -39,6 +39,8 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
endif()
file(GLOB AK_SOURCES CONFIGURE_DEPENDS "../../AK/*.cpp")
+file(GLOB LIBREGEX_LIBC_SOURCES "../../Libraries/LibRegex/C/Regex.cpp")
+file(GLOB LIBREGEX_SOURCES CONFIGURE_DEPENDS "../../Libraries/LibRegex/*.cpp")
file(GLOB LIBCORE_SOURCES CONFIGURE_DEPENDS "../../Libraries/LibCore/*.cpp")
file(GLOB LIBELF_SOURCES CONFIGURE_DEPENDS "../../Libraries/LibELF/*.cpp")
file(GLOB LIBGEMINI_SOURCES CONFIGURE_DEPENDS "../../Libraries/LibGemini/*.cpp")
@@ -58,8 +60,9 @@ file(GLOB LIBTLS_SOURCES CONFIGURE_DEPENDS "../../Libraries/LibTLS/*.cpp")
file(GLOB SHELL_SOURCES CONFIGURE_DEPENDS "../../Shell/*.cpp")
file(GLOB SHELL_TESTS CONFIGURE_DEPENDS "../../Shell/Tests/*.sh")
+set(LAGOM_REGEX_SOURCES ${LIBREGEX_LIBC_SOURCES} ${LIBREGEX_SOURCES})
set(LAGOM_CORE_SOURCES ${AK_SOURCES} ${LIBCORE_SOURCES})
-set(LAGOM_MORE_SOURCES ${LIBELF_SOURCES} ${LIBIPC_SOURCES} ${LIBLINE_SOURCES} ${LIBJS_SOURCES} ${LIBJS_SUBDIR_SOURCES} ${LIBX86_SOURCES} ${LIBCRYPTO_SOURCES} ${LIBCOMPRESS_SOURCES} ${LIBCRYPTO_SUBDIR_SOURCES} ${LIBTLS_SOURCES} ${LIBMARKDOWN_SOURCES} ${LIBGEMINI_SOURCES} ${LIBGFX_SOURCES} ${LIBHTTP_SOURCES})
+set(LAGOM_MORE_SOURCES ${LIBELF_SOURCES} ${LIBIPC_SOURCES} ${LIBLINE_SOURCES} ${LIBJS_SOURCES} ${LIBJS_SUBDIR_SOURCES} ${LIBX86_SOURCES} ${LIBCRYPTO_SOURCES} ${LIBCOMPRESS_SOURCES} ${LIBCRYPTO_SUBDIR_SOURCES} ${LIBTLS_SOURCES} ${LIBMARKDOWN_SOURCES} ${LIBGEMINI_SOURCES} ${LIBGFX_SOURCES} ${LIBHTTP_SOURCES} ${LAGOM_REGEX_SOURCES})
include_directories (../../)
include_directories (../../Libraries/)