From 79bbca4644cad7f2dee89c7ac6b8f9acc2c8b427 Mon Sep 17 00:00:00 2001 From: ailin-nemui Date: Thu, 16 Feb 2017 22:48:13 +0100 Subject: Refactor regex and implement UTF8 mode for GRegex - with non-unicode byte to Private Use Area A mapping - move all ifdefs to iregex.h file only --- src/core/Makefile.am | 8 +++ src/core/ignore.c | 40 ++------------ src/core/ignore.h | 11 +--- src/core/iregex-gregex.c | 137 +++++++++++++++++++++++++++++++++++++++++++++++ src/core/iregex-regexh.c | 101 ++++++++++++++++++++++++++++++++++ src/core/iregex.h | 52 ++++++++++++++++++ src/core/misc.c | 4 -- 7 files changed, 305 insertions(+), 48 deletions(-) create mode 100644 src/core/iregex-gregex.c create mode 100644 src/core/iregex-regexh.c create mode 100644 src/core/iregex.h (limited to 'src/core') diff --git a/src/core/Makefile.am b/src/core/Makefile.am index 10bd035a..91daba3f 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -7,6 +7,12 @@ AM_CPPFLAGS = \ -DSYSCONFDIR=\""$(sysconfdir)"\" \ -DMODULEDIR=\""$(libdir)/irssi/modules"\" +if USE_GREGEX +regex_impl=iregex-gregex.c +else +regex_impl=iregex-regexh.c +endif + libcore_a_SOURCES = \ args.c \ channels.c \ @@ -45,6 +51,7 @@ libcore_a_SOURCES = \ signals.c \ special-vars.c \ utf8.c \ + $(regex_impl) \ wcwidth.c \ tls.c \ write-buffer.c @@ -97,6 +104,7 @@ pkginc_core_HEADERS = \ signals.h \ special-vars.h \ utf8.h \ + iregex.h \ window-item-def.h \ tls.h \ write-buffer.h \ diff --git a/src/core/ignore.c b/src/core/ignore.c index d4a92e3c..63a507f5 100644 --- a/src/core/ignore.c +++ b/src/core/ignore.c @@ -24,6 +24,7 @@ #include "levels.h" #include "lib-config/iconfig.h" #include "settings.h" +#include "iregex.h" #include "masks.h" #include "servers.h" @@ -67,13 +68,8 @@ static int ignore_match_pattern(IGNORE_REC *rec, const char *text) return FALSE; if (rec->regexp) { -#ifdef USE_GREGEX return rec->preg != NULL && - g_regex_match(rec->preg, text, 0, NULL); -#else - return rec->regexp_compiled && - regexec(&rec->preg, text, 0, NULL, 0) == 0; -#endif + i_regex_match(rec->preg, text, 0, NULL, NULL); } return rec->fullword ? @@ -327,41 +323,19 @@ static void ignore_remove_config(IGNORE_REC *rec) static void ignore_init_rec(IGNORE_REC *rec) { -#ifdef USE_GREGEX if (rec->preg != NULL) - g_regex_unref(rec->preg); + i_regex_unref(rec->preg); if (rec->regexp && rec->pattern != NULL) { GError *re_error = NULL; - rec->preg = g_regex_new(rec->pattern, G_REGEX_OPTIMIZE | G_REGEX_RAW | G_REGEX_CASELESS, 0, &re_error); + rec->preg = i_regex_new(rec->pattern, G_REGEX_OPTIMIZE | G_REGEX_CASELESS, 0, &re_error); if (rec->preg == NULL) { g_warning("Failed to compile regexp '%s': %s", rec->pattern, re_error->message); g_error_free(re_error); } } -#else - char *errbuf; - int errcode, errbuf_len; - - if (rec->regexp_compiled) regfree(&rec->preg); - rec->regexp_compiled = FALSE; - - if (rec->regexp && rec->pattern != NULL) { - errcode = regcomp(&rec->preg, rec->pattern, - REG_EXTENDED|REG_ICASE|REG_NOSUB); - if (errcode != 0) { - errbuf_len = regerror(errcode, &rec->preg, 0, 0); - errbuf = g_malloc(errbuf_len); - regerror(errcode, &rec->preg, errbuf, errbuf_len); - g_warning("Failed to compile regexp '%s': %s", rec->pattern, errbuf); - g_free(errbuf); - } else { - rec->regexp_compiled = TRUE; - } - } -#endif } void ignore_add_rec(IGNORE_REC *rec) @@ -381,11 +355,7 @@ static void ignore_destroy(IGNORE_REC *rec, int send_signal) if (send_signal) signal_emit("ignore destroyed", 1, rec); -#ifdef USE_GREGEX - if (rec->preg != NULL) g_regex_unref(rec->preg); -#else - if (rec->regexp_compiled) regfree(&rec->preg); -#endif + if (rec->preg != NULL) i_regex_unref(rec->preg); if (rec->channels != NULL) g_strfreev(rec->channels); g_free_not_null(rec->mask); g_free_not_null(rec->servertag); diff --git a/src/core/ignore.h b/src/core/ignore.h index 80ae1d12..e18be3c4 100644 --- a/src/core/ignore.h +++ b/src/core/ignore.h @@ -1,9 +1,7 @@ #ifndef __IGNORE_H #define __IGNORE_H -#ifndef USE_GREGEX -# include -#endif +#include "iregex.h" typedef struct _IGNORE_REC IGNORE_REC; @@ -20,12 +18,7 @@ struct _IGNORE_REC { unsigned int regexp:1; unsigned int fullword:1; unsigned int replies:1; /* ignore replies to nick in channel */ -#ifdef USE_GREGEX - GRegex *preg; -#else - unsigned int regexp_compiled:1; /* should always be TRUE, unless regexp is invalid */ - regex_t preg; -#endif + Regex *preg; }; extern GSList *ignores; diff --git a/src/core/iregex-gregex.c b/src/core/iregex-gregex.c new file mode 100644 index 00000000..0de11e64 --- /dev/null +++ b/src/core/iregex-gregex.c @@ -0,0 +1,137 @@ +#include + +#include "iregex.h" + +const gchar * +make_valid_utf8(const gchar *text, gboolean *free_ret) +{ + GString *str; + const gchar *ptr; + if (g_utf8_validate(text, -1, NULL)) { + if (free_ret) + *free_ret = FALSE; + return text; + } + + str = g_string_sized_new(strlen(text) + 12); + + ptr = text; + while (*ptr) { + gunichar c = g_utf8_get_char_validated(ptr, -1); + /* the unicode is invalid */ + if (c == (gunichar)-1 || c == (gunichar)-2) { + /* encode the byte into PUA-A */ + g_string_append_unichar(str, (gunichar) (0xfff00 | (*ptr & 0xff))); + ptr++; + } else { + g_string_append_unichar(str, c); + ptr = g_utf8_next_char(ptr); + } + } + + if (free_ret) + *free_ret = TRUE; + return g_string_free(str, FALSE); +} + +Regex * +i_regex_new (const gchar *pattern, + GRegexCompileFlags compile_options, + GRegexMatchFlags match_options, + GError **error) +{ + const gchar *valid_pattern; + gboolean free_valid_pattern; + Regex *ret = NULL; + + valid_pattern = make_valid_utf8(pattern, &free_valid_pattern); + ret = g_regex_new(valid_pattern, compile_options, match_options, error); + + if (free_valid_pattern) + g_free_not_null((gchar *)valid_pattern); + + return ret; +} + +void +i_regex_unref (Regex *regex) +{ + g_regex_unref(regex); +} + +/* if new_string is present, the caller must free new_string. + otherwise, g_match_info_get_string must not be used. */ +gboolean +i_regex_match (const Regex *regex, + const gchar *string, + GRegexMatchFlags match_options, + MatchInfo **match_info, + const gchar **new_string) +{ + gboolean ret; + gboolean free_valid_string; + const gchar *valid_string = make_valid_utf8(string, &free_valid_string); + + ret = g_regex_match(regex, valid_string, match_options, match_info); + if (free_valid_string) { + if (new_string) + *new_string = valid_string; + else + g_free_not_null((gchar *)valid_string); + } + return ret; +} + +gsize +strlen_pua_oddly(const char *str) +{ + const gchar *ptr; + gsize ret = 0; + ptr = str; + + while (*ptr) { + const gchar *old; + gunichar c = g_utf8_get_char(ptr); + old = ptr; + ptr = g_utf8_next_char(ptr); + + /* it is our PUA encoded byte */ + if ((c & 0xfff00) == 0xfff00) + ret++; + else + ret += ptr - old; + } + + return ret; +} + +gboolean +i_match_info_fetch_pos (const MatchInfo *match_info, + gint match_num, + gint *start_pos, + gint *end_pos, + const gchar *new_string) +{ + gint tmp_start, tmp_end, new_start_pos; + gboolean ret; + + if (!new_string || (!start_pos && !end_pos)) + return g_match_info_fetch_pos(match_info, match_num, start_pos, end_pos); + + ret = g_match_info_fetch_pos(match_info, match_num, &tmp_start, &tmp_end); + if (start_pos || end_pos) { + gchar *to_start = g_strndup(new_string, tmp_start); + new_start_pos = strlen_pua_oddly(to_start); + g_free_not_null(to_start); + + if (start_pos) + *start_pos = new_start_pos; + + if (end_pos) { + gchar *to_end = g_strndup(new_string + tmp_start, tmp_end - tmp_start); + *end_pos = new_start_pos + strlen_pua_oddly(to_end); + g_free_not_null(to_end); + } + } + return ret; +} diff --git a/src/core/iregex-regexh.c b/src/core/iregex-regexh.c new file mode 100644 index 00000000..aabe44f6 --- /dev/null +++ b/src/core/iregex-regexh.c @@ -0,0 +1,101 @@ +#include "iregex.h" + +Regex * +i_regex_new (const gchar *pattern, + GRegexCompileFlags compile_options, + GRegexMatchFlags match_options, + GError **error) +{ + Regex *regex; + char *errbuf; + int cflags; + int errcode, errbuf_len; + + regex = g_new0(Regex, 1); + cflags = REG_EXTENDED; + if (compile_options & G_REGEX_CASELESS) + cflags |= REG_ICASE; + if (compile_options & G_REGEX_MULTILINE) + cflags |= REG_NEWLINE; + if (match_options & G_REGEX_MATCH_NOTBOL) + cflags |= REG_NOTBOL; + if (match_options & G_REGEX_MATCH_NOTEOL) + cflags |= REG_NOTEOL; + + errcode = regcomp(regex, pattern, cflags); + if (errcode != 0) { + errbuf_len = regerror(errcode, regex, 0, 0); + errbuf = g_malloc(errbuf_len); + regerror(errcode, regex, errbuf, errbuf_len); + g_set_error(error, G_REGEX_ERROR, errcode, "%s", errbuf); + g_free(errbuf); + g_free(regex); + return NULL; + } else { + return regex; + } +} + +void +i_regex_unref (Regex *regex) +{ + regfree(regex); + g_free(regex); +} + +gboolean +i_regex_match (const Regex *regex, + const gchar *string, + GRegexMatchFlags match_options, + MatchInfo **match_info, + const gchar **new_string) +{ + int groups; + int eflags; + + g_return_val_if_fail(regex != NULL, FALSE); + + if (match_info != NULL) { + groups = 1 + regex->re_nsub; + *match_info = g_new0(MatchInfo, groups); + } else { + groups = 0; + } + + eflags = 0; + if (match_options & G_REGEX_MATCH_NOTBOL) + eflags |= REG_NOTBOL; + if (match_options & G_REGEX_MATCH_NOTEOL) + eflags |= REG_NOTEOL; + + return regexec(regex, string, groups, groups ? *match_info : NULL, eflags) == 0; +} + +gboolean +i_match_info_fetch_pos (const MatchInfo *match_info, + gint match_num, + gint *start_pos, + gint *end_pos, + const gchar *new_string) +{ + if (start_pos != NULL) + *start_pos = match_info[match_num].rm_so; + if (end_pos != NULL) + *end_pos = match_info[match_num].rm_eo; + + return TRUE; +} + +gboolean +i_match_info_matches (const MatchInfo *match_info) +{ + g_return_val_if_fail(match_info != NULL, FALSE); + + return match_info[0].rm_so != -1; +} + +void +i_match_info_free (MatchInfo *match_info) +{ + g_free(match_info); +} diff --git a/src/core/iregex.h b/src/core/iregex.h new file mode 100644 index 00000000..adeea987 --- /dev/null +++ b/src/core/iregex.h @@ -0,0 +1,52 @@ +#ifndef __REGEX_H +#define __REGEX_H + +#include "common.h" + +#ifdef USE_GREGEX + +#include +typedef GRegex Regex; +typedef GMatchInfo MatchInfo; + +#define i_match_info_matches g_match_info_matches +#define i_match_info_free g_match_info_free + +#else + +#include +typedef regex_t Regex; +typedef regmatch_t MatchInfo; + +gboolean +i_match_info_matches (const MatchInfo *match_info); + +void +i_match_info_free (MatchInfo *match_info); + +#endif + +Regex * +i_regex_new (const gchar *pattern, + GRegexCompileFlags compile_options, + GRegexMatchFlags match_options, + GError **error); + +void +i_regex_unref (Regex *regex); + +gboolean +i_regex_match (const Regex *regex, + const gchar *string, + GRegexMatchFlags match_options, + MatchInfo **match_info, + const gchar **new_string); + +gboolean +i_match_info_fetch_pos (const MatchInfo *match_info, + gint match_num, + gint *start_pos, + gint *end_pos, + const gchar *new_string); + +#endif diff --git a/src/core/misc.c b/src/core/misc.c index 0f038cbb..4b1e72f6 100644 --- a/src/core/misc.c +++ b/src/core/misc.c @@ -22,10 +22,6 @@ #include "misc.h" #include "commands.h" -#ifndef USE_GREGEX -# include -#endif - typedef struct { int condition; GInputFunction function; -- cgit v1.2.3 From 00354c365187cecb9bc3ce3c3b3482e32d04729a Mon Sep 17 00:00:00 2001 From: ailin-nemui Date: Mon, 5 Jun 2017 10:10:38 +0200 Subject: Update iregex-gregex.c make helper functions static --- src/core/iregex-gregex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/core') diff --git a/src/core/iregex-gregex.c b/src/core/iregex-gregex.c index 0de11e64..1a77d5b3 100644 --- a/src/core/iregex-gregex.c +++ b/src/core/iregex-gregex.c @@ -2,7 +2,7 @@ #include "iregex.h" -const gchar * +static const gchar * make_valid_utf8(const gchar *text, gboolean *free_ret) { GString *str; @@ -82,7 +82,7 @@ i_regex_match (const Regex *regex, return ret; } -gsize +static gsize strlen_pua_oddly(const char *str) { const gchar *ptr; -- cgit v1.2.3 From 48899a123d68051fbc73acb8ad151e89fdcb6b31 Mon Sep 17 00:00:00 2001 From: ailin-nemui Date: Mon, 5 Jun 2017 10:23:16 +0200 Subject: Update iregex-gregex.c add 2 comments about new_string --- src/core/iregex-gregex.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'src/core') diff --git a/src/core/iregex-gregex.c b/src/core/iregex-gregex.c index 1a77d5b3..4a1623b9 100644 --- a/src/core/iregex-gregex.c +++ b/src/core/iregex-gregex.c @@ -60,7 +60,9 @@ i_regex_unref (Regex *regex) } /* if new_string is present, the caller must free new_string. - otherwise, g_match_info_get_string must not be used. */ + otherwise, g_match_info_get_string must not be used. + if string is not vali utf8, new_string will be assigned + a similar, but valid utf8, string */ gboolean i_regex_match (const Regex *regex, const gchar *string, @@ -105,6 +107,9 @@ strlen_pua_oddly(const char *str) return ret; } +/* new_string should be passed in here from the i_regex_match call. + The start_pos and end_pos will then be calculated as if they were on + the original string */ gboolean i_match_info_fetch_pos (const MatchInfo *match_info, gint match_num, -- cgit v1.2.3 From 4edfccfce794d4c10b2a92c02fe982bb089c6629 Mon Sep 17 00:00:00 2001 From: ailin-nemui Date: Mon, 5 Jun 2017 11:41:50 +0200 Subject: get rid of new_text --- src/core/ignore.c | 2 +- src/core/iregex-gregex.c | 55 ++++++++++++++++++++++++++++++++++-------------- src/core/iregex-regexh.c | 6 ++---- src/core/iregex.h | 15 +++++-------- 4 files changed, 47 insertions(+), 31 deletions(-) (limited to 'src/core') diff --git a/src/core/ignore.c b/src/core/ignore.c index 63a507f5..cec91e6b 100644 --- a/src/core/ignore.c +++ b/src/core/ignore.c @@ -69,7 +69,7 @@ static int ignore_match_pattern(IGNORE_REC *rec, const char *text) if (rec->regexp) { return rec->preg != NULL && - i_regex_match(rec->preg, text, 0, NULL, NULL); + i_regex_match(rec->preg, text, 0, NULL); } return rec->fullword ? diff --git a/src/core/iregex-gregex.c b/src/core/iregex-gregex.c index 4a1623b9..36b4faa4 100644 --- a/src/core/iregex-gregex.c +++ b/src/core/iregex-gregex.c @@ -2,6 +2,11 @@ #include "iregex.h" +struct _MatchInfo { + const char *valid_string; + GMatchInfo *g_match_info; +}; + static const gchar * make_valid_utf8(const gchar *text, gboolean *free_ret) { @@ -59,28 +64,29 @@ i_regex_unref (Regex *regex) g_regex_unref(regex); } -/* if new_string is present, the caller must free new_string. - otherwise, g_match_info_get_string must not be used. - if string is not vali utf8, new_string will be assigned - a similar, but valid utf8, string */ gboolean i_regex_match (const Regex *regex, const gchar *string, GRegexMatchFlags match_options, - MatchInfo **match_info, - const gchar **new_string) + MatchInfo **match_info) { gboolean ret; gboolean free_valid_string; const gchar *valid_string = make_valid_utf8(string, &free_valid_string); - ret = g_regex_match(regex, valid_string, match_options, match_info); + if (match_info != NULL) + *match_info = g_new0(MatchInfo, 1); + + ret = g_regex_match(regex, valid_string, match_options, + match_info != NULL ? &(*match_info)->g_match_info : NULL); + if (free_valid_string) { - if (new_string) - *new_string = valid_string; + if (match_info != NULL) + (*match_info)->valid_string = valid_string; else g_free_not_null((gchar *)valid_string); } + return ret; } @@ -114,18 +120,20 @@ gboolean i_match_info_fetch_pos (const MatchInfo *match_info, gint match_num, gint *start_pos, - gint *end_pos, - const gchar *new_string) + gint *end_pos) { gint tmp_start, tmp_end, new_start_pos; gboolean ret; - if (!new_string || (!start_pos && !end_pos)) - return g_match_info_fetch_pos(match_info, match_num, start_pos, end_pos); + if (!match_info->valid_string || (!start_pos && !end_pos)) + return g_match_info_fetch_pos(match_info->g_match_info, + match_num, start_pos, end_pos); - ret = g_match_info_fetch_pos(match_info, match_num, &tmp_start, &tmp_end); + ret = g_match_info_fetch_pos(match_info->g_match_info, + match_num, &tmp_start, &tmp_end); if (start_pos || end_pos) { - gchar *to_start = g_strndup(new_string, tmp_start); + const gchar *str = match_info->valid_string; + gchar *to_start = g_strndup(str, tmp_start); new_start_pos = strlen_pua_oddly(to_start); g_free_not_null(to_start); @@ -133,10 +141,25 @@ i_match_info_fetch_pos (const MatchInfo *match_info, *start_pos = new_start_pos; if (end_pos) { - gchar *to_end = g_strndup(new_string + tmp_start, tmp_end - tmp_start); + gchar *to_end = g_strndup(str + tmp_start, tmp_end - tmp_start); *end_pos = new_start_pos + strlen_pua_oddly(to_end); g_free_not_null(to_end); } } return ret; } + +gboolean +i_match_info_matches (const MatchInfo *match_info) +{ + g_return_val_if_fail(match_info != NULL, FALSE); + + return g_match_info_matches(match_info->g_match_info); +} + +void +i_match_info_free (MatchInfo *match_info) +{ + g_match_info_free(match_info->g_match_info); + g_free(match_info); +} diff --git a/src/core/iregex-regexh.c b/src/core/iregex-regexh.c index aabe44f6..897eb7e2 100644 --- a/src/core/iregex-regexh.c +++ b/src/core/iregex-regexh.c @@ -47,8 +47,7 @@ gboolean i_regex_match (const Regex *regex, const gchar *string, GRegexMatchFlags match_options, - MatchInfo **match_info, - const gchar **new_string) + MatchInfo **match_info) { int groups; int eflags; @@ -75,8 +74,7 @@ gboolean i_match_info_fetch_pos (const MatchInfo *match_info, gint match_num, gint *start_pos, - gint *end_pos, - const gchar *new_string) + gint *end_pos) { if (start_pos != NULL) *start_pos = match_info[match_num].rm_so; diff --git a/src/core/iregex.h b/src/core/iregex.h index adeea987..e67378d7 100644 --- a/src/core/iregex.h +++ b/src/core/iregex.h @@ -7,10 +7,7 @@ #include typedef GRegex Regex; -typedef GMatchInfo MatchInfo; - -#define i_match_info_matches g_match_info_matches -#define i_match_info_free g_match_info_free +typedef struct _MatchInfo MatchInfo; #else @@ -18,14 +15,14 @@ typedef GMatchInfo MatchInfo; typedef regex_t Regex; typedef regmatch_t MatchInfo; +#endif + gboolean i_match_info_matches (const MatchInfo *match_info); void i_match_info_free (MatchInfo *match_info); -#endif - Regex * i_regex_new (const gchar *pattern, GRegexCompileFlags compile_options, @@ -39,14 +36,12 @@ gboolean i_regex_match (const Regex *regex, const gchar *string, GRegexMatchFlags match_options, - MatchInfo **match_info, - const gchar **new_string); + MatchInfo **match_info); gboolean i_match_info_fetch_pos (const MatchInfo *match_info, gint match_num, gint *start_pos, - gint *end_pos, - const gchar *new_string); + gint *end_pos); #endif -- cgit v1.2.3