diff options
Diffstat (limited to 'src/core/iregex-gregex.c')
-rw-r--r-- | src/core/iregex-gregex.c | 165 |
1 files changed, 165 insertions, 0 deletions
diff --git a/src/core/iregex-gregex.c b/src/core/iregex-gregex.c new file mode 100644 index 00000000..36b4faa4 --- /dev/null +++ b/src/core/iregex-gregex.c @@ -0,0 +1,165 @@ +#include <string.h> + +#include "iregex.h" + +struct _MatchInfo { + const char *valid_string; + GMatchInfo *g_match_info; +}; + +static const gchar * +make_valid_utf8(const gchar *text, gboolean *free_ret) +{ + GString *str; + const gchar *ptr; + if (g_utf8_validate(text, -1, NULL)) { + if (free_ret) + *free_ret = FALSE; + return text; + } + + str = g_string_sized_new(strlen(text) + 12); + + ptr = text; + while (*ptr) { + gunichar c = g_utf8_get_char_validated(ptr, -1); + /* the unicode is invalid */ + if (c == (gunichar)-1 || c == (gunichar)-2) { + /* encode the byte into PUA-A */ + g_string_append_unichar(str, (gunichar) (0xfff00 | (*ptr & 0xff))); + ptr++; + } else { + g_string_append_unichar(str, c); + ptr = g_utf8_next_char(ptr); + } + } + + if (free_ret) + *free_ret = TRUE; + return g_string_free(str, FALSE); +} + +Regex * +i_regex_new (const gchar *pattern, + GRegexCompileFlags compile_options, + GRegexMatchFlags match_options, + GError **error) +{ + const gchar *valid_pattern; + gboolean free_valid_pattern; + Regex *ret = NULL; + + valid_pattern = make_valid_utf8(pattern, &free_valid_pattern); + ret = g_regex_new(valid_pattern, compile_options, match_options, error); + + if (free_valid_pattern) + g_free_not_null((gchar *)valid_pattern); + + return ret; +} + +void +i_regex_unref (Regex *regex) +{ + g_regex_unref(regex); +} + +gboolean +i_regex_match (const Regex *regex, + const gchar *string, + GRegexMatchFlags match_options, + MatchInfo **match_info) +{ + gboolean ret; + gboolean free_valid_string; + const gchar *valid_string = make_valid_utf8(string, &free_valid_string); + + if (match_info != NULL) + *match_info = g_new0(MatchInfo, 1); + + ret = g_regex_match(regex, valid_string, match_options, + match_info != NULL ? &(*match_info)->g_match_info : NULL); + + if (free_valid_string) { + if (match_info != NULL) + (*match_info)->valid_string = valid_string; + else + g_free_not_null((gchar *)valid_string); + } + + return ret; +} + +static gsize +strlen_pua_oddly(const char *str) +{ + const gchar *ptr; + gsize ret = 0; + ptr = str; + + while (*ptr) { + const gchar *old; + gunichar c = g_utf8_get_char(ptr); + old = ptr; + ptr = g_utf8_next_char(ptr); + + /* it is our PUA encoded byte */ + if ((c & 0xfff00) == 0xfff00) + ret++; + else + ret += ptr - old; + } + + return ret; +} + +/* new_string should be passed in here from the i_regex_match call. + The start_pos and end_pos will then be calculated as if they were on + the original string */ +gboolean +i_match_info_fetch_pos (const MatchInfo *match_info, + gint match_num, + gint *start_pos, + gint *end_pos) +{ + gint tmp_start, tmp_end, new_start_pos; + gboolean ret; + + if (!match_info->valid_string || (!start_pos && !end_pos)) + return g_match_info_fetch_pos(match_info->g_match_info, + match_num, start_pos, end_pos); + + ret = g_match_info_fetch_pos(match_info->g_match_info, + match_num, &tmp_start, &tmp_end); + if (start_pos || end_pos) { + const gchar *str = match_info->valid_string; + gchar *to_start = g_strndup(str, tmp_start); + new_start_pos = strlen_pua_oddly(to_start); + g_free_not_null(to_start); + + if (start_pos) + *start_pos = new_start_pos; + + if (end_pos) { + gchar *to_end = g_strndup(str + tmp_start, tmp_end - tmp_start); + *end_pos = new_start_pos + strlen_pua_oddly(to_end); + g_free_not_null(to_end); + } + } + return ret; +} + +gboolean +i_match_info_matches (const MatchInfo *match_info) +{ + g_return_val_if_fail(match_info != NULL, FALSE); + + return g_match_info_matches(match_info->g_match_info); +} + +void +i_match_info_free (MatchInfo *match_info) +{ + g_match_info_free(match_info->g_match_info); + g_free(match_info); +} |