1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
|
#include <string.h>
#include "iregex.h"
struct _MatchInfo {
const char *valid_string;
GMatchInfo *g_match_info;
};
static const gchar *
make_valid_utf8(const gchar *text, gboolean *free_ret)
{
GString *str;
const gchar *ptr;
if (g_utf8_validate(text, -1, NULL)) {
if (free_ret)
*free_ret = FALSE;
return text;
}
str = g_string_sized_new(strlen(text) + 12);
ptr = text;
while (*ptr) {
gunichar c = g_utf8_get_char_validated(ptr, -1);
/* the unicode is invalid */
if (c == (gunichar)-1 || c == (gunichar)-2) {
/* encode the byte into PUA-A */
g_string_append_unichar(str, (gunichar) (0xfff00 | (*ptr & 0xff)));
ptr++;
} else {
g_string_append_unichar(str, c);
ptr = g_utf8_next_char(ptr);
}
}
if (free_ret)
*free_ret = TRUE;
return g_string_free(str, FALSE);
}
Regex *
i_regex_new (const gchar *pattern,
GRegexCompileFlags compile_options,
GRegexMatchFlags match_options,
GError **error)
{
const gchar *valid_pattern;
gboolean free_valid_pattern;
Regex *ret = NULL;
valid_pattern = make_valid_utf8(pattern, &free_valid_pattern);
ret = g_regex_new(valid_pattern, compile_options, match_options, error);
if (free_valid_pattern)
g_free_not_null((gchar *)valid_pattern);
return ret;
}
void
i_regex_unref (Regex *regex)
{
g_regex_unref(regex);
}
gboolean
i_regex_match (const Regex *regex,
const gchar *string,
GRegexMatchFlags match_options,
MatchInfo **match_info)
{
gboolean ret;
gboolean free_valid_string;
const gchar *valid_string = make_valid_utf8(string, &free_valid_string);
if (match_info != NULL)
*match_info = g_new0(MatchInfo, 1);
ret = g_regex_match(regex, valid_string, match_options,
match_info != NULL ? &(*match_info)->g_match_info : NULL);
if (free_valid_string) {
if (match_info != NULL)
(*match_info)->valid_string = valid_string;
else
g_free_not_null((gchar *)valid_string);
}
return ret;
}
static gsize
strlen_pua_oddly(const char *str)
{
const gchar *ptr;
gsize ret = 0;
ptr = str;
while (*ptr) {
const gchar *old;
gunichar c = g_utf8_get_char(ptr);
old = ptr;
ptr = g_utf8_next_char(ptr);
/* it is our PUA encoded byte */
if ((c & 0xfff00) == 0xfff00)
ret++;
else
ret += ptr - old;
}
return ret;
}
/* new_string should be passed in here from the i_regex_match call.
The start_pos and end_pos will then be calculated as if they were on
the original string */
gboolean
i_match_info_fetch_pos (const MatchInfo *match_info,
gint match_num,
gint *start_pos,
gint *end_pos)
{
gint tmp_start, tmp_end, new_start_pos;
gboolean ret;
if (!match_info->valid_string || (!start_pos && !end_pos))
return g_match_info_fetch_pos(match_info->g_match_info,
match_num, start_pos, end_pos);
ret = g_match_info_fetch_pos(match_info->g_match_info,
match_num, &tmp_start, &tmp_end);
if (start_pos || end_pos) {
const gchar *str = match_info->valid_string;
gchar *to_start = g_strndup(str, tmp_start);
new_start_pos = strlen_pua_oddly(to_start);
g_free_not_null(to_start);
if (start_pos)
*start_pos = new_start_pos;
if (end_pos) {
gchar *to_end = g_strndup(str + tmp_start, tmp_end - tmp_start);
*end_pos = new_start_pos + strlen_pua_oddly(to_end);
g_free_not_null(to_end);
}
}
return ret;
}
gboolean
i_match_info_matches (const MatchInfo *match_info)
{
g_return_val_if_fail(match_info != NULL, FALSE);
return g_match_info_matches(match_info->g_match_info);
}
void
i_match_info_free (MatchInfo *match_info)
{
g_match_info_free(match_info->g_match_info);
g_free(match_info);
}
|