summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorSébastien Helleu <flashcode@flashtux.org>2022-12-21 19:23:29 +0100
committerSébastien Helleu <flashcode@flashtux.org>2022-12-21 20:49:09 +0100
commit68b510517e7a14b2d2457f8437e9291b87e0d1d5 (patch)
treea3fae5b8673ec860f49315bb1b0ec72e74cf54d1 /src
parent95286c1eb362cedb767597ea23fb29d6455f6b94 (diff)
downloadweechat-68b510517e7a14b2d2457f8437e9291b87e0d1d5.zip
core: improve case convert and insensitive char comparisons (closes #258)
All lowercase letters are now properly converted to uppercase letters (and vice versa), via functions `towupper` and `towlower`. Functions `string_tolower`, `string_toupper` and `utf8_charcasecmp` have been optimized to be faster when there are ASCII chars (< 128); functions are about 25-40% faster with mixed chars (both ASCII and multi-bytes). Function `utf8_wide_char` has been removed, `utf8_char_int` can be used instead.
Diffstat (limited to 'src')
-rw-r--r--src/core/wee-config.c6
-rw-r--r--src/core/wee-string.c90
-rw-r--r--src/core/wee-utf8.c79
-rw-r--r--src/core/wee-utf8.h1
4 files changed, 83 insertions, 93 deletions
diff --git a/src/core/wee-config.c b/src/core/wee-config.c
index c5bb8cd3f..f84c49626 100644
--- a/src/core/wee-config.c
+++ b/src/core/wee-config.c
@@ -500,7 +500,7 @@ config_set_word_chars (const char *str_word_chars,
/* char1 */
item = string_strndup (ptr_item, pos - ptr_item);
item2 = string_convert_escaped_chars (item);
- (*word_chars)[i].char1 = utf8_wide_char (item2);
+ (*word_chars)[i].char1 = utf8_char_int (item2);
if (item)
free (item);
if (item2)
@@ -508,7 +508,7 @@ config_set_word_chars (const char *str_word_chars,
/* char2 */
item = strdup (pos + 1);
item2 = string_convert_escaped_chars (item);
- (*word_chars)[i].char2 = utf8_wide_char (item2);
+ (*word_chars)[i].char2 = utf8_char_int (item2);
if (item)
free (item);
if (item2)
@@ -521,7 +521,7 @@ config_set_word_chars (const char *str_word_chars,
if ((*word_chars)[i].wc_class == (wctype_t)0)
{
item = string_convert_escaped_chars (ptr_item);
- (*word_chars)[i].char1 = utf8_wide_char (item);
+ (*word_chars)[i].char1 = utf8_char_int (item);
(*word_chars)[i].char2 = (*word_chars)[i].char1;
if (item)
free (item);
diff --git a/src/core/wee-string.c b/src/core/wee-string.c
index 66bd82153..14bc245e0 100644
--- a/src/core/wee-string.c
+++ b/src/core/wee-string.c
@@ -31,8 +31,8 @@
#include <string.h>
#include <ctype.h>
#include <wctype.h>
-#include <regex.h>
#include <wchar.h>
+#include <regex.h>
#include <stdint.h>
#include <gcrypt.h>
@@ -308,67 +308,89 @@ string_repeat (const char *string, int count)
}
/*
- * Converts uppercase letters to lowercase.
- *
- * This function is locale independent: only letters 'A' to 'Z' without accents
- * are converted to lowercase. All other chars are kept as-is.
- *
- * Note: result must be freed after use.
+ * Converts string to lowercase (locale dependent).
*/
char *
string_tolower (const char *string)
{
- char *result, *ptr_result;
+ char **result, utf_char[5];
if (!string)
return NULL;
- result = strdup (string);
+ result = string_dyn_alloc (strlen (string) + 1);
if (!result)
return NULL;
- ptr_result = result;
- while (ptr_result && ptr_result[0])
+ while (string && string[0])
{
- if ((ptr_result[0] >= 'A') && (ptr_result[0] <= 'Z'))
- ptr_result[0] += ('a' - 'A');
- ptr_result = (char *)utf8_next_char (ptr_result);
+ if (!((unsigned char)(string[0]) & 0x80))
+ {
+ /*
+ * optimization for single-byte char: only letters A-Z must be
+ * converted to lowercase; this is faster than calling `towlower`
+ */
+ if ((string[0] >= 'A') && (string[0] <= 'Z'))
+ utf_char[0] = string[0] + ('a' - 'A');
+ else
+ utf_char[0] = string[0];
+ utf_char[1] = '\0';
+ string_dyn_concat (result, utf_char, -1);
+ string++;
+ }
+ else
+ {
+ /* char ≥ 2 bytes, use `towlower` */
+ utf8_int_string (towlower (utf8_char_int (string)), utf_char);
+ string_dyn_concat (result, utf_char, -1);
+ string = (char *)utf8_next_char (string);
+ }
}
-
- return result;
+ return string_dyn_free (result, 0);
}
/*
- * Converts lowercase letters to uppercase.
- *
- * This function is locale independent: only letters 'a' to 'z' without accents
- * are converted to uppercase. All other chars are kept as-is.
- *
- * Note: result must be freed after use.
+ * Converts string to uppercase (locale dependent).
*/
char *
string_toupper (const char *string)
{
- char *result, *ptr_result;
+ char **result, utf_char[5];
if (!string)
return NULL;
- result = strdup (string);
+ result = string_dyn_alloc (strlen (string) + 1);
if (!result)
return NULL;
- ptr_result = result;
- while (ptr_result && ptr_result[0])
+ while (string && string[0])
{
- if ((ptr_result[0] >= 'a') && (ptr_result[0] <= 'z'))
- ptr_result[0] -= ('a' - 'A');
- ptr_result = (char *)utf8_next_char (ptr_result);
+ if (!((unsigned char)(string[0]) & 0x80))
+ {
+ /*
+ * optimization for single-byte char: only letters a-z must be
+ * converted to uppercase; this is faster than calling `towupper`
+ */
+ if ((string[0] >= 'a') && (string[0] <= 'z'))
+ utf_char[0] = string[0] - ('a' - 'A');
+ else
+ utf_char[0] = string[0];
+ utf_char[1] = '\0';
+ string_dyn_concat (result, utf_char, -1);
+ string++;
+ }
+ else
+ {
+ /* char ≥ 2 bytes, use `towupper` */
+ utf8_int_string (towupper (utf8_char_int (string)), utf_char);
+ string_dyn_concat (result, utf_char, -1);
+ string = (char *)utf8_next_char (string);
+ }
}
-
- return result;
+ return string_dyn_free (result, 0);
}
/*
@@ -1174,11 +1196,11 @@ string_is_word_char (const char *string,
wint_t c;
int i, match;
- c = utf8_wide_char (string);
-
- if (c == WEOF)
+ if (!string || !string[0])
return 0;
+ c = utf8_char_int (string);
+
for (i = 0; i < word_chars_count; i++)
{
if (word_chars[i].wc_class != (wctype_t)0)
diff --git a/src/core/wee-utf8.c b/src/core/wee-utf8.c
index c8756a695..8f6a69cff 100644
--- a/src/core/wee-utf8.c
+++ b/src/core/wee-utf8.c
@@ -394,49 +394,6 @@ utf8_int_string (unsigned int unicode_value, char *string)
}
/*
- * Gets wide char from string (first char).
- *
- * Returns the char as "wint_t", WEOF is string was NULL/empty or in case of
- * error.
- */
-
-wint_t
-utf8_wide_char (const char *string)
-{
- int char_size;
- wint_t result;
-
- if (!string || !string[0])
- return WEOF;
-
- char_size = utf8_char_size (string);
- switch (char_size)
- {
- case 1:
- result = (wint_t)string[0];
- break;
- case 2:
- result = ((wint_t)((unsigned char)string[0])) << 8
- | ((wint_t)((unsigned char)string[1]));
- break;
- case 3:
- result = ((wint_t)((unsigned char)string[0])) << 16
- | ((wint_t)((unsigned char)string[1])) << 8
- | ((wint_t)((unsigned char)string[2]));
- break;
- case 4:
- result = ((wint_t)((unsigned char)string[0])) << 24
- | ((wint_t)((unsigned char)string[1])) << 16
- | ((wint_t)((unsigned char)string[2])) << 8
- | ((wint_t)((unsigned char)string[3]));
- break;
- default:
- result = WEOF;
- }
- return result;
-}
-
-/*
* Gets size of UTF-8 char (in bytes).
*
* Returns an integer between 0 and 4.
@@ -626,13 +583,25 @@ utf8_charcasecmp (const char *string1, const char *string2)
if (!string1 || !string2)
return (string1) ? 1 : ((string2) ? -1 : 0);
- wchar1 = utf8_wide_char (string1);
- if ((wchar1 >= 'A') && (wchar1 <= 'Z'))
- wchar1 += ('a' - 'A');
-
- wchar2 = utf8_wide_char (string2);
- if ((wchar2 >= 'A') && (wchar2 <= 'Z'))
- wchar2 += ('a' - 'A');
+ /*
+ * optimization for single-byte chars: only letters A-Z must be converted
+ * to lowercase; this is faster than calling `towlower`
+ */
+ if (!((unsigned char)(string1[0]) & 0x80)
+ && !((unsigned char)(string2[0]) & 0x80))
+ {
+ wchar1 = string1[0];
+ if ((wchar1 >= 'A') && (wchar1 <= 'Z'))
+ wchar1 += ('a' - 'A');
+ wchar2 = string2[0];
+ if ((wchar2 >= 'A') && (wchar2 <= 'Z'))
+ wchar2 += ('a' - 'A');
+ }
+ else
+ {
+ wchar1 = towlower (utf8_char_int (string1));
+ wchar2 = towlower (utf8_char_int (string2));
+ }
return (wchar1 < wchar2) ? -1 : ((wchar1 == wchar2) ? 0 : 1);
}
@@ -658,17 +627,17 @@ utf8_charcasecmp (const char *string1, const char *string2)
int
utf8_charcasecmp_range (const char *string1, const char *string2, int range)
{
- wint_t wchar1, wchar2;
+ wchar_t wchar1, wchar2;
if (!string1 || !string2)
return (string1) ? 1 : ((string2) ? -1 : 0);
- wchar1 = utf8_wide_char (string1);
- if ((wchar1 >= (wint_t)'A') && (wchar1 < (wint_t)('A' + range)))
+ wchar1 = utf8_char_int (string1);
+ if ((wchar1 >= (wchar_t)'A') && (wchar1 < (wchar_t)('A' + range)))
wchar1 += ('a' - 'A');
- wchar2 = utf8_wide_char (string2);
- if ((wchar2 >= (wint_t)'A') && (wchar2 < (wint_t)('A' + range)))
+ wchar2 = utf8_char_int (string2);
+ if ((wchar2 >= (wchar_t)'A') && (wchar2 < (wchar_t)('A' + range)))
wchar2 += ('a' - 'A');
return (wchar1 < wchar2) ? -1 : ((wchar1 == wchar2) ? 0 : 1);
diff --git a/src/core/wee-utf8.h b/src/core/wee-utf8.h
index 6c5771860..9507ea05d 100644
--- a/src/core/wee-utf8.h
+++ b/src/core/wee-utf8.h
@@ -37,7 +37,6 @@ extern const char *utf8_prev_char (const char *string_start,
extern const char *utf8_next_char (const char *string);
extern int utf8_char_int (const char *string);
extern int utf8_int_string (unsigned int unicode_value, char *string);
-extern wint_t utf8_wide_char (const char *string);
extern int utf8_char_size (const char *string);
extern int utf8_strlen (const char *string);
extern int utf8_strnlen (const char *string, int bytes);