core: improve case convert and insensitive char comparisons (closes #258)

All lowercase letters are now properly converted to uppercase letters (and vice versa), via functions `towupper` and `towlower`. Functions `string_tolower`, `string_toupper` and `utf8_charcasecmp` have been optimized to be faster when there are ASCII chars (< 128); functions are about 25-40% faster with mixed chars (both ASCII and multi-bytes). Function `utf8_wide_char` has been removed, `utf8_char_int` can be used instead.
author: Sébastien Helleu <flashcode@flashtux.org> 2022-12-21 19:23:29 +0100
committer: Sébastien Helleu <flashcode@flashtux.org> 2022-12-21 20:49:09 +0100
commit: 68b510517e7a14b2d2457f8437e9291b87e0d1d5 (patch)
tree: a3fae5b8673ec860f49315bb1b0ec72e74cf54d1 /src
parent: 95286c1eb362cedb767597ea23fb29d6455f6b94 (diff)
download: weechat-68b510517e7a14b2d2457f8437e9291b87e0d1d5.zip
4 files changed, 83 insertions, 93 deletions
diff --git a/src/core/wee-config.c b/src/core/wee-config.c
index c5bb8cd3f..f84c49626 100644
--- a/src/core/wee-config.c
+++ b/src/core/wee-config.c
@@ -500,7 +500,7 @@ config_set_word_chars (const char *str_word_chars,
                 /* char1 */
                 item = string_strndup (ptr_item, pos - ptr_item);
                 item2 = string_convert_escaped_chars (item);
-                (*word_chars)[i].char1 = utf8_wide_char (item2);
+                (*word_chars)[i].char1 = utf8_char_int (item2);
                 if (item)
                     free (item);
                 if (item2)
@@ -508,7 +508,7 @@ config_set_word_chars (const char *str_word_chars,
                 /* char2 */
                 item = strdup (pos + 1);
                 item2 = string_convert_escaped_chars (item);
-                (*word_chars)[i].char2 = utf8_wide_char (item2);
+                (*word_chars)[i].char2 = utf8_char_int (item2);
                 if (item)
                     free (item);
                 if (item2)
@@ -521,7 +521,7 @@ config_set_word_chars (const char *str_word_chars,
                 if ((*word_chars)[i].wc_class == (wctype_t)0)
                 {
                     item = string_convert_escaped_chars (ptr_item);
-                    (*word_chars)[i].char1 = utf8_wide_char (item);
+                    (*word_chars)[i].char1 = utf8_char_int (item);
                     (*word_chars)[i].char2 = (*word_chars)[i].char1;
                     if (item)
                         free (item);
diff --git a/src/core/wee-string.c b/src/core/wee-string.c
index 66bd82153..14bc245e0 100644
--- a/src/core/wee-string.c
+++ b/src/core/wee-string.c
@@ -31,8 +31,8 @@
 #include <string.h>
 #include <ctype.h>
 #include <wctype.h>
-#include <regex.h>
 #include <wchar.h>
+#include <regex.h>
 #include <stdint.h>
 #include <gcrypt.h>
 
@@ -308,67 +308,89 @@ string_repeat (const char *string, int count)
 }
 
 /*
- * Converts uppercase letters to lowercase.
- *
- * This function is locale independent: only letters 'A' to 'Z' without accents
- * are converted to lowercase. All other chars are kept as-is.
- *
- * Note: result must be freed after use.
+ * Converts string to lowercase (locale dependent).
  */
 
 char *
 string_tolower (const char *string)
 {
-    char *result, *ptr_result;
+    char **result, utf_char[5];
 
     if (!string)
         return NULL;
 
-    result = strdup (string);
+    result = string_dyn_alloc (strlen (string) + 1);
     if (!result)
         return NULL;
 
-    ptr_result = result;
-    while (ptr_result && ptr_result[0])
+    while (string && string[0])
     {
-        if ((ptr_result[0] >= 'A') && (ptr_result[0] <= 'Z'))
-            ptr_result[0] += ('a' - 'A');
-        ptr_result = (char *)utf8_next_char (ptr_result);
+        if (!((unsigned char)(string[0]) & 0x80))
+        {
+            /*
+             * optimization for single-byte char: only letters A-Z must be
+             * converted to lowercase; this is faster than calling `towlower`
+             */
+            if ((string[0] >= 'A') && (string[0] <= 'Z'))
+                utf_char[0] = string[0] + ('a' - 'A');
+            else
+                utf_char[0] = string[0];
+            utf_char[1] = '\0';
+            string_dyn_concat (result, utf_char, -1);
+            string++;
+        }
+        else
+        {
+            /* char ≥ 2 bytes, use `towlower` */
+            utf8_int_string (towlower (utf8_char_int (string)), utf_char);
+            string_dyn_concat (result, utf_char, -1);
+            string = (char *)utf8_next_char (string);
+        }
     }
-
-    return result;
+    return string_dyn_free (result, 0);
 }
 
 /*
- * Converts lowercase letters to uppercase.
- *
- * This function is locale independent: only letters 'a' to 'z' without accents
- * are converted to uppercase. All other chars are kept as-is.
- *
- * Note: result must be freed after use.
+ * Converts string to uppercase (locale dependent).
  */
 
 char *
 string_toupper (const char *string)
 {
-    char *result, *ptr_result;
+    char **result, utf_char[5];
 
     if (!string)
         return NULL;
 
-    result = strdup (string);
+    result = string_dyn_alloc (strlen (string) + 1);
     if (!result)
         return NULL;
 
-    ptr_result = result;
-    while (ptr_result && ptr_result[0])
+    while (string && string[0])
     {
-        if ((ptr_result[0] >= 'a') && (ptr_result[0] <= 'z'))
-            ptr_result[0] -= ('a' - 'A');
-        ptr_result = (char *)utf8_next_char (ptr_result);
+        if (!((unsigned char)(string[0]) & 0x80))
+        {
+            /*
+             * optimization for single-byte char: only letters a-z must be
+             * converted to uppercase; this is faster than calling `towupper`
+             */
+            if ((string[0] >= 'a') && (string[0] <= 'z'))
+                utf_char[0] = string[0] - ('a' - 'A');
+            else
+                utf_char[0] = string[0];
+            utf_char[1] = '\0';
+            string_dyn_concat (result, utf_char, -1);
+            string++;
+        }
+        else
+        {
+            /* char ≥ 2 bytes, use `towupper` */
+            utf8_int_string (towupper (utf8_char_int (string)), utf_char);
+            string_dyn_concat (result, utf_char, -1);
+            string = (char *)utf8_next_char (string);
+        }
     }
-
-    return result;
+    return string_dyn_free (result, 0);
 }
 
 /*
@@ -1174,11 +1196,11 @@ string_is_word_char (const char *string,
     wint_t c;
     int i, match;
 
-    c = utf8_wide_char (string);
-
-    if (c == WEOF)
+    if (!string || !string[0])
         return 0;
 
+    c = utf8_char_int (string);
+
     for (i = 0; i < word_chars_count; i++)
     {
         if (word_chars[i].wc_class != (wctype_t)0)
diff --git a/src/core/wee-utf8.c b/src/core/wee-utf8.c
index c8756a695..8f6a69cff 100644
--- a/src/core/wee-utf8.c
+++ b/src/core/wee-utf8.c
@@ -394,49 +394,6 @@ utf8_int_string (unsigned int unicode_value, char *string)
 }
 
 /*
- * Gets wide char from string (first char).
- *
- * Returns the char as "wint_t", WEOF is string was NULL/empty or in case of
- * error.
- */
-
-wint_t
-utf8_wide_char (const char *string)
-{
-    int char_size;
-    wint_t result;
-
-    if (!string || !string[0])
-        return WEOF;
-
-    char_size = utf8_char_size (string);
-    switch (char_size)
-    {
-        case 1:
-            result = (wint_t)string[0];
-            break;
-        case 2:
-            result = ((wint_t)((unsigned char)string[0])) << 8
-                |  ((wint_t)((unsigned char)string[1]));
-            break;
-        case 3:
-            result = ((wint_t)((unsigned char)string[0])) << 16
-                |  ((wint_t)((unsigned char)string[1])) << 8
-                |  ((wint_t)((unsigned char)string[2]));
-            break;
-        case 4:
-            result = ((wint_t)((unsigned char)string[0])) << 24
-                |  ((wint_t)((unsigned char)string[1])) << 16
-                |  ((wint_t)((unsigned char)string[2])) << 8
-                |  ((wint_t)((unsigned char)string[3]));
-            break;
-        default:
-            result = WEOF;
-    }
-    return result;
-}
-
-/*
  * Gets size of UTF-8 char (in bytes).
  *
  * Returns an integer between 0 and 4.
@@ -626,13 +583,25 @@ utf8_charcasecmp (const char *string1, const char *string2)
     if (!string1 || !string2)
         return (string1) ? 1 : ((string2) ? -1 : 0);
 
-    wchar1 = utf8_wide_char (string1);
-    if ((wchar1 >= 'A') && (wchar1 <= 'Z'))
-        wchar1 += ('a' - 'A');
-
-    wchar2 = utf8_wide_char (string2);
-    if ((wchar2 >= 'A') && (wchar2 <= 'Z'))
-        wchar2 += ('a' - 'A');
+    /*
+     * optimization for single-byte chars: only letters A-Z must be converted
+     * to lowercase; this is faster than calling `towlower`
+     */
+    if (!((unsigned char)(string1[0]) & 0x80)
+        && !((unsigned char)(string2[0]) & 0x80))
+    {
+        wchar1 = string1[0];
+        if ((wchar1 >= 'A') && (wchar1 <= 'Z'))
+            wchar1 += ('a' - 'A');
+        wchar2 = string2[0];
+        if ((wchar2 >= 'A') && (wchar2 <= 'Z'))
+            wchar2 += ('a' - 'A');
+    }
+    else
+    {
+        wchar1 = towlower (utf8_char_int (string1));
+        wchar2 = towlower (utf8_char_int (string2));
+    }
 
     return (wchar1 < wchar2) ? -1 : ((wchar1 == wchar2) ? 0 : 1);
 }
@@ -658,17 +627,17 @@ utf8_charcasecmp (const char *string1, const char *string2)
 int
 utf8_charcasecmp_range (const char *string1, const char *string2, int range)
 {
-    wint_t wchar1, wchar2;
+    wchar_t wchar1, wchar2;
 
     if (!string1 || !string2)
         return (string1) ? 1 : ((string2) ? -1 : 0);
 
-    wchar1 = utf8_wide_char (string1);
-    if ((wchar1 >= (wint_t)'A') && (wchar1 < (wint_t)('A' + range)))
+    wchar1 = utf8_char_int (string1);
+    if ((wchar1 >= (wchar_t)'A') && (wchar1 < (wchar_t)('A' + range)))
         wchar1 += ('a' - 'A');
 
-    wchar2 = utf8_wide_char (string2);
-    if ((wchar2 >= (wint_t)'A') && (wchar2 < (wint_t)('A' + range)))
+    wchar2 = utf8_char_int (string2);
+    if ((wchar2 >= (wchar_t)'A') && (wchar2 < (wchar_t)('A' + range)))
         wchar2 += ('a' - 'A');
 
     return (wchar1 < wchar2) ? -1 : ((wchar1 == wchar2) ? 0 : 1);
diff --git a/src/core/wee-utf8.h b/src/core/wee-utf8.h
index 6c5771860..9507ea05d 100644
--- a/src/core/wee-utf8.h
+++ b/src/core/wee-utf8.h
@@ -37,7 +37,6 @@ extern const char *utf8_prev_char (const char *string_start,
 extern const char *utf8_next_char (const char *string);
 extern int utf8_char_int (const char *string);
 extern int utf8_int_string (unsigned int unicode_value, char *string);
-extern wint_t utf8_wide_char (const char *string);
 extern int utf8_char_size (const char *string);
 extern int utf8_strlen (const char *string);
 extern int utf8_strnlen (const char *string, int bytes);
author	Sébastien Helleu <flashcode@flashtux.org>	2022-12-21 19:23:29 +0100
committer	Sébastien Helleu <flashcode@flashtux.org>	2022-12-21 20:49:09 +0100
commit	68b510517e7a14b2d2457f8437e9291b87e0d1d5 (patch)
tree	a3fae5b8673ec860f49315bb1b0ec72e74cf54d1 /src
parent	95286c1eb362cedb767597ea23fb29d6455f6b94 (diff)
download	weechat-68b510517e7a14b2d2457f8437e9291b87e0d1d5.zip