From 68b510517e7a14b2d2457f8437e9291b87e0d1d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Helleu?= Date: Wed, 21 Dec 2022 19:23:29 +0100 Subject: core: improve case convert and insensitive char comparisons (closes #258) All lowercase letters are now properly converted to uppercase letters (and vice versa), via functions `towupper` and `towlower`. Functions `string_tolower`, `string_toupper` and `utf8_charcasecmp` have been optimized to be faster when there are ASCII chars (< 128); functions are about 25-40% faster with mixed chars (both ASCII and multi-bytes). Function `utf8_wide_char` has been removed, `utf8_char_int` can be used instead. --- src/core/wee-config.c | 6 ++-- src/core/wee-string.c | 90 ++++++++++++++++++++++++++++++++------------------- src/core/wee-utf8.c | 79 ++++++++++++++------------------------------ src/core/wee-utf8.h | 1 - 4 files changed, 83 insertions(+), 93 deletions(-) (limited to 'src') diff --git a/src/core/wee-config.c b/src/core/wee-config.c index c5bb8cd3f..f84c49626 100644 --- a/src/core/wee-config.c +++ b/src/core/wee-config.c @@ -500,7 +500,7 @@ config_set_word_chars (const char *str_word_chars, /* char1 */ item = string_strndup (ptr_item, pos - ptr_item); item2 = string_convert_escaped_chars (item); - (*word_chars)[i].char1 = utf8_wide_char (item2); + (*word_chars)[i].char1 = utf8_char_int (item2); if (item) free (item); if (item2) @@ -508,7 +508,7 @@ config_set_word_chars (const char *str_word_chars, /* char2 */ item = strdup (pos + 1); item2 = string_convert_escaped_chars (item); - (*word_chars)[i].char2 = utf8_wide_char (item2); + (*word_chars)[i].char2 = utf8_char_int (item2); if (item) free (item); if (item2) @@ -521,7 +521,7 @@ config_set_word_chars (const char *str_word_chars, if ((*word_chars)[i].wc_class == (wctype_t)0) { item = string_convert_escaped_chars (ptr_item); - (*word_chars)[i].char1 = utf8_wide_char (item); + (*word_chars)[i].char1 = utf8_char_int (item); (*word_chars)[i].char2 = (*word_chars)[i].char1; if (item) free (item); diff --git a/src/core/wee-string.c b/src/core/wee-string.c index 66bd82153..14bc245e0 100644 --- a/src/core/wee-string.c +++ b/src/core/wee-string.c @@ -31,8 +31,8 @@ #include #include #include -#include #include +#include #include #include @@ -308,67 +308,89 @@ string_repeat (const char *string, int count) } /* - * Converts uppercase letters to lowercase. - * - * This function is locale independent: only letters 'A' to 'Z' without accents - * are converted to lowercase. All other chars are kept as-is. - * - * Note: result must be freed after use. + * Converts string to lowercase (locale dependent). */ char * string_tolower (const char *string) { - char *result, *ptr_result; + char **result, utf_char[5]; if (!string) return NULL; - result = strdup (string); + result = string_dyn_alloc (strlen (string) + 1); if (!result) return NULL; - ptr_result = result; - while (ptr_result && ptr_result[0]) + while (string && string[0]) { - if ((ptr_result[0] >= 'A') && (ptr_result[0] <= 'Z')) - ptr_result[0] += ('a' - 'A'); - ptr_result = (char *)utf8_next_char (ptr_result); + if (!((unsigned char)(string[0]) & 0x80)) + { + /* + * optimization for single-byte char: only letters A-Z must be + * converted to lowercase; this is faster than calling `towlower` + */ + if ((string[0] >= 'A') && (string[0] <= 'Z')) + utf_char[0] = string[0] + ('a' - 'A'); + else + utf_char[0] = string[0]; + utf_char[1] = '\0'; + string_dyn_concat (result, utf_char, -1); + string++; + } + else + { + /* char ≥ 2 bytes, use `towlower` */ + utf8_int_string (towlower (utf8_char_int (string)), utf_char); + string_dyn_concat (result, utf_char, -1); + string = (char *)utf8_next_char (string); + } } - - return result; + return string_dyn_free (result, 0); } /* - * Converts lowercase letters to uppercase. - * - * This function is locale independent: only letters 'a' to 'z' without accents - * are converted to uppercase. All other chars are kept as-is. - * - * Note: result must be freed after use. + * Converts string to uppercase (locale dependent). */ char * string_toupper (const char *string) { - char *result, *ptr_result; + char **result, utf_char[5]; if (!string) return NULL; - result = strdup (string); + result = string_dyn_alloc (strlen (string) + 1); if (!result) return NULL; - ptr_result = result; - while (ptr_result && ptr_result[0]) + while (string && string[0]) { - if ((ptr_result[0] >= 'a') && (ptr_result[0] <= 'z')) - ptr_result[0] -= ('a' - 'A'); - ptr_result = (char *)utf8_next_char (ptr_result); + if (!((unsigned char)(string[0]) & 0x80)) + { + /* + * optimization for single-byte char: only letters a-z must be + * converted to uppercase; this is faster than calling `towupper` + */ + if ((string[0] >= 'a') && (string[0] <= 'z')) + utf_char[0] = string[0] - ('a' - 'A'); + else + utf_char[0] = string[0]; + utf_char[1] = '\0'; + string_dyn_concat (result, utf_char, -1); + string++; + } + else + { + /* char ≥ 2 bytes, use `towupper` */ + utf8_int_string (towupper (utf8_char_int (string)), utf_char); + string_dyn_concat (result, utf_char, -1); + string = (char *)utf8_next_char (string); + } } - - return result; + return string_dyn_free (result, 0); } /* @@ -1174,11 +1196,11 @@ string_is_word_char (const char *string, wint_t c; int i, match; - c = utf8_wide_char (string); - - if (c == WEOF) + if (!string || !string[0]) return 0; + c = utf8_char_int (string); + for (i = 0; i < word_chars_count; i++) { if (word_chars[i].wc_class != (wctype_t)0) diff --git a/src/core/wee-utf8.c b/src/core/wee-utf8.c index c8756a695..8f6a69cff 100644 --- a/src/core/wee-utf8.c +++ b/src/core/wee-utf8.c @@ -393,49 +393,6 @@ utf8_int_string (unsigned int unicode_value, char *string) return num_bytes; } -/* - * Gets wide char from string (first char). - * - * Returns the char as "wint_t", WEOF is string was NULL/empty or in case of - * error. - */ - -wint_t -utf8_wide_char (const char *string) -{ - int char_size; - wint_t result; - - if (!string || !string[0]) - return WEOF; - - char_size = utf8_char_size (string); - switch (char_size) - { - case 1: - result = (wint_t)string[0]; - break; - case 2: - result = ((wint_t)((unsigned char)string[0])) << 8 - | ((wint_t)((unsigned char)string[1])); - break; - case 3: - result = ((wint_t)((unsigned char)string[0])) << 16 - | ((wint_t)((unsigned char)string[1])) << 8 - | ((wint_t)((unsigned char)string[2])); - break; - case 4: - result = ((wint_t)((unsigned char)string[0])) << 24 - | ((wint_t)((unsigned char)string[1])) << 16 - | ((wint_t)((unsigned char)string[2])) << 8 - | ((wint_t)((unsigned char)string[3])); - break; - default: - result = WEOF; - } - return result; -} - /* * Gets size of UTF-8 char (in bytes). * @@ -626,13 +583,25 @@ utf8_charcasecmp (const char *string1, const char *string2) if (!string1 || !string2) return (string1) ? 1 : ((string2) ? -1 : 0); - wchar1 = utf8_wide_char (string1); - if ((wchar1 >= 'A') && (wchar1 <= 'Z')) - wchar1 += ('a' - 'A'); - - wchar2 = utf8_wide_char (string2); - if ((wchar2 >= 'A') && (wchar2 <= 'Z')) - wchar2 += ('a' - 'A'); + /* + * optimization for single-byte chars: only letters A-Z must be converted + * to lowercase; this is faster than calling `towlower` + */ + if (!((unsigned char)(string1[0]) & 0x80) + && !((unsigned char)(string2[0]) & 0x80)) + { + wchar1 = string1[0]; + if ((wchar1 >= 'A') && (wchar1 <= 'Z')) + wchar1 += ('a' - 'A'); + wchar2 = string2[0]; + if ((wchar2 >= 'A') && (wchar2 <= 'Z')) + wchar2 += ('a' - 'A'); + } + else + { + wchar1 = towlower (utf8_char_int (string1)); + wchar2 = towlower (utf8_char_int (string2)); + } return (wchar1 < wchar2) ? -1 : ((wchar1 == wchar2) ? 0 : 1); } @@ -658,17 +627,17 @@ utf8_charcasecmp (const char *string1, const char *string2) int utf8_charcasecmp_range (const char *string1, const char *string2, int range) { - wint_t wchar1, wchar2; + wchar_t wchar1, wchar2; if (!string1 || !string2) return (string1) ? 1 : ((string2) ? -1 : 0); - wchar1 = utf8_wide_char (string1); - if ((wchar1 >= (wint_t)'A') && (wchar1 < (wint_t)('A' + range))) + wchar1 = utf8_char_int (string1); + if ((wchar1 >= (wchar_t)'A') && (wchar1 < (wchar_t)('A' + range))) wchar1 += ('a' - 'A'); - wchar2 = utf8_wide_char (string2); - if ((wchar2 >= (wint_t)'A') && (wchar2 < (wint_t)('A' + range))) + wchar2 = utf8_char_int (string2); + if ((wchar2 >= (wchar_t)'A') && (wchar2 < (wchar_t)('A' + range))) wchar2 += ('a' - 'A'); return (wchar1 < wchar2) ? -1 : ((wchar1 == wchar2) ? 0 : 1); diff --git a/src/core/wee-utf8.h b/src/core/wee-utf8.h index 6c5771860..9507ea05d 100644 --- a/src/core/wee-utf8.h +++ b/src/core/wee-utf8.h @@ -37,7 +37,6 @@ extern const char *utf8_prev_char (const char *string_start, extern const char *utf8_next_char (const char *string); extern int utf8_char_int (const char *string); extern int utf8_int_string (unsigned int unicode_value, char *string); -extern wint_t utf8_wide_char (const char *string); extern int utf8_char_size (const char *string); extern int utf8_strlen (const char *string); extern int utf8_strnlen (const char *string, int bytes); -- cgit v1.2.3