diff options
author | Sebastien Helleu <flashcode@flashtux.org> | 2005-10-21 13:59:55 +0000 |
---|---|---|
committer | Sebastien Helleu <flashcode@flashtux.org> | 2005-10-21 13:59:55 +0000 |
commit | 73359fcc05f325f0b1045c318d3e9d9a844f592b (patch) | |
tree | 126b1ca61af319020097d51c2cc63350824a36e4 /src/common/utf8.c | |
parent | 0bc32f561b7c8b4e5804c73c98fbbbef991cae3e (diff) | |
download | weechat-73359fcc05f325f0b1045c318d3e9d9a844f592b.zip |
Full UTF-8 support, auto-detection of UTF-8 usage (locale)
Diffstat (limited to 'src/common/utf8.c')
-rw-r--r-- | src/common/utf8.c | 330 |
1 files changed, 330 insertions, 0 deletions
diff --git a/src/common/utf8.c b/src/common/utf8.c new file mode 100644 index 000000000..3f9188558 --- /dev/null +++ b/src/common/utf8.c @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2003-2005 by FlashCode <flashcode@flashtux.org> + * See README for License detail, AUTHORS for developers list. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* utf8.c: UTF-8 string functions for WeeChat */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <string.h> + +#include "weechat.h" +#include "utf8.h" +#include "weeconfig.h" + + +int local_utf8 = 0; + + +/* + * utf8_init: initializes UTF-8 in WeeChat + */ + +void +utf8_init () +{ + local_utf8 = 0; + + if (cfg_look_charset_internal && cfg_look_charset_internal[0]) + { + if (strstr (cfg_look_charset_internal, "UTF-8") + || strstr (cfg_look_charset_internal, "utf-8")) + local_utf8 = 1; + } + else if ((local_charset) + && ((strstr (local_charset, "UTF-8") + || strstr (local_charset, "utf-8")))) + local_utf8 = 1; +} + +/* + * utf8_is_valid: return 1 if UTF-8 string is valid, 0 otherwise + */ + +int +utf8_is_valid (char *string) +{ + while (string[0]) + { + /* UTF-8, 2 bytes, should be: 110vvvvv 10vvvvvv */ + if (((unsigned char)(string[0]) & 0xE0) == 0xC0) + { + if (!string[1] || (((unsigned char)(string[1]) & 0xC0) != 0x80)) + return 0; + string += 2; + } + /* UTF-8, 3 bytes, should be: 1110vvvv 10vvvvvv 10vvvvvv */ + else if (((unsigned char)(string[0]) & 0xF0) == 0xE0) + { + if (!string[1] || !string[2] + || (((unsigned char)(string[1]) & 0xC0) != 0x80) + || (((unsigned char)(string[2]) & 0xC0) != 0x80)) + return 0; + string += 3; + } + /* UTF-8, 4 bytes, should be: 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv */ + else if (((unsigned char)(string[0]) & 0xF8) == 0xF0) + { + if (!string[1] || !string[2] || !string[3] + || (((unsigned char)(string[1]) & 0xC0) != 0x80) + || (((unsigned char)(string[2]) & 0xC0) != 0x80) + || (((unsigned char)(string[3]) & 0xC0) != 0x80)) + return 0; + string += 4; + } + /* UTF-8, 1 byte, should be: 0vvvvvvv */ + else if ((unsigned char)(string[0]) >= 0x80) + return 0; + else + string++; + } + return 1; +} + +/* + * utf8_prev_char: return previous UTF-8 char in a string + */ + +char * +utf8_prev_char (char *string_start, char *string) +{ + if (!string || (string <= string_start)) + return NULL; + + string--; + + if (!local_utf8) + return string; + + if (((unsigned char)(string[0]) & 0xC0) == 0x80) + { + /* UTF-8, at least 2 bytes */ + string--; + if (string < string_start) + return string + 1; + if (((unsigned char)(string[0]) & 0xC0) == 0x80) + { + /* UTF-8, at least 3 bytes */ + string--; + if (string < string_start) + return string + 1; + if (((unsigned char)(string[0]) & 0xC0) == 0x80) + { + /* UTF-8, 4 bytes */ + string--; + if (string < string_start) + return string + 1; + return string; + } + else + return string; + } + else + return string; + } + return string; +} + +/* + * utf8_next_char: return next UTF-8 char in a string + */ + +char * +utf8_next_char (char *string) +{ + if (!string) + return NULL; + + if (!local_utf8) + return string + 1; + + /* UTF-8, 2 bytes: 110vvvvv 10vvvvvv */ + if (((unsigned char)(string[0]) & 0xE0) == 0xC0) + { + if (!string[1]) + return string + 1; + return string + 2; + } + /* UTF-8, 3 bytes: 1110vvvv 10vvvvvv 10vvvvvv */ + else if (((unsigned char)(string[0]) & 0xF0) == 0xE0) + { + if (!string[1]) + return string + 1; + if (!string[2]) + return string + 2; + return string + 3; + } + /* UTF-8, 4 bytes: 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv */ + else if (((unsigned char)(string[0]) & 0xF8) == 0xF0) + { + if (!string[1]) + return string + 1; + if (!string[2]) + return string + 2; + if (!string[3]) + return string + 3; + return string + 4; + } + /* UTF-8, 1 byte: 0vvvvvvv */ + return string + 1; +} + +/* + * utf8_char_size: return UTF-8 char size + */ + +int +utf8_char_size (char *string) +{ + if (!string) + return 0; + + return utf8_next_char (string) - string; +} + +/* + * utf8_strlen: return length of an UTF-8 string (<= strlen(string)) + */ + +int +utf8_strlen (char *string) +{ + int length; + + if (!string) + return 0; + + if (!local_utf8) + return strlen (string); + + length = 0; + while (string[0]) + { + string = utf8_next_char (string); + length++; + } + return length; +} + +/* + * utf8_strlen: return length of an UTF-8 string, for N bytes + */ + +int +utf8_strnlen (char *string, int bytes) +{ + char *start; + int length; + + if (!string) + return 0; + + if (!local_utf8) + { + length = strlen (string); + if (bytes > length) + return length; + return bytes; + } + + start = string; + length = 0; + while (string[0] && (string - start < bytes)) + { + string = utf8_next_char (string); + length++; + } + return length; +} + +/* + * utf8_add_offset: moves forward N chars in an UTF-8 string + */ + +char * +utf8_add_offset (char *string, int offset) +{ + int count; + + if (!string) + return string; + + if (!local_utf8) + return string + offset; + + count = 0; + while (string[0] && (count < offset)) + { + string = utf8_next_char (string); + count++; + } + return string; +} + +/* + * utf8_real_pos: get real position in UTF-8 + * for example: ("aébc", 2) returns 3 + */ + +int +utf8_real_pos (char *string, int pos) +{ + int count, real_pos; + char *next_char; + + if (!string || !local_utf8) + return pos; + + count = 0; + real_pos = 0; + while (string[0] && (count < pos)) + { + next_char = utf8_next_char (string); + real_pos += (next_char - string); + string = next_char; + count++; + } + return real_pos; +} + +/* + * utf8_pos: get position in UTF-8 + * for example: ("aébc", 3) returns 2 + */ + +int +utf8_pos (char *string, int real_pos) +{ + int count; + char *limit; + + if (!string || !local_charset) + return real_pos; + + count = 0; + limit = string + real_pos; + while (string[0] && (string < limit)) + { + string = utf8_next_char (string); + count++; + } + return count; +} |