summaryrefslogtreecommitdiff
path: root/src/common/utf8.c
diff options
context:
space:
mode:
authorSebastien Helleu <flashcode@flashtux.org>2005-10-21 13:59:55 +0000
committerSebastien Helleu <flashcode@flashtux.org>2005-10-21 13:59:55 +0000
commit73359fcc05f325f0b1045c318d3e9d9a844f592b (patch)
tree126b1ca61af319020097d51c2cc63350824a36e4 /src/common/utf8.c
parent0bc32f561b7c8b4e5804c73c98fbbbef991cae3e (diff)
downloadweechat-73359fcc05f325f0b1045c318d3e9d9a844f592b.zip
Full UTF-8 support, auto-detection of UTF-8 usage (locale)
Diffstat (limited to 'src/common/utf8.c')
-rw-r--r--src/common/utf8.c330
1 files changed, 330 insertions, 0 deletions
diff --git a/src/common/utf8.c b/src/common/utf8.c
new file mode 100644
index 000000000..3f9188558
--- /dev/null
+++ b/src/common/utf8.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2003-2005 by FlashCode <flashcode@flashtux.org>
+ * See README for License detail, AUTHORS for developers list.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* utf8.c: UTF-8 string functions for WeeChat */
+
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <string.h>
+
+#include "weechat.h"
+#include "utf8.h"
+#include "weeconfig.h"
+
+
+int local_utf8 = 0;
+
+
+/*
+ * utf8_init: initializes UTF-8 in WeeChat
+ */
+
+void
+utf8_init ()
+{
+ local_utf8 = 0;
+
+ if (cfg_look_charset_internal && cfg_look_charset_internal[0])
+ {
+ if (strstr (cfg_look_charset_internal, "UTF-8")
+ || strstr (cfg_look_charset_internal, "utf-8"))
+ local_utf8 = 1;
+ }
+ else if ((local_charset)
+ && ((strstr (local_charset, "UTF-8")
+ || strstr (local_charset, "utf-8"))))
+ local_utf8 = 1;
+}
+
+/*
+ * utf8_is_valid: return 1 if UTF-8 string is valid, 0 otherwise
+ */
+
+int
+utf8_is_valid (char *string)
+{
+ while (string[0])
+ {
+ /* UTF-8, 2 bytes, should be: 110vvvvv 10vvvvvv */
+ if (((unsigned char)(string[0]) & 0xE0) == 0xC0)
+ {
+ if (!string[1] || (((unsigned char)(string[1]) & 0xC0) != 0x80))
+ return 0;
+ string += 2;
+ }
+ /* UTF-8, 3 bytes, should be: 1110vvvv 10vvvvvv 10vvvvvv */
+ else if (((unsigned char)(string[0]) & 0xF0) == 0xE0)
+ {
+ if (!string[1] || !string[2]
+ || (((unsigned char)(string[1]) & 0xC0) != 0x80)
+ || (((unsigned char)(string[2]) & 0xC0) != 0x80))
+ return 0;
+ string += 3;
+ }
+ /* UTF-8, 4 bytes, should be: 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv */
+ else if (((unsigned char)(string[0]) & 0xF8) == 0xF0)
+ {
+ if (!string[1] || !string[2] || !string[3]
+ || (((unsigned char)(string[1]) & 0xC0) != 0x80)
+ || (((unsigned char)(string[2]) & 0xC0) != 0x80)
+ || (((unsigned char)(string[3]) & 0xC0) != 0x80))
+ return 0;
+ string += 4;
+ }
+ /* UTF-8, 1 byte, should be: 0vvvvvvv */
+ else if ((unsigned char)(string[0]) >= 0x80)
+ return 0;
+ else
+ string++;
+ }
+ return 1;
+}
+
+/*
+ * utf8_prev_char: return previous UTF-8 char in a string
+ */
+
+char *
+utf8_prev_char (char *string_start, char *string)
+{
+ if (!string || (string <= string_start))
+ return NULL;
+
+ string--;
+
+ if (!local_utf8)
+ return string;
+
+ if (((unsigned char)(string[0]) & 0xC0) == 0x80)
+ {
+ /* UTF-8, at least 2 bytes */
+ string--;
+ if (string < string_start)
+ return string + 1;
+ if (((unsigned char)(string[0]) & 0xC0) == 0x80)
+ {
+ /* UTF-8, at least 3 bytes */
+ string--;
+ if (string < string_start)
+ return string + 1;
+ if (((unsigned char)(string[0]) & 0xC0) == 0x80)
+ {
+ /* UTF-8, 4 bytes */
+ string--;
+ if (string < string_start)
+ return string + 1;
+ return string;
+ }
+ else
+ return string;
+ }
+ else
+ return string;
+ }
+ return string;
+}
+
+/*
+ * utf8_next_char: return next UTF-8 char in a string
+ */
+
+char *
+utf8_next_char (char *string)
+{
+ if (!string)
+ return NULL;
+
+ if (!local_utf8)
+ return string + 1;
+
+ /* UTF-8, 2 bytes: 110vvvvv 10vvvvvv */
+ if (((unsigned char)(string[0]) & 0xE0) == 0xC0)
+ {
+ if (!string[1])
+ return string + 1;
+ return string + 2;
+ }
+ /* UTF-8, 3 bytes: 1110vvvv 10vvvvvv 10vvvvvv */
+ else if (((unsigned char)(string[0]) & 0xF0) == 0xE0)
+ {
+ if (!string[1])
+ return string + 1;
+ if (!string[2])
+ return string + 2;
+ return string + 3;
+ }
+ /* UTF-8, 4 bytes: 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv */
+ else if (((unsigned char)(string[0]) & 0xF8) == 0xF0)
+ {
+ if (!string[1])
+ return string + 1;
+ if (!string[2])
+ return string + 2;
+ if (!string[3])
+ return string + 3;
+ return string + 4;
+ }
+ /* UTF-8, 1 byte: 0vvvvvvv */
+ return string + 1;
+}
+
+/*
+ * utf8_char_size: return UTF-8 char size
+ */
+
+int
+utf8_char_size (char *string)
+{
+ if (!string)
+ return 0;
+
+ return utf8_next_char (string) - string;
+}
+
+/*
+ * utf8_strlen: return length of an UTF-8 string (<= strlen(string))
+ */
+
+int
+utf8_strlen (char *string)
+{
+ int length;
+
+ if (!string)
+ return 0;
+
+ if (!local_utf8)
+ return strlen (string);
+
+ length = 0;
+ while (string[0])
+ {
+ string = utf8_next_char (string);
+ length++;
+ }
+ return length;
+}
+
+/*
+ * utf8_strlen: return length of an UTF-8 string, for N bytes
+ */
+
+int
+utf8_strnlen (char *string, int bytes)
+{
+ char *start;
+ int length;
+
+ if (!string)
+ return 0;
+
+ if (!local_utf8)
+ {
+ length = strlen (string);
+ if (bytes > length)
+ return length;
+ return bytes;
+ }
+
+ start = string;
+ length = 0;
+ while (string[0] && (string - start < bytes))
+ {
+ string = utf8_next_char (string);
+ length++;
+ }
+ return length;
+}
+
+/*
+ * utf8_add_offset: moves forward N chars in an UTF-8 string
+ */
+
+char *
+utf8_add_offset (char *string, int offset)
+{
+ int count;
+
+ if (!string)
+ return string;
+
+ if (!local_utf8)
+ return string + offset;
+
+ count = 0;
+ while (string[0] && (count < offset))
+ {
+ string = utf8_next_char (string);
+ count++;
+ }
+ return string;
+}
+
+/*
+ * utf8_real_pos: get real position in UTF-8
+ * for example: ("aébc", 2) returns 3
+ */
+
+int
+utf8_real_pos (char *string, int pos)
+{
+ int count, real_pos;
+ char *next_char;
+
+ if (!string || !local_utf8)
+ return pos;
+
+ count = 0;
+ real_pos = 0;
+ while (string[0] && (count < pos))
+ {
+ next_char = utf8_next_char (string);
+ real_pos += (next_char - string);
+ string = next_char;
+ count++;
+ }
+ return real_pos;
+}
+
+/*
+ * utf8_pos: get position in UTF-8
+ * for example: ("aébc", 3) returns 2
+ */
+
+int
+utf8_pos (char *string, int real_pos)
+{
+ int count;
+ char *limit;
+
+ if (!string || !local_charset)
+ return real_pos;
+
+ count = 0;
+ limit = string + real_pos;
+ while (string[0] && (string < limit))
+ {
+ string = utf8_next_char (string);
+ count++;
+ }
+ return count;
+}