From e09b83c60c77bef5fc7704dfe6f70df75462b62f Mon Sep 17 00:00:00 2001 From: Andreas Kling Date: Sun, 3 May 2020 22:41:34 +0200 Subject: LibTextCodec: Start fleshing out a simple text codec library We're starting with a very basic decoding API and only ISO-8859-1 and UTF-8 decoding (and UTF-8 decoding is really a no-op since String is expected to be UTF-8.) --- Applications/Browser/Makefile | 2 +- Applications/Help/Makefile | 2 +- Applications/IRCClient/Makefile | 2 +- Applications/TextEditor/Makefile | 2 +- DevTools/HackStudio/Makefile | 2 +- Libraries/LibTextCodec/Decoder.cpp | 73 ++++++++++++++++++++++++++++++++++ Libraries/LibTextCodec/Decoder.h | 50 +++++++++++++++++++++++ Libraries/LibTextCodec/Makefile | 15 +++++++ Libraries/LibWeb/Parser/HTMLParser.cpp | 19 ++------- Userland/Makefile | 2 +- 10 files changed, 148 insertions(+), 21 deletions(-) create mode 100644 Libraries/LibTextCodec/Decoder.cpp create mode 100644 Libraries/LibTextCodec/Decoder.h create mode 100644 Libraries/LibTextCodec/Makefile diff --git a/Applications/Browser/Makefile b/Applications/Browser/Makefile index 91cc60aff2..ac0182b6d5 100644 --- a/Applications/Browser/Makefile +++ b/Applications/Browser/Makefile @@ -7,7 +7,7 @@ OBJS = \ PROGRAM = Browser -LIB_DEPS = Web JS GUI Gfx IPC Protocol Core +LIB_DEPS = Web JS TextCodec GUI Gfx IPC Protocol Core main.cpp: ../../Libraries/LibWeb/CSS/PropertyID.h ../../Libraries/LibWeb/CSS/PropertyID.h: diff --git a/Applications/Help/Makefile b/Applications/Help/Makefile index c1869dbbb0..d1b1159f5f 100644 --- a/Applications/Help/Makefile +++ b/Applications/Help/Makefile @@ -7,6 +7,6 @@ OBJS = \ PROGRAM = Help -LIB_DEPS = GUI Web JS Gfx Markdown IPC Protocol Thread Pthread Core +LIB_DEPS = GUI Web TextCodec JS Gfx Markdown IPC Protocol Thread Pthread Core include ../../Makefile.common diff --git a/Applications/IRCClient/Makefile b/Applications/IRCClient/Makefile index fc4a987ea6..693d63ac6f 100644 --- a/Applications/IRCClient/Makefile +++ b/Applications/IRCClient/Makefile @@ -11,6 +11,6 @@ OBJS = \ PROGRAM = IRCClient -LIB_DEPS = Web JS GUI Gfx Protocol IPC Thread Pthread Core +LIB_DEPS = Web TextCodec JS GUI Gfx Protocol IPC Thread Pthread Core include ../../Makefile.common diff --git a/Applications/TextEditor/Makefile b/Applications/TextEditor/Makefile index 6d917e407d..545526a16a 100644 --- a/Applications/TextEditor/Makefile +++ b/Applications/TextEditor/Makefile @@ -4,6 +4,6 @@ OBJS = \ PROGRAM = TextEditor -LIB_DEPS = Web Markdown GUI Gfx VT Protocol IPC Thread Pthread Core JS +LIB_DEPS = Web TextCodec Markdown GUI Gfx VT Protocol IPC Thread Pthread Core JS include ../../Makefile.common diff --git a/DevTools/HackStudio/Makefile b/DevTools/HackStudio/Makefile index 468a60ef85..5ae18fde67 100644 --- a/DevTools/HackStudio/Makefile +++ b/DevTools/HackStudio/Makefile @@ -18,6 +18,6 @@ OBJS = \ PROGRAM = HackStudio -LIB_DEPS = GUI Web VT Protocol Markdown Gfx IPC Thread Pthread Core JS Debug +LIB_DEPS = GUI Web TextCodec VT Protocol Markdown Gfx IPC Thread Pthread Core JS Debug include ../../Makefile.common diff --git a/Libraries/LibTextCodec/Decoder.cpp b/Libraries/LibTextCodec/Decoder.cpp new file mode 100644 index 0000000000..e126fc63cb --- /dev/null +++ b/Libraries/LibTextCodec/Decoder.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2020, Andreas Kling + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include + +namespace TextCodec { + +Latin1Decoder& latin1_decoder() +{ + static Latin1Decoder* decoder; + if (!decoder) + decoder = new Latin1Decoder; + return *decoder; +} + +UTF8Decoder& utf8_decoder() +{ + static UTF8Decoder* decoder; + if (!decoder) + decoder = new UTF8Decoder; + return *decoder; +} + +Decoder* decoder_for(const String& encoding) +{ + if (encoding.equals_ignoring_case("iso-8859-1")) + return &latin1_decoder(); + if (encoding.equals_ignoring_case("utf-8")) + return &utf8_decoder(); + return nullptr; +} + +String UTF8Decoder::to_utf8(const StringView& input) +{ + return input; +} + +String Latin1Decoder::to_utf8(const StringView& input) +{ + StringBuilder builder(input.length()); + for (size_t i = 0; i < input.length(); ++i) { + u8 ch = input[i]; + builder.append(ch >= 0x80 ? '?' : ch); + } + return builder.to_string(); +} + +} diff --git a/Libraries/LibTextCodec/Decoder.h b/Libraries/LibTextCodec/Decoder.h new file mode 100644 index 0000000000..419617f49c --- /dev/null +++ b/Libraries/LibTextCodec/Decoder.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2020, Andreas Kling + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include + +namespace TextCodec { + +class Decoder { +public: + virtual String to_utf8(const StringView&) = 0; +}; + +class UTF8Decoder final : public Decoder { +public: + virtual String to_utf8(const StringView&) override; +}; + +class Latin1Decoder final : public Decoder { +public: + virtual String to_utf8(const StringView&) override; +}; + +Decoder* decoder_for(const String& encoding); + +} diff --git a/Libraries/LibTextCodec/Makefile b/Libraries/LibTextCodec/Makefile new file mode 100644 index 0000000000..7d80b9273a --- /dev/null +++ b/Libraries/LibTextCodec/Makefile @@ -0,0 +1,15 @@ +OBJS = \ + Decoder.o + +LIBRARY = libtextcodec.a + +install: + for dir in .; do \ + mkdir -p $(SERENITY_BASE_DIR)/Root/usr/include/LibTextCodec/$$dir; \ + cp $$dir/*.h $(SERENITY_BASE_DIR)/Root/usr/include/LibTextCodec/$$dir/; \ + done + cp $(LIBRARY) $(SERENITY_BASE_DIR)/Root/usr/lib/ + +include ../../Makefile.common + +include ../../Makefile.subdir diff --git a/Libraries/LibWeb/Parser/HTMLParser.cpp b/Libraries/LibWeb/Parser/HTMLParser.cpp index ca46b8de53..ab44078286 100644 --- a/Libraries/LibWeb/Parser/HTMLParser.cpp +++ b/Libraries/LibWeb/Parser/HTMLParser.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -385,21 +386,9 @@ static bool parse_html_document(const StringView& html, Document& document, Pare String to_utf8(const StringView& input, const String& encoding) { - String output; - if (encoding == "utf-8") { - output = input; - } else if (encoding == "iso-8859-1") { - StringBuilder builder(input.length()); - for (size_t i = 0; i < input.length(); ++i) { - u8 ch = input[i]; - builder.append(ch >= 0x80 ? '?' : ch); - } - output = builder.to_string(); - } else { - dbg() << "Unknown encoding " << encoding; - ASSERT_NOT_REACHED(); - } - return output; + auto* decoder = TextCodec::decoder_for(encoding); + ASSERT(decoder); + return decoder->to_utf8(input); } RefPtr parse_html_fragment(Document& document, const StringView& raw_html, const String& encoding) diff --git a/Userland/Makefile b/Userland/Makefile index 03cd975e5f..14d62fc8e5 100644 --- a/Userland/Makefile +++ b/Userland/Makefile @@ -4,7 +4,7 @@ APPS = ${SRCS:.cpp=} EXTRA_CLEAN = $(APPS) -LIB_DEPS = Crypto TLS Web GUI Gfx Audio Protocol IPC Thread Pthread PCIDB Markdown JS Core Line X86 Debug +LIB_DEPS = Crypto TLS Web TextCodec GUI Gfx Audio Protocol IPC Thread Pthread PCIDB Markdown JS Core Line X86 Debug include ../Makefile.common -- cgit v1.2.3