diff options
author | Hüseyin ASLITÜRK <asliturk@hotmail.com> | 2020-05-20 21:22:23 +0300 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2020-05-21 01:19:42 +0200 |
commit | 241df7206ec2f6ab1e36316a2f90e24467d4907b (patch) | |
tree | 44cf6723e79907fa747a40dcc9ad73d5ca3838e7 /Libraries | |
parent | 738235574f32690f186e4734636434641093c2ff (diff) | |
download | serenity-241df7206ec2f6ab1e36316a2f90e24467d4907b.zip |
LibWeb: HTML Parser, handle html escaped characters
Convert HTML escaped (&#XXX;) characters to string.
Diffstat (limited to 'Libraries')
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLParser.cpp | 58 |
1 files changed, 56 insertions, 2 deletions
diff --git a/Libraries/LibWeb/Parser/HTMLParser.cpp b/Libraries/LibWeb/Parser/HTMLParser.cpp index d3509a3d26..493d8b9a39 100644 --- a/Libraries/LibWeb/Parser/HTMLParser.cpp +++ b/Libraries/LibWeb/Parser/HTMLParser.cpp @@ -27,6 +27,7 @@ #include <AK/Function.h> #include <AK/NonnullRefPtrVector.h> #include <AK/StringBuilder.h> +#include <AK/StringUtils.h> #include <LibTextCodec/Decoder.h> #include <LibWeb/DOM/Comment.h> #include <LibWeb/DOM/DocumentFragment.h> @@ -64,6 +65,23 @@ static bool is_void_element(const StringView& tag_name) || tag_name == "wbr"; } +static Vector<char> codepoint_to_bytes(const u32 codepoint) +{ + Vector<char, 0> bytes; + + if (codepoint < 0x80) { + bytes.insert(0, (char)codepoint); + } else if (codepoint < 0x800) { + char b2 = (codepoint & 0x3F) + 0x80; + char b1 = ((codepoint >> 6) & 0x1F) + +0xC0; + + bytes.insert(0, b1); + bytes.insert(1, b2); + } + + return bytes; +} + static bool parse_html_document(const StringView& html, Document& document, ParentNode& root) { NonnullRefPtrVector<ParentNode> node_stack; @@ -213,6 +231,7 @@ static bool parse_html_document(const StringView& html, Document& document, Pare }; auto rest_of_html = html.substring_view(i, html.length() - i); bool found = false; + for (auto& escape : escapes) { if (rest_of_html.starts_with(escape.code)) { text_buffer.append(escape.value); @@ -221,8 +240,43 @@ static bool parse_html_document(const StringView& html, Document& document, Pare break; } } - if (!found) - dbg() << "Unhandled escape sequence"; + + if (!found) { + char num_sign = html[i + 1]; + if (num_sign && num_sign == '#') { + int j = 2; // spip '&#' and search for ';' + while (html[i + j] != ';' && j < 7) { + j++; + } + + if (j < 7) { // We found ; char + bool ok; + u32 codepoint; + String str_code_point = html.substring_view(i + 2, j - 2); + if (str_code_point.starts_with('x')) { + String str = str_code_point.substring(1, str_code_point.length() - 1); + codepoint = AK::StringUtils::convert_to_uint_from_hex(str, ok); + } else { + codepoint = str_code_point.to_uint(ok); + } + + if (ok) { + Vector<char> bytes = codepoint_to_bytes(codepoint); + if (bytes.size() > 0) { + for (size_t i = 0; i < bytes.size(); i++) { + text_buffer.append(bytes.at(i)); + } + found = true; + i = i + j; + } + } + } + } + } + + if (!found) { + dbg() << "Unhandled escape sequence:" << html.substring_view(i, min((size_t)5, html.length())); + } } break; case State::BeforeTagName: |