summaryrefslogtreecommitdiff
path: root/Libraries
diff options
context:
space:
mode:
authorHüseyin ASLITÜRK <asliturk@hotmail.com>2020-05-20 21:22:23 +0300
committerAndreas Kling <kling@serenityos.org>2020-05-21 01:19:42 +0200
commit241df7206ec2f6ab1e36316a2f90e24467d4907b (patch)
tree44cf6723e79907fa747a40dcc9ad73d5ca3838e7 /Libraries
parent738235574f32690f186e4734636434641093c2ff (diff)
downloadserenity-241df7206ec2f6ab1e36316a2f90e24467d4907b.zip
LibWeb: HTML Parser, handle html escaped characters
Convert HTML escaped (&#XXX;) characters to string.
Diffstat (limited to 'Libraries')
-rw-r--r--Libraries/LibWeb/Parser/HTMLParser.cpp58
1 files changed, 56 insertions, 2 deletions
diff --git a/Libraries/LibWeb/Parser/HTMLParser.cpp b/Libraries/LibWeb/Parser/HTMLParser.cpp
index d3509a3d26..493d8b9a39 100644
--- a/Libraries/LibWeb/Parser/HTMLParser.cpp
+++ b/Libraries/LibWeb/Parser/HTMLParser.cpp
@@ -27,6 +27,7 @@
#include <AK/Function.h>
#include <AK/NonnullRefPtrVector.h>
#include <AK/StringBuilder.h>
+#include <AK/StringUtils.h>
#include <LibTextCodec/Decoder.h>
#include <LibWeb/DOM/Comment.h>
#include <LibWeb/DOM/DocumentFragment.h>
@@ -64,6 +65,23 @@ static bool is_void_element(const StringView& tag_name)
|| tag_name == "wbr";
}
+static Vector<char> codepoint_to_bytes(const u32 codepoint)
+{
+ Vector<char, 0> bytes;
+
+ if (codepoint < 0x80) {
+ bytes.insert(0, (char)codepoint);
+ } else if (codepoint < 0x800) {
+ char b2 = (codepoint & 0x3F) + 0x80;
+ char b1 = ((codepoint >> 6) & 0x1F) + +0xC0;
+
+ bytes.insert(0, b1);
+ bytes.insert(1, b2);
+ }
+
+ return bytes;
+}
+
static bool parse_html_document(const StringView& html, Document& document, ParentNode& root)
{
NonnullRefPtrVector<ParentNode> node_stack;
@@ -213,6 +231,7 @@ static bool parse_html_document(const StringView& html, Document& document, Pare
};
auto rest_of_html = html.substring_view(i, html.length() - i);
bool found = false;
+
for (auto& escape : escapes) {
if (rest_of_html.starts_with(escape.code)) {
text_buffer.append(escape.value);
@@ -221,8 +240,43 @@ static bool parse_html_document(const StringView& html, Document& document, Pare
break;
}
}
- if (!found)
- dbg() << "Unhandled escape sequence";
+
+ if (!found) {
+ char num_sign = html[i + 1];
+ if (num_sign && num_sign == '#') {
+ int j = 2; // spip '&#' and search for ';'
+ while (html[i + j] != ';' && j < 7) {
+ j++;
+ }
+
+ if (j < 7) { // We found ; char
+ bool ok;
+ u32 codepoint;
+ String str_code_point = html.substring_view(i + 2, j - 2);
+ if (str_code_point.starts_with('x')) {
+ String str = str_code_point.substring(1, str_code_point.length() - 1);
+ codepoint = AK::StringUtils::convert_to_uint_from_hex(str, ok);
+ } else {
+ codepoint = str_code_point.to_uint(ok);
+ }
+
+ if (ok) {
+ Vector<char> bytes = codepoint_to_bytes(codepoint);
+ if (bytes.size() > 0) {
+ for (size_t i = 0; i < bytes.size(); i++) {
+ text_buffer.append(bytes.at(i));
+ }
+ found = true;
+ i = i + j;
+ }
+ }
+ }
+ }
+ }
+
+ if (!found) {
+ dbg() << "Unhandled escape sequence:" << html.substring_view(i, min((size_t)5, html.length()));
+ }
}
break;
case State::BeforeTagName: