AK: Add an exact and fast hex float parsing algorithm

Similar to decimal floating point parsing the current strtod hex float parsing gives a lot of incorrect results. We can use a similar technique as with decimal parsing however hex floats are much simpler as we don't need to scale with a power of 5. For hex floats we just provide the parse_first_hexfloat API as there is currently no need for a parse_hexfloat_completely API. Again the accepted input for parse_first_hexfloat is very lenient and any validation should be done before calling this method.
author: davidot <davidot@serenityos.org> 2022-10-13 02:23:07 +0200
committer: Linus Groh <mail@linusgroh.de> 2022-10-23 15:48:45 +0200
commit: 2334cd85a26f6244c7af198f118a6d7e9c437b44 (patch)
tree: df7d9a85e0ed31ebe5c8c98f5fff23731a66e571 /AK
parent: 53b7f5e6a11e663c83df8030c3171c5945cb75ec (diff)
download: serenity-2334cd85a26f6244c7af198f118a6d7e9c437b44.zip
2 files changed, 252 insertions, 0 deletions
diff --git a/AK/FloatingPointStringConversions.cpp b/AK/FloatingPointStringConversions.cpp
index b8e7a15554..96cedd7f83 100644
--- a/AK/FloatingPointStringConversions.cpp
+++ b/AK/FloatingPointStringConversions.cpp
@@ -2015,4 +2015,239 @@ template Optional<double> parse_floating_point_completely(char const* start, cha
 
 template Optional<float> parse_floating_point_completely(char const* start, char const* end);
 
+struct HexFloatParseResult {
+    bool is_negative = false;
+    bool valid = false;
+    char const* last_parsed = nullptr;
+    u64 mantissa = 0;
+    i64 exponent = 0;
+};
+
+static HexFloatParseResult parse_hexfloat(char const* start)
+{
+    HexFloatParseResult result {};
+    if (start == nullptr || *start == '\0')
+        return result;
+
+    char const* parse_head = start;
+    bool any_digits = false;
+    bool truncated_non_zero = false;
+
+    if (*parse_head == '-') {
+        result.is_negative = true;
+        ++parse_head;
+
+        if (*parse_head == '\0' || (!is_ascii_hex_digit(*parse_head) && *parse_head != floating_point_decimal_separator))
+            return result;
+    } else if (*parse_head == '+') {
+        ++parse_head;
+
+        if (*parse_head == '\0' || (!is_ascii_hex_digit(*parse_head) && *parse_head != floating_point_decimal_separator))
+            return result;
+    }
+    if (*parse_head == '0' && (*(parse_head + 1) != '\0') && (*(parse_head + 1) == 'x' || *(parse_head + 1) == 'X')) {
+        // Skip potential 0[xX], we have to do this here since the sign comes at the front
+        parse_head += 2;
+    }
+
+    auto add_mantissa_digit = [&] {
+        any_digits = true;
+
+        // We assume you already checked this is actually a digit
+        auto digit = parse_ascii_hex_digit(*parse_head);
+
+        // Because the power of sixteen is just scaling of power of two we don't
+        // need to keep all the remaining digits beyond the first 52 bits, just because
+        // it's easy we store the first 16 digits. However for rounding we do need to parse
+        // all the digits and keep track if we see any non zero one.
+        if (result.mantissa < (1ull << 60)) {
+            result.mantissa = (result.mantissa * 16) + digit;
+            return true;
+        }
+
+        if (digit != 0)
+            truncated_non_zero = true;
+
+        return false;
+    };
+
+    while (*parse_head != '\0' && is_ascii_hex_digit(*parse_head)) {
+        add_mantissa_digit();
+
+        ++parse_head;
+    }
+
+    if (*parse_head != '\0' && *parse_head == floating_point_decimal_separator) {
+        ++parse_head;
+        i64 digits_after_separator = 0;
+        while (*parse_head != '\0' && is_ascii_hex_digit(*parse_head)) {
+            // Track how many characters we actually read into the mantissa
+            digits_after_separator += add_mantissa_digit() ? 1 : 0;
+
+            ++parse_head;
+        }
+
+        // We parsed x digits after the dot so need to multiply with 2^(-x * 4)
+        // Since every digit is 4 bits
+        result.exponent = -digits_after_separator * 4;
+    }
+
+    if (!any_digits)
+        return result;
+
+    if (*parse_head != '\0' && (*parse_head == 'p' || *parse_head == 'P')) {
+        [&] {
+            auto const* head_before_p = parse_head;
+            ArmedScopeGuard reset_ptr { [&] { parse_head = head_before_p; } };
+            ++parse_head;
+
+            if (*parse_head == '\0')
+                return;
+
+            bool exponent_is_negative = false;
+            i64 explicit_exponent = 0;
+
+            if (*parse_head == '-' || *parse_head == '+') {
+                exponent_is_negative = *parse_head == '-';
+                ++parse_head;
+                if (*parse_head == '\0')
+                    return;
+            }
+
+            if (!is_ascii_digit(*parse_head))
+                return;
+
+            // We have at least one digit (with optional preceding sign) so we will not reset
+            reset_ptr.disarm();
+
+            while (*parse_head != '\0' && is_ascii_digit(*parse_head)) {
+                // If we hit exponent overflow the number is so huge we are in trouble anyway, see
+                // a comment in parse_numbers.
+                if (explicit_exponent < 0x10000000)
+                    explicit_exponent = 10 * explicit_exponent + (*parse_head - '0');
+                ++parse_head;
+            }
+
+            if (exponent_is_negative)
+                explicit_exponent = -explicit_exponent;
+
+            result.exponent += explicit_exponent;
+        }();
+    }
+
+    result.valid = true;
+
+    // Round up exactly halfway with truncated non zeros, but don't if it would cascade up
+    if (truncated_non_zero && (result.mantissa & 0xF) != 0xF) {
+        VERIFY(result.mantissa >= 0x1000'0000'0000'0000);
+        result.mantissa |= 1;
+    }
+
+    result.last_parsed = parse_head;
+
+    return result;
+}
+
+template<FloatingPoint T>
+static FloatingPointBuilder build_hex_float(HexFloatParseResult& parse_result)
+{
+    using FloatingPointRepr = FloatingPointInfo<T>;
+    VERIFY(parse_result.mantissa != 0);
+
+    if (parse_result.exponent >= FloatingPointRepr::infinity_exponent())
+        return FloatingPointBuilder::infinity<T>();
+
+    auto leading_zeros = count_leading_zeroes(parse_result.mantissa);
+    u64 normalized_mantissa = parse_result.mantissa << leading_zeros;
+
+    // No need to multiply with some power of 5 here the exponent is already a power of 2.
+
+    u8 upperbit = normalized_mantissa >> 63;
+    FloatingPointBuilder parts;
+    parts.mantissa = normalized_mantissa >> (upperbit + 64 - FloatingPointRepr::mantissa_bits() - 3);
+
+    parts.exponent = parse_result.exponent + upperbit - leading_zeros + FloatingPointRepr::exponent_bias() + 62;
+
+    if (parts.exponent <= 0) {
+        // subnormal
+        if (-parts.exponent + 1 >= 64) {
+            parts.mantissa = 0;
+            parts.exponent = 0;
+            return parts;
+        }
+
+        parts.mantissa >>= -parts.exponent + 1;
+        parts.mantissa += parts.mantissa & 1;
+        parts.mantissa >>= 1;
+
+        if (parts.mantissa < (1ull << FloatingPointRepr::mantissa_bits())) {
+            parts.exponent = 0;
+        } else {
+            parts.exponent = 1;
+        }
+
+        return parts;
+    }
+
+    // Here we don't have to only do this halfway check for some exponents
+    if ((parts.mantissa & 0b11) == 0b01) {
+        // effectively all discard bits from z.high are 0
+        if (normalized_mantissa == (parts.mantissa << (upperbit + 64 - FloatingPointRepr::mantissa_bits() - 3)))
+            parts.mantissa &= ~u64(1);
+    }
+
+    parts.mantissa += parts.mantissa & 1;
+    parts.mantissa >>= 1;
+
+    if (parts.mantissa >= (2ull << FloatingPointRepr::mantissa_bits())) {
+        parts.mantissa = 1ull << FloatingPointRepr::mantissa_bits();
+        ++parts.exponent;
+    }
+
+    parts.mantissa &= ~(1ull << FloatingPointRepr::mantissa_bits());
+
+    if (parts.exponent >= FloatingPointRepr::infinity_exponent()) {
+        parts.mantissa = 0;
+        parts.exponent = FloatingPointRepr::infinity_exponent();
+    }
+
+    return parts;
+}
+
+template<FloatingPoint T>
+FloatingPointParseResults<T> parse_first_hexfloat_until_zero_character(char const* start)
+{
+    using FloatingPointRepr = FloatingPointInfo<T>;
+    auto parse_result = parse_hexfloat(start);
+
+    if (!parse_result.valid)
+        return { nullptr, FloatingPointError::NoOrInvalidInput, __builtin_nan("") };
+
+    FloatingPointParseResults<T> full_result {};
+    full_result.end_ptr = parse_result.last_parsed;
+
+    // We special case this to be able to differentiate between 0 and values rounded down to 0
+
+    if (parse_result.mantissa == 0) {
+        full_result.value = 0.;
+        return full_result;
+    }
+
+    auto result = build_hex_float<T>(parse_result);
+    full_result.value = result.template to_value<T>(parse_result.is_negative);
+
+    if (result.exponent == FloatingPointRepr::infinity_exponent()) {
+        VERIFY(result.mantissa == 0);
+        full_result.error = FloatingPointError::OutOfRange;
+    } else if (result.mantissa == 0 && result.exponent == 0) {
+        full_result.error = FloatingPointError::RoundedDownToZero;
+    }
+
+    return full_result;
+}
+
+template FloatingPointParseResults<double> parse_first_hexfloat_until_zero_character(char const* start);
+
+template FloatingPointParseResults<float> parse_first_hexfloat_until_zero_character(char const* start);
+
 }
diff --git a/AK/FloatingPointStringConversions.h b/AK/FloatingPointStringConversions.h
index 471d042144..4947652446 100644
--- a/AK/FloatingPointStringConversions.h
+++ b/AK/FloatingPointStringConversions.h
@@ -62,7 +62,24 @@ FloatingPointParseResults<T> parse_first_floating_point_until_zero_character(cha
 template<FloatingPoint T = double>
 Optional<T> parse_floating_point_completely(char const* start, char const* end);
 
+/// This function finds the first floating point as a hex float within [start, end).
+/// The accepted format is intentionally as lenient as possible. If your format is
+/// stricter you must validate it first. The format accepts:
+/// - An optional sign, both + and - are supported
+/// - Optionally either 0x or OX
+/// - 0 or more hexadecimal digits, with leading zeros allowed [1]
+/// - A decimal point '.', which can have no digits after it
+/// - 0 or more hexadecimal digits, unless the first digits [1] doesn't have any digits,
+///   then this must have at least one
+/// - An exponent 'p' or 'P' followed by an optional sign '+' or '-' and at least one decimal digit
+/// NOTE: The exponent is _not_ hexadecimal and gives powers of 2 not 16.
+/// This function additionally detects out of range values which have been rounded to
+/// [-]infinity or 0 and gives the next character to read after the floating point.
+template<FloatingPoint T = double>
+FloatingPointParseResults<T> parse_first_hexfloat_until_zero_character(char const* start);
+
 }
 
 using AK::parse_first_floating_point;
+using AK::parse_first_hexfloat_until_zero_character;
 using AK::parse_floating_point_completely;
author	davidot <davidot@serenityos.org>	2022-10-13 02:23:07 +0200
committer	Linus Groh <mail@linusgroh.de>	2022-10-23 15:48:45 +0200
commit	2334cd85a26f6244c7af198f118a6d7e9c437b44 (patch)
tree	df7d9a85e0ed31ebe5c8c98f5fff23731a66e571 /AK
parent	53b7f5e6a11e663c83df8030c3171c5945cb75ec (diff)
download	serenity-2334cd85a26f6244c7af198f118a6d7e9c437b44.zip