LibC: Implement new strtod, accurate up to 8 eps

This strtod implementation is not perfectly accurate, as evidenced by the test (accuracy_strtod.cpp), but it is sufficiently close (up to 8 eps). The main sources of inaccuracy are: - Highly repeated division/multiplication by 'base' (Each operation brings a multiplicative error of 1+2^-53.) - Loss during the initial conversion from long long to double (most prominently, 69294956446009195 should first be rounded to 69294956446009200 and then converted to 69294956446009200.0 and then divided by ten, yielding 6929495644600920.0. Currently, it converts first to double, can't represent 69294956446009195.0, and instead represents 69294956446009190, which eventually yields 6929495644600919.0. Close, but technically wrong.) I believe that these issues can be fixed by rewriting the part at and after double value = digits.number(); and that the loss before that is acceptable. Specifically, losing the exact exponent on overflow is obviously fine. The only other loss occurs when the significant digits overflow a 'long long'. But these store 64(-7ish) bits, and the mantissa of a double only stores 52 bits. With a bit more thinking, one could probably get the error down to 1 or 2 eps. (But not better.) Fixes #1979.
author: Ben Wiederhake <BenWiederhake.GitHub@gmx.de> 2020-05-10 22:43:09 +0200
committer: Andreas Kling <kling@serenityos.org> 2020-05-11 10:52:24 +0200
commit: bc5d5bf75d06c422094a3bd0aafd14be8a84c61a (patch)
tree: 9801ecad6fc9ac2ca6230c992a56a5371ae6f45b /Libraries/LibC
parent: c19d5943f14fe5bf39b885e58ec6d1df2649a07b (diff)
download: serenity-bc5d5bf75d06c422094a3bd0aafd14be8a84c61a.zip
1 files changed, 382 insertions, 125 deletions
diff --git a/Libraries/LibC/stdlib.cpp b/Libraries/LibC/stdlib.cpp
index a823ffaf91..c5d84191cd 100644
--- a/Libraries/LibC/stdlib.cpp
+++ b/Libraries/LibC/stdlib.cpp
@@ -26,6 +26,7 @@
 
 #include <AK/Assertions.h>
 #include <AK/HashMap.h>
+#include <AK/Noncopyable.h>
 #include <AK/StdLibExtras.h>
 #include <AK/Types.h>
 #include <AK/Utf8View.h>
@@ -129,6 +130,130 @@ static inline T strtol_impl(const char* nptr, char** endptr, int base)
     return num;
 }
 
+static void strtons(const char* str, char** endptr)
+{
+    assert(endptr);
+    char* ptr = const_cast<char*>(str);
+    while (isspace(*ptr)) {
+        ptr += 1;
+    }
+    *endptr = ptr;
+}
+
+enum Sign {
+    Negative,
+    Positive,
+};
+
+static Sign strtosign(const char* str, char** endptr)
+{
+    assert(endptr);
+    if (*str == '+') {
+        *endptr = const_cast<char*>(str + 1);
+        return Sign::Positive;
+    } else if (*str == '-') {
+        *endptr = const_cast<char*>(str + 1);
+        return Sign::Negative;
+    } else {
+        *endptr = const_cast<char*>(str);
+        return Sign::Positive;
+    }
+}
+
+enum DigitConsumeDecision {
+    Consumed,
+    PosOverflow,
+    NegOverflow,
+    Invalid,
+};
+
+template<typename T, T min_value, T max_value>
+class NumParser {
+    AK_MAKE_NONMOVABLE(NumParser)
+
+public:
+    NumParser(Sign sign, int base)
+        : m_base(base)
+        , m_num(0)
+        , m_sign(sign)
+    {
+        m_cutoff = positive() ? (max_value / base) : (min_value / base);
+        m_max_digit_after_cutoff = positive() ? (max_value % base) : (min_value % base);
+    }
+
+    int parse_digit(char ch)
+    {
+        int digit;
+        if (isdigit(ch))
+            digit = ch - '0';
+        else if (islower(ch))
+            digit = ch - ('a' - 10);
+        else if (isupper(ch))
+            digit = ch - ('A' - 10);
+        else
+            return -1;
+
+        if (digit >= m_base)
+            return -1;
+
+        return digit;
+    }
+
+    DigitConsumeDecision consume(char ch)
+    {
+        int digit = parse_digit(ch);
+        if (digit == -1)
+            return DigitConsumeDecision::Invalid;
+
+        if (!can_append_digit(digit)) {
+            if (m_sign != Sign::Negative) {
+                return DigitConsumeDecision::PosOverflow;
+            } else {
+                return DigitConsumeDecision::NegOverflow;
+            }
+        }
+
+        m_num *= m_base;
+        m_num += positive() ? digit : -digit;
+
+        return DigitConsumeDecision::Consumed;
+    }
+
+    T number() const { return m_num; };
+
+private:
+    bool can_append_digit(int digit)
+    {
+        const bool is_below_cutoff = positive() ? (m_num < m_cutoff) : (m_num > m_cutoff);
+
+        if (is_below_cutoff) {
+            return true;
+        } else {
+            return m_num == m_cutoff && digit < m_max_digit_after_cutoff;
+        }
+    }
+
+    bool positive() const
+    {
+        return m_sign != Sign::Negative;
+    }
+
+    const T m_base;
+    T m_num;
+    T m_cutoff;
+    int m_max_digit_after_cutoff;
+    Sign m_sign;
+};
+
+typedef NumParser<int, INT_MIN, INT_MAX> IntParser;
+typedef NumParser<long long, LONG_LONG_MIN, LONG_LONG_MAX> LongLongParser;
+
+static bool is_either(char* str, int offset, char lower, char upper)
+{
+    char ch = *(str + offset);
+    return ch == lower || ch == upper;
+}
+
 __attribute__((warn_unused_result)) int __generate_unique_filename(char* pattern)
 {
     size_t length = strlen(pattern);
@@ -308,162 +433,294 @@ int putenv(char* new_var)
 
 double strtod(const char* str, char** endptr)
 {
-    size_t len = strlen(str);
-    size_t weight = 1;
-    int exp_val = 0;
-    double value = 0.0f;
-    double fraction = 0.0f;
-    bool has_sign = false;
-    bool is_negative = false;
-    bool is_fractional = false;
-    bool is_scientific = false;
+    // Parse spaces, sign, and base
+    char* parse_ptr = const_cast<char*>(str);
+    strtons(parse_ptr, &parse_ptr);
+    const Sign sign = strtosign(parse_ptr, &parse_ptr);
+
+    // Parse inf/nan, if applicable.
+    if (is_either(parse_ptr, 0, 'i', 'I')) {
+        if (is_either(parse_ptr, 1, 'n', 'N')) {
+            if (is_either(parse_ptr, 2, 'f', 'F')) {
+                parse_ptr += 3;
+                if (is_either(parse_ptr, 0, 'i', 'I')) {
+                    if (is_either(parse_ptr, 1, 'n', 'N')) {
+                        if (is_either(parse_ptr, 2, 'i', 'I')) {
+                            if (is_either(parse_ptr, 3, 't', 'T')) {
+                                if (is_either(parse_ptr, 4, 'y', 'Y')) {
+                                    parse_ptr += 5;
+                                }
+                            }
+                        }
+                    }
+                }
+                if (endptr)
+                    *endptr = parse_ptr;
+                // Don't set errno to ERANGE here:
+                // The caller may want to distinguish between "input is
+                // literal infinity" and "input is not literal infinity
+                // but did not fit into double".
+                if (sign != Sign::Negative) {
+                    return __builtin_huge_val();
+                } else {
+                    return -__builtin_huge_val();
+                }
+            }
+        }
+    }
+    if (is_either(parse_ptr, 0, 'n', 'N')) {
+        if (is_either(parse_ptr, 1, 'a', 'A')) {
+            if (is_either(parse_ptr, 2, 'n', 'N')) {
+                if (endptr)
+                    *endptr = parse_ptr + 3;
+                errno = ERANGE;
+                if (sign != Sign::Negative) {
+                    return __builtin_nan("");
+                } else {
+                    return -__builtin_nan("");
+                }
+            }
+        }
+    }
 
-    if (str[0] == '-') {
-        is_negative = true;
-        has_sign = true;
+    // Parse base
+    char exponent_lower;
+    char exponent_upper;
+    int base = 10;
+    if (*parse_ptr == '0') {
+        const char base_ch = *(parse_ptr + 1);
+        if (base_ch == 'x' || base_ch == 'X') {
+            base = 16;
+            parse_ptr += 2;
+        }
     }
-    if (str[0] == '+') {
-        has_sign = true;
+
+    if (base == 10) {
+        exponent_lower = 'e';
+        exponent_upper = 'E';
+    } else {
+        exponent_lower = 'p';
+        exponent_upper = 'P';
     }
-    size_t i;
-    for (i = has_sign; i < len; i++) {
 
-        // Looks like we're about to start working on the fractional part
-        if (str[i] == '.') {
-            is_fractional = true;
+    // Parse "digits", possibly keeping track of the exponent offset.
+    // We parse the most significant digits and the position in the
+    // base-`base` representation separately. This allows us to handle
+    // numbers like `0.0000000000000000000000000000000000001234` or
+    // `1234567890123456789012345678901234567890` with ease.
+    LongLongParser digits { sign, base };
+    bool digits_usable = false;
+    bool should_continue = true;
+    bool digits_overflow = false;
+    bool after_decimal = false;
+    int exponent = 0;
+    do {
+        if (!after_decimal && *parse_ptr == '.') {
+            after_decimal = true;
+            parse_ptr += 1;
             continue;
         }
 
-        if (str[i] == 'e' || str[i] == 'E') {
-            if (str[i + 1] == '-')
-                exp_val = -atoi(str + i + 2);
-            else if (str[i + 1] == '+')
-                exp_val = atoi(str + i + 2);
-            else
-                exp_val = atoi(str + i + 1);
+        bool is_a_digit;
+        if (digits_overflow) {
+            is_a_digit = digits.parse_digit(*parse_ptr) != -1;
+        } else {
+            DigitConsumeDecision decision = digits.consume(*parse_ptr);
+            switch (decision) {
+            case DigitConsumeDecision::Consumed:
+                is_a_digit = true;
+                // The very first actual digit must pass here:
+                digits_usable = true;
+                break;
+            case DigitConsumeDecision::PosOverflow:
+                // fallthrough
+            case DigitConsumeDecision::NegOverflow:
+                is_a_digit = true;
+                digits_overflow = true;
+                break;
+            case DigitConsumeDecision::Invalid:
+                is_a_digit = false;
+                break;
+            default:
+                ASSERT_NOT_REACHED();
+            }
+        }
 
-            is_scientific = true;
-            continue;
+        if (is_a_digit) {
+            exponent -= after_decimal ? 1 : 0;
+            exponent += digits_overflow ? 1 : 0;
         }
 
-        if (str[i] < '0' || str[i] > '9' || exp_val != 0)
-            continue;
+        should_continue = is_a_digit;
+        parse_ptr += should_continue;
+    } while (should_continue);
 
-        if (is_fractional) {
-            fraction *= 10;
-            fraction += str[i] - '0';
-            weight *= 10;
+    if (!digits_usable) {
+        // No actual number value available.
+        if (endptr)
+            *endptr = const_cast<char*>(str);
+        return 0.0;
+    }
+
+    // Parse exponent.
+    // We already know the next character is not a digit in the current base,
+    // nor a valid decimal point. Check whether it's an exponent sign.
+    if (*parse_ptr == exponent_lower || *parse_ptr == exponent_upper) {
+        // Need to keep the old parse_ptr around, in case of rollback.
+        char* old_parse_ptr = parse_ptr;
+        parse_ptr += 1;
+
+        // Can't use atol or strtol here: Must accept excessive exponents,
+        // even exponents >64 bits.
+        Sign exponent_sign = strtosign(parse_ptr, &parse_ptr);
+        IntParser exponent_parser { exponent_sign, base };
+        bool exponent_usable = false;
+        bool exponent_overflow = false;
+        should_continue = true;
+        do {
+            bool is_a_digit;
+            if (exponent_overflow) {
+                is_a_digit = exponent_parser.parse_digit(*parse_ptr) != -1;
+            } else {
+                DigitConsumeDecision decision = exponent_parser.consume(*parse_ptr);
+                switch (decision) {
+                case DigitConsumeDecision::Consumed:
+                    is_a_digit = true;
+                    // The very first actual digit must pass here:
+                    exponent_usable = true;
+                    break;
+                case DigitConsumeDecision::PosOverflow:
+                    // fallthrough
+                case DigitConsumeDecision::NegOverflow:
+                    is_a_digit = true;
+                    exponent_overflow = true;
+                    break;
+                case DigitConsumeDecision::Invalid:
+                    is_a_digit = false;
+                    break;
+                default:
+                    ASSERT_NOT_REACHED();
+                }
+            }
+
+            should_continue = is_a_digit;
+            parse_ptr += should_continue;
+        } while (should_continue);
+
+        if (!exponent_usable) {
+            parse_ptr = old_parse_ptr;
+        } else if (exponent_overflow) {
+            // Technically this is wrong. If someone gives us 5GB of digits,
+            // and then an exponent of -5_000_000_000, the resulting exponent
+            // should be around 0.
+            // However, I think it's safe to assume that we never have to deal
+            // with that many digits anyway.
+            if (sign != Sign::Negative) {
+                exponent = INT_MIN;
+            } else {
+                exponent = INT_MAX;
+            }
         } else {
-            value = value * 10;
-            value += str[i] - '0';
+            // Literal exponent is usable and fits in an int.
+            // However, `exponent + exponent_parser.number()` might overflow an int.
+            // This would result in the wrong sign of the exponent!
+            long long new_exponent = static_cast<long long>(exponent) + static_cast<long long>(exponent_parser.number());
+            if (new_exponent < INT_MIN) {
+                exponent = INT_MIN;
+            } else if (new_exponent > INT_MAX) {
+                exponent = INT_MAX;
+            } else {
+                exponent = static_cast<int>(new_exponent);
+            }
         }
     }
 
-    fraction /= weight;
-    value += fraction;
+    // Parsing finished. now we only have to compute the result.
+    if (endptr)
+        *endptr = const_cast<char*>(parse_ptr);
 
-    if (is_scientific) {
-        bool divide = exp_val < 0;
-        if (divide)
-            exp_val *= -1;
+    // If `digits` is zero, we don't even have to look at `exponent`.
+    if (digits.number() == 0) {
+        if (sign != Sign::Negative) {
+            return 0.0;
+        } else {
+            return -0.0;
+        }
+    }
 
-        for (int i = 0; i < exp_val; i++) {
-            if (divide)
-                value /= 10;
-            else
-                value *= 10;
+    // Deal with extreme exponents.
+    // The smallest normal is 2^-1022.
+    // The smallest denormal is 2^-1074.
+    // The largest number in `digits` is 2^63 - 1.
+    // Therefore, if "base^exponent" is smaller than 2^-(1074+63), the result is 0.0 anyway.
+    // This threshold is roughly 5.3566 * 10^-343.
+    // So if the resulting exponent is -344 or lower (closer to -inf),
+    // the result is 0.0 anyway.
+    // We only need to avoid false positives, so we can ignore base 16.
+    if (exponent <= -344) {
+        errno = ERANGE;
+        // Definitely can't be represented more precisely.
+        // I lied, sometimes the result is +0.0, and sometimes -0.0.
+        if (sign != Sign::Negative) {
+            return 0.0;
+        } else {
+            return -0.0;
         }
     }
-    //FIXME: Not entirely sure if this is correct, but seems to work.
-    if (endptr)
-        *endptr = const_cast<char*>(str + i);
-    return is_negative ? -value : value;
+    // The largest normal is 2^+1024-eps.
+    // The smallest number in `digits` is 1.
+    // Therefore, if "base^exponent" is 2^+1024, the result is INF anyway.
+    // This threshold is roughly 1.7977 * 10^-308.
+    // So if the resulting exponent is +309 or higher,
+    // the result is INF anyway.
+    // We only need to avoid false positives, so we can ignore base 16.
+    if (exponent >= 309) {
+        errno = ERANGE;
+        // Definitely can't be represented more precisely.
+        // I lied, sometimes the result is +INF, and sometimes -INF.
+        if (sign != Sign::Negative) {
+            return __builtin_huge_val();
+        } else {
+            return -__builtin_huge_val();
+        }
+    }
+
+    // TODO: If `exponent` is large, this could be made faster.
+    double value = digits.number();
+    if (exponent < 0) {
+        exponent = -exponent;
+        for (int i = 0; i < exponent; ++i) {
+            value /= base;
+        }
+        if (value == -0.0 || value == +0.0) {
+            errno = ERANGE;
+        }
+    } else if (exponent > 0) {
+        for (int i = 0; i < exponent; ++i) {
+            value *= base;
+        }
+        if (value == -__builtin_huge_val() || value == +__builtin_huge_val()) {
+            errno = ERANGE;
+        }
+    }
+
+    return value;
 }
 
 long double strtold(const char* str, char** endptr)
 {
-    (void)str;
-    (void)endptr;
-    dbgprintf("LibC: strtold: '%s'\n", str);
-    ASSERT_NOT_REACHED();
+    assert(sizeof(double) == sizeof(long double));
+    return strtod(str, endptr);
 }
 
 float strtof(const char* str, char** endptr)
 {
-    (void)str;
-    (void)endptr;
-    dbgprintf("LibC: strtof: '%s'\n", str);
-    ASSERT_NOT_REACHED();
+    return strtod(str, endptr);
 }
 
 double atof(const char* str)
 {
-    size_t len = strlen(str);
-    size_t weight = 1;
-    int exp_val = 0;
-    double value = 0.0f;
-    double fraction = 0.0f;
-    bool has_sign = false;
-    bool is_negative = false;
-    bool is_fractional = false;
-    bool is_scientific = false;
-
-    if (str[0] == '-') {
-        is_negative = true;
-        has_sign = true;
-    }
-    if (str[0] == '+') {
-        has_sign = true;
-    }
-
-    for (size_t i = has_sign; i < len; i++) {
-
-        // Looks like we're about to start working on the fractional part
-        if (str[i] == '.') {
-            is_fractional = true;
-            continue;
-        }
-
-        if (str[i] == 'e' || str[i] == 'E') {
-            if (str[i + 1] == '-' || str[i + 1] == '+')
-                exp_val = atoi(str + i + 2);
-            else
-                exp_val = atoi(str + i + 1);
-
-            is_scientific = true;
-            continue;
-        }
-
-        if (str[i] < '0' || str[i] > '9' || exp_val != 0)
-            continue;
-
-        if (is_fractional) {
-            fraction *= 10;
-            fraction += str[i] - '0';
-            weight *= 10;
-        } else {
-            value = value * 10;
-            value += str[i] - '0';
-        }
-    }
-
-    fraction /= weight;
-    value += fraction;
-
-    if (is_scientific) {
-        bool divide = exp_val < 0;
-        if (divide)
-            exp_val *= -1;
-
-        for (int i = 0; i < exp_val; i++) {
-            if (divide)
-                value /= 10;
-            else
-                value *= 10;
-        }
-    }
-
-    return is_negative ? -value : value;
+    return strtod(str, nullptr);
 }
 
 int atoi(const char* str)
author	Ben Wiederhake <BenWiederhake.GitHub@gmx.de>	2020-05-10 22:43:09 +0200
committer	Andreas Kling <kling@serenityos.org>	2020-05-11 10:52:24 +0200
commit	bc5d5bf75d06c422094a3bd0aafd14be8a84c61a (patch)
tree	9801ecad6fc9ac2ca6230c992a56a5371ae6f45b /Libraries/LibC
parent	c19d5943f14fe5bf39b885e58ec6d1df2649a07b (diff)
download	serenity-bc5d5bf75d06c422094a3bd0aafd14be8a84c61a.zip