LibC: Make strtod use the new exact number parser

Because strtod need to set ERANGE and track the last character we have to check the resulting value. We also have to check for nan and inf in strtod itself as the new double parser doesn't accept that as floating points.
author: davidot <davidot@serenityos.org> 2022-10-12 02:22:54 +0200
committer: Linus Groh <mail@linusgroh.de> 2022-10-23 15:48:45 +0200
commit: 1986b8b066881ec9eda30eb38c755d2aedaf6440 (patch)
tree: bd1e96f2da0a3818d8dc83460728b3bc61e4091c /Userland/Libraries/LibC
parent: 6fd8e96d537e8175f0ca06789a6a19913f561455 (diff)
download: serenity-1986b8b066881ec9eda30eb38c755d2aedaf6440.zip
1 files changed, 164 insertions, 278 deletions
diff --git a/Userland/Libraries/LibC/stdlib.cpp b/Userland/Libraries/LibC/stdlib.cpp
index 81759f4b1f..36ba788412 100644
--- a/Userland/Libraries/LibC/stdlib.cpp
+++ b/Userland/Libraries/LibC/stdlib.cpp
@@ -5,6 +5,8 @@
  */
 
 #include <AK/Assertions.h>
+#include <AK/CharacterTypes.h>
+#include <AK/FloatingPointStringConversions.h>
 #include <AK/HashMap.h>
 #include <AK/Noncopyable.h>
 #include <AK/Random.h>
@@ -182,6 +184,166 @@ inline int generate_unique_filename(char* pattern, size_t suffix_length, Callbac
     return EEXIST;
 }
 
+static bool is_infinity_string(char* parse_ptr, char** endptr)
+{
+    if (is_either(parse_ptr, 0, 'i', 'I')) {
+        if (is_either(parse_ptr, 1, 'n', 'N')) {
+            if (is_either(parse_ptr, 2, 'f', 'F')) {
+                parse_ptr += 3;
+                if (is_either(parse_ptr, 0, 'i', 'I')) {
+                    if (is_either(parse_ptr, 1, 'n', 'N')) {
+                        if (is_either(parse_ptr, 2, 'i', 'I')) {
+                            if (is_either(parse_ptr, 3, 't', 'T')) {
+                                if (is_either(parse_ptr, 4, 'y', 'Y')) {
+                                    parse_ptr += 5;
+                                }
+                            }
+                        }
+                    }
+                }
+                if (endptr)
+                    *endptr = parse_ptr;
+
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
+static bool is_nan_string(char* parse_ptr, char** endptr)
+{
+    // FIXME: Actually parse (or at least skip) the (n-char-sequenceopt) part
+    if (is_either(parse_ptr, 0, 'n', 'N')) {
+        if (is_either(parse_ptr, 1, 'a', 'A')) {
+            if (is_either(parse_ptr, 2, 'n', 'N')) {
+                if (endptr)
+                    *endptr = parse_ptr + 3;
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
+template<FloatingPoint T>
+static T c_str_to_floating_point(char const* str, char** endptr)
+{
+    // First, they decompose the input string into three parts:
+    char* parse_ptr = const_cast<char*>(str);
+
+    // An initial, possibly empty, sequence of white-space characters (as specified by isspace())
+    strtons(parse_ptr, &parse_ptr);
+
+    // A subject sequence interpreted as a floating-point constant or representing infinity or NaN
+
+    if (*parse_ptr == '\0') {
+        if (endptr)
+            *endptr = const_cast<char*>(str);
+        return 0.;
+    }
+
+    bool is_hex = [&] {
+        // A hexfloat must start with either 0x, 0X, -0x or -0X and have something after it
+        char const* parse_head = parse_ptr;
+        if (*parse_head == '-')
+            ++parse_head;
+
+        if (*parse_head != '0')
+            return false;
+
+        ++parse_head;
+
+        if (*parse_head != 'x')
+            return false;
+
+        ++parse_head;
+
+        // We must have at least one digit but it can come after the "decimal" point.
+
+        if (is_ascii_hex_digit(*parse_head))
+            return true;
+
+        if (*parse_head != '.')
+            return false;
+
+        ++parse_head;
+
+        return is_ascii_hex_digit(*parse_head);
+    }();
+
+    AK::FloatingPointParseResults<T> double_parse_result;
+    if (is_hex) {
+        // A 0x or 0X, then a non-empty sequence of hexadecimal digits optionally containing a radix character;
+        // then an optional binary exponent part consisting of the character 'p' or the character 'P',
+        // optionally followed by a '+' or '-' character, and then followed by one or more decimal digits
+
+        double_parse_result = AK::parse_first_hexfloat_until_zero_character<T>(parse_ptr);
+    } else {
+        // A non-empty sequence of decimal digits optionally containing a radix character;
+        // then an optional exponent part consisting of the character 'e' or the character 'E',
+        // optionally followed by a '+' or '-' character, and then followed by one or more decimal digits
+        double_parse_result = AK::parse_first_floating_point_until_zero_character<T>(parse_ptr);
+    }
+
+    if (double_parse_result.error == AK::FloatingPointError::None) {
+        // The only way to get NaN (which we shouldn't) or infinities is rounding up to them so we
+        // have to set ERANGE in that case.
+        if (!__builtin_isfinite(double_parse_result.value))
+            errno = ERANGE;
+
+        if (endptr)
+            *endptr = const_cast<char*>(double_parse_result.end_ptr);
+        return double_parse_result.value;
+    }
+
+    if (double_parse_result.error == AK::FloatingPointError::RoundedDownToZero || double_parse_result.error == AK::FloatingPointError::OutOfRange) {
+        // This is a special case for strtod, where we have a double so close to zero we had to round
+        // it to zero, in which case we have to set ERANGE
+        errno = ERANGE;
+
+        if (endptr)
+            *endptr = const_cast<char*>(double_parse_result.end_ptr);
+        return double_parse_result.value;
+    }
+
+    // The only way we are here is if the input was not valid for parse_first_floating_point or not a valid hex float
+    // So the only cases left are:
+    // - One of INF or INFINITY, ignoring case
+    // - One of NAN or NAN(n-char-sequenceopt), ignoring case in the NAN part
+
+    const Sign sign = strtosign(parse_ptr, &parse_ptr);
+
+    if (is_infinity_string(parse_ptr, endptr)) {
+        // Don't set errno to ERANGE here:
+        // The caller may want to distinguish between "input is
+        // literal infinity" and "input is not literal infinity
+        // but did not fit into double".
+        if (sign != Sign::Negative)
+            return static_cast<T>(__builtin_huge_val());
+        else
+            return static_cast<T>(-__builtin_huge_val());
+    }
+
+    if (is_nan_string(parse_ptr, endptr)) {
+        errno = ERANGE;
+        // FIXME: Do we actually want to return "different" NaN bit values?
+        if (sign != Sign::Negative)
+            return static_cast<T>(__builtin_nan(""));
+        else
+            return static_cast<T>(-__builtin_nan(""));
+    }
+
+    // If no conversion could be performed, 0 shall be returned, and errno may be set to [EINVAL].
+    // FIXME: This is in the posix standard linked from strtod but not in implementations of strtod
+    //        and not in the man pages for linux strtod.
+    if (endptr)
+        *endptr = const_cast<char*>(str);
+    return 0;
+}
+
 extern "C" {
 
 void exit(int status)
@@ -398,283 +560,7 @@ void setprogname(char const* progname)
 // https://pubs.opengroup.org/onlinepubs/9699919799/functions/strtod.html
 double strtod(char const* str, char** endptr)
 {
-    // Parse spaces, sign, and base
-    char* parse_ptr = const_cast<char*>(str);
-    strtons(parse_ptr, &parse_ptr);
-    const Sign sign = strtosign(parse_ptr, &parse_ptr);
-
-    // Parse inf/nan, if applicable.
-    if (is_either(parse_ptr, 0, 'i', 'I')) {
-        if (is_either(parse_ptr, 1, 'n', 'N')) {
-            if (is_either(parse_ptr, 2, 'f', 'F')) {
-                parse_ptr += 3;
-                if (is_either(parse_ptr, 0, 'i', 'I')) {
-                    if (is_either(parse_ptr, 1, 'n', 'N')) {
-                        if (is_either(parse_ptr, 2, 'i', 'I')) {
-                            if (is_either(parse_ptr, 3, 't', 'T')) {
-                                if (is_either(parse_ptr, 4, 'y', 'Y')) {
-                                    parse_ptr += 5;
-                                }
-                            }
-                        }
-                    }
-                }
-                if (endptr)
-                    *endptr = parse_ptr;
-                // Don't set errno to ERANGE here:
-                // The caller may want to distinguish between "input is
-                // literal infinity" and "input is not literal infinity
-                // but did not fit into double".
-                if (sign != Sign::Negative) {
-                    return __builtin_huge_val();
-                } else {
-                    return -__builtin_huge_val();
-                }
-            }
-        }
-    }
-    if (is_either(parse_ptr, 0, 'n', 'N')) {
-        if (is_either(parse_ptr, 1, 'a', 'A')) {
-            if (is_either(parse_ptr, 2, 'n', 'N')) {
-                if (endptr)
-                    *endptr = parse_ptr + 3;
-                errno = ERANGE;
-                if (sign != Sign::Negative) {
-                    return __builtin_nan("");
-                } else {
-                    return -__builtin_nan("");
-                }
-            }
-        }
-    }
-
-    // Parse base
-    char exponent_lower;
-    char exponent_upper;
-    int base = 10;
-    if (*parse_ptr == '0') {
-        char const base_ch = *(parse_ptr + 1);
-        if (base_ch == 'x' || base_ch == 'X') {
-            base = 16;
-            parse_ptr += 2;
-        }
-    }
-
-    if (base == 10) {
-        exponent_lower = 'e';
-        exponent_upper = 'E';
-    } else {
-        exponent_lower = 'p';
-        exponent_upper = 'P';
-    }
-
-    // Parse "digits", possibly keeping track of the exponent offset.
-    // We parse the most significant digits and the position in the
-    // base-`base` representation separately. This allows us to handle
-    // numbers like `0.0000000000000000000000000000000000001234` or
-    // `1234567890123456789012345678901234567890` with ease.
-    LongLongParser digits { sign, base };
-    bool digits_usable = false;
-    bool should_continue = true;
-    bool digits_overflow = false;
-    bool after_decimal = false;
-    int exponent = 0;
-    do {
-        if (!after_decimal && *parse_ptr == '.') {
-            after_decimal = true;
-            parse_ptr += 1;
-            continue;
-        }
-
-        bool is_a_digit;
-        if (digits_overflow) {
-            is_a_digit = digits.parse_digit(*parse_ptr) != -1;
-        } else {
-            DigitConsumeDecision decision = digits.consume(*parse_ptr);
-            switch (decision) {
-            case DigitConsumeDecision::Consumed:
-                is_a_digit = true;
-                // The very first actual digit must pass here:
-                digits_usable = true;
-                break;
-            case DigitConsumeDecision::PosOverflow:
-            case DigitConsumeDecision::NegOverflow:
-                is_a_digit = true;
-                digits_overflow = true;
-                break;
-            case DigitConsumeDecision::Invalid:
-                is_a_digit = false;
-                break;
-            default:
-                VERIFY_NOT_REACHED();
-            }
-        }
-
-        if (is_a_digit) {
-            exponent -= after_decimal ? 1 : 0;
-            exponent += digits_overflow ? 1 : 0;
-        }
-
-        should_continue = is_a_digit;
-        parse_ptr += should_continue;
-    } while (should_continue);
-
-    if (!digits_usable) {
-        // No actual number value available.
-        if (endptr)
-            *endptr = const_cast<char*>(str);
-        return 0.0;
-    }
-
-    // Parse exponent.
-    // We already know the next character is not a digit in the current base,
-    // nor a valid decimal point. Check whether it's an exponent sign.
-    if (*parse_ptr == exponent_lower || *parse_ptr == exponent_upper) {
-        // Need to keep the old parse_ptr around, in case of rollback.
-        char* old_parse_ptr = parse_ptr;
-        parse_ptr += 1;
-
-        // Can't use atol or strtol here: Must accept excessive exponents,
-        // even exponents >64 bits.
-        Sign exponent_sign = strtosign(parse_ptr, &parse_ptr);
-        IntParser exponent_parser { exponent_sign, base };
-        bool exponent_usable = false;
-        bool exponent_overflow = false;
-        should_continue = true;
-        do {
-            bool is_a_digit;
-            if (exponent_overflow) {
-                is_a_digit = exponent_parser.parse_digit(*parse_ptr) != -1;
-            } else {
-                DigitConsumeDecision decision = exponent_parser.consume(*parse_ptr);
-                switch (decision) {
-                case DigitConsumeDecision::Consumed:
-                    is_a_digit = true;
-                    // The very first actual digit must pass here:
-                    exponent_usable = true;
-                    break;
-                case DigitConsumeDecision::PosOverflow:
-                case DigitConsumeDecision::NegOverflow:
-                    is_a_digit = true;
-                    exponent_overflow = true;
-                    break;
-                case DigitConsumeDecision::Invalid:
-                    is_a_digit = false;
-                    break;
-                default:
-                    VERIFY_NOT_REACHED();
-                }
-            }
-
-            should_continue = is_a_digit;
-            parse_ptr += should_continue;
-        } while (should_continue);
-
-        if (!exponent_usable) {
-            parse_ptr = old_parse_ptr;
-        } else if (exponent_overflow) {
-            // Technically this is wrong. If someone gives us 5GB of digits,
-            // and then an exponent of -5_000_000_000, the resulting exponent
-            // should be around 0.
-            // However, I think it's safe to assume that we never have to deal
-            // with that many digits anyway.
-            if (sign != Sign::Negative) {
-                exponent = INT_MIN;
-            } else {
-                exponent = INT_MAX;
-            }
-        } else {
-            // Literal exponent is usable and fits in an int.
-            // However, `exponent + exponent_parser.number()` might overflow an int.
-            // This would result in the wrong sign of the exponent!
-            long long new_exponent = static_cast<long long>(exponent) + static_cast<long long>(exponent_parser.number());
-            if (new_exponent < INT_MIN) {
-                exponent = INT_MIN;
-            } else if (new_exponent > INT_MAX) {
-                exponent = INT_MAX;
-            } else {
-                exponent = static_cast<int>(new_exponent);
-            }
-        }
-    }
-
-    // Parsing finished. now we only have to compute the result.
-    if (endptr)
-        *endptr = const_cast<char*>(parse_ptr);
-
-    // If `digits` is zero, we don't even have to look at `exponent`.
-    if (digits.number() == 0) {
-        if (sign != Sign::Negative) {
-            return 0.0;
-        } else {
-            return -0.0;
-        }
-    }
-
-    // Deal with extreme exponents.
-    // The smallest normal is 2^-1022.
-    // The smallest denormal is 2^-1074.
-    // The largest number in `digits` is 2^63 - 1.
-    // Therefore, if "base^exponent" is smaller than 2^-(1074+63), the result is 0.0 anyway.
-    // This threshold is roughly 5.3566 * 10^-343.
-    // So if the resulting exponent is -344 or lower (closer to -inf),
-    // the result is 0.0 anyway.
-    // We only need to avoid false positives, so we can ignore base 16.
-    if (exponent <= -344) {
-        errno = ERANGE;
-        // Definitely can't be represented more precisely.
-        // I lied, sometimes the result is +0.0, and sometimes -0.0.
-        if (sign != Sign::Negative) {
-            return 0.0;
-        } else {
-            return -0.0;
-        }
-    }
-    // The largest normal is 2^+1024-eps.
-    // The smallest number in `digits` is 1.
-    // Therefore, if "base^exponent" is 2^+1024, the result is INF anyway.
-    // This threshold is roughly 1.7977 * 10^-308.
-    // So if the resulting exponent is +309 or higher,
-    // the result is INF anyway.
-    // We only need to avoid false positives, so we can ignore base 16.
-    if (exponent >= 309) {
-        errno = ERANGE;
-        // Definitely can't be represented more precisely.
-        // I lied, sometimes the result is +INF, and sometimes -INF.
-        if (sign != Sign::Negative) {
-            return __builtin_huge_val();
-        } else {
-            return -__builtin_huge_val();
-        }
-    }
-
-    // TODO: If `exponent` is large, this could be made faster.
-    double value = digits.number();
-    double scale = 1;
-
-    if (exponent < 0) {
-        exponent = -exponent;
-        for (int i = 0; i < min(exponent, 300); ++i) {
-            scale *= base;
-        }
-        value /= scale;
-        for (int i = 300; i < exponent; i++) {
-            value /= base;
-        }
-        if (value == -0.0 || value == +0.0) {
-            errno = ERANGE;
-        }
-    } else if (exponent > 0) {
-        for (int i = 0; i < exponent; ++i) {
-            scale *= base;
-        }
-        value *= scale;
-        if (value == -__builtin_huge_val() || value == +__builtin_huge_val()) {
-            errno = ERANGE;
-        }
-    }
-
-    return value;
+    return c_str_to_floating_point<double>(str, endptr);
 }
 
 // https://pubs.opengroup.org/onlinepubs/9699919799/functions/strtold.html
@@ -687,7 +573,7 @@ long double strtold(char const* str, char** endptr)
 // https://pubs.opengroup.org/onlinepubs/9699919799/functions/strtof.html
 float strtof(char const* str, char** endptr)
 {
-    return strtod(str, endptr);
+    return c_str_to_floating_point<float>(str, endptr);
 }
 
 // https://pubs.opengroup.org/onlinepubs/9699919799/functions/atof.html
author	davidot <davidot@serenityos.org>	2022-10-12 02:22:54 +0200
committer	Linus Groh <mail@linusgroh.de>	2022-10-23 15:48:45 +0200
commit	1986b8b066881ec9eda30eb38c755d2aedaf6440 (patch)
tree	bd1e96f2da0a3818d8dc83460728b3bc61e4091c /Userland/Libraries/LibC
parent	6fd8e96d537e8175f0ca06789a6a19913f561455 (diff)
download	serenity-1986b8b066881ec9eda30eb38c755d2aedaf6440.zip