From a1aa990b0b034cd33f3771520b070288f5156d88 Mon Sep 17 00:00:00 2001 From: Elliott Sales de Andrade Date: Thu, 9 Jan 2020 01:54:33 -0500 Subject: [PATCH] Improve `fread` for very small or very large fp numbers. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On non-x86 architectures (armv7hl and ppc64le), test 1018 fails with a slightly differently parsed number. In base R, `R_strtod` handles small numbers by pre-dividing numerator and divisor before applying the exponent part (instead of dividing all together.) However, it does not use a lookup table. For `fread`, trim the exponent lookup table from ±350 to ±300, and if anything is in that removed range, do two multiplications instead. This results in approximately the same effect as in base R. Removing some of the range from the lookup table also fixes several warnings such as: ``` freadLookups.h:57:1: warning: floating constant truncated to zero [-Woverflow] 57 | 1.0E-324L, | ^~~~~~~~~ freadLookups.h:690:1: warning: floating constant exceeds range of 'long double' [-Woverflow] 690 | 1.0E309L, | ^~~~~~~~ ``` See #3492 and #4032. --- NEWS.md | 2 + inst/tests/tests.Rraw | 3 +- src/fread.c | 26 ++++++++--- src/fread.h | 2 +- src/freadLookups.h | 104 +----------------------------------------- 5 files changed, 26 insertions(+), 111 deletions(-) diff --git a/NEWS.md b/NEWS.md index 2d607e9bad..36d6a171f0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -87,6 +87,8 @@ unit = "s") 6. `all.equal(DT, y)` no longer errors when `y` is not a data.table, [#4042](https://github.com/Rdatatable/data.table/issues/4042). Thanks to @d-sci for reporting and the PR. +7. `fread` improves handling of very small (<1e-300) or very large (>1e+300) floating point numbers on non-x86 architectures (specifically ppc64le and armv7hl). Thanks to @QuLogic for reporting and fixing, [PR#4165](https://github.com/Rdatatable/data.table/pull/4165). + ## NOTES 1. `as.IDate`, `as.ITime`, `second`, `minute`, and `hour` now recognize UTC equivalents for speed: GMT, GMT-0, GMT+0, GMT0, Etc/GMT, and Etc/UTC, [#4116](https://github.com/Rdatatable/data.table/issues/4116). diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 03152f270f..ed62cbac23 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2929,7 +2929,8 @@ test(1017.2, fread(f, integer64="character"), DT) unlink(f) # ERANGE errno handled, #4879 -test(1018, identical(fread("1.46761e-313\n"), data.table(V1=1.46761e-313))) +test(1018.1, identical(fread("1.46761e-313\n"), data.table(V1=1.46761e-313))) +test(1018.2, identical(fread("1.46761e+313\n"), data.table(V1=1.46761e+313))) test(1019, fread("A\n1.23456789123456789123456999\n"), data.table(A=1.234567891234568)) # crash assigning to row 0, #2754 diff --git a/src/fread.c b/src/fread.c index 69082c5d95..c94aeac069 100644 --- a/src/fread.c +++ b/src/fread.c @@ -648,9 +648,9 @@ static void StrtoI64(FieldParseContext *ctx) // TODO: review ERANGE checks and tests; that range outside [1.7e-308,1.7e+308] coerces to [0.0,Inf] /* f = "~/data.table/src/freadLookups.h" -cat("const long double pow10lookup[701] = {\n", file=f, append=FALSE) -for (i in (-350):(349)) cat("1.0E",i,"L,\n", sep="", file=f, append=TRUE) -cat("1.0E350L\n};\n", file=f, append=TRUE) +cat("const long double pow10lookup[601] = {\n", file=f, append=FALSE) +for (i in (-300):(299)) cat("1.0E",i,"L,\n", sep="", file=f, append=TRUE) +cat("1.0E300L\n};\n", file=f, append=TRUE) */ @@ -767,11 +767,23 @@ static void parse_double_regular(FieldParseContext *ctx) } e += Eneg? -E : E; } - e += 350; // lookup table is arranged from -350 (0) to +350 (700) - if (e<0 || e>700) goto fail; + if (e<-350 || e>350) goto fail; - double r = (double)((long double)acc * pow10lookup[e]); - *target = neg? -r : r; + long double r = (long double)acc; + if (e < -300 || e > 300) { + // Handle extra precision by pre-multiplying the result by pow(10, extra), + // and then remove extra from e. + // This avoids having to store very small or very large constants that may + // fail to be encoded by the compiler, even though the values can actually + // be stored correctly. + int_fast8_t extra = e < 0 ? e + 300 : e - 300; + r *= pow10lookup[extra + 300]; + e -= extra; + } + e += 300; // lookup table is arranged from -300 (0) to +300 (600) + + r *= pow10lookup[e]; + *target = (double)(neg? -r : r); *(ctx->ch) = ch; return; diff --git a/src/fread.h b/src/fread.h index ec230c6986..64150d66f4 100644 --- a/src/fread.h +++ b/src/fread.h @@ -30,7 +30,7 @@ typedef enum { extern int8_t typeSize[NUMTYPE]; extern const char typeName[NUMTYPE][10]; -extern const long double pow10lookup[701]; +extern const long double pow10lookup[601]; extern const uint8_t hexdigits[256]; diff --git a/src/freadLookups.h b/src/freadLookups.h index faeb4ade54..bb736a60ac 100644 --- a/src/freadLookups.h +++ b/src/freadLookups.h @@ -27,57 +27,7 @@ const uint8_t hexdigits[256] = { }; -const long double pow10lookup[701] = { -1.0E-350L, -1.0E-349L, -1.0E-348L, -1.0E-347L, -1.0E-346L, -1.0E-345L, -1.0E-344L, -1.0E-343L, -1.0E-342L, -1.0E-341L, -1.0E-340L, -1.0E-339L, -1.0E-338L, -1.0E-337L, -1.0E-336L, -1.0E-335L, -1.0E-334L, -1.0E-333L, -1.0E-332L, -1.0E-331L, -1.0E-330L, -1.0E-329L, -1.0E-328L, -1.0E-327L, -1.0E-326L, -1.0E-325L, -1.0E-324L, -1.0E-323L, -1.0E-322L, -1.0E-321L, -1.0E-320L, -1.0E-319L, -1.0E-318L, -1.0E-317L, -1.0E-316L, -1.0E-315L, -1.0E-314L, -1.0E-313L, -1.0E-312L, -1.0E-311L, -1.0E-310L, -1.0E-309L, -1.0E-308L, -1.0E-307L, -1.0E-306L, -1.0E-305L, -1.0E-304L, -1.0E-303L, -1.0E-302L, -1.0E-301L, +const long double pow10lookup[601] = { 1.0E-300L, 1.0E-299L, 1.0E-298L, @@ -678,57 +628,7 @@ const long double pow10lookup[701] = { 1.0E297L, 1.0E298L, 1.0E299L, -1.0E300L, -1.0E301L, -1.0E302L, -1.0E303L, -1.0E304L, -1.0E305L, -1.0E306L, -1.0E307L, -1.0E308L, -1.0E309L, -1.0E310L, -1.0E311L, -1.0E312L, -1.0E313L, -1.0E314L, -1.0E315L, -1.0E316L, -1.0E317L, -1.0E318L, -1.0E319L, -1.0E320L, -1.0E321L, -1.0E322L, -1.0E323L, -1.0E324L, -1.0E325L, -1.0E326L, -1.0E327L, -1.0E328L, -1.0E329L, -1.0E330L, -1.0E331L, -1.0E332L, -1.0E333L, -1.0E334L, -1.0E335L, -1.0E336L, -1.0E337L, -1.0E338L, -1.0E339L, -1.0E340L, -1.0E341L, -1.0E342L, -1.0E343L, -1.0E344L, -1.0E345L, -1.0E346L, -1.0E347L, -1.0E348L, -1.0E349L, -1.0E350L +1.0E300L }; #endif