@@ -2406,6 +2406,13 @@ math_fmod_impl(PyObject *module, double x, double y)
24062406/*
24072407Given an *n* length *vec* of values and a value *max*, compute:
24082408
2409+ sqrt(sum((x * scale) ** 2 for x in vec)) / scale
2410+
2411+ where scale is the first power of two
2412+ greater than max.
2413+
2414+ or compute:
2415+
24092416 max * sqrt(sum((x / max) ** 2 for x in vec))
24102417
24112418The value of the *max* variable must be non-negative and
@@ -2425,19 +2432,25 @@ The *csum* variable tracks the cumulative sum and *frac* tracks
24252432the cumulative fractional errors at each step. Since this
24262433variant assumes that |csum| >= |x| at each step, we establish
24272434the precondition by starting the accumulation from 1.0 which
2428- represents the largest possible value of (x/max)**2.
2435+ represents the largest possible value of (x*scale)**2 or (x /max)**2.
24292436
24302437After the loop is finished, the initial 1.0 is subtracted out
24312438for a net zero effect on the final sum. Since *csum* will be
24322439greater than 1.0, the subtraction of 1.0 will not cause
24332440fractional digits to be dropped from *csum*.
24342441
2442+ To get the full benefit from compensated summation, the
2443+ largest addend should be in the range: 0.5 <= x <= 1.0.
2444+ Accordingly, scaling or division by *max* should not be skipped
2445+ even if not otherwise needed to prevent overflow or loss of precision.
2446+
24352447*/
24362448
24372449static inline double
24382450vector_norm (Py_ssize_t n , double * vec , double max , int found_nan )
24392451{
2440- double x , csum = 1.0 , oldcsum , frac = 0.0 ;
2452+ double x , csum = 1.0 , oldcsum , frac = 0.0 , scale ;
2453+ int max_e ;
24412454 Py_ssize_t i ;
24422455
24432456 if (Py_IS_INFINITY (max )) {
@@ -2449,14 +2462,36 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
24492462 if (max == 0.0 || n <= 1 ) {
24502463 return max ;
24512464 }
2465+ frexp (max , & max_e );
2466+ if (max_e >= -1023 ) {
2467+ scale = ldexp (1.0 , - max_e );
2468+ assert (max * scale >= 0.5 );
2469+ assert (max * scale < 1.0 );
2470+ for (i = 0 ; i < n ; i ++ ) {
2471+ x = vec [i ];
2472+ assert (Py_IS_FINITE (x ) && fabs (x ) <= max );
2473+ x *= scale ;
2474+ x = x * x ;
2475+ assert (x <= 1.0 );
2476+ assert (csum >= x );
2477+ oldcsum = csum ;
2478+ csum += x ;
2479+ frac += (oldcsum - csum ) + x ;
2480+ }
2481+ return sqrt (csum - 1.0 + frac ) / scale ;
2482+ }
2483+ /* When max_e < -1023, ldexp(1.0, -max_e) overflows.
2484+ So instead of multiplying by a scale, we just divide by *max*.
2485+ */
24522486 for (i = 0 ; i < n ; i ++ ) {
24532487 x = vec [i ];
24542488 assert (Py_IS_FINITE (x ) && fabs (x ) <= max );
24552489 x /= max ;
24562490 x = x * x ;
2491+ assert (x <= 1.0 );
2492+ assert (csum >= x );
24572493 oldcsum = csum ;
24582494 csum += x ;
2459- assert (csum >= x );
24602495 frac += (oldcsum - csum ) + x ;
24612496 }
24622497 return max * sqrt (csum - 1.0 + frac );
0 commit comments