libgcc/config/libbid/bid64_sqrt.c

   1 /* Copyright (C) 2007  Free Software Foundation, Inc.
   2
   3 This file is part of GCC.
   4
   5 GCC is free software; you can redistribute it and/or modify it under
   6 the terms of the GNU General Public License as published by the Free
   7 Software Foundation; either version 2, or (at your option) any later
   8 version.
   9
  10 In addition to the permissions in the GNU General Public License, the
  11 Free Software Foundation gives you unlimited permission to link the
  12 compiled version of this file into combinations with other programs,
  13 and to distribute those combinations without any restriction coming
  14 from the use of this file.  (The General Public License restrictions
  15 do apply in other respects; for example, they cover modification of
  16 the file, and distribution when not linked into a combine
  17 executable.)
  18
  19 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  20 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  21 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  22 for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GCC; see the file COPYING.  If not, write to the Free
  26 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
  27 02110-1301, USA.  */
  28
  29 /*****************************************************************************
  30  *    BID64 square root
  31  *****************************************************************************
  32  *
  33  *  Algorithm description:
  34  *
  35  *  if(exponent_x is odd)
  36  *     scale coefficient_x by 10, adjust exponent
  37  *  - get lower estimate for number of digits in coefficient_x
  38  *  - scale coefficient x to between 31 and 33 decimal digits
  39  *  - in parallel, check for exact case and return if true
  40  *  - get high part of result coefficient using double precision sqrt
  41  *  - compute remainder and refine coefficient in one iteration (which
  42  *                                 modifies it by at most 1)
  43  *  - result exponent is easy to compute from the adjusted arg. exponent
  44  *
  45  ****************************************************************************/
  46
  47 #include "bid_internal.h"
  48 #include "bid_sqrt_macros.h"
  49 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
  50 #include <fenv.h>
  51
  52 #define FE_ALL_FLAGS FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW|FE_INEXACT
  53 #endif
  54
  55 extern double sqrt (double);
  56
  57 #if DECIMAL_CALL_BY_REFERENCE
  58
  59 void
  60 bid64_sqrt (UINT64 * pres,
  61             UINT64 *
  62             px _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
  63             _EXC_INFO_PARAM) {
  64   UINT64 x;
  65 #else
  66
  67 UINT64
  68 bid64_sqrt (UINT64 x _RND_MODE_PARAM _EXC_FLAGS_PARAM
  69             _EXC_MASKS_PARAM _EXC_INFO_PARAM) {
  70 #endif
  71   UINT128 CA, CT;
  72   UINT64 sign_x, coefficient_x;
  73   UINT64 Q, Q2, A10, C4, R, R2, QE, res;
  74   SINT64 D;
  75   int_double t_scale;
  76   int_float tempx;
  77   double da, dq, da_h, da_l, dqe;
  78   int exponent_x, exponent_q, bin_expon_cx;
  79   int digits_x;
  80   int scale;
  81 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
  82   fexcept_t binaryflags = 0;
  83 #endif
  84
  85 #if DECIMAL_CALL_BY_REFERENCE
  86 #if !DECIMAL_GLOBAL_ROUNDING
  87   _IDEC_round rnd_mode = *prnd_mode;
  88 #endif
  89   x = *px;
  90 #endif
  91
  92   // unpack arguments, check for NaN or Infinity
  93   if (!unpack_BID64 (&sign_x, &exponent_x, &coefficient_x, x)) {
  94     // x is Inf. or NaN or 0
  95     if ((x & INFINITY_MASK64) == INFINITY_MASK64) {
  96       res = coefficient_x;
  97       if ((coefficient_x & SSNAN_MASK64) == SINFINITY_MASK64)   // -Infinity
  98       {
  99         res = NAN_MASK64;
 100 #ifdef SET_STATUS_FLAGS
 101         __set_status_flags (pfpsf, INVALID_EXCEPTION);
 102 #endif
 103       }
 104 #ifdef SET_STATUS_FLAGS
 105       if ((x & SNAN_MASK64) == SNAN_MASK64)     // sNaN
 106         __set_status_flags (pfpsf, INVALID_EXCEPTION);
 107 #endif
 108       BID_RETURN (res & QUIET_MASK64);
 109     }
 110     // x is 0
 111     exponent_x = (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1;
 112     res = sign_x | (((UINT64) exponent_x) << 53);
 113     BID_RETURN (res);
 114   }
 115   // x<0?
 116   if (sign_x && coefficient_x) {
 117     res = NAN_MASK64;
 118 #ifdef SET_STATUS_FLAGS
 119     __set_status_flags (pfpsf, INVALID_EXCEPTION);
 120 #endif
 121     BID_RETURN (res);
 122   }
 123 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
 124   (void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS);
 125 #endif
 126   //--- get number of bits in the coefficient of x ---
 127   tempx.d = (float) coefficient_x;
 128   bin_expon_cx = ((tempx.i >> 23) & 0xff) - 0x7f;
 129   digits_x = estimate_decimal_digits[bin_expon_cx];
 130   // add test for range
 131   if (coefficient_x >= power10_index_binexp[bin_expon_cx])
 132     digits_x++;
 133
 134   A10 = coefficient_x;
 135   if (exponent_x & 1) {
 136     A10 = (A10 << 2) + A10;
 137     A10 += A10;
 138   }
 139
 140   dqe = sqrt ((double) A10);
 141   QE = (UINT32) dqe;
 142   if (QE * QE == A10) {
 143     res =
 144       very_fast_get_BID64 (0, (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1,
 145                            QE);
 146 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
 147     (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
 148 #endif
 149     BID_RETURN (res);
 150   }
 151   // if exponent is odd, scale coefficient by 10
 152   scale = 31 - digits_x;
 153   exponent_q = exponent_x - scale;
 154   scale += (exponent_q & 1);    // exp. bias is even
 155
 156   CT = power10_table_128[scale];
 157   __mul_64x128_short (CA, coefficient_x, CT);
 158
 159   // 2^64
 160   t_scale.i = 0x43f0000000000000ull;
 161   // convert CA to DP
 162   da_h = CA.w[1];
 163   da_l = CA.w[0];
 164   da = da_h * t_scale.d + da_l;
 165
 166   dq = sqrt (da);
 167
 168   Q = (UINT64) dq;
 169
 170   // get sign(sqrt(CA)-Q)
 171   R = CA.w[0] - Q * Q;
 172   R = ((SINT64) R) >> 63;
 173   D = R + R + 1;
 174
 175   exponent_q = (exponent_q + DECIMAL_EXPONENT_BIAS) >> 1;
 176
 177 #ifdef SET_STATUS_FLAGS
 178   __set_status_flags (pfpsf, INEXACT_EXCEPTION);
 179 #endif
 180
 181 #ifndef IEEE_ROUND_NEAREST
 182 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
 183   if (!((rnd_mode) & 3)) {
 184 #endif
 185 #endif
 186
 187     // midpoint to check
 188     Q2 = Q + Q + D;
 189     C4 = CA.w[0] << 2;
 190
 191     // get sign(-sqrt(CA)+Midpoint)
 192     R2 = Q2 * Q2 - C4;
 193     R2 = ((SINT64) R2) >> 63;
 194
 195     // adjust Q if R!=R2
 196     Q += (D & (R ^ R2));
 197 #ifndef IEEE_ROUND_NEAREST
 198 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
 199   } else {
 200     C4 = CA.w[0];
 201     Q += D;
 202     if ((SINT64) (Q * Q - C4) > 0)
 203       Q--;
 204     if (rnd_mode == ROUNDING_UP)
 205       Q++;
 206   }
 207 #endif
 208 #endif
 209
 210   res = fast_get_BID64 (0, exponent_q, Q);
 211 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
 212   (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
 213 #endif
 214   BID_RETURN (res);
 215 }
 216
 217
 218 TYPE0_FUNCTION_ARG1 (UINT64, bid64q_sqrt, x)
 219
 220      UINT256 M256, C4, C8;
 221      UINT128 CX, CX2, A10, S2, T128, CS, CSM, CS2, C256, CS1,
 222        mul_factor2_long = { {0x0ull, 0x0ull} }, QH, Tmp, TP128, Qh, Ql;
 223 UINT64 sign_x, Carry, B10, res, mul_factor, mul_factor2 = 0x0ull, CS0;
 224 SINT64 D;
 225 int_float fx, f64;
 226 int exponent_x, bin_expon_cx, done = 0;
 227 int digits, scale, exponent_q = 0, exact = 1, amount, extra_digits;
 228 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
 229 fexcept_t binaryflags = 0;
 230 #endif
 231
 232         // unpack arguments, check for NaN or Infinity
 233 if (!unpack_BID128_value (&sign_x, &exponent_x, &CX, x)) {
 234   res = CX.w[1];
 235   // NaN ?
 236   if ((x.w[1] & 0x7c00000000000000ull) == 0x7c00000000000000ull) {
 237 #ifdef SET_STATUS_FLAGS
 238     if ((x.w[1] & 0x7e00000000000000ull) == 0x7e00000000000000ull)      // sNaN
 239       __set_status_flags (pfpsf, INVALID_EXCEPTION);
 240 #endif
 241     Tmp.w[1] = (CX.w[1] & 0x00003fffffffffffull);
 242     Tmp.w[0] = CX.w[0];
 243     TP128 = reciprocals10_128[18];
 244     __mul_128x128_full (Qh, Ql, Tmp, TP128);
 245     amount = recip_scale[18];
 246     __shr_128 (Tmp, Qh, amount);
 247     res = (CX.w[1] & 0xfc00000000000000ull) | Tmp.w[0];
 248     BID_RETURN (res);
 249   }
 250   // x is Infinity?
 251   if ((x.w[1] & 0x7800000000000000ull) == 0x7800000000000000ull) {
 252     if (sign_x) {
 253       // -Inf, return NaN
 254       res = 0x7c00000000000000ull;
 255 #ifdef SET_STATUS_FLAGS
 256       __set_status_flags (pfpsf, INVALID_EXCEPTION);
 257 #endif
 258     }
 259     BID_RETURN (res);
 260   }
 261   // x is 0 otherwise
 262
 263   exponent_x =
 264     ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) +
 265     DECIMAL_EXPONENT_BIAS;
 266   if (exponent_x < 0)
 267     exponent_x = 0;
 268   if (exponent_x > DECIMAL_MAX_EXPON_64)
 269     exponent_x = DECIMAL_MAX_EXPON_64;
 270   //res= sign_x | (((UINT64)exponent_x)<<53);
 271   res = get_BID64 (sign_x, exponent_x, 0, rnd_mode, pfpsf);
 272   BID_RETURN (res);
 273 }
 274 if (sign_x) {
 275   res = 0x7c00000000000000ull;
 276 #ifdef SET_STATUS_FLAGS
 277   __set_status_flags (pfpsf, INVALID_EXCEPTION);
 278 #endif
 279   BID_RETURN (res);
 280 }
 281 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
 282 (void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS);
 283 #endif
 284
 285            // 2^64
 286 f64.i = 0x5f800000;
 287
 288            // fx ~ CX
 289 fx.d = (float) CX.w[1] * f64.d + (float) CX.w[0];
 290 bin_expon_cx = ((fx.i >> 23) & 0xff) - 0x7f;
 291 digits = estimate_decimal_digits[bin_expon_cx];
 292
 293 A10 = CX;
 294 if (exponent_x & 1) {
 295   A10.w[1] = (CX.w[1] << 3) | (CX.w[0] >> 61);
 296   A10.w[0] = CX.w[0] << 3;
 297   CX2.w[1] = (CX.w[1] << 1) | (CX.w[0] >> 63);
 298   CX2.w[0] = CX.w[0] << 1;
 299   __add_128_128 (A10, A10, CX2);
 300 }
 301
 302 C256.w[1] = A10.w[1];
 303 C256.w[0] = A10.w[0];
 304 CS.w[0] = short_sqrt128 (A10);
 305 CS.w[1] = 0;
 306 mul_factor = 0;
 307            // check for exact result
 308 if (CS.w[0] < 10000000000000000ull) {
 309   if (CS.w[0] * CS.w[0] == A10.w[0]) {
 310     __sqr64_fast (S2, CS.w[0]);
 311     if (S2.w[1] == A10.w[1])    // && S2.w[0]==A10.w[0])
 312     {
 313       res =
 314         get_BID64 (0,
 315                    ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) +
 316                    DECIMAL_EXPONENT_BIAS, CS.w[0], rnd_mode, pfpsf);
 317 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
 318       (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
 319 #endif
 320       BID_RETURN (res);
 321     }
 322   }
 323   if (CS.w[0] >= 1000000000000000ull) {
 324     done = 1;
 325     exponent_q = exponent_x;
 326     C256.w[1] = A10.w[1];
 327     C256.w[0] = A10.w[0];
 328   }
 329 #ifdef SET_STATUS_FLAGS
 330   __set_status_flags (pfpsf, INEXACT_EXCEPTION);
 331 #endif
 332   exact = 0;
 333 } else {
 334   B10 = 0x3333333333333334ull;
 335   __mul_64x64_to_128_full (CS2, CS.w[0], B10);
 336   CS0 = CS2.w[1] >> 1;
 337   if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) {
 338 #ifdef SET_STATUS_FLAGS
 339     __set_status_flags (pfpsf, INEXACT_EXCEPTION);
 340 #endif
 341     exact = 0;
 342   }
 343   done = 1;
 344   CS.w[0] = CS0;
 345   exponent_q = exponent_x + 2;
 346   mul_factor = 10;
 347   mul_factor2 = 100;
 348   if (CS.w[0] >= 10000000000000000ull) {
 349     __mul_64x64_to_128_full (CS2, CS.w[0], B10);
 350     CS0 = CS2.w[1] >> 1;
 351     if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) {
 352 #ifdef SET_STATUS_FLAGS
 353       __set_status_flags (pfpsf, INEXACT_EXCEPTION);
 354 #endif
 355       exact = 0;
 356     }
 357     exponent_q += 2;
 358     CS.w[0] = CS0;
 359     mul_factor = 100;
 360     mul_factor2 = 10000;
 361   }
 362   if (exact) {
 363     CS0 = CS.w[0] * mul_factor;
 364     __sqr64_fast (CS1, CS0)
 365       if ((CS1.w[0] != A10.w[0]) || (CS1.w[1] != A10.w[1])) {
 366 #ifdef SET_STATUS_FLAGS
 367       __set_status_flags (pfpsf, INEXACT_EXCEPTION);
 368 #endif
 369       exact = 0;
 370     }
 371   }
 372 }
 373
 374 if (!done) {
 375   // get number of digits in CX
 376   D = CX.w[1] - power10_index_binexp_128[bin_expon_cx].w[1];
 377   if (D > 0
 378       || (!D && CX.w[0] >= power10_index_binexp_128[bin_expon_cx].w[0]))
 379     digits++;
 380
 381   // if exponent is odd, scale coefficient by 10
 382   scale = 31 - digits;
 383   exponent_q = exponent_x - scale;
 384   scale += (exponent_q & 1);    // exp. bias is even
 385
 386   T128 = power10_table_128[scale];
 387   __mul_128x128_low (C256, CX, T128);
 388
 389
 390   CS.w[0] = short_sqrt128 (C256);
 391 }
 392    //printf("CS=%016I64x\n",CS.w[0]);
 393
 394 exponent_q =
 395   ((exponent_q - DECIMAL_EXPONENT_BIAS_128) >> 1) +
 396   DECIMAL_EXPONENT_BIAS;
 397 if ((exponent_q < 0) && (exponent_q + MAX_FORMAT_DIGITS >= 0)) {
 398   extra_digits = -exponent_q;
 399   exponent_q = 0;
 400
 401   // get coeff*(2^M[extra_digits])/10^extra_digits
 402   __mul_64x64_to_128 (QH, CS.w[0], reciprocals10_64[extra_digits]);
 403
 404   // now get P/10^extra_digits: shift Q_high right by M[extra_digits]-128
 405   amount = short_recip_scale[extra_digits];
 406
 407   CS0 = QH.w[1] >> amount;
 408
 409 #ifdef SET_STATUS_FLAGS
 410   if (exact) {
 411     if (CS.w[0] != CS0 * power10_table_128[extra_digits].w[0])
 412       exact = 0;
 413   }
 414   if (!exact)
 415     __set_status_flags (pfpsf, UNDERFLOW_EXCEPTION | INEXACT_EXCEPTION);
 416 #endif
 417
 418   CS.w[0] = CS0;
 419   if (!mul_factor)
 420     mul_factor = 1;
 421   mul_factor *= power10_table_128[extra_digits].w[0];
 422   __mul_64x64_to_128 (mul_factor2_long, mul_factor, mul_factor);
 423   if (mul_factor2_long.w[1])
 424     mul_factor2 = 0;
 425   else
 426     mul_factor2 = mul_factor2_long.w[1];
 427 }
 428            // 4*C256
 429 C4.w[1] = (C256.w[1] << 2) | (C256.w[0] >> 62);
 430 C4.w[0] = C256.w[0] << 2;
 431
 432 #ifndef IEEE_ROUND_NEAREST
 433 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
 434 if (!((rnd_mode) & 3)) {
 435 #endif
 436 #endif
 437   // compare to midpoints
 438   CSM.w[0] = (CS.w[0] + CS.w[0]) | 1;
 439   //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C4.w[1],C4.w[0],CSM.w[1],CSM.w[0], CS.w[0]);
 440   if (mul_factor)
 441     CSM.w[0] *= mul_factor;
 442   // CSM^2
 443   __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]);
 444   //__mul_128x128_to_256(M256, CSM, CSM);
 445
 446   if (C4.w[1] > M256.w[1] ||
 447       (C4.w[1] == M256.w[1] && C4.w[0] > M256.w[0])) {
 448     // round up
 449     CS.w[0]++;
 450   } else {
 451     C8.w[0] = CS.w[0] << 3;
 452     C8.w[1] = 0;
 453     if (mul_factor) {
 454       if (mul_factor2) {
 455         __mul_64x64_to_128 (C8, C8.w[0], mul_factor2);
 456       } else {
 457         __mul_64x128_low (C8, C8.w[0], mul_factor2_long);
 458       }
 459     }
 460     // M256 - 8*CSM
 461     __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
 462     M256.w[1] = M256.w[1] - C8.w[1] - Carry;
 463
 464     // if CSM' > C256, round up
 465     if (M256.w[1] > C4.w[1] ||
 466         (M256.w[1] == C4.w[1] && M256.w[0] > C4.w[0])) {
 467       // round down
 468       if (CS.w[0])
 469         CS.w[0]--;
 470     }
 471   }
 472 #ifndef IEEE_ROUND_NEAREST
 473 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
 474 } else {
 475   CS.w[0]++;
 476   CSM.w[0] = CS.w[0];
 477   C8.w[0] = CSM.w[0] << 1;
 478   if (mul_factor)
 479     CSM.w[0] *= mul_factor;
 480   __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]);
 481   C8.w[1] = 0;
 482   if (mul_factor) {
 483     if (mul_factor2) {
 484       __mul_64x64_to_128 (C8, C8.w[0], mul_factor2);
 485     } else {
 486       __mul_64x128_low (C8, C8.w[0], mul_factor2_long);
 487     }
 488   }
 489   //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C256.w[1],C256.w[0],M256.w[1],M256.w[0], CS.w[0]);
 490
 491   if (M256.w[1] > C256.w[1] ||
 492       (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0])) {
 493     __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
 494     M256.w[1] = M256.w[1] - Carry - C8.w[1];
 495     M256.w[0]++;
 496     if (!M256.w[0]) {
 497       M256.w[1]++;
 498
 499     }
 500
 501     if ((M256.w[1] > C256.w[1] ||
 502          (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0]))
 503         && (CS.w[0] > 1)) {
 504
 505       CS.w[0]--;
 506
 507       if (CS.w[0] > 1) {
 508         __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
 509         M256.w[1] = M256.w[1] - Carry - C8.w[1];
 510         M256.w[0]++;
 511         if (!M256.w[0]) {
 512           M256.w[1]++;
 513         }
 514
 515         if (M256.w[1] > C256.w[1] ||
 516             (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0]))
 517           CS.w[0]--;
 518       }
 519     }
 520   }
 521
 522   else {
 523                                 /*__add_carry_out(M256.w[0], Carry, M256.w[0], C8.w[0]);
 524                                 M256.w[1] = M256.w[1] + Carry + C8.w[1];
 525                                 M256.w[0]++;
 526                                 if(!M256.w[0])
 527                                 {
 528                                         M256.w[1]++;
 529                                 }
 530                                 CS.w[0]++;
 531                         if(M256.w[1]<C256.w[1] ||
 532                                 (M256.w[1]==C256.w[1] && M256.w[0]<=C256.w[0]))
 533                         {
 534                                 CS.w[0]++;
 535                         }*/
 536     CS.w[0]++;
 537   }
 538   //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact);
 539   // RU?
 540   if (((rnd_mode) != ROUNDING_UP) || exact) {
 541     if (CS.w[0])
 542       CS.w[0]--;
 543   }
 544
 545 }
 546 #endif
 547 #endif
 548  //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact);
 549
 550 res = get_BID64 (0, exponent_q, CS.w[0], rnd_mode, pfpsf);
 551 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
 552 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
 553 #endif
 554 BID_RETURN (res);
 555
 556
 557 }