libgcc/config/libbid/bid64_sqrt.c

   1 /* Copyright (C) 2007, 2009  Free Software Foundation, Inc.
   2
   3 This file is part of GCC.
   4
   5 GCC is free software; you can redistribute it and/or modify it under
   6 the terms of the GNU General Public License as published by the Free
   7 Software Foundation; either version 3, or (at your option) any later
   8 version.
   9
  10 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  11 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13 for more details.
  14
  15 Under Section 7 of GPL version 3, you are granted additional
  16 permissions described in the GCC Runtime Library Exception, version
  17 3.1, as published by the Free Software Foundation.
  18
  19 You should have received a copy of the GNU General Public License and
  20 a copy of the GCC Runtime Library Exception along with this program;
  21 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  22 <http://www.gnu.org/licenses/>.  */
  23
  24 /*****************************************************************************
  25  *    BID64 square root
  26  *****************************************************************************
  27  *
  28  *  Algorithm description:
  29  *
  30  *  if(exponent_x is odd)
  31  *     scale coefficient_x by 10, adjust exponent
  32  *  - get lower estimate for number of digits in coefficient_x
  33  *  - scale coefficient x to between 31 and 33 decimal digits
  34  *  - in parallel, check for exact case and return if true
  35  *  - get high part of result coefficient using double precision sqrt
  36  *  - compute remainder and refine coefficient in one iteration (which
  37  *                                 modifies it by at most 1)
  38  *  - result exponent is easy to compute from the adjusted arg. exponent
  39  *
  40  ****************************************************************************/
  41
  42 #include "bid_internal.h"
  43 #include "bid_sqrt_macros.h"
  44 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
  45 #include <fenv.h>
  46
  47 #define FE_ALL_FLAGS FE_INVALID|FE_DIVBYZERO|FE_OVERFLOW|FE_UNDERFLOW|FE_INEXACT
  48 #endif
  49
  50 extern double sqrt (double);
  51
  52 #if DECIMAL_CALL_BY_REFERENCE
  53
  54 void
  55 bid64_sqrt (UINT64 * pres,
  56             UINT64 *
  57             px _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
  58             _EXC_INFO_PARAM) {
  59   UINT64 x;
  60 #else
  61
  62 UINT64
  63 bid64_sqrt (UINT64 x _RND_MODE_PARAM _EXC_FLAGS_PARAM
  64             _EXC_MASKS_PARAM _EXC_INFO_PARAM) {
  65 #endif
  66   UINT128 CA, CT;
  67   UINT64 sign_x, coefficient_x;
  68   UINT64 Q, Q2, A10, C4, R, R2, QE, res;
  69   SINT64 D;
  70   int_double t_scale;
  71   int_float tempx;
  72   double da, dq, da_h, da_l, dqe;
  73   int exponent_x, exponent_q, bin_expon_cx;
  74   int digits_x;
  75   int scale;
  76 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
  77   fexcept_t binaryflags = 0;
  78 #endif
  79
  80 #if DECIMAL_CALL_BY_REFERENCE
  81 #if !DECIMAL_GLOBAL_ROUNDING
  82   _IDEC_round rnd_mode = *prnd_mode;
  83 #endif
  84   x = *px;
  85 #endif
  86
  87   // unpack arguments, check for NaN or Infinity
  88   if (!unpack_BID64 (&sign_x, &exponent_x, &coefficient_x, x)) {
  89     // x is Inf. or NaN or 0
  90     if ((x & INFINITY_MASK64) == INFINITY_MASK64) {
  91       res = coefficient_x;
  92       if ((coefficient_x & SSNAN_MASK64) == SINFINITY_MASK64)   // -Infinity
  93       {
  94         res = NAN_MASK64;
  95 #ifdef SET_STATUS_FLAGS
  96         __set_status_flags (pfpsf, INVALID_EXCEPTION);
  97 #endif
  98       }
  99 #ifdef SET_STATUS_FLAGS
 100       if ((x & SNAN_MASK64) == SNAN_MASK64)     // sNaN
 101         __set_status_flags (pfpsf, INVALID_EXCEPTION);
 102 #endif
 103       BID_RETURN (res & QUIET_MASK64);
 104     }
 105     // x is 0
 106     exponent_x = (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1;
 107     res = sign_x | (((UINT64) exponent_x) << 53);
 108     BID_RETURN (res);
 109   }
 110   // x<0?
 111   if (sign_x && coefficient_x) {
 112     res = NAN_MASK64;
 113 #ifdef SET_STATUS_FLAGS
 114     __set_status_flags (pfpsf, INVALID_EXCEPTION);
 115 #endif
 116     BID_RETURN (res);
 117   }
 118 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
 119   (void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS);
 120 #endif
 121   //--- get number of bits in the coefficient of x ---
 122   tempx.d = (float) coefficient_x;
 123   bin_expon_cx = ((tempx.i >> 23) & 0xff) - 0x7f;
 124   digits_x = estimate_decimal_digits[bin_expon_cx];
 125   // add test for range
 126   if (coefficient_x >= power10_index_binexp[bin_expon_cx])
 127     digits_x++;
 128
 129   A10 = coefficient_x;
 130   if (exponent_x & 1) {
 131     A10 = (A10 << 2) + A10;
 132     A10 += A10;
 133   }
 134
 135   dqe = sqrt ((double) A10);
 136   QE = (UINT32) dqe;
 137   if (QE * QE == A10) {
 138     res =
 139       very_fast_get_BID64 (0, (exponent_x + DECIMAL_EXPONENT_BIAS) >> 1,
 140                            QE);
 141 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
 142     (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
 143 #endif
 144     BID_RETURN (res);
 145   }
 146   // if exponent is odd, scale coefficient by 10
 147   scale = 31 - digits_x;
 148   exponent_q = exponent_x - scale;
 149   scale += (exponent_q & 1);    // exp. bias is even
 150
 151   CT = power10_table_128[scale];
 152   __mul_64x128_short (CA, coefficient_x, CT);
 153
 154   // 2^64
 155   t_scale.i = 0x43f0000000000000ull;
 156   // convert CA to DP
 157   da_h = CA.w[1];
 158   da_l = CA.w[0];
 159   da = da_h * t_scale.d + da_l;
 160
 161   dq = sqrt (da);
 162
 163   Q = (UINT64) dq;
 164
 165   // get sign(sqrt(CA)-Q)
 166   R = CA.w[0] - Q * Q;
 167   R = ((SINT64) R) >> 63;
 168   D = R + R + 1;
 169
 170   exponent_q = (exponent_q + DECIMAL_EXPONENT_BIAS) >> 1;
 171
 172 #ifdef SET_STATUS_FLAGS
 173   __set_status_flags (pfpsf, INEXACT_EXCEPTION);
 174 #endif
 175
 176 #ifndef IEEE_ROUND_NEAREST
 177 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
 178   if (!((rnd_mode) & 3)) {
 179 #endif
 180 #endif
 181
 182     // midpoint to check
 183     Q2 = Q + Q + D;
 184     C4 = CA.w[0] << 2;
 185
 186     // get sign(-sqrt(CA)+Midpoint)
 187     R2 = Q2 * Q2 - C4;
 188     R2 = ((SINT64) R2) >> 63;
 189
 190     // adjust Q if R!=R2
 191     Q += (D & (R ^ R2));
 192 #ifndef IEEE_ROUND_NEAREST
 193 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
 194   } else {
 195     C4 = CA.w[0];
 196     Q += D;
 197     if ((SINT64) (Q * Q - C4) > 0)
 198       Q--;
 199     if (rnd_mode == ROUNDING_UP)
 200       Q++;
 201   }
 202 #endif
 203 #endif
 204
 205   res = fast_get_BID64 (0, exponent_q, Q);
 206 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
 207   (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
 208 #endif
 209   BID_RETURN (res);
 210 }
 211
 212
 213 TYPE0_FUNCTION_ARG1 (UINT64, bid64q_sqrt, x)
 214
 215      UINT256 M256, C4, C8;
 216      UINT128 CX, CX2, A10, S2, T128, CS, CSM, CS2, C256, CS1,
 217        mul_factor2_long = { {0x0ull, 0x0ull} }, QH, Tmp, TP128, Qh, Ql;
 218 UINT64 sign_x, Carry, B10, res, mul_factor, mul_factor2 = 0x0ull, CS0;
 219 SINT64 D;
 220 int_float fx, f64;
 221 int exponent_x, bin_expon_cx, done = 0;
 222 int digits, scale, exponent_q = 0, exact = 1, amount, extra_digits;
 223 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
 224 fexcept_t binaryflags = 0;
 225 #endif
 226
 227         // unpack arguments, check for NaN or Infinity
 228 if (!unpack_BID128_value (&sign_x, &exponent_x, &CX, x)) {
 229   res = CX.w[1];
 230   // NaN ?
 231   if ((x.w[1] & 0x7c00000000000000ull) == 0x7c00000000000000ull) {
 232 #ifdef SET_STATUS_FLAGS
 233     if ((x.w[1] & 0x7e00000000000000ull) == 0x7e00000000000000ull)      // sNaN
 234       __set_status_flags (pfpsf, INVALID_EXCEPTION);
 235 #endif
 236     Tmp.w[1] = (CX.w[1] & 0x00003fffffffffffull);
 237     Tmp.w[0] = CX.w[0];
 238     TP128 = reciprocals10_128[18];
 239     __mul_128x128_full (Qh, Ql, Tmp, TP128);
 240     amount = recip_scale[18];
 241     __shr_128 (Tmp, Qh, amount);
 242     res = (CX.w[1] & 0xfc00000000000000ull) | Tmp.w[0];
 243     BID_RETURN (res);
 244   }
 245   // x is Infinity?
 246   if ((x.w[1] & 0x7800000000000000ull) == 0x7800000000000000ull) {
 247     if (sign_x) {
 248       // -Inf, return NaN
 249       res = 0x7c00000000000000ull;
 250 #ifdef SET_STATUS_FLAGS
 251       __set_status_flags (pfpsf, INVALID_EXCEPTION);
 252 #endif
 253     }
 254     BID_RETURN (res);
 255   }
 256   // x is 0 otherwise
 257
 258   exponent_x =
 259     ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) +
 260     DECIMAL_EXPONENT_BIAS;
 261   if (exponent_x < 0)
 262     exponent_x = 0;
 263   if (exponent_x > DECIMAL_MAX_EXPON_64)
 264     exponent_x = DECIMAL_MAX_EXPON_64;
 265   //res= sign_x | (((UINT64)exponent_x)<<53);
 266   res = get_BID64 (sign_x, exponent_x, 0, rnd_mode, pfpsf);
 267   BID_RETURN (res);
 268 }
 269 if (sign_x) {
 270   res = 0x7c00000000000000ull;
 271 #ifdef SET_STATUS_FLAGS
 272   __set_status_flags (pfpsf, INVALID_EXCEPTION);
 273 #endif
 274   BID_RETURN (res);
 275 }
 276 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
 277 (void) fegetexceptflag (&binaryflags, FE_ALL_FLAGS);
 278 #endif
 279
 280            // 2^64
 281 f64.i = 0x5f800000;
 282
 283            // fx ~ CX
 284 fx.d = (float) CX.w[1] * f64.d + (float) CX.w[0];
 285 bin_expon_cx = ((fx.i >> 23) & 0xff) - 0x7f;
 286 digits = estimate_decimal_digits[bin_expon_cx];
 287
 288 A10 = CX;
 289 if (exponent_x & 1) {
 290   A10.w[1] = (CX.w[1] << 3) | (CX.w[0] >> 61);
 291   A10.w[0] = CX.w[0] << 3;
 292   CX2.w[1] = (CX.w[1] << 1) | (CX.w[0] >> 63);
 293   CX2.w[0] = CX.w[0] << 1;
 294   __add_128_128 (A10, A10, CX2);
 295 }
 296
 297 C256.w[1] = A10.w[1];
 298 C256.w[0] = A10.w[0];
 299 CS.w[0] = short_sqrt128 (A10);
 300 CS.w[1] = 0;
 301 mul_factor = 0;
 302            // check for exact result
 303 if (CS.w[0] < 10000000000000000ull) {
 304   if (CS.w[0] * CS.w[0] == A10.w[0]) {
 305     __sqr64_fast (S2, CS.w[0]);
 306     if (S2.w[1] == A10.w[1])    // && S2.w[0]==A10.w[0])
 307     {
 308       res =
 309         get_BID64 (0,
 310                    ((exponent_x - DECIMAL_EXPONENT_BIAS_128) >> 1) +
 311                    DECIMAL_EXPONENT_BIAS, CS.w[0], rnd_mode, pfpsf);
 312 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
 313       (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
 314 #endif
 315       BID_RETURN (res);
 316     }
 317   }
 318   if (CS.w[0] >= 1000000000000000ull) {
 319     done = 1;
 320     exponent_q = exponent_x;
 321     C256.w[1] = A10.w[1];
 322     C256.w[0] = A10.w[0];
 323   }
 324 #ifdef SET_STATUS_FLAGS
 325   __set_status_flags (pfpsf, INEXACT_EXCEPTION);
 326 #endif
 327   exact = 0;
 328 } else {
 329   B10 = 0x3333333333333334ull;
 330   __mul_64x64_to_128_full (CS2, CS.w[0], B10);
 331   CS0 = CS2.w[1] >> 1;
 332   if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) {
 333 #ifdef SET_STATUS_FLAGS
 334     __set_status_flags (pfpsf, INEXACT_EXCEPTION);
 335 #endif
 336     exact = 0;
 337   }
 338   done = 1;
 339   CS.w[0] = CS0;
 340   exponent_q = exponent_x + 2;
 341   mul_factor = 10;
 342   mul_factor2 = 100;
 343   if (CS.w[0] >= 10000000000000000ull) {
 344     __mul_64x64_to_128_full (CS2, CS.w[0], B10);
 345     CS0 = CS2.w[1] >> 1;
 346     if (CS.w[0] != ((CS0 << 3) + (CS0 << 1))) {
 347 #ifdef SET_STATUS_FLAGS
 348       __set_status_flags (pfpsf, INEXACT_EXCEPTION);
 349 #endif
 350       exact = 0;
 351     }
 352     exponent_q += 2;
 353     CS.w[0] = CS0;
 354     mul_factor = 100;
 355     mul_factor2 = 10000;
 356   }
 357   if (exact) {
 358     CS0 = CS.w[0] * mul_factor;
 359     __sqr64_fast (CS1, CS0)
 360       if ((CS1.w[0] != A10.w[0]) || (CS1.w[1] != A10.w[1])) {
 361 #ifdef SET_STATUS_FLAGS
 362       __set_status_flags (pfpsf, INEXACT_EXCEPTION);
 363 #endif
 364       exact = 0;
 365     }
 366   }
 367 }
 368
 369 if (!done) {
 370   // get number of digits in CX
 371   D = CX.w[1] - power10_index_binexp_128[bin_expon_cx].w[1];
 372   if (D > 0
 373       || (!D && CX.w[0] >= power10_index_binexp_128[bin_expon_cx].w[0]))
 374     digits++;
 375
 376   // if exponent is odd, scale coefficient by 10
 377   scale = 31 - digits;
 378   exponent_q = exponent_x - scale;
 379   scale += (exponent_q & 1);    // exp. bias is even
 380
 381   T128 = power10_table_128[scale];
 382   __mul_128x128_low (C256, CX, T128);
 383
 384
 385   CS.w[0] = short_sqrt128 (C256);
 386 }
 387    //printf("CS=%016I64x\n",CS.w[0]);
 388
 389 exponent_q =
 390   ((exponent_q - DECIMAL_EXPONENT_BIAS_128) >> 1) +
 391   DECIMAL_EXPONENT_BIAS;
 392 if ((exponent_q < 0) && (exponent_q + MAX_FORMAT_DIGITS >= 0)) {
 393   extra_digits = -exponent_q;
 394   exponent_q = 0;
 395
 396   // get coeff*(2^M[extra_digits])/10^extra_digits
 397   __mul_64x64_to_128 (QH, CS.w[0], reciprocals10_64[extra_digits]);
 398
 399   // now get P/10^extra_digits: shift Q_high right by M[extra_digits]-128
 400   amount = short_recip_scale[extra_digits];
 401
 402   CS0 = QH.w[1] >> amount;
 403
 404 #ifdef SET_STATUS_FLAGS
 405   if (exact) {
 406     if (CS.w[0] != CS0 * power10_table_128[extra_digits].w[0])
 407       exact = 0;
 408   }
 409   if (!exact)
 410     __set_status_flags (pfpsf, UNDERFLOW_EXCEPTION | INEXACT_EXCEPTION);
 411 #endif
 412
 413   CS.w[0] = CS0;
 414   if (!mul_factor)
 415     mul_factor = 1;
 416   mul_factor *= power10_table_128[extra_digits].w[0];
 417   __mul_64x64_to_128 (mul_factor2_long, mul_factor, mul_factor);
 418   if (mul_factor2_long.w[1])
 419     mul_factor2 = 0;
 420   else
 421     mul_factor2 = mul_factor2_long.w[1];
 422 }
 423            // 4*C256
 424 C4.w[1] = (C256.w[1] << 2) | (C256.w[0] >> 62);
 425 C4.w[0] = C256.w[0] << 2;
 426
 427 #ifndef IEEE_ROUND_NEAREST
 428 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
 429 if (!((rnd_mode) & 3)) {
 430 #endif
 431 #endif
 432   // compare to midpoints
 433   CSM.w[0] = (CS.w[0] + CS.w[0]) | 1;
 434   //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C4.w[1],C4.w[0],CSM.w[1],CSM.w[0], CS.w[0]);
 435   if (mul_factor)
 436     CSM.w[0] *= mul_factor;
 437   // CSM^2
 438   __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]);
 439   //__mul_128x128_to_256(M256, CSM, CSM);
 440
 441   if (C4.w[1] > M256.w[1] ||
 442       (C4.w[1] == M256.w[1] && C4.w[0] > M256.w[0])) {
 443     // round up
 444     CS.w[0]++;
 445   } else {
 446     C8.w[0] = CS.w[0] << 3;
 447     C8.w[1] = 0;
 448     if (mul_factor) {
 449       if (mul_factor2) {
 450         __mul_64x64_to_128 (C8, C8.w[0], mul_factor2);
 451       } else {
 452         __mul_64x128_low (C8, C8.w[0], mul_factor2_long);
 453       }
 454     }
 455     // M256 - 8*CSM
 456     __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
 457     M256.w[1] = M256.w[1] - C8.w[1] - Carry;
 458
 459     // if CSM' > C256, round up
 460     if (M256.w[1] > C4.w[1] ||
 461         (M256.w[1] == C4.w[1] && M256.w[0] > C4.w[0])) {
 462       // round down
 463       if (CS.w[0])
 464         CS.w[0]--;
 465     }
 466   }
 467 #ifndef IEEE_ROUND_NEAREST
 468 #ifndef IEEE_ROUND_NEAREST_TIES_AWAY
 469 } else {
 470   CS.w[0]++;
 471   CSM.w[0] = CS.w[0];
 472   C8.w[0] = CSM.w[0] << 1;
 473   if (mul_factor)
 474     CSM.w[0] *= mul_factor;
 475   __mul_64x64_to_128 (M256, CSM.w[0], CSM.w[0]);
 476   C8.w[1] = 0;
 477   if (mul_factor) {
 478     if (mul_factor2) {
 479       __mul_64x64_to_128 (C8, C8.w[0], mul_factor2);
 480     } else {
 481       __mul_64x128_low (C8, C8.w[0], mul_factor2_long);
 482     }
 483   }
 484   //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x\n",C256.w[1],C256.w[0],M256.w[1],M256.w[0], CS.w[0]);
 485
 486   if (M256.w[1] > C256.w[1] ||
 487       (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0])) {
 488     __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
 489     M256.w[1] = M256.w[1] - Carry - C8.w[1];
 490     M256.w[0]++;
 491     if (!M256.w[0]) {
 492       M256.w[1]++;
 493
 494     }
 495
 496     if ((M256.w[1] > C256.w[1] ||
 497          (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0]))
 498         && (CS.w[0] > 1)) {
 499
 500       CS.w[0]--;
 501
 502       if (CS.w[0] > 1) {
 503         __sub_borrow_out (M256.w[0], Carry, M256.w[0], C8.w[0]);
 504         M256.w[1] = M256.w[1] - Carry - C8.w[1];
 505         M256.w[0]++;
 506         if (!M256.w[0]) {
 507           M256.w[1]++;
 508         }
 509
 510         if (M256.w[1] > C256.w[1] ||
 511             (M256.w[1] == C256.w[1] && M256.w[0] > C256.w[0]))
 512           CS.w[0]--;
 513       }
 514     }
 515   }
 516
 517   else {
 518                                 /*__add_carry_out(M256.w[0], Carry, M256.w[0], C8.w[0]);
 519                                 M256.w[1] = M256.w[1] + Carry + C8.w[1];
 520                                 M256.w[0]++;
 521                                 if(!M256.w[0])
 522                                 {
 523                                         M256.w[1]++;
 524                                 }
 525                                 CS.w[0]++;
 526                         if(M256.w[1]<C256.w[1] ||
 527                                 (M256.w[1]==C256.w[1] && M256.w[0]<=C256.w[0]))
 528                         {
 529                                 CS.w[0]++;
 530                         }*/
 531     CS.w[0]++;
 532   }
 533   //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact);
 534   // RU?
 535   if (((rnd_mode) != ROUNDING_UP) || exact) {
 536     if (CS.w[0])
 537       CS.w[0]--;
 538   }
 539
 540 }
 541 #endif
 542 #endif
 543  //printf("C256=%016I64x %016I64x, CSM=%016I64x %016I64x %016I64x %d\n",C4.w[1],C4.w[0],M256.w[1],M256.w[0], CS.w[0], exact);
 544
 545 res = get_BID64 (0, exponent_q, CS.w[0], rnd_mode, pfpsf);
 546 #ifdef UNCHANGED_BINARY_STATUS_FLAGS
 547 (void) fesetexceptflag (&binaryflags, FE_ALL_FLAGS);
 548 #endif
 549 BID_RETURN (res);
 550
 551
 552 }