libgcc/config/libbid/bid128_add.c

   1 /* Copyright (C) 2007  Free Software Foundation, Inc.
   2
   3 This file is part of GCC.
   4
   5 GCC is free software; you can redistribute it and/or modify it under
   6 the terms of the GNU General Public License as published by the Free
   7 Software Foundation; either version 2, or (at your option) any later
   8 version.
   9
  10 In addition to the permissions in the GNU General Public License, the
  11 Free Software Foundation gives you unlimited permission to link the
  12 compiled version of this file into combinations with other programs,
  13 and to distribute those combinations without any restriction coming
  14 from the use of this file.  (The General Public License restrictions
  15 do apply in other respects; for example, they cover modification of
  16 the file, and distribution when not linked into a combine
  17 executable.)
  18
  19 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  20 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  21 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  22 for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GCC; see the file COPYING.  If not, write to the Free
  26 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
  27 02110-1301, USA.  */
  28
  29 /*****************************************************************************
  30  *  BID128 add
  31  ****************************************************************************/
  32
  33 #include "bid_internal.h"
  34
  35 #if DECIMAL_CALL_BY_REFERENCE
  36 void
  37 __bid128_add (UINT128 * pres, UINT128 * px,
  38             UINT128 *
  39             py _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
  40             _EXC_INFO_PARAM) {
  41   UINT128 x = *px, y = *py;
  42 #if !DECIMAL_GLOBAL_ROUNDING
  43   unsigned int rnd_mode = *prnd_mode;
  44 #endif
  45 #else
  46 UINT128
  47 __bid128_add (UINT128 x,
  48             UINT128 y _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
  49             _EXC_INFO_PARAM) {
  50 #endif
  51
  52   UINT128 res;
  53   UINT64 x_sign, y_sign, tmp_sign;
  54   UINT64 x_exp, y_exp, tmp_exp; // e1 = x_exp, e2 = y_exp
  55   UINT64 C1_hi, C2_hi, tmp_signif_hi;
  56   UINT64 C1_lo, C2_lo, tmp_signif_lo;
  57   // Note: C1.w[1], C1.w[0] represent C1_hi, C1_lo (all are UINT64)
  58   // Note: C2.w[1], C2.w[0] represent C2_hi, C2_lo (all are UINT64)
  59   UINT64 tmp64, tmp64A, tmp64B;
  60   BID_UI64DOUBLE tmp1, tmp2;
  61   int x_nr_bits, y_nr_bits;
  62   int q1, q2, delta, scale, x1, ind, shift, tmp_inexact = 0;
  63   UINT64 halfulp64;
  64   UINT128 halfulp128;
  65   UINT128 C1, C2;
  66   UINT128 __bid_ten2m1;
  67   UINT128 highf2star; // top 128 bits in f2*; low 128 bits in R256[1], R256[0]
  68   UINT256 P256, Q256, R256;
  69   int is_inexact = 0, is_midpoint_lt_even = 0, is_midpoint_gt_even = 0;
  70   int is_inexact_lt_midpoint = 0, is_inexact_gt_midpoint = 0;
  71   int second_pass = 0;
  72
  73   // check for NaN or Infinity
  74   if (((x.w[1] & MASK_SPECIAL) == MASK_SPECIAL)
  75       || ((y.w[1] & MASK_SPECIAL) == MASK_SPECIAL)) {
  76     // x is special or y is special
  77
  78     if ((x.w[1] & MASK_NAN) == MASK_NAN) { // x is NAN
  79       if ((x.w[1] & MASK_SNAN) == MASK_SNAN) { // x is SNAN
  80         // set invalid flag
  81         *pfpsf |= INVALID_EXCEPTION;
  82         // return quiet (x)
  83         res.w[1] = x.w[1] & 0xfdffffffffffffffull;
  84         res.w[0] = x.w[0];
  85       } else { // x is QNaN
  86         if ((y.w[1] & MASK_SNAN) == MASK_SNAN) { // y is SNAN
  87           // set invalid flag
  88           *pfpsf |= INVALID_EXCEPTION;
  89         }
  90         // return x
  91         res.w[1] = x.w[1];
  92         res.w[0] = x.w[0];
  93       }
  94       BID_RETURN (res);
  95     } else if ((y.w[1] & MASK_NAN) == MASK_NAN) { // y is NAN
  96       if ((y.w[1] & MASK_SNAN) == MASK_SNAN) { // y is SNAN
  97         // set invalid flag
  98         *pfpsf |= INVALID_EXCEPTION;
  99         // return quiet (y)
 100         res.w[1] = y.w[1] & 0xfdffffffffffffffull;
 101         res.w[0] = y.w[0];
 102       } else { // y is QNaN
 103         // return y
 104         res.w[1] = y.w[1];
 105         res.w[0] = y.w[0];
 106       }
 107       BID_RETURN (res);
 108     } else { // neither x not y is NaN; at least one is infinity
 109       if ((x.w[1] & MASK_ANY_INF) == MASK_INF) { // x is infinity
 110         if ((y.w[1] & MASK_ANY_INF) == MASK_INF) { // y is infinity
 111           // if same sign, return either of them
 112           if ((x.w[1] & MASK_SIGN) == (y.w[1] & MASK_SIGN)) {
 113             res.w[1] = x.w[1];
 114             res.w[0] = x.w[0];
 115           } else { // x and y are infinities of opposite signs
 116             // set invalid flag
 117             *pfpsf |= INVALID_EXCEPTION;
 118             // return QNaN Indefinite
 119             res.w[1] = 0x7c00000000000000ull;
 120             res.w[0] = 0x0000000000000000ull;
 121           }
 122         } else { // y is 0 or finite
 123           // return x
 124           res.w[1] = x.w[1];
 125           res.w[0] = x.w[0];
 126         }
 127       } else { // x is not NaN or infinity, so y must be infinity
 128         res.w[1] = y.w[1];
 129         res.w[0] = y.w[0];
 130       }
 131       BID_RETURN (res);
 132     }
 133   }
 134   // unpack the arguments
 135   // unpack x
 136   x_sign = x.w[1] & MASK_SIGN; // 0 for positive, MASK_SIGN for negative
 137   x_exp = x.w[1] & MASK_EXP; // biased and shifted left 49 bit positions
 138   C1_hi = x.w[1] & MASK_COEFF;
 139   C1_lo = x.w[0];
 140   // unpack y
 141   y_sign = y.w[1] & MASK_SIGN; // 0 for positive, MASK_SIGN for negative
 142   y_exp = y.w[1] & MASK_EXP; // biased and shifted left 49 bit positions
 143   C2_hi = y.w[1] & MASK_COEFF;
 144   C2_lo = y.w[0];
 145
 146   // test for non-canonical values:
 147   // - values whose encoding begins with x00, x01, or x10 and whose
 148   //   coefficient is larger than 10^34 -1, or
 149   // - values whose encoding begins with x1100, x1101, x1110 (if NaNs
 150   //   and infinitis were eliminated already this test is reduced to
 151   //   checking for x10x)
 152
 153   // test for non-canonical values of the argument x
 154   if ((((C1_hi > 0x0001ed09bead87c0ull)
 155        || ((C1_hi == 0x0001ed09bead87c0ull)
 156            && (C1_lo > 0x378d8e63ffffffffull)))
 157       && ((x.w[1] & 0x6000000000000000ull) != 0x6000000000000000ull))
 158       || ((x.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull)) {
 159     // check for the case where the exponent is shifted right by 2 bits!
 160     if ((x.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
 161       x_exp = (x.w[1] << 2) & MASK_EXP; // same position as for G[0]G[1] != 11
 162     }
 163     x.w[1] = x.w[1] & 0x8000000000000000ull; // preserve the sign bit
 164     x.w[0] = 0;
 165     C1_hi = 0;
 166     C1_lo = 0;
 167   }
 168   // test for non-canonical values of the argument y
 169   if ((((C2_hi > 0x0001ed09bead87c0ull)
 170        || ((C2_hi == 0x0001ed09bead87c0ull)
 171            && (C2_lo > 0x378d8e63ffffffffull)))
 172       && ((y.w[1] & 0x6000000000000000ull) != 0x6000000000000000ull))
 173       || ((y.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull)) {
 174     // check for the case where the exponent is shifted right by 2 bits!
 175     if ((y.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
 176       y_exp = (y.w[1] << 2) & MASK_EXP; // same position as for G[0]G[1] != 11
 177     }
 178     y.w[1] = y.w[1] & 0x8000000000000000ull; // preserve the sign bit
 179     y.w[0] = 0;
 180     C2_hi = 0;
 181     C2_lo = 0;
 182   }
 183
 184   if ((C1_hi == 0x0ull) && (C1_lo == 0x0ull)) {
 185     // x is 0 and y is not special
 186     // if y is 0 return 0 with the smaller exponent
 187     if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
 188       if (x_exp < y_exp)
 189         res.w[1] = x_exp;
 190       else
 191         res.w[1] = y_exp;
 192       if (x_sign && y_sign)
 193         res.w[1] = res.w[1] | x_sign; // both negative
 194       else if (rnd_mode == ROUNDING_DOWN && x_sign != y_sign)
 195         res.w[1] = res.w[1] | 0x8000000000000000ull; // -0
 196       // else; // res = +0
 197       res.w[0] = 0;
 198     } else {
 199       // for 0 + y return y, with the preferred exponent
 200       if (y_exp <= x_exp) {
 201         res.w[1] = y.w[1];
 202         res.w[0] = y.w[0];
 203       } else { // if y_exp > x_exp
 204         // return (C2 * 10^scale) * 10^(y_exp - scale)
 205         // where scale = min (P34-q2, y_exp-x_exp)
 206         // determine q2 = nr. of decimal digits in y
 207         //  determine first the nr. of bits in y (y_nr_bits)
 208
 209         if (C2_hi == 0) { // y_bits is the nr. of bits in C2_lo
 210           if (C2_lo >= 0x0020000000000000ull) { // y >= 2^53
 211             // split the 64-bit value in two 32-bit halves to avoid
 212             // rounding errors
 213             if (C2_lo >= 0x0000000100000000ull) { // y >= 2^32
 214               tmp2.d = (double) (C2_lo >> 32); // exact conversion
 215               ///tmp2_i64 = *(UINT64 *) & tmp2_d;
 216               y_nr_bits =
 217                 32 +
 218                 ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
 219             } else { // y < 2^32
 220               tmp2.d = (double) (C2_lo); // exact conversion
 221               ///tmp2_i64 = *(UINT64 *) & tmp2_d;
 222               y_nr_bits =
 223                 ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
 224             }
 225           } else { // if y < 2^53
 226             tmp2.d = (double) C2_lo; // exact conversion
 227             ///tmp2_i64 = *(UINT64 *) & tmp2_d;
 228             y_nr_bits =
 229               ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
 230           }
 231         } else { // C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
 232           tmp2.d = (double) C2_hi; // exact conversion
 233           ///tmp2_i64 = *(UINT64 *) & tmp2_d;
 234           y_nr_bits =
 235             64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
 236         }
 237         q2 = __bid_nr_digits[y_nr_bits].digits;
 238         if (q2 == 0) {
 239           q2 = __bid_nr_digits[y_nr_bits].digits1;
 240           if (C2_hi > __bid_nr_digits[y_nr_bits].threshold_hi
 241               || (C2_hi == __bid_nr_digits[y_nr_bits].threshold_hi
 242               && C2_lo >= __bid_nr_digits[y_nr_bits].threshold_lo))
 243             q2++;
 244         }
 245         // return (C2 * 10^scale) * 10^(y_exp - scale)
 246         // where scale = min (P34-q2, y_exp-x_exp)
 247         scale = P34 - q2;
 248         ind = (y_exp - x_exp) >> 49;
 249         if (ind < scale)
 250           scale = ind;
 251         if (scale == 0) {
 252           res.w[1] = y.w[1];
 253           res.w[0] = y.w[0];
 254         } else if (q2 <= 19) { // y fits in 64 bits
 255           if (scale <= 19) { // 10^scale fits in 64 bits
 256             // 64 x 64 C2_lo * __bid_ten2k64[scale]
 257             __mul_64x64_to_128MACH (res, C2_lo, __bid_ten2k64[scale]);
 258           } else { // 10^scale fits in 128 bits
 259             // 64 x 128 C2_lo * __bid_ten2k128[scale - 20]
 260             __mul_128x64_to_128 (res, C2_lo, __bid_ten2k128[scale - 20]);
 261           }
 262         } else { // y fits in 128 bits, but 10^scale must fit in 64 bits
 263           // 64 x 128 __bid_ten2k64[scale] * C2
 264           C2.w[1] = C2_hi;
 265           C2.w[0] = C2_lo;
 266           __mul_128x64_to_128 (res, __bid_ten2k64[scale], C2);
 267         }
 268         // subtract scale from the exponent
 269         y_exp = y_exp - ((UINT64) scale << 49);
 270         res.w[1] = res.w[1] | y_sign | y_exp;
 271       }
 272     }
 273     BID_RETURN (res);
 274   } else if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
 275     // y is 0 and x is not special, and not zero
 276     // for x + 0 return x, with the preferred exponent
 277     if (x_exp <= y_exp) {
 278       res.w[1] = x.w[1];
 279       res.w[0] = x.w[0];
 280     } else { // if x_exp > y_exp
 281       // return (C1 * 10^scale) * 10^(x_exp - scale)
 282       // where scale = min (P34-q1, x_exp-y_exp)
 283       // determine q1 = nr. of decimal digits in x
 284       //  determine first the nr. of bits in x
 285       if (C1_hi == 0) { // x_bits is the nr. of bits in C1_lo
 286         if (C1_lo >= 0x0020000000000000ull) { // x >= 2^53
 287           // split the 64-bit value in two 32-bit halves to avoid
 288           // rounding errors
 289           if (C1_lo >= 0x0000000100000000ull) { // x >= 2^32
 290             tmp1.d = (double) (C1_lo >> 32); // exact conversion
 291             x_nr_bits =
 292               32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) -
 293                     0x3ff);
 294           } else { // x < 2^32
 295             tmp1.d = (double) (C1_lo); // exact conversion
 296             x_nr_bits =
 297               ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
 298           }
 299         } else { // if x < 2^53
 300           tmp1.d = (double) C1_lo; // exact conversion
 301           x_nr_bits =
 302             ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
 303         }
 304       } else { // C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
 305         tmp1.d = (double) C1_hi; // exact conversion
 306         x_nr_bits =
 307           64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
 308       }
 309       q1 = __bid_nr_digits[x_nr_bits].digits;
 310       if (q1 == 0) {
 311         q1 = __bid_nr_digits[x_nr_bits].digits1;
 312         if (C1_hi > __bid_nr_digits[x_nr_bits].threshold_hi
 313             || (C1_hi == __bid_nr_digits[x_nr_bits].threshold_hi
 314             && C1_lo >= __bid_nr_digits[x_nr_bits].threshold_lo))
 315           q1++;
 316       }
 317       // return (C1 * 10^scale) * 10^(x_exp - scale)
 318       // where scale = min (P34-q1, x_exp-y_exp)
 319       scale = P34 - q1;
 320       ind = (x_exp - y_exp) >> 49;
 321       if (ind < scale)
 322         scale = ind;
 323       if (scale == 0) {
 324         res.w[1] = x.w[1];
 325         res.w[0] = x.w[0];
 326       } else if (q1 <= 19) { // x fits in 64 bits
 327         if (scale <= 19) { // 10^scale fits in 64 bits
 328           // 64 x 64 C1_lo * __bid_ten2k64[scale]
 329           __mul_64x64_to_128MACH (res, C1_lo, __bid_ten2k64[scale]);
 330         } else { // 10^scale fits in 128 bits
 331           // 64 x 128 C1_lo * __bid_ten2k128[scale - 20]
 332           __mul_128x64_to_128 (res, C1_lo, __bid_ten2k128[scale - 20]);
 333         }
 334       } else { // x fits in 128 bits, but 10^scale must fit in 64 bits
 335         // 64 x 128 __bid_ten2k64[scale] * C1
 336         C1.w[1] = C1_hi;
 337         C1.w[0] = C1_lo;
 338         __mul_128x64_to_128 (res, __bid_ten2k64[scale], C1);
 339       }
 340       // subtract scale from the exponent
 341       x_exp = x_exp - ((UINT64) scale << 49);
 342       res.w[1] = res.w[1] | x_sign | x_exp;
 343     }
 344     BID_RETURN (res);
 345   } else { // x and y are not canonical, not special, and are not zero
 346     // note that the result may still be zero, and then it has to have the
 347     // preferred exponent
 348     if (x_exp < y_exp) { // if exp_x < exp_y then swap x and y
 349       tmp_sign = x_sign;
 350       tmp_exp = x_exp;
 351       tmp_signif_hi = C1_hi;
 352       tmp_signif_lo = C1_lo;
 353       x_sign = y_sign;
 354       x_exp = y_exp;
 355       C1_hi = C2_hi;
 356       C1_lo = C2_lo;
 357       y_sign = tmp_sign;
 358       y_exp = tmp_exp;
 359       C2_hi = tmp_signif_hi;
 360       C2_lo = tmp_signif_lo;
 361     }
 362     // q1 = nr. of decimal digits in x
 363     //  determine first the nr. of bits in x
 364     if (C1_hi == 0) { // x_bits is the nr. of bits in C1_lo
 365       if (C1_lo >= 0x0020000000000000ull) { // x >= 2^53
 366         //split the 64-bit value in two 32-bit halves to avoid rounding errors
 367         if (C1_lo >= 0x0000000100000000ull) { // x >= 2^32
 368           tmp1.d = (double) (C1_lo >> 32); // exact conversion
 369           x_nr_bits =
 370             32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
 371         } else { // x < 2^32
 372           tmp1.d = (double) (C1_lo); // exact conversion
 373           x_nr_bits =
 374             ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
 375         }
 376       } else { // if x < 2^53
 377         tmp1.d = (double) C1_lo; // exact conversion
 378         x_nr_bits =
 379           ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
 380       }
 381     } else { // C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
 382       tmp1.d = (double) C1_hi; // exact conversion
 383       x_nr_bits =
 384         64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
 385     }
 386
 387     q1 = __bid_nr_digits[x_nr_bits].digits;
 388     if (q1 == 0) {
 389       q1 = __bid_nr_digits[x_nr_bits].digits1;
 390       if (C1_hi > __bid_nr_digits[x_nr_bits].threshold_hi
 391           || (C1_hi == __bid_nr_digits[x_nr_bits].threshold_hi
 392           && C1_lo >= __bid_nr_digits[x_nr_bits].threshold_lo))
 393         q1++;
 394     }
 395     // q2 = nr. of decimal digits in y
 396     //  determine first the nr. of bits in y (y_nr_bits)
 397     if (C2_hi == 0) { // y_bits is the nr. of bits in C2_lo
 398       if (C2_lo >= 0x0020000000000000ull) { // y >= 2^53
 399         //split the 64-bit value in two 32-bit halves to avoid rounding errors
 400         if (C2_lo >= 0x0000000100000000ull) { // y >= 2^32
 401           tmp2.d = (double) (C2_lo >> 32); // exact conversion
 402           ///tmp2_i64 = *(UINT64 *) & tmp2_d;
 403           y_nr_bits =
 404             32 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
 405         } else { // y < 2^32
 406           tmp2.d = (double) (C2_lo); // exact conversion
 407           ///tmp2_i64 = *(UINT64 *) & tmp2_d;
 408           y_nr_bits =
 409             ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
 410         }
 411       } else { // if y < 2^53
 412         tmp2.d = (double) C2_lo; // exact conversion
 413         ///tmp2_i64 = *(UINT64 *) & tmp2_d;
 414         y_nr_bits =
 415           ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
 416       }
 417     } else { // C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
 418       tmp2.d = (double) C2_hi; // exact conversion
 419       ///tmp2_i64 = *(UINT64 *) & tmp2_d;
 420       y_nr_bits =
 421         64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
 422     }
 423
 424     q2 = __bid_nr_digits[y_nr_bits].digits;
 425     if (q2 == 0) {
 426       q2 = __bid_nr_digits[y_nr_bits].digits1;
 427       if (C2_hi > __bid_nr_digits[y_nr_bits].threshold_hi
 428           || (C2_hi == __bid_nr_digits[y_nr_bits].threshold_hi
 429           && C2_lo >= __bid_nr_digits[y_nr_bits].threshold_lo))
 430         q2++;
 431     }
 432
 433     delta = q1 + (int) (x_exp >> 49) - q2 - (int) (y_exp >> 49);
 434
 435     if (delta >= P34) {
 436       // round the result directly because 0 < C2 < ulp (C1 * 10^(x_exp-e2))
 437       // n = C1 * 10^e1 or n = C1 +/- 10^(q1-P34)) * 10^e1
 438       // the result is inexact; the preferred exponent is the least possible
 439
 440       if (delta >= P34 + 1) {
 441         // for RN the result is the operand with the larger magnitude,
 442         // possibly scaled up by 10^(P34-q1)
 443         // an overflow cannot occur in this case (rounding to nearest)
 444         if (q1 < P34) { // scale C1 up by 10^(P34-q1)
 445           // Note: because delta >= P34+1 it is certain that
 446           //     x_exp - ((UINT64)scale << 49) will stay above e_min
 447           scale = P34 - q1;
 448           if (q1 <= 19) { // C1 fits in 64 bits
 449             // 1 <= q1 <= 19 => 15 <= scale <= 33
 450             if (scale <= 19) { // 10^scale fits in 64 bits
 451               __mul_64x64_to_128MACH (C1, __bid_ten2k64[scale], C1_lo);
 452             } else { // if 20 <= scale <= 33
 453               // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
 454               // (C1 * 10^(scale-19)) fits in 64 bits
 455               C1_lo = C1_lo * __bid_ten2k64[scale - 19];
 456               __mul_64x64_to_128MACH (C1, __bid_ten2k64[19], C1_lo);
 457             }
 458           } else { //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
 459             // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
 460             C1.w[1] = C1_hi;
 461             C1.w[0] = C1_lo;
 462             // C1 = __bid_ten2k64[P34 - q1] * C1
 463             __mul_128x64_to_128 (C1, __bid_ten2k64[P34 - q1], C1);
 464           }
 465           x_exp = x_exp - ((UINT64) scale << 49);
 466           C1_hi = C1.w[1];
 467           C1_lo = C1.w[0];
 468         }
 469         // some special cases arise: if delta = P34 + 1 and C1 = 10^(P34-1)
 470         // (after scaling) and x_sign != y_sign and C2 > 5*10^(q2-1) =>
 471         // subtract 1 ulp
 472         // Note: do this only for rounding to nearest; for other rounding
 473         // modes the correction will be applied next
 474         if ((rnd_mode == ROUNDING_TO_NEAREST
 475              || rnd_mode == ROUNDING_TIES_AWAY) && delta == (P34 + 1)
 476             && C1_hi == 0x0000314dc6448d93ull
 477             && C1_lo == 0x38c15b0a00000000ull && x_sign != y_sign
 478             && ((q2 <= 19 && C2_lo > __bid_midpoint64[q2 - 1]) || (q2 >= 20
 479                 && (C2_hi > __bid_midpoint128[q2 - 20].w[1]
 480                     || (C2_hi == __bid_midpoint128[q2 - 20].w[1]
 481                     && C2_lo > __bid_midpoint128[q2 - 20].w[0]))))) {
 482           // C1 = 10^34 - 1 and decrement x_exp by 1 (no underflow possible)
 483           C1_hi = 0x0001ed09bead87c0ull;
 484           C1_lo = 0x378d8e63ffffffffull;
 485           x_exp = x_exp - EXP_P1;
 486         }
 487         if (rnd_mode != ROUNDING_TO_NEAREST) {
 488           if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
 489               || (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
 490             // add 1 ulp and then check for overflow
 491             C1_lo = C1_lo + 1;
 492             if (C1_lo == 0) { // rounding overflow in the low 64 bits
 493               C1_hi = C1_hi + 1;
 494             }
 495             if (C1_hi == 0x0001ed09bead87c0ull
 496                 && C1_lo == 0x378d8e6400000000ull) {
 497               // C1 = 10^34 => rounding overflow
 498               C1_hi = 0x0000314dc6448d93ull;
 499               C1_lo = 0x38c15b0a00000000ull; // 10^33
 500               x_exp = x_exp + EXP_P1;
 501               if (x_exp == EXP_MAX_P1) { // overflow
 502                 C1_hi = 0x7800000000000000ull; // +inf
 503                 C1_lo = 0x0ull;
 504                 x_exp = 0; // x_sign is preserved
 505                 // set overflow flag (the inexact flag was set too)
 506                 *pfpsf |= OVERFLOW_EXCEPTION;
 507               }
 508             }
 509           } else if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
 510                      || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
 511                      || (rnd_mode == ROUNDING_TO_ZERO && x_sign != y_sign)) {
 512             // subtract 1 ulp from C1
 513             // Note: because delta >= P34 + 1 the result cannot be zero
 514             C1_lo = C1_lo - 1;
 515             if (C1_lo == 0xffffffffffffffffull)
 516               C1_hi = C1_hi - 1;
 517             // if the coefficient is 10^33 - 1 then make it 10^34 - 1 and
 518             // decrease the exponent by 1 (because delta >= P34 + 1 the
 519             // exponent will not become less than e_min)
 520             // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
 521             // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
 522             if (C1_hi == 0x0000314dc6448d93ull
 523                 && C1_lo == 0x38c15b09ffffffffull) {
 524               // make C1 = 10^34  - 1
 525               C1_hi = 0x0001ed09bead87c0ull;
 526               C1_lo = 0x378d8e63ffffffffull;
 527               x_exp = x_exp - EXP_P1;
 528             }
 529           } else {
 530             ; // the result is already correct
 531           }
 532         }
 533         // set the inexact flag
 534         *pfpsf |= INEXACT_EXCEPTION;
 535         // assemble the result
 536         res.w[1] = x_sign | x_exp | C1_hi;
 537         res.w[0] = C1_lo;
 538       } else { // delta = P34
 539         // in most cases, the smaller operand may be < or = or > 1/2 ulp of the
 540         // larger operand
 541         // however, the case C1 = 10^(q1-1) and x_sign != y_sign is special due
 542         // to accuracy loss after subtraction, and will be treated separately
 543         if (x_sign == y_sign || (q1 <= 20
 544             && (C1_hi != 0 || C1_lo != __bid_ten2k64[q1 - 1])) || (q1 >= 21
 545             && (C1_hi != __bid_ten2k128[q1 - 21].w[1]
 546                 || C1_lo != __bid_ten2k128[q1 - 21].w[0]))) {
 547           // if x_sign == y_sign or C1 != 10^(q1-1)
 548           // compare C2 with 1/2 ulp = 5 * 10^(q2-1), the latter read from table
 549           // Note: cases q1<=19 and q1>=20 can be coalesced at some latency cost
 550           if (q2 <= 19) { // C2 and 5*10^(q2-1) both fit in 64 bits
 551             halfulp64 = __bid_midpoint64[q2 - 1]; // 5 * 10^(q2-1)
 552             if (C2_lo < halfulp64) { // n2 < 1/2 ulp (n1)
 553               // for RN the result is the operand with the larger magnitude,
 554               // possibly scaled up by 10^(P34-q1)
 555               // an overflow cannot occur in this case (rounding to nearest)
 556               if (q1 < P34) { // scale C1 up by 10^(P34-q1)
 557                 // Note: because delta = P34 it is certain that
 558                 //     x_exp - ((UINT64)scale << 49) will stay above e_min
 559                 scale = P34 - q1;
 560                 if (q1 <= 19) { // C1 fits in 64 bits
 561                   // 1 <= q1 <= 19 => 15 <= scale <= 33
 562                   if (scale <= 19) { // 10^scale fits in 64 bits
 563                     __mul_64x64_to_128MACH (C1, __bid_ten2k64[scale], C1_lo);
 564                   } else { // if 20 <= scale <= 33
 565                     // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
 566                     // (C1 * 10^(scale-19)) fits in 64 bits
 567                     C1_lo = C1_lo * __bid_ten2k64[scale - 19];
 568                     __mul_64x64_to_128MACH (C1, __bid_ten2k64[19], C1_lo);
 569                   }
 570                 } else { //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
 571                   // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
 572                   C1.w[1] = C1_hi;
 573                   C1.w[0] = C1_lo;
 574                   // C1 = __bid_ten2k64[P34 - q1] * C1
 575                   __mul_128x64_to_128 (C1, __bid_ten2k64[P34 - q1], C1);
 576                 }
 577                 x_exp = x_exp - ((UINT64) scale << 49);
 578                 C1_hi = C1.w[1];
 579                 C1_lo = C1.w[0];
 580               }
 581               if (rnd_mode != ROUNDING_TO_NEAREST) {
 582                 if ((rnd_mode == ROUNDING_DOWN && x_sign
 583                     && y_sign) || (rnd_mode == ROUNDING_UP
 584                     && !x_sign && !y_sign)) {
 585                   // add 1 ulp and then check for overflow
 586                   C1_lo = C1_lo + 1;
 587                   if (C1_lo == 0) { // rounding overflow in the low 64 bits
 588                     C1_hi = C1_hi + 1;
 589                   }
 590                   if (C1_hi == 0x0001ed09bead87c0ull
 591                       && C1_lo == 0x378d8e6400000000ull) {
 592                     // C1 = 10^34 => rounding overflow
 593                     C1_hi = 0x0000314dc6448d93ull;
 594                     C1_lo = 0x38c15b0a00000000ull; // 10^33
 595                     x_exp = x_exp + EXP_P1;
 596                     if (x_exp == EXP_MAX_P1) { // overflow
 597                       C1_hi = 0x7800000000000000ull; // +inf
 598                       C1_lo = 0x0ull;
 599                       x_exp = 0; // x_sign is preserved
 600                       // set overflow flag (the inexact flag was set too)
 601                       *pfpsf |= OVERFLOW_EXCEPTION;
 602                     }
 603                   }
 604                 } else if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign) ||
 605                     (rnd_mode == ROUNDING_UP && x_sign && !y_sign) ||
 606                     (rnd_mode == ROUNDING_TO_ZERO && x_sign != y_sign)) {
 607                   // subtract 1 ulp from C1
 608                   // Note: because delta >= P34 + 1 the result cannot be zero
 609                   C1_lo = C1_lo - 1;
 610                   if (C1_lo == 0xffffffffffffffffull)
 611                     C1_hi = C1_hi - 1;
 612                   // if the coefficient is 10^33-1 then make it 10^34-1 and
 613                   // decrease the exponent by 1 (because delta >= P34 + 1 the
 614                   // exponent will not become less than e_min)
 615                   // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
 616                   // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
 617                   if (C1_hi == 0x0000314dc6448d93ull
 618                       && C1_lo == 0x38c15b09ffffffffull) {
 619                     // make C1 = 10^34  - 1
 620                     C1_hi = 0x0001ed09bead87c0ull;
 621                     C1_lo = 0x378d8e63ffffffffull;
 622                     x_exp = x_exp - EXP_P1;
 623                   }
 624                 } else {
 625                   ; // the result is already correct
 626                 }
 627               }
 628               // set the inexact flag
 629               *pfpsf |= INEXACT_EXCEPTION;
 630               // assemble the result
 631               res.w[1] = x_sign | x_exp | C1_hi;
 632               res.w[0] = C1_lo;
 633             } else if ((C2_lo == halfulp64)
 634                        && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
 635               // n2 = 1/2 ulp (n1) and C1 is even
 636               // the result is the operand with the larger magnitude,
 637               // possibly scaled up by 10^(P34-q1)
 638               // an overflow cannot occur in this case (rounding to nearest)
 639               if (q1 < P34) { // scale C1 up by 10^(P34-q1)
 640                 // Note: because delta = P34 it is certain that
 641                 //     x_exp - ((UINT64)scale << 49) will stay above e_min
 642                 scale = P34 - q1;
 643                 if (q1 <= 19) { // C1 fits in 64 bits
 644                   // 1 <= q1 <= 19 => 15 <= scale <= 33
 645                   if (scale <= 19) { // 10^scale fits in 64 bits
 646                     __mul_64x64_to_128MACH (C1, __bid_ten2k64[scale], C1_lo);
 647                   } else { // if 20 <= scale <= 33
 648                     // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
 649                     // (C1 * 10^(scale-19)) fits in 64 bits
 650                     C1_lo = C1_lo * __bid_ten2k64[scale - 19];
 651                     __mul_64x64_to_128MACH (C1, __bid_ten2k64[19], C1_lo);
 652                   }
 653                 } else { //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
 654                   // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
 655                   C1.w[1] = C1_hi;
 656                   C1.w[0] = C1_lo;
 657                   // C1 = __bid_ten2k64[P34 - q1] * C1
 658                   __mul_128x64_to_128 (C1, __bid_ten2k64[P34 - q1], C1);
 659                 }
 660                 x_exp = x_exp - ((UINT64) scale << 49);
 661                 C1_hi = C1.w[1];
 662                 C1_lo = C1.w[0];
 663               }
 664               if ((rnd_mode == ROUNDING_TO_NEAREST
 665                   && x_sign == y_sign && (C1_lo & 0x01))
 666                   || (rnd_mode == ROUNDING_TIES_AWAY
 667                   && x_sign == y_sign) || (rnd_mode == ROUNDING_UP
 668                   && !x_sign && !y_sign)
 669                   || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)) {
 670                 // add 1 ulp and then check for overflow
 671                 C1_lo = C1_lo + 1;
 672                 if (C1_lo == 0) { // rounding overflow in the low 64 bits
 673                   C1_hi = C1_hi + 1;
 674                 }
 675                 if (C1_hi == 0x0001ed09bead87c0ull
 676                     && C1_lo == 0x378d8e6400000000ull) {
 677                   // C1 = 10^34 => rounding overflow
 678                   C1_hi = 0x0000314dc6448d93ull;
 679                   C1_lo = 0x38c15b0a00000000ull; // 10^33
 680                   x_exp = x_exp + EXP_P1;
 681                   if (x_exp == EXP_MAX_P1) { // overflow
 682                     C1_hi = 0x7800000000000000ull; // +inf
 683                     C1_lo = 0x0ull;
 684                     x_exp = 0; // x_sign is preserved
 685                     // set overflow flag (the inexact flag was set too)
 686                     *pfpsf |= OVERFLOW_EXCEPTION;
 687                   }
 688                 }
 689               } else if ((rnd_mode == ROUNDING_TO_NEAREST
 690                          && x_sign != y_sign && (C1_lo & 0x01))
 691                          || (rnd_mode == ROUNDING_DOWN && !x_sign
 692                          && y_sign) || (rnd_mode == ROUNDING_UP
 693                          && x_sign && !y_sign)
 694                          || (rnd_mode == ROUNDING_TO_ZERO
 695                          && x_sign != y_sign)) {
 696                 // subtract 1 ulp from C1
 697                 // Note: because delta >= P34 + 1 the result cannot be zero
 698                 C1_lo = C1_lo - 1;
 699                 if (C1_lo == 0xffffffffffffffffull)
 700                   C1_hi = C1_hi - 1;
 701                 // if the coefficient is 10^33 - 1 then make it 10^34 - 1
 702                 // and decrease the exponent by 1 (because delta >= P34 + 1
 703                 // the exponent will not become less than e_min)
 704                 // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
 705                 // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
 706                 if (C1_hi == 0x0000314dc6448d93ull
 707                     && C1_lo == 0x38c15b09ffffffffull) {
 708                   // make C1 = 10^34  - 1
 709                   C1_hi = 0x0001ed09bead87c0ull;
 710                   C1_lo = 0x378d8e63ffffffffull;
 711                   x_exp = x_exp - EXP_P1;
 712                 }
 713               } else {
 714                 ; // the result is already correct
 715               }
 716               // set the inexact flag
 717               *pfpsf |= INEXACT_EXCEPTION;
 718               // assemble the result
 719               res.w[1] = x_sign | x_exp | C1_hi;
 720               res.w[0] = C1_lo;
 721             } else { // if C2_lo > halfulp64 ||
 722               // (C2_lo == halfulp64 && q1 == P34 && ((C1_lo & 0x1) == 1)), i.e.
 723               // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
 724               // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
 725               if (q1 < P34) { // then 1 ulp = 10^(e1+q1-P34) < 10^e1
 726                 // Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
 727                 // because q1 < P34 we must first replace C1 by
 728                 // C1 * 10^(P34-q1), and must decrease the exponent by
 729                 // (P34-q1) (it will still be at least e_min)
 730                 scale = P34 - q1;
 731                 if (q1 <= 19) { // C1 fits in 64 bits
 732                   // 1 <= q1 <= 19 => 15 <= scale <= 33
 733                   if (scale <= 19) { // 10^scale fits in 64 bits
 734                     __mul_64x64_to_128MACH (C1, __bid_ten2k64[scale], C1_lo);
 735                   } else { // if 20 <= scale <= 33
 736                     // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
 737                     // (C1 * 10^(scale-19)) fits in 64 bits
 738                     C1_lo = C1_lo * __bid_ten2k64[scale - 19];
 739                     __mul_64x64_to_128MACH (C1, __bid_ten2k64[19], C1_lo);
 740                   }
 741                 } else { //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
 742                   // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
 743                   C1.w[1] = C1_hi;
 744                   C1.w[0] = C1_lo;
 745                   // C1 = __bid_ten2k64[P34 - q1] * C1
 746                   __mul_128x64_to_128 (C1, __bid_ten2k64[P34 - q1], C1);
 747                 }
 748                 x_exp = x_exp - ((UINT64) scale << 49);
 749                 C1_hi = C1.w[1];
 750                 C1_lo = C1.w[0];
 751                 // check for rounding overflow
 752                 if (C1_hi == 0x0001ed09bead87c0ull
 753                     && C1_lo == 0x378d8e6400000000ull) {
 754                   // C1 = 10^34 => rounding overflow
 755                   C1_hi = 0x0000314dc6448d93ull;
 756                   C1_lo = 0x38c15b0a00000000ull; // 10^33
 757                   x_exp = x_exp + EXP_P1;
 758                 }
 759               }
 760               if ((rnd_mode == ROUNDING_TO_NEAREST
 761                   && x_sign != y_sign)
 762                   || (rnd_mode == ROUNDING_TIES_AWAY
 763                   && x_sign != y_sign && C2_lo != halfulp64)
 764                   || (rnd_mode == ROUNDING_DOWN && !x_sign
 765                   && y_sign) || (rnd_mode == ROUNDING_UP && x_sign
 766                   && !y_sign) || (rnd_mode == ROUNDING_TO_ZERO
 767                   && x_sign != y_sign)) {
 768                 // the result is x - 1
 769                 // for RN n1 * n2 < 0; underflow not possible
 770                 C1_lo = C1_lo - 1;
 771                 if (C1_lo == 0xffffffffffffffffull)
 772                   C1_hi--;
 773                 // check if we crossed into the lower decade
 774                 if (C1_hi == 0x0000314dc6448d93ull &&
 775                     C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
 776                   C1_hi = 0x0001ed09bead87c0ull; // 10^34 - 1
 777                   C1_lo = 0x378d8e63ffffffffull;
 778                   x_exp = x_exp - EXP_P1; // no underflow, because n1 >> n2
 779                 }
 780               } else if ((rnd_mode == ROUNDING_TO_NEAREST
 781                          && x_sign == y_sign)
 782                          || (rnd_mode == ROUNDING_TIES_AWAY
 783                          && x_sign == y_sign)
 784                          || (rnd_mode == ROUNDING_DOWN && x_sign
 785                          && y_sign) || (rnd_mode == ROUNDING_UP
 786                          && !x_sign && !y_sign)) {
 787                 // the result is x + 1
 788                 // for RN x_sign = y_sign, i.e. n1*n2 > 0
 789                 C1_lo = C1_lo + 1;
 790                 if (C1_lo == 0) { // rounding overflow in the low 64 bits
 791                   C1_hi = C1_hi + 1;
 792                 }
 793                 if (C1_hi == 0x0001ed09bead87c0ull
 794                     && C1_lo == 0x378d8e6400000000ull) {
 795                   // C1 = 10^34 => rounding overflow
 796                   C1_hi = 0x0000314dc6448d93ull;
 797                   C1_lo = 0x38c15b0a00000000ull; // 10^33
 798                   x_exp = x_exp + EXP_P1;
 799                   if (x_exp == EXP_MAX_P1) { // overflow
 800                     C1_hi = 0x7800000000000000ull; // +inf
 801                     C1_lo = 0x0ull;
 802                     x_exp = 0; // x_sign is preserved
 803                     // set the overflow flag
 804                     *pfpsf |= OVERFLOW_EXCEPTION;
 805                   }
 806                 }
 807               } else {
 808                 ; // the result is x
 809               }
 810               // set the inexact flag
 811               *pfpsf |= INEXACT_EXCEPTION;
 812               // assemble the result
 813               res.w[1] = x_sign | x_exp | C1_hi;
 814               res.w[0] = C1_lo;
 815             }
 816           } else { // if q2 >= 20 then 5*10^(q2-1) and C2 (the latter in
 817             // most cases) fit only in more than 64 bits
 818             halfulp128 = __bid_midpoint128[q2 - 20]; // 5 * 10^(q2-1)
 819             if ((C2_hi < halfulp128.w[1])
 820                 || (C2_hi == halfulp128.w[1]
 821                     && C2_lo < halfulp128.w[0])) {
 822               // n2 < 1/2 ulp (n1)
 823               // the result is the operand with the larger magnitude,
 824               // possibly scaled up by 10^(P34-q1)
 825               // an overflow cannot occur in this case (rounding to nearest)
 826               if (q1 < P34) { // scale C1 up by 10^(P34-q1)
 827                 // Note: because delta = P34 it is certain that
 828                 //     x_exp - ((UINT64)scale << 49) will stay above e_min
 829                 scale = P34 - q1;
 830                 if (q1 <= 19) { // C1 fits in 64 bits
 831                   // 1 <= q1 <= 19 => 15 <= scale <= 33
 832                   if (scale <= 19) { // 10^scale fits in 64 bits
 833                     __mul_64x64_to_128MACH (C1, __bid_ten2k64[scale], C1_lo);
 834                   } else { // if 20 <= scale <= 33
 835                     // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
 836                     // (C1 * 10^(scale-19)) fits in 64 bits
 837                     C1_lo = C1_lo * __bid_ten2k64[scale - 19];
 838                     __mul_64x64_to_128MACH (C1, __bid_ten2k64[19], C1_lo);
 839                   }
 840                 } else { //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
 841                   // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
 842                   C1.w[1] = C1_hi;
 843                   C1.w[0] = C1_lo;
 844                   // C1 = __bid_ten2k64[P34 - q1] * C1
 845                   __mul_128x64_to_128 (C1, __bid_ten2k64[P34 - q1], C1);
 846                 }
 847                 C1_hi = C1.w[1];
 848                 C1_lo = C1.w[0];
 849                 x_exp = x_exp - ((UINT64) scale << 49);
 850               }
 851               if (rnd_mode != ROUNDING_TO_NEAREST) {
 852                 if ((rnd_mode == ROUNDING_DOWN && x_sign
 853                     && y_sign) || (rnd_mode == ROUNDING_UP
 854                     && !x_sign && !y_sign)) {
 855                   // add 1 ulp and then check for overflow
 856                   C1_lo = C1_lo + 1;
 857                   if (C1_lo == 0) { // rounding overflow in the low 64 bits
 858                     C1_hi = C1_hi + 1;
 859                   }
 860                   if (C1_hi == 0x0001ed09bead87c0ull
 861                       && C1_lo == 0x378d8e6400000000ull) {
 862                     // C1 = 10^34 => rounding overflow
 863                     C1_hi = 0x0000314dc6448d93ull;
 864                     C1_lo = 0x38c15b0a00000000ull; // 10^33
 865                     x_exp = x_exp + EXP_P1;
 866                     if (x_exp == EXP_MAX_P1) { // overflow
 867                       C1_hi = 0x7800000000000000ull; // +inf
 868                       C1_lo = 0x0ull;
 869                       x_exp = 0; // x_sign is preserved
 870                       // set overflow flag (the inexact flag was set too)
 871                       *pfpsf |= OVERFLOW_EXCEPTION;
 872                     }
 873                   }
 874                 } else if ((rnd_mode == ROUNDING_DOWN && !x_sign
 875                            && y_sign) || (rnd_mode == ROUNDING_UP
 876                            && x_sign && !y_sign)
 877                            || (rnd_mode == ROUNDING_TO_ZERO
 878                            && x_sign != y_sign)) {
 879                   // subtract 1 ulp from C1
 880                   // Note: because delta >= P34 + 1 the result cannot be zero
 881                   C1_lo = C1_lo - 1;
 882                   if (C1_lo == 0xffffffffffffffffull)
 883                     C1_hi = C1_hi - 1;
 884                   // if the coefficient is 10^33-1 then make it 10^34-1 and
 885                   // decrease the exponent by 1 (because delta >= P34 + 1 the
 886                   // exponent will not become less than e_min)
 887                   // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
 888                   // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
 889                   if (C1_hi == 0x0000314dc6448d93ull
 890                       && C1_lo == 0x38c15b09ffffffffull) {
 891                     // make C1 = 10^34  - 1
 892                     C1_hi = 0x0001ed09bead87c0ull;
 893                     C1_lo = 0x378d8e63ffffffffull;
 894                     x_exp = x_exp - EXP_P1;
 895                   }
 896                 } else {
 897                   ; // the result is already correct
 898                 }
 899               }
 900               // set the inexact flag
 901               *pfpsf |= INEXACT_EXCEPTION;
 902               // assemble the result
 903               res.w[1] = x_sign | x_exp | C1_hi;
 904               res.w[0] = C1_lo;
 905             } else if ((C2_hi == halfulp128.w[1]
 906                         && C2_lo == halfulp128.w[0])
 907                        && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
 908               // midpoint & lsb in C1 is 0
 909               // n2 = 1/2 ulp (n1) and C1 is even
 910               // the result is the operand with the larger magnitude,
 911               // possibly scaled up by 10^(P34-q1)
 912               // an overflow cannot occur in this case (rounding to nearest)
 913               if (q1 < P34) { // scale C1 up by 10^(P34-q1)
 914                 // Note: because delta = P34 it is certain that
 915                 //     x_exp - ((UINT64)scale << 49) will stay above e_min
 916                 scale = P34 - q1;
 917                 if (q1 <= 19) { // C1 fits in 64 bits
 918                   // 1 <= q1 <= 19 => 15 <= scale <= 33
 919                   if (scale <= 19) { // 10^scale fits in 64 bits
 920                     __mul_64x64_to_128MACH (C1, __bid_ten2k64[scale], C1_lo);
 921                   } else { // if 20 <= scale <= 33
 922                     // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
 923                     // (C1 * 10^(scale-19)) fits in 64 bits
 924                     C1_lo = C1_lo * __bid_ten2k64[scale - 19];
 925                     __mul_64x64_to_128MACH (C1, __bid_ten2k64[19], C1_lo);
 926                   }
 927                 } else { //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
 928                   // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
 929                   C1.w[1] = C1_hi;
 930                   C1.w[0] = C1_lo;
 931                   // C1 = __bid_ten2k64[P34 - q1] * C1
 932                   __mul_128x64_to_128 (C1, __bid_ten2k64[P34 - q1], C1);
 933                 }
 934                 x_exp = x_exp - ((UINT64) scale << 49);
 935                 C1_hi = C1.w[1];
 936                 C1_lo = C1.w[0];
 937               }
 938               if (rnd_mode != ROUNDING_TO_NEAREST) {
 939                 if ((rnd_mode == ROUNDING_TIES_AWAY
 940                     && x_sign == y_sign)
 941                     || (rnd_mode == ROUNDING_UP && !y_sign)) {
 942                   // add 1 ulp and then check for overflow
 943                   C1_lo = C1_lo + 1;
 944                   if (C1_lo == 0) { // rounding overflow in the low 64 bits
 945                     C1_hi = C1_hi + 1;
 946                   }
 947                   if (C1_hi == 0x0001ed09bead87c0ull
 948                       && C1_lo == 0x378d8e6400000000ull) {
 949                     // C1 = 10^34 => rounding overflow
 950                     C1_hi = 0x0000314dc6448d93ull;
 951                     C1_lo = 0x38c15b0a00000000ull; // 10^33
 952                     x_exp = x_exp + EXP_P1;
 953                     if (x_exp == EXP_MAX_P1) { // overflow
 954                       C1_hi = 0x7800000000000000ull; // +inf
 955                       C1_lo = 0x0ull;
 956                       x_exp = 0; // x_sign is preserved
 957                       // set overflow flag (the inexact flag was set too)
 958                       *pfpsf |= OVERFLOW_EXCEPTION;
 959                     }
 960                   }
 961                 } else if ((rnd_mode == ROUNDING_DOWN && y_sign)
 962                            || (rnd_mode == ROUNDING_TO_ZERO
 963                            && x_sign != y_sign)) {
 964                   // subtract 1 ulp from C1
 965                   // Note: because delta >= P34 + 1 the result cannot be zero
 966                   C1_lo = C1_lo - 1;
 967                   if (C1_lo == 0xffffffffffffffffull)
 968                     C1_hi = C1_hi - 1;
 969                   // if the coefficient is 10^33 - 1 then make it 10^34 - 1
 970                   // and decrease the exponent by 1 (because delta >= P34 + 1
 971                   // the exponent will not become less than e_min)
 972                   // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
 973                   // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
 974                   if (C1_hi == 0x0000314dc6448d93ull
 975                       && C1_lo == 0x38c15b09ffffffffull) {
 976                     // make C1 = 10^34  - 1
 977                     C1_hi = 0x0001ed09bead87c0ull;
 978                     C1_lo = 0x378d8e63ffffffffull;
 979                     x_exp = x_exp - EXP_P1;
 980                   }
 981                 } else {
 982                   ; // the result is already correct
 983                 }
 984               }
 985               // set the inexact flag
 986               *pfpsf |= INEXACT_EXCEPTION;
 987               // assemble the result
 988               res.w[1] = x_sign | x_exp | C1_hi;
 989               res.w[0] = C1_lo;
 990             } else { // if C2 > halfulp128 ||
 991               // (C2 == halfulp128 && q1 == P34 && ((C1 & 0x1) == 1)), i.e.
 992               // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
 993               // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
 994               if (q1 < P34) { // then 1 ulp = 10^(e1+q1-P34) < 10^e1
 995                 // Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
 996                 // because q1 < P34 we must first replace C1 by C1*10^(P34-q1),
 997                 // and must decrease the exponent by (P34-q1) (it will still be
 998                 // at least e_min)
 999                 scale = P34 - q1;
1000                 if (q1 <= 19) { // C1 fits in 64 bits
1001                   // 1 <= q1 <= 19 => 15 <= scale <= 33
1002                   if (scale <= 19) { // 10^scale fits in 64 bits
1003                     __mul_64x64_to_128MACH (C1, __bid_ten2k64[scale], C1_lo);
1004                   } else { // if 20 <= scale <= 33
1005                     // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
1006                     // (C1 * 10^(scale-19)) fits in 64 bits
1007                     C1_lo = C1_lo * __bid_ten2k64[scale - 19];
1008                     __mul_64x64_to_128MACH (C1, __bid_ten2k64[19], C1_lo);
1009                   }
1010                 } else { //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
1011                   // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
1012                   C1.w[1] = C1_hi;
1013                   C1.w[0] = C1_lo;
1014                   // C1 = __bid_ten2k64[P34 - q1] * C1
1015                   __mul_128x64_to_128 (C1, __bid_ten2k64[P34 - q1], C1);
1016                 }
1017                 C1_hi = C1.w[1];
1018                 C1_lo = C1.w[0];
1019                 x_exp = x_exp - ((UINT64) scale << 49);
1020               }
1021               if ((rnd_mode == ROUNDING_TO_NEAREST
1022                   && x_sign != y_sign) || (rnd_mode == ROUNDING_TIES_AWAY
1023                   && x_sign != y_sign &&
1024                   (C2_hi != halfulp128.w[1] || C2_lo != halfulp128.w[0])) ||
1025                   (rnd_mode == ROUNDING_DOWN && !x_sign
1026                   && y_sign) || (rnd_mode == ROUNDING_UP && x_sign
1027                   && !y_sign) || (rnd_mode == ROUNDING_TO_ZERO
1028                   && x_sign != y_sign)) {
1029                 // the result is x - 1
1030                 // for RN n1 * n2 < 0; underflow not possible
1031                 C1_lo = C1_lo - 1;
1032                 if (C1_lo == 0xffffffffffffffffull)
1033                   C1_hi--;
1034                 // check if we crossed into the lower decade
1035                 if (C1_hi == 0x0000314dc6448d93ull &&
1036                     C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
1037                   C1_hi = 0x0001ed09bead87c0ull; // 10^34 - 1
1038                   C1_lo = 0x378d8e63ffffffffull;
1039                   x_exp = x_exp - EXP_P1; // no underflow, because n1 >> n2
1040                 }
1041               } else if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign == y_sign)
1042                   || (rnd_mode == ROUNDING_TIES_AWAY && x_sign == y_sign) ||
1043                   (rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
1044                   (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
1045                 // the result is x + 1
1046                 // for RN x_sign = y_sign, i.e. n1*n2 > 0
1047                 C1_lo = C1_lo + 1;
1048                 if (C1_lo == 0) { // rounding overflow in the low 64 bits
1049                   C1_hi = C1_hi + 1;
1050                 }
1051                 if (C1_hi == 0x0001ed09bead87c0ull
1052                     && C1_lo == 0x378d8e6400000000ull) {
1053                   // C1 = 10^34 => rounding overflow
1054                   C1_hi = 0x0000314dc6448d93ull;
1055                   C1_lo = 0x38c15b0a00000000ull; // 10^33
1056                   x_exp = x_exp + EXP_P1;
1057                   if (x_exp == EXP_MAX_P1) { // overflow
1058                     C1_hi = 0x7800000000000000ull; // +inf
1059                     C1_lo = 0x0ull;
1060                     x_exp = 0; // x_sign is preserved
1061                     // set the overflow flag
1062                     *pfpsf |= OVERFLOW_EXCEPTION;
1063                   }
1064                 }
1065               } else {
1066                 ; // the result is x
1067               }
1068               // set the inexact flag
1069               *pfpsf |= INEXACT_EXCEPTION;
1070               // assemble the result
1071               res.w[1] = x_sign | x_exp | C1_hi;
1072               res.w[0] = C1_lo;
1073             }
1074           }        // end q1 >= 20
1075           // end case where C1 != 10^(q1-1)
1076         } else { // C1 = 10^(q1-1) and x_sign != y_sign
1077           // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
1078           // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34
1079           // where x1 = q2 - 1, 0 <= x1 <= P34 - 1
1080           // Because C1 = 10^(q1-1) and x_sign != y_sign, C' will have P34
1081           // digits and n = C' * 10^(e2+x1)
1082           // If the result has P34+1 digits, redo the steps above with x1+1
1083           // If the result has P34-1 digits or less, redo the steps above with
1084           // x1-1 but only if initially x1 >= 1
1085           // NOTE: these two steps can be improved, e.g we could guess if
1086           // P34+1 or P34-1 digits will be obtained by adding/subtracting
1087           // just the top 64 bits of the two operands
1088           // The result cannot be zero, and it cannot overflow
1089           x1 = q2 - 1; // 0 <= x1 <= P34-1
1090           // Calculate C1 * 10^(e1-e2-x1) where 1 <= e1-e2-x1 <= P34
1091           // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
1092           scale = P34 - q1 + 1; // scale=e1-e2-x1 = P34+1-q1; 1<=scale<=P34
1093           // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
1094           // but their product fits with certainty in 128 bits
1095           if (scale >= 20) { //10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
1096             __mul_128x64_to_128 (C1, C1_lo, __bid_ten2k128[scale - 20]);
1097           } else { // if (scale >= 1
1098             // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
1099             if (q1 <= 19) { // C1 fits in 64 bits
1100               __mul_64x64_to_128MACH (C1, C1_lo, __bid_ten2k64[scale]);
1101             } else { // q1 >= 20
1102               C1.w[1] = C1_hi;
1103               C1.w[0] = C1_lo;
1104               __mul_128x64_to_128 (C1, __bid_ten2k64[scale], C1);
1105             }
1106           }
1107           tmp64 = C1.w[0]; // C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
1108
1109           // now round C2 to q2-x1 = 1 decimal digit
1110           // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
1111           ind = x1 - 1; // -1 <= ind <= P34 - 2
1112           if (ind >= 0) { // if (x1 >= 1)
1113             C2.w[0] = C2_lo;
1114             C2.w[1] = C2_hi;
1115             if (ind <= 18) {
1116               C2.w[0] = C2.w[0] + __bid_midpoint64[ind];
1117               if (C2.w[0] < C2_lo)
1118                 C2.w[1]++;
1119             } else { // 19 <= ind <= 32
1120               C2.w[0] = C2.w[0] + __bid_midpoint128[ind - 19].w[0];
1121               C2.w[1] = C2.w[1] + __bid_midpoint128[ind - 19].w[1];
1122               if (C2.w[0] < C2_lo)
1123                 C2.w[1]++;
1124             }
1125             // the approximation of 10^(-x1) was rounded up to 118 bits
1126             __mul_128x128_to_256 (R256, C2, __bid_ten2mk128[ind]); // R256 = C2*, f2*
1127             // calculate C2* and f2*
1128             // C2* is actually floor(C2*) in this case
1129             // C2* and f2* need shifting and masking, as shown by
1130             // __bid_shiftright128[] and __bid_maskhigh128[]
1131             // the top Ex bits of 10^(-x1) are T* = __bid_ten2mk128trunc[ind], e.g.
1132             // if x1=1, T*=__bid_ten2mk128trunc[0]=0x19999999999999999999999999999999
1133             // if (0 < f2* < 10^(-x1)) then
1134             //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
1135             //       shift; C2* has p decimal digits, correct by Prop. 1)
1136             //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
1137             //       shift; C2* has p decimal digits, correct by Pr. 1)
1138             // else
1139             //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
1140             //       correct by Property 1)
1141             // n = C2* * 10^(e2+x1)
1142
1143             if (ind <= 2) {
1144               highf2star.w[1] = 0x0;
1145               highf2star.w[0] = 0x0; // low f2* ok
1146             } else if (ind <= 21) {
1147               highf2star.w[1] = 0x0;
1148               highf2star.w[0] = R256.w[2] & __bid_maskhigh128[ind]; // low f2* ok
1149             } else {
1150               highf2star.w[1] = R256.w[3] & __bid_maskhigh128[ind];
1151               highf2star.w[0] = R256.w[2]; // low f2* is ok
1152             }
1153             // shift right C2* by Ex-128 = __bid_shiftright128[ind]
1154             if (ind >= 3) {
1155               shift = __bid_shiftright128[ind];
1156               if (shift < 64) { // 3 <= shift <= 63
1157                 R256.w[2] =
1158                   (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
1159                 R256.w[3] = (R256.w[3] >> shift);
1160               } else { // 66 <= shift <= 102
1161                 R256.w[2] = (R256.w[3] >> (shift - 64));
1162                 R256.w[3] = 0x0ULL;
1163               }
1164             }
1165             // redundant
1166             is_inexact_lt_midpoint = 0;
1167             is_inexact_gt_midpoint = 0;
1168             is_midpoint_lt_even = 0;
1169             is_midpoint_gt_even = 0;
1170             // determine inexactness of the rounding of C2*
1171             // (cannot be followed by a second rounding)
1172             // if (0 < f2* - 1/2 < 10^(-x1)) then
1173             //   the result is exact
1174             // else (if f2* - 1/2 > T* then)
1175             //   the result of is inexact
1176             if (ind <= 2) {
1177               if (R256.w[1] > 0x8000000000000000ull ||
1178                   (R256.w[1] == 0x8000000000000000ull && R256.w[0] > 0x0ull)) {
1179                 // f2* > 1/2 and the result may be exact
1180                 tmp64A = R256.w[1] - 0x8000000000000000ull; // f* - 1/2
1181                 if ((tmp64A > __bid_ten2mk128trunc[ind].w[1]
1182                      || (tmp64A == __bid_ten2mk128trunc[ind].w[1]
1183                      && R256.w[0] >= __bid_ten2mk128trunc[ind].w[0]))) {
1184                   // set the inexact flag
1185                   *pfpsf |= INEXACT_EXCEPTION;
1186                   // this rounding is applied to C2 only!
1187                   // x_sign != y_sign
1188                   is_inexact_gt_midpoint = 1;
1189                 }        // else the result is exact
1190                 // rounding down, unless a midpoint in [ODD, EVEN]
1191               } else { // the result is inexact; f2* <= 1/2
1192                 // set the inexact flag
1193                 *pfpsf |= INEXACT_EXCEPTION;
1194                 // this rounding is applied to C2 only!
1195                 // x_sign != y_sign
1196                 is_inexact_lt_midpoint = 1;
1197               }
1198             } else if (ind <= 21) { // if 3 <= ind <= 21
1199               if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
1200                   && highf2star.w[0] > __bid_one_half128[ind])
1201                   || (highf2star.w[1] == 0x0
1202                   && highf2star.w[0] == __bid_one_half128[ind]
1203                   && (R256.w[1] || R256.w[0]))) {
1204                 // f2* > 1/2 and the result may be exact
1205                 // Calculate f2* - 1/2
1206                 tmp64A = highf2star.w[0] - __bid_one_half128[ind];
1207                 tmp64B = highf2star.w[1];
1208                 if (tmp64A > highf2star.w[0])
1209                   tmp64B--;
1210                 if (tmp64B || tmp64A
1211                     || R256.w[1] > __bid_ten2mk128trunc[ind].w[1]
1212                     || (R256.w[1] == __bid_ten2mk128trunc[ind].w[1]
1213                     && R256.w[0] > __bid_ten2mk128trunc[ind].w[0])) {
1214                   // set the inexact flag
1215                   *pfpsf |= INEXACT_EXCEPTION;
1216                   // this rounding is applied to C2 only!
1217                   // x_sign != y_sign
1218                   is_inexact_gt_midpoint = 1;
1219                 }        // else the result is exact
1220               } else { // the result is inexact; f2* <= 1/2
1221                 // set the inexact flag
1222                 *pfpsf |= INEXACT_EXCEPTION;
1223                 // this rounding is applied to C2 only!
1224                 // x_sign != y_sign
1225                 is_inexact_lt_midpoint = 1;
1226               }
1227             } else { // if 22 <= ind <= 33
1228               if (highf2star.w[1] > __bid_one_half128[ind]
1229                   || (highf2star.w[1] == __bid_one_half128[ind]
1230                   && (highf2star.w[0] || R256.w[1]
1231                       || R256.w[0]))) {
1232                 // f2* > 1/2 and the result may be exact
1233                 // Calculate f2* - 1/2
1234                 // tmp64A = highf2star.w[0];
1235                 tmp64B = highf2star.w[1] - __bid_one_half128[ind];
1236                 if (tmp64B || highf2star.w[0]
1237                     || R256.w[1] > __bid_ten2mk128trunc[ind].w[1]
1238                     || (R256.w[1] == __bid_ten2mk128trunc[ind].w[1]
1239                     && R256.w[0] > __bid_ten2mk128trunc[ind].w[0])) {
1240                   // set the inexact flag
1241                   *pfpsf |= INEXACT_EXCEPTION;
1242                   // this rounding is applied to C2 only!
1243                   // x_sign != y_sign
1244                   is_inexact_gt_midpoint = 1;
1245                 }        // else the result is exact
1246               } else { // the result is inexact; f2* <= 1/2
1247                 // set the inexact flag
1248                 *pfpsf |= INEXACT_EXCEPTION;
1249                 // this rounding is applied to C2 only!
1250                 // x_sign != y_sign
1251                 is_inexact_lt_midpoint = 1;
1252               }
1253             }
1254             // check for midpoints after determining inexactness
1255             if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
1256                 && (highf2star.w[0] == 0)
1257                 && (R256.w[1] < __bid_ten2mk128trunc[ind].w[1]
1258                     || (R256.w[1] == __bid_ten2mk128trunc[ind].w[1]
1259                     && R256.w[0] <= __bid_ten2mk128trunc[ind].w[0]))) {
1260               // the result is a midpoint
1261               if ((tmp64 + R256.w[2]) & 0x01) { // MP in [EVEN, ODD]
1262                 // if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
1263                 R256.w[2]--;
1264                 if (R256.w[2] == 0xffffffffffffffffull)
1265                   R256.w[3]--;
1266                 // this rounding is applied to C2 only!
1267                 // x_sign != y_sign
1268                 is_midpoint_lt_even = 1;
1269                 is_inexact_lt_midpoint = 0;
1270                 is_inexact_gt_midpoint = 0;
1271               } else {
1272                 // else MP in [ODD, EVEN]
1273                 // this rounding is applied to C2 only!
1274                 // x_sign != y_sign
1275                 is_midpoint_gt_even = 1;
1276                 is_inexact_lt_midpoint = 0;
1277                 is_inexact_gt_midpoint = 0;
1278               }
1279             }
1280           } else { // if (ind == -1) only when x1 = 0
1281             R256.w[2] = C2_lo;
1282             R256.w[3] = C2_hi;
1283             is_midpoint_lt_even = 0;
1284             is_midpoint_gt_even = 0;
1285             is_inexact_lt_midpoint = 0;
1286             is_inexact_gt_midpoint = 0;
1287           }
1288           // and now subtract C1 * 10^(e1-e2-x1) - (C2 * 10^(-x1))rnd,P34
1289           // because x_sign != y_sign this last operation is exact
1290           C1.w[0] = C1.w[0] - R256.w[2];
1291           C1.w[1] = C1.w[1] - R256.w[3];
1292           if (C1.w[0] > tmp64)
1293             C1.w[1]--; // borrow
1294           if (C1.w[1] >= 0x8000000000000000ull) { // negative coefficient!
1295             C1.w[0] = ~C1.w[0];
1296             C1.w[0]++;
1297             C1.w[1] = ~C1.w[1];
1298             if (C1.w[0] == 0x0)
1299               C1.w[1]++;
1300             tmp_sign = y_sign; // the result will have the sign of y
1301           } else {
1302             tmp_sign = x_sign;
1303           }
1304           // the difference has exactly P34 digits
1305           x_sign = tmp_sign;
1306           if (x1 >= 1)
1307             y_exp = y_exp + ((UINT64) x1 << 49);
1308           C1_hi = C1.w[1];
1309           C1_lo = C1.w[0];
1310           // general correction from RN to RA, RM, RP, RZ; result uses y_exp
1311           if (rnd_mode != ROUNDING_TO_NEAREST) {
1312             if ((!x_sign &&
1313                 ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint) ||
1314                 ((rnd_mode == ROUNDING_TIES_AWAY || rnd_mode == ROUNDING_UP) &&
1315                 is_midpoint_gt_even))) ||
1316                 (x_sign &&
1317                 ((rnd_mode == ROUNDING_DOWN && is_inexact_lt_midpoint) ||
1318                 ((rnd_mode == ROUNDING_TIES_AWAY || rnd_mode == ROUNDING_DOWN)
1319                 && is_midpoint_gt_even)))) {
1320               // C1 = C1 + 1
1321               C1_lo = C1_lo + 1;
1322               if (C1_lo == 0) { // rounding overflow in the low 64 bits
1323                 C1_hi = C1_hi + 1;
1324               }
1325               if (C1_hi == 0x0001ed09bead87c0ull
1326                   && C1_lo == 0x378d8e6400000000ull) {
1327                 // C1 = 10^34 => rounding overflow
1328                 C1_hi = 0x0000314dc6448d93ull;
1329                 C1_lo = 0x38c15b0a00000000ull; // 10^33
1330                 y_exp = y_exp + EXP_P1;
1331               }
1332             } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint) &&
1333                 ((x_sign &&
1334                 (rnd_mode == ROUNDING_UP || rnd_mode == ROUNDING_TO_ZERO)) ||
1335                 (!x_sign &&
1336                 (rnd_mode == ROUNDING_DOWN || rnd_mode == ROUNDING_TO_ZERO)))) {
1337               // C1 = C1 - 1
1338               C1_lo = C1_lo - 1;
1339               if (C1_lo == 0xffffffffffffffffull)
1340                 C1_hi--;
1341               // check if we crossed into the lower decade
1342               if (C1_hi == 0x0000314dc6448d93ull &&
1343                   C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
1344                 C1_hi = 0x0001ed09bead87c0ull; // 10^34 - 1
1345                 C1_lo = 0x378d8e63ffffffffull;
1346                 y_exp = y_exp - EXP_P1;
1347                 // no underflow, because delta + q2 >= P34 + 1
1348               }
1349             } else {
1350               ; // exact, the result is already correct
1351             }
1352           }
1353           // assemble the result
1354           res.w[1] = x_sign | y_exp | C1_hi;
1355           res.w[0] = C1_lo;
1356         }
1357       }        // end delta = P34
1358     } else { // if (|delta| <= P34 - 1)
1359       if (delta >= 0) { // if (0 <= delta <= P34 - 1)
1360         if (delta <= P34 - 1 - q2) {
1361           // calculate C' directly; the result is exact
1362           // in this case 1<=q1<=P34-1, 1<=q2<=P34-1 and 0 <= e1-e2 <= P34-2
1363           // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
1364           // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
1365           // but their product fits with certainty in 128 bits (actually in 113)
1366           scale = delta - q1 + q2; // scale = (int)(e1 >> 49) - (int)(e2 >> 49)
1367
1368           if (scale >= 20) { // 10^(e1-e2) does not fit in 64 bits, but C1 does
1369             __mul_128x64_to_128 (C1, C1_lo, __bid_ten2k128[scale - 20]);
1370             C1_hi = C1.w[1];
1371             C1_lo = C1.w[0];
1372           } else if (scale >= 1) {
1373             // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
1374             if (q1 <= 19) { // C1 fits in 64 bits
1375               __mul_64x64_to_128MACH (C1, C1_lo, __bid_ten2k64[scale]);
1376             } else { // q1 >= 20
1377               C1.w[1] = C1_hi;
1378               C1.w[0] = C1_lo;
1379               __mul_128x64_to_128 (C1, __bid_ten2k64[scale], C1);
1380             }
1381             C1_hi = C1.w[1];
1382             C1_lo = C1.w[0];
1383           } else { // if (scale == 0) C1 is unchanged
1384             C1.w[0] = C1_lo; // C1.w[1] = C1_hi;
1385           }
1386           // now add C2
1387           if (x_sign == y_sign) {
1388             // the result cannot overflow
1389             C1_lo = C1_lo + C2_lo;
1390             C1_hi = C1_hi + C2_hi;
1391             if (C1_lo < C1.w[0])
1392               C1_hi++;
1393           } else { // if x_sign != y_sign
1394             C1_lo = C1_lo - C2_lo;
1395             C1_hi = C1_hi - C2_hi;
1396             if (C1_lo > C1.w[0])
1397               C1_hi--;
1398             // the result can be zero, but it cannot overflow
1399             if (C1_lo == 0 && C1_hi == 0) {
1400               // assemble the result
1401               if (x_exp < y_exp)
1402                 res.w[1] = x_exp;
1403               else
1404                 res.w[1] = y_exp;
1405               res.w[0] = 0;
1406               if (rnd_mode == ROUNDING_DOWN) {
1407                 res.w[1] |= 0x8000000000000000ull;
1408               }
1409               BID_RETURN (res);
1410             }
1411             if (C1_hi >= 0x8000000000000000ull) { // negative coefficient!
1412               C1_lo = ~C1_lo;
1413               C1_lo++;
1414               C1_hi = ~C1_hi;
1415               if (C1_lo == 0x0)
1416                 C1_hi++;
1417               x_sign = y_sign; // the result will have the sign of y
1418             }
1419           }
1420           // assemble the result
1421           res.w[1] = x_sign | y_exp | C1_hi;
1422           res.w[0] = C1_lo;
1423         } else if (delta == P34 - q2) {
1424           // calculate C' directly; the result may be inexact if it requires
1425           // P34+1 decimal digits; in this case the 'cutoff' point for addition
1426           // is at the position of the lsb of C2, so 0 <= e1-e2 <= P34-1
1427           // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
1428           // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
1429           // but their product fits with certainty in 128 bits (actually in 113)
1430           scale = delta - q1 + q2; // scale = (int)(e1 >> 49) - (int)(e2 >> 49)
1431           if (scale >= 20) { // 10^(e1-e2) does not fit in 64 bits, but C1 does
1432             __mul_128x64_to_128 (C1, C1_lo, __bid_ten2k128[scale - 20]);
1433           } else if (scale >= 1) {
1434             // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
1435             if (q1 <= 19) { // C1 fits in 64 bits
1436               __mul_64x64_to_128MACH (C1, C1_lo, __bid_ten2k64[scale]);
1437             } else { // q1 >= 20
1438               C1.w[1] = C1_hi;
1439               C1.w[0] = C1_lo;
1440               __mul_128x64_to_128 (C1, __bid_ten2k64[scale], C1);
1441             }
1442           } else { // if (scale == 0) C1 is unchanged
1443             C1.w[1] = C1_hi;
1444             C1.w[0] = C1_lo; // only the low part is necessary
1445           }
1446           C1_hi = C1.w[1];
1447           C1_lo = C1.w[0];
1448           // now add C2
1449           if (x_sign == y_sign) {
1450             // the result can overflow!
1451             C1_lo = C1_lo + C2_lo;
1452             C1_hi = C1_hi + C2_hi;
1453             if (C1_lo < C1.w[0])
1454               C1_hi++;
1455             // test for overflow, possible only when C1 >= 10^34
1456             if (C1_hi > 0x0001ed09bead87c0ull ||
1457                 (C1_hi == 0x0001ed09bead87c0ull &&
1458                 C1_lo >= 0x378d8e6400000000ull)) { // C1 >= 10^34
1459               // in this case q = P34 + 1 and x = q - P34 = 1, so multiply
1460               // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1
1461               // decimal digits
1462               // Calculate C'' = C' + 1/2 * 10^x
1463               if (C1_lo >= 0xfffffffffffffffbull) { // low half add has carry
1464                 C1_lo = C1_lo + 5;
1465                 C1_hi = C1_hi + 1;
1466               } else {
1467                 C1_lo = C1_lo + 5;
1468               }
1469               // the approximation of 10^(-1) was rounded up to 118 bits
1470               // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
1471               // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
1472               C1.w[1] = C1_hi;
1473               C1.w[0] = C1_lo; // C''
1474               __bid_ten2m1.w[1] = 0x1999999999999999ull;
1475               __bid_ten2m1.w[0] = 0x9999999999999a00ull;
1476               __mul_128x128_to_256 (P256, C1, __bid_ten2m1); // P256 = C*, f*
1477               // C* is actually floor(C*) in this case
1478               // the top Ex = 128 bits of 10^(-1) are
1479               // T* = 0x00199999999999999999999999999999
1480               // if (0 < f* < 10^(-x)) then
1481               //   if floor(C*) is even then C = floor(C*) - logical right
1482               //       shift; C has p decimal digits, correct by Prop. 1)
1483               //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
1484               //       shift; C has p decimal digits, correct by Pr. 1)
1485               // else
1486               //   C = floor(C*) (logical right shift; C has p decimal digits,
1487               //       correct by Property 1)
1488               // n = C * 10^(e2+x)
1489               if ((P256.w[1] || P256.w[0])
1490                   && (P256.w[1] < 0x1999999999999999ull
1491                       || (P256.w[1] == 0x1999999999999999ull
1492                       && P256.w[0] <= 0x9999999999999999ull))) {
1493                 // the result is a midpoint
1494                 if (P256.w[2] & 0x01) {
1495                   is_midpoint_gt_even = 1;
1496                   // if floor(C*) is odd C = floor(C*) - 1; the result is not 0
1497                   P256.w[2]--;
1498                   if (P256.w[2] == 0xffffffffffffffffull)
1499                     P256.w[3]--;
1500                 } else {
1501                   is_midpoint_lt_even = 1;
1502                 }
1503               }
1504               // n = Cstar * 10^(e2+1)
1505               y_exp = y_exp + EXP_P1;
1506               // C* != 10^P because C* has P34 digits
1507               // check for overflow
1508               if (y_exp == EXP_MAX_P1
1509                   && (rnd_mode == ROUNDING_TO_NEAREST
1510                       || rnd_mode == ROUNDING_TIES_AWAY)) {
1511                 // overflow for RN
1512                 res.w[1] = x_sign | 0x7800000000000000ull; // +/-inf
1513                 res.w[0] = 0x0ull;
1514                 // set the inexact flag
1515                 *pfpsf |= INEXACT_EXCEPTION;
1516                 // set the overflow flag
1517                 *pfpsf |= OVERFLOW_EXCEPTION;
1518                 BID_RETURN (res);
1519               }
1520               // if (0 < f* - 1/2 < 10^(-x)) then
1521               //   the result of the addition is exact
1522               // else
1523               //   the result of the addition is inexact
1524               if (P256.w[1] > 0x8000000000000000ull ||
1525                   (P256.w[1] == 0x8000000000000000ull &&
1526                   P256.w[0] > 0x0ull)) { // the result may be exact
1527                 tmp64 = P256.w[1] - 0x8000000000000000ull; // f* - 1/2
1528                 if ((tmp64 > 0x1999999999999999ull
1529                      || (tmp64 == 0x1999999999999999ull
1530                      && P256.w[0] >= 0x9999999999999999ull))) {
1531                   // set the inexact flag
1532                   *pfpsf |= INEXACT_EXCEPTION;
1533                   is_inexact = 1;
1534                 }        // else the result is exact
1535               } else { // the result is inexact
1536                 // set the inexact flag
1537                 *pfpsf |= INEXACT_EXCEPTION;
1538                 is_inexact = 1;
1539               }
1540               C1_hi = P256.w[3];
1541               C1_lo = P256.w[2];
1542               if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
1543                 is_inexact_lt_midpoint = is_inexact
1544                   && (P256.w[1] & 0x8000000000000000ull);
1545                 is_inexact_gt_midpoint = is_inexact
1546                   && !(P256.w[1] & 0x8000000000000000ull);
1547               }
1548               // general correction from RN to RA, RM, RP, RZ;
1549               // result uses y_exp
1550               if (rnd_mode != ROUNDING_TO_NEAREST) {
1551                 if ((!x_sign &&
1552                     ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint) ||
1553                     ((rnd_mode == ROUNDING_TIES_AWAY || rnd_mode == ROUNDING_UP)
1554                     && is_midpoint_gt_even))) ||
1555                     (x_sign &&
1556                     ((rnd_mode == ROUNDING_DOWN && is_inexact_lt_midpoint) ||
1557                     ((rnd_mode == ROUNDING_TIES_AWAY ||
1558                     rnd_mode == ROUNDING_DOWN) && is_midpoint_gt_even)))) {
1559                   // C1 = C1 + 1
1560                   C1_lo = C1_lo + 1;
1561                   if (C1_lo == 0) { // rounding overflow in the low 64 bits
1562                     C1_hi = C1_hi + 1;
1563                   }
1564                   if (C1_hi == 0x0001ed09bead87c0ull
1565                       && C1_lo == 0x378d8e6400000000ull) {
1566                     // C1 = 10^34 => rounding overflow
1567                     C1_hi = 0x0000314dc6448d93ull;
1568                     C1_lo = 0x38c15b0a00000000ull; // 10^33
1569                     y_exp = y_exp + EXP_P1;
1570                   }
1571                 } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint) &&
1572                     ((x_sign && (rnd_mode == ROUNDING_UP ||
1573                     rnd_mode == ROUNDING_TO_ZERO)) || (!x_sign &&
1574                     (rnd_mode == ROUNDING_DOWN ||
1575                     rnd_mode == ROUNDING_TO_ZERO)))) {
1576                   // C1 = C1 - 1
1577                   C1_lo = C1_lo - 1;
1578                   if (C1_lo == 0xffffffffffffffffull)
1579                     C1_hi--;
1580                   // check if we crossed into the lower decade
1581                   if (C1_hi == 0x0000314dc6448d93ull &&
1582                       C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
1583                     C1_hi = 0x0001ed09bead87c0ull; // 10^34 - 1
1584                     C1_lo = 0x378d8e63ffffffffull;
1585                     y_exp = y_exp - EXP_P1;
1586                     // no underflow, because delta + q2 >= P34 + 1
1587                   }
1588                 } else {
1589                   ; // exact, the result is already correct
1590                 }
1591                 // in all cases check for overflow (RN and RA solved already)
1592                 if (y_exp == EXP_MAX_P1) { // overflow
1593                   if ((rnd_mode == ROUNDING_DOWN && x_sign) || // RM and res < 0
1594                       (rnd_mode == ROUNDING_UP && !x_sign)) { // RP and res > 0
1595                     C1_hi = 0x7800000000000000ull; // +inf
1596                     C1_lo = 0x0ull;
1597                   } else { // RM and res > 0, RP and res < 0, or RZ
1598                     C1_hi = 0x5fffed09bead87c0ull;
1599                     C1_lo = 0x378d8e63ffffffffull;
1600                   }
1601                   y_exp = 0; // x_sign is preserved
1602                   // set the inexact flag (in case the exact addition was exact)
1603                   *pfpsf |= INEXACT_EXCEPTION;
1604                   // set the overflow flag
1605                   *pfpsf |= OVERFLOW_EXCEPTION;
1606                 }
1607               }
1608             } // else if (C1 < 10^34) then C1 is the coeff.; the result is exact
1609           } else { // if x_sign != y_sign the result is exact
1610             C1_lo = C1_lo - C2_lo;
1611             C1_hi = C1_hi - C2_hi;
1612             if (C1_lo > C1.w[0])
1613               C1_hi--;
1614             // the result can be zero, but it cannot overflow
1615             if (C1_lo == 0 && C1_hi == 0) {
1616               // assemble the result
1617               if (x_exp < y_exp)
1618                 res.w[1] = x_exp;
1619               else
1620                 res.w[1] = y_exp;
1621               res.w[0] = 0;
1622               if (rnd_mode == ROUNDING_DOWN) {
1623                 res.w[1] |= 0x8000000000000000ull;
1624               }
1625               BID_RETURN (res);
1626             }
1627             if (C1_hi >= 0x8000000000000000ull) { // negative coefficient!
1628               C1_lo = ~C1_lo;
1629               C1_lo++;
1630               C1_hi = ~C1_hi;
1631               if (C1_lo == 0x0)
1632                 C1_hi++;
1633               x_sign = y_sign; // the result will have the sign of y
1634             }
1635           }
1636           // assemble the result
1637           res.w[1] = x_sign | y_exp | C1_hi;
1638           res.w[0] = C1_lo;
1639         } else { // if (delta >= P34 + 1 - q2)
1640           // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
1641           // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34
1642           // where x1 = q1 + e1 - e2 - P34, 1 <= x1 <= P34 - 1
1643           // In most cases C' will have P34 digits, and n = C' * 10^(e2+x1)
1644           // If the result has P34+1 digits, redo the steps above with x1+1
1645           // If the result has P34-1 digits or less, redo the steps above with
1646           // x1-1 but only if initially x1 >= 1
1647           // NOTE: these two steps can be improved, e.g we could guess if
1648           // P34+1 or P34-1 digits will be obtained by adding/subtracting just
1649           // the top 64 bits of the two operands
1650           // The result cannot be zero, but it can overflow
1651           x1 = delta + q2 - P34; // 1 <= x1 <= P34-1
1652         roundC2:
1653           // Calculate C1 * 10^(e1-e2-x1) where 0 <= e1-e2-x1 <= P34 - 1
1654           // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
1655           scale = delta - q1 + q2 - x1; // scale = e1 - e2 - x1 = P34 - q1
1656           // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
1657           // but their product fits with certainty in 128 bits (actually in 113)
1658           if (scale >= 20) { //10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
1659             __mul_128x64_to_128 (C1, C1_lo, __bid_ten2k128[scale - 20]);
1660           } else if (scale >= 1) {
1661             // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
1662             if (q1 <= 19) { // C1 fits in 64 bits
1663               __mul_64x64_to_128MACH (C1, C1_lo, __bid_ten2k64[scale]);
1664             } else { // q1 >= 20
1665               C1.w[1] = C1_hi;
1666               C1.w[0] = C1_lo;
1667               __mul_128x64_to_128 (C1, __bid_ten2k64[scale], C1);
1668             }
1669           } else { // if (scale == 0) C1 is unchanged
1670             C1.w[1] = C1_hi;
1671             C1.w[0] = C1_lo;
1672           }
1673           tmp64 = C1.w[0]; // C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
1674
1675           // now round C2 to q2-x1 decimal digits, where 1<=x1<=q2-1<=P34-1
1676           // (but if we got here a second time after x1 = x1 - 1, then
1677           // x1 >= 0; note that for x1 = 0 C2 is unchanged)
1678           // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
1679           ind = x1 - 1; // 0 <= ind <= q2-2<=P34-2=32; but note that if x1 = 0
1680           // during a second pass, then ind = -1
1681           if (ind >= 0) { // if (x1 >= 1)
1682             C2.w[0] = C2_lo;
1683             C2.w[1] = C2_hi;
1684             if (ind <= 18) {
1685               C2.w[0] = C2.w[0] + __bid_midpoint64[ind];
1686               if (C2.w[0] < C2_lo)
1687                 C2.w[1]++;
1688             } else { // 19 <= ind <= 32
1689               C2.w[0] = C2.w[0] + __bid_midpoint128[ind - 19].w[0];
1690               C2.w[1] = C2.w[1] + __bid_midpoint128[ind - 19].w[1];
1691               if (C2.w[0] < C2_lo)
1692                 C2.w[1]++;
1693             }
1694             // the approximation of 10^(-x1) was rounded up to 118 bits
1695             __mul_128x128_to_256 (R256, C2, __bid_ten2mk128[ind]); // R256 = C2*, f2*
1696             // calculate C2* and f2*
1697             // C2* is actually floor(C2*) in this case
1698             // C2* and f2* need shifting and masking, as shown by
1699             // __bid_shiftright128[] and __bid_maskhigh128[]
1700             // the top Ex bits of 10^(-x1) are T* = __bid_ten2mk128trunc[ind], e.g.
1701             // if x1=1, T*=__bid_ten2mk128trunc[0]=0x19999999999999999999999999999999
1702             // if (0 < f2* < 10^(-x1)) then
1703             //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
1704             //       shift; C2* has p decimal digits, correct by Prop. 1)
1705             //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
1706             //       shift; C2* has p decimal digits, correct by Pr. 1)
1707             // else
1708             //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
1709             //       correct by Property 1)
1710             // n = C2* * 10^(e2+x1)
1711
1712             if (ind <= 2) {
1713               highf2star.w[1] = 0x0;
1714               highf2star.w[0] = 0x0; // low f2* ok
1715             } else if (ind <= 21) {
1716               highf2star.w[1] = 0x0;
1717               highf2star.w[0] = R256.w[2] & __bid_maskhigh128[ind]; // low f2* ok
1718             } else {
1719               highf2star.w[1] = R256.w[3] & __bid_maskhigh128[ind];
1720               highf2star.w[0] = R256.w[2]; // low f2* is ok
1721             }
1722             // shift right C2* by Ex-128 = __bid_shiftright128[ind]
1723             if (ind >= 3) {
1724               shift = __bid_shiftright128[ind];
1725               if (shift < 64) { // 3 <= shift <= 63
1726                 R256.w[2] =
1727                   (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
1728                 R256.w[3] = (R256.w[3] >> shift);
1729               } else { // 66 <= shift <= 102
1730                 R256.w[2] = (R256.w[3] >> (shift - 64));
1731                 R256.w[3] = 0x0ULL;
1732               }
1733             }
1734             if (second_pass) {
1735               is_inexact_lt_midpoint = 0;
1736               is_inexact_gt_midpoint = 0;
1737               is_midpoint_lt_even = 0;
1738               is_midpoint_gt_even = 0;
1739             }
1740             // determine inexactness of the rounding of C2* (this may be
1741             // followed by a second rounding only if we get P34+1
1742             // decimal digits)
1743             // if (0 < f2* - 1/2 < 10^(-x1)) then
1744             //   the result is exact
1745             // else (if f2* - 1/2 > T* then)
1746             //   the result of is inexact
1747             if (ind <= 2) {
1748               if (R256.w[1] > 0x8000000000000000ull ||
1749                   (R256.w[1] == 0x8000000000000000ull && R256.w[0] > 0x0ull)) {
1750                   // f2* > 1/2 and the result may be exact
1751                 tmp64A = R256.w[1] - 0x8000000000000000ull; // f* - 1/2
1752                 if ((tmp64A > __bid_ten2mk128trunc[ind].w[1]
1753                      || (tmp64A == __bid_ten2mk128trunc[ind].w[1]
1754                      && R256.w[0] >= __bid_ten2mk128trunc[ind].w[0]))) {
1755                   // set the inexact flag
1756                   // *pfpsf |= INEXACT_EXCEPTION;
1757                   tmp_inexact = 1; // may be set again during a second pass
1758                   // this rounding is applied to C2 only!
1759                   if (x_sign == y_sign)
1760                     is_inexact_lt_midpoint = 1;
1761                   else        // if (x_sign != y_sign)
1762                     is_inexact_gt_midpoint = 1;
1763                 }        // else the result is exact
1764                 // rounding down, unless a midpoint in [ODD, EVEN]
1765               } else { // the result is inexact; f2* <= 1/2
1766                 // set the inexact flag
1767                 // *pfpsf |= INEXACT_EXCEPTION;
1768                 tmp_inexact = 1; // just in case we will round a second time
1769                 // rounding up, unless a midpoint in [EVEN, ODD]
1770                 // this rounding is applied to C2 only!
1771                 if (x_sign == y_sign)
1772                   is_inexact_gt_midpoint = 1;
1773                 else        // if (x_sign != y_sign)
1774                   is_inexact_lt_midpoint = 1;
1775               }
1776             } else if (ind <= 21) { // if 3 <= ind <= 21
1777               if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
1778                   && highf2star.w[0] > __bid_one_half128[ind])
1779                   || ((highf2star.w[1] == 0x0
1780                   && highf2star.w[0] == __bid_one_half128[ind])
1781                   && (R256.w[1] || R256.w[0]))) {
1782                 // f2* > 1/2 and the result may be exact
1783                 // Calculate f2* - 1/2
1784                 tmp64A = highf2star.w[0] - __bid_one_half128[ind];
1785                 tmp64B = highf2star.w[1];
1786                 if (tmp64A > highf2star.w[0])
1787                   tmp64B--;
1788                 if (tmp64B || tmp64A
1789                     || R256.w[1] > __bid_ten2mk128trunc[ind].w[1]
1790                     || (R256.w[1] == __bid_ten2mk128trunc[ind].w[1]
1791                     && R256.w[0] > __bid_ten2mk128trunc[ind].w[0])) {
1792                   // set the inexact flag
1793                   // *pfpsf |= INEXACT_EXCEPTION;
1794                   tmp_inexact = 1; // may be set again during a second pass
1795                   // this rounding is applied to C2 only!
1796                   if (x_sign == y_sign)
1797                     is_inexact_lt_midpoint = 1;
1798                   else        // if (x_sign != y_sign)
1799                     is_inexact_gt_midpoint = 1;
1800                 }        // else the result is exact
1801               } else { // the result is inexact; f2* <= 1/2
1802                 // set the inexact flag
1803                 // *pfpsf |= INEXACT_EXCEPTION;
1804                 tmp_inexact = 1; // may be set again during a second pass
1805                 // rounding up, unless a midpoint in [EVEN, ODD]
1806                 // this rounding is applied to C2 only!
1807                 if (x_sign == y_sign)
1808                   is_inexact_gt_midpoint = 1;
1809                 else        // if (x_sign != y_sign)
1810                   is_inexact_lt_midpoint = 1;
1811               }
1812             } else { // if 22 <= ind <= 33
1813               if (highf2star.w[1] > __bid_one_half128[ind]
1814                   || (highf2star.w[1] == __bid_one_half128[ind]
1815                   && (highf2star.w[0] || R256.w[1]
1816                       || R256.w[0]))) {
1817                 // f2* > 1/2 and the result may be exact
1818                 // Calculate f2* - 1/2
1819                 // tmp64A = highf2star.w[0];
1820                 tmp64B = highf2star.w[1] - __bid_one_half128[ind];
1821                 if (tmp64B || highf2star.w[0]
1822                     || R256.w[1] > __bid_ten2mk128trunc[ind].w[1]
1823                     || (R256.w[1] == __bid_ten2mk128trunc[ind].w[1]
1824                     && R256.w[0] > __bid_ten2mk128trunc[ind].w[0])) {
1825                   // set the inexact flag
1826                   // *pfpsf |= INEXACT_EXCEPTION;
1827                   tmp_inexact = 1; // may be set again during a second pass
1828                   // this rounding is applied to C2 only!
1829                   if (x_sign == y_sign)
1830                     is_inexact_lt_midpoint = 1;
1831                   else        // if (x_sign != y_sign)
1832                     is_inexact_gt_midpoint = 1;
1833                 }        // else the result is exact
1834               } else { // the result is inexact; f2* <= 1/2
1835                 // set the inexact flag
1836                 // *pfpsf |= INEXACT_EXCEPTION;
1837                 tmp_inexact = 1; // may be set again during a second pass
1838                 // rounding up, unless a midpoint in [EVEN, ODD]
1839                 // this rounding is applied to C2 only!
1840                 if (x_sign == y_sign)
1841                   is_inexact_gt_midpoint = 1;
1842                 else        // if (x_sign != y_sign)
1843                   is_inexact_lt_midpoint = 1;
1844               }
1845             }
1846             // check for midpoints
1847             if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
1848                 && (highf2star.w[0] == 0)
1849                 && (R256.w[1] < __bid_ten2mk128trunc[ind].w[1]
1850                     || (R256.w[1] == __bid_ten2mk128trunc[ind].w[1]
1851                     && R256.w[0] <= __bid_ten2mk128trunc[ind].w[0]))) {
1852               // the result is a midpoint
1853               if ((tmp64 + R256.w[2]) & 0x01) { // MP in [EVEN, ODD]
1854                 // if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
1855                 R256.w[2]--;
1856                 if (R256.w[2] == 0xffffffffffffffffull)
1857                   R256.w[3]--;
1858                 // this rounding is applied to C2 only!
1859                 if (x_sign == y_sign)
1860                   is_midpoint_gt_even = 1;
1861                 else        // if (x_sign != y_sign)
1862                   is_midpoint_lt_even = 1;
1863                 is_inexact_lt_midpoint = 0;
1864                 is_inexact_gt_midpoint = 0;
1865               } else {
1866                 // else MP in [ODD, EVEN]
1867                 // this rounding is applied to C2 only!
1868                 if (x_sign == y_sign)
1869                   is_midpoint_lt_even = 1;
1870                 else        // if (x_sign != y_sign)
1871                   is_midpoint_gt_even = 1;
1872                 is_inexact_lt_midpoint = 0;
1873                 is_inexact_gt_midpoint = 0;
1874               }
1875             }
1876             // end if (ind >= 0)
1877           } else { // if (ind == -1); only during a 2nd pass, and when x1 = 0
1878             R256.w[2] = C2_lo;
1879             R256.w[3] = C2_hi;
1880             tmp_inexact = 0;
1881             // to correct a possible setting to 1 from 1st pass
1882             if (second_pass) {
1883               is_midpoint_lt_even = 0;
1884               is_midpoint_gt_even = 0;
1885               is_inexact_lt_midpoint = 0;
1886               is_inexact_gt_midpoint = 0;
1887             }
1888           }
1889           // and now add/subtract C1 * 10^(e1-e2-x1) +/- (C2 * 10^(-x1))rnd,P34
1890           if (x_sign == y_sign) { // addition; could overflow
1891             // no second pass is possible this way (only for x_sign != y_sign)
1892             C1.w[0] = C1.w[0] + R256.w[2];
1893             C1.w[1] = C1.w[1] + R256.w[3];
1894             if (C1.w[0] < tmp64)
1895               C1.w[1]++; // carry
1896             // if the sum has P34+1 digits, i.e. C1>=10^34 redo the calculation
1897             // with x1=x1+1
1898             if (C1.w[1] > 0x0001ed09bead87c0ull ||
1899                 (C1.w[1] == 0x0001ed09bead87c0ull &&
1900                 C1.w[0] >= 0x378d8e6400000000ull)) { // C1 >= 10^34
1901               // chop off one more digit from the sum, but make sure there is
1902               // no double-rounding error (see table - double rounding logic)
1903               // now round C1 from P34+1 to P34 decimal digits
1904               // C1' = C1 + 1/2 * 10 = C1 + 5
1905               if (C1.w[0] >= 0xfffffffffffffffbull) { // low half add has carry
1906                 C1.w[0] = C1.w[0] + 5;
1907                 C1.w[1] = C1.w[1] + 1;
1908               } else {
1909                 C1.w[0] = C1.w[0] + 5;
1910               }
1911               // the approximation of 10^(-1) was rounded up to 118 bits
1912               __mul_128x128_to_256 (Q256, C1, __bid_ten2mk128[0]); // Q256 = C1*, f1*
1913               // C1* is actually floor(C1*) in this case
1914               // the top 128 bits of 10^(-1) are
1915               // T* = __bid_ten2mk128trunc[0]=0x19999999999999999999999999999999
1916               // if (0 < f1* < 10^(-1)) then
1917               //   if floor(C1*) is even then C1* = floor(C1*) - logical right
1918               //       shift; C1* has p decimal digits, correct by Prop. 1)
1919               //   else if floor(C1*) is odd C1* = floor(C1*) - 1 (logical right
1920               //       shift; C1* has p decimal digits, correct by Pr. 1)
1921               // else
1922               //   C1* = floor(C1*) (logical right shift; C has p decimal digits
1923               //       correct by Property 1)
1924               // n = C1* * 10^(e2+x1+1)
1925               if ((Q256.w[1] || Q256.w[0])
1926                   && (Q256.w[1] < __bid_ten2mk128trunc[0].w[1]
1927                       || (Q256.w[1] == __bid_ten2mk128trunc[0].w[1]
1928                       && Q256.w[0] <= __bid_ten2mk128trunc[0].w[0]))) {
1929                 // the result is a midpoint
1930                 if (is_inexact_lt_midpoint) { // for the 1st rounding
1931                   is_inexact_gt_midpoint = 1;
1932                   is_inexact_lt_midpoint = 0;
1933                   is_midpoint_gt_even = 0;
1934                   is_midpoint_lt_even = 0;
1935                 } else if (is_inexact_gt_midpoint) { // for the 1st rounding
1936                   Q256.w[2]--;
1937                   if (Q256.w[2] == 0xffffffffffffffffull)
1938                     Q256.w[3]--;
1939                   is_inexact_gt_midpoint = 0;
1940                   is_inexact_lt_midpoint = 1;
1941                   is_midpoint_gt_even = 0;
1942                   is_midpoint_lt_even = 0;
1943                 } else if (is_midpoint_gt_even) { // for the 1st rounding
1944                   // Note: cannot have is_midpoint_lt_even
1945                   is_inexact_gt_midpoint = 0;
1946                   is_inexact_lt_midpoint = 1;
1947                   is_midpoint_gt_even = 0;
1948                   is_midpoint_lt_even = 0;
1949                 } else { // the first rounding must have been exact
1950                   if (Q256.w[2] & 0x01) { // MP in [EVEN, ODD]
1951                     // the truncated result is correct
1952                     Q256.w[2]--;
1953                     if (Q256.w[2] == 0xffffffffffffffffull)
1954                       Q256.w[3]--;
1955                     is_inexact_gt_midpoint = 0;
1956                     is_inexact_lt_midpoint = 0;
1957                     is_midpoint_gt_even = 1;
1958                     is_midpoint_lt_even = 0;
1959                   } else { // MP in [ODD, EVEN]
1960                     is_inexact_gt_midpoint = 0;
1961                     is_inexact_lt_midpoint = 0;
1962                     is_midpoint_gt_even = 0;
1963                     is_midpoint_lt_even = 1;
1964                   }
1965                 }
1966                 tmp_inexact = 1; // in all cases
1967               } else { // the result is not a midpoint
1968                 // determine inexactness of the rounding of C1 (the sum C1+C2*)
1969                 // if (0 < f1* - 1/2 < 10^(-1)) then
1970                 //   the result is exact
1971                 // else (if f1* - 1/2 > T* then)
1972                 //   the result of is inexact
1973                 // ind = 0
1974                 if (Q256.w[1] > 0x8000000000000000ull
1975                     || (Q256.w[1] == 0x8000000000000000ull
1976                     && Q256.w[0] > 0x0ull)) {
1977                   // f1* > 1/2 and the result may be exact
1978                   Q256.w[1] = Q256.w[1] - 0x8000000000000000ull; // f1* - 1/2
1979                   if ((Q256.w[1] > __bid_ten2mk128trunc[0].w[1]
1980                        || (Q256.w[1] == __bid_ten2mk128trunc[0].w[1]
1981                        && Q256.w[0] > __bid_ten2mk128trunc[0].w[0]))) {
1982                     is_inexact_gt_midpoint = 0;
1983                     is_inexact_lt_midpoint = 1;
1984                     is_midpoint_gt_even = 0;
1985                     is_midpoint_lt_even = 0;
1986                     // set the inexact flag
1987                     tmp_inexact = 1;
1988                     // *pfpsf |= INEXACT_EXCEPTION;
1989                   } else { // else the result is exact for the 2nd rounding
1990                     if (tmp_inexact) { // if the previous rounding was inexact
1991                       if (is_midpoint_lt_even) {
1992                         is_inexact_gt_midpoint = 1;
1993                         is_midpoint_lt_even = 0;
1994                       } else if (is_midpoint_gt_even) {
1995                         is_inexact_lt_midpoint = 1;
1996                         is_midpoint_gt_even = 0;
1997                       } else {
1998                         ; // no change
1999                       }
2000                     }
2001                   }
2002                   // rounding down, unless a midpoint in [ODD, EVEN]
2003                 } else { // the result is inexact; f1* <= 1/2
2004                   is_inexact_gt_midpoint = 1;
2005                   is_inexact_lt_midpoint = 0;
2006                   is_midpoint_gt_even = 0;
2007                   is_midpoint_lt_even = 0;
2008                   // set the inexact flag
2009                   tmp_inexact = 1;
2010                   // *pfpsf |= INEXACT_EXCEPTION;
2011                 }
2012               }        // end 'the result is not a midpoint'
2013               // n = C1 * 10^(e2+x1)
2014               C1.w[1] = Q256.w[3];
2015               C1.w[0] = Q256.w[2];
2016               y_exp = y_exp + ((UINT64) (x1 + 1) << 49);
2017             } else { // C1 < 10^34
2018               // C1.w[1] and C1.w[0] already set
2019               // n = C1 * 10^(e2+x1)
2020               y_exp = y_exp + ((UINT64) x1 << 49);
2021             }
2022             // check for overflow
2023             if (y_exp == EXP_MAX_P1
2024                 && (rnd_mode == ROUNDING_TO_NEAREST
2025                     || rnd_mode == ROUNDING_TIES_AWAY)) {
2026               res.w[1] = 0x7800000000000000ull | x_sign; // +/-inf
2027               res.w[0] = 0x0ull;
2028               // set the inexact flag
2029               *pfpsf |= INEXACT_EXCEPTION;
2030               // set the overflow flag
2031               *pfpsf |= OVERFLOW_EXCEPTION;
2032               BID_RETURN (res);
2033             }        // else no overflow
2034           } else { // if x_sign != y_sign the result of this subtract. is exact
2035             C1.w[0] = C1.w[0] - R256.w[2];
2036             C1.w[1] = C1.w[1] - R256.w[3];
2037             if (C1.w[0] > tmp64)
2038               C1.w[1]--; // borrow
2039             if (C1.w[1] >= 0x8000000000000000ull) { // negative coefficient!
2040               C1.w[0] = ~C1.w[0];
2041               C1.w[0]++;
2042               C1.w[1] = ~C1.w[1];
2043               if (C1.w[0] == 0x0)
2044                 C1.w[1]++;
2045               tmp_sign = y_sign;
2046                   // the result will have the sign of y if last rnd
2047             } else {
2048               tmp_sign = x_sign;
2049             }
2050             // if the difference has P34-1 digits or less, i.e. C1 < 10^33 then
2051             //   redo the calculation with x1=x1-1;
2052             // redo the calculation also if C1 = 10^33 and
2053             //   (is_inexact_gt_midpoint or is_midpoint_lt_even);
2054             //   (the last part should have really been
2055             //   (is_inexact_lt_midpoint or is_midpoint_gt_even) from
2056             //    the rounding of C2, but the position flags have been reversed)
2057             // 10^33 = 0x0000314dc6448d93 0x38c15b0a00000000
2058             if ((C1.w[1] < 0x0000314dc6448d93ull ||
2059                 (C1.w[1] == 0x0000314dc6448d93ull &&
2060                 C1.w[0] < 0x38c15b0a00000000ull)) ||
2061                 (C1.w[1] == 0x0000314dc6448d93ull &&
2062                 C1.w[0] == 0x38c15b0a00000000ull &&
2063                 (is_inexact_gt_midpoint || is_midpoint_lt_even))) { // C1=10^33
2064               x1 = x1 - 1; // x1 >= 0
2065               if (x1 >= 0) {
2066                 // clear position flags and tmp_inexact
2067                 is_midpoint_lt_even = 0;
2068                 is_midpoint_gt_even = 0;
2069                 is_inexact_lt_midpoint = 0;
2070                 is_inexact_gt_midpoint = 0;
2071                 tmp_inexact = 0;
2072                 second_pass = 1;
2073                 goto roundC2; // else result has less than P34 digits
2074               }
2075             }
2076             // if the coefficient of the result is 10^34 it means that this
2077             // must be the second pass, and we are done
2078             if (C1.w[1] == 0x0001ed09bead87c0ull &&
2079                 C1.w[0] == 0x378d8e6400000000ull) { // if  C1 = 10^34
2080               C1.w[1] = 0x0000314dc6448d93ull; // C1 = 10^33
2081               C1.w[0] = 0x38c15b0a00000000ull;
2082               y_exp = y_exp + ((UINT64) 1 << 49);
2083             }
2084             x_sign = tmp_sign;
2085             if (x1 >= 1)
2086               y_exp = y_exp + ((UINT64) x1 << 49);
2087             // x1 = -1 is possible at the end of a second pass when the
2088             // first pass started with x1 = 1
2089           }
2090           C1_hi = C1.w[1];
2091           C1_lo = C1.w[0];
2092           // general correction from RN to RA, RM, RP, RZ; result uses y_exp
2093           if (rnd_mode != ROUNDING_TO_NEAREST) {
2094             if ((!x_sign
2095                 && ((rnd_mode == ROUNDING_UP
2096                     && is_inexact_lt_midpoint)
2097                     || ((rnd_mode == ROUNDING_TIES_AWAY
2098                         || rnd_mode == ROUNDING_UP)
2099                     && is_midpoint_gt_even))) ||
2100                 (x_sign
2101                 && ((rnd_mode == ROUNDING_DOWN
2102                     && is_inexact_lt_midpoint)
2103                     || ((rnd_mode == ROUNDING_TIES_AWAY
2104                         || rnd_mode == ROUNDING_DOWN)
2105                     && is_midpoint_gt_even)))) {
2106               // C1 = C1 + 1
2107               C1_lo = C1_lo + 1;
2108               if (C1_lo == 0) { // rounding overflow in the low 64 bits
2109                 C1_hi = C1_hi + 1;
2110               }
2111               if (C1_hi == 0x0001ed09bead87c0ull
2112                   && C1_lo == 0x378d8e6400000000ull) {
2113                 // C1 = 10^34 => rounding overflow
2114                 C1_hi = 0x0000314dc6448d93ull;
2115                 C1_lo = 0x38c15b0a00000000ull; // 10^33
2116                 y_exp = y_exp + EXP_P1;
2117               }
2118             } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint) &&
2119                 ((x_sign &&
2120                 (rnd_mode == ROUNDING_UP || rnd_mode == ROUNDING_TO_ZERO)) ||
2121                 (!x_sign &&
2122                 (rnd_mode == ROUNDING_DOWN || rnd_mode == ROUNDING_TO_ZERO)))) {
2123               // C1 = C1 - 1
2124               C1_lo = C1_lo - 1;
2125               if (C1_lo == 0xffffffffffffffffull)
2126                 C1_hi--;
2127               // check if we crossed into the lower decade
2128               if (C1_hi == 0x0000314dc6448d93ull &&
2129                 C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
2130                 C1_hi = 0x0001ed09bead87c0ull; // 10^34 - 1
2131                 C1_lo = 0x378d8e63ffffffffull;
2132                 y_exp = y_exp - EXP_P1;
2133                 // no underflow, because delta + q2 >= P34 + 1
2134               }
2135             } else {
2136               ; // exact, the result is already correct
2137             }
2138             // in all cases check for overflow (RN and RA solved already)
2139             if (y_exp == EXP_MAX_P1) { // overflow
2140               if ((rnd_mode == ROUNDING_DOWN && x_sign) || // RM and res < 0
2141                   (rnd_mode == ROUNDING_UP && !x_sign)) { // RP and res > 0
2142                 C1_hi = 0x7800000000000000ull; // +inf
2143                 C1_lo = 0x0ull;
2144               } else { // RM and res > 0, RP and res < 0, or RZ
2145                 C1_hi = 0x5fffed09bead87c0ull;
2146                 C1_lo = 0x378d8e63ffffffffull;
2147               }
2148               y_exp = 0; // x_sign is preserved
2149               // set the inexact flag (in case the exact addition was exact)
2150               *pfpsf |= INEXACT_EXCEPTION;
2151               // set the overflow flag
2152               *pfpsf |= OVERFLOW_EXCEPTION;
2153             }
2154           }
2155           // assemble the result
2156           res.w[1] = x_sign | y_exp | C1_hi;
2157           res.w[0] = C1_lo;
2158           if (tmp_inexact)
2159             *pfpsf |= INEXACT_EXCEPTION;
2160         }
2161       } else { // if (-P34 + 1 <= delta <= -1) <=> 1 <= -delta <= P34 - 1
2162         // NOTE: the following, up to "} else { // if x_sign != y_sign
2163         // the result is exact" is identical to "else if (delta == P34 - q2) {"
2164         // from above; also, the code is not symmetric: a+b and b+a may take
2165         // different paths (need to unify eventually!)
2166         // calculate C' = C2 + C1 * 10^(e1-e2) directly; the result may be
2167         // inexact if it requires P34 + 1 decimal digits; in either case the
2168         // 'cutoff' point for addition is at the position of the lsb of C2
2169         // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
2170         // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
2171         // but their product fits with certainty in 128 bits (actually in 113)
2172         // Note that 0 <= e1 - e2 <= P34 - 2
2173         //   -P34 + 1 <= delta <= -1 <=> -P34 + 1 <= delta <= -1 <=>
2174         //   -P34 + 1 <= q1 + e1 - q2 - e2 <= -1 <=>
2175         //   q2 - q1 - P34 + 1 <= e1 - e2 <= q2 - q1 - 1 <=>
2176         //   1 - P34 - P34 + 1 <= e1-e2 <= P34 - 1 - 1 => 0 <= e1-e2 <= P34 - 2
2177         scale = delta - q1 + q2; // scale = (int)(e1 >> 49) - (int)(e2 >> 49)
2178         if (scale >= 20) { // 10^(e1-e2) does not fit in 64 bits, but C1 does
2179           __mul_128x64_to_128 (C1, C1_lo, __bid_ten2k128[scale - 20]);
2180         } else if (scale >= 1) {
2181           // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
2182           if (q1 <= 19) { // C1 fits in 64 bits
2183             __mul_64x64_to_128MACH (C1, C1_lo, __bid_ten2k64[scale]);
2184           } else { // q1 >= 20
2185             C1.w[1] = C1_hi;
2186             C1.w[0] = C1_lo;
2187             __mul_128x64_to_128 (C1, __bid_ten2k64[scale], C1);
2188           }
2189         } else { // if (scale == 0) C1 is unchanged
2190           C1.w[1] = C1_hi;
2191           C1.w[0] = C1_lo; // only the low part is necessary
2192         }
2193         C1_hi = C1.w[1];
2194         C1_lo = C1.w[0];
2195         // now add C2
2196         if (x_sign == y_sign) {
2197           // the result can overflow!
2198           C1_lo = C1_lo + C2_lo;
2199           C1_hi = C1_hi + C2_hi;
2200           if (C1_lo < C1.w[0])
2201             C1_hi++;
2202           // test for overflow, possible only when C1 >= 10^34
2203           if (C1_hi > 0x0001ed09bead87c0ull ||
2204               (C1_hi == 0x0001ed09bead87c0ull &&
2205               C1_lo >= 0x378d8e6400000000ull)) { // C1 >= 10^34
2206             // in this case q = P34 + 1 and x = q - P34 = 1, so multiply
2207             // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1
2208             // decimal digits
2209             // Calculate C'' = C' + 1/2 * 10^x
2210             if (C1_lo >= 0xfffffffffffffffbull) { // low half add has carry
2211               C1_lo = C1_lo + 5;
2212               C1_hi = C1_hi + 1;
2213             } else {
2214               C1_lo = C1_lo + 5;
2215             }
2216             // the approximation of 10^(-1) was rounded up to 118 bits
2217             // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
2218             // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
2219             C1.w[1] = C1_hi;
2220             C1.w[0] = C1_lo; // C''
2221             __bid_ten2m1.w[1] = 0x1999999999999999ull;
2222             __bid_ten2m1.w[0] = 0x9999999999999a00ull;
2223             __mul_128x128_to_256 (P256, C1, __bid_ten2m1); // P256 = C*, f*
2224             // C* is actually floor(C*) in this case
2225             // the top Ex = 128 bits of 10^(-1) are
2226             // T* = 0x00199999999999999999999999999999
2227             // if (0 < f* < 10^(-x)) then
2228             //   if floor(C*) is even then C = floor(C*) - logical right
2229             //       shift; C has p decimal digits, correct by Prop. 1)
2230             //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
2231             //       shift; C has p decimal digits, correct by Pr. 1)
2232             // else
2233             //   C = floor(C*) (logical right shift; C has p decimal digits,
2234             //       correct by Property 1)
2235             // n = C * 10^(e2+x)
2236             if ((P256.w[1] || P256.w[0])
2237                 && (P256.w[1] < 0x1999999999999999ull
2238                     || (P256.w[1] == 0x1999999999999999ull
2239                     && P256.w[0] <= 0x9999999999999999ull))) {
2240               // the result is a midpoint
2241               if (P256.w[2] & 0x01) {
2242                 is_midpoint_gt_even = 1;
2243                 // if floor(C*) is odd C = floor(C*) - 1; the result is not 0
2244                 P256.w[2]--;
2245                 if (P256.w[2] == 0xffffffffffffffffull)
2246                   P256.w[3]--;
2247               } else {
2248                 is_midpoint_lt_even = 1;
2249               }
2250             }
2251             // n = Cstar * 10^(e2+1)
2252             y_exp = y_exp + EXP_P1;
2253             // C* != 10^P34 because C* has P34 digits
2254             // check for overflow
2255             if (y_exp == EXP_MAX_P1
2256                 && (rnd_mode == ROUNDING_TO_NEAREST
2257                     || rnd_mode == ROUNDING_TIES_AWAY)) {
2258               // overflow for RN
2259               res.w[1] = x_sign | 0x7800000000000000ull; // +/-inf
2260               res.w[0] = 0x0ull;
2261               // set the inexact flag
2262               *pfpsf |= INEXACT_EXCEPTION;
2263               // set the overflow flag
2264               *pfpsf |= OVERFLOW_EXCEPTION;
2265               BID_RETURN (res);
2266             }
2267             // if (0 < f* - 1/2 < 10^(-x)) then
2268             //   the result of the addition is exact
2269             // else
2270             //   the result of the addition is inexact
2271             if (P256.w[1] > 0x8000000000000000ull ||
2272                 (P256.w[1] == 0x8000000000000000ull &&
2273                 P256.w[0] > 0x0ull)) { // the result may be exact
2274               tmp64 = P256.w[1] - 0x8000000000000000ull; // f* - 1/2
2275               if ((tmp64 > 0x1999999999999999ull
2276                    || (tmp64 == 0x1999999999999999ull
2277                    && P256.w[0] >= 0x9999999999999999ull))) {
2278                 // set the inexact flag
2279                 *pfpsf |= INEXACT_EXCEPTION;
2280                 is_inexact = 1;
2281               }        // else the result is exact
2282             } else { // the result is inexact
2283               // set the inexact flag
2284               *pfpsf |= INEXACT_EXCEPTION;
2285               is_inexact = 1;
2286             }
2287             C1_hi = P256.w[3];
2288             C1_lo = P256.w[2];
2289             if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
2290               is_inexact_lt_midpoint = is_inexact
2291                 && (P256.w[1] & 0x8000000000000000ull);
2292               is_inexact_gt_midpoint = is_inexact
2293                 && !(P256.w[1] & 0x8000000000000000ull);
2294             }
2295             // general correction from RN to RA, RM, RP, RZ; result uses y_exp
2296             if (rnd_mode != ROUNDING_TO_NEAREST) {
2297               if ((!x_sign &&
2298                   ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint) ||
2299                   ((rnd_mode == ROUNDING_TIES_AWAY || rnd_mode == ROUNDING_UP)
2300                   && is_midpoint_gt_even))) ||
2301                   (x_sign &&
2302                   ((rnd_mode == ROUNDING_DOWN && is_inexact_lt_midpoint) ||
2303                   ((rnd_mode == ROUNDING_TIES_AWAY || rnd_mode == ROUNDING_DOWN)
2304                   && is_midpoint_gt_even)))) {
2305                 // C1 = C1 + 1
2306                 C1_lo = C1_lo + 1;
2307                 if (C1_lo == 0) { // rounding overflow in the low 64 bits
2308                   C1_hi = C1_hi + 1;
2309                 }
2310                 if (C1_hi == 0x0001ed09bead87c0ull
2311                     && C1_lo == 0x378d8e6400000000ull) {
2312                   // C1 = 10^34 => rounding overflow
2313                   C1_hi = 0x0000314dc6448d93ull;
2314                   C1_lo = 0x38c15b0a00000000ull; // 10^33
2315                   y_exp = y_exp + EXP_P1;
2316                 }
2317               } else
2318                 if ((is_midpoint_lt_even || is_inexact_gt_midpoint) &&
2319                     ((x_sign &&
2320                     (rnd_mode == ROUNDING_UP ||
2321                     rnd_mode == ROUNDING_TO_ZERO)) ||
2322                     (!x_sign &&
2323                     (rnd_mode == ROUNDING_DOWN ||
2324                     rnd_mode == ROUNDING_TO_ZERO)))) {
2325                 // C1 = C1 - 1
2326                 C1_lo = C1_lo - 1;
2327                 if (C1_lo == 0xffffffffffffffffull)
2328                   C1_hi--;
2329                 // check if we crossed into the lower decade
2330                 if (C1_hi == 0x0000314dc6448d93ull &&
2331                     C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
2332                   C1_hi = 0x0001ed09bead87c0ull; // 10^34 - 1
2333                   C1_lo = 0x378d8e63ffffffffull;
2334                   y_exp = y_exp - EXP_P1;
2335                   // no underflow, because delta + q2 >= P34 + 1
2336                 }
2337               } else {
2338                 ; // exact, the result is already correct
2339               }
2340               // in all cases check for overflow (RN and RA solved already)
2341               if (y_exp == EXP_MAX_P1) { // overflow
2342                 if ((rnd_mode == ROUNDING_DOWN && x_sign) || // RM and res < 0
2343                     (rnd_mode == ROUNDING_UP && !x_sign)) { // RP and res > 0
2344                   C1_hi = 0x7800000000000000ull; // +inf
2345                   C1_lo = 0x0ull;
2346                 } else { // RM and res > 0, RP and res < 0, or RZ
2347                   C1_hi = 0x5fffed09bead87c0ull;
2348                   C1_lo = 0x378d8e63ffffffffull;
2349                 }
2350                 y_exp = 0; // x_sign is preserved
2351                 // set the inexact flag (in case the exact addition was exact)
2352                 *pfpsf |= INEXACT_EXCEPTION;
2353                 // set the overflow flag
2354                 *pfpsf |= OVERFLOW_EXCEPTION;
2355               }
2356             }
2357           } // else if (C1 < 10^34) then C1 is the coeff.; the result is exact
2358           // assemble the result
2359           res.w[1] = x_sign | y_exp | C1_hi;
2360           res.w[0] = C1_lo;
2361         } else { // if x_sign != y_sign the result is exact
2362           C1_lo = C2_lo - C1_lo;
2363           C1_hi = C2_hi - C1_hi;
2364           if (C1_lo > C2_lo)
2365             C1_hi--;
2366           if (C1_hi >= 0x8000000000000000ull) { // negative coefficient!
2367             C1_lo = ~C1_lo;
2368             C1_lo++;
2369             C1_hi = ~C1_hi;
2370             if (C1_lo == 0x0)
2371               C1_hi++;
2372             x_sign = y_sign; // the result will have the sign of y
2373           }
2374           // the result can be zero, but it cannot overflow
2375           if (C1_lo == 0 && C1_hi == 0) {
2376             // assemble the result
2377             if (x_exp < y_exp)
2378               res.w[1] = x_exp;
2379             else
2380               res.w[1] = y_exp;
2381             res.w[0] = 0;
2382             if (rnd_mode == ROUNDING_DOWN) {
2383               res.w[1] |= 0x8000000000000000ull;
2384             }
2385             BID_RETURN (res);
2386           }
2387           // assemble the result
2388           res.w[1] = y_sign | y_exp | C1_hi;
2389           res.w[0] = C1_lo;
2390         }
2391       }
2392     }
2393     BID_RETURN (res)
2394   }
2395 }
2396
2397 /*****************************************************************************
2398  *  BID128 sub
2399  ****************************************************************************/
2400
2401 #if DECIMAL_CALL_BY_REFERENCE
2402 void
2403 __bid128_sub (UINT128 * pres, UINT128 * px,
2404             UINT128 *
2405             py _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
2406             _EXC_INFO_PARAM) {
2407   UINT128 x = *px, y = *py;
2408 #if !DECIMAL_GLOBAL_ROUNDING
2409   unsigned int rnd_mode = *prnd_mode;
2410 #endif
2411 #else
2412 UINT128
2413 __bid128_sub (UINT128 x,
2414             UINT128 y _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
2415             _EXC_INFO_PARAM) {
2416 #endif
2417
2418   UINT128 res;
2419   UINT64 y_sign;
2420
2421   if ((y.w[1] & MASK_NAN) != MASK_NAN) { // y is not NAN
2422     // change its sign
2423     y_sign = y.w[1] & MASK_SIGN; // 0 for positive, MASK_SIGN for negative
2424     if (y_sign)
2425       y.w[1] = y.w[1] & 0x7fffffffffffffffull;
2426     else
2427       y.w[1] = y.w[1] | 0x8000000000000000ull;
2428   }
2429 #if DECIMAL_CALL_BY_REFERENCE
2430   __bid128_add (&res, &x,
2431               &y _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
2432               _EXC_INFO_ARG);
2433 #else
2434   res =
2435     __bid128_add (x,
2436                 y _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
2437                 _EXC_INFO_ARG);
2438 #endif
2439   BID_RETURN (res);
2440 }