OSDN Git Service

libgcc/
[pf3gnuchains/gcc-fork.git] / libgcc / config / libbid / bid128_add.c
index 79bf9e5..9cb4813 100644 (file)
@@ -26,36 +26,455 @@ along with GCC; see the file COPYING.  If not, write to the Free
 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
 02110-1301, USA.  */
 
+#include "bid_internal.h"
+
+
+#if DECIMAL_CALL_BY_REFERENCE
+void
+bid64dq_add (UINT64 * pres, UINT64 * px, UINT128 * py
+            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+            _EXC_INFO_PARAM) {
+  UINT64 x = *px;
+#if !DECIMAL_GLOBAL_ROUNDING
+  unsigned int rnd_mode = *prnd_mode;
+#endif
+#else
+UINT64
+bid64dq_add (UINT64 x, UINT128 y
+            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+            _EXC_INFO_PARAM) {
+#endif
+  UINT64 res = 0xbaddbaddbaddbaddull;
+  UINT128 x1;
+
+#if DECIMAL_CALL_BY_REFERENCE
+  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  bid64qq_add (&res, &x1, py
+              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+              _EXC_INFO_ARG);
+#else
+  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  res = bid64qq_add (x1, y
+                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+                    _EXC_INFO_ARG);
+#endif
+  BID_RETURN (res);
+}
+
+
+#if DECIMAL_CALL_BY_REFERENCE
+void
+bid64qd_add (UINT64 * pres, UINT128 * px, UINT64 * py
+            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+            _EXC_INFO_PARAM) {
+  UINT64 y = *py;
+#if !DECIMAL_GLOBAL_ROUNDING
+  unsigned int rnd_mode = *prnd_mode;
+#endif
+#else
+UINT64
+bid64qd_add (UINT128 x, UINT64 y
+            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+            _EXC_INFO_PARAM) {
+#endif
+  UINT64 res = 0xbaddbaddbaddbaddull;
+  UINT128 y1;
+
+#if DECIMAL_CALL_BY_REFERENCE
+  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  bid64qq_add (&res, px, &y1
+              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+              _EXC_INFO_ARG);
+#else
+  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  res = bid64qq_add (x, y1
+                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+                    _EXC_INFO_ARG);
+#endif
+  BID_RETURN (res);
+}
+
+
+#if DECIMAL_CALL_BY_REFERENCE
+void
+bid64qq_add (UINT64 * pres, UINT128 * px, UINT128 * py
+            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+            _EXC_INFO_PARAM) {
+  UINT128 x = *px, y = *py;
+#if !DECIMAL_GLOBAL_ROUNDING
+  unsigned int rnd_mode = *prnd_mode;
+#endif
+#else
+UINT64
+bid64qq_add (UINT128 x, UINT128 y
+            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+            _EXC_INFO_PARAM) {
+#endif
+
+  UINT128 one = { {0x0000000000000001ull, 0x3040000000000000ull}
+  };
+  UINT64 res = 0xbaddbaddbaddbaddull;
+
+  BID_SWAP128 (one);
+#if DECIMAL_CALL_BY_REFERENCE
+  bid64qqq_fma (&res, &one, &x, &y
+               _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+               _EXC_INFO_ARG);
+#else
+  res = bid64qqq_fma (one, x, y
+                     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+                     _EXC_INFO_ARG);
+#endif
+  BID_RETURN (res);
+}
+
+
+#if DECIMAL_CALL_BY_REFERENCE
+void
+bid128dd_add (UINT128 * pres, UINT64 * px, UINT64 * py
+             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+             _EXC_INFO_PARAM) {
+  UINT64 x = *px, y = *py;
+#if !DECIMAL_GLOBAL_ROUNDING
+  unsigned int rnd_mode = *prnd_mode;
+#endif
+#else
+UINT128
+bid128dd_add (UINT64 x, UINT64 y
+             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+             _EXC_INFO_PARAM) {
+#endif
+  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
+  };
+  UINT128 x1, y1;
+
+#if DECIMAL_CALL_BY_REFERENCE
+  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  bid128_add (&res, &x1, &y1
+             _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+             _EXC_INFO_ARG);
+#else
+  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  res = bid128_add (x1, y1
+                   _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+                   _EXC_INFO_ARG);
+#endif
+  BID_RETURN (res);
+}
+
+
+#if DECIMAL_CALL_BY_REFERENCE
+void
+bid128dq_add (UINT128 * pres, UINT64 * px, UINT128 * py
+             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+             _EXC_INFO_PARAM) {
+  UINT64 x = *px;
+#if !DECIMAL_GLOBAL_ROUNDING
+  unsigned int rnd_mode = *prnd_mode;
+#endif
+#else
+UINT128
+bid128dq_add (UINT64 x, UINT128 y
+             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+             _EXC_INFO_PARAM) {
+#endif
+  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
+  };
+  UINT128 x1;
+
+#if DECIMAL_CALL_BY_REFERENCE
+  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  bid128_add (&res, &x1, py
+             _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+             _EXC_INFO_ARG);
+#else
+  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  res = bid128_add (x1, y
+                   _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+                   _EXC_INFO_ARG);
+#endif
+  BID_RETURN (res);
+}
+
+
+#if DECIMAL_CALL_BY_REFERENCE
+void
+bid128qd_add (UINT128 * pres, UINT128 * px, UINT64 * py
+             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+             _EXC_INFO_PARAM) {
+  UINT64 y = *py;
+#if !DECIMAL_GLOBAL_ROUNDING
+  unsigned int rnd_mode = *prnd_mode;
+#endif
+#else
+UINT128
+bid128qd_add (UINT128 x, UINT64 y
+             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+             _EXC_INFO_PARAM) {
+#endif
+  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
+  };
+  UINT128 y1;
+
+#if DECIMAL_CALL_BY_REFERENCE
+  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  bid128_add (&res, px, &y1
+             _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+             _EXC_INFO_ARG);
+#else
+  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  res = bid128_add (x, y1
+                   _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+                   _EXC_INFO_ARG);
+#endif
+  BID_RETURN (res);
+}
+
+
+// bid128_add stands for bid128qq_add
+
+
 /*****************************************************************************
- *  BID128 add
+ *  BID64/BID128 sub
  ****************************************************************************/
 
-#include "bid_internal.h"
+#if DECIMAL_CALL_BY_REFERENCE
+void
+bid64dq_sub (UINT64 * pres, UINT64 * px, UINT128 * py
+            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+            _EXC_INFO_PARAM) {
+  UINT64 x = *px;
+#if !DECIMAL_GLOBAL_ROUNDING
+  unsigned int rnd_mode = *prnd_mode;
+#endif
+#else
+UINT64
+bid64dq_sub (UINT64 x, UINT128 y
+            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+            _EXC_INFO_PARAM) {
+#endif
+  UINT64 res = 0xbaddbaddbaddbaddull;
+  UINT128 x1;
+
+#if DECIMAL_CALL_BY_REFERENCE
+  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  bid64qq_sub (&res, &x1, py
+              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+              _EXC_INFO_ARG);
+#else
+  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  res = bid64qq_sub (x1, y
+                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+                    _EXC_INFO_ARG);
+#endif
+  BID_RETURN (res);
+}
+
+
+#if DECIMAL_CALL_BY_REFERENCE
+void
+bid64qd_sub (UINT64 * pres, UINT128 * px, UINT64 * py
+            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+            _EXC_INFO_PARAM) {
+  UINT64 y = *py;
+#if !DECIMAL_GLOBAL_ROUNDING
+  unsigned int rnd_mode = *prnd_mode;
+#endif
+#else
+UINT64
+bid64qd_sub (UINT128 x, UINT64 y
+            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+            _EXC_INFO_PARAM) {
+#endif
+  UINT64 res = 0xbaddbaddbaddbaddull;
+  UINT128 y1;
+
+#if DECIMAL_CALL_BY_REFERENCE
+  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  bid64qq_sub (&res, px, &y1
+              _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+              _EXC_INFO_ARG);
+#else
+  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  res = bid64qq_sub (x, y1
+                    _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+                    _EXC_INFO_ARG);
+#endif
+  BID_RETURN (res);
+}
+
 
 #if DECIMAL_CALL_BY_REFERENCE
 void
-__bid128_add (UINT128 * pres, UINT128 * px,
-            UINT128 *
-            py _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
-            _EXC_INFO_PARAM) {
+bid64qq_sub (UINT64 * pres, UINT128 * px, UINT128 * py
+            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+            _EXC_INFO_PARAM) {
   UINT128 x = *px, y = *py;
 #if !DECIMAL_GLOBAL_ROUNDING
   unsigned int rnd_mode = *prnd_mode;
 #endif
 #else
+UINT64
+bid64qq_sub (UINT128 x, UINT128 y
+            _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+            _EXC_INFO_PARAM) {
+#endif
+
+  UINT128 one = { {0x0000000000000001ull, 0x3040000000000000ull}
+  };
+  UINT64 res = 0xbaddbaddbaddbaddull;
+  UINT64 y_sign;
+
+  BID_SWAP128 (one);
+  if ((y.w[HIGH_128W] & MASK_NAN) != MASK_NAN) {       // y is not NAN
+    // change its sign
+    y_sign = y.w[HIGH_128W] & MASK_SIGN;       // 0 for positive, MASK_SIGN for negative
+    if (y_sign)
+      y.w[HIGH_128W] = y.w[HIGH_128W] & 0x7fffffffffffffffull;
+    else
+      y.w[HIGH_128W] = y.w[HIGH_128W] | 0x8000000000000000ull;
+  }
+#if DECIMAL_CALL_BY_REFERENCE
+  bid64qqq_fma (&res, &one, &x, &y
+               _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+               _EXC_INFO_ARG);
+#else
+  res = bid64qqq_fma (one, x, y
+                     _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+                     _EXC_INFO_ARG);
+#endif
+  BID_RETURN (res);
+}
+
+
+#if DECIMAL_CALL_BY_REFERENCE
+void
+bid128dd_sub (UINT128 * pres, UINT64 * px, UINT64 * py
+             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+             _EXC_INFO_PARAM) {
+  UINT64 x = *px, y = *py;
+#if !DECIMAL_GLOBAL_ROUNDING
+  unsigned int rnd_mode = *prnd_mode;
+#endif
+#else
 UINT128
-__bid128_add (UINT128 x,
-            UINT128 y _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
-            _EXC_INFO_PARAM) {
+bid128dd_sub (UINT64 x, UINT64 y
+             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+             _EXC_INFO_PARAM) {
 #endif
+  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
+  };
+  UINT128 x1, y1;
 
-  UINT128 res;
+#if DECIMAL_CALL_BY_REFERENCE
+  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  bid128_sub (&res, &x1, &y1
+             _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+             _EXC_INFO_ARG);
+#else
+  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  res = bid128_sub (x1, y1
+                   _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+                   _EXC_INFO_ARG);
+#endif
+  BID_RETURN (res);
+}
+
+
+#if DECIMAL_CALL_BY_REFERENCE
+void
+bid128dq_sub (UINT128 * pres, UINT64 * px, UINT128 * py
+             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+             _EXC_INFO_PARAM) {
+  UINT64 x = *px;
+#if !DECIMAL_GLOBAL_ROUNDING
+  unsigned int rnd_mode = *prnd_mode;
+#endif
+#else
+UINT128
+bid128dq_sub (UINT64 x, UINT128 y
+             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+             _EXC_INFO_PARAM) {
+#endif
+  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
+  };
+  UINT128 x1;
+
+#if DECIMAL_CALL_BY_REFERENCE
+  bid64_to_bid128 (&x1, &x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  bid128_sub (&res, &x1, py
+             _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+             _EXC_INFO_ARG);
+#else
+  x1 = bid64_to_bid128 (x _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  res = bid128_sub (x1, y
+                   _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+                   _EXC_INFO_ARG);
+#endif
+  BID_RETURN (res);
+}
+
+
+#if DECIMAL_CALL_BY_REFERENCE
+void
+bid128qd_sub (UINT128 * pres, UINT128 * px, UINT64 * py
+             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+             _EXC_INFO_PARAM) {
+  UINT64 y = *py;
+#if !DECIMAL_GLOBAL_ROUNDING
+  unsigned int rnd_mode = *prnd_mode;
+#endif
+#else
+UINT128
+bid128qd_sub (UINT128 x, UINT64 y
+             _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+             _EXC_INFO_PARAM) {
+#endif
+  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
+  };
+  UINT128 y1;
+
+#if DECIMAL_CALL_BY_REFERENCE
+  bid64_to_bid128 (&y1, &y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  bid128_sub (&res, px, &y1
+             _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+             _EXC_INFO_ARG);
+#else
+  y1 = bid64_to_bid128 (y _EXC_FLAGS_ARG _EXC_MASKS_ARG _EXC_INFO_ARG);
+  res = bid128_sub (x, y1
+                   _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+                   _EXC_INFO_ARG);
+#endif
+  BID_RETURN (res);
+}
+
+#if DECIMAL_CALL_BY_REFERENCE
+void
+bid128_add (UINT128 * pres, UINT128 * px, UINT128 * py
+           _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+           _EXC_INFO_PARAM) {
+  UINT128 x = *px, y = *py;
+#if !DECIMAL_GLOBAL_ROUNDING
+  unsigned int rnd_mode = *prnd_mode;
+#endif
+#else
+UINT128
+bid128_add (UINT128 x, UINT128 y
+           _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+           _EXC_INFO_PARAM) {
+#endif
+  UINT128 res = { {0xbaddbaddbaddbaddull, 0xbaddbaddbaddbaddull}
+  };
   UINT64 x_sign, y_sign, tmp_sign;
-  UINT64 x_exp, y_exp, tmp_exp; // e1 = x_exp, e2 = y_exp
+  UINT64 x_exp, y_exp, tmp_exp;        // e1 = x_exp, e2 = y_exp
   UINT64 C1_hi, C2_hi, tmp_signif_hi;
   UINT64 C1_lo, C2_lo, tmp_signif_lo;
-  // Note: C1.w[1], C1.w[0] represent C1_hi, C1_lo (all are UINT64)
-  // Note: C2.w[1], C2.w[0] represent C2_hi, C2_lo (all are UINT64)
+  // Note: C1.w[1], C1.w[0] represent C1_hi, C1_lo (all UINT64)
+  // Note: C2.w[1], C2.w[0] represent C2_hi, C2_lo (all UINT64)
   UINT64 tmp64, tmp64A, tmp64B;
   BID_UI64DOUBLE tmp1, tmp2;
   int x_nr_bits, y_nr_bits;
@@ -63,86 +482,105 @@ __bid128_add (UINT128 x,
   UINT64 halfulp64;
   UINT128 halfulp128;
   UINT128 C1, C2;
-  UINT128 __bid_ten2m1;
-  UINT128 highf2star; // top 128 bits in f2*; low 128 bits in R256[1], R256[0]
+  UINT128 ten2m1;
+  UINT128 highf2star;          // top 128 bits in f2*; low 128 bits in R256[1], R256[0]
   UINT256 P256, Q256, R256;
   int is_inexact = 0, is_midpoint_lt_even = 0, is_midpoint_gt_even = 0;
   int is_inexact_lt_midpoint = 0, is_inexact_gt_midpoint = 0;
   int second_pass = 0;
 
+  BID_SWAP128 (x);
+  BID_SWAP128 (y);
+  x_sign = x.w[1] & MASK_SIGN; // 0 for positive, MASK_SIGN for negative
+  y_sign = y.w[1] & MASK_SIGN; // 0 for positive, MASK_SIGN for negative
+
   // check for NaN or Infinity
   if (((x.w[1] & MASK_SPECIAL) == MASK_SPECIAL)
       || ((y.w[1] & MASK_SPECIAL) == MASK_SPECIAL)) {
     // x is special or y is special
-
-    if ((x.w[1] & MASK_NAN) == MASK_NAN) { // x is NAN
-      if ((x.w[1] & MASK_SNAN) == MASK_SNAN) { // x is SNAN
-        // set invalid flag
-        *pfpsf |= INVALID_EXCEPTION;
-        // return quiet (x)
-        res.w[1] = x.w[1] & 0xfdffffffffffffffull;
-        res.w[0] = x.w[0];
-      } else { // x is QNaN
-        if ((y.w[1] & MASK_SNAN) == MASK_SNAN) { // y is SNAN
-          // set invalid flag
-          *pfpsf |= INVALID_EXCEPTION;
-        }
-        // return x
-        res.w[1] = x.w[1];
-        res.w[0] = x.w[0];
+    if ((x.w[1] & MASK_NAN) == MASK_NAN) {     // x is NAN
+      // check first for non-canonical NaN payload
+      if (((x.w[1] & 0x00003fffffffffffull) > 0x0000314dc6448d93ull) ||
+         (((x.w[1] & 0x00003fffffffffffull) == 0x0000314dc6448d93ull)
+          && (x.w[0] > 0x38c15b09ffffffffull))) {
+       x.w[1] = x.w[1] & 0xffffc00000000000ull;
+       x.w[0] = 0x0ull;
       }
+      if ((x.w[1] & MASK_SNAN) == MASK_SNAN) { // x is SNAN
+       // set invalid flag
+       *pfpsf |= INVALID_EXCEPTION;
+       // return quiet (x)
+       res.w[1] = x.w[1] & 0xfc003fffffffffffull;
+       // clear out also G[6]-G[16]
+       res.w[0] = x.w[0];
+      } else { // x is QNaN
+       // return x
+       res.w[1] = x.w[1] & 0xfc003fffffffffffull;
+       // clear out G[6]-G[16]
+       res.w[0] = x.w[0];
+       // if y = SNaN signal invalid exception
+       if ((y.w[1] & MASK_SNAN) == MASK_SNAN) {
+         // set invalid flag
+         *pfpsf |= INVALID_EXCEPTION;
+       }
+      }
+      BID_SWAP128 (res);
       BID_RETURN (res);
-    } else if ((y.w[1] & MASK_NAN) == MASK_NAN) { // y is NAN
-      if ((y.w[1] & MASK_SNAN) == MASK_SNAN) { // y is SNAN
-        // set invalid flag
-        *pfpsf |= INVALID_EXCEPTION;
-        // return quiet (y)
-        res.w[1] = y.w[1] & 0xfdffffffffffffffull;
-        res.w[0] = y.w[0];
-      } else { // y is QNaN
-        // return y
-        res.w[1] = y.w[1];
-        res.w[0] = y.w[0];
+    } else if ((y.w[1] & MASK_NAN) == MASK_NAN) {      // y is NAN
+      // check first for non-canonical NaN payload
+      if (((y.w[1] & 0x00003fffffffffffull) > 0x0000314dc6448d93ull) ||
+         (((y.w[1] & 0x00003fffffffffffull) == 0x0000314dc6448d93ull)
+          && (y.w[0] > 0x38c15b09ffffffffull))) {
+       y.w[1] = y.w[1] & 0xffffc00000000000ull;
+       y.w[0] = 0x0ull;
+      }
+      if ((y.w[1] & MASK_SNAN) == MASK_SNAN) { // y is SNAN
+       // set invalid flag
+       *pfpsf |= INVALID_EXCEPTION;
+       // return quiet (y)
+       res.w[1] = y.w[1] & 0xfc003fffffffffffull;
+       // clear out also G[6]-G[16]
+       res.w[0] = y.w[0];
+      } else { // y is QNaN
+       // return y
+       res.w[1] = y.w[1] & 0xfc003fffffffffffull;
+       // clear out G[6]-G[16]
+       res.w[0] = y.w[0];
       }
+      BID_SWAP128 (res);
       BID_RETURN (res);
-    } else { // neither x not y is NaN; at least one is infinity
-      if ((x.w[1] & MASK_ANY_INF) == MASK_INF) { // x is infinity
-        if ((y.w[1] & MASK_ANY_INF) == MASK_INF) { // y is infinity
-          // if same sign, return either of them
-          if ((x.w[1] & MASK_SIGN) == (y.w[1] & MASK_SIGN)) {
-            res.w[1] = x.w[1];
-            res.w[0] = x.w[0];
-          } else { // x and y are infinities of opposite signs
-            // set invalid flag
-            *pfpsf |= INVALID_EXCEPTION;
-            // return QNaN Indefinite
-            res.w[1] = 0x7c00000000000000ull;
-            res.w[0] = 0x0000000000000000ull;
-          }
-        } else { // y is 0 or finite
-          // return x
-          res.w[1] = x.w[1];
-          res.w[0] = x.w[0];
-        }
-      } else { // x is not NaN or infinity, so y must be infinity
-        res.w[1] = y.w[1];
-        res.w[0] = y.w[0];
+    } else {   // neither x not y is NaN; at least one is infinity
+      if ((x.w[1] & MASK_ANY_INF) == MASK_INF) {       // x is infinity
+       if ((y.w[1] & MASK_ANY_INF) == MASK_INF) {      // y is infinity
+         // if same sign, return either of them
+         if ((x.w[1] & MASK_SIGN) == (y.w[1] & MASK_SIGN)) {
+           res.w[1] = x_sign | MASK_INF;
+           res.w[0] = 0x0ull;
+         } else {      // x and y are infinities of opposite signs
+           // set invalid flag
+           *pfpsf |= INVALID_EXCEPTION;
+           // return QNaN Indefinite
+           res.w[1] = 0x7c00000000000000ull;
+           res.w[0] = 0x0000000000000000ull;
+         }
+       } else {        // y is 0 or finite
+         // return x
+         res.w[1] = x_sign | MASK_INF;
+         res.w[0] = 0x0ull;
+       }
+      } else { // x is not NaN or infinity, so y must be infinity
+       res.w[1] = y_sign | MASK_INF;
+       res.w[0] = 0x0ull;
       }
+      BID_SWAP128 (res);
       BID_RETURN (res);
     }
   }
   // unpack the arguments
+
   // unpack x 
-  x_sign = x.w[1] & MASK_SIGN; // 0 for positive, MASK_SIGN for negative
-  x_exp = x.w[1] & MASK_EXP; // biased and shifted left 49 bit positions
   C1_hi = x.w[1] & MASK_COEFF;
   C1_lo = x.w[0];
-  // unpack y 
-  y_sign = y.w[1] & MASK_SIGN; // 0 for positive, MASK_SIGN for negative
-  y_exp = y.w[1] & MASK_EXP; // biased and shifted left 49 bit positions
-  C2_hi = y.w[1] & MASK_COEFF;
-  C2_lo = y.w[0];
-
   // test for non-canonical values:
   // - values whose encoding begins with x00, x01, or x10 and whose 
   //   coefficient is larger than 10^34 -1, or
@@ -150,35 +588,45 @@ __bid128_add (UINT128 x,
   //   and infinitis were eliminated already this test is reduced to 
   //   checking for x10x) 
 
-  // test for non-canonical values of the argument x
-  if ((((C1_hi > 0x0001ed09bead87c0ull)
-       || ((C1_hi == 0x0001ed09bead87c0ull)
-           && (C1_lo > 0x378d8e63ffffffffull)))
-      && ((x.w[1] & 0x6000000000000000ull) != 0x6000000000000000ull))
-      || ((x.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull)) {
-    // check for the case where the exponent is shifted right by 2 bits!
-    if ((x.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
-      x_exp = (x.w[1] << 2) & MASK_EXP; // same position as for G[0]G[1] != 11
+  // x is not infinity; check for non-canonical values - treated as zero
+  if ((x.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
+    // G0_G1=11; non-canonical
+    x_exp = (x.w[1] << 2) & MASK_EXP;  // biased and shifted left 49 bits
+    C1_hi = 0; // significand high
+    C1_lo = 0; // significand low
+  } else {     // G0_G1 != 11
+    x_exp = x.w[1] & MASK_EXP; // biased and shifted left 49 bits
+    if (C1_hi > 0x0001ed09bead87c0ull ||
+       (C1_hi == 0x0001ed09bead87c0ull
+        && C1_lo > 0x378d8e63ffffffffull)) {
+      // x is non-canonical if coefficient is larger than 10^34 -1
+      C1_hi = 0;
+      C1_lo = 0;
+    } else {   // canonical
+      ;
     }
-    x.w[1] = x.w[1] & 0x8000000000000000ull; // preserve the sign bit
-    x.w[0] = 0;
-    C1_hi = 0;
-    C1_lo = 0;
   }
-  // test for non-canonical values of the argument y
-  if ((((C2_hi > 0x0001ed09bead87c0ull)
-       || ((C2_hi == 0x0001ed09bead87c0ull)
-           && (C2_lo > 0x378d8e63ffffffffull)))
-      && ((y.w[1] & 0x6000000000000000ull) != 0x6000000000000000ull))
-      || ((y.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull)) {
-    // check for the case where the exponent is shifted right by 2 bits!
-    if ((y.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
-      y_exp = (y.w[1] << 2) & MASK_EXP; // same position as for G[0]G[1] != 11
+
+  // unpack y  
+  C2_hi = y.w[1] & MASK_COEFF;
+  C2_lo = y.w[0];
+  // y is not infinity; check for non-canonical values - treated as zero 
+  if ((y.w[1] & 0x6000000000000000ull) == 0x6000000000000000ull) {
+    // G0_G1=11; non-canonical 
+    y_exp = (y.w[1] << 2) & MASK_EXP;  // biased and shifted left 49 bits
+    C2_hi = 0; // significand high
+    C2_lo = 0; // significand low 
+  } else {     // G0_G1 != 11 
+    y_exp = y.w[1] & MASK_EXP; // biased and shifted left 49 bits
+    if (C2_hi > 0x0001ed09bead87c0ull ||
+       (C2_hi == 0x0001ed09bead87c0ull
+        && C2_lo > 0x378d8e63ffffffffull)) {
+      // y is non-canonical if coefficient is larger than 10^34 -1 
+      C2_hi = 0;
+      C2_lo = 0;
+    } else {   // canonical
+      ;
     }
-    y.w[1] = y.w[1] & 0x8000000000000000ull; // preserve the sign bit
-    y.w[0] = 0;
-    C2_hi = 0;
-    C2_lo = 0;
   }
 
   if ((C1_hi == 0x0ull) && (C1_lo == 0x0ull)) {
@@ -186,90 +634,87 @@ __bid128_add (UINT128 x,
     // if y is 0 return 0 with the smaller exponent
     if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
       if (x_exp < y_exp)
-        res.w[1] = x_exp;
+       res.w[1] = x_exp;
       else
-        res.w[1] = y_exp;
+       res.w[1] = y_exp;
       if (x_sign && y_sign)
-        res.w[1] = res.w[1] | x_sign; // both negative
+       res.w[1] = res.w[1] | x_sign;   // both negative
       else if (rnd_mode == ROUNDING_DOWN && x_sign != y_sign)
-        res.w[1] = res.w[1] | 0x8000000000000000ull; // -0
+       res.w[1] = res.w[1] | 0x8000000000000000ull;    // -0
       // else; // res = +0
       res.w[0] = 0;
     } else {
       // for 0 + y return y, with the preferred exponent
       if (y_exp <= x_exp) {
-        res.w[1] = y.w[1];
-        res.w[0] = y.w[0];
-      } else { // if y_exp > x_exp
-        // return (C2 * 10^scale) * 10^(y_exp - scale)
-        // where scale = min (P34-q2, y_exp-x_exp)
-        // determine q2 = nr. of decimal digits in y
-        //  determine first the nr. of bits in y (y_nr_bits)
-
-        if (C2_hi == 0) { // y_bits is the nr. of bits in C2_lo
-          if (C2_lo >= 0x0020000000000000ull) { // y >= 2^53
-            // split the 64-bit value in two 32-bit halves to avoid 
-            // rounding errors
-            if (C2_lo >= 0x0000000100000000ull) { // y >= 2^32
-              tmp2.d = (double) (C2_lo >> 32); // exact conversion
-              ///tmp2_i64 = *(UINT64 *) & tmp2_d;
-              y_nr_bits =
-                32 +
-                ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
-            } else { // y < 2^32
-              tmp2.d = (double) (C2_lo); // exact conversion
-              ///tmp2_i64 = *(UINT64 *) & tmp2_d;
-              y_nr_bits =
-                ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
-            }
-          } else { // if y < 2^53
-            tmp2.d = (double) C2_lo; // exact conversion
-            ///tmp2_i64 = *(UINT64 *) & tmp2_d;
-            y_nr_bits =
-              ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
-          }
-        } else { // C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
-          tmp2.d = (double) C2_hi; // exact conversion
-          ///tmp2_i64 = *(UINT64 *) & tmp2_d;
-          y_nr_bits =
-            64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
-        }
-        q2 = __bid_nr_digits[y_nr_bits].digits;
-        if (q2 == 0) {
-          q2 = __bid_nr_digits[y_nr_bits].digits1;
-          if (C2_hi > __bid_nr_digits[y_nr_bits].threshold_hi
-              || (C2_hi == __bid_nr_digits[y_nr_bits].threshold_hi
-              && C2_lo >= __bid_nr_digits[y_nr_bits].threshold_lo))
-            q2++;
-        }
-        // return (C2 * 10^scale) * 10^(y_exp - scale)
-        // where scale = min (P34-q2, y_exp-x_exp)
-        scale = P34 - q2;
-        ind = (y_exp - x_exp) >> 49;
-        if (ind < scale)
-          scale = ind;
-        if (scale == 0) {
-          res.w[1] = y.w[1];
-          res.w[0] = y.w[0];
-        } else if (q2 <= 19) { // y fits in 64 bits 
-          if (scale <= 19) { // 10^scale fits in 64 bits
-            // 64 x 64 C2_lo * __bid_ten2k64[scale]
-            __mul_64x64_to_128MACH (res, C2_lo, __bid_ten2k64[scale]);
-          } else { // 10^scale fits in 128 bits
-            // 64 x 128 C2_lo * __bid_ten2k128[scale - 20]
-            __mul_128x64_to_128 (res, C2_lo, __bid_ten2k128[scale - 20]);
-          }
-        } else { // y fits in 128 bits, but 10^scale must fit in 64 bits 
-          // 64 x 128 __bid_ten2k64[scale] * C2
-          C2.w[1] = C2_hi;
-          C2.w[0] = C2_lo;
-          __mul_128x64_to_128 (res, __bid_ten2k64[scale], C2);
-        }
-        // subtract scale from the exponent
-        y_exp = y_exp - ((UINT64) scale << 49);
-        res.w[1] = res.w[1] | y_sign | y_exp;
+       res.w[1] = y.w[1];
+       res.w[0] = y.w[0];
+      } else { // if y_exp > x_exp
+       // return (C2 * 10^scale) * 10^(y_exp - scale)
+       // where scale = min (P34-q2, y_exp-x_exp)
+       // determine q2 = nr. of decimal digits in y
+       //  determine first the nr. of bits in y (y_nr_bits)
+
+       if (C2_hi == 0) {       // y_bits is the nr. of bits in C2_lo
+         if (C2_lo >= 0x0020000000000000ull) { // y >= 2^53
+           // split the 64-bit value in two 32-bit halves to avoid 
+           // rounding errors
+           if (C2_lo >= 0x0000000100000000ull) {       // y >= 2^32
+             tmp2.d = (double) (C2_lo >> 32);  // exact conversion
+             y_nr_bits =
+               32 +
+               ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
+           } else {    // y < 2^32
+             tmp2.d = (double) (C2_lo);        // exact conversion
+             y_nr_bits =
+               ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
+           }
+         } else {      // if y < 2^53
+           tmp2.d = (double) C2_lo;    // exact conversion
+           y_nr_bits =
+             ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
+         }
+       } else {        // C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
+         tmp2.d = (double) C2_hi;      // exact conversion
+         y_nr_bits =
+           64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
+       }
+       q2 = nr_digits[y_nr_bits].digits;
+       if (q2 == 0) {
+         q2 = nr_digits[y_nr_bits].digits1;
+         if (C2_hi > nr_digits[y_nr_bits].threshold_hi ||
+             (C2_hi == nr_digits[y_nr_bits].threshold_hi &&
+              C2_lo >= nr_digits[y_nr_bits].threshold_lo))
+           q2++;
+       }
+       // return (C2 * 10^scale) * 10^(y_exp - scale)
+       // where scale = min (P34-q2, y_exp-x_exp)
+       scale = P34 - q2;
+       ind = (y_exp - x_exp) >> 49;
+       if (ind < scale)
+         scale = ind;
+       if (scale == 0) {
+         res.w[1] = y.w[1];
+         res.w[0] = y.w[0];
+       } else if (q2 <= 19) {  // y fits in 64 bits 
+         if (scale <= 19) {    // 10^scale fits in 64 bits
+           // 64 x 64 C2_lo * ten2k64[scale]
+           __mul_64x64_to_128MACH (res, C2_lo, ten2k64[scale]);
+         } else {      // 10^scale fits in 128 bits
+           // 64 x 128 C2_lo * ten2k128[scale - 20]
+           __mul_128x64_to_128 (res, C2_lo, ten2k128[scale - 20]);
+         }
+       } else {        // y fits in 128 bits, but 10^scale must fit in 64 bits 
+         // 64 x 128 ten2k64[scale] * C2
+         C2.w[1] = C2_hi;
+         C2.w[0] = C2_lo;
+         __mul_128x64_to_128 (res, ten2k64[scale], C2);
+       }
+       // subtract scale from the exponent
+       y_exp = y_exp - ((UINT64) scale << 49);
+       res.w[1] = res.w[1] | y_sign | y_exp;
       }
     }
+    BID_SWAP128 (res);
     BID_RETURN (res);
   } else if ((C2_hi == 0x0ull) && (C2_lo == 0x0ull)) {
     // y is 0 and x is not special, and not zero
@@ -277,75 +722,76 @@ __bid128_add (UINT128 x,
     if (x_exp <= y_exp) {
       res.w[1] = x.w[1];
       res.w[0] = x.w[0];
-    } else { // if x_exp > y_exp
+    } else {   // if x_exp > y_exp
       // return (C1 * 10^scale) * 10^(x_exp - scale)
       // where scale = min (P34-q1, x_exp-y_exp)
       // determine q1 = nr. of decimal digits in x
       //  determine first the nr. of bits in x
-      if (C1_hi == 0) { // x_bits is the nr. of bits in C1_lo
-        if (C1_lo >= 0x0020000000000000ull) { // x >= 2^53
-          // split the 64-bit value in two 32-bit halves to avoid 
-          // rounding errors
-          if (C1_lo >= 0x0000000100000000ull) { // x >= 2^32
-            tmp1.d = (double) (C1_lo >> 32); // exact conversion
-            x_nr_bits =
-              32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) -
-                    0x3ff);
-          } else { // x < 2^32
-            tmp1.d = (double) (C1_lo); // exact conversion
-            x_nr_bits =
-              ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
-          }
-        } else { // if x < 2^53
-          tmp1.d = (double) C1_lo; // exact conversion
-          x_nr_bits =
-            ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
-        }
-      } else { // C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
-        tmp1.d = (double) C1_hi; // exact conversion
-        x_nr_bits =
-          64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
+      if (C1_hi == 0) {        // x_bits is the nr. of bits in C1_lo
+       if (C1_lo >= 0x0020000000000000ull) {   // x >= 2^53
+         // split the 64-bit value in two 32-bit halves to avoid 
+         // rounding errors
+         if (C1_lo >= 0x0000000100000000ull) { // x >= 2^32
+           tmp1.d = (double) (C1_lo >> 32);    // exact conversion
+           x_nr_bits =
+             32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) -
+                   0x3ff);
+         } else {      // x < 2^32
+           tmp1.d = (double) (C1_lo);  // exact conversion
+           x_nr_bits =
+             ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
+         }
+       } else {        // if x < 2^53
+         tmp1.d = (double) C1_lo;      // exact conversion
+         x_nr_bits =
+           ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
+       }
+      } else { // C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
+       tmp1.d = (double) C1_hi;        // exact conversion
+       x_nr_bits =
+         64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
       }
-      q1 = __bid_nr_digits[x_nr_bits].digits;
+      q1 = nr_digits[x_nr_bits].digits;
       if (q1 == 0) {
-        q1 = __bid_nr_digits[x_nr_bits].digits1;
-        if (C1_hi > __bid_nr_digits[x_nr_bits].threshold_hi
-            || (C1_hi == __bid_nr_digits[x_nr_bits].threshold_hi
-            && C1_lo >= __bid_nr_digits[x_nr_bits].threshold_lo))
-          q1++;
+       q1 = nr_digits[x_nr_bits].digits1;
+       if (C1_hi > nr_digits[x_nr_bits].threshold_hi ||
+           (C1_hi == nr_digits[x_nr_bits].threshold_hi &&
+            C1_lo >= nr_digits[x_nr_bits].threshold_lo))
+         q1++;
       }
       // return (C1 * 10^scale) * 10^(x_exp - scale)
       // where scale = min (P34-q1, x_exp-y_exp)  
       scale = P34 - q1;
       ind = (x_exp - y_exp) >> 49;
       if (ind < scale)
-        scale = ind;
+       scale = ind;
       if (scale == 0) {
-        res.w[1] = x.w[1];
-        res.w[0] = x.w[0];
-      } else if (q1 <= 19) { // x fits in 64 bits  
-        if (scale <= 19) { // 10^scale fits in 64 bits
-          // 64 x 64 C1_lo * __bid_ten2k64[scale] 
-          __mul_64x64_to_128MACH (res, C1_lo, __bid_ten2k64[scale]);
-        } else { // 10^scale fits in 128 bits
-          // 64 x 128 C1_lo * __bid_ten2k128[scale - 20]
-          __mul_128x64_to_128 (res, C1_lo, __bid_ten2k128[scale - 20]);
-        }
-      } else { // x fits in 128 bits, but 10^scale must fit in 64 bits
-        // 64 x 128 __bid_ten2k64[scale] * C1
-        C1.w[1] = C1_hi;
-        C1.w[0] = C1_lo;
-        __mul_128x64_to_128 (res, __bid_ten2k64[scale], C1);
+       res.w[1] = x.w[1];
+       res.w[0] = x.w[0];
+      } else if (q1 <= 19) {   // x fits in 64 bits  
+       if (scale <= 19) {      // 10^scale fits in 64 bits
+         // 64 x 64 C1_lo * ten2k64[scale] 
+         __mul_64x64_to_128MACH (res, C1_lo, ten2k64[scale]);
+       } else {        // 10^scale fits in 128 bits
+         // 64 x 128 C1_lo * ten2k128[scale - 20]
+         __mul_128x64_to_128 (res, C1_lo, ten2k128[scale - 20]);
+       }
+      } else { // x fits in 128 bits, but 10^scale must fit in 64 bits
+       // 64 x 128 ten2k64[scale] * C1
+       C1.w[1] = C1_hi;
+       C1.w[0] = C1_lo;
+       __mul_128x64_to_128 (res, ten2k64[scale], C1);
       }
       // subtract scale from the exponent
       x_exp = x_exp - ((UINT64) scale << 49);
       res.w[1] = res.w[1] | x_sign | x_exp;
     }
+    BID_SWAP128 (res);
     BID_RETURN (res);
-  } else { // x and y are not canonical, not special, and are not zero
+  } else {     // x and y are not canonical, not special, and are not zero
     // note that the result may still be zero, and then it has to have the
     // preferred exponent
-    if (x_exp < y_exp) { // if exp_x < exp_y then swap x and y 
+    if (x_exp < y_exp) {       // if exp_x < exp_y then swap x and y 
       tmp_sign = x_sign;
       tmp_exp = x_exp;
       tmp_signif_hi = C1_hi;
@@ -361,73 +807,69 @@ __bid128_add (UINT128 x,
     }
     // q1 = nr. of decimal digits in x
     //  determine first the nr. of bits in x
-    if (C1_hi == 0) { // x_bits is the nr. of bits in C1_lo
-      if (C1_lo >= 0x0020000000000000ull) { // x >= 2^53
-        //split the 64-bit value in two 32-bit halves to avoid rounding errors
-        if (C1_lo >= 0x0000000100000000ull) { // x >= 2^32
-          tmp1.d = (double) (C1_lo >> 32); // exact conversion
-          x_nr_bits =
-            32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
-        } else { // x < 2^32
-          tmp1.d = (double) (C1_lo); // exact conversion
-          x_nr_bits =
-            ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
-        }
-      } else { // if x < 2^53
-        tmp1.d = (double) C1_lo; // exact conversion
-        x_nr_bits =
-          ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
+    if (C1_hi == 0) {  // x_bits is the nr. of bits in C1_lo
+      if (C1_lo >= 0x0020000000000000ull) {    // x >= 2^53
+       //split the 64-bit value in two 32-bit halves to avoid rounding errors
+       if (C1_lo >= 0x0000000100000000ull) {   // x >= 2^32
+         tmp1.d = (double) (C1_lo >> 32);      // exact conversion
+         x_nr_bits =
+           32 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
+       } else {        // x < 2^32
+         tmp1.d = (double) (C1_lo);    // exact conversion
+         x_nr_bits =
+           ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
+       }
+      } else { // if x < 2^53
+       tmp1.d = (double) C1_lo;        // exact conversion
+       x_nr_bits =
+         ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
       }
-    } else { // C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
-      tmp1.d = (double) C1_hi; // exact conversion
+    } else {   // C1_hi != 0 => nr. bits = 64 + nr_bits (C1_hi)
+      tmp1.d = (double) C1_hi; // exact conversion
       x_nr_bits =
-        64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
+       64 + ((((unsigned int) (tmp1.ui64 >> 52)) & 0x7ff) - 0x3ff);
     }
 
-    q1 = __bid_nr_digits[x_nr_bits].digits;
+    q1 = nr_digits[x_nr_bits].digits;
     if (q1 == 0) {
-      q1 = __bid_nr_digits[x_nr_bits].digits1;
-      if (C1_hi > __bid_nr_digits[x_nr_bits].threshold_hi
-          || (C1_hi == __bid_nr_digits[x_nr_bits].threshold_hi
-          && C1_lo >= __bid_nr_digits[x_nr_bits].threshold_lo))
-        q1++;
+      q1 = nr_digits[x_nr_bits].digits1;
+      if (C1_hi > nr_digits[x_nr_bits].threshold_hi ||
+         (C1_hi == nr_digits[x_nr_bits].threshold_hi &&
+          C1_lo >= nr_digits[x_nr_bits].threshold_lo))
+       q1++;
     }
     // q2 = nr. of decimal digits in y
     //  determine first the nr. of bits in y (y_nr_bits)
-    if (C2_hi == 0) { // y_bits is the nr. of bits in C2_lo
-      if (C2_lo >= 0x0020000000000000ull) { // y >= 2^53
-        //split the 64-bit value in two 32-bit halves to avoid rounding errors
-        if (C2_lo >= 0x0000000100000000ull) { // y >= 2^32
-          tmp2.d = (double) (C2_lo >> 32); // exact conversion
-          ///tmp2_i64 = *(UINT64 *) & tmp2_d;
-          y_nr_bits =
-            32 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
-        } else { // y < 2^32
-          tmp2.d = (double) (C2_lo); // exact conversion
-          ///tmp2_i64 = *(UINT64 *) & tmp2_d;
-          y_nr_bits =
-            ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
-        }
-      } else { // if y < 2^53
-        tmp2.d = (double) C2_lo; // exact conversion
-        ///tmp2_i64 = *(UINT64 *) & tmp2_d;
-        y_nr_bits =
-          ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
+    if (C2_hi == 0) {  // y_bits is the nr. of bits in C2_lo
+      if (C2_lo >= 0x0020000000000000ull) {    // y >= 2^53
+       //split the 64-bit value in two 32-bit halves to avoid rounding errors
+       if (C2_lo >= 0x0000000100000000ull) {   // y >= 2^32
+         tmp2.d = (double) (C2_lo >> 32);      // exact conversion
+         y_nr_bits =
+           32 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
+       } else {        // y < 2^32
+         tmp2.d = (double) (C2_lo);    // exact conversion
+         y_nr_bits =
+           ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
+       }
+      } else { // if y < 2^53
+       tmp2.d = (double) C2_lo;        // exact conversion
+       y_nr_bits =
+         ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
       }
-    } else { // C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
-      tmp2.d = (double) C2_hi; // exact conversion
-      ///tmp2_i64 = *(UINT64 *) & tmp2_d;
+    } else {   // C2_hi != 0 => nr. bits = 64 + nr_bits (C2_hi)
+      tmp2.d = (double) C2_hi; // exact conversion
       y_nr_bits =
-        64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
+       64 + ((((unsigned int) (tmp2.ui64 >> 52)) & 0x7ff) - 0x3ff);
     }
 
-    q2 = __bid_nr_digits[y_nr_bits].digits;
+    q2 = nr_digits[y_nr_bits].digits;
     if (q2 == 0) {
-      q2 = __bid_nr_digits[y_nr_bits].digits1;
-      if (C2_hi > __bid_nr_digits[y_nr_bits].threshold_hi
-          || (C2_hi == __bid_nr_digits[y_nr_bits].threshold_hi
-          && C2_lo >= __bid_nr_digits[y_nr_bits].threshold_lo))
-        q2++;
+      q2 = nr_digits[y_nr_bits].digits1;
+      if (C2_hi > nr_digits[y_nr_bits].threshold_hi ||
+         (C2_hi == nr_digits[y_nr_bits].threshold_hi &&
+          C2_lo >= nr_digits[y_nr_bits].threshold_lo))
+       q2++;
     }
 
     delta = q1 + (int) (x_exp >> 49) - q2 - (int) (y_exp >> 49);
@@ -438,2003 +880,2067 @@ __bid128_add (UINT128 x,
       // the result is inexact; the preferred exponent is the least possible
 
       if (delta >= P34 + 1) {
-        // for RN the result is the operand with the larger magnitude,
-        // possibly scaled up by 10^(P34-q1)
-        // an overflow cannot occur in this case (rounding to nearest)
-        if (q1 < P34) { // scale C1 up by 10^(P34-q1)
-          // Note: because delta >= P34+1 it is certain that 
-          //     x_exp - ((UINT64)scale << 49) will stay above e_min
-          scale = P34 - q1;
-          if (q1 <= 19) { // C1 fits in 64 bits
-            // 1 <= q1 <= 19 => 15 <= scale <= 33
-            if (scale <= 19) { // 10^scale fits in 64 bits
-              __mul_64x64_to_128MACH (C1, __bid_ten2k64[scale], C1_lo);
-            } else { // if 20 <= scale <= 33
-              // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
-              // (C1 * 10^(scale-19)) fits in 64 bits
-              C1_lo = C1_lo * __bid_ten2k64[scale - 19];
-              __mul_64x64_to_128MACH (C1, __bid_ten2k64[19], C1_lo);
-            }
-          } else { //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
-            // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
-            C1.w[1] = C1_hi;
-            C1.w[0] = C1_lo;
-            // C1 = __bid_ten2k64[P34 - q1] * C1
-            __mul_128x64_to_128 (C1, __bid_ten2k64[P34 - q1], C1);
-          }
-          x_exp = x_exp - ((UINT64) scale << 49);
-          C1_hi = C1.w[1];
-          C1_lo = C1.w[0];
-        }
-        // some special cases arise: if delta = P34 + 1 and C1 = 10^(P34-1) 
-        // (after scaling) and x_sign != y_sign and C2 > 5*10^(q2-1) => 
-        // subtract 1 ulp
-        // Note: do this only for rounding to nearest; for other rounding 
-        // modes the correction will be applied next
-        if ((rnd_mode == ROUNDING_TO_NEAREST
-             || rnd_mode == ROUNDING_TIES_AWAY) && delta == (P34 + 1)
-            && C1_hi == 0x0000314dc6448d93ull
-            && C1_lo == 0x38c15b0a00000000ull && x_sign != y_sign
-            && ((q2 <= 19 && C2_lo > __bid_midpoint64[q2 - 1]) || (q2 >= 20
-                && (C2_hi > __bid_midpoint128[q2 - 20].w[1]
-                    || (C2_hi == __bid_midpoint128[q2 - 20].w[1]
-                    && C2_lo > __bid_midpoint128[q2 - 20].w[0]))))) {
-          // C1 = 10^34 - 1 and decrement x_exp by 1 (no underflow possible)
-          C1_hi = 0x0001ed09bead87c0ull;
-          C1_lo = 0x378d8e63ffffffffull;
-          x_exp = x_exp - EXP_P1;
-        }
-        if (rnd_mode != ROUNDING_TO_NEAREST) {
-          if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
-              || (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
-            // add 1 ulp and then check for overflow
-            C1_lo = C1_lo + 1;
-            if (C1_lo == 0) { // rounding overflow in the low 64 bits
-              C1_hi = C1_hi + 1;
-            }
-            if (C1_hi == 0x0001ed09bead87c0ull
-                && C1_lo == 0x378d8e6400000000ull) {
-              // C1 = 10^34 => rounding overflow
-              C1_hi = 0x0000314dc6448d93ull;
-              C1_lo = 0x38c15b0a00000000ull; // 10^33
-              x_exp = x_exp + EXP_P1;
-              if (x_exp == EXP_MAX_P1) { // overflow
-                C1_hi = 0x7800000000000000ull; // +inf
-                C1_lo = 0x0ull;
-                x_exp = 0; // x_sign is preserved
-                // set overflow flag (the inexact flag was set too)
-                *pfpsf |= OVERFLOW_EXCEPTION;
-              }
-            }
-          } else if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
-                     || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
-                     || (rnd_mode == ROUNDING_TO_ZERO && x_sign != y_sign)) {
-            // subtract 1 ulp from C1
-            // Note: because delta >= P34 + 1 the result cannot be zero
-            C1_lo = C1_lo - 1;
-            if (C1_lo == 0xffffffffffffffffull)
-              C1_hi = C1_hi - 1;
-            // if the coefficient is 10^33 - 1 then make it 10^34 - 1 and 
-            // decrease the exponent by 1 (because delta >= P34 + 1 the
-            // exponent will not become less than e_min)
-            // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
-            // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
-            if (C1_hi == 0x0000314dc6448d93ull
-                && C1_lo == 0x38c15b09ffffffffull) {
-              // make C1 = 10^34  - 1
-              C1_hi = 0x0001ed09bead87c0ull;
-              C1_lo = 0x378d8e63ffffffffull;
-              x_exp = x_exp - EXP_P1;
-            }
-          } else {
-            ; // the result is already correct
-          }
-        }
-        // set the inexact flag
-        *pfpsf |= INEXACT_EXCEPTION;
-        // assemble the result
-        res.w[1] = x_sign | x_exp | C1_hi;
-        res.w[0] = C1_lo;
-      } else { // delta = P34 
-        // in most cases, the smaller operand may be < or = or > 1/2 ulp of the
-        // larger operand
-        // however, the case C1 = 10^(q1-1) and x_sign != y_sign is special due
-        // to accuracy loss after subtraction, and will be treated separately
-        if (x_sign == y_sign || (q1 <= 20
-            && (C1_hi != 0 || C1_lo != __bid_ten2k64[q1 - 1])) || (q1 >= 21
-            && (C1_hi != __bid_ten2k128[q1 - 21].w[1]
-                || C1_lo != __bid_ten2k128[q1 - 21].w[0]))) {
-          // if x_sign == y_sign or C1 != 10^(q1-1)
-          // compare C2 with 1/2 ulp = 5 * 10^(q2-1), the latter read from table
-          // Note: cases q1<=19 and q1>=20 can be coalesced at some latency cost
-          if (q2 <= 19) { // C2 and 5*10^(q2-1) both fit in 64 bits
-            halfulp64 = __bid_midpoint64[q2 - 1]; // 5 * 10^(q2-1)
-            if (C2_lo < halfulp64) { // n2 < 1/2 ulp (n1)
-              // for RN the result is the operand with the larger magnitude, 
-              // possibly scaled up by 10^(P34-q1)
-              // an overflow cannot occur in this case (rounding to nearest)
-              if (q1 < P34) { // scale C1 up by 10^(P34-q1)
-                // Note: because delta = P34 it is certain that
-                //     x_exp - ((UINT64)scale << 49) will stay above e_min
-                scale = P34 - q1;
-                if (q1 <= 19) { // C1 fits in 64 bits
-                  // 1 <= q1 <= 19 => 15 <= scale <= 33
-                  if (scale <= 19) { // 10^scale fits in 64 bits
-                    __mul_64x64_to_128MACH (C1, __bid_ten2k64[scale], C1_lo);
-                  } else { // if 20 <= scale <= 33
-                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
-                    // (C1 * 10^(scale-19)) fits in 64 bits
-                    C1_lo = C1_lo * __bid_ten2k64[scale - 19];
-                    __mul_64x64_to_128MACH (C1, __bid_ten2k64[19], C1_lo);
-                  }
-                } else { //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
-                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
-                  C1.w[1] = C1_hi;
-                  C1.w[0] = C1_lo;
-                  // C1 = __bid_ten2k64[P34 - q1] * C1
-                  __mul_128x64_to_128 (C1, __bid_ten2k64[P34 - q1], C1);
-                }
-                x_exp = x_exp - ((UINT64) scale << 49);
-                C1_hi = C1.w[1];
-                C1_lo = C1.w[0];
-              }
-              if (rnd_mode != ROUNDING_TO_NEAREST) {
-                if ((rnd_mode == ROUNDING_DOWN && x_sign
-                    && y_sign) || (rnd_mode == ROUNDING_UP
-                    && !x_sign && !y_sign)) {
-                  // add 1 ulp and then check for overflow
-                  C1_lo = C1_lo + 1;
-                  if (C1_lo == 0) { // rounding overflow in the low 64 bits
-                    C1_hi = C1_hi + 1;
-                  }
-                  if (C1_hi == 0x0001ed09bead87c0ull
-                      && C1_lo == 0x378d8e6400000000ull) {
-                    // C1 = 10^34 => rounding overflow
-                    C1_hi = 0x0000314dc6448d93ull;
-                    C1_lo = 0x38c15b0a00000000ull; // 10^33
-                    x_exp = x_exp + EXP_P1;
-                    if (x_exp == EXP_MAX_P1) { // overflow
-                      C1_hi = 0x7800000000000000ull; // +inf
-                      C1_lo = 0x0ull;
-                      x_exp = 0; // x_sign is preserved
-                      // set overflow flag (the inexact flag was set too)
-                      *pfpsf |= OVERFLOW_EXCEPTION;
-                    }
-                  }
-                } else if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign) || 
-                    (rnd_mode == ROUNDING_UP && x_sign && !y_sign) || 
-                    (rnd_mode == ROUNDING_TO_ZERO && x_sign != y_sign)) {
-                  // subtract 1 ulp from C1
-                  // Note: because delta >= P34 + 1 the result cannot be zero
-                  C1_lo = C1_lo - 1;
-                  if (C1_lo == 0xffffffffffffffffull)
-                    C1_hi = C1_hi - 1;
-                  // if the coefficient is 10^33-1 then make it 10^34-1 and 
-                  // decrease the exponent by 1 (because delta >= P34 + 1 the
-                  // exponent will not become less than e_min)
-                  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
-                  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
-                  if (C1_hi == 0x0000314dc6448d93ull
-                      && C1_lo == 0x38c15b09ffffffffull) {
-                    // make C1 = 10^34  - 1
-                    C1_hi = 0x0001ed09bead87c0ull;
-                    C1_lo = 0x378d8e63ffffffffull;
-                    x_exp = x_exp - EXP_P1;
-                  }
-                } else {
-                  ; // the result is already correct
-                }
-              }
-              // set the inexact flag
-              *pfpsf |= INEXACT_EXCEPTION;
-              // assemble the result
-              res.w[1] = x_sign | x_exp | C1_hi;
-              res.w[0] = C1_lo;
-            } else if ((C2_lo == halfulp64)
-                       && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
-              // n2 = 1/2 ulp (n1) and C1 is even
-              // the result is the operand with the larger magnitude,
-              // possibly scaled up by 10^(P34-q1)
-              // an overflow cannot occur in this case (rounding to nearest)
-              if (q1 < P34) { // scale C1 up by 10^(P34-q1)
-                // Note: because delta = P34 it is certain that
-                //     x_exp - ((UINT64)scale << 49) will stay above e_min
-                scale = P34 - q1;
-                if (q1 <= 19) { // C1 fits in 64 bits
-                  // 1 <= q1 <= 19 => 15 <= scale <= 33
-                  if (scale <= 19) { // 10^scale fits in 64 bits
-                    __mul_64x64_to_128MACH (C1, __bid_ten2k64[scale], C1_lo);
-                  } else { // if 20 <= scale <= 33 
-                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
-                    // (C1 * 10^(scale-19)) fits in 64 bits  
-                    C1_lo = C1_lo * __bid_ten2k64[scale - 19];
-                    __mul_64x64_to_128MACH (C1, __bid_ten2k64[19], C1_lo);
-                  }
-                } else { //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
-                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits 
-                  C1.w[1] = C1_hi;
-                  C1.w[0] = C1_lo;
-                  // C1 = __bid_ten2k64[P34 - q1] * C1 
-                  __mul_128x64_to_128 (C1, __bid_ten2k64[P34 - q1], C1);
-                }
-                x_exp = x_exp - ((UINT64) scale << 49);
-                C1_hi = C1.w[1];
-                C1_lo = C1.w[0];
-              }
-              if ((rnd_mode == ROUNDING_TO_NEAREST
-                  && x_sign == y_sign && (C1_lo & 0x01))
-                  || (rnd_mode == ROUNDING_TIES_AWAY
-                  && x_sign == y_sign) || (rnd_mode == ROUNDING_UP
-                  && !x_sign && !y_sign)
-                  || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)) {
-                // add 1 ulp and then check for overflow
-                C1_lo = C1_lo + 1;
-                if (C1_lo == 0) { // rounding overflow in the low 64 bits
-                  C1_hi = C1_hi + 1;
-                }
-                if (C1_hi == 0x0001ed09bead87c0ull
-                    && C1_lo == 0x378d8e6400000000ull) {
-                  // C1 = 10^34 => rounding overflow
-                  C1_hi = 0x0000314dc6448d93ull;
-                  C1_lo = 0x38c15b0a00000000ull; // 10^33
-                  x_exp = x_exp + EXP_P1;
-                  if (x_exp == EXP_MAX_P1) { // overflow
-                    C1_hi = 0x7800000000000000ull; // +inf
-                    C1_lo = 0x0ull;
-                    x_exp = 0; // x_sign is preserved
-                    // set overflow flag (the inexact flag was set too)
-                    *pfpsf |= OVERFLOW_EXCEPTION;
-                  }
-                }
-              } else if ((rnd_mode == ROUNDING_TO_NEAREST
-                         && x_sign != y_sign && (C1_lo & 0x01))
-                         || (rnd_mode == ROUNDING_DOWN && !x_sign
-                         && y_sign) || (rnd_mode == ROUNDING_UP
-                         && x_sign && !y_sign)
-                         || (rnd_mode == ROUNDING_TO_ZERO
-                         && x_sign != y_sign)) {
-                // subtract 1 ulp from C1
-                // Note: because delta >= P34 + 1 the result cannot be zero
-                C1_lo = C1_lo - 1;
-                if (C1_lo == 0xffffffffffffffffull)
-                  C1_hi = C1_hi - 1;
-                // if the coefficient is 10^33 - 1 then make it 10^34 - 1
-                // and decrease the exponent by 1 (because delta >= P34 + 1
-                // the exponent will not become less than e_min)
-                // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
-                // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
-                if (C1_hi == 0x0000314dc6448d93ull
-                    && C1_lo == 0x38c15b09ffffffffull) {
-                  // make C1 = 10^34  - 1
-                  C1_hi = 0x0001ed09bead87c0ull;
-                  C1_lo = 0x378d8e63ffffffffull;
-                  x_exp = x_exp - EXP_P1;
-                }
-              } else {
-                ; // the result is already correct
-              }
-              // set the inexact flag
-              *pfpsf |= INEXACT_EXCEPTION;
-              // assemble the result 
-              res.w[1] = x_sign | x_exp | C1_hi;
-              res.w[0] = C1_lo;
-            } else { // if C2_lo > halfulp64 || 
-              // (C2_lo == halfulp64 && q1 == P34 && ((C1_lo & 0x1) == 1)), i.e.
-              // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
-              // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
-              if (q1 < P34) { // then 1 ulp = 10^(e1+q1-P34) < 10^e1
-                // Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
-                // because q1 < P34 we must first replace C1 by 
-                // C1 * 10^(P34-q1), and must decrease the exponent by 
-                // (P34-q1) (it will still be at least e_min)
-                scale = P34 - q1;
-                if (q1 <= 19) { // C1 fits in 64 bits
-                  // 1 <= q1 <= 19 => 15 <= scale <= 33
-                  if (scale <= 19) { // 10^scale fits in 64 bits
-                    __mul_64x64_to_128MACH (C1, __bid_ten2k64[scale], C1_lo);
-                  } else { // if 20 <= scale <= 33
-                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
-                    // (C1 * 10^(scale-19)) fits in 64 bits
-                    C1_lo = C1_lo * __bid_ten2k64[scale - 19];
-                    __mul_64x64_to_128MACH (C1, __bid_ten2k64[19], C1_lo);
-                  }
-                } else { //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
-                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
-                  C1.w[1] = C1_hi;
-                  C1.w[0] = C1_lo;
-                  // C1 = __bid_ten2k64[P34 - q1] * C1
-                  __mul_128x64_to_128 (C1, __bid_ten2k64[P34 - q1], C1);
-                }
-                x_exp = x_exp - ((UINT64) scale << 49);
-                C1_hi = C1.w[1];
-                C1_lo = C1.w[0];
-                // check for rounding overflow
-                if (C1_hi == 0x0001ed09bead87c0ull
-                    && C1_lo == 0x378d8e6400000000ull) {
-                  // C1 = 10^34 => rounding overflow 
-                  C1_hi = 0x0000314dc6448d93ull;
-                  C1_lo = 0x38c15b0a00000000ull; // 10^33
-                  x_exp = x_exp + EXP_P1;
-                }
-              }
-              if ((rnd_mode == ROUNDING_TO_NEAREST
-                  && x_sign != y_sign)
-                  || (rnd_mode == ROUNDING_TIES_AWAY
-                  && x_sign != y_sign && C2_lo != halfulp64)
-                  || (rnd_mode == ROUNDING_DOWN && !x_sign
-                  && y_sign) || (rnd_mode == ROUNDING_UP && x_sign
-                  && !y_sign) || (rnd_mode == ROUNDING_TO_ZERO
-                  && x_sign != y_sign)) {
-                // the result is x - 1
-                // for RN n1 * n2 < 0; underflow not possible
-                C1_lo = C1_lo - 1;
-                if (C1_lo == 0xffffffffffffffffull)
-                  C1_hi--;
-                // check if we crossed into the lower decade
-                if (C1_hi == 0x0000314dc6448d93ull && 
-                    C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
-                  C1_hi = 0x0001ed09bead87c0ull; // 10^34 - 1
-                  C1_lo = 0x378d8e63ffffffffull;
-                  x_exp = x_exp - EXP_P1; // no underflow, because n1 >> n2
-                }
-              } else if ((rnd_mode == ROUNDING_TO_NEAREST
-                         && x_sign == y_sign)
-                         || (rnd_mode == ROUNDING_TIES_AWAY
-                         && x_sign == y_sign)
-                         || (rnd_mode == ROUNDING_DOWN && x_sign
-                         && y_sign) || (rnd_mode == ROUNDING_UP
-                         && !x_sign && !y_sign)) {
-                // the result is x + 1
-                // for RN x_sign = y_sign, i.e. n1*n2 > 0
-                C1_lo = C1_lo + 1;
-                if (C1_lo == 0) { // rounding overflow in the low 64 bits
-                  C1_hi = C1_hi + 1;
-                }
-                if (C1_hi == 0x0001ed09bead87c0ull
-                    && C1_lo == 0x378d8e6400000000ull) {
-                  // C1 = 10^34 => rounding overflow
-                  C1_hi = 0x0000314dc6448d93ull;
-                  C1_lo = 0x38c15b0a00000000ull; // 10^33
-                  x_exp = x_exp + EXP_P1;
-                  if (x_exp == EXP_MAX_P1) { // overflow
-                    C1_hi = 0x7800000000000000ull; // +inf
-                    C1_lo = 0x0ull;
-                    x_exp = 0; // x_sign is preserved
-                    // set the overflow flag
-                    *pfpsf |= OVERFLOW_EXCEPTION;
-                  }
-                }
-              } else {
-                ; // the result is x
-              }
-              // set the inexact flag
-              *pfpsf |= INEXACT_EXCEPTION;
-              // assemble the result
-              res.w[1] = x_sign | x_exp | C1_hi;
-              res.w[0] = C1_lo;
-            }
-          } else { // if q2 >= 20 then 5*10^(q2-1) and C2 (the latter in 
-            // most cases) fit only in more than 64 bits
-            halfulp128 = __bid_midpoint128[q2 - 20]; // 5 * 10^(q2-1)
-            if ((C2_hi < halfulp128.w[1])
-                || (C2_hi == halfulp128.w[1]
-                    && C2_lo < halfulp128.w[0])) {
-              // n2 < 1/2 ulp (n1)
-              // the result is the operand with the larger magnitude,
-              // possibly scaled up by 10^(P34-q1)
-              // an overflow cannot occur in this case (rounding to nearest)
-              if (q1 < P34) { // scale C1 up by 10^(P34-q1)
-                // Note: because delta = P34 it is certain that
-                //     x_exp - ((UINT64)scale << 49) will stay above e_min
-                scale = P34 - q1;
-                if (q1 <= 19) { // C1 fits in 64 bits
-                  // 1 <= q1 <= 19 => 15 <= scale <= 33
-                  if (scale <= 19) { // 10^scale fits in 64 bits
-                    __mul_64x64_to_128MACH (C1, __bid_ten2k64[scale], C1_lo);
-                  } else { // if 20 <= scale <= 33 
-                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
-                    // (C1 * 10^(scale-19)) fits in 64 bits  
-                    C1_lo = C1_lo * __bid_ten2k64[scale - 19];
-                    __mul_64x64_to_128MACH (C1, __bid_ten2k64[19], C1_lo);
-                  }
-                } else { //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
-                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits 
-                  C1.w[1] = C1_hi;
-                  C1.w[0] = C1_lo;
-                  // C1 = __bid_ten2k64[P34 - q1] * C1 
-                  __mul_128x64_to_128 (C1, __bid_ten2k64[P34 - q1], C1);
-                }
-                C1_hi = C1.w[1];
-                C1_lo = C1.w[0];
-                x_exp = x_exp - ((UINT64) scale << 49);
-              }
-              if (rnd_mode != ROUNDING_TO_NEAREST) {
-                if ((rnd_mode == ROUNDING_DOWN && x_sign
-                    && y_sign) || (rnd_mode == ROUNDING_UP
-                    && !x_sign && !y_sign)) {
-                  // add 1 ulp and then check for overflow
-                  C1_lo = C1_lo + 1;
-                  if (C1_lo == 0) { // rounding overflow in the low 64 bits
-                    C1_hi = C1_hi + 1;
-                  }
-                  if (C1_hi == 0x0001ed09bead87c0ull
-                      && C1_lo == 0x378d8e6400000000ull) {
-                    // C1 = 10^34 => rounding overflow
-                    C1_hi = 0x0000314dc6448d93ull;
-                    C1_lo = 0x38c15b0a00000000ull; // 10^33
-                    x_exp = x_exp + EXP_P1;
-                    if (x_exp == EXP_MAX_P1) { // overflow
-                      C1_hi = 0x7800000000000000ull; // +inf
-                      C1_lo = 0x0ull;
-                      x_exp = 0; // x_sign is preserved
-                      // set overflow flag (the inexact flag was set too)
-                      *pfpsf |= OVERFLOW_EXCEPTION;
-                    }
-                  }
-                } else if ((rnd_mode == ROUNDING_DOWN && !x_sign
-                           && y_sign) || (rnd_mode == ROUNDING_UP
-                           && x_sign && !y_sign)
-                           || (rnd_mode == ROUNDING_TO_ZERO
-                           && x_sign != y_sign)) {
-                  // subtract 1 ulp from C1
-                  // Note: because delta >= P34 + 1 the result cannot be zero
-                  C1_lo = C1_lo - 1;
-                  if (C1_lo == 0xffffffffffffffffull)
-                    C1_hi = C1_hi - 1;
-                  // if the coefficient is 10^33-1 then make it 10^34-1 and
-                  // decrease the exponent by 1 (because delta >= P34 + 1 the
-                  // exponent will not become less than e_min)
-                  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
-                  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
-                  if (C1_hi == 0x0000314dc6448d93ull
-                      && C1_lo == 0x38c15b09ffffffffull) {
-                    // make C1 = 10^34  - 1
-                    C1_hi = 0x0001ed09bead87c0ull;
-                    C1_lo = 0x378d8e63ffffffffull;
-                    x_exp = x_exp - EXP_P1;
-                  }
-                } else {
-                  ; // the result is already correct
-                }
-              }
-              // set the inexact flag 
-              *pfpsf |= INEXACT_EXCEPTION;
-              // assemble the result 
-              res.w[1] = x_sign | x_exp | C1_hi;
-              res.w[0] = C1_lo;
-            } else if ((C2_hi == halfulp128.w[1]
-                        && C2_lo == halfulp128.w[0])
-                       && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
-              // midpoint & lsb in C1 is 0
-              // n2 = 1/2 ulp (n1) and C1 is even
-              // the result is the operand with the larger magnitude,
-              // possibly scaled up by 10^(P34-q1)
-              // an overflow cannot occur in this case (rounding to nearest)
-              if (q1 < P34) { // scale C1 up by 10^(P34-q1)
-                // Note: because delta = P34 it is certain that
-                //     x_exp - ((UINT64)scale << 49) will stay above e_min
-                scale = P34 - q1;
-                if (q1 <= 19) { // C1 fits in 64 bits
-                  // 1 <= q1 <= 19 => 15 <= scale <= 33
-                  if (scale <= 19) { // 10^scale fits in 64 bits
-                    __mul_64x64_to_128MACH (C1, __bid_ten2k64[scale], C1_lo);
-                  } else { // if 20 <= scale <= 33
-                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
-                    // (C1 * 10^(scale-19)) fits in 64 bits
-                    C1_lo = C1_lo * __bid_ten2k64[scale - 19];
-                    __mul_64x64_to_128MACH (C1, __bid_ten2k64[19], C1_lo);
-                  }
-                } else { //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
-                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
-                  C1.w[1] = C1_hi;
-                  C1.w[0] = C1_lo;
-                  // C1 = __bid_ten2k64[P34 - q1] * C1
-                  __mul_128x64_to_128 (C1, __bid_ten2k64[P34 - q1], C1);
-                }
-                x_exp = x_exp - ((UINT64) scale << 49);
-                C1_hi = C1.w[1];
-                C1_lo = C1.w[0];
-              }
-              if (rnd_mode != ROUNDING_TO_NEAREST) {
-                if ((rnd_mode == ROUNDING_TIES_AWAY
-                    && x_sign == y_sign)
-                    || (rnd_mode == ROUNDING_UP && !y_sign)) {
-                  // add 1 ulp and then check for overflow
-                  C1_lo = C1_lo + 1;
-                  if (C1_lo == 0) { // rounding overflow in the low 64 bits
-                    C1_hi = C1_hi + 1;
-                  }
-                  if (C1_hi == 0x0001ed09bead87c0ull
-                      && C1_lo == 0x378d8e6400000000ull) {
-                    // C1 = 10^34 => rounding overflow
-                    C1_hi = 0x0000314dc6448d93ull;
-                    C1_lo = 0x38c15b0a00000000ull; // 10^33
-                    x_exp = x_exp + EXP_P1;
-                    if (x_exp == EXP_MAX_P1) { // overflow
-                      C1_hi = 0x7800000000000000ull; // +inf
-                      C1_lo = 0x0ull;
-                      x_exp = 0; // x_sign is preserved
-                      // set overflow flag (the inexact flag was set too)
-                      *pfpsf |= OVERFLOW_EXCEPTION;
-                    }
-                  }
-                } else if ((rnd_mode == ROUNDING_DOWN && y_sign)
-                           || (rnd_mode == ROUNDING_TO_ZERO
-                           && x_sign != y_sign)) {
-                  // subtract 1 ulp from C1
-                  // Note: because delta >= P34 + 1 the result cannot be zero
-                  C1_lo = C1_lo - 1;
-                  if (C1_lo == 0xffffffffffffffffull)
-                    C1_hi = C1_hi - 1;
-                  // if the coefficient is 10^33 - 1 then make it 10^34 - 1
-                  // and decrease the exponent by 1 (because delta >= P34 + 1
-                  // the exponent will not become less than e_min)
-                  // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
-                  // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
-                  if (C1_hi == 0x0000314dc6448d93ull
-                      && C1_lo == 0x38c15b09ffffffffull) {
-                    // make C1 = 10^34  - 1
-                    C1_hi = 0x0001ed09bead87c0ull;
-                    C1_lo = 0x378d8e63ffffffffull;
-                    x_exp = x_exp - EXP_P1;
-                  }
-                } else {
-                  ; // the result is already correct
-                }
-              }
-              // set the inexact flag
-              *pfpsf |= INEXACT_EXCEPTION;
-              // assemble the result
-              res.w[1] = x_sign | x_exp | C1_hi;
-              res.w[0] = C1_lo;
-            } else { // if C2 > halfulp128 ||
-              // (C2 == halfulp128 && q1 == P34 && ((C1 & 0x1) == 1)), i.e.
-              // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
-              // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
-              if (q1 < P34) { // then 1 ulp = 10^(e1+q1-P34) < 10^e1
-                // Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
-                // because q1 < P34 we must first replace C1 by C1*10^(P34-q1),
-                // and must decrease the exponent by (P34-q1) (it will still be
-                // at least e_min)
-                scale = P34 - q1;
-                if (q1 <= 19) { // C1 fits in 64 bits
-                  // 1 <= q1 <= 19 => 15 <= scale <= 33
-                  if (scale <= 19) { // 10^scale fits in 64 bits
-                    __mul_64x64_to_128MACH (C1, __bid_ten2k64[scale], C1_lo);
-                  } else { // if 20 <= scale <= 33
-                    // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
-                    // (C1 * 10^(scale-19)) fits in 64 bits
-                    C1_lo = C1_lo * __bid_ten2k64[scale - 19];
-                    __mul_64x64_to_128MACH (C1, __bid_ten2k64[19], C1_lo);
-                  }
-                } else { //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
-                  // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
-                  C1.w[1] = C1_hi;
-                  C1.w[0] = C1_lo;
-                  // C1 = __bid_ten2k64[P34 - q1] * C1
-                  __mul_128x64_to_128 (C1, __bid_ten2k64[P34 - q1], C1);
-                }
-                C1_hi = C1.w[1];
-                C1_lo = C1.w[0];
-                x_exp = x_exp - ((UINT64) scale << 49);
-              }
-              if ((rnd_mode == ROUNDING_TO_NEAREST
-                  && x_sign != y_sign) || (rnd_mode == ROUNDING_TIES_AWAY
-                  && x_sign != y_sign && 
-                  (C2_hi != halfulp128.w[1] || C2_lo != halfulp128.w[0])) ||
-                  (rnd_mode == ROUNDING_DOWN && !x_sign
-                  && y_sign) || (rnd_mode == ROUNDING_UP && x_sign
-                  && !y_sign) || (rnd_mode == ROUNDING_TO_ZERO
-                  && x_sign != y_sign)) {
-                // the result is x - 1
-                // for RN n1 * n2 < 0; underflow not possible
-                C1_lo = C1_lo - 1;
-                if (C1_lo == 0xffffffffffffffffull)
-                  C1_hi--;
-                // check if we crossed into the lower decade
-                if (C1_hi == 0x0000314dc6448d93ull && 
-                    C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
-                  C1_hi = 0x0001ed09bead87c0ull; // 10^34 - 1
-                  C1_lo = 0x378d8e63ffffffffull;
-                  x_exp = x_exp - EXP_P1; // no underflow, because n1 >> n2
-                }
-              } else if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign == y_sign) 
-                  || (rnd_mode == ROUNDING_TIES_AWAY && x_sign == y_sign) || 
-                  (rnd_mode == ROUNDING_DOWN && x_sign && y_sign) || 
-                  (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
-                // the result is x + 1
-                // for RN x_sign = y_sign, i.e. n1*n2 > 0
-                C1_lo = C1_lo + 1;
-                if (C1_lo == 0) { // rounding overflow in the low 64 bits
-                  C1_hi = C1_hi + 1;
-                }
-                if (C1_hi == 0x0001ed09bead87c0ull
-                    && C1_lo == 0x378d8e6400000000ull) {
-                  // C1 = 10^34 => rounding overflow
-                  C1_hi = 0x0000314dc6448d93ull;
-                  C1_lo = 0x38c15b0a00000000ull; // 10^33
-                  x_exp = x_exp + EXP_P1;
-                  if (x_exp == EXP_MAX_P1) { // overflow
-                    C1_hi = 0x7800000000000000ull; // +inf
-                    C1_lo = 0x0ull;
-                    x_exp = 0; // x_sign is preserved
-                    // set the overflow flag
-                    *pfpsf |= OVERFLOW_EXCEPTION;
-                  }
-                }
-              } else {
-                ; // the result is x
-              }
-              // set the inexact flag
-              *pfpsf |= INEXACT_EXCEPTION;
-              // assemble the result
-              res.w[1] = x_sign | x_exp | C1_hi;
-              res.w[0] = C1_lo;
-            }
-          }        // end q1 >= 20
-          // end case where C1 != 10^(q1-1)
-        } else { // C1 = 10^(q1-1) and x_sign != y_sign
-          // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
-          // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34 
-          // where x1 = q2 - 1, 0 <= x1 <= P34 - 1
-          // Because C1 = 10^(q1-1) and x_sign != y_sign, C' will have P34 
-          // digits and n = C' * 10^(e2+x1)
-          // If the result has P34+1 digits, redo the steps above with x1+1
-          // If the result has P34-1 digits or less, redo the steps above with 
-          // x1-1 but only if initially x1 >= 1
-          // NOTE: these two steps can be improved, e.g we could guess if
-          // P34+1 or P34-1 digits will be obtained by adding/subtracting 
-          // just the top 64 bits of the two operands
-          // The result cannot be zero, and it cannot overflow
-          x1 = q2 - 1; // 0 <= x1 <= P34-1
-          // Calculate C1 * 10^(e1-e2-x1) where 1 <= e1-e2-x1 <= P34
-          // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
-          scale = P34 - q1 + 1; // scale=e1-e2-x1 = P34+1-q1; 1<=scale<=P34
-          // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
-          // but their product fits with certainty in 128 bits
-          if (scale >= 20) { //10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
-            __mul_128x64_to_128 (C1, C1_lo, __bid_ten2k128[scale - 20]);
-          } else { // if (scale >= 1
-            // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
-            if (q1 <= 19) { // C1 fits in 64 bits
-              __mul_64x64_to_128MACH (C1, C1_lo, __bid_ten2k64[scale]);
-            } else { // q1 >= 20
-              C1.w[1] = C1_hi;
-              C1.w[0] = C1_lo;
-              __mul_128x64_to_128 (C1, __bid_ten2k64[scale], C1);
-            }
-          }
-          tmp64 = C1.w[0]; // C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
-
-          // now round C2 to q2-x1 = 1 decimal digit
-          // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
-          ind = x1 - 1; // -1 <= ind <= P34 - 2
-          if (ind >= 0) { // if (x1 >= 1)
-            C2.w[0] = C2_lo;
-            C2.w[1] = C2_hi;
-            if (ind <= 18) {
-              C2.w[0] = C2.w[0] + __bid_midpoint64[ind];
-              if (C2.w[0] < C2_lo)
-                C2.w[1]++;
-            } else { // 19 <= ind <= 32
-              C2.w[0] = C2.w[0] + __bid_midpoint128[ind - 19].w[0];
-              C2.w[1] = C2.w[1] + __bid_midpoint128[ind - 19].w[1];
-              if (C2.w[0] < C2_lo)
-                C2.w[1]++;
-            }
-            // the approximation of 10^(-x1) was rounded up to 118 bits
-            __mul_128x128_to_256 (R256, C2, __bid_ten2mk128[ind]); // R256 = C2*, f2*
-            // calculate C2* and f2*
-            // C2* is actually floor(C2*) in this case
-            // C2* and f2* need shifting and masking, as shown by
-            // __bid_shiftright128[] and __bid_maskhigh128[]
-            // the top Ex bits of 10^(-x1) are T* = __bid_ten2mk128trunc[ind], e.g.
-            // if x1=1, T*=__bid_ten2mk128trunc[0]=0x19999999999999999999999999999999
-            // if (0 < f2* < 10^(-x1)) then
-            //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
-            //       shift; C2* has p decimal digits, correct by Prop. 1)
-            //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
-            //       shift; C2* has p decimal digits, correct by Pr. 1)
-            // else
-            //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
-            //       correct by Property 1)
-            // n = C2* * 10^(e2+x1)
-
-            if (ind <= 2) {
-              highf2star.w[1] = 0x0;
-              highf2star.w[0] = 0x0; // low f2* ok
-            } else if (ind <= 21) {
-              highf2star.w[1] = 0x0;
-              highf2star.w[0] = R256.w[2] & __bid_maskhigh128[ind]; // low f2* ok
-            } else {
-              highf2star.w[1] = R256.w[3] & __bid_maskhigh128[ind];
-              highf2star.w[0] = R256.w[2]; // low f2* is ok
-            }
-            // shift right C2* by Ex-128 = __bid_shiftright128[ind]
-            if (ind >= 3) {
-              shift = __bid_shiftright128[ind];
-              if (shift < 64) { // 3 <= shift <= 63
-                R256.w[2] =
-                  (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
-                R256.w[3] = (R256.w[3] >> shift);
-              } else { // 66 <= shift <= 102
-                R256.w[2] = (R256.w[3] >> (shift - 64));
-                R256.w[3] = 0x0ULL;
-              }
-            }
-            // redundant
-            is_inexact_lt_midpoint = 0;
-            is_inexact_gt_midpoint = 0;
-            is_midpoint_lt_even = 0;
-            is_midpoint_gt_even = 0;
-            // determine inexactness of the rounding of C2*
-            // (cannot be followed by a second rounding)
-            // if (0 < f2* - 1/2 < 10^(-x1)) then
-            //   the result is exact
-            // else (if f2* - 1/2 > T* then)
-            //   the result of is inexact
-            if (ind <= 2) {
-              if (R256.w[1] > 0x8000000000000000ull || 
-                  (R256.w[1] == 0x8000000000000000ull && R256.w[0] > 0x0ull)) {
-                // f2* > 1/2 and the result may be exact
-                tmp64A = R256.w[1] - 0x8000000000000000ull; // f* - 1/2
-                if ((tmp64A > __bid_ten2mk128trunc[ind].w[1]
-                     || (tmp64A == __bid_ten2mk128trunc[ind].w[1]
-                     && R256.w[0] >= __bid_ten2mk128trunc[ind].w[0]))) {
-                  // set the inexact flag
-                  *pfpsf |= INEXACT_EXCEPTION;
-                  // this rounding is applied to C2 only!
-                  // x_sign != y_sign
-                  is_inexact_gt_midpoint = 1;
-                }        // else the result is exact
-                // rounding down, unless a midpoint in [ODD, EVEN]
-              } else { // the result is inexact; f2* <= 1/2
-                // set the inexact flag
-                *pfpsf |= INEXACT_EXCEPTION;
-                // this rounding is applied to C2 only!
-                // x_sign != y_sign
-                is_inexact_lt_midpoint = 1;
-              }
-            } else if (ind <= 21) { // if 3 <= ind <= 21
-              if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
-                  && highf2star.w[0] > __bid_one_half128[ind])
-                  || (highf2star.w[1] == 0x0
-                  && highf2star.w[0] == __bid_one_half128[ind]
-                  && (R256.w[1] || R256.w[0]))) {
-                // f2* > 1/2 and the result may be exact
-                // Calculate f2* - 1/2
-                tmp64A = highf2star.w[0] - __bid_one_half128[ind];
-                tmp64B = highf2star.w[1];
-                if (tmp64A > highf2star.w[0])
-                  tmp64B--;
-                if (tmp64B || tmp64A
-                    || R256.w[1] > __bid_ten2mk128trunc[ind].w[1]
-                    || (R256.w[1] == __bid_ten2mk128trunc[ind].w[1]
-                    && R256.w[0] > __bid_ten2mk128trunc[ind].w[0])) {
-                  // set the inexact flag
-                  *pfpsf |= INEXACT_EXCEPTION;
-                  // this rounding is applied to C2 only!
-                  // x_sign != y_sign
-                  is_inexact_gt_midpoint = 1;
-                }        // else the result is exact
-              } else { // the result is inexact; f2* <= 1/2
-                // set the inexact flag
-                *pfpsf |= INEXACT_EXCEPTION;
-                // this rounding is applied to C2 only!
-                // x_sign != y_sign
-                is_inexact_lt_midpoint = 1;
-              }
-            } else { // if 22 <= ind <= 33
-              if (highf2star.w[1] > __bid_one_half128[ind]
-                  || (highf2star.w[1] == __bid_one_half128[ind]
-                  && (highf2star.w[0] || R256.w[1]
-                      || R256.w[0]))) {
-                // f2* > 1/2 and the result may be exact
-                // Calculate f2* - 1/2
-                // tmp64A = highf2star.w[0];
-                tmp64B = highf2star.w[1] - __bid_one_half128[ind];
-                if (tmp64B || highf2star.w[0]
-                    || R256.w[1] > __bid_ten2mk128trunc[ind].w[1]
-                    || (R256.w[1] == __bid_ten2mk128trunc[ind].w[1]
-                    && R256.w[0] > __bid_ten2mk128trunc[ind].w[0])) {
-                  // set the inexact flag
-                  *pfpsf |= INEXACT_EXCEPTION;
-                  // this rounding is applied to C2 only!
-                  // x_sign != y_sign
-                  is_inexact_gt_midpoint = 1;
-                }        // else the result is exact
-              } else { // the result is inexact; f2* <= 1/2
-                // set the inexact flag
-                *pfpsf |= INEXACT_EXCEPTION;
-                // this rounding is applied to C2 only!
-                // x_sign != y_sign
-                is_inexact_lt_midpoint = 1;
-              }
-            }
-            // check for midpoints after determining inexactness
-            if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
-                && (highf2star.w[0] == 0)
-                && (R256.w[1] < __bid_ten2mk128trunc[ind].w[1]
-                    || (R256.w[1] == __bid_ten2mk128trunc[ind].w[1]
-                    && R256.w[0] <= __bid_ten2mk128trunc[ind].w[0]))) {
-              // the result is a midpoint
-              if ((tmp64 + R256.w[2]) & 0x01) { // MP in [EVEN, ODD]
-                // if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
-                R256.w[2]--;
-                if (R256.w[2] == 0xffffffffffffffffull)
-                  R256.w[3]--;
-                // this rounding is applied to C2 only!
-                // x_sign != y_sign
-                is_midpoint_lt_even = 1;
-                is_inexact_lt_midpoint = 0;
-                is_inexact_gt_midpoint = 0;
-              } else {
-                // else MP in [ODD, EVEN]
-                // this rounding is applied to C2 only!
-                // x_sign != y_sign
-                is_midpoint_gt_even = 1;
-                is_inexact_lt_midpoint = 0;
-                is_inexact_gt_midpoint = 0;
-              }
-            }
-          } else { // if (ind == -1) only when x1 = 0
-            R256.w[2] = C2_lo;
-            R256.w[3] = C2_hi;
-            is_midpoint_lt_even = 0;
-            is_midpoint_gt_even = 0;
-            is_inexact_lt_midpoint = 0;
-            is_inexact_gt_midpoint = 0;
-          }
-          // and now subtract C1 * 10^(e1-e2-x1) - (C2 * 10^(-x1))rnd,P34
-          // because x_sign != y_sign this last operation is exact
-          C1.w[0] = C1.w[0] - R256.w[2];
-          C1.w[1] = C1.w[1] - R256.w[3];
-          if (C1.w[0] > tmp64)
-            C1.w[1]--; // borrow
-          if (C1.w[1] >= 0x8000000000000000ull) { // negative coefficient!
-            C1.w[0] = ~C1.w[0];
-            C1.w[0]++;
-            C1.w[1] = ~C1.w[1];
-            if (C1.w[0] == 0x0)
-              C1.w[1]++;
-            tmp_sign = y_sign; // the result will have the sign of y
-          } else {
-            tmp_sign = x_sign;
-          }
-          // the difference has exactly P34 digits
-          x_sign = tmp_sign;
-          if (x1 >= 1)
-            y_exp = y_exp + ((UINT64) x1 << 49);
-          C1_hi = C1.w[1];
-          C1_lo = C1.w[0];
-          // general correction from RN to RA, RM, RP, RZ; result uses y_exp
-          if (rnd_mode != ROUNDING_TO_NEAREST) {
-            if ((!x_sign && 
-                ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint) || 
-                ((rnd_mode == ROUNDING_TIES_AWAY || rnd_mode == ROUNDING_UP) &&
-                is_midpoint_gt_even))) || 
-                (x_sign && 
-                ((rnd_mode == ROUNDING_DOWN && is_inexact_lt_midpoint) || 
-                ((rnd_mode == ROUNDING_TIES_AWAY || rnd_mode == ROUNDING_DOWN) 
-                && is_midpoint_gt_even)))) {
-              // C1 = C1 + 1
-              C1_lo = C1_lo + 1;
-              if (C1_lo == 0) { // rounding overflow in the low 64 bits
-                C1_hi = C1_hi + 1;
-              }
-              if (C1_hi == 0x0001ed09bead87c0ull
-                  && C1_lo == 0x378d8e6400000000ull) {
-                // C1 = 10^34 => rounding overflow
-                C1_hi = 0x0000314dc6448d93ull;
-                C1_lo = 0x38c15b0a00000000ull; // 10^33
-                y_exp = y_exp + EXP_P1;
-              }
-            } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint) && 
-                ((x_sign && 
-                (rnd_mode == ROUNDING_UP || rnd_mode == ROUNDING_TO_ZERO)) || 
-                (!x_sign && 
-                (rnd_mode == ROUNDING_DOWN || rnd_mode == ROUNDING_TO_ZERO)))) {
-              // C1 = C1 - 1
-              C1_lo = C1_lo - 1;
-              if (C1_lo == 0xffffffffffffffffull)
-                C1_hi--;
-              // check if we crossed into the lower decade
-              if (C1_hi == 0x0000314dc6448d93ull && 
-                  C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
-                C1_hi = 0x0001ed09bead87c0ull; // 10^34 - 1
-                C1_lo = 0x378d8e63ffffffffull;
-                y_exp = y_exp - EXP_P1;
-                // no underflow, because delta + q2 >= P34 + 1
-              }
-            } else {
-              ; // exact, the result is already correct
-            }
-          }
-          // assemble the result
-          res.w[1] = x_sign | y_exp | C1_hi;
-          res.w[0] = C1_lo;
-        }
-      }        // end delta = P34
-    } else { // if (|delta| <= P34 - 1)
-      if (delta >= 0) { // if (0 <= delta <= P34 - 1)
-        if (delta <= P34 - 1 - q2) {
-          // calculate C' directly; the result is exact
-          // in this case 1<=q1<=P34-1, 1<=q2<=P34-1 and 0 <= e1-e2 <= P34-2
-          // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
-          // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
-          // but their product fits with certainty in 128 bits (actually in 113)
-          scale = delta - q1 + q2; // scale = (int)(e1 >> 49) - (int)(e2 >> 49) 
-
-          if (scale >= 20) { // 10^(e1-e2) does not fit in 64 bits, but C1 does
-            __mul_128x64_to_128 (C1, C1_lo, __bid_ten2k128[scale - 20]);
-            C1_hi = C1.w[1];
-            C1_lo = C1.w[0];
-          } else if (scale >= 1) {
-            // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits 
-            if (q1 <= 19) { // C1 fits in 64 bits
-              __mul_64x64_to_128MACH (C1, C1_lo, __bid_ten2k64[scale]);
-            } else { // q1 >= 20
-              C1.w[1] = C1_hi;
-              C1.w[0] = C1_lo;
-              __mul_128x64_to_128 (C1, __bid_ten2k64[scale], C1);
-            }
-            C1_hi = C1.w[1];
-            C1_lo = C1.w[0];
-          } else { // if (scale == 0) C1 is unchanged
-            C1.w[0] = C1_lo; // C1.w[1] = C1_hi; 
-          }
-          // now add C2
-          if (x_sign == y_sign) {
-            // the result cannot overflow
-            C1_lo = C1_lo + C2_lo;
-            C1_hi = C1_hi + C2_hi;
-            if (C1_lo < C1.w[0])
-              C1_hi++;
-          } else { // if x_sign != y_sign
-            C1_lo = C1_lo - C2_lo;
-            C1_hi = C1_hi - C2_hi;
-            if (C1_lo > C1.w[0])
-              C1_hi--;
-            // the result can be zero, but it cannot overflow
-            if (C1_lo == 0 && C1_hi == 0) {
-              // assemble the result
-              if (x_exp < y_exp)
-                res.w[1] = x_exp;
-              else
-                res.w[1] = y_exp;
-              res.w[0] = 0;
-              if (rnd_mode == ROUNDING_DOWN) {
-                res.w[1] |= 0x8000000000000000ull;
-              }
-              BID_RETURN (res);
-            }
-            if (C1_hi >= 0x8000000000000000ull) { // negative coefficient!
-              C1_lo = ~C1_lo;
-              C1_lo++;
-              C1_hi = ~C1_hi;
-              if (C1_lo == 0x0)
-                C1_hi++;
-              x_sign = y_sign; // the result will have the sign of y
-            }
-          }
-          // assemble the result
-          res.w[1] = x_sign | y_exp | C1_hi;
-          res.w[0] = C1_lo;
-        } else if (delta == P34 - q2) {
-          // calculate C' directly; the result may be inexact if it requires 
-          // P34+1 decimal digits; in this case the 'cutoff' point for addition
-          // is at the position of the lsb of C2, so 0 <= e1-e2 <= P34-1
-          // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
-          // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
-          // but their product fits with certainty in 128 bits (actually in 113)
-          scale = delta - q1 + q2; // scale = (int)(e1 >> 49) - (int)(e2 >> 49)
-          if (scale >= 20) { // 10^(e1-e2) does not fit in 64 bits, but C1 does
-            __mul_128x64_to_128 (C1, C1_lo, __bid_ten2k128[scale - 20]);
-          } else if (scale >= 1) {
-            // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
-            if (q1 <= 19) { // C1 fits in 64 bits
-              __mul_64x64_to_128MACH (C1, C1_lo, __bid_ten2k64[scale]);
-            } else { // q1 >= 20
-              C1.w[1] = C1_hi;
-              C1.w[0] = C1_lo;
-              __mul_128x64_to_128 (C1, __bid_ten2k64[scale], C1);
-            }
-          } else { // if (scale == 0) C1 is unchanged
-            C1.w[1] = C1_hi;
-            C1.w[0] = C1_lo; // only the low part is necessary
-          }
-          C1_hi = C1.w[1];
-          C1_lo = C1.w[0];
-          // now add C2
-          if (x_sign == y_sign) {
-            // the result can overflow!
-            C1_lo = C1_lo + C2_lo;
-            C1_hi = C1_hi + C2_hi;
-            if (C1_lo < C1.w[0])
-              C1_hi++;
-            // test for overflow, possible only when C1 >= 10^34
-            if (C1_hi > 0x0001ed09bead87c0ull || 
-                (C1_hi == 0x0001ed09bead87c0ull && 
-                C1_lo >= 0x378d8e6400000000ull)) { // C1 >= 10^34
-              // in this case q = P34 + 1 and x = q - P34 = 1, so multiply 
-              // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1 
-              // decimal digits
-              // Calculate C'' = C' + 1/2 * 10^x
-              if (C1_lo >= 0xfffffffffffffffbull) { // low half add has carry
-                C1_lo = C1_lo + 5;
-                C1_hi = C1_hi + 1;
-              } else {
-                C1_lo = C1_lo + 5;
-              }
-              // the approximation of 10^(-1) was rounded up to 118 bits
-              // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
-              // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
-              C1.w[1] = C1_hi;
-              C1.w[0] = C1_lo; // C''
-              __bid_ten2m1.w[1] = 0x1999999999999999ull;
-              __bid_ten2m1.w[0] = 0x9999999999999a00ull;
-              __mul_128x128_to_256 (P256, C1, __bid_ten2m1); // P256 = C*, f*
-              // C* is actually floor(C*) in this case
-              // the top Ex = 128 bits of 10^(-1) are 
-              // T* = 0x00199999999999999999999999999999
-              // if (0 < f* < 10^(-x)) then
-              //   if floor(C*) is even then C = floor(C*) - logical right 
-              //       shift; C has p decimal digits, correct by Prop. 1)
-              //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
-              //       shift; C has p decimal digits, correct by Pr. 1)
-              // else
-              //   C = floor(C*) (logical right shift; C has p decimal digits,
-              //       correct by Property 1)
-              // n = C * 10^(e2+x)
-              if ((P256.w[1] || P256.w[0])
-                  && (P256.w[1] < 0x1999999999999999ull
-                      || (P256.w[1] == 0x1999999999999999ull
-                      && P256.w[0] <= 0x9999999999999999ull))) {
-                // the result is a midpoint
-                if (P256.w[2] & 0x01) {
-                  is_midpoint_gt_even = 1;
-                  // if floor(C*) is odd C = floor(C*) - 1; the result is not 0
-                  P256.w[2]--;
-                  if (P256.w[2] == 0xffffffffffffffffull)
-                    P256.w[3]--;
-                } else {
-                  is_midpoint_lt_even = 1;
-                }
-              }
-              // n = Cstar * 10^(e2+1)
-              y_exp = y_exp + EXP_P1;
-              // C* != 10^P because C* has P34 digits
-              // check for overflow
-              if (y_exp == EXP_MAX_P1
-                  && (rnd_mode == ROUNDING_TO_NEAREST
-                      || rnd_mode == ROUNDING_TIES_AWAY)) {
-                // overflow for RN
-                res.w[1] = x_sign | 0x7800000000000000ull; // +/-inf
-                res.w[0] = 0x0ull;
-                // set the inexact flag
-                *pfpsf |= INEXACT_EXCEPTION;
-                // set the overflow flag
-                *pfpsf |= OVERFLOW_EXCEPTION;
-                BID_RETURN (res);
-              }
-              // if (0 < f* - 1/2 < 10^(-x)) then 
-              //   the result of the addition is exact 
-              // else 
-              //   the result of the addition is inexact
-              if (P256.w[1] > 0x8000000000000000ull || 
-                  (P256.w[1] == 0x8000000000000000ull && 
-                  P256.w[0] > 0x0ull)) { // the result may be exact
-                tmp64 = P256.w[1] - 0x8000000000000000ull; // f* - 1/2
-                if ((tmp64 > 0x1999999999999999ull
-                     || (tmp64 == 0x1999999999999999ull
-                     && P256.w[0] >= 0x9999999999999999ull))) {
-                  // set the inexact flag
-                  *pfpsf |= INEXACT_EXCEPTION;
-                  is_inexact = 1;
-                }        // else the result is exact
-              } else { // the result is inexact
-                // set the inexact flag
-                *pfpsf |= INEXACT_EXCEPTION;
-                is_inexact = 1;
-              }
-              C1_hi = P256.w[3];
-              C1_lo = P256.w[2];
-              if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
-                is_inexact_lt_midpoint = is_inexact
-                  && (P256.w[1] & 0x8000000000000000ull);
-                is_inexact_gt_midpoint = is_inexact
-                  && !(P256.w[1] & 0x8000000000000000ull);
-              }
-              // general correction from RN to RA, RM, RP, RZ; 
-              // result uses y_exp
-              if (rnd_mode != ROUNDING_TO_NEAREST) {
-                if ((!x_sign && 
-                    ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint) || 
-                    ((rnd_mode == ROUNDING_TIES_AWAY || rnd_mode == ROUNDING_UP)
-                    && is_midpoint_gt_even))) || 
-                    (x_sign && 
-                    ((rnd_mode == ROUNDING_DOWN && is_inexact_lt_midpoint) || 
-                    ((rnd_mode == ROUNDING_TIES_AWAY || 
-                    rnd_mode == ROUNDING_DOWN) && is_midpoint_gt_even)))) {
-                  // C1 = C1 + 1
-                  C1_lo = C1_lo + 1;
-                  if (C1_lo == 0) { // rounding overflow in the low 64 bits
-                    C1_hi = C1_hi + 1;
-                  }
-                  if (C1_hi == 0x0001ed09bead87c0ull
-                      && C1_lo == 0x378d8e6400000000ull) {
-                    // C1 = 10^34 => rounding overflow
-                    C1_hi = 0x0000314dc6448d93ull;
-                    C1_lo = 0x38c15b0a00000000ull; // 10^33
-                    y_exp = y_exp + EXP_P1;
-                  }
-                } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint) && 
-                    ((x_sign && (rnd_mode == ROUNDING_UP || 
-                    rnd_mode == ROUNDING_TO_ZERO)) || (!x_sign && 
-                    (rnd_mode == ROUNDING_DOWN || 
-                    rnd_mode == ROUNDING_TO_ZERO)))) {
-                  // C1 = C1 - 1
-                  C1_lo = C1_lo - 1;
-                  if (C1_lo == 0xffffffffffffffffull)
-                    C1_hi--;
-                  // check if we crossed into the lower decade
-                  if (C1_hi == 0x0000314dc6448d93ull && 
-                      C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
-                    C1_hi = 0x0001ed09bead87c0ull; // 10^34 - 1
-                    C1_lo = 0x378d8e63ffffffffull;
-                    y_exp = y_exp - EXP_P1;
-                    // no underflow, because delta + q2 >= P34 + 1
-                  }
-                } else {
-                  ; // exact, the result is already correct
-                }
-                // in all cases check for overflow (RN and RA solved already)
-                if (y_exp == EXP_MAX_P1) { // overflow
-                  if ((rnd_mode == ROUNDING_DOWN && x_sign) || // RM and res < 0
-                      (rnd_mode == ROUNDING_UP && !x_sign)) { // RP and res > 0
-                    C1_hi = 0x7800000000000000ull; // +inf
-                    C1_lo = 0x0ull;
-                  } else { // RM and res > 0, RP and res < 0, or RZ
-                    C1_hi = 0x5fffed09bead87c0ull;
-                    C1_lo = 0x378d8e63ffffffffull;
-                  }
-                  y_exp = 0; // x_sign is preserved
-                  // set the inexact flag (in case the exact addition was exact)
-                  *pfpsf |= INEXACT_EXCEPTION;
-                  // set the overflow flag
-                  *pfpsf |= OVERFLOW_EXCEPTION;
-                }
-              }
-            } // else if (C1 < 10^34) then C1 is the coeff.; the result is exact
-          } else { // if x_sign != y_sign the result is exact
-            C1_lo = C1_lo - C2_lo;
-            C1_hi = C1_hi - C2_hi;
-            if (C1_lo > C1.w[0])
-              C1_hi--;
-            // the result can be zero, but it cannot overflow
-            if (C1_lo == 0 && C1_hi == 0) {
-              // assemble the result
-              if (x_exp < y_exp)
-                res.w[1] = x_exp;
-              else
-                res.w[1] = y_exp;
-              res.w[0] = 0;
-              if (rnd_mode == ROUNDING_DOWN) {
-                res.w[1] |= 0x8000000000000000ull;
-              }
-              BID_RETURN (res);
-            }
-            if (C1_hi >= 0x8000000000000000ull) { // negative coefficient!
-              C1_lo = ~C1_lo;
-              C1_lo++;
-              C1_hi = ~C1_hi;
-              if (C1_lo == 0x0)
-                C1_hi++;
-              x_sign = y_sign; // the result will have the sign of y
-            }
-          }
-          // assemble the result
-          res.w[1] = x_sign | y_exp | C1_hi;
-          res.w[0] = C1_lo;
-        } else { // if (delta >= P34 + 1 - q2)
-          // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
-          // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34 
-          // where x1 = q1 + e1 - e2 - P34, 1 <= x1 <= P34 - 1
-          // In most cases C' will have P34 digits, and n = C' * 10^(e2+x1)
-          // If the result has P34+1 digits, redo the steps above with x1+1
-          // If the result has P34-1 digits or less, redo the steps above with 
-          // x1-1 but only if initially x1 >= 1
-          // NOTE: these two steps can be improved, e.g we could guess if
-          // P34+1 or P34-1 digits will be obtained by adding/subtracting just
-          // the top 64 bits of the two operands
-          // The result cannot be zero, but it can overflow
-          x1 = delta + q2 - P34; // 1 <= x1 <= P34-1
-        roundC2:
-          // Calculate C1 * 10^(e1-e2-x1) where 0 <= e1-e2-x1 <= P34 - 1
-          // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
-          scale = delta - q1 + q2 - x1; // scale = e1 - e2 - x1 = P34 - q1
-          // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
-          // but their product fits with certainty in 128 bits (actually in 113)
-          if (scale >= 20) { //10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
-            __mul_128x64_to_128 (C1, C1_lo, __bid_ten2k128[scale - 20]);
-          } else if (scale >= 1) {
-            // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
-            if (q1 <= 19) { // C1 fits in 64 bits
-              __mul_64x64_to_128MACH (C1, C1_lo, __bid_ten2k64[scale]);
-            } else { // q1 >= 20
-              C1.w[1] = C1_hi;
-              C1.w[0] = C1_lo;
-              __mul_128x64_to_128 (C1, __bid_ten2k64[scale], C1);
-            }
-          } else { // if (scale == 0) C1 is unchanged
-            C1.w[1] = C1_hi;
-            C1.w[0] = C1_lo;
-          }
-          tmp64 = C1.w[0]; // C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
-
-          // now round C2 to q2-x1 decimal digits, where 1<=x1<=q2-1<=P34-1
-          // (but if we got here a second time after x1 = x1 - 1, then 
-          // x1 >= 0; note that for x1 = 0 C2 is unchanged)
-          // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
-          ind = x1 - 1; // 0 <= ind <= q2-2<=P34-2=32; but note that if x1 = 0
-          // during a second pass, then ind = -1
-          if (ind >= 0) { // if (x1 >= 1)
-            C2.w[0] = C2_lo;
-            C2.w[1] = C2_hi;
-            if (ind <= 18) {
-              C2.w[0] = C2.w[0] + __bid_midpoint64[ind];
-              if (C2.w[0] < C2_lo)
-                C2.w[1]++;
-            } else { // 19 <= ind <= 32
-              C2.w[0] = C2.w[0] + __bid_midpoint128[ind - 19].w[0];
-              C2.w[1] = C2.w[1] + __bid_midpoint128[ind - 19].w[1];
-              if (C2.w[0] < C2_lo)
-                C2.w[1]++;
-            }
-            // the approximation of 10^(-x1) was rounded up to 118 bits
-            __mul_128x128_to_256 (R256, C2, __bid_ten2mk128[ind]); // R256 = C2*, f2*
-            // calculate C2* and f2*
-            // C2* is actually floor(C2*) in this case
-            // C2* and f2* need shifting and masking, as shown by
-            // __bid_shiftright128[] and __bid_maskhigh128[]
-            // the top Ex bits of 10^(-x1) are T* = __bid_ten2mk128trunc[ind], e.g.
-            // if x1=1, T*=__bid_ten2mk128trunc[0]=0x19999999999999999999999999999999
-            // if (0 < f2* < 10^(-x1)) then
-            //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
-            //       shift; C2* has p decimal digits, correct by Prop. 1)
-            //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
-            //       shift; C2* has p decimal digits, correct by Pr. 1)
-            // else
-            //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
-            //       correct by Property 1)
-            // n = C2* * 10^(e2+x1)
-
-            if (ind <= 2) {
-              highf2star.w[1] = 0x0;
-              highf2star.w[0] = 0x0; // low f2* ok
-            } else if (ind <= 21) {
-              highf2star.w[1] = 0x0;
-              highf2star.w[0] = R256.w[2] & __bid_maskhigh128[ind]; // low f2* ok
-            } else {
-              highf2star.w[1] = R256.w[3] & __bid_maskhigh128[ind];
-              highf2star.w[0] = R256.w[2]; // low f2* is ok
-            }
-            // shift right C2* by Ex-128 = __bid_shiftright128[ind]
-            if (ind >= 3) {
-              shift = __bid_shiftright128[ind];
-              if (shift < 64) { // 3 <= shift <= 63
-                R256.w[2] =
-                  (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
-                R256.w[3] = (R256.w[3] >> shift);
-              } else { // 66 <= shift <= 102
-                R256.w[2] = (R256.w[3] >> (shift - 64));
-                R256.w[3] = 0x0ULL;
-              }
-            }
-            if (second_pass) {
-              is_inexact_lt_midpoint = 0;
-              is_inexact_gt_midpoint = 0;
-              is_midpoint_lt_even = 0;
-              is_midpoint_gt_even = 0;
-            }
-            // determine inexactness of the rounding of C2* (this may be 
-            // followed by a second rounding only if we get P34+1 
-            // decimal digits)
-            // if (0 < f2* - 1/2 < 10^(-x1)) then
-            //   the result is exact
-            // else (if f2* - 1/2 > T* then)
-            //   the result of is inexact
-            if (ind <= 2) {
-              if (R256.w[1] > 0x8000000000000000ull || 
-                  (R256.w[1] == 0x8000000000000000ull && R256.w[0] > 0x0ull)) {
-                  // f2* > 1/2 and the result may be exact
-                tmp64A = R256.w[1] - 0x8000000000000000ull; // f* - 1/2
-                if ((tmp64A > __bid_ten2mk128trunc[ind].w[1]
-                     || (tmp64A == __bid_ten2mk128trunc[ind].w[1]
-                     && R256.w[0] >= __bid_ten2mk128trunc[ind].w[0]))) {
-                  // set the inexact flag
-                  // *pfpsf |= INEXACT_EXCEPTION;
-                  tmp_inexact = 1; // may be set again during a second pass
-                  // this rounding is applied to C2 only!
-                  if (x_sign == y_sign)
-                    is_inexact_lt_midpoint = 1;
-                  else        // if (x_sign != y_sign)
-                    is_inexact_gt_midpoint = 1;
-                }        // else the result is exact
-                // rounding down, unless a midpoint in [ODD, EVEN]
-              } else { // the result is inexact; f2* <= 1/2
-                // set the inexact flag
-                // *pfpsf |= INEXACT_EXCEPTION;
-                tmp_inexact = 1; // just in case we will round a second time
-                // rounding up, unless a midpoint in [EVEN, ODD]
-                // this rounding is applied to C2 only!
-                if (x_sign == y_sign)
-                  is_inexact_gt_midpoint = 1;
-                else        // if (x_sign != y_sign)
-                  is_inexact_lt_midpoint = 1;
-              }
-            } else if (ind <= 21) { // if 3 <= ind <= 21
-              if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
-                  && highf2star.w[0] > __bid_one_half128[ind])
-                  || ((highf2star.w[1] == 0x0
-                  && highf2star.w[0] == __bid_one_half128[ind])
-                  && (R256.w[1] || R256.w[0]))) {
-                // f2* > 1/2 and the result may be exact
-                // Calculate f2* - 1/2
-                tmp64A = highf2star.w[0] - __bid_one_half128[ind];
-                tmp64B = highf2star.w[1];
-                if (tmp64A > highf2star.w[0])
-                  tmp64B--;
-                if (tmp64B || tmp64A
-                    || R256.w[1] > __bid_ten2mk128trunc[ind].w[1]
-                    || (R256.w[1] == __bid_ten2mk128trunc[ind].w[1]
-                    && R256.w[0] > __bid_ten2mk128trunc[ind].w[0])) {
-                  // set the inexact flag
-                  // *pfpsf |= INEXACT_EXCEPTION;
-                  tmp_inexact = 1; // may be set again during a second pass
-                  // this rounding is applied to C2 only!
-                  if (x_sign == y_sign)
-                    is_inexact_lt_midpoint = 1;
-                  else        // if (x_sign != y_sign)
-                    is_inexact_gt_midpoint = 1;
-                }        // else the result is exact
-              } else { // the result is inexact; f2* <= 1/2
-                // set the inexact flag
-                // *pfpsf |= INEXACT_EXCEPTION;
-                tmp_inexact = 1; // may be set again during a second pass
-                // rounding up, unless a midpoint in [EVEN, ODD]
-                // this rounding is applied to C2 only!
-                if (x_sign == y_sign)
-                  is_inexact_gt_midpoint = 1;
-                else        // if (x_sign != y_sign)
-                  is_inexact_lt_midpoint = 1;
-              }
-            } else { // if 22 <= ind <= 33
-              if (highf2star.w[1] > __bid_one_half128[ind]
-                  || (highf2star.w[1] == __bid_one_half128[ind]
-                  && (highf2star.w[0] || R256.w[1]
-                      || R256.w[0]))) {
-                // f2* > 1/2 and the result may be exact
-                // Calculate f2* - 1/2
-                // tmp64A = highf2star.w[0];
-                tmp64B = highf2star.w[1] - __bid_one_half128[ind];
-                if (tmp64B || highf2star.w[0]
-                    || R256.w[1] > __bid_ten2mk128trunc[ind].w[1]
-                    || (R256.w[1] == __bid_ten2mk128trunc[ind].w[1]
-                    && R256.w[0] > __bid_ten2mk128trunc[ind].w[0])) {
-                  // set the inexact flag
-                  // *pfpsf |= INEXACT_EXCEPTION;
-                  tmp_inexact = 1; // may be set again during a second pass
-                  // this rounding is applied to C2 only!
-                  if (x_sign == y_sign)
-                    is_inexact_lt_midpoint = 1;
-                  else        // if (x_sign != y_sign)
-                    is_inexact_gt_midpoint = 1;
-                }        // else the result is exact
-              } else { // the result is inexact; f2* <= 1/2
-                // set the inexact flag
-                // *pfpsf |= INEXACT_EXCEPTION;
-                tmp_inexact = 1; // may be set again during a second pass
-                // rounding up, unless a midpoint in [EVEN, ODD]
-                // this rounding is applied to C2 only!
-                if (x_sign == y_sign)
-                  is_inexact_gt_midpoint = 1;
-                else        // if (x_sign != y_sign)
-                  is_inexact_lt_midpoint = 1;
-              }
-            }
-            // check for midpoints
-            if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
-                && (highf2star.w[0] == 0)
-                && (R256.w[1] < __bid_ten2mk128trunc[ind].w[1]
-                    || (R256.w[1] == __bid_ten2mk128trunc[ind].w[1]
-                    && R256.w[0] <= __bid_ten2mk128trunc[ind].w[0]))) {
-              // the result is a midpoint
-              if ((tmp64 + R256.w[2]) & 0x01) { // MP in [EVEN, ODD]
-                // if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
-                R256.w[2]--;
-                if (R256.w[2] == 0xffffffffffffffffull)
-                  R256.w[3]--;
-                // this rounding is applied to C2 only!
-                if (x_sign == y_sign)
-                  is_midpoint_gt_even = 1;
-                else        // if (x_sign != y_sign)
-                  is_midpoint_lt_even = 1;
-                is_inexact_lt_midpoint = 0;
-                is_inexact_gt_midpoint = 0;
-              } else {
-                // else MP in [ODD, EVEN]
-                // this rounding is applied to C2 only!
-                if (x_sign == y_sign)
-                  is_midpoint_lt_even = 1;
-                else        // if (x_sign != y_sign)
-                  is_midpoint_gt_even = 1;
-                is_inexact_lt_midpoint = 0;
-                is_inexact_gt_midpoint = 0;
-              }
-            }
-            // end if (ind >= 0)
-          } else { // if (ind == -1); only during a 2nd pass, and when x1 = 0
-            R256.w[2] = C2_lo;
-            R256.w[3] = C2_hi;
-            tmp_inexact = 0;
-            // to correct a possible setting to 1 from 1st pass
-            if (second_pass) {
-              is_midpoint_lt_even = 0;
-              is_midpoint_gt_even = 0;
-              is_inexact_lt_midpoint = 0;
-              is_inexact_gt_midpoint = 0;
-            }
-          }
-          // and now add/subtract C1 * 10^(e1-e2-x1) +/- (C2 * 10^(-x1))rnd,P34
-          if (x_sign == y_sign) { // addition; could overflow
-            // no second pass is possible this way (only for x_sign != y_sign)
-            C1.w[0] = C1.w[0] + R256.w[2];
-            C1.w[1] = C1.w[1] + R256.w[3];
-            if (C1.w[0] < tmp64)
-              C1.w[1]++; // carry
-            // if the sum has P34+1 digits, i.e. C1>=10^34 redo the calculation
-            // with x1=x1+1 
-            if (C1.w[1] > 0x0001ed09bead87c0ull || 
-                (C1.w[1] == 0x0001ed09bead87c0ull && 
-                C1.w[0] >= 0x378d8e6400000000ull)) { // C1 >= 10^34
-              // chop off one more digit from the sum, but make sure there is
-              // no double-rounding error (see table - double rounding logic)
-              // now round C1 from P34+1 to P34 decimal digits
-              // C1' = C1 + 1/2 * 10 = C1 + 5
-              if (C1.w[0] >= 0xfffffffffffffffbull) { // low half add has carry
-                C1.w[0] = C1.w[0] + 5;
-                C1.w[1] = C1.w[1] + 1;
-              } else {
-                C1.w[0] = C1.w[0] + 5;
-              }
-              // the approximation of 10^(-1) was rounded up to 118 bits
-              __mul_128x128_to_256 (Q256, C1, __bid_ten2mk128[0]); // Q256 = C1*, f1*
-              // C1* is actually floor(C1*) in this case
-              // the top 128 bits of 10^(-1) are
-              // T* = __bid_ten2mk128trunc[0]=0x19999999999999999999999999999999
-              // if (0 < f1* < 10^(-1)) then
-              //   if floor(C1*) is even then C1* = floor(C1*) - logical right
-              //       shift; C1* has p decimal digits, correct by Prop. 1)
-              //   else if floor(C1*) is odd C1* = floor(C1*) - 1 (logical right
-              //       shift; C1* has p decimal digits, correct by Pr. 1)
-              // else
-              //   C1* = floor(C1*) (logical right shift; C has p decimal digits
-              //       correct by Property 1)
-              // n = C1* * 10^(e2+x1+1)
-              if ((Q256.w[1] || Q256.w[0])
-                  && (Q256.w[1] < __bid_ten2mk128trunc[0].w[1]
-                      || (Q256.w[1] == __bid_ten2mk128trunc[0].w[1]
-                      && Q256.w[0] <= __bid_ten2mk128trunc[0].w[0]))) {
-                // the result is a midpoint
-                if (is_inexact_lt_midpoint) { // for the 1st rounding
-                  is_inexact_gt_midpoint = 1;
-                  is_inexact_lt_midpoint = 0;
-                  is_midpoint_gt_even = 0;
-                  is_midpoint_lt_even = 0;
-                } else if (is_inexact_gt_midpoint) { // for the 1st rounding
-                  Q256.w[2]--;
-                  if (Q256.w[2] == 0xffffffffffffffffull)
-                    Q256.w[3]--;
-                  is_inexact_gt_midpoint = 0;
-                  is_inexact_lt_midpoint = 1;
-                  is_midpoint_gt_even = 0;
-                  is_midpoint_lt_even = 0;
-                } else if (is_midpoint_gt_even) { // for the 1st rounding
-                  // Note: cannot have is_midpoint_lt_even
-                  is_inexact_gt_midpoint = 0;
-                  is_inexact_lt_midpoint = 1;
-                  is_midpoint_gt_even = 0;
-                  is_midpoint_lt_even = 0;
-                } else { // the first rounding must have been exact
-                  if (Q256.w[2] & 0x01) { // MP in [EVEN, ODD]
-                    // the truncated result is correct
-                    Q256.w[2]--;
-                    if (Q256.w[2] == 0xffffffffffffffffull)
-                      Q256.w[3]--;
-                    is_inexact_gt_midpoint = 0;
-                    is_inexact_lt_midpoint = 0;
-                    is_midpoint_gt_even = 1;
-                    is_midpoint_lt_even = 0;
-                  } else { // MP in [ODD, EVEN]
-                    is_inexact_gt_midpoint = 0;
-                    is_inexact_lt_midpoint = 0;
-                    is_midpoint_gt_even = 0;
-                    is_midpoint_lt_even = 1;
-                  }
-                }
-                tmp_inexact = 1; // in all cases
-              } else { // the result is not a midpoint 
-                // determine inexactness of the rounding of C1 (the sum C1+C2*)
-                // if (0 < f1* - 1/2 < 10^(-1)) then
-                //   the result is exact
-                // else (if f1* - 1/2 > T* then)
-                //   the result of is inexact
-                // ind = 0
-                if (Q256.w[1] > 0x8000000000000000ull
-                    || (Q256.w[1] == 0x8000000000000000ull
-                    && Q256.w[0] > 0x0ull)) {
-                  // f1* > 1/2 and the result may be exact
-                  Q256.w[1] = Q256.w[1] - 0x8000000000000000ull; // f1* - 1/2
-                  if ((Q256.w[1] > __bid_ten2mk128trunc[0].w[1]
-                       || (Q256.w[1] == __bid_ten2mk128trunc[0].w[1]
-                       && Q256.w[0] > __bid_ten2mk128trunc[0].w[0]))) {
-                    is_inexact_gt_midpoint = 0;
-                    is_inexact_lt_midpoint = 1;
-                    is_midpoint_gt_even = 0;
-                    is_midpoint_lt_even = 0;
-                    // set the inexact flag
-                    tmp_inexact = 1;
-                    // *pfpsf |= INEXACT_EXCEPTION;
-                  } else { // else the result is exact for the 2nd rounding
-                    if (tmp_inexact) { // if the previous rounding was inexact
-                      if (is_midpoint_lt_even) {
-                        is_inexact_gt_midpoint = 1;
-                        is_midpoint_lt_even = 0;
-                      } else if (is_midpoint_gt_even) {
-                        is_inexact_lt_midpoint = 1;
-                        is_midpoint_gt_even = 0;
-                      } else {
-                        ; // no change
-                      }
-                    }
-                  }
-                  // rounding down, unless a midpoint in [ODD, EVEN]
-                } else { // the result is inexact; f1* <= 1/2
-                  is_inexact_gt_midpoint = 1;
-                  is_inexact_lt_midpoint = 0;
-                  is_midpoint_gt_even = 0;
-                  is_midpoint_lt_even = 0;
-                  // set the inexact flag
-                  tmp_inexact = 1;
-                  // *pfpsf |= INEXACT_EXCEPTION;
-                }
-              }        // end 'the result is not a midpoint'
-              // n = C1 * 10^(e2+x1)
-              C1.w[1] = Q256.w[3];
-              C1.w[0] = Q256.w[2];
-              y_exp = y_exp + ((UINT64) (x1 + 1) << 49);
-            } else { // C1 < 10^34
-              // C1.w[1] and C1.w[0] already set
-              // n = C1 * 10^(e2+x1)
-              y_exp = y_exp + ((UINT64) x1 << 49);
-            }
-            // check for overflow
-            if (y_exp == EXP_MAX_P1
-                && (rnd_mode == ROUNDING_TO_NEAREST
-                    || rnd_mode == ROUNDING_TIES_AWAY)) {
-              res.w[1] = 0x7800000000000000ull | x_sign; // +/-inf
-              res.w[0] = 0x0ull;
-              // set the inexact flag
-              *pfpsf |= INEXACT_EXCEPTION;
-              // set the overflow flag
-              *pfpsf |= OVERFLOW_EXCEPTION;
-              BID_RETURN (res);
-            }        // else no overflow
-          } else { // if x_sign != y_sign the result of this subtract. is exact
-            C1.w[0] = C1.w[0] - R256.w[2];
-            C1.w[1] = C1.w[1] - R256.w[3];
-            if (C1.w[0] > tmp64)
-              C1.w[1]--; // borrow
-            if (C1.w[1] >= 0x8000000000000000ull) { // negative coefficient!
-              C1.w[0] = ~C1.w[0];
-              C1.w[0]++;
-              C1.w[1] = ~C1.w[1];
-              if (C1.w[0] == 0x0)
-                C1.w[1]++;
-              tmp_sign = y_sign; 
-                  // the result will have the sign of y if last rnd
-            } else {
-              tmp_sign = x_sign;
-            }
-            // if the difference has P34-1 digits or less, i.e. C1 < 10^33 then
-            //   redo the calculation with x1=x1-1;
-            // redo the calculation also if C1 = 10^33 and 
-            //   (is_inexact_gt_midpoint or is_midpoint_lt_even);
-            //   (the last part should have really been 
-            //   (is_inexact_lt_midpoint or is_midpoint_gt_even) from
-            //    the rounding of C2, but the position flags have been reversed)
-            // 10^33 = 0x0000314dc6448d93 0x38c15b0a00000000
-            if ((C1.w[1] < 0x0000314dc6448d93ull || 
-                (C1.w[1] == 0x0000314dc6448d93ull && 
-                C1.w[0] < 0x38c15b0a00000000ull)) || 
-                (C1.w[1] == 0x0000314dc6448d93ull && 
-                C1.w[0] == 0x38c15b0a00000000ull && 
-                (is_inexact_gt_midpoint || is_midpoint_lt_even))) { // C1=10^33
-              x1 = x1 - 1; // x1 >= 0
-              if (x1 >= 0) {
-                // clear position flags and tmp_inexact
-                is_midpoint_lt_even = 0;
-                is_midpoint_gt_even = 0;
-                is_inexact_lt_midpoint = 0;
-                is_inexact_gt_midpoint = 0;
-                tmp_inexact = 0;
-                second_pass = 1;
-                goto roundC2; // else result has less than P34 digits
-              }
-            }
-            // if the coefficient of the result is 10^34 it means that this
-            // must be the second pass, and we are done 
-            if (C1.w[1] == 0x0001ed09bead87c0ull && 
-                C1.w[0] == 0x378d8e6400000000ull) { // if  C1 = 10^34
-              C1.w[1] = 0x0000314dc6448d93ull; // C1 = 10^33
-              C1.w[0] = 0x38c15b0a00000000ull;
-              y_exp = y_exp + ((UINT64) 1 << 49);
-            }
-            x_sign = tmp_sign;
-            if (x1 >= 1)
-              y_exp = y_exp + ((UINT64) x1 << 49);
-            // x1 = -1 is possible at the end of a second pass when the 
-            // first pass started with x1 = 1 
-          }
-          C1_hi = C1.w[1];
-          C1_lo = C1.w[0];
-          // general correction from RN to RA, RM, RP, RZ; result uses y_exp
-          if (rnd_mode != ROUNDING_TO_NEAREST) {
-            if ((!x_sign
-                && ((rnd_mode == ROUNDING_UP
-                    && is_inexact_lt_midpoint)
-                    || ((rnd_mode == ROUNDING_TIES_AWAY
-                        || rnd_mode == ROUNDING_UP)
-                    && is_midpoint_gt_even))) || 
-                (x_sign
-                && ((rnd_mode == ROUNDING_DOWN
-                    && is_inexact_lt_midpoint)
-                    || ((rnd_mode == ROUNDING_TIES_AWAY
-                        || rnd_mode == ROUNDING_DOWN)
-                    && is_midpoint_gt_even)))) {
-              // C1 = C1 + 1
-              C1_lo = C1_lo + 1;
-              if (C1_lo == 0) { // rounding overflow in the low 64 bits
-                C1_hi = C1_hi + 1;
-              }
-              if (C1_hi == 0x0001ed09bead87c0ull
-                  && C1_lo == 0x378d8e6400000000ull) {
-                // C1 = 10^34 => rounding overflow
-                C1_hi = 0x0000314dc6448d93ull;
-                C1_lo = 0x38c15b0a00000000ull; // 10^33
-                y_exp = y_exp + EXP_P1;
-              }
-            } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint) &&
-                ((x_sign &&
-                (rnd_mode == ROUNDING_UP || rnd_mode == ROUNDING_TO_ZERO)) ||
-                (!x_sign &&
-                (rnd_mode == ROUNDING_DOWN || rnd_mode == ROUNDING_TO_ZERO)))) {
-              // C1 = C1 - 1
-              C1_lo = C1_lo - 1;
-              if (C1_lo == 0xffffffffffffffffull)
-                C1_hi--;
-              // check if we crossed into the lower decade
-              if (C1_hi == 0x0000314dc6448d93ull && 
-                C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
-                C1_hi = 0x0001ed09bead87c0ull; // 10^34 - 1
-                C1_lo = 0x378d8e63ffffffffull;
-                y_exp = y_exp - EXP_P1;
-                // no underflow, because delta + q2 >= P34 + 1
-              }
-            } else {
-              ; // exact, the result is already correct
-            }
-            // in all cases check for overflow (RN and RA solved already)
-            if (y_exp == EXP_MAX_P1) { // overflow
-              if ((rnd_mode == ROUNDING_DOWN && x_sign) || // RM and res < 0
-                  (rnd_mode == ROUNDING_UP && !x_sign)) { // RP and res > 0
-                C1_hi = 0x7800000000000000ull; // +inf
-                C1_lo = 0x0ull;
-              } else { // RM and res > 0, RP and res < 0, or RZ
-                C1_hi = 0x5fffed09bead87c0ull;
-                C1_lo = 0x378d8e63ffffffffull;
-              }
-              y_exp = 0; // x_sign is preserved
-              // set the inexact flag (in case the exact addition was exact)
-              *pfpsf |= INEXACT_EXCEPTION;
-              // set the overflow flag
-              *pfpsf |= OVERFLOW_EXCEPTION;
-            }
-          }
-          // assemble the result
-          res.w[1] = x_sign | y_exp | C1_hi;
-          res.w[0] = C1_lo;
-          if (tmp_inexact)
-            *pfpsf |= INEXACT_EXCEPTION;
-        }
-      } else { // if (-P34 + 1 <= delta <= -1) <=> 1 <= -delta <= P34 - 1
-        // NOTE: the following, up to "} else { // if x_sign != y_sign 
-        // the result is exact" is identical to "else if (delta == P34 - q2) {"
-        // from above; also, the code is not symmetric: a+b and b+a may take
-        // different paths (need to unify eventually!) 
-        // calculate C' = C2 + C1 * 10^(e1-e2) directly; the result may be 
-        // inexact if it requires P34 + 1 decimal digits; in either case the 
-        // 'cutoff' point for addition is at the position of the lsb of C2
-        // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
-        // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
-        // but their product fits with certainty in 128 bits (actually in 113)
-        // Note that 0 <= e1 - e2 <= P34 - 2
-        //   -P34 + 1 <= delta <= -1 <=> -P34 + 1 <= delta <= -1 <=>
-        //   -P34 + 1 <= q1 + e1 - q2 - e2 <= -1 <=>
-        //   q2 - q1 - P34 + 1 <= e1 - e2 <= q2 - q1 - 1 <=>
-        //   1 - P34 - P34 + 1 <= e1-e2 <= P34 - 1 - 1 => 0 <= e1-e2 <= P34 - 2
-        scale = delta - q1 + q2; // scale = (int)(e1 >> 49) - (int)(e2 >> 49)
-        if (scale >= 20) { // 10^(e1-e2) does not fit in 64 bits, but C1 does
-          __mul_128x64_to_128 (C1, C1_lo, __bid_ten2k128[scale - 20]);
-        } else if (scale >= 1) {
-          // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
-          if (q1 <= 19) { // C1 fits in 64 bits
-            __mul_64x64_to_128MACH (C1, C1_lo, __bid_ten2k64[scale]);
-          } else { // q1 >= 20
-            C1.w[1] = C1_hi;
-            C1.w[0] = C1_lo;
-            __mul_128x64_to_128 (C1, __bid_ten2k64[scale], C1);
-          }
-        } else { // if (scale == 0) C1 is unchanged
-          C1.w[1] = C1_hi;
-          C1.w[0] = C1_lo; // only the low part is necessary
-        }
-        C1_hi = C1.w[1];
-        C1_lo = C1.w[0];
-        // now add C2
-        if (x_sign == y_sign) {
-          // the result can overflow!
-          C1_lo = C1_lo + C2_lo;
-          C1_hi = C1_hi + C2_hi;
-          if (C1_lo < C1.w[0])
-            C1_hi++;
-          // test for overflow, possible only when C1 >= 10^34
-          if (C1_hi > 0x0001ed09bead87c0ull || 
-              (C1_hi == 0x0001ed09bead87c0ull && 
-              C1_lo >= 0x378d8e6400000000ull)) { // C1 >= 10^34
-            // in this case q = P34 + 1 and x = q - P34 = 1, so multiply 
-            // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1 
-            // decimal digits
-            // Calculate C'' = C' + 1/2 * 10^x
-            if (C1_lo >= 0xfffffffffffffffbull) { // low half add has carry
-              C1_lo = C1_lo + 5;
-              C1_hi = C1_hi + 1;
-            } else {
-              C1_lo = C1_lo + 5;
-            }
-            // the approximation of 10^(-1) was rounded up to 118 bits
-            // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
-            // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
-            C1.w[1] = C1_hi;
-            C1.w[0] = C1_lo; // C''
-            __bid_ten2m1.w[1] = 0x1999999999999999ull;
-            __bid_ten2m1.w[0] = 0x9999999999999a00ull;
-            __mul_128x128_to_256 (P256, C1, __bid_ten2m1); // P256 = C*, f*
-            // C* is actually floor(C*) in this case
-            // the top Ex = 128 bits of 10^(-1) are 
-            // T* = 0x00199999999999999999999999999999
-            // if (0 < f* < 10^(-x)) then
-            //   if floor(C*) is even then C = floor(C*) - logical right 
-            //       shift; C has p decimal digits, correct by Prop. 1)
-            //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
-            //       shift; C has p decimal digits, correct by Pr. 1)
-            // else
-            //   C = floor(C*) (logical right shift; C has p decimal digits,
-            //       correct by Property 1)
-            // n = C * 10^(e2+x)
-            if ((P256.w[1] || P256.w[0])
-                && (P256.w[1] < 0x1999999999999999ull
-                    || (P256.w[1] == 0x1999999999999999ull
-                    && P256.w[0] <= 0x9999999999999999ull))) {
-              // the result is a midpoint
-              if (P256.w[2] & 0x01) {
-                is_midpoint_gt_even = 1;
-                // if floor(C*) is odd C = floor(C*) - 1; the result is not 0
-                P256.w[2]--;
-                if (P256.w[2] == 0xffffffffffffffffull)
-                  P256.w[3]--;
-              } else {
-                is_midpoint_lt_even = 1;
-              }
-            }
-            // n = Cstar * 10^(e2+1)
-            y_exp = y_exp + EXP_P1;
-            // C* != 10^P34 because C* has P34 digits
-            // check for overflow
-            if (y_exp == EXP_MAX_P1
-                && (rnd_mode == ROUNDING_TO_NEAREST
-                    || rnd_mode == ROUNDING_TIES_AWAY)) {
-              // overflow for RN
-              res.w[1] = x_sign | 0x7800000000000000ull; // +/-inf
-              res.w[0] = 0x0ull;
-              // set the inexact flag
-              *pfpsf |= INEXACT_EXCEPTION;
-              // set the overflow flag
-              *pfpsf |= OVERFLOW_EXCEPTION;
-              BID_RETURN (res);
-            }
-            // if (0 < f* - 1/2 < 10^(-x)) then 
-            //   the result of the addition is exact 
-            // else 
-            //   the result of the addition is inexact
-            if (P256.w[1] > 0x8000000000000000ull || 
-                (P256.w[1] == 0x8000000000000000ull && 
-                P256.w[0] > 0x0ull)) { // the result may be exact
-              tmp64 = P256.w[1] - 0x8000000000000000ull; // f* - 1/2
-              if ((tmp64 > 0x1999999999999999ull
-                   || (tmp64 == 0x1999999999999999ull
-                   && P256.w[0] >= 0x9999999999999999ull))) {
-                // set the inexact flag
-                *pfpsf |= INEXACT_EXCEPTION;
-                is_inexact = 1;
-              }        // else the result is exact
-            } else { // the result is inexact
-              // set the inexact flag
-              *pfpsf |= INEXACT_EXCEPTION;
-              is_inexact = 1;
-            }
-            C1_hi = P256.w[3];
-            C1_lo = P256.w[2];
-            if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
-              is_inexact_lt_midpoint = is_inexact
-                && (P256.w[1] & 0x8000000000000000ull);
-              is_inexact_gt_midpoint = is_inexact
-                && !(P256.w[1] & 0x8000000000000000ull);
-            }
-            // general correction from RN to RA, RM, RP, RZ; result uses y_exp
-            if (rnd_mode != ROUNDING_TO_NEAREST) {
-              if ((!x_sign && 
-                  ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint) || 
-                  ((rnd_mode == ROUNDING_TIES_AWAY || rnd_mode == ROUNDING_UP) 
-                  && is_midpoint_gt_even))) || 
-                  (x_sign && 
-                  ((rnd_mode == ROUNDING_DOWN && is_inexact_lt_midpoint) || 
-                  ((rnd_mode == ROUNDING_TIES_AWAY || rnd_mode == ROUNDING_DOWN)
-                  && is_midpoint_gt_even)))) {
-                // C1 = C1 + 1
-                C1_lo = C1_lo + 1;
-                if (C1_lo == 0) { // rounding overflow in the low 64 bits
-                  C1_hi = C1_hi + 1;
-                }
-                if (C1_hi == 0x0001ed09bead87c0ull
-                    && C1_lo == 0x378d8e6400000000ull) {
-                  // C1 = 10^34 => rounding overflow
-                  C1_hi = 0x0000314dc6448d93ull;
-                  C1_lo = 0x38c15b0a00000000ull; // 10^33
-                  y_exp = y_exp + EXP_P1;
-                }
-              } else
-                if ((is_midpoint_lt_even || is_inexact_gt_midpoint) && 
-                    ((x_sign && 
-                    (rnd_mode == ROUNDING_UP || 
-                    rnd_mode == ROUNDING_TO_ZERO)) || 
-                    (!x_sign && 
-                    (rnd_mode == ROUNDING_DOWN || 
-                    rnd_mode == ROUNDING_TO_ZERO)))) {
-                // C1 = C1 - 1
-                C1_lo = C1_lo - 1;
-                if (C1_lo == 0xffffffffffffffffull)
-                  C1_hi--;
-                // check if we crossed into the lower decade
-                if (C1_hi == 0x0000314dc6448d93ull && 
-                    C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
-                  C1_hi = 0x0001ed09bead87c0ull; // 10^34 - 1
-                  C1_lo = 0x378d8e63ffffffffull;
-                  y_exp = y_exp - EXP_P1;
-                  // no underflow, because delta + q2 >= P34 + 1
-                }
-              } else {
-                ; // exact, the result is already correct
-              }
-              // in all cases check for overflow (RN and RA solved already)
-              if (y_exp == EXP_MAX_P1) { // overflow
-                if ((rnd_mode == ROUNDING_DOWN && x_sign) || // RM and res < 0
-                    (rnd_mode == ROUNDING_UP && !x_sign)) { // RP and res > 0
-                  C1_hi = 0x7800000000000000ull; // +inf
-                  C1_lo = 0x0ull;
-                } else { // RM and res > 0, RP and res < 0, or RZ
-                  C1_hi = 0x5fffed09bead87c0ull;
-                  C1_lo = 0x378d8e63ffffffffull;
-                }
-                y_exp = 0; // x_sign is preserved
-                // set the inexact flag (in case the exact addition was exact)
-                *pfpsf |= INEXACT_EXCEPTION;
-                // set the overflow flag
-                *pfpsf |= OVERFLOW_EXCEPTION;
-              }
-            }
-          } // else if (C1 < 10^34) then C1 is the coeff.; the result is exact
-          // assemble the result
-          res.w[1] = x_sign | y_exp | C1_hi;
-          res.w[0] = C1_lo;
-        } else { // if x_sign != y_sign the result is exact
-          C1_lo = C2_lo - C1_lo;
-          C1_hi = C2_hi - C1_hi;
-          if (C1_lo > C2_lo)
-            C1_hi--;
-          if (C1_hi >= 0x8000000000000000ull) { // negative coefficient!
-            C1_lo = ~C1_lo;
-            C1_lo++;
-            C1_hi = ~C1_hi;
-            if (C1_lo == 0x0)
-              C1_hi++;
-            x_sign = y_sign; // the result will have the sign of y
-          }
-          // the result can be zero, but it cannot overflow
-          if (C1_lo == 0 && C1_hi == 0) {
-            // assemble the result
-            if (x_exp < y_exp)
-              res.w[1] = x_exp;
-            else
-              res.w[1] = y_exp;
-            res.w[0] = 0;
-            if (rnd_mode == ROUNDING_DOWN) {
-              res.w[1] |= 0x8000000000000000ull;
-            }
-            BID_RETURN (res);
-          }
-          // assemble the result
-          res.w[1] = y_sign | y_exp | C1_hi;
-          res.w[0] = C1_lo;
-        }
+       // for RN the result is the operand with the larger magnitude,
+       // possibly scaled up by 10^(P34-q1)
+       // an overflow cannot occur in this case (rounding to nearest)
+       if (q1 < P34) { // scale C1 up by 10^(P34-q1)
+         // Note: because delta >= P34+1 it is certain that 
+         //     x_exp - ((UINT64)scale << 49) will stay above e_min
+         scale = P34 - q1;
+         if (q1 <= 19) {       // C1 fits in 64 bits
+           // 1 <= q1 <= 19 => 15 <= scale <= 33
+           if (scale <= 19) {  // 10^scale fits in 64 bits
+             __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
+           } else {    // if 20 <= scale <= 33
+             // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
+             // (C1 * 10^(scale-19)) fits in 64 bits
+             C1_lo = C1_lo * ten2k64[scale - 19];
+             __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
+           }
+         } else {      //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
+           // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
+           C1.w[1] = C1_hi;
+           C1.w[0] = C1_lo;
+           // C1 = ten2k64[P34 - q1] * C1
+           __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
+         }
+         x_exp = x_exp - ((UINT64) scale << 49);
+         C1_hi = C1.w[1];
+         C1_lo = C1.w[0];
+       }
+       // some special cases arise: if delta = P34 + 1 and C1 = 10^(P34-1) 
+       // (after scaling) and x_sign != y_sign and C2 > 5*10^(q2-1) => 
+       // subtract 1 ulp
+       // Note: do this only for rounding to nearest; for other rounding 
+       // modes the correction will be applied next
+       if ((rnd_mode == ROUNDING_TO_NEAREST
+            || rnd_mode == ROUNDING_TIES_AWAY) && delta == (P34 + 1)
+           && C1_hi == 0x0000314dc6448d93ull
+           && C1_lo == 0x38c15b0a00000000ull && x_sign != y_sign
+           && ((q2 <= 19 && C2_lo > midpoint64[q2 - 1]) || (q2 >= 20
+                                                            && (C2_hi >
+                                                                midpoint128
+                                                                [q2 -
+                                                                 20].
+                                                                w[1]
+                                                                ||
+                                                                (C2_hi
+                                                                 ==
+                                                                 midpoint128
+                                                                 [q2 -
+                                                                  20].
+                                                                 w[1]
+                                                                 &&
+                                                                 C2_lo
+                                                                 >
+                                                                 midpoint128
+                                                                 [q2 -
+                                                                  20].
+                                                                 w
+                                                                 [0])))))
+       {
+         // C1 = 10^34 - 1 and decrement x_exp by 1 (no underflow possible)
+         C1_hi = 0x0001ed09bead87c0ull;
+         C1_lo = 0x378d8e63ffffffffull;
+         x_exp = x_exp - EXP_P1;
+       }
+       if (rnd_mode != ROUNDING_TO_NEAREST) {
+         if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
+             (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
+           // add 1 ulp and then check for overflow
+           C1_lo = C1_lo + 1;
+           if (C1_lo == 0) {   // rounding overflow in the low 64 bits
+             C1_hi = C1_hi + 1;
+           }
+           if (C1_hi == 0x0001ed09bead87c0ull
+               && C1_lo == 0x378d8e6400000000ull) {
+             // C1 = 10^34 => rounding overflow
+             C1_hi = 0x0000314dc6448d93ull;
+             C1_lo = 0x38c15b0a00000000ull;    // 10^33
+             x_exp = x_exp + EXP_P1;
+             if (x_exp == EXP_MAX_P1) {        // overflow
+               C1_hi = 0x7800000000000000ull;  // +inf
+               C1_lo = 0x0ull;
+               x_exp = 0;      // x_sign is preserved
+               // set overflow flag (the inexact flag was set too)
+               *pfpsf |= OVERFLOW_EXCEPTION;
+             }
+           }
+         } else if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign) ||
+                    (rnd_mode == ROUNDING_UP && x_sign && !y_sign) ||
+                    (rnd_mode == ROUNDING_TO_ZERO
+                     && x_sign != y_sign)) {
+           // subtract 1 ulp from C1
+           // Note: because delta >= P34 + 1 the result cannot be zero
+           C1_lo = C1_lo - 1;
+           if (C1_lo == 0xffffffffffffffffull)
+             C1_hi = C1_hi - 1;
+           // if the coefficient is 10^33 - 1 then make it 10^34 - 1 and 
+           // decrease the exponent by 1 (because delta >= P34 + 1 the
+           // exponent will not become less than e_min)
+           // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
+           // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
+           if (C1_hi == 0x0000314dc6448d93ull
+               && C1_lo == 0x38c15b09ffffffffull) {
+             // make C1 = 10^34  - 1
+             C1_hi = 0x0001ed09bead87c0ull;
+             C1_lo = 0x378d8e63ffffffffull;
+             x_exp = x_exp - EXP_P1;
+           }
+         } else {
+           ;   // the result is already correct
+         }
+       }
+       // set the inexact flag
+       *pfpsf |= INEXACT_EXCEPTION;
+       // assemble the result
+       res.w[1] = x_sign | x_exp | C1_hi;
+       res.w[0] = C1_lo;
+      } else { // delta = P34 
+       // in most cases, the smaller operand may be < or = or > 1/2 ulp of the
+       // larger operand
+       // however, the case C1 = 10^(q1-1) and x_sign != y_sign is special due
+       // to accuracy loss after subtraction, and will be treated separately
+       if (x_sign == y_sign || (q1 <= 20
+                                && (C1_hi != 0
+                                    || C1_lo != ten2k64[q1 - 1]))
+           || (q1 >= 21 && (C1_hi != ten2k128[q1 - 21].w[1]
+                            || C1_lo != ten2k128[q1 - 21].w[0]))) {
+         // if x_sign == y_sign or C1 != 10^(q1-1)
+         // compare C2 with 1/2 ulp = 5 * 10^(q2-1), the latter read from table
+         // Note: cases q1<=19 and q1>=20 can be coalesced at some latency cost
+         if (q2 <= 19) {       // C2 and 5*10^(q2-1) both fit in 64 bits
+           halfulp64 = midpoint64[q2 - 1];     // 5 * 10^(q2-1)
+           if (C2_lo < halfulp64) {    // n2 < 1/2 ulp (n1)
+             // for RN the result is the operand with the larger magnitude, 
+             // possibly scaled up by 10^(P34-q1)
+             // an overflow cannot occur in this case (rounding to nearest)
+             if (q1 < P34) {   // scale C1 up by 10^(P34-q1)
+               // Note: because delta = P34 it is certain that
+               //     x_exp - ((UINT64)scale << 49) will stay above e_min
+               scale = P34 - q1;
+               if (q1 <= 19) { // C1 fits in 64 bits
+                 // 1 <= q1 <= 19 => 15 <= scale <= 33
+                 if (scale <= 19) {    // 10^scale fits in 64 bits
+                   __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
+                 } else {      // if 20 <= scale <= 33
+                   // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
+                   // (C1 * 10^(scale-19)) fits in 64 bits
+                   C1_lo = C1_lo * ten2k64[scale - 19];
+                   __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
+                 }
+               } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
+                 // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
+                 C1.w[1] = C1_hi;
+                 C1.w[0] = C1_lo;
+                 // C1 = ten2k64[P34 - q1] * C1
+                 __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
+               }
+               x_exp = x_exp - ((UINT64) scale << 49);
+               C1_hi = C1.w[1];
+               C1_lo = C1.w[0];
+             }
+             if (rnd_mode != ROUNDING_TO_NEAREST) {
+               if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
+                   (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
+                 // add 1 ulp and then check for overflow
+                 C1_lo = C1_lo + 1;
+                 if (C1_lo == 0) {     // rounding overflow in the low 64 bits
+                   C1_hi = C1_hi + 1;
+                 }
+                 if (C1_hi == 0x0001ed09bead87c0ull
+                     && C1_lo == 0x378d8e6400000000ull) {
+                   // C1 = 10^34 => rounding overflow
+                   C1_hi = 0x0000314dc6448d93ull;
+                   C1_lo = 0x38c15b0a00000000ull;      // 10^33
+                   x_exp = x_exp + EXP_P1;
+                   if (x_exp == EXP_MAX_P1) {  // overflow
+                     C1_hi = 0x7800000000000000ull;    // +inf
+                     C1_lo = 0x0ull;
+                     x_exp = 0;        // x_sign is preserved
+                     // set overflow flag (the inexact flag was set too)
+                     *pfpsf |= OVERFLOW_EXCEPTION;
+                   }
+                 }
+               } else
+                 if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
+                     || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
+                     || (rnd_mode == ROUNDING_TO_ZERO
+                         && x_sign != y_sign)) {
+                 // subtract 1 ulp from C1
+                 // Note: because delta >= P34 + 1 the result cannot be zero
+                 C1_lo = C1_lo - 1;
+                 if (C1_lo == 0xffffffffffffffffull)
+                   C1_hi = C1_hi - 1;
+                 // if the coefficient is 10^33-1 then make it 10^34-1 and 
+                 // decrease the exponent by 1 (because delta >= P34 + 1 the
+                 // exponent will not become less than e_min)
+                 // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
+                 // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
+                 if (C1_hi == 0x0000314dc6448d93ull
+                     && C1_lo == 0x38c15b09ffffffffull) {
+                   // make C1 = 10^34  - 1
+                   C1_hi = 0x0001ed09bead87c0ull;
+                   C1_lo = 0x378d8e63ffffffffull;
+                   x_exp = x_exp - EXP_P1;
+                 }
+               } else {
+                 ;     // the result is already correct
+               }
+             }
+             // set the inexact flag
+             *pfpsf |= INEXACT_EXCEPTION;
+             // assemble the result
+             res.w[1] = x_sign | x_exp | C1_hi;
+             res.w[0] = C1_lo;
+           } else if ((C2_lo == halfulp64)
+                      && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
+             // n2 = 1/2 ulp (n1) and C1 is even
+             // the result is the operand with the larger magnitude,
+             // possibly scaled up by 10^(P34-q1)
+             // an overflow cannot occur in this case (rounding to nearest)
+             if (q1 < P34) {   // scale C1 up by 10^(P34-q1)
+               // Note: because delta = P34 it is certain that
+               //     x_exp - ((UINT64)scale << 49) will stay above e_min
+               scale = P34 - q1;
+               if (q1 <= 19) { // C1 fits in 64 bits
+                 // 1 <= q1 <= 19 => 15 <= scale <= 33
+                 if (scale <= 19) {    // 10^scale fits in 64 bits
+                   __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
+                 } else {      // if 20 <= scale <= 33 
+                   // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
+                   // (C1 * 10^(scale-19)) fits in 64 bits  
+                   C1_lo = C1_lo * ten2k64[scale - 19];
+                   __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
+                 }
+               } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
+                 // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits 
+                 C1.w[1] = C1_hi;
+                 C1.w[0] = C1_lo;
+                 // C1 = ten2k64[P34 - q1] * C1 
+                 __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
+               }
+               x_exp = x_exp - ((UINT64) scale << 49);
+               C1_hi = C1.w[1];
+               C1_lo = C1.w[0];
+             }
+             if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign == y_sign
+                  && (C1_lo & 0x01)) || (rnd_mode == ROUNDING_TIES_AWAY
+                                         && x_sign == y_sign)
+                 || (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)
+                 || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)) {
+               // add 1 ulp and then check for overflow
+               C1_lo = C1_lo + 1;
+               if (C1_lo == 0) {       // rounding overflow in the low 64 bits
+                 C1_hi = C1_hi + 1;
+               }
+               if (C1_hi == 0x0001ed09bead87c0ull
+                   && C1_lo == 0x378d8e6400000000ull) {
+                 // C1 = 10^34 => rounding overflow
+                 C1_hi = 0x0000314dc6448d93ull;
+                 C1_lo = 0x38c15b0a00000000ull;        // 10^33
+                 x_exp = x_exp + EXP_P1;
+                 if (x_exp == EXP_MAX_P1) {    // overflow
+                   C1_hi = 0x7800000000000000ull;      // +inf
+                   C1_lo = 0x0ull;
+                   x_exp = 0;  // x_sign is preserved
+                   // set overflow flag (the inexact flag was set too)
+                   *pfpsf |= OVERFLOW_EXCEPTION;
+                 }
+               }
+             } else
+               if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign
+                    && (C1_lo & 0x01)) || (rnd_mode == ROUNDING_DOWN
+                                           && !x_sign && y_sign)
+                   || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
+                   || (rnd_mode == ROUNDING_TO_ZERO
+                       && x_sign != y_sign)) {
+               // subtract 1 ulp from C1
+               // Note: because delta >= P34 + 1 the result cannot be zero
+               C1_lo = C1_lo - 1;
+               if (C1_lo == 0xffffffffffffffffull)
+                 C1_hi = C1_hi - 1;
+               // if the coefficient is 10^33 - 1 then make it 10^34 - 1
+               // and decrease the exponent by 1 (because delta >= P34 + 1
+               // the exponent will not become less than e_min)
+               // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
+               // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
+               if (C1_hi == 0x0000314dc6448d93ull
+                   && C1_lo == 0x38c15b09ffffffffull) {
+                 // make C1 = 10^34  - 1
+                 C1_hi = 0x0001ed09bead87c0ull;
+                 C1_lo = 0x378d8e63ffffffffull;
+                 x_exp = x_exp - EXP_P1;
+               }
+             } else {
+               ;       // the result is already correct
+             }
+             // set the inexact flag
+             *pfpsf |= INEXACT_EXCEPTION;
+             // assemble the result 
+             res.w[1] = x_sign | x_exp | C1_hi;
+             res.w[0] = C1_lo;
+           } else {    // if C2_lo > halfulp64 || 
+             // (C2_lo == halfulp64 && q1 == P34 && ((C1_lo & 0x1) == 1)), i.e.
+             // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
+             // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
+             if (q1 < P34) {   // then 1 ulp = 10^(e1+q1-P34) < 10^e1
+               // Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
+               // because q1 < P34 we must first replace C1 by 
+               // C1 * 10^(P34-q1), and must decrease the exponent by 
+               // (P34-q1) (it will still be at least e_min)
+               scale = P34 - q1;
+               if (q1 <= 19) { // C1 fits in 64 bits
+                 // 1 <= q1 <= 19 => 15 <= scale <= 33
+                 if (scale <= 19) {    // 10^scale fits in 64 bits
+                   __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
+                 } else {      // if 20 <= scale <= 33
+                   // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
+                   // (C1 * 10^(scale-19)) fits in 64 bits
+                   C1_lo = C1_lo * ten2k64[scale - 19];
+                   __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
+                 }
+               } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
+                 // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
+                 C1.w[1] = C1_hi;
+                 C1.w[0] = C1_lo;
+                 // C1 = ten2k64[P34 - q1] * C1
+                 __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
+               }
+               x_exp = x_exp - ((UINT64) scale << 49);
+               C1_hi = C1.w[1];
+               C1_lo = C1.w[0];
+               // check for rounding overflow
+               if (C1_hi == 0x0001ed09bead87c0ull
+                   && C1_lo == 0x378d8e6400000000ull) {
+                 // C1 = 10^34 => rounding overflow 
+                 C1_hi = 0x0000314dc6448d93ull;
+                 C1_lo = 0x38c15b0a00000000ull;        // 10^33
+                 x_exp = x_exp + EXP_P1;
+               }
+             }
+             if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign)
+                 || (rnd_mode == ROUNDING_TIES_AWAY && x_sign != y_sign
+                     && C2_lo != halfulp64)
+                 || (rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
+                 || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
+                 || (rnd_mode == ROUNDING_TO_ZERO
+                     && x_sign != y_sign)) {
+               // the result is x - 1
+               // for RN n1 * n2 < 0; underflow not possible
+               C1_lo = C1_lo - 1;
+               if (C1_lo == 0xffffffffffffffffull)
+                 C1_hi--;
+               // check if we crossed into the lower decade
+               if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
+                 C1_hi = 0x0001ed09bead87c0ull;        // 10^34 - 1
+                 C1_lo = 0x378d8e63ffffffffull;
+                 x_exp = x_exp - EXP_P1;       // no underflow, because n1 >> n2
+               }
+             } else
+               if ((rnd_mode == ROUNDING_TO_NEAREST
+                    && x_sign == y_sign)
+                   || (rnd_mode == ROUNDING_TIES_AWAY
+                       && x_sign == y_sign)
+                   || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
+                   || (rnd_mode == ROUNDING_UP && !x_sign
+                       && !y_sign)) {
+               // the result is x + 1
+               // for RN x_sign = y_sign, i.e. n1*n2 > 0
+               C1_lo = C1_lo + 1;
+               if (C1_lo == 0) {       // rounding overflow in the low 64 bits
+                 C1_hi = C1_hi + 1;
+               }
+               if (C1_hi == 0x0001ed09bead87c0ull
+                   && C1_lo == 0x378d8e6400000000ull) {
+                 // C1 = 10^34 => rounding overflow
+                 C1_hi = 0x0000314dc6448d93ull;
+                 C1_lo = 0x38c15b0a00000000ull;        // 10^33
+                 x_exp = x_exp + EXP_P1;
+                 if (x_exp == EXP_MAX_P1) {    // overflow
+                   C1_hi = 0x7800000000000000ull;      // +inf
+                   C1_lo = 0x0ull;
+                   x_exp = 0;  // x_sign is preserved
+                   // set the overflow flag
+                   *pfpsf |= OVERFLOW_EXCEPTION;
+                 }
+               }
+             } else {
+               ;       // the result is x
+             }
+             // set the inexact flag
+             *pfpsf |= INEXACT_EXCEPTION;
+             // assemble the result
+             res.w[1] = x_sign | x_exp | C1_hi;
+             res.w[0] = C1_lo;
+           }
+         } else {      // if q2 >= 20 then 5*10^(q2-1) and C2 (the latter in 
+           // most cases) fit only in more than 64 bits
+           halfulp128 = midpoint128[q2 - 20];  // 5 * 10^(q2-1)
+           if ((C2_hi < halfulp128.w[1])
+               || (C2_hi == halfulp128.w[1]
+                   && C2_lo < halfulp128.w[0])) {
+             // n2 < 1/2 ulp (n1)
+             // the result is the operand with the larger magnitude,
+             // possibly scaled up by 10^(P34-q1)
+             // an overflow cannot occur in this case (rounding to nearest)
+             if (q1 < P34) {   // scale C1 up by 10^(P34-q1)
+               // Note: because delta = P34 it is certain that
+               //     x_exp - ((UINT64)scale << 49) will stay above e_min
+               scale = P34 - q1;
+               if (q1 <= 19) { // C1 fits in 64 bits
+                 // 1 <= q1 <= 19 => 15 <= scale <= 33
+                 if (scale <= 19) {    // 10^scale fits in 64 bits
+                   __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
+                 } else {      // if 20 <= scale <= 33 
+                   // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
+                   // (C1 * 10^(scale-19)) fits in 64 bits  
+                   C1_lo = C1_lo * ten2k64[scale - 19];
+                   __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
+                 }
+               } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
+                 // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits 
+                 C1.w[1] = C1_hi;
+                 C1.w[0] = C1_lo;
+                 // C1 = ten2k64[P34 - q1] * C1 
+                 __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
+               }
+               C1_hi = C1.w[1];
+               C1_lo = C1.w[0];
+               x_exp = x_exp - ((UINT64) scale << 49);
+             }
+             if (rnd_mode != ROUNDING_TO_NEAREST) {
+               if ((rnd_mode == ROUNDING_DOWN && x_sign && y_sign) ||
+                   (rnd_mode == ROUNDING_UP && !x_sign && !y_sign)) {
+                 // add 1 ulp and then check for overflow
+                 C1_lo = C1_lo + 1;
+                 if (C1_lo == 0) {     // rounding overflow in the low 64 bits
+                   C1_hi = C1_hi + 1;
+                 }
+                 if (C1_hi == 0x0001ed09bead87c0ull
+                     && C1_lo == 0x378d8e6400000000ull) {
+                   // C1 = 10^34 => rounding overflow
+                   C1_hi = 0x0000314dc6448d93ull;
+                   C1_lo = 0x38c15b0a00000000ull;      // 10^33
+                   x_exp = x_exp + EXP_P1;
+                   if (x_exp == EXP_MAX_P1) {  // overflow
+                     C1_hi = 0x7800000000000000ull;    // +inf
+                     C1_lo = 0x0ull;
+                     x_exp = 0;        // x_sign is preserved
+                     // set overflow flag (the inexact flag was set too)
+                     *pfpsf |= OVERFLOW_EXCEPTION;
+                   }
+                 }
+               } else
+                 if ((rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
+                     || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
+                     || (rnd_mode == ROUNDING_TO_ZERO
+                         && x_sign != y_sign)) {
+                 // subtract 1 ulp from C1
+                 // Note: because delta >= P34 + 1 the result cannot be zero
+                 C1_lo = C1_lo - 1;
+                 if (C1_lo == 0xffffffffffffffffull)
+                   C1_hi = C1_hi - 1;
+                 // if the coefficient is 10^33-1 then make it 10^34-1 and
+                 // decrease the exponent by 1 (because delta >= P34 + 1 the
+                 // exponent will not become less than e_min)
+                 // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
+                 // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
+                 if (C1_hi == 0x0000314dc6448d93ull
+                     && C1_lo == 0x38c15b09ffffffffull) {
+                   // make C1 = 10^34  - 1
+                   C1_hi = 0x0001ed09bead87c0ull;
+                   C1_lo = 0x378d8e63ffffffffull;
+                   x_exp = x_exp - EXP_P1;
+                 }
+               } else {
+                 ;     // the result is already correct
+               }
+             }
+             // set the inexact flag 
+             *pfpsf |= INEXACT_EXCEPTION;
+             // assemble the result 
+             res.w[1] = x_sign | x_exp | C1_hi;
+             res.w[0] = C1_lo;
+           } else if ((C2_hi == halfulp128.w[1]
+                       && C2_lo == halfulp128.w[0])
+                      && (q1 < P34 || ((C1_lo & 0x1) == 0))) {
+             // midpoint & lsb in C1 is 0
+             // n2 = 1/2 ulp (n1) and C1 is even
+             // the result is the operand with the larger magnitude,
+             // possibly scaled up by 10^(P34-q1)
+             // an overflow cannot occur in this case (rounding to nearest)
+             if (q1 < P34) {   // scale C1 up by 10^(P34-q1)
+               // Note: because delta = P34 it is certain that
+               //     x_exp - ((UINT64)scale << 49) will stay above e_min
+               scale = P34 - q1;
+               if (q1 <= 19) { // C1 fits in 64 bits
+                 // 1 <= q1 <= 19 => 15 <= scale <= 33
+                 if (scale <= 19) {    // 10^scale fits in 64 bits
+                   __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
+                 } else {      // if 20 <= scale <= 33
+                   // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
+                   // (C1 * 10^(scale-19)) fits in 64 bits
+                   C1_lo = C1_lo * ten2k64[scale - 19];
+                   __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
+                 }
+               } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
+                 // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
+                 C1.w[1] = C1_hi;
+                 C1.w[0] = C1_lo;
+                 // C1 = ten2k64[P34 - q1] * C1
+                 __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
+               }
+               x_exp = x_exp - ((UINT64) scale << 49);
+               C1_hi = C1.w[1];
+               C1_lo = C1.w[0];
+             }
+             if (rnd_mode != ROUNDING_TO_NEAREST) {
+               if ((rnd_mode == ROUNDING_TIES_AWAY && x_sign == y_sign)
+                   || (rnd_mode == ROUNDING_UP && !y_sign)) {
+                 // add 1 ulp and then check for overflow
+                 C1_lo = C1_lo + 1;
+                 if (C1_lo == 0) {     // rounding overflow in the low 64 bits
+                   C1_hi = C1_hi + 1;
+                 }
+                 if (C1_hi == 0x0001ed09bead87c0ull
+                     && C1_lo == 0x378d8e6400000000ull) {
+                   // C1 = 10^34 => rounding overflow
+                   C1_hi = 0x0000314dc6448d93ull;
+                   C1_lo = 0x38c15b0a00000000ull;      // 10^33
+                   x_exp = x_exp + EXP_P1;
+                   if (x_exp == EXP_MAX_P1) {  // overflow
+                     C1_hi = 0x7800000000000000ull;    // +inf
+                     C1_lo = 0x0ull;
+                     x_exp = 0;        // x_sign is preserved
+                     // set overflow flag (the inexact flag was set too)
+                     *pfpsf |= OVERFLOW_EXCEPTION;
+                   }
+                 }
+               } else if ((rnd_mode == ROUNDING_DOWN && y_sign)
+                          || (rnd_mode == ROUNDING_TO_ZERO
+                              && x_sign != y_sign)) {
+                 // subtract 1 ulp from C1
+                 // Note: because delta >= P34 + 1 the result cannot be zero
+                 C1_lo = C1_lo - 1;
+                 if (C1_lo == 0xffffffffffffffffull)
+                   C1_hi = C1_hi - 1;
+                 // if the coefficient is 10^33 - 1 then make it 10^34 - 1
+                 // and decrease the exponent by 1 (because delta >= P34 + 1
+                 // the exponent will not become less than e_min)
+                 // 10^33 - 1 = 0x0000314dc6448d9338c15b09ffffffff
+                 // 10^34 - 1 = 0x0001ed09bead87c0378d8e63ffffffff
+                 if (C1_hi == 0x0000314dc6448d93ull
+                     && C1_lo == 0x38c15b09ffffffffull) {
+                   // make C1 = 10^34  - 1
+                   C1_hi = 0x0001ed09bead87c0ull;
+                   C1_lo = 0x378d8e63ffffffffull;
+                   x_exp = x_exp - EXP_P1;
+                 }
+               } else {
+                 ;     // the result is already correct
+               }
+             }
+             // set the inexact flag
+             *pfpsf |= INEXACT_EXCEPTION;
+             // assemble the result
+             res.w[1] = x_sign | x_exp | C1_hi;
+             res.w[0] = C1_lo;
+           } else {    // if C2 > halfulp128 ||
+             // (C2 == halfulp128 && q1 == P34 && ((C1 & 0x1) == 1)), i.e.
+             // 1/2 ulp(n1) < n2 < 1 ulp(n1) or n2 = 1/2 ulp(n1) and C1 odd
+             // res = x+1 ulp if n1*n2 > 0 and res = x-1 ulp if n1*n2 < 0
+             if (q1 < P34) {   // then 1 ulp = 10^(e1+q1-P34) < 10^e1
+               // Note: if (q1 == P34) then 1 ulp = 10^(e1+q1-P34) = 10^e1
+               // because q1 < P34 we must first replace C1 by C1*10^(P34-q1),
+               // and must decrease the exponent by (P34-q1) (it will still be
+               // at least e_min)
+               scale = P34 - q1;
+               if (q1 <= 19) { // C1 fits in 64 bits
+                 // 1 <= q1 <= 19 => 15 <= scale <= 33
+                 if (scale <= 19) {    // 10^scale fits in 64 bits
+                   __mul_64x64_to_128MACH (C1, ten2k64[scale], C1_lo);
+                 } else {      // if 20 <= scale <= 33
+                   // C1 * 10^scale = (C1 * 10^(scale-19)) * 10^19 where
+                   // (C1 * 10^(scale-19)) fits in 64 bits
+                   C1_lo = C1_lo * ten2k64[scale - 19];
+                   __mul_64x64_to_128MACH (C1, ten2k64[19], C1_lo);
+                 }
+               } else {        //if 20 <= q1 <= 33=P34-1 then C1 fits only in 128 bits
+                 // => 1 <= P34 - q1 <= 14 so 10^(P34-q1) fits in 64 bits
+                 C1.w[1] = C1_hi;
+                 C1.w[0] = C1_lo;
+                 // C1 = ten2k64[P34 - q1] * C1
+                 __mul_128x64_to_128 (C1, ten2k64[P34 - q1], C1);
+               }
+               C1_hi = C1.w[1];
+               C1_lo = C1.w[0];
+               x_exp = x_exp - ((UINT64) scale << 49);
+             }
+             if ((rnd_mode == ROUNDING_TO_NEAREST && x_sign != y_sign)
+                 || (rnd_mode == ROUNDING_TIES_AWAY && x_sign != y_sign
+                     && (C2_hi != halfulp128.w[1]
+                         || C2_lo != halfulp128.w[0]))
+                 || (rnd_mode == ROUNDING_DOWN && !x_sign && y_sign)
+                 || (rnd_mode == ROUNDING_UP && x_sign && !y_sign)
+                 || (rnd_mode == ROUNDING_TO_ZERO
+                     && x_sign != y_sign)) {
+               // the result is x - 1
+               // for RN n1 * n2 < 0; underflow not possible
+               C1_lo = C1_lo - 1;
+               if (C1_lo == 0xffffffffffffffffull)
+                 C1_hi--;
+               // check if we crossed into the lower decade
+               if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
+                 C1_hi = 0x0001ed09bead87c0ull;        // 10^34 - 1
+                 C1_lo = 0x378d8e63ffffffffull;
+                 x_exp = x_exp - EXP_P1;       // no underflow, because n1 >> n2
+               }
+             } else
+               if ((rnd_mode == ROUNDING_TO_NEAREST
+                    && x_sign == y_sign)
+                   || (rnd_mode == ROUNDING_TIES_AWAY
+                       && x_sign == y_sign)
+                   || (rnd_mode == ROUNDING_DOWN && x_sign && y_sign)
+                   || (rnd_mode == ROUNDING_UP && !x_sign
+                       && !y_sign)) {
+               // the result is x + 1
+               // for RN x_sign = y_sign, i.e. n1*n2 > 0
+               C1_lo = C1_lo + 1;
+               if (C1_lo == 0) {       // rounding overflow in the low 64 bits
+                 C1_hi = C1_hi + 1;
+               }
+               if (C1_hi == 0x0001ed09bead87c0ull
+                   && C1_lo == 0x378d8e6400000000ull) {
+                 // C1 = 10^34 => rounding overflow
+                 C1_hi = 0x0000314dc6448d93ull;
+                 C1_lo = 0x38c15b0a00000000ull;        // 10^33
+                 x_exp = x_exp + EXP_P1;
+                 if (x_exp == EXP_MAX_P1) {    // overflow
+                   C1_hi = 0x7800000000000000ull;      // +inf
+                   C1_lo = 0x0ull;
+                   x_exp = 0;  // x_sign is preserved
+                   // set the overflow flag
+                   *pfpsf |= OVERFLOW_EXCEPTION;
+                 }
+               }
+             } else {
+               ;       // the result is x
+             }
+             // set the inexact flag
+             *pfpsf |= INEXACT_EXCEPTION;
+             // assemble the result
+             res.w[1] = x_sign | x_exp | C1_hi;
+             res.w[0] = C1_lo;
+           }
+         }     // end q1 >= 20
+         // end case where C1 != 10^(q1-1)
+       } else {        // C1 = 10^(q1-1) and x_sign != y_sign
+         // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
+         // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34 
+         // where x1 = q2 - 1, 0 <= x1 <= P34 - 1
+         // Because C1 = 10^(q1-1) and x_sign != y_sign, C' will have P34 
+         // digits and n = C' * 10^(e2+x1)
+         // If the result has P34+1 digits, redo the steps above with x1+1
+         // If the result has P34-1 digits or less, redo the steps above with 
+         // x1-1 but only if initially x1 >= 1
+         // NOTE: these two steps can be improved, e.g we could guess if
+         // P34+1 or P34-1 digits will be obtained by adding/subtracting 
+         // just the top 64 bits of the two operands
+         // The result cannot be zero, and it cannot overflow
+         x1 = q2 - 1;  // 0 <= x1 <= P34-1
+         // Calculate C1 * 10^(e1-e2-x1) where 1 <= e1-e2-x1 <= P34
+         // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
+         scale = P34 - q1 + 1; // scale=e1-e2-x1 = P34+1-q1; 1<=scale<=P34
+         // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
+         // but their product fits with certainty in 128 bits
+         if (scale >= 20) {    //10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
+           __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
+         } else {      // if (scale >= 1
+           // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
+           if (q1 <= 19) {     // C1 fits in 64 bits
+             __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
+           } else {    // q1 >= 20
+             C1.w[1] = C1_hi;
+             C1.w[0] = C1_lo;
+             __mul_128x64_to_128 (C1, ten2k64[scale], C1);
+           }
+         }
+         tmp64 = C1.w[0];      // C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
+
+         // now round C2 to q2-x1 = 1 decimal digit
+         // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
+         ind = x1 - 1; // -1 <= ind <= P34 - 2
+         if (ind >= 0) {       // if (x1 >= 1)
+           C2.w[0] = C2_lo;
+           C2.w[1] = C2_hi;
+           if (ind <= 18) {
+             C2.w[0] = C2.w[0] + midpoint64[ind];
+             if (C2.w[0] < C2_lo)
+               C2.w[1]++;
+           } else {    // 19 <= ind <= 32
+             C2.w[0] = C2.w[0] + midpoint128[ind - 19].w[0];
+             C2.w[1] = C2.w[1] + midpoint128[ind - 19].w[1];
+             if (C2.w[0] < C2_lo)
+               C2.w[1]++;
+           }
+           // the approximation of 10^(-x1) was rounded up to 118 bits
+           __mul_128x128_to_256 (R256, C2, ten2mk128[ind]);    // R256 = C2*, f2*
+           // calculate C2* and f2*
+           // C2* is actually floor(C2*) in this case
+           // C2* and f2* need shifting and masking, as shown by
+           // shiftright128[] and maskhigh128[]
+           // the top Ex bits of 10^(-x1) are T* = ten2mk128trunc[ind], e.g.
+           // if x1=1, T*=ten2mk128trunc[0]=0x19999999999999999999999999999999
+           // if (0 < f2* < 10^(-x1)) then
+           //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
+           //       shift; C2* has p decimal digits, correct by Prop. 1)
+           //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
+           //       shift; C2* has p decimal digits, correct by Pr. 1)
+           // else
+           //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
+           //       correct by Property 1)
+           // n = C2* * 10^(e2+x1)
+
+           if (ind <= 2) {
+             highf2star.w[1] = 0x0;
+             highf2star.w[0] = 0x0;    // low f2* ok
+           } else if (ind <= 21) {
+             highf2star.w[1] = 0x0;
+             highf2star.w[0] = R256.w[2] & maskhigh128[ind];   // low f2* ok
+           } else {
+             highf2star.w[1] = R256.w[3] & maskhigh128[ind];
+             highf2star.w[0] = R256.w[2];      // low f2* is ok
+           }
+           // shift right C2* by Ex-128 = shiftright128[ind]
+           if (ind >= 3) {
+             shift = shiftright128[ind];
+             if (shift < 64) { // 3 <= shift <= 63
+               R256.w[2] =
+                 (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
+               R256.w[3] = (R256.w[3] >> shift);
+             } else {  // 66 <= shift <= 102
+               R256.w[2] = (R256.w[3] >> (shift - 64));
+               R256.w[3] = 0x0ULL;
+             }
+           }
+           // redundant
+           is_inexact_lt_midpoint = 0;
+           is_inexact_gt_midpoint = 0;
+           is_midpoint_lt_even = 0;
+           is_midpoint_gt_even = 0;
+           // determine inexactness of the rounding of C2*
+           // (cannot be followed by a second rounding)
+           // if (0 < f2* - 1/2 < 10^(-x1)) then
+           //   the result is exact
+           // else (if f2* - 1/2 > T* then)
+           //   the result of is inexact
+           if (ind <= 2) {
+             if (R256.w[1] > 0x8000000000000000ull ||
+                 (R256.w[1] == 0x8000000000000000ull
+                  && R256.w[0] > 0x0ull)) {
+               // f2* > 1/2 and the result may be exact
+               tmp64A = R256.w[1] - 0x8000000000000000ull;     // f* - 1/2
+               if ((tmp64A > ten2mk128trunc[ind].w[1]
+                    || (tmp64A == ten2mk128trunc[ind].w[1]
+                        && R256.w[0] >= ten2mk128trunc[ind].w[0]))) {
+                 // set the inexact flag
+                 *pfpsf |= INEXACT_EXCEPTION;
+                 // this rounding is applied to C2 only!
+                 // x_sign != y_sign
+                 is_inexact_gt_midpoint = 1;
+               }       // else the result is exact
+               // rounding down, unless a midpoint in [ODD, EVEN]
+             } else {  // the result is inexact; f2* <= 1/2
+               // set the inexact flag
+               *pfpsf |= INEXACT_EXCEPTION;
+               // this rounding is applied to C2 only!
+               // x_sign != y_sign
+               is_inexact_lt_midpoint = 1;
+             }
+           } else if (ind <= 21) {     // if 3 <= ind <= 21
+             if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
+                                           && highf2star.w[0] >
+                                           onehalf128[ind])
+                 || (highf2star.w[1] == 0x0
+                     && highf2star.w[0] == onehalf128[ind]
+                     && (R256.w[1] || R256.w[0]))) {
+               // f2* > 1/2 and the result may be exact
+               // Calculate f2* - 1/2
+               tmp64A = highf2star.w[0] - onehalf128[ind];
+               tmp64B = highf2star.w[1];
+               if (tmp64A > highf2star.w[0])
+                 tmp64B--;
+               if (tmp64B || tmp64A
+                   || R256.w[1] > ten2mk128trunc[ind].w[1]
+                   || (R256.w[1] == ten2mk128trunc[ind].w[1]
+                       && R256.w[0] > ten2mk128trunc[ind].w[0])) {
+                 // set the inexact flag
+                 *pfpsf |= INEXACT_EXCEPTION;
+                 // this rounding is applied to C2 only!
+                 // x_sign != y_sign
+                 is_inexact_gt_midpoint = 1;
+               }       // else the result is exact
+             } else {  // the result is inexact; f2* <= 1/2
+               // set the inexact flag
+               *pfpsf |= INEXACT_EXCEPTION;
+               // this rounding is applied to C2 only!
+               // x_sign != y_sign
+               is_inexact_lt_midpoint = 1;
+             }
+           } else {    // if 22 <= ind <= 33
+             if (highf2star.w[1] > onehalf128[ind]
+                 || (highf2star.w[1] == onehalf128[ind]
+                     && (highf2star.w[0] || R256.w[1]
+                         || R256.w[0]))) {
+               // f2* > 1/2 and the result may be exact
+               // Calculate f2* - 1/2
+               // tmp64A = highf2star.w[0];
+               tmp64B = highf2star.w[1] - onehalf128[ind];
+               if (tmp64B || highf2star.w[0]
+                   || R256.w[1] > ten2mk128trunc[ind].w[1]
+                   || (R256.w[1] == ten2mk128trunc[ind].w[1]
+                       && R256.w[0] > ten2mk128trunc[ind].w[0])) {
+                 // set the inexact flag
+                 *pfpsf |= INEXACT_EXCEPTION;
+                 // this rounding is applied to C2 only!
+                 // x_sign != y_sign
+                 is_inexact_gt_midpoint = 1;
+               }       // else the result is exact
+             } else {  // the result is inexact; f2* <= 1/2
+               // set the inexact flag
+               *pfpsf |= INEXACT_EXCEPTION;
+               // this rounding is applied to C2 only!
+               // x_sign != y_sign
+               is_inexact_lt_midpoint = 1;
+             }
+           }
+           // check for midpoints after determining inexactness
+           if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
+               && (highf2star.w[0] == 0)
+               && (R256.w[1] < ten2mk128trunc[ind].w[1]
+                   || (R256.w[1] == ten2mk128trunc[ind].w[1]
+                       && R256.w[0] <= ten2mk128trunc[ind].w[0]))) {
+             // the result is a midpoint
+             if ((tmp64 + R256.w[2]) & 0x01) { // MP in [EVEN, ODD]
+               // if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
+               R256.w[2]--;
+               if (R256.w[2] == 0xffffffffffffffffull)
+                 R256.w[3]--;
+               // this rounding is applied to C2 only!
+               // x_sign != y_sign
+               is_midpoint_lt_even = 1;
+               is_inexact_lt_midpoint = 0;
+               is_inexact_gt_midpoint = 0;
+             } else {
+               // else MP in [ODD, EVEN]
+               // this rounding is applied to C2 only!
+               // x_sign != y_sign
+               is_midpoint_gt_even = 1;
+               is_inexact_lt_midpoint = 0;
+               is_inexact_gt_midpoint = 0;
+             }
+           }
+         } else {      // if (ind == -1) only when x1 = 0
+           R256.w[2] = C2_lo;
+           R256.w[3] = C2_hi;
+           is_midpoint_lt_even = 0;
+           is_midpoint_gt_even = 0;
+           is_inexact_lt_midpoint = 0;
+           is_inexact_gt_midpoint = 0;
+         }
+         // and now subtract C1 * 10^(e1-e2-x1) - (C2 * 10^(-x1))rnd,P34
+         // because x_sign != y_sign this last operation is exact
+         C1.w[0] = C1.w[0] - R256.w[2];
+         C1.w[1] = C1.w[1] - R256.w[3];
+         if (C1.w[0] > tmp64)
+           C1.w[1]--;  // borrow
+         if (C1.w[1] >= 0x8000000000000000ull) {       // negative coefficient!
+           C1.w[0] = ~C1.w[0];
+           C1.w[0]++;
+           C1.w[1] = ~C1.w[1];
+           if (C1.w[0] == 0x0)
+             C1.w[1]++;
+           tmp_sign = y_sign;  // the result will have the sign of y
+         } else {
+           tmp_sign = x_sign;
+         }
+         // the difference has exactly P34 digits
+         x_sign = tmp_sign;
+         if (x1 >= 1)
+           y_exp = y_exp + ((UINT64) x1 << 49);
+         C1_hi = C1.w[1];
+         C1_lo = C1.w[0];
+         // general correction from RN to RA, RM, RP, RZ; result uses y_exp
+         if (rnd_mode != ROUNDING_TO_NEAREST) {
+           if ((!x_sign
+                && ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint)
+                    ||
+                    ((rnd_mode == ROUNDING_TIES_AWAY
+                      || rnd_mode == ROUNDING_UP)
+                     && is_midpoint_gt_even))) || (x_sign
+                                                   &&
+                                                   ((rnd_mode ==
+                                                     ROUNDING_DOWN
+                                                     &&
+                                                     is_inexact_lt_midpoint)
+                                                    ||
+                                                    ((rnd_mode ==
+                                                      ROUNDING_TIES_AWAY
+                                                      || rnd_mode ==
+                                                      ROUNDING_DOWN)
+                                                     &&
+                                                     is_midpoint_gt_even))))
+           {
+             // C1 = C1 + 1
+             C1_lo = C1_lo + 1;
+             if (C1_lo == 0) { // rounding overflow in the low 64 bits
+               C1_hi = C1_hi + 1;
+             }
+             if (C1_hi == 0x0001ed09bead87c0ull
+                 && C1_lo == 0x378d8e6400000000ull) {
+               // C1 = 10^34 => rounding overflow
+               C1_hi = 0x0000314dc6448d93ull;
+               C1_lo = 0x38c15b0a00000000ull;  // 10^33
+               y_exp = y_exp + EXP_P1;
+             }
+           } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
+                      &&
+                      ((x_sign
+                        && (rnd_mode == ROUNDING_UP
+                            || rnd_mode == ROUNDING_TO_ZERO))
+                       || (!x_sign
+                           && (rnd_mode == ROUNDING_DOWN
+                               || rnd_mode == ROUNDING_TO_ZERO)))) {
+             // C1 = C1 - 1
+             C1_lo = C1_lo - 1;
+             if (C1_lo == 0xffffffffffffffffull)
+               C1_hi--;
+             // check if we crossed into the lower decade
+             if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {   // 10^33 - 1
+               C1_hi = 0x0001ed09bead87c0ull;  // 10^34 - 1
+               C1_lo = 0x378d8e63ffffffffull;
+               y_exp = y_exp - EXP_P1;
+               // no underflow, because delta + q2 >= P34 + 1
+             }
+           } else {
+             ; // exact, the result is already correct
+           }
+         }
+         // assemble the result
+         res.w[1] = x_sign | y_exp | C1_hi;
+         res.w[0] = C1_lo;
+       }
+      }        // end delta = P34
+    } else {   // if (|delta| <= P34 - 1)
+      if (delta >= 0) {        // if (0 <= delta <= P34 - 1)
+       if (delta <= P34 - 1 - q2) {
+         // calculate C' directly; the result is exact
+         // in this case 1<=q1<=P34-1, 1<=q2<=P34-1 and 0 <= e1-e2 <= P34-2
+         // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
+         // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
+         // but their product fits with certainty in 128 bits (actually in 113)
+         scale = delta - q1 + q2;      // scale = (int)(e1 >> 49) - (int)(e2 >> 49) 
+
+         if (scale >= 20) {    // 10^(e1-e2) does not fit in 64 bits, but C1 does
+           __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
+           C1_hi = C1.w[1];
+           C1_lo = C1.w[0];
+         } else if (scale >= 1) {
+           // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits 
+           if (q1 <= 19) {     // C1 fits in 64 bits
+             __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
+           } else {    // q1 >= 20
+             C1.w[1] = C1_hi;
+             C1.w[0] = C1_lo;
+             __mul_128x64_to_128 (C1, ten2k64[scale], C1);
+           }
+           C1_hi = C1.w[1];
+           C1_lo = C1.w[0];
+         } else {      // if (scale == 0) C1 is unchanged
+           C1.w[0] = C1_lo;    // C1.w[1] = C1_hi; 
+         }
+         // now add C2
+         if (x_sign == y_sign) {
+           // the result cannot overflow
+           C1_lo = C1_lo + C2_lo;
+           C1_hi = C1_hi + C2_hi;
+           if (C1_lo < C1.w[0])
+             C1_hi++;
+         } else {      // if x_sign != y_sign
+           C1_lo = C1_lo - C2_lo;
+           C1_hi = C1_hi - C2_hi;
+           if (C1_lo > C1.w[0])
+             C1_hi--;
+           // the result can be zero, but it cannot overflow
+           if (C1_lo == 0 && C1_hi == 0) {
+             // assemble the result
+             if (x_exp < y_exp)
+               res.w[1] = x_exp;
+             else
+               res.w[1] = y_exp;
+             res.w[0] = 0;
+             if (rnd_mode == ROUNDING_DOWN) {
+               res.w[1] |= 0x8000000000000000ull;
+             }
+             BID_SWAP128 (res);
+             BID_RETURN (res);
+           }
+           if (C1_hi >= 0x8000000000000000ull) {       // negative coefficient!
+             C1_lo = ~C1_lo;
+             C1_lo++;
+             C1_hi = ~C1_hi;
+             if (C1_lo == 0x0)
+               C1_hi++;
+             x_sign = y_sign;  // the result will have the sign of y
+           }
+         }
+         // assemble the result
+         res.w[1] = x_sign | y_exp | C1_hi;
+         res.w[0] = C1_lo;
+       } else if (delta == P34 - q2) {
+         // calculate C' directly; the result may be inexact if it requires 
+         // P34+1 decimal digits; in this case the 'cutoff' point for addition
+         // is at the position of the lsb of C2, so 0 <= e1-e2 <= P34-1
+         // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
+         // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
+         // but their product fits with certainty in 128 bits (actually in 113)
+         scale = delta - q1 + q2;      // scale = (int)(e1 >> 49) - (int)(e2 >> 49)
+         if (scale >= 20) {    // 10^(e1-e2) does not fit in 64 bits, but C1 does
+           __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
+         } else if (scale >= 1) {
+           // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
+           if (q1 <= 19) {     // C1 fits in 64 bits
+             __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
+           } else {    // q1 >= 20
+             C1.w[1] = C1_hi;
+             C1.w[0] = C1_lo;
+             __mul_128x64_to_128 (C1, ten2k64[scale], C1);
+           }
+         } else {      // if (scale == 0) C1 is unchanged
+           C1.w[1] = C1_hi;
+           C1.w[0] = C1_lo;    // only the low part is necessary
+         }
+         C1_hi = C1.w[1];
+         C1_lo = C1.w[0];
+         // now add C2
+         if (x_sign == y_sign) {
+           // the result can overflow!
+           C1_lo = C1_lo + C2_lo;
+           C1_hi = C1_hi + C2_hi;
+           if (C1_lo < C1.w[0])
+             C1_hi++;
+           // test for overflow, possible only when C1 >= 10^34
+           if (C1_hi > 0x0001ed09bead87c0ull || (C1_hi == 0x0001ed09bead87c0ull && C1_lo >= 0x378d8e6400000000ull)) {  // C1 >= 10^34
+             // in this case q = P34 + 1 and x = q - P34 = 1, so multiply 
+             // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1 
+             // decimal digits
+             // Calculate C'' = C' + 1/2 * 10^x
+             if (C1_lo >= 0xfffffffffffffffbull) {     // low half add has carry
+               C1_lo = C1_lo + 5;
+               C1_hi = C1_hi + 1;
+             } else {
+               C1_lo = C1_lo + 5;
+             }
+             // the approximation of 10^(-1) was rounded up to 118 bits
+             // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
+             // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
+             C1.w[1] = C1_hi;
+             C1.w[0] = C1_lo;  // C''
+             ten2m1.w[1] = 0x1999999999999999ull;
+             ten2m1.w[0] = 0x9999999999999a00ull;
+             __mul_128x128_to_256 (P256, C1, ten2m1);  // P256 = C*, f*
+             // C* is actually floor(C*) in this case
+             // the top Ex = 128 bits of 10^(-1) are 
+             // T* = 0x00199999999999999999999999999999
+             // if (0 < f* < 10^(-x)) then
+             //   if floor(C*) is even then C = floor(C*) - logical right 
+             //       shift; C has p decimal digits, correct by Prop. 1)
+             //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
+             //       shift; C has p decimal digits, correct by Pr. 1)
+             // else
+             //   C = floor(C*) (logical right shift; C has p decimal digits,
+             //       correct by Property 1)
+             // n = C * 10^(e2+x)
+             if ((P256.w[1] || P256.w[0])
+                 && (P256.w[1] < 0x1999999999999999ull
+                     || (P256.w[1] == 0x1999999999999999ull
+                         && P256.w[0] <= 0x9999999999999999ull))) {
+               // the result is a midpoint
+               if (P256.w[2] & 0x01) {
+                 is_midpoint_gt_even = 1;
+                 // if floor(C*) is odd C = floor(C*) - 1; the result is not 0
+                 P256.w[2]--;
+                 if (P256.w[2] == 0xffffffffffffffffull)
+                   P256.w[3]--;
+               } else {
+                 is_midpoint_lt_even = 1;
+               }
+             }
+             // n = Cstar * 10^(e2+1)
+             y_exp = y_exp + EXP_P1;
+             // C* != 10^P because C* has P34 digits
+             // check for overflow
+             if (y_exp == EXP_MAX_P1
+                 && (rnd_mode == ROUNDING_TO_NEAREST
+                     || rnd_mode == ROUNDING_TIES_AWAY)) {
+               // overflow for RN
+               res.w[1] = x_sign | 0x7800000000000000ull;      // +/-inf
+               res.w[0] = 0x0ull;
+               // set the inexact flag
+               *pfpsf |= INEXACT_EXCEPTION;
+               // set the overflow flag
+               *pfpsf |= OVERFLOW_EXCEPTION;
+               BID_SWAP128 (res);
+               BID_RETURN (res);
+             }
+             // if (0 < f* - 1/2 < 10^(-x)) then 
+             //   the result of the addition is exact 
+             // else 
+             //   the result of the addition is inexact
+             if (P256.w[1] > 0x8000000000000000ull || (P256.w[1] == 0x8000000000000000ull && P256.w[0] > 0x0ull)) {    // the result may be exact
+               tmp64 = P256.w[1] - 0x8000000000000000ull;      // f* - 1/2
+               if ((tmp64 > 0x1999999999999999ull
+                    || (tmp64 == 0x1999999999999999ull
+                        && P256.w[0] >= 0x9999999999999999ull))) {
+                 // set the inexact flag
+                 *pfpsf |= INEXACT_EXCEPTION;
+                 is_inexact = 1;
+               }       // else the result is exact
+             } else {  // the result is inexact
+               // set the inexact flag
+               *pfpsf |= INEXACT_EXCEPTION;
+               is_inexact = 1;
+             }
+             C1_hi = P256.w[3];
+             C1_lo = P256.w[2];
+             if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
+               is_inexact_lt_midpoint = is_inexact
+                 && (P256.w[1] & 0x8000000000000000ull);
+               is_inexact_gt_midpoint = is_inexact
+                 && !(P256.w[1] & 0x8000000000000000ull);
+             }
+             // general correction from RN to RA, RM, RP, RZ; 
+             // result uses y_exp
+             if (rnd_mode != ROUNDING_TO_NEAREST) {
+               if ((!x_sign
+                    &&
+                    ((rnd_mode == ROUNDING_UP
+                      && is_inexact_lt_midpoint)
+                     ||
+                     ((rnd_mode == ROUNDING_TIES_AWAY
+                       || rnd_mode == ROUNDING_UP)
+                      && is_midpoint_gt_even))) || (x_sign
+                                                    &&
+                                                    ((rnd_mode ==
+                                                      ROUNDING_DOWN
+                                                      &&
+                                                      is_inexact_lt_midpoint)
+                                                     ||
+                                                     ((rnd_mode ==
+                                                       ROUNDING_TIES_AWAY
+                                                       || rnd_mode ==
+                                                       ROUNDING_DOWN)
+                                                      &&
+                                                      is_midpoint_gt_even))))
+               {
+                 // C1 = C1 + 1
+                 C1_lo = C1_lo + 1;
+                 if (C1_lo == 0) {     // rounding overflow in the low 64 bits
+                   C1_hi = C1_hi + 1;
+                 }
+                 if (C1_hi == 0x0001ed09bead87c0ull
+                     && C1_lo == 0x378d8e6400000000ull) {
+                   // C1 = 10^34 => rounding overflow
+                   C1_hi = 0x0000314dc6448d93ull;
+                   C1_lo = 0x38c15b0a00000000ull;      // 10^33
+                   y_exp = y_exp + EXP_P1;
+                 }
+               } else
+                 if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
+                     &&
+                     ((x_sign
+                       && (rnd_mode == ROUNDING_UP
+                           || rnd_mode == ROUNDING_TO_ZERO))
+                      || (!x_sign
+                          && (rnd_mode == ROUNDING_DOWN
+                              || rnd_mode == ROUNDING_TO_ZERO)))) {
+                 // C1 = C1 - 1
+                 C1_lo = C1_lo - 1;
+                 if (C1_lo == 0xffffffffffffffffull)
+                   C1_hi--;
+                 // check if we crossed into the lower decade
+                 if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {       // 10^33 - 1
+                   C1_hi = 0x0001ed09bead87c0ull;      // 10^34 - 1
+                   C1_lo = 0x378d8e63ffffffffull;
+                   y_exp = y_exp - EXP_P1;
+                   // no underflow, because delta + q2 >= P34 + 1
+                 }
+               } else {
+                 ;     // exact, the result is already correct
+               }
+               // in all cases check for overflow (RN and RA solved already)
+               if (y_exp == EXP_MAX_P1) {      // overflow
+                 if ((rnd_mode == ROUNDING_DOWN && x_sign) ||  // RM and res < 0
+                     (rnd_mode == ROUNDING_UP && !x_sign)) {   // RP and res > 0
+                   C1_hi = 0x7800000000000000ull;      // +inf
+                   C1_lo = 0x0ull;
+                 } else {      // RM and res > 0, RP and res < 0, or RZ
+                   C1_hi = 0x5fffed09bead87c0ull;
+                   C1_lo = 0x378d8e63ffffffffull;
+                 }
+                 y_exp = 0;    // x_sign is preserved
+                 // set the inexact flag (in case the exact addition was exact)
+                 *pfpsf |= INEXACT_EXCEPTION;
+                 // set the overflow flag
+                 *pfpsf |= OVERFLOW_EXCEPTION;
+               }
+             }
+           }   // else if (C1 < 10^34) then C1 is the coeff.; the result is exact
+         } else {      // if x_sign != y_sign the result is exact
+           C1_lo = C1_lo - C2_lo;
+           C1_hi = C1_hi - C2_hi;
+           if (C1_lo > C1.w[0])
+             C1_hi--;
+           // the result can be zero, but it cannot overflow
+           if (C1_lo == 0 && C1_hi == 0) {
+             // assemble the result
+             if (x_exp < y_exp)
+               res.w[1] = x_exp;
+             else
+               res.w[1] = y_exp;
+             res.w[0] = 0;
+             if (rnd_mode == ROUNDING_DOWN) {
+               res.w[1] |= 0x8000000000000000ull;
+             }
+             BID_SWAP128 (res);
+             BID_RETURN (res);
+           }
+           if (C1_hi >= 0x8000000000000000ull) {       // negative coefficient!
+             C1_lo = ~C1_lo;
+             C1_lo++;
+             C1_hi = ~C1_hi;
+             if (C1_lo == 0x0)
+               C1_hi++;
+             x_sign = y_sign;  // the result will have the sign of y
+           }
+         }
+         // assemble the result
+         res.w[1] = x_sign | y_exp | C1_hi;
+         res.w[0] = C1_lo;
+       } else {        // if (delta >= P34 + 1 - q2)
+         // instead of C' = (C1 * 10^(e1-e2) + C2)rnd,P34
+         // calculate C' = C1 * 10^(e1-e2-x1) + (C2 * 10^(-x1))rnd,P34 
+         // where x1 = q1 + e1 - e2 - P34, 1 <= x1 <= P34 - 1
+         // In most cases C' will have P34 digits, and n = C' * 10^(e2+x1)
+         // If the result has P34+1 digits, redo the steps above with x1+1
+         // If the result has P34-1 digits or less, redo the steps above with 
+         // x1-1 but only if initially x1 >= 1
+         // NOTE: these two steps can be improved, e.g we could guess if
+         // P34+1 or P34-1 digits will be obtained by adding/subtracting just
+         // the top 64 bits of the two operands
+         // The result cannot be zero, but it can overflow
+         x1 = delta + q2 - P34;        // 1 <= x1 <= P34-1
+       roundC2:
+         // Calculate C1 * 10^(e1-e2-x1) where 0 <= e1-e2-x1 <= P34 - 1
+         // scale = (int)(e1 >> 49) - (int)(e2 >> 49) - x1; 0 <= scale <= P34-1
+         scale = delta - q1 + q2 - x1; // scale = e1 - e2 - x1 = P34 - q1
+         // either C1 or 10^(e1-e2-x1) may not fit is 64 bits,
+         // but their product fits with certainty in 128 bits (actually in 113)
+         if (scale >= 20) {    //10^(e1-e2-x1) doesn't fit in 64 bits, but C1 does
+           __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
+         } else if (scale >= 1) {
+           // if 1 <= scale <= 19 then 10^(e1-e2-x1) fits in 64 bits
+           if (q1 <= 19) {     // C1 fits in 64 bits
+             __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
+           } else {    // q1 >= 20
+             C1.w[1] = C1_hi;
+             C1.w[0] = C1_lo;
+             __mul_128x64_to_128 (C1, ten2k64[scale], C1);
+           }
+         } else {      // if (scale == 0) C1 is unchanged
+           C1.w[1] = C1_hi;
+           C1.w[0] = C1_lo;
+         }
+         tmp64 = C1.w[0];      // C1.w[1], C1.w[0] contains C1 * 10^(e1-e2-x1)
+
+         // now round C2 to q2-x1 decimal digits, where 1<=x1<=q2-1<=P34-1
+         // (but if we got here a second time after x1 = x1 - 1, then 
+         // x1 >= 0; note that for x1 = 0 C2 is unchanged)
+         // C2' = C2 + 1/2 * 10^x1 = C2 + 5 * 10^(x1-1)
+         ind = x1 - 1; // 0 <= ind <= q2-2<=P34-2=32; but note that if x1 = 0
+         // during a second pass, then ind = -1
+         if (ind >= 0) {       // if (x1 >= 1)
+           C2.w[0] = C2_lo;
+           C2.w[1] = C2_hi;
+           if (ind <= 18) {
+             C2.w[0] = C2.w[0] + midpoint64[ind];
+             if (C2.w[0] < C2_lo)
+               C2.w[1]++;
+           } else {    // 19 <= ind <= 32
+             C2.w[0] = C2.w[0] + midpoint128[ind - 19].w[0];
+             C2.w[1] = C2.w[1] + midpoint128[ind - 19].w[1];
+             if (C2.w[0] < C2_lo)
+               C2.w[1]++;
+           }
+           // the approximation of 10^(-x1) was rounded up to 118 bits
+           __mul_128x128_to_256 (R256, C2, ten2mk128[ind]);    // R256 = C2*, f2*
+           // calculate C2* and f2*
+           // C2* is actually floor(C2*) in this case
+           // C2* and f2* need shifting and masking, as shown by
+           // shiftright128[] and maskhigh128[]
+           // the top Ex bits of 10^(-x1) are T* = ten2mk128trunc[ind], e.g.
+           // if x1=1, T*=ten2mk128trunc[0]=0x19999999999999999999999999999999
+           // if (0 < f2* < 10^(-x1)) then
+           //   if floor(C1+C2*) is even then C2* = floor(C2*) - logical right
+           //       shift; C2* has p decimal digits, correct by Prop. 1)
+           //   else if floor(C1+C2*) is odd C2* = floor(C2*)-1 (logical right
+           //       shift; C2* has p decimal digits, correct by Pr. 1)
+           // else
+           //   C2* = floor(C2*) (logical right shift; C has p decimal digits,
+           //       correct by Property 1)
+           // n = C2* * 10^(e2+x1)
+
+           if (ind <= 2) {
+             highf2star.w[1] = 0x0;
+             highf2star.w[0] = 0x0;    // low f2* ok
+           } else if (ind <= 21) {
+             highf2star.w[1] = 0x0;
+             highf2star.w[0] = R256.w[2] & maskhigh128[ind];   // low f2* ok
+           } else {
+             highf2star.w[1] = R256.w[3] & maskhigh128[ind];
+             highf2star.w[0] = R256.w[2];      // low f2* is ok
+           }
+           // shift right C2* by Ex-128 = shiftright128[ind]
+           if (ind >= 3) {
+             shift = shiftright128[ind];
+             if (shift < 64) { // 3 <= shift <= 63
+               R256.w[2] =
+                 (R256.w[2] >> shift) | (R256.w[3] << (64 - shift));
+               R256.w[3] = (R256.w[3] >> shift);
+             } else {  // 66 <= shift <= 102
+               R256.w[2] = (R256.w[3] >> (shift - 64));
+               R256.w[3] = 0x0ULL;
+             }
+           }
+           if (second_pass) {
+             is_inexact_lt_midpoint = 0;
+             is_inexact_gt_midpoint = 0;
+             is_midpoint_lt_even = 0;
+             is_midpoint_gt_even = 0;
+           }
+           // determine inexactness of the rounding of C2* (this may be 
+           // followed by a second rounding only if we get P34+1 
+           // decimal digits)
+           // if (0 < f2* - 1/2 < 10^(-x1)) then
+           //   the result is exact
+           // else (if f2* - 1/2 > T* then)
+           //   the result of is inexact
+           if (ind <= 2) {
+             if (R256.w[1] > 0x8000000000000000ull ||
+                 (R256.w[1] == 0x8000000000000000ull
+                  && R256.w[0] > 0x0ull)) {
+               // f2* > 1/2 and the result may be exact
+               tmp64A = R256.w[1] - 0x8000000000000000ull;     // f* - 1/2
+               if ((tmp64A > ten2mk128trunc[ind].w[1]
+                    || (tmp64A == ten2mk128trunc[ind].w[1]
+                        && R256.w[0] >= ten2mk128trunc[ind].w[0]))) {
+                 // set the inexact flag
+                 // *pfpsf |= INEXACT_EXCEPTION;
+                 tmp_inexact = 1;      // may be set again during a second pass
+                 // this rounding is applied to C2 only!
+                 if (x_sign == y_sign)
+                   is_inexact_lt_midpoint = 1;
+                 else  // if (x_sign != y_sign)
+                   is_inexact_gt_midpoint = 1;
+               }       // else the result is exact
+               // rounding down, unless a midpoint in [ODD, EVEN]
+             } else {  // the result is inexact; f2* <= 1/2
+               // set the inexact flag
+               // *pfpsf |= INEXACT_EXCEPTION;
+               tmp_inexact = 1;        // just in case we will round a second time
+               // rounding up, unless a midpoint in [EVEN, ODD]
+               // this rounding is applied to C2 only!
+               if (x_sign == y_sign)
+                 is_inexact_gt_midpoint = 1;
+               else    // if (x_sign != y_sign)
+                 is_inexact_lt_midpoint = 1;
+             }
+           } else if (ind <= 21) {     // if 3 <= ind <= 21
+             if (highf2star.w[1] > 0x0 || (highf2star.w[1] == 0x0
+                                           && highf2star.w[0] >
+                                           onehalf128[ind])
+                 || (highf2star.w[1] == 0x0
+                     && highf2star.w[0] == onehalf128[ind]
+                     && (R256.w[1] || R256.w[0]))) {
+               // f2* > 1/2 and the result may be exact
+               // Calculate f2* - 1/2
+               tmp64A = highf2star.w[0] - onehalf128[ind];
+               tmp64B = highf2star.w[1];
+               if (tmp64A > highf2star.w[0])
+                 tmp64B--;
+               if (tmp64B || tmp64A
+                   || R256.w[1] > ten2mk128trunc[ind].w[1]
+                   || (R256.w[1] == ten2mk128trunc[ind].w[1]
+                       && R256.w[0] > ten2mk128trunc[ind].w[0])) {
+                 // set the inexact flag
+                 // *pfpsf |= INEXACT_EXCEPTION;
+                 tmp_inexact = 1;      // may be set again during a second pass
+                 // this rounding is applied to C2 only!
+                 if (x_sign == y_sign)
+                   is_inexact_lt_midpoint = 1;
+                 else  // if (x_sign != y_sign)
+                   is_inexact_gt_midpoint = 1;
+               }       // else the result is exact
+             } else {  // the result is inexact; f2* <= 1/2
+               // set the inexact flag
+               // *pfpsf |= INEXACT_EXCEPTION;
+               tmp_inexact = 1;        // may be set again during a second pass
+               // rounding up, unless a midpoint in [EVEN, ODD]
+               // this rounding is applied to C2 only!
+               if (x_sign == y_sign)
+                 is_inexact_gt_midpoint = 1;
+               else    // if (x_sign != y_sign)
+                 is_inexact_lt_midpoint = 1;
+             }
+           } else {    // if 22 <= ind <= 33
+             if (highf2star.w[1] > onehalf128[ind]
+                 || (highf2star.w[1] == onehalf128[ind]
+                     && (highf2star.w[0] || R256.w[1]
+                         || R256.w[0]))) {
+               // f2* > 1/2 and the result may be exact
+               // Calculate f2* - 1/2
+               // tmp64A = highf2star.w[0];
+               tmp64B = highf2star.w[1] - onehalf128[ind];
+               if (tmp64B || highf2star.w[0]
+                   || R256.w[1] > ten2mk128trunc[ind].w[1]
+                   || (R256.w[1] == ten2mk128trunc[ind].w[1]
+                       && R256.w[0] > ten2mk128trunc[ind].w[0])) {
+                 // set the inexact flag
+                 // *pfpsf |= INEXACT_EXCEPTION;
+                 tmp_inexact = 1;      // may be set again during a second pass
+                 // this rounding is applied to C2 only!
+                 if (x_sign == y_sign)
+                   is_inexact_lt_midpoint = 1;
+                 else  // if (x_sign != y_sign)
+                   is_inexact_gt_midpoint = 1;
+               }       // else the result is exact
+             } else {  // the result is inexact; f2* <= 1/2
+               // set the inexact flag
+               // *pfpsf |= INEXACT_EXCEPTION;
+               tmp_inexact = 1;        // may be set again during a second pass
+               // rounding up, unless a midpoint in [EVEN, ODD]
+               // this rounding is applied to C2 only!
+               if (x_sign == y_sign)
+                 is_inexact_gt_midpoint = 1;
+               else    // if (x_sign != y_sign)
+                 is_inexact_lt_midpoint = 1;
+             }
+           }
+           // check for midpoints
+           if ((R256.w[1] || R256.w[0]) && (highf2star.w[1] == 0)
+               && (highf2star.w[0] == 0)
+               && (R256.w[1] < ten2mk128trunc[ind].w[1]
+                   || (R256.w[1] == ten2mk128trunc[ind].w[1]
+                       && R256.w[0] <= ten2mk128trunc[ind].w[0]))) {
+             // the result is a midpoint
+             if ((tmp64 + R256.w[2]) & 0x01) { // MP in [EVEN, ODD]
+               // if floor(C2*) is odd C = floor(C2*) - 1; the result may be 0
+               R256.w[2]--;
+               if (R256.w[2] == 0xffffffffffffffffull)
+                 R256.w[3]--;
+               // this rounding is applied to C2 only!
+               if (x_sign == y_sign)
+                 is_midpoint_gt_even = 1;
+               else    // if (x_sign != y_sign)
+                 is_midpoint_lt_even = 1;
+               is_inexact_lt_midpoint = 0;
+               is_inexact_gt_midpoint = 0;
+             } else {
+               // else MP in [ODD, EVEN]
+               // this rounding is applied to C2 only!
+               if (x_sign == y_sign)
+                 is_midpoint_lt_even = 1;
+               else    // if (x_sign != y_sign)
+                 is_midpoint_gt_even = 1;
+               is_inexact_lt_midpoint = 0;
+               is_inexact_gt_midpoint = 0;
+             }
+           }
+           // end if (ind >= 0)
+         } else {      // if (ind == -1); only during a 2nd pass, and when x1 = 0
+           R256.w[2] = C2_lo;
+           R256.w[3] = C2_hi;
+           tmp_inexact = 0;
+           // to correct a possible setting to 1 from 1st pass
+           if (second_pass) {
+             is_midpoint_lt_even = 0;
+             is_midpoint_gt_even = 0;
+             is_inexact_lt_midpoint = 0;
+             is_inexact_gt_midpoint = 0;
+           }
+         }
+         // and now add/subtract C1 * 10^(e1-e2-x1) +/- (C2 * 10^(-x1))rnd,P34
+         if (x_sign == y_sign) {       // addition; could overflow
+           // no second pass is possible this way (only for x_sign != y_sign)
+           C1.w[0] = C1.w[0] + R256.w[2];
+           C1.w[1] = C1.w[1] + R256.w[3];
+           if (C1.w[0] < tmp64)
+             C1.w[1]++;        // carry
+           // if the sum has P34+1 digits, i.e. C1>=10^34 redo the calculation
+           // with x1=x1+1 
+           if (C1.w[1] > 0x0001ed09bead87c0ull || (C1.w[1] == 0x0001ed09bead87c0ull && C1.w[0] >= 0x378d8e6400000000ull)) {    // C1 >= 10^34
+             // chop off one more digit from the sum, but make sure there is
+             // no double-rounding error (see table - double rounding logic)
+             // now round C1 from P34+1 to P34 decimal digits
+             // C1' = C1 + 1/2 * 10 = C1 + 5
+             if (C1.w[0] >= 0xfffffffffffffffbull) {   // low half add has carry
+               C1.w[0] = C1.w[0] + 5;
+               C1.w[1] = C1.w[1] + 1;
+             } else {
+               C1.w[0] = C1.w[0] + 5;
+             }
+             // the approximation of 10^(-1) was rounded up to 118 bits
+             __mul_128x128_to_256 (Q256, C1, ten2mk128[0]);    // Q256 = C1*, f1*
+             // C1* is actually floor(C1*) in this case
+             // the top 128 bits of 10^(-1) are
+             // T* = ten2mk128trunc[0]=0x19999999999999999999999999999999
+             // if (0 < f1* < 10^(-1)) then
+             //   if floor(C1*) is even then C1* = floor(C1*) - logical right
+             //       shift; C1* has p decimal digits, correct by Prop. 1)
+             //   else if floor(C1*) is odd C1* = floor(C1*) - 1 (logical right
+             //       shift; C1* has p decimal digits, correct by Pr. 1)
+             // else
+             //   C1* = floor(C1*) (logical right shift; C has p decimal digits
+             //       correct by Property 1)
+             // n = C1* * 10^(e2+x1+1)
+             if ((Q256.w[1] || Q256.w[0])
+                 && (Q256.w[1] < ten2mk128trunc[0].w[1]
+                     || (Q256.w[1] == ten2mk128trunc[0].w[1]
+                         && Q256.w[0] <= ten2mk128trunc[0].w[0]))) {
+               // the result is a midpoint
+               if (is_inexact_lt_midpoint) {   // for the 1st rounding
+                 is_inexact_gt_midpoint = 1;
+                 is_inexact_lt_midpoint = 0;
+                 is_midpoint_gt_even = 0;
+                 is_midpoint_lt_even = 0;
+               } else if (is_inexact_gt_midpoint) {    // for the 1st rounding
+                 Q256.w[2]--;
+                 if (Q256.w[2] == 0xffffffffffffffffull)
+                   Q256.w[3]--;
+                 is_inexact_gt_midpoint = 0;
+                 is_inexact_lt_midpoint = 1;
+                 is_midpoint_gt_even = 0;
+                 is_midpoint_lt_even = 0;
+               } else if (is_midpoint_gt_even) {       // for the 1st rounding
+                 // Note: cannot have is_midpoint_lt_even
+                 is_inexact_gt_midpoint = 0;
+                 is_inexact_lt_midpoint = 1;
+                 is_midpoint_gt_even = 0;
+                 is_midpoint_lt_even = 0;
+               } else {        // the first rounding must have been exact
+                 if (Q256.w[2] & 0x01) {       // MP in [EVEN, ODD]
+                   // the truncated result is correct
+                   Q256.w[2]--;
+                   if (Q256.w[2] == 0xffffffffffffffffull)
+                     Q256.w[3]--;
+                   is_inexact_gt_midpoint = 0;
+                   is_inexact_lt_midpoint = 0;
+                   is_midpoint_gt_even = 1;
+                   is_midpoint_lt_even = 0;
+                 } else {      // MP in [ODD, EVEN]
+                   is_inexact_gt_midpoint = 0;
+                   is_inexact_lt_midpoint = 0;
+                   is_midpoint_gt_even = 0;
+                   is_midpoint_lt_even = 1;
+                 }
+               }
+               tmp_inexact = 1;        // in all cases
+             } else {  // the result is not a midpoint 
+               // determine inexactness of the rounding of C1 (the sum C1+C2*)
+               // if (0 < f1* - 1/2 < 10^(-1)) then
+               //   the result is exact
+               // else (if f1* - 1/2 > T* then)
+               //   the result of is inexact
+               // ind = 0
+               if (Q256.w[1] > 0x8000000000000000ull
+                   || (Q256.w[1] == 0x8000000000000000ull
+                       && Q256.w[0] > 0x0ull)) {
+                 // f1* > 1/2 and the result may be exact
+                 Q256.w[1] = Q256.w[1] - 0x8000000000000000ull;        // f1* - 1/2
+                 if ((Q256.w[1] > ten2mk128trunc[0].w[1]
+                      || (Q256.w[1] == ten2mk128trunc[0].w[1]
+                          && Q256.w[0] > ten2mk128trunc[0].w[0]))) {
+                   is_inexact_gt_midpoint = 0;
+                   is_inexact_lt_midpoint = 1;
+                   is_midpoint_gt_even = 0;
+                   is_midpoint_lt_even = 0;
+                   // set the inexact flag
+                   tmp_inexact = 1;
+                   // *pfpsf |= INEXACT_EXCEPTION;
+                 } else {      // else the result is exact for the 2nd rounding
+                   if (tmp_inexact) {  // if the previous rounding was inexact
+                     if (is_midpoint_lt_even) {
+                       is_inexact_gt_midpoint = 1;
+                       is_midpoint_lt_even = 0;
+                     } else if (is_midpoint_gt_even) {
+                       is_inexact_lt_midpoint = 1;
+                       is_midpoint_gt_even = 0;
+                     } else {
+                       ;       // no change
+                     }
+                   }
+                 }
+                 // rounding down, unless a midpoint in [ODD, EVEN]
+               } else {        // the result is inexact; f1* <= 1/2
+                 is_inexact_gt_midpoint = 1;
+                 is_inexact_lt_midpoint = 0;
+                 is_midpoint_gt_even = 0;
+                 is_midpoint_lt_even = 0;
+                 // set the inexact flag
+                 tmp_inexact = 1;
+                 // *pfpsf |= INEXACT_EXCEPTION;
+               }
+             } // end 'the result is not a midpoint'
+             // n = C1 * 10^(e2+x1)
+             C1.w[1] = Q256.w[3];
+             C1.w[0] = Q256.w[2];
+             y_exp = y_exp + ((UINT64) (x1 + 1) << 49);
+           } else {    // C1 < 10^34
+             // C1.w[1] and C1.w[0] already set
+             // n = C1 * 10^(e2+x1)
+             y_exp = y_exp + ((UINT64) x1 << 49);
+           }
+           // check for overflow
+           if (y_exp == EXP_MAX_P1
+               && (rnd_mode == ROUNDING_TO_NEAREST
+                   || rnd_mode == ROUNDING_TIES_AWAY)) {
+             res.w[1] = 0x7800000000000000ull | x_sign;        // +/-inf
+             res.w[0] = 0x0ull;
+             // set the inexact flag
+             *pfpsf |= INEXACT_EXCEPTION;
+             // set the overflow flag
+             *pfpsf |= OVERFLOW_EXCEPTION;
+             BID_SWAP128 (res);
+             BID_RETURN (res);
+           }   // else no overflow
+         } else {      // if x_sign != y_sign the result of this subtract. is exact
+           C1.w[0] = C1.w[0] - R256.w[2];
+           C1.w[1] = C1.w[1] - R256.w[3];
+           if (C1.w[0] > tmp64)
+             C1.w[1]--;        // borrow
+           if (C1.w[1] >= 0x8000000000000000ull) {     // negative coefficient!
+             C1.w[0] = ~C1.w[0];
+             C1.w[0]++;
+             C1.w[1] = ~C1.w[1];
+             if (C1.w[0] == 0x0)
+               C1.w[1]++;
+             tmp_sign = y_sign;
+             // the result will have the sign of y if last rnd
+           } else {
+             tmp_sign = x_sign;
+           }
+           // if the difference has P34-1 digits or less, i.e. C1 < 10^33 then
+           //   redo the calculation with x1=x1-1;
+           // redo the calculation also if C1 = 10^33 and 
+           //   (is_inexact_gt_midpoint or is_midpoint_lt_even);
+           //   (the last part should have really been 
+           //   (is_inexact_lt_midpoint or is_midpoint_gt_even) from
+           //    the rounding of C2, but the position flags have been reversed)
+           // 10^33 = 0x0000314dc6448d93 0x38c15b0a00000000
+           if ((C1.w[1] < 0x0000314dc6448d93ull || (C1.w[1] == 0x0000314dc6448d93ull && C1.w[0] < 0x38c15b0a00000000ull)) || (C1.w[1] == 0x0000314dc6448d93ull && C1.w[0] == 0x38c15b0a00000000ull && (is_inexact_gt_midpoint || is_midpoint_lt_even))) {      // C1=10^33
+             x1 = x1 - 1;      // x1 >= 0
+             if (x1 >= 0) {
+               // clear position flags and tmp_inexact
+               is_midpoint_lt_even = 0;
+               is_midpoint_gt_even = 0;
+               is_inexact_lt_midpoint = 0;
+               is_inexact_gt_midpoint = 0;
+               tmp_inexact = 0;
+               second_pass = 1;
+               goto roundC2;   // else result has less than P34 digits
+             }
+           }
+           // if the coefficient of the result is 10^34 it means that this
+           // must be the second pass, and we are done 
+           if (C1.w[1] == 0x0001ed09bead87c0ull && C1.w[0] == 0x378d8e6400000000ull) { // if  C1 = 10^34
+             C1.w[1] = 0x0000314dc6448d93ull;  // C1 = 10^33
+             C1.w[0] = 0x38c15b0a00000000ull;
+             y_exp = y_exp + ((UINT64) 1 << 49);
+           }
+           x_sign = tmp_sign;
+           if (x1 >= 1)
+             y_exp = y_exp + ((UINT64) x1 << 49);
+           // x1 = -1 is possible at the end of a second pass when the 
+           // first pass started with x1 = 1 
+         }
+         C1_hi = C1.w[1];
+         C1_lo = C1.w[0];
+         // general correction from RN to RA, RM, RP, RZ; result uses y_exp
+         if (rnd_mode != ROUNDING_TO_NEAREST) {
+           if ((!x_sign
+                && ((rnd_mode == ROUNDING_UP && is_inexact_lt_midpoint)
+                    ||
+                    ((rnd_mode == ROUNDING_TIES_AWAY
+                      || rnd_mode == ROUNDING_UP)
+                     && is_midpoint_gt_even))) || (x_sign
+                                                   &&
+                                                   ((rnd_mode ==
+                                                     ROUNDING_DOWN
+                                                     &&
+                                                     is_inexact_lt_midpoint)
+                                                    ||
+                                                    ((rnd_mode ==
+                                                      ROUNDING_TIES_AWAY
+                                                      || rnd_mode ==
+                                                      ROUNDING_DOWN)
+                                                     &&
+                                                     is_midpoint_gt_even))))
+           {
+             // C1 = C1 + 1
+             C1_lo = C1_lo + 1;
+             if (C1_lo == 0) { // rounding overflow in the low 64 bits
+               C1_hi = C1_hi + 1;
+             }
+             if (C1_hi == 0x0001ed09bead87c0ull
+                 && C1_lo == 0x378d8e6400000000ull) {
+               // C1 = 10^34 => rounding overflow
+               C1_hi = 0x0000314dc6448d93ull;
+               C1_lo = 0x38c15b0a00000000ull;  // 10^33
+               y_exp = y_exp + EXP_P1;
+             }
+           } else if ((is_midpoint_lt_even || is_inexact_gt_midpoint)
+                      &&
+                      ((x_sign
+                        && (rnd_mode == ROUNDING_UP
+                            || rnd_mode == ROUNDING_TO_ZERO))
+                       || (!x_sign
+                           && (rnd_mode == ROUNDING_DOWN
+                               || rnd_mode == ROUNDING_TO_ZERO)))) {
+             // C1 = C1 - 1
+             C1_lo = C1_lo - 1;
+             if (C1_lo == 0xffffffffffffffffull)
+               C1_hi--;
+             // check if we crossed into the lower decade
+             if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) {   // 10^33 - 1
+               C1_hi = 0x0001ed09bead87c0ull;  // 10^34 - 1
+               C1_lo = 0x378d8e63ffffffffull;
+               y_exp = y_exp - EXP_P1;
+               // no underflow, because delta + q2 >= P34 + 1
+             }
+           } else {
+             ; // exact, the result is already correct
+           }
+           // in all cases check for overflow (RN and RA solved already)
+           if (y_exp == EXP_MAX_P1) {  // overflow
+             if ((rnd_mode == ROUNDING_DOWN && x_sign) ||      // RM and res < 0
+                 (rnd_mode == ROUNDING_UP && !x_sign)) {       // RP and res > 0
+               C1_hi = 0x7800000000000000ull;  // +inf
+               C1_lo = 0x0ull;
+             } else {  // RM and res > 0, RP and res < 0, or RZ
+               C1_hi = 0x5fffed09bead87c0ull;
+               C1_lo = 0x378d8e63ffffffffull;
+             }
+             y_exp = 0;        // x_sign is preserved
+             // set the inexact flag (in case the exact addition was exact)
+             *pfpsf |= INEXACT_EXCEPTION;
+             // set the overflow flag
+             *pfpsf |= OVERFLOW_EXCEPTION;
+           }
+         }
+         // assemble the result
+         res.w[1] = x_sign | y_exp | C1_hi;
+         res.w[0] = C1_lo;
+         if (tmp_inexact)
+           *pfpsf |= INEXACT_EXCEPTION;
+       }
+      } else { // if (-P34 + 1 <= delta <= -1) <=> 1 <= -delta <= P34 - 1
+       // NOTE: the following, up to "} else { // if x_sign != y_sign 
+       // the result is exact" is identical to "else if (delta == P34 - q2) {"
+       // from above; also, the code is not symmetric: a+b and b+a may take
+       // different paths (need to unify eventually!) 
+       // calculate C' = C2 + C1 * 10^(e1-e2) directly; the result may be 
+       // inexact if it requires P34 + 1 decimal digits; in either case the 
+       // 'cutoff' point for addition is at the position of the lsb of C2
+       // The coefficient of the result is C1 * 10^(e1-e2) + C2 and the
+       // exponent is e2; either C1 or 10^(e1-e2) may not fit is 64 bits,
+       // but their product fits with certainty in 128 bits (actually in 113)
+       // Note that 0 <= e1 - e2 <= P34 - 2
+       //   -P34 + 1 <= delta <= -1 <=> -P34 + 1 <= delta <= -1 <=>
+       //   -P34 + 1 <= q1 + e1 - q2 - e2 <= -1 <=>
+       //   q2 - q1 - P34 + 1 <= e1 - e2 <= q2 - q1 - 1 <=>
+       //   1 - P34 - P34 + 1 <= e1-e2 <= P34 - 1 - 1 => 0 <= e1-e2 <= P34 - 2
+       scale = delta - q1 + q2;        // scale = (int)(e1 >> 49) - (int)(e2 >> 49)
+       if (scale >= 20) {      // 10^(e1-e2) does not fit in 64 bits, but C1 does
+         __mul_128x64_to_128 (C1, C1_lo, ten2k128[scale - 20]);
+       } else if (scale >= 1) {
+         // if 1 <= scale <= 19 then 10^(e1-e2) fits in 64 bits
+         if (q1 <= 19) {       // C1 fits in 64 bits
+           __mul_64x64_to_128MACH (C1, C1_lo, ten2k64[scale]);
+         } else {      // q1 >= 20
+           C1.w[1] = C1_hi;
+           C1.w[0] = C1_lo;
+           __mul_128x64_to_128 (C1, ten2k64[scale], C1);
+         }
+       } else {        // if (scale == 0) C1 is unchanged
+         C1.w[1] = C1_hi;
+         C1.w[0] = C1_lo;      // only the low part is necessary
+       }
+       C1_hi = C1.w[1];
+       C1_lo = C1.w[0];
+       // now add C2
+       if (x_sign == y_sign) {
+         // the result can overflow!
+         C1_lo = C1_lo + C2_lo;
+         C1_hi = C1_hi + C2_hi;
+         if (C1_lo < C1.w[0])
+           C1_hi++;
+         // test for overflow, possible only when C1 >= 10^34
+         if (C1_hi > 0x0001ed09bead87c0ull || (C1_hi == 0x0001ed09bead87c0ull && C1_lo >= 0x378d8e6400000000ull)) {    // C1 >= 10^34
+           // in this case q = P34 + 1 and x = q - P34 = 1, so multiply 
+           // C'' = C'+ 5 = C1 + 5 by k1 ~ 10^(-1) calculated for P34 + 1 
+           // decimal digits
+           // Calculate C'' = C' + 1/2 * 10^x
+           if (C1_lo >= 0xfffffffffffffffbull) {       // low half add has carry
+             C1_lo = C1_lo + 5;
+             C1_hi = C1_hi + 1;
+           } else {
+             C1_lo = C1_lo + 5;
+           }
+           // the approximation of 10^(-1) was rounded up to 118 bits
+           // 10^(-1) =~ 33333333333333333333333333333400 * 2^-129
+           // 10^(-1) =~ 19999999999999999999999999999a00 * 2^-128
+           C1.w[1] = C1_hi;
+           C1.w[0] = C1_lo;    // C''
+           ten2m1.w[1] = 0x1999999999999999ull;
+           ten2m1.w[0] = 0x9999999999999a00ull;
+           __mul_128x128_to_256 (P256, C1, ten2m1);    // P256 = C*, f*
+           // C* is actually floor(C*) in this case
+           // the top Ex = 128 bits of 10^(-1) are 
+           // T* = 0x00199999999999999999999999999999
+           // if (0 < f* < 10^(-x)) then
+           //   if floor(C*) is even then C = floor(C*) - logical right 
+           //       shift; C has p decimal digits, correct by Prop. 1)
+           //   else if floor(C*) is odd C = floor(C*) - 1 (logical right
+           //       shift; C has p decimal digits, correct by Pr. 1)
+           // else
+           //   C = floor(C*) (logical right shift; C has p decimal digits,
+           //       correct by Property 1)
+           // n = C * 10^(e2+x)
+           if ((P256.w[1] || P256.w[0])
+               && (P256.w[1] < 0x1999999999999999ull
+                   || (P256.w[1] == 0x1999999999999999ull
+                       && P256.w[0] <= 0x9999999999999999ull))) {
+             // the result is a midpoint
+             if (P256.w[2] & 0x01) {
+               is_midpoint_gt_even = 1;
+               // if floor(C*) is odd C = floor(C*) - 1; the result is not 0
+               P256.w[2]--;
+               if (P256.w[2] == 0xffffffffffffffffull)
+                 P256.w[3]--;
+             } else {
+               is_midpoint_lt_even = 1;
+             }
+           }
+           // n = Cstar * 10^(e2+1)
+           y_exp = y_exp + EXP_P1;
+           // C* != 10^P34 because C* has P34 digits
+           // check for overflow
+           if (y_exp == EXP_MAX_P1
+               && (rnd_mode == ROUNDING_TO_NEAREST
+                   || rnd_mode == ROUNDING_TIES_AWAY)) {
+             // overflow for RN
+             res.w[1] = x_sign | 0x7800000000000000ull;        // +/-inf
+             res.w[0] = 0x0ull;
+             // set the inexact flag
+             *pfpsf |= INEXACT_EXCEPTION;
+             // set the overflow flag
+             *pfpsf |= OVERFLOW_EXCEPTION;
+             BID_SWAP128 (res);
+             BID_RETURN (res);
+           }
+           // if (0 < f* - 1/2 < 10^(-x)) then 
+           //   the result of the addition is exact 
+           // else 
+           //   the result of the addition is inexact
+           if (P256.w[1] > 0x8000000000000000ull || (P256.w[1] == 0x8000000000000000ull && P256.w[0] > 0x0ull)) {      // the result may be exact
+             tmp64 = P256.w[1] - 0x8000000000000000ull;        // f* - 1/2
+             if ((tmp64 > 0x1999999999999999ull
+                  || (tmp64 == 0x1999999999999999ull
+                      && P256.w[0] >= 0x9999999999999999ull))) {
+               // set the inexact flag
+               *pfpsf |= INEXACT_EXCEPTION;
+               is_inexact = 1;
+             } // else the result is exact
+           } else {    // the result is inexact
+             // set the inexact flag
+             *pfpsf |= INEXACT_EXCEPTION;
+             is_inexact = 1;
+           }
+           C1_hi = P256.w[3];
+           C1_lo = P256.w[2];
+           if (!is_midpoint_gt_even && !is_midpoint_lt_even) {
+             is_inexact_lt_midpoint = is_inexact
+               && (P256.w[1] & 0x8000000000000000ull);
+             is_inexact_gt_midpoint = is_inexact
+               && !(P256.w[1] & 0x8000000000000000ull);
+           }
+           // general correction from RN to RA, RM, RP, RZ; result uses y_exp
+           if (rnd_mode != ROUNDING_TO_NEAREST) {
+             if ((!x_sign
+                  && ((rnd_mode == ROUNDING_UP
+                       && is_inexact_lt_midpoint)
+                      || ((rnd_mode == ROUNDING_TIES_AWAY
+                           || rnd_mode == ROUNDING_UP)
+                          && is_midpoint_gt_even))) || (x_sign
+                                                        &&
+                                                        ((rnd_mode ==
+                                                          ROUNDING_DOWN
+                                                          &&
+                                                          is_inexact_lt_midpoint)
+                                                         ||
+                                                         ((rnd_mode ==
+                                                           ROUNDING_TIES_AWAY
+                                                           || rnd_mode
+                                                           ==
+                                                           ROUNDING_DOWN)
+                                                          &&
+                                                          is_midpoint_gt_even))))
+             {
+               // C1 = C1 + 1
+               C1_lo = C1_lo + 1;
+               if (C1_lo == 0) {       // rounding overflow in the low 64 bits
+                 C1_hi = C1_hi + 1;
+               }
+               if (C1_hi == 0x0001ed09bead87c0ull
+                   && C1_lo == 0x378d8e6400000000ull) {
+                 // C1 = 10^34 => rounding overflow
+                 C1_hi = 0x0000314dc6448d93ull;
+                 C1_lo = 0x38c15b0a00000000ull;        // 10^33
+                 y_exp = y_exp + EXP_P1;
+               }
+             } else
+               if ((is_midpoint_lt_even || is_inexact_gt_midpoint) &&
+                   ((x_sign && (rnd_mode == ROUNDING_UP ||
+                                rnd_mode == ROUNDING_TO_ZERO)) ||
+                    (!x_sign && (rnd_mode == ROUNDING_DOWN ||
+                                 rnd_mode == ROUNDING_TO_ZERO)))) {
+               // C1 = C1 - 1
+               C1_lo = C1_lo - 1;
+               if (C1_lo == 0xffffffffffffffffull)
+                 C1_hi--;
+               // check if we crossed into the lower decade
+               if (C1_hi == 0x0000314dc6448d93ull && C1_lo == 0x38c15b09ffffffffull) { // 10^33 - 1
+                 C1_hi = 0x0001ed09bead87c0ull;        // 10^34 - 1
+                 C1_lo = 0x378d8e63ffffffffull;
+                 y_exp = y_exp - EXP_P1;
+                 // no underflow, because delta + q2 >= P34 + 1
+               }
+             } else {
+               ;       // exact, the result is already correct
+             }
+             // in all cases check for overflow (RN and RA solved already)
+             if (y_exp == EXP_MAX_P1) {        // overflow
+               if ((rnd_mode == ROUNDING_DOWN && x_sign) ||    // RM and res < 0
+                   (rnd_mode == ROUNDING_UP && !x_sign)) {     // RP and res > 0
+                 C1_hi = 0x7800000000000000ull;        // +inf
+                 C1_lo = 0x0ull;
+               } else {        // RM and res > 0, RP and res < 0, or RZ
+                 C1_hi = 0x5fffed09bead87c0ull;
+                 C1_lo = 0x378d8e63ffffffffull;
+               }
+               y_exp = 0;      // x_sign is preserved
+               // set the inexact flag (in case the exact addition was exact)
+               *pfpsf |= INEXACT_EXCEPTION;
+               // set the overflow flag
+               *pfpsf |= OVERFLOW_EXCEPTION;
+             }
+           }
+         }     // else if (C1 < 10^34) then C1 is the coeff.; the result is exact
+         // assemble the result
+         res.w[1] = x_sign | y_exp | C1_hi;
+         res.w[0] = C1_lo;
+       } else {        // if x_sign != y_sign the result is exact
+         C1_lo = C2_lo - C1_lo;
+         C1_hi = C2_hi - C1_hi;
+         if (C1_lo > C2_lo)
+           C1_hi--;
+         if (C1_hi >= 0x8000000000000000ull) { // negative coefficient!
+           C1_lo = ~C1_lo;
+           C1_lo++;
+           C1_hi = ~C1_hi;
+           if (C1_lo == 0x0)
+             C1_hi++;
+           x_sign = y_sign;    // the result will have the sign of y
+         }
+         // the result can be zero, but it cannot overflow
+         if (C1_lo == 0 && C1_hi == 0) {
+           // assemble the result
+           if (x_exp < y_exp)
+             res.w[1] = x_exp;
+           else
+             res.w[1] = y_exp;
+           res.w[0] = 0;
+           if (rnd_mode == ROUNDING_DOWN) {
+             res.w[1] |= 0x8000000000000000ull;
+           }
+           BID_SWAP128 (res);
+           BID_RETURN (res);
+         }
+         // assemble the result
+         res.w[1] = y_sign | y_exp | C1_hi;
+         res.w[0] = C1_lo;
+       }
       }
     }
+    BID_SWAP128 (res);
     BID_RETURN (res)
   }
 }
 
+
+
+// bid128_sub stands for bid128qq_sub
+
 /*****************************************************************************
  *  BID128 sub
  ****************************************************************************/
 
 #if DECIMAL_CALL_BY_REFERENCE
 void
-__bid128_sub (UINT128 * pres, UINT128 * px,
-            UINT128 *
-            py _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
-            _EXC_INFO_PARAM) {
+bid128_sub (UINT128 * pres, UINT128 * px, UINT128 * py
+           _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+           _EXC_INFO_PARAM) {
   UINT128 x = *px, y = *py;
 #if !DECIMAL_GLOBAL_ROUNDING
   unsigned int rnd_mode = *prnd_mode;
 #endif
 #else
 UINT128
-__bid128_sub (UINT128 x,
-            UINT128 y _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
-            _EXC_INFO_PARAM) {
+bid128_sub (UINT128 x, UINT128 y
+           _RND_MODE_PARAM _EXC_FLAGS_PARAM _EXC_MASKS_PARAM
+           _EXC_INFO_PARAM) {
 #endif
 
   UINT128 res;
   UINT64 y_sign;
 
-  if ((y.w[1] & MASK_NAN) != MASK_NAN) { // y is not NAN
+  if ((y.w[HIGH_128W] & MASK_NAN) != MASK_NAN) {       // y is not NAN
     // change its sign
-    y_sign = y.w[1] & MASK_SIGN; // 0 for positive, MASK_SIGN for negative
+    y_sign = y.w[HIGH_128W] & MASK_SIGN;       // 0 for positive, MASK_SIGN for negative
     if (y_sign)
-      y.w[1] = y.w[1] & 0x7fffffffffffffffull;
+      y.w[HIGH_128W] = y.w[HIGH_128W] & 0x7fffffffffffffffull;
     else
-      y.w[1] = y.w[1] | 0x8000000000000000ull;
+      y.w[HIGH_128W] = y.w[HIGH_128W] | 0x8000000000000000ull;
   }
 #if DECIMAL_CALL_BY_REFERENCE
-  __bid128_add (&res, &x,
-              &y _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
-              _EXC_INFO_ARG);
+  bid128_add (&res, &x, &y
+             _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+             _EXC_INFO_ARG);
 #else
-  res =
-    __bid128_add (x,
-                y _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
-                _EXC_INFO_ARG);
+  res = bid128_add (x, y
+                   _RND_MODE_ARG _EXC_FLAGS_ARG _EXC_MASKS_ARG
+                   _EXC_INFO_ARG);
 #endif
   BID_RETURN (res);
 }