* config/ia64/ia64.c (ia64_output_function_profiler): Emit an

[pf3gnuchains/gcc-fork.git] / gcc / config / ia64 / ia64.c
diff --git a/gcc/config/ia64/ia64.c b/gcc/config/ia64/ia64.c

index 7b20280..2e1cc19 100644 (file)
--- a/gcc/config/ia64/ia64.c
+++ b/gcc/config/ia64/ia64.c
@@ -18,8 +18,8 @@ GNU General Public License for more details.
  
  You should have received a copy of the GNU General Public License
  along with GCC; see the file COPYING.  If not, write to
-the Free Software Foundation, 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA.  */
+the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+Boston, MA 02110-1301, USA.  */
  
  #include "config.h"
  #include "system.h"
@@ -52,6 +52,7 @@ Boston, MA 02111-1307, USA.  */
  #include "langhooks.h"
  #include "cfglayout.h"
  #include "tree-gimple.h"
+#include "intl.h"
  
  /* This is used for communication between ASM_OUTPUT_LABEL and
     ASM_OUTPUT_LABELREF.  */
@@ -232,6 +233,8 @@ static void ia64_file_start (void);
  
  static void ia64_select_rtx_section (enum machine_mode, rtx,
                                      unsigned HOST_WIDE_INT);
+static void ia64_output_dwarf_dtprel (FILE *, int, rtx)
+     ATTRIBUTE_UNUSED;
  static void ia64_rwreloc_select_section (tree, int, unsigned HOST_WIDE_INT)
       ATTRIBUTE_UNUSED;
  static void ia64_rwreloc_unique_section (tree, int)
@@ -260,6 +263,10 @@ static tree ia64_gimplify_va_arg (tree, tree, tree *, tree *);
  static bool ia64_scalar_mode_supported_p (enum machine_mode mode);
  static bool ia64_vector_mode_supported_p (enum machine_mode mode);
  static bool ia64_cannot_force_const_mem (rtx);
+static const char *ia64_mangle_fundamental_type (tree);
+static const char *ia64_invalid_conversion (tree, tree);
+static const char *ia64_invalid_unary_op (int, tree);
+static const char *ia64_invalid_binary_op (int, tree, tree);
  \f
  /* Table of valid machine attributes.  */
  static const struct attribute_spec ia64_attribute_table[] =
@@ -369,6 +376,11 @@ static const struct attribute_spec ia64_attribute_table[] =
  #undef  TARGET_SECTION_TYPE_FLAGS
  #define TARGET_SECTION_TYPE_FLAGS  ia64_section_type_flags
  
+#ifdef HAVE_AS_TLS
+#undef TARGET_ASM_OUTPUT_DWARF_DTPREL
+#define TARGET_ASM_OUTPUT_DWARF_DTPREL ia64_output_dwarf_dtprel
+#endif
+
  /* ??? ABI doesn't allow us to define this.  */
  #if 0
  #undef TARGET_PROMOTE_FUNCTION_ARGS
@@ -422,6 +434,16 @@ static const struct attribute_spec ia64_attribute_table[] =
  #undef TARGET_CANNOT_FORCE_CONST_MEM
  #define TARGET_CANNOT_FORCE_CONST_MEM ia64_cannot_force_const_mem
  
+#undef TARGET_MANGLE_FUNDAMENTAL_TYPE
+#define TARGET_MANGLE_FUNDAMENTAL_TYPE ia64_mangle_fundamental_type
+
+#undef TARGET_INVALID_CONVERSION
+#define TARGET_INVALID_CONVERSION ia64_invalid_conversion
+#undef TARGET_INVALID_UNARY_OP
+#define TARGET_INVALID_UNARY_OP ia64_invalid_unary_op
+#undef TARGET_INVALID_BINARY_OP
+#define TARGET_INVALID_BINARY_OP ia64_invalid_binary_op
+
  struct gcc_target targetm = TARGET_INITIALIZER;
  \f
  typedef enum
@@ -493,21 +515,21 @@ ia64_handle_model_attribute (tree *node, tree name, tree args,
           && !TREE_STATIC (decl))
         {
           error ("%Jan address area attribute cannot be specified for "
-                "local variables", decl, decl);
+                "local variables", decl);
           *no_add_attrs = true;
         }
        area = ia64_get_addr_area (decl);
        if (area != ADDR_AREA_NORMAL && addr_area != area)
         {
-         error ("%Jaddress area of '%s' conflicts with previous "
-                "declaration", decl, decl);
+         error ("address area of %q+D conflicts with previous "
+                "declaration", decl);
           *no_add_attrs = true;
         }
        break;
  
      case FUNCTION_DECL:
        error ("%Jaddress area attribute cannot be specified for functions",
-            decl, decl);
+            decl);
        *no_add_attrs = true;
        break;
  
@@ -668,6 +690,37 @@ ia64_move_ok (rtx dst, rtx src)
      return GET_CODE (src) == CONST_DOUBLE && CONST_DOUBLE_OK_FOR_G (src);
  }
  
+/* Return 1 if the operands are ok for a floating point load pair.  */
+
+int
+ia64_load_pair_ok (rtx dst, rtx src)
+{
+  if (GET_CODE (dst) != REG || !FP_REGNO_P (REGNO (dst)))
+    return 0;
+  if (GET_CODE (src) != MEM || MEM_VOLATILE_P (src))
+    return 0;
+  switch (GET_CODE (XEXP (src, 0)))
+    {
+    case REG:
+    case POST_INC:
+      break;
+    case POST_DEC:
+      return 0;
+    case POST_MODIFY:
+      {
+       rtx adjust = XEXP (XEXP (XEXP (src, 0), 1), 1);
+
+       if (GET_CODE (adjust) != CONST_INT
+           || INTVAL (adjust) != GET_MODE_SIZE (GET_MODE (src)))
+         return 0;
+      }
+      break;
+    default:
+      abort ();
+    }
+  return 1;
+}
+
  int
  addp4_optimize_ok (rtx op1, rtx op2)
  {
@@ -731,6 +784,17 @@ ia64_legitimate_constant_p (rtx x)
      case SYMBOL_REF:
        return tls_symbolic_operand_type (x) == 0;
  
+    case CONST_VECTOR:
+      {
+       enum machine_mode mode = GET_MODE (x);
+
+       if (mode == V2SFmode)
+         return ia64_extra_constraint (x, 'Y');
+
+       return (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+               && GET_MODE_SIZE (mode) <= 8);
+      }
+
      default:
        return false;
      }
@@ -1263,8 +1327,8 @@ ia64_split_tmode_move (rtx operands[])
     This solution attempts to prevent this situation from occurring.  When
     we see something like the above, we spill the inner register to memory.  */
  
-rtx
-spill_xfmode_operand (rtx in, int force)
+static rtx
+spill_xfmode_rfmode_operand (rtx in, int force, enum machine_mode mode)
  {
    if (GET_CODE (in) == SUBREG
        && GET_MODE (SUBREG_REG (in)) == TImode
@@ -1272,11 +1336,11 @@ spill_xfmode_operand (rtx in, int force)
      {
        rtx memt = assign_stack_temp (TImode, 16, 0);
        emit_move_insn (memt, SUBREG_REG (in));
-      return adjust_address (memt, XFmode, 0);
+      return adjust_address (memt, mode, 0);
      }
    else if (force && GET_CODE (in) == REG)
      {
-      rtx memx = assign_stack_temp (XFmode, 16, 0);
+      rtx memx = assign_stack_temp (mode, 16, 0);
        emit_move_insn (memx, in);
        return memx;
      }
@@ -1284,6 +1348,138 @@ spill_xfmode_operand (rtx in, int force)
      return in;
  }
  
+/* Expand the movxf or movrf pattern (MODE says which) with the given
+   OPERANDS, returning true if the pattern should then invoke
+   DONE.  */
+
+bool
+ia64_expand_movxf_movrf (enum machine_mode mode, rtx operands[])
+{
+  rtx op0 = operands[0];
+
+  if (GET_CODE (op0) == SUBREG)
+    op0 = SUBREG_REG (op0);
+
+  /* We must support XFmode loads into general registers for stdarg/vararg,
+     unprototyped calls, and a rare case where a long double is passed as
+     an argument after a float HFA fills the FP registers.  We split them into
+     DImode loads for convenience.  We also need to support XFmode stores
+     for the last case.  This case does not happen for stdarg/vararg routines,
+     because we do a block store to memory of unnamed arguments.  */
+
+  if (GET_CODE (op0) == REG && GR_REGNO_P (REGNO (op0)))
+    {
+      rtx out[2];
+
+      /* We're hoping to transform everything that deals with XFmode
+        quantities and GR registers early in the compiler.  */
+      gcc_assert (!no_new_pseudos);
+
+      /* Struct to register can just use TImode instead.  */
+      if ((GET_CODE (operands[1]) == SUBREG
+          && GET_MODE (SUBREG_REG (operands[1])) == TImode)
+         || (GET_CODE (operands[1]) == REG
+             && GR_REGNO_P (REGNO (operands[1]))))
+       {
+         rtx op1 = operands[1];
+
+         if (GET_CODE (op1) == SUBREG)
+           op1 = SUBREG_REG (op1);
+         else
+           op1 = gen_rtx_REG (TImode, REGNO (op1));
+
+         emit_move_insn (gen_rtx_REG (TImode, REGNO (op0)), op1);
+         return true;
+       }
+
+      if (GET_CODE (operands[1]) == CONST_DOUBLE)
+       {
+         /* Don't word-swap when reading in the constant.  */
+         emit_move_insn (gen_rtx_REG (DImode, REGNO (op0)),
+                         operand_subword (operands[1], WORDS_BIG_ENDIAN,
+                                          0, mode));
+         emit_move_insn (gen_rtx_REG (DImode, REGNO (op0) + 1),
+                         operand_subword (operands[1], !WORDS_BIG_ENDIAN,
+                                          0, mode));
+         return true;
+       }
+
+      /* If the quantity is in a register not known to be GR, spill it.  */
+      if (register_operand (operands[1], mode))
+       operands[1] = spill_xfmode_rfmode_operand (operands[1], 1, mode);
+
+      gcc_assert (GET_CODE (operands[1]) == MEM);
+
+      /* Don't word-swap when reading in the value.  */
+      out[0] = gen_rtx_REG (DImode, REGNO (op0));
+      out[1] = gen_rtx_REG (DImode, REGNO (op0) + 1);
+
+      emit_move_insn (out[0], adjust_address (operands[1], DImode, 0));
+      emit_move_insn (out[1], adjust_address (operands[1], DImode, 8));
+      return true;
+    }
+
+  if (GET_CODE (operands[1]) == REG && GR_REGNO_P (REGNO (operands[1])))
+    {
+      /* We're hoping to transform everything that deals with XFmode
+        quantities and GR registers early in the compiler.  */
+      gcc_assert (!no_new_pseudos);
+
+      /* Op0 can't be a GR_REG here, as that case is handled above.
+        If op0 is a register, then we spill op1, so that we now have a
+        MEM operand.  This requires creating an XFmode subreg of a TImode reg
+        to force the spill.  */
+      if (register_operand (operands[0], mode))
+       {
+         rtx op1 = gen_rtx_REG (TImode, REGNO (operands[1]));
+         op1 = gen_rtx_SUBREG (mode, op1, 0);
+         operands[1] = spill_xfmode_rfmode_operand (op1, 0, mode);
+       }
+
+      else
+       {
+         rtx in[2];
+
+         gcc_assert (GET_CODE (operands[0]) == MEM);
+
+         /* Don't word-swap when writing out the value.  */
+         in[0] = gen_rtx_REG (DImode, REGNO (operands[1]));
+         in[1] = gen_rtx_REG (DImode, REGNO (operands[1]) + 1);
+
+         emit_move_insn (adjust_address (operands[0], DImode, 0), in[0]);
+         emit_move_insn (adjust_address (operands[0], DImode, 8), in[1]);
+         return true;
+       }
+    }
+
+  if (!reload_in_progress && !reload_completed)
+    {
+      operands[1] = spill_xfmode_rfmode_operand (operands[1], 0, mode);
+
+      if (GET_MODE (op0) == TImode && GET_CODE (op0) == REG)
+       {
+         rtx memt, memx, in = operands[1];
+         if (CONSTANT_P (in))
+           in = validize_mem (force_const_mem (mode, in));
+         if (GET_CODE (in) == MEM)
+           memt = adjust_address (in, TImode, 0);
+         else
+           {
+             memt = assign_stack_temp (TImode, 16, 0);
+             memx = adjust_address (memt, mode, 0);
+             emit_move_insn (memx, in);
+           }
+         emit_move_insn (op0, memt);
+         return true;
+       }
+
+      if (!ia64_move_ok (operands[0], operands[1]))
+       operands[1] = force_reg (mode, operands[1]);
+    }
+
+  return false;
+}
+
  /* Emit comparison instruction if necessary, returning the expression
     that holds the compare result in the proper mode.  */
  
@@ -1367,7 +1563,8 @@ ia64_expand_compare (enum rtx_code code, enum machine_mode mode)
    return gen_rtx_fmt_ee (code, mode, cmp, const0_rtx);
  }
  
-/* Generate an integral vector comparison.  */
+/* Generate an integral vector comparison.  Return true if the condition has
+   been reversed, and so the sense of the comparison should be inverted.  */
  
  static bool
  ia64_expand_vecint_compare (enum rtx_code code, enum machine_mode mode,
@@ -1376,95 +1573,85 @@ ia64_expand_vecint_compare (enum rtx_code code, enum machine_mode mode,
    bool negate = false;
    rtx x;
  
+  /* Canonicalize the comparison to EQ, GT, GTU.  */
    switch (code)
      {
      case EQ:
      case GT:
+    case GTU:
        break;
  
      case NE:
-      code = EQ;
-      negate = true;
-      break;
-
      case LE:
-      code = GT;
+    case LEU:
+      code = reverse_condition (code);
        negate = true;
        break;
  
      case GE:
+    case GEU:
+      code = reverse_condition (code);
        negate = true;
        /* FALLTHRU */
  
      case LT:
-      x = op0;
-      op0 = op1;
-      op1 = x;
-      code = GT;
-      break;
-
-    case GTU:
-    case GEU:
      case LTU:
-    case LEU:
-      {
-       rtx w0h, w0l, w1h, w1l, ch, cl;
-       enum machine_mode wmode;
-       rtx (*unpack_l) (rtx, rtx, rtx);
-       rtx (*unpack_h) (rtx, rtx, rtx);
-       rtx (*pack) (rtx, rtx, rtx);
+      code = swap_condition (code);
+      x = op0, op0 = op1, op1 = x;
+      break;
  
-       /* We don't have native unsigned comparisons, but we can generate
-          them better than generic code can.  */
+    default:
+      gcc_unreachable ();
+    }
  
-       gcc_assert (mode != V2SImode);
-       switch (mode)
+  /* Unsigned parallel compare is not supported by the hardware.  Play some
+     tricks to turn this into a signed comparison against 0.  */
+  if (code == GTU)
+    {
+      switch (mode)
+       {
+       case V2SImode:
           {
-         case V8QImode:
-           wmode = V4HImode;
-           pack = gen_pack2_sss;
-           unpack_l = gen_unpack1_l;
-           unpack_h = gen_unpack1_h;
-           break;
-
-         case V4HImode:
-           wmode = V2SImode;
-           pack = gen_pack4_sss;
-           unpack_l = gen_unpack2_l;
-           unpack_h = gen_unpack2_h;
-           break;
-
-         default:
-           gcc_unreachable ();
+           rtx t1, t2, mask;
+
+           /* Perform a parallel modulo subtraction.  */
+           t1 = gen_reg_rtx (V2SImode);
+           emit_insn (gen_subv2si3 (t1, op0, op1));
+
+           /* Extract the original sign bit of op0.  */
+           mask = GEN_INT (-0x80000000);
+           mask = gen_rtx_CONST_VECTOR (V2SImode, gen_rtvec (2, mask, mask));
+           mask = force_reg (V2SImode, mask);
+           t2 = gen_reg_rtx (V2SImode);
+           emit_insn (gen_andv2si3 (t2, op0, mask));
+
+           /* XOR it back into the result of the subtraction.  This results
+              in the sign bit set iff we saw unsigned underflow.  */
+           x = gen_reg_rtx (V2SImode);
+           emit_insn (gen_xorv2si3 (x, t1, t2));
+
+           code = GT;
+           op0 = x;
+           op1 = CONST0_RTX (mode);
           }
+         break;
  
-       /* Unpack into wider vectors, zero extending the elements.  */
-
-       w0l = gen_reg_rtx (wmode);
-       w0h = gen_reg_rtx (wmode);
-       w1l = gen_reg_rtx (wmode);
-       w1h = gen_reg_rtx (wmode);
-       emit_insn (unpack_l (gen_lowpart (mode, w0l), op0, CONST0_RTX (mode)));
-       emit_insn (unpack_h (gen_lowpart (mode, w0h), op0, CONST0_RTX (mode)));
-       emit_insn (unpack_l (gen_lowpart (mode, w1l), op1, CONST0_RTX (mode)));
-       emit_insn (unpack_h (gen_lowpart (mode, w1h), op1, CONST0_RTX (mode)));
-
-       /* Compare in the wider mode.  */
-
-       cl = gen_reg_rtx (wmode);
-       ch = gen_reg_rtx (wmode);
-       code = signed_condition (code);
-       ia64_expand_vecint_compare (code, wmode, cl, w0l, w1l);
-       negate = ia64_expand_vecint_compare (code, wmode, ch, w0h, w1h);
-
-       /* Repack into a single narrower vector.  */
-
-       emit_insn (pack (dest, cl, ch));
-      }
-      return negate;
+       case V8QImode:
+       case V4HImode:
+         /* Perform a parallel unsigned saturating subtraction.  */
+         x = gen_reg_rtx (mode);
+         emit_insn (gen_rtx_SET (VOIDmode, x,
+                                 gen_rtx_US_MINUS (mode, op0, op1)));
+
+         code = EQ;
+         op0 = x;
+         op1 = CONST0_RTX (mode);
+         negate = !negate;
+         break;
  
-    default:
-      gcc_unreachable ();
+       default:
+         gcc_unreachable ();
+       }
      }
  
    x = gen_rtx_fmt_ee (code, mode, op0, op1);
@@ -1473,57 +1660,6 @@ ia64_expand_vecint_compare (enum rtx_code code, enum machine_mode mode,
    return negate;
  }
  
-static void
-ia64_expand_vcondu_v2si (enum rtx_code code, rtx operands[])
-{
-  rtx dl, dh, bl, bh, op1l, op1h, op2l, op2h, op4l, op4h, op5l, op5h, x;
-
-  /* In this case, we extract the two SImode quantities and generate
-     normal comparisons for each of them.  */
-
-  op1l = gen_lowpart (SImode, operands[1]);
-  op2l = gen_lowpart (SImode, operands[2]);
-  op4l = gen_lowpart (SImode, operands[4]);
-  op5l = gen_lowpart (SImode, operands[5]);
-
-  op1h = gen_reg_rtx (SImode);
-  op2h = gen_reg_rtx (SImode);
-  op4h = gen_reg_rtx (SImode);
-  op5h = gen_reg_rtx (SImode);
-
-  emit_insn (gen_lshrdi3 (gen_lowpart (DImode, op1h),
-                         gen_lowpart (DImode, operands[1]), GEN_INT (32)));
-  emit_insn (gen_lshrdi3 (gen_lowpart (DImode, op2h),
-                         gen_lowpart (DImode, operands[2]), GEN_INT (32)));
-  emit_insn (gen_lshrdi3 (gen_lowpart (DImode, op4h),
-                         gen_lowpart (DImode, operands[4]), GEN_INT (32)));
-  emit_insn (gen_lshrdi3 (gen_lowpart (DImode, op5h),
-                         gen_lowpart (DImode, operands[5]), GEN_INT (32)));
-
-  bl = gen_reg_rtx (BImode);
-  x = gen_rtx_fmt_ee (code, BImode, op4l, op5l);
-  emit_insn (gen_rtx_SET (VOIDmode, bl, x));
-
-  bh = gen_reg_rtx (BImode);
-  x = gen_rtx_fmt_ee (code, BImode, op4h, op5h);
-  emit_insn (gen_rtx_SET (VOIDmode, bh, x));
-
-  /* With the results of the comparisons, emit conditional moves.  */
-
-  dl = gen_reg_rtx (SImode);
-  x = gen_rtx_IF_THEN_ELSE (SImode, bl, op1l, op2l);
-  emit_insn (gen_rtx_SET (VOIDmode, dl, x));
-
-  dh = gen_reg_rtx (SImode);
-  x = gen_rtx_IF_THEN_ELSE (SImode, bh, op1h, op2h);
-  emit_insn (gen_rtx_SET (VOIDmode, dh, x));
-
-  /* Merge the two partial results back into a vector.  */
-
-  x = gen_rtx_VEC_CONCAT (V2SImode, dl, dh);
-  emit_insn (gen_rtx_SET (VOIDmode, operands[0], x));
-}
-
  /* Emit an integral vector conditional move.  */
  
  void
@@ -1534,15 +1670,6 @@ ia64_expand_vecint_cmov (rtx operands[])
    bool negate;
    rtx cmp, x, ot, of;
  
-  /* Since we don't have unsigned V2SImode comparisons, it's more efficient
-     to special-case them entirely.  */
-  if (mode == V2SImode
-      && (code == GTU || code == GEU || code == LEU || code == LTU))
-    {
-      ia64_expand_vcondu_v2si (code, operands);
-      return;
-    }
-
    cmp = gen_reg_rtx (mode);
    negate = ia64_expand_vecint_compare (code, mode, cmp,
                                        operands[4], operands[5]);
@@ -1591,7 +1718,7 @@ bool
  ia64_expand_vecint_minmax (enum rtx_code code, enum machine_mode mode,
                            rtx operands[])
  {
-  rtx xops[5];
+  rtx xops[6];
  
    /* These four combinations are supported directly.  */
    if (mode == V8QImode && (code == UMIN || code == UMAX))
@@ -1599,6 +1726,18 @@ ia64_expand_vecint_minmax (enum rtx_code code, enum machine_mode mode,
    if (mode == V4HImode && (code == SMIN || code == SMAX))
      return false;
  
+  /* This combination can be implemented with only saturating subtraction.  */
+  if (mode == V4HImode && code == UMAX)
+    {
+      rtx x, tmp = gen_reg_rtx (mode);
+
+      x = gen_rtx_US_MINUS (mode, operands[1], operands[2]);
+      emit_insn (gen_rtx_SET (VOIDmode, tmp, x));
+
+      emit_insn (gen_addv4hi3 (operands[0], tmp, operands[2]));
+      return true;
+    }
+
    /* Everything else implemented via vector comparisons.  */
    xops[0] = operands[0];
    xops[4] = xops[1] = operands[1];
@@ -1627,6 +1766,113 @@ ia64_expand_vecint_minmax (enum rtx_code code, enum machine_mode mode,
    return true;
  }
  
+/* Emit an integral vector widening sum operations.  */
+
+void
+ia64_expand_widen_sum (rtx operands[3], bool unsignedp)
+{
+  rtx l, h, x, s;
+  enum machine_mode wmode, mode;
+  rtx (*unpack_l) (rtx, rtx, rtx);
+  rtx (*unpack_h) (rtx, rtx, rtx);
+  rtx (*plus) (rtx, rtx, rtx);
+
+  wmode = GET_MODE (operands[0]);
+  mode = GET_MODE (operands[1]);
+
+  switch (mode)
+    {
+    case V8QImode:
+      unpack_l = gen_unpack1_l;
+      unpack_h = gen_unpack1_h;
+      plus = gen_addv4hi3;
+      break;
+    case V4HImode:
+      unpack_l = gen_unpack2_l;
+      unpack_h = gen_unpack2_h;
+      plus = gen_addv2si3;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Fill in x with the sign extension of each element in op1.  */
+  if (unsignedp)
+    x = CONST0_RTX (mode);
+  else
+    {
+      bool neg;
+
+      x = gen_reg_rtx (mode);
+
+      neg = ia64_expand_vecint_compare (LT, mode, x, operands[1],
+                                       CONST0_RTX (mode));
+      gcc_assert (!neg);
+    }
+
+  l = gen_reg_rtx (wmode);
+  h = gen_reg_rtx (wmode);
+  s = gen_reg_rtx (wmode);
+
+  emit_insn (unpack_l (gen_lowpart (mode, l), operands[1], x));
+  emit_insn (unpack_h (gen_lowpart (mode, h), operands[1], x));
+  emit_insn (plus (s, l, operands[2]));
+  emit_insn (plus (operands[0], h, s));
+}
+
+/* Emit a signed or unsigned V8QI dot product operation.  */
+
+void
+ia64_expand_dot_prod_v8qi (rtx operands[4], bool unsignedp)
+{
+  rtx l1, l2, h1, h2, x1, x2, p1, p2, p3, p4, s1, s2, s3;
+
+  /* Fill in x1 and x2 with the sign extension of each element.  */
+  if (unsignedp)
+    x1 = x2 = CONST0_RTX (V8QImode);
+  else
+    {
+      bool neg;
+
+      x1 = gen_reg_rtx (V8QImode);
+      x2 = gen_reg_rtx (V8QImode);
+
+      neg = ia64_expand_vecint_compare (LT, V8QImode, x1, operands[1],
+                                       CONST0_RTX (V8QImode));
+      gcc_assert (!neg);
+      neg = ia64_expand_vecint_compare (LT, V8QImode, x2, operands[2],
+                                       CONST0_RTX (V8QImode));
+      gcc_assert (!neg);
+    }
+
+  l1 = gen_reg_rtx (V4HImode);
+  l2 = gen_reg_rtx (V4HImode);
+  h1 = gen_reg_rtx (V4HImode);
+  h2 = gen_reg_rtx (V4HImode);
+
+  emit_insn (gen_unpack1_l (gen_lowpart (V8QImode, l1), operands[1], x1));
+  emit_insn (gen_unpack1_l (gen_lowpart (V8QImode, l2), operands[2], x2));
+  emit_insn (gen_unpack1_h (gen_lowpart (V8QImode, h1), operands[1], x1));
+  emit_insn (gen_unpack1_h (gen_lowpart (V8QImode, h2), operands[2], x2));
+
+  p1 = gen_reg_rtx (V2SImode);
+  p2 = gen_reg_rtx (V2SImode);
+  p3 = gen_reg_rtx (V2SImode);
+  p4 = gen_reg_rtx (V2SImode);
+  emit_insn (gen_pmpy2_r (p1, l1, l2));
+  emit_insn (gen_pmpy2_l (p2, l1, l2));
+  emit_insn (gen_pmpy2_r (p3, h1, h2));
+  emit_insn (gen_pmpy2_l (p4, h1, h2));
+
+  s1 = gen_reg_rtx (V2SImode);
+  s2 = gen_reg_rtx (V2SImode);
+  s3 = gen_reg_rtx (V2SImode);
+  emit_insn (gen_addv2si3 (s1, p1, p2));
+  emit_insn (gen_addv2si3 (s2, p3, p4));
+  emit_insn (gen_addv2si3 (s3, s1, operands[3]));
+  emit_insn (gen_addv2si3 (operands[0], s2, s3));
+}
+
  /* Emit the appropriate sequence for a call.  */
  
  void
@@ -2008,7 +2254,7 @@ mark_reg_gr_used_mask (rtx reg, void *data ATTRIBUTE_UNUSED)
    unsigned int regno = REGNO (reg);
    if (regno < 32)
      {
-      unsigned int i, n = HARD_REGNO_NREGS (regno, GET_MODE (reg));
+      unsigned int i, n = hard_regno_nregs[regno][GET_MODE (reg)];
        for (i = 0; i < n; ++i)
         current_frame_info.gr_used_mask |= 1 << (regno + i);
      }
@@ -2080,12 +2326,14 @@ ia64_compute_frame_size (HOST_WIDE_INT size)
        break;
    i = regno - OUT_REG (0) + 1;
  
+#ifndef PROFILE_HOOK
    /* When -p profiling, we need one output register for the mcount argument.
       Likewise for -a profiling for the bb_init_func argument.  For -ax
       profiling, we need two output registers for the two bb_init_trace_func
       arguments.  */
    if (current_function_profile)
      i = MAX (i, 1);
+#endif
    current_frame_info.n_output_regs = i;
  
    /* ??? No rotating register support yet.  */
@@ -4120,7 +4368,7 @@ ia64_function_value (tree valtype, tree func ATTRIBUTE_UNUSED)
          the middle-end will give it XFmode anyway, and XFmode values
          don't normally fit in integer registers.  So we need to smuggle
          the value inside a parallel.  */
-      else if (mode == XFmode || mode == XCmode)
+      else if (mode == XFmode || mode == XCmode || mode == RFmode)
         need_parallel = true;
  
        if (need_parallel)
@@ -4151,14 +4399,17 @@ ia64_function_value (tree valtype, tree func ATTRIBUTE_UNUSED)
      }
  }
  
-/* This is called from dwarf2out.c via ASM_OUTPUT_DWARF_DTPREL.
+/* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
     We need to emit DTP-relative relocations.  */
  
-void
+static void
  ia64_output_dwarf_dtprel (FILE *file, int size, rtx x)
  {
-  gcc_assert (size == 8);
-  fputs ("\tdata8.ua\t@dtprel(", file);
+  gcc_assert (size == 4 || size == 8);
+  if (size == 4)
+    fputs ("\tdata4.ua\t@dtprel(", file);
+  else
+    fputs ("\tdata8.ua\t@dtprel(", file);
    output_addr_const (file, x);
    fputs (")", file);
  }
@@ -4192,6 +4443,7 @@ ia64_print_operand_address (FILE * stream ATTRIBUTE_UNUSED,
         for Intel assembler.
     U   Print an 8-bit sign extended number (K) as a 64-bit unsigned number
         for Intel assembler.
+   X   A pair of floating point registers.
     r   Print register name, or constant 0 as r0.  HP compatibility for
         Linux kernel.
     v    Print vector constant value as an 8-byte integer value.  */
@@ -4340,6 +4592,13 @@ ia64_print_operand (FILE * file, rtx x, int code)
         }
        break;
  
+    case 'X':
+      {
+       unsigned int regno = REGNO (x);
+       fprintf (file, "%s, %s", reg_names [regno], reg_names [regno + 1]);
+      }
+      return;
+
      case 'r':
        /* If this operand is the constant zero, write it as register zero.
          Any register, zero, or CONST_INT value is OK here.  */
@@ -4535,7 +4794,7 @@ ia64_register_move_cost (enum machine_mode mode, enum reg_class from,
       so that we get secondary memory reloads.  Between FR_REGS,
       we have to make this at least as expensive as MEMORY_MOVE_COST
       to avoid spectacularly poor register class preferencing.  */
-  if (mode == XFmode)
+  if (mode == XFmode || mode == RFmode)
      {
        if (to != GR_REGS || from != GR_REGS)
          return MEMORY_MOVE_COST (mode, to, 0);
@@ -4569,6 +4828,7 @@ ia64_register_move_cost (enum machine_mode mode, enum reg_class from,
  
      case GR_REGS:
      case FR_REGS:
+    case FP_REGS:
      case GR_AND_FR_REGS:
      case GR_AND_BR_REGS:
      case ALL_REGS:
@@ -4590,6 +4850,7 @@ ia64_preferred_reload_class (rtx x, enum reg_class class)
    switch (class)
      {
      case FR_REGS:
+    case FP_REGS:
        /* Don't allow volatile mem reloads into floating point registers.
          This is defined to force reload to choose the r/m case instead
          of the f/f case when reloading (set (reg fX) (mem/v)).  */
@@ -4655,6 +4916,7 @@ ia64_secondary_reload_class (enum reg_class class,
        break;
  
      case FR_REGS:
+    case FP_REGS:
        /* Need to go through general registers to get to other class regs.  */
        if (regno >= 0 && ! (FR_REGNO_P (regno) || GENERAL_REGNO_P (regno)))
         return GR_REGS;
@@ -5161,25 +5423,19 @@ update_set_flags (rtx x, struct reg_flags *pflags)
        return;
  
      case IF_THEN_ELSE:
-      if (SET_DEST (x) == pc_rtx)
-       /* X is a conditional branch.  */
-       return;
-      else
-       {
-         /* X is a conditional move.  */
-         rtx cond = XEXP (src, 0);
-         cond = XEXP (cond, 0);
-
-         /* We always split conditional moves into COND_EXEC patterns, so the
-            only pattern that can reach here is doloop_end_internal.  We don't
-            need to do anything special for this pattern.  */
-         gcc_assert (GET_CODE (cond) == REG && REGNO (cond) == AR_LC_REGNUM);
-         return;
-       }
+      /* There are three cases here:
+        (1) The destination is (pc), in which case this is a branch,
+        nothing here applies.
+        (2) The destination is ar.lc, in which case this is a
+        doloop_end_internal,
+        (3) The destination is an fp register, in which case this is
+        an fselect instruction.
+        In all cases, nothing we do in this function applies.  */
+      return;
  
      default:
        if (COMPARISON_P (src)
-         && GET_MODE_CLASS (GET_MODE (XEXP (src, 0))) == MODE_FLOAT)
+         && SCALAR_FLOAT_MODE_P (GET_MODE (XEXP (src, 0))))
         /* Set pflags->is_fp to 1 so that we know we're dealing
            with a floating point comparison when processing the
            destination of the SET.  */
@@ -6006,16 +6262,19 @@ ia64_dependencies_evaluation_hook (rtx head, rtx tail)
        {
         for (link = INSN_DEPEND (insn); link != 0; link = XEXP (link, 1))
           {
+           enum attr_itanium_class c;
+
             if (REG_NOTE_KIND (link) != REG_DEP_TRUE)
               continue;
             next = XEXP (link, 0);
-           if ((ia64_safe_itanium_class (next) == ITANIUM_CLASS_ST
-                || ia64_safe_itanium_class (next) == ITANIUM_CLASS_STF)
+           c = ia64_safe_itanium_class (next);
+           if ((c == ITANIUM_CLASS_ST
+                || c == ITANIUM_CLASS_STF)
                 && ia64_st_address_bypass_p (insn, next))
               break;
-           else if ((ia64_safe_itanium_class (next) == ITANIUM_CLASS_LD
-                     || ia64_safe_itanium_class (next)
-                     == ITANIUM_CLASS_FLD)
+           else if ((c == ITANIUM_CLASS_LD
+                     || c == ITANIUM_CLASS_FLD
+                     || c == ITANIUM_CLASS_FLDP)
                      && ia64_ld_address_bypass_p (insn, next))
               break;
           }
@@ -7287,7 +7546,7 @@ final_emit_insn_group_barriers (FILE *dump ATTRIBUTE_UNUSED)
  
  \f
  
-/* If the following function returns TRUE, we will use the the DFA
+/* If the following function returns TRUE, we will use the DFA
     insn scheduler.  */
  
  static int
@@ -7413,8 +7672,10 @@ emit_predicate_relation_info (void)
           && NOTE_LINE_NUMBER (NEXT_INSN (head)) == NOTE_INSN_BASIC_BLOCK)
         head = NEXT_INSN (head);
  
-      for (r = PR_REG (0); r < PR_REG (64); r += 2)
-       if (REGNO_REG_SET_P (bb->global_live_at_start, r))
+      /* Skip p0, which may be thought to be live due to (reg:DI p0)
+        grabbing the entire block of predicate registers.  */
+      for (r = PR_REG (2); r < PR_REG (64); r += 2)
+       if (REGNO_REG_SET_P (bb->il.rtl->global_live_at_start, r))
           {
             rtx p = gen_rtx_REG (BImode, r);
             rtx n = emit_insn_after (gen_pred_rel_mutex (p), head);
@@ -7470,7 +7731,7 @@ ia64_reorg (void)
       non-optimizing bootstrap.  */
    update_life_info (NULL, UPDATE_LIFE_GLOBAL_RM_NOTES, PROP_DEATH_NOTES);
  
-  if (ia64_flag_schedule_insns2)
+  if (optimize && ia64_flag_schedule_insns2)
      {
        timevar_push (TV_SCHED2);
        ia64_final_schedule = 1;
@@ -8023,9 +8284,7 @@ ia64_init_builtins (void)
  
    /* The __fpreg type.  */
    fpreg_type = make_node (REAL_TYPE);
-  /* ??? The back end should know to load/save __fpreg variables using
-     the ldf.fill and stf.spill instructions.  */
-  TYPE_PRECISION (fpreg_type) = 80;
+  TYPE_PRECISION (fpreg_type) = 82;
    layout_type (fpreg_type);
    (*lang_hooks.types.register_builtin_type) (fpreg_type, "__fpreg");
  
@@ -8514,6 +8773,7 @@ ia64_scalar_mode_supported_p (enum machine_mode mode)
      case SFmode:
      case DFmode:
      case XFmode:
+    case RFmode:
        return true;
  
      case TFmode:
@@ -8542,4 +8802,163 @@ ia64_vector_mode_supported_p (enum machine_mode mode)
      }
  }
  
+/* Implement the FUNCTION_PROFILER macro.  */
+
+void
+ia64_output_function_profiler (FILE *file, int labelno)
+{
+  bool indirect_call;
+
+  /* If the function needs a static chain and the static chain
+     register is r15, we use an indirect call so as to bypass
+     the PLT stub in case the executable is dynamically linked,
+     because the stub clobbers r15 as per 5.3.6 of the psABI.
+     We don't need to do that in non canonical PIC mode.  */
+
+  if (cfun->static_chain_decl && !TARGET_NO_PIC && !TARGET_AUTO_PIC)
+    {
+      gcc_assert (STATIC_CHAIN_REGNUM == 15);
+      indirect_call = true;
+    }
+  else
+    indirect_call = false;
+
+  if (TARGET_GNU_AS)
+    fputs ("\t.prologue 4, r40\n", file);
+  else
+    fputs ("\t.prologue\n\t.save ar.pfs, r40\n", file);
+  fputs ("\talloc out0 = ar.pfs, 8, 0, 4, 0\n", file);
+
+  if (NO_PROFILE_COUNTERS)
+    fputs ("\tmov out3 = r0\n", file);
+  else
+    {
+      char buf[20];
+      ASM_GENERATE_INTERNAL_LABEL (buf, "LP", labelno);
+
+      if (TARGET_AUTO_PIC)
+       fputs ("\tmovl out3 = @gprel(", file);
+      else
+       fputs ("\taddl out3 = @ltoff(", file);
+      assemble_name (file, buf);
+      if (TARGET_AUTO_PIC)
+       fputs (")\n", file);
+      else
+       fputs ("), r1\n", file);
+    }
+
+  if (indirect_call)
+    fputs ("\taddl r14 = @ltoff(@fptr(_mcount)), r1\n", file);
+  fputs ("\t;;\n", file);
+
+  fputs ("\t.save rp, r42\n", file);
+  fputs ("\tmov out2 = b0\n", file);
+  if (indirect_call)
+    fputs ("\tld8 r14 = [r14]\n\t;;\n", file);
+  fputs ("\t.body\n", file);
+  fputs ("\tmov out1 = r1\n", file);
+  if (indirect_call)
+    {
+      fputs ("\tld8 r16 = [r14], 8\n\t;;\n", file);
+      fputs ("\tmov b6 = r16\n", file);
+      fputs ("\tld8 r1 = [r14]\n", file);
+      fputs ("\tbr.call.sptk.many b0 = b6\n\t;;\n", file);
+    }
+  else
+    fputs ("\tbr.call.sptk.many b0 = _mcount\n\t;;\n", file);
+}
+
+static GTY(()) rtx mcount_func_rtx;
+static rtx
+gen_mcount_func_rtx (void)
+{
+  if (!mcount_func_rtx)
+    mcount_func_rtx = init_one_libfunc ("_mcount");
+  return mcount_func_rtx;
+}
+
+void
+ia64_profile_hook (int labelno)
+{
+  rtx label, ip;
+
+  if (NO_PROFILE_COUNTERS)
+    label = const0_rtx;
+  else
+    {
+      char buf[30];
+      const char *label_name;
+      ASM_GENERATE_INTERNAL_LABEL (buf, "LP", labelno);
+      label_name = (*targetm.strip_name_encoding) (ggc_strdup (buf));
+      label = gen_rtx_SYMBOL_REF (Pmode, label_name);
+      SYMBOL_REF_FLAGS (label) = SYMBOL_FLAG_LOCAL;
+    }
+  ip = gen_reg_rtx (Pmode);
+  emit_insn (gen_ip_value (ip));
+  emit_library_call (gen_mcount_func_rtx (), LCT_NORMAL,
+                     VOIDmode, 3,
+                    gen_rtx_REG (Pmode, BR_REG (0)), Pmode,
+                    ip, Pmode,
+                    label, Pmode);
+}
+
+/* Return the mangling of TYPE if it is an extended fundamental type.  */
+
+static const char *
+ia64_mangle_fundamental_type (tree type)
+{
+  /* On HP-UX, "long double" is mangled as "e" so __float128 is
+     mangled as "e".  */
+  if (!TARGET_HPUX && TYPE_MODE (type) == TFmode)
+    return "g";
+  /* On HP-UX, "e" is not available as a mangling of __float80 so use
+     an extended mangling.  Elsewhere, "e" is available since long
+     double is 80 bits.  */
+  if (TYPE_MODE (type) == XFmode)
+    return TARGET_HPUX ? "u9__float80" : "e";
+  if (TYPE_MODE (type) == RFmode)
+    return "u7__fpreg";
+  return NULL;
+}
+
+/* Return the diagnostic message string if conversion from FROMTYPE to
+   TOTYPE is not allowed, NULL otherwise.  */
+static const char *
+ia64_invalid_conversion (tree fromtype, tree totype)
+{
+  /* Reject nontrivial conversion to or from __fpreg.  */
+  if (TYPE_MODE (fromtype) == RFmode
+      && TYPE_MODE (totype) != RFmode
+      && TYPE_MODE (totype) != VOIDmode)
+    return N_("invalid conversion from %<__fpreg%>");
+  if (TYPE_MODE (totype) == RFmode
+      && TYPE_MODE (fromtype) != RFmode)
+    return N_("invalid conversion to %<__fpreg%>");
+  return NULL;
+}
+
+/* Return the diagnostic message string if the unary operation OP is
+   not permitted on TYPE, NULL otherwise.  */
+static const char *
+ia64_invalid_unary_op (int op, tree type)
+{
+  /* Reject operations on __fpreg other than unary + or &.  */
+  if (TYPE_MODE (type) == RFmode
+      && op != CONVERT_EXPR
+      && op != ADDR_EXPR)
+    return N_("invalid operation on %<__fpreg%>");
+  return NULL;
+}
+
+/* Return the diagnostic message string if the binary operation OP is
+   not permitted on TYPE1 and TYPE2, NULL otherwise.  */
+static const char *
+ia64_invalid_binary_op (int op ATTRIBUTE_UNUSED, tree type1, tree type2)
+{
+  /* Reject operations on __fpreg.  */
+  if (TYPE_MODE (type1) == RFmode || TYPE_MODE (type2) == RFmode)
+    return N_("invalid operation on %<__fpreg%>");
+  return NULL;
+}
+
  #include "gt-ia64.h"