* config/ia64/ia64.c (ia64_load_pair_ok): Return 0 if the second member

[pf3gnuchains/gcc-fork.git] / gcc / config / ia64 / ia64.c
diff --git a/gcc/config/ia64/ia64.c b/gcc/config/ia64/ia64.c

index a007743..77be757 100644 (file)
--- a/gcc/config/ia64/ia64.c
+++ b/gcc/config/ia64/ia64.c
@@ -1,6 +1,6 @@
  /* Definitions of target machine for GNU compiler.
     Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
-   2009, 2010
+   2009, 2010, 2011
     Free Software Foundation, Inc.
     Contributed by James E. Wilson <wilson@cygnus.com> and
                   David Mosberger <davidm@hpl.hp.com>.
@@ -43,7 +43,6 @@ along with GCC; see the file COPYING3.  If not see
  #include "basic-block.h"
  #include "libfuncs.h"
  #include "diagnostic-core.h"
-#include "toplev.h"
  #include "sched-int.h"
  #include "timevar.h"
  #include "target.h"
@@ -61,6 +60,8 @@ along with GCC; see the file COPYING3.  If not see
  #include "tm-constrs.h"
  #include "sel-sched.h"
  #include "reload.h"
+#include "dwarf2out.h"
+#include "opts.h"
  
  /* This is used for communication between ASM_OUTPUT_LABEL and
     ASM_OUTPUT_LABELREF.  */
@@ -102,17 +103,6 @@ static const char * const ia64_local_reg_names[80] =
  static const char * const ia64_output_reg_names[8] =
  { "out0", "out1", "out2", "out3", "out4", "out5", "out6", "out7" };
  
-/* Which cpu are we scheduling for.  */
-enum processor_type ia64_tune = PROCESSOR_ITANIUM2;
-
-/* Determines whether we run our final scheduling pass or not.  We always
-   avoid the normal second scheduling pass.  */
-static int ia64_flag_schedule_insns2;
-
-/* Determines whether we run variable tracking in machine dependent
-   reorganization.  */
-static int ia64_flag_var_tracking;
-
  /* Variables which are this size or smaller are put in the sdata/sbss
     sections.  */
  
@@ -202,20 +192,19 @@ static rtx gen_fr_spill_x (rtx, rtx, rtx);
  static rtx gen_fr_restore_x (rtx, rtx, rtx);
  
  static void ia64_option_override (void);
-static void ia64_option_default_params (void);
  static bool ia64_can_eliminate (const int, const int);
  static enum machine_mode hfa_element_mode (const_tree, bool);
-static void ia64_setup_incoming_varargs (CUMULATIVE_ARGS *, enum machine_mode,
+static void ia64_setup_incoming_varargs (cumulative_args_t, enum machine_mode,
                                          tree, int *, int);
-static int ia64_arg_partial_bytes (CUMULATIVE_ARGS *, enum machine_mode,
+static int ia64_arg_partial_bytes (cumulative_args_t, enum machine_mode,
                                    tree, bool);
-static rtx ia64_function_arg_1 (const CUMULATIVE_ARGS *, enum machine_mode,
+static rtx ia64_function_arg_1 (cumulative_args_t, enum machine_mode,
                                 const_tree, bool, bool);
-static rtx ia64_function_arg (CUMULATIVE_ARGS *, enum machine_mode,
+static rtx ia64_function_arg (cumulative_args_t, enum machine_mode,
                               const_tree, bool);
-static rtx ia64_function_incoming_arg (CUMULATIVE_ARGS *,
+static rtx ia64_function_incoming_arg (cumulative_args_t,
                                        enum machine_mode, const_tree, bool);
-static void ia64_function_arg_advance (CUMULATIVE_ARGS *, enum machine_mode,
+static void ia64_function_arg_advance (cumulative_args_t, enum machine_mode,
                                        const_tree, bool);
  static unsigned int ia64_function_arg_boundary (enum machine_mode,
                                                 const_tree);
@@ -228,10 +217,9 @@ static int ia64_register_move_cost (enum machine_mode, reg_class_t,
                                      reg_class_t);
  static int ia64_memory_move_cost (enum machine_mode mode, reg_class_t,
                                   bool);
-static bool ia64_rtx_costs (rtx, int, int, int *, bool);
+static bool ia64_rtx_costs (rtx, int, int, int, int *, bool);
  static int ia64_unspec_may_trap_p (const_rtx, unsigned);
  static void fix_range (const char *);
-static bool ia64_handle_option (size_t, const char *, int);
  static struct machine_function * ia64_init_machine_status (void);
  static void emit_insn_group_barriers (FILE *);
  static void emit_all_insn_group_barriers (FILE *);
@@ -246,6 +234,10 @@ static void ia64_output_function_prologue (FILE *, HOST_WIDE_INT);
  static void ia64_output_function_epilogue (FILE *, HOST_WIDE_INT);
  static void ia64_output_function_end_prologue (FILE *);
  
+static void ia64_print_operand (FILE *, rtx, int);
+static void ia64_print_operand_address (FILE *, rtx);
+static bool ia64_print_operand_punct_valid_p (unsigned char code);
+
  static int ia64_issue_rate (void);
  static int ia64_adjust_cost_2 (rtx, int, rtx, int, dw_t);
  static void ia64_sched_init (FILE *, int, int);
@@ -262,7 +254,6 @@ static void ia64_asm_emit_except_personality (rtx);
  static void ia64_asm_init_sections (void);
  
  static enum unwind_info_type ia64_debug_unwind_info (void);
-static enum unwind_info_type ia64_except_unwind_info (void);
  
  static struct bundle_state *get_free_bundle_state (void);
  static void free_bundle_state (struct bundle_state *);
@@ -317,52 +308,59 @@ static rtx ia64_struct_value_rtx (tree, int);
  static tree ia64_gimplify_va_arg (tree, tree, gimple_seq *, gimple_seq *);
  static bool ia64_scalar_mode_supported_p (enum machine_mode mode);
  static bool ia64_vector_mode_supported_p (enum machine_mode mode);
-static bool ia64_cannot_force_const_mem (rtx);
+static bool ia64_legitimate_constant_p (enum machine_mode, rtx);
+static bool ia64_legitimate_address_p (enum machine_mode, rtx, bool);
+static bool ia64_cannot_force_const_mem (enum machine_mode, rtx);
  static const char *ia64_mangle_type (const_tree);
  static const char *ia64_invalid_conversion (const_tree, const_tree);
  static const char *ia64_invalid_unary_op (int, const_tree);
  static const char *ia64_invalid_binary_op (int, const_tree, const_tree);
  static enum machine_mode ia64_c_mode_for_suffix (char);
-static enum machine_mode ia64_promote_function_mode (const_tree,
-                                                    enum machine_mode,
-                                                    int *,
-                                                    const_tree,
-                                                    int);
  static void ia64_trampoline_init (rtx, tree, rtx);
  static void ia64_override_options_after_change (void);
  
-static void ia64_dwarf_handle_frame_unspec (const char *, rtx, int);
  static tree ia64_builtin_decl (unsigned, bool);
  
  static reg_class_t ia64_preferred_reload_class (rtx, reg_class_t);
  static enum machine_mode ia64_get_reg_raw_mode (int regno);
  static section * ia64_hpux_function_section (tree, enum node_frequency,
                                              bool, bool);
+
+static bool ia64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+                                             const unsigned char *sel);
+
+#define MAX_VECT_LEN   8
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool one_operand_p;
+  bool testing_p; 
+};
+
+static bool ia64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
+
  \f
  /* Table of valid machine attributes.  */
  static const struct attribute_spec ia64_attribute_table[] =
  {
-  /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler } */
-  { "syscall_linkage", 0, 0, false, true,  true,  NULL },
-  { "model",          1, 1, true, false, false, ia64_handle_model_attribute },
+  /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
+       affects_type_identity } */
+  { "syscall_linkage", 0, 0, false, true,  true,  NULL, false },
+  { "model",          1, 1, true, false, false, ia64_handle_model_attribute,
+    false },
  #if TARGET_ABI_OPEN_VMS
-  { "common_object",   1, 1, true, false, false, ia64_vms_common_object_attribute},
+  { "common_object",   1, 1, true, false, false,
+    ia64_vms_common_object_attribute, false },
  #endif
    { "version_id",      1, 1, true, false, false,
-    ia64_handle_version_id_attribute },
-  { NULL,             0, 0, false, false, false, NULL }
+    ia64_handle_version_id_attribute, false },
+  { NULL,             0, 0, false, false, false, NULL, false }
  };
  
-/* Implement overriding of the optimization options.  */
-static const struct default_options ia64_option_optimization_table[] =
-  {
-    { OPT_LEVELS_1_PLUS, OPT_fomit_frame_pointer, NULL, 1 },
-#ifdef SUBTARGET_OPTIMIZATION_OPTIONS
-    SUBTARGET_OPTIMIZATION_OPTIONS,
-#endif
-    { OPT_LEVELS_NONE, 0, NULL, 0 }
-  };
-
  /* Initialize the GCC target structure.  */
  #undef TARGET_ATTRIBUTE_TABLE
  #define TARGET_ATTRIBUTE_TABLE ia64_attribute_table
@@ -395,10 +393,6 @@ static const struct default_options ia64_option_optimization_table[] =
  
  #undef TARGET_OPTION_OVERRIDE
  #define TARGET_OPTION_OVERRIDE ia64_option_override
-#undef TARGET_OPTION_OPTIMIZATION_TABLE
-#define TARGET_OPTION_OPTIMIZATION_TABLE ia64_option_optimization_table
-#undef TARGET_OPTION_DEFAULT_PARAMS
-#define TARGET_OPTION_DEFAULT_PARAMS ia64_option_default_params
  
  #undef TARGET_ASM_FUNCTION_PROLOGUE
  #define TARGET_ASM_FUNCTION_PROLOGUE ia64_output_function_prologue
@@ -407,6 +401,13 @@ static const struct default_options ia64_option_optimization_table[] =
  #undef TARGET_ASM_FUNCTION_EPILOGUE
  #define TARGET_ASM_FUNCTION_EPILOGUE ia64_output_function_epilogue
  
+#undef TARGET_PRINT_OPERAND
+#define TARGET_PRINT_OPERAND ia64_print_operand
+#undef TARGET_PRINT_OPERAND_ADDRESS
+#define TARGET_PRINT_OPERAND_ADDRESS ia64_print_operand_address
+#undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
+#define TARGET_PRINT_OPERAND_PUNCT_VALID_P ia64_print_operand_punct_valid_p
+
  #undef TARGET_IN_SMALL_DATA_P
  #define TARGET_IN_SMALL_DATA_P  ia64_in_small_data_p
  
@@ -540,9 +541,6 @@ static const struct default_options ia64_option_optimization_table[] =
  #define TARGET_ASM_OUTPUT_DWARF_DTPREL ia64_output_dwarf_dtprel
  #endif
  
-#undef TARGET_PROMOTE_FUNCTION_MODE
-#define TARGET_PROMOTE_FUNCTION_MODE ia64_promote_function_mode
-
  /* ??? Investigate.  */
  #if 0
  #undef TARGET_PROMOTE_PROTOTYPES
@@ -574,8 +572,6 @@ static const struct default_options ia64_option_optimization_table[] =
  #undef TARGET_GIMPLIFY_VA_ARG_EXPR
  #define TARGET_GIMPLIFY_VA_ARG_EXPR ia64_gimplify_va_arg
  
-#undef TARGET_DWARF_HANDLE_FRAME_UNSPEC
-#define TARGET_DWARF_HANDLE_FRAME_UNSPEC  ia64_dwarf_handle_frame_unspec
  #undef TARGET_ASM_UNWIND_EMIT
  #define TARGET_ASM_UNWIND_EMIT ia64_asm_unwind_emit
  #undef TARGET_ASM_EMIT_EXCEPT_PERSONALITY
@@ -585,8 +581,6 @@ static const struct default_options ia64_option_optimization_table[] =
  
  #undef TARGET_DEBUG_UNWIND_INFO
  #define TARGET_DEBUG_UNWIND_INFO  ia64_debug_unwind_info
-#undef TARGET_EXCEPT_UNWIND_INFO
-#define TARGET_EXCEPT_UNWIND_INFO  ia64_except_unwind_info
  
  #undef TARGET_SCALAR_MODE_SUPPORTED_P
  #define TARGET_SCALAR_MODE_SUPPORTED_P ia64_scalar_mode_supported_p
@@ -598,10 +592,10 @@ static const struct default_options ia64_option_optimization_table[] =
  #undef TARGET_RELAXED_ORDERING
  #define TARGET_RELAXED_ORDERING true
  
-#undef TARGET_DEFAULT_TARGET_FLAGS
-#define TARGET_DEFAULT_TARGET_FLAGS (TARGET_DEFAULT | TARGET_CPU_DEFAULT)
-#undef TARGET_HANDLE_OPTION
-#define TARGET_HANDLE_OPTION ia64_handle_option
+#undef TARGET_LEGITIMATE_CONSTANT_P
+#define TARGET_LEGITIMATE_CONSTANT_P ia64_legitimate_constant_p
+#undef TARGET_LEGITIMATE_ADDRESS_P
+#define TARGET_LEGITIMATE_ADDRESS_P ia64_legitimate_address_p
  
  #undef TARGET_CANNOT_FORCE_CONST_MEM
  #define TARGET_CANNOT_FORCE_CONST_MEM ia64_cannot_force_const_mem
@@ -634,6 +628,17 @@ static const struct default_options ia64_option_optimization_table[] =
  #undef TARGET_PREFERRED_RELOAD_CLASS
  #define TARGET_PREFERRED_RELOAD_CLASS ia64_preferred_reload_class
  
+#undef TARGET_DELAY_SCHED2
+#define TARGET_DELAY_SCHED2 true
+
+/* Variable tracking should be run after all optimizations which
+   change order of insns.  It also needs a valid CFG.  */
+#undef TARGET_DELAY_VARTRACK
+#define TARGET_DELAY_VARTRACK true
+
+#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
+#define TARGET_VECTORIZE_VEC_PERM_CONST_OK ia64_vectorize_vec_perm_const_ok
+
  struct gcc_target targetm = TARGET_INITIALIZER;
  \f
  typedef enum
@@ -880,7 +885,11 @@ ia64_move_ok (rtx dst, rtx src)
  int
  ia64_load_pair_ok (rtx dst, rtx src)
  {
-  if (GET_CODE (dst) != REG || !FP_REGNO_P (REGNO (dst)))
+  /* ??? There is a thinko in the implementation of the "x" constraint and the
+     FP_REGS class.  The constraint will also reject (reg f30:TI) so we must
+     also return false for it.  */
+  if (GET_CODE (dst) != REG
+      || !(FP_REGNO_P (REGNO (dst)) && FP_REGNO_P (REGNO (dst) + 1)))
      return 0;
    if (GET_CODE (src) != MEM || MEM_VOLATILE_P (src))
      return 0;
@@ -948,11 +957,73 @@ tls_symbolic_operand_type (rtx addr)
    return tls_kind;
  }
  
+/* Returns true if REG (assumed to be a `reg' RTX) is valid for use
+   as a base register.  */
+
+static inline bool
+ia64_reg_ok_for_base_p (const_rtx reg, bool strict)
+{
+  if (strict
+      && REGNO_OK_FOR_BASE_P (REGNO (reg)))
+    return true;
+  else if (!strict
+          && (GENERAL_REGNO_P (REGNO (reg))
+              || !HARD_REGISTER_P (reg)))
+    return true;
+  else
+    return false;
+}
+
+static bool
+ia64_legitimate_address_reg (const_rtx reg, bool strict)
+{
+  if ((REG_P (reg) && ia64_reg_ok_for_base_p (reg, strict))
+      || (GET_CODE (reg) == SUBREG && REG_P (XEXP (reg, 0))
+         && ia64_reg_ok_for_base_p (XEXP (reg, 0), strict)))
+    return true;
+
+  return false;
+}
+
+static bool
+ia64_legitimate_address_disp (const_rtx reg, const_rtx disp, bool strict)
+{
+  if (GET_CODE (disp) == PLUS
+      && rtx_equal_p (reg, XEXP (disp, 0))
+      && (ia64_legitimate_address_reg (XEXP (disp, 1), strict)
+         || (CONST_INT_P (XEXP (disp, 1))
+             && IN_RANGE (INTVAL (XEXP (disp, 1)), -256, 255))))
+    return true;
+
+  return false;
+}
+
+/* Implement TARGET_LEGITIMATE_ADDRESS_P.  */
+
+static bool
+ia64_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
+                          rtx x, bool strict)
+{
+  if (ia64_legitimate_address_reg (x, strict))
+    return true;
+  else if ((GET_CODE (x) == POST_INC || GET_CODE (x) == POST_DEC)
+          && ia64_legitimate_address_reg (XEXP (x, 0), strict)
+          && XEXP (x, 0) != arg_pointer_rtx) 
+    return true;
+  else if (GET_CODE (x) == POST_MODIFY
+          && ia64_legitimate_address_reg (XEXP (x, 0), strict)
+          && XEXP (x, 0) != arg_pointer_rtx
+          && ia64_legitimate_address_disp (XEXP (x, 0), XEXP (x, 1), strict))
+    return true;
+  else
+    return false;
+}
+
  /* Return true if X is a constant that is valid for some immediate
     field in an instruction.  */
  
-bool
-ia64_legitimate_constant_p (rtx x)
+static bool
+ia64_legitimate_constant_p (enum machine_mode mode, rtx x)
  {
    switch (GET_CODE (x))
      {
@@ -961,8 +1032,7 @@ ia64_legitimate_constant_p (rtx x)
        return true;
  
      case CONST_DOUBLE:
-      if (GET_MODE (x) == VOIDmode || GET_MODE (x) == SFmode
-         || GET_MODE (x) == DFmode)
+      if (GET_MODE (x) == VOIDmode || mode == SFmode || mode == DFmode)
         return true;
        return satisfies_constraint_G (x);
  
@@ -984,25 +1054,21 @@ ia64_legitimate_constant_p (rtx x)
               op = XEXP (XEXP (op, 0), 0);
             }
  
-          if (any_offset_symbol_operand (op, GET_MODE (op))
-              || function_operand (op, GET_MODE (op)))
+          if (any_offset_symbol_operand (op, mode)
+              || function_operand (op, mode))
              return true;
-         if (aligned_offset_symbol_operand (op, GET_MODE (op)))
+         if (aligned_offset_symbol_operand (op, mode))
             return (addend & 0x3fff) == 0;
           return false;
         }
        return false;
  
      case CONST_VECTOR:
-      {
-       enum machine_mode mode = GET_MODE (x);
+      if (mode == V2SFmode)
+       return satisfies_constraint_Y (x);
  
-       if (mode == V2SFmode)
-         return satisfies_constraint_Y (x);
-
-       return (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
-               && GET_MODE_SIZE (mode) <= 8);
-      }
+      return (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+             && GET_MODE_SIZE (mode) <= 8);
  
      default:
        return false;
@@ -1012,9 +1078,9 @@ ia64_legitimate_constant_p (rtx x)
  /* Don't allow TLS addresses to get spilled to memory.  */
  
  static bool
-ia64_cannot_force_const_mem (rtx x)
+ia64_cannot_force_const_mem (enum machine_mode mode, rtx x)
  {
-  if (GET_MODE (x) == RFmode)
+  if (mode == RFmode)
      return true;
    return tls_symbolic_operand_type (x) != 0;
  }
@@ -1074,7 +1140,7 @@ ia64_expand_load_address (rtx dest, rtx src)
        tmp = gen_rtx_PLUS (Pmode, tmp, pic_offset_table_rtx);
        emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
  
-      tmp = gen_rtx_LO_SUM (Pmode, dest, src);
+      tmp = gen_rtx_LO_SUM (Pmode, gen_const_mem (Pmode, dest), src);
        emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
  
        if (addend)
@@ -1972,58 +2038,88 @@ ia64_expand_vecint_minmax (enum rtx_code code, enum machine_mode mode,
    return true;
  }
  
-/* Emit an integral vector widening sum operations.  */
+/* The vectors LO and HI each contain N halves of a double-wide vector.
+   Reassemble either the first N/2 or the second N/2 elements.  */
  
  void
-ia64_expand_widen_sum (rtx operands[3], bool unsignedp)
+ia64_unpack_assemble (rtx out, rtx lo, rtx hi, bool highp)
  {
-  rtx l, h, x, s;
-  enum machine_mode wmode, mode;
-  rtx (*unpack_l) (rtx, rtx, rtx);
-  rtx (*unpack_h) (rtx, rtx, rtx);
-  rtx (*plus) (rtx, rtx, rtx);
+  enum machine_mode vmode = GET_MODE (lo);
+  unsigned int i, high, nelt = GET_MODE_NUNITS (vmode);
+  struct expand_vec_perm_d d;
+  bool ok;
  
-  wmode = GET_MODE (operands[0]);
-  mode = GET_MODE (operands[1]);
+  d.target = gen_lowpart (vmode, out);
+  d.op0 = (TARGET_BIG_ENDIAN ? hi : lo);
+  d.op1 = (TARGET_BIG_ENDIAN ? lo : hi);
+  d.vmode = vmode;
+  d.nelt = nelt;
+  d.one_operand_p = false;
+  d.testing_p = false;
  
-  switch (mode)
+  high = (highp ? nelt / 2 : 0);
+  for (i = 0; i < nelt / 2; ++i)
      {
-    case V8QImode:
-      unpack_l = gen_unpack1_l;
-      unpack_h = gen_unpack1_h;
-      plus = gen_addv4hi3;
-      break;
-    case V4HImode:
-      unpack_l = gen_unpack2_l;
-      unpack_h = gen_unpack2_h;
-      plus = gen_addv2si3;
-      break;
-    default:
-      gcc_unreachable ();
+      d.perm[i * 2] = i + high;
+      d.perm[i * 2 + 1] = i + high + nelt;
      }
  
-  /* Fill in x with the sign extension of each element in op1.  */
+  ok = ia64_expand_vec_perm_const_1 (&d);
+  gcc_assert (ok);
+}
+
+/* Return a vector of the sign-extension of VEC.  */
+
+static rtx
+ia64_unpack_sign (rtx vec, bool unsignedp)
+{
+  enum machine_mode mode = GET_MODE (vec);
+  rtx zero = CONST0_RTX (mode);
+
    if (unsignedp)
-    x = CONST0_RTX (mode);
+    return zero;
    else
      {
+      rtx sign = gen_reg_rtx (mode);
        bool neg;
  
-      x = gen_reg_rtx (mode);
-
-      neg = ia64_expand_vecint_compare (LT, mode, x, operands[1],
-                                       CONST0_RTX (mode));
+      neg = ia64_expand_vecint_compare (LT, mode, sign, vec, zero);
        gcc_assert (!neg);
+
+      return sign;
      }
+}
+
+/* Emit an integral vector unpack operation.  */
+
+void
+ia64_expand_unpack (rtx operands[3], bool unsignedp, bool highp)
+{
+  rtx sign = ia64_unpack_sign (operands[1], unsignedp);
+  ia64_unpack_assemble (operands[0], operands[1], sign, highp);
+}
+
+/* Emit an integral vector widening sum operations.  */
  
+void
+ia64_expand_widen_sum (rtx operands[3], bool unsignedp)
+{
+  enum machine_mode wmode;
+  rtx l, h, t, sign;
+
+  sign = ia64_unpack_sign (operands[1], unsignedp);
+
+  wmode = GET_MODE (operands[0]);
    l = gen_reg_rtx (wmode);
    h = gen_reg_rtx (wmode);
-  s = gen_reg_rtx (wmode);
  
-  emit_insn (unpack_l (gen_lowpart (mode, l), operands[1], x));
-  emit_insn (unpack_h (gen_lowpart (mode, h), operands[1], x));
-  emit_insn (plus (s, l, operands[2]));
-  emit_insn (plus (operands[0], h, s));
+  ia64_unpack_assemble (l, operands[1], sign, false);
+  ia64_unpack_assemble (h, operands[1], sign, true);
+
+  t = expand_binop (wmode, add_optab, l, operands[2], NULL, 0, OPTAB_DIRECT);
+  t = expand_binop (wmode, add_optab, h, t, operands[0], 0, OPTAB_DIRECT);
+  if (t != operands[0])
+    emit_move_insn (operands[0], t);
  }
  
  /* Emit a signed or unsigned V8QI dot product operation.  */
@@ -2031,44 +2127,31 @@ ia64_expand_widen_sum (rtx operands[3], bool unsignedp)
  void
  ia64_expand_dot_prod_v8qi (rtx operands[4], bool unsignedp)
  {
-  rtx l1, l2, h1, h2, x1, x2, p1, p2, p3, p4, s1, s2, s3;
+  rtx op1, op2, sn1, sn2, l1, l2, h1, h2;
+  rtx p1, p2, p3, p4, s1, s2, s3;
  
-  /* Fill in x1 and x2 with the sign extension of each element.  */
-  if (unsignedp)
-    x1 = x2 = CONST0_RTX (V8QImode);
-  else
-    {
-      bool neg;
-
-      x1 = gen_reg_rtx (V8QImode);
-      x2 = gen_reg_rtx (V8QImode);
-
-      neg = ia64_expand_vecint_compare (LT, V8QImode, x1, operands[1],
-                                       CONST0_RTX (V8QImode));
-      gcc_assert (!neg);
-      neg = ia64_expand_vecint_compare (LT, V8QImode, x2, operands[2],
-                                       CONST0_RTX (V8QImode));
-      gcc_assert (!neg);
-    }
+  op1 = operands[1];
+  op2 = operands[2];
+  sn1 = ia64_unpack_sign (op1, unsignedp);
+  sn2 = ia64_unpack_sign (op2, unsignedp);
  
    l1 = gen_reg_rtx (V4HImode);
    l2 = gen_reg_rtx (V4HImode);
    h1 = gen_reg_rtx (V4HImode);
    h2 = gen_reg_rtx (V4HImode);
-
-  emit_insn (gen_unpack1_l (gen_lowpart (V8QImode, l1), operands[1], x1));
-  emit_insn (gen_unpack1_l (gen_lowpart (V8QImode, l2), operands[2], x2));
-  emit_insn (gen_unpack1_h (gen_lowpart (V8QImode, h1), operands[1], x1));
-  emit_insn (gen_unpack1_h (gen_lowpart (V8QImode, h2), operands[2], x2));
+  ia64_unpack_assemble (l1, op1, sn1, false);
+  ia64_unpack_assemble (l2, op2, sn2, false);
+  ia64_unpack_assemble (h1, op1, sn1, true);
+  ia64_unpack_assemble (h2, op2, sn2, true);
  
    p1 = gen_reg_rtx (V2SImode);
    p2 = gen_reg_rtx (V2SImode);
    p3 = gen_reg_rtx (V2SImode);
    p4 = gen_reg_rtx (V2SImode);
-  emit_insn (gen_pmpy2_r (p1, l1, l2));
-  emit_insn (gen_pmpy2_l (p2, l1, l2));
-  emit_insn (gen_pmpy2_r (p3, h1, h2));
-  emit_insn (gen_pmpy2_l (p4, h1, h2));
+  emit_insn (gen_pmpy2_even (p1, l1, l2));
+  emit_insn (gen_pmpy2_even (p2, h1, h2));
+  emit_insn (gen_pmpy2_odd (p3, l1, l2));
+  emit_insn (gen_pmpy2_odd (p4, h1, h2));
  
    s1 = gen_reg_rtx (V2SImode);
    s2 = gen_reg_rtx (V2SImode);
@@ -2276,7 +2359,7 @@ ia64_split_call (rtx retval, rtx addr, rtx retaddr, rtx scratch_r,
  
  void
  ia64_expand_atomic_op (enum rtx_code code, rtx mem, rtx val,
-                      rtx old_dst, rtx new_dst)
+                      rtx old_dst, rtx new_dst, enum memmodel model)
  {
    enum machine_mode mode = GET_MODE (mem);
    rtx old_reg, new_reg, cmp_reg, ar_ccv, label;
@@ -2293,12 +2376,31 @@ ia64_expand_atomic_op (enum rtx_code code, rtx mem, rtx val,
        if (!old_dst)
          old_dst = gen_reg_rtx (mode);
  
-      emit_insn (gen_memory_barrier ());
+      switch (model)
+       {
+       case MEMMODEL_ACQ_REL:
+       case MEMMODEL_SEQ_CST:
+         emit_insn (gen_memory_barrier ());
+         /* FALLTHRU */
+       case MEMMODEL_RELAXED:
+       case MEMMODEL_ACQUIRE:
+       case MEMMODEL_CONSUME:
+         if (mode == SImode)
+           icode = CODE_FOR_fetchadd_acq_si;
+         else
+           icode = CODE_FOR_fetchadd_acq_di;
+         break;
+       case MEMMODEL_RELEASE:
+         if (mode == SImode)
+           icode = CODE_FOR_fetchadd_rel_si;
+         else
+           icode = CODE_FOR_fetchadd_rel_di;
+         break;
+
+       default:
+         gcc_unreachable ();
+       }
  
-      if (mode == SImode)
-       icode = CODE_FOR_fetchadd_acq_si;
-      else
-       icode = CODE_FOR_fetchadd_acq_di;
        emit_insn (GEN_FCN (icode) (old_dst, mem, val));
  
        if (new_dst)
@@ -2312,8 +2414,12 @@ ia64_expand_atomic_op (enum rtx_code code, rtx mem, rtx val,
      }
  
    /* Because of the volatile mem read, we get an ld.acq, which is the
-     front half of the full barrier.  The end half is the cmpxchg.rel.  */
-  gcc_assert (MEM_VOLATILE_P (mem));
+     front half of the full barrier.  The end half is the cmpxchg.rel.
+     For relaxed and release memory models, we don't need this.  But we
+     also don't bother trying to prevent it either.  */
+  gcc_assert (model == MEMMODEL_RELAXED
+             || model == MEMMODEL_RELEASE
+             || MEM_VOLATILE_P (mem));
  
    old_reg = gen_reg_rtx (DImode);
    cmp_reg = gen_reg_rtx (DImode);
@@ -2352,12 +2458,36 @@ ia64_expand_atomic_op (enum rtx_code code, rtx mem, rtx val,
    if (new_dst)
      emit_move_insn (new_dst, new_reg);
  
-  switch (mode)
+  switch (model)
      {
-    case QImode:  icode = CODE_FOR_cmpxchg_rel_qi;  break;
-    case HImode:  icode = CODE_FOR_cmpxchg_rel_hi;  break;
-    case SImode:  icode = CODE_FOR_cmpxchg_rel_si;  break;
-    case DImode:  icode = CODE_FOR_cmpxchg_rel_di;  break;
+    case MEMMODEL_RELAXED:
+    case MEMMODEL_ACQUIRE:
+    case MEMMODEL_CONSUME:
+      switch (mode)
+       {
+       case QImode: icode = CODE_FOR_cmpxchg_acq_qi;  break;
+       case HImode: icode = CODE_FOR_cmpxchg_acq_hi;  break;
+       case SImode: icode = CODE_FOR_cmpxchg_acq_si;  break;
+       case DImode: icode = CODE_FOR_cmpxchg_acq_di;  break;
+       default:
+         gcc_unreachable ();
+       }
+      break;
+
+    case MEMMODEL_RELEASE:
+    case MEMMODEL_ACQ_REL:
+    case MEMMODEL_SEQ_CST:
+      switch (mode)
+       {
+       case QImode: icode = CODE_FOR_cmpxchg_rel_qi;  break;
+       case HImode: icode = CODE_FOR_cmpxchg_rel_hi;  break;
+       case SImode: icode = CODE_FOR_cmpxchg_rel_si;  break;
+       case DImode: icode = CODE_FOR_cmpxchg_rel_di;  break;
+       default:
+         gcc_unreachable ();
+       }
+      break;
+
      default:
        gcc_unreachable ();
      }
@@ -2372,13 +2502,6 @@ ia64_expand_atomic_op (enum rtx_code code, rtx mem, rtx val,
  static void
  ia64_file_start (void)
  {
-  /* Variable tracking should be run after all optimizations which change order
-     of insns.  It also needs a valid CFG.  This can't be done in
-     ia64_option_override, because flag_var_tracking is finalized after
-     that.  */
-  ia64_flag_var_tracking = flag_var_tracking;
-  flag_var_tracking = 0;
-
    default_file_start ();
    emit_safe_across_calls ();
  }
@@ -3165,7 +3288,7 @@ ia64_expand_prologue (void)
    ia64_compute_frame_size (get_frame_size ());
    last_scratch_gr_reg = 15;
  
-  if (flag_stack_usage)
+  if (flag_stack_usage_info)
      current_function_static_stack_size = current_frame_info.total_size;
  
    if (dump_file) 
@@ -3263,7 +3386,14 @@ ia64_expand_prologue (void)
                                    GEN_INT (current_frame_info.n_local_regs),
                                    GEN_INT (current_frame_info.n_output_regs),
                                    GEN_INT (current_frame_info.n_rotate_regs)));
-      RTX_FRAME_RELATED_P (insn) = (current_frame_info.r[reg_save_ar_pfs] != 0);
+      if (current_frame_info.r[reg_save_ar_pfs])
+       {
+         RTX_FRAME_RELATED_P (insn) = 1;
+         add_reg_note (insn, REG_CFA_REGISTER,
+                       gen_rtx_SET (VOIDmode,
+                                    ar_pfs_save_reg,
+                                    gen_rtx_REG (DImode, AR_PFS_REGNUM)));
+       }
      }
  
    /* Set up frame pointer, stack pointer, and spill iterators.  */
@@ -3451,7 +3581,8 @@ ia64_expand_prologue (void)
            reg_emitted (reg_save_b0);
           insn = emit_move_insn (alt_reg, reg);
           RTX_FRAME_RELATED_P (insn) = 1;
-         add_reg_note (insn, REG_CFA_REGISTER, NULL_RTX);
+         add_reg_note (insn, REG_CFA_REGISTER,
+                       gen_rtx_SET (VOIDmode, alt_reg, pc_rtx));
  
           /* Even if we're not going to generate an epilogue, we still
              need to save the register so that EH works.  */
@@ -3524,6 +3655,7 @@ ia64_start_function (FILE *file, const char *fnname,
  {
  #if VMS_DEBUGGING_INFO
    if (vms_debug_main
+      && debug_info_level > DINFO_LEVEL_NONE
        && strncmp (vms_debug_main, fnname, strlen (vms_debug_main)) == 0)
      {
        targetm.asm_out.globalize_label (asm_out_file, VMS_DEBUG_MAIN_POINTER);
@@ -3798,10 +3930,19 @@ ia64_expand_epilogue (int sibcall_p)
        if (current_frame_info.n_input_regs != 0)
         {
           rtx n_inputs = GEN_INT (current_frame_info.n_input_regs);
+
           insn = emit_insn (gen_alloc (gen_rtx_REG (DImode, fp),
                                 const0_rtx, const0_rtx,
                                 n_inputs, const0_rtx));
           RTX_FRAME_RELATED_P (insn) = 1;
+
+         /* ??? We need to mark the alloc as frame-related so that it gets
+            passed into ia64_asm_unwind_emit for ia64-specific unwinding.
+            But there's nothing dwarf2 related to be done wrt the register
+            windows.  If we do nothing, dwarf2out will abort on the UNSPEC;
+            the empty parallel means dwarf2out will not see anything.  */
+         add_reg_note (insn, REG_FRAME_RELATED_EXPR,
+                       gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (0)));
         }
      }
  }
@@ -3957,7 +4098,7 @@ ia64_output_function_prologue (FILE *file, HOST_WIDE_INT size ATTRIBUTE_UNUSED)
              current_frame_info.n_output_regs,
              current_frame_info.n_rotate_regs);
  
-  if (ia64_except_unwind_info () != UI_TARGET)
+  if (ia64_except_unwind_info (&global_options) != UI_TARGET)
      return;
  
    /* Emit the .prologue directive.  */
@@ -4015,7 +4156,7 @@ ia64_output_function_prologue (FILE *file, HOST_WIDE_INT size ATTRIBUTE_UNUSED)
  static void
  ia64_output_function_end_prologue (FILE *file)
  {
-  if (ia64_except_unwind_info () != UI_TARGET)
+  if (ia64_except_unwind_info (&global_options) != UI_TARGET)
      return;
  
    fputs ("\t.body\n", file);
@@ -4167,14 +4308,14 @@ ia64_trampoline_init (rtx m_tramp, tree fndecl, rtx static_chain)
     We generate the actual spill instructions during prologue generation.  */
  
  static void
-ia64_setup_incoming_varargs (CUMULATIVE_ARGS *cum, enum machine_mode mode,
+ia64_setup_incoming_varargs (cumulative_args_t cum, enum machine_mode mode,
                              tree type, int * pretend_size,
                              int second_time ATTRIBUTE_UNUSED)
  {
-  CUMULATIVE_ARGS next_cum = *cum;
+  CUMULATIVE_ARGS next_cum = *get_cumulative_args (cum);
  
    /* Skip the current argument.  */
-  ia64_function_arg_advance (&next_cum, mode, type, 1);
+  ia64_function_arg_advance (pack_cumulative_args (&next_cum), mode, type, 1);
  
    if (next_cum.words < MAX_ARGUMENT_SLOTS)
      {
@@ -4322,9 +4463,11 @@ ia64_function_arg_offset (const CUMULATIVE_ARGS *cum,
     registers.  */
  
  static rtx
-ia64_function_arg_1 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
+ia64_function_arg_1 (cumulative_args_t cum_v, enum machine_mode mode,
                      const_tree type, bool named, bool incoming)
  {
+  const CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+
    int basereg = (incoming ? GR_ARG_FIRST : AR_ARG_FIRST);
    int words = ia64_function_arg_words (type, mode);
    int offset = ia64_function_arg_offset (cum, type, words);
@@ -4515,7 +4658,7 @@ ia64_function_arg_1 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
  /* Implement TARGET_FUNCION_ARG target hook.  */
  
  static rtx
-ia64_function_arg (CUMULATIVE_ARGS *cum, enum machine_mode mode,
+ia64_function_arg (cumulative_args_t cum, enum machine_mode mode,
                    const_tree type, bool named)
  {
    return ia64_function_arg_1 (cum, mode, type, named, false);
@@ -4524,7 +4667,7 @@ ia64_function_arg (CUMULATIVE_ARGS *cum, enum machine_mode mode,
  /* Implement TARGET_FUNCION_INCOMING_ARG target hook.  */
  
  static rtx
-ia64_function_incoming_arg (CUMULATIVE_ARGS *cum,
+ia64_function_incoming_arg (cumulative_args_t cum,
                             enum machine_mode mode,
                             const_tree type, bool named)
  {
@@ -4536,9 +4679,11 @@ ia64_function_incoming_arg (CUMULATIVE_ARGS *cum,
     in memory.  */
  
  static int
-ia64_arg_partial_bytes (CUMULATIVE_ARGS *cum, enum machine_mode mode,
+ia64_arg_partial_bytes (cumulative_args_t cum_v, enum machine_mode mode,
                         tree type, bool named ATTRIBUTE_UNUSED)
  {
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+
    int words = ia64_function_arg_words (type, mode);
    int offset = ia64_function_arg_offset (cum, type, words);
  
@@ -4577,9 +4722,10 @@ ia64_arg_type (enum machine_mode mode)
     ia64_function_arg.  */
  
  static void
-ia64_function_arg_advance (CUMULATIVE_ARGS *cum, enum machine_mode mode,
+ia64_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
                            const_tree type, bool named)
  {
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
    int words = ia64_function_arg_words (type, mode);
    int offset = ia64_function_arg_offset (cum, type, words);
    enum machine_mode hfa_mode = VOIDmode;
@@ -4732,12 +4878,9 @@ ia64_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
    if ((TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == INTEGER_TYPE)
        ? int_size_in_bytes (type) > 8 : TYPE_ALIGN (type) > 8 * BITS_PER_UNIT)
      {
-      tree t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (valist), valist,
-                      size_int (2 * UNITS_PER_WORD - 1));
-      t = fold_convert (sizetype, t);
+      tree t = fold_build_pointer_plus_hwi (valist, 2 * UNITS_PER_WORD - 1);
        t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
-                 size_int (-2 * UNITS_PER_WORD));
-      t = fold_convert (TREE_TYPE (valist), t);
+                 build_int_cst (TREE_TYPE (t), -2 * UNITS_PER_WORD));
        gimplify_assign (unshare_expr (valist), t, pre_p);
      }
  
@@ -4866,9 +5009,9 @@ ia64_function_value (const_tree valtype,
           return gen_rtx_PARALLEL (mode, gen_rtvec_v (i, loc));
         }
  
-      mode = ia64_promote_function_mode (valtype, mode, &unsignedp,
-                                        func ? TREE_TYPE (func) : NULL_TREE,
-                                        true);
+      mode = promote_function_mode (valtype, mode, &unsignedp,
+                                    func ? TREE_TYPE (func) : NULL_TREE,
+                                    true);
  
        return gen_rtx_REG (mode, GR_RET_FIRST);
      }
@@ -4916,7 +5059,7 @@ ia64_output_dwarf_dtprel (FILE *file, int size, rtx x)
  /* ??? Do we need this?  It gets used only for 'a' operands.  We could perhaps
     also call this from ia64_print_operand for memory addresses.  */
  
-void
+static void
  ia64_print_operand_address (FILE * stream ATTRIBUTE_UNUSED,
                             rtx address ATTRIBUTE_UNUSED)
  {
@@ -4947,7 +5090,7 @@ ia64_print_operand_address (FILE * stream ATTRIBUTE_UNUSED,
         Linux kernel.
     v    Print vector constant value as an 8-byte integer value.  */
  
-void
+static void
  ia64_print_operand (FILE * file, rtx x, int code)
  {
    const char *str;
@@ -5234,6 +5377,14 @@ ia64_print_operand (FILE * file, rtx x, int code)
  
    return;
  }
+
+/* Worker function for TARGET_PRINT_OPERAND_PUNCT_VALID_P.  */
+
+static bool
+ia64_print_operand_punct_valid_p (unsigned char code)
+{
+  return (code == '+' || code == ',');
+}
  \f
  /* Compute a (partial) cost for rtx X.  Return true if the complete
     cost has been computed, and false if subexpressions should be
@@ -5241,8 +5392,8 @@ ia64_print_operand (FILE * file, rtx x, int code)
  /* ??? This is incomplete.  */
  
  static bool
-ia64_rtx_costs (rtx x, int code, int outer_code, int *total,
-               bool speed ATTRIBUTE_UNUSED)
+ia64_rtx_costs (rtx x, int code, int outer_code, int opno ATTRIBUTE_UNUSED,
+               int *total, bool speed ATTRIBUTE_UNUSED)
  {
    switch (code)
      {
@@ -5328,12 +5479,9 @@ ia64_rtx_costs (rtx x, int code, int outer_code, int *total,
     one in class TO, using MODE.  */
  
  static int
-ia64_register_move_cost (enum machine_mode mode, reg_class_t from_i,
-                        reg_class_t to_i)
+ia64_register_move_cost (enum machine_mode mode, reg_class_t from,
+                        reg_class_t to)
  {
-  enum reg_class from = (enum reg_class) from_i;
-  enum reg_class to = (enum reg_class) to_i;
-
    /* ADDL_REGS is the same as GR_REGS for movement purposes.  */
    if (to == ADDL_REGS)
      to = GR_REGS;
@@ -5344,7 +5492,7 @@ ia64_register_move_cost (enum machine_mode mode, reg_class_t from_i,
       lower number class as the destination.  */
    if (from < to)
      {
-      enum reg_class tmp = to;
+      reg_class_t tmp = to;
        to = from, from = tmp;
      }
  
@@ -5632,58 +5780,29 @@ fix_range (const char *const_str)
      }
  }
  
-/* Implement TARGET_HANDLE_OPTION.  */
+/* Implement TARGET_OPTION_OVERRIDE.  */
  
-static bool
-ia64_handle_option (size_t code, const char *arg, int value)
+static void
+ia64_option_override (void)
  {
-  switch (code)
-    {
-    case OPT_mfixed_range_:
-      fix_range (arg);
-      return true;
-
-    case OPT_mtls_size_:
-      if (value != 14 && value != 22 && value != 64)
-       error ("bad value %<%s%> for -mtls-size= switch", arg);
-      return true;
+  unsigned int i;
+  cl_deferred_option *opt;
+  VEC(cl_deferred_option,heap) *vec
+    = (VEC(cl_deferred_option,heap) *) ia64_deferred_options;
  
-    case OPT_mtune_:
-      {
-       static struct pta
-         {
-           const char *name;           /* processor name or nickname.  */
-           enum processor_type processor;
-         }
-       const processor_alias_table[] =
-         {
-           {"itanium2", PROCESSOR_ITANIUM2},
-           {"mckinley", PROCESSOR_ITANIUM2},
-         };
-       int const pta_size = ARRAY_SIZE (processor_alias_table);
-       int i;
-
-       for (i = 0; i < pta_size; i++)
-         if (!strcmp (arg, processor_alias_table[i].name))
-           {
-             ia64_tune = processor_alias_table[i].processor;
-             break;
-           }
-       if (i == pta_size)
-         error ("bad value %<%s%> for -mtune= switch", arg);
-       return true;
-      }
+  FOR_EACH_VEC_ELT (cl_deferred_option, vec, i, opt)
+    {
+      switch (opt->opt_index)
+       {
+       case OPT_mfixed_range_:
+         fix_range (opt->arg);
+         break;
  
-    default:
-      return true;
+       default:
+         gcc_unreachable ();
+       }
      }
-}
-
-/* Implement TARGET_OPTION_OVERRIDE.  */
  
-static void
-ia64_option_override (void)
-{
    if (TARGET_AUTO_PIC)
      target_flags |= MASK_CONST_GP;
  
@@ -5716,9 +5835,6 @@ ia64_option_override (void)
  static void
  ia64_override_options_after_change (void)
  {
-  ia64_flag_schedule_insns2 = flag_schedule_insns_after_reload;
-  flag_schedule_insns_after_reload = 0;
-
    if (optimize >= 3
        && !global_options_set.x_flag_selective_scheduling
        && !global_options_set.x_flag_selective_scheduling2)
@@ -6258,6 +6374,17 @@ rtx_needs_barrier (rtx x, struct reg_flags flags, int pred)
               break;
  
             case CLOBBER:
+             if (REG_P (XEXP (pat, 0))
+                 && extract_asm_operands (x) != NULL_RTX
+                 && REGNO (XEXP (pat, 0)) != AR_UNAT_REGNUM)
+               {
+                 new_flags.is_write = 1;
+                 need_barrier |= rtx_needs_barrier (XEXP (pat, 0),
+                                                    new_flags, pred);
+                 new_flags = flags;
+               }
+             break;
+
             case RETURN:
               break;
  
@@ -6363,6 +6490,7 @@ rtx_needs_barrier (rtx x, struct reg_flags flags, int pred)
         case UNSPEC_PIC_CALL:
          case UNSPEC_MF:
          case UNSPEC_FETCHADD_ACQ:
+        case UNSPEC_FETCHADD_REL:
         case UNSPEC_BSP_VALUE:
         case UNSPEC_FLUSHRS:
         case UNSPEC_BUNDLE_SELECTOR:
@@ -6406,6 +6534,7 @@ rtx_needs_barrier (rtx x, struct reg_flags flags, int pred)
           break;
  
          case UNSPEC_CMPXCHG_ACQ:
+        case UNSPEC_CMPXCHG_REL:
           need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 1), flags, pred);
           need_barrier |= rtx_needs_barrier (XVECEXP (x, 0, 2), flags, pred);
           break;
@@ -8637,7 +8766,7 @@ ia64_add_bundle_selector_before (int template0, rtx insn)
    ia64_emit_insn_before (b, insn);
  #if NR_BUNDLES == 10
    if ((template0 == 4 || template0 == 5)
-      && ia64_except_unwind_info () == UI_TARGET)
+      && ia64_except_unwind_info (&global_options) == UI_TARGET)
      {
        int i;
        rtx note = NULL_RTX;
@@ -9379,12 +9508,18 @@ ia64_reorg (void)
    if (optimize == 0)
      split_all_insns ();
  
-  if (optimize && ia64_flag_schedule_insns2
+  if (optimize && flag_schedule_insns_after_reload
        && dbg_cnt (ia64_sched2))
      {
+      basic_block bb;
        timevar_push (TV_SCHED2);
        ia64_final_schedule = 1;
  
+      /* We can't let modulo-sched prevent us from scheduling any bbs,
+        since we need the final schedule to produce bundle information.  */
+      FOR_EACH_BB (bb)
+       bb->flags &= ~BB_DISABLE_SCHEDULE;
+
        initiate_bundle_states ();
        ia64_nop = make_insn_raw (gen_nop ());
        PREV_INSN (ia64_nop) = NEXT_INSN (ia64_nop) = NULL_RTX;
@@ -9478,7 +9613,7 @@ ia64_reorg (void)
    /* A call must not be the last instruction in a function, so that the
       return address is still within the function, so that unwinding works
       properly.  Note that IA-64 differs from dwarf2 on this point.  */
-  if (ia64_except_unwind_info () == UI_TARGET)
+  if (ia64_except_unwind_info (&global_options) == UI_TARGET)
      {
        rtx insn;
        int saw_stop = 0;
@@ -9509,7 +9644,7 @@ ia64_reorg (void)
  
    emit_predicate_relation_info ();
  
-  if (ia64_flag_var_tracking)
+  if (flag_var_tracking)
      {
        timevar_push (TV_VAR_TRACKING);
        variable_tracking_main ();
@@ -9638,70 +9773,11 @@ static bool need_copy_state;
  # define MAX_ARTIFICIAL_LABEL_BYTES 30
  #endif
  
-/* Emit a debugging label after a call-frame-related insn.  We'd
-   rather output the label right away, but we'd have to output it
-   after, not before, the instruction, and the instruction has not
-   been output yet.  So we emit the label after the insn, delete it to
-   avoid introducing basic blocks, and mark it as preserved, such that
-   it is still output, given that it is referenced in debug info.  */
-
-static const char *
-ia64_emit_deleted_label_after_insn (rtx insn)
-{
-  char label[MAX_ARTIFICIAL_LABEL_BYTES];
-  rtx lb = gen_label_rtx ();
-  rtx label_insn = emit_label_after (lb, insn);
-
-  LABEL_PRESERVE_P (lb) = 1;
-
-  delete_insn (label_insn);
-
-  ASM_GENERATE_INTERNAL_LABEL (label, "L", CODE_LABEL_NUMBER (label_insn));
-
-  return xstrdup (label);
-}
-
-/* Define the CFA after INSN with the steady-state definition.  */
-
-static void
-ia64_dwarf2out_def_steady_cfa (rtx insn, bool frame)
-{
-  rtx fp = frame_pointer_needed
-    ? hard_frame_pointer_rtx
-    : stack_pointer_rtx;
-  const char *label = ia64_emit_deleted_label_after_insn (insn);
-
-  if (!frame)
-    return;
-
-  dwarf2out_def_cfa
-    (label, REGNO (fp),
-     ia64_initial_elimination_offset
-     (REGNO (arg_pointer_rtx), REGNO (fp))
-     + ARG_POINTER_CFA_OFFSET (current_function_decl));
-}
-
-/* All we need to do here is avoid a crash in the generic dwarf2
-   processing.  The real CFA definition is set up above.  */
-
-static void
-ia64_dwarf_handle_frame_unspec (const char * ARG_UNUSED (label),
-                               rtx ARG_UNUSED (pattern),
-                               int index)
-{
-  gcc_assert (index == UNSPECV_ALLOC);
-}
-
-/* The generic dwarf2 frame debug info generator does not define a
-   separate region for the very end of the epilogue, so refrain from
-   doing so in the IA64-specific code as well.  */
-
-#define IA64_CHANGE_CFA_IN_EPILOGUE 0
-
  /* The function emits unwind directives for the start of an epilogue.  */
  
  static void
-process_epilogue (FILE *asm_out_file, rtx insn, bool unwind, bool frame)
+process_epilogue (FILE *asm_out_file, rtx insn ATTRIBUTE_UNUSED,
+                 bool unwind, bool frame ATTRIBUTE_UNUSED)
  {
    /* If this isn't the last block of the function, then we need to label the
       current state, and copy it back in at the start of the next block.  */
@@ -9716,9 +9792,6 @@ process_epilogue (FILE *asm_out_file, rtx insn, bool unwind, bool frame)
  
    if (unwind)
      fprintf (asm_out_file, "\t.restore sp\n");
-  if (IA64_CHANGE_CFA_IN_EPILOGUE && frame)
-    dwarf2out_def_cfa (ia64_emit_deleted_label_after_insn (insn),
-                      STACK_POINTER_REGNUM, INCOMING_FRAME_SP_OFFSET);
  }
  
  /* This function processes a SET pattern for REG_CFA_ADJUST_CFA.  */
@@ -9746,7 +9819,6 @@ process_cfa_adjust_cfa (FILE *asm_out_file, rtx pat, rtx insn,
                 fprintf (asm_out_file,
                          "\t.fframe "HOST_WIDE_INT_PRINT_DEC"\n",
                          -INTVAL (op1));
-             ia64_dwarf2out_def_steady_cfa (insn, frame);
             }
           else
             process_epilogue (asm_out_file, insn, unwind, frame);
@@ -9765,7 +9837,6 @@ process_cfa_adjust_cfa (FILE *asm_out_file, rtx pat, rtx insn,
        if (unwind)
         fprintf (asm_out_file, "\t.vframe r%d\n",
                  ia64_dbx_register_number (REGNO (dest)));
-      ia64_dwarf2out_def_steady_cfa (insn, frame);
      }
    else
      gcc_unreachable ();
@@ -9778,20 +9849,22 @@ process_cfa_register (FILE *asm_out_file, rtx pat, bool unwind)
  {
    rtx dest = SET_DEST (pat);
    rtx src = SET_SRC (pat);
-
    int dest_regno = REGNO (dest);
-  int src_regno = REGNO (src);
+  int src_regno;
  
-  switch (src_regno)
+  if (src == pc_rtx)
      {
-    case BR_REG (0):
        /* Saving return address pointer.  */
-      gcc_assert (dest_regno == current_frame_info.r[reg_save_b0]);
        if (unwind)
         fprintf (asm_out_file, "\t.save rp, r%d\n",
                  ia64_dbx_register_number (dest_regno));
-      break;
+      return;
+    }
  
+  src_regno = REGNO (src);
+
+  switch (src_regno)
+    {
      case PR_REG (0):
        gcc_assert (dest_regno == current_frame_info.r[reg_save_pr]);
        if (unwind)
@@ -9944,7 +10017,7 @@ process_cfa_offset (FILE *asm_out_file, rtx pat, bool unwind)
  static void
  ia64_asm_unwind_emit (FILE *asm_out_file, rtx insn)
  {
-  bool unwind = ia64_except_unwind_info () == UI_TARGET;
+  bool unwind = ia64_except_unwind_info (&global_options) == UI_TARGET;
    bool frame = dwarf2out_do_frame ();
    rtx note, pat;
    bool handled_one;
@@ -9965,8 +10038,6 @@ ia64_asm_unwind_emit (FILE *asm_out_file, rtx insn)
               fprintf (asm_out_file, "\t.copy_state %d\n",
                        cfun->machine->state_num);
             }
-         if (IA64_CHANGE_CFA_IN_EPILOGUE)
-           ia64_dwarf2out_def_steady_cfa (insn, frame);
           need_copy_state = false;
         }
      }
@@ -10077,25 +10148,6 @@ ia64_debug_unwind_info (void)
  {
    return UI_TARGET;
  }
-
-/* Implement TARGET_EXCEPT_UNWIND_INFO.  */
-
-static enum unwind_info_type
-ia64_except_unwind_info (void)
-{
-  /* Honor the --enable-sjlj-exceptions configure switch.  */
-#ifdef CONFIG_UNWIND_EXCEPTIONS
-  if (CONFIG_UNWIND_EXCEPTIONS)
-    return UI_SJLJ;
-#endif
-
-  /* For simplicity elsewhere in this file, indicate that all unwind
-     info is disabled if we're not emitting unwind tables.  */
-  if (!flag_exceptions && !flag_unwind_tables)
-    return UI_NONE;
-
-  return UI_TARGET;
-}
  \f
  enum ia64_builtins
  {
@@ -10140,7 +10192,7 @@ ia64_init_builtins (void)
        (*lang_hooks.types.register_builtin_type) (float128_type, "__float128");
  
        /* TFmode support builtins.  */
-      ftype = build_function_type (float128_type, void_list_node);
+      ftype = build_function_type_list (float128_type, NULL_TREE);
        decl = add_builtin_function ("__builtin_infq", ftype,
                                    IA64_BUILTIN_INFQ, BUILT_IN_MD,
                                    NULL, NULL_TREE);
@@ -10176,39 +10228,34 @@ ia64_init_builtins (void)
                                                "__float128");
  
    /* Fwrite on VMS is non-standard.  */
-  if (TARGET_ABI_OPEN_VMS)
-    {
-      implicit_built_in_decls[(int) BUILT_IN_FWRITE] = NULL_TREE;
-      implicit_built_in_decls[(int) BUILT_IN_FWRITE_UNLOCKED] = NULL_TREE;
-    }
+#if TARGET_ABI_OPEN_VMS
+  vms_patch_builtins ();
+#endif
  
  #define def_builtin(name, type, code)                                  \
    add_builtin_function ((name), (type), (code), BUILT_IN_MD,   \
                        NULL, NULL_TREE)
  
    decl = def_builtin ("__builtin_ia64_bsp",
-              build_function_type (ptr_type_node, void_list_node),
-              IA64_BUILTIN_BSP);
+                     build_function_type_list (ptr_type_node, NULL_TREE),
+                     IA64_BUILTIN_BSP);
    ia64_builtins[IA64_BUILTIN_BSP] = decl;
  
    decl = def_builtin ("__builtin_ia64_flushrs",
-              build_function_type (void_type_node, void_list_node),
-              IA64_BUILTIN_FLUSHRS);
+                     build_function_type_list (void_type_node, NULL_TREE),
+                     IA64_BUILTIN_FLUSHRS);
    ia64_builtins[IA64_BUILTIN_FLUSHRS] = decl;
  
  #undef def_builtin
  
    if (TARGET_HPUX)
      {
-      if (built_in_decls [BUILT_IN_FINITE])
-       set_user_assembler_name (built_in_decls [BUILT_IN_FINITE],
-         "_Isfinite");
-      if (built_in_decls [BUILT_IN_FINITEF])
-       set_user_assembler_name (built_in_decls [BUILT_IN_FINITEF],
-         "_Isfinitef");
-      if (built_in_decls [BUILT_IN_FINITEL])
-       set_user_assembler_name (built_in_decls [BUILT_IN_FINITEL],
-         "_Isfinitef128");
+      if ((decl = builtin_decl_explicit (BUILT_IN_FINITE)) != NULL_TREE)
+       set_user_assembler_name (decl, "_Isfinite");
+      if ((decl = builtin_decl_explicit (BUILT_IN_FINITEF)) != NULL_TREE)
+       set_user_assembler_name (decl, "_Isfinitef");
+      if ((decl = builtin_decl_explicit (BUILT_IN_FINITEL)) != NULL_TREE)
+       set_user_assembler_name (decl, "_Isfinitef128");
      }
  }
  
@@ -10308,10 +10355,6 @@ ia64_asm_output_external (FILE *file, tree decl, const char *name)
        int need_visibility = ((*targetm.binds_local_p) (decl)
                              && maybe_assemble_visibility (decl));
  
-#ifdef DO_CRTL_NAMES
-      DO_CRTL_NAMES;
-#endif
-
        /* GNU as does not need anything here, but the HP linker does
          need something for external functions.  */
        if ((TARGET_HPUX_LD || !TARGET_GNU_AS)
@@ -10823,7 +10866,7 @@ ia64_profile_hook (int labelno)
        char buf[30];
        const char *label_name;
        ASM_GENERATE_INTERNAL_LABEL (buf, "LP", labelno);
-      label_name = (*targetm.strip_name_encoding) (ggc_strdup (buf));
+      label_name = ggc_strdup ((*targetm.strip_name_encoding) (buf));
        label = gen_rtx_SYMBOL_REF (Pmode, label_name);
        SYMBOL_REF_FLAGS (label) = SYMBOL_FLAG_LOCAL;
      }
@@ -10901,20 +10944,6 @@ ia64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1, const_tree ty
    return NULL;
  }
  
-/* Implement TARGET_OPTION_DEFAULT_PARAMS.  */
-static void
-ia64_option_default_params (void)
-{
-  /* Let the scheduler form additional regions.  */
-  set_default_param_value (PARAM_MAX_SCHED_EXTEND_REGIONS_ITERS, 2);
-
-  /* Set the default values for cache-related parameters.  */
-  set_default_param_value (PARAM_SIMULTANEOUS_PREFETCHES, 6);
-  set_default_param_value (PARAM_L1_CACHE_LINE_SIZE, 32);
-
-  set_default_param_value (PARAM_SCHED_MEM_TRUE_DEP_COST, 4);
-}
-
  /* HP-UX version_id attribute.
     For object foo, if the version_id is set to 1234 put out an alias
     of '.alias foo "foo{1234}"  We can't use "foo{1234}" in anything
@@ -10951,43 +10980,6 @@ ia64_c_mode_for_suffix (char suffix)
    return VOIDmode;
  }
  
-static enum machine_mode
-ia64_promote_function_mode (const_tree type,
-                           enum machine_mode mode,
-                           int *punsignedp,
-                           const_tree funtype,
-                           int for_return)
-{
-  /* Special processing required for OpenVMS ...  */
-
-  if (!TARGET_ABI_OPEN_VMS)
-    return default_promote_function_mode(type, mode, punsignedp, funtype,
-                                        for_return);
-
-  /* HP OpenVMS Calling Standard dated June, 2004, that describes
-     HP OpenVMS I64 Version 8.2EFT,
-     chapter 4 "OpenVMS I64 Conventions"
-     section 4.7 "Procedure Linkage"
-     subsection 4.7.5.2, "Normal Register Parameters"
-
-     "Unsigned integral (except unsigned 32-bit), set, and VAX floating-point
-     values passed in registers are zero-filled; signed integral values as
-     well as unsigned 32-bit integral values are sign-extended to 64 bits.
-     For all other types passed in the general registers, unused bits are
-     undefined."  */
-
-  if (!AGGREGATE_TYPE_P (type)
-      && GET_MODE_CLASS (mode) == MODE_INT
-      && GET_MODE_SIZE (mode) < UNITS_PER_WORD)
-    {
-      if (mode == SImode)
-       *punsignedp = 0;
-      return DImode;
-    }
-  else
-    return promote_mode (type, mode, punsignedp);
-}
-   
  static GTY(()) rtx ia64_dconst_0_5_rtx;
  
  rtx
@@ -11034,5 +11026,563 @@ ia64_hpux_function_section (tree decl ATTRIBUTE_UNUSED,
  {
    return NULL;
  }
+\f
+/* Construct (set target (vec_select op0 (parallel perm))) and
+   return true if that's a valid instruction in the active ISA.  */
+
+static bool
+expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
+{
+  rtx rperm[MAX_VECT_LEN], x;
+  unsigned i;
+
+  for (i = 0; i < nelt; ++i)
+    rperm[i] = GEN_INT (perm[i]);
+
+  x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
+  x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
+  x = gen_rtx_SET (VOIDmode, target, x);
+
+  x = emit_insn (x);
+  if (recog_memoized (x) < 0)
+    {
+      remove_insn (x);
+      return false;
+    }
+  return true;
+}
+
+/* Similar, but generate a vec_concat from op0 and op1 as well.  */
+
+static bool
+expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
+                       const unsigned char *perm, unsigned nelt)
+{
+  enum machine_mode v2mode;
+  rtx x;
+
+  v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
+  x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
+  return expand_vselect (target, x, perm, nelt);
+}
+
+/* Try to expand a no-op permutation.  */
+
+static bool
+expand_vec_perm_identity (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt = d->nelt;
+
+  for (i = 0; i < nelt; ++i)
+    if (d->perm[i] != i)
+      return false;
+
+  if (!d->testing_p)
+    emit_move_insn (d->target, d->op0);
+
+  return true;
+}
+
+/* Try to expand D via a shrp instruction.  */
+
+static bool
+expand_vec_perm_shrp (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt = d->nelt, shift, mask;
+  rtx tmp, hi, lo;
+
+  /* ??? Don't force V2SFmode into the integer registers.  */
+  if (d->vmode == V2SFmode)
+    return false;
+
+  mask = (d->one_operand_p ? nelt - 1 : 2 * nelt - 1);
+
+  shift = d->perm[0];
+  if (BYTES_BIG_ENDIAN && shift > nelt)
+    return false;
+
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != ((shift + i) & mask))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  hi = shift < nelt ? d->op1 : d->op0;
+  lo = shift < nelt ? d->op0 : d->op1;
+
+  shift %= nelt;
+
+  shift *= GET_MODE_UNIT_SIZE (d->vmode) * BITS_PER_UNIT;
+
+  /* We've eliminated the shift 0 case via expand_vec_perm_identity.  */
+  gcc_assert (IN_RANGE (shift, 1, 63));
+
+  /* Recall that big-endian elements are numbered starting at the top of
+     the register.  Ideally we'd have a shift-left-pair.  But since we
+     don't, convert to a shift the other direction.  */
+  if (BYTES_BIG_ENDIAN)
+    shift = 64 - shift;
+
+  tmp = gen_reg_rtx (DImode);
+  hi = gen_lowpart (DImode, hi);
+  lo = gen_lowpart (DImode, lo);
+  emit_insn (gen_shrp (tmp, hi, lo, GEN_INT (shift)));
+
+  emit_move_insn (d->target, gen_lowpart (d->vmode, tmp));
+  return true;
+}
+
+/* Try to instantiate D in a single instruction.  */
+
+static bool
+expand_vec_perm_1 (struct expand_vec_perm_d *d)
+{     
+  unsigned i, nelt = d->nelt;
+  unsigned char perm2[MAX_VECT_LEN];
+
+  /* Try single-operand selections.  */
+  if (d->one_operand_p)
+    {
+      if (expand_vec_perm_identity (d))
+       return true;
+      if (expand_vselect (d->target, d->op0, d->perm, nelt))
+       return true;
+    }
+
+  /* Try two operand selections.  */
+  if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
+    return true;
+
+  /* Recognize interleave style patterns with reversed operands.  */
+  if (!d->one_operand_p)
+    {
+      for (i = 0; i < nelt; ++i)
+       {
+         unsigned e = d->perm[i];
+         if (e >= nelt)
+           e -= nelt;
+         else
+           e += nelt;
+         perm2[i] = e;
+       }
+
+      if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
+       return true;
+    }
+
+  if (expand_vec_perm_shrp (d))
+    return true;
+
+  /* ??? Look for deposit-like permutations where most of the result 
+     comes from one vector unchanged and the rest comes from a 
+     sequential hunk of the other vector.  */
+
+  return false;
+}
+
+/* Pattern match broadcast permutations.  */
+
+static bool
+expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
+{
+  unsigned i, elt, nelt = d->nelt;
+  unsigned char perm2[2];
+  rtx temp;
+  bool ok;
+
+  if (!d->one_operand_p)
+    return false;
+
+  elt = d->perm[0];
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != elt)
+      return false;
+
+  switch (d->vmode)
+    {
+    case V2SImode:
+    case V2SFmode:
+      /* Implementable by interleave.  */
+      perm2[0] = elt;
+      perm2[1] = elt + 2;
+      ok = expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, 2);
+      gcc_assert (ok);
+      break;
+
+    case V8QImode:
+      /* Implementable by extract + broadcast.  */
+      if (BYTES_BIG_ENDIAN)
+       elt = 7 - elt;
+      elt *= BITS_PER_UNIT;
+      temp = gen_reg_rtx (DImode);
+      emit_insn (gen_extzv (temp, gen_lowpart (DImode, d->op0),
+                           GEN_INT (8), GEN_INT (elt)));
+      emit_insn (gen_mux1_brcst_qi (d->target, gen_lowpart (QImode, temp)));
+      break;
+
+    case V4HImode:
+      /* Should have been matched directly by vec_select.  */
+    default:
+      gcc_unreachable ();
+    }
+
+  return true;
+}
+
+/* A subroutine of ia64_expand_vec_perm_const_1.  Try to simplify a
+   two vector permutation into a single vector permutation by using
+   an interleave operation to merge the vectors.  */
+
+static bool
+expand_vec_perm_interleave_2 (struct expand_vec_perm_d *d)
+{
+  struct expand_vec_perm_d dremap, dfinal;
+  unsigned char remap[2 * MAX_VECT_LEN];
+  unsigned contents, i, nelt, nelt2;
+  unsigned h0, h1, h2, h3;
+  rtx seq;
+  bool ok;
+
+  if (d->one_operand_p)
+    return false;
+
+  nelt = d->nelt;
+  nelt2 = nelt / 2;
+
+  /* Examine from whence the elements come.  */
+  contents = 0;
+  for (i = 0; i < nelt; ++i)
+    contents |= 1u << d->perm[i];
+
+  memset (remap, 0xff, sizeof (remap));
+  dremap = *d;
+
+  h0 = (1u << nelt2) - 1;
+  h1 = h0 << nelt2;
+  h2 = h0 << nelt;
+  h3 = h0 << (nelt + nelt2);
+  
+  if ((contents & (h0 | h2)) == contents)      /* punpck even halves */
+    {
+      for (i = 0; i < nelt; ++i)
+       {
+         unsigned which = i / 2 + (i & 1 ? nelt : 0);
+         remap[which] = i;
+         dremap.perm[i] = which;
+       }
+    }
+  else if ((contents & (h1 | h3)) == contents) /* punpck odd halves */
+    {
+      for (i = 0; i < nelt; ++i)
+       {
+         unsigned which = i / 2 + nelt2 + (i & 1 ? nelt : 0);
+         remap[which] = i;
+         dremap.perm[i] = which;
+       }
+    }
+  else if ((contents & 0x5555) == contents)    /* mix even elements */
+    {
+      for (i = 0; i < nelt; ++i)
+       {
+         unsigned which = (i & ~1) + (i & 1 ? nelt : 0);
+         remap[which] = i;
+         dremap.perm[i] = which;
+       }
+    }
+  else if ((contents & 0xaaaa) == contents)    /* mix odd elements */
+    {
+      for (i = 0; i < nelt; ++i)
+       {
+         unsigned which = (i | 1) + (i & 1 ? nelt : 0);
+         remap[which] = i;
+         dremap.perm[i] = which;
+       }
+    }
+  else if (floor_log2 (contents) - ctz_hwi (contents) < (int)nelt) /* shrp */
+    {
+      unsigned shift = ctz_hwi (contents);
+      for (i = 0; i < nelt; ++i)
+       {
+         unsigned which = (i + shift) & (2 * nelt - 1);
+         remap[which] = i;
+         dremap.perm[i] = which;
+       }
+    }
+  else
+    return false;
+
+  /* Use the remapping array set up above to move the elements from their
+     swizzled locations into their final destinations.  */
+  dfinal = *d;
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned e = remap[d->perm[i]];
+      gcc_assert (e < nelt);
+      dfinal.perm[i] = e;
+    }
+  dfinal.op0 = gen_reg_rtx (dfinal.vmode);
+  dfinal.op1 = dfinal.op0;
+  dfinal.one_operand_p = true;
+  dremap.target = dfinal.op0;
+
+  /* Test if the final remap can be done with a single insn.  For V4HImode
+     this *will* succeed.  For V8QImode or V2SImode it may not.  */
+  start_sequence ();
+  ok = expand_vec_perm_1 (&dfinal);
+  seq = get_insns ();
+  end_sequence ();
+  if (!ok)
+    return false;
+  if (d->testing_p)
+    return true;
+
+  ok = expand_vec_perm_1 (&dremap);
+  gcc_assert (ok);
+
+  emit_insn (seq);
+  return true;
+}
+
+/* A subroutine of ia64_expand_vec_perm_const_1.  Emit a full V4HImode
+   constant permutation via two mux2 and a merge.  */
+
+static bool
+expand_vec_perm_v4hi_5 (struct expand_vec_perm_d *d)
+{
+  unsigned char perm2[4];
+  rtx rmask[4];
+  unsigned i;
+  rtx t0, t1, mask, x;
+  bool ok;
+
+  if (d->vmode != V4HImode || d->one_operand_p)
+    return false;
+  if (d->testing_p)
+    return true;
+
+  for (i = 0; i < 4; ++i)
+    {
+      perm2[i] = d->perm[i] & 3;
+      rmask[i] = (d->perm[i] & 4 ? const0_rtx : constm1_rtx);
+    }
+  mask = gen_rtx_CONST_VECTOR (V4HImode, gen_rtvec_v (4, rmask));
+  mask = force_reg (V4HImode, mask);
+
+  t0 = gen_reg_rtx (V4HImode);
+  t1 = gen_reg_rtx (V4HImode);
+
+  ok = expand_vselect (t0, d->op0, perm2, 4);
+  gcc_assert (ok);
+  ok = expand_vselect (t1, d->op1, perm2, 4);
+  gcc_assert (ok);
+
+  x = gen_rtx_AND (V4HImode, mask, t0);
+  emit_insn (gen_rtx_SET (VOIDmode, t0, x));
+
+  x = gen_rtx_NOT (V4HImode, mask);
+  x = gen_rtx_AND (V4HImode, x, t1);
+  emit_insn (gen_rtx_SET (VOIDmode, t1, x));
+
+  x = gen_rtx_IOR (V4HImode, t0, t1);
+  emit_insn (gen_rtx_SET (VOIDmode, d->target, x));
+
+  return true;
+}
+
+/* The guts of ia64_expand_vec_perm_const, also used by the ok hook.
+   With all of the interface bits taken care of, perform the expansion
+   in D and return true on success.  */
+
+static bool
+ia64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+{
+  if (expand_vec_perm_1 (d))
+    return true;
+  if (expand_vec_perm_broadcast (d))
+    return true;
+  if (expand_vec_perm_interleave_2 (d))
+    return true;
+  if (expand_vec_perm_v4hi_5 (d))
+    return true;
+  return false;
+}
+
+bool
+ia64_expand_vec_perm_const (rtx operands[4])
+{
+  struct expand_vec_perm_d d;
+  unsigned char perm[MAX_VECT_LEN];
+  int i, nelt, which;
+  rtx sel;
+
+  d.target = operands[0];
+  d.op0 = operands[1];
+  d.op1 = operands[2];
+  sel = operands[3];
+
+  d.vmode = GET_MODE (d.target);
+  gcc_assert (VECTOR_MODE_P (d.vmode));
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = false;
+
+  gcc_assert (GET_CODE (sel) == CONST_VECTOR);
+  gcc_assert (XVECLEN (sel, 0) == nelt);
+  gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
+
+  for (i = which = 0; i < nelt; ++i)
+    {
+      rtx e = XVECEXP (sel, 0, i);
+      int ei = INTVAL (e) & (2 * nelt - 1);
+
+      which |= (ei < nelt ? 1 : 2);
+      d.perm[i] = ei;
+      perm[i] = ei;
+    }
+
+  switch (which)
+    {
+    default:
+      gcc_unreachable();
+
+    case 3:
+      if (!rtx_equal_p (d.op0, d.op1))
+       {
+         d.one_operand_p = false;
+         break;
+       }
+
+      /* The elements of PERM do not suggest that only the first operand
+        is used, but both operands are identical.  Allow easier matching
+        of the permutation by folding the permutation into the single
+        input vector.  */
+      for (i = 0; i < nelt; ++i)
+       if (d.perm[i] >= nelt)
+         d.perm[i] -= nelt;
+      /* FALLTHRU */
+
+    case 1:
+      d.op1 = d.op0;
+      d.one_operand_p = true;
+      break;
+
+    case 2:
+      for (i = 0; i < nelt; ++i)
+        d.perm[i] -= nelt;
+      d.op0 = d.op1;
+      d.one_operand_p = true;
+      break;
+    }
+
+  if (ia64_expand_vec_perm_const_1 (&d))
+    return true;
+
+  /* If the mask says both arguments are needed, but they are the same,
+     the above tried to expand with one_operand_p true.  If that didn't
+     work, retry with one_operand_p false, as that's what we used in _ok.  */
+  if (which == 3 && d.one_operand_p)
+    {
+      memcpy (d.perm, perm, sizeof (perm));
+      d.one_operand_p = false;
+      return ia64_expand_vec_perm_const_1 (&d);
+    }
+
+  return false;
+}
+
+/* Implement targetm.vectorize.vec_perm_const_ok.  */
+
+static bool
+ia64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+                                 const unsigned char *sel)
+{
+  struct expand_vec_perm_d d;
+  unsigned int i, nelt, which;
+  bool ret;
+
+  d.vmode = vmode;
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = true;
+
+  /* Extract the values from the vector CST into the permutation
+     array in D.  */
+  memcpy (d.perm, sel, nelt);
+  for (i = which = 0; i < nelt; ++i)
+    {
+      unsigned char e = d.perm[i];
+      gcc_assert (e < 2 * nelt);
+      which |= (e < nelt ? 1 : 2);
+    }
+
+  /* For all elements from second vector, fold the elements to first.  */
+  if (which == 2)
+    for (i = 0; i < nelt; ++i)
+      d.perm[i] -= nelt;
+
+  /* Check whether the mask can be applied to the vector type.  */
+  d.one_operand_p = (which != 3);
+
+  /* Otherwise we have to go through the motions and see if we can
+     figure out how to generate the requested permutation.  */
+  d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
+  d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
+  if (!d.one_operand_p)
+    d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+
+  start_sequence ();
+  ret = ia64_expand_vec_perm_const_1 (&d);
+  end_sequence ();
+
+  return ret;
+}
+
+void
+ia64_expand_vec_setv2sf (rtx operands[3])
+{
+  struct expand_vec_perm_d d;
+  unsigned int which;
+  bool ok;
+  
+  d.target = operands[0];
+  d.op0 = operands[0];
+  d.op1 = gen_reg_rtx (V2SFmode);
+  d.vmode = V2SFmode;
+  d.nelt = 2;
+  d.one_operand_p = false;
+  d.testing_p = false;
+
+  which = INTVAL (operands[2]);
+  gcc_assert (which <= 1);
+  d.perm[0] = 1 - which;
+  d.perm[1] = which + 2;
+
+  emit_insn (gen_fpack (d.op1, operands[1], CONST0_RTX (SFmode)));
+
+  ok = ia64_expand_vec_perm_const_1 (&d);
+  gcc_assert (ok);
+}
+
+void
+ia64_expand_vec_perm_even_odd (rtx target, rtx op0, rtx op1, int odd)
+{
+  struct expand_vec_perm_d d;
+  enum machine_mode vmode = GET_MODE (target);
+  unsigned int i, nelt = GET_MODE_NUNITS (vmode);
+  bool ok;
+
+  d.target = target;
+  d.op0 = op0;
+  d.op1 = op1;
+  d.vmode = vmode;
+  d.nelt = nelt;
+  d.one_operand_p = false;
+  d.testing_p = false;
+
+  for (i = 0; i < nelt; ++i)
+    d.perm[i] = i * 2 + odd;
+
+  ok = ia64_expand_vec_perm_const_1 (&d);
+  gcc_assert (ok);
+}
  
  #include "gt-ia64.h"