X-Git-Url: http://git.sourceforge.jp/view?p=pf3gnuchains%2Fgcc-fork.git;a=blobdiff_plain;f=gcc%2Fconfig%2Fia64%2Fia64.c;h=66ef5a84e07edd929a97596334acdf738441a5b3;hp=e8a853bb2567f5bec3ca591105fab51beb90dcc9;hb=9b57ed9f90cd5e7baf86a51bf37121361c6ac4b7;hpb=8deb3959b001122f1d9f0f8320adc8bc77844046

diff --git a/gcc/config/ia64/ia64.c b/gcc/config/ia64/ia64.c
index e8a853bb256..66ef5a84e07 100644
--- a/gcc/config/ia64/ia64.c
+++ b/gcc/config/ia64/ia64.c
@@ -1,6 +1,6 @@
 /* Definitions of target machine for GNU compiler.
-   Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
-   Free Software Foundation, Inc.
+   Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008,
+   2009  Free Software Foundation, Inc.
    Contributed by James E. Wilson <wilson@cygnus.com> and
 		  David Mosberger <davidm@hpl.hp.com>.
 
@@ -57,6 +57,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "params.h"
 #include "dbgcnt.h"
 #include "tm-constrs.h"
+#include "sel-sched.h"
 
 /* This is used for communication between ASM_OUTPUT_LABEL and
    ASM_OUTPUT_LABELREF.  */
@@ -172,12 +173,19 @@ static int ia64_first_cycle_multipass_dfa_lookahead_guard (rtx);
 static bool ia64_first_cycle_multipass_dfa_lookahead_guard_spec (const_rtx);
 static int ia64_dfa_new_cycle (FILE *, int, rtx, int, int, int *);
 static void ia64_h_i_d_extended (void);
+static void * ia64_alloc_sched_context (void);
+static void ia64_init_sched_context (void *, bool);
+static void ia64_set_sched_context (void *);
+static void ia64_clear_sched_context (void *);
+static void ia64_free_sched_context (void *);
 static int ia64_mode_to_int (enum machine_mode);
 static void ia64_set_sched_flags (spec_info_t);
+static ds_t ia64_get_insn_spec_ds (rtx);
+static ds_t ia64_get_insn_checked_ds (rtx);
+static bool ia64_skip_rtx_p (const_rtx);
 static int ia64_speculate_insn (rtx, ds_t, rtx *);
-static rtx ia64_gen_spec_insn (rtx, ds_t, int, bool, bool);
-static bool ia64_needs_block_p (const_rtx);
-static rtx ia64_gen_check (rtx, rtx, bool);
+static bool ia64_needs_block_p (int);
+static rtx ia64_gen_spec_check (rtx, rtx, ds_t);
 static int ia64_spec_check_p (rtx);
 static int ia64_spec_check_src_p (rtx);
 static rtx gen_tls_get_addr (void);
@@ -202,7 +210,7 @@ static int ia64_arg_partial_bytes (CUMULATIVE_ARGS *, enum machine_mode,
 				   tree, bool);
 static bool ia64_function_ok_for_sibcall (tree, tree);
 static bool ia64_return_in_memory (const_tree, const_tree);
-static bool ia64_rtx_costs (rtx, int, int, int *);
+static bool ia64_rtx_costs (rtx, int, int, int *, bool);
 static int ia64_unspec_may_trap_p (const_rtx, unsigned);
 static void fix_range (const char *);
 static bool ia64_handle_option (size_t, const char *, int);
@@ -222,7 +230,7 @@ static void ia64_output_function_epilogue (FILE *, HOST_WIDE_INT);
 static void ia64_output_function_end_prologue (FILE *);
 
 static int ia64_issue_rate (void);
-static int ia64_adjust_cost (rtx, rtx, rtx, int);
+static int ia64_adjust_cost_2 (rtx, int, rtx, int, dw_t);
 static void ia64_sched_init (FILE *, int, int);
 static void ia64_sched_init_global (FILE *, int, int);
 static void ia64_sched_finish_global (FILE *, int);
@@ -248,6 +256,7 @@ static int get_max_pos (state_t);
 static int get_template (state_t, int);
 
 static rtx get_next_important_insn (rtx, rtx);
+static bool important_for_bundling_p (rtx);
 static void bundling (FILE *, int, rtx, rtx);
 
 static void ia64_output_mi_thunk (FILE *, tree, HOST_WIDE_INT,
@@ -270,6 +279,8 @@ static void ia64_sysv4_init_libfuncs (void)
      ATTRIBUTE_UNUSED;
 static void ia64_vms_init_libfuncs (void)
      ATTRIBUTE_UNUSED;
+static void ia64_soft_fp_init_libfuncs (void)
+     ATTRIBUTE_UNUSED;
 
 static tree ia64_handle_model_attribute (tree *, tree, tree, int, bool *);
 static tree ia64_handle_version_id_attribute (tree *, tree, tree, int, bool *);
@@ -333,8 +344,8 @@ static const struct attribute_spec ia64_attribute_table[] =
 #undef TARGET_IN_SMALL_DATA_P
 #define TARGET_IN_SMALL_DATA_P  ia64_in_small_data_p
 
-#undef TARGET_SCHED_ADJUST_COST
-#define TARGET_SCHED_ADJUST_COST ia64_adjust_cost
+#undef TARGET_SCHED_ADJUST_COST_2
+#define TARGET_SCHED_ADJUST_COST_2 ia64_adjust_cost_2
 #undef TARGET_SCHED_ISSUE_RATE
 #define TARGET_SCHED_ISSUE_RATE ia64_issue_rate
 #undef TARGET_SCHED_VARIABLE_ISSUE
@@ -373,22 +384,46 @@ static const struct attribute_spec ia64_attribute_table[] =
 #undef TARGET_SCHED_H_I_D_EXTENDED
 #define TARGET_SCHED_H_I_D_EXTENDED ia64_h_i_d_extended
 
+#undef TARGET_SCHED_ALLOC_SCHED_CONTEXT
+#define TARGET_SCHED_ALLOC_SCHED_CONTEXT ia64_alloc_sched_context
+
+#undef TARGET_SCHED_INIT_SCHED_CONTEXT
+#define TARGET_SCHED_INIT_SCHED_CONTEXT ia64_init_sched_context
+
+#undef TARGET_SCHED_SET_SCHED_CONTEXT
+#define TARGET_SCHED_SET_SCHED_CONTEXT ia64_set_sched_context
+
+#undef TARGET_SCHED_CLEAR_SCHED_CONTEXT
+#define TARGET_SCHED_CLEAR_SCHED_CONTEXT ia64_clear_sched_context
+
+#undef TARGET_SCHED_FREE_SCHED_CONTEXT
+#define TARGET_SCHED_FREE_SCHED_CONTEXT ia64_free_sched_context
+
 #undef TARGET_SCHED_SET_SCHED_FLAGS
 #define TARGET_SCHED_SET_SCHED_FLAGS ia64_set_sched_flags
 
+#undef TARGET_SCHED_GET_INSN_SPEC_DS
+#define TARGET_SCHED_GET_INSN_SPEC_DS ia64_get_insn_spec_ds
+
+#undef TARGET_SCHED_GET_INSN_CHECKED_DS
+#define TARGET_SCHED_GET_INSN_CHECKED_DS ia64_get_insn_checked_ds
+
 #undef TARGET_SCHED_SPECULATE_INSN
 #define TARGET_SCHED_SPECULATE_INSN ia64_speculate_insn
 
 #undef TARGET_SCHED_NEEDS_BLOCK_P
 #define TARGET_SCHED_NEEDS_BLOCK_P ia64_needs_block_p
 
-#undef TARGET_SCHED_GEN_CHECK
-#define TARGET_SCHED_GEN_CHECK ia64_gen_check
+#undef TARGET_SCHED_GEN_SPEC_CHECK
+#define TARGET_SCHED_GEN_SPEC_CHECK ia64_gen_spec_check
 
 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD_SPEC
 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD_SPEC\
   ia64_first_cycle_multipass_dfa_lookahead_guard_spec
 
+#undef TARGET_SCHED_SKIP_RTX_P
+#define TARGET_SCHED_SKIP_RTX_P ia64_skip_rtx_p
+
 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
 #define TARGET_FUNCTION_OK_FOR_SIBCALL ia64_function_ok_for_sibcall
 #undef TARGET_ARG_PARTIAL_BYTES
@@ -408,7 +443,7 @@ static const struct attribute_spec ia64_attribute_table[] =
 #undef TARGET_RTX_COSTS
 #define TARGET_RTX_COSTS ia64_rtx_costs
 #undef TARGET_ADDRESS_COST
-#define TARGET_ADDRESS_COST hook_int_rtx_0
+#define TARGET_ADDRESS_COST hook_int_rtx_bool_0
 
 #undef TARGET_UNSPEC_MAY_TRAP_P
 #define TARGET_UNSPEC_MAY_TRAP_P ia64_unspec_may_trap_p
@@ -493,12 +528,6 @@ static const struct attribute_spec ia64_attribute_table[] =
 #undef TARGET_C_MODE_FOR_SUFFIX
 #define TARGET_C_MODE_FOR_SUFFIX ia64_c_mode_for_suffix
 
-#undef TARGET_OPTION_COLD_ATTRIBUTE_SETS_OPTIMIZATION
-#define TARGET_OPTION_COLD_ATTRIBUTE_SETS_OPTIMIZATION true
-
-#undef TARGET_OPTION_HOT_ATTRIBUTE_SETS_OPTIMIZATION
-#define TARGET_OPTION_HOT_ATTRIBUTE_SETS_OPTIMIZATION true
-
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 typedef enum
@@ -1486,7 +1515,7 @@ ia64_expand_compare (enum rtx_code code, enum machine_mode mode)
   /* HPUX TFmode compare requires a library call to _U_Qfcmp, which takes a
      magic number as its third argument, that indicates what to do.
      The return value is an integer to be compared against zero.  */
-  else if (GET_MODE (op0) == TFmode)
+  else if (TARGET_HPUX && GET_MODE (op0) == TFmode)
     {
       enum qfcmp_magic {
 	QCMP_INV = 1,	/* Raise FP_INVALID on SNaN as a side effect.  */
@@ -2114,11 +2143,13 @@ ia64_expand_atomic_op (enum rtx_code code, rtx mem, rtx val,
   new_reg = cmp_reg;
   if (code == NOT)
     {
-      new_reg = expand_simple_unop (DImode, NOT, new_reg, NULL_RTX, true);
-      code = AND;
+      new_reg = expand_simple_binop (DImode, AND, new_reg, val, NULL_RTX,
+				     true, OPTAB_DIRECT);
+      new_reg = expand_simple_unop (DImode, code, new_reg, NULL_RTX, true);
     }
-  new_reg = expand_simple_binop (DImode, code, new_reg, val, NULL_RTX,
-				 true, OPTAB_DIRECT);
+  else
+    new_reg = expand_simple_binop (DImode, code, new_reg, val, NULL_RTX,
+				   true, OPTAB_DIRECT);
 
   if (mode != DImode)
     new_reg = gen_lowpart (mode, new_reg);
@@ -4334,8 +4365,9 @@ ia64_function_ok_for_sibcall (tree decl, tree exp ATTRIBUTE_UNUSED)
     return false;
 
   /* We must always return with our current GP.  This means we can
-     only sibcall to functions defined in the current module.  */
-  return decl && (*targetm.binds_local_p) (decl);
+     only sibcall to functions defined in the current module unless
+     TARGET_CONST_GP is set to true.  */
+  return (decl && (*targetm.binds_local_p) (decl)) || TARGET_CONST_GP;
 }
 
 
@@ -4816,7 +4848,8 @@ ia64_print_operand (FILE * file, rtx x, int code)
 /* ??? This is incomplete.  */
 
 static bool
-ia64_rtx_costs (rtx x, int code, int outer_code, int *total)
+ia64_rtx_costs (rtx x, int code, int outer_code, int *total,
+		bool speed ATTRIBUTE_UNUSED)
 {
   switch (code)
     {
@@ -5179,6 +5212,8 @@ fix_range (const char *const_str)
 static bool
 ia64_handle_option (size_t code, const char *arg, int value)
 {
+  static bool warned_itanium1_deprecated;
+
   switch (code)
     {
     case OPT_mfixed_range_:
@@ -5212,6 +5247,16 @@ ia64_handle_option (size_t code, const char *arg, int value)
 	  if (!strcmp (arg, processor_alias_table[i].name))
 	    {
 	      ia64_tune = processor_alias_table[i].processor;
+	      if (ia64_tune == PROCESSOR_ITANIUM
+		  && ! warned_itanium1_deprecated)
+		{
+		  inform (0,
+			  "value %<%s%> for -mtune= switch is deprecated",
+			  arg);
+		  inform (0, "GCC 4.4 is the last release with "
+			  "Itanium1 tuning support");
+		  warned_itanium1_deprecated = true;
+		}
 	      break;
 	    }
 	if (i == pta_size)
@@ -5238,9 +5283,36 @@ ia64_override_options (void)
       TARGET_INLINE_SQRT = INL_MAX_THR;
     }
 
+  ia64_flag_schedule_insns2 = flag_schedule_insns_after_reload;
+  flag_schedule_insns_after_reload = 0;
+
+  if (optimize >= 3
+      && ! sel_sched_switch_set)
+    {
+      flag_selective_scheduling2 = 1;
+      flag_sel_sched_pipelining = 1;
+    }
+  if (mflag_sched_control_spec == 2)
+    {
+      /* Control speculation is on by default for the selective scheduler,
+         but not for the Haifa scheduler.  */
+      mflag_sched_control_spec = flag_selective_scheduling2 ? 1 : 0;
+    }
+  if (flag_sel_sched_pipelining && flag_auto_inc_dec)
+    {
+      /* FIXME: remove this when we'd implement breaking autoinsns as
+         a transformation.  */
+      flag_auto_inc_dec = 0;
+    }
+
   ia64_section_threshold = g_switch_set ? g_switch_value : IA64_DEFAULT_GVALUE;
 
   init_machine_status = ia64_init_machine_status;
+
+  if (align_functions <= 0)
+    align_functions = 64;
+  if (align_loops <= 0)
+    align_loops = 32;
 }
 
 /* Initialize the record of emitted frame related registers.  */
@@ -5895,6 +5967,7 @@ rtx_needs_barrier (rtx x, struct reg_flags flags, int pred)
 	case UNSPEC_FR_SQRT_RECIP_APPROX_RES:
 	case UNSPEC_LDA:
 	case UNSPEC_LDS:
+	case UNSPEC_LDS_A:
 	case UNSPEC_LDSA:
 	case UNSPEC_CHKACLR:
         case UNSPEC_CHKS:
@@ -6100,6 +6173,7 @@ group_barrier_needed (rtx insn)
 	 asm.  */
       if (! need_barrier)
 	need_barrier = rws_access_regno (REG_VOLATILE, flags, 0);
+
       break;
 
     default:
@@ -6283,10 +6357,6 @@ static rtx dfa_stop_insn;
 
 static rtx last_scheduled_insn;
 
-/* The following variable value is size of the DFA state.  */
-
-static size_t dfa_state_size;
-
 /* The following variable value is pointer to a DFA state used as
    temporary variable.  */
 
@@ -6302,14 +6372,6 @@ static state_t prev_cycle_state = NULL;
 
 static char *stops_p = NULL;
 
-/* The following array element values are ZERO for non-speculative
-   instructions and hold corresponding speculation check number for
-   speculative instructions.  */
-static int *spec_check_no = NULL;
-
-/* Size of spec_check_no array.  */
-static int max_uid = 0;
-
 /* The following variable is used to set up the mentioned above array.  */
 
 static int stop_before_p = 0;
@@ -6333,6 +6395,12 @@ static int *add_cycles;
 /* The following variable value is number of data speculations in progress.  */
 static int pending_data_specs = 0;
 
+/* Number of memory references on current and three future processor cycles.  */
+static char mem_ops_in_group[4];
+
+/* Number of current processor cycle (from scheduler's point of view).  */
+static int current_cycle;
+
 static rtx ia64_single_set (rtx);
 static void ia64_emit_insn_before (rtx, rtx);
 
@@ -6382,20 +6450,44 @@ ia64_single_set (rtx insn)
   return ret;
 }
 
-/* Adjust the cost of a scheduling dependency.  Return the new cost of
-   a dependency LINK or INSN on DEP_INSN.  COST is the current cost.  */
-
+/* Adjust the cost of a scheduling dependency.
+   Return the new cost of a dependency of type DEP_TYPE or INSN on DEP_INSN.
+   COST is the current cost, DW is dependency weakness.  */
 static int
-ia64_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
+ia64_adjust_cost_2 (rtx insn, int dep_type1, rtx dep_insn, int cost, dw_t dw)
 {
+  enum reg_note dep_type = (enum reg_note) dep_type1;
   enum attr_itanium_class dep_class;
   enum attr_itanium_class insn_class;
 
-  if (REG_NOTE_KIND (link) != REG_DEP_OUTPUT)
-    return cost;
-
   insn_class = ia64_safe_itanium_class (insn);
   dep_class = ia64_safe_itanium_class (dep_insn);
+
+  /* Treat true memory dependencies separately.  Ignore apparent true
+     dependence between store and call (call has a MEM inside a SYMBOL_REF).  */
+  if (dep_type == REG_DEP_TRUE
+      && (dep_class == ITANIUM_CLASS_ST || dep_class == ITANIUM_CLASS_STF)
+      && (insn_class == ITANIUM_CLASS_BR || insn_class == ITANIUM_CLASS_SCALL))
+    return 0;
+
+  if (dw == MIN_DEP_WEAK)
+    /* Store and load are likely to alias, use higher cost to avoid stall.  */
+    return PARAM_VALUE (PARAM_SCHED_MEM_TRUE_DEP_COST);
+  else if (dw > MIN_DEP_WEAK)
+    {
+      /* Store and load are less likely to alias.  */
+      if (mflag_sched_fp_mem_deps_zero_cost && dep_class == ITANIUM_CLASS_STF)
+	/* Assume there will be no cache conflict for floating-point data.
+	   For integer data, L1 conflict penalty is huge (17 cycles), so we
+	   never assume it will not cause a conflict.  */
+	return 0;
+      else
+	return cost;
+    }
+
+  if (dep_type != REG_DEP_OUTPUT)
+    return cost;
+
   if (dep_class == ITANIUM_CLASS_ST || dep_class == ITANIUM_CLASS_STF
       || insn_class == ITANIUM_CLASS_ST || insn_class == ITANIUM_CLASS_STF)
     return 0;
@@ -6480,7 +6572,7 @@ ia64_sched_init (FILE *dump ATTRIBUTE_UNUSED,
 #ifdef ENABLE_CHECKING
   rtx insn;
 
-  if (reload_completed)
+  if (!sel_sched_p () && reload_completed)
     for (insn = NEXT_INSN (current_sched_info->prev_head);
 	 insn != current_sched_info->next_tail;
 	 insn = NEXT_INSN (insn))
@@ -6488,6 +6580,9 @@ ia64_sched_init (FILE *dump ATTRIBUTE_UNUSED,
 #endif
   last_scheduled_insn = NULL_RTX;
   init_insn_group_barriers ();
+
+  current_cycle = 0;
+  memset (mem_ops_in_group, 0, sizeof (mem_ops_in_group));
 }
 
 /* We're beginning a scheduling pass.  Check assertion.  */
@@ -6497,7 +6592,7 @@ ia64_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
                         int sched_verbose ATTRIBUTE_UNUSED,
                         int max_ready ATTRIBUTE_UNUSED)
 {  
-  gcc_assert (!pending_data_specs);
+  gcc_assert (pending_data_specs == 0);
 }
 
 /* Scheduling pass is now finished.  Free/reset static variable.  */
@@ -6505,9 +6600,41 @@ static void
 ia64_sched_finish_global (FILE *dump ATTRIBUTE_UNUSED,
 			  int sched_verbose ATTRIBUTE_UNUSED)
 {
-  free (spec_check_no);
-  spec_check_no = 0;
-  max_uid = 0;
+  gcc_assert (pending_data_specs == 0);
+}
+
+/* Return TRUE if INSN is a load (either normal or speculative, but not a
+   speculation check), FALSE otherwise.  */
+static bool
+is_load_p (rtx insn)
+{
+  enum attr_itanium_class insn_class = ia64_safe_itanium_class (insn);
+
+  return
+   ((insn_class == ITANIUM_CLASS_LD || insn_class == ITANIUM_CLASS_FLD)
+    && get_attr_check_load (insn) == CHECK_LOAD_NO);
+}
+
+/* If INSN is a memory reference, memoize it in MEM_OPS_IN_GROUP global array
+   (taking account for 3-cycle cache reference postponing for stores: Intel
+   Itanium 2 Reference Manual for Software Development and Optimization,
+   6.7.3.1).  */
+static void
+record_memory_reference (rtx insn)
+{
+  enum attr_itanium_class insn_class = ia64_safe_itanium_class (insn);
+
+  switch (insn_class) {
+    case ITANIUM_CLASS_FLD:
+    case ITANIUM_CLASS_LD:
+      mem_ops_in_group[current_cycle % 4]++;
+      break;
+    case ITANIUM_CLASS_STF:
+    case ITANIUM_CLASS_ST:
+      mem_ops_in_group[(current_cycle + 3) % 4]++;
+      break;
+    default:;
+  }
 }
 
 /* We are about to being issuing insns for this clock cycle.
@@ -6515,7 +6642,7 @@ ia64_sched_finish_global (FILE *dump ATTRIBUTE_UNUSED,
 
 static int
 ia64_dfa_sched_reorder (FILE *dump, int sched_verbose, rtx *ready,
-			int *pn_ready, int clock_var ATTRIBUTE_UNUSED,
+			int *pn_ready, int clock_var,
 			int reorder_type)
 {
   int n_asms;
@@ -6595,6 +6722,27 @@ ia64_dfa_sched_reorder (FILE *dump, int sched_verbose, rtx *ready,
       ready += deleted;
     }
 
+  current_cycle = clock_var;
+  if (reload_completed && mem_ops_in_group[clock_var % 4] >= ia64_max_memory_insns)
+    {
+      int moved = 0;
+
+      insnp = e_ready;
+      /* Move down loads/stores, preserving relative order.  */
+      while (insnp-- > ready + moved)
+	while (insnp >= ready + moved)
+	  {
+	    rtx insn = *insnp;
+	    if (! is_load_p (insn))
+	      break;
+	    memmove (ready + 1, ready, (insnp - ready) * sizeof (rtx));
+	    *ready = insn;
+	    moved++;
+	  }
+      n_ready -= moved;
+      ready += moved;
+    }
+
   return 1;
 }
 
@@ -6632,9 +6780,9 @@ ia64_variable_issue (FILE *dump ATTRIBUTE_UNUSED,
 		     rtx insn ATTRIBUTE_UNUSED,
 		     int can_issue_more ATTRIBUTE_UNUSED)
 {
-  if (current_sched_info->flags & DO_SPECULATION)
+  if (sched_deps_info->generate_spec_deps && !sel_sched_p ())
     /* Modulo scheduling does not extend h_i_d when emitting
-       new instructions.  Deal with it.  */
+       new instructions.  Don't use h_i_d, if we don't have to.  */
     {
       if (DONE_SPEC (insn) & BEGIN_DATA)
 	pending_data_specs++;
@@ -6653,6 +6801,8 @@ ia64_variable_issue (FILE *dump ATTRIBUTE_UNUSED,
 	init_insn_group_barriers ();
       stops_p [INSN_UID (insn)] = stop_before_p;
       stop_before_p = 0;
+
+      record_memory_reference (insn);
     }
   return 1;
 }
@@ -6663,10 +6813,13 @@ ia64_variable_issue (FILE *dump ATTRIBUTE_UNUSED,
 static int
 ia64_first_cycle_multipass_dfa_lookahead_guard (rtx insn)
 {
-  gcc_assert (insn  && INSN_P (insn));
+  gcc_assert (insn && INSN_P (insn));
   return ((!reload_completed
 	   || !safe_group_barrier_needed (insn))
-	  && ia64_first_cycle_multipass_dfa_lookahead_guard_spec (insn));
+	  && ia64_first_cycle_multipass_dfa_lookahead_guard_spec (insn)
+	  && (!mflag_sched_mem_insns_hard_limit
+	      || !is_load_p (insn)
+	      || mem_ops_in_group[current_cycle % 4] < ia64_max_memory_insns));
 }
 
 /* We are choosing insn from the ready queue.  Return nonzero if INSN
@@ -6688,6 +6841,26 @@ ia64_first_cycle_multipass_dfa_lookahead_guard_spec (const_rtx insn)
 
 static rtx dfa_pre_cycle_insn;
 
+/* Returns 1 when a meaningful insn was scheduled between the last group
+   barrier and LAST.  */
+static int
+scheduled_good_insn (rtx last)
+{
+  if (last && recog_memoized (last) >= 0)
+    return 1;
+
+  for ( ;
+       last != NULL && !NOTE_INSN_BASIC_BLOCK_P (last)
+       && !stops_p[INSN_UID (last)];
+       last = PREV_INSN (last))
+    /* We could hit a NOTE_INSN_DELETED here which is actually outside
+       the ebb we're scheduling.  */
+    if (INSN_P (last) && recog_memoized (last) >= 0)
+      return 1;
+
+  return 0;
+}
+
 /* We are about to being issuing INSN.  Return nonzero if we cannot
    issue it on given cycle CLOCK and return zero if we should not sort
    the ready queue on the next clock start.  */
@@ -6699,17 +6872,32 @@ ia64_dfa_new_cycle (FILE *dump, int verbose, rtx insn, int last_clock,
   int setup_clocks_p = FALSE;
 
   gcc_assert (insn && INSN_P (insn));
-  if ((reload_completed && safe_group_barrier_needed (insn))
+  /* When a group barrier is needed for insn, last_scheduled_insn
+     should be set.  */
+  gcc_assert (!(reload_completed && safe_group_barrier_needed (insn))
+              || last_scheduled_insn);
+
+  if ((reload_completed
+       && (safe_group_barrier_needed (insn)
+	   || (mflag_sched_stop_bits_after_every_cycle
+	       && last_clock != clock
+	       && last_scheduled_insn
+	       && scheduled_good_insn (last_scheduled_insn))))
       || (last_scheduled_insn
 	  && (GET_CODE (last_scheduled_insn) == CALL_INSN
 	      || GET_CODE (PATTERN (last_scheduled_insn)) == ASM_INPUT
 	      || asm_noperands (PATTERN (last_scheduled_insn)) >= 0)))
     {
       init_insn_group_barriers ();
+
       if (verbose && dump)
 	fprintf (dump, "//    Stop should be before %d%s\n", INSN_UID (insn),
 		 last_clock == clock ? " + cycle advance" : "");
+
       stop_before_p = 1;
+      current_cycle = clock;
+      mem_ops_in_group[current_cycle % 4] = 0;
+
       if (last_clock == clock)
 	{
 	  state_transition (curr_state, dfa_stop_insn);
@@ -6722,19 +6910,24 @@ ia64_dfa_new_cycle (FILE *dump, int verbose, rtx insn, int last_clock,
 	}
       else if (reload_completed)
 	setup_clocks_p = TRUE;
-      if (GET_CODE (PATTERN (last_scheduled_insn)) == ASM_INPUT
-	  || asm_noperands (PATTERN (last_scheduled_insn)) >= 0)
-	state_reset (curr_state);
-      else
+
+      if (last_scheduled_insn)
 	{
-	  memcpy (curr_state, prev_cycle_state, dfa_state_size);
-	  state_transition (curr_state, dfa_stop_insn);
-	  state_transition (curr_state, dfa_pre_cycle_insn);
-	  state_transition (curr_state, NULL);
+	  if (GET_CODE (PATTERN (last_scheduled_insn)) == ASM_INPUT
+	      || asm_noperands (PATTERN (last_scheduled_insn)) >= 0)
+	    state_reset (curr_state);
+	  else
+	    {
+	      memcpy (curr_state, prev_cycle_state, dfa_state_size);
+	      state_transition (curr_state, dfa_stop_insn);
+	      state_transition (curr_state, dfa_pre_cycle_insn);
+	      state_transition (curr_state, NULL);
+	    }
 	}
     }
   else if (reload_completed)
     setup_clocks_p = TRUE;
+
   if (setup_clocks_p && ia64_tune == PROCESSOR_ITANIUM
       && GET_CODE (PATTERN (insn)) != ASM_INPUT
       && asm_noperands (PATTERN (insn)) < 0)
@@ -6765,6 +6958,7 @@ ia64_dfa_new_cycle (FILE *dump, int verbose, rtx insn, int last_clock,
 	    add_cycles [INSN_UID (insn)] = 3 - d;
 	}
     }
+
   return 0;
 }
 
@@ -6773,18 +6967,9 @@ ia64_dfa_new_cycle (FILE *dump, int verbose, rtx insn, int last_clock,
 static void
 ia64_h_i_d_extended (void)
 {
-  if (current_sched_info->flags & DO_SPECULATION)
-    {
-      int new_max_uid = get_max_uid () + 1;
-
-      spec_check_no = (int *) xrecalloc (spec_check_no, new_max_uid,
-				 max_uid, sizeof (*spec_check_no));
-      max_uid = new_max_uid;
-    }
-
   if (stops_p != NULL) 
     {
-      int new_clocks_length = get_max_uid () + 1;
+      int new_clocks_length = get_max_uid () * 3 / 2;
       
       stops_p = (char *) xrecalloc (stops_p, new_clocks_length, clocks_length, 1);
       
@@ -6799,6 +6984,203 @@ ia64_h_i_d_extended (void)
       clocks_length = new_clocks_length;
     }
 }
+
+
+/* This structure describes the data used by the backend to guide scheduling.
+   When the current scheduling point is switched, this data should be saved
+   and restored later, if the scheduler returns to this point.  */
+struct _ia64_sched_context
+{
+  state_t prev_cycle_state;
+  rtx last_scheduled_insn;
+  struct reg_write_state rws_sum[NUM_REGS];
+  struct reg_write_state rws_insn[NUM_REGS];
+  int first_instruction;
+  int pending_data_specs;
+  int current_cycle;
+  char mem_ops_in_group[4];
+};
+typedef struct _ia64_sched_context *ia64_sched_context_t;
+
+/* Allocates a scheduling context.  */
+static void *
+ia64_alloc_sched_context (void)
+{
+  return xmalloc (sizeof (struct _ia64_sched_context));
+}
+
+/* Initializes the _SC context with clean data, if CLEAN_P, and from
+   the global context otherwise.  */
+static void
+ia64_init_sched_context (void *_sc, bool clean_p)
+{
+  ia64_sched_context_t sc = (ia64_sched_context_t) _sc;
+
+  sc->prev_cycle_state = xmalloc (dfa_state_size);
+  if (clean_p)
+    {
+      state_reset (sc->prev_cycle_state);
+      sc->last_scheduled_insn = NULL_RTX;
+      memset (sc->rws_sum, 0, sizeof (rws_sum));
+      memset (sc->rws_insn, 0, sizeof (rws_insn));
+      sc->first_instruction = 1;
+      sc->pending_data_specs = 0;
+      sc->current_cycle = 0;
+      memset (sc->mem_ops_in_group, 0, sizeof (mem_ops_in_group));
+    }
+  else
+    {
+      memcpy (sc->prev_cycle_state, prev_cycle_state, dfa_state_size);
+      sc->last_scheduled_insn = last_scheduled_insn;
+      memcpy (sc->rws_sum, rws_sum, sizeof (rws_sum));
+      memcpy (sc->rws_insn, rws_insn, sizeof (rws_insn));
+      sc->first_instruction = first_instruction;
+      sc->pending_data_specs = pending_data_specs;
+      sc->current_cycle = current_cycle;
+      memcpy (sc->mem_ops_in_group, mem_ops_in_group, sizeof (mem_ops_in_group));
+    }
+}
+
+/* Sets the global scheduling context to the one pointed to by _SC.  */
+static void
+ia64_set_sched_context (void *_sc)
+{
+  ia64_sched_context_t sc = (ia64_sched_context_t) _sc;
+
+  gcc_assert (sc != NULL);
+
+  memcpy (prev_cycle_state, sc->prev_cycle_state, dfa_state_size);
+  last_scheduled_insn = sc->last_scheduled_insn;
+  memcpy (rws_sum, sc->rws_sum, sizeof (rws_sum));
+  memcpy (rws_insn, sc->rws_insn, sizeof (rws_insn));
+  first_instruction = sc->first_instruction;
+  pending_data_specs = sc->pending_data_specs;
+  current_cycle = sc->current_cycle;
+  memcpy (mem_ops_in_group, sc->mem_ops_in_group, sizeof (mem_ops_in_group));
+}
+
+/* Clears the data in the _SC scheduling context.  */
+static void
+ia64_clear_sched_context (void *_sc)
+{
+  ia64_sched_context_t sc = (ia64_sched_context_t) _sc;
+  
+  free (sc->prev_cycle_state);
+  sc->prev_cycle_state = NULL;
+}
+
+/* Frees the _SC scheduling context.  */
+static void
+ia64_free_sched_context (void *_sc)
+{
+  gcc_assert (_sc != NULL);
+
+  free (_sc);
+}
+
+typedef rtx (* gen_func_t) (rtx, rtx);
+
+/* Return a function that will generate a load of mode MODE_NO
+   with speculation types TS.  */
+static gen_func_t
+get_spec_load_gen_function (ds_t ts, int mode_no)
+{
+  static gen_func_t gen_ld_[] = {
+    gen_movbi,
+    gen_movqi_internal,
+    gen_movhi_internal,
+    gen_movsi_internal,
+    gen_movdi_internal,
+    gen_movsf_internal,
+    gen_movdf_internal,
+    gen_movxf_internal,
+    gen_movti_internal,
+    gen_zero_extendqidi2,
+    gen_zero_extendhidi2,
+    gen_zero_extendsidi2,
+  };
+
+  static gen_func_t gen_ld_a[] = {
+    gen_movbi_advanced,
+    gen_movqi_advanced,
+    gen_movhi_advanced,
+    gen_movsi_advanced,
+    gen_movdi_advanced,
+    gen_movsf_advanced,
+    gen_movdf_advanced,
+    gen_movxf_advanced,
+    gen_movti_advanced,
+    gen_zero_extendqidi2_advanced,
+    gen_zero_extendhidi2_advanced,
+    gen_zero_extendsidi2_advanced,
+  };
+  static gen_func_t gen_ld_s[] = {
+    gen_movbi_speculative,
+    gen_movqi_speculative,
+    gen_movhi_speculative,
+    gen_movsi_speculative,
+    gen_movdi_speculative,
+    gen_movsf_speculative,
+    gen_movdf_speculative,
+    gen_movxf_speculative,
+    gen_movti_speculative,
+    gen_zero_extendqidi2_speculative,
+    gen_zero_extendhidi2_speculative,
+    gen_zero_extendsidi2_speculative,
+  };
+  static gen_func_t gen_ld_sa[] = {
+    gen_movbi_speculative_advanced,
+    gen_movqi_speculative_advanced,
+    gen_movhi_speculative_advanced,
+    gen_movsi_speculative_advanced,
+    gen_movdi_speculative_advanced,
+    gen_movsf_speculative_advanced,
+    gen_movdf_speculative_advanced,
+    gen_movxf_speculative_advanced,
+    gen_movti_speculative_advanced,
+    gen_zero_extendqidi2_speculative_advanced,
+    gen_zero_extendhidi2_speculative_advanced,
+    gen_zero_extendsidi2_speculative_advanced,
+  };
+  static gen_func_t gen_ld_s_a[] = {
+    gen_movbi_speculative_a,
+    gen_movqi_speculative_a,
+    gen_movhi_speculative_a,
+    gen_movsi_speculative_a,
+    gen_movdi_speculative_a,
+    gen_movsf_speculative_a,
+    gen_movdf_speculative_a,
+    gen_movxf_speculative_a,
+    gen_movti_speculative_a,
+    gen_zero_extendqidi2_speculative_a,
+    gen_zero_extendhidi2_speculative_a,
+    gen_zero_extendsidi2_speculative_a,
+  };
+
+  gen_func_t *gen_ld;
+
+  if (ts & BEGIN_DATA)
+    {
+      if (ts & BEGIN_CONTROL)
+	gen_ld = gen_ld_sa;
+      else
+	gen_ld = gen_ld_a;
+    }
+  else if (ts & BEGIN_CONTROL)
+    {
+      if ((spec_info->flags & SEL_SCHED_SPEC_DONT_CHECK_CONTROL)
+	  || ia64_needs_block_p (ts))
+	gen_ld = gen_ld_s;
+      else
+	gen_ld = gen_ld_s_a;
+    }
+  else if (ts == 0)
+    gen_ld = gen_ld_;
+  else
+    gcc_unreachable ();
+
+  return gen_ld[mode_no];
+}
 
 /* Constants that help mapping 'enum machine_mode' to int.  */
 enum SPEC_MODES
@@ -6810,6 +7192,12 @@ enum SPEC_MODES
     SPEC_MODE_LAST = 8
   };
 
+enum
+  {
+    /* Offset to reach ZERO_EXTEND patterns.  */
+    SPEC_GEN_EXTEND_OFFSET = SPEC_MODE_LAST - SPEC_MODE_FOR_EXTEND_FIRST + 1
+  };
+
 /* Return index of the MODE.  */
 static int
 ia64_mode_to_int (enum machine_mode mode)
@@ -6840,28 +7228,34 @@ ia64_set_sched_flags (spec_info_t spec_info)
   unsigned int *flags = &(current_sched_info->flags);
 
   if (*flags & SCHED_RGN
-      || *flags & SCHED_EBB)  
+      || *flags & SCHED_EBB
+      || *flags & SEL_SCHED)
     {
       int mask = 0;
 
       if ((mflag_sched_br_data_spec && !reload_completed && optimize > 0)
-	  || (mflag_sched_ar_data_spec && reload_completed))
+          || (mflag_sched_ar_data_spec && reload_completed))
 	{
 	  mask |= BEGIN_DATA;
-	  
-	  if ((mflag_sched_br_in_data_spec && !reload_completed)
-	      || (mflag_sched_ar_in_data_spec && reload_completed))
+
+	  if (!sel_sched_p ()
+	      && ((mflag_sched_br_in_data_spec && !reload_completed)
+		  || (mflag_sched_ar_in_data_spec && reload_completed)))
 	    mask |= BE_IN_DATA;
 	}
       
-      if (mflag_sched_control_spec)
+      if (mflag_sched_control_spec
+          && (!sel_sched_p ()
+	      || reload_completed))
 	{
 	  mask |= BEGIN_CONTROL;
 	  
-	  if (mflag_sched_in_control_spec)
+	  if (!sel_sched_p () && mflag_sched_in_control_spec)
 	    mask |= BE_IN_CONTROL;
 	}
 
+      spec_info->mask = mask;
+
       if (mask)
 	{
 	  *flags |= USE_DEPS_LIST | DO_SPECULATION;
@@ -6869,23 +7263,22 @@ ia64_set_sched_flags (spec_info_t spec_info)
 	  if (mask & BE_IN_SPEC)
 	    *flags |= NEW_BBS;
 	  
-	  spec_info->mask = mask;
 	  spec_info->flags = 0;
       
 	  if ((mask & DATA_SPEC) && mflag_sched_prefer_non_data_spec_insns)
 	    spec_info->flags |= PREFER_NON_DATA_SPEC;
 
-	  if ((mask & CONTROL_SPEC)
-	      && mflag_sched_prefer_non_control_spec_insns)
-	    spec_info->flags |= PREFER_NON_CONTROL_SPEC;
-
-	  if (mflag_sched_spec_verbose)
+	  if (mask & CONTROL_SPEC)
 	    {
-	      if (sched_verbose >= 1)
-		spec_info->dump = sched_dump;
-	      else
-		spec_info->dump = stderr;
+	      if (mflag_sched_prefer_non_control_spec_insns)
+		spec_info->flags |= PREFER_NON_CONTROL_SPEC;
+
+	      if (sel_sched_p () && mflag_sel_sched_dont_check_control_spec)
+		spec_info->flags |= SEL_SCHED_SPEC_DONT_CHECK_CONTROL;
 	    }
+
+	  if (sched_verbose >= 1)
+	    spec_info->dump = sched_dump;
 	  else
 	    spec_info->dump = 0;
 	  
@@ -6893,306 +7286,288 @@ ia64_set_sched_flags (spec_info_t spec_info)
 	    spec_info->flags |= COUNT_SPEC_IN_CRITICAL_PATH;
 	}
     }
+  else
+    spec_info->mask = 0;
 }
 
-/* Implement targetm.sched.speculate_insn hook.
-   Check if the INSN can be TS speculative.
-   If 'no' - return -1.
-   If 'yes' - generate speculative pattern in the NEW_PAT and return 1.
-   If current pattern of the INSN already provides TS speculation, return 0.  */
+/* If INSN is an appropriate load return its mode.
+   Return -1 otherwise.  */
 static int
-ia64_speculate_insn (rtx insn, ds_t ts, rtx *new_pat)
-{  
-  rtx pat, reg, mem, mem_reg;
-  int mode_no, gen_p = 1;
+get_mode_no_for_insn (rtx insn)
+{
+  rtx reg, mem, mode_rtx;
+  int mode_no;
   bool extend_p;
-  
-  gcc_assert (!(ts & ~BEGIN_SPEC) && ts);
-           
-  pat = PATTERN (insn);
 
-  if (GET_CODE (pat) == COND_EXEC)
-    pat = COND_EXEC_CODE (pat);
+  extract_insn_cached (insn);
 
-  /* This should be a SET ...  */
-  if (GET_CODE (pat) != SET)
-    return -1;
+  /* We use WHICH_ALTERNATIVE only after reload.  This will
+     guarantee that reload won't touch a speculative insn.  */
 
-  reg = SET_DEST (pat);
-  /* ... to the general/fp register ...  */
-  if (!REG_P (reg) || !(GR_REGNO_P (REGNO (reg)) || FP_REGNO_P (REGNO (reg))))
+  if (recog_data.n_operands != 2)
     return -1;
 
-  /* ... from the mem ...  */
-  mem = SET_SRC (pat);
+  reg = recog_data.operand[0];
+  mem = recog_data.operand[1];
 
-  /* ... that can, possibly, be a zero_extend ...  */
-  if (GET_CODE (mem) == ZERO_EXTEND)
+  /* We should use MEM's mode since REG's mode in presence of
+     ZERO_EXTEND will always be DImode.  */
+  if (get_attr_speculable1 (insn) == SPECULABLE1_YES)
+    /* Process non-speculative ld.  */
+    {
+      if (!reload_completed)
+	{
+	  /* Do not speculate into regs like ar.lc.  */
+	  if (!REG_P (reg) || AR_REGNO_P (REGNO (reg)))
+	    return -1;
+
+	  if (!MEM_P (mem))
+	    return -1;
+
+	  {
+	    rtx mem_reg = XEXP (mem, 0);
+
+	    if (!REG_P (mem_reg))
+	      return -1;
+	  }
+
+	  mode_rtx = mem;
+	}
+      else if (get_attr_speculable2 (insn) == SPECULABLE2_YES)
+	{
+	  gcc_assert (REG_P (reg) && MEM_P (mem));
+	  mode_rtx = mem;
+	}
+      else
+	return -1;
+    }
+  else if (get_attr_data_speculative (insn) == DATA_SPECULATIVE_YES
+	   || get_attr_control_speculative (insn) == CONTROL_SPECULATIVE_YES
+	   || get_attr_check_load (insn) == CHECK_LOAD_YES)
+    /* Process speculative ld or ld.c.  */
     {
-      mem = XEXP (mem, 0);
-      extend_p = true;      
+      gcc_assert (REG_P (reg) && MEM_P (mem));
+      mode_rtx = mem;
     }
   else
-    extend_p = false;
-
-  /* ... or a speculative load.  */
-  if (GET_CODE (mem) == UNSPEC)
     {
-      int code;
-      
-      code = XINT (mem, 1);
-      if (code != UNSPEC_LDA && code != UNSPEC_LDS && code != UNSPEC_LDSA)
-	return -1;
-
-      if ((code == UNSPEC_LDA && !(ts & BEGIN_CONTROL))
-	  || (code == UNSPEC_LDS && !(ts & BEGIN_DATA))
-	  || code == UNSPEC_LDSA)
-	gen_p = 0;
+      enum attr_itanium_class attr_class = get_attr_itanium_class (insn);
 
-      mem = XVECEXP (mem, 0, 0);
-      gcc_assert (MEM_P (mem));
+      if (attr_class == ITANIUM_CLASS_CHK_A
+	  || attr_class == ITANIUM_CLASS_CHK_S_I
+	  || attr_class == ITANIUM_CLASS_CHK_S_F)
+	/* Process chk.  */
+	mode_rtx = reg;
+      else
+	return -1;
     }
 
-  /* Source should be a mem ...  */
-  if (!MEM_P (mem))
-    return -1;
+  mode_no = ia64_mode_to_int (GET_MODE (mode_rtx));
 
-  /* ... addressed by a register.  */
-  mem_reg = XEXP (mem, 0);
-  if (!REG_P (mem_reg))
-    return -1;
-     
-  /* We should use MEM's mode since REG's mode in presence of ZERO_EXTEND
-     will always be DImode.  */
-  mode_no = ia64_mode_to_int (GET_MODE (mem));
-  
-  if (mode_no == SPEC_MODE_INVALID
-      || (extend_p
-	  && !(SPEC_MODE_FOR_EXTEND_FIRST <= mode_no
-	       && mode_no <= SPEC_MODE_FOR_EXTEND_LAST)))
+  if (mode_no == SPEC_MODE_INVALID)
     return -1;
 
-  extract_insn_cached (insn);
-  gcc_assert (reg == recog_data.operand[0] && mem == recog_data.operand[1]);
+  extend_p = (GET_MODE (reg) != GET_MODE (mode_rtx));
 
-  *new_pat = ia64_gen_spec_insn (insn, ts, mode_no, gen_p != 0, extend_p);
+  if (extend_p)
+    {
+      if (!(SPEC_MODE_FOR_EXTEND_FIRST <= mode_no
+	    && mode_no <= SPEC_MODE_FOR_EXTEND_LAST))
+	return -1;
+
+      mode_no += SPEC_GEN_EXTEND_OFFSET;
+    }
 
-  return gen_p;
+  return mode_no;
 }
 
-enum
-  {
-    /* Offset to reach ZERO_EXTEND patterns.  */
-    SPEC_GEN_EXTEND_OFFSET = SPEC_MODE_LAST - SPEC_MODE_FOR_EXTEND_FIRST + 1,
-    /* Number of patterns for each speculation mode.  */
-    SPEC_N = (SPEC_MODE_LAST
-              + SPEC_MODE_FOR_EXTEND_LAST - SPEC_MODE_FOR_EXTEND_FIRST + 2)
-  };
+/* If X is an unspec part of a speculative load, return its code.
+   Return -1 otherwise.  */
+static int
+get_spec_unspec_code (const_rtx x)
+{
+  if (GET_CODE (x) != UNSPEC)
+    return -1;
 
-enum SPEC_GEN_LD_MAP
   {
-    /* Offset to ld.a patterns.  */
-    SPEC_GEN_A = 0 * SPEC_N,
-    /* Offset to ld.s patterns.  */
-    SPEC_GEN_S = 1 * SPEC_N,
-    /* Offset to ld.sa patterns.  */
-    SPEC_GEN_SA = 2 * SPEC_N,
-    /* Offset to ld.sa patterns.  For this patterns corresponding ld.c will
-       mutate to chk.s.  */
-    SPEC_GEN_SA_FOR_S = 3 * SPEC_N
-  };
+    int code;
 
-/* These offsets are used to get (4 * SPEC_N).  */
-enum SPEC_GEN_CHECK_OFFSET
-  {
-    SPEC_GEN_CHKA_FOR_A_OFFSET = 4 * SPEC_N - SPEC_GEN_A,
-    SPEC_GEN_CHKA_FOR_SA_OFFSET = 4 * SPEC_N - SPEC_GEN_SA
-  };
+    code = XINT (x, 1);
 
-/* If GEN_P is true, calculate the index of needed speculation check and return
-   speculative pattern for INSN with speculative mode TS, machine mode
-   MODE_NO and with ZERO_EXTEND (if EXTEND_P is true).
-   If GEN_P is false, just calculate the index of needed speculation check.  */
-static rtx
-ia64_gen_spec_insn (rtx insn, ds_t ts, int mode_no, bool gen_p, bool extend_p)
+    switch (code)
+      {
+      case UNSPEC_LDA:
+      case UNSPEC_LDS:
+      case UNSPEC_LDS_A:
+      case UNSPEC_LDSA:
+	return code;
+
+      default:
+	return -1;
+      }
+  }
+}
+
+/* Implement skip_rtx_p hook.  */
+static bool
+ia64_skip_rtx_p (const_rtx x)
 {
-  rtx pat, new_pat;
-  int load_no;
-  int shift = 0;
+  return get_spec_unspec_code (x) != -1;
+}
 
-  static rtx (* const gen_load[]) (rtx, rtx) = {
-    gen_movbi_advanced,
-    gen_movqi_advanced,
-    gen_movhi_advanced,
-    gen_movsi_advanced,
-    gen_movdi_advanced,
-    gen_movsf_advanced,
-    gen_movdf_advanced,
-    gen_movxf_advanced,
-    gen_movti_advanced,
-    gen_zero_extendqidi2_advanced,
-    gen_zero_extendhidi2_advanced,
-    gen_zero_extendsidi2_advanced,
+/* If INSN is a speculative load, return its UNSPEC code.
+   Return -1 otherwise.  */
+static int
+get_insn_spec_code (const_rtx insn)
+{
+  rtx pat, reg, mem;
 
-    gen_movbi_speculative,
-    gen_movqi_speculative,
-    gen_movhi_speculative,
-    gen_movsi_speculative,
-    gen_movdi_speculative,
-    gen_movsf_speculative,
-    gen_movdf_speculative,
-    gen_movxf_speculative,
-    gen_movti_speculative,
-    gen_zero_extendqidi2_speculative,
-    gen_zero_extendhidi2_speculative,
-    gen_zero_extendsidi2_speculative,
+  pat = PATTERN (insn);
 
-    gen_movbi_speculative_advanced,
-    gen_movqi_speculative_advanced,
-    gen_movhi_speculative_advanced,
-    gen_movsi_speculative_advanced,
-    gen_movdi_speculative_advanced,
-    gen_movsf_speculative_advanced,
-    gen_movdf_speculative_advanced,
-    gen_movxf_speculative_advanced,
-    gen_movti_speculative_advanced,
-    gen_zero_extendqidi2_speculative_advanced,
-    gen_zero_extendhidi2_speculative_advanced,
-    gen_zero_extendsidi2_speculative_advanced,
+  if (GET_CODE (pat) == COND_EXEC)
+    pat = COND_EXEC_CODE (pat);
 
-    gen_movbi_speculative_advanced,
-    gen_movqi_speculative_advanced,
-    gen_movhi_speculative_advanced,
-    gen_movsi_speculative_advanced,
-    gen_movdi_speculative_advanced,
-    gen_movsf_speculative_advanced,
-    gen_movdf_speculative_advanced,
-    gen_movxf_speculative_advanced,
-    gen_movti_speculative_advanced,
-    gen_zero_extendqidi2_speculative_advanced,
-    gen_zero_extendhidi2_speculative_advanced,
-    gen_zero_extendsidi2_speculative_advanced
-  };
+  if (GET_CODE (pat) != SET)
+    return -1;
 
-  load_no = extend_p ? mode_no + SPEC_GEN_EXTEND_OFFSET : mode_no;
+  reg = SET_DEST (pat);
+  if (!REG_P (reg))
+    return -1;
 
-  if (ts & BEGIN_DATA)
+  mem = SET_SRC (pat);
+  if (GET_CODE (mem) == ZERO_EXTEND)
+    mem = XEXP (mem, 0);
+
+  return get_spec_unspec_code (mem);
+}
+
+/* If INSN is a speculative load, return a ds with the speculation types.
+   Otherwise [if INSN is a normal instruction] return 0.  */
+static ds_t
+ia64_get_insn_spec_ds (rtx insn)
+{
+  int code = get_insn_spec_code (insn);
+
+  switch (code)
     {
-      /* We don't need recovery because even if this is ld.sa
-	 ALAT entry will be allocated only if NAT bit is set to zero. 
-	 So it is enough to use ld.c here.  */	  
+    case UNSPEC_LDA:
+      return BEGIN_DATA;
 
-      if (ts & BEGIN_CONTROL)
-	{	      
-	  load_no += SPEC_GEN_SA;
+    case UNSPEC_LDS:
+    case UNSPEC_LDS_A:
+      return BEGIN_CONTROL;
 
-	  if (!mflag_sched_ldc)
-	    shift = SPEC_GEN_CHKA_FOR_SA_OFFSET;
-	}
-      else
-	{
-	  load_no += SPEC_GEN_A;
+    case UNSPEC_LDSA:
+      return BEGIN_DATA | BEGIN_CONTROL;
 
-	  if (!mflag_sched_ldc)		
-	    shift = SPEC_GEN_CHKA_FOR_A_OFFSET;
-	}
+    default:
+      return 0;
     }
-  else if (ts & BEGIN_CONTROL)
+}
+
+/* If INSN is a speculative load return a ds with the speculation types that
+   will be checked.
+   Otherwise [if INSN is a normal instruction] return 0.  */
+static ds_t
+ia64_get_insn_checked_ds (rtx insn)
+{
+  int code = get_insn_spec_code (insn);
+
+  switch (code)
     {
-      /* ld.sa can be used instead of ld.s to avoid basic block splitting.  */
-      if (!mflag_control_ldc)
-	load_no += SPEC_GEN_S;
-      else
-	{
-	  gcc_assert (mflag_sched_ldc);
-	  load_no += SPEC_GEN_SA_FOR_S;
-	}
+    case UNSPEC_LDA:
+      return BEGIN_DATA | BEGIN_CONTROL;
+
+    case UNSPEC_LDS:
+      return BEGIN_CONTROL;
+
+    case UNSPEC_LDS_A:
+    case UNSPEC_LDSA:
+      return BEGIN_DATA | BEGIN_CONTROL;
+
+    default:
+      return 0;
     }
-  else
-    gcc_unreachable ();
+}
 
-  /* Set the desired check index.  We add '1', because zero element in this
-     array means, that instruction with such uid is non-speculative.  */
-  spec_check_no[INSN_UID (insn)] = load_no + shift + 1;
+/* If GEN_P is true, calculate the index of needed speculation check and return
+   speculative pattern for INSN with speculative mode TS, machine mode
+   MODE_NO and with ZERO_EXTEND (if EXTEND_P is true).
+   If GEN_P is false, just calculate the index of needed speculation check.  */
+static rtx
+ia64_gen_spec_load (rtx insn, ds_t ts, int mode_no)
+{
+  rtx pat, new_pat;
+  gen_func_t gen_load;
 
-  if (!gen_p)
-    return 0;
+  gen_load = get_spec_load_gen_function (ts, mode_no);
 
-  new_pat = gen_load[load_no] (copy_rtx (recog_data.operand[0]),
-			       copy_rtx (recog_data.operand[1]));
+  new_pat = gen_load (copy_rtx (recog_data.operand[0]),
+		      copy_rtx (recog_data.operand[1]));
 
   pat = PATTERN (insn);
   if (GET_CODE (pat) == COND_EXEC)
-    new_pat = gen_rtx_COND_EXEC (VOIDmode, copy_rtx 
-				 (COND_EXEC_TEST (pat)), new_pat);
+    new_pat = gen_rtx_COND_EXEC (VOIDmode, copy_rtx (COND_EXEC_TEST (pat)),
+				 new_pat);
 
   return new_pat;
 }
 
-/* Offset to branchy checks.  */
-enum { SPEC_GEN_CHECK_MUTATION_OFFSET = 5 * SPEC_N };
-
-/* Return nonzero, if INSN needs branchy recovery check.  */
 static bool
-ia64_needs_block_p (const_rtx insn)
+insn_can_be_in_speculative_p (rtx insn ATTRIBUTE_UNUSED,
+			      ds_t ds ATTRIBUTE_UNUSED)
 {
-  int check_no;
+  return false;
+}
 
-  check_no = spec_check_no[INSN_UID(insn)] - 1;
-  gcc_assert (0 <= check_no && check_no < SPEC_GEN_CHECK_MUTATION_OFFSET);
+/* Implement targetm.sched.speculate_insn hook.
+   Check if the INSN can be TS speculative.
+   If 'no' - return -1.
+   If 'yes' - generate speculative pattern in the NEW_PAT and return 1.
+   If current pattern of the INSN already provides TS speculation,
+   return 0.  */
+static int
+ia64_speculate_insn (rtx insn, ds_t ts, rtx *new_pat)
+{  
+  int mode_no;
+  int res;
+  
+  gcc_assert (!(ts & ~SPECULATIVE));
 
-  return ((SPEC_GEN_S <= check_no && check_no < SPEC_GEN_S + SPEC_N)
-	  || (4 * SPEC_N <= check_no && check_no < 4 * SPEC_N + SPEC_N));
-}
+  if (ia64_spec_check_p (insn))
+    return -1;
 
-/* Generate (or regenerate, if (MUTATE_P)) recovery check for INSN.
-   If (LABEL != 0 || MUTATE_P), generate branchy recovery check.
-   Otherwise, generate a simple check.  */
-static rtx
-ia64_gen_check (rtx insn, rtx label, bool mutate_p)
-{
-  rtx op1, pat, check_pat;
+  if ((ts & BE_IN_SPEC)
+      && !insn_can_be_in_speculative_p (insn, ts))
+    return -1;
 
-  static rtx (* const gen_check[]) (rtx, rtx) = {
-    gen_movbi_clr,
-    gen_movqi_clr,
-    gen_movhi_clr,
-    gen_movsi_clr,
-    gen_movdi_clr,
-    gen_movsf_clr,
-    gen_movdf_clr,
-    gen_movxf_clr,
-    gen_movti_clr,
-    gen_zero_extendqidi2_clr,
-    gen_zero_extendhidi2_clr,
-    gen_zero_extendsidi2_clr,
+  mode_no = get_mode_no_for_insn (insn);
 
-    gen_speculation_check_bi,
-    gen_speculation_check_qi,
-    gen_speculation_check_hi,
-    gen_speculation_check_si,
-    gen_speculation_check_di,
-    gen_speculation_check_sf,
-    gen_speculation_check_df,
-    gen_speculation_check_xf,
-    gen_speculation_check_ti,
-    gen_speculation_check_di,
-    gen_speculation_check_di,
-    gen_speculation_check_di,
+  if (mode_no != SPEC_MODE_INVALID)
+    {
+      if (ia64_get_insn_spec_ds (insn) == ds_get_speculation_types (ts))
+	res = 0;
+      else
+	{
+	  res = 1;
+	  *new_pat = ia64_gen_spec_load (insn, ts, mode_no);
+	}
+    }
+  else
+    res = -1;
 
-    gen_movbi_clr,
-    gen_movqi_clr,
-    gen_movhi_clr,
-    gen_movsi_clr,
-    gen_movdi_clr,
-    gen_movsf_clr,
-    gen_movdf_clr,
-    gen_movxf_clr,
-    gen_movti_clr,
-    gen_zero_extendqidi2_clr,
-    gen_zero_extendhidi2_clr,
-    gen_zero_extendsidi2_clr,
+  return res;
+}
 
+/* Return a function that will generate a check for speculation TS with mode
+   MODE_NO.
+   If simple check is needed, pass true for SIMPLE_CHECK_P.
+   If clearing check is needed, pass true for CLEARING_CHECK_P.  */
+static gen_func_t
+get_spec_check_gen_function (ds_t ts, int mode_no,
+			     bool simple_check_p, bool clearing_check_p)
+{
+  static gen_func_t gen_ld_c_clr[] = {
     gen_movbi_clr,
     gen_movqi_clr,
     gen_movhi_clr,
@@ -7205,36 +7580,22 @@ ia64_gen_check (rtx insn, rtx label, bool mutate_p)
     gen_zero_extendqidi2_clr,
     gen_zero_extendhidi2_clr,
     gen_zero_extendsidi2_clr,
-
-    gen_advanced_load_check_clr_bi,
-    gen_advanced_load_check_clr_qi,
-    gen_advanced_load_check_clr_hi,
-    gen_advanced_load_check_clr_si,
-    gen_advanced_load_check_clr_di,
-    gen_advanced_load_check_clr_sf,
-    gen_advanced_load_check_clr_df,
-    gen_advanced_load_check_clr_xf,
-    gen_advanced_load_check_clr_ti,
-    gen_advanced_load_check_clr_di,
-    gen_advanced_load_check_clr_di,
-    gen_advanced_load_check_clr_di,
-
-    /* Following checks are generated during mutation.  */
-    gen_advanced_load_check_clr_bi,
-    gen_advanced_load_check_clr_qi,
-    gen_advanced_load_check_clr_hi,
-    gen_advanced_load_check_clr_si,
-    gen_advanced_load_check_clr_di,
-    gen_advanced_load_check_clr_sf,
-    gen_advanced_load_check_clr_df,
-    gen_advanced_load_check_clr_xf,
-    gen_advanced_load_check_clr_ti,
-    gen_advanced_load_check_clr_di,
-    gen_advanced_load_check_clr_di,
-    gen_advanced_load_check_clr_di,
-
-    0,0,0,0,0,0,0,0,0,0,0,0,
-
+  };
+  static gen_func_t gen_ld_c_nc[] = {
+    gen_movbi_nc,
+    gen_movqi_nc,
+    gen_movhi_nc,
+    gen_movsi_nc,
+    gen_movdi_nc,
+    gen_movsf_nc,
+    gen_movdf_nc,
+    gen_movxf_nc,
+    gen_movti_nc,
+    gen_zero_extendqidi2_nc,
+    gen_zero_extendhidi2_nc,
+    gen_zero_extendsidi2_nc,
+  };
+  static gen_func_t gen_chk_a_clr[] = {
     gen_advanced_load_check_clr_bi,
     gen_advanced_load_check_clr_qi,
     gen_advanced_load_check_clr_hi,
@@ -7247,7 +7608,22 @@ ia64_gen_check (rtx insn, rtx label, bool mutate_p)
     gen_advanced_load_check_clr_di,
     gen_advanced_load_check_clr_di,
     gen_advanced_load_check_clr_di,
-
+  };
+  static gen_func_t gen_chk_a_nc[] = {
+    gen_advanced_load_check_nc_bi,
+    gen_advanced_load_check_nc_qi,
+    gen_advanced_load_check_nc_hi,
+    gen_advanced_load_check_nc_si,
+    gen_advanced_load_check_nc_di,
+    gen_advanced_load_check_nc_sf,
+    gen_advanced_load_check_nc_df,
+    gen_advanced_load_check_nc_xf,
+    gen_advanced_load_check_nc_ti,
+    gen_advanced_load_check_nc_di,
+    gen_advanced_load_check_nc_di,
+    gen_advanced_load_check_nc_di,
+  };
+  static gen_func_t gen_chk_s[] = {
     gen_speculation_check_bi,
     gen_speculation_check_qi,
     gen_speculation_check_hi,
@@ -7259,49 +7635,96 @@ ia64_gen_check (rtx insn, rtx label, bool mutate_p)
     gen_speculation_check_ti,
     gen_speculation_check_di,
     gen_speculation_check_di,
-    gen_speculation_check_di
+    gen_speculation_check_di,
   };
 
-  extract_insn_cached (insn);
+  gen_func_t *gen_check;
 
-  if (label)
-    {
-      gcc_assert (mutate_p || ia64_needs_block_p (insn));
-      op1 = label;
-    }
-  else
+  if (ts & BEGIN_DATA)
     {
-      gcc_assert (!mutate_p && !ia64_needs_block_p (insn));
-      op1 = copy_rtx (recog_data.operand[1]);
+      /* We don't need recovery because even if this is ld.sa
+	 ALAT entry will be allocated only if NAT bit is set to zero.
+	 So it is enough to use ld.c here.  */
+
+      if (simple_check_p)
+	{
+	  gcc_assert (mflag_sched_spec_ldc);
+
+	  if (clearing_check_p)
+	    gen_check = gen_ld_c_clr;
+	  else
+	    gen_check = gen_ld_c_nc;
+	}
+      else
+	{
+	  if (clearing_check_p)
+	    gen_check = gen_chk_a_clr;
+	  else
+	    gen_check = gen_chk_a_nc;
+	}
     }
-      
-  if (mutate_p)
-    /* INSN is ld.c.
-       Find the speculation check number by searching for original
-       speculative load in the RESOLVED_DEPS list of INSN.
-       As long as patterns are unique for each instruction, this can be
-       accomplished by matching ORIG_PAT fields.  */
+  else if (ts & BEGIN_CONTROL)
     {
-      sd_iterator_def sd_it;
-      dep_t dep;
-      int check_no = 0;
-      rtx orig_pat = ORIG_PAT (insn);
-
-      FOR_EACH_DEP (insn, SD_LIST_RES_BACK, sd_it, dep)
+      if (simple_check_p)
+	/* We might want to use ld.sa -> ld.c instead of
+	   ld.s -> chk.s.  */
 	{
-	  rtx x = DEP_PRO (dep);
+	  gcc_assert (!ia64_needs_block_p (ts));
 
-	  if (ORIG_PAT (x) == orig_pat)
-	    check_no = spec_check_no[INSN_UID (x)];
+	  if (clearing_check_p)
+	    gen_check = gen_ld_c_clr;
+	  else
+	    gen_check = gen_ld_c_nc;
+	}
+      else
+	{
+	  gen_check = gen_chk_s;
 	}
-      gcc_assert (check_no);
+    }
+  else
+    gcc_unreachable ();
+
+  gcc_assert (mode_no >= 0);
+  return gen_check[mode_no];
+}
+
+/* Return nonzero, if INSN needs branchy recovery check.  */
+static bool
+ia64_needs_block_p (ds_t ts)
+{
+  if (ts & BEGIN_DATA)
+    return !mflag_sched_spec_ldc;
+
+  gcc_assert ((ts & BEGIN_CONTROL) != 0);
+
+  return !(mflag_sched_spec_control_ldc && mflag_sched_spec_ldc);
+}
+
+/* Generate (or regenerate, if (MUTATE_P)) recovery check for INSN.
+   If (LABEL != 0 || MUTATE_P), generate branchy recovery check.
+   Otherwise, generate a simple check.  */
+static rtx
+ia64_gen_spec_check (rtx insn, rtx label, ds_t ds)
+{
+  rtx op1, pat, check_pat;
+  gen_func_t gen_check;
+  int mode_no;
 
-      spec_check_no[INSN_UID (insn)] = (check_no
-					+ SPEC_GEN_CHECK_MUTATION_OFFSET);
+  mode_no = get_mode_no_for_insn (insn);
+  gcc_assert (mode_no >= 0);
+
+  if (label)
+    op1 = label;
+  else
+    {
+      gcc_assert (!ia64_needs_block_p (ds));
+      op1 = copy_rtx (recog_data.operand[1]);
     }
+      
+  gen_check = get_spec_check_gen_function (ds, mode_no, label == NULL_RTX,
+					   true);
 
-  check_pat = (gen_check[spec_check_no[INSN_UID (insn)] - 1]
-	       (copy_rtx (recog_data.operand[0]), op1));
+  check_pat = gen_check (copy_rtx (recog_data.operand[0]), op1);
     
   pat = PATTERN (insn);
   if (GET_CODE (pat) == COND_EXEC)
@@ -7342,9 +7765,11 @@ ia64_spec_check_src_p (rtx src)
 	      
 	      code = XINT (t, 1);
 	     
-	      if (code == UNSPEC_CHKACLR
-		  || code == UNSPEC_CHKS
-		  || code == UNSPEC_LDCCLR)
+	      if (code == UNSPEC_LDCCLR
+		  || code == UNSPEC_LDCNC
+		  || code == UNSPEC_CHKACLR
+		  || code == UNSPEC_CHKANC
+		  || code == UNSPEC_CHKS)
 		{
 		  gcc_assert (code != 0);
 		  return code;
@@ -7375,6 +7800,7 @@ struct bundle_state
   int accumulated_insns_num; /* number of all previous insns including
 				nops.  L is considered as 2 insns */
   int branch_deviation; /* deviation of previous branches from 3rd slots  */
+  int middle_bundle_stops; /* number of stop bits in the middle of bundles */
   struct bundle_state *next;  /* next state with the same insn_num  */
   struct bundle_state *originator; /* originator (previous insn state)  */
   /* All bundle states are in the following chain.  */
@@ -7520,9 +7946,15 @@ insert_bundle_state (struct bundle_state *bundle_state)
 		   || (((struct bundle_state *)
 			*entry_ptr)->accumulated_insns_num
 		       == bundle_state->accumulated_insns_num
-		       && ((struct bundle_state *)
-			   *entry_ptr)->branch_deviation
-		       > bundle_state->branch_deviation))))
+		       && (((struct bundle_state *)
+			    *entry_ptr)->branch_deviation
+			   > bundle_state->branch_deviation
+			   || (((struct bundle_state *)
+				*entry_ptr)->branch_deviation
+			       == bundle_state->branch_deviation
+			       && ((struct bundle_state *)
+				   *entry_ptr)->middle_bundle_stops
+			       > bundle_state->middle_bundle_stops))))))
 
     {
       struct bundle_state temp;
@@ -7616,6 +8048,7 @@ issue_nops_and_insn (struct bundle_state *originator, int before_nops_num,
   curr_state->accumulated_insns_num
     = originator->accumulated_insns_num + before_nops_num;
   curr_state->branch_deviation = originator->branch_deviation;
+  curr_state->middle_bundle_stops = originator->middle_bundle_stops;
   gcc_assert (insn);
   if (INSN_CODE (insn) == CODE_FOR_insn_group_barrier)
     {
@@ -7625,6 +8058,8 @@ issue_nops_and_insn (struct bundle_state *originator, int before_nops_num,
       if (!try_issue_insn (curr_state, insn))
 	return;
       memcpy (temp_dfa_state, curr_state->dfa_state, dfa_state_size);
+      if (curr_state->accumulated_insns_num % 3 != 0)
+	curr_state->middle_bundle_stops++;
       if (state_transition (temp_dfa_state, dfa_pre_cycle_insn) >= 0
 	  && curr_state->accumulated_insns_num % 3 != 0)
 	{
@@ -7800,6 +8235,16 @@ get_template (state_t state, int pos)
     }
 }
 
+/* True when INSN is important for bundling.  */
+static bool
+important_for_bundling_p (rtx insn)
+{
+  return (INSN_P (insn)
+	  && ia64_safe_itanium_class (insn) != ITANIUM_CLASS_IGNORE
+	  && GET_CODE (PATTERN (insn)) != USE
+	  && GET_CODE (PATTERN (insn)) != CLOBBER);
+}
+
 /* The following function returns an insn important for insn bundling
    followed by INSN and before TAIL.  */
 
@@ -7807,10 +8252,7 @@ static rtx
 get_next_important_insn (rtx insn, rtx tail)
 {
   for (; insn && insn != tail; insn = NEXT_INSN (insn))
-    if (INSN_P (insn)
-	&& ia64_safe_itanium_class (insn) != ITANIUM_CLASS_IGNORE
-	&& GET_CODE (PATTERN (insn)) != USE
-	&& GET_CODE (PATTERN (insn)) != CLOBBER)
+    if (important_for_bundling_p (insn))
       return insn;
   return NULL_RTX;
 }
@@ -7935,6 +8377,7 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
   curr_state->cost = 0;
   curr_state->accumulated_insns_num = 0;
   curr_state->branch_deviation = 0;
+  curr_state->middle_bundle_stops = 0;
   curr_state->next = NULL;
   curr_state->originator = NULL;
   state_reset (curr_state->dfa_state);
@@ -7957,7 +8400,8 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
 	  if (INSN_P (next_insn)
 	      && ia64_safe_itanium_class (next_insn) != ITANIUM_CLASS_IGNORE
 	      && GET_CODE (PATTERN (next_insn)) != USE
-	      && GET_CODE (PATTERN (next_insn)) != CLOBBER)
+	      && GET_CODE (PATTERN (next_insn)) != CLOBBER
+	      && INSN_CODE (next_insn) != CODE_FOR_insn_group_barrier)
 	    {
 	      PUT_MODE (next_insn, TImode);
 	      break;
@@ -8031,13 +8475,14 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
 
 	    fprintf
 	      (dump,
-	       "//    Bundle state %d (orig %d, cost %d, nops %d/%d, insns %d, branch %d, state %d) for %d\n",
+	       "//    Bundle state %d (orig %d, cost %d, nops %d/%d, insns %d, branch %d, mid.stops %d state %d) for %d\n",
 	       curr_state->unique_num,
 	       (curr_state->originator == NULL
 		? -1 : curr_state->originator->unique_num),
 	       curr_state->cost,
 	       curr_state->before_nops_num, curr_state->after_nops_num,
 	       curr_state->accumulated_insns_num, curr_state->branch_deviation,
+	       curr_state->middle_bundle_stops,
 	       (ia64_tune == PROCESSOR_ITANIUM
 		? ((struct DFA_chip *) curr_state->dfa_state)->oneb_automaton_state
 		: ((struct DFA_chip *) curr_state->dfa_state)->twob_automaton_state),
@@ -8064,10 +8509,15 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
 		    < best_state->accumulated_insns_num
 		    || (curr_state->accumulated_insns_num
 			== best_state->accumulated_insns_num
-			&& curr_state->branch_deviation
-			< best_state->branch_deviation)))))
+			&& (curr_state->branch_deviation
+			    < best_state->branch_deviation
+			    || (curr_state->branch_deviation
+				== best_state->branch_deviation
+				&& curr_state->middle_bundle_stops
+				< best_state->middle_bundle_stops)))))))
       best_state = curr_state;
   /* Second (backward) pass: adding nops and templates.  */
+  gcc_assert (best_state);
   insn_num = best_state->before_nops_num;
   template0 = template1 = -1;
   for (curr_state = best_state;
@@ -8090,13 +8540,14 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
 
 	  fprintf
 	    (dump,
-	     "//    Best %d (orig %d, cost %d, nops %d/%d, insns %d, branch %d, state %d) for %d\n",
+	     "//    Best %d (orig %d, cost %d, nops %d/%d, insns %d, branch %d, mid.stops %d, state %d) for %d\n",
 	     curr_state->unique_num,
 	     (curr_state->originator == NULL
 	      ? -1 : curr_state->originator->unique_num),
 	     curr_state->cost,
 	     curr_state->before_nops_num, curr_state->after_nops_num,
 	     curr_state->accumulated_insns_num, curr_state->branch_deviation,
+	     curr_state->middle_bundle_stops,
 	     (ia64_tune == PROCESSOR_ITANIUM
 	      ? ((struct DFA_chip *) curr_state->dfa_state)->oneb_automaton_state
 	      : ((struct DFA_chip *) curr_state->dfa_state)->twob_automaton_state),
@@ -8295,6 +8746,57 @@ bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
 				     insn);
 	  }
       }
+
+#ifdef ENABLE_CHECKING
+  {
+    /* Assert right calculation of middle_bundle_stops.  */
+    int num = best_state->middle_bundle_stops;
+    bool start_bundle = true, end_bundle = false;
+
+    for (insn = NEXT_INSN (prev_head_insn);
+	 insn && insn != tail;
+	 insn = NEXT_INSN (insn))
+      {
+	if (!INSN_P (insn))
+	  continue;
+	if (recog_memoized (insn) == CODE_FOR_bundle_selector)
+	  start_bundle = true;
+	else
+	  {
+	    rtx next_insn;
+
+	    for (next_insn = NEXT_INSN (insn);
+		 next_insn && next_insn != tail;
+		 next_insn = NEXT_INSN (next_insn))
+	      if (INSN_P (next_insn)
+		  && (ia64_safe_itanium_class (next_insn)
+		      != ITANIUM_CLASS_IGNORE
+		      || recog_memoized (next_insn)
+		      == CODE_FOR_bundle_selector)
+		  && GET_CODE (PATTERN (next_insn)) != USE
+		  && GET_CODE (PATTERN (next_insn)) != CLOBBER)
+		break;
+
+	    end_bundle = next_insn == NULL_RTX
+	     || next_insn == tail
+	     || (INSN_P (next_insn)
+		 && recog_memoized (next_insn)
+		 == CODE_FOR_bundle_selector);
+	    if (recog_memoized (insn) == CODE_FOR_insn_group_barrier
+		&& !start_bundle && !end_bundle
+		&& next_insn
+		&& GET_CODE (PATTERN (next_insn)) != ASM_INPUT
+		&& asm_noperands (PATTERN (next_insn)) < 0)
+	      num--;
+
+	    start_bundle = false;
+	  }
+      }
+
+    gcc_assert (num == 0);
+  }
+#endif
+
   free (index_to_bundle_states);
   finish_bundle_state_table ();
   bundling_p = 0;
@@ -8332,6 +8834,7 @@ final_emit_insn_group_barriers (FILE *dump ATTRIBUTE_UNUSED)
 {
   rtx insn;
   int need_barrier_p = 0;
+  int seen_good_insn = 0;
   rtx prev_insn = NULL_RTX;
 
   init_insn_group_barriers ();
@@ -8353,6 +8856,7 @@ final_emit_insn_group_barriers (FILE *dump ATTRIBUTE_UNUSED)
 	    emit_insn_after (gen_insn_group_barrier (GEN_INT (3)), last);
 
 	  init_insn_group_barriers ();
+	  seen_good_insn = 0;
 	  need_barrier_p = 0;
 	  prev_insn = NULL_RTX;
 	}
@@ -8361,10 +8865,14 @@ final_emit_insn_group_barriers (FILE *dump ATTRIBUTE_UNUSED)
 	  if (recog_memoized (insn) == CODE_FOR_insn_group_barrier)
 	    {
 	      init_insn_group_barriers ();
+	      seen_good_insn = 0;
 	      need_barrier_p = 0;
 	      prev_insn = NULL_RTX;
 	    }
-	  else if (need_barrier_p || group_barrier_needed (insn))
+	  else if (need_barrier_p || group_barrier_needed (insn)
+		   || (mflag_sched_stop_bits_after_every_cycle
+		       && GET_MODE (insn) == TImode
+		       && seen_good_insn))
 	    {
 	      if (TARGET_EARLY_STOP_BITS)
 		{
@@ -8388,19 +8896,32 @@ final_emit_insn_group_barriers (FILE *dump ATTRIBUTE_UNUSED)
 		       last != insn;
 		       last = NEXT_INSN (last))
 		    if (INSN_P (last))
-		      group_barrier_needed (last);
+		      {
+			group_barrier_needed (last);
+			if (recog_memoized (last) >= 0
+			    && important_for_bundling_p (last))
+			  seen_good_insn = 1;
+		      }
 		}
 	      else
 		{
 		  emit_insn_before (gen_insn_group_barrier (GEN_INT (3)),
 				    insn);
 		  init_insn_group_barriers ();
+		  seen_good_insn = 0;
 		}
 	      group_barrier_needed (insn);
+	      if (recog_memoized (insn) >= 0
+		  && important_for_bundling_p (insn))
+		seen_good_insn = 1;
 	      prev_insn = NULL_RTX;
 	    }
-	  else if (recog_memoized (insn) >= 0)
-	    prev_insn = insn;
+	  else if (recog_memoized (insn) >= 0
+		   && important_for_bundling_p (insn))
+	    {
+	      prev_insn = insn;
+	      seen_good_insn = 1;
+	    }
 	  need_barrier_p = (GET_CODE (insn) == CALL_INSN
 			    || GET_CODE (PATTERN (insn)) == ASM_INPUT
 			    || asm_noperands (PATTERN (insn)) >= 0);
@@ -8509,7 +9030,8 @@ ia64_ld_address_bypass_p (rtx producer, rtx consumer)
     {
       int c = XINT (mem, 1);
 
-      gcc_assert (c == UNSPEC_LDA || c == UNSPEC_LDS || c == UNSPEC_LDSA);
+      gcc_assert (c == UNSPEC_LDA || c == UNSPEC_LDS || c == UNSPEC_LDS_A
+		  || c == UNSPEC_LDSA);
       mem = XVECEXP (mem, 0, 0);
     }
 
@@ -8606,7 +9128,8 @@ ia64_reorg (void)
   if (optimize == 0)
     split_all_insns ();
 
-  if (optimize && ia64_flag_schedule_insns2 && dbg_cnt (ia64_sched2))
+  if (optimize && ia64_flag_schedule_insns2
+      && dbg_cnt (ia64_sched2))
     {
       timevar_push (TV_SCHED2);
       ia64_final_schedule = 1;
@@ -8680,7 +9203,16 @@ ia64_reorg (void)
 	  _1mfb_ = get_cpu_unit_code ("1b_1mfb.");
 	  _1mlx_ = get_cpu_unit_code ("1b_1mlx.");
 	}
-      schedule_ebbs ();
+
+      if (flag_selective_scheduling2
+	  && !maybe_skip_selective_scheduling ())
+        run_selective_scheduling ();
+      else
+	schedule_ebbs ();
+
+      /* Redo alignment computation, as it might gone wrong.  */
+      compute_alignments ();
+
       /* We cannot reuse this one because it has been corrupted by the
 	 evil glat.  */
       finish_bundle_states ();
@@ -9236,7 +9768,11 @@ process_for_unwind_directive (FILE *asm_out_file, rtx insn)
 enum ia64_builtins
 {
   IA64_BUILTIN_BSP,
-  IA64_BUILTIN_FLUSHRS
+  IA64_BUILTIN_COPYSIGNQ,
+  IA64_BUILTIN_FABSQ,
+  IA64_BUILTIN_FLUSHRS,
+  IA64_BUILTIN_INFQ,
+  IA64_BUILTIN_HUGE_VALQ
 };
 
 void
@@ -9260,10 +9796,39 @@ ia64_init_builtins (void)
   /* The __float128 type.  */
   if (!TARGET_HPUX)
     {
+      tree ftype, decl;
       tree float128_type = make_node (REAL_TYPE);
+
       TYPE_PRECISION (float128_type) = 128;
       layout_type (float128_type);
       (*lang_hooks.types.register_builtin_type) (float128_type, "__float128");
+
+      /* TFmode support builtins.  */
+      ftype = build_function_type (float128_type, void_list_node);
+      add_builtin_function ("__builtin_infq", ftype,
+			    IA64_BUILTIN_INFQ, BUILT_IN_MD,
+			    NULL, NULL_TREE);
+
+      add_builtin_function ("__builtin_huge_valq", ftype,
+			    IA64_BUILTIN_HUGE_VALQ, BUILT_IN_MD,
+			    NULL, NULL_TREE);
+
+      ftype = build_function_type_list (float128_type,
+					float128_type,
+					NULL_TREE);
+      decl = add_builtin_function ("__builtin_fabsq", ftype,
+				   IA64_BUILTIN_FABSQ, BUILT_IN_MD,
+				   "__fabstf2", NULL_TREE);
+      TREE_READONLY (decl) = 1;
+
+      ftype = build_function_type_list (float128_type,
+					float128_type,
+					float128_type,
+					NULL_TREE);
+      decl = add_builtin_function ("__builtin_copysignq", ftype,
+				   IA64_BUILTIN_COPYSIGNQ, BUILT_IN_MD,
+				   "__copysigntf3", NULL_TREE);
+      TREE_READONLY (decl) = 1;
     }
   else
     /* Under HPUX, this is a synonym for "long double".  */
@@ -9321,8 +9886,30 @@ ia64_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
       emit_insn (gen_flushrs ());
       return const0_rtx;
 
+    case IA64_BUILTIN_INFQ:
+    case IA64_BUILTIN_HUGE_VALQ:
+      {
+	REAL_VALUE_TYPE inf;
+	rtx tmp;
+
+	real_inf (&inf);
+	tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
+
+	tmp = validize_mem (force_const_mem (mode, tmp));
+
+	if (target == 0)
+	  target = gen_reg_rtx (mode);
+
+	emit_move_insn (target, tmp);
+	return target;
+      }
+
+    case IA64_BUILTIN_FABSQ:
+    case IA64_BUILTIN_COPYSIGNQ:
+      return expand_call (exp, target, ignore);
+
     default:
-      break;
+      gcc_unreachable ();
     }
 
   return NULL_RTX;
@@ -9485,6 +10072,13 @@ ia64_sysv4_init_libfuncs (void)
   /* We leave out _U_Qfmin, _U_Qfmax and _U_Qfabs since soft-fp in
      glibc doesn't have them.  */
 }
+
+/* Use soft-fp.  */
+
+static void
+ia64_soft_fp_init_libfuncs (void)
+{
+}
 
 /* For HPUX, it is illegal to have relocations in shared segments.  */
 
@@ -9735,7 +10329,7 @@ ia64_scalar_mode_supported_p (enum machine_mode mode)
       return true;
 
     case TFmode:
-      return TARGET_HPUX;
+      return true;
 
     default:
       return false;
@@ -9930,13 +10524,6 @@ void
 ia64_optimization_options (int level ATTRIBUTE_UNUSED,
                            int size ATTRIBUTE_UNUSED)
 {
-  /* Disable the second machine independent scheduling pass and use one for the
-     IA-64.  This needs to be here instead of in OVERRIDE_OPTIONS because this
-     is done whenever the optimization is changed via #pragma GCC optimize or
-     attribute((optimize(...))).  */
-  ia64_flag_schedule_insns2 = flag_schedule_insns_after_reload;
-  flag_schedule_insns_after_reload = 0;
-
   /* Let the scheduler form additional regions.  */
   set_param_value ("max-sched-extend-regions-iters", 2);
 
@@ -9944,6 +10531,7 @@ ia64_optimization_options (int level ATTRIBUTE_UNUSED,
   set_param_value ("simultaneous-prefetches", 6);
   set_param_value ("l1-cache-line-size", 32);
 
+  set_param_value("sched-mem-true-dep-cost", 4);
 }
 
 /* HP-UX version_id attribute.