Emit vzerouppers after reload.

[pf3gnuchains/gcc-fork.git] / gcc / config / i386 / i386.c
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

index f2bd705..a5beb83 100644 (file)
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -108,163 +108,119 @@ check_avx256_stores (rtx dest, const_rtx set, void *data)
  static void
  move_or_delete_vzeroupper_2 (basic_block bb, bool upper_128bits_set)
  {
-  rtx curr_insn, next_insn, prev_insn, insn;
+  rtx insn;
+  rtx vzeroupper_insn = NULL_RTX;
+  rtx pat;
+  int avx256;
  
    if (dump_file)
      fprintf (dump_file, " BB [%i] entry: upper 128bits: %d\n",
              bb->index, upper_128bits_set);
  
-  for (curr_insn = BB_HEAD (bb);
-       curr_insn && curr_insn != NEXT_INSN (BB_END (bb));
-       curr_insn = next_insn)
+  insn = BB_HEAD (bb);
+  while (insn != BB_END (bb))
      {
-      int avx256;
+      insn = NEXT_INSN (insn);
  
-      next_insn = NEXT_INSN (curr_insn);
-
-      if (!NONDEBUG_INSN_P (curr_insn))
+      if (!NONDEBUG_INSN_P (insn))
         continue;
  
-      /* Search for vzeroupper.  */
-      insn = PATTERN (curr_insn);
-      if (GET_CODE (insn) == UNSPEC_VOLATILE
-         && XINT (insn, 1) == UNSPECV_VZEROUPPER)
+      /* Move vzeroupper before jump/call.  */
+      if (JUMP_P (insn) || CALL_P (insn))
+       {
+         if (!vzeroupper_insn)
+           continue;
+
+         if (PREV_INSN (insn) != vzeroupper_insn)
+           {
+             if (dump_file)
+               {
+                 fprintf (dump_file, "Move vzeroupper after:\n");
+                 print_rtl_single (dump_file, PREV_INSN (insn));
+                 fprintf (dump_file, "before:\n");
+                 print_rtl_single (dump_file, insn);
+               }
+             reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
+                                 PREV_INSN (insn));
+           }
+         vzeroupper_insn = NULL_RTX;
+         continue;
+       }
+
+      pat = PATTERN (insn);
+
+      /* Check insn for vzeroupper intrinsic.  */
+      if (GET_CODE (pat) == UNSPEC_VOLATILE
+         && XINT (pat, 1) == UNSPECV_VZEROUPPER)
         {
-         /* Found vzeroupper.  */
           if (dump_file)
             {
+             /* Found vzeroupper intrinsic.  */
               fprintf (dump_file, "Found vzeroupper:\n");
-             print_rtl_single (dump_file, curr_insn);
+             print_rtl_single (dump_file, insn);
             }
         }
        else
         {
-         /* Check vzeroall intrinsic.  */
-         if (GET_CODE (insn) == PARALLEL
-             && GET_CODE (XVECEXP (insn, 0, 0)) == UNSPEC_VOLATILE
-             && XINT (XVECEXP (insn, 0, 0), 1) == UNSPECV_VZEROALL)
-           upper_128bits_set = false;
-         else if (!upper_128bits_set)
+         /* Check insn for vzeroall intrinsic.  */
+         if (GET_CODE (pat) == PARALLEL
+             && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
+             && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
             {
-             /* Check if upper 128bits of AVX registers are used.  */
-             note_stores (insn, check_avx256_stores,
-                          &upper_128bits_set);
+             upper_128bits_set = false;
+
+             /* Delete pending vzeroupper insertion.  */
+             if (vzeroupper_insn)
+               {
+                 delete_insn (vzeroupper_insn);
+                 vzeroupper_insn = NULL_RTX;
+               }
             }
+         else if (!upper_128bits_set)
+           note_stores (pat, check_avx256_stores, &upper_128bits_set);
           continue;
         }
  
-      avx256 = INTVAL (XVECEXP (insn, 0, 0));
+      /* Process vzeroupper intrinsic.  */
+      avx256 = INTVAL (XVECEXP (pat, 0, 0));
  
        if (!upper_128bits_set)
         {
           /* Since the upper 128bits are cleared, callee must not pass
              256bit AVX register.  We only need to check if callee
              returns 256bit AVX register.  */
-         upper_128bits_set = avx256 == callee_return_avx256;
+         upper_128bits_set = (avx256 == callee_return_avx256);
  
-         /* Remove unnecessary vzeroupper since upper 128bits are
-            cleared.  */
+         /* Remove unnecessary vzeroupper since
+            upper 128bits are cleared.  */
           if (dump_file)
             {
               fprintf (dump_file, "Delete redundant vzeroupper:\n");
-             print_rtl_single (dump_file, curr_insn);
+             print_rtl_single (dump_file, insn);
             }
-         delete_insn (curr_insn);
-         continue;
+         delete_insn (insn);
         }
        else if (avx256 == callee_return_pass_avx256
                || avx256 == callee_pass_avx256)
         {
           /* Callee passes 256bit AVX register.  Check if callee
              returns 256bit AVX register.  */
-         upper_128bits_set = avx256 == callee_return_pass_avx256;
+         upper_128bits_set = (avx256 == callee_return_pass_avx256);
  
-         /* Must remove vzeroupper since callee passes 256bit AVX
-            register.  */
+         /* Must remove vzeroupper since
+            callee passes in 256bit AVX register.  */
           if (dump_file)
             {
               fprintf (dump_file, "Delete callee pass vzeroupper:\n");
-             print_rtl_single (dump_file, curr_insn);
-           }
-         delete_insn (curr_insn);
-         continue;
-       }
-
-      /* Find the jump after vzeroupper.  */
-      prev_insn = curr_insn;
-      if (avx256 == vzeroupper_intrinsic)
-       {
-         /* For vzeroupper intrinsic, check if there is another
-            vzeroupper.  */
-         insn = NEXT_INSN (curr_insn);
-         while (insn)
-           {
-             if (NONJUMP_INSN_P (insn)
-                 && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
-                 && XINT (PATTERN (insn), 1) == UNSPECV_VZEROUPPER)
-               {
-                 if (dump_file)
-                   {
-                     fprintf (dump_file,
-                              "Delete redundant vzeroupper intrinsic:\n");
-                     print_rtl_single (dump_file, curr_insn);
-                   }
-                 delete_insn (curr_insn);
-                 insn = NULL;
-                 continue;
-               }
-
-             if (JUMP_P (insn) || CALL_P (insn))
-               break;
-             prev_insn = insn;
-             insn = NEXT_INSN (insn);
-             if (insn == NEXT_INSN (BB_END (bb)))
-               break;
+             print_rtl_single (dump_file, insn);
             }
-
-         /* Continue if redundant vzeroupper intrinsic is deleted.  */
-         if (!insn)
-           continue;
+         delete_insn (insn);
         }
        else
         {
-         /* Find the next jump/call.  */
-         insn = NEXT_INSN (curr_insn);
-         while (insn)
-           {
-             if (JUMP_P (insn) || CALL_P (insn))
-               break;
-             prev_insn = insn;
-             insn = NEXT_INSN (insn);
-             if (insn == NEXT_INSN (BB_END (bb)))
-               break;
-           }
-
-         if (!insn)
-           gcc_unreachable();
+         upper_128bits_set = false;
+         vzeroupper_insn = insn;
         }
-
-      /* Keep vzeroupper.  */
-      upper_128bits_set = false;
-
-      /* Also allow label as the next instruction.  */
-      if (insn == NEXT_INSN (BB_END (bb)) && !LABEL_P (insn))
-       gcc_unreachable();
-
-      /* Move vzeroupper before jump/call if neeeded.  */
-      if (curr_insn != prev_insn)
-       {
-         reorder_insns_nobb (curr_insn, curr_insn, prev_insn);
-         if (dump_file)
-           {
-             fprintf (dump_file, "Move vzeroupper after:\n");
-             print_rtl_single (dump_file, prev_insn);
-             fprintf (dump_file, "before:\n");
-             print_rtl_single (dump_file, insn);
-           }
-       }
-
-      next_insn = NEXT_INSN (insn);
      }
  
    BLOCK_INFO (bb)->upper_128bits_set = upper_128bits_set;
@@ -1642,6 +1598,8 @@ const struct processor_costs *ix86_cost = &pentium_cost;
  #define m_PENT4  (1<<PROCESSOR_PENTIUM4)
  #define m_NOCONA  (1<<PROCESSOR_NOCONA)
  #define m_CORE2  (1<<PROCESSOR_CORE2)
+#define m_COREI7_32  (1<<PROCESSOR_COREI7_32)
+#define m_COREI7_64  (1<<PROCESSOR_COREI7_64)
  #define m_ATOM  (1<<PROCESSOR_ATOM)
  
  #define m_GEODE  (1<<PROCESSOR_GEODE)
@@ -1654,8 +1612,8 @@ const struct processor_costs *ix86_cost = &pentium_cost;
  #define m_BDVER1  (1<<PROCESSOR_BDVER1)
  #define m_AMD_MULTIPLE  (m_K8 | m_ATHLON | m_AMDFAM10 | m_BDVER1)
  
-#define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
-#define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
+#define m_GENERIC32 (1<<PROCESSOR_GENERIC32 | m_COREI7_32)
+#define m_GENERIC64 (1<<PROCESSOR_GENERIC64 | m_COREI7_64)
  
  /* Generic instruction choice should be common subset of supported CPUs
     (PPro/PENT4/NOCONA/CORE2/Athlon/K8).  */
@@ -2151,6 +2109,7 @@ struct ix86_frame
    HOST_WIDE_INT frame_pointer_offset;
    HOST_WIDE_INT hard_frame_pointer_offset;
    HOST_WIDE_INT stack_pointer_offset;
+  HOST_WIDE_INT hfp_save_offset;
    HOST_WIDE_INT reg_save_offset;
    HOST_WIDE_INT sse_reg_save_offset;
  
@@ -2460,6 +2419,10 @@ static const struct ptt processor_target_table[PROCESSOR_max] =
    {&k8_cost, 16, 7, 16, 7, 16},
    {&nocona_cost, 0, 0, 0, 0, 0},
    {&core2_cost, 16, 10, 16, 10, 16},
+  /* Core i7 32-bit.  */
+  {&generic32_cost, 16, 10, 16, 10, 16},
+  /* Core i7 64-bit.  */
+  {&generic64_cost, 16, 10, 16, 10, 16},
    {&generic32_cost, 16, 7, 16, 7, 16},
    {&generic64_cost, 16, 10, 16, 10, 16},
    {&amdfam10_cost, 32, 24, 32, 7, 32},
@@ -2482,6 +2445,7 @@ static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
    "prescott",
    "nocona",
    "core2",
+  "corei7",
    "atom",
    "geode",
    "k6",
@@ -3182,6 +3146,9 @@ ix86_option_override_internal (bool main_args_p)
        {"core2", PROCESSOR_CORE2, CPU_CORE2,
         PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
         | PTA_SSSE3 | PTA_CX16},
+      {"corei7", PROCESSOR_COREI7_64, CPU_GENERIC64,
+       PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
+       | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
        {"atom", PROCESSOR_ATOM, CPU_ATOM,
         PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
         | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
@@ -3521,23 +3488,45 @@ ix86_option_override_internal (bool main_args_p)
        {
         ix86_schedule = processor_alias_table[i].schedule;
         ix86_tune = processor_alias_table[i].processor;
-       if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
+       if (TARGET_64BIT)
           {
-           if (ix86_tune_defaulted)
+           if (!(processor_alias_table[i].flags & PTA_64BIT))
               {
-               ix86_tune_string = "x86-64";
-               for (i = 0; i < pta_size; i++)
-                 if (! strcmp (ix86_tune_string,
-                               processor_alias_table[i].name))
-                   break;
-               ix86_schedule = processor_alias_table[i].schedule;
-               ix86_tune = processor_alias_table[i].processor;
+               if (ix86_tune_defaulted)
+                 {
+                   ix86_tune_string = "x86-64";
+                   for (i = 0; i < pta_size; i++)
+                     if (! strcmp (ix86_tune_string,
+                                   processor_alias_table[i].name))
+                       break;
+                   ix86_schedule = processor_alias_table[i].schedule;
+                   ix86_tune = processor_alias_table[i].processor;
+                 }
+               else
+                 error ("CPU you selected does not support x86-64 "
+                        "instruction set");
+             }
+         }
+       else
+         {
+           /* Adjust tuning when compiling for 32-bit ABI.  */
+           switch (ix86_tune)
+             {
+             case PROCESSOR_GENERIC64:
+               ix86_tune = PROCESSOR_GENERIC32;
+               ix86_schedule = CPU_PENTIUMPRO;
+               break;
+
+             case PROCESSOR_COREI7_64:
+               ix86_tune = PROCESSOR_COREI7_32;
+               ix86_schedule = CPU_PENTIUMPRO;
+               break;
+
+             default:
+               break;
               }
-           else
-             error ("CPU you selected does not support x86-64 "
-                    "instruction set");
           }
-        /* Intel CPUs have always interpreted SSE prefetch instructions as
+       /* Intel CPUs have always interpreted SSE prefetch instructions as
            NOPs; so, we can enable SSE prefetch instructions even when
            -mtune (rather than -march) points us to a processor that has them.
            However, the VIA C3 gives a SIGILL, so we only do that for i686 and
@@ -3573,7 +3562,7 @@ ix86_option_override_internal (bool main_args_p)
        if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
         flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
        if (flag_asynchronous_unwind_tables == 2)
-       flag_asynchronous_unwind_tables = 1;
+       flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
        if (flag_pcc_struct_return == 2)
         flag_pcc_struct_return = 0;
      }
@@ -3777,10 +3766,19 @@ ix86_option_override_internal (bool main_args_p)
    ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
    if (ix86_preferred_stack_boundary_string)
      {
+      int min = (TARGET_64BIT ? 4 : 2);
+      int max = (TARGET_SEH ? 4 : 12);
+
        i = atoi (ix86_preferred_stack_boundary_string);
-      if (i < (TARGET_64BIT ? 4 : 2) || i > 12)
-       error ("%spreferred-stack-boundary=%d%s is not between %d and 12",
-              prefix, i, suffix, TARGET_64BIT ? 4 : 2);
+      if (i < min || i > max)
+       {
+         if (min == max)
+           error ("%spreferred-stack-boundary%s is not supported "
+                  "for this target", prefix, suffix);
+         else
+           error ("%spreferred-stack-boundary=%d%s is not between %d and %d",
+                  prefix, i, suffix, min, max);
+       }
        else
         ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT;
      }
@@ -3987,7 +3985,13 @@ ix86_option_override_internal (bool main_args_p)
          sorry ("-mfentry isn't supported for 32-bit in combination with -fpic");
        flag_fentry = 0;
      }
-  if (flag_fentry < 0)
+  else if (TARGET_SEH)
+    {
+      if (flag_fentry == 0)
+       sorry ("-mno-fentry isn't compatible with SEH");
+      flag_fentry = 1;
+    }
+  else if (flag_fentry < 0)
     {
  #if defined(PROFILE_BEFORE_PROLOGUE)
       flag_fentry = 1;
@@ -4926,8 +4930,11 @@ ix86_function_ok_for_sibcall (tree decl, tree exp)
  
    /* If we are generating position-independent code, we cannot sibcall
       optimize any indirect call, or a direct call to a global function,
-     as the PLT requires %ebx be live.  */
-  if (!TARGET_64BIT && flag_pic && (!decl || !targetm.binds_local_p (decl)))
+     as the PLT requires %ebx be live. (Darwin does not have a PLT.)  */
+  if (!TARGET_MACHO
+      && !TARGET_64BIT 
+      && flag_pic 
+      && (!decl || !targetm.binds_local_p (decl)))
      return false;
  
    /* If we need to align the outgoing stack, then sibcalling would
@@ -5536,6 +5543,10 @@ ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
          fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
      }
  
+#ifdef SUBTARGET_ASM_UNWIND_INIT
+  SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
+#endif
+
    ASM_OUTPUT_LABEL (asm_out_file, fname);
  
    /* Output magic byte marker, if hot-patch attribute is set.  */
@@ -6952,10 +6963,12 @@ ix86_pass_by_reference (CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED,
    return 0;
  }
  
-/* Return true when TYPE should be 128bit aligned for 32bit argument passing
-   ABI.  */
+/* Return true when TYPE should be 128bit aligned for 32bit argument
+   passing ABI.  XXX: This function is obsolete and is only used for
+   checking psABI compatibility with previous versions of GCC.  */
+
  static bool
-contains_aligned_value_p (const_tree type)
+ix86_compat_aligned_value_p (const_tree type)
  {
    enum machine_mode mode = TYPE_MODE (type);
    if (((TARGET_SSE && SSE_REG_MODE_P (mode))
@@ -6982,7 +6995,7 @@ contains_aligned_value_p (const_tree type)
             for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
               {
                 if (TREE_CODE (field) == FIELD_DECL
-                   && contains_aligned_value_p (TREE_TYPE (field)))
+                   && ix86_compat_aligned_value_p (TREE_TYPE (field)))
                   return true;
               }
             break;
@@ -6990,7 +7003,7 @@ contains_aligned_value_p (const_tree type)
  
         case ARRAY_TYPE:
           /* Just for use if some languages passes arrays by value.  */
-         if (contains_aligned_value_p (TREE_TYPE (type)))
+         if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
             return true;
           break;
  
@@ -7001,9 +7014,13 @@ contains_aligned_value_p (const_tree type)
    return false;
  }
  
+/* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
+   XXX: This function is obsolete and is only used for checking psABI
+   compatibility with previous versions of GCC.  */
+
  static int
-ix86_old_function_arg_boundary (enum machine_mode mode, const_tree type,
-                               int align)
+ix86_compat_function_arg_boundary (enum machine_mode mode,
+                                  const_tree type, int align)
  {
    /* In 32bit, only _Decimal128 and __float128 are aligned to their
       natural boundaries.  */
@@ -7023,7 +7040,7 @@ ix86_old_function_arg_boundary (enum machine_mode mode, const_tree type,
         }
        else
         {
-         if (!contains_aligned_value_p (type))
+         if (!ix86_compat_aligned_value_p (type))
             align = PARM_BOUNDARY;
         }
      }
@@ -7032,6 +7049,59 @@ ix86_old_function_arg_boundary (enum machine_mode mode, const_tree type,
    return align;
  }
  
+/* Return true when TYPE should be 128bit aligned for 32bit argument
+   passing ABI.  */
+
+static bool
+ix86_contains_aligned_value_p (const_tree type)
+{
+  enum machine_mode mode = TYPE_MODE (type);
+
+  if (mode == XFmode || mode == XCmode)
+    return false;
+
+  if (TYPE_ALIGN (type) < 128)
+    return false;
+
+  if (AGGREGATE_TYPE_P (type))
+    {
+      /* Walk the aggregates recursively.  */
+      switch (TREE_CODE (type))
+       {
+       case RECORD_TYPE:
+       case UNION_TYPE:
+       case QUAL_UNION_TYPE:
+         {
+           tree field;
+
+           /* Walk all the structure fields.  */
+           for (field = TYPE_FIELDS (type);
+                field;
+                field = DECL_CHAIN (field))
+             {
+               if (TREE_CODE (field) == FIELD_DECL
+                   && ix86_contains_aligned_value_p (TREE_TYPE (field)))
+                 return true;
+             }
+           break;
+         }
+
+       case ARRAY_TYPE:
+         /* Just for use if some languages passes arrays by value.  */
+         if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
+           return true;
+         break;
+
+       default:
+         gcc_unreachable ();
+       }
+    }
+  else
+    return TYPE_ALIGN (type) >= 128;
+
+  return false;
+}
+
  /* Gives the alignment boundary, in bits, of an argument with the
     specified mode and type.  */
  
@@ -7055,13 +7125,25 @@ ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
        static bool warned;
        int saved_align = align;
  
-      if (!TARGET_64BIT && align < 128)
-       align = PARM_BOUNDARY;
+      if (!TARGET_64BIT)
+       {
+         /* i386 ABI defines XFmode arguments to be 4 byte aligned.  */
+         if (!type)
+           {
+             if (mode == XFmode || mode == XCmode)
+               align = PARM_BOUNDARY;
+           }
+         else if (!ix86_contains_aligned_value_p (type))
+           align = PARM_BOUNDARY;
+
+         if (align < 128)
+           align = PARM_BOUNDARY;
+       }
  
        if (warn_psabi
           && !warned
-         && align != ix86_old_function_arg_boundary (mode, type,
-                                                     saved_align))
+         && align != ix86_compat_function_arg_boundary (mode, type,
+                                                        saved_align))
         {
           warned = true;
           inform (input_location,
@@ -8863,17 +8945,25 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
    gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
    gcc_assert (preferred_alignment <= stack_alignment_needed);
  
+  /* For SEH we have to limit the amount of code movement into the prologue.
+     At present we do this via a BLOCKAGE, at which point there's very little
+     scheduling that can be done, which means that there's very little point
+     in doing anything except PUSHs.  */
+  if (TARGET_SEH)
+    cfun->machine->use_fast_prologue_epilogue = false;
+
    /* During reload iteration the amount of registers saved can change.
       Recompute the value as needed.  Do not recompute when amount of registers
       didn't change as reload does multiple calls to the function and does not
       expect the decision to change within single iteration.  */
-  if (!optimize_function_for_size_p (cfun)
-      && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
+  else if (!optimize_function_for_size_p (cfun)
+           && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
      {
        int count = frame->nregs;
        struct cgraph_node *node = cgraph_node (current_function_decl);
  
        cfun->machine->use_fast_prologue_epilogue_nregs = count;
+
        /* The fast prologue uses move instead of push to save registers.  This
           is significantly longer, but also executes faster as modern hardware
           can execute the moves in parallel, but can't do that for push/pop.
@@ -8915,7 +9005,9 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
    /* Skip saved base pointer.  */
    if (frame_pointer_needed)
      offset += UNITS_PER_WORD;
+  frame->hfp_save_offset = offset;
  
+  /* The traditional frame pointer location is at the top of the frame.  */
    frame->hard_frame_pointer_offset = offset;
  
    /* Register save area */
@@ -8998,6 +9090,27 @@ ix86_compute_frame_layout (struct ix86_frame *frame)
    else
      frame->red_zone_size = 0;
    frame->stack_pointer_offset -= frame->red_zone_size;
+
+  /* The SEH frame pointer location is near the bottom of the frame.
+     This is enforced by the fact that the difference between the
+     stack pointer and the frame pointer is limited to 240 bytes in
+     the unwind data structure.  */
+  if (TARGET_SEH)
+    {
+      HOST_WIDE_INT diff;
+
+      /* If we can leave the frame pointer where it is, do so.  */
+      diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
+      if (diff > 240 || (diff & 15) != 0)
+       {
+         /* Ideally we'd determine what portion of the local stack frame
+            (within the constraint of the lowest 240) is most heavily used.
+            But without that complication, simply bias the frame pointer
+            by 128 bytes so as to maximize the amount of the local stack
+            frame that is addressable with 8-bit offsets.  */
+         frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
+       }
+    }
  }
  
  /* This is semi-inlined memory_address_length, but simplified
@@ -9930,7 +10043,8 @@ ix86_expand_prologue (void)
        /* Check if profiling is active and we shall use profiling before
           prologue variant. If so sorry.  */
        if (crtl->profile && flag_fentry != 0)
-        sorry ("ms_hook_prologue attribute isn't compatible with -mfentry for 32-bit");
+        sorry ("ms_hook_prologue attribute isn't compatible "
+              "with -mfentry for 32-bit");
  
        /* In ix86_asm_output_function_label we emitted:
          8b ff     movl.s %edi,%edi
@@ -10059,14 +10173,16 @@ ix86_expand_prologue (void)
        insn = emit_insn (gen_push (hard_frame_pointer_rtx));
        RTX_FRAME_RELATED_P (insn) = 1;
  
-      insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
-      RTX_FRAME_RELATED_P (insn) = 1;
+      if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
+       {
+         insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
+         RTX_FRAME_RELATED_P (insn) = 1;
  
-      if (m->fs.cfa_reg == stack_pointer_rtx)
-        m->fs.cfa_reg = hard_frame_pointer_rtx;
-      gcc_assert (m->fs.sp_offset == frame.hard_frame_pointer_offset);
-      m->fs.fp_offset = m->fs.sp_offset;
-      m->fs.fp_valid = true;
+         if (m->fs.cfa_reg == stack_pointer_rtx)
+           m->fs.cfa_reg = hard_frame_pointer_rtx;
+         m->fs.fp_offset = m->fs.sp_offset;
+         m->fs.fp_valid = true;
+       }
      }
  
    int_registers_saved = (frame.nregs == 0);
@@ -10219,12 +10335,15 @@ ix86_expand_prologue (void)
        insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
                                            stack_pointer_rtx, eax));
  
-      if (m->fs.cfa_reg == stack_pointer_rtx)
+      /* Note that SEH directives need to continue tracking the stack
+        pointer even after the frame pointer has been set up.  */
+      if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
         {
-         m->fs.cfa_offset += allocate;
+         if (m->fs.cfa_reg == stack_pointer_rtx)
+           m->fs.cfa_offset += allocate;
  
           RTX_FRAME_RELATED_P (insn) = 1;
-         add_reg_note (insn, REG_CFA_ADJUST_CFA,
+         add_reg_note (insn, REG_FRAME_RELATED_EXPR,
                         gen_rtx_SET (VOIDmode, stack_pointer_rtx,
                                      plus_constant (stack_pointer_rtx,
                                                     -allocate)));
@@ -10246,6 +10365,22 @@ ix86_expand_prologue (void)
      }
    gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
  
+  /* If we havn't already set up the frame pointer, do so now.  */
+  if (frame_pointer_needed && !m->fs.fp_valid)
+    {
+      insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
+                           GEN_INT (frame.stack_pointer_offset
+                                    - frame.hard_frame_pointer_offset));
+      insn = emit_insn (insn);
+      RTX_FRAME_RELATED_P (insn) = 1;
+      add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
+
+      if (m->fs.cfa_reg == stack_pointer_rtx)
+       m->fs.cfa_reg = hard_frame_pointer_rtx;
+      m->fs.fp_offset = frame.hard_frame_pointer_offset;
+      m->fs.fp_valid = true;
+    }
+
    if (!int_registers_saved)
      ix86_emit_save_regs_using_mov (frame.reg_save_offset);
    if (frame.nsseregs)
@@ -10315,6 +10450,11 @@ ix86_expand_prologue (void)
    /* Emit cld instruction if stringops are used in the function.  */
    if (TARGET_CLD && ix86_current_function_needs_cld)
      emit_insn (gen_cld ());
+
+  /* SEH requires that the prologue end within 256 bytes of the start of
+     the function.  Prevent instruction schedules that would extend that.  */
+  if (TARGET_SEH)
+    emit_insn (gen_blockage ());
  }
  
  /* Emit code to restore REG using a POP insn.  */
@@ -10539,13 +10679,16 @@ ix86_expand_epilogue (int style)
    if (crtl->calls_eh_return && style != 2)
      frame.reg_save_offset -= 2 * UNITS_PER_WORD;
  
+  /* EH_RETURN requires the use of moves to function properly.  */
+  if (crtl->calls_eh_return)
+    restore_regs_via_mov = true;
+  /* SEH requires the use of pops to identify the epilogue.  */
+  else if (TARGET_SEH)
+    restore_regs_via_mov = false;
    /* If we're only restoring one register and sp is not valid then
       using a move instruction to restore the register since it's
       less work than reloading sp and popping the register.  */
-  if (!m->fs.sp_valid && frame.nregs <= 1)
-    restore_regs_via_mov = true;
-  /* EH_RETURN requires the use of moves to function properly.  */
-  else if (crtl->calls_eh_return)
+  else if (!m->fs.sp_valid && frame.nregs <= 1)
      restore_regs_via_mov = true;
    else if (TARGET_EPILOGUE_USING_MOVE
            && cfun->machine->use_fast_prologue_epilogue
@@ -10657,6 +10800,22 @@ ix86_expand_epilogue (int style)
      }
    else
      {
+      /* SEH requires that the function end with (1) a stack adjustment
+        if necessary, (2) a sequence of pops, and (3) a return or
+        jump instruction.  Prevent insns from the function body from
+        being scheduled into this sequence.  */
+      if (TARGET_SEH)
+       {
+         /* Prevent a catch region from being adjacent to the standard
+            epilogue sequence.  Unfortuantely crtl->uses_eh_lsda nor
+            several other flags that would be interesting to test are
+            not yet set up.  */
+         if (flag_non_call_exceptions)
+           emit_insn (gen_nops (const1_rtx));
+         else
+           emit_insn (gen_blockage ());
+       }
+
        /* First step is to deallocate the stack frame so that we can
          pop the registers.  */
        if (!m->fs.sp_valid)
@@ -10684,7 +10843,7 @@ ix86_expand_epilogue (int style)
      {
        /* If the stack pointer is valid and pointing at the frame
          pointer store address, then we only need a pop.  */
-      if (m->fs.sp_valid && m->fs.sp_offset == frame.hard_frame_pointer_offset)
+      if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
         ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
        /* Leave results in shorter dependency chains on CPUs that are
          able to grok it fast.  */
@@ -11407,6 +11566,12 @@ legitimate_constant_p (rtx x)
        if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
           && SYMBOL_REF_DLLIMPORT_P (x))
         return false;
+
+#if TARGET_MACHO
+      /* mdynamic-no-pic */
+      if (MACHO_DYNAMIC_NO_PIC_P)
+       return machopic_symbol_defined_p (x);
+#endif
        break;
  
      case CONST_DOUBLE:
@@ -11777,9 +11942,15 @@ ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
                 /* Non-constant pic memory reference.  */
                 return false;
             }
-         else if (! legitimate_pic_address_disp_p (disp))
+         else if ((!TARGET_MACHO || flag_pic)
+                   && ! legitimate_pic_address_disp_p (disp))
             /* Displacement is an invalid pic construct.  */
             return false;
+#if TARGET_MACHO
+         else if (MACHO_DYNAMIC_NO_PIC_P && !legitimate_constant_p (disp))
+           /* displacment must be referenced via non_lazy_pointer */
+           return false;
+#endif
  
            /* This code used to verify that a symbolic pic displacement
              includes the pic_offset_table_rtx register.
@@ -12388,6 +12559,11 @@ ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
    if (flag_pic && SYMBOLIC_CONST (x))
      return legitimize_pic_address (x, 0);
  
+#if TARGET_MACHO
+  if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
+    return machopic_indirect_data_reference (x, 0);
+#endif
+
    /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
    if (GET_CODE (x) == ASHIFT
        && CONST_INT_P (XEXP (x, 1))
@@ -13885,7 +14061,7 @@ ix86_print_operand (FILE *file, rtx x, int code)
         }
        if (CONST_INT_P (x))
         fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
-      else if (flag_pic)
+      else if (flag_pic || MACHOPIC_INDIRECT)
         output_pic_addr_const (file, x, code);
        else
         output_addr_const (file, x);
@@ -14866,25 +15042,43 @@ ix86_expand_move (enum machine_mode mode, rtx operands[])
         }
      }
  
-  if (flag_pic && mode == Pmode && symbolic_operand (op1, Pmode))
+  if ((flag_pic || MACHOPIC_INDIRECT) 
+       && mode == Pmode && symbolic_operand (op1, Pmode))
      {
        if (TARGET_MACHO && !TARGET_64BIT)
         {
  #if TARGET_MACHO
-         if (MACHOPIC_PURE)
+         /* dynamic-no-pic */
+         if (MACHOPIC_INDIRECT)
             {
               rtx temp = ((reload_in_progress
                            || ((op0 && REG_P (op0))
                                && mode == Pmode))
                           ? op0 : gen_reg_rtx (Pmode));
               op1 = machopic_indirect_data_reference (op1, temp);
-             op1 = machopic_legitimize_pic_address (op1, mode,
-                                                    temp == op1 ? 0 : temp);
+             if (MACHOPIC_PURE)
+               op1 = machopic_legitimize_pic_address (op1, mode,
+                                                      temp == op1 ? 0 : temp);
             }
-         else if (MACHOPIC_INDIRECT)
-           op1 = machopic_indirect_data_reference (op1, 0);
-         if (op0 == op1)
+         if (op0 != op1 && GET_CODE (op0) != MEM)
+           {
+             rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
+             emit_insn (insn);
+             return;
+           }
+         if (GET_CODE (op0) == MEM)
+           op1 = force_reg (Pmode, op1);
+         else
+           {
+             rtx temp = op0;
+             if (GET_CODE (temp) != REG)
+               temp = gen_reg_rtx (Pmode);
+             temp = legitimize_pic_address (op1, temp);
+             if (temp == op0)
             return;
+             op1 = temp;
+           }
+      /* dynamic-no-pic */
  #endif
         }
        else
@@ -15423,6 +15617,13 @@ ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
        gcc_assert (code == PLUS);
        emit_insn (op);
      }
+  else if (reload_completed
+          && code == PLUS
+          && !rtx_equal_p (dst, src1))
+    {
+      /* This is going to be an LEA; avoid splitting it later.  */
+      emit_insn (op);
+    }
    else
      {
        clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
@@ -21320,10 +21521,12 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
                                + 2, vec));
      }
  
-  /* Emit vzeroupper if needed.  */
+  /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration.  */
    if (TARGET_VZEROUPPER && cfun->machine->use_avx256_p)
      {
+      rtx unspec;
        int avx256;
+
        cfun->machine->use_vzeroupper_p = 1;
        if (cfun->machine->callee_pass_avx256_p)
         {
@@ -21336,7 +21539,11 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
         avx256 = callee_return_avx256;
        else
         avx256 = call_no_avx256;
-      emit_insn (gen_avx_vzeroupper (GEN_INT (avx256))); 
+
+      unspec = gen_rtx_UNSPEC (VOIDmode,
+                              gen_rtvec (1, GEN_INT (avx256)),
+                              UNSPEC_CALL_NEEDS_VZEROUPPER);
+      call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, call, unspec));
      }
  
    call = emit_call_insn (call);
@@ -21346,6 +21553,91 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
    return call;
  }
  
+void
+ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
+{
+  rtx call = XVECEXP (PATTERN (insn), 0, 0);
+  emit_insn (gen_avx_vzeroupper (vzeroupper));
+  emit_call_insn (call);
+}
+
+void
+ix86_split_call_pop_vzeroupper (rtx insn, rtx vzeroupper)
+{
+  rtx call = XVECEXP (PATTERN (insn), 0, 0);
+  rtx pop = XVECEXP (PATTERN (insn), 0, 1);
+  emit_insn (gen_avx_vzeroupper (vzeroupper));
+  emit_call_insn (gen_rtx_PARALLEL (VOIDmode,
+                                   gen_rtvec (2, call, pop)));
+}
+
+/* Output the assembly for a call instruction.  */
+
+const char *
+ix86_output_call_insn (rtx insn, rtx call_op, int addr_op)
+{
+  bool direct_p = constant_call_address_operand (call_op, Pmode);
+  bool seh_nop_p = false;
+
+  gcc_assert (addr_op == 0 || addr_op == 1);
+
+  if (SIBLING_CALL_P (insn))
+    {
+      if (direct_p)
+       return addr_op ? "jmp\t%P1" : "jmp\t%P0";
+      /* SEH epilogue detection requires the indirect branch case
+        to include REX.W.  */
+      else if (TARGET_SEH)
+       return addr_op ? "rex.W jmp %A1" : "rex.W jmp %A0";
+      else
+       return addr_op ? "jmp\t%A1" : "jmp\t%A0";
+    }
+
+  /* SEH unwinding can require an extra nop to be emitted in several
+     circumstances.  Determine if we have one of those.  */
+  if (TARGET_SEH)
+    {
+      rtx i;
+
+      for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
+       {
+         /* If we get to another real insn, we don't need the nop.  */
+         if (INSN_P (i))
+           break;
+
+         /* If we get to the epilogue note, prevent a catch region from
+            being adjacent to the standard epilogue sequence.  If non-
+            call-exceptions, we'll have done this during epilogue emission. */
+         if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
+             && !flag_non_call_exceptions
+             && !can_throw_internal (insn))
+           {
+             seh_nop_p = true;
+             break;
+           }
+       }
+
+      /* If we didn't find a real insn following the call, prevent the
+        unwinder from looking into the next function.  */
+      if (i == NULL)
+       seh_nop_p = true;
+    }
+
+  if (direct_p)
+    {
+      if (seh_nop_p)
+       return addr_op ? "call\t%P1\n\tnop" : "call\t%P0\n\tnop";
+      else
+       return addr_op ? "call\t%P1" : "call\t%P0";
+    }
+  else
+    {
+      if (seh_nop_p)
+       return addr_op ? "call\t%A1\n\tnop" : "call\t%A0\n\tnop";
+      else
+       return addr_op ? "call\t%A1" : "call\t%A0";
+    }
+}
  \f
  /* Clear stack slot assignments remembered from previous functions.
     This is called from INIT_EXPANDERS once before RTL is emitted for each
@@ -21961,12 +22253,265 @@ ia32_multipass_dfa_lookahead (void)
      case PROCESSOR_K6:
        return 1;
  
+    case PROCESSOR_CORE2:
+    case PROCESSOR_COREI7_32:
+    case PROCESSOR_COREI7_64:
+      /* Generally, we want haifa-sched:max_issue() to look ahead as far
+        as many instructions can be executed on a cycle, i.e.,
+        issue_rate.  I wonder why tuning for many CPUs does not do this.  */
+      return ix86_issue_rate ();
+
      default:
        return 0;
      }
  }
  
  \f
+
+/* Model decoder of Core 2/i7.
+   Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
+   track the instruction fetch block boundaries and make sure that long
+   (9+ bytes) instructions are assigned to D0.  */
+
+/* Maximum length of an insn that can be handled by
+   a secondary decoder unit.  '8' for Core 2/i7.  */
+static int core2i7_secondary_decoder_max_insn_size;
+
+/* Ifetch block size, i.e., number of bytes decoder reads per cycle.
+   '16' for Core 2/i7.  */
+static int core2i7_ifetch_block_size;
+
+/* Maximum number of instructions decoder can handle per cycle.
+   '6' for Core 2/i7.  */
+static int core2i7_ifetch_block_max_insns;
+
+typedef struct ix86_first_cycle_multipass_data_ *
+  ix86_first_cycle_multipass_data_t;
+typedef const struct ix86_first_cycle_multipass_data_ *
+  const_ix86_first_cycle_multipass_data_t;
+
+/* A variable to store target state across calls to max_issue within
+   one cycle.  */
+static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
+  *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
+
+/* Initialize DATA.  */
+static void
+core2i7_first_cycle_multipass_init (void *_data)
+{
+  ix86_first_cycle_multipass_data_t data
+    = (ix86_first_cycle_multipass_data_t) _data;
+
+  data->ifetch_block_len = 0;
+  data->ifetch_block_n_insns = 0;
+  data->ready_try_change = NULL;
+  data->ready_try_change_size = 0;
+}
+
+/* Advancing the cycle; reset ifetch block counts.  */
+static void
+core2i7_dfa_post_advance_cycle (void)
+{
+  ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
+
+  gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
+
+  data->ifetch_block_len = 0;
+  data->ifetch_block_n_insns = 0;
+}
+
+static int min_insn_size (rtx);
+
+/* Filter out insns from ready_try that the core will not be able to issue
+   on current cycle due to decoder.  */
+static void
+core2i7_first_cycle_multipass_filter_ready_try
+(const_ix86_first_cycle_multipass_data_t data,
+ char *ready_try, int n_ready, bool first_cycle_insn_p)
+{
+  while (n_ready--)
+    {
+      rtx insn;
+      int insn_size;
+
+      if (ready_try[n_ready])
+       continue;
+
+      insn = get_ready_element (n_ready);
+      insn_size = min_insn_size (insn);
+
+      if (/* If this is a too long an insn for a secondary decoder ...  */
+         (!first_cycle_insn_p
+          && insn_size > core2i7_secondary_decoder_max_insn_size)
+         /* ... or it would not fit into the ifetch block ...  */
+         || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
+         /* ... or the decoder is full already ...  */
+         || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
+       /* ... mask the insn out.  */
+       {
+         ready_try[n_ready] = 1;
+
+         if (data->ready_try_change)
+           SET_BIT (data->ready_try_change, n_ready);
+       }
+    }
+}
+
+/* Prepare for a new round of multipass lookahead scheduling.  */
+static void
+core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
+                                    bool first_cycle_insn_p)
+{
+  ix86_first_cycle_multipass_data_t data
+    = (ix86_first_cycle_multipass_data_t) _data;
+  const_ix86_first_cycle_multipass_data_t prev_data
+    = ix86_first_cycle_multipass_data;
+
+  /* Restore the state from the end of the previous round.  */
+  data->ifetch_block_len = prev_data->ifetch_block_len;
+  data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
+
+  /* Filter instructions that cannot be issued on current cycle due to
+     decoder restrictions.  */
+  core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
+                                                 first_cycle_insn_p);
+}
+
+/* INSN is being issued in current solution.  Account for its impact on
+   the decoder model.  */
+static void
+core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
+                                    rtx insn, const void *_prev_data)
+{
+  ix86_first_cycle_multipass_data_t data
+    = (ix86_first_cycle_multipass_data_t) _data;
+  const_ix86_first_cycle_multipass_data_t prev_data
+    = (const_ix86_first_cycle_multipass_data_t) _prev_data;
+
+  int insn_size = min_insn_size (insn);
+
+  data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
+  data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
+  gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
+             && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
+
+  /* Allocate or resize the bitmap for storing INSN's effect on ready_try.  */
+  if (!data->ready_try_change)
+    {
+      data->ready_try_change = sbitmap_alloc (n_ready);
+      data->ready_try_change_size = n_ready;
+    }
+  else if (data->ready_try_change_size < n_ready)
+    {
+      data->ready_try_change = sbitmap_resize (data->ready_try_change,
+                                              n_ready, 0);
+      data->ready_try_change_size = n_ready;
+    }
+  sbitmap_zero (data->ready_try_change);
+
+  /* Filter out insns from ready_try that the core will not be able to issue
+     on current cycle due to decoder.  */
+  core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
+                                                 false);
+}
+
+/* Revert the effect on ready_try.  */
+static void
+core2i7_first_cycle_multipass_backtrack (const void *_data,
+                                        char *ready_try,
+                                        int n_ready ATTRIBUTE_UNUSED)
+{
+  const_ix86_first_cycle_multipass_data_t data
+    = (const_ix86_first_cycle_multipass_data_t) _data;
+  unsigned int i = 0;
+  sbitmap_iterator sbi;
+
+  gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
+  EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
+    {
+      ready_try[i] = 0;
+    }
+}
+
+/* Save the result of multipass lookahead scheduling for the next round.  */
+static void
+core2i7_first_cycle_multipass_end (const void *_data)
+{
+  const_ix86_first_cycle_multipass_data_t data
+    = (const_ix86_first_cycle_multipass_data_t) _data;
+  ix86_first_cycle_multipass_data_t next_data
+    = ix86_first_cycle_multipass_data;
+
+  if (data != NULL)
+    {
+      next_data->ifetch_block_len = data->ifetch_block_len;
+      next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
+    }
+}
+
+/* Deallocate target data.  */
+static void
+core2i7_first_cycle_multipass_fini (void *_data)
+{
+  ix86_first_cycle_multipass_data_t data
+    = (ix86_first_cycle_multipass_data_t) _data;
+
+  if (data->ready_try_change)
+    {
+      sbitmap_free (data->ready_try_change);
+      data->ready_try_change = NULL;
+      data->ready_try_change_size = 0;
+    }
+}
+
+/* Prepare for scheduling pass.  */
+static void
+ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
+                       int verbose ATTRIBUTE_UNUSED,
+                       int max_uid ATTRIBUTE_UNUSED)
+{
+  /* Install scheduling hooks for current CPU.  Some of these hooks are used
+     in time-critical parts of the scheduler, so we only set them up when
+     they are actually used.  */
+  switch (ix86_tune)
+    {
+    case PROCESSOR_CORE2:
+    case PROCESSOR_COREI7_32:
+    case PROCESSOR_COREI7_64:
+      targetm.sched.dfa_post_advance_cycle
+       = core2i7_dfa_post_advance_cycle;
+      targetm.sched.first_cycle_multipass_init
+       = core2i7_first_cycle_multipass_init;
+      targetm.sched.first_cycle_multipass_begin
+       = core2i7_first_cycle_multipass_begin;
+      targetm.sched.first_cycle_multipass_issue
+       = core2i7_first_cycle_multipass_issue;
+      targetm.sched.first_cycle_multipass_backtrack
+       = core2i7_first_cycle_multipass_backtrack;
+      targetm.sched.first_cycle_multipass_end
+       = core2i7_first_cycle_multipass_end;
+      targetm.sched.first_cycle_multipass_fini
+       = core2i7_first_cycle_multipass_fini;
+
+      /* Set decoder parameters.  */
+      core2i7_secondary_decoder_max_insn_size = 8;
+      core2i7_ifetch_block_size = 16;
+      core2i7_ifetch_block_max_insns = 6;
+      break;
+
+    default:
+      targetm.sched.dfa_post_advance_cycle = NULL;
+      targetm.sched.first_cycle_multipass_init = NULL;
+      targetm.sched.first_cycle_multipass_begin = NULL;
+      targetm.sched.first_cycle_multipass_issue = NULL;
+      targetm.sched.first_cycle_multipass_backtrack = NULL;
+      targetm.sched.first_cycle_multipass_end = NULL;
+      targetm.sched.first_cycle_multipass_fini = NULL;
+      break;
+    }
+}
+
+\f
  /* Compute the alignment given to a constant that is being placed in memory.
     EXP is the constant and ALIGN is the alignment that the object would
     ordinarily have.
@@ -28147,36 +28692,81 @@ machopic_output_stub (FILE *file, const char *symb, const char *stub)
  
    sprintf (lazy_ptr_name, "L%d$lz", label);
  
-  if (MACHOPIC_PURE)
+  if (MACHOPIC_ATT_STUB)
+    switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
+  else if (MACHOPIC_PURE)
+    {
+      if (TARGET_DEEP_BRANCH_PREDICTION)
+       switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
+      else
      switch_to_section (darwin_sections[machopic_picsymbol_stub_section]);
+    }
    else
      switch_to_section (darwin_sections[machopic_symbol_stub_section]);
  
    fprintf (file, "%s:\n", stub);
    fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
  
-  if (MACHOPIC_PURE)
+  if (MACHOPIC_ATT_STUB)
      {
-      fprintf (file, "\tcall\tLPC$%d\nLPC$%d:\tpopl\t%%eax\n", label, label);
-      fprintf (file, "\tmovl\t%s-LPC$%d(%%eax),%%edx\n", lazy_ptr_name, label);
-      fprintf (file, "\tjmp\t*%%edx\n");
+      fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
+    }
+  else if (MACHOPIC_PURE)
+    {
+      /* PIC stub.  */
+      if (TARGET_DEEP_BRANCH_PREDICTION)
+       {
+         /* 25-byte PIC stub using "CALL get_pc_thunk".  */
+         rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
+         output_set_got (tmp, NULL_RTX);       /* "CALL ___<cpu>.get_pc_thunk.cx".  */
+         fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n", label, lazy_ptr_name, label);
+       }
+      else
+       {
+         /* 26-byte PIC stub using inline picbase: "CALL L42 ! L42: pop %eax".  */
+         fprintf (file, "\tcall LPC$%d\nLPC$%d:\tpopl %%ecx\n", label, label);
+         fprintf (file, "\tmovl %s-LPC$%d(%%ecx),%%ecx\n", lazy_ptr_name, label);
+       }
+      fprintf (file, "\tjmp\t*%%ecx\n");
      }
    else
      fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
  
+  /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
+     it needs no stub-binding-helper.  */
+  if (MACHOPIC_ATT_STUB)
+    return;
+
    fprintf (file, "%s:\n", binder_name);
  
    if (MACHOPIC_PURE)
      {
-      fprintf (file, "\tlea\t%s-LPC$%d(%%eax),%%eax\n", lazy_ptr_name, label);
-      fputs ("\tpushl\t%eax\n", file);
+      fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
+      fprintf (file, "\tpushl\t%%ecx\n");
      }
    else
      fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
  
    fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
  
+  /* N.B. Keep the correspondence of these
+     'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
+     old-pic/new-pic/non-pic stubs; altering this will break
+     compatibility with existing dylibs.  */
+  if (MACHOPIC_PURE)
+    {
+      /* PIC stubs.  */
+      if (TARGET_DEEP_BRANCH_PREDICTION)
+       /* 25-byte PIC stub using "CALL get_pc_thunk".  */
+       switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
+      else
+       /* 26-byte PIC stub using inline picbase: "CALL L42 ! L42: pop %ebx".  */
    switch_to_section (darwin_sections[machopic_lazy_symbol_ptr_section]);
+    }
+  else
+    /* 16-byte -mdynamic-no-pic stub.  */
+    switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
+
    fprintf (file, "%s:\n", lazy_ptr_name);
    fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
    fprintf (file, ASM_LONG "%s\n", binder_name);
@@ -33673,6 +34263,8 @@ ix86_autovectorize_vector_sizes (void)
  #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
  #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra 
  
+#undef TARGET_SCHED_INIT_GLOBAL
+#define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
  #undef TARGET_SCHED_ADJUST_COST
  #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
  #undef TARGET_SCHED_ISSUE_RATE