PR target/43667

[pf3gnuchains/gcc-fork.git] / gcc / config / i386 / i386.c
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

index ad1eee4..c945a54 100644 (file)
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1,6 +1,6 @@
  /* Subroutines used for code generation on IA-32.
-   Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
-   2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
+   Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
+   2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
     Free Software Foundation, Inc.
  
  This file is part of GCC.
@@ -53,8 +53,9 @@ along with GCC; see the file COPYING3.  If not see
  #include "tm-constrs.h"
  #include "params.h"
  #include "cselib.h"
+#include "debug.h"
+#include "dwarf2out.h"
  
-static int x86_builtin_vectorization_cost (bool);
  static rtx legitimize_dllimport_symbol (rtx, bool);
  
  #ifndef CHECK_STACK_LIMIT
@@ -1457,7 +1458,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
    m_AMD_MULTIPLE,
  
    /* X86_TUNE_INTER_UNIT_MOVES */
-  ~(m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
+  ~(m_AMD_MULTIPLE | m_GENERIC),
  
    /* X86_TUNE_INTER_UNIT_CONVERSIONS */
    ~(m_AMDFAM10),
@@ -1808,7 +1809,7 @@ static rtx (*ix86_gen_leave) (void);
  static rtx (*ix86_gen_pop1) (rtx);
  static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
  static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
-static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx);
+static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
  static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
  static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
  static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
@@ -1885,6 +1886,7 @@ static void ix86_compute_frame_layout (struct ix86_frame *);
  static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
                                                  rtx, rtx, int);
  static void ix86_add_new_builtins (int);
+static rtx ix86_expand_vec_perm_builtin (tree);
  
  enum ix86_function_specific_strings
  {
@@ -1910,6 +1912,10 @@ static unsigned int ix86_minimum_incoming_stack_boundary (bool);
  static enum calling_abi ix86_function_abi (const_tree);
  
  \f
+#ifndef SUBTARGET32_DEFAULT_CPU
+#define SUBTARGET32_DEFAULT_CPU "i386"
+#endif
+
  /* The svr4 ABI for the i386 says that records and unions are returned
     in memory.  */
  #ifndef DEFAULT_PCC_STRUCT_RETURN
@@ -2400,7 +2406,7 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value)
      }
  }
  \f
-/* Return a string the documents the current -m options.  The caller is
+/* Return a string that documents the current -m options.  The caller is
     responsible for freeing the string.  */
  
  static char *
@@ -2419,6 +2425,7 @@ ix86_target_string (int isa, int flags, const char *arch, const char *tune,
    {
      { "-m64",          OPTION_MASK_ISA_64BIT },
      { "-mfma4",                OPTION_MASK_ISA_FMA4 },
+    { "-mfma",         OPTION_MASK_ISA_FMA },
      { "-mxop",         OPTION_MASK_ISA_XOP },
      { "-mlwp",         OPTION_MASK_ISA_LWP },
      { "-msse4a",       OPTION_MASK_ISA_SSE4A },
@@ -2621,6 +2628,7 @@ override_options (bool main_args_p)
  {
    int i;
    unsigned int ix86_arch_mask, ix86_tune_mask;
+  const bool ix86_tune_specified = (ix86_tune_string != NULL); 
    const char *prefix;
    const char *suffix;
    const char *sw;
@@ -2821,8 +2829,12 @@ override_options (bool main_args_p)
                    || !strcmp (ix86_tune_string, "generic64")))
         ;
        else if (!strncmp (ix86_tune_string, "generic", 7))
-       error ("bad value (%s) for %stune=%s %s",
+        error ("bad value (%s) for %stune=%s %s",
                ix86_tune_string, prefix, suffix, sw);
+      else if (!strcmp (ix86_tune_string, "x86-64"))
+        warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated.  Use "
+                 "%stune=k8%s or %stune=generic%s instead as appropriate.",
+                 prefix, suffix, prefix, suffix, prefix, suffix);
      }
    else
      {
@@ -2846,6 +2858,7 @@ override_options (bool main_args_p)
             ix86_tune_string = "generic32";
         }
      }
+
    if (ix86_stringop_string)
      {
        if (!strcmp (ix86_stringop_string, "rep_byte"))
@@ -2868,23 +2881,12 @@ override_options (bool main_args_p)
         error ("bad value (%s) for %sstringop-strategy=%s %s",
                ix86_stringop_string, prefix, suffix, sw);
      }
-  if (!strcmp (ix86_tune_string, "x86-64"))
-    warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated.  Use "
-            "%stune=k8%s or %stune=generic%s instead as appropriate.",
-            prefix, suffix, prefix, suffix, prefix, suffix);
  
    if (!ix86_arch_string)
-    ix86_arch_string = TARGET_64BIT ? "x86-64" : "i386";
+    ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
    else
      ix86_arch_specified = 1;
  
-  if (!strcmp (ix86_arch_string, "generic"))
-    error ("generic CPU can be used only for %stune=%s %s",
-          prefix, suffix, sw);
-  if (!strncmp (ix86_arch_string, "generic", 7))
-    error ("bad value (%s) for %sarch=%s %s",
-          ix86_arch_string, prefix, suffix, sw);
-
    /* Validate -mabi= value.  */
    if (ix86_abi_string)
      {
@@ -3032,7 +3034,10 @@ override_options (bool main_args_p)
         break;
        }
  
-  if (i == pta_size)
+  if (!strcmp (ix86_arch_string, "generic"))
+    error ("generic CPU can be used only for %stune=%s %s",
+          prefix, suffix, sw);
+  else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
      error ("bad value (%s) for %sarch=%s %s",
            ix86_arch_string, prefix, suffix, sw);
  
@@ -3071,7 +3076,8 @@ override_options (bool main_args_p)
           x86_prefetch_sse = true;
         break;
        }
-  if (i == pta_size)
+
+  if (ix86_tune_specified && i == pta_size)
      error ("bad value (%s) for %stune=%s %s",
            ix86_tune_string, prefix, suffix, sw);
  
@@ -3191,8 +3197,6 @@ override_options (bool main_args_p)
         ix86_tls_dialect = TLS_DIALECT_GNU;
        else if (strcmp (ix86_tls_dialect_string, "gnu2") == 0)
         ix86_tls_dialect = TLS_DIALECT_GNU2;
-      else if (strcmp (ix86_tls_dialect_string, "sun") == 0)
-       ix86_tls_dialect = TLS_DIALECT_SUN;
        else
         error ("bad value (%s) for %stls-dialect=%s %s",
                ix86_tls_dialect_string, prefix, suffix, sw);
@@ -5346,7 +5350,7 @@ classify_argument (enum machine_mode mode, const_tree type,
      }
  
    /* for V1xx modes, just use the base mode */
-  if (VECTOR_MODE_P (mode) && mode != V1DImode
+  if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
        && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
      mode = GET_MODE_INNER (mode);
  
@@ -5470,6 +5474,7 @@ classify_argument (enum machine_mode mode, const_tree type,
        classes[0] = X86_64_SSE_CLASS;
        classes[1] = X86_64_SSEUP_CLASS;
        return 2;
+    case V1TImode:
      case V1DImode:
      case V2SFmode:
      case V2SImode:
@@ -5814,6 +5819,7 @@ function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
      case V4HImode:
      case V2SImode:
      case V2SFmode:
+    case V1TImode:
      case V1DImode:
        if (!type || !AGGREGATE_TYPE_P (type))
         {
@@ -6001,6 +6007,7 @@ function_arg_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
      case V4HImode:
      case V2SImode:
      case V2SFmode:
+    case V1TImode:
      case V1DImode:
        if (!type || !AGGREGATE_TYPE_P (type))
         {
@@ -7573,8 +7580,8 @@ get_pc_thunk_name (char name[32], unsigned int regno)
  /* This function generates code for -fpic that loads %ebx with
     the return address of the caller and then returns.  */
  
-void
-ix86_file_end (void)
+static void
+ix86_code_end (void)
  {
    rtx xops[2];
    int regno;
@@ -7582,12 +7589,21 @@ ix86_file_end (void)
    for (regno = 0; regno < 8; ++regno)
      {
        char name[32];
+      tree decl;
  
        if (! ((pic_labels_used >> regno) & 1))
         continue;
  
        get_pc_thunk_name (name, regno);
  
+      decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
+                        get_identifier (name),
+                        build_function_type (void_type_node, void_list_node));
+      DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
+                                      NULL_TREE, void_type_node);
+      TREE_PUBLIC (decl) = 1;
+      TREE_STATIC (decl) = 1;
+
  #if TARGET_MACHO
        if (TARGET_MACHO)
         {
@@ -7598,18 +7614,12 @@ ix86_file_end (void)
           assemble_name (asm_out_file, name);
           fputs ("\n", asm_out_file);
           ASM_OUTPUT_LABEL (asm_out_file, name);
+         DECL_WEAK (decl) = 1;
         }
        else
  #endif
        if (USE_HIDDEN_LINKONCE)
         {
-         tree decl;
-
-         decl = build_decl (BUILTINS_LOCATION,
-                            FUNCTION_DECL, get_identifier (name),
-                            error_mark_node);
-         TREE_PUBLIC (decl) = 1;
-         TREE_STATIC (decl) = 1;
           DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
  
           (*targetm.asm_out.unique_section) (decl, 0);
@@ -7627,14 +7637,23 @@ ix86_file_end (void)
           ASM_OUTPUT_LABEL (asm_out_file, name);
         }
  
+      DECL_INITIAL (decl) = make_node (BLOCK);
+      current_function_decl = decl;
+      init_function_start (decl);
+      first_function_block_is_cold = false;
+      /* Make sure unwind info is emitted for the thunk if needed.  */
+      final_start_function (emit_barrier (), asm_out_file, 1);
+
        xops[0] = gen_rtx_REG (Pmode, regno);
        xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
        output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
        output_asm_insn ("ret", xops);
+      final_end_function ();
+      init_insn_lengths ();
+      free_after_compilation (cfun);
+      set_cfun (NULL);
+      current_function_decl = NULL;
      }
-
-  if (NEED_INDICATE_EXEC_STACK)
-    file_end_indicate_exec_stack ();
  }
  
  /* Emit code for the SET_GOT patterns.  */
@@ -7671,7 +7690,24 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
        if (!flag_pic)
         output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
        else
-       output_asm_insn ("call\t%a2", xops);
+       {
+         output_asm_insn ("call\t%a2", xops);
+#ifdef DWARF2_UNWIND_INFO
+         /* The call to next label acts as a push.  */
+         if (dwarf2out_do_frame ())
+           {
+             rtx insn;
+             start_sequence ();
+             insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+                                            gen_rtx_PLUS (Pmode,
+                                                          stack_pointer_rtx,
+                                                          GEN_INT (-4))));
+             RTX_FRAME_RELATED_P (insn) = 1;
+             dwarf2out_frame_debug (insn, true);
+             end_sequence ();
+           }
+#endif
+       }
  
  #if TARGET_MACHO
        /* Output the Mach-O "canonical" label name ("Lxx$pb") here too.  This
@@ -7684,7 +7720,27 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
                                  CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
  
        if (flag_pic)
-       output_asm_insn ("pop%z0\t%0", xops);
+       {
+         output_asm_insn ("pop%z0\t%0", xops);
+#ifdef DWARF2_UNWIND_INFO
+         /* The pop is a pop and clobbers dest, but doesn't restore it
+            for unwind info purposes.  */
+         if (dwarf2out_do_frame ())
+           {
+             rtx insn;
+             start_sequence ();
+             insn = emit_insn (gen_rtx_SET (VOIDmode, dest, const0_rtx));
+             dwarf2out_frame_debug (insn, true);
+             insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
+                                            gen_rtx_PLUS (Pmode,
+                                                          stack_pointer_rtx,
+                                                          GEN_INT (4))));
+             RTX_FRAME_RELATED_P (insn) = 1;
+             dwarf2out_frame_debug (insn, true);
+             end_sequence ();
+           }
+#endif
+       }
      }
    else
      {
@@ -7692,6 +7748,18 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
        get_pc_thunk_name (name, REGNO (dest));
        pic_labels_used |= 1 << REGNO (dest);
  
+#ifdef DWARF2_UNWIND_INFO
+      /* Ensure all queued register saves are flushed before the
+        call.  */
+      if (dwarf2out_do_frame ())
+       {
+         rtx insn;
+         start_sequence ();
+         insn = emit_barrier ();
+         end_sequence ();
+         dwarf2out_frame_debug (insn, false);
+       }
+#endif
        xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
        xops[2] = gen_rtx_MEM (QImode, xops[2]);
        output_asm_insn ("call\t%X2", xops);
@@ -8328,7 +8396,11 @@ ix86_get_drap_rtx (void)
        end_sequence ();
        
        insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
-      RTX_FRAME_RELATED_P (insn) = 1;
+      if (!optimize)
+       {
+         add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
+         RTX_FRAME_RELATED_P (insn) = 1;
+       }
        return drap_vreg;
      }
    else
@@ -8556,13 +8628,10 @@ ix86_expand_prologue (void)
                                ix86_cfa_state->reg == stack_pointer_rtx);
    else
      {
-      /* Only valid for Win32.  */
        rtx eax = gen_rtx_REG (Pmode, AX_REG);
        bool eax_live;
        rtx t;
  
-      gcc_assert (!TARGET_64BIT || cfun->machine->call_abi == MS_ABI);
-
        if (cfun->machine->call_abi == MS_ABI)
         eax_live = false;
        else
@@ -8844,7 +8913,7 @@ ix86_emit_restore_sse_regs_using_mov (rtx pointer, HOST_WIDE_INT offset,
  {
    int regno;
    rtx base_address = gen_rtx_MEM (TImode, pointer);
-  rtx mem, insn;
+  rtx mem;
  
    for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
      if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
@@ -8865,7 +8934,7 @@ ix86_emit_restore_sse_regs_using_mov (rtx pointer, HOST_WIDE_INT offset,
           }
         mem = adjust_address (base_address, TImode, offset);
         set_mem_align (mem, 128);
-       insn = emit_move_insn (reg, mem);
+       emit_move_insn (reg, mem);
         offset += 16;
  
         ix86_add_cfa_restore_note (NULL_RTX, reg, red_offset);
@@ -10791,29 +10860,29 @@ output_pic_addr_const (FILE *file, rtx x, int code)
           break;
         case UNSPEC_GOTTPOFF:
           /* FIXME: This might be @TPOFF in Sun ld too.  */
-         fputs ("@GOTTPOFF", file);
+         fputs ("@gottpoff", file);
           break;
         case UNSPEC_TPOFF:
-         fputs ("@TPOFF", file);
+         fputs ("@tpoff", file);
           break;
         case UNSPEC_NTPOFF:
           if (TARGET_64BIT)
-           fputs ("@TPOFF", file);
+           fputs ("@tpoff", file);
           else
-           fputs ("@NTPOFF", file);
+           fputs ("@ntpoff", file);
           break;
         case UNSPEC_DTPOFF:
-         fputs ("@DTPOFF", file);
+         fputs ("@dtpoff", file);
           break;
         case UNSPEC_GOTNTPOFF:
           if (TARGET_64BIT)
             fputs (ASSEMBLER_DIALECT == ASM_ATT ?
-                  "@GOTTPOFF(%rip)": "@GOTTPOFF[rip]", file);
+                  "@gottpoff(%rip)": "@gottpoff[rip]", file);
           else
-           fputs ("@GOTNTPOFF", file);
+           fputs ("@gotntpoff", file);
           break;
         case UNSPEC_INDNTPOFF:
-         fputs ("@INDNTPOFF", file);
+         fputs ("@indntpoff", file);
           break;
  #if TARGET_MACHO
         case UNSPEC_MACHOPIC_OFFSET:
@@ -10840,7 +10909,7 @@ i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
  {
    fputs (ASM_LONG, file);
    output_addr_const (file, x);
-  fputs ("@DTPOFF", file);
+  fputs ("@dtpoff", file);
    switch (size)
      {
      case 4:
@@ -10881,6 +10950,9 @@ static rtx
  ix86_delegitimize_address (rtx x)
  {
    rtx orig_x = delegitimize_mem_from_attrs (x);
+  /* addend is NULL or some rtx if x is something+GOTOFF where
+     something doesn't include the PIC register.  */
+  rtx addend = NULL_RTX;
    /* reg_addend is NULL or a multiple of some register.  */
    rtx reg_addend = NULL_RTX;
    /* const_addend is NULL or a const_int.  */
@@ -10919,14 +10991,13 @@ ix86_delegitimize_address (rtx x)
        else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
         reg_addend = XEXP (reg_addend, 0);
        else
-       return orig_x;
-      if (!REG_P (reg_addend)
-         && GET_CODE (reg_addend) != MULT
-         && GET_CODE (reg_addend) != ASHIFT)
-       return orig_x;
+       {
+         reg_addend = NULL_RTX;
+         addend = XEXP (x, 0);
+       }
      }
    else
-    return orig_x;
+    addend = XEXP (x, 0);
  
    x = XEXP (XEXP (x, 1), 0);
    if (GET_CODE (x) == PLUS
@@ -10937,7 +11008,7 @@ ix86_delegitimize_address (rtx x)
      }
  
    if (GET_CODE (x) == UNSPEC
-      && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x))
+      && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
           || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
      result = XVECEXP (x, 0, 0);
  
@@ -10952,6 +11023,22 @@ ix86_delegitimize_address (rtx x)
      result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
    if (reg_addend)
      result = gen_rtx_PLUS (Pmode, reg_addend, result);
+  if (addend)
+    {
+      /* If the rest of original X doesn't involve the PIC register, add
+        addend and subtract pic_offset_table_rtx.  This can happen e.g.
+        for code like:
+        leal (%ebx, %ecx, 4), %ecx
+        ...
+        movl foo@GOTOFF(%ecx), %edx
+        in which case we return (%ecx - %ebx) + foo.  */
+      if (pic_offset_table_rtx)
+        result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
+                                                    pic_offset_table_rtx),
+                              result);
+      else
+       return orig_x;
+    }
    return result;
  }
  
@@ -11303,7 +11390,6 @@ get_some_local_dynamic_name (void)
     L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
     C -- print opcode suffix for set/cmov insn.
     c -- like C, but print reversed condition
-   E,e -- likewise, but for compare-and-branch fused insn.
     F,f -- likewise, but for floating-point.
     O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
          otherwise nothing
@@ -11708,14 +11794,6 @@ print_operand (FILE *file, rtx x, int code)
           put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
           return;
  
-       case 'E':
-         put_condition_code (GET_CODE (x), CCmode, 0, 0, file);
-         return;
-
-       case 'e':
-         put_condition_code (GET_CODE (x), CCmode, 1, 0, file);
-         return;
-
         case 'H':
           /* It doesn't actually matter what mode we use here, as we're
              only going to use this for printing.  */
@@ -11842,13 +11920,14 @@ print_operand (FILE *file, rtx x, int code)
             case 2: size = "WORD"; break;
             case 4: size = "DWORD"; break;
             case 8: size = "QWORD"; break;
-           case 12: size = "XWORD"; break;
+           case 12: size = "TBYTE"; break;
             case 16:
               if (GET_MODE (x) == XFmode)
-               size = "XWORD";
+               size = "TBYTE";
                else
                 size = "XMMWORD";
                break;
+           case 32: size = "YMMWORD"; break;
             default:
               gcc_unreachable ();
             }
@@ -12098,34 +12177,34 @@ output_addr_const_extra (FILE *file, rtx x)
      case UNSPEC_GOTTPOFF:
        output_addr_const (file, op);
        /* FIXME: This might be @TPOFF in Sun ld.  */
-      fputs ("@GOTTPOFF", file);
+      fputs ("@gottpoff", file);
        break;
      case UNSPEC_TPOFF:
        output_addr_const (file, op);
-      fputs ("@TPOFF", file);
+      fputs ("@tpoff", file);
        break;
      case UNSPEC_NTPOFF:
        output_addr_const (file, op);
        if (TARGET_64BIT)
-       fputs ("@TPOFF", file);
+       fputs ("@tpoff", file);
        else
-       fputs ("@NTPOFF", file);
+       fputs ("@ntpoff", file);
        break;
      case UNSPEC_DTPOFF:
        output_addr_const (file, op);
-      fputs ("@DTPOFF", file);
+      fputs ("@dtpoff", file);
        break;
      case UNSPEC_GOTNTPOFF:
        output_addr_const (file, op);
        if (TARGET_64BIT)
         fputs (ASSEMBLER_DIALECT == ASM_ATT ?
-              "@GOTTPOFF(%rip)" : "@GOTTPOFF[rip]", file);
+              "@gottpoff(%rip)" : "@gottpoff[rip]", file);
        else
-       fputs ("@GOTNTPOFF", file);
+       fputs ("@gotntpoff", file);
        break;
      case UNSPEC_INDNTPOFF:
        output_addr_const (file, op);
-      fputs ("@INDNTPOFF", file);
+      fputs ("@indntpoff", file);
        break;
  #if TARGET_MACHO
      case UNSPEC_MACHOPIC_OFFSET:
@@ -13848,6 +13927,19 @@ ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
    return TRUE;
  }
  
+/* Return TRUE if the operands to a vec_interleave_{high,low}v2df
+   are ok, keeping in mind the possible movddup alternative.  */
+
+bool
+ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
+{
+  if (MEM_P (operands[0]))
+    return rtx_equal_p (operands[0], operands[1 + high]);
+  if (MEM_P (operands[1]) && MEM_P (operands[2]))
+    return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
+  return true;
+}
+
  /* Post-reload splitter for converting an SF or DFmode value in an
     SSE register into an unsigned SImode.  */
  
@@ -13943,7 +14035,7 @@ ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
    exponents = validize_mem (force_const_mem (V4SImode, x));
  
    /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
-  emit_insn (gen_sse2_punpckldq (int_xmm, int_xmm, exponents));
+  emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
  
    /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
       yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
@@ -13969,7 +14061,7 @@ ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
    else
      {
        x = copy_to_mode_reg (V2DFmode, fp_xmm);
-      emit_insn (gen_sse2_unpckhpd (fp_xmm, fp_xmm, fp_xmm));
+      emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
        emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
      }
  
@@ -15364,7 +15456,7 @@ ix86_expand_int_movcc (rtx operands[])
    enum rtx_code code = GET_CODE (operands[1]), compare_code;
    rtx compare_seq, compare_op;
    enum machine_mode mode = GET_MODE (operands[0]);
-  bool sign_bit_compare_p = false;;
+  bool sign_bit_compare_p = false;
  
    start_sequence ();
    ix86_compare_op0 = XEXP (operands[1], 0);
@@ -15404,15 +15496,19 @@ ix86_expand_int_movcc (rtx operands[])
  
            if (!sign_bit_compare_p)
             {
+             rtx flags;
               bool fpcmp = false;
  
               compare_code = GET_CODE (compare_op);
  
-             if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
-                 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
+             flags = XEXP (compare_op, 0);
+
+             if (GET_MODE (flags) == CCFPmode
+                 || GET_MODE (flags) == CCFPUmode)
                 {
                   fpcmp = true;
-                 compare_code = ix86_fp_compare_code_to_integer (compare_code);
+                 compare_code
+                   = ix86_fp_compare_code_to_integer (compare_code);
                 }
  
               /* To simplify rest of code, restrict to the GEU case.  */
@@ -15431,7 +15527,8 @@ ix86_expand_int_movcc (rtx operands[])
                               reverse_condition_maybe_unordered
                                 (GET_CODE (compare_op)));
                   else
-                   PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
+                   PUT_CODE (compare_op,
+                             reverse_condition (GET_CODE (compare_op)));
                 }
               diff = ct - cf;
  
@@ -15440,10 +15537,10 @@ ix86_expand_int_movcc (rtx operands[])
                 tmp = gen_reg_rtx (mode);
  
               if (mode == DImode)
-               emit_insn (gen_x86_movdicc_0_m1 (tmp, compare_op));
+               emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
               else
-               emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
-                                                compare_op));
+               emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
+                                                flags, compare_op));
             }
           else
             {
@@ -16155,116 +16252,110 @@ ix86_expand_int_vcond (rtx operands[])
    /* XOP supports all of the comparisons on all vector int types.  */
    if (!TARGET_XOP)
      {
-  /* Canonicalize the comparison to EQ, GT, GTU.  */
-  switch (code)
-    {
-    case EQ:
-    case GT:
-    case GTU:
-      break;
-
-    case NE:
-    case LE:
-    case LEU:
-      code = reverse_condition (code);
-      negate = true;
-      break;
-
-    case GE:
-    case GEU:
-      code = reverse_condition (code);
-      negate = true;
-      /* FALLTHRU */
-
-    case LT:
-    case LTU:
-      code = swap_condition (code);
-      x = cop0, cop0 = cop1, cop1 = x;
-      break;
-
-    default:
-      gcc_unreachable ();
-    }
-
-  /* Only SSE4.1/SSE4.2 supports V2DImode.  */
-  if (mode == V2DImode)
-    {
+      /* Canonicalize the comparison to EQ, GT, GTU.  */
        switch (code)
         {
         case EQ:
-         /* SSE4.1 supports EQ.  */
-         if (!TARGET_SSE4_1)
-           return false;
-         break;
-
         case GT:
         case GTU:
-         /* SSE4.2 supports GT/GTU.  */
-         if (!TARGET_SSE4_2)
-           return false;
+         break;
+
+       case NE:
+       case LE:
+       case LEU:
+         code = reverse_condition (code);
+         negate = true;
+         break;
+
+       case GE:
+       case GEU:
+         code = reverse_condition (code);
+         negate = true;
+         /* FALLTHRU */
+
+       case LT:
+       case LTU:
+         code = swap_condition (code);
+         x = cop0, cop0 = cop1, cop1 = x;
           break;
  
         default:
           gcc_unreachable ();
         }
-    }
  
-  /* Unsigned parallel compare is not supported by the hardware.  Play some
-     tricks to turn this into a signed comparison against 0.  */
-  if (code == GTU)
-    {
-      cop0 = force_reg (mode, cop0);
+      /* Only SSE4.1/SSE4.2 supports V2DImode.  */
+      if (mode == V2DImode)
+       {
+         switch (code)
+           {
+           case EQ:
+             /* SSE4.1 supports EQ.  */
+             if (!TARGET_SSE4_1)
+               return false;
+             break;
  
-      switch (mode)
+           case GT:
+           case GTU:
+             /* SSE4.2 supports GT/GTU.  */
+             if (!TARGET_SSE4_2)
+               return false;
+             break;
+
+           default:
+             gcc_unreachable ();
+           }
+       }
+
+      /* Unsigned parallel compare is not supported by the hardware.
+        Play some tricks to turn this into a signed comparison
+        against 0.  */
+      if (code == GTU)
         {
-       case V4SImode:
-       case V2DImode:
-         {
-           rtx t1, t2, mask;
-
-           /* Perform a parallel modulo subtraction.  */
-           t1 = gen_reg_rtx (mode);
-           emit_insn ((mode == V4SImode
-                       ? gen_subv4si3
-                       : gen_subv2di3) (t1, cop0, cop1));
-
-           /* Extract the original sign bit of op0.  */
-           mask = ix86_build_signbit_mask (GET_MODE_INNER (mode),
-                                           true, false);
-           t2 = gen_reg_rtx (mode);
-           emit_insn ((mode == V4SImode
-                       ? gen_andv4si3
-                       : gen_andv2di3) (t2, cop0, mask));
-
-           /* XOR it back into the result of the subtraction.  This results
-              in the sign bit set iff we saw unsigned underflow.  */
-           x = gen_reg_rtx (mode);
-           emit_insn ((mode == V4SImode
-                       ? gen_xorv4si3
-                       : gen_xorv2di3) (x, t1, t2));
-
-           code = GT;
-         }
-         break;
+         cop0 = force_reg (mode, cop0);
+
+         switch (mode)
+           {
+           case V4SImode:
+           case V2DImode:
+               {
+                 rtx t1, t2, mask;
+                 rtx (*gen_sub3) (rtx, rtx, rtx);
+
+                 /* Subtract (-(INT MAX) - 1) from both operands to make
+                    them signed.  */
+                 mask = ix86_build_signbit_mask (GET_MODE_INNER (mode),
+                                                 true, false);
+                 gen_sub3 = (mode == V4SImode
+                             ? gen_subv4si3 : gen_subv2di3);
+                 t1 = gen_reg_rtx (mode);
+                 emit_insn (gen_sub3 (t1, cop0, mask));
+
+                 t2 = gen_reg_rtx (mode);
+                 emit_insn (gen_sub3 (t2, cop1, mask));
+
+                 cop0 = t1;
+                 cop1 = t2;
+                 code = GT;
+               }
+             break;
  
-       case V16QImode:
-       case V8HImode:
-         /* Perform a parallel unsigned saturating subtraction.  */
-         x = gen_reg_rtx (mode);
-         emit_insn (gen_rtx_SET (VOIDmode, x,
-                                 gen_rtx_US_MINUS (mode, cop0, cop1)));
+           case V16QImode:
+           case V8HImode:
+             /* Perform a parallel unsigned saturating subtraction.  */
+             x = gen_reg_rtx (mode);
+             emit_insn (gen_rtx_SET (VOIDmode, x,
+                                     gen_rtx_US_MINUS (mode, cop0, cop1)));
  
-         code = EQ;
-         negate = !negate;
-         break;
+             cop0 = x;
+             cop1 = CONST0_RTX (mode);
+             code = EQ;
+             negate = !negate;
+             break;
  
-       default:
-         gcc_unreachable ();
+           default:
+             gcc_unreachable ();
+           }
         }
-
-      cop0 = x;
-      cop1 = CONST0_RTX (mode);
-    }
      }
  
    x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
@@ -16360,9 +16451,9 @@ ix86_expand_sse4_unpack (rtx operands[2], bool unsigned_p, bool high_p)
      {
        /* Shift higher 8 bytes to lower 8 bytes.  */
        src = gen_reg_rtx (imode);
-      emit_insn (gen_sse2_lshrti3 (gen_lowpart (TImode, src),
-                                  gen_lowpart (TImode, operands[1]),
-                                  GEN_INT (64)));
+      emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, src),
+                                    gen_lowpart (V1TImode, operands[1]),
+                                    GEN_INT (64)));
      }
    else
      src = operands[1];
@@ -16377,11 +16468,12 @@ int
  ix86_expand_int_addcc (rtx operands[])
  {
    enum rtx_code code = GET_CODE (operands[1]);
-  rtx (*insn)(rtx, rtx, rtx, rtx);
+  rtx flags;
+  rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
    rtx compare_op;
    rtx val = const0_rtx;
    bool fpcmp = false;
-  enum machine_mode mode = GET_MODE (operands[0]);
+  enum machine_mode mode;
  
    ix86_compare_op0 = XEXP (operands[1], 0);
    ix86_compare_op1 = XEXP (operands[1], 1);
@@ -16393,8 +16485,10 @@ ix86_expand_int_addcc (rtx operands[])
       return 0;
    code = GET_CODE (compare_op);
  
-  if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
-      || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
+  flags = XEXP (compare_op, 0);
+
+  if (GET_MODE (flags) == CCFPmode
+      || GET_MODE (flags) == CCFPUmode)
      {
        fpcmp = true;
        code = ix86_fp_compare_code_to_integer (code);
@@ -16410,12 +16504,13 @@ ix86_expand_int_addcc (rtx operands[])
        else
         PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
      }
-  PUT_MODE (compare_op, mode);
+
+  mode = GET_MODE (operands[0]);
  
    /* Construct either adc or sbb insn.  */
    if ((code == LTU) == (operands[3] == constm1_rtx))
      {
-      switch (GET_MODE (operands[0]))
+      switch (mode)
         {
           case QImode:
             insn = gen_subqi3_carry;
@@ -16435,7 +16530,7 @@ ix86_expand_int_addcc (rtx operands[])
      }
    else
      {
-      switch (GET_MODE (operands[0]))
+      switch (mode)
         {
           case QImode:
             insn = gen_addqi3_carry;
@@ -16453,7 +16548,7 @@ ix86_expand_int_addcc (rtx operands[])
             gcc_unreachable ();
         }
      }
-  emit_insn (insn (operands[0], operands[2], val, compare_op));
+  emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
  
    return 1; /* DONE */
  }
@@ -18986,7 +19081,6 @@ ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
                                gen_rtx_IF_THEN_ELSE (Pmode, tmp,
                                                      reg2,
                                                      out)));
-
      }
    else
      {
@@ -19013,8 +19107,9 @@ ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
    /* Avoid branch in fixing the byte.  */
    tmpreg = gen_lowpart (QImode, tmpreg);
    emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
-  cmp = gen_rtx_LTU (Pmode, gen_rtx_REG (CCmode, FLAGS_REG), const0_rtx);
-  emit_insn ((*ix86_gen_sub3_carry) (out, out, GEN_INT (3), cmp));
+  tmp = gen_rtx_REG (CCmode, FLAGS_REG);
+  cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
+  emit_insn ((*ix86_gen_sub3_carry) (out, out, GEN_INT (3), tmp, cmp));
  
    emit_label (end_0_label);
  }
@@ -20930,7 +21025,6 @@ enum ix86_builtins
    IX86_BUILTIN_EXTRACTF128SI256,
    IX86_BUILTIN_VZEROALL,
    IX86_BUILTIN_VZEROUPPER,
-  IX86_BUILTIN_VZEROUPPER_REX64,
    IX86_BUILTIN_VPERMILVARPD,
    IX86_BUILTIN_VPERMILVARPS,
    IX86_BUILTIN_VPERMILVARPD256,
@@ -20939,6 +21033,10 @@ enum ix86_builtins
    IX86_BUILTIN_VPERMILPS,
    IX86_BUILTIN_VPERMILPD256,
    IX86_BUILTIN_VPERMILPS256,
+  IX86_BUILTIN_VPERMIL2PD,
+  IX86_BUILTIN_VPERMIL2PS,
+  IX86_BUILTIN_VPERMIL2PD256,
+  IX86_BUILTIN_VPERMIL2PS256,
    IX86_BUILTIN_VPERM2F128PD256,
    IX86_BUILTIN_VPERM2F128PS256,
    IX86_BUILTIN_VPERM2F128SI256,
@@ -21026,6 +21124,19 @@ enum ix86_builtins
  
    IX86_BUILTIN_CVTUDQ2PS,
  
+  IX86_BUILTIN_VEC_PERM_V2DF,
+  IX86_BUILTIN_VEC_PERM_V4SF,
+  IX86_BUILTIN_VEC_PERM_V2DI,
+  IX86_BUILTIN_VEC_PERM_V4SI,
+  IX86_BUILTIN_VEC_PERM_V8HI,
+  IX86_BUILTIN_VEC_PERM_V16QI,
+  IX86_BUILTIN_VEC_PERM_V2DI_U,
+  IX86_BUILTIN_VEC_PERM_V4SI_U,
+  IX86_BUILTIN_VEC_PERM_V8HI_U,
+  IX86_BUILTIN_VEC_PERM_V16QI_U,
+  IX86_BUILTIN_VEC_PERM_V4DF,
+  IX86_BUILTIN_VEC_PERM_V8SF,
+
    /* FMA4 and XOP instructions.  */
    IX86_BUILTIN_VFMADDSS,
    IX86_BUILTIN_VFMADDSD,
@@ -21204,19 +21315,15 @@ enum ix86_builtins
    IX86_BUILTIN_VPCOMTRUEQ,
  
    /* LWP instructions.  */
-  IX86_BUILTIN_LLWPCB16,
-  IX86_BUILTIN_LLWPCB32,
-  IX86_BUILTIN_LLWPCB64,
-  IX86_BUILTIN_SLWPCB16,
-  IX86_BUILTIN_SLWPCB32,
-  IX86_BUILTIN_SLWPCB64,
-  IX86_BUILTIN_LWPVAL16,
+  IX86_BUILTIN_LLWPCB,
+  IX86_BUILTIN_SLWPCB,
    IX86_BUILTIN_LWPVAL32,
    IX86_BUILTIN_LWPVAL64,
-  IX86_BUILTIN_LWPINS16,
    IX86_BUILTIN_LWPINS32,
    IX86_BUILTIN_LWPINS64,
  
+  IX86_BUILTIN_CLZS,
+
    IX86_BUILTIN_MAX
  };
  
@@ -21454,14 +21561,13 @@ static const struct builtin_description bdesc_special_args[] =
  
    /* AVX */
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, 0, IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
-  { OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_64BIT, CODE_FOR_avx_vzeroupper_rex64, 0, IX86_BUILTIN_VZEROUPPER_REX64, UNKNOWN, (int) VOID_FTYPE_VOID },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
  
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastss, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastsd256, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastss256, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_pd256, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_ps256, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
  
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
@@ -21484,20 +21590,12 @@ static const struct builtin_description bdesc_special_args[] =
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DF_V4DF },
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SF_V8SF },
  
-  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcbhi1,   "__builtin_ia32_llwpcb16",   IX86_BUILTIN_LLWPCB16,    UNKNOWN,     (int) VOID_FTYPE_VOID },
-  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcbsi1,   "__builtin_ia32_llwpcb32",   IX86_BUILTIN_LLWPCB32,    UNKNOWN,     (int) VOID_FTYPE_VOID },
-  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcbdi1,   "__builtin_ia32_llwpcb64",   IX86_BUILTIN_LLWPCB64,    UNKNOWN,     (int) VOID_FTYPE_VOID },
-
-  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcbhi1,   "__builtin_ia32_slwpcb16",   IX86_BUILTIN_SLWPCB16,    UNKNOWN,     (int) VOID_FTYPE_VOID },
-  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcbsi1,   "__builtin_ia32_slwpcb32",   IX86_BUILTIN_SLWPCB32,    UNKNOWN,     (int) VOID_FTYPE_VOID },
-  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcbdi1,   "__builtin_ia32_slwpcb64",   IX86_BUILTIN_SLWPCB64,    UNKNOWN,     (int) VOID_FTYPE_VOID },
-
-  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalhi3,   "__builtin_ia32_lwpval16", IX86_BUILTIN_LWPVAL16,  UNKNOWN,     (int) VOID_FTYPE_USHORT_UINT_USHORT },
-  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3,   "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL64,  UNKNOWN,     (int) VOID_FTYPE_UINT_UINT_UINT },
-  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3,   "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64,  UNKNOWN,     (int) VOID_FTYPE_UINT64_UINT_UINT },
-  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinshi3,   "__builtin_ia32_lwpins16", IX86_BUILTIN_LWPINS16,  UNKNOWN,     (int) UCHAR_FTYPE_USHORT_UINT_USHORT },
-  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3,   "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS64,  UNKNOWN,     (int) UCHAR_FTYPE_UINT_UINT_UINT },
-  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3,   "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64,  UNKNOWN,     (int) UCHAR_FTYPE_UINT64_UINT_UINT },
+  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
+  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
+  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
+  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
+  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
+  { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
  
  };
  
@@ -21669,8 +21767,8 @@ static const struct builtin_description bdesc_args[] =
    { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss,  "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
    { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp,  "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
    { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp,  "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
-  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_unpckhps, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
-  { OPTION_MASK_ISA_SSE, CODE_FOR_sse_unpcklps, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
  
    { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
    { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
@@ -21700,6 +21798,19 @@ static const struct builtin_description bdesc_args[] =
    /* SSE2 */
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
  
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2df", IX86_BUILTIN_VEC_PERM_V2DF, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DI },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4sf", IX86_BUILTIN_VEC_PERM_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2di", IX86_BUILTIN_VEC_PERM_V2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4si", IX86_BUILTIN_VEC_PERM_V4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8hi", IX86_BUILTIN_VEC_PERM_V8HI, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v16qi", IX86_BUILTIN_VEC_PERM_V16QI, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2di_u", IX86_BUILTIN_VEC_PERM_V2DI_U, UNKNOWN, (int) V2UDI_FTYPE_V2UDI_V2UDI_V2UDI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4si_u", IX86_BUILTIN_VEC_PERM_V4SI_U, UNKNOWN, (int) V4USI_FTYPE_V4USI_V4USI_V4USI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8hi_u", IX86_BUILTIN_VEC_PERM_V8HI_U, UNKNOWN, (int) V8UHI_FTYPE_V8UHI_V8UHI_V8UHI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v16qi_u", IX86_BUILTIN_VEC_PERM_V16QI_U, UNKNOWN, (int) V16UQI_FTYPE_V16UQI_V16UQI_V16UQI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4df", IX86_BUILTIN_VEC_PERM_V4DF, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8sf", IX86_BUILTIN_VEC_PERM_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SI },
+
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF  },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
@@ -21767,8 +21878,8 @@ static const struct builtin_description bdesc_args[] =
    { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3,  "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
  
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd,  "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_unpckhpd_exp, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_unpcklpd_exp, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
  
    { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
  
@@ -21813,14 +21924,14 @@ static const struct builtin_description bdesc_args[] =
    { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
  
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_punpckhbw, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_punpckhwd, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI  },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_punpckhdq, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN,  (int) V4SI_FTYPE_V4SI_V4SI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_punpckhqdq, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_punpcklbw, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_punpcklwd, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_punpckldq, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_punpcklqdq, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI  },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN,  (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
  
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
@@ -21839,7 +21950,7 @@ static const struct builtin_description bdesc_args[] =
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
  
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
@@ -21847,7 +21958,7 @@ static const struct builtin_description bdesc_args[] =
    { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
  
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
@@ -22110,9 +22221,15 @@ static const struct builtin_description bdesc_args[] =
  
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF  },
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
+
+  { OPTION_MASK_ISA_ABM, CODE_FOR_clzhi2_abm,   "__builtin_clzs",   IX86_BUILTIN_CLZS,    UNKNOWN,     (int) UINT16_FTYPE_UINT16 },
  };
  
  /* FMA4 and XOP.  */
+#define MULTI_ARG_4_DF2_DI_I   V2DF_FTYPE_V2DF_V2DF_V2DI_INT
+#define MULTI_ARG_4_DF2_DI_I1  V4DF_FTYPE_V4DF_V4DF_V4DI_INT
+#define MULTI_ARG_4_SF2_SI_I   V4SF_FTYPE_V4SF_V4SF_V4SI_INT
+#define MULTI_ARG_4_SF2_SI_I1  V8SF_FTYPE_V8SF_V8SF_V8SI_INT
  #define MULTI_ARG_3_SF         V4SF_FTYPE_V4SF_V4SF_V4SF
  #define MULTI_ARG_3_DF         V2DF_FTYPE_V2DF_V2DF_V2DF
  #define MULTI_ARG_3_SF2                V8SF_FTYPE_V8SF_V8SF_V8SF
@@ -22355,6 +22472,11 @@ static const struct builtin_description bdesc_multi_arg[] =
    { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3,      "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_SI_TF },
    { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3,      "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE,    (int)MULTI_ARG_2_DI_TF },
  
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3,     "__builtin_ia32_vpermil2pd",  IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3,     "__builtin_ia32_vpermil2ps",  IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3,     "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3,     "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
+
  };
  
  /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
@@ -22470,12 +22592,6 @@ ix86_init_mmx_sse_builtins (void)
    def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
                      V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
  
-  /* AVX */
-  def_builtin (OPTION_MASK_ISA_AVX, "__builtin_ia32_vzeroupper",
-              VOID_FTYPE_VOID,
-              (TARGET_64BIT ? IX86_BUILTIN_VZEROUPPER_REX64
-               : IX86_BUILTIN_VZEROUPPER));
-
    /* MMX access to the vec_init patterns.  */
    def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
                      V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
@@ -22741,6 +22857,14 @@ ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
  
    switch (m_type)
      {
+    case MULTI_ARG_4_DF2_DI_I:
+    case MULTI_ARG_4_DF2_DI_I1:
+    case MULTI_ARG_4_SF2_SI_I:
+    case MULTI_ARG_4_SF2_SI_I1:
+      nargs = 4;
+      last_arg_constant = true;
+      break;
+
      case MULTI_ARG_3_SF:
      case MULTI_ARG_3_DF:
      case MULTI_ARG_3_SF2:
@@ -22884,6 +23008,10 @@ ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
        pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
        break;
  
+    case 4:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
+      break;
+
      default:
        gcc_unreachable ();
      }
@@ -23316,6 +23444,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
      case FLOAT_FTYPE_FLOAT:
      case INT_FTYPE_INT:
      case UINT64_FTYPE_INT:
+    case UINT16_FTYPE_UINT16:
      case INT64_FTYPE_INT64:
      case INT64_FTYPE_V4SF:
      case INT64_FTYPE_V2DF:
@@ -23448,7 +23577,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
        break;
      case V2DI_FTYPE_V2DI_INT_CONVERT:
        nargs = 2;
-      rmode = V2DImode;
+      rmode = V1TImode;
        nargs_constant = 1;
        break;
      case V8HI_FTYPE_V8HI_INT:
@@ -23501,6 +23630,13 @@ ix86_expand_args_builtin (const struct builtin_description *d,
        nargs = 3;
        nargs_constant = 2;
        break;
+    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
+    case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
+    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
+    case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
+      nargs = 4;
+      nargs_constant = 1;
+      break;
      case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
        nargs = 4;
        nargs_constant = 2;
@@ -23570,6 +23706,10 @@ ix86_expand_args_builtin (const struct builtin_description *d,
  
               case CODE_FOR_sse4_1_blendpd:
               case CODE_FOR_avx_vpermilv2df:
+             case CODE_FOR_xop_vpermil2v2df3:
+             case CODE_FOR_xop_vpermil2v4sf3:
+             case CODE_FOR_xop_vpermil2v4df3:
+             case CODE_FOR_xop_vpermil2v8sf3:
                 error ("the last argument must be a 2-bit immediate");
                 return const0_rtx;
  
@@ -23676,7 +23816,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
      {
        rtx op;
        enum machine_mode mode;
-    } args[2];
+    } args[3];
    enum insn_code icode = d->icode;
    bool last_arg_constant = false;
    const struct insn_data *insn_p = &insn_data[icode];
@@ -23703,6 +23843,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
      case V4DF_FTYPE_PCV2DF:
      case V4DF_FTYPE_PCDOUBLE:
      case V2DF_FTYPE_PCDOUBLE:
+    case VOID_FTYPE_PVOID:
        nargs = 1;
        klass = load;
        memory = 0;
@@ -23746,15 +23887,14 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
        /* Reserve memory operand for target.  */
        memory = ARRAY_SIZE (args);
        break;
-    case VOID_FTYPE_USHORT_UINT_USHORT:
      case VOID_FTYPE_UINT_UINT_UINT:
      case VOID_FTYPE_UINT64_UINT_UINT:
-    case UCHAR_FTYPE_USHORT_UINT_USHORT:
      case UCHAR_FTYPE_UINT_UINT_UINT:
      case UCHAR_FTYPE_UINT64_UINT_UINT:
        nargs = 3;
-      klass = store;
-      memory = 0;
+      klass = load;
+      memory = ARRAY_SIZE (args);
+      last_arg_constant = true;
        break;
      default:
        gcc_unreachable ();
@@ -23792,12 +23932,16 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
        if (last_arg_constant && (i + 1) == nargs)
         {
           if (!match)
-           switch (icode)
-             {
-            default:
+           {
+             if (icode == CODE_FOR_lwp_lwpvalsi3
+                 || icode == CODE_FOR_lwp_lwpinssi3
+                 || icode == CODE_FOR_lwp_lwpvaldi3
+                 || icode == CODE_FOR_lwp_lwpinsdi3)
+               error ("the last argument must be a 32-bit immediate");
+             else
                 error ("the last argument must be an 8-bit immediate");
-               return const0_rtx;
-             }
+             return const0_rtx;
+           }
         }
        else
         {
@@ -23835,6 +23979,9 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
      case 2:
        pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
        break;
+    case 3:
+      pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
+      break;
      default:
        gcc_unreachable ();
      }
@@ -24115,6 +24262,20 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
      case IX86_BUILTIN_VEC_SET_V16QI:
        return ix86_expand_vec_set_builtin (exp);
  
+    case IX86_BUILTIN_VEC_PERM_V2DF:
+    case IX86_BUILTIN_VEC_PERM_V4SF:
+    case IX86_BUILTIN_VEC_PERM_V2DI:
+    case IX86_BUILTIN_VEC_PERM_V4SI:
+    case IX86_BUILTIN_VEC_PERM_V8HI:
+    case IX86_BUILTIN_VEC_PERM_V16QI:
+    case IX86_BUILTIN_VEC_PERM_V2DI_U:
+    case IX86_BUILTIN_VEC_PERM_V4SI_U:
+    case IX86_BUILTIN_VEC_PERM_V8HI_U:
+    case IX86_BUILTIN_VEC_PERM_V16QI_U:
+    case IX86_BUILTIN_VEC_PERM_V4DF:
+    case IX86_BUILTIN_VEC_PERM_V8SF:
+      return ix86_expand_vec_perm_builtin (exp);
+
      case IX86_BUILTIN_INFQ:
      case IX86_BUILTIN_HUGE_VALQ:
        {
@@ -24133,6 +24294,23 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
         return target;
        }
  
+    case IX86_BUILTIN_LLWPCB:
+      arg0 = CALL_EXPR_ARG (exp, 0);
+      op0 = expand_normal (arg0);
+      icode = CODE_FOR_lwp_llwpcb;
+      if (! (*insn_data[icode].operand[0].predicate) (op0, Pmode))
+       op0 = copy_to_mode_reg (Pmode, op0);
+      emit_insn (gen_lwp_llwpcb (op0));
+      return 0;
+
+    case IX86_BUILTIN_SLWPCB:
+      icode = CODE_FOR_lwp_slwpcb;
+      if (!target
+         || ! (*insn_data[icode].operand[0].predicate) (target, Pmode))
+       target = gen_reg_rtx (Pmode);
+      emit_insn (gen_lwp_slwpcb (target));
+      return target;
+
      default:
        break;
      }
@@ -24188,14 +24366,16 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
     if it is not available.  */
  
  static tree
-ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
+ix86_builtin_vectorized_function (tree fndecl, tree type_out,
                                   tree type_in)
  {
    enum machine_mode in_mode, out_mode;
    int in_n, out_n;
+  enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
  
    if (TREE_CODE (type_out) != VECTOR_TYPE
-      || TREE_CODE (type_in) != VECTOR_TYPE)
+      || TREE_CODE (type_in) != VECTOR_TYPE
+      || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
      return NULL_TREE;
  
    out_mode = TYPE_MODE (TREE_TYPE (type_out));
@@ -24602,6 +24782,58 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode)
    /* Make sure success has a non-zero value by adding one.  */
    return mask + 1;
  }
+
+/* Helper for avx_vperm2f128_v4df_operand et al.  This is also used by
+   the expansion functions to turn the parallel back into a mask.
+   The return value is 0 for no match and the imm8+1 for a match.  */
+
+int
+avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
+{
+  unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
+  unsigned mask = 0;
+  unsigned char ipar[8];
+
+  if (XVECLEN (par, 0) != (int) nelt)
+    return 0;
+
+  /* Validate that all of the elements are constants, and not totally
+     out of range.  Copy the data into an integral array to make the
+     subsequent checks easier.  */
+  for (i = 0; i < nelt; ++i)
+    {
+      rtx er = XVECEXP (par, 0, i);
+      unsigned HOST_WIDE_INT ei;
+
+      if (!CONST_INT_P (er))
+       return 0;
+      ei = INTVAL (er);
+      if (ei >= 2 * nelt)
+       return 0;
+      ipar[i] = ei;
+    }
+
+  /* Validate that the halves of the permute are halves.  */
+  for (i = 0; i < nelt2 - 1; ++i)
+    if (ipar[i] + 1 != ipar[i + 1])
+      return 0;
+  for (i = nelt2; i < nelt - 1; ++i)
+    if (ipar[i] + 1 != ipar[i + 1])
+      return 0;
+
+  /* Reconstruct the mask.  */
+  for (i = 0; i < 2; ++i)
+    {
+      unsigned e = ipar[i * nelt2];
+      if (e % nelt2)
+       return 0;
+      e /= nelt2;
+      mask |= e << (i * 4);
+    }
+
+  /* Make sure success has a non-zero value by adding one.  */
+  return mask + 1;
+}
  \f
  
  /* Store OPERAND to the memory after reload is completed.  This means
@@ -25617,6 +25849,16 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int *total, bool speed)
         *total = 0;
        return false;
  
+    case VEC_SELECT:
+    case VEC_CONCAT:
+    case VEC_MERGE:
+    case VEC_DUPLICATE:
+      /* ??? Assume all of these vector manipulation patterns are
+        recognizable.  In which case they all pretty much have the
+        same cost.  */
+     *total = COSTS_N_INSNS (1);
+     return true;
+
      default:
        return false;
      }
@@ -25686,13 +25928,6 @@ machopic_output_stub (FILE *file, const char *symb, const char *stub)
    fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
    fprintf (file, ASM_LONG "%s\n", binder_name);
  }
-
-void
-darwin_x86_file_end (void)
-{
-  darwin_file_end ();
-  ix86_file_end ();
-}
  #endif /* TARGET_MACHO */
  
  /* Order the registers for register allocator.  */
@@ -25939,7 +26174,7 @@ x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
     *(*this + vcall_offset) should be added to THIS.  */
  
  static void
-x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED,
+x86_output_mi_thunk (FILE *file,
                      tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
                      HOST_WIDE_INT vcall_offset, tree function)
  {
@@ -25947,6 +26182,9 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED,
    rtx this_param = x86_this_parameter (function);
    rtx this_reg, tmp;
  
+  /* Make sure unwind info is emitted for the thunk if needed.  */
+  final_start_function (emit_barrier (), file, 1);
+
    /* If VCALL_OFFSET, we'll need THIS in a register.  Might as well
       pull it in now and let DELTA benefit.  */
    if (REG_P (this_param))
@@ -25976,8 +26214,13 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED,
               xops[0] = tmp;
               xops[1] = this_param;
             }
-         output_asm_insn ("add{q}\t{%0, %1|%1, %0}", xops);
+         if (x86_maybe_negate_const_int (&xops[0], DImode))
+           output_asm_insn ("sub{q}\t{%0, %1|%1, %0}", xops);
+         else
+           output_asm_insn ("add{q}\t{%0, %1|%1, %0}", xops);
         }
+      else if (x86_maybe_negate_const_int (&xops[0], SImode))
+       output_asm_insn ("sub{l}\t{%0, %1|%1, %0}", xops);
        else
         output_asm_insn ("add{l}\t{%0, %1|%1, %0}", xops);
      }
@@ -26067,6 +26310,7 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED,
           output_asm_insn ("jmp\t{*}%1", xops);
         }
      }
+  final_end_function ();
  }
  
  static void
@@ -26108,7 +26352,7 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
    if (TARGET_64BIT)
      {
  #ifndef NO_PROFILE_COUNTERS
-      fprintf (file, "\tleaq\t" LPREFIX "P%d@(%%rip),%%r11\n", labelno);
+      fprintf (file, "\tleaq\t" LPREFIX "P%d(%%rip),%%r11\n", labelno);
  #endif
  
        if (DEFAULT_ABI == SYSV_ABI && flag_pic)
@@ -26402,26 +26646,72 @@ x86_extended_reg_mentioned_p (rtx insn)
                        extended_reg_mentioned_1, NULL);
  }
  
-/* Generate an unsigned DImode/SImode to FP conversion.  This is the same code
-   optabs would emit if we didn't have TFmode patterns.  */
-
-void
-x86_emit_floatuns (rtx operands[2])
+/* If profitable, negate (without causing overflow) integer constant
+   of mode MODE at location LOC.  Return true in this case.  */
+bool
+x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
  {
-  rtx neglab, donelab, i0, i1, f0, in, out;
-  enum machine_mode mode, inmode;
-
-  inmode = GET_MODE (operands[1]);
-  gcc_assert (inmode == SImode || inmode == DImode);
+  HOST_WIDE_INT val;
  
-  out = operands[0];
-  in = force_reg (inmode, operands[1]);
-  mode = GET_MODE (out);
-  neglab = gen_label_rtx ();
-  donelab = gen_label_rtx ();
-  f0 = gen_reg_rtx (mode);
+  if (!CONST_INT_P (*loc))
+    return false;
  
-  emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
+  switch (mode)
+    {
+    case DImode:
+      /* DImode x86_64 constants must fit in 32 bits.  */
+      gcc_assert (x86_64_immediate_operand (*loc, mode));
+
+      mode = SImode;
+      break;
+
+    case SImode:
+    case HImode:
+    case QImode:
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* Avoid overflows.  */
+  if (mode_signbit_p (mode, *loc))
+    return false;
+
+  val = INTVAL (*loc);
+
+  /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
+     Exceptions: -128 encodes smaller than 128, so swap sign and op.  */
+  if ((val < 0 && val != -128)
+      || val == 128)
+    {
+      *loc = GEN_INT (-val);
+      return true;
+    }
+
+  return false;
+}
+
+/* Generate an unsigned DImode/SImode to FP conversion.  This is the same code
+   optabs would emit if we didn't have TFmode patterns.  */
+
+void
+x86_emit_floatuns (rtx operands[2])
+{
+  rtx neglab, donelab, i0, i1, f0, in, out;
+  enum machine_mode mode, inmode;
+
+  inmode = GET_MODE (operands[1]);
+  gcc_assert (inmode == SImode || inmode == DImode);
+
+  out = operands[0];
+  in = force_reg (inmode, operands[1]);
+  mode = GET_MODE (out);
+  neglab = gen_label_rtx ();
+  donelab = gen_label_rtx ();
+  f0 = gen_reg_rtx (mode);
+
+  emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
  
    expand_float (out, in, 0);
  
@@ -26443,6 +26733,35 @@ x86_emit_floatuns (rtx operands[2])
    emit_label (donelab);
  }
  \f
+/* AVX does not support 32-byte integer vector operations,
+   thus the longest vector we are faced with is V16QImode.  */
+#define MAX_VECT_LEN   16
+
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool testing_p;
+};
+
+static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
+static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
+
+/* Get a vector mode of the same size as the original but with elements
+   twice as wide.  This is only guaranteed to apply to integral vectors.  */
+
+static inline enum machine_mode
+get_mode_wider_vector (enum machine_mode o)
+{
+  /* ??? Rely on the ordering that genmodes.c gives to vectors.  */
+  enum machine_mode n = GET_MODE_WIDER_MODE (o);
+  gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
+  gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
+  return n;
+}
+
  /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
     with all elements equal to VAR.  Return true if successful.  */
  
@@ -26450,8 +26769,7 @@ static bool
  ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
                                    rtx target, rtx val)
  {
-  enum machine_mode hmode, smode, wsmode, wvmode;
-  rtx x;
+  bool ok;
  
    switch (mode)
      {
@@ -26461,13 +26779,36 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
         return false;
        /* FALLTHRU */
  
+    case V4DFmode:
+    case V4DImode:
+    case V8SFmode:
+    case V8SImode:
      case V2DFmode:
      case V2DImode:
      case V4SFmode:
      case V4SImode:
-      val = force_reg (GET_MODE_INNER (mode), val);
-      x = gen_rtx_VEC_DUPLICATE (mode, val);
-      emit_insn (gen_rtx_SET (VOIDmode, target, x));
+      {
+       rtx insn, dup;
+
+       /* First attempt to recognize VAL as-is.  */
+       dup = gen_rtx_VEC_DUPLICATE (mode, val);
+       insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
+       if (recog_memoized (insn) < 0)
+         {
+           rtx seq;
+           /* If that fails, force VAL into a register.  */
+
+           start_sequence ();
+           XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
+           seq = get_insns ();
+           end_sequence ();
+           if (seq)
+             emit_insn_before (seq, insn);
+
+           ok = recog_memoized (insn) >= 0;
+           gcc_assert (ok);
+         }
+      }
        return true;
  
      case V4HImode:
@@ -26475,130 +26816,87 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
         return false;
        if (TARGET_SSE || TARGET_3DNOW_A)
         {
+         rtx x;
+
           val = gen_lowpart (SImode, val);
           x = gen_rtx_TRUNCATE (HImode, val);
           x = gen_rtx_VEC_DUPLICATE (mode, x);
           emit_insn (gen_rtx_SET (VOIDmode, target, x));
           return true;
         }
-      else
-       {
-         smode = HImode;
-         wsmode = SImode;
-         wvmode = V2SImode;
-         goto widen;
-       }
+      goto widen;
  
      case V8QImode:
        if (!mmx_ok)
         return false;
-      smode = QImode;
-      wsmode = HImode;
-      wvmode = V4HImode;
        goto widen;
+
      case V8HImode:
        if (TARGET_SSE2)
         {
+         struct expand_vec_perm_d dperm;
           rtx tmp1, tmp2;
-         /* Extend HImode to SImode using a paradoxical SUBREG.  */
+
+       permute:
+         memset (&dperm, 0, sizeof (dperm));
+         dperm.target = target;
+         dperm.vmode = mode;
+         dperm.nelt = GET_MODE_NUNITS (mode);
+         dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
+
+         /* Extend to SImode using a paradoxical SUBREG.  */
           tmp1 = gen_reg_rtx (SImode);
           emit_move_insn (tmp1, gen_lowpart (SImode, val));
-         /* Insert the SImode value as low element of V4SImode vector. */
-         tmp2 = gen_reg_rtx (V4SImode);
-         tmp1 = gen_rtx_VEC_MERGE (V4SImode,
-                                   gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
-                                   CONST0_RTX (V4SImode),
-                                   const1_rtx);
-         emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
-         /* Cast the V4SImode vector back to a V8HImode vector.  */
-         tmp1 = gen_reg_rtx (V8HImode);
-         emit_move_insn (tmp1, gen_lowpart (V8HImode, tmp2));
-         /* Duplicate the low short through the whole low SImode word.  */
-         emit_insn (gen_sse2_punpcklwd (tmp1, tmp1, tmp1));
-         /* Cast the V8HImode vector back to a V4SImode vector.  */
-         tmp2 = gen_reg_rtx (V4SImode);
-         emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
-         /* Replicate the low element of the V4SImode vector.  */
-         emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
-         /* Cast the V2SImode back to V8HImode, and store in target.  */
-         emit_move_insn (target, gen_lowpart (V8HImode, tmp2));
-         return true;
+
+         /* Insert the SImode value as low element of a V4SImode vector. */
+         tmp2 = gen_lowpart (V4SImode, dperm.op0);
+         emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
+
+         ok = (expand_vec_perm_1 (&dperm)
+               || expand_vec_perm_broadcast_1 (&dperm));
+         gcc_assert (ok);
+         return ok;
         }
-      smode = HImode;
-      wsmode = SImode;
-      wvmode = V4SImode;
        goto widen;
+
      case V16QImode:
        if (TARGET_SSE2)
-       {
-         rtx tmp1, tmp2;
-         /* Extend QImode to SImode using a paradoxical SUBREG.  */
-         tmp1 = gen_reg_rtx (SImode);
-         emit_move_insn (tmp1, gen_lowpart (SImode, val));
-         /* Insert the SImode value as low element of V4SImode vector. */
-         tmp2 = gen_reg_rtx (V4SImode);
-         tmp1 = gen_rtx_VEC_MERGE (V4SImode,
-                                   gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
-                                   CONST0_RTX (V4SImode),
-                                   const1_rtx);
-         emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
-         /* Cast the V4SImode vector back to a V16QImode vector.  */
-         tmp1 = gen_reg_rtx (V16QImode);
-         emit_move_insn (tmp1, gen_lowpart (V16QImode, tmp2));
-         /* Duplicate the low byte through the whole low SImode word.  */
-         emit_insn (gen_sse2_punpcklbw (tmp1, tmp1, tmp1));
-         emit_insn (gen_sse2_punpcklbw (tmp1, tmp1, tmp1));
-         /* Cast the V16QImode vector back to a V4SImode vector.  */
-         tmp2 = gen_reg_rtx (V4SImode);
-         emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
-         /* Replicate the low element of the V4SImode vector.  */
-         emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
-         /* Cast the V2SImode back to V16QImode, and store in target.  */
-         emit_move_insn (target, gen_lowpart (V16QImode, tmp2));
-         return true;
-       }
-      smode = QImode;
-      wsmode = HImode;
-      wvmode = V8HImode;
+       goto permute;
        goto widen;
+
      widen:
        /* Replicate the value once into the next wider mode and recurse.  */
-      val = convert_modes (wsmode, smode, val, true);
-      x = expand_simple_binop (wsmode, ASHIFT, val,
-                              GEN_INT (GET_MODE_BITSIZE (smode)),
-                              NULL_RTX, 1, OPTAB_LIB_WIDEN);
-      val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
-
-      x = gen_reg_rtx (wvmode);
-      if (!ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val))
-       gcc_unreachable ();
-      emit_move_insn (target, gen_lowpart (mode, x));
-      return true;
+      {
+       enum machine_mode smode, wsmode, wvmode;
+       rtx x;
+
+       smode = GET_MODE_INNER (mode);
+       wvmode = get_mode_wider_vector (mode);
+       wsmode = GET_MODE_INNER (wvmode);
+
+       val = convert_modes (wsmode, smode, val, true);
+       x = expand_simple_binop (wsmode, ASHIFT, val,
+                                GEN_INT (GET_MODE_BITSIZE (smode)),
+                                NULL_RTX, 1, OPTAB_LIB_WIDEN);
+       val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
+
+       x = gen_lowpart (wvmode, target);
+       ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
+       gcc_assert (ok);
+       return ok;
+      }
  
-    case V4DFmode:
-      hmode = V2DFmode;
-      goto half;
-    case V4DImode:
-      hmode = V2DImode;
-      goto half;
-    case V8SFmode:
-      hmode = V4SFmode;
-      goto half;
-    case V8SImode:
-      hmode = V4SImode;
-      goto half;
      case V16HImode:
-      hmode = V8HImode;
-      goto half;
      case V32QImode:
-      hmode = V16QImode;
-      goto half;
-half:
        {
-       rtx tmp = gen_reg_rtx (hmode);
-       ix86_expand_vector_init_duplicate (mmx_ok, hmode, tmp, val);
-       emit_insn (gen_rtx_SET (VOIDmode, target,
-                               gen_rtx_VEC_CONCAT (mode, tmp, tmp)));
+       enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
+       rtx x = gen_reg_rtx (hvmode);
+
+       ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
+       gcc_assert (ok);
+
+       x = gen_rtx_VEC_CONCAT (mode, x, x);
+       emit_insn (gen_rtx_SET (VOIDmode, target, x));
        }
        return true;
  
@@ -27379,7 +27677,7 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
           /* tmp = target = A B C D */
           tmp = copy_to_reg (target);
           /* target = A A B B */
-         emit_insn (gen_sse_unpcklps (target, target, target));
+         emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
           /* target = X A B B */
           ix86_expand_vector_set (false, target, val, 0);
           /* target = A X C D  */
@@ -27589,7 +27887,7 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
  
         case 2:
           tmp = gen_reg_rtx (mode);
-         emit_insn (gen_sse_unpckhps (tmp, vec, vec));
+         emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
           break;
  
         default:
@@ -27623,7 +27921,7 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
  
             case 2:
               tmp = gen_reg_rtx (mode);
-             emit_insn (gen_sse2_punpckhdq (tmp, vec, vec));
+             emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
               break;
  
             default:
@@ -28665,199 +28963,6 @@ ix86_expand_round (rtx operand0, rtx operand1)
    emit_move_insn (operand0, res);
  }
  \f
-/* Validate whether a FMA4 instruction is valid or not.
-   OPERANDS is the array of operands.
-   NUM is the number of operands.
-   USES_OC0 is true if the instruction uses OC0 and provides 4 variants.
-   NUM_MEMORY is the maximum number of memory operands to accept.
-   NUM_MEMORY less than zero is a special case to allow an operand
-   of an instruction to be memory operation.
-   when COMMUTATIVE is set, operand 1 and 2 can be swapped.  */
-
-bool
-ix86_fma4_valid_op_p (rtx operands[], rtx insn ATTRIBUTE_UNUSED, int num,
-                     bool uses_oc0, int num_memory, bool commutative)
-{
-  int mem_mask;
-  int mem_count;
-  int i;
-
-  /* Count the number of memory arguments */
-  mem_mask = 0;
-  mem_count = 0;
-  for (i = 0; i < num; i++)
-    {
-      enum machine_mode mode = GET_MODE (operands[i]);
-      if (register_operand (operands[i], mode))
-       ;
-
-      else if (memory_operand (operands[i], mode))
-       {
-         mem_mask |= (1 << i);
-         mem_count++;
-       }
-
-      else
-       {
-         rtx pattern = PATTERN (insn);
-
-         /* allow 0 for pcmov */
-         if (GET_CODE (pattern) != SET
-             || GET_CODE (SET_SRC (pattern)) != IF_THEN_ELSE
-             || i < 2
-             || operands[i] != CONST0_RTX (mode))
-           return false;
-       }
-    }
-
-  /* Special case pmacsdq{l,h} where we allow the 3rd argument to be
-     a memory operation.  */
-  if (num_memory < 0)
-    {
-      num_memory = -num_memory;
-      if ((mem_mask & (1 << (num-1))) != 0)
-       {
-         mem_mask &= ~(1 << (num-1));
-         mem_count--;
-       }
-    }
-
-  /* If there were no memory operations, allow the insn */
-  if (mem_mask == 0)
-    return true;
-
-  /* Do not allow the destination register to be a memory operand.  */
-  else if (mem_mask & (1 << 0))
-    return false;
-
-  /* If there are too many memory operations, disallow the instruction.  While
-     the hardware only allows 1 memory reference, before register allocation
-     for some insns, we allow two memory operations sometimes in order to allow
-     code like the following to be optimized:
-
-       float fmadd (float *a, float *b, float *c) { return (*a * *b) + *c; }
-
-    or similar cases that are vectorized into using the vfmaddss
-    instruction.  */
-  else if (mem_count > num_memory)
-    return false;
-
-  /* Don't allow more than one memory operation if not optimizing.  */
-  else if (mem_count > 1 && !optimize)
-    return false;
-
-  else if (num == 4 && mem_count == 1)
-    {
-      /* formats (destination is the first argument), example vfmaddss:
-        xmm1, xmm1, xmm2, xmm3/mem
-        xmm1, xmm1, xmm2/mem, xmm3
-        xmm1, xmm2, xmm3/mem, xmm1
-        xmm1, xmm2/mem, xmm3, xmm1 */
-      if (uses_oc0)
-       return ((mem_mask == (1 << 1))
-               || (mem_mask == (1 << 2))
-               || (mem_mask == (1 << 3)));
-
-      /* format, example vpmacsdd:
-        xmm1, xmm2, xmm3/mem, xmm1 */
-      if (commutative)
-       return (mem_mask == (1 << 2) || mem_mask == (1 << 1));
-      else
-       return (mem_mask == (1 << 2));
-    }
-
-  else if (num == 4 && num_memory == 2)
-    {
-      /* If there are two memory operations, we can load one of the memory ops
-        into the destination register.  This is for optimizing the
-        multiply/add ops, which the combiner has optimized both the multiply
-        and the add insns to have a memory operation.  We have to be careful
-        that the destination doesn't overlap with the inputs.  */
-      rtx op0 = operands[0];
-
-      if (reg_mentioned_p (op0, operands[1])
-         || reg_mentioned_p (op0, operands[2])
-         || reg_mentioned_p (op0, operands[3]))
-       return false;
-
-      /* formats (destination is the first argument), example vfmaddss:
-        xmm1, xmm1, xmm2, xmm3/mem
-        xmm1, xmm1, xmm2/mem, xmm3
-        xmm1, xmm2, xmm3/mem, xmm1
-        xmm1, xmm2/mem, xmm3, xmm1
-
-         For the oc0 case, we will load either operands[1] or operands[3] into
-         operands[0], so any combination of 2 memory operands is ok.  */
-      if (uses_oc0)
-       return true;
-
-      /* format, example vpmacsdd:
-        xmm1, xmm2, xmm3/mem, xmm1
-
-         For the integer multiply/add instructions be more restrictive and
-         require operands[2] and operands[3] to be the memory operands.  */
-      if (commutative)
-       return (mem_mask == ((1 << 1) | (1 << 3)) || ((1 << 2) | (1 << 3)));
-      else
-       return (mem_mask == ((1 << 2) | (1 << 3)));
-    }
-
-  else if (num == 3 && num_memory == 1)
-    {
-      /* formats, example vprotb:
-        xmm1, xmm2, xmm3/mem
-        xmm1, xmm2/mem, xmm3 */
-      if (uses_oc0)
-       return ((mem_mask == (1 << 1)) || (mem_mask == (1 << 2)));
-
-      /* format, example vpcomeq:
-        xmm1, xmm2, xmm3/mem */
-      else
-       return (mem_mask == (1 << 2));
-    }
-
-  else
-    gcc_unreachable ();
-
-  return false;
-}
-
-
-/* Fixup an FMA4 instruction that has 2 memory input references into a form the
-   hardware will allow by using the destination register to load one of the
-   memory operations.  Presently this is used by the multiply/add routines to
-   allow 2 memory references.  */
-
-void
-ix86_expand_fma4_multiple_memory (rtx operands[],
-                                 int num,
-                                 enum machine_mode mode)
-{
-  rtx op0 = operands[0];
-  if (num != 4
-      || memory_operand (op0, mode)
-      || reg_mentioned_p (op0, operands[1])
-      || reg_mentioned_p (op0, operands[2])
-      || reg_mentioned_p (op0, operands[3]))
-    gcc_unreachable ();
-
-  /* For 2 memory operands, pick either operands[1] or operands[3] to move into
-     the destination register.  */
-  if (memory_operand (operands[1], mode))
-    {
-      emit_move_insn (op0, operands[1]);
-      operands[1] = op0;
-    }
-  else if (memory_operand (operands[3], mode))
-    {
-      emit_move_insn (op0, operands[3]);
-      operands[3] = op0;
-    }
-  else
-    gcc_unreachable ();
-
-  return;
-}
  
  /* Table of valid machine attributes.  */
  static const struct attribute_spec ix86_attribute_table[] =
@@ -28900,7 +29005,7 @@ static const struct attribute_spec ix86_attribute_table[] =
  
  /* Implement targetm.vectorize.builtin_vectorization_cost.  */
  static int
-x86_builtin_vectorization_cost (bool runtime_test)
+ix86_builtin_vectorization_cost (bool runtime_test)
  {
    /* If the branch of the runtime test is taken - i.e. - the vectorized
       version is skipped - this incurs a misprediction cost (because the
@@ -28922,61 +29027,1239 @@ x86_builtin_vectorization_cost (bool runtime_test)
      return 0;
  }
  
-/* This function returns the calling abi specific va_list type node.
-   It returns  the FNDECL specific va_list type.  */
+/* Implement targetm.vectorize.builtin_vec_perm.  */
  
-tree
-ix86_fn_abi_va_list (tree fndecl)
+static tree
+ix86_vectorize_builtin_vec_perm (tree vec_type, tree *mask_type)
  {
-  if (!TARGET_64BIT)
-    return va_list_type_node;
-  gcc_assert (fndecl != NULL_TREE);
+  tree itype = TREE_TYPE (vec_type);
+  bool u = TYPE_UNSIGNED (itype);
+  enum machine_mode vmode = TYPE_MODE (vec_type);
+  enum ix86_builtins fcode = fcode; /* Silence bogus warning.  */
+  bool ok = TARGET_SSE2;
  
-  if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
-    return ms_va_list_type_node;
-  else
-    return sysv_va_list_type_node;
+  switch (vmode)
+    {
+    case V4DFmode:
+      ok = TARGET_AVX;
+      fcode = IX86_BUILTIN_VEC_PERM_V4DF;
+      goto get_di;
+    case V2DFmode:
+      fcode = IX86_BUILTIN_VEC_PERM_V2DF;
+    get_di:
+      itype = ix86_get_builtin_type (IX86_BT_DI);
+      break;
+
+    case V8SFmode:
+      ok = TARGET_AVX;
+      fcode = IX86_BUILTIN_VEC_PERM_V8SF;
+      goto get_si;
+    case V4SFmode:
+      ok = TARGET_SSE;
+      fcode = IX86_BUILTIN_VEC_PERM_V4SF;
+    get_si:
+      itype = ix86_get_builtin_type (IX86_BT_SI);
+      break;
+
+    case V2DImode:
+      fcode = u ? IX86_BUILTIN_VEC_PERM_V2DI_U : IX86_BUILTIN_VEC_PERM_V2DI;
+      break;
+    case V4SImode:
+      fcode = u ? IX86_BUILTIN_VEC_PERM_V4SI_U : IX86_BUILTIN_VEC_PERM_V4SI;
+      break;
+    case V8HImode:
+      fcode = u ? IX86_BUILTIN_VEC_PERM_V8HI_U : IX86_BUILTIN_VEC_PERM_V8HI;
+      break;
+    case V16QImode:
+      fcode = u ? IX86_BUILTIN_VEC_PERM_V16QI_U : IX86_BUILTIN_VEC_PERM_V16QI;
+      break;
+    default:
+      ok = false;
+      break;
+    }
+
+  if (!ok)
+    return NULL_TREE;
+
+  *mask_type = itype;
+  return ix86_builtins[(int) fcode];
  }
  
-/* Returns the canonical va_list type specified by TYPE. If there
-   is no valid TYPE provided, it return NULL_TREE.  */
+/* Return a vector mode with twice as many elements as VMODE.  */
+/* ??? Consider moving this to a table generated by genmodes.c.  */
  
-tree
-ix86_canonical_va_list_type (tree type)
+static enum machine_mode
+doublesize_vector_mode (enum machine_mode vmode)
+{
+  switch (vmode)
+    {
+    case V2SFmode:     return V4SFmode;
+    case V1DImode:     return V2DImode;
+    case V2SImode:     return V4SImode;
+    case V4HImode:     return V8HImode;
+    case V8QImode:     return V16QImode;
+
+    case V2DFmode:     return V4DFmode;
+    case V4SFmode:     return V8SFmode;
+    case V2DImode:     return V4DImode;
+    case V4SImode:     return V8SImode;
+    case V8HImode:     return V16HImode;
+    case V16QImode:    return V32QImode;
+
+    case V4DFmode:     return V8DFmode;
+    case V8SFmode:     return V16SFmode;
+    case V4DImode:     return V8DImode;
+    case V8SImode:     return V16SImode;
+    case V16HImode:    return V32HImode;
+    case V32QImode:    return V64QImode;
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* Construct (set target (vec_select op0 (parallel perm))) and
+   return true if that's a valid instruction in the active ISA.  */
+
+static bool
+expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
  {
-  tree wtype, htype;
+  rtx rperm[MAX_VECT_LEN], x;
+  unsigned i;
  
-  /* Resolve references and pointers to va_list type.  */
-  if (INDIRECT_REF_P (type))
-    type = TREE_TYPE (type);
-  else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
-    type = TREE_TYPE (type);
+  for (i = 0; i < nelt; ++i)
+    rperm[i] = GEN_INT (perm[i]);
  
-  if (TARGET_64BIT)
+  x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
+  x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
+  x = gen_rtx_SET (VOIDmode, target, x);
+
+  x = emit_insn (x);
+  if (recog_memoized (x) < 0)
      {
-      wtype = va_list_type_node;
-         gcc_assert (wtype != NULL_TREE);
-      htype = type;
-      if (TREE_CODE (wtype) == ARRAY_TYPE)
-       {
-         /* If va_list is an array type, the argument may have decayed
-            to a pointer type, e.g. by being passed to another function.
-            In that case, unwrap both types so that we can compare the
-            underlying records.  */
-         if (TREE_CODE (htype) == ARRAY_TYPE
-             || POINTER_TYPE_P (htype))
-           {
-             wtype = TREE_TYPE (wtype);
-             htype = TREE_TYPE (htype);
-           }
-       }
-      if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
-       return va_list_type_node;
-      wtype = sysv_va_list_type_node;
-         gcc_assert (wtype != NULL_TREE);
-      htype = type;
-      if (TREE_CODE (wtype) == ARRAY_TYPE)
-       {
+      remove_insn (x);
+      return false;
+    }
+  return true;
+}
+
+/* Similar, but generate a vec_concat from op0 and op1 as well.  */
+
+static bool
+expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
+                       const unsigned char *perm, unsigned nelt)
+{
+  enum machine_mode v2mode;
+  rtx x;
+
+  v2mode = doublesize_vector_mode (GET_MODE (op0));
+  x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
+  return expand_vselect (target, x, perm, nelt);
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   in terms of blendp[sd] / pblendw / pblendvb.  */
+
+static bool
+expand_vec_perm_blend (struct expand_vec_perm_d *d)
+{
+  enum machine_mode vmode = d->vmode;
+  unsigned i, mask, nelt = d->nelt;
+  rtx target, op0, op1, x;
+
+  if (!TARGET_SSE4_1 || d->op0 == d->op1)
+    return false;
+  if (!(GET_MODE_SIZE (vmode) == 16 || vmode == V4DFmode || vmode == V8SFmode))
+    return false;
+
+  /* This is a blend, not a permute.  Elements must stay in their
+     respective lanes.  */
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned e = d->perm[i];
+      if (!(e == i || e == i + nelt))
+       return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
+     decision should be extracted elsewhere, so that we only try that
+     sequence once all budget==3 options have been tried.  */
+
+  /* For bytes, see if bytes move in pairs so we can use pblendw with
+     an immediate argument, rather than pblendvb with a vector argument.  */
+  if (vmode == V16QImode)
+    {
+      bool pblendw_ok = true;
+      for (i = 0; i < 16 && pblendw_ok; i += 2)
+       pblendw_ok = (d->perm[i] + 1 == d->perm[i + 1]);
+
+      if (!pblendw_ok)
+       {
+         rtx rperm[16], vperm;
+
+         for (i = 0; i < nelt; ++i)
+           rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
+
+         vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm));
+         vperm = force_reg (V16QImode, vperm);
+
+         emit_insn (gen_sse4_1_pblendvb (d->target, d->op0, d->op1, vperm));
+         return true;
+       }
+    }
+
+  target = d->target;
+  op0 = d->op0;
+  op1 = d->op1;
+  mask = 0;
+
+  switch (vmode)
+    {
+    case V4DFmode:
+    case V8SFmode:
+    case V2DFmode:
+    case V4SFmode:
+    case V8HImode:
+      for (i = 0; i < nelt; ++i)
+       mask |= (d->perm[i] >= nelt) << i;
+      break;
+
+    case V2DImode:
+      for (i = 0; i < 2; ++i)
+       mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
+      goto do_subreg;
+
+    case V4SImode:
+      for (i = 0; i < 4; ++i)
+       mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+      goto do_subreg;
+
+    case V16QImode:
+      for (i = 0; i < 8; ++i)
+       mask |= (d->perm[i * 2] >= 16) << i;
+
+    do_subreg:
+      vmode = V8HImode;
+      target = gen_lowpart (vmode, target);
+      op0 = gen_lowpart (vmode, op0);
+      op1 = gen_lowpart (vmode, op1);
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  /* This matches five different patterns with the different modes.  */
+  x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
+  x = gen_rtx_SET (VOIDmode, target, x);
+  emit_insn (x);
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   in terms of the variable form of vpermilps.
+
+   Note that we will have already failed the immediate input vpermilps,
+   which requires that the high and low part shuffle be identical; the
+   variable form doesn't require that.  */
+
+static bool
+expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
+{
+  rtx rperm[8], vperm;
+  unsigned i;
+
+  if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
+    return false;
+
+  /* We can only permute within the 128-bit lane.  */
+  for (i = 0; i < 8; ++i)
+    {
+      unsigned e = d->perm[i];
+      if (i < 4 ? e >= 4 : e < 4)
+       return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  for (i = 0; i < 8; ++i)
+    {
+      unsigned e = d->perm[i];
+
+      /* Within each 128-bit lane, the elements of op0 are numbered
+        from 0 and the elements of op1 are numbered from 4.  */
+      if (e >= 8 + 4)
+       e -= 8;
+      else if (e >= 4)
+       e -= 4;
+
+      rperm[i] = GEN_INT (e);
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
+  vperm = force_reg (V8SImode, vperm);
+  emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   in terms of pshufb or vpperm.  */
+
+static bool
+expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt, eltsz;
+  rtx rperm[16], vperm, target, op0, op1;
+
+  if (!(d->op0 == d->op1 ? TARGET_SSSE3 : TARGET_XOP))
+    return false;
+  if (GET_MODE_SIZE (d->vmode) != 16)
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  nelt = d->nelt;
+  eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned j, e = d->perm[i];
+      for (j = 0; j < eltsz; ++j)
+       rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm));
+  vperm = force_reg (V16QImode, vperm);
+
+  target = gen_lowpart (V16QImode, d->target);
+  op0 = gen_lowpart (V16QImode, d->op0);
+  if (d->op0 == d->op1)
+    emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
+  else
+    {
+      op1 = gen_lowpart (V16QImode, d->op1);
+      emit_insn (gen_xop_pperm (target, op0, op1, vperm));
+    }
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
+   in a single instruction.  */
+
+static bool
+expand_vec_perm_1 (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt = d->nelt;
+  unsigned char perm2[MAX_VECT_LEN];
+
+  /* Check plain VEC_SELECT first, because AVX has instructions that could
+     match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
+     input where SEL+CONCAT may not.  */
+  if (d->op0 == d->op1)
+    {
+      int mask = nelt - 1;
+
+      for (i = 0; i < nelt; i++)
+       perm2[i] = d->perm[i] & mask;
+
+      if (expand_vselect (d->target, d->op0, perm2, nelt))
+       return true;
+
+      /* There are plenty of patterns in sse.md that are written for
+        SEL+CONCAT and are not replicated for a single op.  Perhaps
+        that should be changed, to avoid the nastiness here.  */
+
+      /* Recognize interleave style patterns, which means incrementing
+        every other permutation operand.  */
+      for (i = 0; i < nelt; i += 2)
+       {
+         perm2[i] = d->perm[i] & mask;
+         perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
+       }
+      if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
+       return true;
+
+      /* Recognize shufps, which means adding {0, 0, nelt, nelt}.  */
+      if (nelt >= 4)
+       {
+         for (i = 0; i < nelt; i += 4)
+           {
+             perm2[i + 0] = d->perm[i + 0] & mask;
+             perm2[i + 1] = d->perm[i + 1] & mask;
+             perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
+             perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
+           }
+
+         if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
+           return true;
+       }
+    }
+
+  /* Finally, try the fully general two operand permute.  */
+  if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
+    return true;
+
+  /* Recognize interleave style patterns with reversed operands.  */
+  if (d->op0 != d->op1)
+    {
+      for (i = 0; i < nelt; ++i)
+       {
+         unsigned e = d->perm[i];
+         if (e >= nelt)
+           e -= nelt;
+         else
+           e += nelt;
+         perm2[i] = e;
+       }
+
+      if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
+       return true;
+    }
+
+  /* Try the SSE4.1 blend variable merge instructions.  */
+  if (expand_vec_perm_blend (d))
+    return true;
+
+  /* Try one of the AVX vpermil variable permutations.  */
+  if (expand_vec_perm_vpermil (d))
+    return true;
+
+  /* Try the SSSE3 pshufb or XOP vpperm variable permutation.  */
+  if (expand_vec_perm_pshufb (d))
+    return true;
+
+  return false;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   in terms of a pair of pshuflw + pshufhw instructions.  */
+
+static bool
+expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
+{
+  unsigned char perm2[MAX_VECT_LEN];
+  unsigned i;
+  bool ok;
+
+  if (d->vmode != V8HImode || d->op0 != d->op1)
+    return false;
+
+  /* The two permutations only operate in 64-bit lanes.  */
+  for (i = 0; i < 4; ++i)
+    if (d->perm[i] >= 4)
+      return false;
+  for (i = 4; i < 8; ++i)
+    if (d->perm[i] < 4)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  /* Emit the pshuflw.  */
+  memcpy (perm2, d->perm, 4);
+  for (i = 4; i < 8; ++i)
+    perm2[i] = i;
+  ok = expand_vselect (d->target, d->op0, perm2, 8);
+  gcc_assert (ok);
+
+  /* Emit the pshufhw.  */
+  memcpy (perm2 + 4, d->perm + 4, 4);
+  for (i = 0; i < 4; ++i)
+    perm2[i] = i;
+  ok = expand_vselect (d->target, d->target, perm2, 8);
+  gcc_assert (ok);
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+   the permutation using the SSSE3 palignr instruction.  This succeeds
+   when all of the elements in PERM fit within one vector and we merely
+   need to shift them down so that a single vector permutation has a
+   chance to succeed.  */
+
+static bool
+expand_vec_perm_palignr (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt = d->nelt;
+  unsigned min, max;
+  bool in_order, ok;
+  rtx shift;
+
+  /* Even with AVX, palignr only operates on 128-bit vectors.  */
+  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+    return false;
+
+  min = nelt, max = 0;
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned e = d->perm[i];
+      if (e < min)
+       min = e;
+      if (e > max)
+       max = e;
+    }
+  if (min == 0 || max - min >= nelt)
+    return false;
+
+  /* Given that we have SSSE3, we know we'll be able to implement the
+     single operand permutation after the palignr with pshufb.  */
+  if (d->testing_p)
+    return true;
+
+  shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
+  emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
+                                 gen_lowpart (TImode, d->op1),
+                                 gen_lowpart (TImode, d->op0), shift));
+
+  d->op0 = d->op1 = d->target;
+
+  in_order = true;
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned e = d->perm[i] - min;
+      if (e != i)
+       in_order = false;
+      d->perm[i] = e;
+    }
+
+  /* Test for the degenerate case where the alignment by itself
+     produces the desired permutation.  */
+  if (in_order)
+    return true;
+
+  ok = expand_vec_perm_1 (d);
+  gcc_assert (ok);
+
+  return ok;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+   a two vector permutation into a single vector permutation by using
+   an interleave operation to merge the vectors.  */
+
+static bool
+expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
+{
+  struct expand_vec_perm_d dremap, dfinal;
+  unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
+  unsigned contents, h1, h2, h3, h4;
+  unsigned char remap[2 * MAX_VECT_LEN];
+  rtx seq;
+  bool ok;
+
+  if (d->op0 == d->op1)
+    return false;
+
+  /* The 256-bit unpck[lh]p[sd] instructions only operate within the 128-bit
+     lanes.  We can use similar techniques with the vperm2f128 instruction,
+     but it requires slightly different logic.  */
+  if (GET_MODE_SIZE (d->vmode) != 16)
+    return false;
+
+  /* Examine from whence the elements come.  */
+  contents = 0;
+  for (i = 0; i < nelt; ++i)
+    contents |= 1u << d->perm[i];
+
+  /* Split the two input vectors into 4 halves.  */
+  h1 = (1u << nelt2) - 1;
+  h2 = h1 << nelt2;
+  h3 = h2 << nelt2;
+  h4 = h3 << nelt2;
+
+  memset (remap, 0xff, sizeof (remap));
+  dremap = *d;
+
+  /* If the elements from the low halves use interleave low, and similarly
+     for interleave high.  If the elements are from mis-matched halves, we
+     can use shufps for V4SF/V4SI or do a DImode shuffle.  */
+  if ((contents & (h1 | h3)) == contents)
+    {
+      for (i = 0; i < nelt2; ++i)
+       {
+         remap[i] = i * 2;
+         remap[i + nelt] = i * 2 + 1;
+         dremap.perm[i * 2] = i;
+         dremap.perm[i * 2 + 1] = i + nelt;
+       }
+    }
+  else if ((contents & (h2 | h4)) == contents)
+    {
+      for (i = 0; i < nelt2; ++i)
+       {
+         remap[i + nelt2] = i * 2;
+         remap[i + nelt + nelt2] = i * 2 + 1;
+         dremap.perm[i * 2] = i + nelt2;
+         dremap.perm[i * 2 + 1] = i + nelt + nelt2;
+       }
+    }
+  else if ((contents & (h1 | h4)) == contents)
+    {
+      for (i = 0; i < nelt2; ++i)
+       {
+         remap[i] = i;
+         remap[i + nelt + nelt2] = i + nelt2;
+         dremap.perm[i] = i;
+         dremap.perm[i + nelt2] = i + nelt + nelt2;
+       }
+      if (nelt != 4)
+       {
+         dremap.vmode = V2DImode;
+         dremap.nelt = 2;
+         dremap.perm[0] = 0;
+         dremap.perm[1] = 3;
+       }
+    }
+  else if ((contents & (h2 | h3)) == contents)
+    {
+      for (i = 0; i < nelt2; ++i)
+       {
+         remap[i + nelt2] = i;
+         remap[i + nelt] = i + nelt2;
+         dremap.perm[i] = i + nelt2;
+         dremap.perm[i + nelt2] = i + nelt;
+       }
+      if (nelt != 4)
+       {
+         dremap.vmode = V2DImode;
+         dremap.nelt = 2;
+         dremap.perm[0] = 1;
+         dremap.perm[1] = 2;
+       }
+    }
+  else
+    return false;
+
+  /* Use the remapping array set up above to move the elements from their
+     swizzled locations into their final destinations.  */
+  dfinal = *d;
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned e = remap[d->perm[i]];
+      gcc_assert (e < nelt);
+      dfinal.perm[i] = e;
+    }
+  dfinal.op0 = gen_reg_rtx (dfinal.vmode);
+  dfinal.op1 = dfinal.op0;
+  dremap.target = dfinal.op0;
+
+  /* Test if the final remap can be done with a single insn.  For V4SFmode or
+     V4SImode this *will* succeed.  For V8HImode or V16QImode it may not.  */
+  start_sequence ();
+  ok = expand_vec_perm_1 (&dfinal);
+  seq = get_insns ();
+  end_sequence ();
+
+  if (!ok)
+    return false;
+
+  if (dremap.vmode != dfinal.vmode)
+    {
+      dremap.target = gen_lowpart (dremap.vmode, dremap.target);
+      dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
+      dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
+    }
+
+  ok = expand_vec_perm_1 (&dremap);
+  gcc_assert (ok);
+
+  emit_insn (seq);
+  return true;
+}
+
+/* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
+   permutation with two pshufb insns and an ior.  We should have already
+   failed all two instruction sequences.  */
+
+static bool
+expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
+{
+  rtx rperm[2][16], vperm, l, h, op, m128;
+  unsigned int i, nelt, eltsz;
+
+  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+    return false;
+  gcc_assert (d->op0 != d->op1);
+
+  nelt = d->nelt;
+  eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+  
+  /* Generate two permutation masks.  If the required element is within
+     the given vector it is shuffled into the proper lane.  If the required
+     element is in the other vector, force a zero into the lane by setting
+     bit 7 in the permutation mask.  */
+  m128 = GEN_INT (-128);
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned j, e = d->perm[i];
+      unsigned which = (e >= nelt);
+      if (e >= nelt)
+       e -= nelt;
+
+      for (j = 0; j < eltsz; ++j)
+       {
+         rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
+         rperm[1-which][i*eltsz + j] = m128;
+       }
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
+  vperm = force_reg (V16QImode, vperm);
+
+  l = gen_reg_rtx (V16QImode);
+  op = gen_lowpart (V16QImode, d->op0);
+  emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
+
+  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
+  vperm = force_reg (V16QImode, vperm);
+
+  h = gen_reg_rtx (V16QImode);
+  op = gen_lowpart (V16QImode, d->op1);
+  emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
+
+  op = gen_lowpart (V16QImode, d->target);
+  emit_insn (gen_iorv16qi3 (op, l, h));
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
+   and extract-odd permutations.  */
+
+static bool
+expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
+{
+  rtx t1, t2, t3, t4;
+
+  switch (d->vmode)
+    {
+    case V4DFmode:
+      t1 = gen_reg_rtx (V4DFmode);
+      t2 = gen_reg_rtx (V4DFmode);
+
+      /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
+      emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
+      emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
+
+      /* Now an unpck[lh]pd will produce the result required.  */
+      if (odd)
+       t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
+      else
+       t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
+      emit_insn (t3);
+      break;
+
+    case V8SFmode:
+      {
+       static const unsigned char perm1[8] = { 0, 2, 1, 3, 5, 6, 5, 7 };
+       static const unsigned char perme[8] = { 0, 1,  8,  9, 4, 5, 12, 13 };
+       static const unsigned char permo[8] = { 2, 3, 10, 11, 6, 7, 14, 15 };
+
+       t1 = gen_reg_rtx (V8SFmode);
+       t2 = gen_reg_rtx (V8SFmode);
+       t3 = gen_reg_rtx (V8SFmode);
+       t4 = gen_reg_rtx (V8SFmode);
+
+       /* Shuffle within the 128-bit lanes to produce:
+          { 0 2 1 3 4 6 5 7 } and { 8 a 9 b c e d f }.  */
+       expand_vselect (t1, d->op0, perm1, 8);
+       expand_vselect (t2, d->op1, perm1, 8);
+
+       /* Shuffle the lanes around to produce:
+          { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
+       emit_insn (gen_avx_vperm2f128v8sf3 (t3, t1, t2, GEN_INT (0x20)));
+       emit_insn (gen_avx_vperm2f128v8sf3 (t4, t1, t2, GEN_INT (0x31)));
+
+       /* Now a vpermil2p will produce the result required.  */
+       /* ??? The vpermil2p requires a vector constant.  Another option
+          is a unpck[lh]ps to merge the two vectors to produce
+          { 0 4 2 6 8 c a e } or { 1 5 3 7 9 d b f }.  Then use another
+          vpermilps to get the elements into the final order.  */
+       d->op0 = t3;
+       d->op1 = t4;
+       memcpy (d->perm, odd ? permo: perme, 8);
+       expand_vec_perm_vpermil (d);
+      }
+      break;
+
+    case V2DFmode:
+    case V4SFmode:
+    case V2DImode:
+    case V4SImode:
+      /* These are always directly implementable by expand_vec_perm_1.  */
+      gcc_unreachable ();
+
+    case V8HImode:
+      if (TARGET_SSSE3)
+       return expand_vec_perm_pshufb2 (d);
+      else
+       {
+         /* We need 2*log2(N)-1 operations to achieve odd/even
+            with interleave. */
+         t1 = gen_reg_rtx (V8HImode);
+         t2 = gen_reg_rtx (V8HImode);
+         emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
+         emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
+         emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
+         emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
+         if (odd)
+           t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
+         else
+           t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
+         emit_insn (t3);
+       }
+      break;
+
+    case V16QImode:
+      if (TARGET_SSSE3)
+       return expand_vec_perm_pshufb2 (d);
+      else
+       {
+         t1 = gen_reg_rtx (V16QImode);
+         t2 = gen_reg_rtx (V16QImode);
+         t3 = gen_reg_rtx (V16QImode);
+         emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
+         emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
+         emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
+         emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
+         emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
+         emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
+         if (odd)
+           t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
+         else
+           t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
+         emit_insn (t3);
+       }
+      break;
+
+    default:
+      gcc_unreachable ();
+    }
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
+   extract-even and extract-odd permutations.  */
+
+static bool
+expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
+{
+  unsigned i, odd, nelt = d->nelt;
+
+  odd = d->perm[0];
+  if (odd != 0 && odd != 1)
+    return false;
+
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != 2 * i + odd)
+      return false;
+
+  return expand_vec_perm_even_odd_1 (d, odd);
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement broadcast
+   permutations.  We assume that expand_vec_perm_1 has already failed.  */
+
+static bool
+expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
+{
+  unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
+  enum machine_mode vmode = d->vmode;
+  unsigned char perm2[4];
+  rtx op0 = d->op0;
+  bool ok;
+
+  switch (vmode)
+    {
+    case V4DFmode:
+    case V8SFmode:
+      /* These are special-cased in sse.md so that we can optionally
+        use the vbroadcast instruction.  They expand to two insns
+        if the input happens to be in a register.  */
+      gcc_unreachable ();
+
+    case V2DFmode:
+    case V2DImode:
+    case V4SFmode:
+    case V4SImode:
+      /* These are always implementable using standard shuffle patterns.  */
+      gcc_unreachable ();
+
+    case V8HImode:
+    case V16QImode:
+      /* These can be implemented via interleave.  We save one insn by
+        stopping once we have promoted to V4SImode and then use pshufd.  */
+      do
+       {
+         optab otab = vec_interleave_low_optab;
+
+         if (elt >= nelt2)
+           {
+             otab = vec_interleave_high_optab;
+             elt -= nelt2;
+           }
+         nelt2 /= 2;
+
+         op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT);
+         vmode = get_mode_wider_vector (vmode);
+         op0 = gen_lowpart (vmode, op0);
+       }
+      while (vmode != V4SImode);
+
+      memset (perm2, elt, 4);
+      ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
+      gcc_assert (ok);
+      return true;
+
+    default:
+      gcc_unreachable ();
+    }
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
+   broadcast permutations.  */
+
+static bool
+expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
+{
+  unsigned i, elt, nelt = d->nelt;
+
+  if (d->op0 != d->op1)
+    return false;
+
+  elt = d->perm[0];
+  for (i = 1; i < nelt; ++i)
+    if (d->perm[i] != elt)
+      return false;
+
+  return expand_vec_perm_broadcast_1 (d);
+}
+
+/* The guts of ix86_expand_vec_perm_builtin, also used by the ok hook.
+   With all of the interface bits taken care of, perform the expansion
+   in D and return true on success.  */
+
+static bool
+ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
+{
+  /* Try a single instruction expansion.  */
+  if (expand_vec_perm_1 (d))
+    return true;
+
+  /* Try sequences of two instructions.  */
+
+  if (expand_vec_perm_pshuflw_pshufhw (d))
+    return true;
+
+  if (expand_vec_perm_palignr (d))
+    return true;
+
+  if (expand_vec_perm_interleave2 (d))
+    return true;
+
+  if (expand_vec_perm_broadcast (d))
+    return true;
+
+  /* Try sequences of three instructions.  */
+
+  if (expand_vec_perm_pshufb2 (d))
+    return true;
+
+  /* ??? Look for narrow permutations whose element orderings would
+     allow the promotion to a wider mode.  */
+
+  /* ??? Look for sequences of interleave or a wider permute that place
+     the data into the correct lanes for a half-vector shuffle like
+     pshuf[lh]w or vpermilps.  */
+
+  /* ??? Look for sequences of interleave that produce the desired results.
+     The combinatorics of punpck[lh] get pretty ugly... */
+
+  if (expand_vec_perm_even_odd (d))
+    return true;
+
+  return false;
+}
+
+/* Extract the values from the vector CST into the permutation array in D.
+   Return 0 on error, 1 if all values from the permutation come from the
+   first vector, 2 if all values from the second vector, and 3 otherwise.  */
+
+static int
+extract_vec_perm_cst (struct expand_vec_perm_d *d, tree cst)
+{
+  tree list = TREE_VECTOR_CST_ELTS (cst);
+  unsigned i, nelt = d->nelt;
+  int ret = 0;
+
+  for (i = 0; i < nelt; ++i, list = TREE_CHAIN (list))
+    {
+      unsigned HOST_WIDE_INT e;
+
+      if (!host_integerp (TREE_VALUE (list), 1))
+       return 0;
+      e = tree_low_cst (TREE_VALUE (list), 1);
+      if (e >= 2 * nelt)
+       return 0;
+
+      ret |= (e < nelt ? 1 : 2);
+      d->perm[i] = e;
+    }
+  gcc_assert (list == NULL);
+
+  /* For all elements from second vector, fold the elements to first.  */
+  if (ret == 2)
+    for (i = 0; i < nelt; ++i)
+      d->perm[i] -= nelt;
+
+  return ret;
+}
+
+static rtx
+ix86_expand_vec_perm_builtin (tree exp)
+{
+  struct expand_vec_perm_d d;
+  tree arg0, arg1, arg2;
+
+  arg0 = CALL_EXPR_ARG (exp, 0);
+  arg1 = CALL_EXPR_ARG (exp, 1);
+  arg2 = CALL_EXPR_ARG (exp, 2);
+
+  d.vmode = TYPE_MODE (TREE_TYPE (arg0));
+  d.nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = false;
+  gcc_assert (VECTOR_MODE_P (d.vmode));
+
+  if (TREE_CODE (arg2) != VECTOR_CST)
+    {
+      error_at (EXPR_LOCATION (exp),
+               "vector permutation requires vector constant");
+      goto exit_error;
+    }
+
+  switch (extract_vec_perm_cst (&d, arg2))
+    {
+    default:
+      gcc_unreachable();
+
+    case 0:
+      error_at (EXPR_LOCATION (exp), "invalid vector permutation constant");
+      goto exit_error;
+
+    case 3:
+      if (!operand_equal_p (arg0, arg1, 0))
+       {
+         d.op0 = expand_expr (arg0, NULL_RTX, d.vmode, EXPAND_NORMAL);
+         d.op0 = force_reg (d.vmode, d.op0);
+         d.op1 = expand_expr (arg1, NULL_RTX, d.vmode, EXPAND_NORMAL);
+         d.op1 = force_reg (d.vmode, d.op1);
+         break;
+       }
+
+      /* The elements of PERM do not suggest that only the first operand
+        is used, but both operands are identical.  Allow easier matching
+        of the permutation by folding the permutation into the single
+        input vector.  */
+      {
+       unsigned i, nelt = d.nelt;
+       for (i = 0; i < nelt; ++i)
+         if (d.perm[i] >= nelt)
+           d.perm[i] -= nelt;
+      }
+      /* FALLTHRU */
+
+    case 1:
+      d.op0 = expand_expr (arg0, NULL_RTX, d.vmode, EXPAND_NORMAL);
+      d.op0 = force_reg (d.vmode, d.op0);
+      d.op1 = d.op0;
+      break;
+
+    case 2:
+      d.op0 = expand_expr (arg1, NULL_RTX, d.vmode, EXPAND_NORMAL);
+      d.op0 = force_reg (d.vmode, d.op0);
+      d.op1 = d.op0;
+      break;
+    }
+ 
+  d.target = gen_reg_rtx (d.vmode);
+  if (ix86_expand_vec_perm_builtin_1 (&d))
+    return d.target;
+
+  /* For compiler generated permutations, we should never got here, because
+     the compiler should also be checking the ok hook.  But since this is a
+     builtin the user has access too, so don't abort.  */
+  switch (d.nelt)
+    {
+    case 2:
+      sorry ("vector permutation (%d %d)", d.perm[0], d.perm[1]);
+      break;
+    case 4:
+      sorry ("vector permutation (%d %d %d %d)",
+            d.perm[0], d.perm[1], d.perm[2], d.perm[3]);
+      break;
+    case 8:
+      sorry ("vector permutation (%d %d %d %d %d %d %d %d)",
+            d.perm[0], d.perm[1], d.perm[2], d.perm[3],
+            d.perm[4], d.perm[5], d.perm[6], d.perm[7]);
+      break;
+    case 16:
+      sorry ("vector permutation "
+            "(%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d)",
+            d.perm[0], d.perm[1], d.perm[2], d.perm[3],
+            d.perm[4], d.perm[5], d.perm[6], d.perm[7],
+            d.perm[8], d.perm[9], d.perm[10], d.perm[11],
+            d.perm[12], d.perm[13], d.perm[14], d.perm[15]);
+      break;
+    default:
+      gcc_unreachable ();
+    }
+ exit_error:
+  return CONST0_RTX (d.vmode);
+}
+
+/* Implement targetm.vectorize.builtin_vec_perm_ok.  */
+
+static bool
+ix86_vectorize_builtin_vec_perm_ok (tree vec_type, tree mask)
+{
+  struct expand_vec_perm_d d;
+  int vec_mask;
+  bool ret, one_vec;
+
+  d.vmode = TYPE_MODE (vec_type);
+  d.nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = true;
+
+  /* Given sufficient ISA support we can just return true here
+     for selected vector modes.  */
+  if (GET_MODE_SIZE (d.vmode) == 16)
+    {
+      /* All implementable with a single vpperm insn.  */
+      if (TARGET_XOP)
+       return true;
+      /* All implementable with 2 pshufb + 1 ior.  */
+      if (TARGET_SSSE3)
+       return true;
+      /* All implementable with shufpd or unpck[lh]pd.  */
+      if (d.nelt == 2)
+       return true;
+    }
+
+  vec_mask = extract_vec_perm_cst (&d, mask);
+
+  /* This hook is cannot be called in response to something that the
+     user does (unlike the builtin expander) so we shouldn't ever see
+     an error generated from the extract.  */
+  gcc_assert (vec_mask > 0 && vec_mask <= 3);
+  one_vec = (vec_mask != 3);
+  
+  /* Implementable with shufps or pshufd.  */
+  if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
+    return true;
+
+  /* Otherwise we have to go through the motions and see if we can
+     figure out how to generate the requested permutation.  */
+  d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
+  d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
+  if (!one_vec)
+    d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+
+  start_sequence ();
+  ret = ix86_expand_vec_perm_builtin_1 (&d);
+  end_sequence ();
+
+  return ret;
+}
+
+void
+ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
+{
+  struct expand_vec_perm_d d;
+  unsigned i, nelt;
+
+  d.target = targ;
+  d.op0 = op0;
+  d.op1 = op1;
+  d.vmode = GET_MODE (targ);
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = false;
+
+  for (i = 0; i < nelt; ++i)
+    d.perm[i] = i * 2 + odd;
+
+  /* We'll either be able to implement the permutation directly...  */
+  if (expand_vec_perm_1 (&d))
+    return;
+
+  /* ... or we use the special-case patterns.  */
+  expand_vec_perm_even_odd_1 (&d, odd);
+}
+\f
+/* This function returns the calling abi specific va_list type node.
+   It returns  the FNDECL specific va_list type.  */
+
+tree
+ix86_fn_abi_va_list (tree fndecl)
+{
+  if (!TARGET_64BIT)
+    return va_list_type_node;
+  gcc_assert (fndecl != NULL_TREE);
+
+  if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
+    return ms_va_list_type_node;
+  else
+    return sysv_va_list_type_node;
+}
+
+/* Returns the canonical va_list type specified by TYPE. If there
+   is no valid TYPE provided, it return NULL_TREE.  */
+
+tree
+ix86_canonical_va_list_type (tree type)
+{
+  tree wtype, htype;
+
+  /* Resolve references and pointers to va_list type.  */
+  if (INDIRECT_REF_P (type))
+    type = TREE_TYPE (type);
+  else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
+    type = TREE_TYPE (type);
+
+  if (TARGET_64BIT)
+    {
+      wtype = va_list_type_node;
+         gcc_assert (wtype != NULL_TREE);
+      htype = type;
+      if (TREE_CODE (wtype) == ARRAY_TYPE)
+       {
+         /* If va_list is an array type, the argument may have decayed
+            to a pointer type, e.g. by being passed to another function.
+            In that case, unwrap both types so that we can compare the
+            underlying records.  */
+         if (TREE_CODE (htype) == ARRAY_TYPE
+             || POINTER_TYPE_P (htype))
+           {
+             wtype = TREE_TYPE (wtype);
+             htype = TREE_TYPE (htype);
+           }
+       }
+      if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
+       return va_list_type_node;
+      wtype = sysv_va_list_type_node;
+         gcc_assert (wtype != NULL_TREE);
+      htype = type;
+      if (TREE_CODE (wtype) == ARRAY_TYPE)
+       {
           /* If va_list is an array type, the argument may have decayed
              to a pointer type, e.g. by being passed to another function.
              In that case, unwrap both types so that we can compare the
@@ -29156,7 +30439,8 @@ ix86_enum_va_list (int idx, const char **pname, tree *ptree)
  #define TARGET_DEFAULT_TARGET_FLAGS    \
    (TARGET_DEFAULT                      \
     | TARGET_SUBTARGET_DEFAULT          \
-   | TARGET_TLS_DIRECT_SEG_REFS_DEFAULT)
+   | TARGET_TLS_DIRECT_SEG_REFS_DEFAULT \
+   | MASK_FUSED_MADD)
  
  #undef TARGET_HANDLE_OPTION
  #define TARGET_HANDLE_OPTION ix86_handle_option
@@ -29250,7 +30534,14 @@ ix86_enum_va_list (int idx, const char **pname, tree *ptree)
  #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
  
  #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
-#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST x86_builtin_vectorization_cost
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
+  ix86_builtin_vectorization_cost
+#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM
+#define TARGET_VECTORIZE_BUILTIN_VEC_PERM \
+  ix86_vectorize_builtin_vec_perm
+#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM_OK
+#define TARGET_VECTORIZE_BUILTIN_VEC_PERM_OK \
+  ix86_vectorize_builtin_vec_perm_ok
  
  #undef TARGET_SET_CURRENT_FUNCTION
  #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
@@ -29285,6 +30576,9 @@ ix86_enum_va_list (int idx, const char **pname, tree *ptree)
  #undef TARGET_CAN_ELIMINATE
  #define TARGET_CAN_ELIMINATE ix86_can_eliminate
  
+#undef TARGET_ASM_CODE_END
+#define TARGET_ASM_CODE_END ix86_code_end
+
  struct gcc_target targetm = TARGET_INITIALIZER;
  \f
  #include "gt-i386.h"