2012-04-16 Uros Bizjak <ubizjak@gmail.com>

[pf3gnuchains/gcc-fork.git] / gcc / config / i386 / i386.c
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

index 490bf4e..6949587 100644 (file)
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1,6 +1,6 @@
  /* Subroutines used for code generation on IA-32.
     Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
-   2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
+   2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
     Free Software Foundation, Inc.
  
  This file is part of GCC.
@@ -47,6 +47,7 @@ along with GCC; see the file COPYING3.  If not see
  #include "target-def.h"
  #include "common/common-target.h"
  #include "langhooks.h"
+#include "reload.h"
  #include "cgraph.h"
  #include "gimple.h"
  #include "dwarf2.h"
@@ -1672,7 +1673,7 @@ struct processor_costs atom_cost = {
    COSTS_N_INSNS (1),                   /* cost of movzx */
    8,                                   /* "large" insn */
    17,                                  /* MOVE_RATIO */
-  2,                                /* cost for loading QImode using movzbl */
+  4,                                   /* cost for loading QImode using movzbl */
    {4, 4, 4},                           /* cost of loading integer registers
                                            in QImode, HImode and SImode.
                                            Relative to reg-reg move (2).  */
@@ -2509,7 +2510,6 @@ static void ix86_compute_frame_layout (struct ix86_frame *);
  static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
                                                  rtx, rtx, int);
  static void ix86_add_new_builtins (HOST_WIDE_INT);
-static rtx ix86_expand_vec_perm_builtin (tree);
  static tree ix86_canonical_va_list_type (tree);
  static void predict_jump (int);
  static unsigned int split_stack_prologue_scratch_regno (void);
@@ -2596,7 +2596,7 @@ static const struct ptt processor_target_table[PROCESSOR_max] =
    {&bdver1_cost, 32, 24, 32, 7, 32},
    {&bdver2_cost, 32, 24, 32, 7, 32},
    {&btver1_cost, 32, 24, 32, 7, 32},
-  {&atom_cost, 16, 7, 16, 7, 16}
+  {&atom_cost, 16, 15, 16, 7, 16}
  };
  
  static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
@@ -3450,8 +3450,6 @@ ix86_option_override_internal (bool main_args_p)
       in case they weren't overwritten by command line options.  */
    if (TARGET_64BIT)
      {
-      if (optimize > 1 && !global_options_set.x_flag_zee)
-        flag_zee = 1;
        if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
         flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
        if (flag_asynchronous_unwind_tables == 2)
@@ -5029,6 +5027,40 @@ ix86_handle_cconv_attribute (tree *node, tree name,
    return NULL_TREE;
  }
  
+/* The transactional memory builtins are implicitly regparm or fastcall
+   depending on the ABI.  Override the generic do-nothing attribute that
+   these builtins were declared with, and replace it with one of the two
+   attributes that we expect elsewhere.  */
+
+static tree
+ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED,
+                                 tree args ATTRIBUTE_UNUSED,
+                                 int flags ATTRIBUTE_UNUSED,
+                                 bool *no_add_attrs)
+{
+  tree alt;
+
+  /* In no case do we want to add the placeholder attribute.  */
+  *no_add_attrs = true;
+
+  /* The 64-bit ABI is unchanged for transactional memory.  */
+  if (TARGET_64BIT)
+    return NULL_TREE;
+
+  /* ??? Is there a better way to validate 32-bit windows?  We have
+     cfun->machine->call_abi, but that seems to be set only for 64-bit.  */
+  if (CHECK_STACK_LIMIT > 0)
+    alt = tree_cons (get_identifier ("fastcall"), NULL, NULL);
+  else
+    {
+      alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL);
+      alt = tree_cons (get_identifier ("regparm"), alt, NULL);
+    }
+  decl_attributes (node, alt, flags);
+
+  return NULL_TREE;
+}
+
  /* This function determines from TYPE the calling-convention.  */
  
  unsigned int
@@ -8027,7 +8059,7 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
               else
                 {
                   tree copy
-                   = build_call_expr (implicit_built_in_decls[BUILT_IN_MEMCPY],
+                   = build_call_expr (builtin_decl_implicit (BUILT_IN_MEMCPY),
                                        3, dest_addr, src_addr,
                                        size_int (cur_size));
                   gimplify_and_add (copy, pre_p);
@@ -8391,6 +8423,10 @@ ix86_frame_pointer_required (void)
    if (SUBTARGET_FRAME_POINTER_REQUIRED)
      return true;
  
+  /* For older 32-bit runtimes setjmp requires valid frame-pointer.  */
+  if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp)
+    return true;
+
    /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
       turns off the frame pointer by default.  Turn it back on now if
       we've not got a leaf function.  */
@@ -9891,12 +9927,68 @@ ix86_finalize_stack_realign_flags (void)
        /* After stack_realign_needed is finalized, we can't no longer
          change it.  */
        gcc_assert (crtl->stack_realign_needed == stack_realign);
+      return;
      }
-  else
-    {
-      crtl->stack_realign_needed = stack_realign;
-      crtl->stack_realign_finalized = true;
+
+  /* If the only reason for frame_pointer_needed is that we conservatively
+     assumed stack realignment might be needed, but in the end nothing that
+     needed the stack alignment had been spilled, clear frame_pointer_needed
+     and say we don't need stack realignment.  */
+  if (stack_realign
+      && !crtl->need_drap
+      && frame_pointer_needed
+      && current_function_is_leaf
+      && flag_omit_frame_pointer
+      && current_function_sp_is_unchanging
+      && !ix86_current_function_calls_tls_descriptor
+      && !crtl->accesses_prior_frames
+      && !cfun->calls_alloca
+      && !crtl->calls_eh_return
+      && !(flag_stack_check && STACK_CHECK_MOVING_SP)
+      && !ix86_frame_pointer_required ()
+      && get_frame_size () == 0
+      && ix86_nsaved_sseregs () == 0
+      && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0)
+    {
+      HARD_REG_SET set_up_by_prologue, prologue_used;
+      basic_block bb;
+
+      CLEAR_HARD_REG_SET (prologue_used);
+      CLEAR_HARD_REG_SET (set_up_by_prologue);
+      add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM);
+      add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM);
+      add_to_hard_reg_set (&set_up_by_prologue, Pmode,
+                          HARD_FRAME_POINTER_REGNUM);
+      FOR_EACH_BB (bb)
+        {
+          rtx insn;
+         FOR_BB_INSNS (bb, insn)
+           if (NONDEBUG_INSN_P (insn)
+               && requires_stack_frame_p (insn, prologue_used,
+                                          set_up_by_prologue))
+             {
+               crtl->stack_realign_needed = stack_realign;
+               crtl->stack_realign_finalized = true;
+               return;
+             }
+       }
+
+      frame_pointer_needed = false;
+      stack_realign = false;
+      crtl->max_used_stack_slot_alignment = incoming_stack_boundary;
+      crtl->stack_alignment_needed = incoming_stack_boundary;
+      crtl->stack_alignment_estimated = incoming_stack_boundary;
+      if (crtl->preferred_stack_boundary > incoming_stack_boundary)
+       crtl->preferred_stack_boundary = incoming_stack_boundary;
+      df_finish_pass (true);
+      df_scan_alloc (NULL);
+      df_scan_blocks ();
+      df_compute_regs_ever_live (true);
+      df_analyze ();
      }
+
+  crtl->stack_realign_needed = stack_realign;
+  crtl->stack_realign_finalized = true;
  }
  
  /* Expand the prologue into a bunch of separate insns.  */
@@ -10452,9 +10544,9 @@ ix86_emit_leave (void)
        add_reg_note (insn, REG_CFA_DEF_CFA,
                     plus_constant (stack_pointer_rtx, m->fs.sp_offset));
        RTX_FRAME_RELATED_P (insn) = 1;
-      ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
-                                m->fs.fp_offset);
      }
+  ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
+                            m->fs.fp_offset);
  }
  
  /* Emit code to restore saved registers using MOV insns.
@@ -10521,6 +10613,17 @@ ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
        }
  }
  
+/* Emit vzeroupper if needed.  */
+
+void
+ix86_maybe_emit_epilogue_vzeroupper (void)
+{
+  if (TARGET_VZEROUPPER
+      && !TREE_THIS_VOLATILE (cfun->decl)
+      && !cfun->machine->caller_return_avx256_p)
+    emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
+}
+
  /* Restore function stack, frame, and registers.  */
  
  void
@@ -10818,10 +10921,7 @@ ix86_expand_epilogue (int style)
      }
  
    /* Emit vzeroupper if needed.  */
-  if (TARGET_VZEROUPPER
-      && !TREE_THIS_VOLATILE (cfun->decl)
-      && !cfun->machine->caller_return_avx256_p)
-    emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
+  ix86_maybe_emit_epilogue_vzeroupper ();
  
    if (crtl->args.pops_args && crtl->args.size)
      {
@@ -10876,15 +10976,28 @@ ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
       it looks like we might want one, insert a NOP.  */
    {
      rtx insn = get_last_insn ();
+    rtx deleted_debug_label = NULL_RTX;
      while (insn
            && NOTE_P (insn)
            && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
-      insn = PREV_INSN (insn);
+      {
+       /* Don't insert a nop for NOTE_INSN_DELETED_DEBUG_LABEL
+          notes only, instead set their CODE_LABEL_NUMBER to -1,
+          otherwise there would be code generation differences
+          in between -g and -g0.  */
+       if (NOTE_P (insn) && NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
+         deleted_debug_label = insn;
+       insn = PREV_INSN (insn);
+      }
      if (insn
         && (LABEL_P (insn)
             || (NOTE_P (insn)
                 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
        fputs ("\tnop\n", file);
+    else if (deleted_debug_label)
+      for (insn = deleted_debug_label; insn; insn = NEXT_INSN (insn))
+       if (NOTE_KIND (insn) == NOTE_INSN_DELETED_DEBUG_LABEL)
+         CODE_LABEL_NUMBER (insn) = -1;
    }
  #endif
  
@@ -11711,6 +11824,13 @@ legitimate_pic_address_disp_p (rtx disp)
              break;
           if (GET_CODE (op0) == LABEL_REF)
             return true;
+         if (GET_CODE (op0) == CONST
+             && GET_CODE (XEXP (op0, 0)) == UNSPEC
+             && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL)
+           return true;
+         if (GET_CODE (op0) == UNSPEC
+             && XINT (op0, 1) == UNSPEC_PCREL)
+           return true;
           if (GET_CODE (op0) != SYMBOL_REF)
             break;
           /* FALLTHRU */
@@ -11804,6 +11924,64 @@ legitimate_pic_address_disp_p (rtx disp)
    return false;
  }
  
+/* Our implementation of LEGITIMIZE_RELOAD_ADDRESS.  Returns a value to
+   replace the input X, or the original X if no replacement is called for.
+   The output parameter *WIN is 1 if the calling macro should goto WIN,
+   0 if it should not.  */
+
+bool
+ix86_legitimize_reload_address (rtx x,
+                               enum machine_mode mode ATTRIBUTE_UNUSED,
+                               int opnum, int type,
+                               int ind_levels ATTRIBUTE_UNUSED)
+{
+  /* Reload can generate:
+
+     (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP)
+                      (reg:DI 97))
+             (reg:DI 2 cx))
+
+     This RTX is rejected from ix86_legitimate_address_p due to
+     non-strictness of base register 97.  Following this rejection, 
+     reload pushes all three components into separate registers,
+     creating invalid memory address RTX.
+
+     Following code reloads only the invalid part of the
+     memory address RTX.  */
+
+  if (GET_CODE (x) == PLUS
+      && REG_P (XEXP (x, 1))
+      && GET_CODE (XEXP (x, 0)) == PLUS
+      && REG_P (XEXP (XEXP (x, 0), 1)))
+    {
+      rtx base, index;
+      bool something_reloaded = false;
+
+      base = XEXP (XEXP (x, 0), 1);      
+      if (!REG_OK_FOR_BASE_STRICT_P (base))
+       {
+         push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL,
+                      BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
+                      opnum, (enum reload_type)type);
+         something_reloaded = true;
+       }
+
+      index = XEXP (x, 1);
+      if (!REG_OK_FOR_INDEX_STRICT_P (index))
+       {
+         push_reload (index, NULL_RTX, &XEXP (x, 1), NULL,
+                      INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
+                      opnum, (enum reload_type)type);
+         something_reloaded = true;
+       }
+
+      gcc_assert (something_reloaded);
+      return true;
+    }
+
+  return false;
+}
+
  /* Recognizes RTL expressions that are valid memory addresses for an
     instruction.  The MODE argument is the machine mode for the MEM
     expression that wants to use this address.
@@ -11820,6 +11998,13 @@ ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
    rtx base, index, disp;
    HOST_WIDE_INT scale;
  
+  /* Since constant address in x32 is signed extended to 64bit,
+     we have to prevent addresses from 0x80000000 to 0xffffffff.  */
+  if (TARGET_X32
+      && CONST_INT_P (addr)
+      && INTVAL (addr) < 0)
+    return false;
+
    if (ix86_decompose_address (addr, &parts) <= 0)
      /* Decomposition failed.  */
      return false;
@@ -13115,14 +13300,27 @@ ix86_delegitimize_address (rtx x)
  
    if (TARGET_64BIT)
      {
+      if (GET_CODE (x) == CONST
+          && GET_CODE (XEXP (x, 0)) == PLUS
+          && GET_MODE (XEXP (x, 0)) == Pmode
+          && CONST_INT_P (XEXP (XEXP (x, 0), 1))
+          && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC
+          && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL)
+        {
+         rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0);
+         x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2);
+         if (MEM_P (orig_x))
+           x = replace_equiv_address_nv (orig_x, x);
+         return x;
+       }
        if (GET_CODE (x) != CONST
           || GET_CODE (XEXP (x, 0)) != UNSPEC
           || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
               && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
-         || !MEM_P (orig_x))
+         || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL))
         return ix86_delegitimize_tls_address (orig_x);
        x = XVECEXP (XEXP (x, 0), 0, 0);
-      if (GET_MODE (orig_x) != GET_MODE (x))
+      if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x))
         {
           x = simplify_gen_subreg (GET_MODE (orig_x), x,
                                    GET_MODE (x), 0);
@@ -13433,26 +13631,28 @@ print_reg (rtx x, int code, FILE *file)
      code = GET_MODE_SIZE (GET_MODE (x));
  
    /* Irritatingly, AMD extended registers use different naming convention
-     from the normal registers.  */
+     from the normal registers: "r%d[bwd]"  */
    if (REX_INT_REG_P (x))
      {
        gcc_assert (TARGET_64BIT);
+      putc ('r', file);
+      fprint_ul (file, REGNO (x) - FIRST_REX_INT_REG + 8);
        switch (code)
         {
           case 0:
             error ("extended registers have no high halves");
             break;
           case 1:
-           fprintf (file, "r%ib", REGNO (x) - FIRST_REX_INT_REG + 8);
+           putc ('b', file);
             break;
           case 2:
-           fprintf (file, "r%iw", REGNO (x) - FIRST_REX_INT_REG + 8);
+           putc ('w', file);
             break;
           case 4:
-           fprintf (file, "r%id", REGNO (x) - FIRST_REX_INT_REG + 8);
+           putc ('d', file);
             break;
           case 8:
-           fprintf (file, "r%i", REGNO (x) - FIRST_REX_INT_REG + 8);
+           /* no suffix */
             break;
           default:
             error ("unsupported operand size for extended register");
@@ -13963,6 +14163,13 @@ ix86_print_operand (FILE *file, rtx x, int code)
           return;
  
         case 'H':
+         if (!offsettable_memref_p (x))
+           {
+             output_operand_lossage ("operand is not an offsettable memory "
+                                     "reference, invalid operand "
+                                     "code 'H'");
+             return;
+           }
           /* It doesn't actually matter what mode we use here, as we're
              only going to use this for printing.  */
           x = adjust_address_nv (x, DImode, 8);
@@ -14115,13 +14322,18 @@ ix86_print_operand (FILE *file, rtx x, int code)
               gcc_unreachable ();
             }
  
-         /* Check for explicit size override (codes 'b', 'w' and 'k')  */
+         /* Check for explicit size override (codes 'b', 'w', 'k',
+            'q' and 'x')  */
           if (code == 'b')
             size = "BYTE";
           else if (code == 'w')
             size = "WORD";
           else if (code == 'k')
             size = "DWORD";
+         else if (code == 'q')
+           size = "QWORD";
+         else if (code == 'x')
+           size = "XMMWORD";
  
           fputs (size, file);
           fputs (" PTR ", file);
@@ -14228,7 +14440,20 @@ ix86_print_operand_address (FILE *file, rtx addr)
    struct ix86_address parts;
    rtx base, index, disp;
    int scale;
-  int ok = ix86_decompose_address (addr, &parts);
+  int ok;
+  bool vsib = false;
+
+  if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR)
+    {
+      ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts);
+      gcc_assert (parts.index == NULL_RTX);
+      parts.index = XVECEXP (addr, 0, 1);
+      parts.scale = INTVAL (XVECEXP (addr, 0, 2));
+      addr = XVECEXP (addr, 0, 0);
+      vsib = true;
+    }
+  else
+    ok = ix86_decompose_address (addr, &parts);
  
    gcc_assert (ok);
  
@@ -14325,8 +14550,8 @@ ix86_print_operand_address (FILE *file, rtx addr)
           if (index)
             {
               putc (',', file);
-             print_reg (index, code, file);
-             if (scale != 1)
+             print_reg (index, vsib ? 0 : code, file);
+             if (scale != 1 || vsib)
                 fprintf (file, ",%d", scale);
             }
           putc (')', file);
@@ -14376,8 +14601,8 @@ ix86_print_operand_address (FILE *file, rtx addr)
           if (index)
             {
               putc ('+', file);
-             print_reg (index, code, file);
-             if (scale != 1)
+             print_reg (index, vsib ? 0 : code, file);
+             if (scale != 1 || vsib)
                 fprintf (file, "*%d", scale);
             }
           putc (']', file);
@@ -16139,7 +16364,6 @@ distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
    basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL;
    rtx prev = start;
    rtx next = NULL;
-  enum attr_type insn_type;
  
    *found = false;
  
@@ -16152,8 +16376,8 @@ distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2,
           distance = increase_distance (prev, next, distance);
           if (insn_defines_reg (regno1, regno2, prev))
             {
-             insn_type = get_attr_type (prev);
-             if (insn_type != TYPE_LEA)
+             if (recog_memoized (prev) < 0
+                 || get_attr_type (prev) != TYPE_LEA)
                 {
                   *found = true;
                   return distance;
@@ -16475,6 +16699,29 @@ ix86_avoid_lea_for_add (rtx insn, rtx operands[])
      return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1);
  }
  
+/* Return true if we should emit lea instruction instead of mov
+   instruction.  */
+
+bool
+ix86_use_lea_for_mov (rtx insn, rtx operands[])
+{
+  unsigned int regno0;
+  unsigned int regno1;
+
+  /* Check if we need to optimize.  */
+  if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
+    return false;
+
+  /* Use lea for reg to reg moves only.  */
+  if (!REG_P (operands[0]) || !REG_P (operands[1]))
+    return false;
+
+  regno0 = true_regnum (operands[0]);
+  regno1 = true_regnum (operands[1]);
+
+  return ix86_lea_outperforms (insn, regno0, regno1, -1, 0);
+}
+
  /* Return true if we need to split lea into a sequence of
     instructions to avoid AGU stalls. */
  
@@ -16836,7 +17083,7 @@ ix86_split_convert_uns_si_sse (rtx operands[])
  
    x = gen_rtx_REG (V4SImode, REGNO (value));
    if (vecmode == V4SFmode)
-    emit_insn (gen_sse2_cvttps2dq (x, value));
+    emit_insn (gen_fix_truncv4sfv4si2 (x, value));
    else
      emit_insn (gen_sse2_cvttpd2dq (x, value));
    value = x;
@@ -16995,6 +17242,95 @@ ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
      emit_move_insn (target, fp_hi);
  }
  
+/* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
+   a vector of unsigned ints VAL to vector of floats TARGET.  */
+
+void
+ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
+{
+  rtx tmp[8];
+  REAL_VALUE_TYPE TWO16r;
+  enum machine_mode intmode = GET_MODE (val);
+  enum machine_mode fltmode = GET_MODE (target);
+  rtx (*cvt) (rtx, rtx);
+
+  if (intmode == V4SImode)
+    cvt = gen_floatv4siv4sf2;
+  else
+    cvt = gen_floatv8siv8sf2;
+  tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
+  tmp[0] = force_reg (intmode, tmp[0]);
+  tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
+                               OPTAB_DIRECT);
+  tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
+                               NULL_RTX, 1, OPTAB_DIRECT);
+  tmp[3] = gen_reg_rtx (fltmode);
+  emit_insn (cvt (tmp[3], tmp[1]));
+  tmp[4] = gen_reg_rtx (fltmode);
+  emit_insn (cvt (tmp[4], tmp[2]));
+  real_ldexp (&TWO16r, &dconst1, 16);
+  tmp[5] = const_double_from_real_value (TWO16r, SFmode);
+  tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
+  tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
+                               OPTAB_DIRECT);
+  tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
+                               OPTAB_DIRECT);
+  if (tmp[7] != target)
+    emit_move_insn (target, tmp[7]);
+}
+
+/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
+   pattern can be used on it instead of *ufix_trunc* resp. fixuns_trunc*.
+   This is done by doing just signed conversion if < 0x1p31, and otherwise by
+   subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards.  */
+
+rtx
+ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
+{
+  REAL_VALUE_TYPE TWO31r;
+  rtx two31r, tmp[4];
+  enum machine_mode mode = GET_MODE (val);
+  enum machine_mode scalarmode = GET_MODE_INNER (mode);
+  enum machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
+  rtx (*cmp) (rtx, rtx, rtx, rtx);
+  int i;
+
+  for (i = 0; i < 3; i++)
+    tmp[i] = gen_reg_rtx (mode);
+  real_ldexp (&TWO31r, &dconst1, 31);
+  two31r = const_double_from_real_value (TWO31r, scalarmode);
+  two31r = ix86_build_const_vector (mode, 1, two31r);
+  two31r = force_reg (mode, two31r);
+  switch (mode)
+    {
+    case V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
+    case V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
+    case V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
+    case V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
+    default: gcc_unreachable ();
+    }
+  tmp[3] = gen_rtx_LE (mode, two31r, val);
+  emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
+  tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
+                               0, OPTAB_DIRECT);
+  if (intmode == V4SImode || TARGET_AVX2)
+    *xorp = expand_simple_binop (intmode, ASHIFT,
+                                gen_lowpart (intmode, tmp[0]),
+                                GEN_INT (31), NULL_RTX, 0,
+                                OPTAB_DIRECT);
+  else
+    {
+      rtx two31 = GEN_INT ((unsigned HOST_WIDE_INT) 1 << 31);
+      two31 = ix86_build_const_vector (intmode, 1, two31);
+      *xorp = expand_simple_binop (intmode, AND,
+                                  gen_lowpart (intmode, tmp[0]),
+                                  two31, NULL_RTX, 0,
+                                  OPTAB_DIRECT);
+    }
+  return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
+                             0, OPTAB_DIRECT);
+}
+
  /* A subroutine of ix86_build_signbit_mask.  If VECT is true,
     then replicate the value for all elements of the vector
     register.  */
@@ -17008,6 +17344,10 @@ ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
  
    switch (mode)
      {
+    case V32QImode:
+    case V16QImode:
+    case V16HImode:
+    case V8HImode:
      case V8SImode:
      case V4SImode:
      case V4DImode:
@@ -19185,8 +19525,51 @@ ix86_expand_int_vcond (rtx operands[])
    cop0 = operands[4];
    cop1 = operands[5];
  
-  /* XOP supports all of the comparisons on all vector int types.  */
-  if (!TARGET_XOP)
+  /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
+     and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
+  if ((code == LT || code == GE)
+      && data_mode == mode
+      && cop1 == CONST0_RTX (mode)
+      && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
+      && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1
+      && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8
+      && (GET_MODE_SIZE (data_mode) == 16
+         || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
+    {
+      rtx negop = operands[2 - (code == LT)];
+      int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1;
+      if (negop == CONST1_RTX (data_mode))
+       {
+         rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
+                                        operands[0], 1, OPTAB_DIRECT);
+         if (res != operands[0])
+           emit_move_insn (operands[0], res);
+         return true;
+       }
+      else if (GET_MODE_INNER (data_mode) != DImode
+              && vector_all_ones_operand (negop, data_mode))
+       {
+         rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
+                                        operands[0], 0, OPTAB_DIRECT);
+         if (res != operands[0])
+           emit_move_insn (operands[0], res);
+         return true;
+       }
+    }
+
+  if (!nonimmediate_operand (cop1, mode))
+    cop1 = force_reg (mode, cop1);
+  if (!general_operand (operands[1], data_mode))
+    operands[1] = force_reg (data_mode, operands[1]);
+  if (!general_operand (operands[2], data_mode))
+    operands[2] = force_reg (data_mode, operands[2]);
+
+  /* XOP supports all of the comparisons on all 128-bit vector int types.  */
+  if (TARGET_XOP
+      && (mode == V16QImode || mode == V8HImode
+         || mode == V4SImode || mode == V2DImode))
+    ;
+  else
      {
        /* Canonicalize the comparison to EQ, GT, GTU.  */
        switch (code)
@@ -19334,7 +19717,7 @@ ix86_expand_vec_perm (rtx operands[])
    rtx op0 = operands[1];
    rtx op1 = operands[2];
    rtx mask = operands[3];
-  rtx t1, t2, vt, vec[16];
+  rtx t1, t2, t3, t4, vt, vt2, vec[32];
    enum machine_mode mode = GET_MODE (op0);
    enum machine_mode maskmode = GET_MODE (mask);
    int w, e, i;
@@ -19343,50 +19726,68 @@ ix86_expand_vec_perm (rtx operands[])
    /* Number of elements in the vector.  */
    w = GET_MODE_NUNITS (mode);
    e = GET_MODE_UNIT_SIZE (mode);
-  gcc_assert (w <= 16);
+  gcc_assert (w <= 32);
  
    if (TARGET_AVX2)
      {
-      if (mode == V4DImode || mode == V4DFmode)
+      if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
         {
           /* Unfortunately, the VPERMQ and VPERMPD instructions only support
              an constant shuffle operand.  With a tiny bit of effort we can
              use VPERMD instead.  A re-interpretation stall for V4DFmode is
-            unfortunate but there's no avoiding it.  */
-         t1 = gen_reg_rtx (V8SImode);
+            unfortunate but there's no avoiding it.
+            Similarly for V16HImode we don't have instructions for variable
+            shuffling, while for V32QImode we can use after preparing suitable
+            masks vpshufb; vpshufb; vpermq; vpor.  */
+
+         if (mode == V16HImode)
+           {
+             maskmode = mode = V32QImode;
+             w = 32;
+             e = 1;
+           }
+         else
+           {
+             maskmode = mode = V8SImode;
+             w = 8;
+             e = 4;
+           }
+         t1 = gen_reg_rtx (maskmode);
  
           /* Replicate the low bits of the V4DImode mask into V8SImode:
                mask = { A B C D }
                t1 = { A A B B C C D D }.  */
-         for (i = 0; i < 4; ++i)
+         for (i = 0; i < w / 2; ++i)
             vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
-         vt = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, vec));
-         vt = force_reg (V8SImode, vt);
-         mask = gen_lowpart (V8SImode, mask);
-         emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
+         vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+         vt = force_reg (maskmode, vt);
+         mask = gen_lowpart (maskmode, mask);
+         if (maskmode == V8SImode)
+           emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
+         else
+           emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
  
           /* Multiply the shuffle indicies by two.  */
-         emit_insn (gen_avx2_lshlv8si3 (t1, t1, const1_rtx));
+         t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
+                                   OPTAB_DIRECT);
  
           /* Add one to the odd shuffle indicies:
                 t1 = { A*2, A*2+1, B*2, B*2+1, ... }.  */
-         for (i = 0; i < 4; ++i)
+         for (i = 0; i < w / 2; ++i)
             {
               vec[i * 2] = const0_rtx;
               vec[i * 2 + 1] = const1_rtx;
             }
-         vt = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, vec));
-         vt = force_const_mem (V8SImode, vt);
-         emit_insn (gen_addv8si3 (t1, t1, vt));
+         vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
+         vt = force_const_mem (maskmode, vt);
+         t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
+                                   OPTAB_DIRECT);
  
-         /* Continue as if V8SImode was used initially.  */
+         /* Continue as if V8SImode (resp. V32QImode) was used initially.  */
           operands[3] = mask = t1;
-         target = gen_lowpart (V8SImode, target);
-         op0 = gen_lowpart (V8SImode, op0);
-         op1 = gen_lowpart (V8SImode, op1);
-         maskmode = mode = V8SImode;
-         w = 8;
-         e = 4;
+         target = gen_lowpart (mode, target);
+         op0 = gen_lowpart (mode, op0);
+         op1 = gen_lowpart (mode, op1);
         }
  
        switch (mode)
@@ -19396,13 +19797,13 @@ ix86_expand_vec_perm (rtx operands[])
              the high bits of the shuffle elements.  No need for us to
              perform an AND ourselves.  */
           if (one_operand_shuffle)
-           emit_insn (gen_avx2_permvarv8si (target, mask, op0));
+           emit_insn (gen_avx2_permvarv8si (target, op0, mask));
           else
             {
               t1 = gen_reg_rtx (V8SImode);
               t2 = gen_reg_rtx (V8SImode);
-             emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
-             emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
+             emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
+             emit_insn (gen_avx2_permvarv8si (t2, op0, mask));
               goto merge_two;
             }
           return;
@@ -19410,13 +19811,13 @@ ix86_expand_vec_perm (rtx operands[])
         case V8SFmode:
           mask = gen_lowpart (V8SFmode, mask);
           if (one_operand_shuffle)
-           emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
+           emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
           else
             {
               t1 = gen_reg_rtx (V8SFmode);
               t2 = gen_reg_rtx (V8SFmode);
-             emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
-             emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
+             emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
+             emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
               goto merge_two;
             }
           return;
@@ -19429,7 +19830,7 @@ ix86_expand_vec_perm (rtx operands[])
           t2 = gen_reg_rtx (V8SImode);
           emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
           emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
-         emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
+         emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
           emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
           return;
  
@@ -19439,10 +19840,96 @@ ix86_expand_vec_perm (rtx operands[])
           mask = gen_lowpart (V4SFmode, mask);
           emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
           emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
-         emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
+         emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
           emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
           return;
  
+       case V32QImode:
+         t1 = gen_reg_rtx (V32QImode);
+         t2 = gen_reg_rtx (V32QImode);
+         t3 = gen_reg_rtx (V32QImode);
+         vt2 = GEN_INT (128);
+         for (i = 0; i < 32; i++)
+           vec[i] = vt2;
+         vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
+         vt = force_reg (V32QImode, vt);
+         for (i = 0; i < 32; i++)
+           vec[i] = i < 16 ? vt2 : const0_rtx;
+         vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
+         vt2 = force_reg (V32QImode, vt2);
+         /* From mask create two adjusted masks, which contain the same
+            bits as mask in the low 7 bits of each vector element.
+            The first mask will have the most significant bit clear
+            if it requests element from the same 128-bit lane
+            and MSB set if it requests element from the other 128-bit lane.
+            The second mask will have the opposite values of the MSB,
+            and additionally will have its 128-bit lanes swapped.
+            E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
+            t1   { 07 92 9e 09 ... | 17 19 85 1f ... } and
+            t3   { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
+            stands for other 12 bytes.  */
+         /* The bit whether element is from the same lane or the other
+            lane is bit 4, so shift it up by 3 to the MSB position.  */
+         emit_insn (gen_ashlv4di3 (gen_lowpart (V4DImode, t1),
+                                   gen_lowpart (V4DImode, mask),
+                                   GEN_INT (3)));
+         /* Clear MSB bits from the mask just in case it had them set.  */
+         emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
+         /* After this t1 will have MSB set for elements from other lane.  */
+         emit_insn (gen_xorv32qi3 (t1, t1, vt2));
+         /* Clear bits other than MSB.  */
+         emit_insn (gen_andv32qi3 (t1, t1, vt));
+         /* Or in the lower bits from mask into t3.  */
+         emit_insn (gen_iorv32qi3 (t3, t1, t2));
+         /* And invert MSB bits in t1, so MSB is set for elements from the same
+            lane.  */
+         emit_insn (gen_xorv32qi3 (t1, t1, vt));
+         /* Swap 128-bit lanes in t3.  */
+         emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
+                                         gen_lowpart (V4DImode, t3),
+                                         const2_rtx, GEN_INT (3),
+                                         const0_rtx, const1_rtx));
+         /* And or in the lower bits from mask into t1.  */
+         emit_insn (gen_iorv32qi3 (t1, t1, t2));
+         if (one_operand_shuffle)
+           {
+             /* Each of these shuffles will put 0s in places where
+                element from the other 128-bit lane is needed, otherwise
+                will shuffle in the requested value.  */
+             emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3));
+             emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
+             /* For t3 the 128-bit lanes are swapped again.  */
+             emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
+                                             gen_lowpart (V4DImode, t3),
+                                             const2_rtx, GEN_INT (3),
+                                             const0_rtx, const1_rtx));
+             /* And oring both together leads to the result.  */
+             emit_insn (gen_iorv32qi3 (target, t1, t3));
+             return;
+           }
+
+         t4 = gen_reg_rtx (V32QImode);
+         /* Similarly to the above one_operand_shuffle code,
+            just for repeated twice for each operand.  merge_two:
+            code will merge the two results together.  */
+         emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3));
+         emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3));
+         emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
+         emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
+         emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4),
+                                         gen_lowpart (V4DImode, t4),
+                                         const2_rtx, GEN_INT (3),
+                                         const0_rtx, const1_rtx));
+         emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3),
+                                         gen_lowpart (V4DImode, t3),
+                                         const2_rtx, GEN_INT (3),
+                                         const0_rtx, const1_rtx));
+         emit_insn (gen_iorv32qi3 (t4, t2, t4));
+         emit_insn (gen_iorv32qi3 (t3, t1, t3));
+         t1 = t4;
+         t2 = t3;
+         goto merge_two;
+
         default:
           gcc_assert (GET_MODE_SIZE (mode) <= 16);
           break;
@@ -19555,7 +20042,7 @@ ix86_expand_vec_perm (rtx operands[])
        mask = expand_simple_binop (maskmode, AND, mask, vt,
                                   NULL_RTX, 0, OPTAB_DIRECT);
  
-      xops[0] = operands[0];
+      xops[0] = gen_lowpart (mode, operands[0]);
        xops[1] = gen_lowpart (mode, t2);
        xops[2] = gen_lowpart (mode, t1);
        xops[3] = gen_rtx_EQ (maskmode, mask, vt);
@@ -19579,9 +20066,38 @@ ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
    if (TARGET_SSE4_1)
      {
        rtx (*unpack)(rtx, rtx);
+      rtx (*extract)(rtx, rtx) = NULL;
+      enum machine_mode halfmode = BLKmode;
  
        switch (imode)
         {
+       case V32QImode:
+         if (unsigned_p)
+           unpack = gen_avx2_zero_extendv16qiv16hi2;
+         else
+           unpack = gen_avx2_sign_extendv16qiv16hi2;
+         halfmode = V16QImode;
+         extract
+           = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
+         break;
+       case V16HImode:
+         if (unsigned_p)
+           unpack = gen_avx2_zero_extendv8hiv8si2;
+         else
+           unpack = gen_avx2_sign_extendv8hiv8si2;
+         halfmode = V8HImode;
+         extract
+           = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
+         break;
+       case V8SImode:
+         if (unsigned_p)
+           unpack = gen_avx2_zero_extendv4siv4di2;
+         else
+           unpack = gen_avx2_sign_extendv4siv4di2;
+         halfmode = V4SImode;
+         extract
+           = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
+         break;
         case V16QImode:
           if (unsigned_p)
             unpack = gen_sse4_1_zero_extendv8qiv8hi2;
@@ -19604,7 +20120,12 @@ ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
           gcc_unreachable ();
         }
  
-      if (high_p)
+      if (GET_MODE_SIZE (imode) == 32)
+       {
+         tmp = gen_reg_rtx (halfmode);
+         emit_insn (extract (tmp, operands[1]));
+       }
+      else if (high_p)
         {
           /* Shift higher 8 bytes to lower 8 bytes.  */
           tmp = gen_reg_rtx (imode);
@@ -24291,6 +24812,7 @@ enum ix86_builtins
    IX86_BUILTIN_CVTTPS2DQ,
  
    IX86_BUILTIN_MOVNTI,
+  IX86_BUILTIN_MOVNTI64,
    IX86_BUILTIN_MOVNTPD,
    IX86_BUILTIN_MOVNTDQ,
  
@@ -24503,22 +25025,32 @@ enum ix86_builtins
    IX86_BUILTIN_PMULDQ128,
    IX86_BUILTIN_PMULLD128,
  
-  IX86_BUILTIN_ROUNDPD,
-  IX86_BUILTIN_ROUNDPS,
    IX86_BUILTIN_ROUNDSD,
    IX86_BUILTIN_ROUNDSS,
  
+  IX86_BUILTIN_ROUNDPD,
+  IX86_BUILTIN_ROUNDPS,
+
    IX86_BUILTIN_FLOORPD,
    IX86_BUILTIN_CEILPD,
    IX86_BUILTIN_TRUNCPD,
    IX86_BUILTIN_RINTPD,
    IX86_BUILTIN_ROUNDPD_AZ,
+
+  IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX,
+  IX86_BUILTIN_CEILPD_VEC_PACK_SFIX,
+  IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX,
+
    IX86_BUILTIN_FLOORPS,
    IX86_BUILTIN_CEILPS,
    IX86_BUILTIN_TRUNCPS,
    IX86_BUILTIN_RINTPS,
    IX86_BUILTIN_ROUNDPS_AZ,
  
+  IX86_BUILTIN_FLOORPS_SFIX,
+  IX86_BUILTIN_CEILPS_SFIX,
+  IX86_BUILTIN_ROUNDPS_AZ_SFIX,
+
    IX86_BUILTIN_PTESTZ,
    IX86_BUILTIN_PTESTC,
    IX86_BUILTIN_PTESTNZC,
@@ -24542,6 +25074,7 @@ enum ix86_builtins
    IX86_BUILTIN_VEC_SET_V16QI,
  
    IX86_BUILTIN_VEC_PACK_SFIX,
+  IX86_BUILTIN_VEC_PACK_SFIX256,
  
    /* SSE4.2.  */
    IX86_BUILTIN_CRC32QI,
@@ -24691,12 +25224,21 @@ enum ix86_builtins
    IX86_BUILTIN_TRUNCPD256,
    IX86_BUILTIN_RINTPD256,
    IX86_BUILTIN_ROUNDPD_AZ256,
+
+  IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256,
+  IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256,
+  IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256,
+
    IX86_BUILTIN_FLOORPS256,
    IX86_BUILTIN_CEILPS256,
    IX86_BUILTIN_TRUNCPS256,
    IX86_BUILTIN_RINTPS256,
    IX86_BUILTIN_ROUNDPS_AZ256,
  
+  IX86_BUILTIN_FLOORPS_SFIX256,
+  IX86_BUILTIN_CEILPS_SFIX256,
+  IX86_BUILTIN_ROUNDPS_AZ_SFIX256,
+
    IX86_BUILTIN_UNPCKHPD256,
    IX86_BUILTIN_UNPCKLPD256,
    IX86_BUILTIN_UNPCKHPS256,
@@ -24902,6 +25444,13 @@ enum ix86_builtins
    IX86_BUILTIN_GATHERDIV4SI,
    IX86_BUILTIN_GATHERDIV8SI,
  
+  /* Alternate 4 element gather for the vectorizer where
+     all operands are 32-byte wide.  */
+  IX86_BUILTIN_GATHERALTSIV4DF,
+  IX86_BUILTIN_GATHERALTDIV8SF,
+  IX86_BUILTIN_GATHERALTSIV4DI,
+  IX86_BUILTIN_GATHERALTDIV8SI,
+
    /* TFmode support builtins.  */
    IX86_BUILTIN_INFQ,
    IX86_BUILTIN_HUGE_VALQ,
@@ -24914,21 +25463,6 @@ enum ix86_builtins
    IX86_BUILTIN_CPYSGNPS256,
    IX86_BUILTIN_CPYSGNPD256,
  
-  IX86_BUILTIN_CVTUDQ2PS,
-
-  IX86_BUILTIN_VEC_PERM_V2DF,
-  IX86_BUILTIN_VEC_PERM_V4SF,
-  IX86_BUILTIN_VEC_PERM_V2DI,
-  IX86_BUILTIN_VEC_PERM_V4SI,
-  IX86_BUILTIN_VEC_PERM_V8HI,
-  IX86_BUILTIN_VEC_PERM_V16QI,
-  IX86_BUILTIN_VEC_PERM_V2DI_U,
-  IX86_BUILTIN_VEC_PERM_V4SI_U,
-  IX86_BUILTIN_VEC_PERM_V8HI_U,
-  IX86_BUILTIN_VEC_PERM_V16QI_U,
-  IX86_BUILTIN_VEC_PERM_V4DF,
-  IX86_BUILTIN_VEC_PERM_V8SF,
-
    /* FMA4 instructions.  */
    IX86_BUILTIN_VFMADDSS,
    IX86_BUILTIN_VFMADDSD,
@@ -25352,7 +25886,7 @@ static const struct builtin_description bdesc_special_args[] =
  
    /* SSE or 3DNow!A  */
    { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
-  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntdi, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
+  { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
  
    /* SSE2 */
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
@@ -25361,7 +25895,8 @@ static const struct builtin_description bdesc_special_args[] =
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntsi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
+  { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
  
@@ -25637,25 +26172,11 @@ static const struct builtin_description bdesc_args[] =
    /* SSE2 */
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
  
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2df", IX86_BUILTIN_VEC_PERM_V2DF, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DI },
-  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4sf", IX86_BUILTIN_VEC_PERM_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2di", IX86_BUILTIN_VEC_PERM_V2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4si", IX86_BUILTIN_VEC_PERM_V4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8hi", IX86_BUILTIN_VEC_PERM_V8HI, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v16qi", IX86_BUILTIN_VEC_PERM_V16QI, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2di_u", IX86_BUILTIN_VEC_PERM_V2DI_U, UNKNOWN, (int) V2UDI_FTYPE_V2UDI_V2UDI_V2UDI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4si_u", IX86_BUILTIN_VEC_PERM_V4SI_U, UNKNOWN, (int) V4USI_FTYPE_V4USI_V4USI_V4USI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8hi_u", IX86_BUILTIN_VEC_PERM_V8HI_U, UNKNOWN, (int) V8UHI_FTYPE_V8UHI_V8UHI_V8UHI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v16qi_u", IX86_BUILTIN_VEC_PERM_V16QI_U, UNKNOWN, (int) V16UQI_FTYPE_V16UQI_V16UQI_V16UQI },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4df", IX86_BUILTIN_VEC_PERM_V4DF, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DI },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8sf", IX86_BUILTIN_VEC_PERM_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SI },
-
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF  },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2ps, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtudq2ps, "__builtin_ia32_cvtudq2ps", IX86_BUILTIN_CVTUDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_floatv4siv4sf2, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
  
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
@@ -25672,7 +26193,7 @@ static const struct builtin_description bdesc_args[] =
  
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttps2dq, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_fix_truncv4sfv4si2, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
  
    { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
    { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
@@ -25923,14 +26444,22 @@ static const struct builtin_description bdesc_args[] =
    { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
    { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
  
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND },
+
    { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
  
    { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
    { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
    { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
    { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
  
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND },
+
    { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF },
  
    { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
    { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
@@ -26010,14 +26539,14 @@ static const struct builtin_description bdesc_args[] =
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtdq2pd256, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtdq2ps256, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_floatv4siv4df2, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_floatv8siv8sf2, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvttpd2dq256, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv4dfv4si2, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
-  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvttps2dq256, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_fix_truncv8sfv8si2, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
@@ -26050,13 +26579,21 @@ static const struct builtin_description bdesc_args[] =
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
  
    { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND },
  
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
  
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND },
+
    { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
  
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256,  "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
    { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256,  "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
@@ -26092,6 +26629,8 @@ static const struct builtin_description bdesc_args[] =
    { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3,  "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
    { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3,  "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
  
+  { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF },
+
    /* AVX2 */
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI },
@@ -26174,12 +26713,12 @@ static const struct builtin_description bdesc_args[] =
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv16hi3, "__builtin_ia32_psignw256", IX86_BUILTIN_PSIGNW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psignv8si3 , "__builtin_ia32_psignd256", IX86_BUILTIN_PSIGND256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlv2ti3, "__builtin_ia32_pslldqi256", IX86_BUILTIN_PSLLDQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_CONVERT },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllwi256", IX86_BUILTIN_PSLLWI256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv16hi3, "__builtin_ia32_psllw256", IX86_BUILTIN_PSLLW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslldi256", IX86_BUILTIN_PSLLDI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv8si3, "__builtin_ia32_pslld256", IX86_BUILTIN_PSLLD256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllqi256", IX86_BUILTIN_PSLLQI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT_COUNT },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_ashlv4di3, "__builtin_ia32_psllq256", IX86_BUILTIN_PSLLQ256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_COUNT },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psrawi256", IX86_BUILTIN_PSRAWI256, UNKNOWN, (int) V16HI_FTYPE_V16HI_SI_COUNT },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv16hi3, "__builtin_ia32_psraw256", IX86_BUILTIN_PSRAW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V8HI_COUNT },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_ashrv8si3, "__builtin_ia32_psradi256", IX86_BUILTIN_PSRADI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_SI_COUNT },
@@ -26224,15 +26763,15 @@ static const struct builtin_description bdesc_args[] =
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_inserti128, "__builtin_ia32_insert128i256", IX86_BUILTIN_VINSERT128I256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V2DI_INT },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4di, "__builtin_ia32_psllv4di", IX86_BUILTIN_PSLLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv2di, "__builtin_ia32_psllv2di", IX86_BUILTIN_PSLLVV2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv8si, "__builtin_ia32_psllv8si", IX86_BUILTIN_PSLLVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashlvv4si, "__builtin_ia32_psllv4si", IX86_BUILTIN_PSLLVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv8si, "__builtin_ia32_psrav8si", IX86_BUILTIN_PSRAVV8SI, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_ashrvv4si, "__builtin_ia32_psrav4si", IX86_BUILTIN_PSRAVV4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
    { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_lshrvv4di, "__builtin_ia32_psrlv4di", IX86_BUILTIN_PSRLVV4DI, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
@@ -26401,14 +26940,14 @@ static const struct builtin_description bdesc_multi_arg[] =
    { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3,         "__builtin_ia32_vprotdi",     IX86_BUILTIN_VPROTD_IMM,  UNKNOWN,      (int)MULTI_ARG_2_SI_IMM },
    { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3,         "__builtin_ia32_vprotwi",     IX86_BUILTIN_VPROTW_IMM,  UNKNOWN,      (int)MULTI_ARG_2_HI_IMM },
    { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3,        "__builtin_ia32_vprotbi",     IX86_BUILTIN_VPROTB_IMM,  UNKNOWN,      (int)MULTI_ARG_2_QI_IMM },
-  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv2di3,         "__builtin_ia32_vpshaq",      IX86_BUILTIN_VPSHAQ,      UNKNOWN,      (int)MULTI_ARG_2_DI },
-  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv4si3,         "__builtin_ia32_vpshad",      IX86_BUILTIN_VPSHAD,      UNKNOWN,      (int)MULTI_ARG_2_SI },
-  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv8hi3,         "__builtin_ia32_vpshaw",      IX86_BUILTIN_VPSHAW,      UNKNOWN,      (int)MULTI_ARG_2_HI },
-  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv16qi3,        "__builtin_ia32_vpshab",      IX86_BUILTIN_VPSHAB,      UNKNOWN,      (int)MULTI_ARG_2_QI },
-  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv2di3,         "__builtin_ia32_vpshlq",      IX86_BUILTIN_VPSHLQ,      UNKNOWN,      (int)MULTI_ARG_2_DI },
-  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv4si3,         "__builtin_ia32_vpshld",      IX86_BUILTIN_VPSHLD,      UNKNOWN,      (int)MULTI_ARG_2_SI },
-  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv8hi3,         "__builtin_ia32_vpshlw",      IX86_BUILTIN_VPSHLW,      UNKNOWN,      (int)MULTI_ARG_2_HI },
-  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv16qi3,        "__builtin_ia32_vpshlb",      IX86_BUILTIN_VPSHLB,      UNKNOWN,      (int)MULTI_ARG_2_QI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav2di3,         "__builtin_ia32_vpshaq",      IX86_BUILTIN_VPSHAQ,      UNKNOWN,      (int)MULTI_ARG_2_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav4si3,         "__builtin_ia32_vpshad",      IX86_BUILTIN_VPSHAD,      UNKNOWN,      (int)MULTI_ARG_2_SI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav8hi3,         "__builtin_ia32_vpshaw",      IX86_BUILTIN_VPSHAW,      UNKNOWN,      (int)MULTI_ARG_2_HI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shav16qi3,        "__builtin_ia32_vpshab",      IX86_BUILTIN_VPSHAB,      UNKNOWN,      (int)MULTI_ARG_2_QI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv2di3,         "__builtin_ia32_vpshlq",      IX86_BUILTIN_VPSHLQ,      UNKNOWN,      (int)MULTI_ARG_2_DI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv4si3,         "__builtin_ia32_vpshld",      IX86_BUILTIN_VPSHLD,      UNKNOWN,      (int)MULTI_ARG_2_SI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3,         "__builtin_ia32_vpshlw",      IX86_BUILTIN_VPSHLW,      UNKNOWN,      (int)MULTI_ARG_2_HI },
+  { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3,        "__builtin_ia32_vpshlb",      IX86_BUILTIN_VPSHLB,      UNKNOWN,      (int)MULTI_ARG_2_QI },
  
    { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2,       "__builtin_ia32_vfrczss",     IX86_BUILTIN_VFRCZSS,     UNKNOWN,      (int)MULTI_ARG_2_SF },
    { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2,       "__builtin_ia32_vfrczsd",     IX86_BUILTIN_VFRCZSD,     UNKNOWN,      (int)MULTI_ARG_2_DF },
@@ -26521,6 +27060,159 @@ static const struct builtin_description bdesc_multi_arg[] =
    { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3,     "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
  
  };
+\f
+/* TM vector builtins.  */
+
+/* Reuse the existing x86-specific `struct builtin_description' cause
+   we're lazy.  Add casts to make them fit.  */
+static const struct builtin_description bdesc_tm[] =
+{
+  { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
+  { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI },
+
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF },
+
+  { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID },
+  { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID },
+};
+
+/* TM callbacks.  */
+
+/* Return the builtin decl needed to load a vector of TYPE.  */
+
+static tree
+ix86_builtin_tm_load (tree type)
+{
+  if (TREE_CODE (type) == VECTOR_TYPE)
+    {
+      switch (tree_low_cst (TYPE_SIZE (type), 1))
+       {
+       case 64:
+         return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64);
+       case 128:
+         return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128);
+       case 256:
+         return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256);
+       }
+    }
+  return NULL_TREE;
+}
+
+/* Return the builtin decl needed to store a vector of TYPE.  */
+
+static tree
+ix86_builtin_tm_store (tree type)
+{
+  if (TREE_CODE (type) == VECTOR_TYPE)
+    {
+      switch (tree_low_cst (TYPE_SIZE (type), 1))
+       {
+       case 64:
+         return builtin_decl_explicit (BUILT_IN_TM_STORE_M64);
+       case 128:
+         return builtin_decl_explicit (BUILT_IN_TM_STORE_M128);
+       case 256:
+         return builtin_decl_explicit (BUILT_IN_TM_STORE_M256);
+       }
+    }
+  return NULL_TREE;
+}
+\f
+/* Initialize the transactional memory vector load/store builtins.  */
+
+static void
+ix86_init_tm_builtins (void)
+{
+  enum ix86_builtin_func_type ftype;
+  const struct builtin_description *d;
+  size_t i;
+  tree decl;
+  tree attrs_load, attrs_type_load, attrs_store, attrs_type_store;
+  tree attrs_log, attrs_type_log;
+
+  if (!flag_tm)
+    return;
+
+  /* If there are no builtins defined, we must be compiling in a
+     language without trans-mem support.  */
+  if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1))
+    return;
+
+  /* Use whatever attributes a normal TM load has.  */
+  decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1);
+  attrs_load = DECL_ATTRIBUTES (decl);
+  attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl));
+  /* Use whatever attributes a normal TM store has.  */
+  decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1);
+  attrs_store = DECL_ATTRIBUTES (decl);
+  attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl));
+  /* Use whatever attributes a normal TM log has.  */
+  decl = builtin_decl_explicit (BUILT_IN_TM_LOG);
+  attrs_log = DECL_ATTRIBUTES (decl);
+  attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl));
+
+  for (i = 0, d = bdesc_tm;
+       i < ARRAY_SIZE (bdesc_tm);
+       i++, d++)
+    {
+      if ((d->mask & ix86_isa_flags) != 0
+         || (lang_hooks.builtin_function
+             == lang_hooks.builtin_function_ext_scope))
+       {
+         tree type, attrs, attrs_type;
+         enum built_in_function code = (enum built_in_function) d->code;
+
+         ftype = (enum ix86_builtin_func_type) d->flag;
+         type = ix86_get_builtin_func_type (ftype);
+
+         if (BUILTIN_TM_LOAD_P (code))
+           {
+             attrs = attrs_load;
+             attrs_type = attrs_type_load;
+           }
+         else if (BUILTIN_TM_STORE_P (code))
+           {
+             attrs = attrs_store;
+             attrs_type = attrs_type_store;
+           }
+         else
+           {
+             attrs = attrs_log;
+             attrs_type = attrs_type_log;
+           }
+         decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL,
+                                      /* The builtin without the prefix for
+                                         calling it directly.  */
+                                      d->name + strlen ("__builtin_"),
+                                      attrs);
+         /* add_builtin_function() will set the DECL_ATTRIBUTES, now
+            set the TYPE_ATTRIBUTES.  */
+         decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN);
+
+         set_builtin_decl (code, decl, false);
+       }
+    }
+}
  
  /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
     in the current target ISA to allow the user to compile particular modules
@@ -26709,6 +27401,22 @@ ix86_init_mmx_sse_builtins (void)
                V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT,
                IX86_BUILTIN_GATHERDIV8SI);
  
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ",
+              V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT,
+              IX86_BUILTIN_GATHERALTSIV4DF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ",
+              V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT,
+              IX86_BUILTIN_GATHERALTDIV8SF);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ",
+              V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT,
+              IX86_BUILTIN_GATHERALTSIV4DI);
+
+  def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ",
+              V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT,
+              IX86_BUILTIN_GATHERALTDIV8SI);
+
    /* MMX access to the vec_init patterns.  */
    def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
                      V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
@@ -26878,6 +27586,7 @@ ix86_init_builtins (void)
    TREE_READONLY (t) = 1;
    ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
  
+  ix86_init_tm_builtins ();
    ix86_init_mmx_sse_builtins ();
  
    if (TARGET_LP64)
@@ -27330,7 +28039,7 @@ ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
    return SUBREG_REG (target);
  }
  
-/* Subroutine of ix86_expand_args_builtin to take care of round insns.  */
+/* Subroutines of ix86_expand_args_builtin to take care of round insns.  */
  
  static rtx
  ix86_expand_sse_round (const struct builtin_description *d, tree exp,
@@ -27363,6 +28072,44 @@ ix86_expand_sse_round (const struct builtin_description *d, tree exp,
    return target;
  }
  
+static rtx
+ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
+                                    tree exp, rtx target)
+{
+  rtx pat;
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  tree arg1 = CALL_EXPR_ARG (exp, 1);
+  rtx op0 = expand_normal (arg0);
+  rtx op1 = expand_normal (arg1);
+  rtx op2;
+  enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
+  enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
+  enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
+
+  if (optimize || target == 0
+      || GET_MODE (target) != tmode
+      || !insn_data[d->icode].operand[0].predicate (target, tmode))
+    target = gen_reg_rtx (tmode);
+
+  op0 = safe_vector_operand (op0, mode0);
+  op1 = safe_vector_operand (op1, mode1);
+
+  if ((optimize && !register_operand (op0, mode0))
+      || !insn_data[d->icode].operand[0].predicate (op0, mode0))
+    op0 = copy_to_mode_reg (mode0, op0);
+  if ((optimize && !register_operand (op1, mode1))
+      || !insn_data[d->icode].operand[1].predicate (op1, mode1))
+    op1 = copy_to_mode_reg (mode1, op1);
+
+  op2 = GEN_INT (d->comparison);
+
+  pat = GEN_FCN (d->icode) (target, op0, op1, op2);
+  if (! pat)
+    return 0;
+  emit_insn (pat);
+  return target;
+}
+
  /* Subroutine of ix86_expand_builtin to take care of ptest insns.  */
  
  static rtx
@@ -27636,7 +28383,12 @@ ix86_expand_args_builtin (const struct builtin_description *d,
      case V4DF_FTYPE_V4DF_ROUND:
      case V4SF_FTYPE_V4SF_ROUND:
      case V8SF_FTYPE_V8SF_ROUND:
+    case V4SI_FTYPE_V4SF_ROUND:
+    case V8SI_FTYPE_V8SF_ROUND:
        return ix86_expand_sse_round (d, exp, target);
+    case V4SI_FTYPE_V2DF_V2DF_ROUND:
+    case V8SI_FTYPE_V4DF_V4DF_ROUND:
+      return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
      case INT_FTYPE_V8SF_V8SF_PTEST:
      case INT_FTYPE_V4DI_V4DI_PTEST:
      case INT_FTYPE_V4DF_V4DF_PTEST:
@@ -27766,6 +28518,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
      case V32QI_FTYPE_V32QI_V32QI:
      case V16HI_FTYPE_V32QI_V32QI:
      case V16HI_FTYPE_V16HI_V16HI:
+    case V8SI_FTYPE_V4DF_V4DF:
      case V8SI_FTYPE_V8SI_V8SI:
      case V8SI_FTYPE_V16HI_V16HI:
      case V4DI_FTYPE_V4DI_V4DI:
@@ -27952,15 +28705,22 @@ ix86_expand_args_builtin (const struct builtin_description *d,
                 error ("the last argument must be an 1-bit immediate");
                 return const0_rtx;
  
-             case CODE_FOR_sse4_1_roundpd:
-             case CODE_FOR_sse4_1_roundps:
               case CODE_FOR_sse4_1_roundsd:
               case CODE_FOR_sse4_1_roundss:
+
+             case CODE_FOR_sse4_1_roundpd:
+             case CODE_FOR_sse4_1_roundps:
+             case CODE_FOR_avx_roundpd256:
+             case CODE_FOR_avx_roundps256:
+
+             case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
+             case CODE_FOR_sse4_1_roundps_sfix:
+             case CODE_FOR_avx_roundpd_vec_pack_sfix256:
+             case CODE_FOR_avx_roundps_sfix256:
+
               case CODE_FOR_sse4_1_blendps:
               case CODE_FOR_avx_blendpd256:
               case CODE_FOR_avx_vpermilv4df:
-             case CODE_FOR_avx_roundpd256:
-             case CODE_FOR_avx_roundps256:
                 error ("the last argument must be a 4-bit immediate");
                 return const0_rtx;
  
@@ -28127,6 +28887,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
      case VOID_FTYPE_PFLOAT_V4SF:
      case VOID_FTYPE_PDOUBLE_V4DF:
      case VOID_FTYPE_PDOUBLE_V2DF:
+    case VOID_FTYPE_PLONGLONG_LONGLONG:
      case VOID_FTYPE_PULONGLONG_ULONGLONG:
      case VOID_FTYPE_PINT_INT:
        nargs = 1;
@@ -28558,20 +29319,6 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
      case IX86_BUILTIN_VEC_SET_V16QI:
        return ix86_expand_vec_set_builtin (exp);
  
-    case IX86_BUILTIN_VEC_PERM_V2DF:
-    case IX86_BUILTIN_VEC_PERM_V4SF:
-    case IX86_BUILTIN_VEC_PERM_V2DI:
-    case IX86_BUILTIN_VEC_PERM_V4SI:
-    case IX86_BUILTIN_VEC_PERM_V8HI:
-    case IX86_BUILTIN_VEC_PERM_V16QI:
-    case IX86_BUILTIN_VEC_PERM_V2DI_U:
-    case IX86_BUILTIN_VEC_PERM_V4SI_U:
-    case IX86_BUILTIN_VEC_PERM_V8HI_U:
-    case IX86_BUILTIN_VEC_PERM_V16QI_U:
-    case IX86_BUILTIN_VEC_PERM_V4DF:
-    case IX86_BUILTIN_VEC_PERM_V8SF:
-      return ix86_expand_vec_perm_builtin (exp);
-
      case IX86_BUILTIN_INFQ:
      case IX86_BUILTIN_HUGE_VALQ:
        {
@@ -28709,7 +29456,7 @@ rdrand_step:
        icode = CODE_FOR_avx2_gatherdiv4sf;
        goto gather_gen;
      case IX86_BUILTIN_GATHERDIV8SF:
-      icode = CODE_FOR_avx2_gatherdiv4sf256;
+      icode = CODE_FOR_avx2_gatherdiv8sf;
        goto gather_gen;
      case IX86_BUILTIN_GATHERSIV2DI:
        icode = CODE_FOR_avx2_gathersiv2di;
@@ -28733,7 +29480,20 @@ rdrand_step:
        icode = CODE_FOR_avx2_gatherdiv4si;
        goto gather_gen;
      case IX86_BUILTIN_GATHERDIV8SI:
-      icode = CODE_FOR_avx2_gatherdiv4si256;
+      icode = CODE_FOR_avx2_gatherdiv8si;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERALTSIV4DF:
+      icode = CODE_FOR_avx2_gathersiv4df;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERALTDIV8SF:
+      icode = CODE_FOR_avx2_gatherdiv8sf;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERALTSIV4DI:
+      icode = CODE_FOR_avx2_gathersiv4di;
+      goto gather_gen;
+    case IX86_BUILTIN_GATHERALTDIV8SI:
+      icode = CODE_FOR_avx2_gatherdiv8si;
+      goto gather_gen;
  
      gather_gen:
        arg0 = CALL_EXPR_ARG (exp, 0);
@@ -28748,13 +29508,43 @@ rdrand_step:
        op4 = expand_normal (arg4);
        /* Note the arg order is different from the operand order.  */
        mode0 = insn_data[icode].operand[1].mode;
-      mode1 = insn_data[icode].operand[2].mode;
        mode2 = insn_data[icode].operand[3].mode;
        mode3 = insn_data[icode].operand[4].mode;
        mode4 = insn_data[icode].operand[5].mode;
  
-      if (target == NULL_RTX)
-       target = gen_reg_rtx (insn_data[icode].operand[0].mode);
+      if (target == NULL_RTX
+         || GET_MODE (target) != insn_data[icode].operand[0].mode)
+       subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
+      else
+       subtarget = target;
+
+      if (fcode == IX86_BUILTIN_GATHERALTSIV4DF
+         || fcode == IX86_BUILTIN_GATHERALTSIV4DI)
+       {
+         rtx half = gen_reg_rtx (V4SImode);
+         if (!nonimmediate_operand (op2, V8SImode))
+           op2 = copy_to_mode_reg (V8SImode, op2);
+         emit_insn (gen_vec_extract_lo_v8si (half, op2));
+         op2 = half;
+       }
+      else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF
+              || fcode == IX86_BUILTIN_GATHERALTDIV8SI)
+       {
+         rtx (*gen) (rtx, rtx);
+         rtx half = gen_reg_rtx (mode0);
+         if (mode0 == V4SFmode)
+           gen = gen_vec_extract_lo_v8sf;
+         else
+           gen = gen_vec_extract_lo_v8si;
+         if (!nonimmediate_operand (op0, GET_MODE (op0)))
+           op0 = copy_to_mode_reg (GET_MODE (op0), op0);
+         emit_insn (gen (half, op0));
+         op0 = half;
+         if (!nonimmediate_operand (op3, GET_MODE (op3)))
+           op3 = copy_to_mode_reg (GET_MODE (op3), op3);
+         emit_insn (gen (half, op3));
+         op3 = half;
+       }
  
        /* Force memory operand only with base register here.  But we
          don't want to do it on memory operand for other builtin
@@ -28762,12 +29552,11 @@ rdrand_step:
        if (GET_MODE (op1) != Pmode)
         op1 = convert_to_mode (Pmode, op1, 1);
        op1 = force_reg (Pmode, op1);
-      op1 = gen_rtx_MEM (mode1, op1);
  
        if (!insn_data[icode].operand[1].predicate (op0, mode0))
         op0 = copy_to_mode_reg (mode0, op0);
-      if (!insn_data[icode].operand[2].predicate (op1, mode1))
-       op1 = copy_to_mode_reg (mode1, op1);
+      if (!insn_data[icode].operand[2].predicate (op1, Pmode))
+       op1 = copy_to_mode_reg (Pmode, op1);
        if (!insn_data[icode].operand[3].predicate (op2, mode2))
         op2 = copy_to_mode_reg (mode2, op2);
        if (!insn_data[icode].operand[4].predicate (op3, mode3))
@@ -28777,10 +29566,91 @@ rdrand_step:
            error ("last argument must be scale 1, 2, 4, 8");
            return const0_rtx;
         }
-      pat = GEN_FCN (icode) (target, op0, op1, op2, op3, op4);
+
+      /* Optimize.  If mask is known to have all high bits set,
+        replace op0 with pc_rtx to signal that the instruction
+        overwrites the whole destination and doesn't use its
+        previous contents.  */
+      if (optimize)
+       {
+         if (TREE_CODE (arg3) == VECTOR_CST)
+           {
+             tree elt;
+             unsigned int negative = 0;
+             for (elt = TREE_VECTOR_CST_ELTS (arg3);
+                  elt; elt = TREE_CHAIN (elt))
+               {
+                 tree cst = TREE_VALUE (elt);
+                 if (TREE_CODE (cst) == INTEGER_CST
+                     && tree_int_cst_sign_bit (cst))
+                   negative++;
+                 else if (TREE_CODE (cst) == REAL_CST
+                          && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
+                   negative++;
+               }
+             if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
+               op0 = pc_rtx;
+           }
+         else if (TREE_CODE (arg3) == SSA_NAME)
+           {
+             /* Recognize also when mask is like:
+                __v2df src = _mm_setzero_pd ();
+                __v2df mask = _mm_cmpeq_pd (src, src);
+                or
+                __v8sf src = _mm256_setzero_ps ();
+                __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
+                as that is a cheaper way to load all ones into
+                a register than having to load a constant from
+                memory.  */
+             gimple def_stmt = SSA_NAME_DEF_STMT (arg3);
+             if (is_gimple_call (def_stmt))
+               {
+                 tree fndecl = gimple_call_fndecl (def_stmt);
+                 if (fndecl
+                     && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD)
+                   switch ((unsigned int) DECL_FUNCTION_CODE (fndecl))
+                     {
+                     case IX86_BUILTIN_CMPPD:
+                     case IX86_BUILTIN_CMPPS:
+                     case IX86_BUILTIN_CMPPD256:
+                     case IX86_BUILTIN_CMPPS256:
+                       if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
+                         break;
+                       /* FALLTHRU */
+                     case IX86_BUILTIN_CMPEQPD:
+                     case IX86_BUILTIN_CMPEQPS:
+                       if (initializer_zerop (gimple_call_arg (def_stmt, 0))
+                           && initializer_zerop (gimple_call_arg (def_stmt,
+                                                                  1)))
+                         op0 = pc_rtx;
+                       break;
+                     default:
+                       break;
+                     }
+               }
+           }
+       }
+
+      pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
        if (! pat)
         return const0_rtx;
        emit_insn (pat);
+
+      if (fcode == IX86_BUILTIN_GATHERDIV8SF
+         || fcode == IX86_BUILTIN_GATHERDIV8SI)
+       {
+         enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode
+                                   ? V4SFmode : V4SImode;
+         if (target == NULL_RTX)
+           target = gen_reg_rtx (tmode);
+         if (tmode == V4SFmode)
+           emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
+         else
+           emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
+       }
+      else
+       target = subtarget;
+
        return target;
  
      default:
@@ -28877,13 +29747,85 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out,
         }
        break;
  
+    case BUILT_IN_IFLOOR:
+    case BUILT_IN_LFLOOR:
+    case BUILT_IN_LLFLOOR:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+       break;
+
+      if (out_mode == SImode && in_mode == DFmode)
+       {
+         if (out_n == 4 && in_n == 2)
+           return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX];
+         else if (out_n == 8 && in_n == 4)
+           return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256];
+       }
+      break;
+
+    case BUILT_IN_IFLOORF:
+    case BUILT_IN_LFLOORF:
+    case BUILT_IN_LLFLOORF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+       break;
+
+      if (out_mode == SImode && in_mode == SFmode)
+       {
+         if (out_n == 4 && in_n == 4)
+           return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX];
+         else if (out_n == 8 && in_n == 8)
+           return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256];
+       }
+      break;
+
+    case BUILT_IN_ICEIL:
+    case BUILT_IN_LCEIL:
+    case BUILT_IN_LLCEIL:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+       break;
+
+      if (out_mode == SImode && in_mode == DFmode)
+       {
+         if (out_n == 4 && in_n == 2)
+           return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX];
+         else if (out_n == 8 && in_n == 4)
+           return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256];
+       }
+      break;
+
+    case BUILT_IN_ICEILF:
+    case BUILT_IN_LCEILF:
+    case BUILT_IN_LLCEILF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+       break;
+
+      if (out_mode == SImode && in_mode == SFmode)
+       {
+         if (out_n == 4 && in_n == 4)
+           return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX];
+         else if (out_n == 8 && in_n == 8)
+           return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256];
+       }
+      break;
+
+    case BUILT_IN_IRINT:
      case BUILT_IN_LRINT:
-      if (out_mode == SImode && out_n == 4
-         && in_mode == DFmode && in_n == 2)
-       return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
+    case BUILT_IN_LLRINT:
+      if (out_mode == SImode && in_mode == DFmode)
+       {
+         if (out_n == 4 && in_n == 2)
+           return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
+         else if (out_n == 8 && in_n == 4)
+           return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256];
+       }
        break;
  
+    case BUILT_IN_IRINTF:
      case BUILT_IN_LRINTF:
+    case BUILT_IN_LLRINTF:
        if (out_mode == SImode && in_mode == SFmode)
         {
           if (out_n == 4 && in_n == 4)
@@ -28893,6 +29835,38 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out,
         }
        break;
  
+    case BUILT_IN_IROUND:
+    case BUILT_IN_LROUND:
+    case BUILT_IN_LLROUND:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+       break;
+
+      if (out_mode == SImode && in_mode == DFmode)
+       {
+         if (out_n == 4 && in_n == 2)
+           return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX];
+         else if (out_n == 8 && in_n == 4)
+           return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256];
+       }
+      break;
+
+    case BUILT_IN_IROUNDF:
+    case BUILT_IN_LROUNDF:
+    case BUILT_IN_LLROUNDF:
+      /* The round insn does not trap on denormals.  */
+      if (flag_trapping_math || !TARGET_ROUND)
+       break;
+
+      if (out_mode == SImode && in_mode == SFmode)
+       {
+         if (out_n == 4 && in_n == 4)
+           return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX];
+         else if (out_n == 8 && in_n == 8)
+           return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256];
+       }
+      break;
+
      case BUILT_IN_COPYSIGN:
        if (out_mode == DFmode && in_mode == DFmode)
         {
@@ -29160,7 +30134,7 @@ ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
        return NULL_TREE;
      }
  
-  bname = IDENTIFIER_POINTER (DECL_NAME (implicit_built_in_decls[fn]));
+  bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
  
    if (fn == BUILT_IN_LOGF)
      strcpy (name, "vmlsLn4");
@@ -29178,7 +30152,8 @@ ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
    name[4] &= ~0x20;
  
    arity = 0;
-  for (args = DECL_ARGUMENTS (implicit_built_in_decls[fn]); args;
+  for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
+       args;
         args = TREE_CHAIN (args))
      arity++;
  
@@ -29259,11 +30234,12 @@ ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
        return NULL_TREE;
      }
  
-  bname = IDENTIFIER_POINTER (DECL_NAME (implicit_built_in_decls[fn]));
+  bname = IDENTIFIER_POINTER (DECL_NAME (builtin_decl_implicit (fn)));
    sprintf (name + 7, "%s", bname+10);
  
    arity = 0;
-  for (args = DECL_ARGUMENTS (implicit_built_in_decls[fn]); args;
+  for (args = DECL_ARGUMENTS (builtin_decl_implicit (fn));
+       args;
         args = TREE_CHAIN (args))
      arity++;
  
@@ -29283,94 +30259,71 @@ ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
    return new_fndecl;
  }
  
-
-/* Returns a decl of a function that implements conversion of an integer vector
-   into a floating-point vector, or vice-versa.  DEST_TYPE and SRC_TYPE
-   are the types involved when converting according to CODE.
+/* Returns a decl of a function that implements gather load with
+   memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE.
     Return NULL_TREE if it is not available.  */
  
  static tree
-ix86_vectorize_builtin_conversion (unsigned int code,
-                                  tree dest_type, tree src_type)
+ix86_vectorize_builtin_gather (const_tree mem_vectype,
+                              const_tree index_type, int scale)
  {
-  if (! TARGET_SSE2)
+  bool si;
+  enum ix86_builtins code;
+
+  if (! TARGET_AVX2)
      return NULL_TREE;
  
-  switch (code)
-    {
-    case FLOAT_EXPR:
-      switch (TYPE_MODE (src_type))
-       {
-       case V4SImode:
-         switch (TYPE_MODE (dest_type))
-           {
-           case V4SFmode:
-             return (TYPE_UNSIGNED (src_type)
-                     ? ix86_builtins[IX86_BUILTIN_CVTUDQ2PS]
-                     : ix86_builtins[IX86_BUILTIN_CVTDQ2PS]);
-           case V4DFmode:
-             return (TYPE_UNSIGNED (src_type)
-                     ? NULL_TREE
-                     : ix86_builtins[IX86_BUILTIN_CVTDQ2PD256]);
-           default:
-             return NULL_TREE;
-           }
-         break;
-       case V8SImode:
-         switch (TYPE_MODE (dest_type))
-           {
-           case V8SFmode:
-             return (TYPE_UNSIGNED (src_type)
-                     ? NULL_TREE
-                     : ix86_builtins[IX86_BUILTIN_CVTDQ2PS256]);
-           default:
-             return NULL_TREE;
-           }
-         break;
-       default:
-         return NULL_TREE;
-       }
+  if ((TREE_CODE (index_type) != INTEGER_TYPE
+       && !POINTER_TYPE_P (index_type))
+      || (TYPE_MODE (index_type) != SImode
+         && TYPE_MODE (index_type) != DImode))
+    return NULL_TREE;
  
-    case FIX_TRUNC_EXPR:
-      switch (TYPE_MODE (dest_type))
-       {
-       case V4SImode:
-         switch (TYPE_MODE (src_type))
-           {
-           case V4SFmode:
-             return (TYPE_UNSIGNED (dest_type)
-                     ? NULL_TREE
-                     : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ]);
-           case V4DFmode:
-             return (TYPE_UNSIGNED (dest_type)
-                     ? NULL_TREE
-                     : ix86_builtins[IX86_BUILTIN_CVTTPD2DQ256]);
-           default:
-             return NULL_TREE;
-           }
-         break;
+  if (TYPE_PRECISION (index_type) > POINTER_SIZE)
+    return NULL_TREE;
  
-       case V8SImode:
-         switch (TYPE_MODE (src_type))
-           {
-           case V8SFmode:
-             return (TYPE_UNSIGNED (dest_type)
-                     ? NULL_TREE
-                     : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ256]);
-           default:
-             return NULL_TREE;
-           }
-         break;
+  /* v*gather* insn sign extends index to pointer mode.  */
+  if (TYPE_PRECISION (index_type) < POINTER_SIZE
+      && TYPE_UNSIGNED (index_type))
+    return NULL_TREE;
  
-       default:
-         return NULL_TREE;
-       }
+  if (scale <= 0
+      || scale > 8
+      || (scale & (scale - 1)) != 0)
+    return NULL_TREE;
  
+  si = TYPE_MODE (index_type) == SImode;
+  switch (TYPE_MODE (mem_vectype))
+    {
+    case V2DFmode:
+      code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF;
+      break;
+    case V4DFmode:
+      code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF;
+      break;
+    case V2DImode:
+      code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI;
+      break;
+    case V4DImode:
+      code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI;
+      break;
+    case V4SFmode:
+      code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF;
+      break;
+    case V8SFmode:
+      code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF;
+      break;
+    case V4SImode:
+      code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI;
+      break;
+    case V8SImode:
+      code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI;
+      break;
      default:
        return NULL_TREE;
      }
  
-  return NULL_TREE;
+  return ix86_builtins[code];
  }
  
  /* Returns a code for a target-specific builtin that implements
@@ -29539,7 +30492,6 @@ avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
    return mask + 1;
  }
  \f
-
  /* Store OPERAND to the memory after reload is completed.  This means
     that we can't easily use assign_stack_local.  */
  rtx
@@ -31773,9 +32725,9 @@ x86_emit_floatuns (rtx operands[2])
    emit_label (donelab);
  }
  \f
-/* AVX does not support 32-byte integer vector operations,
-   thus the longest vector we are faced with is V16QImode.  */
-#define MAX_VECT_LEN   16
+/* AVX2 does support 32-byte integer vector operations,
+   thus the longest vector we are faced with is V32QImode.  */
+#define MAX_VECT_LEN   32
  
  struct expand_vec_perm_d
  {
@@ -31788,9 +32740,6 @@ struct expand_vec_perm_d
  
  static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
  static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
-static int extract_vec_perm_cst (struct expand_vec_perm_d *, tree);
-static bool ix86_vectorize_builtin_vec_perm_ok (tree vec_type, tree mask);
-
  
  /* Get a vector mode of the same size as the original but with elements
     twice as wide.  This is only guaranteed to apply to integral vectors.  */
@@ -32672,9 +33621,9 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
           tmp = gen_reg_rtx (GET_MODE_INNER (mode));
           ix86_expand_vector_extract (true, tmp, target, 1 - elt);
           if (elt == 0)
-           tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
-         else
             tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
+         else
+           tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
           emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
           return;
         }
@@ -32688,9 +33637,9 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
        tmp = gen_reg_rtx (GET_MODE_INNER (mode));
        ix86_expand_vector_extract (false, tmp, target, 1 - elt);
        if (elt == 0)
-       tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
-      else
         tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
+      else
+       tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
        emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
        return;
  
@@ -33112,72 +34061,100 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
      }
  }
  
-/* Expand a vector reduction.  FN is the binary pattern to reduce;
-   DEST is the destination; IN is the input vector.  */
+/* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
+   to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
+   The upper bits of DEST are undefined, though they shouldn't cause
+   exceptions (some bits from src or all zeros are ok).  */
  
-void
-ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
+static void
+emit_reduc_half (rtx dest, rtx src, int i)
  {
-  rtx tmp1, tmp2, tmp3, tmp4, tmp5;
-  enum machine_mode mode = GET_MODE (in);
-  int i;
-
-  tmp1 = gen_reg_rtx (mode);
-  tmp2 = gen_reg_rtx (mode);
-  tmp3 = gen_reg_rtx (mode);
-
-  switch (mode)
+  rtx tem;
+  switch (GET_MODE (src))
      {
      case V4SFmode:
-      emit_insn (gen_sse_movhlps (tmp1, in, in));
-      emit_insn (fn (tmp2, tmp1, in));
-      emit_insn (gen_sse_shufps_v4sf (tmp3, tmp2, tmp2,
-                                     const1_rtx, const1_rtx,
-                                     GEN_INT (1+4), GEN_INT (1+4)));
+      if (i == 128)
+       tem = gen_sse_movhlps (dest, src, src);
+      else
+       tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
+                                  GEN_INT (1 + 4), GEN_INT (1 + 4));
+      break;
+    case V2DFmode:
+      tem = gen_vec_interleave_highv2df (dest, src, src);
+      break;
+    case V16QImode:
+    case V8HImode:
+    case V4SImode:
+    case V2DImode:
+      tem = gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, dest),
+                               gen_lowpart (V1TImode, src),
+                               GEN_INT (i / 2));
        break;
      case V8SFmode:
-      tmp4 = gen_reg_rtx (mode);
-      tmp5 = gen_reg_rtx (mode);
-      emit_insn (gen_avx_vperm2f128v8sf3 (tmp4, in, in, const1_rtx));
-      emit_insn (fn (tmp5, tmp4, in));
-      emit_insn (gen_avx_shufps256 (tmp1, tmp5, tmp5, GEN_INT (2+12)));
-      emit_insn (fn (tmp2, tmp1, tmp5));
-      emit_insn (gen_avx_shufps256 (tmp3, tmp2, tmp2, const1_rtx));
+      if (i == 256)
+       tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
+      else
+       tem = gen_avx_shufps256 (dest, src, src,
+                                GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
        break;
      case V4DFmode:
-      emit_insn (gen_avx_vperm2f128v4df3 (tmp1, in, in, const1_rtx));
-      emit_insn (fn (tmp2, tmp1, in));
-      emit_insn (gen_avx_shufpd256 (tmp3, tmp2, tmp2, const1_rtx));
+      if (i == 256)
+       tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
+      else
+       tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
        break;
      case V32QImode:
      case V16HImode:
      case V8SImode:
      case V4DImode:
-      emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, tmp1),
-                                   gen_lowpart (V4DImode, in),
-                                   gen_lowpart (V4DImode, in),
-                                   const1_rtx));
-      tmp4 = in;
-      tmp5 = tmp1;
-      for (i = 64; i >= GET_MODE_BITSIZE (GET_MODE_INNER (mode)); i >>= 1)
-       {
-         if (i != 64)
-           {
-             tmp2 = gen_reg_rtx (mode);
-             tmp3 = gen_reg_rtx (mode);
-           }
-         emit_insn (fn (tmp2, tmp4, tmp5));
-         emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, tmp3),
-                                        gen_lowpart (V2TImode, tmp2),
-                                        GEN_INT (i)));
-         tmp4 = tmp2;
-         tmp5 = tmp3;
-       }
+      if (i == 256)
+       tem = gen_avx2_permv2ti (gen_lowpart (V4DImode, dest),
+                                gen_lowpart (V4DImode, src),
+                                gen_lowpart (V4DImode, src),
+                                const1_rtx);
+      else
+       tem = gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, dest),
+                                 gen_lowpart (V2TImode, src),
+                                 GEN_INT (i / 2));
        break;
      default:
        gcc_unreachable ();
      }
-  emit_insn (fn (dest, tmp2, tmp3));
+  emit_insn (tem);
+}
+
+/* Expand a vector reduction.  FN is the binary pattern to reduce;
+   DEST is the destination; IN is the input vector.  */
+
+void
+ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
+{
+  rtx half, dst, vec = in;
+  enum machine_mode mode = GET_MODE (in);
+  int i;
+
+  /* SSE4 has a special instruction for V8HImode UMIN reduction.  */
+  if (TARGET_SSE4_1
+      && mode == V8HImode
+      && fn == gen_uminv8hi3)
+    {
+      emit_insn (gen_sse4_1_phminposuw (dest, in));
+      return;
+    }
+
+  for (i = GET_MODE_BITSIZE (mode);
+       i > GET_MODE_BITSIZE (GET_MODE_INNER (mode));
+       i >>= 1)
+    {
+      half = gen_reg_rtx (mode);
+      emit_reduc_half (half, vec, i);
+      if (i == GET_MODE_BITSIZE (GET_MODE_INNER (mode)) * 2)
+       dst = dest;
+      else
+       dst = gen_reg_rtx (mode);
+      emit_insn (fn (dst, half, vec));
+      vec = dst;
+    }
  }
  \f
  /* Target hook for scalar_mode_supported_p.  */
@@ -33512,6 +34489,8 @@ void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
  
    /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
  
+  b = force_reg (mode, b);
+
    /* x0 = rcp(b) estimate */
    emit_insn (gen_rtx_SET (VOIDmode, x0,
                           gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
@@ -33567,6 +34546,8 @@ void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
    /* sqrt(a)  = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
       rsqrt(a) = -0.5     * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
  
+  a = force_reg (mode, a);
+
    /* x0 = rsqrt(a) estimate */
    emit_insn (gen_rtx_SET (VOIDmode, x0,
                           gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
@@ -34373,6 +35354,11 @@ static const struct attribute_spec ix86_attribute_table[] =
       for FP arguments.  */
    { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
      true },
+  /* The transactional memory builtins are implicitly regparm or fastcall
+     depending on the ABI.  Override the generic do-nothing attribute that
+     these builtins were declared with.  */
+  { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute,
+    true },
    /* force_align_arg_pointer says this function realigns the stack at entry.  */
    { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
      false, true,  true, ix86_handle_cconv_attribute, false },
@@ -34443,105 +35429,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
          return ix86_cost->cond_not_taken_branch_cost;
  
        case vec_perm:
-        return 1;
+      case vec_promote_demote:
+        return ix86_cost->vec_stmt_cost;
  
        default:
          gcc_unreachable ();
      }
  }
  
-
-/* Implement targetm.vectorize.builtin_vec_perm.  */
-
-static tree
-ix86_vectorize_builtin_vec_perm (tree vec_type, tree *mask_type)
-{
-  tree itype = TREE_TYPE (vec_type);
-  bool u = TYPE_UNSIGNED (itype);
-  enum machine_mode vmode = TYPE_MODE (vec_type);
-  enum ix86_builtins fcode;
-  bool ok = TARGET_SSE2;
-
-  switch (vmode)
-    {
-    case V4DFmode:
-      ok = TARGET_AVX;
-      fcode = IX86_BUILTIN_VEC_PERM_V4DF;
-      goto get_di;
-    case V2DFmode:
-      fcode = IX86_BUILTIN_VEC_PERM_V2DF;
-    get_di:
-      itype = ix86_get_builtin_type (IX86_BT_DI);
-      break;
-
-    case V8SFmode:
-      ok = TARGET_AVX;
-      fcode = IX86_BUILTIN_VEC_PERM_V8SF;
-      goto get_si;
-    case V4SFmode:
-      ok = TARGET_SSE;
-      fcode = IX86_BUILTIN_VEC_PERM_V4SF;
-    get_si:
-      itype = ix86_get_builtin_type (IX86_BT_SI);
-      break;
-
-    case V2DImode:
-      fcode = u ? IX86_BUILTIN_VEC_PERM_V2DI_U : IX86_BUILTIN_VEC_PERM_V2DI;
-      break;
-    case V4SImode:
-      fcode = u ? IX86_BUILTIN_VEC_PERM_V4SI_U : IX86_BUILTIN_VEC_PERM_V4SI;
-      break;
-    case V8HImode:
-      fcode = u ? IX86_BUILTIN_VEC_PERM_V8HI_U : IX86_BUILTIN_VEC_PERM_V8HI;
-      break;
-    case V16QImode:
-      fcode = u ? IX86_BUILTIN_VEC_PERM_V16QI_U : IX86_BUILTIN_VEC_PERM_V16QI;
-      break;
-    default:
-      ok = false;
-      break;
-    }
-
-  if (!ok)
-    return NULL_TREE;
-
-  *mask_type = itype;
-  return ix86_builtins[(int) fcode];
-}
-
-/* Return a vector mode with twice as many elements as VMODE.  */
-/* ??? Consider moving this to a table generated by genmodes.c.  */
-
-static enum machine_mode
-doublesize_vector_mode (enum machine_mode vmode)
-{
-  switch (vmode)
-    {
-    case V2SFmode:     return V4SFmode;
-    case V1DImode:     return V2DImode;
-    case V2SImode:     return V4SImode;
-    case V4HImode:     return V8HImode;
-    case V8QImode:     return V16QImode;
-
-    case V2DFmode:     return V4DFmode;
-    case V4SFmode:     return V8SFmode;
-    case V2DImode:     return V4DImode;
-    case V4SImode:     return V8SImode;
-    case V8HImode:     return V16HImode;
-    case V16QImode:    return V32QImode;
-
-    case V4DFmode:     return V8DFmode;
-    case V8SFmode:     return V16SFmode;
-    case V4DImode:     return V8DImode;
-    case V8SImode:     return V16SImode;
-    case V16HImode:    return V32HImode;
-    case V32QImode:    return V64QImode;
-
-    default:
-      gcc_unreachable ();
-    }
-}
-
  /* Construct (set target (vec_select op0 (parallel perm))) and
     return true if that's a valid instruction in the active ISA.  */
  
@@ -34576,13 +35471,13 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
    enum machine_mode v2mode;
    rtx x;
  
-  v2mode = doublesize_vector_mode (GET_MODE (op0));
+  v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0));
    x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
    return expand_vselect (target, x, perm, nelt);
  }
  
  /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
-   in terms of blendp[sd] / pblendw / pblendvb.  */
+   in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
  
  static bool
  expand_vec_perm_blend (struct expand_vec_perm_d *d)
@@ -34590,10 +35485,17 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
    enum machine_mode vmode = d->vmode;
    unsigned i, mask, nelt = d->nelt;
    rtx target, op0, op1, x;
+  rtx rperm[32], vperm;
  
-  if (!TARGET_SSE4_1 || d->op0 == d->op1)
+  if (d->op0 == d->op1)
      return false;
-  if (!(GET_MODE_SIZE (vmode) == 16 || vmode == V4DFmode || vmode == V8SFmode))
+  if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+    ;
+  else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
+    ;
+  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+    ;
+  else
      return false;
  
    /* This is a blend, not a permute.  Elements must stay in their
@@ -34611,30 +35513,6 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
    /* ??? Without SSE4.1, we could implement this with and/andn/or.  This
       decision should be extracted elsewhere, so that we only try that
       sequence once all budget==3 options have been tried.  */
-
-  /* For bytes, see if bytes move in pairs so we can use pblendw with
-     an immediate argument, rather than pblendvb with a vector argument.  */
-  if (vmode == V16QImode)
-    {
-      bool pblendw_ok = true;
-      for (i = 0; i < 16 && pblendw_ok; i += 2)
-       pblendw_ok = (d->perm[i] + 1 == d->perm[i + 1]);
-
-      if (!pblendw_ok)
-       {
-         rtx rperm[16], vperm;
-
-         for (i = 0; i < nelt; ++i)
-           rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
-
-         vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm));
-         vperm = force_reg (V16QImode, vperm);
-
-         emit_insn (gen_sse4_1_pblendvb (d->target, d->op0, d->op1, vperm));
-         return true;
-       }
-    }
-
    target = d->target;
    op0 = d->op0;
    op1 = d->op1;
@@ -34647,6 +35525,7 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
      case V2DFmode:
      case V4SFmode:
      case V8HImode:
+    case V8SImode:
        for (i = 0; i < nelt; ++i)
         mask |= (d->perm[i] >= nelt) << i;
        break;
@@ -34654,24 +35533,122 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
      case V2DImode:
        for (i = 0; i < 2; ++i)
         mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
+      vmode = V8HImode;
        goto do_subreg;
  
      case V4SImode:
        for (i = 0; i < 4; ++i)
         mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+      vmode = V8HImode;
        goto do_subreg;
  
      case V16QImode:
+      /* See if bytes move in pairs so we can use pblendw with
+        an immediate argument, rather than pblendvb with a vector
+        argument.  */
+      for (i = 0; i < 16; i += 2)
+       if (d->perm[i] + 1 != d->perm[i + 1])
+         {
+         use_pblendvb:
+           for (i = 0; i < nelt; ++i)
+             rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
+
+         finish_pblendvb:
+           vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
+           vperm = force_reg (vmode, vperm);
+
+           if (GET_MODE_SIZE (vmode) == 16)
+             emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
+           else
+             emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
+           return true;
+         }
+
        for (i = 0; i < 8; ++i)
         mask |= (d->perm[i * 2] >= 16) << i;
+      vmode = V8HImode;
+      /* FALLTHRU */
  
      do_subreg:
-      vmode = V8HImode;
        target = gen_lowpart (vmode, target);
        op0 = gen_lowpart (vmode, op0);
        op1 = gen_lowpart (vmode, op1);
        break;
  
+    case V32QImode:
+      /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
+      for (i = 0; i < 32; i += 2)
+       if (d->perm[i] + 1 != d->perm[i + 1])
+         goto use_pblendvb;
+      /* See if bytes move in quadruplets.  If yes, vpblendd
+        with immediate can be used.  */
+      for (i = 0; i < 32; i += 4)
+       if (d->perm[i] + 2 != d->perm[i + 2])
+         break;
+      if (i < 32)
+       {
+         /* See if bytes move the same in both lanes.  If yes,
+            vpblendw with immediate can be used.  */
+         for (i = 0; i < 16; i += 2)
+           if (d->perm[i] + 16 != d->perm[i + 16])
+             goto use_pblendvb;
+
+         /* Use vpblendw.  */
+         for (i = 0; i < 16; ++i)
+           mask |= (d->perm[i * 2] >= 32) << i;
+         vmode = V16HImode;
+         goto do_subreg;
+       }
+
+      /* Use vpblendd.  */
+      for (i = 0; i < 8; ++i)
+       mask |= (d->perm[i * 4] >= 32) << i;
+      vmode = V8SImode;
+      goto do_subreg;
+
+    case V16HImode:
+      /* See if words move in pairs.  If yes, vpblendd can be used.  */
+      for (i = 0; i < 16; i += 2)
+       if (d->perm[i] + 1 != d->perm[i + 1])
+         break;
+      if (i < 16)
+       {
+         /* See if words move the same in both lanes.  If not,
+            vpblendvb must be used.  */
+         for (i = 0; i < 8; i++)
+           if (d->perm[i] + 8 != d->perm[i + 8])
+             {
+               /* Use vpblendvb.  */
+               for (i = 0; i < 32; ++i)
+                 rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
+
+               vmode = V32QImode;
+               nelt = 32;
+               target = gen_lowpart (vmode, target);
+               op0 = gen_lowpart (vmode, op0);
+               op1 = gen_lowpart (vmode, op1);
+               goto finish_pblendvb;
+             }
+
+         /* Use vpblendw.  */
+         for (i = 0; i < 16; ++i)
+           mask |= (d->perm[i] >= 16) << i;
+         break;
+       }
+
+      /* Use vpblendd.  */
+      for (i = 0; i < 8; ++i)
+       mask |= (d->perm[i * 2] >= 16) << i;
+      vmode = V8SImode;
+      goto do_subreg;
+
+    case V4DImode:
+      /* Use vpblendd.  */
+      for (i = 0; i < 4; ++i)
+       mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+      vmode = V8SImode;
+      goto do_subreg;
+
      default:
        gcc_unreachable ();
      }
@@ -34732,43 +35709,165 @@ expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
    return true;
  }
  
-/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
-   in terms of pshufb or vpperm.  */
+/* Return true if permutation D can be performed as VMODE permutation
+   instead.  */
  
  static bool
-expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
+valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d)
  {
-  unsigned i, nelt, eltsz;
-  rtx rperm[16], vperm, target, op0, op1;
+  unsigned int i, j, chunk;
  
-  if (!(d->op0 == d->op1 ? TARGET_SSSE3 : TARGET_XOP))
-    return false;
-  if (GET_MODE_SIZE (d->vmode) != 16)
+  if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
+      || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
+      || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
      return false;
  
-  if (d->testing_p)
+  if (GET_MODE_NUNITS (vmode) >= d->nelt)
      return true;
  
+  chunk = d->nelt / GET_MODE_NUNITS (vmode);
+  for (i = 0; i < d->nelt; i += chunk)
+    if (d->perm[i] & (chunk - 1))
+      return false;
+    else
+      for (j = 1; j < chunk; ++j)
+       if (d->perm[i] + j != d->perm[i + j])
+         return false;
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
+   in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128.  */
+
+static bool
+expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt, eltsz, mask;
+  unsigned char perm[32];
+  enum machine_mode vmode = V16QImode;
+  rtx rperm[32], vperm, target, op0, op1;
+
    nelt = d->nelt;
-  eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
  
-  for (i = 0; i < nelt; ++i)
+  if (d->op0 != d->op1)
      {
-      unsigned j, e = d->perm[i];
-      for (j = 0; j < eltsz; ++j)
-       rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
+      if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
+       {
+         if (TARGET_AVX2
+             && valid_perm_using_mode_p (V2TImode, d))
+           {
+             if (d->testing_p)
+               return true;
+
+             /* Use vperm2i128 insn.  The pattern uses
+                V4DImode instead of V2TImode.  */
+             target = gen_lowpart (V4DImode, d->target);
+             op0 = gen_lowpart (V4DImode, d->op0);
+             op1 = gen_lowpart (V4DImode, d->op1);
+             rperm[0]
+               = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0)
+                          || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0));
+             emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
+             return true;
+           }
+         return false;
+       }
      }
+  else
+    {
+      if (GET_MODE_SIZE (d->vmode) == 16)
+       {
+         if (!TARGET_SSSE3)
+           return false;
+       }
+      else if (GET_MODE_SIZE (d->vmode) == 32)
+       {
+         if (!TARGET_AVX2)
+           return false;
  
-  vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm));
-  vperm = force_reg (V16QImode, vperm);
+         /* V4DImode should be already handled through
+            expand_vselect by vpermq instruction.  */
+         gcc_assert (d->vmode != V4DImode);
+
+         vmode = V32QImode;
+         if (d->vmode == V8SImode
+             || d->vmode == V16HImode
+             || d->vmode == V32QImode)
+           {
+             /* First see if vpermq can be used for
+                V8SImode/V16HImode/V32QImode.  */
+             if (valid_perm_using_mode_p (V4DImode, d))
+               {
+                 for (i = 0; i < 4; i++)
+                   perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
+                 if (d->testing_p)
+                   return true;
+                 return expand_vselect (gen_lowpart (V4DImode, d->target),
+                                        gen_lowpart (V4DImode, d->op0),
+                                        perm, 4);
+               }
+
+             /* Next see if vpermd can be used.  */
+             if (valid_perm_using_mode_p (V8SImode, d))
+               vmode = V8SImode;
+           }
+
+         if (vmode == V32QImode)
+           {
+             /* vpshufb only works intra lanes, it is not
+                possible to shuffle bytes in between the lanes.  */
+             for (i = 0; i < nelt; ++i)
+               if ((d->perm[i] ^ i) & (nelt / 2))
+                 return false;
+           }
+       }
+      else
+       return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  if (vmode == V8SImode)
+    for (i = 0; i < 8; ++i)
+      rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
+  else
+    {
+      eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+      if (d->op0 != d->op1)
+       mask = 2 * nelt - 1;
+      else if (vmode == V16QImode)
+       mask = nelt - 1;
+      else
+       mask = nelt / 2 - 1;
+
+      for (i = 0; i < nelt; ++i)
+       {
+         unsigned j, e = d->perm[i] & mask;
+         for (j = 0; j < eltsz; ++j)
+           rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
+       }
+    }
  
-  target = gen_lowpart (V16QImode, d->target);
-  op0 = gen_lowpart (V16QImode, d->op0);
+  vperm = gen_rtx_CONST_VECTOR (vmode,
+                               gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
+  vperm = force_reg (vmode, vperm);
+
+  target = gen_lowpart (vmode, d->target);
+  op0 = gen_lowpart (vmode, d->op0);
    if (d->op0 == d->op1)
-    emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
+    {
+      if (vmode == V16QImode)
+       emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
+      else if (vmode == V32QImode)
+       emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+      else
+       emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
+    }
    else
      {
-      op1 = gen_lowpart (V16QImode, d->op1);
+      op1 = gen_lowpart (vmode, d->op1);
        emit_insn (gen_xop_pperm (target, op0, op1, vperm));
      }
  
@@ -34790,9 +35889,58 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
    if (d->op0 == d->op1)
      {
        int mask = nelt - 1;
+      bool identity_perm = true;
+      bool broadcast_perm = true;
  
        for (i = 0; i < nelt; i++)
-       perm2[i] = d->perm[i] & mask;
+       {
+         perm2[i] = d->perm[i] & mask;
+         if (perm2[i] != i)
+           identity_perm = false;
+         if (perm2[i])
+           broadcast_perm = false;
+       }
+
+      if (identity_perm)
+       {
+         if (!d->testing_p)
+           emit_move_insn (d->target, d->op0);
+         return true;
+       }
+      else if (broadcast_perm && TARGET_AVX2)
+       {
+         /* Use vpbroadcast{b,w,d}.  */
+         rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
+         switch (d->vmode)
+           {
+           case V32QImode:
+             op = gen_lowpart (V16QImode, op);
+             gen = gen_avx2_pbroadcastv32qi;
+             break;
+           case V16HImode:
+             op = gen_lowpart (V8HImode, op);
+             gen = gen_avx2_pbroadcastv16hi;
+             break;
+           case V8SImode:
+             op = gen_lowpart (V4SImode, op);
+             gen = gen_avx2_pbroadcastv8si;
+             break;
+           case V16QImode:
+             gen = gen_avx2_pbroadcastv16qi;
+             break;
+           case V8HImode:
+             gen = gen_avx2_pbroadcastv8hi;
+             break;
+           /* For other modes prefer other shuffles this function creates.  */
+           default: break;
+           }
+         if (gen != NULL)
+           {
+             if (!d->testing_p)
+               emit_insn (gen (d->target, op));
+             return true;
+           }
+       }
  
        if (expand_vselect (d->target, d->op0, perm2, nelt))
         return true;
@@ -34856,7 +36004,8 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
    if (expand_vec_perm_vpermil (d))
      return true;
  
-  /* Try the SSSE3 pshufb or XOP vpperm variable permutation.  */
+  /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
+     vpshufb, vpermd or vpermq variable permutation.  */
    if (expand_vec_perm_pshufb (d))
      return true;
  
@@ -34966,6 +36115,8 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)
    return ok;
  }
  
+static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
+
  /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
     a two vector permutation into a single vector permutation by using
     an interleave operation to merge the vectors.  */
@@ -34975,93 +36126,225 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
  {
    struct expand_vec_perm_d dremap, dfinal;
    unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
-  unsigned contents, h1, h2, h3, h4;
+  unsigned HOST_WIDE_INT contents;
    unsigned char remap[2 * MAX_VECT_LEN];
    rtx seq;
-  bool ok;
-
-  if (d->op0 == d->op1)
-    return false;
+  bool ok, same_halves = false;
  
-  /* The 256-bit unpck[lh]p[sd] instructions only operate within the 128-bit
-     lanes.  We can use similar techniques with the vperm2f128 instruction,
-     but it requires slightly different logic.  */
-  if (GET_MODE_SIZE (d->vmode) != 16)
+  if (GET_MODE_SIZE (d->vmode) == 16)
+    {
+      if (d->op0 == d->op1)
+       return false;
+    }
+  else if (GET_MODE_SIZE (d->vmode) == 32)
+    {
+      if (!TARGET_AVX)
+       return false;
+      /* For 32-byte modes allow even d->op0 == d->op1.
+        The lack of cross-lane shuffling in some instructions
+        might prevent a single insn shuffle.  */
+      dfinal = *d;
+      dfinal.testing_p = true;
+      /* If expand_vec_perm_interleave3 can expand this into
+        a 3 insn sequence, give up and let it be expanded as
+        3 insn sequence.  While that is one insn longer,
+        it doesn't need a memory operand and in the common
+        case that both interleave low and high permutations
+        with the same operands are adjacent needs 4 insns
+        for both after CSE.  */
+      if (expand_vec_perm_interleave3 (&dfinal))
+       return false;
+    }
+  else
      return false;
  
    /* Examine from whence the elements come.  */
    contents = 0;
    for (i = 0; i < nelt; ++i)
-    contents |= 1u << d->perm[i];
-
-  /* Split the two input vectors into 4 halves.  */
-  h1 = (1u << nelt2) - 1;
-  h2 = h1 << nelt2;
-  h3 = h2 << nelt2;
-  h4 = h3 << nelt2;
+    contents |= ((unsigned HOST_WIDE_INT) 1) << d->perm[i];
  
    memset (remap, 0xff, sizeof (remap));
    dremap = *d;
  
-  /* If the elements from the low halves use interleave low, and similarly
-     for interleave high.  If the elements are from mis-matched halves, we
-     can use shufps for V4SF/V4SI or do a DImode shuffle.  */
-  if ((contents & (h1 | h3)) == contents)
+  if (GET_MODE_SIZE (d->vmode) == 16)
      {
-      for (i = 0; i < nelt2; ++i)
+      unsigned HOST_WIDE_INT h1, h2, h3, h4;
+
+      /* Split the two input vectors into 4 halves.  */
+      h1 = (((unsigned HOST_WIDE_INT) 1) << nelt2) - 1;
+      h2 = h1 << nelt2;
+      h3 = h2 << nelt2;
+      h4 = h3 << nelt2;
+
+      /* If the elements from the low halves use interleave low, and similarly
+        for interleave high.  If the elements are from mis-matched halves, we
+        can use shufps for V4SF/V4SI or do a DImode shuffle.  */
+      if ((contents & (h1 | h3)) == contents)
         {
-         remap[i] = i * 2;
-         remap[i + nelt] = i * 2 + 1;
-         dremap.perm[i * 2] = i;
-         dremap.perm[i * 2 + 1] = i + nelt;
+         /* punpckl* */
+         for (i = 0; i < nelt2; ++i)
+           {
+             remap[i] = i * 2;
+             remap[i + nelt] = i * 2 + 1;
+             dremap.perm[i * 2] = i;
+             dremap.perm[i * 2 + 1] = i + nelt;
+           }
+         if (!TARGET_SSE2 && d->vmode == V4SImode)
+           dremap.vmode = V4SFmode;
         }
-    }
-  else if ((contents & (h2 | h4)) == contents)
-    {
-      for (i = 0; i < nelt2; ++i)
+      else if ((contents & (h2 | h4)) == contents)
         {
-         remap[i + nelt2] = i * 2;
-         remap[i + nelt + nelt2] = i * 2 + 1;
-         dremap.perm[i * 2] = i + nelt2;
-         dremap.perm[i * 2 + 1] = i + nelt + nelt2;
+         /* punpckh* */
+         for (i = 0; i < nelt2; ++i)
+           {
+             remap[i + nelt2] = i * 2;
+             remap[i + nelt + nelt2] = i * 2 + 1;
+             dremap.perm[i * 2] = i + nelt2;
+             dremap.perm[i * 2 + 1] = i + nelt + nelt2;
+           }
+         if (!TARGET_SSE2 && d->vmode == V4SImode)
+           dremap.vmode = V4SFmode;
         }
-    }
-  else if ((contents & (h1 | h4)) == contents)
-    {
-      for (i = 0; i < nelt2; ++i)
+      else if ((contents & (h1 | h4)) == contents)
         {
-         remap[i] = i;
-         remap[i + nelt + nelt2] = i + nelt2;
-         dremap.perm[i] = i;
-         dremap.perm[i + nelt2] = i + nelt + nelt2;
+         /* shufps */
+         for (i = 0; i < nelt2; ++i)
+           {
+             remap[i] = i;
+             remap[i + nelt + nelt2] = i + nelt2;
+             dremap.perm[i] = i;
+             dremap.perm[i + nelt2] = i + nelt + nelt2;
+           }
+         if (nelt != 4)
+           {
+             /* shufpd */
+             dremap.vmode = V2DImode;
+             dremap.nelt = 2;
+             dremap.perm[0] = 0;
+             dremap.perm[1] = 3;
+           }
         }
-      if (nelt != 4)
+      else if ((contents & (h2 | h3)) == contents)
         {
-         dremap.vmode = V2DImode;
-         dremap.nelt = 2;
-         dremap.perm[0] = 0;
-         dremap.perm[1] = 3;
+         /* shufps */
+         for (i = 0; i < nelt2; ++i)
+           {
+             remap[i + nelt2] = i;
+             remap[i + nelt] = i + nelt2;
+             dremap.perm[i] = i + nelt2;
+             dremap.perm[i + nelt2] = i + nelt;
+           }
+         if (nelt != 4)
+           {
+             /* shufpd */
+             dremap.vmode = V2DImode;
+             dremap.nelt = 2;
+             dremap.perm[0] = 1;
+             dremap.perm[1] = 2;
+           }
         }
+      else
+       return false;
      }
-  else if ((contents & (h2 | h3)) == contents)
+  else
      {
-      for (i = 0; i < nelt2; ++i)
+      unsigned int nelt4 = nelt / 4, nzcnt = 0;
+      unsigned HOST_WIDE_INT q[8];
+      unsigned int nonzero_halves[4];
+
+      /* Split the two input vectors into 8 quarters.  */
+      q[0] = (((unsigned HOST_WIDE_INT) 1) << nelt4) - 1;
+      for (i = 1; i < 8; ++i)
+       q[i] = q[0] << (nelt4 * i);
+      for (i = 0; i < 4; ++i)
+       if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
+         {
+           nonzero_halves[nzcnt] = i;
+           ++nzcnt;
+         }
+
+      if (nzcnt == 1)
+       {
+         gcc_assert (d->op0 == d->op1);
+         nonzero_halves[1] = nonzero_halves[0];
+         same_halves = true;
+       }
+      else if (d->op0 == d->op1)
         {
-         remap[i + nelt2] = i;
-         remap[i + nelt] = i + nelt2;
-         dremap.perm[i] = i + nelt2;
-         dremap.perm[i + nelt2] = i + nelt;
+         gcc_assert (nonzero_halves[0] == 0);
+         gcc_assert (nonzero_halves[1] == 1);
         }
-      if (nelt != 4)
+
+      if (nzcnt <= 2)
         {
-         dremap.vmode = V2DImode;
-         dremap.nelt = 2;
-         dremap.perm[0] = 1;
-         dremap.perm[1] = 2;
+         if (d->perm[0] / nelt2 == nonzero_halves[1])
+           {
+             /* Attempt to increase the likelyhood that dfinal
+                shuffle will be intra-lane.  */
+             char tmph = nonzero_halves[0];
+             nonzero_halves[0] = nonzero_halves[1];
+             nonzero_halves[1] = tmph;
+           }
+
+         /* vperm2f128 or vperm2i128.  */
+         for (i = 0; i < nelt2; ++i)
+           {
+             remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
+             remap[i + nonzero_halves[0] * nelt2] = i;
+             dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
+             dremap.perm[i] = i + nonzero_halves[0] * nelt2;
+           }
+
+         if (d->vmode != V8SFmode
+             && d->vmode != V4DFmode
+             && d->vmode != V8SImode)
+           {
+             dremap.vmode = V8SImode;
+             dremap.nelt = 8;
+             for (i = 0; i < 4; ++i)
+               {
+                 dremap.perm[i] = i + nonzero_halves[0] * 4;
+                 dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
+               }
+           }
         }
+      else if (d->op0 == d->op1)
+       return false;
+      else if (TARGET_AVX2
+              && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
+       {
+         /* vpunpckl* */
+         for (i = 0; i < nelt4; ++i)
+           {
+             remap[i] = i * 2;
+             remap[i + nelt] = i * 2 + 1;
+             remap[i + nelt2] = i * 2 + nelt2;
+             remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
+             dremap.perm[i * 2] = i;
+             dremap.perm[i * 2 + 1] = i + nelt;
+             dremap.perm[i * 2 + nelt2] = i + nelt2;
+             dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
+           }
+       }
+      else if (TARGET_AVX2
+              && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
+       {
+         /* vpunpckh* */
+         for (i = 0; i < nelt4; ++i)
+           {
+             remap[i + nelt4] = i * 2;
+             remap[i + nelt + nelt4] = i * 2 + 1;
+             remap[i + nelt2 + nelt4] = i * 2 + nelt2;
+             remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
+             dremap.perm[i * 2] = i + nelt4;
+             dremap.perm[i * 2 + 1] = i + nelt + nelt4;
+             dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
+             dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
+           }
+       }
+      else
+       return false;
      }
-  else
-    return false;
  
    /* Use the remapping array set up above to move the elements from their
       swizzled locations into their final destinations.  */
@@ -35070,7 +36353,15 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
      {
        unsigned e = remap[d->perm[i]];
        gcc_assert (e < nelt);
-      dfinal.perm[i] = e;
+      /* If same_halves is true, both halves of the remapped vector are the
+        same.  Avoid cross-lane accesses if possible.  */
+      if (same_halves && i >= nelt2)
+       {
+         gcc_assert (e < nelt2);
+         dfinal.perm[i] = e + nelt2;
+       }
+      else
+       dfinal.perm[i] = e;
      }
    dfinal.op0 = gen_reg_rtx (dfinal.vmode);
    dfinal.op1 = dfinal.op0;
@@ -35086,6 +36377,9 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
    if (!ok)
      return false;
  
+  if (d->testing_p)
+    return true;
+
    if (dremap.vmode != dfinal.vmode)
      {
        dremap.target = gen_lowpart (dremap.vmode, dremap.target);
@@ -35100,6 +36394,159 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
    return true;
  }
  
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+   a single vector cross-lane permutation into vpermq followed
+   by any of the single insn permutations.  */
+
+static bool
+expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
+{
+  struct expand_vec_perm_d dremap, dfinal;
+  unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
+  unsigned contents[2];
+  bool ok;
+
+  if (!(TARGET_AVX2
+       && (d->vmode == V32QImode || d->vmode == V16HImode)
+       && d->op0 == d->op1))
+    return false;
+
+  contents[0] = 0;
+  contents[1] = 0;
+  for (i = 0; i < nelt2; ++i)
+    {
+      contents[0] |= 1u << (d->perm[i] / nelt4);
+      contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
+    }
+
+  for (i = 0; i < 2; ++i)
+    {
+      unsigned int cnt = 0;
+      for (j = 0; j < 4; ++j)
+       if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
+         return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  dremap = *d;
+  dremap.vmode = V4DImode;
+  dremap.nelt = 4;
+  dremap.target = gen_reg_rtx (V4DImode);
+  dremap.op0 = gen_lowpart (V4DImode, d->op0);
+  dremap.op1 = dremap.op0;
+  for (i = 0; i < 2; ++i)
+    {
+      unsigned int cnt = 0;
+      for (j = 0; j < 4; ++j)
+       if ((contents[i] & (1u << j)) != 0)
+         dremap.perm[2 * i + cnt++] = j;
+      for (; cnt < 2; ++cnt)
+       dremap.perm[2 * i + cnt] = 0;
+    }
+
+  dfinal = *d;
+  dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
+  dfinal.op1 = dfinal.op0;
+  for (i = 0, j = 0; i < nelt; ++i)
+    {
+      if (i == nelt2)
+       j = 2;
+      dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
+      if ((d->perm[i] / nelt4) == dremap.perm[j])
+       ;
+      else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
+       dfinal.perm[i] |= nelt4;
+      else
+       gcc_unreachable ();
+    }
+
+  ok = expand_vec_perm_1 (&dremap);
+  gcc_assert (ok);
+
+  ok = expand_vec_perm_1 (&dfinal);
+  gcc_assert (ok);
+
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
+   a two vector permutation using 2 intra-lane interleave insns
+   and cross-lane shuffle for 32-byte vectors.  */
+
+static bool
+expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
+{
+  unsigned i, nelt;
+  rtx (*gen) (rtx, rtx, rtx);
+
+  if (d->op0 == d->op1)
+    return false;
+  if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
+    ;
+  else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
+    ;
+  else
+    return false;
+
+  nelt = d->nelt;
+  if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
+    return false;
+  for (i = 0; i < nelt; i += 2)
+    if (d->perm[i] != d->perm[0] + i / 2
+       || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  switch (d->vmode)
+    {
+    case V32QImode:
+      if (d->perm[0])
+       gen = gen_vec_interleave_highv32qi;
+      else
+       gen = gen_vec_interleave_lowv32qi;
+      break;
+    case V16HImode:
+      if (d->perm[0])
+       gen = gen_vec_interleave_highv16hi;
+      else
+       gen = gen_vec_interleave_lowv16hi;
+      break;
+    case V8SImode:
+      if (d->perm[0])
+       gen = gen_vec_interleave_highv8si;
+      else
+       gen = gen_vec_interleave_lowv8si;
+      break;
+    case V4DImode:
+      if (d->perm[0])
+       gen = gen_vec_interleave_highv4di;
+      else
+       gen = gen_vec_interleave_lowv4di;
+      break;
+    case V8SFmode:
+      if (d->perm[0])
+       gen = gen_vec_interleave_highv8sf;
+      else
+       gen = gen_vec_interleave_lowv8sf;
+      break;
+    case V4DFmode:
+      if (d->perm[0])
+       gen = gen_vec_interleave_highv4df;
+      else
+       gen = gen_vec_interleave_lowv4df;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  emit_insn (gen (d->target, d->op0, d->op1));
+  return true;
+}
+
  /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
     permutation with two pshufb insns and an ior.  We should have already
     failed all two instruction sequences.  */
@@ -35156,6 +36603,152 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
    return true;
  }
  
+/* Implement arbitrary permutation of one V32QImode and V16QImode operand
+   with two vpshufb insns, vpermq and vpor.  We should have already failed
+   all two or three instruction sequences.  */
+
+static bool
+expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
+{
+  rtx rperm[2][32], vperm, l, h, hp, op, m128;
+  unsigned int i, nelt, eltsz;
+
+  if (!TARGET_AVX2
+      || d->op0 != d->op1
+      || (d->vmode != V32QImode && d->vmode != V16HImode))
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  nelt = d->nelt;
+  eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+
+  /* Generate two permutation masks.  If the required element is within
+     the same lane, it is shuffled in.  If the required element from the
+     other lane, force a zero by setting bit 7 in the permutation mask.
+     In the other mask the mask has non-negative elements if element
+     is requested from the other lane, but also moved to the other lane,
+     so that the result of vpshufb can have the two V2TImode halves
+     swapped.  */
+  m128 = GEN_INT (-128);
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+      unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
+
+      for (j = 0; j < eltsz; ++j)
+       {
+         rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
+         rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
+       }
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
+  vperm = force_reg (V32QImode, vperm);
+
+  h = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V32QImode, d->op0);
+  emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
+
+  /* Swap the 128-byte lanes of h into hp.  */
+  hp = gen_reg_rtx (V4DImode);
+  op = gen_lowpart (V4DImode, h);
+  emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
+                                 const1_rtx));
+
+  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
+  vperm = force_reg (V32QImode, vperm);
+
+  l = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V32QImode, d->op0);
+  emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
+
+  op = gen_lowpart (V32QImode, d->target);
+  emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
+
+  return true;
+}
+
+/* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
+   and extract-odd permutations of two V32QImode and V16QImode operand
+   with two vpshufb insns, vpor and vpermq.  We should have already
+   failed all two or three instruction sequences.  */
+
+static bool
+expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
+{
+  rtx rperm[2][32], vperm, l, h, ior, op, m128;
+  unsigned int i, nelt, eltsz;
+
+  if (!TARGET_AVX2
+      || d->op0 == d->op1
+      || (d->vmode != V32QImode && d->vmode != V16HImode))
+    return false;
+
+  for (i = 0; i < d->nelt; ++i)
+    if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  nelt = d->nelt;
+  eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+
+  /* Generate two permutation masks.  In the first permutation mask
+     the first quarter will contain indexes for the first half
+     of the op0, the second quarter will contain bit 7 set, third quarter
+     will contain indexes for the second half of the op0 and the
+     last quarter bit 7 set.  In the second permutation mask
+     the first quarter will contain bit 7 set, the second quarter
+     indexes for the first half of the op1, the third quarter bit 7 set
+     and last quarter indexes for the second half of the op1.
+     I.e. the first mask e.g. for V32QImode extract even will be:
+     0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
+     (all values masked with 0xf except for -128) and second mask
+     for extract even will be
+     -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe.  */
+  m128 = GEN_INT (-128);
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+      unsigned which = d->perm[i] >= nelt;
+      unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
+
+      for (j = 0; j < eltsz; ++j)
+       {
+         rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
+         rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
+       }
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
+  vperm = force_reg (V32QImode, vperm);
+
+  l = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V32QImode, d->op0);
+  emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
+
+  vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
+  vperm = force_reg (V32QImode, vperm);
+
+  h = gen_reg_rtx (V32QImode);
+  op = gen_lowpart (V32QImode, d->op1);
+  emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
+
+  ior = gen_reg_rtx (V32QImode);
+  emit_insn (gen_iorv32qi3 (ior, l, h));
+
+  /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation.  */
+  op = gen_lowpart (V4DImode, d->target);
+  ior = gen_lowpart (V4DImode, ior);
+  emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
+                                 const1_rtx, GEN_INT (3)));
+
+  return true;
+}
+
  /* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
     and extract-odd permutations.  */
  
@@ -35265,6 +36858,81 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
         }
        break;
  
+    case V16HImode:
+    case V32QImode:
+      return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
+
+    case V4DImode:
+      if (!TARGET_AVX2)
+       {
+         struct expand_vec_perm_d d_copy = *d;
+         d_copy.vmode = V4DFmode;
+         d_copy.target = gen_lowpart (V4DFmode, d->target);
+         d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
+         d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
+         return expand_vec_perm_even_odd_1 (&d_copy, odd);
+       }
+
+      t1 = gen_reg_rtx (V4DImode);
+      t2 = gen_reg_rtx (V4DImode);
+
+      /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }.  */
+      emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
+      emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
+
+      /* Now an vpunpck[lh]qdq will produce the result required.  */
+      if (odd)
+       t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
+      else
+       t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
+      emit_insn (t3);
+      break;
+
+    case V8SImode:
+      if (!TARGET_AVX2)
+       {
+         struct expand_vec_perm_d d_copy = *d;
+         d_copy.vmode = V8SFmode;
+         d_copy.target = gen_lowpart (V8SFmode, d->target);
+         d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
+         d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
+         return expand_vec_perm_even_odd_1 (&d_copy, odd);
+       }
+
+      t1 = gen_reg_rtx (V8SImode);
+      t2 = gen_reg_rtx (V8SImode);
+
+      /* Shuffle the lanes around into
+        { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }.  */
+      emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1),
+                                   gen_lowpart (V4DImode, d->op0),
+                                   gen_lowpart (V4DImode, d->op1),
+                                   GEN_INT (0x20)));
+      emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2),
+                                   gen_lowpart (V4DImode, d->op0),
+                                   gen_lowpart (V4DImode, d->op1),
+                                   GEN_INT (0x31)));
+
+      /* Swap the 2nd and 3rd position in each lane into
+        { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }.  */
+      emit_insn (gen_avx2_pshufdv3 (t1, t1,
+                                   GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
+      emit_insn (gen_avx2_pshufdv3 (t2, t2,
+                                   GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
+
+      /* Now an vpunpck[lh]qdq will produce
+        { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }.  */
+      if (odd)
+       t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target),
+                                          gen_lowpart (V4DImode, t1),
+                                          gen_lowpart (V4DImode, t2));
+      else
+       t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target),
+                                         gen_lowpart (V4DImode, t1),
+                                         gen_lowpart (V4DImode, t2));
+      emit_insn (t3);
+      break;
+
      default:
        gcc_unreachable ();
      }
@@ -35325,18 +36993,23 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
          stopping once we have promoted to V4SImode and then use pshufd.  */
        do
         {
-         optab otab = vec_interleave_low_optab;
+         rtx dest;
+         rtx (*gen) (rtx, rtx, rtx)
+           = vmode == V16QImode ? gen_vec_interleave_lowv16qi
+                                : gen_vec_interleave_lowv8hi;
  
           if (elt >= nelt2)
             {
-             otab = vec_interleave_high_optab;
+             gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
+                                      : gen_vec_interleave_highv8hi;
               elt -= nelt2;
             }
           nelt2 /= 2;
  
-         op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT);
+         dest = gen_reg_rtx (vmode);
+         emit_insn (gen (dest, op0, op0));
           vmode = get_mode_wider_vector (vmode);
-         op0 = gen_lowpart (vmode, op0);
+         op0 = gen_lowpart (vmode, dest);
         }
        while (vmode != V4SImode);
  
@@ -35345,6 +37018,15 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
        gcc_assert (ok);
        return true;
  
+    case V32QImode:
+    case V16HImode:
+    case V8SImode:
+    case V4DImode:
+      /* For AVX2 broadcasts of the first element vpbroadcast* or
+        vpermq should be used by expand_vec_perm_1.  */
+      gcc_assert (!TARGET_AVX2 || d->perm[0]);
+      return false;
+
      default:
        gcc_unreachable ();
      }
@@ -35369,12 +37051,123 @@ expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
    return expand_vec_perm_broadcast_1 (d);
  }
  
-/* The guts of ix86_expand_vec_perm_builtin, also used by the ok hook.
+/* Implement arbitrary permutation of two V32QImode and V16QImode operands
+   with 4 vpshufb insns, 2 vpermq and 3 vpor.  We should have already failed
+   all the shorter instruction sequences.  */
+
+static bool
+expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
+{
+  rtx rperm[4][32], vperm, l[2], h[2], op, m128;
+  unsigned int i, nelt, eltsz;
+  bool used[4];
+
+  if (!TARGET_AVX2
+      || d->op0 == d->op1
+      || (d->vmode != V32QImode && d->vmode != V16HImode))
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  nelt = d->nelt;
+  eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
+
+  /* Generate 4 permutation masks.  If the required element is within
+     the same lane, it is shuffled in.  If the required element from the
+     other lane, force a zero by setting bit 7 in the permutation mask.
+     In the other mask the mask has non-negative elements if element
+     is requested from the other lane, but also moved to the other lane,
+     so that the result of vpshufb can have the two V2TImode halves
+     swapped.  */
+  m128 = GEN_INT (-128);
+  for (i = 0; i < 32; ++i)
+    {
+      rperm[0][i] = m128;
+      rperm[1][i] = m128;
+      rperm[2][i] = m128;
+      rperm[3][i] = m128;
+    }
+  used[0] = false;
+  used[1] = false;
+  used[2] = false;
+  used[3] = false;
+  for (i = 0; i < nelt; ++i)
+    {
+      unsigned j, e = d->perm[i] & (nelt / 2 - 1);
+      unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
+      unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
+
+      for (j = 0; j < eltsz; ++j)
+       rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
+      used[which] = true;
+    }
+
+  for (i = 0; i < 2; ++i)
+    {
+      if (!used[2 * i + 1])
+       {
+         h[i] = NULL_RTX;
+         continue;
+       }
+      vperm = gen_rtx_CONST_VECTOR (V32QImode,
+                                   gen_rtvec_v (32, rperm[2 * i + 1]));
+      vperm = force_reg (V32QImode, vperm);
+      h[i] = gen_reg_rtx (V32QImode);
+      op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
+      emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
+    }
+
+  /* Swap the 128-byte lanes of h[X].  */
+  for (i = 0; i < 2; ++i)
+   {
+     if (h[i] == NULL_RTX)
+       continue;
+     op = gen_reg_rtx (V4DImode);
+     emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
+                                    const2_rtx, GEN_INT (3), const0_rtx,
+                                    const1_rtx));
+     h[i] = gen_lowpart (V32QImode, op);
+   }
+
+  for (i = 0; i < 2; ++i)
+    {
+      if (!used[2 * i])
+       {
+         l[i] = NULL_RTX;
+         continue;
+       }
+      vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
+      vperm = force_reg (V32QImode, vperm);
+      l[i] = gen_reg_rtx (V32QImode);
+      op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
+      emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
+    }
+
+  for (i = 0; i < 2; ++i)
+    {
+      if (h[i] && l[i])
+       {
+         op = gen_reg_rtx (V32QImode);
+         emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
+         l[i] = op;
+       }
+      else if (h[i])
+       l[i] = h[i];
+    }
+
+  gcc_assert (l[0] && l[1]);
+  op = gen_lowpart (V32QImode, d->target);
+  emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
+  return true;
+}
+
+/* The guts of ix86_expand_vec_perm_const, also used by the ok hook.
     With all of the interface bits taken care of, perform the expansion
     in D and return true on success.  */
  
  static bool
-ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
+ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
  {
    /* Try a single instruction expansion.  */
    if (expand_vec_perm_1 (d))
@@ -35394,11 +37187,25 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
    if (expand_vec_perm_broadcast (d))
      return true;
  
+  if (expand_vec_perm_vpermq_perm_1 (d))
+    return true;
+
    /* Try sequences of three instructions.  */
  
    if (expand_vec_perm_pshufb2 (d))
      return true;
  
+  if (expand_vec_perm_interleave3 (d))
+    return true;
+
+  /* Try sequences of four instructions.  */
+
+  if (expand_vec_perm_vpshufb2_vpermq (d))
+    return true;
+
+  if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
+    return true;
+
    /* ??? Look for narrow permutations whose element orderings would
       allow the promotion to a wider mode.  */
  
@@ -35412,156 +37219,114 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
    if (expand_vec_perm_even_odd (d))
      return true;
  
-  return false;
-}
-
-/* Extract the values from the vector CST into the permutation array in D.
-   Return 0 on error, 1 if all values from the permutation come from the
-   first vector, 2 if all values from the second vector, and 3 otherwise.  */
-
-static int
-extract_vec_perm_cst (struct expand_vec_perm_d *d, tree cst)
-{
-  tree list = TREE_VECTOR_CST_ELTS (cst);
-  unsigned i, nelt = d->nelt;
-  int ret = 0;
-
-  for (i = 0; i < nelt; ++i, list = TREE_CHAIN (list))
-    {
-      unsigned HOST_WIDE_INT e;
-
-      if (!host_integerp (TREE_VALUE (list), 1))
-       return 0;
-      e = tree_low_cst (TREE_VALUE (list), 1);
-      if (e >= 2 * nelt)
-       return 0;
-
-      ret |= (e < nelt ? 1 : 2);
-      d->perm[i] = e;
-    }
-  gcc_assert (list == NULL);
-
-  /* For all elements from second vector, fold the elements to first.  */
-  if (ret == 2)
-    for (i = 0; i < nelt; ++i)
-      d->perm[i] -= nelt;
+  /* Even longer sequences.  */
+  if (expand_vec_perm_vpshufb4_vpermq2 (d))
+    return true;
  
-  return ret;
+  return false;
  }
  
-static rtx
-ix86_expand_vec_perm_builtin (tree exp)
+bool
+ix86_expand_vec_perm_const (rtx operands[4])
  {
    struct expand_vec_perm_d d;
-  tree arg0, arg1, arg2;
+  unsigned char perm[MAX_VECT_LEN];
+  int i, nelt, which;
+  rtx sel;
  
-  arg0 = CALL_EXPR_ARG (exp, 0);
-  arg1 = CALL_EXPR_ARG (exp, 1);
-  arg2 = CALL_EXPR_ARG (exp, 2);
+  d.target = operands[0];
+  d.op0 = operands[1];
+  d.op1 = operands[2];
+  sel = operands[3];
  
-  d.vmode = TYPE_MODE (TREE_TYPE (arg0));
-  d.nelt = GET_MODE_NUNITS (d.vmode);
-  d.testing_p = false;
+  d.vmode = GET_MODE (d.target);
    gcc_assert (VECTOR_MODE_P (d.vmode));
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = false;
+
+  gcc_assert (GET_CODE (sel) == CONST_VECTOR);
+  gcc_assert (XVECLEN (sel, 0) == nelt);
+  gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
  
-  if (TREE_CODE (arg2) != VECTOR_CST)
+  for (i = which = 0; i < nelt; ++i)
      {
-      error_at (EXPR_LOCATION (exp),
-               "vector permutation requires vector constant");
-      goto exit_error;
+      rtx e = XVECEXP (sel, 0, i);
+      int ei = INTVAL (e) & (2 * nelt - 1);
+
+      which |= (ei < nelt ? 1 : 2);
+      d.perm[i] = ei;
+      perm[i] = ei;
      }
  
-  switch (extract_vec_perm_cst (&d, arg2))
+  switch (which)
      {
      default:
        gcc_unreachable();
  
-    case 0:
-      error_at (EXPR_LOCATION (exp), "invalid vector permutation constant");
-      goto exit_error;
-
      case 3:
-      if (!operand_equal_p (arg0, arg1, 0))
-       {
-         d.op0 = expand_expr (arg0, NULL_RTX, d.vmode, EXPAND_NORMAL);
-         d.op0 = force_reg (d.vmode, d.op0);
-         d.op1 = expand_expr (arg1, NULL_RTX, d.vmode, EXPAND_NORMAL);
-         d.op1 = force_reg (d.vmode, d.op1);
-         break;
-       }
+      if (!rtx_equal_p (d.op0, d.op1))
+       break;
  
        /* The elements of PERM do not suggest that only the first operand
          is used, but both operands are identical.  Allow easier matching
          of the permutation by folding the permutation into the single
          input vector.  */
-      {
-       unsigned i, nelt = d.nelt;
-       for (i = 0; i < nelt; ++i)
-         if (d.perm[i] >= nelt)
-           d.perm[i] -= nelt;
-      }
+      for (i = 0; i < nelt; ++i)
+       if (d.perm[i] >= nelt)
+         d.perm[i] -= nelt;
        /* FALLTHRU */
  
      case 1:
-      d.op0 = expand_expr (arg0, NULL_RTX, d.vmode, EXPAND_NORMAL);
-      d.op0 = force_reg (d.vmode, d.op0);
        d.op1 = d.op0;
        break;
  
      case 2:
-      d.op0 = expand_expr (arg1, NULL_RTX, d.vmode, EXPAND_NORMAL);
-      d.op0 = force_reg (d.vmode, d.op0);
-      d.op1 = d.op0;
+      for (i = 0; i < nelt; ++i)
+        d.perm[i] -= nelt;
+      d.op0 = d.op1;
        break;
      }
  
-  d.target = gen_reg_rtx (d.vmode);
-  if (ix86_expand_vec_perm_builtin_1 (&d))
-    return d.target;
+  if (ix86_expand_vec_perm_const_1 (&d))
+    return true;
  
-  /* For compiler generated permutations, we should never got here, because
-     the compiler should also be checking the ok hook.  But since this is a
-     builtin the user has access too, so don't abort.  */
-  switch (d.nelt)
+  /* If the mask says both arguments are needed, but they are the same,
+     the above tried to expand with d.op0 == d.op1.  If that didn't work,
+     retry with d.op0 != d.op1 as that is what testing has been done with.  */
+  if (which == 3 && d.op0 == d.op1)
      {
-    case 2:
-      sorry ("vector permutation (%d %d)", d.perm[0], d.perm[1]);
-      break;
-    case 4:
-      sorry ("vector permutation (%d %d %d %d)",
-            d.perm[0], d.perm[1], d.perm[2], d.perm[3]);
-      break;
-    case 8:
-      sorry ("vector permutation (%d %d %d %d %d %d %d %d)",
-            d.perm[0], d.perm[1], d.perm[2], d.perm[3],
-            d.perm[4], d.perm[5], d.perm[6], d.perm[7]);
-      break;
-    case 16:
-      sorry ("vector permutation "
-            "(%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d)",
-            d.perm[0], d.perm[1], d.perm[2], d.perm[3],
-            d.perm[4], d.perm[5], d.perm[6], d.perm[7],
-            d.perm[8], d.perm[9], d.perm[10], d.perm[11],
-            d.perm[12], d.perm[13], d.perm[14], d.perm[15]);
-      break;
-    default:
-      gcc_unreachable ();
+      rtx seq;
+      bool ok;
+
+      memcpy (d.perm, perm, sizeof (perm));
+      d.op1 = gen_reg_rtx (d.vmode);
+      start_sequence ();
+      ok = ix86_expand_vec_perm_const_1 (&d);
+      seq = get_insns ();
+      end_sequence ();
+      if (ok)
+       {
+         emit_move_insn (d.op1, d.op0);
+         emit_insn (seq);
+         return true;
+       }
      }
- exit_error:
-  return CONST0_RTX (d.vmode);
+
+  return false;
  }
  
-/* Implement targetm.vectorize.builtin_vec_perm_ok.  */
+/* Implement targetm.vectorize.vec_perm_const_ok.  */
  
  static bool
-ix86_vectorize_builtin_vec_perm_ok (tree vec_type, tree mask)
+ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+                                 const unsigned char *sel)
  {
    struct expand_vec_perm_d d;
-  int vec_mask;
+  unsigned int i, nelt, which;
    bool ret, one_vec;
  
-  d.vmode = TYPE_MODE (vec_type);
-  d.nelt = GET_MODE_NUNITS (d.vmode);
+  d.vmode = vmode;
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
    d.testing_p = true;
  
    /* Given sufficient ISA support we can just return true here
@@ -35579,13 +37344,23 @@ ix86_vectorize_builtin_vec_perm_ok (tree vec_type, tree mask)
         return true;
      }
  
-  vec_mask = extract_vec_perm_cst (&d, mask);
+  /* Extract the values from the vector CST into the permutation
+     array in D.  */
+  memcpy (d.perm, sel, nelt);
+  for (i = which = 0; i < nelt; ++i)
+    {
+      unsigned char e = d.perm[i];
+      gcc_assert (e < 2 * nelt);
+      which |= (e < nelt ? 1 : 2);
+    }
+
+  /* For all elements from second vector, fold the elements to first.  */
+  if (which == 2)
+    for (i = 0; i < nelt; ++i)
+      d.perm[i] -= nelt;
  
    /* Check whether the mask can be applied to the vector type.  */
-  if (vec_mask < 0 || vec_mask > 3)
-    return false;
-
-  one_vec = (vec_mask != 3);
+  one_vec = (which != 3);
  
    /* Implementable with shufps or pshufd.  */
    if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
@@ -35599,7 +37374,7 @@ ix86_vectorize_builtin_vec_perm_ok (tree vec_type, tree mask)
      d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
  
    start_sequence ();
-  ret = ix86_expand_vec_perm_builtin_1 (&d);
+  ret = ix86_expand_vec_perm_const_1 (&d);
    end_sequence ();
  
    return ret;
@@ -36678,13 +38453,13 @@ ix86_preferred_simd_mode (enum machine_mode mode)
    switch (mode)
      {
      case QImode:
-      return TARGET_AVX2 ? V32QImode : V16QImode;
+      return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode;
      case HImode:
-      return TARGET_AVX2 ? V16HImode : V8HImode;
+      return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode;
      case SImode:
-      return TARGET_AVX2 ? V8SImode : V4SImode;
+      return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode;
      case DImode:
-      return TARGET_AVX2 ? V4DImode : V2DImode;
+      return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode;
  
      case SFmode:
        if (TARGET_AVX && !TARGET_PREFER_AVX128)
@@ -36743,8 +38518,14 @@ ix86_autovectorize_vector_sizes (void)
  #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
    ix86_builtin_vectorized_function
  
-#undef TARGET_VECTORIZE_BUILTIN_CONVERSION
-#define TARGET_VECTORIZE_BUILTIN_CONVERSION ix86_vectorize_builtin_conversion
+#undef TARGET_VECTORIZE_BUILTIN_TM_LOAD
+#define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load
+
+#undef TARGET_VECTORIZE_BUILTIN_TM_STORE
+#define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store
+
+#undef TARGET_VECTORIZE_BUILTIN_GATHER
+#define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
  
  #undef TARGET_BUILTIN_RECIPROCAL
  #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
@@ -36937,7 +38718,7 @@ ix86_autovectorize_vector_sizes (void)
  #undef TARGET_MANGLE_TYPE
  #define TARGET_MANGLE_TYPE ix86_mangle_type
  
-#ifndef TARGET_MACHO
+#if !TARGET_MACHO
  #undef TARGET_STACK_PROTECT_FAIL
  #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
  #endif
@@ -36967,12 +38748,9 @@ ix86_autovectorize_vector_sizes (void)
  #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
  #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
    ix86_builtin_vectorization_cost
-#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM
-#define TARGET_VECTORIZE_BUILTIN_VEC_PERM \
-  ix86_vectorize_builtin_vec_perm
-#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM_OK
-#define TARGET_VECTORIZE_BUILTIN_VEC_PERM_OK \
-  ix86_vectorize_builtin_vec_perm_ok
+#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
+#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
+  ix86_vectorize_vec_perm_const_ok
  #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
  #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
    ix86_preferred_simd_mode