* config/arm/arm.c (arm_print_operand): Fix invalid alignment

[pf3gnuchains/gcc-fork.git] / gcc / config / arm / arm.c
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c

index 951d65c..ba081d1 100644 (file)
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -269,6 +269,9 @@ static unsigned int arm_autovectorize_vector_sizes (void);
  static int arm_default_branch_cost (bool, bool);
  static int arm_cortex_a5_branch_cost (bool, bool);
  
+static bool arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+                                            const unsigned char *sel);
+
  \f
  /* Table of machine attributes.  */
  static const struct attribute_spec arm_attribute_table[] =
@@ -612,6 +615,10 @@ static const struct attribute_spec arm_attribute_table[] =
  #define TARGET_PREFERRED_RENAME_CLASS \
    arm_preferred_rename_class
  
+#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
+#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
+  arm_vectorize_vec_perm_const_ok
+
  struct gcc_target targetm = TARGET_INITIALIZER;
  \f
  /* Obstack for minipool constant handling.  */
@@ -961,17 +968,6 @@ const struct tune_params arm_cortex_a9_tune =
    arm_default_branch_cost
  };
  
-const struct tune_params arm_cortex_a15_tune =
-{
-  arm_9e_rtx_costs,
-  NULL,
-  1,                                           /* Constant limit.  */
-  1,                                           /* Max cond insns.  */
-  ARM_PREFETCH_NOT_BENEFICIAL,                 /* TODO: Calculate correct values.  */
-  false,                                       /* Prefer constant pool.  */
-  arm_cortex_a5_branch_cost
-};
-
  const struct tune_params arm_fa726te_tune =
  {
    arm_9e_rtx_costs,
@@ -3673,6 +3669,10 @@ arm_libcall_uses_aapcs_base (const_rtx libcall)
        add_libcall (libcall_htab,
                    convert_optab_libfunc (trunc_optab, HFmode, SFmode));
        add_libcall (libcall_htab,
+                  convert_optab_libfunc (sfix_optab, SImode, DFmode));
+      add_libcall (libcall_htab,
+                  convert_optab_libfunc (ufix_optab, SImode, DFmode));
+      add_libcall (libcall_htab,
                    convert_optab_libfunc (sfix_optab, DImode, DFmode));
        add_libcall (libcall_htab,
                    convert_optab_libfunc (ufix_optab, DImode, DFmode));
@@ -4264,6 +4264,11 @@ use_vfp_abi (enum arm_pcs pcs_variant, bool is_double)
           (TARGET_VFP_DOUBLE || !is_double));
  }
  
+/* Return true if an argument whose type is TYPE, or mode is MODE, is
+   suitable for passing or returning in VFP registers for the PCS
+   variant selected.  If it is, then *BASE_MODE is updated to contain
+   a machine mode describing each element of the argument's type and
+   *COUNT to hold the number of such elements.  */
  static bool
  aapcs_vfp_is_call_or_return_candidate (enum arm_pcs pcs_variant,
                                        enum machine_mode mode, const_tree type,
@@ -4271,9 +4276,20 @@ aapcs_vfp_is_call_or_return_candidate (enum arm_pcs pcs_variant,
  {
    enum machine_mode new_mode = VOIDmode;
  
-  if (GET_MODE_CLASS (mode) == MODE_FLOAT
-      || GET_MODE_CLASS (mode) == MODE_VECTOR_INT
-      || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
+  /* If we have the type information, prefer that to working things
+     out from the mode.  */
+  if (type)
+    {
+      int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
+
+      if (ag_count > 0 && ag_count <= 4)
+       *count = ag_count;
+      else
+       return false;
+    }
+  else if (GET_MODE_CLASS (mode) == MODE_FLOAT
+          || GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+          || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
      {
        *count = 1;
        new_mode = mode;
@@ -4283,15 +4299,6 @@ aapcs_vfp_is_call_or_return_candidate (enum arm_pcs pcs_variant,
        *count = 2;
        new_mode = (mode == DCmode ? DFmode : SFmode);
      }
-  else if (type && (mode == BLKmode || TREE_CODE (type) == VECTOR_TYPE))
-    {
-      int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
-
-      if (ag_count > 0 && ag_count <= 4)
-       *count = ag_count;
-      else
-       return false;
-    }
    else
      return false;
  
@@ -5571,11 +5578,7 @@ arm_load_pic_register (unsigned long saved_regs ATTRIBUTE_UNUSED)
  
        if (TARGET_32BIT)
         {
-         emit_insn (gen_pic_load_addr_32bit (pic_reg, pic_rtx));
-         if (TARGET_ARM)
-           emit_insn (gen_pic_add_dot_plus_eight (pic_reg, pic_reg, labelno));
-         else
-           emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno));
+         emit_insn (gen_pic_load_addr_unified (pic_reg, pic_rtx, labelno));
         }
        else /* TARGET_THUMB1 */
         {
@@ -5588,10 +5591,10 @@ arm_load_pic_register (unsigned long saved_regs ATTRIBUTE_UNUSED)
                                      thumb_find_work_register (saved_regs));
               emit_insn (gen_pic_load_addr_thumb1 (pic_tmp, pic_rtx));
               emit_insn (gen_movsi (pic_offset_table_rtx, pic_tmp));
+             emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno));
             }
           else
-           emit_insn (gen_pic_load_addr_thumb1 (pic_reg, pic_rtx));
-         emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno));
+           emit_insn (gen_pic_load_addr_unified (pic_reg, pic_rtx, labelno));
         }
      }
  
@@ -5621,20 +5624,7 @@ arm_pic_static_addr (rtx orig, rtx reg)
                                 UNSPEC_SYMBOL_OFFSET);
    offset_rtx = gen_rtx_CONST (Pmode, offset_rtx);
  
-  if (TARGET_32BIT)
-    {
-      emit_insn (gen_pic_load_addr_32bit (reg, offset_rtx));
-      if (TARGET_ARM)
-        insn = emit_insn (gen_pic_add_dot_plus_eight (reg, reg, labelno));
-      else
-        insn = emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
-    }
-  else /* TARGET_THUMB1 */
-    {
-      emit_insn (gen_pic_load_addr_thumb1 (reg, offset_rtx));
-      insn = emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
-    }
-
+  insn = emit_insn (gen_pic_load_addr_unified (reg, offset_rtx, labelno));
    return insn;
  }
  
@@ -5677,7 +5667,7 @@ static bool
  will_be_in_index_register (const_rtx x)
  {
    /* arm.md: calculate_pic_address will split this into a register.  */
-  return GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_PIC_SYM;
+  return GET_CODE (x) == UNSPEC && (XINT (x, 1) == UNSPEC_PIC_SYM);
  }
  
  /* Return nonzero if X is a valid ARM state address operand.  */
@@ -7641,6 +7631,15 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
  
      case SET:
        return false;
+      
+    case UNSPEC:
+      /* We cost this as high as our memory costs to allow this to
+        be hoisted from loops.  */
+      if (XINT (x, 1) == UNSPEC_PIC_UNIFIED)
+       {
+         *total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode));
+       }
+      return true;
  
      default:
        *total = COSTS_N_INSNS (4);
@@ -10001,7 +10000,8 @@ static int
  arm_note_pic_base (rtx *x, void *date ATTRIBUTE_UNUSED)
  {
    if (GET_CODE (*x) == UNSPEC
-      && XINT (*x, 1) == UNSPEC_PIC_BASE)
+      && (XINT (*x, 1) == UNSPEC_PIC_BASE
+         || XINT (*x, 1) == UNSPEC_PIC_UNIFIED))
      return 1;
    return 0;
  }
@@ -14201,6 +14201,9 @@ output_move_double (rtx *operands, bool emit, int *count)
                     output_asm_insn ("sub%?\t%0, %1, %2", otherops);
                 }
  
+             if (count)
+               *count = 2;
+
               if (TARGET_LDRD)
                 return "ldr%(d%)\t%0, [%1]";
  
@@ -17709,11 +17712,11 @@ arm_print_operand (FILE *stream, rtx x, int code)
         memsize = MEM_SIZE (x);
  
         /* Only certain alignment specifiers are supported by the hardware.  */
-       if (memsize == 16 && (align % 32) == 0)
+       if (memsize == 32 && (align % 32) == 0)
           align_bits = 256;
-       else if ((memsize == 8 || memsize == 16) && (align % 16) == 0)
+       else if ((memsize == 16 || memsize == 32) && (align % 16) == 0)
           align_bits = 128;
-       else if ((align % 8) == 0)
+       else if (memsize >= 8 && (align % 8) == 0)
           align_bits = 64;
         else
           align_bits = 0;
@@ -19101,7 +19104,9 @@ static neon_builtin_datum neon_builtin_data[] =
    VAR3 (BINOP, vsubhn, v8hi, v4si, v2di),
    VAR8 (BINOP, vceq, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
    VAR8 (BINOP, vcge, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
+  VAR6 (BINOP, vcgeu, v8qi, v4hi, v2si, v16qi, v8hi, v4si),
    VAR8 (BINOP, vcgt, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
+  VAR6 (BINOP, vcgtu, v8qi, v4hi, v2si, v16qi, v8hi, v4si),
    VAR2 (BINOP, vcage, v2sf, v4sf),
    VAR2 (BINOP, vcagt, v2sf, v4sf),
    VAR6 (BINOP, vtst, v8qi, v4hi, v2si, v16qi, v8hi, v4si),
@@ -20915,6 +20920,57 @@ neon_disambiguate_copy (rtx *operands, rtx *dest, rtx *src, unsigned int count)
      }
  }
  
+/* Split operands into moves from op[1] + op[2] into op[0].  */
+
+void
+neon_split_vcombine (rtx operands[3])
+{
+  unsigned int dest = REGNO (operands[0]);
+  unsigned int src1 = REGNO (operands[1]);
+  unsigned int src2 = REGNO (operands[2]);
+  enum machine_mode halfmode = GET_MODE (operands[1]);
+  unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
+  rtx destlo, desthi;
+
+  if (src1 == dest && src2 == dest + halfregs)
+    {
+      /* No-op move.  Can't split to nothing; emit something.  */
+      emit_note (NOTE_INSN_DELETED);
+      return;
+    }
+
+  /* Preserve register attributes for variable tracking.  */
+  destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
+  desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
+                              GET_MODE_SIZE (halfmode));
+
+  /* Special case of reversed high/low parts.  Use VSWP.  */
+  if (src2 == dest && src1 == dest + halfregs)
+    {
+      rtx x = gen_rtx_SET (VOIDmode, destlo, operands[1]);
+      rtx y = gen_rtx_SET (VOIDmode, desthi, operands[2]);
+      emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, x, y)));
+      return;
+    }
+
+  if (!reg_overlap_mentioned_p (operands[2], destlo))
+    {
+      /* Try to avoid unnecessary moves if part of the result
+        is in the right place already.  */
+      if (src1 != dest)
+       emit_move_insn (destlo, operands[1]);
+      if (src2 != dest + halfregs)
+       emit_move_insn (desthi, operands[2]);
+    }
+  else
+    {
+      if (src2 != dest + halfregs)
+       emit_move_insn (desthi, operands[2]);
+      if (src1 != dest)
+       emit_move_insn (destlo, operands[1]);
+    }
+}
+
  /* Expand an expression EXP that calls a built-in function,
     with result going to TARGET if that's convenient
     (and in mode MODE if that's convenient).
@@ -24617,7 +24673,12 @@ int
  arm_count_output_move_double_insns (rtx *operands)
  {
    int count;
-  output_move_double (operands, false, &count);
+  rtx ops[2];
+  /* output_move_double may modify the operands array, so call it
+     here on a copy of the array.  */
+  ops[0] = operands[0];
+  ops[1] = operands[1];
+  output_move_double (ops, false, &count);
    return count;
  }
  
@@ -24642,7 +24703,7 @@ vfp3_const_double_for_fract_bits (rtx operand)
      }
    return 0;
  }
-
+\f
  /* Emit a memory barrier around an atomic sequence according to MODEL.  */
  
  static void
@@ -24945,6 +25006,515 @@ arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
  
    arm_post_atomic_barrier (model);
  }
+\f
+#define MAX_VECT_LEN 16
  
-#include "gt-arm.h"
+struct expand_vec_perm_d
+{
+  rtx target, op0, op1;
+  unsigned char perm[MAX_VECT_LEN];
+  enum machine_mode vmode;
+  unsigned char nelt;
+  bool one_vector_p;
+  bool testing_p;
+};
+
+/* Generate a variable permutation.  */
+
+static void
+arm_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  enum machine_mode vmode = GET_MODE (target);
+  bool one_vector_p = rtx_equal_p (op0, op1);
+
+  gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
+  gcc_checking_assert (GET_MODE (op0) == vmode);
+  gcc_checking_assert (GET_MODE (op1) == vmode);
+  gcc_checking_assert (GET_MODE (sel) == vmode);
+  gcc_checking_assert (TARGET_NEON);
+
+  if (one_vector_p)
+    {
+      if (vmode == V8QImode)
+       emit_insn (gen_neon_vtbl1v8qi (target, op0, sel));
+      else
+       emit_insn (gen_neon_vtbl1v16qi (target, op0, sel));
+    }
+  else
+    {
+      rtx pair;
+
+      if (vmode == V8QImode)
+       {
+         pair = gen_reg_rtx (V16QImode);
+         emit_insn (gen_neon_vcombinev8qi (pair, op0, op1));
+         pair = gen_lowpart (TImode, pair);
+         emit_insn (gen_neon_vtbl2v8qi (target, pair, sel));
+       }
+      else
+       {
+         pair = gen_reg_rtx (OImode);
+         emit_insn (gen_neon_vcombinev16qi (pair, op0, op1));
+         emit_insn (gen_neon_vtbl2v16qi (target, pair, sel));
+       }
+    }
+}
+
+void
+arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  enum machine_mode vmode = GET_MODE (target);
+  unsigned int i, nelt = GET_MODE_NUNITS (vmode);
+  bool one_vector_p = rtx_equal_p (op0, op1);
+  rtx rmask[MAX_VECT_LEN], mask;
+
+  /* TODO: ARM's VTBL indexing is little-endian.  In order to handle GCC's
+     numbering of elements for big-endian, we must reverse the order.  */
+  gcc_checking_assert (!BYTES_BIG_ENDIAN);
+
+  /* The VTBL instruction does not use a modulo index, so we must take care
+     of that ourselves.  */
+  mask = GEN_INT (one_vector_p ? nelt - 1 : 2 * nelt - 1);
+  for (i = 0; i < nelt; ++i)
+    rmask[i] = mask;
+  mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rmask));
+  sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
+
+  arm_expand_vec_perm_1 (target, op0, op1, sel);
+}
+
+/* Generate or test for an insn that supports a constant permutation.  */
+
+/* Recognize patterns for the VUZP insns.  */
+
+static bool
+arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
+{
+  unsigned int i, odd, mask, nelt = d->nelt;
+  rtx out0, out1, in0, in1, x;
+  rtx (*gen)(rtx, rtx, rtx, rtx);
+
+  if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
+    return false;
+
+  /* Note that these are little-endian tests.  Adjust for big-endian later.  */
+  if (d->perm[0] == 0)
+    odd = 0;
+  else if (d->perm[0] == 1)
+    odd = 1;
+  else
+    return false;
+  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+  for (i = 0; i < nelt; i++)
+    {
+      unsigned elt = (i * 2 + odd) & mask;
+      if (d->perm[i] != elt)
+       return false;
+    }
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  switch (d->vmode)
+    {
+    case V16QImode: gen = gen_neon_vuzpv16qi_internal; break;
+    case V8QImode:  gen = gen_neon_vuzpv8qi_internal;  break;
+    case V8HImode:  gen = gen_neon_vuzpv8hi_internal;  break;
+    case V4HImode:  gen = gen_neon_vuzpv4hi_internal;  break;
+    case V4SImode:  gen = gen_neon_vuzpv4si_internal;  break;
+    case V2SImode:  gen = gen_neon_vuzpv2si_internal;  break;
+    case V2SFmode:  gen = gen_neon_vuzpv2sf_internal;  break;
+    case V4SFmode:  gen = gen_neon_vuzpv4sf_internal;  break;
+    default:
+      gcc_unreachable ();
+    }
+
+  in0 = d->op0;
+  in1 = d->op1;
+  if (BYTES_BIG_ENDIAN)
+    {
+      x = in0, in0 = in1, in1 = x;
+      odd = !odd;
+    }
+
+  out0 = d->target;
+  out1 = gen_reg_rtx (d->vmode);
+  if (odd)
+    x = out0, out0 = out1, out1 = x;
+
+  emit_insn (gen (out0, in0, in1, out1));
+  return true;
+}
+
+/* Recognize patterns for the VZIP insns.  */
+
+static bool
+arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
+{
+  unsigned int i, high, mask, nelt = d->nelt;
+  rtx out0, out1, in0, in1, x;
+  rtx (*gen)(rtx, rtx, rtx, rtx);
+
+  if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
+    return false;
+
+  /* Note that these are little-endian tests.  Adjust for big-endian later.  */
+  high = nelt / 2;
+  if (d->perm[0] == high)
+    ;
+  else if (d->perm[0] == 0)
+    high = 0;
+  else
+    return false;
+  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+  for (i = 0; i < nelt / 2; i++)
+    {
+      unsigned elt = (i + high) & mask;
+      if (d->perm[i * 2] != elt)
+       return false;
+      elt = (elt + nelt) & mask;
+      if (d->perm[i * 2 + 1] != elt)
+       return false;
+    }
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  switch (d->vmode)
+    {
+    case V16QImode: gen = gen_neon_vzipv16qi_internal; break;
+    case V8QImode:  gen = gen_neon_vzipv8qi_internal;  break;
+    case V8HImode:  gen = gen_neon_vzipv8hi_internal;  break;
+    case V4HImode:  gen = gen_neon_vzipv4hi_internal;  break;
+    case V4SImode:  gen = gen_neon_vzipv4si_internal;  break;
+    case V2SImode:  gen = gen_neon_vzipv2si_internal;  break;
+    case V2SFmode:  gen = gen_neon_vzipv2sf_internal;  break;
+    case V4SFmode:  gen = gen_neon_vzipv4sf_internal;  break;
+    default:
+      gcc_unreachable ();
+    }
  
+  in0 = d->op0;
+  in1 = d->op1;
+  if (BYTES_BIG_ENDIAN)
+    {
+      x = in0, in0 = in1, in1 = x;
+      high = !high;
+    }
+
+  out0 = d->target;
+  out1 = gen_reg_rtx (d->vmode);
+  if (high)
+    x = out0, out0 = out1, out1 = x;
+
+  emit_insn (gen (out0, in0, in1, out1));
+  return true;
+}
+
+/* Recognize patterns for the VREV insns.  */
+
+static bool
+arm_evpc_neon_vrev (struct expand_vec_perm_d *d)
+{
+  unsigned int i, j, diff, nelt = d->nelt;
+  rtx (*gen)(rtx, rtx, rtx);
+
+  if (!d->one_vector_p)
+    return false;
+
+  diff = d->perm[0];
+  switch (diff)
+    {
+    case 7:
+      switch (d->vmode)
+       {
+       case V16QImode: gen = gen_neon_vrev64v16qi; break;
+       case V8QImode:  gen = gen_neon_vrev64v8qi;  break;
+       default:
+         return false;
+       }
+      break;
+    case 3:
+      switch (d->vmode)
+       {
+       case V16QImode: gen = gen_neon_vrev32v16qi; break;
+       case V8QImode:  gen = gen_neon_vrev32v8qi;  break;
+       case V8HImode:  gen = gen_neon_vrev64v8hi;  break;
+       case V4HImode:  gen = gen_neon_vrev64v4hi;  break;
+       default:
+         return false;
+       }
+      break;
+    case 1:
+      switch (d->vmode)
+       {
+       case V16QImode: gen = gen_neon_vrev16v16qi; break;
+       case V8QImode:  gen = gen_neon_vrev16v8qi;  break;
+       case V8HImode:  gen = gen_neon_vrev32v8hi;  break;
+       case V4HImode:  gen = gen_neon_vrev32v4hi;  break;
+       case V4SImode:  gen = gen_neon_vrev64v4si;  break;
+       case V2SImode:  gen = gen_neon_vrev64v2si;  break;
+       case V4SFmode:  gen = gen_neon_vrev64v4sf;  break;
+       case V2SFmode:  gen = gen_neon_vrev64v2sf;  break;
+       default:
+         return false;
+       }
+      break;
+    default:
+      return false;
+    }
+
+  for (i = 0; i < nelt; i += diff)
+    for (j = 0; j <= diff; j += 1)
+      if (d->perm[i + j] != i + diff - j)
+       return false;
+
+  /* Success! */
+  if (d->testing_p)
+    return true;
+
+  /* ??? The third operand is an artifact of the builtin infrastructure
+     and is ignored by the actual instruction.  */
+  emit_insn (gen (d->target, d->op0, const0_rtx));
+  return true;
+}
+
+/* Recognize patterns for the VTRN insns.  */
+
+static bool
+arm_evpc_neon_vtrn (struct expand_vec_perm_d *d)
+{
+  unsigned int i, odd, mask, nelt = d->nelt;
+  rtx out0, out1, in0, in1, x;
+  rtx (*gen)(rtx, rtx, rtx, rtx);
+
+  if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
+    return false;
+
+  /* Note that these are little-endian tests.  Adjust for big-endian later.  */
+  if (d->perm[0] == 0)
+    odd = 0;
+  else if (d->perm[0] == 1)
+    odd = 1;
+  else
+    return false;
+  mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+  for (i = 0; i < nelt; i += 2)
+    {
+      if (d->perm[i] != i + odd)
+       return false;
+      if (d->perm[i + 1] != ((i + nelt + odd) & mask))
+       return false;
+    }
+
+  /* Success!  */
+  if (d->testing_p)
+    return true;
+
+  switch (d->vmode)
+    {
+    case V16QImode: gen = gen_neon_vtrnv16qi_internal; break;
+    case V8QImode:  gen = gen_neon_vtrnv8qi_internal;  break;
+    case V8HImode:  gen = gen_neon_vtrnv8hi_internal;  break;
+    case V4HImode:  gen = gen_neon_vtrnv4hi_internal;  break;
+    case V4SImode:  gen = gen_neon_vtrnv4si_internal;  break;
+    case V2SImode:  gen = gen_neon_vtrnv2si_internal;  break;
+    case V2SFmode:  gen = gen_neon_vtrnv2sf_internal;  break;
+    case V4SFmode:  gen = gen_neon_vtrnv4sf_internal;  break;
+    default:
+      gcc_unreachable ();
+    }
+
+  in0 = d->op0;
+  in1 = d->op1;
+  if (BYTES_BIG_ENDIAN)
+    {
+      x = in0, in0 = in1, in1 = x;
+      odd = !odd;
+    }
+
+  out0 = d->target;
+  out1 = gen_reg_rtx (d->vmode);
+  if (odd)
+    x = out0, out0 = out1, out1 = x;
+
+  emit_insn (gen (out0, in0, in1, out1));
+  return true;
+}
+
+/* The NEON VTBL instruction is a fully variable permuation that's even
+   stronger than what we expose via VEC_PERM_EXPR.  What it doesn't do
+   is mask the index operand as VEC_PERM_EXPR requires.  Therefore we
+   can do slightly better by expanding this as a constant where we don't
+   have to apply a mask.  */
+
+static bool
+arm_evpc_neon_vtbl (struct expand_vec_perm_d *d)
+{
+  rtx rperm[MAX_VECT_LEN], sel;
+  enum machine_mode vmode = d->vmode;
+  unsigned int i, nelt = d->nelt;
+
+  /* TODO: ARM's VTBL indexing is little-endian.  In order to handle GCC's
+     numbering of elements for big-endian, we must reverse the order.  */
+  if (BYTES_BIG_ENDIAN)
+    return false;
+
+  if (d->testing_p)
+    return true;
+
+  /* Generic code will try constant permutation twice.  Once with the
+     original mode and again with the elements lowered to QImode.
+     So wait and don't do the selector expansion ourselves.  */
+  if (vmode != V8QImode && vmode != V16QImode)
+    return false;
+
+  for (i = 0; i < nelt; ++i)
+    rperm[i] = GEN_INT (d->perm[i]);
+  sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
+  sel = force_reg (vmode, sel);
+
+  arm_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
+  return true;
+}
+
+static bool
+arm_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+{
+  /* The pattern matching functions above are written to look for a small
+     number to begin the sequence (0, 1, N/2).  If we begin with an index
+     from the second operand, we can swap the operands.  */
+  if (d->perm[0] >= d->nelt)
+    {
+      unsigned i, nelt = d->nelt;
+      rtx x;
+
+      for (i = 0; i < nelt; ++i)
+       d->perm[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
+
+      x = d->op0;
+      d->op0 = d->op1;
+      d->op1 = x;
+    }
+
+  if (TARGET_NEON)
+    {
+      if (arm_evpc_neon_vuzp (d))
+       return true;
+      if (arm_evpc_neon_vzip (d))
+       return true;
+      if (arm_evpc_neon_vrev (d))
+       return true;
+      if (arm_evpc_neon_vtrn (d))
+       return true;
+      return arm_evpc_neon_vtbl (d);
+    }
+  return false;
+}
+
+/* Expand a vec_perm_const pattern.  */
+
+bool
+arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
+{
+  struct expand_vec_perm_d d;
+  int i, nelt, which;
+
+  d.target = target;
+  d.op0 = op0;
+  d.op1 = op1;
+
+  d.vmode = GET_MODE (target);
+  gcc_assert (VECTOR_MODE_P (d.vmode));
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = false;
+
+  for (i = which = 0; i < nelt; ++i)
+    {
+      rtx e = XVECEXP (sel, 0, i);
+      int ei = INTVAL (e) & (2 * nelt - 1);
+      which |= (ei < nelt ? 1 : 2);
+      d.perm[i] = ei;
+    }
+
+  switch (which)
+    {
+    default:
+      gcc_unreachable();
+
+    case 3:
+      d.one_vector_p = false;
+      if (!rtx_equal_p (op0, op1))
+       break;
+
+      /* The elements of PERM do not suggest that only the first operand
+        is used, but both operands are identical.  Allow easier matching
+        of the permutation by folding the permutation into the single
+        input vector.  */
+      /* FALLTHRU */
+    case 2:
+      for (i = 0; i < nelt; ++i)
+        d.perm[i] &= nelt - 1;
+      d.op0 = op1;
+      d.one_vector_p = true;
+      break;
+
+    case 1:
+      d.op1 = op0;
+      d.one_vector_p = true;
+      break;
+    }
+
+  return arm_expand_vec_perm_const_1 (&d);
+}
+
+/* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK.  */
+
+static bool
+arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+                                const unsigned char *sel)
+{
+  struct expand_vec_perm_d d;
+  unsigned int i, nelt, which;
+  bool ret;
+
+  d.vmode = vmode;
+  d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+  d.testing_p = true;
+  memcpy (d.perm, sel, nelt);
+
+  /* Categorize the set of elements in the selector.  */
+  for (i = which = 0; i < nelt; ++i)
+    {
+      unsigned char e = d.perm[i];
+      gcc_assert (e < 2 * nelt);
+      which |= (e < nelt ? 1 : 2);
+    }
+
+  /* For all elements from second vector, fold the elements to first.  */
+  if (which == 2)
+    for (i = 0; i < nelt; ++i)
+      d.perm[i] -= nelt;
+
+  /* Check whether the mask can be applied to the vector type.  */
+  d.one_vector_p = (which != 3);
+
+  d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
+  d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
+  if (!d.one_vector_p)
+    d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+
+  start_sequence ();
+  ret = arm_expand_vec_perm_const_1 (&d);
+  end_sequence ();
+
+  return ret;
+}
+
+\f
+#include "gt-arm.h"