/* Output routines for GCC for ARM.
Copyright (C) 1991, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
- 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
+ 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
Free Software Foundation, Inc.
Contributed by Pieter `Tiggr' Schoenmakers (rcpieter@win.tue.nl)
and Martin Simmons (@harleqn.co.uk).
static bool arm_9e_rtx_costs (rtx, enum rtx_code, enum rtx_code, int *, bool);
static bool arm_rtx_costs (rtx, int, int, int, int *, bool);
static int arm_address_cost (rtx, bool);
+static int arm_register_move_cost (enum machine_mode, reg_class_t, reg_class_t);
+static int arm_memory_move_cost (enum machine_mode, reg_class_t, bool);
static bool arm_memory_load_p (rtx);
static bool arm_cirrus_insn_p (rtx);
static void cirrus_reorg (rtx);
static int arm_default_branch_cost (bool, bool);
static int arm_cortex_a5_branch_cost (bool, bool);
+static bool arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+ const unsigned char *sel);
+
\f
/* Table of machine attributes. */
static const struct attribute_spec arm_attribute_table[] =
#undef TARGET_SCHED_ADJUST_COST
#define TARGET_SCHED_ADJUST_COST arm_adjust_cost
+#undef TARGET_REGISTER_MOVE_COST
+#define TARGET_REGISTER_MOVE_COST arm_register_move_cost
+
+#undef TARGET_MEMORY_MOVE_COST
+#define TARGET_MEMORY_MOVE_COST arm_memory_move_cost
+
#undef TARGET_ENCODE_SECTION_INFO
#ifdef ARM_PE
#define TARGET_ENCODE_SECTION_INFO arm_pe_encode_section_info
#define TARGET_PREFERRED_RENAME_CLASS \
arm_preferred_rename_class
+#undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
+#define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
+ arm_vectorize_vec_perm_const_ok
+
struct gcc_target targetm = TARGET_INITIALIZER;
\f
/* Obstack for minipool constant handling. */
global_options_set.x_param_values);
/* ARM EABI defaults to strict volatile bitfields. */
- if (TARGET_AAPCS_BASED && flag_strict_volatile_bitfields < 0)
+ if (TARGET_AAPCS_BASED && flag_strict_volatile_bitfields < 0
+ && abi_version_at_least(2))
flag_strict_volatile_bitfields = 1;
/* Enable sw prefetching at -O3 for CPUS that have prefetch, and we have deemed
add_libcall (libcall_htab,
convert_optab_libfunc (trunc_optab, HFmode, SFmode));
add_libcall (libcall_htab,
+ convert_optab_libfunc (sfix_optab, SImode, DFmode));
+ add_libcall (libcall_htab,
+ convert_optab_libfunc (ufix_optab, SImode, DFmode));
+ add_libcall (libcall_htab,
convert_optab_libfunc (sfix_optab, DImode, DFmode));
add_libcall (libcall_htab,
convert_optab_libfunc (ufix_optab, DImode, DFmode));
(TARGET_VFP_DOUBLE || !is_double));
}
+/* Return true if an argument whose type is TYPE, or mode is MODE, is
+ suitable for passing or returning in VFP registers for the PCS
+ variant selected. If it is, then *BASE_MODE is updated to contain
+ a machine mode describing each element of the argument's type and
+ *COUNT to hold the number of such elements. */
static bool
aapcs_vfp_is_call_or_return_candidate (enum arm_pcs pcs_variant,
enum machine_mode mode, const_tree type,
{
enum machine_mode new_mode = VOIDmode;
- if (GET_MODE_CLASS (mode) == MODE_FLOAT
- || GET_MODE_CLASS (mode) == MODE_VECTOR_INT
- || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
+ /* If we have the type information, prefer that to working things
+ out from the mode. */
+ if (type)
+ {
+ int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
+
+ if (ag_count > 0 && ag_count <= 4)
+ *count = ag_count;
+ else
+ return false;
+ }
+ else if (GET_MODE_CLASS (mode) == MODE_FLOAT
+ || GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
{
*count = 1;
new_mode = mode;
*count = 2;
new_mode = (mode == DCmode ? DFmode : SFmode);
}
- else if (type && (mode == BLKmode || TREE_CODE (type) == VECTOR_TYPE))
- {
- int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
-
- if (ag_count > 0 && ag_count <= 4)
- *count = ag_count;
- else
- return false;
- }
else
return false;
if (IS_STACKALIGN (func_type))
return false;
+ /* The AAPCS says that, on bare-metal, calls to unresolved weak
+ references should become a NOP. Don't convert such calls into
+ sibling calls. */
+ if (TARGET_AAPCS_BASED
+ && arm_abi == ARM_ABI_AAPCS
+ && DECL_WEAK (decl))
+ return false;
+
/* Everything else is ok. */
return true;
}
if (TARGET_32BIT)
{
- emit_insn (gen_pic_load_addr_32bit (pic_reg, pic_rtx));
- if (TARGET_ARM)
- emit_insn (gen_pic_add_dot_plus_eight (pic_reg, pic_reg, labelno));
- else
- emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno));
+ emit_insn (gen_pic_load_addr_unified (pic_reg, pic_rtx, labelno));
}
else /* TARGET_THUMB1 */
{
thumb_find_work_register (saved_regs));
emit_insn (gen_pic_load_addr_thumb1 (pic_tmp, pic_rtx));
emit_insn (gen_movsi (pic_offset_table_rtx, pic_tmp));
+ emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno));
}
else
- emit_insn (gen_pic_load_addr_thumb1 (pic_reg, pic_rtx));
- emit_insn (gen_pic_add_dot_plus_four (pic_reg, pic_reg, labelno));
+ emit_insn (gen_pic_load_addr_unified (pic_reg, pic_rtx, labelno));
}
}
UNSPEC_SYMBOL_OFFSET);
offset_rtx = gen_rtx_CONST (Pmode, offset_rtx);
- if (TARGET_32BIT)
- {
- emit_insn (gen_pic_load_addr_32bit (reg, offset_rtx));
- if (TARGET_ARM)
- insn = emit_insn (gen_pic_add_dot_plus_eight (reg, reg, labelno));
- else
- insn = emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
- }
- else /* TARGET_THUMB1 */
- {
- emit_insn (gen_pic_load_addr_thumb1 (reg, offset_rtx));
- insn = emit_insn (gen_pic_add_dot_plus_four (reg, reg, labelno));
- }
-
+ insn = emit_insn (gen_pic_load_addr_unified (reg, offset_rtx, labelno));
return insn;
}
will_be_in_index_register (const_rtx x)
{
/* arm.md: calculate_pic_address will split this into a register. */
- return GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_PIC_SYM;
+ return GET_CODE (x) == UNSPEC && (XINT (x, 1) == UNSPEC_PIC_SYM);
}
/* Return nonzero if X is a valid ARM state address operand. */
case SET:
return false;
+
+ case UNSPEC:
+ /* We cost this as high as our memory costs to allow this to
+ be hoisted from loops. */
+ if (XINT (x, 1) == UNSPEC_PIC_UNIFIED)
+ {
+ *total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode));
+ }
+ return true;
default:
*total = COSTS_N_INSNS (4);
return true;
}
+/* Implement TARGET_REGISTER_MOVE_COST.
+
+ Moves between FPA_REGS and GENERAL_REGS are two memory insns.
+ Moves between VFP_REGS and GENERAL_REGS are a single insn, but
+ it is typically more expensive than a single memory access. We set
+ the cost to less than two memory accesses so that floating
+ point to integer conversion does not go through memory. */
+
+int
+arm_register_move_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
+ reg_class_t from, reg_class_t to)
+{
+ if (TARGET_32BIT)
+ {
+ if ((from == FPA_REGS && to != FPA_REGS)
+ || (from != FPA_REGS && to == FPA_REGS))
+ return 20;
+ else if ((IS_VFP_CLASS (from) && !IS_VFP_CLASS (to))
+ || (!IS_VFP_CLASS (from) && IS_VFP_CLASS (to)))
+ return 15;
+ else if ((from == IWMMXT_REGS && to != IWMMXT_REGS)
+ || (from != IWMMXT_REGS && to == IWMMXT_REGS))
+ return 4;
+ else if (from == IWMMXT_GR_REGS || to == IWMMXT_GR_REGS)
+ return 20;
+ else if ((from == CIRRUS_REGS && to != CIRRUS_REGS)
+ || (from != CIRRUS_REGS && to == CIRRUS_REGS))
+ return 20;
+ else
+ return 2;
+ }
+ else
+ {
+ if (from == HI_REGS || to == HI_REGS)
+ return 4;
+ else
+ return 2;
+ }
+}
+
+/* Implement TARGET_MEMORY_MOVE_COST. */
+
+int
+arm_memory_move_cost (enum machine_mode mode, reg_class_t rclass,
+ bool in ATTRIBUTE_UNUSED)
+{
+ if (TARGET_32BIT)
+ return 10;
+ else
+ {
+ if (GET_MODE_SIZE (mode) < 4)
+ return 8;
+ else
+ return ((2 * GET_MODE_SIZE (mode)) * (rclass == LO_REGS ? 1 : 2));
+ }
+}
+
/* This function implements the target macro TARGET_SCHED_ADJUST_COST.
It corrects the value of COST based on the relationship between
INSN and DEP through the dependence LINK. It returns the new
arm_note_pic_base (rtx *x, void *date ATTRIBUTE_UNUSED)
{
if (GET_CODE (*x) == UNSPEC
- && XINT (*x, 1) == UNSPEC_PIC_BASE)
+ && (XINT (*x, 1) == UNSPEC_PIC_BASE
+ || XINT (*x, 1) == UNSPEC_PIC_UNIFIED))
return 1;
return 0;
}
return CC_Zmode;
/* We can do an equality test in three Thumb instructions. */
- if (!TARGET_ARM)
+ if (!TARGET_32BIT)
return CC_Zmode;
/* FALLTHROUGH */
/* DImode unsigned comparisons can be implemented by cmp +
cmpeq without a scratch register. Not worth doing in
Thumb-2. */
- if (TARGET_ARM)
+ if (TARGET_32BIT)
return CC_CZmode;
/* FALLTHROUGH */
return the rtx for register 0 in the proper mode. FP means this is a
floating point compare: I don't think that it is needed on the arm. */
rtx
-arm_gen_compare_reg (enum rtx_code code, rtx x, rtx y)
+arm_gen_compare_reg (enum rtx_code code, rtx x, rtx y, rtx scratch)
{
enum machine_mode mode;
rtx cc_reg;
CC_CZmode is cheaper. */
if (mode == CC_Zmode && y != const0_rtx)
{
+ gcc_assert (!reload_completed);
x = expand_binop (DImode, xor_optab, x, y, NULL_RTX, 0, OPTAB_WIDEN);
y = const0_rtx;
}
+
/* A scratch register is required. */
- clobber = gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (SImode));
+ if (reload_completed)
+ gcc_assert (scratch != NULL && GET_MODE (scratch) == SImode);
+ else
+ scratch = gen_rtx_SCRATCH (SImode);
+
+ clobber = gen_rtx_CLOBBER (VOIDmode, scratch);
set = gen_rtx_SET (VOIDmode, cc_reg, gen_rtx_COMPARE (mode, x, y));
emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clobber)));
}
output_asm_insn ("sub%?\t%0, %1, %2", otherops);
}
+ if (count)
+ *count = 2;
+
if (TARGET_LDRD)
return "ldr%(d%)\t%0, [%1]";
memsize = MEM_SIZE (x);
/* Only certain alignment specifiers are supported by the hardware. */
- if (memsize == 16 && (align % 32) == 0)
+ if (memsize == 32 && (align % 32) == 0)
align_bits = 256;
- else if ((memsize == 8 || memsize == 16) && (align % 16) == 0)
+ else if ((memsize == 16 || memsize == 32) && (align % 16) == 0)
align_bits = 128;
- else if ((align % 8) == 0)
+ else if (memsize >= 8 && (align % 8) == 0)
align_bits = 64;
else
align_bits = 0;
}
return;
+ case 'v':
+ gcc_assert (GET_CODE (x) == CONST_DOUBLE);
+ fprintf (stream, "#%d", vfp3_const_double_for_fract_bits (x));
+ return;
+
/* Register specifier for vld1.16/vst1.16. Translate the S register
number into a D register number and element index. */
case 'z':
VAR3 (BINOP, vsubhn, v8hi, v4si, v2di),
VAR8 (BINOP, vceq, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
VAR8 (BINOP, vcge, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
+ VAR6 (BINOP, vcgeu, v8qi, v4hi, v2si, v16qi, v8hi, v4si),
VAR8 (BINOP, vcgt, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf),
+ VAR6 (BINOP, vcgtu, v8qi, v4hi, v2si, v16qi, v8hi, v4si),
VAR2 (BINOP, vcage, v2sf, v4sf),
VAR2 (BINOP, vcagt, v2sf, v4sf),
VAR6 (BINOP, vtst, v8qi, v4hi, v2si, v16qi, v8hi, v4si),
}
}
+/* Split operands into moves from op[1] + op[2] into op[0]. */
+
+void
+neon_split_vcombine (rtx operands[3])
+{
+ unsigned int dest = REGNO (operands[0]);
+ unsigned int src1 = REGNO (operands[1]);
+ unsigned int src2 = REGNO (operands[2]);
+ enum machine_mode halfmode = GET_MODE (operands[1]);
+ unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
+ rtx destlo, desthi;
+
+ if (src1 == dest && src2 == dest + halfregs)
+ {
+ /* No-op move. Can't split to nothing; emit something. */
+ emit_note (NOTE_INSN_DELETED);
+ return;
+ }
+
+ /* Preserve register attributes for variable tracking. */
+ destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
+ desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
+ GET_MODE_SIZE (halfmode));
+
+ /* Special case of reversed high/low parts. Use VSWP. */
+ if (src2 == dest && src1 == dest + halfregs)
+ {
+ rtx x = gen_rtx_SET (VOIDmode, destlo, operands[1]);
+ rtx y = gen_rtx_SET (VOIDmode, desthi, operands[2]);
+ emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, x, y)));
+ return;
+ }
+
+ if (!reg_overlap_mentioned_p (operands[2], destlo))
+ {
+ /* Try to avoid unnecessary moves if part of the result
+ is in the right place already. */
+ if (src1 != dest)
+ emit_move_insn (destlo, operands[1]);
+ if (src2 != dest + halfregs)
+ emit_move_insn (desthi, operands[2]);
+ }
+ else
+ {
+ if (src2 != dest + halfregs)
+ emit_move_insn (desthi, operands[2]);
+ if (src1 != dest)
+ emit_move_insn (destlo, operands[1]);
+ }
+}
+
/* Expand an expression EXP that calls a built-in function,
with result going to TARGET if that's convenient
(and in mode MODE if that's convenient).
return !TARGET_THUMB1;
}
-/* Legitimize a memory reference for sync primitive implemented using
- ldrex / strex. We currently force the form of the reference to be
- indirect without offset. We do not yet support the indirect offset
- addressing supported by some ARM targets for these
- instructions. */
-static rtx
-arm_legitimize_sync_memory (rtx memory)
+static unsigned int
+arm_autovectorize_vector_sizes (void)
{
- rtx addr = force_reg (Pmode, XEXP (memory, 0));
- rtx legitimate_memory = gen_rtx_MEM (GET_MODE (memory), addr);
-
- set_mem_alias_set (legitimate_memory, ALIAS_SET_MEMORY_BARRIER);
- MEM_VOLATILE_P (legitimate_memory) = MEM_VOLATILE_P (memory);
- return legitimate_memory;
+ return TARGET_NEON_VECTORIZE_DOUBLE ? 0 : (16 | 8);
}
-/* An instruction emitter. */
-typedef void (* emit_f) (int label, const char *, rtx *);
-
-/* An instruction emitter that emits via the conventional
- output_asm_insn. */
-static void
-arm_emit (int label ATTRIBUTE_UNUSED, const char *pattern, rtx *operands)
+static bool
+arm_vector_alignment_reachable (const_tree type, bool is_packed)
{
- output_asm_insn (pattern, operands);
-}
-
-/* Count the number of emitted synchronization instructions. */
-static unsigned arm_insn_count;
+ /* Vectors which aren't in packed structures will not be less aligned than
+ the natural alignment of their element type, so this is safe. */
+ if (TARGET_NEON && !BYTES_BIG_ENDIAN)
+ return !is_packed;
-/* An emitter that counts emitted instructions but does not actually
- emit instruction into the instruction stream. */
-static void
-arm_count (int label,
- const char *pattern ATTRIBUTE_UNUSED,
- rtx *operands ATTRIBUTE_UNUSED)
-{
- if (! label)
- ++ arm_insn_count;
+ return default_builtin_vector_alignment_reachable (type, is_packed);
}
-/* Construct a pattern using conventional output formatting and feed
- it to output_asm_insn. Provides a mechanism to construct the
- output pattern on the fly. Note the hard limit on the pattern
- buffer size. */
-static void ATTRIBUTE_PRINTF_4
-arm_output_asm_insn (emit_f emit, int label, rtx *operands,
- const char *pattern, ...)
+static bool
+arm_builtin_support_vector_misalignment (enum machine_mode mode,
+ const_tree type, int misalignment,
+ bool is_packed)
{
- va_list ap;
- char buffer[256];
+ if (TARGET_NEON && !BYTES_BIG_ENDIAN)
+ {
+ HOST_WIDE_INT align = TYPE_ALIGN_UNIT (type);
- va_start (ap, pattern);
- vsprintf (buffer, pattern, ap);
- va_end (ap);
- emit (label, buffer, operands);
+ if (is_packed)
+ return align == 1;
+
+ /* If the misalignment is unknown, we should be able to handle the access
+ so long as it is not to a member of a packed data structure. */
+ if (misalignment == -1)
+ return true;
+
+ /* Return true if the misalignment is a multiple of the natural alignment
+ of the vector's element type. This is probably always going to be
+ true in practice, since we've already established that this isn't a
+ packed access. */
+ return ((misalignment % align) == 0);
+ }
+
+ return default_builtin_support_vector_misalignment (mode, type, misalignment,
+ is_packed);
}
-/* Emit the memory barrier instruction, if any, provided by this
- target to a specified emitter. */
static void
-arm_process_output_memory_barrier (emit_f emit, rtx *operands)
+arm_conditional_register_usage (void)
{
- if (TARGET_HAVE_DMB)
+ int regno;
+
+ if (TARGET_SOFT_FLOAT || TARGET_THUMB1 || !TARGET_FPA)
{
- /* Note we issue a system level barrier. We should consider
- issuing a inner shareabilty zone barrier here instead, ie.
- "DMB ISH". */
- emit (0, "dmb\tsy", operands);
- return;
+ for (regno = FIRST_FPA_REGNUM;
+ regno <= LAST_FPA_REGNUM; ++regno)
+ fixed_regs[regno] = call_used_regs[regno] = 1;
}
- if (TARGET_HAVE_DMB_MCR)
+ if (TARGET_THUMB1 && optimize_size)
{
- emit (0, "mcr\tp15, 0, r0, c7, c10, 5", operands);
- return;
+ /* When optimizing for size on Thumb-1, it's better not
+ to use the HI regs, because of the overhead of
+ stacking them. */
+ for (regno = FIRST_HI_REGNUM;
+ regno <= LAST_HI_REGNUM; ++regno)
+ fixed_regs[regno] = call_used_regs[regno] = 1;
}
- gcc_unreachable ();
-}
-
-/* Emit the memory barrier instruction, if any, provided by this
- target. */
-const char *
-arm_output_memory_barrier (rtx *operands)
-{
- arm_process_output_memory_barrier (arm_emit, operands);
- return "";
-}
+ /* The link register can be clobbered by any branch insn,
+ but we have no way to track that at present, so mark
+ it as unavailable. */
+ if (TARGET_THUMB1)
+ fixed_regs[LR_REGNUM] = call_used_regs[LR_REGNUM] = 1;
-/* Helper to figure out the instruction suffix required on ldrex/strex
- for operations on an object of the specified mode. */
-static const char *
-arm_ldrex_suffix (enum machine_mode mode)
-{
- switch (mode)
+ if (TARGET_32BIT && TARGET_HARD_FLOAT)
{
- case QImode: return "b";
- case HImode: return "h";
- case SImode: return "";
- case DImode: return "d";
- default:
- gcc_unreachable ();
+ if (TARGET_MAVERICK)
+ {
+ for (regno = FIRST_FPA_REGNUM;
+ regno <= LAST_FPA_REGNUM; ++ regno)
+ fixed_regs[regno] = call_used_regs[regno] = 1;
+ for (regno = FIRST_CIRRUS_FP_REGNUM;
+ regno <= LAST_CIRRUS_FP_REGNUM; ++ regno)
+ {
+ fixed_regs[regno] = 0;
+ call_used_regs[regno] = regno < FIRST_CIRRUS_FP_REGNUM + 4;
+ }
+ }
+ if (TARGET_VFP)
+ {
+ /* VFPv3 registers are disabled when earlier VFP
+ versions are selected due to the definition of
+ LAST_VFP_REGNUM. */
+ for (regno = FIRST_VFP_REGNUM;
+ regno <= LAST_VFP_REGNUM; ++ regno)
+ {
+ fixed_regs[regno] = 0;
+ call_used_regs[regno] = regno < FIRST_VFP_REGNUM + 16
+ || regno >= FIRST_VFP_REGNUM + 32;
+ }
+ }
}
- return "";
-}
-/* Emit an ldrex{b,h,d, } instruction appropriate for the specified
- mode. */
-static void
-arm_output_ldrex (emit_f emit,
- enum machine_mode mode,
- rtx target,
- rtx memory)
-{
- rtx operands[3];
-
- operands[0] = target;
- if (mode != DImode)
+ if (TARGET_REALLY_IWMMXT)
{
- const char *suffix = arm_ldrex_suffix (mode);
- operands[1] = memory;
- arm_output_asm_insn (emit, 0, operands, "ldrex%s\t%%0, %%C1", suffix);
+ regno = FIRST_IWMMXT_GR_REGNUM;
+ /* The 2002/10/09 revision of the XScale ABI has wCG0
+ and wCG1 as call-preserved registers. The 2002/11/21
+ revision changed this so that all wCG registers are
+ scratch registers. */
+ for (regno = FIRST_IWMMXT_GR_REGNUM;
+ regno <= LAST_IWMMXT_GR_REGNUM; ++ regno)
+ fixed_regs[regno] = 0;
+ /* The XScale ABI has wR0 - wR9 as scratch registers,
+ the rest as call-preserved registers. */
+ for (regno = FIRST_IWMMXT_REGNUM;
+ regno <= LAST_IWMMXT_REGNUM; ++ regno)
+ {
+ fixed_regs[regno] = 0;
+ call_used_regs[regno] = regno < FIRST_IWMMXT_REGNUM + 10;
+ }
}
- else
+
+ if ((unsigned) PIC_OFFSET_TABLE_REGNUM != INVALID_REGNUM)
{
- /* The restrictions on target registers in ARM mode are that the two
- registers are consecutive and the first one is even; Thumb is
- actually more flexible, but DI should give us this anyway.
- Note that the 1st register always gets the lowest word in memory. */
- gcc_assert ((REGNO (target) & 1) == 0);
- operands[1] = gen_rtx_REG (SImode, REGNO (target) + 1);
- operands[2] = memory;
- arm_output_asm_insn (emit, 0, operands, "ldrexd\t%%0, %%1, %%C2");
+ fixed_regs[PIC_OFFSET_TABLE_REGNUM] = 1;
+ call_used_regs[PIC_OFFSET_TABLE_REGNUM] = 1;
}
-}
-
-/* Emit a strex{b,h,d, } instruction appropriate for the specified
- mode. */
-static void
-arm_output_strex (emit_f emit,
- enum machine_mode mode,
- const char *cc,
- rtx result,
- rtx value,
- rtx memory)
-{
- rtx operands[4];
-
- operands[0] = result;
- operands[1] = value;
- if (mode != DImode)
+ else if (TARGET_APCS_STACK)
{
- const char *suffix = arm_ldrex_suffix (mode);
- operands[2] = memory;
- arm_output_asm_insn (emit, 0, operands, "strex%s%s\t%%0, %%1, %%C2",
- suffix, cc);
+ fixed_regs[10] = 1;
+ call_used_regs[10] = 1;
}
- else
+ /* -mcaller-super-interworking reserves r11 for calls to
+ _interwork_r11_call_via_rN(). Making the register global
+ is an easy way of ensuring that it remains valid for all
+ calls. */
+ if (TARGET_APCS_FRAME || TARGET_CALLER_INTERWORKING
+ || TARGET_TPCS_FRAME || TARGET_TPCS_LEAF_FRAME)
{
- /* The restrictions on target registers in ARM mode are that the two
- registers are consecutive and the first one is even; Thumb is
- actually more flexible, but DI should give us this anyway.
- Note that the 1st register always gets the lowest word in memory. */
- gcc_assert ((REGNO (value) & 1) == 0 || TARGET_THUMB2);
- operands[2] = gen_rtx_REG (SImode, REGNO (value) + 1);
- operands[3] = memory;
- arm_output_asm_insn (emit, 0, operands, "strexd%s\t%%0, %%1, %%2, %%C3",
- cc);
+ fixed_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
+ call_used_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
+ if (TARGET_CALLER_INTERWORKING)
+ global_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
}
+ SUBTARGET_CONDITIONAL_REGISTER_USAGE
}
-/* Helper to emit an it instruction in Thumb2 mode only; although the assembler
- will ignore it in ARM mode, emitting it will mess up instruction counts we
- sometimes keep 'flags' are the extra t's and e's if it's more than one
- instruction that is conditional. */
-static void
-arm_output_it (emit_f emit, const char *flags, const char *cond)
-{
- rtx operands[1]; /* Don't actually use the operand. */
- if (TARGET_THUMB2)
- arm_output_asm_insn (emit, 0, operands, "it%s\t%s", flags, cond);
-}
-
-/* Helper to emit a two operand instruction. */
-static void
-arm_output_op2 (emit_f emit, const char *mnemonic, rtx d, rtx s)
+static reg_class_t
+arm_preferred_rename_class (reg_class_t rclass)
{
- rtx operands[2];
-
- operands[0] = d;
- operands[1] = s;
- arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1", mnemonic);
-}
+ /* Thumb-2 instructions using LO_REGS may be smaller than instructions
+ using GENERIC_REGS. During register rename pass, we prefer LO_REGS,
+ and code size can be reduced. */
+ if (TARGET_THUMB2 && rclass == GENERAL_REGS)
+ return LO_REGS;
+ else
+ return NO_REGS;
+}
-/* Helper to emit a three operand instruction. */
-static void
-arm_output_op3 (emit_f emit, const char *mnemonic, rtx d, rtx a, rtx b)
+/* Compute the atrribute "length" of insn "*push_multi".
+ So this function MUST be kept in sync with that insn pattern. */
+int
+arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
{
- rtx operands[3];
+ int i, regno, hi_reg;
+ int num_saves = XVECLEN (parallel_op, 0);
- operands[0] = d;
- operands[1] = a;
- operands[2] = b;
- arm_output_asm_insn (emit, 0, operands, "%s\t%%0, %%1, %%2", mnemonic);
-}
+ /* ARM mode. */
+ if (TARGET_ARM)
+ return 4;
+ /* Thumb1 mode. */
+ if (TARGET_THUMB1)
+ return 2;
-/* Emit a load store exclusive synchronization loop.
+ /* Thumb2 mode. */
+ regno = REGNO (first_op);
+ hi_reg = (REGNO_REG_CLASS (regno) == HI_REGS) && (regno != LR_REGNUM);
+ for (i = 1; i < num_saves && !hi_reg; i++)
+ {
+ regno = REGNO (XEXP (XVECEXP (parallel_op, 0, i), 0));
+ hi_reg |= (REGNO_REG_CLASS (regno) == HI_REGS) && (regno != LR_REGNUM);
+ }
- do
- old_value = [mem]
- if old_value != required_value
- break;
- t1 = sync_op (old_value, new_value)
- [mem] = t1, t2 = [0|1]
- while ! t2
+ if (!hi_reg)
+ return 2;
+ return 4;
+}
- Note:
- t1 == t2 is not permitted
- t1 == old_value is permitted
+/* Compute the number of instructions emitted by output_move_double. */
+int
+arm_count_output_move_double_insns (rtx *operands)
+{
+ int count;
+ rtx ops[2];
+ /* output_move_double may modify the operands array, so call it
+ here on a copy of the array. */
+ ops[0] = operands[0];
+ ops[1] = operands[1];
+ output_move_double (ops, false, &count);
+ return count;
+}
- required_value:
+int
+vfp3_const_double_for_fract_bits (rtx operand)
+{
+ REAL_VALUE_TYPE r0;
+
+ if (GET_CODE (operand) != CONST_DOUBLE)
+ return 0;
+
+ REAL_VALUE_FROM_CONST_DOUBLE (r0, operand);
+ if (exact_real_inverse (DFmode, &r0))
+ {
+ if (exact_real_truncate (DFmode, &r0))
+ {
+ HOST_WIDE_INT value = real_to_integer (&r0);
+ value = value & 0xffffffff;
+ if ((value != 0) && ( (value & (value - 1)) == 0))
+ return int_log2 (value);
+ }
+ }
+ return 0;
+}
+\f
+/* Emit a memory barrier around an atomic sequence according to MODEL. */
- RTX register representing the required old_value for
- the modify to continue, if NULL no comparsion is performed. */
static void
-arm_output_sync_loop (emit_f emit,
- enum machine_mode mode,
- rtx old_value,
- rtx memory,
- rtx required_value,
- rtx new_value,
- rtx t1,
- rtx t2,
- enum attr_sync_op sync_op,
- int early_barrier_required)
+arm_pre_atomic_barrier (enum memmodel model)
{
- rtx operands[2];
- /* We'll use the lo for the normal rtx in the none-DI case
- as well as the least-sig word in the DI case. */
- rtx old_value_lo, required_value_lo, new_value_lo, t1_lo;
- rtx old_value_hi, required_value_hi, new_value_hi, t1_hi;
+ switch (model)
+ {
+ case MEMMODEL_RELAXED:
+ case MEMMODEL_CONSUME:
+ case MEMMODEL_ACQUIRE:
+ break;
+ case MEMMODEL_RELEASE:
+ case MEMMODEL_ACQ_REL:
+ case MEMMODEL_SEQ_CST:
+ emit_insn (gen_memory_barrier ());
+ break;
+ default:
+ gcc_unreachable ();
+ }
+}
- bool is_di = mode == DImode;
+static void
+arm_post_atomic_barrier (enum memmodel model)
+{
+ switch (model)
+ {
+ case MEMMODEL_RELAXED:
+ case MEMMODEL_CONSUME:
+ case MEMMODEL_RELEASE:
+ break;
+ case MEMMODEL_ACQUIRE:
+ case MEMMODEL_ACQ_REL:
+ case MEMMODEL_SEQ_CST:
+ emit_insn (gen_memory_barrier ());
+ break;
+ default:
+ gcc_unreachable ();
+ }
+}
- gcc_assert (t1 != t2);
+/* Emit the load-exclusive and store-exclusive instructions. */
- if (early_barrier_required)
- arm_process_output_memory_barrier (emit, NULL);
+static void
+arm_emit_load_exclusive (enum machine_mode mode, rtx rval, rtx mem)
+{
+ rtx (*gen) (rtx, rtx);
- arm_output_asm_insn (emit, 1, operands, "%sLSYT%%=:", LOCAL_LABEL_PREFIX);
+ switch (mode)
+ {
+ case QImode: gen = gen_arm_load_exclusiveqi; break;
+ case HImode: gen = gen_arm_load_exclusivehi; break;
+ case SImode: gen = gen_arm_load_exclusivesi; break;
+ case DImode: gen = gen_arm_load_exclusivedi; break;
+ default:
+ gcc_unreachable ();
+ }
+
+ emit_insn (gen (rval, mem));
+}
- arm_output_ldrex (emit, mode, old_value, memory);
+static void
+arm_emit_store_exclusive (enum machine_mode mode, rtx bval, rtx rval, rtx mem)
+{
+ rtx (*gen) (rtx, rtx, rtx);
- if (is_di)
+ switch (mode)
{
- old_value_lo = gen_lowpart (SImode, old_value);
- old_value_hi = gen_highpart (SImode, old_value);
- if (required_value)
- {
- required_value_lo = gen_lowpart (SImode, required_value);
- required_value_hi = gen_highpart (SImode, required_value);
- }
- else
- {
- /* Silence false potentially unused warning. */
- required_value_lo = NULL_RTX;
- required_value_hi = NULL_RTX;
- }
- new_value_lo = gen_lowpart (SImode, new_value);
- new_value_hi = gen_highpart (SImode, new_value);
- t1_lo = gen_lowpart (SImode, t1);
- t1_hi = gen_highpart (SImode, t1);
+ case QImode: gen = gen_arm_store_exclusiveqi; break;
+ case HImode: gen = gen_arm_store_exclusivehi; break;
+ case SImode: gen = gen_arm_store_exclusivesi; break;
+ case DImode: gen = gen_arm_store_exclusivedi; break;
+ default:
+ gcc_unreachable ();
}
- else
- {
- old_value_lo = old_value;
- new_value_lo = new_value;
- required_value_lo = required_value;
- t1_lo = t1;
- /* Silence false potentially unused warning. */
- t1_hi = NULL_RTX;
- new_value_hi = NULL_RTX;
- required_value_hi = NULL_RTX;
- old_value_hi = NULL_RTX;
- }
+ emit_insn (gen (bval, rval, mem));
+}
- if (required_value)
- {
- operands[0] = old_value_lo;
- operands[1] = required_value_lo;
+/* Mark the previous jump instruction as unlikely. */
- arm_output_asm_insn (emit, 0, operands, "cmp\t%%0, %%1");
- if (is_di)
- {
- arm_output_it (emit, "", "eq");
- arm_output_op2 (emit, "cmpeq", old_value_hi, required_value_hi);
- }
- arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYB%%=", LOCAL_LABEL_PREFIX);
- }
+static void
+emit_unlikely_jump (rtx insn)
+{
+ rtx very_unlikely = GEN_INT (REG_BR_PROB_BASE / 100 - 1);
- switch (sync_op)
- {
- case SYNC_OP_ADD:
- arm_output_op3 (emit, is_di ? "adds" : "add",
- t1_lo, old_value_lo, new_value_lo);
- if (is_di)
- arm_output_op3 (emit, "adc", t1_hi, old_value_hi, new_value_hi);
- break;
+ insn = emit_jump_insn (insn);
+ add_reg_note (insn, REG_BR_PROB, very_unlikely);
+}
- case SYNC_OP_SUB:
- arm_output_op3 (emit, is_di ? "subs" : "sub",
- t1_lo, old_value_lo, new_value_lo);
- if (is_di)
- arm_output_op3 (emit, "sbc", t1_hi, old_value_hi, new_value_hi);
- break;
+/* Expand a compare and swap pattern. */
- case SYNC_OP_IOR:
- arm_output_op3 (emit, "orr", t1_lo, old_value_lo, new_value_lo);
- if (is_di)
- arm_output_op3 (emit, "orr", t1_hi, old_value_hi, new_value_hi);
- break;
+void
+arm_expand_compare_and_swap (rtx operands[])
+{
+ rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
+ enum machine_mode mode;
+ rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
+
+ bval = operands[0];
+ rval = operands[1];
+ mem = operands[2];
+ oldval = operands[3];
+ newval = operands[4];
+ is_weak = operands[5];
+ mod_s = operands[6];
+ mod_f = operands[7];
+ mode = GET_MODE (mem);
- case SYNC_OP_XOR:
- arm_output_op3 (emit, "eor", t1_lo, old_value_lo, new_value_lo);
- if (is_di)
- arm_output_op3 (emit, "eor", t1_hi, old_value_hi, new_value_hi);
- break;
+ switch (mode)
+ {
+ case QImode:
+ case HImode:
+ /* For narrow modes, we're going to perform the comparison in SImode,
+ so do the zero-extension now. */
+ rval = gen_reg_rtx (SImode);
+ oldval = convert_modes (SImode, mode, oldval, true);
+ /* FALLTHRU */
- case SYNC_OP_AND:
- arm_output_op3 (emit,"and", t1_lo, old_value_lo, new_value_lo);
- if (is_di)
- arm_output_op3 (emit, "and", t1_hi, old_value_hi, new_value_hi);
+ case SImode:
+ /* Force the value into a register if needed. We waited until after
+ the zero-extension above to do this properly. */
+ if (!arm_add_operand (oldval, mode))
+ oldval = force_reg (mode, oldval);
break;
- case SYNC_OP_NAND:
- arm_output_op3 (emit, "and", t1_lo, old_value_lo, new_value_lo);
- if (is_di)
- arm_output_op3 (emit, "and", t1_hi, old_value_hi, new_value_hi);
- arm_output_op2 (emit, "mvn", t1_lo, t1_lo);
- if (is_di)
- arm_output_op2 (emit, "mvn", t1_hi, t1_hi);
+ case DImode:
+ if (!cmpdi_operand (oldval, mode))
+ oldval = force_reg (mode, oldval);
break;
- case SYNC_OP_NONE:
- t1 = new_value;
- t1_lo = new_value_lo;
- if (is_di)
- t1_hi = new_value_hi;
- break;
+ default:
+ gcc_unreachable ();
}
- /* Note that the result of strex is a 0/1 flag that's always 1 register. */
- if (t2)
+ switch (mode)
{
- arm_output_strex (emit, mode, "", t2, t1, memory);
- operands[0] = t2;
- arm_output_asm_insn (emit, 0, operands, "teq\t%%0, #0");
- arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYT%%=",
- LOCAL_LABEL_PREFIX);
+ case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
+ case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
+ case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
+ case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
+ default:
+ gcc_unreachable ();
}
- else
- {
- /* Use old_value for the return value because for some operations
- the old_value can easily be restored. This saves one register. */
- arm_output_strex (emit, mode, "", old_value_lo, t1, memory);
- operands[0] = old_value_lo;
- arm_output_asm_insn (emit, 0, operands, "teq\t%%0, #0");
- arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYT%%=",
- LOCAL_LABEL_PREFIX);
- /* Note that we only used the _lo half of old_value as a temporary
- so in DI we don't have to restore the _hi part. */
- switch (sync_op)
- {
- case SYNC_OP_ADD:
- arm_output_op3 (emit, "sub", old_value_lo, t1_lo, new_value_lo);
- break;
+ emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
- case SYNC_OP_SUB:
- arm_output_op3 (emit, "add", old_value_lo, t1_lo, new_value_lo);
- break;
+ if (mode == QImode || mode == HImode)
+ emit_move_insn (operands[1], gen_lowpart (mode, rval));
- case SYNC_OP_XOR:
- arm_output_op3 (emit, "eor", old_value_lo, t1_lo, new_value_lo);
- break;
+ /* In all cases, we arrange for success to be signaled by Z set.
+ This arrangement allows for the boolean result to be used directly
+ in a subsequent branch, post optimization. */
+ x = gen_rtx_REG (CCmode, CC_REGNUM);
+ x = gen_rtx_EQ (SImode, x, const0_rtx);
+ emit_insn (gen_rtx_SET (VOIDmode, bval, x));
+}
- case SYNC_OP_NONE:
- arm_output_op2 (emit, "mov", old_value_lo, required_value_lo);
- break;
+/* Split a compare and swap pattern. It is IMPLEMENTATION DEFINED whether
+ another memory store between the load-exclusive and store-exclusive can
+ reset the monitor from Exclusive to Open state. This means we must wait
+ until after reload to split the pattern, lest we get a register spill in
+ the middle of the atomic sequence. */
- default:
- gcc_unreachable ();
- }
+void
+arm_split_compare_and_swap (rtx operands[])
+{
+ rtx rval, mem, oldval, newval, scratch;
+ enum machine_mode mode;
+ enum memmodel mod_s, mod_f;
+ bool is_weak;
+ rtx label1, label2, x, cond;
+
+ rval = operands[0];
+ mem = operands[1];
+ oldval = operands[2];
+ newval = operands[3];
+ is_weak = (operands[4] != const0_rtx);
+ mod_s = (enum memmodel) INTVAL (operands[5]);
+ mod_f = (enum memmodel) INTVAL (operands[6]);
+ scratch = operands[7];
+ mode = GET_MODE (mem);
+
+ arm_pre_atomic_barrier (mod_s);
+
+ label1 = NULL_RTX;
+ if (!is_weak)
+ {
+ label1 = gen_label_rtx ();
+ emit_label (label1);
}
+ label2 = gen_label_rtx ();
- /* Note: label is before barrier so that in cmp failure case we still get
- a barrier to stop subsequent loads floating upwards past the ldrex
- PR target/48126. */
- arm_output_asm_insn (emit, 1, operands, "%sLSYB%%=:", LOCAL_LABEL_PREFIX);
- arm_process_output_memory_barrier (emit, NULL);
-}
+ arm_emit_load_exclusive (mode, rval, mem);
-static rtx
-arm_get_sync_operand (rtx *operands, int index, rtx default_value)
-{
- if (index > 0)
- default_value = operands[index - 1];
+ cond = arm_gen_compare_reg (NE, rval, oldval, scratch);
+ x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+ x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+ gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
+ emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
- return default_value;
-}
+ arm_emit_store_exclusive (mode, scratch, mem, newval);
-#define FETCH_SYNC_OPERAND(NAME, DEFAULT) \
- arm_get_sync_operand (operands, (int) get_attr_sync_##NAME (insn), DEFAULT);
+ /* Weak or strong, we want EQ to be true for success, so that we
+ match the flags that we got from the compare above. */
+ cond = gen_rtx_REG (CCmode, CC_REGNUM);
+ x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
+ emit_insn (gen_rtx_SET (VOIDmode, cond, x));
-/* Extract the operands for a synchroniztion instruction from the
- instructions attributes and emit the instruction. */
-static void
-arm_process_output_sync_insn (emit_f emit, rtx insn, rtx *operands)
-{
- rtx result, memory, required_value, new_value, t1, t2;
- int early_barrier;
- enum machine_mode mode;
- enum attr_sync_op sync_op;
+ if (!is_weak)
+ {
+ x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+ x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
+ gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
+ emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
+ }
- result = FETCH_SYNC_OPERAND(result, 0);
- memory = FETCH_SYNC_OPERAND(memory, 0);
- required_value = FETCH_SYNC_OPERAND(required_value, 0);
- new_value = FETCH_SYNC_OPERAND(new_value, 0);
- t1 = FETCH_SYNC_OPERAND(t1, 0);
- t2 = FETCH_SYNC_OPERAND(t2, 0);
- early_barrier =
- get_attr_sync_release_barrier (insn) == SYNC_RELEASE_BARRIER_YES;
- sync_op = get_attr_sync_op (insn);
- mode = GET_MODE (memory);
+ if (mod_f != MEMMODEL_RELAXED)
+ emit_label (label2);
- arm_output_sync_loop (emit, mode, result, memory, required_value,
- new_value, t1, t2, sync_op, early_barrier);
-}
+ arm_post_atomic_barrier (mod_s);
-/* Emit a synchronization instruction loop. */
-const char *
-arm_output_sync_insn (rtx insn, rtx *operands)
-{
- arm_process_output_sync_insn (arm_emit, insn, operands);
- return "";
+ if (mod_f == MEMMODEL_RELAXED)
+ emit_label (label2);
}
-/* Count the number of machine instruction that will be emitted for a
- synchronization instruction. Note that the emitter used does not
- emit instructions, it just counts instructions being carefull not
- to count labels. */
-unsigned int
-arm_sync_loop_insns (rtx insn, rtx *operands)
+void
+arm_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
+ rtx value, rtx model_rtx, rtx cond)
{
- arm_insn_count = 0;
- arm_process_output_sync_insn (arm_count, insn, operands);
- return arm_insn_count;
-}
+ enum memmodel model = (enum memmodel) INTVAL (model_rtx);
+ enum machine_mode mode = GET_MODE (mem);
+ enum machine_mode wmode = (mode == DImode ? DImode : SImode);
+ rtx label, x;
-/* Helper to call a target sync instruction generator, dealing with
- the variation in operands required by the different generators. */
-static rtx
-arm_call_generator (struct arm_sync_generator *generator, rtx old_value,
- rtx memory, rtx required_value, rtx new_value)
-{
- switch (generator->op)
+ arm_pre_atomic_barrier (model);
+
+ label = gen_label_rtx ();
+ emit_label (label);
+
+ if (new_out)
+ new_out = gen_lowpart (wmode, new_out);
+ if (old_out)
+ old_out = gen_lowpart (wmode, old_out);
+ else
+ old_out = new_out;
+ value = simplify_gen_subreg (wmode, value, mode, 0);
+
+ arm_emit_load_exclusive (mode, old_out, mem);
+
+ switch (code)
{
- case arm_sync_generator_omn:
- gcc_assert (! required_value);
- return generator->u.omn (old_value, memory, new_value);
+ case SET:
+ new_out = value;
+ break;
- case arm_sync_generator_omrn:
- gcc_assert (required_value);
- return generator->u.omrn (old_value, memory, required_value, new_value);
+ case NOT:
+ x = gen_rtx_AND (wmode, old_out, value);
+ emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
+ x = gen_rtx_NOT (wmode, new_out);
+ emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
+ break;
+
+ case MINUS:
+ if (CONST_INT_P (value))
+ {
+ value = GEN_INT (-INTVAL (value));
+ code = PLUS;
+ }
+ /* FALLTHRU */
+
+ case PLUS:
+ if (mode == DImode)
+ {
+ /* DImode plus/minus need to clobber flags. */
+ /* The adddi3 and subdi3 patterns are incorrectly written so that
+ they require matching operands, even when we could easily support
+ three operands. Thankfully, this can be fixed up post-splitting,
+ as the individual add+adc patterns do accept three operands and
+ post-reload cprop can make these moves go away. */
+ emit_move_insn (new_out, old_out);
+ if (code == PLUS)
+ x = gen_adddi3 (new_out, new_out, value);
+ else
+ x = gen_subdi3 (new_out, new_out, value);
+ emit_insn (x);
+ break;
+ }
+ /* FALLTHRU */
+
+ default:
+ x = gen_rtx_fmt_ee (code, wmode, old_out, value);
+ emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
+ break;
}
- return NULL;
+ arm_emit_store_exclusive (mode, cond, mem, gen_lowpart (mode, new_out));
+
+ x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+ emit_unlikely_jump (gen_cbranchsi4 (x, cond, const0_rtx, label));
+
+ arm_post_atomic_barrier (model);
}
+\f
+#define MAX_VECT_LEN 16
-/* Expand a synchronization loop. The synchronization loop is expanded
- as an opaque block of instructions in order to ensure that we do
- not subsequently get extraneous memory accesses inserted within the
- critical region. The exclusive access property of ldrex/strex is
- only guaranteed in there are no intervening memory accesses. */
-void
-arm_expand_sync (enum machine_mode mode,
- struct arm_sync_generator *generator,
- rtx target, rtx memory, rtx required_value, rtx new_value)
+struct expand_vec_perm_d
{
- if (target == NULL)
- target = gen_reg_rtx (mode);
+ rtx target, op0, op1;
+ unsigned char perm[MAX_VECT_LEN];
+ enum machine_mode vmode;
+ unsigned char nelt;
+ bool one_vector_p;
+ bool testing_p;
+};
- memory = arm_legitimize_sync_memory (memory);
- if (mode != SImode && mode != DImode)
- {
- rtx load_temp = gen_reg_rtx (SImode);
+/* Generate a variable permutation. */
+
+static void
+arm_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
+{
+ enum machine_mode vmode = GET_MODE (target);
+ bool one_vector_p = rtx_equal_p (op0, op1);
- if (required_value)
- required_value = convert_modes (SImode, mode, required_value, true);
+ gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
+ gcc_checking_assert (GET_MODE (op0) == vmode);
+ gcc_checking_assert (GET_MODE (op1) == vmode);
+ gcc_checking_assert (GET_MODE (sel) == vmode);
+ gcc_checking_assert (TARGET_NEON);
- new_value = convert_modes (SImode, mode, new_value, true);
- emit_insn (arm_call_generator (generator, load_temp, memory,
- required_value, new_value));
- emit_move_insn (target, gen_lowpart (mode, load_temp));
+ if (one_vector_p)
+ {
+ if (vmode == V8QImode)
+ emit_insn (gen_neon_vtbl1v8qi (target, op0, sel));
+ else
+ emit_insn (gen_neon_vtbl1v16qi (target, op0, sel));
}
else
{
- emit_insn (arm_call_generator (generator, target, memory, required_value,
- new_value));
+ rtx pair;
+
+ if (vmode == V8QImode)
+ {
+ pair = gen_reg_rtx (V16QImode);
+ emit_insn (gen_neon_vcombinev8qi (pair, op0, op1));
+ pair = gen_lowpart (TImode, pair);
+ emit_insn (gen_neon_vtbl2v8qi (target, pair, sel));
+ }
+ else
+ {
+ pair = gen_reg_rtx (OImode);
+ emit_insn (gen_neon_vcombinev16qi (pair, op0, op1));
+ emit_insn (gen_neon_vtbl2v16qi (target, pair, sel));
+ }
}
}
-static unsigned int
-arm_autovectorize_vector_sizes (void)
+void
+arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
{
- return TARGET_NEON_VECTORIZE_DOUBLE ? 0 : (16 | 8);
-}
+ enum machine_mode vmode = GET_MODE (target);
+ unsigned int i, nelt = GET_MODE_NUNITS (vmode);
+ bool one_vector_p = rtx_equal_p (op0, op1);
+ rtx rmask[MAX_VECT_LEN], mask;
-static bool
-arm_vector_alignment_reachable (const_tree type, bool is_packed)
-{
- /* Vectors which aren't in packed structures will not be less aligned than
- the natural alignment of their element type, so this is safe. */
- if (TARGET_NEON && !BYTES_BIG_ENDIAN)
- return !is_packed;
+ /* TODO: ARM's VTBL indexing is little-endian. In order to handle GCC's
+ numbering of elements for big-endian, we must reverse the order. */
+ gcc_checking_assert (!BYTES_BIG_ENDIAN);
- return default_builtin_vector_alignment_reachable (type, is_packed);
+ /* The VTBL instruction does not use a modulo index, so we must take care
+ of that ourselves. */
+ mask = GEN_INT (one_vector_p ? nelt - 1 : 2 * nelt - 1);
+ for (i = 0; i < nelt; ++i)
+ rmask[i] = mask;
+ mask = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rmask));
+ sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
+
+ arm_expand_vec_perm_1 (target, op0, op1, sel);
}
+/* Generate or test for an insn that supports a constant permutation. */
+
+/* Recognize patterns for the VUZP insns. */
+
static bool
-arm_builtin_support_vector_misalignment (enum machine_mode mode,
- const_tree type, int misalignment,
- bool is_packed)
+arm_evpc_neon_vuzp (struct expand_vec_perm_d *d)
{
- if (TARGET_NEON && !BYTES_BIG_ENDIAN)
+ unsigned int i, odd, mask, nelt = d->nelt;
+ rtx out0, out1, in0, in1, x;
+ rtx (*gen)(rtx, rtx, rtx, rtx);
+
+ if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
+ return false;
+
+ /* Note that these are little-endian tests. Adjust for big-endian later. */
+ if (d->perm[0] == 0)
+ odd = 0;
+ else if (d->perm[0] == 1)
+ odd = 1;
+ else
+ return false;
+ mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+ for (i = 0; i < nelt; i++)
{
- HOST_WIDE_INT align = TYPE_ALIGN_UNIT (type);
+ unsigned elt = (i * 2 + odd) & mask;
+ if (d->perm[i] != elt)
+ return false;
+ }
- if (is_packed)
- return align == 1;
+ /* Success! */
+ if (d->testing_p)
+ return true;
- /* If the misalignment is unknown, we should be able to handle the access
- so long as it is not to a member of a packed data structure. */
- if (misalignment == -1)
- return true;
+ switch (d->vmode)
+ {
+ case V16QImode: gen = gen_neon_vuzpv16qi_internal; break;
+ case V8QImode: gen = gen_neon_vuzpv8qi_internal; break;
+ case V8HImode: gen = gen_neon_vuzpv8hi_internal; break;
+ case V4HImode: gen = gen_neon_vuzpv4hi_internal; break;
+ case V4SImode: gen = gen_neon_vuzpv4si_internal; break;
+ case V2SImode: gen = gen_neon_vuzpv2si_internal; break;
+ case V2SFmode: gen = gen_neon_vuzpv2sf_internal; break;
+ case V4SFmode: gen = gen_neon_vuzpv4sf_internal; break;
+ default:
+ gcc_unreachable ();
+ }
- /* Return true if the misalignment is a multiple of the natural alignment
- of the vector's element type. This is probably always going to be
- true in practice, since we've already established that this isn't a
- packed access. */
- return ((misalignment % align) == 0);
+ in0 = d->op0;
+ in1 = d->op1;
+ if (BYTES_BIG_ENDIAN)
+ {
+ x = in0, in0 = in1, in1 = x;
+ odd = !odd;
}
- return default_builtin_support_vector_misalignment (mode, type, misalignment,
- is_packed);
+ out0 = d->target;
+ out1 = gen_reg_rtx (d->vmode);
+ if (odd)
+ x = out0, out0 = out1, out1 = x;
+
+ emit_insn (gen (out0, in0, in1, out1));
+ return true;
}
-static void
-arm_conditional_register_usage (void)
+/* Recognize patterns for the VZIP insns. */
+
+static bool
+arm_evpc_neon_vzip (struct expand_vec_perm_d *d)
{
- int regno;
+ unsigned int i, high, mask, nelt = d->nelt;
+ rtx out0, out1, in0, in1, x;
+ rtx (*gen)(rtx, rtx, rtx, rtx);
- if (TARGET_SOFT_FLOAT || TARGET_THUMB1 || !TARGET_FPA)
+ if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
+ return false;
+
+ /* Note that these are little-endian tests. Adjust for big-endian later. */
+ high = nelt / 2;
+ if (d->perm[0] == high)
+ ;
+ else if (d->perm[0] == 0)
+ high = 0;
+ else
+ return false;
+ mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+ for (i = 0; i < nelt / 2; i++)
{
- for (regno = FIRST_FPA_REGNUM;
- regno <= LAST_FPA_REGNUM; ++regno)
- fixed_regs[regno] = call_used_regs[regno] = 1;
+ unsigned elt = (i + high) & mask;
+ if (d->perm[i * 2] != elt)
+ return false;
+ elt = (elt + nelt) & mask;
+ if (d->perm[i * 2 + 1] != elt)
+ return false;
}
- if (TARGET_THUMB1 && optimize_size)
+ /* Success! */
+ if (d->testing_p)
+ return true;
+
+ switch (d->vmode)
{
- /* When optimizing for size on Thumb-1, it's better not
- to use the HI regs, because of the overhead of
- stacking them. */
- for (regno = FIRST_HI_REGNUM;
- regno <= LAST_HI_REGNUM; ++regno)
- fixed_regs[regno] = call_used_regs[regno] = 1;
+ case V16QImode: gen = gen_neon_vzipv16qi_internal; break;
+ case V8QImode: gen = gen_neon_vzipv8qi_internal; break;
+ case V8HImode: gen = gen_neon_vzipv8hi_internal; break;
+ case V4HImode: gen = gen_neon_vzipv4hi_internal; break;
+ case V4SImode: gen = gen_neon_vzipv4si_internal; break;
+ case V2SImode: gen = gen_neon_vzipv2si_internal; break;
+ case V2SFmode: gen = gen_neon_vzipv2sf_internal; break;
+ case V4SFmode: gen = gen_neon_vzipv4sf_internal; break;
+ default:
+ gcc_unreachable ();
}
- /* The link register can be clobbered by any branch insn,
- but we have no way to track that at present, so mark
- it as unavailable. */
- if (TARGET_THUMB1)
- fixed_regs[LR_REGNUM] = call_used_regs[LR_REGNUM] = 1;
+ in0 = d->op0;
+ in1 = d->op1;
+ if (BYTES_BIG_ENDIAN)
+ {
+ x = in0, in0 = in1, in1 = x;
+ high = !high;
+ }
- if (TARGET_32BIT && TARGET_HARD_FLOAT)
+ out0 = d->target;
+ out1 = gen_reg_rtx (d->vmode);
+ if (high)
+ x = out0, out0 = out1, out1 = x;
+
+ emit_insn (gen (out0, in0, in1, out1));
+ return true;
+}
+
+/* Recognize patterns for the VREV insns. */
+
+static bool
+arm_evpc_neon_vrev (struct expand_vec_perm_d *d)
+{
+ unsigned int i, j, diff, nelt = d->nelt;
+ rtx (*gen)(rtx, rtx, rtx);
+
+ if (!d->one_vector_p)
+ return false;
+
+ diff = d->perm[0];
+ switch (diff)
{
- if (TARGET_MAVERICK)
+ case 7:
+ switch (d->vmode)
{
- for (regno = FIRST_FPA_REGNUM;
- regno <= LAST_FPA_REGNUM; ++ regno)
- fixed_regs[regno] = call_used_regs[regno] = 1;
- for (regno = FIRST_CIRRUS_FP_REGNUM;
- regno <= LAST_CIRRUS_FP_REGNUM; ++ regno)
- {
- fixed_regs[regno] = 0;
- call_used_regs[regno] = regno < FIRST_CIRRUS_FP_REGNUM + 4;
- }
+ case V16QImode: gen = gen_neon_vrev64v16qi; break;
+ case V8QImode: gen = gen_neon_vrev64v8qi; break;
+ default:
+ return false;
}
- if (TARGET_VFP)
+ break;
+ case 3:
+ switch (d->vmode)
{
- /* VFPv3 registers are disabled when earlier VFP
- versions are selected due to the definition of
- LAST_VFP_REGNUM. */
- for (regno = FIRST_VFP_REGNUM;
- regno <= LAST_VFP_REGNUM; ++ regno)
- {
- fixed_regs[regno] = 0;
- call_used_regs[regno] = regno < FIRST_VFP_REGNUM + 16
- || regno >= FIRST_VFP_REGNUM + 32;
- }
+ case V16QImode: gen = gen_neon_vrev32v16qi; break;
+ case V8QImode: gen = gen_neon_vrev32v8qi; break;
+ case V8HImode: gen = gen_neon_vrev64v8hi; break;
+ case V4HImode: gen = gen_neon_vrev64v4hi; break;
+ default:
+ return false;
}
- }
-
- if (TARGET_REALLY_IWMMXT)
- {
- regno = FIRST_IWMMXT_GR_REGNUM;
- /* The 2002/10/09 revision of the XScale ABI has wCG0
- and wCG1 as call-preserved registers. The 2002/11/21
- revision changed this so that all wCG registers are
- scratch registers. */
- for (regno = FIRST_IWMMXT_GR_REGNUM;
- regno <= LAST_IWMMXT_GR_REGNUM; ++ regno)
- fixed_regs[regno] = 0;
- /* The XScale ABI has wR0 - wR9 as scratch registers,
- the rest as call-preserved registers. */
- for (regno = FIRST_IWMMXT_REGNUM;
- regno <= LAST_IWMMXT_REGNUM; ++ regno)
- {
- fixed_regs[regno] = 0;
- call_used_regs[regno] = regno < FIRST_IWMMXT_REGNUM + 10;
+ break;
+ case 1:
+ switch (d->vmode)
+ {
+ case V16QImode: gen = gen_neon_vrev16v16qi; break;
+ case V8QImode: gen = gen_neon_vrev16v8qi; break;
+ case V8HImode: gen = gen_neon_vrev32v8hi; break;
+ case V4HImode: gen = gen_neon_vrev32v4hi; break;
+ case V4SImode: gen = gen_neon_vrev64v4si; break;
+ case V2SImode: gen = gen_neon_vrev64v2si; break;
+ case V4SFmode: gen = gen_neon_vrev64v4sf; break;
+ case V2SFmode: gen = gen_neon_vrev64v2sf; break;
+ default:
+ return false;
}
+ break;
+ default:
+ return false;
}
- if ((unsigned) PIC_OFFSET_TABLE_REGNUM != INVALID_REGNUM)
+ for (i = 0; i < nelt; i += diff)
+ for (j = 0; j <= diff; j += 1)
+ if (d->perm[i + j] != i + diff - j)
+ return false;
+
+ /* Success! */
+ if (d->testing_p)
+ return true;
+
+ /* ??? The third operand is an artifact of the builtin infrastructure
+ and is ignored by the actual instruction. */
+ emit_insn (gen (d->target, d->op0, const0_rtx));
+ return true;
+}
+
+/* Recognize patterns for the VTRN insns. */
+
+static bool
+arm_evpc_neon_vtrn (struct expand_vec_perm_d *d)
+{
+ unsigned int i, odd, mask, nelt = d->nelt;
+ rtx out0, out1, in0, in1, x;
+ rtx (*gen)(rtx, rtx, rtx, rtx);
+
+ if (GET_MODE_UNIT_SIZE (d->vmode) >= 8)
+ return false;
+
+ /* Note that these are little-endian tests. Adjust for big-endian later. */
+ if (d->perm[0] == 0)
+ odd = 0;
+ else if (d->perm[0] == 1)
+ odd = 1;
+ else
+ return false;
+ mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
+
+ for (i = 0; i < nelt; i += 2)
{
- fixed_regs[PIC_OFFSET_TABLE_REGNUM] = 1;
- call_used_regs[PIC_OFFSET_TABLE_REGNUM] = 1;
+ if (d->perm[i] != i + odd)
+ return false;
+ if (d->perm[i + 1] != ((i + nelt + odd) & mask))
+ return false;
}
- else if (TARGET_APCS_STACK)
+
+ /* Success! */
+ if (d->testing_p)
+ return true;
+
+ switch (d->vmode)
{
- fixed_regs[10] = 1;
- call_used_regs[10] = 1;
+ case V16QImode: gen = gen_neon_vtrnv16qi_internal; break;
+ case V8QImode: gen = gen_neon_vtrnv8qi_internal; break;
+ case V8HImode: gen = gen_neon_vtrnv8hi_internal; break;
+ case V4HImode: gen = gen_neon_vtrnv4hi_internal; break;
+ case V4SImode: gen = gen_neon_vtrnv4si_internal; break;
+ case V2SImode: gen = gen_neon_vtrnv2si_internal; break;
+ case V2SFmode: gen = gen_neon_vtrnv2sf_internal; break;
+ case V4SFmode: gen = gen_neon_vtrnv4sf_internal; break;
+ default:
+ gcc_unreachable ();
}
- /* -mcaller-super-interworking reserves r11 for calls to
- _interwork_r11_call_via_rN(). Making the register global
- is an easy way of ensuring that it remains valid for all
- calls. */
- if (TARGET_APCS_FRAME || TARGET_CALLER_INTERWORKING
- || TARGET_TPCS_FRAME || TARGET_TPCS_LEAF_FRAME)
+
+ in0 = d->op0;
+ in1 = d->op1;
+ if (BYTES_BIG_ENDIAN)
{
- fixed_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
- call_used_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
- if (TARGET_CALLER_INTERWORKING)
- global_regs[ARM_HARD_FRAME_POINTER_REGNUM] = 1;
+ x = in0, in0 = in1, in1 = x;
+ odd = !odd;
}
- SUBTARGET_CONDITIONAL_REGISTER_USAGE
+
+ out0 = d->target;
+ out1 = gen_reg_rtx (d->vmode);
+ if (odd)
+ x = out0, out0 = out1, out1 = x;
+
+ emit_insn (gen (out0, in0, in1, out1));
+ return true;
}
-static reg_class_t
-arm_preferred_rename_class (reg_class_t rclass)
+/* The NEON VTBL instruction is a fully variable permuation that's even
+ stronger than what we expose via VEC_PERM_EXPR. What it doesn't do
+ is mask the index operand as VEC_PERM_EXPR requires. Therefore we
+ can do slightly better by expanding this as a constant where we don't
+ have to apply a mask. */
+
+static bool
+arm_evpc_neon_vtbl (struct expand_vec_perm_d *d)
{
- /* Thumb-2 instructions using LO_REGS may be smaller than instructions
- using GENERIC_REGS. During register rename pass, we prefer LO_REGS,
- and code size can be reduced. */
- if (TARGET_THUMB2 && rclass == GENERAL_REGS)
- return LO_REGS;
- else
- return NO_REGS;
+ rtx rperm[MAX_VECT_LEN], sel;
+ enum machine_mode vmode = d->vmode;
+ unsigned int i, nelt = d->nelt;
+
+ /* TODO: ARM's VTBL indexing is little-endian. In order to handle GCC's
+ numbering of elements for big-endian, we must reverse the order. */
+ if (BYTES_BIG_ENDIAN)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ /* Generic code will try constant permutation twice. Once with the
+ original mode and again with the elements lowered to QImode.
+ So wait and don't do the selector expansion ourselves. */
+ if (vmode != V8QImode && vmode != V16QImode)
+ return false;
+
+ for (i = 0; i < nelt; ++i)
+ rperm[i] = GEN_INT (d->perm[i]);
+ sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
+ sel = force_reg (vmode, sel);
+
+ arm_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
+ return true;
}
-/* Compute the atrribute "length" of insn "*push_multi".
- So this function MUST be kept in sync with that insn pattern. */
-int
-arm_attr_length_push_multi(rtx parallel_op, rtx first_op)
+static bool
+arm_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
{
- int i, regno, hi_reg;
- int num_saves = XVECLEN (parallel_op, 0);
+ /* The pattern matching functions above are written to look for a small
+ number to begin the sequence (0, 1, N/2). If we begin with an index
+ from the second operand, we can swap the operands. */
+ if (d->perm[0] >= d->nelt)
+ {
+ unsigned i, nelt = d->nelt;
+ rtx x;
- /* ARM mode. */
- if (TARGET_ARM)
- return 4;
- /* Thumb1 mode. */
- if (TARGET_THUMB1)
- return 2;
+ for (i = 0; i < nelt; ++i)
+ d->perm[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
- /* Thumb2 mode. */
- regno = REGNO (first_op);
- hi_reg = (REGNO_REG_CLASS (regno) == HI_REGS) && (regno != LR_REGNUM);
- for (i = 1; i < num_saves && !hi_reg; i++)
+ x = d->op0;
+ d->op0 = d->op1;
+ d->op1 = x;
+ }
+
+ if (TARGET_NEON)
{
- regno = REGNO (XEXP (XVECEXP (parallel_op, 0, i), 0));
- hi_reg |= (REGNO_REG_CLASS (regno) == HI_REGS) && (regno != LR_REGNUM);
+ if (arm_evpc_neon_vuzp (d))
+ return true;
+ if (arm_evpc_neon_vzip (d))
+ return true;
+ if (arm_evpc_neon_vrev (d))
+ return true;
+ if (arm_evpc_neon_vtrn (d))
+ return true;
+ return arm_evpc_neon_vtbl (d);
}
+ return false;
+}
- if (!hi_reg)
- return 2;
- return 4;
+/* Expand a vec_perm_const pattern. */
+
+bool
+arm_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
+{
+ struct expand_vec_perm_d d;
+ int i, nelt, which;
+
+ d.target = target;
+ d.op0 = op0;
+ d.op1 = op1;
+
+ d.vmode = GET_MODE (target);
+ gcc_assert (VECTOR_MODE_P (d.vmode));
+ d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+ d.testing_p = false;
+
+ for (i = which = 0; i < nelt; ++i)
+ {
+ rtx e = XVECEXP (sel, 0, i);
+ int ei = INTVAL (e) & (2 * nelt - 1);
+ which |= (ei < nelt ? 1 : 2);
+ d.perm[i] = ei;
+ }
+
+ switch (which)
+ {
+ default:
+ gcc_unreachable();
+
+ case 3:
+ d.one_vector_p = false;
+ if (!rtx_equal_p (op0, op1))
+ break;
+
+ /* The elements of PERM do not suggest that only the first operand
+ is used, but both operands are identical. Allow easier matching
+ of the permutation by folding the permutation into the single
+ input vector. */
+ /* FALLTHRU */
+ case 2:
+ for (i = 0; i < nelt; ++i)
+ d.perm[i] &= nelt - 1;
+ d.op0 = op1;
+ d.one_vector_p = true;
+ break;
+
+ case 1:
+ d.op1 = op0;
+ d.one_vector_p = true;
+ break;
+ }
+
+ return arm_expand_vec_perm_const_1 (&d);
}
-/* Compute the number of instructions emitted by output_move_double. */
-int
-arm_count_output_move_double_insns (rtx *operands)
+/* Implement TARGET_VECTORIZE_VEC_PERM_CONST_OK. */
+
+static bool
+arm_vectorize_vec_perm_const_ok (enum machine_mode vmode,
+ const unsigned char *sel)
{
- int count;
- output_move_double (operands, false, &count);
- return count;
+ struct expand_vec_perm_d d;
+ unsigned int i, nelt, which;
+ bool ret;
+
+ d.vmode = vmode;
+ d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
+ d.testing_p = true;
+ memcpy (d.perm, sel, nelt);
+
+ /* Categorize the set of elements in the selector. */
+ for (i = which = 0; i < nelt; ++i)
+ {
+ unsigned char e = d.perm[i];
+ gcc_assert (e < 2 * nelt);
+ which |= (e < nelt ? 1 : 2);
+ }
+
+ /* For all elements from second vector, fold the elements to first. */
+ if (which == 2)
+ for (i = 0; i < nelt; ++i)
+ d.perm[i] -= nelt;
+
+ /* Check whether the mask can be applied to the vector type. */
+ d.one_vector_p = (which != 3);
+
+ d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
+ d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
+ if (!d.one_vector_p)
+ d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
+
+ start_sequence ();
+ ret = arm_expand_vec_perm_const_1 (&d);
+ end_sequence ();
+
+ return ret;
}
+\f
#include "gt-arm.h"