X-Git-Url: http://git.sourceforge.jp/view?p=pf3gnuchains%2Fgcc-fork.git;a=blobdiff_plain;f=gcc%2Fconfig%2Fi386%2Fi386.c;h=8e0185623a38b3dce0bc96031ec6dcd1703de44c;hp=ca62b229089a70ef61b41f195a495dd6717394e1;hb=e8b33739f8ce01ae1e8733d3f19ac5501d297edd;hpb=63f5ad449bbe0a4d478ae9412461e204533a6206 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index ca62b229089..8e0185623a3 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1,6 +1,6 @@ /* Subroutines used for code generation on IA-32. - Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, - 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011 + Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, + 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. This file is part of GCC. @@ -47,6 +47,7 @@ along with GCC; see the file COPYING3. If not see #include "target-def.h" #include "common/common-target.h" #include "langhooks.h" +#include "reload.h" #include "cgraph.h" #include "gimple.h" #include "dwarf2.h" @@ -1622,7 +1623,7 @@ struct processor_costs nocona_cost = { 8, /* MMX or SSE register to integer */ 8, /* size of l1 cache. */ 1024, /* size of l2 cache. */ - 128, /* size of prefetch block */ + 64, /* size of prefetch block */ 8, /* number of parallel prefetches */ 1, /* Branch cost */ COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */ @@ -2185,7 +2186,7 @@ unsigned char ix86_arch_features[X86_ARCH_LAST]; /* Feature tests against the various architecture variations, used to create ix86_arch_features based on the processor mask. */ static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = { - /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */ + /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */ ~(m_386 | m_486 | m_PENT | m_K6), /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */ @@ -2427,9 +2428,12 @@ enum processor_type ix86_tune; /* Which instruction set architecture to use. */ enum processor_type ix86_arch; -/* true if sse prefetch instruction is not NOOP. */ +/* True if processor has SSE prefetch instruction. */ int x86_prefetch_sse; +/* True if processor has prefetchw instruction. */ +int x86_prefetchw; + /* -mstackrealign option */ static const char ix86_force_align_arg_pointer_string[] = "force_align_arg_pointer"; @@ -2930,6 +2934,8 @@ ix86_option_override_internal (bool main_args_p) #define PTA_XOP (HOST_WIDE_INT_1 << 29) #define PTA_AVX2 (HOST_WIDE_INT_1 << 30) #define PTA_BMI2 (HOST_WIDE_INT_1 << 31) +#define PTA_PREFETCHW (HOST_WIDE_INT_1 << 32) + /* if this reaches 64, need to widen struct pta flags below */ static struct pta @@ -2973,7 +2979,7 @@ ix86_option_override_internal (bool main_args_p) | PTA_SSSE3 | PTA_CX16}, {"corei7", PROCESSOR_COREI7_64, CPU_COREI7, PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 - | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16}, + | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT}, {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7, PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX @@ -2988,12 +2994,12 @@ ix86_option_override_internal (bool main_args_p) | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT - | PTA_FMA | PTA_MOVBE}, + | PTA_FMA | PTA_MOVBE}, {"atom", PROCESSOR_ATOM, CPU_ATOM, PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE}, {"geode", PROCESSOR_GEODE, CPU_GEODE, - PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE}, + PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE}, {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX}, {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW}, {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW}, @@ -3019,7 +3025,7 @@ ix86_option_override_internal (bool main_args_p) PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF}, {"opteron-sse3", PROCESSOR_K8, CPU_K8, - PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE + PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF}, {"athlon64", PROCESSOR_K8, CPU_K8, PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE @@ -3037,19 +3043,19 @@ ix86_option_override_internal (bool main_args_p) PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM}, {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1, - PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 - | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 - | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4 - | PTA_XOP | PTA_LWP}, + PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2 + | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 + | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX + | PTA_FMA4 | PTA_XOP | PTA_LWP}, {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2, - PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 - | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1 - | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX - | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C + PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2 + | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 + | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX + | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C | PTA_FMA}, {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64, - PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 - | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16}, + PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2 + | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16}, {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO, 0 /* flags are only used for -march switch. */ }, {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64, @@ -3205,7 +3211,7 @@ ix86_option_override_internal (bool main_args_p) "large", "32"); else if (TARGET_X32) error ("code model %qs not supported in x32 mode", - "medium"); + "large"); break; case CM_32: @@ -3357,6 +3363,8 @@ ix86_option_override_internal (bool main_args_p) ix86_isa_flags |= OPTION_MASK_ISA_F16C; if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)) x86_prefetch_sse = true; + if (processor_alias_table[i].flags & PTA_PREFETCHW) + x86_prefetchw = true; break; } @@ -3423,7 +3431,7 @@ ix86_option_override_internal (bool main_args_p) -mtune (rather than -march) points us to a processor that has them. However, the VIA C3 gives a SIGILL, so we only do that for i686 and higher processors. */ - if (TARGET_CMOVE + if (TARGET_CMOV && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))) x86_prefetch_sse = true; break; @@ -3449,8 +3457,6 @@ ix86_option_override_internal (bool main_args_p) in case they weren't overwritten by command line options. */ if (TARGET_64BIT) { - if (optimize > 1 && !global_options_set.x_flag_zee) - flag_zee = 1; if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer) flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER; if (flag_asynchronous_unwind_tables == 2) @@ -3701,12 +3707,6 @@ ix86_option_override_internal (bool main_args_p) target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; } - /* For sane SSE instruction set generation we need fcomi instruction. - It is safe to enable all CMOVE instructions. Also, RDRAND intrinsic - expands to a sequence that includes conditional move. */ - if (TARGET_SSE || TARGET_RDRND) - TARGET_CMOVE = 1; - /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */ { char *p; @@ -5028,6 +5028,40 @@ ix86_handle_cconv_attribute (tree *node, tree name, return NULL_TREE; } +/* The transactional memory builtins are implicitly regparm or fastcall + depending on the ABI. Override the generic do-nothing attribute that + these builtins were declared with, and replace it with one of the two + attributes that we expect elsewhere. */ + +static tree +ix86_handle_tm_regparm_attribute (tree *node, tree name ATTRIBUTE_UNUSED, + tree args ATTRIBUTE_UNUSED, + int flags ATTRIBUTE_UNUSED, + bool *no_add_attrs) +{ + tree alt; + + /* In no case do we want to add the placeholder attribute. */ + *no_add_attrs = true; + + /* The 64-bit ABI is unchanged for transactional memory. */ + if (TARGET_64BIT) + return NULL_TREE; + + /* ??? Is there a better way to validate 32-bit windows? We have + cfun->machine->call_abi, but that seems to be set only for 64-bit. */ + if (CHECK_STACK_LIMIT > 0) + alt = tree_cons (get_identifier ("fastcall"), NULL, NULL); + else + { + alt = tree_cons (NULL, build_int_cst (NULL, 2), NULL); + alt = tree_cons (get_identifier ("regparm"), alt, NULL); + } + decl_attributes (node, alt, flags); + + return NULL_TREE; +} + /* This function determines from TYPE the calling-convention. */ unsigned int @@ -5558,7 +5592,10 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ { /* The return value of this function uses 256bit AVX modes. */ if (caller) - cfun->machine->callee_return_avx256_p = true; + { + cfun->machine->callee_return_avx256_p = true; + cum->callee_return_avx256_p = true; + } else cfun->machine->caller_return_avx256_p = true; } @@ -6266,7 +6303,7 @@ construct_container (enum machine_mode mode, enum machine_mode orig_mode, /* Likewise, error if the ABI requires us to return values in the x87 registers and the user specified -mno-80387. */ - if (!TARGET_80387 && in_return) + if (!TARGET_FLOAT_RETURNS_IN_80387 && in_return) for (i = 0; i < n; i++) if (regclass[i] == X86_64_X87_CLASS || regclass[i] == X86_64_X87UP_CLASS @@ -6829,11 +6866,20 @@ ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode, { /* This argument uses 256bit AVX modes. */ if (cum->caller) - cfun->machine->callee_pass_avx256_p = true; + cum->callee_pass_avx256_p = true; else cfun->machine->caller_pass_avx256_p = true; } + if (cum->caller && mode == VOIDmode) + { + /* This function is called with MODE == VOIDmode immediately + before the call instruction is emitted. We copy callee 256bit + AVX info from the current CUM here. */ + cfun->machine->callee_return_avx256_p = cum->callee_return_avx256_p; + cfun->machine->callee_pass_avx256_p = cum->callee_pass_avx256_p; + } + return arg; } @@ -7083,9 +7129,15 @@ ix86_function_value_regno_p (const unsigned int regno) switch (regno) { case AX_REG: + case DX_REG: return true; + case DI_REG: + case SI_REG: + return TARGET_64BIT && ix86_abi != MS_ABI; - case FIRST_FLOAT_REG: + /* Complex values are returned in %st(0)/%st(1) pair. */ + case ST0_REG: + case ST1_REG: /* TODO: The function should depend on current function ABI but builtins.c would need updating then. Therefore we use the default ABI. */ @@ -7093,10 +7145,12 @@ ix86_function_value_regno_p (const unsigned int regno) return false; return TARGET_FLOAT_RETURNS_IN_80387; - case FIRST_SSE_REG: + /* Complex values are returned in %xmm0/%xmm1 pair. */ + case XMM0_REG: + case XMM1_REG: return TARGET_SSE; - case FIRST_MMX_REG: + case MM0_REG: if (TARGET_MACHO || TARGET_64BIT) return false; return TARGET_MMX; @@ -8394,6 +8448,11 @@ ix86_frame_pointer_required (void) if (TARGET_32BIT_MS_ABI && cfun->calls_setjmp) return true; + /* Win64 SEH, very large frames need a frame-pointer as maximum stack + allocation is 4GB. */ + if (TARGET_64BIT_MS_ABI && get_frame_size () > SEH_MAX_FRAME_SIZE) + return true; + /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER turns off the frame pointer by default. Turn it back on now if we've not got a leaf function. */ @@ -8562,17 +8621,12 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED) if (!flag_pic) { - xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ()); + if (TARGET_MACHO) + /* We don't need a pic base, we're not producing pic. */ + gcc_unreachable (); + xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ()); output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops); - -#if TARGET_MACHO - /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This - is what will be referenced by the Mach-O PIC subsystem. */ - if (!label) - ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME); -#endif - targetm.asm_out.internal_label (asm_out_file, "L", CODE_LABEL_NUMBER (XEXP (xops[2], 0))); } @@ -8585,12 +8639,18 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED) xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name)); xops[2] = gen_rtx_MEM (QImode, xops[2]); output_asm_insn ("call\t%X2", xops); - /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This - is what will be referenced by the Mach-O PIC subsystem. */ + #if TARGET_MACHO - if (!label) + /* Output the Mach-O "canonical" pic base label name ("Lxx$pb") here. + This is what will be referenced by the Mach-O PIC subsystem. */ + if (machopic_should_output_picbase_label () || !label) ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME); - else + + /* When we are restoring the pic base at the site of a nonlocal label, + and we decided to emit the pic base above, we will still output a + local label used for calculating the correction offset (even though + the offset will be 0 in that case). */ + if (label) targetm.asm_out.internal_label (asm_out_file, "L", CODE_LABEL_NUMBER (label)); #endif @@ -8666,7 +8726,8 @@ ix86_save_reg (unsigned int regno, bool maybe_eh_return) && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM) || crtl->profile || crtl->calls_eh_return - || crtl->uses_const_pool)) + || crtl->uses_const_pool + || cfun->has_nonlocal_label)) return ix86_select_alt_pic_regnum () == INVALID_REGNUM; if (crtl->calls_eh_return && maybe_eh_return) @@ -8880,6 +8941,11 @@ ix86_compute_frame_layout (struct ix86_frame *frame) offset += frame->nregs * UNITS_PER_WORD; frame->reg_save_offset = offset; + /* On SEH target, registers are pushed just before the frame pointer + location. */ + if (TARGET_SEH) + frame->hard_frame_pointer_offset = offset; + /* Align and set SSE register save area. */ if (frame->nsseregs) { @@ -8971,9 +9037,12 @@ ix86_compute_frame_layout (struct ix86_frame *frame) { HOST_WIDE_INT diff; - /* If we can leave the frame pointer where it is, do so. */ + /* If we can leave the frame pointer where it is, do so. Also, returns + the establisher frame for __builtin_frame_address (0). */ diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset; - if (diff > 240 || (diff & 15) != 0) + if (diff <= SEH_MAX_FRAME_SIZE + && (diff > 240 || (diff & 15) != 0) + && !crtl->accesses_prior_frames) { /* Ideally we'd determine what portion of the local stack frame (within the constraint of the lowest 240) is most heavily used. @@ -9511,6 +9580,8 @@ get_scratch_register_on_entry (struct scratch_reg *sr) tree decl = current_function_decl, fntype = TREE_TYPE (decl); bool fastcall_p = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE; + bool thiscall_p + = lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE; bool static_chain_p = DECL_STATIC_CHAIN (decl); int regparm = ix86_function_regparm (fntype, decl); int drap_regno @@ -9521,10 +9592,15 @@ get_scratch_register_on_entry (struct scratch_reg *sr) if ((regparm < 1 || (fastcall_p && !static_chain_p)) && drap_regno != AX_REG) regno = AX_REG; - else if (regparm < 2 && drap_regno != DX_REG) + /* 'thiscall' sets regparm to 1, uses ecx for arguments and edx + for the static chain register. */ + else if (thiscall_p && !static_chain_p && drap_regno != AX_REG) + regno = AX_REG; + else if (regparm < 2 && !thiscall_p && drap_regno != DX_REG) regno = DX_REG; /* ecx is the static chain register. */ - else if (regparm < 3 && !fastcall_p && !static_chain_p + else if (regparm < 3 && !fastcall_p && !thiscall_p + && !static_chain_p && drap_regno != CX_REG) regno = CX_REG; else if (ix86_save_reg (BX_REG, true)) @@ -9557,6 +9633,7 @@ release_scratch_register_on_entry (struct scratch_reg *sr) { if (sr->saved) { + struct machine_function *m = cfun->machine; rtx x, insn = emit_insn (gen_pop (sr->reg)); /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */ @@ -9564,6 +9641,7 @@ release_scratch_register_on_entry (struct scratch_reg *sr) x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD)); x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x); add_reg_note (insn, REG_FRAME_RELATED_EXPR, x); + m->fs.sp_offset -= UNITS_PER_WORD; } } @@ -9894,12 +9972,68 @@ ix86_finalize_stack_realign_flags (void) /* After stack_realign_needed is finalized, we can't no longer change it. */ gcc_assert (crtl->stack_realign_needed == stack_realign); + return; } - else - { - crtl->stack_realign_needed = stack_realign; - crtl->stack_realign_finalized = true; + + /* If the only reason for frame_pointer_needed is that we conservatively + assumed stack realignment might be needed, but in the end nothing that + needed the stack alignment had been spilled, clear frame_pointer_needed + and say we don't need stack realignment. */ + if (stack_realign + && !crtl->need_drap + && frame_pointer_needed + && current_function_is_leaf + && flag_omit_frame_pointer + && current_function_sp_is_unchanging + && !ix86_current_function_calls_tls_descriptor + && !crtl->accesses_prior_frames + && !cfun->calls_alloca + && !crtl->calls_eh_return + && !(flag_stack_check && STACK_CHECK_MOVING_SP) + && !ix86_frame_pointer_required () + && get_frame_size () == 0 + && ix86_nsaved_sseregs () == 0 + && ix86_varargs_gpr_size + ix86_varargs_fpr_size == 0) + { + HARD_REG_SET set_up_by_prologue, prologue_used; + basic_block bb; + + CLEAR_HARD_REG_SET (prologue_used); + CLEAR_HARD_REG_SET (set_up_by_prologue); + add_to_hard_reg_set (&set_up_by_prologue, Pmode, STACK_POINTER_REGNUM); + add_to_hard_reg_set (&set_up_by_prologue, Pmode, ARG_POINTER_REGNUM); + add_to_hard_reg_set (&set_up_by_prologue, Pmode, + HARD_FRAME_POINTER_REGNUM); + FOR_EACH_BB (bb) + { + rtx insn; + FOR_BB_INSNS (bb, insn) + if (NONDEBUG_INSN_P (insn) + && requires_stack_frame_p (insn, prologue_used, + set_up_by_prologue)) + { + crtl->stack_realign_needed = stack_realign; + crtl->stack_realign_finalized = true; + return; + } + } + + frame_pointer_needed = false; + stack_realign = false; + crtl->max_used_stack_slot_alignment = incoming_stack_boundary; + crtl->stack_alignment_needed = incoming_stack_boundary; + crtl->stack_alignment_estimated = incoming_stack_boundary; + if (crtl->preferred_stack_boundary > incoming_stack_boundary) + crtl->preferred_stack_boundary = incoming_stack_boundary; + df_finish_pass (true); + df_scan_alloc (NULL); + df_scan_blocks (); + df_compute_regs_ever_live (true); + df_analyze (); } + + crtl->stack_realign_needed = stack_realign; + crtl->stack_realign_finalized = true; } /* Expand the prologue into a bunch of separate insns. */ @@ -9913,6 +10047,7 @@ ix86_expand_prologue (void) struct ix86_frame frame; HOST_WIDE_INT allocate; bool int_registers_saved; + bool sse_registers_saved; ix86_finalize_stack_realign_flags (); @@ -10065,6 +10200,9 @@ ix86_expand_prologue (void) m->fs.realigned = true; } + int_registers_saved = (frame.nregs == 0); + sse_registers_saved = (frame.nsseregs == 0); + if (frame_pointer_needed && !m->fs.fp_valid) { /* Note: AT&T enter does NOT have reversed args. Enter is probably @@ -10072,6 +10210,17 @@ ix86_expand_prologue (void) insn = emit_insn (gen_push (hard_frame_pointer_rtx)); RTX_FRAME_RELATED_P (insn) = 1; + /* Push registers now, before setting the frame pointer + on SEH target. */ + if (!int_registers_saved + && TARGET_SEH + && !frame.save_regs_using_mov) + { + ix86_emit_save_regs (); + int_registers_saved = true; + gcc_assert (m->fs.sp_offset == frame.reg_save_offset); + } + if (m->fs.sp_offset == frame.hard_frame_pointer_offset) { insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx); @@ -10084,8 +10233,6 @@ ix86_expand_prologue (void) } } - int_registers_saved = (frame.nregs == 0); - if (!int_registers_saved) { /* If saving registers via PUSH, do so now. */ @@ -10162,6 +10309,27 @@ ix86_expand_prologue (void) current_function_static_stack_size = stack_size; } + /* On SEH target with very large frame size, allocate an area to save + SSE registers (as the very large allocation won't be described). */ + if (TARGET_SEH + && frame.stack_pointer_offset > SEH_MAX_FRAME_SIZE + && !sse_registers_saved) + { + HOST_WIDE_INT sse_size = + frame.sse_reg_save_offset - frame.reg_save_offset; + + gcc_assert (int_registers_saved); + + /* No need to do stack checking as the area will be immediately + written. */ + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (-sse_size), -1, + m->fs.cfa_reg == stack_pointer_rtx); + allocate -= sse_size; + ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset); + sse_registers_saved = true; + } + /* The stack has already been decremented by the instruction calling us so probe if the size is non-negative to preserve the protection area. */ if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK) @@ -10202,7 +10370,7 @@ ix86_expand_prologue (void) rtx eax = gen_rtx_REG (Pmode, AX_REG); rtx r10 = NULL; rtx (*adjust_stack_insn)(rtx, rtx, rtx); - + const bool sp_is_cfa_reg = (m->fs.cfa_reg == stack_pointer_rtx); bool eax_live = false; bool r10_live = false; @@ -10211,16 +10379,31 @@ ix86_expand_prologue (void) if (!TARGET_64BIT_MS_ABI) eax_live = ix86_eax_live_at_start_p (); + /* Note that SEH directives need to continue tracking the stack + pointer even after the frame pointer has been set up. */ if (eax_live) { - emit_insn (gen_push (eax)); + insn = emit_insn (gen_push (eax)); allocate -= UNITS_PER_WORD; + if (sp_is_cfa_reg || TARGET_SEH) + { + if (sp_is_cfa_reg) + m->fs.cfa_offset += UNITS_PER_WORD; + RTX_FRAME_RELATED_P (insn) = 1; + } } + if (r10_live) { r10 = gen_rtx_REG (Pmode, R10_REG); - emit_insn (gen_push (r10)); + insn = emit_insn (gen_push (r10)); allocate -= UNITS_PER_WORD; + if (sp_is_cfa_reg || TARGET_SEH) + { + if (sp_is_cfa_reg) + m->fs.cfa_offset += UNITS_PER_WORD; + RTX_FRAME_RELATED_P (insn) = 1; + } } emit_move_insn (eax, GEN_INT (allocate)); @@ -10234,13 +10417,10 @@ ix86_expand_prologue (void) insn = emit_insn (adjust_stack_insn (stack_pointer_rtx, stack_pointer_rtx, eax)); - /* Note that SEH directives need to continue tracking the stack - pointer even after the frame pointer has been set up. */ - if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH) + if (sp_is_cfa_reg || TARGET_SEH) { - if (m->fs.cfa_reg == stack_pointer_rtx) + if (sp_is_cfa_reg) m->fs.cfa_offset += allocate; - RTX_FRAME_RELATED_P (insn) = 1; add_reg_note (insn, REG_FRAME_RELATED_EXPR, gen_rtx_SET (VOIDmode, stack_pointer_rtx, @@ -10251,14 +10431,15 @@ ix86_expand_prologue (void) if (r10_live && eax_live) { - t = choose_baseaddr (m->fs.sp_offset - allocate); + t = plus_constant (stack_pointer_rtx, allocate); emit_move_insn (r10, gen_frame_mem (Pmode, t)); - t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD); + t = plus_constant (stack_pointer_rtx, + allocate - UNITS_PER_WORD); emit_move_insn (eax, gen_frame_mem (Pmode, t)); } else if (eax_live || r10_live) { - t = choose_baseaddr (m->fs.sp_offset - allocate); + t = plus_constant (stack_pointer_rtx, allocate); emit_move_insn ((eax_live ? eax : r10), gen_frame_mem (Pmode, t)); } } @@ -10282,7 +10463,7 @@ ix86_expand_prologue (void) if (!int_registers_saved) ix86_emit_save_regs_using_mov (frame.reg_save_offset); - if (frame.nsseregs) + if (!sse_registers_saved) ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset); pic_reg_used = false; @@ -10455,9 +10636,9 @@ ix86_emit_leave (void) add_reg_note (insn, REG_CFA_DEF_CFA, plus_constant (stack_pointer_rtx, m->fs.sp_offset)); RTX_FRAME_RELATED_P (insn) = 1; - ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx, - m->fs.fp_offset); } + ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx, + m->fs.fp_offset); } /* Emit code to restore saved registers using MOV insns. @@ -10524,6 +10705,17 @@ ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset, } } +/* Emit vzeroupper if needed. */ + +void +ix86_maybe_emit_epilogue_vzeroupper (void) +{ + if (TARGET_VZEROUPPER + && !TREE_THIS_VOLATILE (cfun->decl) + && !cfun->machine->caller_return_avx256_p) + emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256))); +} + /* Restore function stack, frame, and registers. */ void @@ -10722,8 +10914,13 @@ ix86_expand_epilogue (int style) } /* First step is to deallocate the stack frame so that we can - pop the registers. */ - if (!m->fs.sp_valid) + pop the registers. Also do it on SEH target for very large + frame as the emitted instructions aren't allowed by the ABI in + epilogues. */ + if (!m->fs.sp_valid + || (TARGET_SEH + && (m->fs.sp_offset - frame.reg_save_offset + >= SEH_MAX_FRAME_SIZE))) { pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx, GEN_INT (m->fs.fp_offset @@ -10821,10 +11018,7 @@ ix86_expand_epilogue (int style) } /* Emit vzeroupper if needed. */ - if (TARGET_VZEROUPPER - && !TREE_THIS_VOLATILE (cfun->decl) - && !cfun->machine->caller_return_avx256_p) - emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256))); + ix86_maybe_emit_epilogue_vzeroupper (); if (crtl->args.pops_args && crtl->args.size) { @@ -10919,12 +11113,15 @@ split_stack_prologue_scratch_regno (void) return R11_REG; else { - bool is_fastcall; + bool is_fastcall, is_thiscall; int regparm; is_fastcall = (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl))) != NULL); + is_thiscall = (lookup_attribute ("thiscall", + TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl))) + != NULL); regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl); if (is_fastcall) @@ -10937,6 +11134,12 @@ split_stack_prologue_scratch_regno (void) } return AX_REG; } + else if (is_thiscall) + { + if (!DECL_STATIC_CHAIN (cfun->decl)) + return DX_REG; + return AX_REG; + } else if (regparm < 3) { if (!DECL_STATIC_CHAIN (cfun->decl)) @@ -11232,30 +11435,6 @@ ix86_live_on_entry (bitmap regs) } } -/* Determine if op is suitable SUBREG RTX for address. */ - -static bool -ix86_address_subreg_operand (rtx op) -{ - enum machine_mode mode; - - if (!REG_P (op)) - return false; - - mode = GET_MODE (op); - - if (GET_MODE_CLASS (mode) != MODE_INT) - return false; - - /* Don't allow SUBREGs that span more than a word. It can lead to spill - failures when the register is one word out of a two word structure. */ - if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) - return false; - - /* Allow only SUBREGs of non-eliminable hard registers. */ - return register_no_elim_operand (op, mode); -} - /* Extract the parts of an RTL expression that is a valid memory address for an instruction. Return 0 if the structure of the address is grossly off. Return -1 if the address contains ASHIFT, so it is not @@ -11278,16 +11457,41 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) { if (GET_CODE (addr) == ZERO_EXTEND && GET_MODE (XEXP (addr, 0)) == SImode) - addr = XEXP (addr, 0); + { + addr = XEXP (addr, 0); + if (CONST_INT_P (addr)) + return 0; + } else if (GET_CODE (addr) == AND && const_32bit_mask (XEXP (addr, 1), DImode)) { addr = XEXP (addr, 0); - /* Strip subreg. */ + /* Adjust SUBREGs. */ if (GET_CODE (addr) == SUBREG && GET_MODE (SUBREG_REG (addr)) == SImode) - addr = SUBREG_REG (addr); + { + addr = SUBREG_REG (addr); + if (CONST_INT_P (addr)) + return 0; + } + else if (GET_MODE (addr) == DImode) + addr = gen_rtx_SUBREG (SImode, addr, 0); + else if (GET_MODE (addr) != VOIDmode) + return 0; + } + } + + /* Allow SImode subregs of DImode addresses, + they will be emitted with addr32 prefix. */ + if (TARGET_64BIT && GET_MODE (addr) == SImode) + { + if (GET_CODE (addr) == SUBREG + && GET_MODE (SUBREG_REG (addr)) == DImode) + { + addr = SUBREG_REG (addr); + if (CONST_INT_P (addr)) + return 0; } } @@ -11295,7 +11499,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) base = addr; else if (GET_CODE (addr) == SUBREG) { - if (ix86_address_subreg_operand (SUBREG_REG (addr))) + if (REG_P (SUBREG_REG (addr))) base = addr; else return 0; @@ -11353,7 +11557,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) break; case SUBREG: - if (!ix86_address_subreg_operand (SUBREG_REG (op))) + if (!REG_P (SUBREG_REG (op))) return 0; /* FALLTHRU */ @@ -11406,7 +11610,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) if (REG_P (index)) ; else if (GET_CODE (index) == SUBREG - && ix86_address_subreg_operand (SUBREG_REG (index))) + && REG_P (SUBREG_REG (index))) ; else return 0; @@ -11727,6 +11931,13 @@ legitimate_pic_address_disp_p (rtx disp) break; if (GET_CODE (op0) == LABEL_REF) return true; + if (GET_CODE (op0) == CONST + && GET_CODE (XEXP (op0, 0)) == UNSPEC + && XINT (XEXP (op0, 0), 1) == UNSPEC_PCREL) + return true; + if (GET_CODE (op0) == UNSPEC + && XINT (op0, 1) == UNSPEC_PCREL) + return true; if (GET_CODE (op0) != SYMBOL_REF) break; /* FALLTHRU */ @@ -11820,6 +12031,103 @@ legitimate_pic_address_disp_p (rtx disp) return false; } +/* Our implementation of LEGITIMIZE_RELOAD_ADDRESS. Returns a value to + replace the input X, or the original X if no replacement is called for. + The output parameter *WIN is 1 if the calling macro should goto WIN, + 0 if it should not. */ + +bool +ix86_legitimize_reload_address (rtx x, + enum machine_mode mode ATTRIBUTE_UNUSED, + int opnum, int type, + int ind_levels ATTRIBUTE_UNUSED) +{ + /* Reload can generate: + + (plus:DI (plus:DI (unspec:DI [(const_int 0 [0])] UNSPEC_TP) + (reg:DI 97)) + (reg:DI 2 cx)) + + This RTX is rejected from ix86_legitimate_address_p due to + non-strictness of base register 97. Following this rejection, + reload pushes all three components into separate registers, + creating invalid memory address RTX. + + Following code reloads only the invalid part of the + memory address RTX. */ + + if (GET_CODE (x) == PLUS + && REG_P (XEXP (x, 1)) + && GET_CODE (XEXP (x, 0)) == PLUS + && REG_P (XEXP (XEXP (x, 0), 1))) + { + rtx base, index; + bool something_reloaded = false; + + base = XEXP (XEXP (x, 0), 1); + if (!REG_OK_FOR_BASE_STRICT_P (base)) + { + push_reload (base, NULL_RTX, &XEXP (XEXP (x, 0), 1), NULL, + BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0, + opnum, (enum reload_type)type); + something_reloaded = true; + } + + index = XEXP (x, 1); + if (!REG_OK_FOR_INDEX_STRICT_P (index)) + { + push_reload (index, NULL_RTX, &XEXP (x, 1), NULL, + INDEX_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0, + opnum, (enum reload_type)type); + something_reloaded = true; + } + + gcc_assert (something_reloaded); + return true; + } + + return false; +} + +/* Determine if op is suitable RTX for an address register. + Return naked register if a register or a register subreg is + found, otherwise return NULL_RTX. */ + +static rtx +ix86_validate_address_register (rtx op) +{ + enum machine_mode mode = GET_MODE (op); + + /* Only SImode or DImode registers can form the address. */ + if (mode != SImode && mode != DImode) + return NULL_RTX; + + if (REG_P (op)) + return op; + else if (GET_CODE (op) == SUBREG) + { + rtx reg = SUBREG_REG (op); + + if (!REG_P (reg)) + return NULL_RTX; + + mode = GET_MODE (reg); + + /* Don't allow SUBREGs that span more than a word. It can + lead to spill failures when the register is one word out + of a two word structure. */ + if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) + return NULL_RTX; + + /* Allow only SUBREGs of non-eliminable hard registers. */ + if (register_no_elim_operand (reg, mode)) + return reg; + } + + /* Op is not a register. */ + return NULL_RTX; +} + /* Recognizes RTL expressions that are valid memory addresses for an instruction. The MODE argument is the machine mode for the MEM expression that wants to use this address. @@ -11835,6 +12143,7 @@ ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED, struct ix86_address parts; rtx base, index, disp; HOST_WIDE_INT scale; + enum ix86_address_seg seg; if (ix86_decompose_address (addr, &parts) <= 0) /* Decomposition failed. */ @@ -11844,21 +12153,14 @@ ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED, index = parts.index; disp = parts.disp; scale = parts.scale; + seg = parts.seg; /* Validate base register. */ if (base) { - rtx reg; + rtx reg = ix86_validate_address_register (base); - if (REG_P (base)) - reg = base; - else if (GET_CODE (base) == SUBREG && REG_P (SUBREG_REG (base))) - reg = SUBREG_REG (base); - else - /* Base is not a register. */ - return false; - - if (GET_MODE (base) != SImode && GET_MODE (base) != DImode) + if (reg == NULL_RTX) return false; if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg)) @@ -11870,17 +12172,9 @@ ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED, /* Validate index register. */ if (index) { - rtx reg; - - if (REG_P (index)) - reg = index; - else if (GET_CODE (index) == SUBREG && REG_P (SUBREG_REG (index))) - reg = SUBREG_REG (index); - else - /* Index is not a register. */ - return false; + rtx reg = ix86_validate_address_register (index); - if (GET_MODE (index) != SImode && GET_MODE (index) != DImode) + if (reg == NULL_RTX) return false; if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg)) @@ -11894,6 +12188,12 @@ ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED, && GET_MODE (base) != GET_MODE (index)) return false; + /* Address override works only on the (%reg) part of %fs:(%reg). */ + if (seg != SEG_DEFAULT + && ((base && GET_MODE (base) != word_mode) + || (index && GET_MODE (index) != word_mode))) + return false; + /* Validate scale factor. */ if (scale != 1) { @@ -12015,6 +12315,12 @@ ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED, && !x86_64_immediate_operand (disp, VOIDmode)) /* Displacement is out of range. */ return false; + /* In x32 mode, constant addresses are sign extended to 64bit, so + we have to prevent addresses from 0x80000000 to 0xffffffff. */ + else if (TARGET_X32 && !(index || base) + && CONST_INT_P (disp) + && val_signbit_known_set_p (SImode, INTVAL (disp))) + return false; } /* Everything looks valid. */ @@ -12063,7 +12369,6 @@ legitimize_pic_address (rtx orig, rtx reg) { rtx addr = orig; rtx new_rtx = orig; - rtx base; #if TARGET_MACHO if (TARGET_MACHO && !TARGET_64BIT) @@ -12268,20 +12573,33 @@ legitimize_pic_address (rtx orig, rtx reg) } else { - base = legitimize_pic_address (XEXP (addr, 0), reg); - new_rtx = legitimize_pic_address (XEXP (addr, 1), - base == reg ? NULL_RTX : reg); + rtx base = legitimize_pic_address (op0, reg); + enum machine_mode mode = GET_MODE (base); + new_rtx + = legitimize_pic_address (op1, base == reg ? NULL_RTX : reg); if (CONST_INT_P (new_rtx)) - new_rtx = plus_constant (base, INTVAL (new_rtx)); + { + if (INTVAL (new_rtx) < -16*1024*1024 + || INTVAL (new_rtx) >= 16*1024*1024) + { + if (!x86_64_immediate_operand (new_rtx, mode)) + new_rtx = force_reg (mode, new_rtx); + new_rtx + = gen_rtx_PLUS (mode, force_reg (mode, base), new_rtx); + } + else + new_rtx = plus_constant (base, INTVAL (new_rtx)); + } else { - if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1))) + if (GET_CODE (new_rtx) == PLUS + && CONSTANT_P (XEXP (new_rtx, 1))) { - base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0)); + base = gen_rtx_PLUS (mode, base, XEXP (new_rtx, 0)); new_rtx = XEXP (new_rtx, 1); } - new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx); + new_rtx = gen_rtx_PLUS (mode, base, new_rtx); } } } @@ -12380,6 +12698,9 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov) tp = get_thread_pointer (true); dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest)); + if (GET_MODE (x) != Pmode) + x = gen_rtx_ZERO_EXTEND (Pmode, x); + set_unique_reg_note (get_last_insn (), REG_EQUAL, x); } else @@ -12388,13 +12709,17 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov) if (TARGET_64BIT) { - rtx rax = gen_rtx_REG (Pmode, AX_REG), insns; + rtx rax = gen_rtx_REG (Pmode, AX_REG); + rtx insns; start_sequence (); emit_call_insn (gen_tls_global_dynamic_64 (rax, x, caddr)); insns = get_insns (); end_sequence (); + if (GET_MODE (x) != Pmode) + x = gen_rtx_ZERO_EXTEND (Pmode, x); + RTL_CONST_CALL_P (insns) = 1; emit_libcall_block (insns, dest, rax, x); } @@ -12436,7 +12761,8 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov) if (TARGET_64BIT) { - rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv; + rtx rax = gen_rtx_REG (Pmode, AX_REG); + rtx insns, eqv; start_sequence (); emit_call_insn (gen_tls_local_dynamic_base_64 (rax, caddr)); @@ -12464,6 +12790,9 @@ legitimize_tls_address (rtx x, enum tls_model model, bool for_mov) { dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp)); + if (GET_MODE (x) != Pmode) + x = gen_rtx_ZERO_EXTEND (Pmode, x); + set_unique_reg_note (get_last_insn (), REG_EQUAL, x); } break; @@ -13131,14 +13460,27 @@ ix86_delegitimize_address (rtx x) if (TARGET_64BIT) { + if (GET_CODE (x) == CONST + && GET_CODE (XEXP (x, 0)) == PLUS + && GET_MODE (XEXP (x, 0)) == Pmode + && CONST_INT_P (XEXP (XEXP (x, 0), 1)) + && GET_CODE (XEXP (XEXP (x, 0), 0)) == UNSPEC + && XINT (XEXP (XEXP (x, 0), 0), 1) == UNSPEC_PCREL) + { + rtx x2 = XVECEXP (XEXP (XEXP (x, 0), 0), 0, 0); + x = gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 1), x2); + if (MEM_P (orig_x)) + x = replace_equiv_address_nv (orig_x, x); + return x; + } if (GET_CODE (x) != CONST || GET_CODE (XEXP (x, 0)) != UNSPEC || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL) - || !MEM_P (orig_x)) + || (!MEM_P (orig_x) && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)) return ix86_delegitimize_tls_address (orig_x); x = XVECEXP (XEXP (x, 0), 0, 0); - if (GET_MODE (orig_x) != GET_MODE (x)) + if (GET_MODE (orig_x) != GET_MODE (x) && MEM_P (orig_x)) { x = simplify_gen_subreg (GET_MODE (orig_x), x, GET_MODE (x), 0); @@ -13321,8 +13663,6 @@ put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse, Those same assemblers have the same but opposite lossage on cmov. */ if (mode == CCmode) suffix = fp ? "nbe" : "a"; - else if (mode == CCCmode) - suffix = "b"; else gcc_unreachable (); break; @@ -13344,8 +13684,12 @@ put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse, } break; case LTU: - gcc_assert (mode == CCmode || mode == CCCmode); - suffix = "b"; + if (mode == CCmode) + suffix = "b"; + else if (mode == CCCmode) + suffix = "c"; + else + gcc_unreachable (); break; case GE: switch (mode) @@ -13365,20 +13709,20 @@ put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse, } break; case GEU: - /* ??? As above. */ - gcc_assert (mode == CCmode || mode == CCCmode); - suffix = fp ? "nb" : "ae"; + if (mode == CCmode) + suffix = fp ? "nb" : "ae"; + else if (mode == CCCmode) + suffix = "nc"; + else + gcc_unreachable (); break; case LE: gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode); suffix = "le"; break; case LEU: - /* ??? As above. */ if (mode == CCmode) suffix = "be"; - else if (mode == CCCmode) - suffix = fp ? "nb" : "ae"; else gcc_unreachable (); break; @@ -13410,15 +13754,9 @@ void print_reg (rtx x, int code, FILE *file) { const char *reg; + unsigned int regno; bool duplicated = code == 'd' && TARGET_AVX; - gcc_assert (x == pc_rtx - || (REGNO (x) != ARG_POINTER_REGNUM - && REGNO (x) != FRAME_POINTER_REGNUM - && REGNO (x) != FLAGS_REG - && REGNO (x) != FPSR_REG - && REGNO (x) != FPCR_REG)); - if (ASSEMBLER_DIALECT == ASM_ATT) putc ('%', file); @@ -13429,6 +13767,13 @@ print_reg (rtx x, int code, FILE *file) return; } + regno = true_regnum (x); + gcc_assert (regno != ARG_POINTER_REGNUM + && regno != FRAME_POINTER_REGNUM + && regno != FLAGS_REG + && regno != FPSR_REG + && regno != FPCR_REG); + if (code == 'w' || MMX_REG_P (x)) code = 2; else if (code == 'b') @@ -13449,26 +13794,28 @@ print_reg (rtx x, int code, FILE *file) code = GET_MODE_SIZE (GET_MODE (x)); /* Irritatingly, AMD extended registers use different naming convention - from the normal registers. */ - if (REX_INT_REG_P (x)) + from the normal registers: "r%d[bwd]" */ + if (REX_INT_REGNO_P (regno)) { gcc_assert (TARGET_64BIT); + putc ('r', file); + fprint_ul (file, regno - FIRST_REX_INT_REG + 8); switch (code) { case 0: error ("extended registers have no high halves"); break; case 1: - fprintf (file, "r%ib", REGNO (x) - FIRST_REX_INT_REG + 8); + putc ('b', file); break; case 2: - fprintf (file, "r%iw", REGNO (x) - FIRST_REX_INT_REG + 8); + putc ('w', file); break; case 4: - fprintf (file, "r%id", REGNO (x) - FIRST_REX_INT_REG + 8); + putc ('d', file); break; case 8: - fprintf (file, "r%i", REGNO (x) - FIRST_REX_INT_REG + 8); + /* no suffix */ break; default: error ("unsupported operand size for extended register"); @@ -13496,24 +13843,24 @@ print_reg (rtx x, int code, FILE *file) case 16: case 2: normal: - reg = hi_reg_name[REGNO (x)]; + reg = hi_reg_name[regno]; break; case 1: - if (REGNO (x) >= ARRAY_SIZE (qi_reg_name)) + if (regno >= ARRAY_SIZE (qi_reg_name)) goto normal; - reg = qi_reg_name[REGNO (x)]; + reg = qi_reg_name[regno]; break; case 0: - if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name)) + if (regno >= ARRAY_SIZE (qi_high_reg_name)) goto normal; - reg = qi_high_reg_name[REGNO (x)]; + reg = qi_high_reg_name[regno]; break; case 32: if (SSE_REG_P (x)) { gcc_assert (!duplicated); putc ('y', file); - fputs (hi_reg_name[REGNO (x)] + 1, file); + fputs (hi_reg_name[regno] + 1, file); return; } break; @@ -13578,6 +13925,7 @@ get_some_local_dynamic_name (void) Z -- likewise, with special suffixes for x87 instructions. * -- print a star (in certain assembler syntax) A -- print an absolute memory reference. + E -- print address with DImode register names if TARGET_64BIT. w -- print the operand as if it's a "word" (HImode) even if it isn't. s -- print a shift double count, followed by the assemblers argument delimiter. @@ -13653,7 +14001,14 @@ ix86_print_operand (FILE *file, rtx x, int code) ix86_print_operand (file, x, 0); return; + case 'E': + /* Wrap address in an UNSPEC to declare special handling. */ + if (TARGET_64BIT) + x = gen_rtx_UNSPEC (DImode, gen_rtvec (1, x), UNSPEC_LEA_ADDR); + output_address (x); + return; + case 'L': if (ASSEMBLER_DIALECT == ASM_ATT) putc ('l', file); @@ -13979,6 +14334,13 @@ ix86_print_operand (FILE *file, rtx x, int code) return; case 'H': + if (!offsettable_memref_p (x)) + { + output_operand_lossage ("operand is not an offsettable memory " + "reference, invalid operand " + "code 'H'"); + return; + } /* It doesn't actually matter what mode we use here, as we're only going to use this for printing. */ x = adjust_address_nv (x, DImode, 8); @@ -14171,7 +14533,8 @@ ix86_print_operand (FILE *file, rtx x, int code) putc ('$', file); /* Sign extend 32bit SFmode immediate to 8 bytes. */ if (code == 'q') - fprintf (file, "0x%08llx", (unsigned long long) (int) l); + fprintf (file, "0x%08" HOST_LONG_LONG_FORMAT "x", + (unsigned long long) (int) l); else fprintf (file, "0x%08x", (unsigned int) l); } @@ -14251,6 +14614,7 @@ ix86_print_operand_address (FILE *file, rtx addr) int scale; int ok; bool vsib = false; + int code = 0; if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_VSIBADDR) { @@ -14261,25 +14625,17 @@ ix86_print_operand_address (FILE *file, rtx addr) addr = XVECEXP (addr, 0, 0); vsib = true; } + else if (GET_CODE (addr) == UNSPEC && XINT (addr, 1) == UNSPEC_LEA_ADDR) + { + gcc_assert (TARGET_64BIT); + ok = ix86_decompose_address (XVECEXP (addr, 0, 0), &parts); + code = 'q'; + } else ok = ix86_decompose_address (addr, &parts); gcc_assert (ok); - if (parts.base && GET_CODE (parts.base) == SUBREG) - { - rtx tmp = SUBREG_REG (parts.base); - parts.base = simplify_subreg (GET_MODE (parts.base), - tmp, GET_MODE (tmp), 0); - } - - if (parts.index && GET_CODE (parts.index) == SUBREG) - { - rtx tmp = SUBREG_REG (parts.index); - parts.index = simplify_subreg (GET_MODE (parts.index), - tmp, GET_MODE (tmp), 0); - } - base = parts.base; index = parts.index; disp = parts.disp; @@ -14331,15 +14687,51 @@ ix86_print_operand_address (FILE *file, rtx addr) } else { - int code = 0; - - /* Print SImode registers for zero-extended addresses to force - addr32 prefix. Otherwise print DImode registers to avoid it. */ - if (TARGET_64BIT) - code = ((GET_CODE (addr) == ZERO_EXTEND - || GET_CODE (addr) == AND) - ? 'l' - : 'q'); + /* Print SImode register names to force addr32 prefix. */ + if (SImode_address_operand (addr, VOIDmode)) + { +#ifdef ENABLE_CHECKING + gcc_assert (TARGET_64BIT); + switch (GET_CODE (addr)) + { + case SUBREG: + gcc_assert (GET_MODE (addr) == SImode); + gcc_assert (GET_MODE (SUBREG_REG (addr)) == DImode); + break; + case ZERO_EXTEND: + case AND: + gcc_assert (GET_MODE (addr) == DImode); + break; + default: + gcc_unreachable (); + } +#endif + gcc_assert (!code); + code = 'k'; + } + else if (code == 0 + && TARGET_X32 + && disp + && CONST_INT_P (disp) + && INTVAL (disp) < -16*1024*1024) + { + /* X32 runs in 64-bit mode, where displacement, DISP, in + address DISP(%r64), is encoded as 32-bit immediate sign- + extended from 32-bit to 64-bit. For -0x40000300(%r64), + address is %r64 + 0xffffffffbffffd00. When %r64 < + 0x40000300, like 0x37ffe064, address is 0xfffffffff7ffdd64, + which is invalid for x32. The correct address is %r64 + - 0x40000300 == 0xf7ffdd64. To properly encode + -0x40000300(%r64) for x32, we zero-extend negative + displacement by forcing addr32 prefix which truncates + 0xfffffffff7ffdd64 to 0xf7ffdd64. In theory, we should + zero-extend all negative displacements, including -1(%rsp). + However, for small negative displacements, sign-extension + won't cause overflow. We only zero-extend negative + displacements if they < -16*1024*1024, which is also used + to check legitimate address displacements for PIC. */ + code = 'k'; + } if (ASSEMBLER_DIALECT == ASM_ATT) { @@ -15239,8 +15631,7 @@ ix86_expand_move (enum machine_mode mode, rtx operands[]) op0, 1, OPTAB_DIRECT); if (tmp == op0) return; - if (GET_MODE (tmp) != mode) - op1 = convert_to_mode (mode, tmp, 1); + op1 = convert_to_mode (mode, tmp, 1); } } @@ -15404,7 +15795,8 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) { rtx m; rtx (*extract) (rtx, rtx, rtx); - rtx (*move_unaligned) (rtx, rtx); + rtx (*load_unaligned) (rtx, rtx); + rtx (*store_unaligned) (rtx, rtx); enum machine_mode mode; switch (GET_MODE (op0)) @@ -15413,39 +15805,52 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) gcc_unreachable (); case V32QImode: extract = gen_avx_vextractf128v32qi; - move_unaligned = gen_avx_movdqu256; + load_unaligned = gen_avx_loaddqu256; + store_unaligned = gen_avx_storedqu256; mode = V16QImode; break; case V8SFmode: extract = gen_avx_vextractf128v8sf; - move_unaligned = gen_avx_movups256; + load_unaligned = gen_avx_loadups256; + store_unaligned = gen_avx_storeups256; mode = V4SFmode; break; case V4DFmode: extract = gen_avx_vextractf128v4df; - move_unaligned = gen_avx_movupd256; + load_unaligned = gen_avx_loadupd256; + store_unaligned = gen_avx_storeupd256; mode = V2DFmode; break; } - if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD) + if (MEM_P (op1)) { - rtx r = gen_reg_rtx (mode); - m = adjust_address (op1, mode, 0); - emit_move_insn (r, m); - m = adjust_address (op1, mode, 16); - r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m); - emit_move_insn (op0, r); + if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD) + { + rtx r = gen_reg_rtx (mode); + m = adjust_address (op1, mode, 0); + emit_move_insn (r, m); + m = adjust_address (op1, mode, 16); + r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m); + emit_move_insn (op0, r); + } + else + emit_insn (load_unaligned (op0, op1)); } - else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE) + else if (MEM_P (op0)) { - m = adjust_address (op0, mode, 0); - emit_insn (extract (m, op1, const0_rtx)); - m = adjust_address (op0, mode, 16); - emit_insn (extract (m, op1, const1_rtx)); + if (TARGET_AVX256_SPLIT_UNALIGNED_STORE) + { + m = adjust_address (op0, mode, 0); + emit_insn (extract (m, op1, const0_rtx)); + m = adjust_address (op0, mode, 16); + emit_insn (extract (m, op1, const1_rtx)); + } + else + emit_insn (store_unaligned (op0, op1)); } else - emit_insn (move_unaligned (op0, op1)); + gcc_unreachable (); } /* Implement the movmisalign patterns for SSE. Non-SSE modes go @@ -15504,6 +15909,7 @@ void ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) { rtx op0, op1, m; + rtx (*move_unaligned) (rtx, rtx); op0 = operands[0]; op1 = operands[1]; @@ -15520,14 +15926,28 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) /* If we're optimizing for size, movups is the smallest. */ if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) { + if (MEM_P (op1)) + move_unaligned = gen_sse_loadups; + else if (MEM_P (op0)) + move_unaligned = gen_sse_storeups; + else + gcc_unreachable (); + op0 = gen_lowpart (V4SFmode, op0); op1 = gen_lowpart (V4SFmode, op1); - emit_insn (gen_sse_movups (op0, op1)); + emit_insn (move_unaligned (op0, op1)); return; } + if (MEM_P (op1)) + move_unaligned = gen_sse2_loaddqu; + else if (MEM_P (op0)) + move_unaligned = gen_sse2_storedqu; + else + gcc_unreachable (); + op0 = gen_lowpart (V16QImode, op0); op1 = gen_lowpart (V16QImode, op1); - emit_insn (gen_sse2_movdqu (op0, op1)); + emit_insn (move_unaligned (op0, op1)); break; case 32: op0 = gen_lowpart (V32QImode, op0); @@ -15545,7 +15965,14 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) switch (mode) { case V4SFmode: - emit_insn (gen_sse_movups (op0, op1)); + if (MEM_P (op1)) + move_unaligned = gen_sse_loadups; + else if (MEM_P (op0)) + move_unaligned = gen_sse_storeups; + else + gcc_unreachable (); + + emit_insn (move_unaligned (op0, op1)); break; case V8SFmode: ix86_avx256_split_vector_move_misalign (op0, op1); @@ -15553,12 +15980,26 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) case V2DFmode: if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) { + if (MEM_P (op1)) + move_unaligned = gen_sse_loadups; + else if (MEM_P (op0)) + move_unaligned = gen_sse_storeups; + else + gcc_unreachable (); + op0 = gen_lowpart (V4SFmode, op0); op1 = gen_lowpart (V4SFmode, op1); - emit_insn (gen_sse_movups (op0, op1)); + emit_insn (move_unaligned (op0, op1)); return; } - emit_insn (gen_sse2_movupd (op0, op1)); + if (MEM_P (op1)) + move_unaligned = gen_sse2_loadupd; + else if (MEM_P (op0)) + move_unaligned = gen_sse2_storeupd; + else + gcc_unreachable (); + + emit_insn (move_unaligned (op0, op1)); break; case V4DFmode: ix86_avx256_split_vector_move_misalign (op0, op1); @@ -15583,7 +16024,7 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) { op0 = gen_lowpart (V4SFmode, op0); op1 = gen_lowpart (V4SFmode, op1); - emit_insn (gen_sse_movups (op0, op1)); + emit_insn (gen_sse_loadups (op0, op1)); return; } @@ -15594,7 +16035,7 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) { op0 = gen_lowpart (V16QImode, op0); op1 = gen_lowpart (V16QImode, op1); - emit_insn (gen_sse2_movdqu (op0, op1)); + emit_insn (gen_sse2_loaddqu (op0, op1)); return; } @@ -15606,7 +16047,7 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) { op0 = gen_lowpart (V2DFmode, op0); op1 = gen_lowpart (V2DFmode, op1); - emit_insn (gen_sse2_movupd (op0, op1)); + emit_insn (gen_sse2_loadupd (op0, op1)); return; } @@ -15641,7 +16082,7 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) { op0 = gen_lowpart (V4SFmode, op0); op1 = gen_lowpart (V4SFmode, op1); - emit_insn (gen_sse_movups (op0, op1)); + emit_insn (gen_sse_loadups (op0, op1)); return; } @@ -15666,7 +16107,7 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) { op0 = gen_lowpart (V4SFmode, op0); op1 = gen_lowpart (V4SFmode, op1); - emit_insn (gen_sse_movups (op0, op1)); + emit_insn (gen_sse_storeups (op0, op1)); return; } @@ -15677,7 +16118,7 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) { op0 = gen_lowpart (V16QImode, op0); op1 = gen_lowpart (V16QImode, op1); - emit_insn (gen_sse2_movdqu (op0, op1)); + emit_insn (gen_sse2_storedqu (op0, op1)); return; } @@ -15687,7 +16128,7 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) { op0 = gen_lowpart (V2DFmode, op0); op1 = gen_lowpart (V2DFmode, op1); - emit_insn (gen_sse2_movupd (op0, op1)); + emit_insn (gen_sse2_storeupd (op0, op1)); } else { @@ -15705,7 +16146,7 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL) { op0 = gen_lowpart (V4SFmode, op0); - emit_insn (gen_sse_movups (op0, op1)); + emit_insn (gen_sse_storeups (op0, op1)); } else { @@ -16173,7 +16614,6 @@ distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2, basic_block bb = start ? BLOCK_FOR_INSN (start) : NULL; rtx prev = start; rtx next = NULL; - enum attr_type insn_type; *found = false; @@ -16186,8 +16626,8 @@ distance_non_agu_define_in_bb (unsigned int regno1, unsigned int regno2, distance = increase_distance (prev, next, distance); if (insn_defines_reg (regno1, regno2, prev)) { - insn_type = get_attr_type (prev); - if (insn_type != TYPE_LEA) + if (recog_memoized (prev) < 0 + || get_attr_type (prev) != TYPE_LEA) { *found = true; return distance; @@ -16414,9 +16854,9 @@ distance_agu_use (unsigned int regno0, rtx insn) over a sequence of instructions. Instructions sequence has SPLIT_COST cycles higher latency than lea latency. */ -bool +static bool ix86_lea_outperforms (rtx insn, unsigned int regno0, unsigned int regno1, - unsigned int regno2, unsigned int split_cost) + unsigned int regno2, int split_cost) { int dist_define, dist_use; @@ -16509,6 +16949,29 @@ ix86_avoid_lea_for_add (rtx insn, rtx operands[]) return !ix86_lea_outperforms (insn, regno0, regno1, regno2, 1); } +/* Return true if we should emit lea instruction instead of mov + instruction. */ + +bool +ix86_use_lea_for_mov (rtx insn, rtx operands[]) +{ + unsigned int regno0; + unsigned int regno1; + + /* Check if we need to optimize. */ + if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun)) + return false; + + /* Use lea for reg to reg moves only. */ + if (!REG_P (operands[0]) || !REG_P (operands[1])) + return false; + + regno0 = true_regnum (operands[0]); + regno1 = true_regnum (operands[1]); + + return ix86_lea_outperforms (insn, regno0, regno1, INVALID_REGNUM, 0); +} + /* Return true if we need to split lea into a sequence of instructions to avoid AGU stalls. */ @@ -16516,12 +16979,17 @@ bool ix86_avoid_lea_for_addr (rtx insn, rtx operands[]) { unsigned int regno0 = true_regnum (operands[0]) ; - unsigned int regno1 = -1; - unsigned int regno2 = -1; - unsigned int split_cost = 0; + unsigned int regno1 = INVALID_REGNUM; + unsigned int regno2 = INVALID_REGNUM; + int split_cost = 0; struct ix86_address parts; int ok; + /* FIXME: Handle zero-extended addresses. */ + if (GET_CODE (operands[1]) == ZERO_EXTEND + || GET_CODE (operands[1]) == AND) + return false; + /* Check we need to optimize. */ if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun)) return false; @@ -16533,6 +17001,11 @@ ix86_avoid_lea_for_addr (rtx insn, rtx operands[]) ok = ix86_decompose_address (operands[1], &parts); gcc_assert (ok); + /* There should be at least two components in the address. */ + if ((parts.base != NULL_RTX) + (parts.index != NULL_RTX) + + (parts.disp != NULL_RTX) + (parts.scale > 1) < 2) + return false; + /* We should not split into add if non legitimate pic operand is used as displacement. */ if (parts.disp && flag_pic && !LEGITIMATE_PIC_OPERAND_P (parts.disp)) @@ -17597,12 +18070,7 @@ ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1) return CCmode; case GTU: /* CF=0 & ZF=0 */ case LEU: /* CF=1 | ZF=1 */ - /* Detect overflow checks. They need just the carry flag. */ - if (GET_CODE (op0) == MINUS - && rtx_equal_p (op1, XEXP (op0, 0))) - return CCCmode; - else - return CCmode; + return CCmode; /* Codes possibly doable only with sign flag when comparing against zero. */ case GE: /* SF=OF or SF=0 */ @@ -19312,8 +19780,51 @@ ix86_expand_int_vcond (rtx operands[]) cop0 = operands[4]; cop1 = operands[5]; - /* XOP supports all of the comparisons on all vector int types. */ - if (!TARGET_XOP) + /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31 + and x < 0 ? 1 : 0 into (unsigned) x >> 31. */ + if ((code == LT || code == GE) + && data_mode == mode + && cop1 == CONST0_RTX (mode) + && operands[1 + (code == LT)] == CONST0_RTX (data_mode) + && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) > 1 + && GET_MODE_SIZE (GET_MODE_INNER (data_mode)) <= 8 + && (GET_MODE_SIZE (data_mode) == 16 + || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32))) + { + rtx negop = operands[2 - (code == LT)]; + int shift = GET_MODE_BITSIZE (GET_MODE_INNER (data_mode)) - 1; + if (negop == CONST1_RTX (data_mode)) + { + rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift), + operands[0], 1, OPTAB_DIRECT); + if (res != operands[0]) + emit_move_insn (operands[0], res); + return true; + } + else if (GET_MODE_INNER (data_mode) != DImode + && vector_all_ones_operand (negop, data_mode)) + { + rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift), + operands[0], 0, OPTAB_DIRECT); + if (res != operands[0]) + emit_move_insn (operands[0], res); + return true; + } + } + + if (!nonimmediate_operand (cop1, mode)) + cop1 = force_reg (mode, cop1); + if (!general_operand (operands[1], data_mode)) + operands[1] = force_reg (data_mode, operands[1]); + if (!general_operand (operands[2], data_mode)) + operands[2] = force_reg (data_mode, operands[2]); + + /* XOP supports all of the comparisons on all 128-bit vector int types. */ + if (TARGET_XOP + && (mode == V16QImode || mode == V8HImode + || mode == V4SImode || mode == V2DImode)) + ; + else { /* Canonicalize the comparison to EQ, GT, GTU. */ switch (code) @@ -19507,7 +20018,7 @@ ix86_expand_vec_perm (rtx operands[]) vt = force_reg (maskmode, vt); mask = gen_lowpart (maskmode, mask); if (maskmode == V8SImode) - emit_insn (gen_avx2_permvarv8si (t1, vt, mask)); + emit_insn (gen_avx2_permvarv8si (t1, mask, vt)); else emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt)); @@ -19523,7 +20034,7 @@ ix86_expand_vec_perm (rtx operands[]) vec[i * 2 + 1] = const1_rtx; } vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); - vt = force_const_mem (maskmode, vt); + vt = validize_mem (force_const_mem (maskmode, vt)); t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1, OPTAB_DIRECT); @@ -19541,13 +20052,13 @@ ix86_expand_vec_perm (rtx operands[]) the high bits of the shuffle elements. No need for us to perform an AND ourselves. */ if (one_operand_shuffle) - emit_insn (gen_avx2_permvarv8si (target, mask, op0)); + emit_insn (gen_avx2_permvarv8si (target, op0, mask)); else { t1 = gen_reg_rtx (V8SImode); t2 = gen_reg_rtx (V8SImode); - emit_insn (gen_avx2_permvarv8si (t1, mask, op0)); - emit_insn (gen_avx2_permvarv8si (t2, mask, op1)); + emit_insn (gen_avx2_permvarv8si (t1, op0, mask)); + emit_insn (gen_avx2_permvarv8si (t2, op1, mask)); goto merge_two; } return; @@ -19555,13 +20066,13 @@ ix86_expand_vec_perm (rtx operands[]) case V8SFmode: mask = gen_lowpart (V8SFmode, mask); if (one_operand_shuffle) - emit_insn (gen_avx2_permvarv8sf (target, mask, op0)); + emit_insn (gen_avx2_permvarv8sf (target, op0, mask)); else { t1 = gen_reg_rtx (V8SFmode); t2 = gen_reg_rtx (V8SFmode); - emit_insn (gen_avx2_permvarv8sf (t1, mask, op0)); - emit_insn (gen_avx2_permvarv8sf (t2, mask, op1)); + emit_insn (gen_avx2_permvarv8sf (t1, op0, mask)); + emit_insn (gen_avx2_permvarv8sf (t2, op1, mask)); goto merge_two; } return; @@ -19574,17 +20085,17 @@ ix86_expand_vec_perm (rtx operands[]) t2 = gen_reg_rtx (V8SImode); emit_insn (gen_avx_vec_concatv8si (t1, op0, op1)); emit_insn (gen_avx_vec_concatv8si (t2, mask, mask)); - emit_insn (gen_avx2_permvarv8si (t1, t2, t1)); + emit_insn (gen_avx2_permvarv8si (t1, t1, t2)); emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx)); return; case V4SFmode: t1 = gen_reg_rtx (V8SFmode); - t2 = gen_reg_rtx (V8SFmode); - mask = gen_lowpart (V4SFmode, mask); + t2 = gen_reg_rtx (V8SImode); + mask = gen_lowpart (V4SImode, mask); emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1)); - emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask)); - emit_insn (gen_avx2_permvarv8sf (t1, t2, t1)); + emit_insn (gen_avx_vec_concatv8si (t2, mask, mask)); + emit_insn (gen_avx2_permvarv8sf (t1, t1, t2)); emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx)); return; @@ -19720,7 +20231,7 @@ ix86_expand_vec_perm (rtx operands[]) for (i = 0; i < 16; ++i) vec[i] = GEN_INT (i/e * e); vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); - vt = force_const_mem (V16QImode, vt); + vt = validize_mem (force_const_mem (V16QImode, vt)); if (TARGET_XOP) emit_insn (gen_xop_pperm (mask, mask, mask, vt)); else @@ -19731,7 +20242,7 @@ ix86_expand_vec_perm (rtx operands[]) for (i = 0; i < 16; ++i) vec[i] = GEN_INT (i % e); vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec)); - vt = force_const_mem (V16QImode, vt); + vt = validize_mem (force_const_mem (V16QImode, vt)); emit_insn (gen_addv16qi3 (mask, mask, vt)); } @@ -22891,7 +23402,6 @@ ix86_init_machine_status (void) f = ggc_alloc_cleared_machine_function (); f->use_fast_prologue_epilogue_nregs = -1; - f->tls_descriptor_call_expanded_p = 0; f->call_abi = ix86_abi; return f; @@ -22910,9 +23420,6 @@ assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n) gcc_assert (n < MAX_386_STACK_LOCALS); - /* Virtual slot is valid only before vregs are instantiated. */ - gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated); - for (s = ix86_stack_locals; s; s = s->next) if (s->mode == mode && s->n == n) return validize_mem (copy_rtx (s->rtl)); @@ -22926,13 +23433,23 @@ assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n) ix86_stack_locals = s; return validize_mem (s->rtl); } + +static void +ix86_instantiate_decls (void) +{ + struct stack_local_entry *s; + + for (s = ix86_stack_locals; s; s = s->next) + if (s->rtl != NULL_RTX) + instantiate_decl_rtl (s->rtl); +} /* Calculate the length of the memory address in the instruction encoding. Includes addr32 prefix, does not include the one-byte modrm, opcode, - or other prefixes. */ + or other prefixes. We never generate addr32 prefix for LEA insn. */ int -memory_address_length (rtx addr) +memory_address_length (rtx addr, bool lea) { struct ix86_address parts; rtx base, index, disp; @@ -22948,18 +23465,26 @@ memory_address_length (rtx addr) ok = ix86_decompose_address (addr, &parts); gcc_assert (ok); - if (parts.base && GET_CODE (parts.base) == SUBREG) - parts.base = SUBREG_REG (parts.base); - if (parts.index && GET_CODE (parts.index) == SUBREG) - parts.index = SUBREG_REG (parts.index); + len = (parts.seg == SEG_DEFAULT) ? 0 : 1; + + /* If this is not LEA instruction, add the length of addr32 prefix. */ + if (TARGET_64BIT && !lea + && (SImode_address_operand (addr, VOIDmode) + || (parts.base && GET_MODE (parts.base) == SImode) + || (parts.index && GET_MODE (parts.index) == SImode))) + len++; base = parts.base; index = parts.index; disp = parts.disp; - /* Add length of addr32 prefix. */ - len = (GET_CODE (addr) == ZERO_EXTEND - || GET_CODE (addr) == AND); + if (base && GET_CODE (base) == SUBREG) + base = SUBREG_REG (base); + if (index && GET_CODE (index) == SUBREG) + index = SUBREG_REG (index); + + gcc_assert (base == NULL_RTX || REG_P (base)); + gcc_assert (index == NULL_RTX || REG_P (index)); /* Rule of thumb: - esp as the base always wants an index, @@ -22973,14 +23498,13 @@ memory_address_length (rtx addr) /* esp (for its index) and ebp (for its displacement) need the two-byte modrm form. Similarly for r12 and r13 in 64-bit code. */ - if (REG_P (addr) - && (addr == arg_pointer_rtx - || addr == frame_pointer_rtx - || REGNO (addr) == SP_REG - || REGNO (addr) == BP_REG - || REGNO (addr) == R12_REG - || REGNO (addr) == R13_REG)) - len = 1; + if (base == arg_pointer_rtx + || base == frame_pointer_rtx + || REGNO (base) == SP_REG + || REGNO (base) == BP_REG + || REGNO (base) == R12_REG + || REGNO (base) == R13_REG) + len++; } /* Direct Addressing. In 64-bit mode mod 00 r/m 5 @@ -22990,7 +23514,7 @@ memory_address_length (rtx addr) by UNSPEC. */ else if (disp && !base && !index) { - len = 4; + len += 4; if (TARGET_64BIT) { rtx symbol = disp; @@ -23008,43 +23532,30 @@ memory_address_length (rtx addr) || (XINT (symbol, 1) != UNSPEC_GOTPCREL && XINT (symbol, 1) != UNSPEC_PCREL && XINT (symbol, 1) != UNSPEC_GOTNTPOFF))) - len += 1; + len++; } } - else { /* Find the length of the displacement constant. */ if (disp) { if (base && satisfies_constraint_K (disp)) - len = 1; + len += 1; else - len = 4; + len += 4; } /* ebp always wants a displacement. Similarly r13. */ - else if (base && REG_P (base) - && (REGNO (base) == BP_REG || REGNO (base) == R13_REG)) - len = 1; + else if (base && (REGNO (base) == BP_REG || REGNO (base) == R13_REG)) + len++; /* An index requires the two-byte modrm form.... */ if (index /* ...like esp (or r12), which always wants an index. */ || base == arg_pointer_rtx || base == frame_pointer_rtx - || (base && REG_P (base) - && (REGNO (base) == SP_REG || REGNO (base) == R12_REG))) - len += 1; - } - - switch (parts.seg) - { - case SEG_FS: - case SEG_GS: - len += 1; - break; - default: - break; + || (base && (REGNO (base) == SP_REG || REGNO (base) == R12_REG))) + len++; } return len; @@ -23098,7 +23609,8 @@ ix86_attr_length_immediate_default (rtx insn, bool shortform) case MODE_SI: len = 4; break; - /* Immediates for DImode instructions are encoded as 32bit sign extended values. */ + /* Immediates for DImode instructions are encoded + as 32bit sign extended values. */ case MODE_DI: len = 4; break; @@ -23108,6 +23620,7 @@ ix86_attr_length_immediate_default (rtx insn, bool shortform) } return len; } + /* Compute default value for "length_address" attribute. */ int ix86_attr_length_address_default (rtx insn) @@ -23124,15 +23637,8 @@ ix86_attr_length_address_default (rtx insn) gcc_assert (GET_CODE (set) == SET); addr = SET_SRC (set); - if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI) - { - if (GET_CODE (addr) == ZERO_EXTEND) - addr = XEXP (addr, 0); - if (GET_CODE (addr) == SUBREG) - addr = SUBREG_REG (addr); - } - return memory_address_length (addr); + return memory_address_length (addr, true); } extract_insn_cached (insn); @@ -23154,7 +23660,7 @@ ix86_attr_length_address_default (rtx insn) if (*constraints == 'X') continue; } - return memory_address_length (XEXP (recog_data.operand[i], 0)); + return memory_address_length (XEXP (recog_data.operand[i], 0), false); } return 0; } @@ -23470,6 +23976,7 @@ ia32_multipass_dfa_lookahead (void) case PROCESSOR_CORE2_64: case PROCESSOR_COREI7_32: case PROCESSOR_COREI7_64: + case PROCESSOR_ATOM: /* Generally, we want haifa-sched:max_issue() to look ahead as far as many instructions can be executed on a cycle, i.e., issue_rate. I wonder why tuning for many CPUs does not do this. */ @@ -23759,7 +24266,8 @@ ix86_constant_alignment (tree exp, int align) int ix86_data_alignment (tree type, int align) { - int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT); + int max_align + = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT); if (AGGREGATE_TYPE_P (type) && TYPE_SIZE (type) @@ -23988,7 +24496,7 @@ ix86_static_chain (const_tree fndecl, bool incoming_p) fntype = TREE_TYPE (fndecl); ccvt = ix86_get_callcvt (fntype); - if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0) + if ((ccvt & IX86_CALLCVT_FASTCALL) != 0) { /* Fastcall functions use ecx/edx for arguments, which leaves us with EAX for the static chain. @@ -23996,6 +24504,13 @@ ix86_static_chain (const_tree fndecl, bool incoming_p) leaves us with EAX for the static chain. */ regno = AX_REG; } + else if ((ccvt & IX86_CALLCVT_THISCALL) != 0) + { + /* Thiscall functions use ecx for arguments, which leaves + us with EAX and EDX for the static chain. + We are using for abi-compatibility EAX. */ + regno = AX_REG; + } else if (ix86_function_regparm (fntype, fndecl) == 3) { /* For regparm 3, we have no free call-clobbered registers in @@ -24556,6 +25071,7 @@ enum ix86_builtins IX86_BUILTIN_CVTTPS2DQ, IX86_BUILTIN_MOVNTI, + IX86_BUILTIN_MOVNTI64, IX86_BUILTIN_MOVNTPD, IX86_BUILTIN_MOVNTDQ, @@ -24768,22 +25284,32 @@ enum ix86_builtins IX86_BUILTIN_PMULDQ128, IX86_BUILTIN_PMULLD128, - IX86_BUILTIN_ROUNDPD, - IX86_BUILTIN_ROUNDPS, IX86_BUILTIN_ROUNDSD, IX86_BUILTIN_ROUNDSS, + IX86_BUILTIN_ROUNDPD, + IX86_BUILTIN_ROUNDPS, + IX86_BUILTIN_FLOORPD, IX86_BUILTIN_CEILPD, IX86_BUILTIN_TRUNCPD, IX86_BUILTIN_RINTPD, IX86_BUILTIN_ROUNDPD_AZ, + + IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, + IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, + IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, + IX86_BUILTIN_FLOORPS, IX86_BUILTIN_CEILPS, IX86_BUILTIN_TRUNCPS, IX86_BUILTIN_RINTPS, IX86_BUILTIN_ROUNDPS_AZ, + IX86_BUILTIN_FLOORPS_SFIX, + IX86_BUILTIN_CEILPS_SFIX, + IX86_BUILTIN_ROUNDPS_AZ_SFIX, + IX86_BUILTIN_PTESTZ, IX86_BUILTIN_PTESTC, IX86_BUILTIN_PTESTNZC, @@ -24807,6 +25333,7 @@ enum ix86_builtins IX86_BUILTIN_VEC_SET_V16QI, IX86_BUILTIN_VEC_PACK_SFIX, + IX86_BUILTIN_VEC_PACK_SFIX256, /* SSE4.2. */ IX86_BUILTIN_CRC32QI, @@ -24956,12 +25483,21 @@ enum ix86_builtins IX86_BUILTIN_TRUNCPD256, IX86_BUILTIN_RINTPD256, IX86_BUILTIN_ROUNDPD_AZ256, + + IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, + IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, + IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, + IX86_BUILTIN_FLOORPS256, IX86_BUILTIN_CEILPS256, IX86_BUILTIN_TRUNCPS256, IX86_BUILTIN_RINTPS256, IX86_BUILTIN_ROUNDPS_AZ256, + IX86_BUILTIN_FLOORPS_SFIX256, + IX86_BUILTIN_CEILPS_SFIX256, + IX86_BUILTIN_ROUNDPS_AZ_SFIX256, + IX86_BUILTIN_UNPCKHPD256, IX86_BUILTIN_UNPCKLPD256, IX86_BUILTIN_UNPCKHPS256, @@ -25167,6 +25703,13 @@ enum ix86_builtins IX86_BUILTIN_GATHERDIV4SI, IX86_BUILTIN_GATHERDIV8SI, + /* Alternate 4 element gather for the vectorizer where + all operands are 32-byte wide. */ + IX86_BUILTIN_GATHERALTSIV4DF, + IX86_BUILTIN_GATHERALTDIV8SF, + IX86_BUILTIN_GATHERALTSIV4DI, + IX86_BUILTIN_GATHERALTDIV8SI, + /* TFmode support builtins. */ IX86_BUILTIN_INFQ, IX86_BUILTIN_HUGE_VALQ, @@ -25591,9 +26134,9 @@ static const struct builtin_description bdesc_special_args[] = { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID }, /* SSE */ - { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storeups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF }, - { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT }, + { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF }, { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF }, @@ -25602,18 +26145,19 @@ static const struct builtin_description bdesc_special_args[] = /* SSE or 3DNow!A */ { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID }, - { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntdi, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG }, + { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntq, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG }, /* SSE2 */ { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storeupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_storedqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntsi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE }, - { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntisi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT }, + { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_movntidi, "__builtin_ia32_movnti64", IX86_BUILTIN_MOVNTI64, UNKNOWN, (int) VOID_FTYPE_PLONGLONG_LONGLONG }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loaddqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE }, @@ -25638,12 +26182,12 @@ static const struct builtin_description bdesc_special_args[] = { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF }, - { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE }, - { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT }, - { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF }, - { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF }, - { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR }, - { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loadups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storeups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_loaddqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_storedqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI }, @@ -26159,14 +26703,22 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND }, { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_floorpd_vec_pack_sfix", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V2DF_V2DF_ROUND }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd_vec_pack_sfix, "__builtin_ia32_ceilpd_vec_pack_sfix", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V2DF_V2DF_ROUND }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2, "__builtin_ia32_roundpd_az", IX86_BUILTIN_ROUNDPD_AZ, UNKNOWN, (int) V2DF_FTYPE_V2DF }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv2df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF }, { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND }, { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND }, { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND }, { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_floorps_sfix", IX86_BUILTIN_FLOORPS_SFIX, (enum rtx_code) ROUND_FLOOR, (int) V4SI_FTYPE_V4SF_ROUND }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_ceilps_sfix", IX86_BUILTIN_CEILPS_SFIX, (enum rtx_code) ROUND_CEIL, (int) V4SI_FTYPE_V4SF_ROUND }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF }, + { OPTION_MASK_ISA_ROUND, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF }, { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST }, { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST }, @@ -26286,13 +26838,21 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND }, { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2, "__builtin_ia32_roundpd_az256", IX86_BUILTIN_ROUNDPD_AZ256, UNKNOWN, (int) V4DF_FTYPE_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_roundv4df2_vec_pack_sfix, "__builtin_ia32_roundpd_az_vec_pack_sfix256", IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_floorpd_vec_pack_sfix256", IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V4DF_V4DF_ROUND }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd_vec_pack_sfix256, "__builtin_ia32_ceilpd_vec_pack_sfix256", IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V4DF_V4DF_ROUND }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_floorps_sfix256", IX86_BUILTIN_FLOORPS_SFIX256, (enum rtx_code) ROUND_FLOOR, (int) V8SI_FTYPE_V8SF_ROUND }, + { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps_sfix256, "__builtin_ia32_ceilps_sfix256", IX86_BUILTIN_CEILPS_SFIX256, (enum rtx_code) ROUND_CEIL, (int) V8SI_FTYPE_V8SF_ROUND }, + { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2, "__builtin_ia32_roundps_az256", IX86_BUILTIN_ROUNDPS_AZ256, UNKNOWN, (int) V8SF_FTYPE_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_roundv8sf2_sfix, "__builtin_ia32_roundps_az_sfix256", IX86_BUILTIN_ROUNDPS_AZ_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V8SF }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, @@ -26328,6 +26888,8 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_pack_sfix256 ", IX86_BUILTIN_VEC_PACK_SFIX256, UNKNOWN, (int) V8SI_FTYPE_V4DF_V4DF }, + /* AVX2 */ { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mpsadbw, "__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_INT }, { OPTION_MASK_ISA_AVX2, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI }, @@ -26460,7 +27022,7 @@ static const struct builtin_description bdesc_args[] = { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT }, - { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF }, + { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT }, { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT }, @@ -26646,8 +27208,8 @@ static const struct builtin_description bdesc_multi_arg[] = { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI }, { OPTION_MASK_ISA_XOP, CODE_FOR_xop_shlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI }, - { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF }, - { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_1_SF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_1_DF }, { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF }, { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF }, { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 }, @@ -26757,6 +27319,159 @@ static const struct builtin_description bdesc_multi_arg[] = { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 }, }; + +/* TM vector builtins. */ + +/* Reuse the existing x86-specific `struct builtin_description' cause + we're lazy. Add casts to make them fit. */ +static const struct builtin_description bdesc_tm[] = +{ + { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WM64", (enum ix86_builtins) BUILT_IN_TM_STORE_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaRM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_WaWM64", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M64, UNKNOWN, VOID_FTYPE_PV2SI_V2SI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_M64, UNKNOWN, V2SI_FTYPE_PCV2SI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaRM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M64, UNKNOWN, V2SI_FTYPE_PCV2SI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RaWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI }, + { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_RfWM64", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M64, UNKNOWN, V2SI_FTYPE_PCV2SI }, + + { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WM128", (enum ix86_builtins) BUILT_IN_TM_STORE_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaRM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_WaWM128", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M128, UNKNOWN, VOID_FTYPE_PV4SF_V4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_M128, UNKNOWN, V4SF_FTYPE_PCV4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaRM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M128, UNKNOWN, V4SF_FTYPE_PCV4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RaWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF }, + { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_RfWM128", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M128, UNKNOWN, V4SF_FTYPE_PCV4SF }, + + { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WM256", (enum ix86_builtins) BUILT_IN_TM_STORE_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaRM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAR_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_WaWM256", (enum ix86_builtins) BUILT_IN_TM_STORE_WAW_M256, UNKNOWN, VOID_FTYPE_PV8SF_V8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_M256, UNKNOWN, V8SF_FTYPE_PCV8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaRM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAR_M256, UNKNOWN, V8SF_FTYPE_PCV8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RaWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RAW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF }, + { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_RfWM256", (enum ix86_builtins) BUILT_IN_TM_LOAD_RFW_M256, UNKNOWN, V8SF_FTYPE_PCV8SF }, + + { OPTION_MASK_ISA_MMX, CODE_FOR_nothing, "__builtin__ITM_LM64", (enum ix86_builtins) BUILT_IN_TM_LOG_M64, UNKNOWN, VOID_FTYPE_PCVOID }, + { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin__ITM_LM128", (enum ix86_builtins) BUILT_IN_TM_LOG_M128, UNKNOWN, VOID_FTYPE_PCVOID }, + { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin__ITM_LM256", (enum ix86_builtins) BUILT_IN_TM_LOG_M256, UNKNOWN, VOID_FTYPE_PCVOID }, +}; + +/* TM callbacks. */ + +/* Return the builtin decl needed to load a vector of TYPE. */ + +static tree +ix86_builtin_tm_load (tree type) +{ + if (TREE_CODE (type) == VECTOR_TYPE) + { + switch (tree_low_cst (TYPE_SIZE (type), 1)) + { + case 64: + return builtin_decl_explicit (BUILT_IN_TM_LOAD_M64); + case 128: + return builtin_decl_explicit (BUILT_IN_TM_LOAD_M128); + case 256: + return builtin_decl_explicit (BUILT_IN_TM_LOAD_M256); + } + } + return NULL_TREE; +} + +/* Return the builtin decl needed to store a vector of TYPE. */ + +static tree +ix86_builtin_tm_store (tree type) +{ + if (TREE_CODE (type) == VECTOR_TYPE) + { + switch (tree_low_cst (TYPE_SIZE (type), 1)) + { + case 64: + return builtin_decl_explicit (BUILT_IN_TM_STORE_M64); + case 128: + return builtin_decl_explicit (BUILT_IN_TM_STORE_M128); + case 256: + return builtin_decl_explicit (BUILT_IN_TM_STORE_M256); + } + } + return NULL_TREE; +} + +/* Initialize the transactional memory vector load/store builtins. */ + +static void +ix86_init_tm_builtins (void) +{ + enum ix86_builtin_func_type ftype; + const struct builtin_description *d; + size_t i; + tree decl; + tree attrs_load, attrs_type_load, attrs_store, attrs_type_store; + tree attrs_log, attrs_type_log; + + if (!flag_tm) + return; + + /* If there are no builtins defined, we must be compiling in a + language without trans-mem support. */ + if (!builtin_decl_explicit_p (BUILT_IN_TM_LOAD_1)) + return; + + /* Use whatever attributes a normal TM load has. */ + decl = builtin_decl_explicit (BUILT_IN_TM_LOAD_1); + attrs_load = DECL_ATTRIBUTES (decl); + attrs_type_load = TYPE_ATTRIBUTES (TREE_TYPE (decl)); + /* Use whatever attributes a normal TM store has. */ + decl = builtin_decl_explicit (BUILT_IN_TM_STORE_1); + attrs_store = DECL_ATTRIBUTES (decl); + attrs_type_store = TYPE_ATTRIBUTES (TREE_TYPE (decl)); + /* Use whatever attributes a normal TM log has. */ + decl = builtin_decl_explicit (BUILT_IN_TM_LOG); + attrs_log = DECL_ATTRIBUTES (decl); + attrs_type_log = TYPE_ATTRIBUTES (TREE_TYPE (decl)); + + for (i = 0, d = bdesc_tm; + i < ARRAY_SIZE (bdesc_tm); + i++, d++) + { + if ((d->mask & ix86_isa_flags) != 0 + || (lang_hooks.builtin_function + == lang_hooks.builtin_function_ext_scope)) + { + tree type, attrs, attrs_type; + enum built_in_function code = (enum built_in_function) d->code; + + ftype = (enum ix86_builtin_func_type) d->flag; + type = ix86_get_builtin_func_type (ftype); + + if (BUILTIN_TM_LOAD_P (code)) + { + attrs = attrs_load; + attrs_type = attrs_type_load; + } + else if (BUILTIN_TM_STORE_P (code)) + { + attrs = attrs_store; + attrs_type = attrs_type_store; + } + else + { + attrs = attrs_log; + attrs_type = attrs_type_log; + } + decl = add_builtin_function (d->name, type, code, BUILT_IN_NORMAL, + /* The builtin without the prefix for + calling it directly. */ + d->name + strlen ("__builtin_"), + attrs); + /* add_builtin_function() will set the DECL_ATTRIBUTES, now + set the TYPE_ATTRIBUTES. */ + decl_attributes (&TREE_TYPE (decl), attrs_type, ATTR_FLAG_BUILT_IN); + + set_builtin_decl (code, decl, false); + } + } +} /* Set up all the MMX/SSE builtins, even builtins for instructions that are not in the current target ISA to allow the user to compile particular modules @@ -26945,6 +27660,22 @@ ix86_init_mmx_sse_builtins (void) V4SI_FTYPE_V4SI_PCINT_V4DI_V4SI_INT, IX86_BUILTIN_GATHERDIV8SI); + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4df ", + V4DF_FTYPE_V4DF_PCDOUBLE_V8SI_V4DF_INT, + IX86_BUILTIN_GATHERALTSIV4DF); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4sf256 ", + V8SF_FTYPE_V8SF_PCFLOAT_V4DI_V8SF_INT, + IX86_BUILTIN_GATHERALTDIV8SF); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltsiv4di ", + V4DI_FTYPE_V4DI_PCINT64_V8SI_V4DI_INT, + IX86_BUILTIN_GATHERALTSIV4DI); + + def_builtin (OPTION_MASK_ISA_AVX2, "__builtin_ia32_gatheraltdiv4si256 ", + V8SI_FTYPE_V8SI_PCINT_V4DI_V8SI_INT, + IX86_BUILTIN_GATHERALTDIV8SI); + /* MMX access to the vec_init patterns. */ def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si", V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI); @@ -27114,6 +27845,7 @@ ix86_init_builtins (void) TREE_READONLY (t) = 1; ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t; + ix86_init_tm_builtins (); ix86_init_mmx_sse_builtins (); if (TARGET_LP64) @@ -27566,7 +28298,7 @@ ix86_expand_sse_comi (const struct builtin_description *d, tree exp, return SUBREG_REG (target); } -/* Subroutine of ix86_expand_args_builtin to take care of round insns. */ +/* Subroutines of ix86_expand_args_builtin to take care of round insns. */ static rtx ix86_expand_sse_round (const struct builtin_description *d, tree exp, @@ -27599,6 +28331,44 @@ ix86_expand_sse_round (const struct builtin_description *d, tree exp, return target; } +static rtx +ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d, + tree exp, rtx target) +{ + rtx pat; + tree arg0 = CALL_EXPR_ARG (exp, 0); + tree arg1 = CALL_EXPR_ARG (exp, 1); + rtx op0 = expand_normal (arg0); + rtx op1 = expand_normal (arg1); + rtx op2; + enum machine_mode tmode = insn_data[d->icode].operand[0].mode; + enum machine_mode mode0 = insn_data[d->icode].operand[1].mode; + enum machine_mode mode1 = insn_data[d->icode].operand[2].mode; + + if (optimize || target == 0 + || GET_MODE (target) != tmode + || !insn_data[d->icode].operand[0].predicate (target, tmode)) + target = gen_reg_rtx (tmode); + + op0 = safe_vector_operand (op0, mode0); + op1 = safe_vector_operand (op1, mode1); + + if ((optimize && !register_operand (op0, mode0)) + || !insn_data[d->icode].operand[0].predicate (op0, mode0)) + op0 = copy_to_mode_reg (mode0, op0); + if ((optimize && !register_operand (op1, mode1)) + || !insn_data[d->icode].operand[1].predicate (op1, mode1)) + op1 = copy_to_mode_reg (mode1, op1); + + op2 = GEN_INT (d->comparison); + + pat = GEN_FCN (d->icode) (target, op0, op1, op2); + if (! pat) + return 0; + emit_insn (pat); + return target; +} + /* Subroutine of ix86_expand_builtin to take care of ptest insns. */ static rtx @@ -27872,7 +28642,12 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V4DF_FTYPE_V4DF_ROUND: case V4SF_FTYPE_V4SF_ROUND: case V8SF_FTYPE_V8SF_ROUND: + case V4SI_FTYPE_V4SF_ROUND: + case V8SI_FTYPE_V8SF_ROUND: return ix86_expand_sse_round (d, exp, target); + case V4SI_FTYPE_V2DF_V2DF_ROUND: + case V8SI_FTYPE_V4DF_V4DF_ROUND: + return ix86_expand_sse_round_vec_pack_sfix (d, exp, target); case INT_FTYPE_V8SF_V8SF_PTEST: case INT_FTYPE_V4DI_V4DI_PTEST: case INT_FTYPE_V4DF_V4DF_PTEST: @@ -28002,6 +28777,7 @@ ix86_expand_args_builtin (const struct builtin_description *d, case V32QI_FTYPE_V32QI_V32QI: case V16HI_FTYPE_V32QI_V32QI: case V16HI_FTYPE_V16HI_V16HI: + case V8SI_FTYPE_V4DF_V4DF: case V8SI_FTYPE_V8SI_V8SI: case V8SI_FTYPE_V16HI_V16HI: case V4DI_FTYPE_V4DI_V4DI: @@ -28188,15 +28964,22 @@ ix86_expand_args_builtin (const struct builtin_description *d, error ("the last argument must be an 1-bit immediate"); return const0_rtx; - case CODE_FOR_sse4_1_roundpd: - case CODE_FOR_sse4_1_roundps: case CODE_FOR_sse4_1_roundsd: case CODE_FOR_sse4_1_roundss: + + case CODE_FOR_sse4_1_roundpd: + case CODE_FOR_sse4_1_roundps: + case CODE_FOR_avx_roundpd256: + case CODE_FOR_avx_roundps256: + + case CODE_FOR_sse4_1_roundpd_vec_pack_sfix: + case CODE_FOR_sse4_1_roundps_sfix: + case CODE_FOR_avx_roundpd_vec_pack_sfix256: + case CODE_FOR_avx_roundps_sfix256: + case CODE_FOR_sse4_1_blendps: case CODE_FOR_avx_blendpd256: case CODE_FOR_avx_vpermilv4df: - case CODE_FOR_avx_roundpd256: - case CODE_FOR_avx_roundps256: error ("the last argument must be a 4-bit immediate"); return const0_rtx; @@ -28363,6 +29146,7 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, case VOID_FTYPE_PFLOAT_V4SF: case VOID_FTYPE_PDOUBLE_V4DF: case VOID_FTYPE_PDOUBLE_V2DF: + case VOID_FTYPE_PLONGLONG_LONGLONG: case VOID_FTYPE_PULONGLONG_ULONGLONG: case VOID_FTYPE_PINT_INT: nargs = 1; @@ -28436,8 +29220,8 @@ ix86_expand_special_args_builtin (const struct builtin_description *d, arg_adjust = 0; if (optimize || target == 0 - || GET_MODE (target) != tmode - || !insn_p->operand[0].predicate (target, tmode)) + || !register_operand (target, tmode) + || GET_MODE (target) != tmode) target = gen_reg_rtx (tmode); } @@ -28715,13 +29499,13 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, case IX86_BUILTIN_LDMXCSR: op0 = expand_normal (CALL_EXPR_ARG (exp, 0)); - target = assign_386_stack_local (SImode, SLOT_VIRTUAL); + target = assign_386_stack_local (SImode, SLOT_TEMP); emit_move_insn (target, op0); emit_insn (gen_sse_ldmxcsr (target)); return 0; case IX86_BUILTIN_STMXCSR: - target = assign_386_stack_local (SImode, SLOT_VIRTUAL); + target = assign_386_stack_local (SImode, SLOT_TEMP); emit_insn (gen_sse_stmxcsr (target)); return copy_to_mode_reg (SImode, target); @@ -28931,7 +29715,7 @@ rdrand_step: icode = CODE_FOR_avx2_gatherdiv4sf; goto gather_gen; case IX86_BUILTIN_GATHERDIV8SF: - icode = CODE_FOR_avx2_gatherdiv4sf256; + icode = CODE_FOR_avx2_gatherdiv8sf; goto gather_gen; case IX86_BUILTIN_GATHERSIV2DI: icode = CODE_FOR_avx2_gathersiv2di; @@ -28955,7 +29739,20 @@ rdrand_step: icode = CODE_FOR_avx2_gatherdiv4si; goto gather_gen; case IX86_BUILTIN_GATHERDIV8SI: - icode = CODE_FOR_avx2_gatherdiv4si256; + icode = CODE_FOR_avx2_gatherdiv8si; + goto gather_gen; + case IX86_BUILTIN_GATHERALTSIV4DF: + icode = CODE_FOR_avx2_gathersiv4df; + goto gather_gen; + case IX86_BUILTIN_GATHERALTDIV8SF: + icode = CODE_FOR_avx2_gatherdiv8sf; + goto gather_gen; + case IX86_BUILTIN_GATHERALTSIV4DI: + icode = CODE_FOR_avx2_gathersiv4di; + goto gather_gen; + case IX86_BUILTIN_GATHERALTDIV8SI: + icode = CODE_FOR_avx2_gatherdiv8si; + goto gather_gen; gather_gen: arg0 = CALL_EXPR_ARG (exp, 0); @@ -28974,8 +29771,41 @@ rdrand_step: mode3 = insn_data[icode].operand[4].mode; mode4 = insn_data[icode].operand[5].mode; - if (target == NULL_RTX) - target = gen_reg_rtx (insn_data[icode].operand[0].mode); + if (target == NULL_RTX + || GET_MODE (target) != insn_data[icode].operand[0].mode + || !insn_data[icode].operand[0].predicate (target, + GET_MODE (target))) + subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode); + else + subtarget = target; + + if (fcode == IX86_BUILTIN_GATHERALTSIV4DF + || fcode == IX86_BUILTIN_GATHERALTSIV4DI) + { + rtx half = gen_reg_rtx (V4SImode); + if (!nonimmediate_operand (op2, V8SImode)) + op2 = copy_to_mode_reg (V8SImode, op2); + emit_insn (gen_vec_extract_lo_v8si (half, op2)); + op2 = half; + } + else if (fcode == IX86_BUILTIN_GATHERALTDIV8SF + || fcode == IX86_BUILTIN_GATHERALTDIV8SI) + { + rtx (*gen) (rtx, rtx); + rtx half = gen_reg_rtx (mode0); + if (mode0 == V4SFmode) + gen = gen_vec_extract_lo_v8sf; + else + gen = gen_vec_extract_lo_v8si; + if (!nonimmediate_operand (op0, GET_MODE (op0))) + op0 = copy_to_mode_reg (GET_MODE (op0), op0); + emit_insn (gen (half, op0)); + op0 = half; + if (!nonimmediate_operand (op3, GET_MODE (op3))) + op3 = copy_to_mode_reg (GET_MODE (op3), op3); + emit_insn (gen (half, op3)); + op3 = half; + } /* Force memory operand only with base register here. But we don't want to do it on memory operand for other builtin @@ -28997,10 +29827,91 @@ rdrand_step: error ("last argument must be scale 1, 2, 4, 8"); return const0_rtx; } - pat = GEN_FCN (icode) (target, op0, op1, op2, op3, op4); + + /* Optimize. If mask is known to have all high bits set, + replace op0 with pc_rtx to signal that the instruction + overwrites the whole destination and doesn't use its + previous contents. */ + if (optimize) + { + if (TREE_CODE (arg3) == VECTOR_CST) + { + tree elt; + unsigned int negative = 0; + for (elt = TREE_VECTOR_CST_ELTS (arg3); + elt; elt = TREE_CHAIN (elt)) + { + tree cst = TREE_VALUE (elt); + if (TREE_CODE (cst) == INTEGER_CST + && tree_int_cst_sign_bit (cst)) + negative++; + else if (TREE_CODE (cst) == REAL_CST + && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst))) + negative++; + } + if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3))) + op0 = pc_rtx; + } + else if (TREE_CODE (arg3) == SSA_NAME) + { + /* Recognize also when mask is like: + __v2df src = _mm_setzero_pd (); + __v2df mask = _mm_cmpeq_pd (src, src); + or + __v8sf src = _mm256_setzero_ps (); + __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ); + as that is a cheaper way to load all ones into + a register than having to load a constant from + memory. */ + gimple def_stmt = SSA_NAME_DEF_STMT (arg3); + if (is_gimple_call (def_stmt)) + { + tree fndecl = gimple_call_fndecl (def_stmt); + if (fndecl + && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD) + switch ((unsigned int) DECL_FUNCTION_CODE (fndecl)) + { + case IX86_BUILTIN_CMPPD: + case IX86_BUILTIN_CMPPS: + case IX86_BUILTIN_CMPPD256: + case IX86_BUILTIN_CMPPS256: + if (!integer_zerop (gimple_call_arg (def_stmt, 2))) + break; + /* FALLTHRU */ + case IX86_BUILTIN_CMPEQPD: + case IX86_BUILTIN_CMPEQPS: + if (initializer_zerop (gimple_call_arg (def_stmt, 0)) + && initializer_zerop (gimple_call_arg (def_stmt, + 1))) + op0 = pc_rtx; + break; + default: + break; + } + } + } + } + + pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4); if (! pat) return const0_rtx; emit_insn (pat); + + if (fcode == IX86_BUILTIN_GATHERDIV8SF + || fcode == IX86_BUILTIN_GATHERDIV8SI) + { + enum machine_mode tmode = GET_MODE (subtarget) == V8SFmode + ? V4SFmode : V4SImode; + if (target == NULL_RTX) + target = gen_reg_rtx (tmode); + if (tmode == V4SFmode) + emit_insn (gen_vec_extract_lo_v8sf (target, subtarget)); + else + emit_insn (gen_vec_extract_lo_v8si (target, subtarget)); + } + else + target = subtarget; + return target; default: @@ -29097,13 +30008,85 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, } break; + case BUILT_IN_IFLOOR: + case BUILT_IN_LFLOOR: + case BUILT_IN_LLFLOOR: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math || !TARGET_ROUND) + break; + + if (out_mode == SImode && in_mode == DFmode) + { + if (out_n == 4 && in_n == 2) + return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX]; + else if (out_n == 8 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_FLOORPD_VEC_PACK_SFIX256]; + } + break; + + case BUILT_IN_IFLOORF: + case BUILT_IN_LFLOORF: + case BUILT_IN_LLFLOORF: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math || !TARGET_ROUND) + break; + + if (out_mode == SImode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX]; + else if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_FLOORPS_SFIX256]; + } + break; + + case BUILT_IN_ICEIL: + case BUILT_IN_LCEIL: + case BUILT_IN_LLCEIL: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math || !TARGET_ROUND) + break; + + if (out_mode == SImode && in_mode == DFmode) + { + if (out_n == 4 && in_n == 2) + return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX]; + else if (out_n == 8 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_CEILPD_VEC_PACK_SFIX256]; + } + break; + + case BUILT_IN_ICEILF: + case BUILT_IN_LCEILF: + case BUILT_IN_LLCEILF: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math || !TARGET_ROUND) + break; + + if (out_mode == SImode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX]; + else if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_CEILPS_SFIX256]; + } + break; + + case BUILT_IN_IRINT: case BUILT_IN_LRINT: - if (out_mode == SImode && out_n == 4 - && in_mode == DFmode && in_n == 2) - return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX]; + case BUILT_IN_LLRINT: + if (out_mode == SImode && in_mode == DFmode) + { + if (out_n == 4 && in_n == 2) + return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX]; + else if (out_n == 8 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX256]; + } break; + case BUILT_IN_IRINTF: case BUILT_IN_LRINTF: + case BUILT_IN_LLRINTF: if (out_mode == SImode && in_mode == SFmode) { if (out_n == 4 && in_n == 4) @@ -29113,6 +30096,38 @@ ix86_builtin_vectorized_function (tree fndecl, tree type_out, } break; + case BUILT_IN_IROUND: + case BUILT_IN_LROUND: + case BUILT_IN_LLROUND: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math || !TARGET_ROUND) + break; + + if (out_mode == SImode && in_mode == DFmode) + { + if (out_n == 4 && in_n == 2) + return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX]; + else if (out_n == 8 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_ROUNDPD_AZ_VEC_PACK_SFIX256]; + } + break; + + case BUILT_IN_IROUNDF: + case BUILT_IN_LROUNDF: + case BUILT_IN_LLROUNDF: + /* The round insn does not trap on denormals. */ + if (flag_trapping_math || !TARGET_ROUND) + break; + + if (out_mode == SImode && in_mode == SFmode) + { + if (out_n == 4 && in_n == 4) + return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX]; + else if (out_n == 8 && in_n == 8) + return ix86_builtins[IX86_BUILTIN_ROUNDPS_AZ_SFIX256]; + } + break; + case BUILT_IN_COPYSIGN: if (out_mode == DFmode && in_mode == DFmode) { @@ -29505,6 +30520,73 @@ ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in) return new_fndecl; } +/* Returns a decl of a function that implements gather load with + memory type MEM_VECTYPE and index type INDEX_VECTYPE and SCALE. + Return NULL_TREE if it is not available. */ + +static tree +ix86_vectorize_builtin_gather (const_tree mem_vectype, + const_tree index_type, int scale) +{ + bool si; + enum ix86_builtins code; + + if (! TARGET_AVX2) + return NULL_TREE; + + if ((TREE_CODE (index_type) != INTEGER_TYPE + && !POINTER_TYPE_P (index_type)) + || (TYPE_MODE (index_type) != SImode + && TYPE_MODE (index_type) != DImode)) + return NULL_TREE; + + if (TYPE_PRECISION (index_type) > POINTER_SIZE) + return NULL_TREE; + + /* v*gather* insn sign extends index to pointer mode. */ + if (TYPE_PRECISION (index_type) < POINTER_SIZE + && TYPE_UNSIGNED (index_type)) + return NULL_TREE; + + if (scale <= 0 + || scale > 8 + || (scale & (scale - 1)) != 0) + return NULL_TREE; + + si = TYPE_MODE (index_type) == SImode; + switch (TYPE_MODE (mem_vectype)) + { + case V2DFmode: + code = si ? IX86_BUILTIN_GATHERSIV2DF : IX86_BUILTIN_GATHERDIV2DF; + break; + case V4DFmode: + code = si ? IX86_BUILTIN_GATHERALTSIV4DF : IX86_BUILTIN_GATHERDIV4DF; + break; + case V2DImode: + code = si ? IX86_BUILTIN_GATHERSIV2DI : IX86_BUILTIN_GATHERDIV2DI; + break; + case V4DImode: + code = si ? IX86_BUILTIN_GATHERALTSIV4DI : IX86_BUILTIN_GATHERDIV4DI; + break; + case V4SFmode: + code = si ? IX86_BUILTIN_GATHERSIV4SF : IX86_BUILTIN_GATHERDIV4SF; + break; + case V8SFmode: + code = si ? IX86_BUILTIN_GATHERSIV8SF : IX86_BUILTIN_GATHERALTDIV8SF; + break; + case V4SImode: + code = si ? IX86_BUILTIN_GATHERSIV4SI : IX86_BUILTIN_GATHERDIV4SI; + break; + case V8SImode: + code = si ? IX86_BUILTIN_GATHERSIV8SI : IX86_BUILTIN_GATHERALTDIV8SI; + break; + default: + return NULL_TREE; + } + + return ix86_builtins[code]; +} + /* Returns a code for a target-specific builtin that implements reciprocal of the function, or NULL_TREE if not available. */ @@ -29671,7 +30753,6 @@ avx_vperm2f128_parallel (rtx par, enum machine_mode mode) return mask + 1; } - /* Store OPERAND to the memory after reload is completed. This means that we can't easily use assign_stack_local. */ rtx @@ -30552,6 +31633,13 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int opno, int *total, { if (CONST_INT_P (XEXP (x, 1))) *total = cost->shift_const; + else if (GET_CODE (XEXP (x, 1)) == SUBREG + && GET_CODE (XEXP (XEXP (x, 1), 0)) == AND) + { + /* Return the cost after shift-and truncation. */ + *total = cost->shift_var; + return true; + } else *total = cost->shift_var; } @@ -31078,8 +32166,7 @@ ix86_handle_struct_attribute (tree *node, tree name, else type = node; - if (!(type && (TREE_CODE (*type) == RECORD_TYPE - || TREE_CODE (*type) == UNION_TYPE))) + if (!(type && RECORD_OR_UNION_TYPE_P (*type))) { warning (OPT_Wattributes, "%qE attribute ignored", name); @@ -31215,6 +32302,20 @@ x86_output_mi_thunk (FILE *file, { rtx this_param = x86_this_parameter (function); rtx this_reg, tmp, fnaddr; + unsigned int tmp_regno; + + if (TARGET_64BIT) + tmp_regno = R10_REG; + else + { + unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function)); + if ((ccvt & IX86_CALLCVT_FASTCALL) != 0) + tmp_regno = AX_REG; + else if ((ccvt & IX86_CALLCVT_THISCALL) != 0) + tmp_regno = DX_REG; + else + tmp_regno = CX_REG; + } emit_note (NOTE_INSN_PROLOGUE_END); @@ -31241,7 +32342,7 @@ x86_output_mi_thunk (FILE *file, { if (!x86_64_general_operand (delta_rtx, Pmode)) { - tmp = gen_rtx_REG (Pmode, R10_REG); + tmp = gen_rtx_REG (Pmode, tmp_regno); emit_move_insn (tmp, delta_rtx); delta_rtx = tmp; } @@ -31254,18 +32355,7 @@ x86_output_mi_thunk (FILE *file, if (vcall_offset) { rtx vcall_addr, vcall_mem, this_mem; - unsigned int tmp_regno; - if (TARGET_64BIT) - tmp_regno = R10_REG; - else - { - unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function)); - if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0) - tmp_regno = AX_REG; - else - tmp_regno = CX_REG; - } tmp = gen_rtx_REG (Pmode, tmp_regno); this_mem = gen_rtx_MEM (ptr_mode, this_reg); @@ -31340,6 +32430,19 @@ x86_output_mi_thunk (FILE *file, emit_jump_insn (gen_indirect_jump (fnaddr)); else { + if (ix86_cmodel == CM_LARGE_PIC && SYMBOLIC_CONST (fnaddr)) + fnaddr = legitimize_pic_address (fnaddr, + gen_rtx_REG (Pmode, tmp_regno)); + + if (!sibcall_insn_operand (fnaddr, Pmode)) + { + tmp = gen_rtx_REG (Pmode, tmp_regno); + if (GET_MODE (fnaddr) != Pmode) + fnaddr = gen_rtx_ZERO_EXTEND (Pmode, fnaddr); + emit_move_insn (tmp, fnaddr); + fnaddr = tmp; + } + tmp = gen_rtx_MEM (QImode, fnaddr); tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx); tmp = emit_call_insn (tmp); @@ -32801,9 +33904,9 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) tmp = gen_reg_rtx (GET_MODE_INNER (mode)); ix86_expand_vector_extract (true, tmp, target, 1 - elt); if (elt == 0) - tmp = gen_rtx_VEC_CONCAT (mode, tmp, val); - else tmp = gen_rtx_VEC_CONCAT (mode, val, tmp); + else + tmp = gen_rtx_VEC_CONCAT (mode, tmp, val); emit_insn (gen_rtx_SET (VOIDmode, target, tmp)); return; } @@ -32817,9 +33920,9 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt) tmp = gen_reg_rtx (GET_MODE_INNER (mode)); ix86_expand_vector_extract (false, tmp, target, 1 - elt); if (elt == 0) - tmp = gen_rtx_VEC_CONCAT (mode, tmp, val); - else tmp = gen_rtx_VEC_CONCAT (mode, val, tmp); + else + tmp = gen_rtx_VEC_CONCAT (mode, tmp, val); emit_insn (gen_rtx_SET (VOIDmode, target, tmp)); return; @@ -34534,6 +35637,11 @@ static const struct attribute_spec ix86_attribute_table[] = for FP arguments. */ { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute, true }, + /* The transactional memory builtins are implicitly regparm or fastcall + depending on the ABI. Override the generic do-nothing attribute that + these builtins were declared with. */ + { "*tm regparm", 0, 0, false, true, true, ix86_handle_tm_regparm_attribute, + true }, /* force_align_arg_pointer says this function realigns the stack at entry. */ { (const char *)&ix86_force_align_arg_pointer_string, 0, 0, false, true, true, ix86_handle_cconv_attribute, false }, @@ -34604,47 +35712,14 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, return ix86_cost->cond_not_taken_branch_cost; case vec_perm: - return 1; + case vec_promote_demote: + return ix86_cost->vec_stmt_cost; default: gcc_unreachable (); } } - -/* Return a vector mode with twice as many elements as VMODE. */ -/* ??? Consider moving this to a table generated by genmodes.c. */ - -static enum machine_mode -doublesize_vector_mode (enum machine_mode vmode) -{ - switch (vmode) - { - case V2SFmode: return V4SFmode; - case V1DImode: return V2DImode; - case V2SImode: return V4SImode; - case V4HImode: return V8HImode; - case V8QImode: return V16QImode; - - case V2DFmode: return V4DFmode; - case V4SFmode: return V8SFmode; - case V2DImode: return V4DImode; - case V4SImode: return V8SImode; - case V8HImode: return V16HImode; - case V16QImode: return V32QImode; - - case V4DFmode: return V8DFmode; - case V8SFmode: return V16SFmode; - case V4DImode: return V8DImode; - case V8SImode: return V16SImode; - case V16HImode: return V32HImode; - case V32QImode: return V64QImode; - - default: - gcc_unreachable (); - } -} - /* Construct (set target (vec_select op0 (parallel perm))) and return true if that's a valid instruction in the active ISA. */ @@ -34679,7 +35754,7 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1, enum machine_mode v2mode; rtx x; - v2mode = doublesize_vector_mode (GET_MODE (op0)); + v2mode = GET_MODE_2XWIDER_MODE (GET_MODE (op0)); x = gen_rtx_VEC_CONCAT (v2mode, op0, op1); return expand_vselect (target, x, perm, nelt); } @@ -35071,7 +36146,7 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) else if (vmode == V32QImode) emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); else - emit_insn (gen_avx2_permvarv8si (target, vperm, op0)); + emit_insn (gen_avx2_permvarv8si (target, op0, vperm)); } else { @@ -35323,6 +36398,8 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d) return ok; } +static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d); + /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify a two vector permutation into a single vector permutation by using an interleave operation to merge the vectors. */ @@ -35349,6 +36426,17 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) /* For 32-byte modes allow even d->op0 == d->op1. The lack of cross-lane shuffling in some instructions might prevent a single insn shuffle. */ + dfinal = *d; + dfinal.testing_p = true; + /* If expand_vec_perm_interleave3 can expand this into + a 3 insn sequence, give up and let it be expanded as + 3 insn sequence. While that is one insn longer, + it doesn't need a memory operand and in the common + case that both interleave low and high permutations + with the same operands are adjacent needs 4 insns + for both after CSE. */ + if (expand_vec_perm_interleave3 (&dfinal)) + return false; } else return false; @@ -35384,6 +36472,8 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) dremap.perm[i * 2] = i; dremap.perm[i * 2 + 1] = i + nelt; } + if (!TARGET_SSE2 && d->vmode == V4SImode) + dremap.vmode = V4SFmode; } else if ((contents & (h2 | h4)) == contents) { @@ -35395,6 +36485,8 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) dremap.perm[i * 2] = i + nelt2; dremap.perm[i * 2 + 1] = i + nelt + nelt2; } + if (!TARGET_SSE2 && d->vmode == V4SImode) + dremap.vmode = V4SFmode; } else if ((contents & (h1 | h4)) == contents) { @@ -36184,18 +37276,23 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) stopping once we have promoted to V4SImode and then use pshufd. */ do { - optab otab = vec_interleave_low_optab; + rtx dest; + rtx (*gen) (rtx, rtx, rtx) + = vmode == V16QImode ? gen_vec_interleave_lowv16qi + : gen_vec_interleave_lowv8hi; if (elt >= nelt2) { - otab = vec_interleave_high_optab; + gen = vmode == V16QImode ? gen_vec_interleave_highv16qi + : gen_vec_interleave_highv8hi; elt -= nelt2; } nelt2 /= 2; - op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT); + dest = gen_reg_rtx (vmode); + emit_insn (gen (dest, op0, op0)); vmode = get_mode_wider_vector (vmode); - op0 = gen_lowpart (vmode, op0); + op0 = gen_lowpart (vmode, dest); } while (vmode != V4SImode); @@ -37639,13 +38736,13 @@ ix86_preferred_simd_mode (enum machine_mode mode) switch (mode) { case QImode: - return TARGET_AVX2 ? V32QImode : V16QImode; + return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V32QImode : V16QImode; case HImode: - return TARGET_AVX2 ? V16HImode : V8HImode; + return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V16HImode : V8HImode; case SImode: - return TARGET_AVX2 ? V8SImode : V4SImode; + return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V8SImode : V4SImode; case DImode: - return TARGET_AVX2 ? V4DImode : V2DImode; + return (TARGET_AVX && !TARGET_PREFER_AVX128) ? V4DImode : V2DImode; case SFmode: if (TARGET_AVX && !TARGET_PREFER_AVX128) @@ -37704,6 +38801,15 @@ ix86_autovectorize_vector_sizes (void) #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \ ix86_builtin_vectorized_function +#undef TARGET_VECTORIZE_BUILTIN_TM_LOAD +#define TARGET_VECTORIZE_BUILTIN_TM_LOAD ix86_builtin_tm_load + +#undef TARGET_VECTORIZE_BUILTIN_TM_STORE +#define TARGET_VECTORIZE_BUILTIN_TM_STORE ix86_builtin_tm_store + +#undef TARGET_VECTORIZE_BUILTIN_GATHER +#define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather + #undef TARGET_BUILTIN_RECIPROCAL #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal @@ -37895,7 +39001,7 @@ ix86_autovectorize_vector_sizes (void) #undef TARGET_MANGLE_TYPE #define TARGET_MANGLE_TYPE ix86_mangle_type -#ifndef TARGET_MACHO +#if !TARGET_MACHO #undef TARGET_STACK_PROTECT_FAIL #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail #endif @@ -37909,6 +39015,9 @@ ix86_autovectorize_vector_sizes (void) #undef TARGET_PROMOTE_FUNCTION_MODE #define TARGET_PROMOTE_FUNCTION_MODE ix86_promote_function_mode +#undef TARGET_INSTANTIATE_DECLS +#define TARGET_INSTANTIATE_DECLS ix86_instantiate_decls + #undef TARGET_SECONDARY_RELOAD #define TARGET_SECONDARY_RELOAD ix86_secondary_reload