X-Git-Url: http://git.sourceforge.jp/view?a=blobdiff_plain;f=gcc%2Fconfig%2Fi386%2Fi386.c;h=a54651da4e71884a7fb0dc961d5db07e442046ac;hb=fa1a750285740ce8a2a7f9e1278ed71e68105331;hp=84b652f2f8129f6fb55257004c049b9ffe4d4934;hpb=080980d0cb9c109b211890d914f3d74248cd2cd2;p=pf3gnuchains%2Fgcc-fork.git diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 84b652f2f81..a54651da4e7 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -53,6 +53,8 @@ along with GCC; see the file COPYING3. If not see #include "tm-constrs.h" #include "params.h" #include "cselib.h" +#include "debug.h" +#include "dwarf2out.h" static rtx legitimize_dllimport_symbol (rtx, bool); @@ -817,6 +819,93 @@ struct processor_costs amdfam10_cost = { 1, /* cond_not_taken_branch_cost. */ }; +struct processor_costs bdver1_cost = { + COSTS_N_INSNS (1), /* cost of an add instruction */ + COSTS_N_INSNS (2), /* cost of a lea instruction */ + COSTS_N_INSNS (1), /* variable shift costs */ + COSTS_N_INSNS (1), /* constant shift costs */ + {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ + COSTS_N_INSNS (4), /* HI */ + COSTS_N_INSNS (3), /* SI */ + COSTS_N_INSNS (4), /* DI */ + COSTS_N_INSNS (5)}, /* other */ + 0, /* cost of multiply per each bit set */ + {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */ + COSTS_N_INSNS (35), /* HI */ + COSTS_N_INSNS (51), /* SI */ + COSTS_N_INSNS (83), /* DI */ + COSTS_N_INSNS (83)}, /* other */ + COSTS_N_INSNS (1), /* cost of movsx */ + COSTS_N_INSNS (1), /* cost of movzx */ + 8, /* "large" insn */ + 9, /* MOVE_RATIO */ + 4, /* cost for loading QImode using movzbl */ + {3, 4, 3}, /* cost of loading integer registers + in QImode, HImode and SImode. + Relative to reg-reg move (2). */ + {3, 4, 3}, /* cost of storing integer registers */ + 4, /* cost of reg,reg fld/fst */ + {4, 4, 12}, /* cost of loading fp registers + in SFmode, DFmode and XFmode */ + {6, 6, 8}, /* cost of storing fp registers + in SFmode, DFmode and XFmode */ + 2, /* cost of moving MMX register */ + {3, 3}, /* cost of loading MMX registers + in SImode and DImode */ + {4, 4}, /* cost of storing MMX registers + in SImode and DImode */ + 2, /* cost of moving SSE register */ + {4, 4, 3}, /* cost of loading SSE registers + in SImode, DImode and TImode */ + {4, 4, 5}, /* cost of storing SSE registers + in SImode, DImode and TImode */ + 3, /* MMX or SSE register to integer */ + /* On K8 + MOVD reg64, xmmreg Double FSTORE 4 + MOVD reg32, xmmreg Double FSTORE 4 + On AMDFAM10 + MOVD reg64, xmmreg Double FADD 3 + 1/1 1/1 + MOVD reg32, xmmreg Double FADD 3 + 1/1 1/1 */ + 64, /* size of l1 cache. */ + 1024, /* size of l2 cache. */ + 64, /* size of prefetch block */ + /* New AMD processors never drop prefetches; if they cannot be performed + immediately, they are queued. We set number of simultaneous prefetches + to a large constant to reflect this (it probably is not a good idea not + to limit number of prefetches at all, as their execution also takes some + time). */ + 100, /* number of parallel prefetches */ + 2, /* Branch cost */ + COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */ + COSTS_N_INSNS (4), /* cost of FMUL instruction. */ + COSTS_N_INSNS (19), /* cost of FDIV instruction. */ + COSTS_N_INSNS (2), /* cost of FABS instruction. */ + COSTS_N_INSNS (2), /* cost of FCHS instruction. */ + COSTS_N_INSNS (35), /* cost of FSQRT instruction. */ + + /* BDVER1 has optimized REP instruction for medium sized blocks, but for + very small blocks it is better to use loop. For large blocks, libcall can + do nontemporary accesses and beat inline considerably. */ + {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}}, + {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + {{libcall, {{8, loop}, {24, unrolled_loop}, + {2048, rep_prefix_4_byte}, {-1, libcall}}}, + {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}, + 4, /* scalar_stmt_cost. */ + 2, /* scalar load_cost. */ + 2, /* scalar_store_cost. */ + 6, /* vec_stmt_cost. */ + 0, /* vec_to_scalar_cost. */ + 2, /* scalar_to_vec_cost. */ + 2, /* vec_align_load_cost. */ + 2, /* vec_unalign_load_cost. */ + 2, /* vec_store_cost. */ + 2, /* cond_taken_branch_cost. */ + 1, /* cond_not_taken_branch_cost. */ +}; + static const struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ @@ -1274,7 +1363,8 @@ const struct processor_costs *ix86_cost = &pentium_cost; #define m_ATHLON (1< 1 && TARGET_64BIT) + flag_zee = 1; + if (TARGET_MACHO) /* The Darwin libraries never set errno, so we might as well avoid calling them when that's the only reason we would. */ @@ -4391,8 +4501,8 @@ ix86_function_ok_for_sibcall (tree decl, tree exp) return true; } -/* Handle "cdecl", "stdcall", "fastcall", "regparm" and "sseregparm" - calling convention attributes; +/* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall", + and "sseregparm" calling convention attributes; arguments as in struct attribute_spec.handler. */ static tree @@ -4422,6 +4532,11 @@ ix86_handle_cconv_attribute (tree *node, tree name, error ("fastcall and regparm attributes are not compatible"); } + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + { + error ("regparam and thiscall attributes are not compatible"); + } + cst = TREE_VALUE (args); if (TREE_CODE (cst) != INTEGER_CST) { @@ -4443,7 +4558,8 @@ ix86_handle_cconv_attribute (tree *node, tree name, if (TARGET_64BIT) { /* Do not warn when emulating the MS ABI. */ - if (TREE_CODE (*node) != FUNCTION_TYPE + if ((TREE_CODE (*node) != FUNCTION_TYPE + && TREE_CODE (*node) != METHOD_TYPE) || ix86_function_type_abi (*node) != MS_ABI) warning (OPT_Wattributes, "%qE attribute ignored", name); @@ -4466,6 +4582,10 @@ ix86_handle_cconv_attribute (tree *node, tree name, { error ("fastcall and regparm attributes are not compatible"); } + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and thiscall attributes are not compatible"); + } } /* Can combine stdcall with fastcall (redundant), regparm and @@ -4480,6 +4600,10 @@ ix86_handle_cconv_attribute (tree *node, tree name, { error ("stdcall and fastcall attributes are not compatible"); } + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + { + error ("stdcall and thiscall attributes are not compatible"); + } } /* Can combine cdecl with regparm and sseregparm. */ @@ -4493,6 +4617,28 @@ ix86_handle_cconv_attribute (tree *node, tree name, { error ("fastcall and cdecl attributes are not compatible"); } + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node))) + { + error ("cdecl and thiscall attributes are not compatible"); + } + } + else if (is_attribute_p ("thiscall", name)) + { + if (TREE_CODE (*node) != METHOD_TYPE && pedantic) + warning (OPT_Wattributes, "%qE attribute is used for none class-method", + name); + if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node))) + { + error ("stdcall and thiscall attributes are not compatible"); + } + if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node))) + { + error ("fastcall and thiscall attributes are not compatible"); + } + if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node))) + { + error ("cdecl and thiscall attributes are not compatible"); + } } /* Can combine sseregparm with all attributes. */ @@ -4526,6 +4672,11 @@ ix86_comp_type_attributes (const_tree type1, const_tree type2) != !lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type2))) return 0; + /* Check for mismatched thiscall types. */ + if (!lookup_attribute ("thiscall", TYPE_ATTRIBUTES (type1)) + != !lookup_attribute ("thiscall", TYPE_ATTRIBUTES (type2))) + return 0; + /* Check for mismatched return types (cdecl vs stdcall). */ if (!lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type1)) != !lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type2))) @@ -4559,6 +4710,9 @@ ix86_function_regparm (const_tree type, const_tree decl) if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type))) return 2; + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (type))) + return 1; + /* Use register calling convention for local functions when possible. */ if (decl && TREE_CODE (decl) == FUNCTION_DECL @@ -4696,7 +4850,8 @@ ix86_return_pops_args (tree fundecl, tree funtype, int size) /* Stdcall and fastcall functions will pop the stack if not variable args. */ if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (funtype)) - || lookup_attribute ("fastcall", TYPE_ATTRIBUTES (funtype))) + || lookup_attribute ("fastcall", TYPE_ATTRIBUTES (funtype)) + || lookup_attribute ("thiscall", TYPE_ATTRIBUTES (funtype))) rtd = 1; if (rtd && ! stdarg_p (funtype)) @@ -4959,7 +5114,12 @@ init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */ else look for regparm information. */ if (fntype) { - if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype))) + if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype))) + { + cum->nregs = 1; + cum->fastcall = 1; /* Same first register as in fastcall. */ + } + else if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype))) { cum->nregs = 2; cum->fastcall = 1; @@ -6277,8 +6437,8 @@ ix86_function_arg_boundary (enum machine_mode mode, tree type) /* Return true if N is a possible register number of function value. */ -bool -ix86_function_value_regno_p (int regno) +static bool +ix86_function_value_regno_p (const unsigned int regno) { switch (regno) { @@ -6736,7 +6896,6 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) { rtx save_area, mem; rtx label; - rtx label_ref; rtx tmp_reg; rtx nsse_reg; alias_set_type set; @@ -6787,35 +6946,9 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) SSE saves. We need some preparation work to get this working. */ label = gen_label_rtx (); - label_ref = gen_rtx_LABEL_REF (Pmode, label); - /* Compute address to jump to : - label - eax*4 + nnamed_sse_arguments*4 Or - label - eax*5 + nnamed_sse_arguments*5 for AVX. */ - tmp_reg = gen_reg_rtx (Pmode); nsse_reg = gen_reg_rtx (Pmode); emit_insn (gen_zero_extendqidi2 (nsse_reg, gen_rtx_REG (QImode, AX_REG))); - emit_insn (gen_rtx_SET (VOIDmode, tmp_reg, - gen_rtx_MULT (Pmode, nsse_reg, - GEN_INT (4)))); - - /* vmovaps is one byte longer than movaps. */ - if (TARGET_AVX) - emit_insn (gen_rtx_SET (VOIDmode, tmp_reg, - gen_rtx_PLUS (Pmode, tmp_reg, - nsse_reg))); - - if (cum->sse_regno) - emit_move_insn - (nsse_reg, - gen_rtx_CONST (DImode, - gen_rtx_PLUS (DImode, - label_ref, - GEN_INT (cum->sse_regno - * (TARGET_AVX ? 5 : 4))))); - else - emit_move_insn (nsse_reg, label_ref); - emit_insn (gen_subdi3 (nsse_reg, nsse_reg, tmp_reg)); /* Compute address of memory block we save into. We always use pointer pointing 127 bytes after first byte to store - this is needed to keep @@ -6828,11 +6961,12 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum) mem = gen_rtx_MEM (BLKmode, plus_constant (tmp_reg, -127)); MEM_NOTRAP_P (mem) = 1; set_mem_alias_set (mem, set); - set_mem_align (mem, BITS_PER_WORD); + set_mem_align (mem, 64); /* And finally do the dirty job! */ emit_insn (gen_sse_prologue_save (mem, nsse_reg, - GEN_INT (cum->sse_regno), label)); + GEN_INT (cum->sse_regno), label, + gen_reg_rtx (Pmode))); } } @@ -6993,7 +7127,7 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p, int indirect_p = 0; tree ptrtype; enum machine_mode nat_mode; - int arg_boundary; + unsigned int arg_boundary; /* Only 64bit target needs something special. */ if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist))) @@ -7225,6 +7359,8 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p, t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, size_int (-align)); t = fold_convert (TREE_TYPE (ovf), t); + if (crtl->stack_alignment_needed < arg_boundary) + crtl->stack_alignment_needed = arg_boundary; } gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue); gimplify_assign (addr, t, pre_p); @@ -7434,15 +7570,27 @@ standard_sse_constant_opcode (rtx insn, rtx x) case MODE_V4SF: return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0"; case MODE_V2DF: - return TARGET_AVX ? "vxorpd\t%0, %0, %0" : "xorpd\t%0, %0"; + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0"; + else + return TARGET_AVX ? "vxorpd\t%0, %0, %0" : "xorpd\t%0, %0"; case MODE_TI: - return TARGET_AVX ? "vpxor\t%0, %0, %0" : "pxor\t%0, %0"; + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0"; + else + return TARGET_AVX ? "vpxor\t%0, %0, %0" : "pxor\t%0, %0"; case MODE_V8SF: return "vxorps\t%x0, %x0, %x0"; case MODE_V4DF: - return "vxorpd\t%x0, %x0, %x0"; + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "vxorps\t%x0, %x0, %x0"; + else + return "vxorpd\t%x0, %x0, %x0"; case MODE_OI: - return "vpxor\t%x0, %x0, %x0"; + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + return "vxorps\t%x0, %x0, %x0"; + else + return "vpxor\t%x0, %x0, %x0"; default: break; } @@ -7576,8 +7724,8 @@ get_pc_thunk_name (char name[32], unsigned int regno) /* This function generates code for -fpic that loads %ebx with the return address of the caller and then returns. */ -void -ix86_file_end (void) +static void +ix86_code_end (void) { rtx xops[2]; int regno; @@ -7585,12 +7733,21 @@ ix86_file_end (void) for (regno = 0; regno < 8; ++regno) { char name[32]; + tree decl; if (! ((pic_labels_used >> regno) & 1)) continue; get_pc_thunk_name (name, regno); + decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL, + get_identifier (name), + build_function_type (void_type_node, void_list_node)); + DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL, + NULL_TREE, void_type_node); + TREE_PUBLIC (decl) = 1; + TREE_STATIC (decl) = 1; + #if TARGET_MACHO if (TARGET_MACHO) { @@ -7601,18 +7758,12 @@ ix86_file_end (void) assemble_name (asm_out_file, name); fputs ("\n", asm_out_file); ASM_OUTPUT_LABEL (asm_out_file, name); + DECL_WEAK (decl) = 1; } else #endif if (USE_HIDDEN_LINKONCE) { - tree decl; - - decl = build_decl (BUILTINS_LOCATION, - FUNCTION_DECL, get_identifier (name), - error_mark_node); - TREE_PUBLIC (decl) = 1; - TREE_STATIC (decl) = 1; DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl); (*targetm.asm_out.unique_section) (decl, 0); @@ -7630,14 +7781,23 @@ ix86_file_end (void) ASM_OUTPUT_LABEL (asm_out_file, name); } + DECL_INITIAL (decl) = make_node (BLOCK); + current_function_decl = decl; + init_function_start (decl); + first_function_block_is_cold = false; + /* Make sure unwind info is emitted for the thunk if needed. */ + final_start_function (emit_barrier (), asm_out_file, 1); + xops[0] = gen_rtx_REG (Pmode, regno); xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx); output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops); output_asm_insn ("ret", xops); + final_end_function (); + init_insn_lengths (); + free_after_compilation (cfun); + set_cfun (NULL); + current_function_decl = NULL; } - - if (NEED_INDICATE_EXEC_STACK) - file_end_indicate_exec_stack (); } /* Emit code for the SET_GOT patterns. */ @@ -7674,7 +7834,24 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED) if (!flag_pic) output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops); else - output_asm_insn ("call\t%a2", xops); + { + output_asm_insn ("call\t%a2", xops); +#ifdef DWARF2_UNWIND_INFO + /* The call to next label acts as a push. */ + if (dwarf2out_do_frame ()) + { + rtx insn; + start_sequence (); + insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, + gen_rtx_PLUS (Pmode, + stack_pointer_rtx, + GEN_INT (-4)))); + RTX_FRAME_RELATED_P (insn) = 1; + dwarf2out_frame_debug (insn, true); + end_sequence (); + } +#endif + } #if TARGET_MACHO /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This @@ -7687,7 +7864,27 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED) CODE_LABEL_NUMBER (XEXP (xops[2], 0))); if (flag_pic) - output_asm_insn ("pop%z0\t%0", xops); + { + output_asm_insn ("pop%z0\t%0", xops); +#ifdef DWARF2_UNWIND_INFO + /* The pop is a pop and clobbers dest, but doesn't restore it + for unwind info purposes. */ + if (dwarf2out_do_frame ()) + { + rtx insn; + start_sequence (); + insn = emit_insn (gen_rtx_SET (VOIDmode, dest, const0_rtx)); + dwarf2out_frame_debug (insn, true); + insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, + gen_rtx_PLUS (Pmode, + stack_pointer_rtx, + GEN_INT (4)))); + RTX_FRAME_RELATED_P (insn) = 1; + dwarf2out_frame_debug (insn, true); + end_sequence (); + } +#endif + } } else { @@ -7695,6 +7892,18 @@ output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED) get_pc_thunk_name (name, REGNO (dest)); pic_labels_used |= 1 << REGNO (dest); +#ifdef DWARF2_UNWIND_INFO + /* Ensure all queued register saves are flushed before the + call. */ + if (dwarf2out_do_frame ()) + { + rtx insn; + start_sequence (); + insn = emit_barrier (); + end_sequence (); + dwarf2out_frame_debug (insn, false); + } +#endif xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name)); xops[2] = gen_rtx_MEM (QImode, xops[2]); output_asm_insn ("call\t%X2", xops); @@ -7919,6 +8128,7 @@ ix86_compute_frame_layout (struct ix86_frame *frame) && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs) { int count = frame->nregs; + struct cgraph_node *node = cgraph_node (current_function_decl); cfun->machine->use_fast_prologue_epilogue_nregs = count; /* The fast prologue uses move instead of push to save registers. This @@ -7933,9 +8143,9 @@ ix86_compute_frame_layout (struct ix86_frame *frame) slow to use many of them. */ if (count) count = (count - 1) * FAST_PROLOGUE_INSN_COUNT; - if (cfun->function_frequency < FUNCTION_FREQUENCY_NORMAL + if (node->frequency < NODE_FREQUENCY_NORMAL || (flag_branch_probabilities - && cfun->function_frequency < FUNCTION_FREQUENCY_HOT)) + && node->frequency < NODE_FREQUENCY_HOT)) cfun->machine->use_fast_prologue_epilogue = false; else cfun->machine->use_fast_prologue_epilogue @@ -8237,6 +8447,8 @@ find_drap_reg (void) passing. */ if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2 && !lookup_attribute ("fastcall", + TYPE_ATTRIBUTES (TREE_TYPE (decl))) + && !lookup_attribute ("thiscall", TYPE_ATTRIBUTES (TREE_TYPE (decl)))) return CX_REG; else @@ -8331,7 +8543,11 @@ ix86_get_drap_rtx (void) end_sequence (); insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ())); - RTX_FRAME_RELATED_P (insn) = 1; + if (!optimize) + { + add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg); + RTX_FRAME_RELATED_P (insn) = 1; + } return drap_vreg; } else @@ -8559,13 +8775,10 @@ ix86_expand_prologue (void) ix86_cfa_state->reg == stack_pointer_rtx); else { - /* Only valid for Win32. */ rtx eax = gen_rtx_REG (Pmode, AX_REG); bool eax_live; rtx t; - gcc_assert (!TARGET_64BIT || cfun->machine->call_abi == MS_ABI); - if (cfun->machine->call_abi == MS_ABI) eax_live = false; else @@ -9263,6 +9476,7 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) rtx base_reg, index_reg; HOST_WIDE_INT scale = 1; rtx scale_rtx = NULL_RTX; + rtx tmp; int retval = 1; enum ix86_address_seg seg = SEG_DEFAULT; @@ -9298,6 +9512,19 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) scale_rtx = XEXP (op, 1); break; + case ASHIFT: + if (index) + return 0; + index = XEXP (op, 0); + tmp = XEXP (op, 1); + if (!CONST_INT_P (tmp)) + return 0; + scale = INTVAL (tmp); + if ((unsigned HOST_WIDE_INT) scale > 3) + return 0; + scale = 1 << scale; + break; + case UNSPEC: if (XINT (op, 1) == UNSPEC_TP && TARGET_TLS_DIRECT_SEG_REFS @@ -9338,8 +9565,6 @@ ix86_decompose_address (rtx addr, struct ix86_address *out) } else if (GET_CODE (addr) == ASHIFT) { - rtx tmp; - /* We're called for lea too, which implements ashift on occasion. */ index = XEXP (addr, 0); tmp = XEXP (addr, 1); @@ -10794,29 +11019,29 @@ output_pic_addr_const (FILE *file, rtx x, int code) break; case UNSPEC_GOTTPOFF: /* FIXME: This might be @TPOFF in Sun ld too. */ - fputs ("@GOTTPOFF", file); + fputs ("@gottpoff", file); break; case UNSPEC_TPOFF: - fputs ("@TPOFF", file); + fputs ("@tpoff", file); break; case UNSPEC_NTPOFF: if (TARGET_64BIT) - fputs ("@TPOFF", file); + fputs ("@tpoff", file); else - fputs ("@NTPOFF", file); + fputs ("@ntpoff", file); break; case UNSPEC_DTPOFF: - fputs ("@DTPOFF", file); + fputs ("@dtpoff", file); break; case UNSPEC_GOTNTPOFF: if (TARGET_64BIT) fputs (ASSEMBLER_DIALECT == ASM_ATT ? - "@GOTTPOFF(%rip)": "@GOTTPOFF[rip]", file); + "@gottpoff(%rip)": "@gottpoff[rip]", file); else - fputs ("@GOTNTPOFF", file); + fputs ("@gotntpoff", file); break; case UNSPEC_INDNTPOFF: - fputs ("@INDNTPOFF", file); + fputs ("@indntpoff", file); break; #if TARGET_MACHO case UNSPEC_MACHOPIC_OFFSET: @@ -10843,7 +11068,7 @@ i386_output_dwarf_dtprel (FILE *file, int size, rtx x) { fputs (ASM_LONG, file); output_addr_const (file, x); - fputs ("@DTPOFF", file); + fputs ("@dtpoff", file); switch (size) { case 4: @@ -10884,6 +11109,9 @@ static rtx ix86_delegitimize_address (rtx x) { rtx orig_x = delegitimize_mem_from_attrs (x); + /* addend is NULL or some rtx if x is something+GOTOFF where + something doesn't include the PIC register. */ + rtx addend = NULL_RTX; /* reg_addend is NULL or a multiple of some register. */ rtx reg_addend = NULL_RTX; /* const_addend is NULL or a const_int. */ @@ -10903,7 +11131,10 @@ ix86_delegitimize_address (rtx x) || XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL || !MEM_P (orig_x)) return orig_x; - return XVECEXP (XEXP (x, 0), 0, 0); + x = XVECEXP (XEXP (x, 0), 0, 0); + if (GET_MODE (orig_x) != Pmode) + return simplify_gen_subreg (GET_MODE (orig_x), x, Pmode, 0); + return x; } if (GET_CODE (x) != PLUS @@ -10922,14 +11153,13 @@ ix86_delegitimize_address (rtx x) else if (ix86_pic_register_p (XEXP (reg_addend, 1))) reg_addend = XEXP (reg_addend, 0); else - return orig_x; - if (!REG_P (reg_addend) - && GET_CODE (reg_addend) != MULT - && GET_CODE (reg_addend) != ASHIFT) - return orig_x; + { + reg_addend = NULL_RTX; + addend = XEXP (x, 0); + } } else - return orig_x; + addend = XEXP (x, 0); x = XEXP (XEXP (x, 1), 0); if (GET_CODE (x) == PLUS @@ -10940,7 +11170,7 @@ ix86_delegitimize_address (rtx x) } if (GET_CODE (x) == UNSPEC - && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x)) + && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend) || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x)))) result = XVECEXP (x, 0, 0); @@ -10955,6 +11185,24 @@ ix86_delegitimize_address (rtx x) result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend)); if (reg_addend) result = gen_rtx_PLUS (Pmode, reg_addend, result); + if (addend) + { + /* If the rest of original X doesn't involve the PIC register, add + addend and subtract pic_offset_table_rtx. This can happen e.g. + for code like: + leal (%ebx, %ecx, 4), %ecx + ... + movl foo@GOTOFF(%ecx), %edx + in which case we return (%ecx - %ebx) + foo. */ + if (pic_offset_table_rtx) + result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend), + pic_offset_table_rtx), + result); + else + return orig_x; + } + if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x)) + return simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0); return result; } @@ -11295,7 +11543,7 @@ get_some_local_dynamic_name (void) return cfun->machine->some_ld_name; for (insn = get_insns (); insn ; insn = NEXT_INSN (insn)) - if (INSN_P (insn) + if (NONDEBUG_INSN_P (insn) && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0)) return cfun->machine->some_ld_name; @@ -11306,7 +11554,6 @@ get_some_local_dynamic_name (void) L,W,B,Q,S,T -- print the opcode suffix for specified size of operand. C -- print opcode suffix for set/cmov insn. c -- like C, but print reversed condition - E,e -- likewise, but for compare-and-branch fused insn. F,f -- likewise, but for floating-point. O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.", otherwise nothing @@ -11711,14 +11958,6 @@ print_operand (FILE *file, rtx x, int code) put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file); return; - case 'E': - put_condition_code (GET_CODE (x), CCmode, 0, 0, file); - return; - - case 'e': - put_condition_code (GET_CODE (x), CCmode, 1, 0, file); - return; - case 'H': /* It doesn't actually matter what mode we use here, as we're only going to use this for printing. */ @@ -11817,10 +12056,8 @@ print_operand (FILE *file, rtx x, int code) return; case ';': -#if TARGET_MACHO - fputs (" ; ", file); -#else - putc (' ', file); +#if TARGET_MACHO || !HAVE_AS_IX86_REP_LOCK_PREFIX + fputs (";", file); #endif return; @@ -12102,34 +12339,34 @@ output_addr_const_extra (FILE *file, rtx x) case UNSPEC_GOTTPOFF: output_addr_const (file, op); /* FIXME: This might be @TPOFF in Sun ld. */ - fputs ("@GOTTPOFF", file); + fputs ("@gottpoff", file); break; case UNSPEC_TPOFF: output_addr_const (file, op); - fputs ("@TPOFF", file); + fputs ("@tpoff", file); break; case UNSPEC_NTPOFF: output_addr_const (file, op); if (TARGET_64BIT) - fputs ("@TPOFF", file); + fputs ("@tpoff", file); else - fputs ("@NTPOFF", file); + fputs ("@ntpoff", file); break; case UNSPEC_DTPOFF: output_addr_const (file, op); - fputs ("@DTPOFF", file); + fputs ("@dtpoff", file); break; case UNSPEC_GOTNTPOFF: output_addr_const (file, op); if (TARGET_64BIT) fputs (ASSEMBLER_DIALECT == ASM_ATT ? - "@GOTTPOFF(%rip)" : "@GOTTPOFF[rip]", file); + "@gottpoff(%rip)" : "@gottpoff[rip]", file); else - fputs ("@GOTNTPOFF", file); + fputs ("@gotntpoff", file); break; case UNSPEC_INDNTPOFF: output_addr_const (file, op); - fputs ("@INDNTPOFF", file); + fputs ("@indntpoff", file); break; #if TARGET_MACHO case UNSPEC_MACHOPIC_OFFSET: @@ -13107,6 +13344,14 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) switch (GET_MODE_SIZE (mode)) { case 16: + /* If we're optimizing for size, movups is the smallest. */ + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + op1 = gen_lowpart (V4SFmode, op1); + emit_insn (gen_avx_movups (op0, op1)); + return; + } op0 = gen_lowpart (V16QImode, op0); op1 = gen_lowpart (V16QImode, op1); emit_insn (gen_avx_movdqu (op0, op1)); @@ -13133,6 +13378,13 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) emit_insn (gen_avx_movups256 (op0, op1)); break; case V2DFmode: + if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + op1 = gen_lowpart (V4SFmode, op1); + emit_insn (gen_avx_movups (op0, op1)); + return; + } emit_insn (gen_avx_movupd (op0, op1)); break; case V4DFmode: @@ -13153,7 +13405,8 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) if (MEM_P (op1)) { /* If we're optimizing for size, movups is the smallest. */ - if (optimize_insn_for_size_p ()) + if (optimize_insn_for_size_p () + || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) { op0 = gen_lowpart (V4SFmode, op0); op1 = gen_lowpart (V4SFmode, op1); @@ -13176,13 +13429,13 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) { rtx zero; - if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL) - { - op0 = gen_lowpart (V2DFmode, op0); - op1 = gen_lowpart (V2DFmode, op1); - emit_insn (gen_sse2_movupd (op0, op1)); - return; - } + if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL) + { + op0 = gen_lowpart (V2DFmode, op0); + op1 = gen_lowpart (V2DFmode, op1); + emit_insn (gen_sse2_movupd (op0, op1)); + return; + } /* When SSE registers are split into halves, we can avoid writing to the top half twice. */ @@ -13211,12 +13464,12 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) } else { - if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL) - { - op0 = gen_lowpart (V4SFmode, op0); - op1 = gen_lowpart (V4SFmode, op1); - emit_insn (gen_sse_movups (op0, op1)); - return; + if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + op1 = gen_lowpart (V4SFmode, op1); + emit_insn (gen_sse_movups (op0, op1)); + return; } if (TARGET_SSE_PARTIAL_REG_DEPENDENCY) @@ -13235,7 +13488,8 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) else if (MEM_P (op0)) { /* If we're optimizing for size, movups is the smallest. */ - if (optimize_insn_for_size_p ()) + if (optimize_insn_for_size_p () + || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) { op0 = gen_lowpart (V4SFmode, op0); op1 = gen_lowpart (V4SFmode, op1); @@ -13256,19 +13510,37 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) if (TARGET_SSE2 && mode == V2DFmode) { - m = adjust_address (op0, DFmode, 0); - emit_insn (gen_sse2_storelpd (m, op1)); - m = adjust_address (op0, DFmode, 8); - emit_insn (gen_sse2_storehpd (m, op1)); + if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL) + { + op0 = gen_lowpart (V2DFmode, op0); + op1 = gen_lowpart (V2DFmode, op1); + emit_insn (gen_sse2_movupd (op0, op1)); + } + else + { + m = adjust_address (op0, DFmode, 0); + emit_insn (gen_sse2_storelpd (m, op1)); + m = adjust_address (op0, DFmode, 8); + emit_insn (gen_sse2_storehpd (m, op1)); + } } else { if (mode != V4SFmode) op1 = gen_lowpart (V4SFmode, op1); - m = adjust_address (op0, V2SFmode, 0); - emit_insn (gen_sse_storelps (m, op1)); - m = adjust_address (op0, V2SFmode, 8); - emit_insn (gen_sse_storehps (m, op1)); + + if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL) + { + op0 = gen_lowpart (V4SFmode, op0); + emit_insn (gen_sse_movups (op0, op1)); + } + else + { + m = adjust_address (op0, V2SFmode, 0); + emit_insn (gen_sse_storelps (m, op1)); + m = adjust_address (op0, V2SFmode, 8); + emit_insn (gen_sse_storehps (m, op1)); + } } } else @@ -13550,7 +13822,7 @@ distance_non_agu_define (unsigned int regno1, unsigned int regno2, rtx prev = PREV_INSN (insn); while (prev && distance < LEA_SEARCH_THRESHOLD) { - if (INSN_P (prev)) + if (NONDEBUG_INSN_P (prev)) { distance++; for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++) @@ -13590,7 +13862,7 @@ distance_non_agu_define (unsigned int regno1, unsigned int regno2, && prev != insn && distance < LEA_SEARCH_THRESHOLD) { - if (INSN_P (prev)) + if (NONDEBUG_INSN_P (prev)) { distance++; for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++) @@ -13636,7 +13908,7 @@ distance_agu_use (unsigned int regno0, rtx insn) rtx next = NEXT_INSN (insn); while (next && distance < LEA_SEARCH_THRESHOLD) { - if (INSN_P (next)) + if (NONDEBUG_INSN_P (next)) { distance++; @@ -13685,7 +13957,7 @@ distance_agu_use (unsigned int regno0, rtx insn) && next != insn && distance < LEA_SEARCH_THRESHOLD) { - if (INSN_P (next)) + if (NONDEBUG_INSN_P (next)) { distance++; @@ -15381,7 +15653,7 @@ ix86_expand_int_movcc (rtx operands[]) enum rtx_code code = GET_CODE (operands[1]), compare_code; rtx compare_seq, compare_op; enum machine_mode mode = GET_MODE (operands[0]); - bool sign_bit_compare_p = false;; + bool sign_bit_compare_p = false; start_sequence (); ix86_compare_op0 = XEXP (operands[1], 0); @@ -15422,7 +15694,6 @@ ix86_expand_int_movcc (rtx operands[]) if (!sign_bit_compare_p) { rtx flags; - rtx (*insn)(rtx, rtx, rtx); bool fpcmp = false; compare_code = GET_CODE (compare_op); @@ -15463,11 +15734,10 @@ ix86_expand_int_movcc (rtx operands[]) tmp = gen_reg_rtx (mode); if (mode == DImode) - insn = gen_x86_movdicc_0_m1; + emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op)); else - insn = gen_x86_movsicc_0_m1; - - emit_insn (insn (tmp, flags, compare_op)); + emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp), + flags, compare_op)); } else { @@ -17005,20 +17275,22 @@ ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode) : gen_x86_64_shld) (high[0], low[0], operands[2])); } - emit_insn ((mode == DImode ? gen_ashlsi3 : gen_ashldi3) (low[0], low[0], operands[2])); + emit_insn ((mode == DImode + ? gen_ashlsi3 + : gen_ashldi3) (low[0], low[0], operands[2])); if (TARGET_CMOVE && scratch) { ix86_expand_clear (scratch); emit_insn ((mode == DImode - ? gen_x86_shift_adj_1 - : gen_x86_64_shift_adj_1) (high[0], low[0], operands[2], - scratch)); + ? gen_x86_shiftsi_adj_1 + : gen_x86_shiftdi_adj_1) (high[0], low[0], operands[2], + scratch)); } else emit_insn ((mode == DImode - ? gen_x86_shift_adj_2 - : gen_x86_64_shift_adj_2) (high[0], low[0], operands[2])); + ? gen_x86_shiftsi_adj_2 + : gen_x86_shiftdi_adj_2) (high[0], low[0], operands[2])); } void @@ -17091,14 +17363,14 @@ ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode) : gen_ashrdi3) (scratch, scratch, GEN_INT (single_width - 1))); emit_insn ((mode == DImode - ? gen_x86_shift_adj_1 - : gen_x86_64_shift_adj_1) (low[0], high[0], operands[2], - scratch)); + ? gen_x86_shiftsi_adj_1 + : gen_x86_shiftdi_adj_1) (low[0], high[0], operands[2], + scratch)); } else emit_insn ((mode == DImode - ? gen_x86_shift_adj_3 - : gen_x86_64_shift_adj_3) (low[0], high[0], operands[2])); + ? gen_x86_shiftsi_adj_3 + : gen_x86_shiftdi_adj_3) (low[0], high[0], operands[2])); } } @@ -17156,14 +17428,14 @@ ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode) { ix86_expand_clear (scratch); emit_insn ((mode == DImode - ? gen_x86_shift_adj_1 - : gen_x86_64_shift_adj_1) (low[0], high[0], operands[2], - scratch)); + ? gen_x86_shiftsi_adj_1 + : gen_x86_shiftdi_adj_1) (low[0], high[0], operands[2], + scratch)); } else emit_insn ((mode == DImode - ? gen_x86_shift_adj_2 - : gen_x86_64_shift_adj_2) (low[0], high[0], operands[2])); + ? gen_x86_shiftsi_adj_2 + : gen_x86_shiftdi_adj_2) (low[0], high[0], operands[2])); } } @@ -19588,6 +19860,7 @@ ix86_issue_rate (void) case PROCESSOR_NOCONA: case PROCESSOR_GENERIC32: case PROCESSOR_GENERIC64: + case PROCESSOR_BDVER1: return 3; case PROCESSOR_CORE2: @@ -19777,6 +20050,7 @@ ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost) case PROCESSOR_ATHLON: case PROCESSOR_K8: case PROCESSOR_AMDFAM10: + case PROCESSOR_BDVER1: case PROCESSOR_ATOM: case PROCESSOR_GENERIC32: case PROCESSOR_GENERIC64: @@ -19972,10 +20246,26 @@ ix86_local_alignment (tree exp, enum machine_mode mode, } /* x86-64 ABI requires arrays greater than 16 bytes to be aligned - to 16byte boundary. */ - if (TARGET_64BIT) + to 16byte boundary. Exact wording is: + + An array uses the same alignment as its elements, except that a local or + global array variable of length at least 16 bytes or + a C99 variable-length array variable always has alignment of at least 16 bytes. + + This was added to allow use of aligned SSE instructions at arrays. This + rule is meant for static storage (where compiler can not do the analysis + by itself). We follow it for automatic variables only when convenient. + We fully control everything in the function compiled and functions from + other unit can not rely on the alignment. + + Exclude va_list type. It is the common case of local array where + we can not benefit from the alignment. */ + if (TARGET_64BIT && optimize_function_for_speed_p (cfun) + && TARGET_SSE) { if (AGGREGATE_TYPE_P (type) + && (TYPE_MAIN_VARIANT (type) + != TYPE_MAIN_VARIANT (va_list_type_node)) && TYPE_SIZE (type) && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16 @@ -20083,6 +20373,12 @@ ix86_static_chain (const_tree fndecl, bool incoming_p) us with EAX for the static chain. */ regno = AX_REG; } + else if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (fntype))) + { + /* Thiscall functions use ecx for arguments, which leaves + us with EAX for the static chain. */ + regno = AX_REG; + } else if (ix86_function_regparm (fntype, fndecl) == 3) { /* For regparm 3, we have no free call-clobbered registers in @@ -20960,6 +21256,10 @@ enum ix86_builtins IX86_BUILTIN_VPERMILPS, IX86_BUILTIN_VPERMILPD256, IX86_BUILTIN_VPERMILPS256, + IX86_BUILTIN_VPERMIL2PD, + IX86_BUILTIN_VPERMIL2PS, + IX86_BUILTIN_VPERMIL2PD256, + IX86_BUILTIN_VPERMIL2PS256, IX86_BUILTIN_VPERM2F128PD256, IX86_BUILTIN_VPERM2F128PS256, IX86_BUILTIN_VPERM2F128SI256, @@ -22149,6 +22449,10 @@ static const struct builtin_description bdesc_args[] = }; /* FMA4 and XOP. */ +#define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT +#define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT +#define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT +#define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF @@ -22391,6 +22695,11 @@ static const struct builtin_description bdesc_multi_arg[] = { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF }, { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 }, + { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 }, + }; /* Set up all the MMX/SSE builtins, even builtins for instructions that are not @@ -22771,6 +23080,14 @@ ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target, switch (m_type) { + case MULTI_ARG_4_DF2_DI_I: + case MULTI_ARG_4_DF2_DI_I1: + case MULTI_ARG_4_SF2_SI_I: + case MULTI_ARG_4_SF2_SI_I1: + nargs = 4; + last_arg_constant = true; + break; + case MULTI_ARG_3_SF: case MULTI_ARG_3_DF: case MULTI_ARG_3_SF2: @@ -22914,6 +23231,10 @@ ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target, pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op); break; + case 4: + pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op); + break; + default: gcc_unreachable (); } @@ -23532,6 +23853,13 @@ ix86_expand_args_builtin (const struct builtin_description *d, nargs = 3; nargs_constant = 2; break; + case V2DF_FTYPE_V2DF_V2DF_V2DI_INT: + case V4DF_FTYPE_V4DF_V4DF_V4DI_INT: + case V4SF_FTYPE_V4SF_V4SF_V4SI_INT: + case V8SF_FTYPE_V8SF_V8SF_V8SI_INT: + nargs = 4; + nargs_constant = 1; + break; case V2DI_FTYPE_V2DI_V2DI_UINT_UINT: nargs = 4; nargs_constant = 2; @@ -23601,6 +23929,10 @@ ix86_expand_args_builtin (const struct builtin_description *d, case CODE_FOR_sse4_1_blendpd: case CODE_FOR_avx_vpermilv2df: + case CODE_FOR_xop_vpermil2v2df3: + case CODE_FOR_xop_vpermil2v4sf3: + case CODE_FOR_xop_vpermil2v4df3: + case CODE_FOR_xop_vpermil2v8sf3: error ("the last argument must be a 2-bit immediate"); return const0_rtx; @@ -24257,14 +24589,16 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED, if it is not available. */ static tree -ix86_builtin_vectorized_function (unsigned int fn, tree type_out, +ix86_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in) { enum machine_mode in_mode, out_mode; int in_n, out_n; + enum built_in_function fn = DECL_FUNCTION_CODE (fndecl); if (TREE_CODE (type_out) != VECTOR_TYPE - || TREE_CODE (type_in) != VECTOR_TYPE) + || TREE_CODE (type_in) != VECTOR_TYPE + || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL) return NULL_TREE; out_mode = TYPE_MODE (TREE_TYPE (type_out)); @@ -24522,43 +24856,92 @@ ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in) /* Returns a decl of a function that implements conversion of an integer vector - into a floating-point vector, or vice-versa. TYPE is the type of the integer - side of the conversion. + into a floating-point vector, or vice-versa. DEST_TYPE and SRC_TYPE + are the types involved when converting according to CODE. Return NULL_TREE if it is not available. */ static tree -ix86_vectorize_builtin_conversion (unsigned int code, tree type) +ix86_vectorize_builtin_conversion (unsigned int code, + tree dest_type, tree src_type) { - if (! (TARGET_SSE2 && TREE_CODE (type) == VECTOR_TYPE)) + if (! TARGET_SSE2) return NULL_TREE; switch (code) { case FLOAT_EXPR: - switch (TYPE_MODE (type)) + switch (TYPE_MODE (src_type)) { case V4SImode: - return TYPE_UNSIGNED (type) - ? ix86_builtins[IX86_BUILTIN_CVTUDQ2PS] - : ix86_builtins[IX86_BUILTIN_CVTDQ2PS]; + switch (TYPE_MODE (dest_type)) + { + case V4SFmode: + return (TYPE_UNSIGNED (src_type) + ? ix86_builtins[IX86_BUILTIN_CVTUDQ2PS] + : ix86_builtins[IX86_BUILTIN_CVTDQ2PS]); + case V4DFmode: + return (TYPE_UNSIGNED (src_type) + ? NULL_TREE + : ix86_builtins[IX86_BUILTIN_CVTDQ2PD256]); + default: + return NULL_TREE; + } + break; + case V8SImode: + switch (TYPE_MODE (dest_type)) + { + case V8SFmode: + return (TYPE_UNSIGNED (src_type) + ? NULL_TREE + : ix86_builtins[IX86_BUILTIN_CVTDQ2PS]); + default: + return NULL_TREE; + } + break; default: return NULL_TREE; } case FIX_TRUNC_EXPR: - switch (TYPE_MODE (type)) + switch (TYPE_MODE (dest_type)) { case V4SImode: - return TYPE_UNSIGNED (type) - ? NULL_TREE - : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ]; + switch (TYPE_MODE (src_type)) + { + case V4SFmode: + return (TYPE_UNSIGNED (dest_type) + ? NULL_TREE + : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ]); + case V4DFmode: + return (TYPE_UNSIGNED (dest_type) + ? NULL_TREE + : ix86_builtins[IX86_BUILTIN_CVTTPD2DQ256]); + default: + return NULL_TREE; + } + break; + + case V8SImode: + switch (TYPE_MODE (src_type)) + { + case V8SFmode: + return (TYPE_UNSIGNED (dest_type) + ? NULL_TREE + : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ256]); + default: + return NULL_TREE; + } + break; + default: return NULL_TREE; } + default: return NULL_TREE; - } + + return NULL_TREE; } /* Returns a code for a target-specific builtin that implements @@ -24622,7 +25005,7 @@ avx_vpermilp_parallel (rtx par, enum machine_mode mode) if (!CONST_INT_P (er)) return 0; ei = INTVAL (er); - if (ei >= 2 * nelt) + if (ei >= nelt) return 0; ipar[i] = ei; } @@ -25817,13 +26200,6 @@ machopic_output_stub (FILE *file, const char *symb, const char *stub) fprintf (file, "\t.indirect_symbol %s\n", symbol_name); fprintf (file, ASM_LONG "%s\n", binder_name); } - -void -darwin_x86_file_end (void) -{ - darwin_file_end (); - ix86_file_end (); -} #endif /* TARGET_MACHO */ /* Order the registers for register allocator. */ @@ -26019,6 +26395,13 @@ x86_this_parameter (tree function) if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type))) regno = aggr ? DX_REG : CX_REG; + else if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (type))) + { + regno = CX_REG; + if (aggr) + return gen_rtx_MEM (SImode, + plus_constant (stack_pointer_rtx, 4)); + } else { regno = AX_REG; @@ -26070,7 +26453,7 @@ x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED, *(*this + vcall_offset) should be added to THIS. */ static void -x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, +x86_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta, HOST_WIDE_INT vcall_offset, tree function) { @@ -26078,6 +26461,9 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, rtx this_param = x86_this_parameter (function); rtx this_reg, tmp; + /* Make sure unwind info is emitted for the thunk if needed. */ + final_start_function (emit_barrier (), file, 1); + /* If VCALL_OFFSET, we'll need THIS in a register. Might as well pull it in now and let DELTA benefit. */ if (REG_P (this_param)) @@ -26095,10 +26481,7 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, /* Adjust the this parameter by a fixed constant. */ if (delta) { - /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'. - Exceptions: -128 encodes smaller than 128, so swap sign and op. */ - bool sub = delta < 0 || delta == 128; - xops[0] = GEN_INT (sub ? -delta : delta); + xops[0] = GEN_INT (delta); xops[1] = this_reg ? this_reg : this_param; if (TARGET_64BIT) { @@ -26110,12 +26493,12 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, xops[0] = tmp; xops[1] = this_param; } - if (sub) + if (x86_maybe_negate_const_int (&xops[0], DImode)) output_asm_insn ("sub{q}\t{%0, %1|%1, %0}", xops); else output_asm_insn ("add{q}\t{%0, %1|%1, %0}", xops); } - else if (sub) + else if (x86_maybe_negate_const_int (&xops[0], SImode)) output_asm_insn ("sub{l}\t{%0, %1|%1, %0}", xops); else output_asm_insn ("add{l}\t{%0, %1|%1, %0}", xops); @@ -26130,7 +26513,9 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, { int tmp_regno = CX_REG; if (lookup_attribute ("fastcall", - TYPE_ATTRIBUTES (TREE_TYPE (function)))) + TYPE_ATTRIBUTES (TREE_TYPE (function))) + || lookup_attribute ("thiscall", + TYPE_ATTRIBUTES (TREE_TYPE (function)))) tmp_regno = AX_REG; tmp = gen_rtx_REG (SImode, tmp_regno); } @@ -26206,6 +26591,7 @@ x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED, output_asm_insn ("jmp\t{*}%1", xops); } } + final_end_function (); } static void @@ -26247,7 +26633,7 @@ x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED) if (TARGET_64BIT) { #ifndef NO_PROFILE_COUNTERS - fprintf (file, "\tleaq\t" LPREFIX "P%d@(%%rip),%%r11\n", labelno); + fprintf (file, "\tleaq\t" LPREFIX "P%d(%%rip),%%r11\n", labelno); #endif if (DEFAULT_ABI == SYSV_ABI && flag_pic) @@ -26479,7 +26865,7 @@ ix86_pad_returns (void) replace = true; /* Empty functions get branch mispredict even when the jump destination is not visible to us. */ - if (!prev && cfun->function_frequency > FUNCTION_FREQUENCY_UNLIKELY_EXECUTED) + if (!prev && !optimize_function_for_size_p (cfun)) replace = true; } if (replace) @@ -26541,6 +26927,52 @@ x86_extended_reg_mentioned_p (rtx insn) extended_reg_mentioned_1, NULL); } +/* If profitable, negate (without causing overflow) integer constant + of mode MODE at location LOC. Return true in this case. */ +bool +x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode) +{ + HOST_WIDE_INT val; + + if (!CONST_INT_P (*loc)) + return false; + + switch (mode) + { + case DImode: + /* DImode x86_64 constants must fit in 32 bits. */ + gcc_assert (x86_64_immediate_operand (*loc, mode)); + + mode = SImode; + break; + + case SImode: + case HImode: + case QImode: + break; + + default: + gcc_unreachable (); + } + + /* Avoid overflows. */ + if (mode_signbit_p (mode, *loc)) + return false; + + val = INTVAL (*loc); + + /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'. + Exceptions: -128 encodes smaller than 128, so swap sign and op. */ + if ((val < 0 && val != -128) + || val == 128) + { + *loc = GEN_INT (-val); + return true; + } + + return false; +} + /* Generate an unsigned DImode/SImode to FP conversion. This is the same code optabs would emit if we didn't have TFmode patterns. */ @@ -28823,6 +29255,9 @@ static const struct attribute_spec ix86_attribute_table[] = /* Fastcall attribute says callee is responsible for popping arguments if they are not variable. */ { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute }, + /* Thiscall attribute says callee is responsible for popping arguments + if they are not variable. */ + { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute }, /* Cdecl attribute says the callee is a normal C declaration */ { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute }, /* Regparm attribute specifies how many integer arguments are to be @@ -29093,8 +29528,8 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) do_subreg: vmode = V8HImode; target = gen_lowpart (vmode, target); - op0 = gen_lowpart (vmode, target); - op1 = gen_lowpart (vmode, target); + op0 = gen_lowpart (vmode, op0); + op1 = gen_lowpart (vmode, op1); break; default: @@ -29102,7 +29537,7 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) } /* This matches five different patterns with the different modes. */ - x = gen_rtx_VEC_MERGE (vmode, op0, op1, GEN_INT (mask)); + x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask)); x = gen_rtx_SET (VOIDmode, target, x); emit_insn (x); @@ -29214,7 +29649,12 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) input where SEL+CONCAT may not. */ if (d->op0 == d->op1) { - if (expand_vselect (d->target, d->op0, d->perm, nelt)) + int mask = nelt - 1; + + for (i = 0; i < nelt; i++) + perm2[i] = d->perm[i] & mask; + + if (expand_vselect (d->target, d->op0, perm2, nelt)) return true; /* There are plenty of patterns in sse.md that are written for @@ -29225,8 +29665,8 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) every other permutation operand. */ for (i = 0; i < nelt; i += 2) { - perm2[i] = d->perm[i]; - perm2[i+1] = d->perm[i+1] + nelt; + perm2[i] = d->perm[i] & mask; + perm2[i + 1] = (d->perm[i + 1] & mask) + nelt; } if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt)) return true; @@ -29234,11 +29674,12 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */ if (nelt >= 4) { - memcpy (perm2, d->perm, nelt); - for (i = 2; i < nelt; i += 4) + for (i = 0; i < nelt; i += 4) { - perm2[i+0] += nelt; - perm2[i+1] += nelt; + perm2[i + 0] = d->perm[i + 0] & mask; + perm2[i + 1] = d->perm[i + 1] & mask; + perm2[i + 2] = (d->perm[i + 2] & mask) + nelt; + perm2[i + 3] = (d->perm[i + 3] & mask) + nelt; } if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt)) @@ -30373,6 +30814,9 @@ ix86_enum_va_list (int idx, const char **pname, tree *ptree) #undef TARGET_FUNCTION_VALUE #define TARGET_FUNCTION_VALUE ix86_function_value +#undef TARGET_FUNCTION_VALUE_REGNO_P +#define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p + #undef TARGET_SECONDARY_RELOAD #define TARGET_SECONDARY_RELOAD ix86_secondary_reload @@ -30419,6 +30863,9 @@ ix86_enum_va_list (int idx, const char **pname, tree *ptree) #undef TARGET_CAN_ELIMINATE #define TARGET_CAN_ELIMINATE ix86_can_eliminate +#undef TARGET_ASM_CODE_END +#define TARGET_ASM_CODE_END ix86_code_end + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-i386.h"