static void
move_or_delete_vzeroupper_2 (basic_block bb, bool upper_128bits_set)
{
- rtx curr_insn, next_insn, prev_insn, insn;
+ rtx insn;
+ rtx vzeroupper_insn = NULL_RTX;
+ rtx pat;
+ int avx256;
if (dump_file)
fprintf (dump_file, " BB [%i] entry: upper 128bits: %d\n",
bb->index, upper_128bits_set);
- for (curr_insn = BB_HEAD (bb);
- curr_insn && curr_insn != NEXT_INSN (BB_END (bb));
- curr_insn = next_insn)
+ insn = BB_HEAD (bb);
+ while (insn != BB_END (bb))
{
- int avx256;
+ insn = NEXT_INSN (insn);
- next_insn = NEXT_INSN (curr_insn);
-
- if (!NONDEBUG_INSN_P (curr_insn))
+ if (!NONDEBUG_INSN_P (insn))
continue;
- /* Search for vzeroupper. */
- insn = PATTERN (curr_insn);
- if (GET_CODE (insn) == UNSPEC_VOLATILE
- && XINT (insn, 1) == UNSPECV_VZEROUPPER)
+ /* Move vzeroupper before jump/call. */
+ if (JUMP_P (insn) || CALL_P (insn))
+ {
+ if (!vzeroupper_insn)
+ continue;
+
+ if (PREV_INSN (insn) != vzeroupper_insn)
+ {
+ if (dump_file)
+ {
+ fprintf (dump_file, "Move vzeroupper after:\n");
+ print_rtl_single (dump_file, PREV_INSN (insn));
+ fprintf (dump_file, "before:\n");
+ print_rtl_single (dump_file, insn);
+ }
+ reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
+ PREV_INSN (insn));
+ }
+ vzeroupper_insn = NULL_RTX;
+ continue;
+ }
+
+ pat = PATTERN (insn);
+
+ /* Check insn for vzeroupper intrinsic. */
+ if (GET_CODE (pat) == UNSPEC_VOLATILE
+ && XINT (pat, 1) == UNSPECV_VZEROUPPER)
{
- /* Found vzeroupper. */
if (dump_file)
{
+ /* Found vzeroupper intrinsic. */
fprintf (dump_file, "Found vzeroupper:\n");
- print_rtl_single (dump_file, curr_insn);
+ print_rtl_single (dump_file, insn);
}
}
else
{
- /* Check vzeroall intrinsic. */
- if (GET_CODE (insn) == PARALLEL
- && GET_CODE (XVECEXP (insn, 0, 0)) == UNSPEC_VOLATILE
- && XINT (XVECEXP (insn, 0, 0), 1) == UNSPECV_VZEROALL)
- upper_128bits_set = false;
- else if (!upper_128bits_set)
+ /* Check insn for vzeroall intrinsic. */
+ if (GET_CODE (pat) == PARALLEL
+ && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
+ && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
{
- /* Check if upper 128bits of AVX registers are used. */
- note_stores (insn, check_avx256_stores,
- &upper_128bits_set);
+ upper_128bits_set = false;
+
+ /* Delete pending vzeroupper insertion. */
+ if (vzeroupper_insn)
+ {
+ delete_insn (vzeroupper_insn);
+ vzeroupper_insn = NULL_RTX;
+ }
}
+ else if (!upper_128bits_set)
+ note_stores (pat, check_avx256_stores, &upper_128bits_set);
continue;
}
- avx256 = INTVAL (XVECEXP (insn, 0, 0));
+ /* Process vzeroupper intrinsic. */
+ avx256 = INTVAL (XVECEXP (pat, 0, 0));
if (!upper_128bits_set)
{
/* Since the upper 128bits are cleared, callee must not pass
256bit AVX register. We only need to check if callee
returns 256bit AVX register. */
- upper_128bits_set = avx256 == callee_return_avx256;
+ upper_128bits_set = (avx256 == callee_return_avx256);
- /* Remove unnecessary vzeroupper since upper 128bits are
- cleared. */
+ /* Remove unnecessary vzeroupper since
+ upper 128bits are cleared. */
if (dump_file)
{
fprintf (dump_file, "Delete redundant vzeroupper:\n");
- print_rtl_single (dump_file, curr_insn);
+ print_rtl_single (dump_file, insn);
}
- delete_insn (curr_insn);
- continue;
+ delete_insn (insn);
}
else if (avx256 == callee_return_pass_avx256
|| avx256 == callee_pass_avx256)
{
/* Callee passes 256bit AVX register. Check if callee
returns 256bit AVX register. */
- upper_128bits_set = avx256 == callee_return_pass_avx256;
+ upper_128bits_set = (avx256 == callee_return_pass_avx256);
- /* Must remove vzeroupper since callee passes 256bit AVX
- register. */
+ /* Must remove vzeroupper since
+ callee passes in 256bit AVX register. */
if (dump_file)
{
fprintf (dump_file, "Delete callee pass vzeroupper:\n");
- print_rtl_single (dump_file, curr_insn);
- }
- delete_insn (curr_insn);
- continue;
- }
-
- /* Find the jump after vzeroupper. */
- prev_insn = curr_insn;
- if (avx256 == vzeroupper_intrinsic)
- {
- /* For vzeroupper intrinsic, check if there is another
- vzeroupper. */
- insn = NEXT_INSN (curr_insn);
- while (insn)
- {
- if (NONJUMP_INSN_P (insn)
- && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
- && XINT (PATTERN (insn), 1) == UNSPECV_VZEROUPPER)
- {
- if (dump_file)
- {
- fprintf (dump_file,
- "Delete redundant vzeroupper intrinsic:\n");
- print_rtl_single (dump_file, curr_insn);
- }
- delete_insn (curr_insn);
- insn = NULL;
- continue;
- }
-
- if (JUMP_P (insn) || CALL_P (insn))
- break;
- prev_insn = insn;
- insn = NEXT_INSN (insn);
- if (insn == NEXT_INSN (BB_END (bb)))
- break;
+ print_rtl_single (dump_file, insn);
}
-
- /* Continue if redundant vzeroupper intrinsic is deleted. */
- if (!insn)
- continue;
+ delete_insn (insn);
}
else
{
- /* Find the next jump/call. */
- insn = NEXT_INSN (curr_insn);
- while (insn)
- {
- if (JUMP_P (insn) || CALL_P (insn))
- break;
- prev_insn = insn;
- insn = NEXT_INSN (insn);
- if (insn == NEXT_INSN (BB_END (bb)))
- break;
- }
-
- if (!insn)
- gcc_unreachable();
+ upper_128bits_set = false;
+ vzeroupper_insn = insn;
}
-
- /* Keep vzeroupper. */
- upper_128bits_set = false;
-
- /* Also allow label as the next instruction. */
- if (insn == NEXT_INSN (BB_END (bb)) && !LABEL_P (insn))
- gcc_unreachable();
-
- /* Move vzeroupper before jump/call if neeeded. */
- if (curr_insn != prev_insn)
- {
- reorder_insns_nobb (curr_insn, curr_insn, prev_insn);
- if (dump_file)
- {
- fprintf (dump_file, "Move vzeroupper after:\n");
- print_rtl_single (dump_file, prev_insn);
- fprintf (dump_file, "before:\n");
- print_rtl_single (dump_file, insn);
- }
- }
-
- next_insn = NEXT_INSN (insn);
}
BLOCK_INFO (bb)->upper_128bits_set = upper_128bits_set;
+ 2, vec));
}
- /* Emit vzeroupper if needed. */
+ /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
if (TARGET_VZEROUPPER && cfun->machine->use_avx256_p)
{
+ rtx unspec;
int avx256;
+
cfun->machine->use_vzeroupper_p = 1;
if (cfun->machine->callee_pass_avx256_p)
{
avx256 = callee_return_avx256;
else
avx256 = call_no_avx256;
- emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
+
+ unspec = gen_rtx_UNSPEC (VOIDmode,
+ gen_rtvec (1, GEN_INT (avx256)),
+ UNSPEC_CALL_NEEDS_VZEROUPPER);
+ call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, call, unspec));
}
call = emit_call_insn (call);
return call;
}
+void
+ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
+{
+ rtx call = XVECEXP (PATTERN (insn), 0, 0);
+ emit_insn (gen_avx_vzeroupper (vzeroupper));
+ emit_call_insn (call);
+}
+
+void
+ix86_split_call_pop_vzeroupper (rtx insn, rtx vzeroupper)
+{
+ rtx call = XVECEXP (PATTERN (insn), 0, 0);
+ rtx pop = XVECEXP (PATTERN (insn), 0, 1);
+ emit_insn (gen_avx_vzeroupper (vzeroupper));
+ emit_call_insn (gen_rtx_PARALLEL (VOIDmode,
+ gen_rtvec (2, call, pop)));
+}
+
/* Output the assembly for a call instruction. */
const char *
case PROCESSOR_K6:
return 1;
+ case PROCESSOR_CORE2:
+ case PROCESSOR_COREI7_32:
+ case PROCESSOR_COREI7_64:
+ /* Generally, we want haifa-sched:max_issue() to look ahead as far
+ as many instructions can be executed on a cycle, i.e.,
+ issue_rate. I wonder why tuning for many CPUs does not do this. */
+ return ix86_issue_rate ();
+
default:
return 0;
}
}
\f
+
+/* Model decoder of Core 2/i7.
+ Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
+ track the instruction fetch block boundaries and make sure that long
+ (9+ bytes) instructions are assigned to D0. */
+
+/* Maximum length of an insn that can be handled by
+ a secondary decoder unit. '8' for Core 2/i7. */
+static int core2i7_secondary_decoder_max_insn_size;
+
+/* Ifetch block size, i.e., number of bytes decoder reads per cycle.
+ '16' for Core 2/i7. */
+static int core2i7_ifetch_block_size;
+
+/* Maximum number of instructions decoder can handle per cycle.
+ '6' for Core 2/i7. */
+static int core2i7_ifetch_block_max_insns;
+
+typedef struct ix86_first_cycle_multipass_data_ *
+ ix86_first_cycle_multipass_data_t;
+typedef const struct ix86_first_cycle_multipass_data_ *
+ const_ix86_first_cycle_multipass_data_t;
+
+/* A variable to store target state across calls to max_issue within
+ one cycle. */
+static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
+ *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
+
+/* Initialize DATA. */
+static void
+core2i7_first_cycle_multipass_init (void *_data)
+{
+ ix86_first_cycle_multipass_data_t data
+ = (ix86_first_cycle_multipass_data_t) _data;
+
+ data->ifetch_block_len = 0;
+ data->ifetch_block_n_insns = 0;
+ data->ready_try_change = NULL;
+ data->ready_try_change_size = 0;
+}
+
+/* Advancing the cycle; reset ifetch block counts. */
+static void
+core2i7_dfa_post_advance_cycle (void)
+{
+ ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
+
+ gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
+
+ data->ifetch_block_len = 0;
+ data->ifetch_block_n_insns = 0;
+}
+
+static int min_insn_size (rtx);
+
+/* Filter out insns from ready_try that the core will not be able to issue
+ on current cycle due to decoder. */
+static void
+core2i7_first_cycle_multipass_filter_ready_try
+(const_ix86_first_cycle_multipass_data_t data,
+ char *ready_try, int n_ready, bool first_cycle_insn_p)
+{
+ while (n_ready--)
+ {
+ rtx insn;
+ int insn_size;
+
+ if (ready_try[n_ready])
+ continue;
+
+ insn = get_ready_element (n_ready);
+ insn_size = min_insn_size (insn);
+
+ if (/* If this is a too long an insn for a secondary decoder ... */
+ (!first_cycle_insn_p
+ && insn_size > core2i7_secondary_decoder_max_insn_size)
+ /* ... or it would not fit into the ifetch block ... */
+ || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
+ /* ... or the decoder is full already ... */
+ || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
+ /* ... mask the insn out. */
+ {
+ ready_try[n_ready] = 1;
+
+ if (data->ready_try_change)
+ SET_BIT (data->ready_try_change, n_ready);
+ }
+ }
+}
+
+/* Prepare for a new round of multipass lookahead scheduling. */
+static void
+core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
+ bool first_cycle_insn_p)
+{
+ ix86_first_cycle_multipass_data_t data
+ = (ix86_first_cycle_multipass_data_t) _data;
+ const_ix86_first_cycle_multipass_data_t prev_data
+ = ix86_first_cycle_multipass_data;
+
+ /* Restore the state from the end of the previous round. */
+ data->ifetch_block_len = prev_data->ifetch_block_len;
+ data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
+
+ /* Filter instructions that cannot be issued on current cycle due to
+ decoder restrictions. */
+ core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
+ first_cycle_insn_p);
+}
+
+/* INSN is being issued in current solution. Account for its impact on
+ the decoder model. */
+static void
+core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
+ rtx insn, const void *_prev_data)
+{
+ ix86_first_cycle_multipass_data_t data
+ = (ix86_first_cycle_multipass_data_t) _data;
+ const_ix86_first_cycle_multipass_data_t prev_data
+ = (const_ix86_first_cycle_multipass_data_t) _prev_data;
+
+ int insn_size = min_insn_size (insn);
+
+ data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
+ data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
+ gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
+ && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
+
+ /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
+ if (!data->ready_try_change)
+ {
+ data->ready_try_change = sbitmap_alloc (n_ready);
+ data->ready_try_change_size = n_ready;
+ }
+ else if (data->ready_try_change_size < n_ready)
+ {
+ data->ready_try_change = sbitmap_resize (data->ready_try_change,
+ n_ready, 0);
+ data->ready_try_change_size = n_ready;
+ }
+ sbitmap_zero (data->ready_try_change);
+
+ /* Filter out insns from ready_try that the core will not be able to issue
+ on current cycle due to decoder. */
+ core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
+ false);
+}
+
+/* Revert the effect on ready_try. */
+static void
+core2i7_first_cycle_multipass_backtrack (const void *_data,
+ char *ready_try,
+ int n_ready ATTRIBUTE_UNUSED)
+{
+ const_ix86_first_cycle_multipass_data_t data
+ = (const_ix86_first_cycle_multipass_data_t) _data;
+ unsigned int i = 0;
+ sbitmap_iterator sbi;
+
+ gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
+ EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
+ {
+ ready_try[i] = 0;
+ }
+}
+
+/* Save the result of multipass lookahead scheduling for the next round. */
+static void
+core2i7_first_cycle_multipass_end (const void *_data)
+{
+ const_ix86_first_cycle_multipass_data_t data
+ = (const_ix86_first_cycle_multipass_data_t) _data;
+ ix86_first_cycle_multipass_data_t next_data
+ = ix86_first_cycle_multipass_data;
+
+ if (data != NULL)
+ {
+ next_data->ifetch_block_len = data->ifetch_block_len;
+ next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
+ }
+}
+
+/* Deallocate target data. */
+static void
+core2i7_first_cycle_multipass_fini (void *_data)
+{
+ ix86_first_cycle_multipass_data_t data
+ = (ix86_first_cycle_multipass_data_t) _data;
+
+ if (data->ready_try_change)
+ {
+ sbitmap_free (data->ready_try_change);
+ data->ready_try_change = NULL;
+ data->ready_try_change_size = 0;
+ }
+}
+
+/* Prepare for scheduling pass. */
+static void
+ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
+ int verbose ATTRIBUTE_UNUSED,
+ int max_uid ATTRIBUTE_UNUSED)
+{
+ /* Install scheduling hooks for current CPU. Some of these hooks are used
+ in time-critical parts of the scheduler, so we only set them up when
+ they are actually used. */
+ switch (ix86_tune)
+ {
+ case PROCESSOR_CORE2:
+ case PROCESSOR_COREI7_32:
+ case PROCESSOR_COREI7_64:
+ targetm.sched.dfa_post_advance_cycle
+ = core2i7_dfa_post_advance_cycle;
+ targetm.sched.first_cycle_multipass_init
+ = core2i7_first_cycle_multipass_init;
+ targetm.sched.first_cycle_multipass_begin
+ = core2i7_first_cycle_multipass_begin;
+ targetm.sched.first_cycle_multipass_issue
+ = core2i7_first_cycle_multipass_issue;
+ targetm.sched.first_cycle_multipass_backtrack
+ = core2i7_first_cycle_multipass_backtrack;
+ targetm.sched.first_cycle_multipass_end
+ = core2i7_first_cycle_multipass_end;
+ targetm.sched.first_cycle_multipass_fini
+ = core2i7_first_cycle_multipass_fini;
+
+ /* Set decoder parameters. */
+ core2i7_secondary_decoder_max_insn_size = 8;
+ core2i7_ifetch_block_size = 16;
+ core2i7_ifetch_block_max_insns = 6;
+ break;
+
+ default:
+ targetm.sched.dfa_post_advance_cycle = NULL;
+ targetm.sched.first_cycle_multipass_init = NULL;
+ targetm.sched.first_cycle_multipass_begin = NULL;
+ targetm.sched.first_cycle_multipass_issue = NULL;
+ targetm.sched.first_cycle_multipass_backtrack = NULL;
+ targetm.sched.first_cycle_multipass_end = NULL;
+ targetm.sched.first_cycle_multipass_fini = NULL;
+ break;
+ }
+}
+
+\f
/* Compute the alignment given to a constant that is being placed in memory.
EXP is the constant and ALIGN is the alignment that the object would
ordinarily have.
#undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
#define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
+#undef TARGET_SCHED_INIT_GLOBAL
+#define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
#undef TARGET_SCHED_ADJUST_COST
#define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
#undef TARGET_SCHED_ISSUE_RATE