/* Subroutines used for code generation on IBM S/390 and zSeries
Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
- 2007, 2008, 2009 Free Software Foundation, Inc.
+ 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
Contributed by Hartmut Penner (hpenner@de.ibm.com) and
Ulrich Weigand (uweigand@de.ibm.com) and
Andreas Krebbel (Andreas.Krebbel@de.ibm.com).
#include "optabs.h"
#include "gimple.h"
#include "df.h"
+#include "params.h"
+#include "cfgloop.h"
/* Define the specific costs for a given cpu. */
COSTS_N_INSNS (10), /* MSGFR */
COSTS_N_INSNS (10), /* MSGR */
COSTS_N_INSNS (10), /* MSR */
- COSTS_N_INSNS (10), /* multiplication in DFmode */
+ COSTS_N_INSNS (1) , /* multiplication in DFmode */
COSTS_N_INSNS (50), /* MXBR */
COSTS_N_INSNS (120), /* SQXBR */
COSTS_N_INSNS (52), /* SQDBR */
COSTS_N_INSNS (38), /* SQEBR */
- COSTS_N_INSNS (10), /* MADBR */
- COSTS_N_INSNS (10), /* MAEBR */
+ COSTS_N_INSNS (1), /* MADBR */
+ COSTS_N_INSNS (1), /* MAEBR */
COSTS_N_INSNS (111), /* DXBR */
COSTS_N_INSNS (39), /* DDBR */
COSTS_N_INSNS (32), /* DEBR */
extern int reload_completed;
+/* Kept up to date using the SCHED_VARIABLE_ISSUE hook. */
+static rtx last_scheduled_insn;
+
/* Structure used to hold the components of a S/390 memory
address. A legitimate address on S/390 is of the general
form
#define REGNO_PAIR_OK(REGNO, MODE) \
(HARD_REGNO_NREGS ((REGNO), (MODE)) == 1 || !((REGNO) & 1))
+/* That's the read ahead of the dynamic branch prediction unit in
+ bytes on a z10 CPU. */
+#define Z10_PREDICT_DISTANCE 384
+
static enum machine_mode
s390_libgcc_cmp_return_mode (void)
{
s390_scalar_mode_supported_p (enum machine_mode mode)
{
if (DECIMAL_FLOAT_MODE_P (mode))
- return true;
+ return default_decimal_float_supported_p ();
else
return default_scalar_mode_supported_p (mode);
}
if (!(target_flags_explicit & MASK_LONG_DOUBLE_128))
target_flags |= MASK_LONG_DOUBLE_128;
#endif
+
+ if (s390_tune == PROCESSOR_2097_Z10)
+ {
+ if (!PARAM_SET_P (PARAM_MAX_UNROLLED_INSNS))
+ set_param_value ("max-unrolled-insns", 100);
+ if (!PARAM_SET_P (PARAM_MAX_UNROLL_TIMES))
+ set_param_value ("max-unroll-times", 32);
+ if (!PARAM_SET_P (PARAM_MAX_COMPLETELY_PEELED_INSNS))
+ set_param_value ("max-completely-peeled-insns", 800);
+ if (!PARAM_SET_P (PARAM_MAX_COMPLETELY_PEEL_TIMES))
+ set_param_value ("max-completely-peel-times", 64);
+ }
+
+ set_param_value ("max-pending-list-length", 256);
}
/* Map for smallest class containing reg regno. */
if (!disp)
return true;
+ /* Without the long displacement facility we don't need to
+ distingiush between long and short displacement. */
+ if (!TARGET_LONG_DISPLACEMENT)
+ return true;
+
/* Integer displacement in range. */
if (GET_CODE (disp) == CONST_INT)
return INTVAL (disp) >= 0 && INTVAL (disp) < 4096;
}
-/* Evaluates constraint strings described by the regular expression
- ([A|B](Q|R|S|T))|U|W and returns 1 if OP is a valid operand for the
- constraint given in STR, or 0 else. */
+/* Return true if ADDR is of kind symbol_ref or symbol_ref + const_int
+ and return these parts in SYMREF and ADDEND. You can pass NULL in
+ SYMREF and/or ADDEND if you are not interested in these values. */
-int
-s390_mem_constraint (const char *str, rtx op)
+static bool
+s390_symref_operand_p (rtx addr, rtx *symref, HOST_WIDE_INT *addend)
{
- struct s390_address addr;
- char c = str[0];
+ HOST_WIDE_INT tmpaddend = 0;
+
+ if (GET_CODE (addr) == CONST)
+ addr = XEXP (addr, 0);
- /* Check for offsettable variants of memory constraints. */
- if (c == 'A')
+ if (GET_CODE (addr) == PLUS)
{
- /* Only accept non-volatile MEMs. */
- if (!MEM_P (op) || MEM_VOLATILE_P (op))
- return 0;
+ if (GET_CODE (XEXP (addr, 0)) == SYMBOL_REF
+ && CONST_INT_P (XEXP (addr, 1)))
+ {
+ tmpaddend = INTVAL (XEXP (addr, 1));
+ addr = XEXP (addr, 0);
+ }
+ else
+ return false;
+ }
+ else
+ if (GET_CODE (addr) != SYMBOL_REF)
+ return false;
- if ((reload_completed || reload_in_progress)
- ? !offsettable_memref_p (op) : !offsettable_nonstrict_memref_p (op))
- return 0;
+ if (symref)
+ *symref = addr;
+ if (addend)
+ *addend = tmpaddend;
- c = str[1];
- }
+ return true;
+}
+
+
+/* Return true if the address in OP is valid for constraint letter C
+ if wrapped in a MEM rtx. Set LIT_POOL_OK to true if it literal
+ pool MEMs should be accepted. Only the Q, R, S, T constraint
+ letters are allowed for C. */
+
+static int
+s390_check_qrst_address (char c, rtx op, bool lit_pool_ok)
+{
+ struct s390_address addr;
+ bool decomposed = false;
- /* Check for non-literal-pool variants of memory constraints. */
- else if (c == 'B')
+ /* This check makes sure that no symbolic address (except literal
+ pool references) are accepted by the R or T constraints. */
+ if (s390_symref_operand_p (op, NULL, NULL))
{
- if (GET_CODE (op) != MEM)
+ if (!lit_pool_ok)
return 0;
- if (!s390_decompose_address (XEXP (op, 0), &addr))
+ if (!s390_decompose_address (op, &addr))
return 0;
- if (addr.literal_pool)
+ if (!addr.literal_pool)
return 0;
-
- c = str[1];
+ decomposed = true;
}
switch (c)
{
- case 'Q':
- if (GET_CODE (op) != MEM)
- return 0;
- if (!s390_decompose_address (XEXP (op, 0), &addr))
+ case 'Q': /* no index short displacement */
+ if (!decomposed && !s390_decompose_address (op, &addr))
return 0;
if (addr.indx)
return 0;
-
- if (TARGET_LONG_DISPLACEMENT)
- {
- if (!s390_short_displacement (addr.disp))
- return 0;
- }
- break;
-
- case 'R':
- if (GET_CODE (op) != MEM)
+ if (!s390_short_displacement (addr.disp))
return 0;
+ break;
+ case 'R': /* with index short displacement */
if (TARGET_LONG_DISPLACEMENT)
{
- if (!s390_decompose_address (XEXP (op, 0), &addr))
+ if (!decomposed && !s390_decompose_address (op, &addr))
return 0;
if (!s390_short_displacement (addr.disp))
return 0;
}
+ /* Any invalid address here will be fixed up by reload,
+ so accept it for the most generic constraint. */
break;
- case 'S':
+ case 'S': /* no index long displacement */
if (!TARGET_LONG_DISPLACEMENT)
return 0;
- if (GET_CODE (op) != MEM)
- return 0;
- if (!s390_decompose_address (XEXP (op, 0), &addr))
+ if (!decomposed && !s390_decompose_address (op, &addr))
return 0;
if (addr.indx)
return 0;
return 0;
break;
- case 'T':
+ case 'T': /* with index long displacement */
if (!TARGET_LONG_DISPLACEMENT)
return 0;
- if (GET_CODE (op) != MEM)
- return 0;
- if (!s390_decompose_address (XEXP (op, 0), &addr))
- return 0;
- if (s390_short_displacement (addr.disp))
+ /* Any invalid address here will be fixed up by reload,
+ so accept it for the most generic constraint. */
+ if ((decomposed || s390_decompose_address (op, &addr))
+ && s390_short_displacement (addr.disp))
return 0;
break;
+ default:
+ return 0;
+ }
+ return 1;
+}
- case 'U':
- if (TARGET_LONG_DISPLACEMENT)
- {
- if (!s390_decompose_address (op, &addr))
- return 0;
- if (!s390_short_displacement (addr.disp))
- return 0;
- }
- break;
- case 'W':
- if (!TARGET_LONG_DISPLACEMENT)
+/* Evaluates constraint strings described by the regular expression
+ ([A|B|Z](Q|R|S|T))|U|W|Y and returns 1 if OP is a valid operand for
+ the constraint given in STR, or 0 else. */
+
+int
+s390_mem_constraint (const char *str, rtx op)
+{
+ char c = str[0];
+
+ switch (c)
+ {
+ case 'A':
+ /* Check for offsettable variants of memory constraints. */
+ if (!MEM_P (op) || MEM_VOLATILE_P (op))
return 0;
- if (!s390_decompose_address (op, &addr))
+ if ((reload_completed || reload_in_progress)
+ ? !offsettable_memref_p (op) : !offsettable_nonstrict_memref_p (op))
return 0;
- if (s390_short_displacement (addr.disp))
+ return s390_check_qrst_address (str[1], XEXP (op, 0), true);
+ case 'B':
+ /* Check for non-literal-pool variants of memory constraints. */
+ if (!MEM_P (op))
return 0;
- break;
-
+ return s390_check_qrst_address (str[1], XEXP (op, 0), false);
+ case 'Q':
+ case 'R':
+ case 'S':
+ case 'T':
+ if (GET_CODE (op) != MEM)
+ return 0;
+ return s390_check_qrst_address (c, XEXP (op, 0), true);
+ case 'U':
+ return (s390_check_qrst_address ('Q', op, true)
+ || s390_check_qrst_address ('R', op, true));
+ case 'W':
+ return (s390_check_qrst_address ('S', op, true)
+ || s390_check_qrst_address ('T', op, true));
case 'Y':
/* Simply check for the basic form of a shift count. Reload will
take care of making sure we have a proper base register. */
if (!s390_decompose_shift_count (op, NULL, NULL))
return 0;
break;
-
+ case 'Z':
+ return s390_check_qrst_address (str[1], op, true);
default:
return 0;
}
-
return 1;
}
-
/* Evaluates constraint strings starting with letter O. Input
parameter C is the second letter following the "O" in the constraint
string. Returns 1 if VALUE meets the respective constraint and 0
return rclass;
}
-/* Return true if ADDR is of kind symbol_ref or symbol_ref + const_int
- and return these parts in SYMREF and ADDEND. You can pass NULL in
- SYMREF and/or ADDEND if you are not interested in these values. */
-
-static bool
-s390_symref_operand_p (rtx addr, rtx *symref, HOST_WIDE_INT *addend)
-{
- HOST_WIDE_INT tmpaddend = 0;
-
- if (GET_CODE (addr) == CONST)
- addr = XEXP (addr, 0);
-
- if (GET_CODE (addr) == PLUS)
- {
- if (GET_CODE (XEXP (addr, 0)) == SYMBOL_REF
- && CONST_INT_P (XEXP (addr, 1)))
- {
- tmpaddend = INTVAL (XEXP (addr, 1));
- addr = XEXP (addr, 0);
- }
- else
- return false;
- }
- else
- if (GET_CODE (addr) != SYMBOL_REF)
- return false;
-
- if (symref)
- *symref = addr;
- if (addend)
- *addend = tmpaddend;
-
- return true;
-}
-
/* Return true if ADDR is SYMBOL_REF + addend with addend being a
multiple of ALIGNMENT and the SYMBOL_REF being naturally
aligned. */
emit_label (loop_start_label);
+ if (TARGET_Z10
+ && (GET_CODE (len) != CONST_INT || INTVAL (len) > 768))
+ {
+ rtx prefetch;
+
+ /* Issue a read prefetch for the +3 cache line. */
+ prefetch = gen_prefetch (gen_rtx_PLUS (Pmode, src_addr, GEN_INT (768)),
+ const0_rtx, const0_rtx);
+ PREFETCH_SCHEDULE_BARRIER_P (prefetch) = true;
+ emit_insn (prefetch);
+
+ /* Issue a write prefetch for the +3 cache line. */
+ prefetch = gen_prefetch (gen_rtx_PLUS (Pmode, dst_addr, GEN_INT (768)),
+ const1_rtx, const0_rtx);
+ PREFETCH_SCHEDULE_BARRIER_P (prefetch) = true;
+ emit_insn (prefetch);
+ }
+
emit_insn (gen_movmem_short (dst, src, GEN_INT (255)));
s390_load_address (dst_addr,
gen_rtx_PLUS (Pmode, dst_addr, GEN_INT (256)));
emit_label (loop_start_label);
+ if (TARGET_Z10
+ && (GET_CODE (len) != CONST_INT || INTVAL (len) > 1024))
+ {
+ /* Issue a write prefetch for the +4 cache line. */
+ rtx prefetch = gen_prefetch (gen_rtx_PLUS (Pmode, dst_addr,
+ GEN_INT (1024)),
+ const1_rtx, const0_rtx);
+ emit_insn (prefetch);
+ PREFETCH_SCHEDULE_BARRIER_P (prefetch) = true;
+ }
+
if (val == const0_rtx)
emit_insn (gen_clrmem_short (dst, GEN_INT (255)));
else
emit_label (loop_start_label);
+ if (TARGET_Z10
+ && (GET_CODE (len) != CONST_INT || INTVAL (len) > 512))
+ {
+ rtx prefetch;
+
+ /* Issue a read prefetch for the +2 cache line of operand 1. */
+ prefetch = gen_prefetch (gen_rtx_PLUS (Pmode, addr0, GEN_INT (512)),
+ const0_rtx, const0_rtx);
+ emit_insn (prefetch);
+ PREFETCH_SCHEDULE_BARRIER_P (prefetch) = true;
+
+ /* Issue a read prefetch for the +2 cache line of operand 2. */
+ prefetch = gen_prefetch (gen_rtx_PLUS (Pmode, addr1, GEN_INT (512)),
+ const0_rtx, const0_rtx);
+ emit_insn (prefetch);
+ PREFETCH_SCHEDULE_BARRIER_P (prefetch) = true;
+ }
+
emit_insn (gen_cmpmem_short (op0, op1, GEN_INT (255)));
temp = gen_rtx_NE (VOIDmode, ccreg, const0_rtx);
temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
static rtx
s390_delegitimize_address (rtx orig_x)
{
- rtx x = orig_x, y;
+ rtx x, y;
+ orig_x = delegitimize_mem_from_attrs (orig_x);
+ x = orig_x;
if (GET_CODE (x) != MEM)
return orig_x;
'C': print opcode suffix for branch condition.
'D': print opcode suffix for inverse branch condition.
+ 'E': print opcode suffix for branch on index instruction.
'J': print tls_load/tls_gdcall/tls_ldcall suffix
'G': print the size of the operand in bytes.
'O': print only the displacement of a memory reference.
fprintf (file, s390_branch_condition_mnemonic (x, TRUE));
return;
+ case 'E':
+ if (GET_CODE (x) == LE)
+ fprintf (file, "l");
+ else if (GET_CODE (x) == GT)
+ fprintf (file, "h");
+ else
+ gcc_unreachable ();
+ return;
+
case 'J':
if (GET_CODE (x) == SYMBOL_REF)
{
A STD instruction should be scheduled earlier,
in order to use the bypass. */
+
static int
s390_adjust_priority (rtx insn ATTRIBUTE_UNUSED, int priority)
{
return priority;
if (s390_tune != PROCESSOR_2084_Z990
- && s390_tune != PROCESSOR_2094_Z9_109)
+ && s390_tune != PROCESSOR_2094_Z9_109
+ && s390_tune != PROCESSOR_2097_Z10)
return priority;
switch (s390_safe_attr_type (insn))
return priority;
}
+
/* The number of instructions that can be issued per cycle. */
static int
/* Return true if register FROM can be eliminated via register TO. */
-bool
-s390_can_eliminate (int from, int to)
+static bool
+s390_can_eliminate (const int from, const int to)
{
/* On zSeries machines, we have not marked the base register as fixed.
Instead, we have an elimination rule BASE_REGNUM -> BASE_REGNUM.
}
else
{
- HOST_WIDE_INT stack_check_mask = ((s390_stack_size - 1)
- & ~(stack_guard - 1));
- rtx t = gen_rtx_AND (Pmode, stack_pointer_rtx,
- GEN_INT (stack_check_mask));
- if (TARGET_64BIT)
- emit_insn (gen_ctrapdi4 (gen_rtx_EQ (VOIDmode, t, const0_rtx),
- t, const0_rtx, const0_rtx));
+ /* stack_guard has to be smaller than s390_stack_size.
+ Otherwise we would emit an AND with zero which would
+ not match the test under mask pattern. */
+ if (stack_guard >= s390_stack_size)
+ {
+ warning (0, "frame size of function %qs is "
+ HOST_WIDE_INT_PRINT_DEC
+ " bytes which is more than half the stack size. "
+ "The dynamic check would not be reliable. "
+ "No check emitted for this function.",
+ current_function_name(),
+ cfun_frame_layout.frame_size);
+ }
else
- emit_insn (gen_ctrapsi4 (gen_rtx_EQ (VOIDmode, t, const0_rtx),
- t, const0_rtx, const0_rtx));
+ {
+ HOST_WIDE_INT stack_check_mask = ((s390_stack_size - 1)
+ & ~(stack_guard - 1));
+
+ rtx t = gen_rtx_AND (Pmode, stack_pointer_rtx,
+ GEN_INT (stack_check_mask));
+ if (TARGET_64BIT)
+ emit_insn (gen_ctrapdi4 (gen_rtx_EQ (VOIDmode,
+ t, const0_rtx),
+ t, const0_rtx, const0_rtx));
+ else
+ emit_insn (gen_ctrapsi4 (gen_rtx_EQ (VOIDmode,
+ t, const0_rtx),
+ t, const0_rtx, const0_rtx));
+ }
}
}
return true;
}
+/* Function arguments and return values are promoted to word size. */
+
+static enum machine_mode
+s390_promote_function_mode (const_tree type, enum machine_mode mode,
+ int *punsignedp,
+ const_tree fntype ATTRIBUTE_UNUSED,
+ int for_return ATTRIBUTE_UNUSED)
+{
+ if (INTEGRAL_MODE_P (mode)
+ && GET_MODE_SIZE (mode) < UNITS_PER_WORD)
+ {
+ if (POINTER_TYPE_P (type))
+ *punsignedp = POINTERS_EXTEND_UNSIGNED;
+ return Pmode;
+ }
+
+ return mode;
+}
+
/* Define where to return a (scalar) value of type TYPE.
If TYPE is null, define where to return a (scalar)
value of mode MODE from a libcall. */
rtx
-s390_function_value (const_tree type, enum machine_mode mode)
+s390_function_value (const_tree type, const_tree fn, enum machine_mode mode)
{
if (type)
{
int unsignedp = TYPE_UNSIGNED (type);
- mode = promote_mode (type, TYPE_MODE (type), &unsignedp, 1);
+ mode = promote_function_mode (type, TYPE_MODE (type), &unsignedp, fn, 1);
}
gcc_assert (GET_MODE_CLASS (mode) == MODE_INT || SCALAR_FLOAT_MODE_P (mode));
On S/390, we use gpr 1 internally in the trampoline code;
gpr 0 is used to hold the static chain. */
-void
-s390_trampoline_template (FILE *file)
+static void
+s390_asm_trampoline_template (FILE *file)
{
rtx op[2];
op[0] = gen_rtx_REG (Pmode, 0);
FNADDR is an RTX for the address of the function's pure code.
CXT is an RTX for the static chain value for the function. */
-void
-s390_initialize_trampoline (rtx addr, rtx fnaddr, rtx cxt)
+static void
+s390_trampoline_init (rtx m_tramp, tree fndecl, rtx cxt)
{
- emit_move_insn (gen_rtx_MEM (Pmode,
- memory_address (Pmode,
- plus_constant (addr, (TARGET_64BIT ? 16 : 8)))), cxt);
- emit_move_insn (gen_rtx_MEM (Pmode,
- memory_address (Pmode,
- plus_constant (addr, (TARGET_64BIT ? 24 : 12)))), fnaddr);
+ rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
+ rtx mem;
+
+ emit_block_move (m_tramp, assemble_trampoline_template (),
+ GEN_INT (2*UNITS_PER_WORD), BLOCK_OP_NORMAL);
+
+ mem = adjust_address (m_tramp, Pmode, 2*UNITS_PER_WORD);
+ emit_move_insn (mem, cxt);
+ mem = adjust_address (m_tramp, Pmode, 3*UNITS_PER_WORD);
+ emit_move_insn (mem, fnaddr);
}
/* Output assembler code to FILE to increment profiler label # LABELNO
&& GET_CODE (XEXP (rtl, 0)) == SYMBOL_REF
&& TREE_CONSTANT_POOL_ADDRESS_P (XEXP (rtl, 0))
&& (MEM_ALIGN (rtl) == 0
+ || GET_MODE_BITSIZE (GET_MODE (rtl)) == 0
|| MEM_ALIGN (rtl) < GET_MODE_BITSIZE (GET_MODE (rtl))))
SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_NOT_NATURALLY_ALIGNED;
}
rtx op[10];
int nonlocal = 0;
+ /* Make sure unwind info is emitted for the thunk if needed. */
+ final_start_function (emit_barrier (), file, 1);
+
/* Operand 0 is the target function. */
op[0] = XEXP (DECL_RTL (function), 0);
if (flag_pic && !SYMBOL_REF_LOCAL_P (op[0]))
output_asm_insn (".long\t%3", op);
}
}
+ final_end_function ();
}
static bool
}
}
+/* On z10 the dynamic branch prediction must see the backward jump in
+ a window of 384 bytes. If not it falls back to the static
+ prediction. This function rearranges the loop backward branch in a
+ way which makes the static prediction always correct. The function
+ returns true if it added an instruction. */
+static bool
+s390_z10_fix_long_loop_prediction (rtx insn)
+{
+ rtx set = single_set (insn);
+ rtx code_label, label_ref, new_label;
+ rtx uncond_jump;
+ rtx cur_insn;
+ rtx tmp;
+ int distance;
+
+ /* This will exclude branch on count and branch on index patterns
+ since these are correctly statically predicted. */
+ if (!set
+ || SET_DEST (set) != pc_rtx
+ || GET_CODE (SET_SRC(set)) != IF_THEN_ELSE)
+ return false;
+
+ label_ref = (GET_CODE (XEXP (SET_SRC (set), 1)) == LABEL_REF ?
+ XEXP (SET_SRC (set), 1) : XEXP (SET_SRC (set), 2));
+
+ gcc_assert (GET_CODE (label_ref) == LABEL_REF);
+
+ code_label = XEXP (label_ref, 0);
+
+ if (INSN_ADDRESSES (INSN_UID (code_label)) == -1
+ || INSN_ADDRESSES (INSN_UID (insn)) == -1
+ || (INSN_ADDRESSES (INSN_UID (insn))
+ - INSN_ADDRESSES (INSN_UID (code_label)) < Z10_PREDICT_DISTANCE))
+ return false;
+
+ for (distance = 0, cur_insn = PREV_INSN (insn);
+ distance < Z10_PREDICT_DISTANCE - 6;
+ distance += get_attr_length (cur_insn), cur_insn = PREV_INSN (cur_insn))
+ if (!cur_insn || JUMP_P (cur_insn) || LABEL_P (cur_insn))
+ return false;
+
+ new_label = gen_label_rtx ();
+ uncond_jump = emit_jump_insn_after (
+ gen_rtx_SET (VOIDmode, pc_rtx,
+ gen_rtx_LABEL_REF (VOIDmode, code_label)),
+ insn);
+ emit_label_after (new_label, uncond_jump);
+
+ tmp = XEXP (SET_SRC (set), 1);
+ XEXP (SET_SRC (set), 1) = XEXP (SET_SRC (set), 2);
+ XEXP (SET_SRC (set), 2) = tmp;
+ INSN_CODE (insn) = -1;
+
+ XEXP (label_ref, 0) = new_label;
+ JUMP_LABEL (insn) = new_label;
+ JUMP_LABEL (uncond_jump) = code_label;
+
+ return true;
+}
+
/* Returns 1 if INSN reads the value of REG for purposes not related
to addressing of memory, and 0 otherwise. */
static int
if that register's value is delivered via a bypass, then the
pipeline recycles, thereby causing significant performance decline.
This function locates such situations and exchanges the two
- operands of the compare. */
-static void
-s390_z10_optimize_cmp (void)
+ operands of the compare. The function return true whenever it
+ added an insn. */
+static bool
+s390_z10_optimize_cmp (rtx insn)
{
- rtx insn, prev_insn, next_insn;
- int added_NOPs = 0;
+ rtx prev_insn, next_insn;
+ bool insn_added_p = false;
+ rtx cond, *op0, *op1;
- for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+ if (GET_CODE (PATTERN (insn)) == PARALLEL)
{
- rtx cond, *op0, *op1;
-
- if (!INSN_P (insn) || INSN_CODE (insn) <= 0)
- continue;
+ /* Handle compare and branch and branch on count
+ instructions. */
+ rtx pattern = single_set (insn);
- if (GET_CODE (PATTERN (insn)) == PARALLEL)
- {
- /* Handle compare and branch and branch on count
- instructions. */
- rtx pattern = single_set (insn);
+ if (!pattern
+ || SET_DEST (pattern) != pc_rtx
+ || GET_CODE (SET_SRC (pattern)) != IF_THEN_ELSE)
+ return false;
- if (!pattern
- || SET_DEST (pattern) != pc_rtx
- || GET_CODE (SET_SRC (pattern)) != IF_THEN_ELSE)
- continue;
+ cond = XEXP (SET_SRC (pattern), 0);
+ op0 = &XEXP (cond, 0);
+ op1 = &XEXP (cond, 1);
+ }
+ else if (GET_CODE (PATTERN (insn)) == SET)
+ {
+ rtx src, dest;
- cond = XEXP (SET_SRC (pattern), 0);
- op0 = &XEXP (cond, 0);
- op1 = &XEXP (cond, 1);
- }
- else if (GET_CODE (PATTERN (insn)) == SET)
- {
- rtx src, dest;
+ /* Handle normal compare instructions. */
+ src = SET_SRC (PATTERN (insn));
+ dest = SET_DEST (PATTERN (insn));
- /* Handle normal compare instructions. */
- src = SET_SRC (PATTERN (insn));
- dest = SET_DEST (PATTERN (insn));
+ if (!REG_P (dest)
+ || !CC_REGNO_P (REGNO (dest))
+ || GET_CODE (src) != COMPARE)
+ return false;
- if (!REG_P (dest)
- || !CC_REGNO_P (REGNO (dest))
- || GET_CODE (src) != COMPARE)
- continue;
+ /* s390_swap_cmp will try to find the conditional
+ jump when passing NULL_RTX as condition. */
+ cond = NULL_RTX;
+ op0 = &XEXP (src, 0);
+ op1 = &XEXP (src, 1);
+ }
+ else
+ return false;
- /* s390_swap_cmp will try to find the conditional
- jump when passing NULL_RTX as condition. */
- cond = NULL_RTX;
- op0 = &XEXP (src, 0);
- op1 = &XEXP (src, 1);
- }
- else
- continue;
+ if (!REG_P (*op0) || !REG_P (*op1))
+ return false;
- if (!REG_P (*op0) || !REG_P (*op1))
- continue;
+ if (GET_MODE_CLASS (GET_MODE (*op0)) != MODE_INT)
+ return false;
- /* Swap the COMPARE arguments and its mask if there is a
- conflicting access in the previous insn. */
- prev_insn = PREV_INSN (insn);
+ /* Swap the COMPARE arguments and its mask if there is a
+ conflicting access in the previous insn. */
+ prev_insn = prev_active_insn (insn);
+ if (prev_insn != NULL_RTX && INSN_P (prev_insn)
+ && reg_referenced_p (*op1, PATTERN (prev_insn)))
+ s390_swap_cmp (cond, op0, op1, insn);
+
+ /* Check if there is a conflict with the next insn. If there
+ was no conflict with the previous insn, then swap the
+ COMPARE arguments and its mask. If we already swapped
+ the operands, or if swapping them would cause a conflict
+ with the previous insn, issue a NOP after the COMPARE in
+ order to separate the two instuctions. */
+ next_insn = next_active_insn (insn);
+ if (next_insn != NULL_RTX && INSN_P (next_insn)
+ && s390_non_addr_reg_read_p (*op1, next_insn))
+ {
if (prev_insn != NULL_RTX && INSN_P (prev_insn)
- && reg_referenced_p (*op1, PATTERN (prev_insn)))
- s390_swap_cmp (cond, op0, op1, insn);
-
- /* Check if there is a conflict with the next insn. If there
- was no conflict with the previous insn, then swap the
- COMPARE arguments and its mask. If we already swapped
- the operands, or if swapping them would cause a conflict
- with the previous insn, issue a NOP after the COMPARE in
- order to separate the two instuctions. */
- next_insn = NEXT_INSN (insn);
- if (next_insn != NULL_RTX && INSN_P (next_insn)
- && s390_non_addr_reg_read_p (*op1, next_insn))
+ && s390_non_addr_reg_read_p (*op0, prev_insn))
{
- if (prev_insn != NULL_RTX && INSN_P (prev_insn)
- && s390_non_addr_reg_read_p (*op0, prev_insn))
- {
- if (REGNO (*op1) == 0)
- emit_insn_after (gen_nop1 (), insn);
- else
- emit_insn_after (gen_nop (), insn);
- added_NOPs = 1;
- }
+ if (REGNO (*op1) == 0)
+ emit_insn_after (gen_nop1 (), insn);
else
- s390_swap_cmp (cond, op0, op1, insn);
+ emit_insn_after (gen_nop (), insn);
+ insn_added_p = true;
}
+ else
+ s390_swap_cmp (cond, op0, op1, insn);
}
-
- /* Adjust branches if we added new instructions. */
- if (added_NOPs)
- shorten_branches (get_insns ());
+ return insn_added_p;
}
-
/* Perform machine-dependent processing. */
static void
/* Try to optimize prologue and epilogue further. */
s390_optimize_prologue ();
- /* Eliminate z10-specific pipeline recycles related to some compare
- instructions. */
+ /* Walk over the insns and do some z10 specific changes. */
+ if (s390_tune == PROCESSOR_2097_Z10)
+ {
+ rtx insn;
+ bool insn_added_p = false;
+
+ /* The insn lengths and addresses have to be up to date for the
+ following manipulations. */
+ shorten_branches (get_insns ());
+
+ for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+ {
+ if (!INSN_P (insn) || INSN_CODE (insn) <= 0)
+ continue;
+
+ if (JUMP_P (insn))
+ insn_added_p |= s390_z10_fix_long_loop_prediction (insn);
+
+ if (GET_CODE (PATTERN (insn)) == PARALLEL
+ || GET_CODE (PATTERN (insn)) == SET)
+ insn_added_p |= s390_z10_optimize_cmp (insn);
+ }
+
+ /* Adjust branches if we added new instructions. */
+ if (insn_added_p)
+ shorten_branches (get_insns ());
+ }
+}
+
+/* Return true if INSN is a fp load insn writing register REGNO. */
+static inline bool
+s390_fpload_toreg (rtx insn, unsigned int regno)
+{
+ rtx set;
+ enum attr_type flag = s390_safe_attr_type (insn);
+
+ if (flag != TYPE_FLOADSF && flag != TYPE_FLOADDF)
+ return false;
+
+ set = single_set (insn);
+
+ if (set == NULL_RTX)
+ return false;
+
+ if (!REG_P (SET_DEST (set)) || !MEM_P (SET_SRC (set)))
+ return false;
+
+ if (REGNO (SET_DEST (set)) != regno)
+ return false;
+
+ return true;
+}
+
+/* This value describes the distance to be avoided between an
+ aritmetic fp instruction and an fp load writing the same register.
+ Z10_EARLYLOAD_DISTANCE - 1 as well as Z10_EARLYLOAD_DISTANCE + 1 is
+ fine but the exact value has to be avoided. Otherwise the FP
+ pipeline will throw an exception causing a major penalty. */
+#define Z10_EARLYLOAD_DISTANCE 7
+
+/* Rearrange the ready list in order to avoid the situation described
+ for Z10_EARLYLOAD_DISTANCE. A problematic load instruction is
+ moved to the very end of the ready list. */
+static void
+s390_z10_prevent_earlyload_conflicts (rtx *ready, int *nready_p)
+{
+ unsigned int regno;
+ int nready = *nready_p;
+ rtx tmp;
+ int i;
+ rtx insn;
+ rtx set;
+ enum attr_type flag;
+ int distance;
+
+ /* Skip DISTANCE - 1 active insns. */
+ for (insn = last_scheduled_insn, distance = Z10_EARLYLOAD_DISTANCE - 1;
+ distance > 0 && insn != NULL_RTX;
+ distance--, insn = prev_active_insn (insn))
+ if (CALL_P (insn) || JUMP_P (insn))
+ return;
+
+ if (insn == NULL_RTX)
+ return;
+
+ set = single_set (insn);
+
+ if (set == NULL_RTX || !REG_P (SET_DEST (set))
+ || GET_MODE_CLASS (GET_MODE (SET_DEST (set))) != MODE_FLOAT)
+ return;
+
+ flag = s390_safe_attr_type (insn);
+
+ if (flag == TYPE_FLOADSF || flag == TYPE_FLOADDF)
+ return;
+
+ regno = REGNO (SET_DEST (set));
+ i = nready - 1;
+
+ while (!s390_fpload_toreg (ready[i], regno) && i > 0)
+ i--;
+
+ if (!i)
+ return;
+
+ tmp = ready[i];
+ memmove (&ready[1], &ready[0], sizeof (rtx) * i);
+ ready[0] = tmp;
+}
+
+/* This function is called via hook TARGET_SCHED_REORDER before
+ issueing one insn from list READY which contains *NREADYP entries.
+ For target z10 it reorders load instructions to avoid early load
+ conflicts in the floating point pipeline */
+static int
+s390_sched_reorder (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
+ rtx *ready, int *nreadyp, int clock ATTRIBUTE_UNUSED)
+{
if (s390_tune == PROCESSOR_2097_Z10)
- s390_z10_optimize_cmp ();
+ if (reload_completed && *nreadyp > 1)
+ s390_z10_prevent_earlyload_conflicts (ready, nreadyp);
+
+ return s390_issue_rate ();
+}
+
+/* This function is called via hook TARGET_SCHED_VARIABLE_ISSUE after
+ the scheduler has issued INSN. It stores the last issued insn into
+ last_scheduled_insn in order to make it available for
+ s390_sched_reorder. */
+static int
+s390_sched_variable_issue (FILE *file ATTRIBUTE_UNUSED,
+ int verbose ATTRIBUTE_UNUSED,
+ rtx insn, int more)
+{
+ last_scheduled_insn = insn;
+
+ if (GET_CODE (PATTERN (insn)) != USE
+ && GET_CODE (PATTERN (insn)) != CLOBBER)
+ return more - 1;
+ else
+ return more;
+}
+
+static void
+s390_sched_init (FILE *file ATTRIBUTE_UNUSED,
+ int verbose ATTRIBUTE_UNUSED,
+ int max_ready ATTRIBUTE_UNUSED)
+{
+ last_scheduled_insn = NULL_RTX;
+}
+
+/* This function checks the whole of insn X for memory references. The
+ function always returns zero because the framework it is called
+ from would stop recursively analyzing the insn upon a return value
+ other than zero. The real result of this function is updating
+ counter variable MEM_COUNT. */
+static int
+check_dpu (rtx *x, unsigned *mem_count)
+{
+ if (*x != NULL_RTX && MEM_P (*x))
+ (*mem_count)++;
+ return 0;
}
+/* This target hook implementation for TARGET_LOOP_UNROLL_ADJUST calculates
+ a new number struct loop *loop should be unrolled if tuned for the z10
+ cpu. The loop is analyzed for memory accesses by calling check_dpu for
+ each rtx of the loop. Depending on the loop_depth and the amount of
+ memory accesses a new number <=nunroll is returned to improve the
+ behaviour of the hardware prefetch unit. */
+static unsigned
+s390_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
+{
+ basic_block *bbs;
+ rtx insn;
+ unsigned i;
+ unsigned mem_count = 0;
+
+ /* Only z10 needs special handling. */
+ if (s390_tune != PROCESSOR_2097_Z10)
+ return nunroll;
+
+ /* Count the number of memory references within the loop body. */
+ bbs = get_loop_body (loop);
+ for (i = 0; i < loop->num_nodes; i++)
+ {
+ for (insn = BB_HEAD (bbs[i]); insn != BB_END (bbs[i]); insn = NEXT_INSN (insn))
+ if (INSN_P (insn) && INSN_CODE (insn) != -1)
+ for_each_rtx (&insn, (rtx_function) check_dpu, &mem_count);
+ }
+ free (bbs);
+
+ /* Prevent division by zero, and we do not need to adjust nunroll in this case. */
+ if (mem_count == 0)
+ return nunroll;
+
+ switch (loop_depth(loop))
+ {
+ case 1:
+ return MIN (nunroll, 28 / mem_count);
+ case 2:
+ return MIN (nunroll, 22 / mem_count);
+ default:
+ return MIN (nunroll, 16 / mem_count);
+ }
+}
/* Initialize GCC target structure. */
#undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
#define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD s390_first_cycle_multipass_dfa_lookahead
+#undef TARGET_SCHED_VARIABLE_ISSUE
+#define TARGET_SCHED_VARIABLE_ISSUE s390_sched_variable_issue
+#undef TARGET_SCHED_REORDER
+#define TARGET_SCHED_REORDER s390_sched_reorder
+#undef TARGET_SCHED_INIT
+#define TARGET_SCHED_INIT s390_sched_init
+
#undef TARGET_CANNOT_COPY_INSN_P
#define TARGET_CANNOT_COPY_INSN_P s390_cannot_copy_insn_p
#undef TARGET_RTX_COSTS
#undef TARGET_GIMPLIFY_VA_ARG_EXPR
#define TARGET_GIMPLIFY_VA_ARG_EXPR s390_gimplify_va_arg
-#undef TARGET_PROMOTE_FUNCTION_ARGS
-#define TARGET_PROMOTE_FUNCTION_ARGS hook_bool_const_tree_true
-#undef TARGET_PROMOTE_FUNCTION_RETURN
-#define TARGET_PROMOTE_FUNCTION_RETURN hook_bool_const_tree_true
+#undef TARGET_PROMOTE_FUNCTION_MODE
+#define TARGET_PROMOTE_FUNCTION_MODE s390_promote_function_mode
#undef TARGET_PASS_BY_REFERENCE
#define TARGET_PASS_BY_REFERENCE s390_pass_by_reference
#undef TARGET_LEGITIMATE_ADDRESS_P
#define TARGET_LEGITIMATE_ADDRESS_P s390_legitimate_address_p
+#undef TARGET_CAN_ELIMINATE
+#define TARGET_CAN_ELIMINATE s390_can_eliminate
+
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST s390_loop_unroll_adjust
+
+#undef TARGET_ASM_TRAMPOLINE_TEMPLATE
+#define TARGET_ASM_TRAMPOLINE_TEMPLATE s390_asm_trampoline_template
+#undef TARGET_TRAMPOLINE_INIT
+#define TARGET_TRAMPOLINE_INIT s390_trampoline_init
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-s390.h"