/* Swing Modulo Scheduling implementation.
- Copyright (C) 2004
+ Copyright (C) 2004, 2005, 2006
Free Software Foundation, Inc.
Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING. If not, write to the Free
-Software Foundation, 59 Temple Place - Suite 330, Boston, MA
-02111-1307, USA. */
+Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+02110-1301, USA. */
#include "config.h"
#include "rtl.h"
#include "tm_p.h"
#include "hard-reg-set.h"
-#include "basic-block.h"
#include "regs.h"
#include "function.h"
#include "flags.h"
#include "gcov-io.h"
#include "df.h"
#include "ddg.h"
+#include "timevar.h"
+#include "tree-pass.h"
#ifdef INSN_SCHEDULING
#define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
+ 1 + (ps)->ii - 1) / (ps)->ii)
-#define CFG_HOOKS cfg_layout_rtl_cfg_hooks
-
/* A single instruction in the partial schedule. */
struct ps_insn
{
ddg_ptr g; /* The DDG of the insns in the partial schedule. */
};
+/* We use this to record all the register replacements we do in
+ the kernel so we can undo SMS if it is not profitable. */
+struct undo_replace_buff_elem
+{
+ rtx insn;
+ rtx orig_reg;
+ rtx new_reg;
+ struct undo_replace_buff_elem *next;
+};
+
-partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
-void free_partial_schedule (partial_schedule_ptr);
-void reset_partial_schedule (partial_schedule_ptr, int new_ii);
+
+static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
+static void free_partial_schedule (partial_schedule_ptr);
+static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
void print_partial_schedule (partial_schedule_ptr, FILE *);
-ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
- ddg_node_ptr node, int cycle,
- sbitmap must_precede,
- sbitmap must_follow);
-void rotate_partial_schedule (partial_schedule_ptr, int);
+static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
+static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
+ ddg_node_ptr node, int cycle,
+ sbitmap must_precede,
+ sbitmap must_follow);
+static void rotate_partial_schedule (partial_schedule_ptr, int);
void set_row_column_for_ps (partial_schedule_ptr);
+static bool ps_unschedule_node (partial_schedule_ptr, ddg_node_ptr );
\f
/* This page defines constants and structures for the modulo scheduling
static int issue_rate;
-/* For printing statistics. */
-static FILE *stats_file;
-
static int sms_order_nodes (ddg_ptr, int, int * result);
static void set_node_sched_params (ddg_ptr);
-static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int,
- int *, FILE*);
+static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
-static void generate_prolog_epilog (partial_schedule_ptr, rtx, rtx, int);
+static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
int from_stage, int to_stage,
int is_prolog);
-
#define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
#define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
#define SCHED_FIRST_REG_MOVE(x) \
return tmp;
}
-static int
-contributes_to_priority (rtx next, rtx insn)
-{
- return BLOCK_NUM (next) == BLOCK_NUM (insn);
-}
-
static void
compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
regset cond_exec ATTRIBUTE_UNUSED,
NULL,
NULL,
sms_print_insn,
- contributes_to_priority,
+ NULL,
compute_jump_reg_dependencies,
NULL, NULL,
NULL, NULL,
- 0, 0, 0
+ 0, 0, 0,
+
+ NULL, NULL, NULL, NULL, NULL,
+#ifdef ENABLE_CHECKING
+ NULL,
+#endif
+ 0
};
-/* Return the register decremented and tested or zero if it is not a decrement
- and branch jump insn (similar to doloop_condition_get). */
+/* Return the register decremented and tested in INSN,
+ or zero if it is not a decrement-and-branch insn. */
+
static rtx
-doloop_register_get (rtx insn, rtx *comp)
+doloop_register_get (rtx insn ATTRIBUTE_UNUSED)
{
- rtx pattern, cmp, inc, reg, condition;
+#ifdef HAVE_doloop_end
+ rtx pattern, reg, condition;
- if (!JUMP_P (insn))
+ if (! JUMP_P (insn))
return NULL_RTX;
- pattern = PATTERN (insn);
-
- /* The canonical doloop pattern we expect is:
-
- (parallel [(set (pc) (if_then_else (condition)
- (label_ref (label))
- (pc)))
- (set (reg) (plus (reg) (const_int -1)))
- (additional clobbers and uses)])
-
- where condition is further restricted to be
- (ne (reg) (const_int 1)). */
-
- if (GET_CODE (pattern) != PARALLEL)
- return NULL_RTX;
-
- cmp = XVECEXP (pattern, 0, 0);
- inc = XVECEXP (pattern, 0, 1);
- /* Return the compare rtx. */
- *comp = cmp;
- /* Check for (set (reg) (something)). */
- if (GET_CODE (inc) != SET || ! REG_P (SET_DEST (inc)))
- return NULL_RTX;
-
- /* Extract loop counter register. */
- reg = SET_DEST (inc);
-
- /* Check if something = (plus (reg) (const_int -1)). */
- if (GET_CODE (SET_SRC (inc)) != PLUS
- || XEXP (SET_SRC (inc), 0) != reg
- || XEXP (SET_SRC (inc), 1) != constm1_rtx)
- return NULL_RTX;
-
- /* Check for (set (pc) (if_then_else (condition)
- (label_ref (label))
- (pc))). */
- if (GET_CODE (cmp) != SET
- || SET_DEST (cmp) != pc_rtx
- || GET_CODE (SET_SRC (cmp)) != IF_THEN_ELSE
- || GET_CODE (XEXP (SET_SRC (cmp), 1)) != LABEL_REF
- || XEXP (SET_SRC (cmp), 2) != pc_rtx)
- return NULL_RTX;
-
- /* Extract loop termination condition. */
- condition = XEXP (SET_SRC (cmp), 0);
-
- /* Check if condition = (ne (reg) (const_int 1)), which is more
- restrictive than the check in doloop_condition_get:
- if ((GET_CODE (condition) != GE && GET_CODE (condition) != NE)
- || GET_CODE (XEXP (condition, 1)) != CONST_INT). */
- if (GET_CODE (condition) != NE
- || XEXP (condition, 1) != const1_rtx)
+ pattern = PATTERN (insn);
+ condition = doloop_condition_get (pattern);
+ if (! condition)
return NULL_RTX;
- if (XEXP (condition, 0) == reg)
- return reg;
+ if (REG_P (XEXP (condition, 0)))
+ reg = XEXP (condition, 0);
+ else if (GET_CODE (XEXP (condition, 0)) == PLUS
+ && REG_P (XEXP (XEXP (condition, 0), 0)))
+ reg = XEXP (XEXP (condition, 0), 0);
+ else
+ gcc_unreachable ();
+ return reg;
+#else
return NULL_RTX;
+#endif
}
/* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
{
rtx insn;
rtx head, tail;
- get_block_head_tail (pre_header->index, &head, &tail);
+
+ if (! pre_header)
+ return NULL_RTX;
+
+ get_ebb_head_tail (pre_header, pre_header, &head, &tail);
for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
if (INSN_P (insn) && single_set (insn) &&
}
static void
-print_node_sched_params (FILE * dump_file, int num_nodes)
+print_node_sched_params (FILE * file, int num_nodes)
{
int i;
+ if (! file)
+ return;
for (i = 0; i < num_nodes; i++)
{
node_sched_params_ptr nsp = &node_sched_params[i];
rtx reg_move = nsp->first_reg_move;
int j;
- fprintf (dump_file, "Node %d:\n", i);
- fprintf (dump_file, " asap = %d:\n", nsp->asap);
- fprintf (dump_file, " time = %d:\n", nsp->time);
- fprintf (dump_file, " nreg_moves = %d:\n", nsp->nreg_moves);
+ fprintf (file, "Node %d:\n", i);
+ fprintf (file, " asap = %d:\n", nsp->asap);
+ fprintf (file, " time = %d:\n", nsp->time);
+ fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
for (j = 0; j < nsp->nreg_moves; j++)
{
- fprintf (dump_file, " reg_move = ");
- print_rtl_single (dump_file, reg_move);
+ fprintf (file, " reg_move = ");
+ print_rtl_single (file, reg_move);
reg_move = PREV_INSN (reg_move);
}
}
return maxii;
}
-
-/* Given the partial schedule, generate register moves when the length
- of the register live range is more than ii; the number of moves is
- determined according to the following equation:
- SCHED_TIME (use) - SCHED_TIME (def) { 1 broken loop-carried
- nreg_moves = ----------------------------------- - { dependence.
- ii { 0 if not.
- This handles the modulo-variable-expansions (mve's) needed for the ps. */
-static void
+/*
+ Breaking intra-loop register anti-dependences:
+ Each intra-loop register anti-dependence implies a cross-iteration true
+ dependence of distance 1. Therefore, we can remove such false dependencies
+ and figure out if the partial schedule broke them by checking if (for a
+ true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
+ if so generate a register move. The number of such moves is equal to:
+ SCHED_TIME (use) - SCHED_TIME (def) { 0 broken
+ nreg_moves = ----------------------------------- + 1 - { dependence.
+ ii { 1 if not.
+*/
+static struct undo_replace_buff_elem *
generate_reg_moves (partial_schedule_ptr ps)
{
ddg_ptr g = ps->g;
int ii = ps->ii;
int i;
+ struct undo_replace_buff_elem *reg_move_replaces = NULL;
for (i = 0; i < g->num_nodes; i++)
{
{
int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
+ if (e->distance == 1)
+ nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
+
/* If dest precedes src in the schedule of the kernel, then dest
will read before src writes and we can save one reg_copy. */
if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
{
int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
+ if (e->distance == 1)
+ dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
+
if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
&& SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
dest_copy--;
for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
{
- int i_use;
+ unsigned int i_use = 0;
rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
rtx reg_move = gen_move_insn (new_reg, prev_reg);
+ sbitmap_iterator sbi;
add_insn_before (reg_move, last_reg_move);
last_reg_move = reg_move;
if (!SCHED_FIRST_REG_MOVE (u))
SCHED_FIRST_REG_MOVE (u) = reg_move;
- EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use,
- replace_rtx (g->nodes[i_use].insn, old_reg, new_reg));
+ EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use, sbi)
+ {
+ struct undo_replace_buff_elem *rep;
+
+ rep = (struct undo_replace_buff_elem *)
+ xcalloc (1, sizeof (struct undo_replace_buff_elem));
+ rep->insn = g->nodes[i_use].insn;
+ rep->orig_reg = old_reg;
+ rep->new_reg = new_reg;
+
+ if (! reg_move_replaces)
+ reg_move_replaces = rep;
+ else
+ {
+ rep->next = reg_move_replaces;
+ reg_move_replaces = rep;
+ }
+
+ replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
+ }
prev_reg = new_reg;
}
+ sbitmap_vector_free (uses_of_defs);
+ }
+ return reg_move_replaces;
+}
+
+/* We call this when we want to undo the SMS schedule for a given loop.
+ One of the things that we do is to delete the register moves generated
+ for the sake of SMS; this function deletes the register move instructions
+ recorded in the undo buffer. */
+static void
+undo_generate_reg_moves (partial_schedule_ptr ps,
+ struct undo_replace_buff_elem *reg_move_replaces)
+{
+ int i,j;
+
+ for (i = 0; i < ps->g->num_nodes; i++)
+ {
+ ddg_node_ptr u = &ps->g->nodes[i];
+ rtx prev;
+ rtx crr = SCHED_FIRST_REG_MOVE (u);
+
+ for (j = 0; j < SCHED_NREG_MOVES (u); j++)
+ {
+ prev = PREV_INSN (crr);
+ delete_insn (crr);
+ crr = prev;
+ }
+ SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
+ }
+
+ while (reg_move_replaces)
+ {
+ struct undo_replace_buff_elem *rep = reg_move_replaces;
+
+ reg_move_replaces = reg_move_replaces->next;
+ replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
+ }
+}
+
+/* Free memory allocated for the undo buffer. */
+static void
+free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
+{
+
+ while (reg_move_replaces)
+ {
+ struct undo_replace_buff_elem *rep = reg_move_replaces;
+
+ reg_move_replaces = reg_move_replaces->next;
+ free (rep);
}
}
int amount = PS_MIN_CYCLE (ps);
int ii = ps->ii;
- for (i = 0; i < g->num_nodes; i++)
+ /* Don't include the closing branch assuming that it is the last node. */
+ for (i = 0; i < g->num_nodes - 1; i++)
{
ddg_node_ptr u = &g->nodes[i];
int normalized_time = SCHED_TIME (u) - amount;
- if (normalized_time < 0)
- abort ();
+ gcc_assert (normalized_time >= 0);
SCHED_TIME (u) = normalized_time;
SCHED_ROW (u) = normalized_time % ii;
PREV_INSN (last));
}
+/* As part of undoing SMS we return to the original ordering of the
+ instructions inside the loop kernel. Given the partial schedule PS, this
+ function returns the ordering of the instruction according to their CUID
+ in the DDG (PS->G), which is the original order of the instruction before
+ performing SMS. */
+static void
+undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
+{
+ int i;
+
+ for (i = 0 ; i < ps->g->num_nodes; i++)
+ if (last == ps->g->nodes[i].insn
+ || last == ps->g->nodes[i].first_note)
+ break;
+ else if (PREV_INSN (last) != ps->g->nodes[i].insn)
+ reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
+ PREV_INSN (last));
+}
+
/* Used to generate the prologue & epilogue. Duplicate the subset of
nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
of both), together with a prefix/suffix of their reg_moves. */
/* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing
number of reg_moves starting with the second occurrence of
u_node, which is generated if its SCHED_STAGE <= to_stage. */
- i_reg_moves = to_stage - SCHED_STAGE (u_node);
+ i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
i_reg_moves = MAX (i_reg_moves, 0);
i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
emit_insn (copy_rtx (PATTERN (reg_move)));
-
if (SCHED_STAGE (u_node) >= from_stage
&& SCHED_STAGE (u_node) <= to_stage)
duplicate_insn_chain (u_node->first_note, u_node->insn);
/* Generate the instructions (including reg_moves) for prolog & epilog. */
static void
-generate_prolog_epilog (partial_schedule_ptr ps, rtx orig_loop_beg,
- rtx orig_loop_end, int unknown_count)
+generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
{
int i;
int last_stage = PS_STAGE_COUNT (ps) - 1;
edge e;
- rtx c_reg = NULL_RTX;
- rtx cmp = NULL_RTX;
- rtx precond_jump = NULL_RTX;
- rtx precond_exit_label = NULL_RTX;
- rtx precond_exit_label_insn = NULL_RTX;
- rtx last_epilog_insn = NULL_RTX;
- rtx loop_exit_label = NULL_RTX;
- rtx loop_exit_label_insn = NULL_RTX;
- rtx orig_loop_bct = NULL_RTX;
-
- /* Loop header edge. */
- e = EDGE_PRED (ps->g->bb, 0);
- if (e->src == ps->g->bb)
- e = EDGE_PRED (ps->g->bb, 1);
/* Generate the prolog, inserting its insns on the loop-entry edge. */
start_sequence ();
- /* This is the place where we want to insert the precondition. */
- if (unknown_count)
- precond_jump = emit_note (NOTE_INSN_DELETED);
+ if (count_reg)
+ /* Generate a subtract instruction at the beginning of the prolog to
+ adjust the loop count by STAGE_COUNT. */
+ emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
for (i = 0; i < last_stage; i++)
duplicate_insns_of_cycles (ps, 0, i, 1);
- /* No need to call insert_insn_on_edge; we prepared the sequence. */
- e->insns.r = get_insns ();
+ /* Put the prolog , on the one and only entry edge. */
+ e = loop_preheader_edge (loop);
+ loop_split_edge_with(e , get_insns());
+
end_sequence ();
/* Generate the epilog, inserting its insns on the loop-exit edge. */
for (i = 0; i < last_stage; i++)
duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
- last_epilog_insn = emit_note (NOTE_INSN_DELETED);
+ /* Put the epilogue on the one and only one exit edge. */
+ gcc_assert (loop->single_exit);
+ e = loop->single_exit;
+ loop_split_edge_with(e , get_insns());
+ end_sequence ();
+}
- /* Emit the label where to put the original loop code. */
- if (unknown_count)
- {
- rtx label, cond;
+/* Return the line note insn preceding INSN, for debugging. Taken from
+ emit-rtl.c. */
+static rtx
+find_line_note (rtx insn)
+{
+ for (; insn; insn = PREV_INSN (insn))
+ if (NOTE_P (insn)
+ && NOTE_LINE_NUMBER (insn) >= 0)
+ break;
- precond_exit_label = gen_label_rtx ();
- precond_exit_label_insn = emit_label (precond_exit_label);
+ return insn;
+}
- /* Put the original loop code. */
- reorder_insns_nobb (orig_loop_beg, orig_loop_end, precond_exit_label_insn);
+/* Return true if all the BBs of the loop are empty except the
+ loop header. */
+static bool
+loop_single_full_bb_p (struct loop *loop)
+{
+ unsigned i;
+ basic_block *bbs = get_loop_body (loop);
- /* Change the label of the BCT to be the PRECOND_EXIT_LABEL. */
- orig_loop_bct = get_last_insn ();
- c_reg = doloop_register_get (orig_loop_bct, &cmp);
- label = XEXP (SET_SRC (cmp), 1);
- cond = XEXP (SET_SRC (cmp), 0);
+ for (i = 0; i < loop->num_nodes ; i++)
+ {
+ rtx head, tail;
+ bool empty_bb = true;
- if (! c_reg || GET_CODE (cond) != NE)
- abort ();
+ if (bbs[i] == loop->header)
+ continue;
- XEXP (label, 0) = precond_exit_label;
- JUMP_LABEL (orig_loop_bct) = precond_exit_label_insn;
- LABEL_NUSES (precond_exit_label_insn)++;
+ /* Make sure that basic blocks other than the header
+ have only notes labels or jumps. */
+ get_ebb_head_tail (bbs[i], bbs[i], &head, &tail);
+ for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
+ {
+ if (NOTE_P (head) || LABEL_P (head)
+ || (INSN_P (head) && JUMP_P (head)))
+ continue;
+ empty_bb = false;
+ break;
+ }
- /* Generate the loop exit label. */
- loop_exit_label = gen_label_rtx ();
- loop_exit_label_insn = emit_label (loop_exit_label);
+ if (! empty_bb)
+ {
+ free (bbs);
+ return false;
+ }
}
+ free (bbs);
+ return true;
+}
- e = EDGE_SUCC (ps->g->bb, 0);
- if (e->dest == ps->g->bb)
- e = EDGE_SUCC (ps->g->bb, 1);
+/* A simple loop from SMS point of view; it is a loop that is composed of
+ either a single basic block or two BBs - a header and a latch. */
+#define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) \
+ && (EDGE_COUNT (loop->latch->preds) == 1) \
+ && (EDGE_COUNT (loop->latch->succs) == 1))
- e->insns.r = get_insns ();
- end_sequence ();
+/* Return true if the loop is in its canonical form and false if not.
+ i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit. */
+static bool
+loop_canon_p (struct loop *loop)
+{
- commit_edge_insertions ();
+ if (loop->inner || ! loop->outer)
+ return false;
- if (unknown_count)
+ if (!loop->single_exit)
{
- rtx precond_insns, epilog_jump, insert_after_insn;
- basic_block loop_exit_bb = BLOCK_FOR_INSN (loop_exit_label_insn);
- basic_block epilog_bb = BLOCK_FOR_INSN (last_epilog_insn);
- basic_block precond_bb = BLOCK_FOR_INSN (precond_jump);
- basic_block orig_loop_bb = BLOCK_FOR_INSN (precond_exit_label_insn);
- edge epilog_exit_edge = EDGE_SUCC (epilog_bb, 0);
-
- /* Do loop preconditioning to take care of cases were the loop count is
- less than the stage count. Update the CFG properly. */
- insert_after_insn = precond_jump;
- start_sequence ();
- c_reg = doloop_register_get (ps->g->closing_branch->insn, &cmp);
- emit_cmp_and_jump_insns (c_reg, GEN_INT (PS_STAGE_COUNT (ps)), LT, NULL,
- GET_MODE (c_reg), 1, precond_exit_label);
- precond_insns = get_insns ();
- precond_jump = get_last_insn ();
- end_sequence ();
- reorder_insns (precond_insns, precond_jump, insert_after_insn);
-
- /* Generate a subtract instruction at the beginning of the prolog to
- adjust the loop count by STAGE_COUNT. */
- emit_insn_after (gen_sub2_insn (c_reg, GEN_INT (PS_STAGE_COUNT (ps) - 1)),
- precond_jump);
- update_bb_for_insn (precond_bb);
- delete_insn (insert_after_insn);
-
- /* Update label info for the precondition jump. */
- JUMP_LABEL (precond_jump) = precond_exit_label_insn;
- LABEL_NUSES (precond_exit_label_insn)++;
-
- /* Update the CFG. */
- split_block (precond_bb, precond_jump);
- make_edge (precond_bb, orig_loop_bb, 0);
-
- /* Add a jump at end of the epilog to the LOOP_EXIT_LABEL to jump over the
- original loop copy and update the CFG. */
- epilog_jump = emit_jump_insn_after (gen_jump (loop_exit_label),
- last_epilog_insn);
- delete_insn (last_epilog_insn);
- JUMP_LABEL (epilog_jump) = loop_exit_label_insn;
- LABEL_NUSES (loop_exit_label_insn)++;
-
- redirect_edge_succ (epilog_exit_edge, loop_exit_bb);
- epilog_exit_edge->flags &= ~EDGE_FALLTHRU;
- emit_barrier_after (BB_END (epilog_bb));
+ if (dump_file)
+ {
+ rtx line_note = find_line_note (BB_END (loop->header));
+
+ fprintf (dump_file, "SMS loop many exits ");
+ if (line_note)
+ {
+ expanded_location xloc;
+ NOTE_EXPANDED_LOCATION (xloc, line_note);
+ fprintf (dump_file, " %s %d (file, line)\n",
+ xloc.file, xloc.line);
+ }
+ }
+ return false;
+ }
+
+ if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
+ {
+ if (dump_file)
+ {
+ rtx line_note = find_line_note (BB_END (loop->header));
+
+ fprintf (dump_file, "SMS loop many BBs. ");
+ if (line_note)
+ {
+ expanded_location xloc;
+ NOTE_EXPANDED_LOCATION (xloc, line_note);
+ fprintf (dump_file, " %s %d (file, line)\n",
+ xloc.file, xloc.line);
+ }
+ }
+ return false;
}
+
+ return true;
}
-/* Return the line note insn preceding INSN, for debugging. Taken from
- emit-rtl.c. */
-static rtx
-find_line_note (rtx insn)
+/* If there are more than one entry for the loop,
+ make it one by splitting the first entry edge and
+ redirecting the others to the new BB. */
+static void
+canon_loop (struct loop *loop)
{
- for (; insn; insn = PREV_INSN (insn))
- if (NOTE_P (insn)
- && NOTE_LINE_NUMBER (insn) >= 0)
- break;
+ edge e;
+ edge_iterator i;
- return insn;
+ /* Avoid annoying special cases of edges going to exit
+ block. */
+ FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
+ if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
+ loop_split_edge_with (e, NULL_RTX);
+
+ if (loop->latch == loop->header
+ || EDGE_COUNT (loop->latch->succs) > 1)
+ {
+ FOR_EACH_EDGE (e, i, loop->header->preds)
+ if (e->src == loop->latch)
+ break;
+ loop_split_edge_with (e, NULL_RTX);
+ }
}
/* Main entry point, perform SMS scheduling on the loops of the function
that consist of single basic blocks. */
-void
-sms_schedule (FILE *dump_file)
+static void
+sms_schedule (void)
{
static int passes = 0;
rtx insn;
ddg_ptr *g_arr, g;
- basic_block bb, pre_header = NULL;
int * node_order;
int maxii;
- int i;
+ unsigned i,num_loops;
partial_schedule_ptr ps;
- int max_bb_index = last_basic_block;
struct df *df;
-
- stats_file = dump_file;
+ struct loops *loops;
+ basic_block bb = NULL;
+ /* vars to the versioning only if needed*/
+ struct loop * nloop;
+ basic_block condition_bb = NULL;
+ edge latch_edge;
+ gcov_type trip_count = 0;
+
+ loops = loop_optimizer_init (LOOPS_HAVE_PREHEADERS
+ | LOOPS_HAVE_MARKED_SINGLE_EXITS);
+ if (!loops)
+ return; /* There is no loops to schedule. */
/* Initialize issue_rate. */
if (targetm.sched.issue_rate)
int temp = reload_completed;
reload_completed = 1;
- issue_rate = (*targetm.sched.issue_rate) ();
+ issue_rate = targetm.sched.issue_rate ();
reload_completed = temp;
}
else
/* Initialize the scheduler. */
current_sched_info = &sms_sched_info;
- sched_init (NULL);
+ sched_init ();
/* Init Data Flow analysis, to be used in interloop dep calculation. */
- df = df_init ();
- df_analyze (df, 0, DF_ALL);
+ df = df_init (DF_HARD_REGS | DF_EQUIV_NOTES | DF_SUBREGS);
+ df_rd_add_problem (df, 0);
+ df_ru_add_problem (df, 0);
+ df_chain_add_problem (df, DF_DU_CHAIN | DF_UD_CHAIN);
+ df_analyze (df);
+
+ if (dump_file)
+ df_dump (df, dump_file);
+
+ /* Allocate memory to hold the DDG array one entry for each loop.
+ We use loop->num as index into this array. */
+ g_arr = XCNEWVEC (ddg_ptr, loops->num);
- /* Allocate memory to hold the DDG array. */
- g_arr = xcalloc (max_bb_index, sizeof (ddg_ptr));
/* Build DDGs for all the relevant loops and hold them in G_ARR
- indexed by the loop BB index. */
- FOR_EACH_BB (bb)
+ indexed by the loop index. */
+ for (i = 0; i < loops->num; i++)
{
rtx head, tail;
- rtx count_reg, comp;
- edge e, pre_header_edge;
+ rtx count_reg;
+ struct loop *loop = loops->parray[i];
- if (bb->index < 0)
- continue;
+ /* For debugging. */
+ if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
+ {
+ if (dump_file)
+ fprintf (dump_file, "SMS reached MAX_PASSES... \n");
- /* Check if bb has two successors, one being itself. */
- if (EDGE_COUNT (bb->succs) != 2)
- continue;
+ break;
+ }
- if (EDGE_SUCC (bb, 0)->dest != bb && EDGE_SUCC (bb, 1)->dest != bb)
- continue;
+ if (! loop_canon_p (loop))
+ continue;
- if ((EDGE_SUCC (bb, 0)->flags & EDGE_COMPLEX)
- || (EDGE_SUCC (bb, 1)->flags & EDGE_COMPLEX))
+ if (! loop_single_full_bb_p (loop))
continue;
- /* Check if bb has two predecessors, one being itself. */
- if (EDGE_COUNT (bb->preds) != 2)
- continue;
+ bb = loop->header;
- if (EDGE_PRED (bb, 0)->src != bb && EDGE_PRED (bb, 1)->src != bb)
- continue;
+ get_ebb_head_tail (bb, bb, &head, &tail);
+ latch_edge = loop_latch_edge (loop);
+ gcc_assert (loop->single_exit);
+ if (loop->single_exit->count)
+ trip_count = latch_edge->count / loop->single_exit->count;
- if ((EDGE_PRED (bb, 0)->flags & EDGE_COMPLEX)
- || (EDGE_PRED (bb, 1)->flags & EDGE_COMPLEX))
- continue;
+ /* Perfrom SMS only on loops that their average count is above threshold. */
- /* For debugging. */
- if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
+ if ( latch_edge->count
+ && (latch_edge->count < loop->single_exit->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
{
if (dump_file)
- fprintf (dump_file, "SMS reached MAX_PASSES... \n");
- break;
- }
-
- get_block_head_tail (bb->index, &head, &tail);
- pre_header_edge = EDGE_PRED (bb, 0);
- if (EDGE_PRED (bb, 0)->src != bb)
- pre_header_edge = EDGE_PRED (bb, 1);
-
- /* Perfrom SMS only on loops that their average count is above threshold. */
- if (bb->count < pre_header_edge->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD)
- {
- if (stats_file)
{
rtx line_note = find_line_note (tail);
{
expanded_location xloc;
NOTE_EXPANDED_LOCATION (xloc, line_note);
- fprintf (stats_file, "SMS bb %s %d (file, line)\n",
+ fprintf (dump_file, "SMS bb %s %d (file, line)\n",
xloc.file, xloc.line);
}
- fprintf (stats_file, "SMS single-bb-loop\n");
+ fprintf (dump_file, "SMS single-bb-loop\n");
if (profile_info && flag_branch_probabilities)
{
- fprintf (stats_file, "SMS loop-count ");
- fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
+ fprintf (dump_file, "SMS loop-count ");
+ fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
(HOST_WIDEST_INT) bb->count);
- fprintf (stats_file, "\n");
- fprintf (stats_file, "SMS preheader-count ");
- fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
- (HOST_WIDEST_INT) pre_header_edge->count);
- fprintf (stats_file, "\n");
- fprintf (stats_file, "SMS profile-sum-max ");
- fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
+ fprintf (dump_file, "\n");
+ fprintf (dump_file, "SMS trip-count ");
+ fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
+ (HOST_WIDEST_INT) trip_count);
+ fprintf (dump_file, "\n");
+ fprintf (dump_file, "SMS profile-sum-max ");
+ fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
(HOST_WIDEST_INT) profile_info->sum_max);
- fprintf (stats_file, "\n");
+ fprintf (dump_file, "\n");
}
}
continue;
}
/* Make sure this is a doloop. */
- if ( !(count_reg = doloop_register_get (tail, &comp)))
+ if ( !(count_reg = doloop_register_get (tail)))
continue;
- e = EDGE_PRED (bb, 0);
- if (e->src == bb)
- pre_header = EDGE_PRED (bb, 1)->src;
- else
- pre_header = e->src;
-
/* Don't handle BBs with calls or barriers, or !single_set insns. */
for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
if (CALL_P (insn)
if (insn != NEXT_INSN (tail))
{
- if (stats_file)
+ if (dump_file)
{
if (CALL_P (insn))
- fprintf (stats_file, "SMS loop-with-call\n");
+ fprintf (dump_file, "SMS loop-with-call\n");
else if (BARRIER_P (insn))
- fprintf (stats_file, "SMS loop-with-barrier\n");
+ fprintf (dump_file, "SMS loop-with-barrier\n");
else
- fprintf (stats_file, "SMS loop-with-not-single-set\n");
- print_rtl_single (stats_file, insn);
+ fprintf (dump_file, "SMS loop-with-not-single-set\n");
+ print_rtl_single (dump_file, insn);
}
continue;
if (! (g = create_ddg (bb, df, 0)))
{
- if (stats_file)
- fprintf (stats_file, "SMS doloop\n");
+ if (dump_file)
+ fprintf (dump_file, "SMS doloop\n");
continue;
}
- g_arr[bb->index] = g;
+ g_arr[i] = g;
}
/* Release Data Flow analysis data structures. */
df_finish (df);
+ df = NULL;
+ /* We don't want to perform SMS on new loops - created by versioning. */
+ num_loops = loops->num;
/* Go over the built DDGs and perfrom SMS for each one of them. */
- for (i = 0; i < max_bb_index; i++)
+ for (i = 0; i < num_loops; i++)
{
rtx head, tail;
- rtx count_reg, count_init, comp;
- edge pre_header_edge;
+ rtx count_reg, count_init;
int mii, rec_mii;
- int stage_count = 0;
+ unsigned stage_count = 0;
HOST_WIDEST_INT loop_count = 0;
+ struct loop *loop = loops->parray[i];
if (! (g = g_arr[i]))
continue;
if (dump_file)
print_ddg (dump_file, g);
- get_block_head_tail (g->bb->index, &head, &tail);
+ get_ebb_head_tail (loop->header, loop->header, &head, &tail);
- pre_header_edge = EDGE_PRED (g->bb, 0);
- if (EDGE_PRED (g->bb, 0)->src != g->bb)
- pre_header_edge = EDGE_PRED (g->bb, 1);
+ latch_edge = loop_latch_edge (loop);
+ gcc_assert (loop->single_exit);
+ if (loop->single_exit->count)
+ trip_count = latch_edge->count / loop->single_exit->count;
- if (stats_file)
+ if (dump_file)
{
rtx line_note = find_line_note (tail);
{
expanded_location xloc;
NOTE_EXPANDED_LOCATION (xloc, line_note);
- fprintf (stats_file, "SMS bb %s %d (file, line)\n",
+ fprintf (dump_file, "SMS bb %s %d (file, line)\n",
xloc.file, xloc.line);
}
- fprintf (stats_file, "SMS single-bb-loop\n");
+ fprintf (dump_file, "SMS single-bb-loop\n");
if (profile_info && flag_branch_probabilities)
{
- fprintf (stats_file, "SMS loop-count ");
- fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
+ fprintf (dump_file, "SMS loop-count ");
+ fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
(HOST_WIDEST_INT) bb->count);
- fprintf (stats_file, "\n");
- fprintf (stats_file, "SMS preheader-count ");
- fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
- (HOST_WIDEST_INT) pre_header_edge->count);
- fprintf (stats_file, "\n");
- fprintf (stats_file, "SMS profile-sum-max ");
- fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
+ fprintf (dump_file, "\n");
+ fprintf (dump_file, "SMS profile-sum-max ");
+ fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
(HOST_WIDEST_INT) profile_info->sum_max);
- fprintf (stats_file, "\n");
+ fprintf (dump_file, "\n");
}
- fprintf (stats_file, "SMS doloop\n");
- fprintf (stats_file, "SMS built-ddg %d\n", g->num_nodes);
- fprintf (stats_file, "SMS num-loads %d\n", g->num_loads);
- fprintf (stats_file, "SMS num-stores %d\n", g->num_stores);
+ fprintf (dump_file, "SMS doloop\n");
+ fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
+ fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
+ fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
}
- /* Make sure this is a doloop. */
- if ( !(count_reg = doloop_register_get (tail, &comp)))
- abort ();
- /* This should be NULL_RTX if the count is unknown at compile time. */
- count_init = const_iteration_count (count_reg, pre_header, &loop_count);
+ /* In case of th loop have doloop register it gets special
+ handling. */
+ count_init = NULL_RTX;
+ if ((count_reg = doloop_register_get (tail)))
+ {
+ basic_block pre_header;
- if (stats_file && count_init)
+ pre_header = loop_preheader_edge (loop)->src;
+ count_init = const_iteration_count (count_reg, pre_header,
+ &loop_count);
+ }
+ gcc_assert (count_reg);
+
+ if (dump_file && count_init)
{
- fprintf (stats_file, "SMS const-doloop ");
- fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
- fprintf (stats_file, "\n");
+ fprintf (dump_file, "SMS const-doloop ");
+ fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
+ loop_count);
+ fprintf (dump_file, "\n");
}
- node_order = (int *) xmalloc (sizeof (int) * g->num_nodes);
+ node_order = XNEWVEC (int, g->num_nodes);
mii = 1; /* Need to pass some estimate of mii. */
rec_mii = sms_order_nodes (g, mii, node_order);
mii = MAX (res_MII (g), rec_mii);
maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
- if (stats_file)
- fprintf (stats_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
+ if (dump_file)
+ fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
rec_mii, mii, maxii);
/* After sms_order_nodes and before sms_schedule_by_order, to copy over
ASAP. */
set_node_sched_params (g);
- ps = sms_schedule_by_order (g, mii, maxii, node_order, dump_file);
+ ps = sms_schedule_by_order (g, mii, maxii, node_order);
if (ps)
stage_count = PS_STAGE_COUNT (ps);
- if (stage_count == 0 || (count_init && (stage_count > loop_count)))
+ /* Stage count of 1 means that there is no interleaving between
+ iterations, let the scheduling passes do the job. */
+ if (stage_count < 1
+ || (count_init && (loop_count <= stage_count))
+ || (flag_branch_probabilities && (trip_count <= stage_count)))
{
if (dump_file)
- fprintf (dump_file, "SMS failed... \n");
- if (stats_file)
- fprintf (stats_file, "SMS sched-failed %d\n", stage_count);
+ {
+ fprintf (dump_file, "SMS failed... \n");
+ fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
+ fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
+ fprintf (dump_file, ", trip-count=");
+ fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
+ fprintf (dump_file, ")\n");
+ }
+ continue;
}
else
{
- rtx orig_loop_beg = NULL_RTX;
- rtx orig_loop_end = NULL_RTX;
+ int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
+ int new_cycles;
+ struct undo_replace_buff_elem *reg_move_replaces;
- if (stats_file)
+ if (dump_file)
{
- fprintf (stats_file,
+ fprintf (dump_file,
"SMS succeeded %d %d (with ii, sc)\n", ps->ii,
stage_count);
print_partial_schedule (ps, dump_file);
g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
}
- /* Save the original loop if we want to do loop preconditioning in
- case the BCT count is not known. */
- if (! count_init)
- {
- int i;
-
- start_sequence ();
- /* Copy the original loop code before modifying it -
- so we can use it later. */
- for (i = 0; i < ps->g->num_nodes; i++)
- duplicate_insn_chain (ps->g->nodes[i].first_note,
- ps->g->nodes[i].insn);
-
- orig_loop_beg = get_insns ();
- orig_loop_end = get_last_insn ();
- end_sequence ();
- }
/* Set the stage boundaries. If the DDG is built with closing_branch_deps,
the closing_branch was scheduled and should appear in the last (ii-1)
row. Otherwise, we are free to schedule the branch, and we let nodes
rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
set_columns_for_ps (ps);
+ /* Generate the kernel just to be able to measure its cycles. */
permute_partial_schedule (ps, g->closing_branch->first_note);
+ reg_move_replaces = generate_reg_moves (ps);
- /* Mark this loop as software pipelined so the later
- scheduling passes doesn't touch it. */
- if (! flag_resched_modulo_sched)
- g->bb->flags |= BB_DISABLE_SCHEDULE;
+ /* Get the number of cycles the new kernel expect to execute in. */
+ new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
- generate_reg_moves (ps);
- if (dump_file)
- print_node_sched_params (dump_file, g->num_nodes);
+ /* Get back to the original loop so we can do loop versioning. */
+ undo_permute_partial_schedule (ps, g->closing_branch->first_note);
+ if (reg_move_replaces)
+ undo_generate_reg_moves (ps, reg_move_replaces);
+
+ if ( new_cycles >= orig_cycles)
+ {
+ /* SMS is not profitable so undo the permutation and reg move generation
+ and return the kernel to its original state. */
+ if (dump_file)
+ fprintf (dump_file, "Undoing SMS because it is not profitable.\n");
+
+ }
+ else
+ {
+ canon_loop (loop);
- /* Set new iteration count of loop kernel. */
- if (count_init)
- SET_SRC (single_set (count_init)) = GEN_INT (loop_count
- - stage_count + 1);
+ /* case the BCT count is not known , Do loop-versioning */
+ if (count_reg && ! count_init)
+ {
+ rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
+ GEN_INT(stage_count));
- /* Generate prolog and epilog. */
- generate_prolog_epilog (ps, orig_loop_beg, orig_loop_end,
- count_init ? 0 : 1);
+ nloop = loop_version (loops, loop, comp_rtx, &condition_bb,
+ true);
+ }
+
+ /* Set new iteration count of loop kernel. */
+ if (count_reg && count_init)
+ SET_SRC (single_set (count_init)) = GEN_INT (loop_count
+ - stage_count + 1);
+
+ /* Now apply the scheduled kernel to the RTL of the loop. */
+ permute_partial_schedule (ps, g->closing_branch->first_note);
+
+ /* Mark this loop as software pipelined so the later
+ scheduling passes doesn't touch it. */
+ if (! flag_resched_modulo_sched)
+ g->bb->flags |= BB_DISABLE_SCHEDULE;
+ /* The life-info is not valid any more. */
+ g->bb->flags |= BB_DIRTY;
+
+ reg_move_replaces = generate_reg_moves (ps);
+ if (dump_file)
+ print_node_sched_params (dump_file, g->num_nodes);
+ /* Generate prolog and epilog. */
+ if (count_reg && !count_init)
+ generate_prolog_epilog (ps, loop, count_reg);
+ else
+ generate_prolog_epilog (ps, loop, NULL_RTX);
+ }
+ free_undo_replace_buff (reg_move_replaces);
}
+
free_partial_schedule (ps);
free (node_sched_params);
free (node_order);
free_ddg (g);
}
+ free (g_arr);
+
/* Release scheduler data, needed until now because of DFA. */
sched_finish ();
+ loop_optimizer_finalize (loops);
}
/* The SMS scheduling algorithm itself
set to 0 to save compile time. */
#define DFA_HISTORY SMS_DFA_HISTORY
+/* Given the partial schedule PS, this function calculates and returns the
+ cycles in which we can schedule the node with the given index I.
+ NOTE: Here we do the backtracking in SMS, in some special cases. We have
+ noticed that there are several cases in which we fail to SMS the loop
+ because the sched window of a node is empty due to tight data-deps. In
+ such cases we want to unschedule some of the predecessors/successors
+ until we get non-empty scheduling window. It returns -1 if the
+ scheduling window is empty and zero otherwise. */
+
+static int
+get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i,
+ sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
+{
+ int start, step, end;
+ ddg_edge_ptr e;
+ int u = nodes_order [i];
+ ddg_node_ptr u_node = &ps->g->nodes[u];
+ sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
+ sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
+ sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
+ sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
+ int psp_not_empty;
+ int pss_not_empty;
+
+ /* 1. compute sched window for u (start, end, step). */
+ sbitmap_zero (psp);
+ sbitmap_zero (pss);
+ psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
+ pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
+
+ if (psp_not_empty && !pss_not_empty)
+ {
+ int early_start = INT_MIN;
+
+ end = INT_MAX;
+ for (e = u_node->in; e != 0; e = e->next_in)
+ {
+ ddg_node_ptr v_node = e->src;
+ if (TEST_BIT (sched_nodes, v_node->cuid))
+ {
+ int node_st = SCHED_TIME (v_node)
+ + e->latency - (e->distance * ii);
+
+ early_start = MAX (early_start, node_st);
+
+ if (e->data_type == MEM_DEP)
+ end = MIN (end, SCHED_TIME (v_node) + ii - 1);
+ }
+ }
+ start = early_start;
+ end = MIN (end, early_start + ii);
+ step = 1;
+ }
+
+ else if (!psp_not_empty && pss_not_empty)
+ {
+ int late_start = INT_MAX;
+
+ end = INT_MIN;
+ for (e = u_node->out; e != 0; e = e->next_out)
+ {
+ ddg_node_ptr v_node = e->dest;
+ if (TEST_BIT (sched_nodes, v_node->cuid))
+ {
+ late_start = MIN (late_start,
+ SCHED_TIME (v_node) - e->latency
+ + (e->distance * ii));
+ if (e->data_type == MEM_DEP)
+ end = MAX (end, SCHED_TIME (v_node) - ii + 1);
+ }
+ }
+ start = late_start;
+ end = MAX (end, late_start - ii);
+ step = -1;
+ }
+
+ else if (psp_not_empty && pss_not_empty)
+ {
+ int early_start = INT_MIN;
+ int late_start = INT_MAX;
+
+ start = INT_MIN;
+ end = INT_MAX;
+ for (e = u_node->in; e != 0; e = e->next_in)
+ {
+ ddg_node_ptr v_node = e->src;
+
+ if (TEST_BIT (sched_nodes, v_node->cuid))
+ {
+ early_start = MAX (early_start,
+ SCHED_TIME (v_node) + e->latency
+ - (e->distance * ii));
+ if (e->data_type == MEM_DEP)
+ end = MIN (end, SCHED_TIME (v_node) + ii - 1);
+ }
+ }
+ for (e = u_node->out; e != 0; e = e->next_out)
+ {
+ ddg_node_ptr v_node = e->dest;
+
+ if (TEST_BIT (sched_nodes, v_node->cuid))
+ {
+ late_start = MIN (late_start,
+ SCHED_TIME (v_node) - e->latency
+ + (e->distance * ii));
+ if (e->data_type == MEM_DEP)
+ start = MAX (start, SCHED_TIME (v_node) - ii + 1);
+ }
+ }
+ start = MAX (start, early_start);
+ end = MIN (end, MIN (early_start + ii, late_start + 1));
+ step = 1;
+ }
+ else /* psp is empty && pss is empty. */
+ {
+ start = SCHED_ASAP (u_node);
+ end = start + ii;
+ step = 1;
+ }
+
+ *start_p = start;
+ *step_p = step;
+ *end_p = end;
+ sbitmap_free (psp);
+ sbitmap_free (pss);
+
+ if ((start >= end && step == 1) || (start <= end && step == -1))
+ return -1;
+ else
+ return 0;
+}
+
+/* This function implements the scheduling algorithm for SMS according to the
+ above algorithm. */
static partial_schedule_ptr
-sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order, FILE *dump_file)
+sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
{
int ii = mii;
int i, c, success;
ddg_edge_ptr e;
int start, end, step; /* Place together into one struct? */
sbitmap sched_nodes = sbitmap_alloc (num_nodes);
- sbitmap psp = sbitmap_alloc (num_nodes);
- sbitmap pss = sbitmap_alloc (num_nodes);
sbitmap must_precede = sbitmap_alloc (num_nodes);
sbitmap must_follow = sbitmap_alloc (num_nodes);
+ sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
- while (try_again_with_larger_ii && ii < maxii)
+ sbitmap_ones (tobe_scheduled);
+ sbitmap_zero (sched_nodes);
+
+ while ((! sbitmap_equal (tobe_scheduled, sched_nodes)
+ || try_again_with_larger_ii ) && ii < maxii)
{
+ int j;
+ bool unscheduled_nodes = false;
+
if (dump_file)
fprintf(dump_file, "Starting with ii=%d\n", ii);
- try_again_with_larger_ii = false;
- sbitmap_zero (sched_nodes);
+ if (try_again_with_larger_ii)
+ {
+ try_again_with_larger_ii = false;
+ sbitmap_zero (sched_nodes);
+ }
for (i = 0; i < num_nodes; i++)
{
int u = nodes_order[i];
- ddg_node_ptr u_node = &g->nodes[u];
- sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
- sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
- int psp_not_empty;
- int pss_not_empty;
+ ddg_node_ptr u_node = &ps->g->nodes[u];
rtx insn = u_node->insn;
if (!INSN_P (insn))
- continue;
-
- if (JUMP_P (insn)) /* Closing branch handled later. */
- continue;
-
- /* 1. compute sched window for u (start, end, step). */
- sbitmap_zero (psp);
- sbitmap_zero (pss);
- psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
- pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
-
- if (psp_not_empty && !pss_not_empty)
{
- int early_start = 0;
-
- end = INT_MAX;
- for (e = u_node->in; e != 0; e = e->next_in)
- {
- ddg_node_ptr v_node = e->src;
- if (TEST_BIT (sched_nodes, v_node->cuid))
- {
- int node_st = SCHED_TIME (v_node)
- + e->latency - (e->distance * ii);
-
- early_start = MAX (early_start, node_st);
-
- if (e->data_type == MEM_DEP)
- end = MIN (end, SCHED_TIME (v_node) + ii - 1);
- }
- }
- start = early_start;
- end = MIN (end, early_start + ii);
- step = 1;
+ RESET_BIT (tobe_scheduled, u);
+ continue;
}
- else if (!psp_not_empty && pss_not_empty)
+ if (JUMP_P (insn)) /* Closing branch handled later. */
{
- int late_start = INT_MAX;
-
- end = INT_MIN;
- for (e = u_node->out; e != 0; e = e->next_out)
- {
- ddg_node_ptr v_node = e->dest;
- if (TEST_BIT (sched_nodes, v_node->cuid))
- {
- late_start = MIN (late_start,
- SCHED_TIME (v_node) - e->latency
- + (e->distance * ii));
- if (e->data_type == MEM_DEP)
- end = MAX (end, SCHED_TIME (v_node) - ii + 1);
- }
- }
- start = late_start;
- end = MAX (end, late_start - ii);
- step = -1;
+ RESET_BIT (tobe_scheduled, u);
+ continue;
}
- else if (psp_not_empty && pss_not_empty)
- {
- int early_start = 0;
- int late_start = INT_MAX;
+ if (TEST_BIT (sched_nodes, u))
+ continue;
- start = INT_MIN;
- end = INT_MAX;
- for (e = u_node->in; e != 0; e = e->next_in)
- {
- ddg_node_ptr v_node = e->src;
-
- if (TEST_BIT (sched_nodes, v_node->cuid))
- {
- early_start = MAX (early_start,
- SCHED_TIME (v_node) + e->latency
- - (e->distance * ii));
- if (e->data_type == MEM_DEP)
- end = MIN (end, SCHED_TIME (v_node) + ii - 1);
- }
- }
- for (e = u_node->out; e != 0; e = e->next_out)
+ /* Try to get non-empty scheduling window. */
+ j = i;
+ while (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, &step, &end) < 0
+ && j > 0)
+ {
+ unscheduled_nodes = true;
+ if (TEST_BIT (NODE_PREDECESSORS (u_node), nodes_order[j - 1])
+ || TEST_BIT (NODE_SUCCESSORS (u_node), nodes_order[j - 1]))
{
- ddg_node_ptr v_node = e->dest;
-
- if (TEST_BIT (sched_nodes, v_node->cuid))
- {
- late_start = MIN (late_start,
- SCHED_TIME (v_node) - e->latency
- + (e->distance * ii));
- if (e->data_type == MEM_DEP)
- start = MAX (start, SCHED_TIME (v_node) - ii + 1);
- }
+ ps_unschedule_node (ps, &ps->g->nodes[nodes_order[j - 1]]);
+ RESET_BIT (sched_nodes, nodes_order [j - 1]);
}
- start = MAX (start, early_start);
- end = MIN (end, MIN (early_start + ii, late_start + 1));
- step = 1;
+ j--;
}
- else /* psp is empty && pss is empty. */
+ if (j < 0)
{
- start = SCHED_ASAP (u_node);
- end = start + ii;
- step = 1;
+ /* ??? Try backtracking instead of immediately ii++? */
+ ii++;
+ try_again_with_larger_ii = true;
+ reset_partial_schedule (ps, ii);
+ break;
}
-
/* 2. Try scheduling u in window. */
if (dump_file)
fprintf(dump_file, "Trying to schedule node %d in (%d .. %d) step %d\n",
reset_partial_schedule (ps, ii);
break;
}
+ if (unscheduled_nodes)
+ break;
+
/* ??? If (success), check register pressure estimates. */
} /* Continue with next node. */
} /* While try_again_with_larger_ii. */
sbitmap_free (sched_nodes);
- sbitmap_free (psp);
- sbitmap_free (pss);
+ sbitmap_free (must_precede);
+ sbitmap_free (must_follow);
+ sbitmap_free (tobe_scheduled);
if (ii >= maxii)
{
{
int u = node_order[i];
- if (u >= num_nodes || u < 0 || TEST_BIT (tmp, u))
- abort ();
+ gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u));
SET_BIT (tmp, u);
}
static int
find_max_asap (ddg_ptr g, sbitmap nodes)
{
- int u;
+ unsigned int u = 0;
int max_asap = -1;
int result = -1;
+ sbitmap_iterator sbi;
- EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
+ EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
{
ddg_node_ptr u_node = &g->nodes[u];
max_asap = ASAP (u_node);
result = u;
}
- });
+ }
return result;
}
static int
find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
{
- int u;
+ unsigned int u = 0;
int max_hv = -1;
int min_mob = INT_MAX;
int result = -1;
+ sbitmap_iterator sbi;
- EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
+ EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
{
ddg_node_ptr u_node = &g->nodes[u];
min_mob = MOB (u_node);
result = u;
}
- });
+ }
return result;
}
static int
find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
{
- int u;
+ unsigned int u = 0;
int max_dv = -1;
int min_mob = INT_MAX;
int result = -1;
+ sbitmap_iterator sbi;
- EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
+ EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
{
ddg_node_ptr u_node = &g->nodes[u];
min_mob = MOB (u_node);
result = u;
}
- });
+ }
return result;
}
modulo scheduling. */
/* Create a partial schedule and allocate a memory to hold II rows. */
-partial_schedule_ptr
+
+static partial_schedule_ptr
create_partial_schedule (int ii, ddg_ptr g, int history)
{
- partial_schedule_ptr ps = (partial_schedule_ptr)
- xmalloc (sizeof (struct partial_schedule));
+ partial_schedule_ptr ps = XNEW (struct partial_schedule);
ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
ps->ii = ii;
ps->history = history;
}
/* Free all the memory allocated to the partial schedule. */
-void
+
+static void
free_partial_schedule (partial_schedule_ptr ps)
{
if (!ps)
/* Clear the rows array with its PS_INSNs, and create a new one with
NEW_II rows. */
-void
+
+static void
reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
{
if (!ps)
static ps_insn_ptr
create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
{
- ps_insn_ptr ps_i = xmalloc (sizeof (struct ps_insn));
+ ps_insn_ptr ps_i = XNEW (struct ps_insn);
ps_i->node = node;
ps_i->next_in_row = NULL;
/* Removes the given PS_INSN from the partial schedule. Returns false if the
node is not found in the partial schedule, else returns true. */
-static int
+static bool
remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
{
int row;
/* Find the first must follow and the last must precede
and insert the node immediately after the must precede
- but make sure that it there is no must follow after it. */
+ but make sure that it there is no must follow after it. */
for (next_ps_i = ps->rows[row];
next_ps_i;
next_ps_i = next_ps_i->next_in_row)
{
if (targetm.sched.dfa_pre_cycle_insn)
state_transition (curr_state,
- (*targetm.sched.dfa_pre_cycle_insn) ());
+ targetm.sched.dfa_pre_cycle_insn ());
state_transition (curr_state, NULL);
if (targetm.sched.dfa_post_cycle_insn)
state_transition (curr_state,
- (*targetm.sched.dfa_post_cycle_insn) ());
+ targetm.sched.dfa_post_cycle_insn ());
+}
+
+/* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
+ the number of cycles according to DFA that the kernel fits in,
+ we use this to check if we done well with SMS after we add
+ register moves. In some cases register moves overhead makes
+ it even worse than the original loop. We want SMS to be performed
+ when it gives less cycles after register moves are added. */
+static int
+kernel_number_of_cycles (rtx first_insn, rtx last_insn)
+{
+ int cycles = 0;
+ rtx insn;
+ int can_issue_more = issue_rate;
+
+ state_reset (curr_state);
+
+ for (insn = first_insn;
+ insn != NULL_RTX && insn != last_insn;
+ insn = NEXT_INSN (insn))
+ {
+ if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
+ continue;
+
+ /* Check if there is room for the current insn. */
+ if (!can_issue_more || state_dead_lock_p (curr_state))
+ {
+ cycles ++;
+ advance_one_cycle ();
+ can_issue_more = issue_rate;
+ }
+
+ /* Update the DFA state and return with failure if the DFA found
+ recource conflicts. */
+ if (state_transition (curr_state, insn) >= 0)
+ {
+ cycles ++;
+ advance_one_cycle ();
+ can_issue_more = issue_rate;
+ }
+
+ if (targetm.sched.variable_issue)
+ can_issue_more =
+ targetm.sched.variable_issue (sched_dump, sched_verbose,
+ insn, can_issue_more);
+ /* A naked CLOBBER or USE generates no instruction, so don't
+ let them consume issue slots. */
+ else if (GET_CODE (PATTERN (insn)) != USE
+ && GET_CODE (PATTERN (insn)) != CLOBBER)
+ can_issue_more--;
+ }
+ return cycles;
}
/* Checks if PS has resource conflicts according to DFA, starting from
if (targetm.sched.variable_issue)
can_issue_more =
- (*targetm.sched.variable_issue) (sched_dump, sched_verbose,
- insn, can_issue_more);
+ targetm.sched.variable_issue (sched_dump, sched_verbose,
+ insn, can_issue_more);
/* A naked CLOBBER or USE generates no instruction, so don't
let them consume issue slots. */
else if (GET_CODE (PATTERN (insn)) != USE
ps->min_cycle -= start_cycle;
}
-#endif /* INSN_SCHEDULING*/
+/* Remove the node N from the partial schedule PS; because we restart the DFA
+ each time we want to check for resource conflicts; this is equivalent to
+ unscheduling the node N. */
+static bool
+ps_unschedule_node (partial_schedule_ptr ps, ddg_node_ptr n)
+{
+ ps_insn_ptr ps_i;
+ int row = SMODULO (SCHED_TIME (n), ps->ii);
+
+ if (row < 0 || row > ps->ii)
+ return false;
+
+ for (ps_i = ps->rows[row];
+ ps_i && ps_i->node != n;
+ ps_i = ps_i->next_in_row);
+ if (!ps_i)
+ return false;
+
+ return remove_node_from_ps (ps, ps_i);
+}
+#endif /* INSN_SCHEDULING */
+\f
+static bool
+gate_handle_sms (void)
+{
+ return (optimize > 0 && flag_modulo_sched);
+}
+
+
+/* Run instruction scheduler. */
+/* Perform SMS module scheduling. */
+static unsigned int
+rest_of_handle_sms (void)
+{
+#ifdef INSN_SCHEDULING
+ basic_block bb;
+
+ /* We want to be able to create new pseudos. */
+ no_new_pseudos = 0;
+ /* Collect loop information to be used in SMS. */
+ cfg_layout_initialize (CLEANUP_UPDATE_LIFE);
+ sms_schedule ();
+
+ /* Update the life information, because we add pseudos. */
+ max_regno = max_reg_num ();
+ allocate_reg_info (max_regno, FALSE, FALSE);
+ update_life_info (NULL, UPDATE_LIFE_GLOBAL_RM_NOTES,
+ (PROP_DEATH_NOTES
+ | PROP_REG_INFO
+ | PROP_KILL_DEAD_CODE
+ | PROP_SCAN_DEAD_CODE));
+
+ no_new_pseudos = 1;
+
+ /* Finalize layout changes. */
+ FOR_EACH_BB (bb)
+ if (bb->next_bb != EXIT_BLOCK_PTR)
+ bb->aux = bb->next_bb;
+ cfg_layout_finalize ();
+ free_dominance_info (CDI_DOMINATORS);
+#endif /* INSN_SCHEDULING */
+ return 0;
+}
+
+struct tree_opt_pass pass_sms =
+{
+ "sms", /* name */
+ gate_handle_sms, /* gate */
+ rest_of_handle_sms, /* execute */
+ NULL, /* sub */
+ NULL, /* next */
+ 0, /* static_pass_number */
+ TV_SMS, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ TODO_dump_func, /* todo_flags_start */
+ TODO_dump_func |
+ TODO_ggc_collect, /* todo_flags_finish */
+ 'm' /* letter */
+};
+