1 /* Swing Modulo Scheduling implementation.
3 Free Software Foundation, Inc.
4 Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
26 #include "coretypes.h"
31 #include "hard-reg-set.h"
35 #include "insn-config.h"
36 #include "insn-attr.h"
40 #include "sched-int.h"
42 #include "cfglayout.h"
51 #ifdef INSN_SCHEDULING
53 /* This file contains the implementation of the Swing Modulo Scheduler,
54 described in the following references:
55 [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
56 Lifetime--sensitive modulo scheduling in a production environment.
57 IEEE Trans. on Comps., 50(3), March 2001
58 [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
59 Swing Modulo Scheduling: A Lifetime Sensitive Approach.
60 PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
62 The basic structure is:
63 1. Build a data-dependence graph (DDG) for each loop.
64 2. Use the DDG to order the insns of a loop (not in topological order
65 necessarily, but rather) trying to place each insn after all its
66 predecessors _or_ after all its successors.
67 3. Compute MII: a lower bound on the number of cycles to schedule the loop.
68 4. Use the ordering to perform list-scheduling of the loop:
69 1. Set II = MII. We will try to schedule the loop within II cycles.
70 2. Try to schedule the insns one by one according to the ordering.
71 For each insn compute an interval of cycles by considering already-
72 scheduled preds and succs (and associated latencies); try to place
73 the insn in the cycles of this window checking for potential
74 resource conflicts (using the DFA interface).
75 Note: this is different from the cycle-scheduling of schedule_insns;
76 here the insns are not scheduled monotonically top-down (nor bottom-
78 3. If failed in scheduling all insns - bump II++ and try again, unless
79 II reaches an upper bound MaxII, in which case report failure.
80 5. If we succeeded in scheduling the loop within II cycles, we now
81 generate prolog and epilog, decrease the counter of the loop, and
82 perform modulo variable expansion for live ranges that span more than
83 II cycles (i.e. use register copies to prevent a def from overwriting
84 itself before reaching the use).
88 /* This page defines partial-schedule structures and functions for
91 typedef struct partial_schedule *partial_schedule_ptr;
92 typedef struct ps_insn *ps_insn_ptr;
94 /* The minimum (absolute) cycle that a node of ps was scheduled in. */
95 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
97 /* The maximum (absolute) cycle that a node of ps was scheduled in. */
98 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
100 /* Perform signed modulo, always returning a non-negative value. */
101 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
103 /* The number of different iterations the nodes in ps span, assuming
104 the stage boundaries are placed efficiently. */
105 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
106 + 1 + (ps)->ii - 1) / (ps)->ii)
108 #define CFG_HOOKS cfg_layout_rtl_cfg_hooks
110 /* A single instruction in the partial schedule. */
113 /* The corresponding DDG_NODE. */
116 /* The (absolute) cycle in which the PS instruction is scheduled.
117 Same as SCHED_TIME (node). */
120 /* The next/prev PS_INSN in the same row. */
121 ps_insn_ptr next_in_row,
124 /* The number of nodes in the same row that come after this node. */
128 /* Holds the partial schedule as an array of II rows. Each entry of the
129 array points to a linked list of PS_INSNs, which represents the
130 instructions that are scheduled for that row. */
131 struct partial_schedule
133 int ii; /* Number of rows in the partial schedule. */
134 int history; /* Threshold for conflict checking using DFA. */
136 /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii). */
139 /* The earliest absolute cycle of an insn in the partial schedule. */
142 /* The latest absolute cycle of an insn in the partial schedule. */
145 ddg_ptr g; /* The DDG of the insns in the partial schedule. */
149 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
150 static void free_partial_schedule (partial_schedule_ptr);
151 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
152 void print_partial_schedule (partial_schedule_ptr, FILE *);
153 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
154 ddg_node_ptr node, int cycle,
155 sbitmap must_precede,
156 sbitmap must_follow);
157 static void rotate_partial_schedule (partial_schedule_ptr, int);
158 void set_row_column_for_ps (partial_schedule_ptr);
161 /* This page defines constants and structures for the modulo scheduling
164 /* As in haifa-sched.c: */
165 /* issue_rate is the number of insns that can be scheduled in the same
166 machine cycle. It can be defined in the config/mach/mach.h file,
167 otherwise we set it to 1. */
169 static int issue_rate;
171 /* For printing statistics. */
172 static FILE *stats_file;
174 static int sms_order_nodes (ddg_ptr, int, int * result);
175 static void set_node_sched_params (ddg_ptr);
176 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int,
178 static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
179 static void generate_prolog_epilog (partial_schedule_ptr, rtx, rtx, int);
180 static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
181 int from_stage, int to_stage,
185 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
186 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
187 #define SCHED_FIRST_REG_MOVE(x) \
188 (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
189 #define SCHED_NREG_MOVES(x) \
190 (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
191 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
192 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
193 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
195 /* The scheduling parameters held for each node. */
196 typedef struct node_sched_params
198 int asap; /* A lower-bound on the absolute scheduling cycle. */
199 int time; /* The absolute scheduling cycle (time >= asap). */
201 /* The following field (first_reg_move) is a pointer to the first
202 register-move instruction added to handle the modulo-variable-expansion
203 of the register defined by this node. This register-move copies the
204 original register defined by the node. */
207 /* The number of register-move instructions added, immediately preceding
211 int row; /* Holds time % ii. */
212 int stage; /* Holds time / ii. */
214 /* The column of a node inside the ps. If nodes u, v are on the same row,
215 u will precede v if column (u) < column (v). */
217 } *node_sched_params_ptr;
220 /* The following three functions are copied from the current scheduler
221 code in order to use sched_analyze() for computing the dependencies.
222 They are used when initializing the sched_info structure. */
224 sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
228 sprintf (tmp, "i%4d", INSN_UID (insn));
233 contributes_to_priority (rtx next, rtx insn)
235 return BLOCK_NUM (next) == BLOCK_NUM (insn);
239 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
240 regset cond_exec ATTRIBUTE_UNUSED,
241 regset used ATTRIBUTE_UNUSED,
242 regset set ATTRIBUTE_UNUSED)
246 static struct sched_info sms_sched_info =
254 contributes_to_priority,
255 compute_jump_reg_dependencies,
262 /* Return the register decremented and tested or zero if it is not a decrement
263 and branch jump insn (similar to doloop_condition_get). */
265 doloop_register_get (rtx insn, rtx *comp)
267 rtx pattern, cmp, inc, reg, condition;
271 pattern = PATTERN (insn);
273 /* The canonical doloop pattern we expect is:
275 (parallel [(set (pc) (if_then_else (condition)
278 (set (reg) (plus (reg) (const_int -1)))
279 (additional clobbers and uses)])
281 where condition is further restricted to be
282 (ne (reg) (const_int 1)). */
284 if (GET_CODE (pattern) != PARALLEL)
287 cmp = XVECEXP (pattern, 0, 0);
288 inc = XVECEXP (pattern, 0, 1);
289 /* Return the compare rtx. */
292 /* Check for (set (reg) (something)). */
293 if (GET_CODE (inc) != SET || ! REG_P (SET_DEST (inc)))
296 /* Extract loop counter register. */
297 reg = SET_DEST (inc);
299 /* Check if something = (plus (reg) (const_int -1)). */
300 if (GET_CODE (SET_SRC (inc)) != PLUS
301 || XEXP (SET_SRC (inc), 0) != reg
302 || XEXP (SET_SRC (inc), 1) != constm1_rtx)
305 /* Check for (set (pc) (if_then_else (condition)
308 if (GET_CODE (cmp) != SET
309 || SET_DEST (cmp) != pc_rtx
310 || GET_CODE (SET_SRC (cmp)) != IF_THEN_ELSE
311 || GET_CODE (XEXP (SET_SRC (cmp), 1)) != LABEL_REF
312 || XEXP (SET_SRC (cmp), 2) != pc_rtx)
315 /* Extract loop termination condition. */
316 condition = XEXP (SET_SRC (cmp), 0);
318 /* Check if condition = (ne (reg) (const_int 1)), which is more
319 restrictive than the check in doloop_condition_get:
320 if ((GET_CODE (condition) != GE && GET_CODE (condition) != NE)
321 || GET_CODE (XEXP (condition, 1)) != CONST_INT). */
322 if (GET_CODE (condition) != NE
323 || XEXP (condition, 1) != const1_rtx)
326 if (XEXP (condition, 0) == reg)
332 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
333 that the number of iterations is a compile-time constant. If so,
334 return the rtx that sets COUNT_REG to a constant, and set COUNT to
335 this constant. Otherwise return 0. */
337 const_iteration_count (rtx count_reg, basic_block pre_header,
338 HOST_WIDEST_INT * count)
342 get_block_head_tail (pre_header->index, &head, &tail);
344 for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
345 if (INSN_P (insn) && single_set (insn) &&
346 rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
348 rtx pat = single_set (insn);
350 if (GET_CODE (SET_SRC (pat)) == CONST_INT)
352 *count = INTVAL (SET_SRC (pat));
362 /* A very simple resource-based lower bound on the initiation interval.
363 ??? Improve the accuracy of this bound by considering the
364 utilization of various units. */
368 return (g->num_nodes / issue_rate);
372 /* Points to the array that contains the sched data for each node. */
373 static node_sched_params_ptr node_sched_params;
375 /* Allocate sched_params for each node and initialize it. Assumes that
376 the aux field of each node contain the asap bound (computed earlier),
377 and copies it into the sched_params field. */
379 set_node_sched_params (ddg_ptr g)
383 /* Allocate for each node in the DDG a place to hold the "sched_data". */
384 /* Initialize ASAP/ALAP/HIGHT to zero. */
385 node_sched_params = (node_sched_params_ptr)
386 xcalloc (g->num_nodes,
387 sizeof (struct node_sched_params));
389 /* Set the pointer of the general data of the node to point to the
390 appropriate sched_params structure. */
391 for (i = 0; i < g->num_nodes; i++)
393 /* Watch out for aliasing problems? */
394 node_sched_params[i].asap = g->nodes[i].aux.count;
395 g->nodes[i].aux.info = &node_sched_params[i];
400 print_node_sched_params (FILE * dump_file, int num_nodes)
404 for (i = 0; i < num_nodes; i++)
406 node_sched_params_ptr nsp = &node_sched_params[i];
407 rtx reg_move = nsp->first_reg_move;
410 fprintf (dump_file, "Node %d:\n", i);
411 fprintf (dump_file, " asap = %d:\n", nsp->asap);
412 fprintf (dump_file, " time = %d:\n", nsp->time);
413 fprintf (dump_file, " nreg_moves = %d:\n", nsp->nreg_moves);
414 for (j = 0; j < nsp->nreg_moves; j++)
416 fprintf (dump_file, " reg_move = ");
417 print_rtl_single (dump_file, reg_move);
418 reg_move = PREV_INSN (reg_move);
423 /* Calculate an upper bound for II. SMS should not schedule the loop if it
424 requires more cycles than this bound. Currently set to the sum of the
425 longest latency edge for each node. Reset based on experiments. */
427 calculate_maxii (ddg_ptr g)
432 for (i = 0; i < g->num_nodes; i++)
434 ddg_node_ptr u = &g->nodes[i];
436 int max_edge_latency = 0;
438 for (e = u->out; e; e = e->next_out)
439 max_edge_latency = MAX (max_edge_latency, e->latency);
441 maxii += max_edge_latency;
447 /* Given the partial schedule, generate register moves when the length
448 of the register live range is more than ii; the number of moves is
449 determined according to the following equation:
450 SCHED_TIME (use) - SCHED_TIME (def) { 1 broken loop-carried
451 nreg_moves = ----------------------------------- - { dependence.
453 This handles the modulo-variable-expansions (mve's) needed for the ps. */
455 generate_reg_moves (partial_schedule_ptr ps)
461 for (i = 0; i < g->num_nodes; i++)
463 ddg_node_ptr u = &g->nodes[i];
465 int nreg_moves = 0, i_reg_move;
466 sbitmap *uses_of_defs;
468 rtx prev_reg, old_reg;
470 /* Compute the number of reg_moves needed for u, by looking at life
471 ranges started at u (excluding self-loops). */
472 for (e = u->out; e; e = e->next_out)
473 if (e->type == TRUE_DEP && e->dest != e->src)
475 int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
477 /* If dest precedes src in the schedule of the kernel, then dest
478 will read before src writes and we can save one reg_copy. */
479 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
480 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
483 nreg_moves = MAX (nreg_moves, nreg_moves4e);
489 /* Every use of the register defined by node may require a different
490 copy of this register, depending on the time the use is scheduled.
491 Set a bitmap vector, telling which nodes use each copy of this
493 uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
494 sbitmap_vector_zero (uses_of_defs, nreg_moves);
495 for (e = u->out; e; e = e->next_out)
496 if (e->type == TRUE_DEP && e->dest != e->src)
498 int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
500 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
501 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
505 SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
508 /* Now generate the reg_moves, attaching relevant uses to them. */
509 SCHED_NREG_MOVES (u) = nreg_moves;
510 old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
511 last_reg_move = u->insn;
513 for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
516 rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
517 rtx reg_move = gen_move_insn (new_reg, prev_reg);
519 add_insn_before (reg_move, last_reg_move);
520 last_reg_move = reg_move;
522 if (!SCHED_FIRST_REG_MOVE (u))
523 SCHED_FIRST_REG_MOVE (u) = reg_move;
525 EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use,
526 replace_rtx (g->nodes[i_use].insn, old_reg, new_reg));
533 /* Bump the SCHED_TIMEs of all nodes to start from zero. Set the values
534 of SCHED_ROW and SCHED_STAGE. */
536 normalize_sched_times (partial_schedule_ptr ps)
540 int amount = PS_MIN_CYCLE (ps);
543 for (i = 0; i < g->num_nodes; i++)
545 ddg_node_ptr u = &g->nodes[i];
546 int normalized_time = SCHED_TIME (u) - amount;
548 if (normalized_time < 0)
551 SCHED_TIME (u) = normalized_time;
552 SCHED_ROW (u) = normalized_time % ii;
553 SCHED_STAGE (u) = normalized_time / ii;
557 /* Set SCHED_COLUMN of each node according to its position in PS. */
559 set_columns_for_ps (partial_schedule_ptr ps)
563 for (row = 0; row < ps->ii; row++)
565 ps_insn_ptr cur_insn = ps->rows[row];
568 for (; cur_insn; cur_insn = cur_insn->next_in_row)
569 SCHED_COLUMN (cur_insn->node) = column++;
573 /* Permute the insns according to their order in PS, from row 0 to
574 row ii-1, and position them right before LAST. This schedules
575 the insns of the loop kernel. */
577 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
583 for (row = 0; row < ii ; row++)
584 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
585 if (PREV_INSN (last) != ps_ij->node->insn)
586 reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
590 /* Used to generate the prologue & epilogue. Duplicate the subset of
591 nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
592 of both), together with a prefix/suffix of their reg_moves. */
594 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
595 int to_stage, int for_prolog)
600 for (row = 0; row < ps->ii; row++)
601 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
603 ddg_node_ptr u_node = ps_ij->node;
605 rtx reg_move = NULL_RTX;
609 /* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing
610 number of reg_moves starting with the second occurrence of
611 u_node, which is generated if its SCHED_STAGE <= to_stage. */
612 i_reg_moves = to_stage - SCHED_STAGE (u_node);
613 i_reg_moves = MAX (i_reg_moves, 0);
614 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
616 /* The reg_moves start from the *first* reg_move backwards. */
619 reg_move = SCHED_FIRST_REG_MOVE (u_node);
620 for (j = 1; j < i_reg_moves; j++)
621 reg_move = PREV_INSN (reg_move);
624 else /* It's for the epilog. */
626 /* SCHED_STAGE (u_node) <= to_stage. Generate all reg_moves,
627 starting to decrease one stage after u_node no longer occurs;
628 that is, generate all reg_moves until
629 SCHED_STAGE (u_node) == from_stage - 1. */
630 i_reg_moves = SCHED_NREG_MOVES (u_node)
631 - (from_stage - SCHED_STAGE (u_node) - 1);
632 i_reg_moves = MAX (i_reg_moves, 0);
633 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
635 /* The reg_moves start from the *last* reg_move forwards. */
638 reg_move = SCHED_FIRST_REG_MOVE (u_node);
639 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
640 reg_move = PREV_INSN (reg_move);
644 for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
645 emit_insn (copy_rtx (PATTERN (reg_move)));
647 if (SCHED_STAGE (u_node) >= from_stage
648 && SCHED_STAGE (u_node) <= to_stage)
649 duplicate_insn_chain (u_node->first_note, u_node->insn);
654 /* Generate the instructions (including reg_moves) for prolog & epilog. */
656 generate_prolog_epilog (partial_schedule_ptr ps, rtx orig_loop_beg,
657 rtx orig_loop_end, int unknown_count)
660 int last_stage = PS_STAGE_COUNT (ps) - 1;
662 rtx c_reg = NULL_RTX;
664 rtx precond_jump = NULL_RTX;
665 rtx precond_exit_label = NULL_RTX;
666 rtx precond_exit_label_insn = NULL_RTX;
667 rtx last_epilog_insn = NULL_RTX;
668 rtx loop_exit_label = NULL_RTX;
669 rtx loop_exit_label_insn = NULL_RTX;
670 rtx orig_loop_bct = NULL_RTX;
672 /* Loop header edge. */
673 e = EDGE_PRED (ps->g->bb, 0);
674 if (e->src == ps->g->bb)
675 e = EDGE_PRED (ps->g->bb, 1);
677 /* Generate the prolog, inserting its insns on the loop-entry edge. */
680 /* This is the place where we want to insert the precondition. */
682 precond_jump = emit_note (NOTE_INSN_DELETED);
684 for (i = 0; i < last_stage; i++)
685 duplicate_insns_of_cycles (ps, 0, i, 1);
687 /* No need to call insert_insn_on_edge; we prepared the sequence. */
688 e->insns.r = get_insns ();
691 /* Generate the epilog, inserting its insns on the loop-exit edge. */
694 for (i = 0; i < last_stage; i++)
695 duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
697 last_epilog_insn = emit_note (NOTE_INSN_DELETED);
699 /* Emit the label where to put the original loop code. */
704 precond_exit_label = gen_label_rtx ();
705 precond_exit_label_insn = emit_label (precond_exit_label);
707 /* Put the original loop code. */
708 reorder_insns_nobb (orig_loop_beg, orig_loop_end, precond_exit_label_insn);
710 /* Change the label of the BCT to be the PRECOND_EXIT_LABEL. */
711 orig_loop_bct = get_last_insn ();
712 c_reg = doloop_register_get (orig_loop_bct, &cmp);
713 label = XEXP (SET_SRC (cmp), 1);
714 cond = XEXP (SET_SRC (cmp), 0);
716 if (! c_reg || GET_CODE (cond) != NE)
719 XEXP (label, 0) = precond_exit_label;
720 JUMP_LABEL (orig_loop_bct) = precond_exit_label_insn;
721 LABEL_NUSES (precond_exit_label_insn)++;
723 /* Generate the loop exit label. */
724 loop_exit_label = gen_label_rtx ();
725 loop_exit_label_insn = emit_label (loop_exit_label);
728 e = EDGE_SUCC (ps->g->bb, 0);
729 if (e->dest == ps->g->bb)
730 e = EDGE_SUCC (ps->g->bb, 1);
732 e->insns.r = get_insns ();
735 commit_edge_insertions ();
739 rtx precond_insns, epilog_jump, insert_after_insn;
740 basic_block loop_exit_bb = BLOCK_FOR_INSN (loop_exit_label_insn);
741 basic_block epilog_bb = BLOCK_FOR_INSN (last_epilog_insn);
742 basic_block precond_bb = BLOCK_FOR_INSN (precond_jump);
743 basic_block orig_loop_bb = BLOCK_FOR_INSN (precond_exit_label_insn);
744 edge epilog_exit_edge = EDGE_SUCC (epilog_bb, 0);
746 /* Do loop preconditioning to take care of cases were the loop count is
747 less than the stage count. Update the CFG properly. */
748 insert_after_insn = precond_jump;
750 c_reg = doloop_register_get (ps->g->closing_branch->insn, &cmp);
751 emit_cmp_and_jump_insns (c_reg, GEN_INT (PS_STAGE_COUNT (ps)), LT, NULL,
752 GET_MODE (c_reg), 1, precond_exit_label);
753 precond_insns = get_insns ();
754 precond_jump = get_last_insn ();
756 reorder_insns (precond_insns, precond_jump, insert_after_insn);
758 /* Generate a subtract instruction at the beginning of the prolog to
759 adjust the loop count by STAGE_COUNT. */
760 emit_insn_after (gen_sub2_insn (c_reg, GEN_INT (PS_STAGE_COUNT (ps) - 1)),
762 update_bb_for_insn (precond_bb);
763 delete_insn (insert_after_insn);
765 /* Update label info for the precondition jump. */
766 JUMP_LABEL (precond_jump) = precond_exit_label_insn;
767 LABEL_NUSES (precond_exit_label_insn)++;
769 /* Update the CFG. */
770 split_block (precond_bb, precond_jump);
771 make_edge (precond_bb, orig_loop_bb, 0);
773 /* Add a jump at end of the epilog to the LOOP_EXIT_LABEL to jump over the
774 original loop copy and update the CFG. */
775 epilog_jump = emit_jump_insn_after (gen_jump (loop_exit_label),
777 delete_insn (last_epilog_insn);
778 JUMP_LABEL (epilog_jump) = loop_exit_label_insn;
779 LABEL_NUSES (loop_exit_label_insn)++;
781 redirect_edge_succ (epilog_exit_edge, loop_exit_bb);
782 epilog_exit_edge->flags &= ~EDGE_FALLTHRU;
783 emit_barrier_after (BB_END (epilog_bb));
787 /* Return the line note insn preceding INSN, for debugging. Taken from
790 find_line_note (rtx insn)
792 for (; insn; insn = PREV_INSN (insn))
794 && NOTE_LINE_NUMBER (insn) >= 0)
800 /* Main entry point, perform SMS scheduling on the loops of the function
801 that consist of single basic blocks. */
803 sms_schedule (FILE *dump_file)
805 static int passes = 0;
808 basic_block bb, pre_header = NULL;
812 partial_schedule_ptr ps;
813 int max_bb_index = last_basic_block;
816 stats_file = dump_file;
818 /* Initialize issue_rate. */
819 if (targetm.sched.issue_rate)
821 int temp = reload_completed;
823 reload_completed = 1;
824 issue_rate = (*targetm.sched.issue_rate) ();
825 reload_completed = temp;
830 /* Initialize the scheduler. */
831 current_sched_info = &sms_sched_info;
834 /* Init Data Flow analysis, to be used in interloop dep calculation. */
836 df_analyze (df, 0, DF_ALL);
838 /* Allocate memory to hold the DDG array. */
839 g_arr = xcalloc (max_bb_index, sizeof (ddg_ptr));
841 /* Build DDGs for all the relevant loops and hold them in G_ARR
842 indexed by the loop BB index. */
847 edge e, pre_header_edge;
852 /* Check if bb has two successors, one being itself. */
853 if (EDGE_COUNT (bb->succs) != 2)
856 if (EDGE_SUCC (bb, 0)->dest != bb && EDGE_SUCC (bb, 1)->dest != bb)
859 if ((EDGE_SUCC (bb, 0)->flags & EDGE_COMPLEX)
860 || (EDGE_SUCC (bb, 1)->flags & EDGE_COMPLEX))
863 /* Check if bb has two predecessors, one being itself. */
864 if (EDGE_COUNT (bb->preds) != 2)
867 if (EDGE_PRED (bb, 0)->src != bb && EDGE_PRED (bb, 1)->src != bb)
870 if ((EDGE_PRED (bb, 0)->flags & EDGE_COMPLEX)
871 || (EDGE_PRED (bb, 1)->flags & EDGE_COMPLEX))
875 if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
878 fprintf (dump_file, "SMS reached MAX_PASSES... \n");
882 get_block_head_tail (bb->index, &head, &tail);
883 pre_header_edge = EDGE_PRED (bb, 0);
884 if (EDGE_PRED (bb, 0)->src != bb)
885 pre_header_edge = EDGE_PRED (bb, 1);
887 /* Perfrom SMS only on loops that their average count is above threshold. */
888 if (bb->count < pre_header_edge->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD)
892 rtx line_note = find_line_note (tail);
896 expanded_location xloc;
897 NOTE_EXPANDED_LOCATION (xloc, line_note);
898 fprintf (stats_file, "SMS bb %s %d (file, line)\n",
899 xloc.file, xloc.line);
901 fprintf (stats_file, "SMS single-bb-loop\n");
902 if (profile_info && flag_branch_probabilities)
904 fprintf (stats_file, "SMS loop-count ");
905 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
906 (HOST_WIDEST_INT) bb->count);
907 fprintf (stats_file, "\n");
908 fprintf (stats_file, "SMS preheader-count ");
909 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
910 (HOST_WIDEST_INT) pre_header_edge->count);
911 fprintf (stats_file, "\n");
912 fprintf (stats_file, "SMS profile-sum-max ");
913 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
914 (HOST_WIDEST_INT) profile_info->sum_max);
915 fprintf (stats_file, "\n");
921 /* Make sure this is a doloop. */
922 if ( !(count_reg = doloop_register_get (tail, &comp)))
925 e = EDGE_PRED (bb, 0);
927 pre_header = EDGE_PRED (bb, 1)->src;
931 /* Don't handle BBs with calls or barriers, or !single_set insns. */
932 for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
935 || (INSN_P (insn) && !JUMP_P (insn)
936 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
939 if (insn != NEXT_INSN (tail))
944 fprintf (stats_file, "SMS loop-with-call\n");
945 else if (BARRIER_P (insn))
946 fprintf (stats_file, "SMS loop-with-barrier\n");
948 fprintf (stats_file, "SMS loop-with-not-single-set\n");
949 print_rtl_single (stats_file, insn);
955 if (! (g = create_ddg (bb, df, 0)))
958 fprintf (stats_file, "SMS doloop\n");
962 g_arr[bb->index] = g;
965 /* Release Data Flow analysis data structures. */
968 /* Go over the built DDGs and perfrom SMS for each one of them. */
969 for (i = 0; i < max_bb_index; i++)
972 rtx count_reg, count_init, comp;
973 edge pre_header_edge;
976 HOST_WIDEST_INT loop_count = 0;
978 if (! (g = g_arr[i]))
982 print_ddg (dump_file, g);
984 get_block_head_tail (g->bb->index, &head, &tail);
986 pre_header_edge = EDGE_PRED (g->bb, 0);
987 if (EDGE_PRED (g->bb, 0)->src != g->bb)
988 pre_header_edge = EDGE_PRED (g->bb, 1);
992 rtx line_note = find_line_note (tail);
996 expanded_location xloc;
997 NOTE_EXPANDED_LOCATION (xloc, line_note);
998 fprintf (stats_file, "SMS bb %s %d (file, line)\n",
999 xloc.file, xloc.line);
1001 fprintf (stats_file, "SMS single-bb-loop\n");
1002 if (profile_info && flag_branch_probabilities)
1004 fprintf (stats_file, "SMS loop-count ");
1005 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1006 (HOST_WIDEST_INT) bb->count);
1007 fprintf (stats_file, "\n");
1008 fprintf (stats_file, "SMS preheader-count ");
1009 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1010 (HOST_WIDEST_INT) pre_header_edge->count);
1011 fprintf (stats_file, "\n");
1012 fprintf (stats_file, "SMS profile-sum-max ");
1013 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1014 (HOST_WIDEST_INT) profile_info->sum_max);
1015 fprintf (stats_file, "\n");
1017 fprintf (stats_file, "SMS doloop\n");
1018 fprintf (stats_file, "SMS built-ddg %d\n", g->num_nodes);
1019 fprintf (stats_file, "SMS num-loads %d\n", g->num_loads);
1020 fprintf (stats_file, "SMS num-stores %d\n", g->num_stores);
1023 /* Make sure this is a doloop. */
1024 if ( !(count_reg = doloop_register_get (tail, &comp)))
1027 /* This should be NULL_RTX if the count is unknown at compile time. */
1028 count_init = const_iteration_count (count_reg, pre_header, &loop_count);
1030 if (stats_file && count_init)
1032 fprintf (stats_file, "SMS const-doloop ");
1033 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1034 fprintf (stats_file, "\n");
1037 node_order = (int *) xmalloc (sizeof (int) * g->num_nodes);
1039 mii = 1; /* Need to pass some estimate of mii. */
1040 rec_mii = sms_order_nodes (g, mii, node_order);
1041 mii = MAX (res_MII (g), rec_mii);
1042 maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1045 fprintf (stats_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1046 rec_mii, mii, maxii);
1048 /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1050 set_node_sched_params (g);
1052 ps = sms_schedule_by_order (g, mii, maxii, node_order, dump_file);
1055 stage_count = PS_STAGE_COUNT (ps);
1057 if (stage_count == 0 || (count_init && (stage_count > loop_count)))
1060 fprintf (dump_file, "SMS failed... \n");
1062 fprintf (stats_file, "SMS sched-failed %d\n", stage_count);
1066 rtx orig_loop_beg = NULL_RTX;
1067 rtx orig_loop_end = NULL_RTX;
1071 fprintf (stats_file,
1072 "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1074 print_partial_schedule (ps, dump_file);
1076 "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1077 g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1080 /* Save the original loop if we want to do loop preconditioning in
1081 case the BCT count is not known. */
1087 /* Copy the original loop code before modifying it -
1088 so we can use it later. */
1089 for (i = 0; i < ps->g->num_nodes; i++)
1090 duplicate_insn_chain (ps->g->nodes[i].first_note,
1091 ps->g->nodes[i].insn);
1093 orig_loop_beg = get_insns ();
1094 orig_loop_end = get_last_insn ();
1097 /* Set the stage boundaries. If the DDG is built with closing_branch_deps,
1098 the closing_branch was scheduled and should appear in the last (ii-1)
1099 row. Otherwise, we are free to schedule the branch, and we let nodes
1100 that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1101 row; this should reduce stage_count to minimum. */
1102 normalize_sched_times (ps);
1103 rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1104 set_columns_for_ps (ps);
1106 permute_partial_schedule (ps, g->closing_branch->first_note);
1108 /* Mark this loop as software pipelined so the later
1109 scheduling passes doesn't touch it. */
1110 if (! flag_resched_modulo_sched)
1111 g->bb->flags |= BB_DISABLE_SCHEDULE;
1113 generate_reg_moves (ps);
1115 print_node_sched_params (dump_file, g->num_nodes);
1117 /* Set new iteration count of loop kernel. */
1119 SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1122 /* Generate prolog and epilog. */
1123 generate_prolog_epilog (ps, orig_loop_beg, orig_loop_end,
1124 count_init ? 0 : 1);
1126 free_partial_schedule (ps);
1127 free (node_sched_params);
1132 /* Release scheduler data, needed until now because of DFA. */
1136 /* The SMS scheduling algorithm itself
1137 -----------------------------------
1138 Input: 'O' an ordered list of insns of a loop.
1139 Output: A scheduling of the loop - kernel, prolog, and epilogue.
1141 'Q' is the empty Set
1142 'PS' is the partial schedule; it holds the currently scheduled nodes with
1144 'PSP' previously scheduled predecessors.
1145 'PSS' previously scheduled successors.
1146 't(u)' the cycle where u is scheduled.
1147 'l(u)' is the latency of u.
1148 'd(v,u)' is the dependence distance from v to u.
1149 'ASAP(u)' the earliest time at which u could be scheduled as computed in
1150 the node ordering phase.
1151 'check_hardware_resources_conflicts(u, PS, c)'
1152 run a trace around cycle/slot through DFA model
1153 to check resource conflicts involving instruction u
1154 at cycle c given the partial schedule PS.
1155 'add_to_partial_schedule_at_time(u, PS, c)'
1156 Add the node/instruction u to the partial schedule
1158 'calculate_register_pressure(PS)'
1159 Given a schedule of instructions, calculate the register
1160 pressure it implies. One implementation could be the
1161 maximum number of overlapping live ranges.
1162 'maxRP' The maximum allowed register pressure, it is usually derived from the number
1163 registers available in the hardware.
1167 3. for each node u in O in pre-computed order
1168 4. if (PSP(u) != Q && PSS(u) == Q) then
1169 5. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1170 6. start = Early_start; end = Early_start + II - 1; step = 1
1171 11. else if (PSP(u) == Q && PSS(u) != Q) then
1172 12. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1173 13. start = Late_start; end = Late_start - II + 1; step = -1
1174 14. else if (PSP(u) != Q && PSS(u) != Q) then
1175 15. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1176 16. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1177 17. start = Early_start;
1178 18. end = min(Early_start + II - 1 , Late_start);
1180 20. else "if (PSP(u) == Q && PSS(u) == Q)"
1181 21. start = ASAP(u); end = start + II - 1; step = 1
1185 24. for (c = start ; c != end ; c += step)
1186 25. if check_hardware_resources_conflicts(u, PS, c) then
1187 26. add_to_partial_schedule_at_time(u, PS, c)
1192 31. if (success == false) then
1194 33. if (II > maxII) then
1195 34. finish - failed to schedule
1200 39. if (calculate_register_pressure(PS) > maxRP) then
1203 42. compute epilogue & prologue
1204 43. finish - succeeded to schedule
1207 /* A limit on the number of cycles that resource conflicts can span. ??? Should
1208 be provided by DFA, and be dependent on the type of insn scheduled. Currently
1209 set to 0 to save compile time. */
1210 #define DFA_HISTORY SMS_DFA_HISTORY
1212 static partial_schedule_ptr
1213 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order, FILE *dump_file)
1217 int try_again_with_larger_ii = true;
1218 int num_nodes = g->num_nodes;
1220 int start, end, step; /* Place together into one struct? */
1221 sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1222 sbitmap psp = sbitmap_alloc (num_nodes);
1223 sbitmap pss = sbitmap_alloc (num_nodes);
1224 sbitmap must_precede = sbitmap_alloc (num_nodes);
1225 sbitmap must_follow = sbitmap_alloc (num_nodes);
1227 partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1229 while (try_again_with_larger_ii && ii < maxii)
1232 fprintf(dump_file, "Starting with ii=%d\n", ii);
1233 try_again_with_larger_ii = false;
1234 sbitmap_zero (sched_nodes);
1236 for (i = 0; i < num_nodes; i++)
1238 int u = nodes_order[i];
1239 ddg_node_ptr u_node = &g->nodes[u];
1240 sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1241 sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1244 rtx insn = u_node->insn;
1249 if (JUMP_P (insn)) /* Closing branch handled later. */
1252 /* 1. compute sched window for u (start, end, step). */
1255 psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1256 pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1258 if (psp_not_empty && !pss_not_empty)
1260 int early_start = 0;
1263 for (e = u_node->in; e != 0; e = e->next_in)
1265 ddg_node_ptr v_node = e->src;
1266 if (TEST_BIT (sched_nodes, v_node->cuid))
1268 int node_st = SCHED_TIME (v_node)
1269 + e->latency - (e->distance * ii);
1271 early_start = MAX (early_start, node_st);
1273 if (e->data_type == MEM_DEP)
1274 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1277 start = early_start;
1278 end = MIN (end, early_start + ii);
1282 else if (!psp_not_empty && pss_not_empty)
1284 int late_start = INT_MAX;
1287 for (e = u_node->out; e != 0; e = e->next_out)
1289 ddg_node_ptr v_node = e->dest;
1290 if (TEST_BIT (sched_nodes, v_node->cuid))
1292 late_start = MIN (late_start,
1293 SCHED_TIME (v_node) - e->latency
1294 + (e->distance * ii));
1295 if (e->data_type == MEM_DEP)
1296 end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1300 end = MAX (end, late_start - ii);
1304 else if (psp_not_empty && pss_not_empty)
1306 int early_start = 0;
1307 int late_start = INT_MAX;
1311 for (e = u_node->in; e != 0; e = e->next_in)
1313 ddg_node_ptr v_node = e->src;
1315 if (TEST_BIT (sched_nodes, v_node->cuid))
1317 early_start = MAX (early_start,
1318 SCHED_TIME (v_node) + e->latency
1319 - (e->distance * ii));
1320 if (e->data_type == MEM_DEP)
1321 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1324 for (e = u_node->out; e != 0; e = e->next_out)
1326 ddg_node_ptr v_node = e->dest;
1328 if (TEST_BIT (sched_nodes, v_node->cuid))
1330 late_start = MIN (late_start,
1331 SCHED_TIME (v_node) - e->latency
1332 + (e->distance * ii));
1333 if (e->data_type == MEM_DEP)
1334 start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1337 start = MAX (start, early_start);
1338 end = MIN (end, MIN (early_start + ii, late_start + 1));
1341 else /* psp is empty && pss is empty. */
1343 start = SCHED_ASAP (u_node);
1348 /* 2. Try scheduling u in window. */
1350 fprintf(dump_file, "Trying to schedule node %d in (%d .. %d) step %d\n",
1351 u, start, end, step);
1353 /* use must_follow & must_precede bitmaps to determine order
1354 of nodes within the cycle. */
1355 sbitmap_zero (must_precede);
1356 sbitmap_zero (must_follow);
1357 for (e = u_node->in; e != 0; e = e->next_in)
1358 if (TEST_BIT (sched_nodes, e->src->cuid)
1359 && e->latency == (ii * e->distance)
1360 && start == SCHED_TIME (e->src))
1361 SET_BIT (must_precede, e->src->cuid);
1363 for (e = u_node->out; e != 0; e = e->next_out)
1364 if (TEST_BIT (sched_nodes, e->dest->cuid)
1365 && e->latency == (ii * e->distance)
1366 && end == SCHED_TIME (e->dest))
1367 SET_BIT (must_follow, e->dest->cuid);
1370 if ((step > 0 && start < end) || (step < 0 && start > end))
1371 for (c = start; c != end; c += step)
1375 psi = ps_add_node_check_conflicts (ps, u_node, c,
1381 SCHED_TIME (u_node) = c;
1382 SET_BIT (sched_nodes, u);
1385 fprintf(dump_file, "Schedule in %d\n", c);
1391 /* ??? Try backtracking instead of immediately ii++? */
1393 try_again_with_larger_ii = true;
1394 reset_partial_schedule (ps, ii);
1397 /* ??? If (success), check register pressure estimates. */
1398 } /* Continue with next node. */
1399 } /* While try_again_with_larger_ii. */
1401 sbitmap_free (sched_nodes);
1407 free_partial_schedule (ps);
1414 /* This page implements the algorithm for ordering the nodes of a DDG
1415 for modulo scheduling, activated through the
1416 "int sms_order_nodes (ddg_ptr, int mii, int * result)" API. */
1418 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1419 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
1420 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
1421 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1422 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
1423 #define DEPTH(x) (ASAP ((x)))
1425 typedef struct node_order_params * nopa;
1427 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1428 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1429 static nopa calculate_order_params (ddg_ptr, int mii);
1430 static int find_max_asap (ddg_ptr, sbitmap);
1431 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1432 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1434 enum sms_direction {BOTTOMUP, TOPDOWN};
1436 struct node_order_params
1443 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1. */
1445 check_nodes_order (int *node_order, int num_nodes)
1448 sbitmap tmp = sbitmap_alloc (num_nodes);
1452 for (i = 0; i < num_nodes; i++)
1454 int u = node_order[i];
1456 if (u >= num_nodes || u < 0 || TEST_BIT (tmp, u))
1465 /* Order the nodes of G for scheduling and pass the result in
1466 NODE_ORDER. Also set aux.count of each node to ASAP.
1467 Return the recMII for the given DDG. */
1469 sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1473 ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1475 nopa nops = calculate_order_params (g, mii);
1477 order_nodes_of_sccs (sccs, node_order);
1479 if (sccs->num_sccs > 0)
1480 /* First SCC has the largest recurrence_length. */
1481 rec_mii = sccs->sccs[0]->recurrence_length;
1483 /* Save ASAP before destroying node_order_params. */
1484 for (i = 0; i < g->num_nodes; i++)
1486 ddg_node_ptr v = &g->nodes[i];
1487 v->aux.count = ASAP (v);
1491 free_ddg_all_sccs (sccs);
1492 check_nodes_order (node_order, g->num_nodes);
1498 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1501 ddg_ptr g = all_sccs->ddg;
1502 int num_nodes = g->num_nodes;
1503 sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1504 sbitmap on_path = sbitmap_alloc (num_nodes);
1505 sbitmap tmp = sbitmap_alloc (num_nodes);
1506 sbitmap ones = sbitmap_alloc (num_nodes);
1508 sbitmap_zero (prev_sccs);
1509 sbitmap_ones (ones);
1511 /* Perfrom the node ordering starting from the SCC with the highest recMII.
1512 For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc. */
1513 for (i = 0; i < all_sccs->num_sccs; i++)
1515 ddg_scc_ptr scc = all_sccs->sccs[i];
1517 /* Add nodes on paths from previous SCCs to the current SCC. */
1518 find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1519 sbitmap_a_or_b (tmp, scc->nodes, on_path);
1521 /* Add nodes on paths from the current SCC to previous SCCs. */
1522 find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1523 sbitmap_a_or_b (tmp, tmp, on_path);
1525 /* Remove nodes of previous SCCs from current extended SCC. */
1526 sbitmap_difference (tmp, tmp, prev_sccs);
1528 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1529 /* Above call to order_nodes_in_scc updated prev_sccs |= tmp. */
1532 /* Handle the remaining nodes that do not belong to any scc. Each call
1533 to order_nodes_in_scc handles a single connected component. */
1534 while (pos < g->num_nodes)
1536 sbitmap_difference (tmp, ones, prev_sccs);
1537 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1539 sbitmap_free (prev_sccs);
1540 sbitmap_free (on_path);
1542 sbitmap_free (ones);
1545 /* MII is needed if we consider backarcs (that do not close recursive cycles). */
1546 static struct node_order_params *
1547 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1551 int num_nodes = g->num_nodes;
1553 /* Allocate a place to hold ordering params for each node in the DDG. */
1554 nopa node_order_params_arr;
1556 /* Initialize of ASAP/ALAP/HEIGHT to zero. */
1557 node_order_params_arr = (nopa) xcalloc (num_nodes,
1558 sizeof (struct node_order_params));
1560 /* Set the aux pointer of each node to point to its order_params structure. */
1561 for (u = 0; u < num_nodes; u++)
1562 g->nodes[u].aux.info = &node_order_params_arr[u];
1564 /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1565 calculate ASAP, ALAP, mobility, distance, and height for each node
1566 in the dependence (direct acyclic) graph. */
1568 /* We assume that the nodes in the array are in topological order. */
1571 for (u = 0; u < num_nodes; u++)
1573 ddg_node_ptr u_node = &g->nodes[u];
1576 for (e = u_node->in; e; e = e->next_in)
1577 if (e->distance == 0)
1578 ASAP (u_node) = MAX (ASAP (u_node),
1579 ASAP (e->src) + e->latency);
1580 max_asap = MAX (max_asap, ASAP (u_node));
1583 for (u = num_nodes - 1; u > -1; u--)
1585 ddg_node_ptr u_node = &g->nodes[u];
1587 ALAP (u_node) = max_asap;
1588 HEIGHT (u_node) = 0;
1589 for (e = u_node->out; e; e = e->next_out)
1590 if (e->distance == 0)
1592 ALAP (u_node) = MIN (ALAP (u_node),
1593 ALAP (e->dest) - e->latency);
1594 HEIGHT (u_node) = MAX (HEIGHT (u_node),
1595 HEIGHT (e->dest) + e->latency);
1599 return node_order_params_arr;
1603 find_max_asap (ddg_ptr g, sbitmap nodes)
1609 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1611 ddg_node_ptr u_node = &g->nodes[u];
1613 if (max_asap < ASAP (u_node))
1615 max_asap = ASAP (u_node);
1623 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1627 int min_mob = INT_MAX;
1630 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1632 ddg_node_ptr u_node = &g->nodes[u];
1634 if (max_hv < HEIGHT (u_node))
1636 max_hv = HEIGHT (u_node);
1637 min_mob = MOB (u_node);
1640 else if ((max_hv == HEIGHT (u_node))
1641 && (min_mob > MOB (u_node)))
1643 min_mob = MOB (u_node);
1651 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1655 int min_mob = INT_MAX;
1658 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1660 ddg_node_ptr u_node = &g->nodes[u];
1662 if (max_dv < DEPTH (u_node))
1664 max_dv = DEPTH (u_node);
1665 min_mob = MOB (u_node);
1668 else if ((max_dv == DEPTH (u_node))
1669 && (min_mob > MOB (u_node)))
1671 min_mob = MOB (u_node);
1678 /* Places the nodes of SCC into the NODE_ORDER array starting
1679 at position POS, according to the SMS ordering algorithm.
1680 NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1681 the NODE_ORDER array, starting from position zero. */
1683 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1684 int * node_order, int pos)
1686 enum sms_direction dir;
1687 int num_nodes = g->num_nodes;
1688 sbitmap workset = sbitmap_alloc (num_nodes);
1689 sbitmap tmp = sbitmap_alloc (num_nodes);
1690 sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1691 sbitmap predecessors = sbitmap_alloc (num_nodes);
1692 sbitmap successors = sbitmap_alloc (num_nodes);
1694 sbitmap_zero (predecessors);
1695 find_predecessors (predecessors, g, nodes_ordered);
1697 sbitmap_zero (successors);
1698 find_successors (successors, g, nodes_ordered);
1701 if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1703 sbitmap_copy (workset, tmp);
1706 else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1708 sbitmap_copy (workset, tmp);
1715 sbitmap_zero (workset);
1716 if ((u = find_max_asap (g, scc)) >= 0)
1717 SET_BIT (workset, u);
1721 sbitmap_zero (zero_bitmap);
1722 while (!sbitmap_equal (workset, zero_bitmap))
1725 ddg_node_ptr v_node;
1726 sbitmap v_node_preds;
1727 sbitmap v_node_succs;
1731 while (!sbitmap_equal (workset, zero_bitmap))
1733 v = find_max_hv_min_mob (g, workset);
1734 v_node = &g->nodes[v];
1735 node_order[pos++] = v;
1736 v_node_succs = NODE_SUCCESSORS (v_node);
1737 sbitmap_a_and_b (tmp, v_node_succs, scc);
1739 /* Don't consider the already ordered successors again. */
1740 sbitmap_difference (tmp, tmp, nodes_ordered);
1741 sbitmap_a_or_b (workset, workset, tmp);
1742 RESET_BIT (workset, v);
1743 SET_BIT (nodes_ordered, v);
1746 sbitmap_zero (predecessors);
1747 find_predecessors (predecessors, g, nodes_ordered);
1748 sbitmap_a_and_b (workset, predecessors, scc);
1752 while (!sbitmap_equal (workset, zero_bitmap))
1754 v = find_max_dv_min_mob (g, workset);
1755 v_node = &g->nodes[v];
1756 node_order[pos++] = v;
1757 v_node_preds = NODE_PREDECESSORS (v_node);
1758 sbitmap_a_and_b (tmp, v_node_preds, scc);
1760 /* Don't consider the already ordered predecessors again. */
1761 sbitmap_difference (tmp, tmp, nodes_ordered);
1762 sbitmap_a_or_b (workset, workset, tmp);
1763 RESET_BIT (workset, v);
1764 SET_BIT (nodes_ordered, v);
1767 sbitmap_zero (successors);
1768 find_successors (successors, g, nodes_ordered);
1769 sbitmap_a_and_b (workset, successors, scc);
1773 sbitmap_free (workset);
1774 sbitmap_free (zero_bitmap);
1775 sbitmap_free (predecessors);
1776 sbitmap_free (successors);
1781 /* This page contains functions for manipulating partial-schedules during
1782 modulo scheduling. */
1784 /* Create a partial schedule and allocate a memory to hold II rows. */
1785 static partial_schedule_ptr
1786 create_partial_schedule (int ii, ddg_ptr g, int history)
1788 partial_schedule_ptr ps = (partial_schedule_ptr)
1789 xmalloc (sizeof (struct partial_schedule));
1790 ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
1792 ps->history = history;
1793 ps->min_cycle = INT_MAX;
1794 ps->max_cycle = INT_MIN;
1800 /* Free the PS_INSNs in rows array of the given partial schedule.
1801 ??? Consider caching the PS_INSN's. */
1803 free_ps_insns (partial_schedule_ptr ps)
1807 for (i = 0; i < ps->ii; i++)
1811 ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
1814 ps->rows[i] = ps_insn;
1820 /* Free all the memory allocated to the partial schedule. */
1822 free_partial_schedule (partial_schedule_ptr ps)
1831 /* Clear the rows array with its PS_INSNs, and create a new one with
1834 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
1839 if (new_ii == ps->ii)
1841 ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
1842 * sizeof (ps_insn_ptr));
1843 memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
1845 ps->min_cycle = INT_MAX;
1846 ps->max_cycle = INT_MIN;
1849 /* Prints the partial schedule as an ii rows array, for each rows
1850 print the ids of the insns in it. */
1852 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
1856 for (i = 0; i < ps->ii; i++)
1858 ps_insn_ptr ps_i = ps->rows[i];
1860 fprintf (dump, "\n[CYCLE %d ]: ", i);
1863 fprintf (dump, "%d, ",
1864 INSN_UID (ps_i->node->insn));
1865 ps_i = ps_i->next_in_row;
1870 /* Creates an object of PS_INSN and initializes it to the given parameters. */
1872 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
1874 ps_insn_ptr ps_i = xmalloc (sizeof (struct ps_insn));
1877 ps_i->next_in_row = NULL;
1878 ps_i->prev_in_row = NULL;
1879 ps_i->row_rest_count = rest_count;
1880 ps_i->cycle = cycle;
1886 /* Removes the given PS_INSN from the partial schedule. Returns false if the
1887 node is not found in the partial schedule, else returns true. */
1889 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
1896 row = SMODULO (ps_i->cycle, ps->ii);
1897 if (! ps_i->prev_in_row)
1899 if (ps_i != ps->rows[row])
1902 ps->rows[row] = ps_i->next_in_row;
1904 ps->rows[row]->prev_in_row = NULL;
1908 ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
1909 if (ps_i->next_in_row)
1910 ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
1916 /* Unlike what literature describes for modulo scheduling (which focuses
1917 on VLIW machines) the order of the instructions inside a cycle is
1918 important. Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
1919 where the current instruction should go relative to the already
1920 scheduled instructions in the given cycle. Go over these
1921 instructions and find the first possible column to put it in. */
1923 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
1924 sbitmap must_precede, sbitmap must_follow)
1926 ps_insn_ptr next_ps_i;
1927 ps_insn_ptr first_must_follow = NULL;
1928 ps_insn_ptr last_must_precede = NULL;
1934 row = SMODULO (ps_i->cycle, ps->ii);
1936 /* Find the first must follow and the last must precede
1937 and insert the node immediately after the must precede
1938 but make sure that it there is no must follow after it. */
1939 for (next_ps_i = ps->rows[row];
1941 next_ps_i = next_ps_i->next_in_row)
1943 if (TEST_BIT (must_follow, next_ps_i->node->cuid)
1944 && ! first_must_follow)
1945 first_must_follow = next_ps_i;
1946 if (TEST_BIT (must_precede, next_ps_i->node->cuid))
1948 /* If we have already met a node that must follow, then
1949 there is no possible column. */
1950 if (first_must_follow)
1953 last_must_precede = next_ps_i;
1957 /* Now insert the node after INSERT_AFTER_PSI. */
1959 if (! last_must_precede)
1961 ps_i->next_in_row = ps->rows[row];
1962 ps_i->prev_in_row = NULL;
1963 if (ps_i->next_in_row)
1964 ps_i->next_in_row->prev_in_row = ps_i;
1965 ps->rows[row] = ps_i;
1969 ps_i->next_in_row = last_must_precede->next_in_row;
1970 last_must_precede->next_in_row = ps_i;
1971 ps_i->prev_in_row = last_must_precede;
1972 if (ps_i->next_in_row)
1973 ps_i->next_in_row->prev_in_row = ps_i;
1979 /* Advances the PS_INSN one column in its current row; returns false
1980 in failure and true in success. Bit N is set in MUST_FOLLOW if
1981 the node with cuid N must be come after the node pointed to by
1982 PS_I when scheduled in the same cycle. */
1984 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
1985 sbitmap must_follow)
1987 ps_insn_ptr prev, next;
1989 ddg_node_ptr next_node;
1994 row = SMODULO (ps_i->cycle, ps->ii);
1996 if (! ps_i->next_in_row)
1999 next_node = ps_i->next_in_row->node;
2001 /* Check if next_in_row is dependent on ps_i, both having same sched
2002 times (typically ANTI_DEP). If so, ps_i cannot skip over it. */
2003 if (TEST_BIT (must_follow, next_node->cuid))
2006 /* Advance PS_I over its next_in_row in the doubly linked list. */
2007 prev = ps_i->prev_in_row;
2008 next = ps_i->next_in_row;
2010 if (ps_i == ps->rows[row])
2011 ps->rows[row] = next;
2013 ps_i->next_in_row = next->next_in_row;
2015 if (next->next_in_row)
2016 next->next_in_row->prev_in_row = ps_i;
2018 next->next_in_row = ps_i;
2019 ps_i->prev_in_row = next;
2021 next->prev_in_row = prev;
2023 prev->next_in_row = next;
2028 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2029 Returns 0 if this is not possible and a PS_INSN otherwise. Bit N is
2030 set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2031 before/after (respectively) the node pointed to by PS_I when scheduled
2032 in the same cycle. */
2034 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2035 sbitmap must_precede, sbitmap must_follow)
2039 int row = SMODULO (cycle, ps->ii);
2042 && ps->rows[row]->row_rest_count >= issue_rate)
2046 rest_count += ps->rows[row]->row_rest_count;
2048 ps_i = create_ps_insn (node, rest_count, cycle);
2050 /* Finds and inserts PS_I according to MUST_FOLLOW and
2052 if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2061 /* Advance time one cycle. Assumes DFA is being used. */
2063 advance_one_cycle (void)
2065 if (targetm.sched.dfa_pre_cycle_insn)
2066 state_transition (curr_state,
2067 (*targetm.sched.dfa_pre_cycle_insn) ());
2069 state_transition (curr_state, NULL);
2071 if (targetm.sched.dfa_post_cycle_insn)
2072 state_transition (curr_state,
2073 (*targetm.sched.dfa_post_cycle_insn) ());
2076 /* Checks if PS has resource conflicts according to DFA, starting from
2077 FROM cycle to TO cycle; returns true if there are conflicts and false
2078 if there are no conflicts. Assumes DFA is being used. */
2080 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2084 state_reset (curr_state);
2086 for (cycle = from; cycle <= to; cycle++)
2088 ps_insn_ptr crr_insn;
2089 /* Holds the remaining issue slots in the current row. */
2090 int can_issue_more = issue_rate;
2092 /* Walk through the DFA for the current row. */
2093 for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2095 crr_insn = crr_insn->next_in_row)
2097 rtx insn = crr_insn->node->insn;
2102 /* Check if there is room for the current insn. */
2103 if (!can_issue_more || state_dead_lock_p (curr_state))
2106 /* Update the DFA state and return with failure if the DFA found
2107 recource conflicts. */
2108 if (state_transition (curr_state, insn) >= 0)
2111 if (targetm.sched.variable_issue)
2113 (*targetm.sched.variable_issue) (sched_dump, sched_verbose,
2114 insn, can_issue_more);
2115 /* A naked CLOBBER or USE generates no instruction, so don't
2116 let them consume issue slots. */
2117 else if (GET_CODE (PATTERN (insn)) != USE
2118 && GET_CODE (PATTERN (insn)) != CLOBBER)
2122 /* Advance the DFA to the next cycle. */
2123 advance_one_cycle ();
2128 /* Checks if the given node causes resource conflicts when added to PS at
2129 cycle C. If not the node is added to PS and returned; otherwise zero
2130 is returned. Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2131 cuid N must be come before/after (respectively) the node pointed to by
2132 PS_I when scheduled in the same cycle. */
2134 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2135 int c, sbitmap must_precede,
2136 sbitmap must_follow)
2138 int has_conflicts = 0;
2141 /* First add the node to the PS, if this succeeds check for
2142 conflicts, trying different issue slots in the same row. */
2143 if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2144 return NULL; /* Failed to insert the node at the given cycle. */
2146 has_conflicts = ps_has_conflicts (ps, c, c)
2148 && ps_has_conflicts (ps,
2152 /* Try different issue slots to find one that the given node can be
2153 scheduled in without conflicts. */
2154 while (has_conflicts)
2156 if (! ps_insn_advance_column (ps, ps_i, must_follow))
2158 has_conflicts = ps_has_conflicts (ps, c, c)
2160 && ps_has_conflicts (ps,
2167 remove_node_from_ps (ps, ps_i);
2171 ps->min_cycle = MIN (ps->min_cycle, c);
2172 ps->max_cycle = MAX (ps->max_cycle, c);
2176 /* Rotate the rows of PS such that insns scheduled at time
2177 START_CYCLE will appear in row 0. Updates max/min_cycles. */
2179 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2181 int i, row, backward_rotates;
2182 int last_row = ps->ii - 1;
2184 if (start_cycle == 0)
2187 backward_rotates = SMODULO (start_cycle, ps->ii);
2189 /* Revisit later and optimize this into a single loop. */
2190 for (i = 0; i < backward_rotates; i++)
2192 ps_insn_ptr first_row = ps->rows[0];
2194 for (row = 0; row < last_row; row++)
2195 ps->rows[row] = ps->rows[row+1];
2197 ps->rows[last_row] = first_row;
2200 ps->max_cycle -= start_cycle;
2201 ps->min_cycle -= start_cycle;
2204 #endif /* INSN_SCHEDULING*/