1 /* Swing Modulo Scheduling implementation.
3 Free Software Foundation, Inc.
4 Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING. If not, write to the Free
20 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
26 #include "coretypes.h"
31 #include "hard-reg-set.h"
32 #include "basic-block.h"
36 #include "insn-config.h"
37 #include "insn-attr.h"
41 #include "sched-int.h"
43 #include "cfglayout.h"
53 /* This file contains the implementation of the Swing Modulo Scheduler,
54 described in the following references:
55 [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
56 Lifetime--sensitive modulo scheduling in a production environment.
57 IEEE Trans. on Comps., 50(3), March 2001
58 [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
59 Swing Modulo Scheduling: A Lifetime Sensitive Approach.
60 PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
62 The basic structure is:
63 1. Build a data-dependence graph (DDG) for each loop.
64 2. Use the DDG to order the insns of a loop (not in topological order
65 necessarily, but rather) trying to place each insn after all its
66 predecessors _or_ after all its successors.
67 3. Compute MII: a lower bound on the number of cycles to schedule the loop.
68 4. Use the ordering to perform list-scheduling of the loop:
69 1. Set II = MII. We will try to schedule the loop within II cycles.
70 2. Try to schedule the insns one by one according to the ordering.
71 For each insn compute an interval of cycles by considering already-
72 scheduled preds and succs (and associated latencies); try to place
73 the insn in the cycles of this window checking for potential
74 resource conflicts (using the DFA interface).
75 Note: this is different from the cycle-scheduling of schedule_insns;
76 here the insns are not scheduled monotonically top-down (nor bottom-
78 3. If failed in scheduling all insns - bump II++ and try again, unless
79 II reaches an upper bound MaxII, inwhich case report failure.
80 5. If we succeeded in scheduling the loop within II cycles, we now
81 generate prolog and epilog, decrease the counter of the loop, and
82 perform modulo variable expansion for live ranges that span more than
83 II cycles (i.e. use register copies to prevent a def from overwriting
84 itself before reaching the use).
88 /* This page defines partial-schedule structures and functions for
91 typedef struct partial_schedule *partial_schedule_ptr;
92 typedef struct ps_insn *ps_insn_ptr;
94 /* The minimum (absolute) cycle that a node of ps was scheduled in. */
95 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
97 /* The maximum (absolute) cycle that a node of ps was scheduled in. */
98 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
100 /* Perform signed modulo, always returning a non-negative value. */
101 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
103 /* The number of different iterations the nodes in ps span, assuming
104 the stage boundaries are placed efficiently. */
105 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
106 + 1 + (ps)->ii - 1) / (ps)->ii)
108 #define CFG_HOOKS cfg_layout_rtl_cfg_hooks
110 /* A single instruction in the partial schedule. */
113 /* The corresponding DDG_NODE. */
116 /* The (absolute) cycle in which the PS instruction is scheduled.
117 Same as SCHED_TIME (node). */
120 /* The next/prev PS_INSN in the same row. */
121 ps_insn_ptr next_in_row,
124 /* The number of nodes in the same row that come after this node. */
128 /* Holds the partial schedule as an array of II rows. Each entry of the
129 array points to a linked list of PS_INSNs, which represents the
130 instructions that are scheduled for that row. */
131 struct partial_schedule
133 int ii; /* Number of rows in the partial schedule. */
134 int history; /* Threshold for conflict checking using DFA. */
136 /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii). */
139 /* The earliest absolute cycle of an insn in the partial schedule. */
142 /* The latest absolute cycle of an insn in the partial schedule. */
145 ddg_ptr g; /* The DDG of the insns in the partial schedule. */
149 partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
150 void free_partial_schedule (partial_schedule_ptr);
151 void reset_partial_schedule (partial_schedule_ptr, int new_ii);
152 void print_partial_schedule (partial_schedule_ptr, FILE *);
153 ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
154 ddg_node_ptr node, int cycle);
155 void rotate_partial_schedule (partial_schedule_ptr, int);
156 void set_row_column_for_ps (partial_schedule_ptr);
159 /* This page defines constants and structures for the modulo scheduling
162 /* As in haifa-sched.c: */
163 /* issue_rate is the number of insns that can be scheduled in the same
164 machine cycle. It can be defined in the config/mach/mach.h file,
165 otherwise we set it to 1. */
167 static int issue_rate;
169 /* For printing statistics. */
170 static FILE *stats_file;
172 static int sms_order_nodes (ddg_ptr, int, int * result);
173 static void set_node_sched_params (ddg_ptr);
174 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int,
176 static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
177 static void generate_prolog_epilog (partial_schedule_ptr, rtx, rtx, int);
178 static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
179 int from_stage, int to_stage,
183 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
184 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
185 #define SCHED_FIRST_REG_MOVE(x) \
186 (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
187 #define SCHED_NREG_MOVES(x) \
188 (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
189 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
190 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
191 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
193 /* The scheduling parameters held for each node. */
194 typedef struct node_sched_params
196 int asap; /* A lower-bound on the absolute scheduling cycle. */
197 int time; /* The absolute scheduling cycle (time >= asap). */
199 /* The following field (first_reg_move) is a pointer to the first
200 register-move instruction added to handle the modulo-variable-expansion
201 of the register defined by this node. This register-move copies the
202 original register defined by the node. */
205 /* The number of register-move instructions added, immediately preceding
209 int row; /* Holds time % ii. */
210 int stage; /* Holds time / ii. */
212 /* The column of a node inside the ps. If nodes u, v are on the same row,
213 u will precede v if column (u) < column (v). */
215 } *node_sched_params_ptr;
218 /* The following three functions are copied from the current scheduler
219 code in order to use sched_analyze() for computing the dependecies.
220 They are used when initializing the sched_info structure. */
222 sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
226 sprintf (tmp, "i%4d", INSN_UID (insn));
231 contributes_to_priority (rtx next, rtx insn)
233 return BLOCK_NUM (next) == BLOCK_NUM (insn);
237 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
238 regset cond_exec ATTRIBUTE_UNUSED,
239 regset used ATTRIBUTE_UNUSED,
240 regset set ATTRIBUTE_UNUSED)
244 static struct sched_info sms_sched_info =
252 contributes_to_priority,
253 compute_jump_reg_dependencies,
260 /* Return the register decremented and tested or zero if it is not a decrement
261 and branch jump insn (similar to doloop_condition_get). */
263 doloop_register_get (rtx insn, rtx *comp)
265 rtx pattern, cmp, inc, reg, condition;
267 if (GET_CODE (insn) != JUMP_INSN)
269 pattern = PATTERN (insn);
271 /* The canonical doloop pattern we expect is:
273 (parallel [(set (pc) (if_then_else (condition)
276 (set (reg) (plus (reg) (const_int -1)))
277 (additional clobbers and uses)])
279 where condition is further restricted to be
280 (ne (reg) (const_int 1)). */
282 if (GET_CODE (pattern) != PARALLEL)
285 cmp = XVECEXP (pattern, 0, 0);
286 inc = XVECEXP (pattern, 0, 1);
287 /* Return the compare rtx. */
290 /* Check for (set (reg) (something)). */
291 if (GET_CODE (inc) != SET || ! REG_P (SET_DEST (inc)))
294 /* Extract loop counter register. */
295 reg = SET_DEST (inc);
297 /* Check if something = (plus (reg) (const_int -1)). */
298 if (GET_CODE (SET_SRC (inc)) != PLUS
299 || XEXP (SET_SRC (inc), 0) != reg
300 || XEXP (SET_SRC (inc), 1) != constm1_rtx)
303 /* Check for (set (pc) (if_then_else (condition)
306 if (GET_CODE (cmp) != SET
307 || SET_DEST (cmp) != pc_rtx
308 || GET_CODE (SET_SRC (cmp)) != IF_THEN_ELSE
309 || GET_CODE (XEXP (SET_SRC (cmp), 1)) != LABEL_REF
310 || XEXP (SET_SRC (cmp), 2) != pc_rtx)
313 /* Extract loop termination condition. */
314 condition = XEXP (SET_SRC (cmp), 0);
316 /* Check if condition = (ne (reg) (const_int 1)), which is more
317 restrictive than the check in doloop_condition_get:
318 if ((GET_CODE (condition) != GE && GET_CODE (condition) != NE)
319 || GET_CODE (XEXP (condition, 1)) != CONST_INT). */
320 if (GET_CODE (condition) != NE
321 || XEXP (condition, 1) != const1_rtx)
324 if (XEXP (condition, 0) == reg)
330 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
331 that the number of iterations is a compile-time constant. If so,
332 return the rtx that sets COUNT_REG to a constant, and set COUNT to
333 this constant. Otherwise return 0. */
335 const_iteration_count (rtx count_reg, basic_block pre_header,
336 HOST_WIDEST_INT * count)
340 get_block_head_tail (pre_header->index, &head, &tail);
342 for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
343 if (INSN_P (insn) && single_set (insn) &&
344 rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
346 rtx pat = single_set (insn);
348 if (GET_CODE (SET_SRC (pat)) == CONST_INT)
350 *count = INTVAL (SET_SRC (pat));
360 /* A very simple resource-based lower bound on the initiation interval.
361 ??? Improve the accuracy of this bound by considering the
362 utilization of various units. */
366 return (g->num_nodes / issue_rate);
370 /* Points to the array that contains the sched data for each node. */
371 static node_sched_params_ptr node_sched_params;
373 /* Allocate sched_params for each node and initialize it. Assumes that
374 the aux field of each node contain the asap bound (computed earlier),
375 and copies it into the sched_params field. */
377 set_node_sched_params (ddg_ptr g)
381 /* Allocate for each node in the DDG a place to hold the "sched_data". */
382 /* Initialize ASAP/ALAP/HIGHT to zero. */
383 node_sched_params = (node_sched_params_ptr)
384 xcalloc (g->num_nodes,
385 sizeof (struct node_sched_params));
387 /* Set the pointer of the general data of the node to point to the
388 appropriate sched_params strcture. */
389 for (i = 0; i < g->num_nodes; i++)
391 /* Watch out for aliasing problems? */
392 node_sched_params[i].asap = g->nodes[i].aux.count;
393 g->nodes[i].aux.info = &node_sched_params[i];
398 print_node_sched_params (FILE * dump_file, int num_nodes)
402 for (i = 0; i < num_nodes; i++)
404 node_sched_params_ptr nsp = &node_sched_params[i];
405 rtx reg_move = nsp->first_reg_move;
408 fprintf (dump_file, "Node %d:\n", i);
409 fprintf (dump_file, " asap = %d:\n", nsp->asap);
410 fprintf (dump_file, " time = %d:\n", nsp->time);
411 fprintf (dump_file, " nreg_moves = %d:\n", nsp->nreg_moves);
412 for (j = 0; j < nsp->nreg_moves; j++)
414 fprintf (dump_file, " reg_move = ");
415 print_rtl_single (dump_file, reg_move);
416 reg_move = PREV_INSN (reg_move);
421 /* Calculate an upper bound for II. SMS should not schedule the loop if it
422 requires more cycles than this bound. Currently set to the sum of the
423 longest latency edge for each node. Reset based on experiments. */
425 calculate_maxii (ddg_ptr g)
430 for (i = 0; i < g->num_nodes; i++)
432 ddg_node_ptr u = &g->nodes[i];
434 int max_edge_latency = 0;
436 for (e = u->out; e; e = e->next_out)
437 max_edge_latency = MAX (max_edge_latency, e->latency);
439 maxii += max_edge_latency;
445 /* Given the partial schdule, generate register moves when the length
446 of the register live range is more than ii; the number of moves is
447 determined according to the following equation:
448 SCHED_TIME (use) - SCHED_TIME (def) { 1 broken loop-carried
449 nreg_moves = ----------------------------------- - { dependecnce.
451 This handles the modulo-variable-expansions (mve's) needed for the ps. */
453 generate_reg_moves (partial_schedule_ptr ps)
459 for (i = 0; i < g->num_nodes; i++)
461 ddg_node_ptr u = &g->nodes[i];
463 int nreg_moves = 0, i_reg_move;
464 sbitmap *uses_of_defs;
466 rtx prev_reg, old_reg;
468 /* Compute the number of reg_moves needed for u, by looking at life
469 ranges started at u (excluding self-loops). */
470 for (e = u->out; e; e = e->next_out)
471 if (e->type == TRUE_DEP && e->dest != e->src)
473 int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
475 /* If dest precedes src in the schedule of the kernel, then dest
476 will read before src writes and we can save one reg_copy. */
477 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
478 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
481 nreg_moves = MAX (nreg_moves, nreg_moves4e);
487 /* Every use of the register defined by node may require a different
488 copy of this register, depending on the time the use is scheduled.
489 Set a bitmap vector, telling which nodes use each copy of this
491 uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
492 sbitmap_vector_zero (uses_of_defs, nreg_moves);
493 for (e = u->out; e; e = e->next_out)
494 if (e->type == TRUE_DEP && e->dest != e->src)
496 int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
498 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
499 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
503 SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
506 /* Now generate the reg_moves, attaching relevant uses to them. */
507 SCHED_NREG_MOVES (u) = nreg_moves;
508 old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
509 last_reg_move = u->insn;
511 for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
514 rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
515 rtx reg_move = gen_move_insn (new_reg, prev_reg);
517 add_insn_before (reg_move, last_reg_move);
518 last_reg_move = reg_move;
520 if (!SCHED_FIRST_REG_MOVE (u))
521 SCHED_FIRST_REG_MOVE (u) = reg_move;
523 EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use,
524 replace_rtx (g->nodes[i_use].insn, old_reg, new_reg));
531 /* Bump the SCHED_TIMEs of all nodes to start from zero. Set the values
532 of SCHED_ROW and SCHED_STAGE. */
534 normalize_sched_times (partial_schedule_ptr ps)
538 int amount = PS_MIN_CYCLE (ps);
541 for (i = 0; i < g->num_nodes; i++)
543 ddg_node_ptr u = &g->nodes[i];
544 int normalized_time = SCHED_TIME (u) - amount;
546 if (normalized_time < 0)
549 SCHED_TIME (u) = normalized_time;
550 SCHED_ROW (u) = normalized_time % ii;
551 SCHED_STAGE (u) = normalized_time / ii;
555 /* Set SCHED_COLUMN of each node according to its position in PS. */
557 set_columns_for_ps (partial_schedule_ptr ps)
561 for (row = 0; row < ps->ii; row++)
563 ps_insn_ptr cur_insn = ps->rows[row];
566 for (; cur_insn; cur_insn = cur_insn->next_in_row)
567 SCHED_COLUMN (cur_insn->node) = column++;
571 /* Permute the insns according to their order in PS, from row 0 to
572 row ii-1, and position them right before LAST. This schedules
573 the insns of the loop kernel. */
575 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
581 for (row = 0; row < ii ; row++)
582 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
583 if (PREV_INSN (last) != ps_ij->node->insn)
584 reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
588 /* Used to generate the prologue & epilogue. Duplicate the subset of
589 nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
590 of both), together with a prefix/suffix of their reg_moves. */
592 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
593 int to_stage, int for_prolog)
598 for (row = 0; row < ps->ii; row++)
599 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
601 ddg_node_ptr u_node = ps_ij->node;
603 rtx reg_move = NULL_RTX;
607 /* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing
608 number of reg_moves starting with the second occurrence of
609 u_node, which is generated if its SCHED_STAGE <= to_stage. */
610 i_reg_moves = to_stage - SCHED_STAGE (u_node);
611 i_reg_moves = MAX (i_reg_moves, 0);
612 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
614 /* The reg_moves start from the *first* reg_move backwards. */
617 reg_move = SCHED_FIRST_REG_MOVE (u_node);
618 for (j = 1; j < i_reg_moves; j++)
619 reg_move = PREV_INSN (reg_move);
622 else /* It's for the epilog. */
624 /* SCHED_STAGE (u_node) <= to_stage. Generate all reg_moves,
625 starting to decrease one stage after u_node no longer occurs;
626 that is, generate all reg_moves until
627 SCHED_STAGE (u_node) == from_stage - 1. */
628 i_reg_moves = SCHED_NREG_MOVES (u_node)
629 - (from_stage - SCHED_STAGE (u_node) - 1);
630 i_reg_moves = MAX (i_reg_moves, 0);
631 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
633 /* The reg_moves start from the *last* reg_move forwards. */
636 reg_move = SCHED_FIRST_REG_MOVE (u_node);
637 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
638 reg_move = PREV_INSN (reg_move);
642 for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
643 emit_insn (copy_rtx (PATTERN (reg_move)));
645 if (SCHED_STAGE (u_node) >= from_stage
646 && SCHED_STAGE (u_node) <= to_stage)
647 duplicate_insn_chain (u_node->first_note, u_node->insn);
652 /* Generate the instructions (including reg_moves) for prolog & epilog. */
654 generate_prolog_epilog (partial_schedule_ptr ps, rtx orig_loop_beg,
655 rtx orig_loop_end, int unknown_count)
658 int last_stage = PS_STAGE_COUNT (ps) - 1;
660 rtx c_reg = NULL_RTX;
662 rtx precond_jump = NULL_RTX;
663 rtx precond_exit_label = NULL_RTX;
664 rtx precond_exit_label_insn = NULL_RTX;
665 rtx last_epilog_insn = NULL_RTX;
666 rtx loop_exit_label = NULL_RTX;
667 rtx loop_exit_label_insn = NULL_RTX;
668 rtx orig_loop_bct = NULL_RTX;
670 /* Loop header edge. */
672 if (e->src == ps->g->bb)
675 /* Generate the prolog, inserting its insns on the loop-entry edge. */
678 /* This is the place where we want to insert the precondition. */
680 precond_jump = emit_note (NOTE_INSN_DELETED);
682 for (i = 0; i < last_stage; i++)
683 duplicate_insns_of_cycles (ps, 0, i, 1);
685 /* No need to call insert_insn_on_edge; we prepared the sequence. */
686 e->insns.r = get_insns ();
689 /* Generate the epilog, inserting its insns on the loop-exit edge. */
692 for (i = 0; i < last_stage; i++)
693 duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
695 last_epilog_insn = emit_note (NOTE_INSN_DELETED);
697 /* Emit the label where to put the original loop code. */
702 precond_exit_label = gen_label_rtx ();
703 precond_exit_label_insn = emit_label (precond_exit_label);
705 /* Put the original loop code. */
706 reorder_insns_nobb (orig_loop_beg, orig_loop_end, precond_exit_label_insn);
708 /* Change the label of the BCT to be the PRECOND_EXIT_LABEL. */
709 orig_loop_bct = get_last_insn ();
710 c_reg = doloop_register_get (orig_loop_bct, &cmp);
711 label = XEXP (SET_SRC (cmp), 1);
712 cond = XEXP (SET_SRC (cmp), 0);
714 if (! c_reg || GET_CODE (cond) != NE)
717 XEXP (label, 0) = precond_exit_label;
718 JUMP_LABEL (orig_loop_bct) = precond_exit_label_insn;
719 LABEL_NUSES (precond_exit_label_insn)++;
721 /* Generate the loop exit label. */
722 loop_exit_label = gen_label_rtx ();
723 loop_exit_label_insn = emit_label (loop_exit_label);
727 if (e->dest == ps->g->bb)
730 e->insns.r = get_insns ();
733 commit_edge_insertions ();
737 rtx precond_insns, epilog_jump, insert_after_insn;
738 basic_block loop_exit_bb = BLOCK_FOR_INSN (loop_exit_label_insn);
739 basic_block epilog_bb = BLOCK_FOR_INSN (last_epilog_insn);
740 basic_block precond_bb = BLOCK_FOR_INSN (precond_jump);
741 basic_block orig_loop_bb = BLOCK_FOR_INSN (precond_exit_label_insn);
742 edge epilog_exit_edge = epilog_bb->succ;
744 /* Do loop preconditioning to take care of cases were the loop count is
745 less than the stage count. Update the CFG properly. */
746 insert_after_insn = precond_jump;
748 c_reg = doloop_register_get (ps->g->closing_branch->insn, &cmp);
749 emit_cmp_and_jump_insns (c_reg, GEN_INT (PS_STAGE_COUNT (ps)), LT, NULL,
750 GET_MODE (c_reg), 1, precond_exit_label);
751 precond_insns = get_insns ();
752 precond_jump = get_last_insn ();
754 reorder_insns (precond_insns, precond_jump, insert_after_insn);
756 /* Generate a subtract instruction at the beginning of the prolog to
757 adjust the loop count by STAGE_COUNT. */
758 emit_insn_after (gen_sub2_insn (c_reg, GEN_INT (PS_STAGE_COUNT (ps) - 1)),
760 update_bb_for_insn (precond_bb);
761 delete_insn (insert_after_insn);
763 /* Update label info for the precondition jump. */
764 JUMP_LABEL (precond_jump) = precond_exit_label_insn;
765 LABEL_NUSES (precond_exit_label_insn)++;
767 /* Update the CFG. */
768 split_block (precond_bb, precond_jump);
769 make_edge (precond_bb, orig_loop_bb, 0);
771 /* Add a jump at end of the epilog to the LOOP_EXIT_LABEL to jump over the
772 original loop copy and update the CFG. */
773 epilog_jump = emit_jump_insn_after (gen_jump (loop_exit_label),
775 delete_insn (last_epilog_insn);
776 JUMP_LABEL (epilog_jump) = loop_exit_label_insn;
777 LABEL_NUSES (loop_exit_label_insn)++;
779 redirect_edge_succ (epilog_exit_edge, loop_exit_bb);
780 epilog_exit_edge->flags &= ~EDGE_FALLTHRU;
781 emit_barrier_after (BB_END (epilog_bb));
785 /* Return the line note insn preceding INSN, for debugging. Taken from
788 find_line_note (rtx insn)
790 for (; insn; insn = PREV_INSN (insn))
791 if (GET_CODE (insn) == NOTE
792 && NOTE_LINE_NUMBER (insn) >= 0)
798 /* Main entry point, perform SMS scheduling on the loops of the function
799 that consist of single basic blocks. */
801 sms_schedule (FILE *dump_file)
803 static int passes = 0;
806 basic_block bb, pre_header = NULL;
810 partial_schedule_ptr ps;
811 int max_bb_index = last_basic_block;
814 /* SMS uses the DFA interface. */
815 if (! targetm.sched.use_dfa_pipeline_interface
816 || ! (*targetm.sched.use_dfa_pipeline_interface) ())
819 stats_file = dump_file;
822 /* Initialize issue_rate. */
823 if (targetm.sched.issue_rate)
825 int temp = reload_completed;
827 reload_completed = 1;
828 issue_rate = (*targetm.sched.issue_rate) ();
829 reload_completed = temp;
834 /* Initilize the scheduler. */
835 current_sched_info = &sms_sched_info;
838 /* Init Data Flow analysis, to be used in interloop dep calculation. */
840 df_analyze (df, 0, DF_ALL);
842 /* Allocate memory to hold the DDG array. */
843 g_arr = xcalloc (max_bb_index, sizeof (ddg_ptr));
845 /* Build DDGs for all the relevant loops and hold them in G_ARR
846 indexed by the loop BB index. */
851 edge e, pre_header_edge;
856 /* Check if bb has two successors, one being itself. */
858 if (!e || !e->succ_next || e->succ_next->succ_next)
861 if (e->dest != bb && e->succ_next->dest != bb)
864 if ((e->flags & EDGE_COMPLEX)
865 || (e->succ_next->flags & EDGE_COMPLEX))
868 /* Check if bb has two predecessors, one being itself. */
869 /* In view of above tests, suffices to check e->pred_next->pred_next? */
871 if (!e || !e->pred_next || e->pred_next->pred_next)
874 if (e->src != bb && e->pred_next->src != bb)
877 if ((e->flags & EDGE_COMPLEX)
878 || (e->pred_next->flags & EDGE_COMPLEX))
882 if (passes++ > MAX_SMS_LOOP_NUMBER && MAX_SMS_LOOP_NUMBER != -1)
885 fprintf (dump_file, "SMS reached MAX_PASSES... \n");
889 get_block_head_tail (bb->index, &head, &tail);
890 pre_header_edge = bb->pred;
891 if (bb->pred->src != bb)
892 pre_header_edge = bb->pred->pred_next;
894 /* Perfrom SMS only on loops that their average count is above threshold. */
895 if (bb->count < pre_header_edge->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD)
899 rtx line_note = find_line_note (tail);
902 fprintf (stats_file, "SMS bb %s %d (file, line)\n",
903 NOTE_SOURCE_FILE (line_note), NOTE_LINE_NUMBER (line_note));
904 fprintf (stats_file, "SMS single-bb-loop\n");
905 if (profile_info && flag_branch_probabilities)
907 fprintf (stats_file, "SMS loop-count ");
908 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
909 (HOST_WIDEST_INT) bb->count);
910 fprintf (stats_file, "\n");
911 fprintf (stats_file, "SMS preheader-count ");
912 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
913 (HOST_WIDEST_INT) pre_header_edge->count);
914 fprintf (stats_file, "\n");
915 fprintf (stats_file, "SMS profile-sum-max ");
916 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
917 (HOST_WIDEST_INT) profile_info->sum_max);
918 fprintf (stats_file, "\n");
924 /* Make sure this is a doloop. */
925 if ( !(count_reg = doloop_register_get (tail, &comp)))
930 pre_header = e->pred_next->src;
934 /* Don't handle BBs with calls or barriers, or !single_set insns. */
935 for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
936 if (GET_CODE (insn) == CALL_INSN
937 || GET_CODE (insn) == BARRIER
938 || (INSN_P (insn) && GET_CODE (insn) != JUMP_INSN
939 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
942 if (insn != NEXT_INSN (tail))
946 if (GET_CODE (insn) == CALL_INSN)
947 fprintf (stats_file, "SMS loop-with-call\n");
948 else if (GET_CODE (insn) == BARRIER)
949 fprintf (stats_file, "SMS loop-with-barrier\n");
951 fprintf (stats_file, "SMS loop-with-not-single-set\n");
952 print_rtl_single (stats_file, insn);
958 if (! (g = create_ddg (bb, df, 0)))
961 fprintf (stats_file, "SMS doloop\n");
965 g_arr[bb->index] = g;
968 /* Release Data Flow analysis data structures. */
971 /* Go over the built DDGs and perfrom SMS for each one of them. */
972 for (i = 0; i < max_bb_index; i++)
975 rtx count_reg, count_init, comp;
976 edge pre_header_edge;
979 HOST_WIDEST_INT loop_count = 0;
981 if (! (g = g_arr[i]))
985 print_ddg (dump_file, g);
987 get_block_head_tail (g->bb->index, &head, &tail);
989 pre_header_edge = g->bb->pred;
990 if (g->bb->pred->src != g->bb)
991 pre_header_edge = g->bb->pred->pred_next;
995 rtx line_note = find_line_note (tail);
998 fprintf (stats_file, "SMS bb %s %d (file, line)\n",
999 NOTE_SOURCE_FILE (line_note), NOTE_LINE_NUMBER (line_note));
1000 fprintf (stats_file, "SMS single-bb-loop\n");
1001 if (profile_info && flag_branch_probabilities)
1003 fprintf (stats_file, "SMS loop-count ");
1004 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1005 (HOST_WIDEST_INT) bb->count);
1006 fprintf (stats_file, "\n");
1007 fprintf (stats_file, "SMS preheader-count ");
1008 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1009 (HOST_WIDEST_INT) pre_header_edge->count);
1010 fprintf (stats_file, "\n");
1011 fprintf (stats_file, "SMS profile-sum-max ");
1012 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1013 (HOST_WIDEST_INT) profile_info->sum_max);
1014 fprintf (stats_file, "\n");
1016 fprintf (stats_file, "SMS doloop\n");
1017 fprintf (stats_file, "SMS built-ddg %d\n", g->num_nodes);
1018 fprintf (stats_file, "SMS num-loads %d\n", g->num_loads);
1019 fprintf (stats_file, "SMS num-stores %d\n", g->num_stores);
1022 /* Make sure this is a doloop. */
1023 if ( !(count_reg = doloop_register_get (tail, &comp)))
1026 /* This should be NULL_RTX if the count is unknown at compile time. */
1027 count_init = const_iteration_count (count_reg, pre_header, &loop_count);
1029 if (stats_file && count_init)
1031 fprintf (stats_file, "SMS const-doloop ");
1032 fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1033 fprintf (stats_file, "\n");
1036 node_order = (int *) xmalloc (sizeof (int) * g->num_nodes);
1038 mii = 1; /* Need to pass some estimate of mii. */
1039 rec_mii = sms_order_nodes (g, mii, node_order);
1040 mii = MAX (res_MII (g), rec_mii);
1041 maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1044 fprintf (stats_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1045 rec_mii, mii, maxii);
1047 /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1049 set_node_sched_params (g);
1051 ps = sms_schedule_by_order (g, mii, maxii, node_order, dump_file);
1054 stage_count = PS_STAGE_COUNT (ps);
1056 if (stage_count == 0 || (count_init && (stage_count > loop_count)))
1059 fprintf (dump_file, "SMS failed... \n");
1061 fprintf (stats_file, "SMS sched-failed %d\n", stage_count);
1065 rtx orig_loop_beg = NULL_RTX;
1066 rtx orig_loop_end = NULL_RTX;
1070 fprintf (stats_file,
1071 "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1073 print_partial_schedule (ps, dump_file);
1075 "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1076 g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1079 /* Save the original loop if we want to do loop preconditioning in
1080 case the BCT count is not known. */
1086 /* Copy the original loop code before modifying it - so we can use
1088 for (i = 0; i < ps->g->num_nodes; i++)
1089 duplicate_insn_chain (ps->g->nodes[i].first_note,
1090 ps->g->nodes[i].insn);
1092 orig_loop_beg = get_insns ();
1093 orig_loop_end = get_last_insn ();
1096 /* Set the stage boundaries. If the DDG is built with closing_branch_deps,
1097 the closing_branch was scheduled and should appear in the last (ii-1)
1098 row. Otherwise, we are free to schedule the branch, and we let nodes
1099 that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1100 row; this should reduce stage_count to minimum. */
1101 normalize_sched_times (ps);
1102 rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1103 set_columns_for_ps (ps);
1105 permute_partial_schedule (ps, g->closing_branch->first_note);
1106 generate_reg_moves (ps);
1108 print_node_sched_params (dump_file, g->num_nodes);
1110 /* Set new iteration count of loop kernel. */
1112 SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1115 /* Generate prolog and epilog. */
1116 generate_prolog_epilog (ps, orig_loop_beg, orig_loop_end,
1117 count_init ? 0 : 1);
1119 free_partial_schedule (ps);
1120 free (node_sched_params);
1125 /* Release scheduler data, needed until now because of DFA. */
1129 /* The SMS scheduling algorithm itself
1130 -----------------------------------
1131 Input: 'O' an ordered list of insns of a loop.
1132 Output: A scheduling of the loop - kernel, prolog, and epilogue.
1134 'Q' is the empty Set
1135 'PS' is the partial schedule; it holds the currently scheduled nodes with
1137 'PSP' previously scheduled predecessors.
1138 'PSS' previously scheduled successors.
1139 't(u)' the cycle where u is scheduled.
1140 'l(u)' is the latency of u.
1141 'd(v,u)' is the dependence distance from v to u.
1142 'ASAP(u)' the earliest time at which u could be scheduled as computed in
1143 the node ordering phase.
1144 'check_hardware_resources_conflicts(u, PS, c)'
1145 run a trace around cycle/slot through DFA model
1146 to check resource conflicts involving instruction u
1147 at cycle c given the partial schedule PS.
1148 'add_to_partial_schedule_at_time(u, PS, c)'
1149 Add the node/instruction u to the partial schedule
1151 'calculate_register_pressure(PS)'
1152 Given a schedule of instructions, calculate the register
1153 pressure it implies. One implementation could be the
1154 maximum number of overlapping live ranges.
1155 'maxRP' The maximum allowed register pressure, it is usually derived from the number
1156 registers available in the hardware.
1160 3. for each node u in O in pre-computed order
1161 4. if (PSP(u) != Q && PSS(u) == Q) then
1162 5. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1163 6. start = Early_start; end = Early_start + II - 1; step = 1
1164 11. else if (PSP(u) == Q && PSS(u) != Q) then
1165 12. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1166 13. start = Late_start; end = Late_start - II + 1; step = -1
1167 14. else if (PSP(u) != Q && PSS(u) != Q) then
1168 15. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1169 16. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1170 17. start = Early_start;
1171 18. end = min(Early_start + II - 1 , Late_start);
1173 20. else "if (PSP(u) == Q && PSS(u) == Q)"
1174 21. start = ASAP(u); end = start + II - 1; step = 1
1178 24. for (c = start ; c != end ; c += step)
1179 25. if check_hardware_resources_conflicts(u, PS, c) then
1180 26. add_to_partial_schedule_at_time(u, PS, c)
1185 31. if (success == false) then
1187 33. if (II > maxII) then
1188 34. finish - failed to schedule
1193 39. if (calculate_register_pressure(PS) > maxRP) then
1196 42. compute epilogue & prologue
1197 43. finish - succeeded to schedule
1200 /* A limit on the number of cycles that resource conflicts can span. ??? Should
1201 be provided by DFA, and be dependent on the type of insn scheduled. Currently
1202 set to 0 to save compile time. */
1203 #define DFA_HISTORY SMS_DFA_HISTORY
1205 static partial_schedule_ptr
1206 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order, FILE *dump_file)
1210 int try_again_with_larger_ii = true;
1211 int num_nodes = g->num_nodes;
1213 int start, end, step; /* Place together into one struct? */
1214 sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1215 sbitmap psp = sbitmap_alloc (num_nodes);
1216 sbitmap pss = sbitmap_alloc (num_nodes);
1217 partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1219 while (try_again_with_larger_ii && ii < maxii)
1222 fprintf(dump_file, "Starting with ii=%d\n", ii);
1223 try_again_with_larger_ii = false;
1224 sbitmap_zero (sched_nodes);
1226 for (i = 0; i < num_nodes; i++)
1228 int u = nodes_order[i];
1229 ddg_node_ptr u_node = &g->nodes[u];
1230 sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1231 sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1234 rtx insn = u_node->insn;
1239 if (GET_CODE (insn) == JUMP_INSN) /* Closing branch handled later. */
1242 /* 1. compute sched window for u (start, end, step). */
1245 psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1246 pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1248 if (psp_not_empty && !pss_not_empty)
1250 int early_start = 0;
1253 for (e = u_node->in; e != 0; e = e->next_in)
1255 ddg_node_ptr v_node = e->src;
1256 if (TEST_BIT (sched_nodes, v_node->cuid))
1258 early_start = MAX (early_start,
1260 + e->latency - (e->distance * ii));
1261 if (e->data_type == MEM_DEP)
1262 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1265 start = early_start;
1266 end = MIN (end, early_start + ii);
1270 else if (!psp_not_empty && pss_not_empty)
1272 int late_start = INT_MAX;
1275 for (e = u_node->out; e != 0; e = e->next_out)
1277 ddg_node_ptr v_node = e->dest;
1278 if (TEST_BIT (sched_nodes, v_node->cuid))
1280 late_start = MIN (late_start,
1281 SCHED_TIME (v_node) - e->latency
1282 + (e->distance * ii));
1283 if (e->data_type == MEM_DEP)
1284 end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1288 end = MAX (end, late_start - ii);
1292 else if (psp_not_empty && pss_not_empty)
1294 int early_start = 0;
1295 int late_start = INT_MAX;
1299 for (e = u_node->in; e != 0; e = e->next_in)
1301 ddg_node_ptr v_node = e->src;
1303 if (TEST_BIT (sched_nodes, v_node->cuid))
1305 early_start = MAX (early_start,
1306 SCHED_TIME (v_node) + e->latency
1307 - (e->distance * ii));
1308 if (e->data_type == MEM_DEP)
1309 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1312 for (e = u_node->out; e != 0; e = e->next_out)
1314 ddg_node_ptr v_node = e->dest;
1316 if (TEST_BIT (sched_nodes, v_node->cuid))
1318 late_start = MIN (late_start,
1319 SCHED_TIME (v_node) - e->latency
1320 + (e->distance * ii));
1321 if (e->data_type == MEM_DEP)
1322 start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1325 start = MAX (start, early_start);
1326 end = MIN (end, MIN (early_start + ii, late_start + 1));
1329 else /* psp is empty && pss is empty. */
1331 start = SCHED_ASAP (u_node);
1336 /* 2. Try scheduling u in window. */
1338 fprintf(dump_file, "Trying to schedule node %d in (%d .. %d) step %d\n",
1339 u, start, end, step);
1342 if ((step > 0 && start < end) || (step < 0 && start > end))
1343 for (c = start; c != end; c += step)
1345 ps_insn_ptr psi = ps_add_node_check_conflicts (ps, u_node, c);
1349 SCHED_TIME (u_node) = c;
1350 SET_BIT (sched_nodes, u);
1353 fprintf(dump_file, "Schedule in %d\n", c);
1359 /* ??? Try backtracking instead of immediately ii++? */
1361 try_again_with_larger_ii = true;
1362 reset_partial_schedule (ps, ii);
1365 /* ??? If (success), check register pressure estimates. */
1366 } /* Continue with next node. */
1367 } /* While try_again_with_larger_ii. */
1369 sbitmap_free (sched_nodes);
1375 free_partial_schedule (ps);
1382 /* This page implements the algorithm for ordering the nodes of a DDG
1383 for modulo scheduling, activated through the
1384 "int sms_order_nodes (ddg_ptr, int mii, int * result)" API. */
1386 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1387 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
1388 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
1389 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1390 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
1391 #define DEPTH(x) (ASAP ((x)))
1393 typedef struct node_order_params * nopa;
1395 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1396 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1397 static nopa calculate_order_params (ddg_ptr, int mii);
1398 static int find_max_asap (ddg_ptr, sbitmap);
1399 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1400 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1402 enum sms_direction {BOTTOMUP, TOPDOWN};
1404 struct node_order_params
1411 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1. */
1413 check_nodes_order (int *node_order, int num_nodes)
1416 sbitmap tmp = sbitmap_alloc (num_nodes);
1420 for (i = 0; i < num_nodes; i++)
1422 int u = node_order[i];
1424 if (u >= num_nodes || u < 0 || TEST_BIT (tmp, u))
1433 /* Order the nodes of G for scheduling and pass the result in
1434 NODE_ORDER. Also set aux.count of each node to ASAP.
1435 Return the recMII for the given DDG. */
1437 sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1441 ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1443 nopa nops = calculate_order_params (g, mii);
1445 order_nodes_of_sccs (sccs, node_order);
1447 if (sccs->num_sccs > 0)
1448 /* First SCC has the largest recurrence_length. */
1449 rec_mii = sccs->sccs[0]->recurrence_length;
1451 /* Save ASAP before destroying node_order_params. */
1452 for (i = 0; i < g->num_nodes; i++)
1454 ddg_node_ptr v = &g->nodes[i];
1455 v->aux.count = ASAP (v);
1459 free_ddg_all_sccs (sccs);
1460 check_nodes_order (node_order, g->num_nodes);
1466 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1469 ddg_ptr g = all_sccs->ddg;
1470 int num_nodes = g->num_nodes;
1471 sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1472 sbitmap on_path = sbitmap_alloc (num_nodes);
1473 sbitmap tmp = sbitmap_alloc (num_nodes);
1474 sbitmap ones = sbitmap_alloc (num_nodes);
1476 sbitmap_zero (prev_sccs);
1477 sbitmap_ones (ones);
1479 /* Perfrom the node ordering starting from the SCC with the highest recMII.
1480 For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc. */
1481 for (i = 0; i < all_sccs->num_sccs; i++)
1483 ddg_scc_ptr scc = all_sccs->sccs[i];
1485 /* Add nodes on paths from previous SCCs to the current SCC. */
1486 find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1487 sbitmap_a_or_b (tmp, scc->nodes, on_path);
1489 /* Add nodes on paths from the current SCC to previous SCCs. */
1490 find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1491 sbitmap_a_or_b (tmp, tmp, on_path);
1493 /* Remove nodes of previous SCCs from current extended SCC. */
1494 sbitmap_difference (tmp, tmp, prev_sccs);
1496 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1497 /* Above call to order_nodes_in_scc updated prev_sccs |= tmp. */
1500 /* Handle the remaining nodes that do not belong to any scc. Each call
1501 to order_nodes_in_scc handles a single connected component. */
1502 while (pos < g->num_nodes)
1504 sbitmap_difference (tmp, ones, prev_sccs);
1505 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1507 sbitmap_free (prev_sccs);
1508 sbitmap_free (on_path);
1510 sbitmap_free (ones);
1513 /* MII is needed if we consider backarcs (that do not close recursive cycles). */
1514 static struct node_order_params *
1515 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1519 int num_nodes = g->num_nodes;
1521 /* Allocate a place to hold ordering params for each node in the DDG. */
1522 nopa node_order_params_arr;
1524 /* Initialize of ASAP/ALAP/HEIGHT to zero. */
1525 node_order_params_arr = (nopa) xcalloc (num_nodes,
1526 sizeof (struct node_order_params));
1528 /* Set the aux pointer of each node to point to its order_params strcture. */
1529 for (u = 0; u < num_nodes; u++)
1530 g->nodes[u].aux.info = &node_order_params_arr[u];
1532 /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1533 calculate ASAP, ALAP, mobility, distance, and height for each node
1534 in the dependence (direct acyclic) graph. */
1536 /* We assume that the nodes in the array are in topological order. */
1539 for (u = 0; u < num_nodes; u++)
1541 ddg_node_ptr u_node = &g->nodes[u];
1544 for (e = u_node->in; e; e = e->next_in)
1545 if (e->distance == 0)
1546 ASAP (u_node) = MAX (ASAP (u_node),
1547 ASAP (e->src) + e->latency);
1548 max_asap = MAX (max_asap, ASAP (u_node));
1551 for (u = num_nodes - 1; u > -1; u--)
1553 ddg_node_ptr u_node = &g->nodes[u];
1555 ALAP (u_node) = max_asap;
1556 HEIGHT (u_node) = 0;
1557 for (e = u_node->out; e; e = e->next_out)
1558 if (e->distance == 0)
1560 ALAP (u_node) = MIN (ALAP (u_node),
1561 ALAP (e->dest) - e->latency);
1562 HEIGHT (u_node) = MAX (HEIGHT (u_node),
1563 HEIGHT (e->dest) + e->latency);
1567 return node_order_params_arr;
1571 find_max_asap (ddg_ptr g, sbitmap nodes)
1577 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1579 ddg_node_ptr u_node = &g->nodes[u];
1581 if (max_asap < ASAP (u_node))
1583 max_asap = ASAP (u_node);
1591 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1595 int min_mob = INT_MAX;
1598 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1600 ddg_node_ptr u_node = &g->nodes[u];
1602 if (max_hv < HEIGHT (u_node))
1604 max_hv = HEIGHT (u_node);
1605 min_mob = MOB (u_node);
1608 else if ((max_hv == HEIGHT (u_node))
1609 && (min_mob > MOB (u_node)))
1611 min_mob = MOB (u_node);
1619 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1623 int min_mob = INT_MAX;
1626 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1628 ddg_node_ptr u_node = &g->nodes[u];
1630 if (max_dv < DEPTH (u_node))
1632 max_dv = DEPTH (u_node);
1633 min_mob = MOB (u_node);
1636 else if ((max_dv == DEPTH (u_node))
1637 && (min_mob > MOB (u_node)))
1639 min_mob = MOB (u_node);
1646 /* Places the nodes of SCC into the NODE_ORDER array starting
1647 at position POS, according to the SMS ordering algorithm.
1648 NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1649 the NODE_ORDER array, starting from position zero. */
1651 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1652 int * node_order, int pos)
1654 enum sms_direction dir;
1655 int num_nodes = g->num_nodes;
1656 sbitmap workset = sbitmap_alloc (num_nodes);
1657 sbitmap tmp = sbitmap_alloc (num_nodes);
1658 sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1659 sbitmap predecessors = sbitmap_alloc (num_nodes);
1660 sbitmap successors = sbitmap_alloc (num_nodes);
1662 sbitmap_zero (predecessors);
1663 find_predecessors (predecessors, g, nodes_ordered);
1665 sbitmap_zero (successors);
1666 find_successors (successors, g, nodes_ordered);
1669 if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1671 sbitmap_copy (workset, tmp);
1674 else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1676 sbitmap_copy (workset, tmp);
1683 sbitmap_zero (workset);
1684 if ((u = find_max_asap (g, scc)) >= 0)
1685 SET_BIT (workset, u);
1689 sbitmap_zero (zero_bitmap);
1690 while (!sbitmap_equal (workset, zero_bitmap))
1693 ddg_node_ptr v_node;
1694 sbitmap v_node_preds;
1695 sbitmap v_node_succs;
1699 while (!sbitmap_equal (workset, zero_bitmap))
1701 v = find_max_hv_min_mob (g, workset);
1702 v_node = &g->nodes[v];
1703 node_order[pos++] = v;
1704 v_node_succs = NODE_SUCCESSORS (v_node);
1705 sbitmap_a_and_b (tmp, v_node_succs, scc);
1707 /* Don't consider the already ordered successors again. */
1708 sbitmap_difference (tmp, tmp, nodes_ordered);
1709 sbitmap_a_or_b (workset, workset, tmp);
1710 RESET_BIT (workset, v);
1711 SET_BIT (nodes_ordered, v);
1714 sbitmap_zero (predecessors);
1715 find_predecessors (predecessors, g, nodes_ordered);
1716 sbitmap_a_and_b (workset, predecessors, scc);
1720 while (!sbitmap_equal (workset, zero_bitmap))
1722 v = find_max_dv_min_mob (g, workset);
1723 v_node = &g->nodes[v];
1724 node_order[pos++] = v;
1725 v_node_preds = NODE_PREDECESSORS (v_node);
1726 sbitmap_a_and_b (tmp, v_node_preds, scc);
1728 /* Don't consider the already ordered predecessors again. */
1729 sbitmap_difference (tmp, tmp, nodes_ordered);
1730 sbitmap_a_or_b (workset, workset, tmp);
1731 RESET_BIT (workset, v);
1732 SET_BIT (nodes_ordered, v);
1735 sbitmap_zero (successors);
1736 find_successors (successors, g, nodes_ordered);
1737 sbitmap_a_and_b (workset, successors, scc);
1741 sbitmap_free (workset);
1742 sbitmap_free (zero_bitmap);
1743 sbitmap_free (predecessors);
1744 sbitmap_free (successors);
1749 /* This page contains functions for manipulating partial-schedules during
1750 modulo scheduling. */
1752 /* Create a partial schedule and allocate a memory to hold II rows. */
1753 partial_schedule_ptr
1754 create_partial_schedule (int ii, ddg_ptr g, int history)
1756 partial_schedule_ptr ps = (partial_schedule_ptr)
1757 xmalloc (sizeof (struct partial_schedule));
1758 ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
1760 ps->history = history;
1761 ps->min_cycle = INT_MAX;
1762 ps->max_cycle = INT_MIN;
1768 /* Free the PS_INSNs in rows array of the given partial schedule.
1769 ??? Consider caching the PS_INSN's. */
1771 free_ps_insns (partial_schedule_ptr ps)
1775 for (i = 0; i < ps->ii; i++)
1779 ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
1782 ps->rows[i] = ps_insn;
1788 /* Free all the memory allocated to the partial schedule. */
1790 free_partial_schedule (partial_schedule_ptr ps)
1799 /* Clear the rows array with its PS_INSNs, and create a new one with
1802 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
1807 if (new_ii == ps->ii)
1809 ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
1810 * sizeof (ps_insn_ptr));
1811 memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
1813 ps->min_cycle = INT_MAX;
1814 ps->max_cycle = INT_MIN;
1817 /* Prints the partial schedule as an ii rows array, for each rows
1818 print the ids of the insns in it. */
1820 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
1824 for (i = 0; i < ps->ii; i++)
1826 ps_insn_ptr ps_i = ps->rows[i];
1828 fprintf (dump, "\n[CYCLE %d ]: ", i);
1831 fprintf (dump, "%d, ",
1832 INSN_UID (ps_i->node->insn));
1833 ps_i = ps_i->next_in_row;
1838 /* Creates an object of PS_INSN and initializes it to the given parameters. */
1840 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
1842 ps_insn_ptr ps_i = xmalloc (sizeof (struct ps_insn));
1845 ps_i->next_in_row = NULL;
1846 ps_i->prev_in_row = NULL;
1847 ps_i->row_rest_count = rest_count;
1848 ps_i->cycle = cycle;
1854 /* Removes the given PS_INSN from the partial schedule. Returns false if the
1855 node is not found in the partial schedule, else returns true. */
1857 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
1864 row = SMODULO (ps_i->cycle, ps->ii);
1865 if (! ps_i->prev_in_row)
1867 if (ps_i != ps->rows[row])
1870 ps->rows[row] = ps_i->next_in_row;
1872 ps->rows[row]->prev_in_row = NULL;
1876 ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
1877 if (ps_i->next_in_row)
1878 ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
1884 /* Advances the PS_INSN one column in its current row; returns false
1885 in failure and true in success. */
1887 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i)
1889 ps_insn_ptr prev, next;
1895 row = SMODULO (ps_i->cycle, ps->ii);
1897 if (! ps_i->next_in_row)
1900 /* Check if next_in_row is dependent on ps_i, both having same sched
1901 times (typically ANTI_DEP). If so, ps_i cannot skip over it. */
1902 if (ps_i->cycle == ps_i->next_in_row->cycle)
1905 ddg_node_ptr next_node = ps_i->next_in_row->node;
1907 for (e = ps_i->node->out; e; e = e->next_out)
1908 if (e->dest == next_node)
1912 /* Advace PS_I over its next_in_row in the doubly linked list. */
1913 prev = ps_i->prev_in_row;
1914 next = ps_i->next_in_row;
1916 if (ps_i == ps->rows[row])
1917 ps->rows[row] = next;
1919 ps_i->next_in_row = next->next_in_row;
1921 if (next->next_in_row)
1922 next->next_in_row->prev_in_row = ps_i;
1924 next->next_in_row = ps_i;
1925 ps_i->prev_in_row = next;
1927 next->prev_in_row = prev;
1929 prev->next_in_row = next;
1934 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
1935 Returns 0 if this is not possible and a PS_INSN otherwise. */
1937 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle)
1939 ps_insn_ptr ps_i, next_ps_i, advance_after;
1941 int row = SMODULO (cycle, ps->ii);
1945 && ps->rows[row]->row_rest_count >= issue_rate)
1949 rest_count += ps->rows[row]->row_rest_count;
1951 ps_i = create_ps_insn (node, rest_count, cycle);
1952 ps_i->next_in_row = ps->rows[row];
1953 ps_i->prev_in_row = NULL;
1954 if (ps_i->next_in_row)
1955 ps_i->next_in_row->prev_in_row = ps_i;
1956 ps->rows[row] = ps_i;
1958 /* Check if n is dependent on an insn already in row, having same cycle
1959 (typically ANTI_DEP). If so, n must skip over it. */
1960 advance_after = NULL;
1961 for (next_ps_i = ps_i->next_in_row;
1963 next_ps_i = next_ps_i->next_in_row)
1964 if (next_ps_i->cycle == cycle)
1965 for (e = node->in; e; e = e->next_in)
1966 if (e->src == next_ps_i->node)
1967 advance_after = next_ps_i;
1970 while (ps_i->prev_in_row != advance_after)
1971 if (!ps_insn_advance_column (ps, ps_i))
1973 remove_node_from_ps (ps, ps_i);
1980 /* Advance time one cycle. Assumes DFA is being used. */
1982 advance_one_cycle (void)
1984 if (targetm.sched.use_dfa_pipeline_interface
1985 && (*targetm.sched.use_dfa_pipeline_interface) ())
1987 if (targetm.sched.dfa_pre_cycle_insn)
1988 state_transition (curr_state,
1989 (*targetm.sched.dfa_pre_cycle_insn) ());
1991 state_transition (curr_state, NULL);
1993 if (targetm.sched.dfa_post_cycle_insn)
1994 state_transition (curr_state,
1995 (*targetm.sched.dfa_post_cycle_insn) ());
1999 /* Checks if PS has resource conflicts according to DFA, starting from
2000 FROM cycle to TO cycle; returns true if there are conflicts and false
2001 if there are no conflicts. Assumes DFA is being used. */
2003 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2007 if (! targetm.sched.use_dfa_pipeline_interface
2008 || ! (*targetm.sched.use_dfa_pipeline_interface) ())
2011 state_reset (curr_state);
2013 for (cycle = from; cycle <= to; cycle++)
2015 ps_insn_ptr crr_insn;
2016 /* Holds the remaining issue slots in the current row. */
2017 int can_issue_more = issue_rate;
2019 /* Walk through the DFA for the current row. */
2020 for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2022 crr_insn = crr_insn->next_in_row)
2024 rtx insn = crr_insn->node->insn;
2029 /* Check if there is room for the current insn. */
2030 if (!can_issue_more || state_dead_lock_p (curr_state))
2033 /* Update the DFA state and return with failure if the DFA found
2034 recource conflicts. */
2035 if (state_transition (curr_state, insn) >= 0)
2038 if (targetm.sched.variable_issue)
2040 (*targetm.sched.variable_issue) (sched_dump, sched_verbose,
2041 insn, can_issue_more);
2042 /* A naked CLOBBER or USE generates no instruction, so don't
2043 let them consume issue slots. */
2044 else if (GET_CODE (PATTERN (insn)) != USE
2045 && GET_CODE (PATTERN (insn)) != CLOBBER)
2049 /* Advance the DFA to the next cycle. */
2050 advance_one_cycle ();
2055 /* Checks if the given node causes resource conflicts when added to PS at
2056 cycle C. If not the node is added to PS and returned; otherwise zero
2059 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n, int c)
2061 int has_conflicts = 0;
2064 /* First add the node to the PS, if this succeeds check for conflicts,
2065 trying different issue slots in the same row. */
2066 if (! (ps_i = add_node_to_ps (ps, n, c)))
2067 return NULL; /* Failed to insert the node at the given cycle. */
2069 has_conflicts = ps_has_conflicts (ps, c, c)
2071 && ps_has_conflicts (ps,
2075 /* Try different issue slots to find one that the given node can be
2076 scheduled in without conflicts. */
2077 while (has_conflicts)
2079 if (! ps_insn_advance_column (ps, ps_i))
2081 has_conflicts = ps_has_conflicts (ps, c, c)
2083 && ps_has_conflicts (ps,
2090 remove_node_from_ps (ps, ps_i);
2094 ps->min_cycle = MIN (ps->min_cycle, c);
2095 ps->max_cycle = MAX (ps->max_cycle, c);
2099 /* Rotate the rows of PS such that insns scheduled at time
2100 START_CYCLE will appear in row 0. Updates max/min_cycles. */
2102 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2104 int i, row, backward_rotates;
2105 int last_row = ps->ii - 1;
2107 if (start_cycle == 0)
2110 backward_rotates = SMODULO (start_cycle, ps->ii);
2112 /* Revisit later and optimize this into a single loop. */
2113 for (i = 0; i < backward_rotates; i++)
2115 ps_insn_ptr first_row = ps->rows[0];
2117 for (row = 0; row < last_row; row++)
2118 ps->rows[row] = ps->rows[row+1];
2120 ps->rows[last_row] = first_row;
2123 ps->max_cycle -= start_cycle;
2124 ps->min_cycle -= start_cycle;