1 /* Swing Modulo Scheduling implementation.
2 Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
3 Free Software Foundation, Inc.
4 Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
25 #include "coretypes.h"
27 #include "diagnostic-core.h"
30 #include "hard-reg-set.h"
34 #include "insn-config.h"
35 #include "insn-attr.h"
38 #include "sched-int.h"
40 #include "cfglayout.h"
48 #include "tree-pass.h"
52 #ifdef INSN_SCHEDULING
54 /* This file contains the implementation of the Swing Modulo Scheduler,
55 described in the following references:
56 [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
57 Lifetime--sensitive modulo scheduling in a production environment.
58 IEEE Trans. on Comps., 50(3), March 2001
59 [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
60 Swing Modulo Scheduling: A Lifetime Sensitive Approach.
61 PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
63 The basic structure is:
64 1. Build a data-dependence graph (DDG) for each loop.
65 2. Use the DDG to order the insns of a loop (not in topological order
66 necessarily, but rather) trying to place each insn after all its
67 predecessors _or_ after all its successors.
68 3. Compute MII: a lower bound on the number of cycles to schedule the loop.
69 4. Use the ordering to perform list-scheduling of the loop:
70 1. Set II = MII. We will try to schedule the loop within II cycles.
71 2. Try to schedule the insns one by one according to the ordering.
72 For each insn compute an interval of cycles by considering already-
73 scheduled preds and succs (and associated latencies); try to place
74 the insn in the cycles of this window checking for potential
75 resource conflicts (using the DFA interface).
76 Note: this is different from the cycle-scheduling of schedule_insns;
77 here the insns are not scheduled monotonically top-down (nor bottom-
79 3. If failed in scheduling all insns - bump II++ and try again, unless
80 II reaches an upper bound MaxII, in which case report failure.
81 5. If we succeeded in scheduling the loop within II cycles, we now
82 generate prolog and epilog, decrease the counter of the loop, and
83 perform modulo variable expansion for live ranges that span more than
84 II cycles (i.e. use register copies to prevent a def from overwriting
85 itself before reaching the use).
87 SMS works with countable loops (1) whose control part can be easily
88 decoupled from the rest of the loop and (2) whose loop count can
89 be easily adjusted. This is because we peel a constant number of
90 iterations into a prologue and epilogue for which we want to avoid
91 emitting the control part, and a kernel which is to iterate that
92 constant number of iterations less than the original loop. So the
93 control part should be a set of insns clearly identified and having
94 its own iv, not otherwise used in the loop (at-least for now), which
95 initializes a register before the loop to the number of iterations.
96 Currently SMS relies on the do-loop pattern to recognize such loops,
97 where (1) the control part comprises of all insns defining and/or
98 using a certain 'count' register and (2) the loop count can be
99 adjusted by modifying this register prior to the loop.
100 TODO: Rely on cfgloop analysis instead. */
102 /* This page defines partial-schedule structures and functions for
103 modulo scheduling. */
105 typedef struct partial_schedule *partial_schedule_ptr;
106 typedef struct ps_insn *ps_insn_ptr;
108 /* The minimum (absolute) cycle that a node of ps was scheduled in. */
109 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
111 /* The maximum (absolute) cycle that a node of ps was scheduled in. */
112 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
114 /* Perform signed modulo, always returning a non-negative value. */
115 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
117 /* The number of different iterations the nodes in ps span, assuming
118 the stage boundaries are placed efficiently. */
119 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
120 + 1 + (ps)->ii - 1) / (ps)->ii)
122 /* A single instruction in the partial schedule. */
125 /* The corresponding DDG_NODE. */
128 /* The (absolute) cycle in which the PS instruction is scheduled.
129 Same as SCHED_TIME (node). */
132 /* The next/prev PS_INSN in the same row. */
133 ps_insn_ptr next_in_row,
136 /* The number of nodes in the same row that come after this node. */
140 /* Holds the partial schedule as an array of II rows. Each entry of the
141 array points to a linked list of PS_INSNs, which represents the
142 instructions that are scheduled for that row. */
143 struct partial_schedule
145 int ii; /* Number of rows in the partial schedule. */
146 int history; /* Threshold for conflict checking using DFA. */
148 /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii). */
151 /* The earliest absolute cycle of an insn in the partial schedule. */
154 /* The latest absolute cycle of an insn in the partial schedule. */
157 ddg_ptr g; /* The DDG of the insns in the partial schedule. */
160 /* We use this to record all the register replacements we do in
161 the kernel so we can undo SMS if it is not profitable. */
162 struct undo_replace_buff_elem
167 struct undo_replace_buff_elem *next;
172 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
173 static void free_partial_schedule (partial_schedule_ptr);
174 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
175 void print_partial_schedule (partial_schedule_ptr, FILE *);
176 static void verify_partial_schedule (partial_schedule_ptr, sbitmap);
177 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
178 ddg_node_ptr node, int cycle,
179 sbitmap must_precede,
180 sbitmap must_follow);
181 static void rotate_partial_schedule (partial_schedule_ptr, int);
182 void set_row_column_for_ps (partial_schedule_ptr);
183 static void ps_insert_empty_row (partial_schedule_ptr, int, sbitmap);
184 static int compute_split_row (sbitmap, int, int, int, ddg_node_ptr);
187 /* This page defines constants and structures for the modulo scheduling
190 static int sms_order_nodes (ddg_ptr, int, int *, int *);
191 static void set_node_sched_params (ddg_ptr);
192 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
193 static void permute_partial_schedule (partial_schedule_ptr, rtx);
194 static void generate_prolog_epilog (partial_schedule_ptr, struct loop *,
196 static void duplicate_insns_of_cycles (partial_schedule_ptr,
199 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
200 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
201 #define SCHED_FIRST_REG_MOVE(x) \
202 (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
203 #define SCHED_NREG_MOVES(x) \
204 (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
205 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
206 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
207 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
209 /* The scheduling parameters held for each node. */
210 typedef struct node_sched_params
212 int asap; /* A lower-bound on the absolute scheduling cycle. */
213 int time; /* The absolute scheduling cycle (time >= asap). */
215 /* The following field (first_reg_move) is a pointer to the first
216 register-move instruction added to handle the modulo-variable-expansion
217 of the register defined by this node. This register-move copies the
218 original register defined by the node. */
221 /* The number of register-move instructions added, immediately preceding
225 int row; /* Holds time % ii. */
226 int stage; /* Holds time / ii. */
228 /* The column of a node inside the ps. If nodes u, v are on the same row,
229 u will precede v if column (u) < column (v). */
231 } *node_sched_params_ptr;
234 /* The following three functions are copied from the current scheduler
235 code in order to use sched_analyze() for computing the dependencies.
236 They are used when initializing the sched_info structure. */
238 sms_print_insn (const_rtx insn, int aligned ATTRIBUTE_UNUSED)
242 sprintf (tmp, "i%4d", INSN_UID (insn));
247 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
248 regset cond_exec ATTRIBUTE_UNUSED,
249 regset used ATTRIBUTE_UNUSED,
250 regset set ATTRIBUTE_UNUSED)
254 static struct common_sched_info_def sms_common_sched_info;
256 static struct sched_deps_info_def sms_sched_deps_info =
258 compute_jump_reg_dependencies,
259 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
264 static struct haifa_sched_info sms_sched_info =
273 NULL, /* insn_finishes_block_p */
278 NULL, NULL, NULL, NULL,
282 /* Given HEAD and TAIL which are the first and last insns in a loop;
283 return the register which controls the loop. Return zero if it has
284 more than one occurrence in the loop besides the control part or the
285 do-loop pattern is not of the form we expect. */
287 doloop_register_get (rtx head ATTRIBUTE_UNUSED, rtx tail ATTRIBUTE_UNUSED)
289 #ifdef HAVE_doloop_end
290 rtx reg, condition, insn, first_insn_not_to_check;
295 /* TODO: Free SMS's dependence on doloop_condition_get. */
296 condition = doloop_condition_get (tail);
300 if (REG_P (XEXP (condition, 0)))
301 reg = XEXP (condition, 0);
302 else if (GET_CODE (XEXP (condition, 0)) == PLUS
303 && REG_P (XEXP (XEXP (condition, 0), 0)))
304 reg = XEXP (XEXP (condition, 0), 0);
308 /* Check that the COUNT_REG has no other occurrences in the loop
309 until the decrement. We assume the control part consists of
310 either a single (parallel) branch-on-count or a (non-parallel)
311 branch immediately preceded by a single (decrement) insn. */
312 first_insn_not_to_check = (GET_CODE (PATTERN (tail)) == PARALLEL ? tail
315 for (insn = head; insn != first_insn_not_to_check; insn = NEXT_INSN (insn))
316 if (reg_mentioned_p (reg, insn))
320 fprintf (dump_file, "SMS count_reg found ");
321 print_rtl_single (dump_file, reg);
322 fprintf (dump_file, " outside control in insn:\n");
323 print_rtl_single (dump_file, insn);
335 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
336 that the number of iterations is a compile-time constant. If so,
337 return the rtx that sets COUNT_REG to a constant, and set COUNT to
338 this constant. Otherwise return 0. */
340 const_iteration_count (rtx count_reg, basic_block pre_header,
341 HOST_WIDEST_INT * count)
349 get_ebb_head_tail (pre_header, pre_header, &head, &tail);
351 for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
352 if (NONDEBUG_INSN_P (insn) && single_set (insn) &&
353 rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
355 rtx pat = single_set (insn);
357 if (CONST_INT_P (SET_SRC (pat)))
359 *count = INTVAL (SET_SRC (pat));
369 /* A very simple resource-based lower bound on the initiation interval.
370 ??? Improve the accuracy of this bound by considering the
371 utilization of various units. */
375 if (targetm.sched.sms_res_mii)
376 return targetm.sched.sms_res_mii (g);
378 return ((g->num_nodes - g->num_debug) / issue_rate);
382 /* Points to the array that contains the sched data for each node. */
383 static node_sched_params_ptr node_sched_params;
385 /* Allocate sched_params for each node and initialize it. Assumes that
386 the aux field of each node contain the asap bound (computed earlier),
387 and copies it into the sched_params field. */
389 set_node_sched_params (ddg_ptr g)
393 /* Allocate for each node in the DDG a place to hold the "sched_data". */
394 /* Initialize ASAP/ALAP/HIGHT to zero. */
395 node_sched_params = (node_sched_params_ptr)
396 xcalloc (g->num_nodes,
397 sizeof (struct node_sched_params));
399 /* Set the pointer of the general data of the node to point to the
400 appropriate sched_params structure. */
401 for (i = 0; i < g->num_nodes; i++)
403 /* Watch out for aliasing problems? */
404 node_sched_params[i].asap = g->nodes[i].aux.count;
405 g->nodes[i].aux.info = &node_sched_params[i];
410 print_node_sched_params (FILE *file, int num_nodes, ddg_ptr g)
416 for (i = 0; i < num_nodes; i++)
418 node_sched_params_ptr nsp = &node_sched_params[i];
419 rtx reg_move = nsp->first_reg_move;
422 fprintf (file, "Node = %d; INSN = %d\n", i,
423 (INSN_UID (g->nodes[i].insn)));
424 fprintf (file, " asap = %d:\n", nsp->asap);
425 fprintf (file, " time = %d:\n", nsp->time);
426 fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
427 for (j = 0; j < nsp->nreg_moves; j++)
429 fprintf (file, " reg_move = ");
430 print_rtl_single (file, reg_move);
431 reg_move = PREV_INSN (reg_move);
437 Breaking intra-loop register anti-dependences:
438 Each intra-loop register anti-dependence implies a cross-iteration true
439 dependence of distance 1. Therefore, we can remove such false dependencies
440 and figure out if the partial schedule broke them by checking if (for a
441 true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
442 if so generate a register move. The number of such moves is equal to:
443 SCHED_TIME (use) - SCHED_TIME (def) { 0 broken
444 nreg_moves = ----------------------------------- + 1 - { dependence.
447 static struct undo_replace_buff_elem *
448 generate_reg_moves (partial_schedule_ptr ps, bool rescan)
453 struct undo_replace_buff_elem *reg_move_replaces = NULL;
455 for (i = 0; i < g->num_nodes; i++)
457 ddg_node_ptr u = &g->nodes[i];
459 int nreg_moves = 0, i_reg_move;
460 sbitmap *uses_of_defs;
462 rtx prev_reg, old_reg;
464 /* Compute the number of reg_moves needed for u, by looking at life
465 ranges started at u (excluding self-loops). */
466 for (e = u->out; e; e = e->next_out)
467 if (e->type == TRUE_DEP && e->dest != e->src)
469 int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
471 if (e->distance == 1)
472 nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
474 /* If dest precedes src in the schedule of the kernel, then dest
475 will read before src writes and we can save one reg_copy. */
476 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
477 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
480 nreg_moves = MAX (nreg_moves, nreg_moves4e);
486 /* Every use of the register defined by node may require a different
487 copy of this register, depending on the time the use is scheduled.
488 Set a bitmap vector, telling which nodes use each copy of this
490 uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
491 sbitmap_vector_zero (uses_of_defs, nreg_moves);
492 for (e = u->out; e; e = e->next_out)
493 if (e->type == TRUE_DEP && e->dest != e->src)
495 int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
497 if (e->distance == 1)
498 dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
500 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
501 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
505 SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
508 /* Now generate the reg_moves, attaching relevant uses to them. */
509 SCHED_NREG_MOVES (u) = nreg_moves;
510 old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
511 /* Insert the reg-moves right before the notes which precede
512 the insn they relates to. */
513 last_reg_move = u->first_note;
515 for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
517 unsigned int i_use = 0;
518 rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
519 rtx reg_move = gen_move_insn (new_reg, prev_reg);
520 sbitmap_iterator sbi;
522 add_insn_before (reg_move, last_reg_move, NULL);
523 last_reg_move = reg_move;
525 if (!SCHED_FIRST_REG_MOVE (u))
526 SCHED_FIRST_REG_MOVE (u) = reg_move;
528 EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use, sbi)
530 struct undo_replace_buff_elem *rep;
532 rep = (struct undo_replace_buff_elem *)
533 xcalloc (1, sizeof (struct undo_replace_buff_elem));
534 rep->insn = g->nodes[i_use].insn;
535 rep->orig_reg = old_reg;
536 rep->new_reg = new_reg;
538 if (! reg_move_replaces)
539 reg_move_replaces = rep;
542 rep->next = reg_move_replaces;
543 reg_move_replaces = rep;
546 replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
548 df_insn_rescan (g->nodes[i_use].insn);
553 sbitmap_vector_free (uses_of_defs);
555 return reg_move_replaces;
558 /* Free memory allocated for the undo buffer. */
560 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
563 while (reg_move_replaces)
565 struct undo_replace_buff_elem *rep = reg_move_replaces;
567 reg_move_replaces = reg_move_replaces->next;
572 /* Bump the SCHED_TIMEs of all nodes to start from zero. Set the values
573 of SCHED_ROW and SCHED_STAGE. */
575 normalize_sched_times (partial_schedule_ptr ps)
578 int amount = PS_MIN_CYCLE (ps);
580 ps_insn_ptr crr_insn;
582 for (row = 0; row < ii; row++)
583 for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
585 ddg_node_ptr u = crr_insn->node;
586 int normalized_time = SCHED_TIME (u) - amount;
589 fprintf (dump_file, "crr_insn->node=%d, crr_insn->cycle=%d,\
590 min_cycle=%d\n", crr_insn->node->cuid, SCHED_TIME
592 gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
593 gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
594 SCHED_TIME (u) = normalized_time;
595 SCHED_ROW (u) = normalized_time % ii;
596 SCHED_STAGE (u) = normalized_time / ii;
600 /* Set SCHED_COLUMN of each node according to its position in PS. */
602 set_columns_for_ps (partial_schedule_ptr ps)
606 for (row = 0; row < ps->ii; row++)
608 ps_insn_ptr cur_insn = ps->rows[row];
611 for (; cur_insn; cur_insn = cur_insn->next_in_row)
612 SCHED_COLUMN (cur_insn->node) = column++;
616 /* Permute the insns according to their order in PS, from row 0 to
617 row ii-1, and position them right before LAST. This schedules
618 the insns of the loop kernel. */
620 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
626 for (row = 0; row < ii ; row++)
627 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
628 if (PREV_INSN (last) != ps_ij->node->insn)
629 reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
634 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
635 int to_stage, int for_prolog, rtx count_reg)
640 for (row = 0; row < ps->ii; row++)
641 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
643 ddg_node_ptr u_node = ps_ij->node;
645 rtx reg_move = NULL_RTX;
647 /* Do not duplicate any insn which refers to count_reg as it
648 belongs to the control part.
649 TODO: This should be done by analyzing the control part of
651 if (reg_mentioned_p (count_reg, u_node->insn))
656 /* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing
657 number of reg_moves starting with the second occurrence of
658 u_node, which is generated if its SCHED_STAGE <= to_stage. */
659 i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
660 i_reg_moves = MAX (i_reg_moves, 0);
661 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
663 /* The reg_moves start from the *first* reg_move backwards. */
666 reg_move = SCHED_FIRST_REG_MOVE (u_node);
667 for (j = 1; j < i_reg_moves; j++)
668 reg_move = PREV_INSN (reg_move);
671 else /* It's for the epilog. */
673 /* SCHED_STAGE (u_node) <= to_stage. Generate all reg_moves,
674 starting to decrease one stage after u_node no longer occurs;
675 that is, generate all reg_moves until
676 SCHED_STAGE (u_node) == from_stage - 1. */
677 i_reg_moves = SCHED_NREG_MOVES (u_node)
678 - (from_stage - SCHED_STAGE (u_node) - 1);
679 i_reg_moves = MAX (i_reg_moves, 0);
680 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
682 /* The reg_moves start from the *last* reg_move forwards. */
685 reg_move = SCHED_FIRST_REG_MOVE (u_node);
686 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
687 reg_move = PREV_INSN (reg_move);
691 for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
692 emit_insn (copy_rtx (PATTERN (reg_move)));
693 if (SCHED_STAGE (u_node) >= from_stage
694 && SCHED_STAGE (u_node) <= to_stage)
695 duplicate_insn_chain (u_node->first_note, u_node->insn);
700 /* Generate the instructions (including reg_moves) for prolog & epilog. */
702 generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
703 rtx count_reg, rtx count_init)
706 int last_stage = PS_STAGE_COUNT (ps) - 1;
709 /* Generate the prolog, inserting its insns on the loop-entry edge. */
714 /* Generate instructions at the beginning of the prolog to
715 adjust the loop count by STAGE_COUNT. If loop count is constant
716 (count_init), this constant is adjusted by STAGE_COUNT in
717 generate_prolog_epilog function. */
718 rtx sub_reg = NULL_RTX;
720 sub_reg = expand_simple_binop (GET_MODE (count_reg), MINUS,
721 count_reg, GEN_INT (last_stage),
722 count_reg, 1, OPTAB_DIRECT);
723 gcc_assert (REG_P (sub_reg));
724 if (REGNO (sub_reg) != REGNO (count_reg))
725 emit_move_insn (count_reg, sub_reg);
728 for (i = 0; i < last_stage; i++)
729 duplicate_insns_of_cycles (ps, 0, i, 1, count_reg);
731 /* Put the prolog on the entry edge. */
732 e = loop_preheader_edge (loop);
733 split_edge_and_insert (e, get_insns ());
737 /* Generate the epilog, inserting its insns on the loop-exit edge. */
740 for (i = 0; i < last_stage; i++)
741 duplicate_insns_of_cycles (ps, i + 1, last_stage, 0, count_reg);
743 /* Put the epilogue on the exit edge. */
744 gcc_assert (single_exit (loop));
745 e = single_exit (loop);
746 split_edge_and_insert (e, get_insns ());
750 /* Return true if all the BBs of the loop are empty except the
753 loop_single_full_bb_p (struct loop *loop)
756 basic_block *bbs = get_loop_body (loop);
758 for (i = 0; i < loop->num_nodes ; i++)
761 bool empty_bb = true;
763 if (bbs[i] == loop->header)
766 /* Make sure that basic blocks other than the header
767 have only notes labels or jumps. */
768 get_ebb_head_tail (bbs[i], bbs[i], &head, &tail);
769 for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
771 if (NOTE_P (head) || LABEL_P (head)
772 || (INSN_P (head) && (DEBUG_INSN_P (head) || JUMP_P (head))))
788 /* A simple loop from SMS point of view; it is a loop that is composed of
789 either a single basic block or two BBs - a header and a latch. */
790 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) \
791 && (EDGE_COUNT (loop->latch->preds) == 1) \
792 && (EDGE_COUNT (loop->latch->succs) == 1))
794 /* Return true if the loop is in its canonical form and false if not.
795 i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit. */
797 loop_canon_p (struct loop *loop)
800 if (loop->inner || !loop_outer (loop))
803 fprintf (dump_file, "SMS loop inner or !loop_outer\n");
807 if (!single_exit (loop))
811 rtx insn = BB_END (loop->header);
813 fprintf (dump_file, "SMS loop many exits ");
814 fprintf (dump_file, " %s %d (file, line)\n",
815 insn_file (insn), insn_line (insn));
820 if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
824 rtx insn = BB_END (loop->header);
826 fprintf (dump_file, "SMS loop many BBs. ");
827 fprintf (dump_file, " %s %d (file, line)\n",
828 insn_file (insn), insn_line (insn));
836 /* If there are more than one entry for the loop,
837 make it one by splitting the first entry edge and
838 redirecting the others to the new BB. */
840 canon_loop (struct loop *loop)
845 /* Avoid annoying special cases of edges going to exit
847 FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
848 if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
851 if (loop->latch == loop->header
852 || EDGE_COUNT (loop->latch->succs) > 1)
854 FOR_EACH_EDGE (e, i, loop->header->preds)
855 if (e->src == loop->latch)
863 setup_sched_infos (void)
865 memcpy (&sms_common_sched_info, &haifa_common_sched_info,
866 sizeof (sms_common_sched_info));
867 sms_common_sched_info.sched_pass_id = SCHED_SMS_PASS;
868 common_sched_info = &sms_common_sched_info;
870 sched_deps_info = &sms_sched_deps_info;
871 current_sched_info = &sms_sched_info;
874 /* Probability in % that the sms-ed loop rolls enough so that optimized
875 version may be entered. Just a guess. */
876 #define PROB_SMS_ENOUGH_ITERATIONS 80
878 /* Used to calculate the upper bound of ii. */
879 #define MAXII_FACTOR 2
881 /* Main entry point, perform SMS scheduling on the loops of the function
882 that consist of single basic blocks. */
891 partial_schedule_ptr ps;
892 basic_block bb = NULL;
894 basic_block condition_bb = NULL;
896 gcov_type trip_count = 0;
898 loop_optimizer_init (LOOPS_HAVE_PREHEADERS
899 | LOOPS_HAVE_RECORDED_EXITS);
900 if (number_of_loops () <= 1)
902 loop_optimizer_finalize ();
903 return; /* There are no loops to schedule. */
906 /* Initialize issue_rate. */
907 if (targetm.sched.issue_rate)
909 int temp = reload_completed;
911 reload_completed = 1;
912 issue_rate = targetm.sched.issue_rate ();
913 reload_completed = temp;
918 /* Initialize the scheduler. */
919 setup_sched_infos ();
922 /* Allocate memory to hold the DDG array one entry for each loop.
923 We use loop->num as index into this array. */
924 g_arr = XCNEWVEC (ddg_ptr, number_of_loops ());
928 fprintf (dump_file, "\n\nSMS analysis phase\n");
929 fprintf (dump_file, "===================\n\n");
932 /* Build DDGs for all the relevant loops and hold them in G_ARR
933 indexed by the loop index. */
934 FOR_EACH_LOOP (li, loop, 0)
940 if (dbg_cnt (sms_sched_loop) == false)
943 fprintf (dump_file, "SMS reached max limit... \n");
950 rtx insn = BB_END (loop->header);
952 fprintf (dump_file, "SMS loop num: %d, file: %s, line: %d\n",
953 loop->num, insn_file (insn), insn_line (insn));
957 if (! loop_canon_p (loop))
960 if (! loop_single_full_bb_p (loop))
963 fprintf (dump_file, "SMS not loop_single_full_bb_p\n");
969 get_ebb_head_tail (bb, bb, &head, &tail);
970 latch_edge = loop_latch_edge (loop);
971 gcc_assert (single_exit (loop));
972 if (single_exit (loop)->count)
973 trip_count = latch_edge->count / single_exit (loop)->count;
975 /* Perform SMS only on loops that their average count is above threshold. */
977 if ( latch_edge->count
978 && (latch_edge->count < single_exit (loop)->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
982 fprintf (dump_file, " %s %d (file, line)\n",
983 insn_file (tail), insn_line (tail));
984 fprintf (dump_file, "SMS single-bb-loop\n");
985 if (profile_info && flag_branch_probabilities)
987 fprintf (dump_file, "SMS loop-count ");
988 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
989 (HOST_WIDEST_INT) bb->count);
990 fprintf (dump_file, "\n");
991 fprintf (dump_file, "SMS trip-count ");
992 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
993 (HOST_WIDEST_INT) trip_count);
994 fprintf (dump_file, "\n");
995 fprintf (dump_file, "SMS profile-sum-max ");
996 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
997 (HOST_WIDEST_INT) profile_info->sum_max);
998 fprintf (dump_file, "\n");
1004 /* Make sure this is a doloop. */
1005 if ( !(count_reg = doloop_register_get (head, tail)))
1008 fprintf (dump_file, "SMS doloop_register_get failed\n");
1012 /* Don't handle BBs with calls or barriers or auto-increment insns
1013 (to avoid creating invalid reg-moves for the auto-increment insns),
1014 or !single_set with the exception of instructions that include
1015 count_reg---these instructions are part of the control part
1016 that do-loop recognizes.
1017 ??? Should handle auto-increment insns.
1018 ??? Should handle insns defining subregs. */
1019 for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1025 || (NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
1026 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE
1027 && !reg_mentioned_p (count_reg, insn))
1028 || (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0)
1029 || (INSN_P (insn) && (set = single_set (insn))
1030 && GET_CODE (SET_DEST (set)) == SUBREG))
1034 if (insn != NEXT_INSN (tail))
1039 fprintf (dump_file, "SMS loop-with-call\n");
1040 else if (BARRIER_P (insn))
1041 fprintf (dump_file, "SMS loop-with-barrier\n");
1042 else if (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0)
1043 fprintf (dump_file, "SMS reg inc\n");
1044 else if ((NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
1045 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
1046 fprintf (dump_file, "SMS loop-with-not-single-set\n");
1048 fprintf (dump_file, "SMS loop with subreg in lhs\n");
1049 print_rtl_single (dump_file, insn);
1055 if (! (g = create_ddg (bb, 0)))
1058 fprintf (dump_file, "SMS create_ddg failed\n");
1062 g_arr[loop->num] = g;
1064 fprintf (dump_file, "...OK\n");
1069 fprintf (dump_file, "\nSMS transformation phase\n");
1070 fprintf (dump_file, "=========================\n\n");
1073 /* We don't want to perform SMS on new loops - created by versioning. */
1074 FOR_EACH_LOOP (li, loop, 0)
1077 rtx count_reg, count_init;
1079 unsigned stage_count = 0;
1080 HOST_WIDEST_INT loop_count = 0;
1082 if (! (g = g_arr[loop->num]))
1087 rtx insn = BB_END (loop->header);
1089 fprintf (dump_file, "SMS loop num: %d, file: %s, line: %d\n",
1090 loop->num, insn_file (insn), insn_line (insn));
1092 print_ddg (dump_file, g);
1095 get_ebb_head_tail (loop->header, loop->header, &head, &tail);
1097 latch_edge = loop_latch_edge (loop);
1098 gcc_assert (single_exit (loop));
1099 if (single_exit (loop)->count)
1100 trip_count = latch_edge->count / single_exit (loop)->count;
1104 fprintf (dump_file, " %s %d (file, line)\n",
1105 insn_file (tail), insn_line (tail));
1106 fprintf (dump_file, "SMS single-bb-loop\n");
1107 if (profile_info && flag_branch_probabilities)
1109 fprintf (dump_file, "SMS loop-count ");
1110 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1111 (HOST_WIDEST_INT) bb->count);
1112 fprintf (dump_file, "\n");
1113 fprintf (dump_file, "SMS profile-sum-max ");
1114 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1115 (HOST_WIDEST_INT) profile_info->sum_max);
1116 fprintf (dump_file, "\n");
1118 fprintf (dump_file, "SMS doloop\n");
1119 fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
1120 fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
1121 fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
1125 /* In case of th loop have doloop register it gets special
1127 count_init = NULL_RTX;
1128 if ((count_reg = doloop_register_get (head, tail)))
1130 basic_block pre_header;
1132 pre_header = loop_preheader_edge (loop)->src;
1133 count_init = const_iteration_count (count_reg, pre_header,
1136 gcc_assert (count_reg);
1138 if (dump_file && count_init)
1140 fprintf (dump_file, "SMS const-doloop ");
1141 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1143 fprintf (dump_file, "\n");
1146 node_order = XNEWVEC (int, g->num_nodes);
1148 mii = 1; /* Need to pass some estimate of mii. */
1149 rec_mii = sms_order_nodes (g, mii, node_order, &max_asap);
1150 mii = MAX (res_MII (g), rec_mii);
1151 maxii = MAX (max_asap, MAXII_FACTOR * mii);
1154 fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1155 rec_mii, mii, maxii);
1157 /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1159 set_node_sched_params (g);
1161 ps = sms_schedule_by_order (g, mii, maxii, node_order);
1164 stage_count = PS_STAGE_COUNT (ps);
1165 gcc_assert(stage_count >= 1);
1168 /* The default value of PARAM_SMS_MIN_SC is 2 as stage count of
1169 1 means that there is no interleaving between iterations thus
1170 we let the scheduling passes do the job in this case. */
1171 if (stage_count < (unsigned) PARAM_VALUE (PARAM_SMS_MIN_SC)
1172 || (count_init && (loop_count <= stage_count))
1173 || (flag_branch_probabilities && (trip_count <= stage_count)))
1177 fprintf (dump_file, "SMS failed... \n");
1178 fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
1179 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1180 fprintf (dump_file, ", trip-count=");
1181 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1182 fprintf (dump_file, ")\n");
1187 struct undo_replace_buff_elem *reg_move_replaces;
1192 "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1194 print_partial_schedule (ps, dump_file);
1196 "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1197 g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1200 /* Set the stage boundaries. If the DDG is built with closing_branch_deps,
1201 the closing_branch was scheduled and should appear in the last (ii-1)
1202 row. Otherwise, we are free to schedule the branch, and we let nodes
1203 that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1204 row; this should reduce stage_count to minimum.
1205 TODO: Revisit the issue of scheduling the insns of the
1206 control part relative to the branch when the control part
1207 has more than one insn. */
1208 normalize_sched_times (ps);
1209 rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1210 set_columns_for_ps (ps);
1214 /* case the BCT count is not known , Do loop-versioning */
1215 if (count_reg && ! count_init)
1217 rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1218 GEN_INT(stage_count));
1219 unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
1220 * REG_BR_PROB_BASE) / 100;
1222 loop_version (loop, comp_rtx, &condition_bb,
1223 prob, prob, REG_BR_PROB_BASE - prob,
1227 /* Set new iteration count of loop kernel. */
1228 if (count_reg && count_init)
1229 SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1232 /* Now apply the scheduled kernel to the RTL of the loop. */
1233 permute_partial_schedule (ps, g->closing_branch->first_note);
1235 /* Mark this loop as software pipelined so the later
1236 scheduling passes doesn't touch it. */
1237 if (! flag_resched_modulo_sched)
1238 g->bb->flags |= BB_DISABLE_SCHEDULE;
1239 /* The life-info is not valid any more. */
1240 df_set_bb_dirty (g->bb);
1242 reg_move_replaces = generate_reg_moves (ps, true);
1244 print_node_sched_params (dump_file, g->num_nodes, g);
1245 /* Generate prolog and epilog. */
1246 generate_prolog_epilog (ps, loop, count_reg, count_init);
1248 free_undo_replace_buff (reg_move_replaces);
1251 free_partial_schedule (ps);
1252 free (node_sched_params);
1259 /* Release scheduler data, needed until now because of DFA. */
1260 haifa_sched_finish ();
1261 loop_optimizer_finalize ();
1264 /* The SMS scheduling algorithm itself
1265 -----------------------------------
1266 Input: 'O' an ordered list of insns of a loop.
1267 Output: A scheduling of the loop - kernel, prolog, and epilogue.
1269 'Q' is the empty Set
1270 'PS' is the partial schedule; it holds the currently scheduled nodes with
1272 'PSP' previously scheduled predecessors.
1273 'PSS' previously scheduled successors.
1274 't(u)' the cycle where u is scheduled.
1275 'l(u)' is the latency of u.
1276 'd(v,u)' is the dependence distance from v to u.
1277 'ASAP(u)' the earliest time at which u could be scheduled as computed in
1278 the node ordering phase.
1279 'check_hardware_resources_conflicts(u, PS, c)'
1280 run a trace around cycle/slot through DFA model
1281 to check resource conflicts involving instruction u
1282 at cycle c given the partial schedule PS.
1283 'add_to_partial_schedule_at_time(u, PS, c)'
1284 Add the node/instruction u to the partial schedule
1286 'calculate_register_pressure(PS)'
1287 Given a schedule of instructions, calculate the register
1288 pressure it implies. One implementation could be the
1289 maximum number of overlapping live ranges.
1290 'maxRP' The maximum allowed register pressure, it is usually derived from the number
1291 registers available in the hardware.
1295 3. for each node u in O in pre-computed order
1296 4. if (PSP(u) != Q && PSS(u) == Q) then
1297 5. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1298 6. start = Early_start; end = Early_start + II - 1; step = 1
1299 11. else if (PSP(u) == Q && PSS(u) != Q) then
1300 12. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1301 13. start = Late_start; end = Late_start - II + 1; step = -1
1302 14. else if (PSP(u) != Q && PSS(u) != Q) then
1303 15. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1304 16. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1305 17. start = Early_start;
1306 18. end = min(Early_start + II - 1 , Late_start);
1308 20. else "if (PSP(u) == Q && PSS(u) == Q)"
1309 21. start = ASAP(u); end = start + II - 1; step = 1
1313 24. for (c = start ; c != end ; c += step)
1314 25. if check_hardware_resources_conflicts(u, PS, c) then
1315 26. add_to_partial_schedule_at_time(u, PS, c)
1320 31. if (success == false) then
1322 33. if (II > maxII) then
1323 34. finish - failed to schedule
1328 39. if (calculate_register_pressure(PS) > maxRP) then
1331 42. compute epilogue & prologue
1332 43. finish - succeeded to schedule
1335 /* A limit on the number of cycles that resource conflicts can span. ??? Should
1336 be provided by DFA, and be dependent on the type of insn scheduled. Currently
1337 set to 0 to save compile time. */
1338 #define DFA_HISTORY SMS_DFA_HISTORY
1340 /* A threshold for the number of repeated unsuccessful attempts to insert
1341 an empty row, before we flush the partial schedule and start over. */
1342 #define MAX_SPLIT_NUM 10
1343 /* Given the partial schedule PS, this function calculates and returns the
1344 cycles in which we can schedule the node with the given index I.
1345 NOTE: Here we do the backtracking in SMS, in some special cases. We have
1346 noticed that there are several cases in which we fail to SMS the loop
1347 because the sched window of a node is empty due to tight data-deps. In
1348 such cases we want to unschedule some of the predecessors/successors
1349 until we get non-empty scheduling window. It returns -1 if the
1350 scheduling window is empty and zero otherwise. */
1353 get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i,
1354 sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
1356 int start, step, end;
1358 int u = nodes_order [i];
1359 ddg_node_ptr u_node = &ps->g->nodes[u];
1360 sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1361 sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1362 sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1363 sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1367 /* 1. compute sched window for u (start, end, step). */
1370 psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1371 pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1373 if (psp_not_empty && !pss_not_empty)
1375 int early_start = INT_MIN;
1378 for (e = u_node->in; e != 0; e = e->next_in)
1380 ddg_node_ptr v_node = e->src;
1384 fprintf (dump_file, "\nProcessing edge: ");
1385 print_ddg_edge (dump_file, e);
1387 "\nScheduling %d (%d) in psp_not_empty,"
1388 " checking p %d (%d): ", u_node->cuid,
1389 INSN_UID (u_node->insn), v_node->cuid, INSN_UID
1393 if (TEST_BIT (sched_nodes, v_node->cuid))
1395 int p_st = SCHED_TIME (v_node);
1398 MAX (early_start, p_st + e->latency - (e->distance * ii));
1402 "pred st = %d; early_start = %d; latency: %d",
1403 p_st, early_start, e->latency);
1405 if (e->data_type == MEM_DEP)
1406 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1409 fprintf (dump_file, "the node is not scheduled\n");
1411 start = early_start;
1412 end = MIN (end, early_start + ii);
1413 /* Schedule the node close to it's predecessors. */
1418 "\nScheduling %d (%d) in a window (%d..%d) with step %d\n",
1419 u_node->cuid, INSN_UID (u_node->insn), start, end, step);
1422 else if (!psp_not_empty && pss_not_empty)
1424 int late_start = INT_MAX;
1427 for (e = u_node->out; e != 0; e = e->next_out)
1429 ddg_node_ptr v_node = e->dest;
1433 fprintf (dump_file, "\nProcessing edge:");
1434 print_ddg_edge (dump_file, e);
1436 "\nScheduling %d (%d) in pss_not_empty,"
1437 " checking s %d (%d): ", u_node->cuid,
1438 INSN_UID (u_node->insn), v_node->cuid, INSN_UID
1442 if (TEST_BIT (sched_nodes, v_node->cuid))
1444 int s_st = SCHED_TIME (v_node);
1446 late_start = MIN (late_start,
1447 s_st - e->latency + (e->distance * ii));
1451 "succ st = %d; late_start = %d; latency = %d",
1452 s_st, late_start, e->latency);
1454 if (e->data_type == MEM_DEP)
1455 end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1457 fprintf (dump_file, "end = %d\n", end);
1461 fprintf (dump_file, "the node is not scheduled\n");
1465 end = MAX (end, late_start - ii);
1466 /* Schedule the node close to it's successors. */
1471 "\nScheduling %d (%d) in a window (%d..%d) with step %d\n",
1472 u_node->cuid, INSN_UID (u_node->insn), start, end, step);
1476 else if (psp_not_empty && pss_not_empty)
1478 int early_start = INT_MIN;
1479 int late_start = INT_MAX;
1480 int count_preds = 0;
1481 int count_succs = 0;
1485 for (e = u_node->in; e != 0; e = e->next_in)
1487 ddg_node_ptr v_node = e->src;
1491 fprintf (dump_file, "\nProcessing edge:");
1492 print_ddg_edge (dump_file, e);
1494 "\nScheduling %d (%d) in psp_pss_not_empty,"
1495 " checking p %d (%d): ", u_node->cuid, INSN_UID
1496 (u_node->insn), v_node->cuid, INSN_UID
1500 if (TEST_BIT (sched_nodes, v_node->cuid))
1502 int p_st = SCHED_TIME (v_node);
1504 early_start = MAX (early_start,
1506 - (e->distance * ii));
1510 "pred st = %d; early_start = %d; latency = %d",
1511 p_st, early_start, e->latency);
1513 if (e->type == TRUE_DEP && e->data_type == REG_DEP)
1516 if (e->data_type == MEM_DEP)
1517 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1520 fprintf (dump_file, "the node is not scheduled\n");
1523 for (e = u_node->out; e != 0; e = e->next_out)
1525 ddg_node_ptr v_node = e->dest;
1529 fprintf (dump_file, "\nProcessing edge:");
1530 print_ddg_edge (dump_file, e);
1532 "\nScheduling %d (%d) in psp_pss_not_empty,"
1533 " checking s %d (%d): ", u_node->cuid, INSN_UID
1534 (u_node->insn), v_node->cuid, INSN_UID
1538 if (TEST_BIT (sched_nodes, v_node->cuid))
1540 int s_st = SCHED_TIME (v_node);
1542 late_start = MIN (late_start,
1544 + (e->distance * ii));
1548 "succ st = %d; late_start = %d; latency = %d",
1549 s_st, late_start, e->latency);
1551 if (e->type == TRUE_DEP && e->data_type == REG_DEP)
1554 if (e->data_type == MEM_DEP)
1555 start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1558 fprintf (dump_file, "the node is not scheduled\n");
1561 start = MAX (start, early_start);
1562 end = MIN (end, MIN (early_start + ii, late_start + 1));
1564 /* If there are more successors than predecessors schedule the
1565 node close to it's successors. */
1566 if (count_succs >= count_preds)
1568 int old_start = start;
1571 end = old_start - 1;
1575 else /* psp is empty && pss is empty. */
1577 start = SCHED_ASAP (u_node);
1588 if ((start >= end && step == 1) || (start <= end && step == -1))
1591 fprintf (dump_file, "\nEmpty window: start=%d, end=%d, step=%d\n",
1599 /* Calculate MUST_PRECEDE/MUST_FOLLOW bitmaps of U_NODE; which is the
1600 node currently been scheduled. At the end of the calculation
1601 MUST_PRECEDE/MUST_FOLLOW contains all predecessors/successors of
1602 U_NODE which are (1) already scheduled in the first/last row of
1603 U_NODE's scheduling window, (2) whose dependence inequality with U
1604 becomes an equality when U is scheduled in this same row, and (3)
1605 whose dependence latency is zero.
1607 The first and last rows are calculated using the following parameters:
1608 START/END rows - The cycles that begins/ends the traversal on the window;
1609 searching for an empty cycle to schedule U_NODE.
1610 STEP - The direction in which we traverse the window.
1611 II - The initiation interval. */
1614 calculate_must_precede_follow (ddg_node_ptr u_node, int start, int end,
1615 int step, int ii, sbitmap sched_nodes,
1616 sbitmap must_precede, sbitmap must_follow)
1619 int first_cycle_in_window, last_cycle_in_window;
1621 gcc_assert (must_precede && must_follow);
1623 /* Consider the following scheduling window:
1624 {first_cycle_in_window, first_cycle_in_window+1, ...,
1625 last_cycle_in_window}. If step is 1 then the following will be
1626 the order we traverse the window: {start=first_cycle_in_window,
1627 first_cycle_in_window+1, ..., end=last_cycle_in_window+1},
1628 or {start=last_cycle_in_window, last_cycle_in_window-1, ...,
1629 end=first_cycle_in_window-1} if step is -1. */
1630 first_cycle_in_window = (step == 1) ? start : end - step;
1631 last_cycle_in_window = (step == 1) ? end - step : start;
1633 sbitmap_zero (must_precede);
1634 sbitmap_zero (must_follow);
1637 fprintf (dump_file, "\nmust_precede: ");
1639 /* Instead of checking if:
1640 (SMODULO (SCHED_TIME (e->src), ii) == first_row_in_window)
1641 && ((SCHED_TIME (e->src) + e->latency - (e->distance * ii)) ==
1642 first_cycle_in_window)
1644 we use the fact that latency is non-negative:
1645 SCHED_TIME (e->src) - (e->distance * ii) <=
1646 SCHED_TIME (e->src) + e->latency - (e->distance * ii)) <=
1647 first_cycle_in_window
1649 SCHED_TIME (e->src) - (e->distance * ii) == first_cycle_in_window */
1650 for (e = u_node->in; e != 0; e = e->next_in)
1651 if (TEST_BIT (sched_nodes, e->src->cuid)
1652 && ((SCHED_TIME (e->src) - (e->distance * ii)) ==
1653 first_cycle_in_window))
1656 fprintf (dump_file, "%d ", e->src->cuid);
1658 SET_BIT (must_precede, e->src->cuid);
1662 fprintf (dump_file, "\nmust_follow: ");
1664 /* Instead of checking if:
1665 (SMODULO (SCHED_TIME (e->dest), ii) == last_row_in_window)
1666 && ((SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) ==
1667 last_cycle_in_window)
1669 we use the fact that latency is non-negative:
1670 SCHED_TIME (e->dest) + (e->distance * ii) >=
1671 SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) >=
1672 last_cycle_in_window
1674 SCHED_TIME (e->dest) + (e->distance * ii) == last_cycle_in_window */
1675 for (e = u_node->out; e != 0; e = e->next_out)
1676 if (TEST_BIT (sched_nodes, e->dest->cuid)
1677 && ((SCHED_TIME (e->dest) + (e->distance * ii)) ==
1678 last_cycle_in_window))
1681 fprintf (dump_file, "%d ", e->dest->cuid);
1683 SET_BIT (must_follow, e->dest->cuid);
1687 fprintf (dump_file, "\n");
1690 /* Return 1 if U_NODE can be scheduled in CYCLE. Use the following
1691 parameters to decide if that's possible:
1692 PS - The partial schedule.
1693 U - The serial number of U_NODE.
1694 NUM_SPLITS - The number of row splits made so far.
1695 MUST_PRECEDE - The nodes that must precede U_NODE. (only valid at
1696 the first row of the scheduling window)
1697 MUST_FOLLOW - The nodes that must follow U_NODE. (only valid at the
1698 last row of the scheduling window) */
1701 try_scheduling_node_in_cycle (partial_schedule_ptr ps, ddg_node_ptr u_node,
1702 int u, int cycle, sbitmap sched_nodes,
1703 int *num_splits, sbitmap must_precede,
1704 sbitmap must_follow)
1709 verify_partial_schedule (ps, sched_nodes);
1710 psi = ps_add_node_check_conflicts (ps, u_node, cycle,
1711 must_precede, must_follow);
1714 SCHED_TIME (u_node) = cycle;
1715 SET_BIT (sched_nodes, u);
1719 fprintf (dump_file, "Scheduled w/o split in %d\n", cycle);
1726 /* This function implements the scheduling algorithm for SMS according to the
1728 static partial_schedule_ptr
1729 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
1732 int i, c, success, num_splits = 0;
1733 int flush_and_start_over = true;
1734 int num_nodes = g->num_nodes;
1735 int start, end, step; /* Place together into one struct? */
1736 sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1737 sbitmap must_precede = sbitmap_alloc (num_nodes);
1738 sbitmap must_follow = sbitmap_alloc (num_nodes);
1739 sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
1741 partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1743 sbitmap_ones (tobe_scheduled);
1744 sbitmap_zero (sched_nodes);
1746 while (flush_and_start_over && (ii < maxii))
1750 fprintf (dump_file, "Starting with ii=%d\n", ii);
1751 flush_and_start_over = false;
1752 sbitmap_zero (sched_nodes);
1754 for (i = 0; i < num_nodes; i++)
1756 int u = nodes_order[i];
1757 ddg_node_ptr u_node = &ps->g->nodes[u];
1758 rtx insn = u_node->insn;
1760 if (!NONDEBUG_INSN_P (insn))
1762 RESET_BIT (tobe_scheduled, u);
1766 if (JUMP_P (insn)) /* Closing branch handled later. */
1768 RESET_BIT (tobe_scheduled, u);
1772 if (TEST_BIT (sched_nodes, u))
1775 /* Try to get non-empty scheduling window. */
1777 if (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start,
1781 fprintf (dump_file, "\nTrying to schedule node %d \
1782 INSN = %d in (%d .. %d) step %d\n", u, (INSN_UID
1783 (g->nodes[u].insn)), start, end, step);
1785 gcc_assert ((step > 0 && start < end)
1786 || (step < 0 && start > end));
1788 calculate_must_precede_follow (u_node, start, end, step, ii,
1789 sched_nodes, must_precede,
1792 for (c = start; c != end; c += step)
1794 sbitmap tmp_precede = NULL;
1795 sbitmap tmp_follow = NULL;
1800 tmp_precede = must_precede;
1801 else /* step == -1. */
1802 tmp_follow = must_follow;
1804 if (c == end - step)
1807 tmp_follow = must_follow;
1808 else /* step == -1. */
1809 tmp_precede = must_precede;
1813 try_scheduling_node_in_cycle (ps, u_node, u, c,
1815 &num_splits, tmp_precede,
1821 verify_partial_schedule (ps, sched_nodes);
1830 if (num_splits >= MAX_SPLIT_NUM)
1833 flush_and_start_over = true;
1834 verify_partial_schedule (ps, sched_nodes);
1835 reset_partial_schedule (ps, ii);
1836 verify_partial_schedule (ps, sched_nodes);
1841 /* The scheduling window is exclusive of 'end'
1842 whereas compute_split_window() expects an inclusive,
1845 split_row = compute_split_row (sched_nodes, start, end - 1,
1848 split_row = compute_split_row (sched_nodes, end + 1, start,
1851 ps_insert_empty_row (ps, split_row, sched_nodes);
1852 i--; /* Go back and retry node i. */
1855 fprintf (dump_file, "num_splits=%d\n", num_splits);
1858 /* ??? If (success), check register pressure estimates. */
1859 } /* Continue with next node. */
1860 } /* While flush_and_start_over. */
1863 free_partial_schedule (ps);
1867 gcc_assert (sbitmap_equal (tobe_scheduled, sched_nodes));
1869 sbitmap_free (sched_nodes);
1870 sbitmap_free (must_precede);
1871 sbitmap_free (must_follow);
1872 sbitmap_free (tobe_scheduled);
1877 /* This function inserts a new empty row into PS at the position
1878 according to SPLITROW, keeping all already scheduled instructions
1879 intact and updating their SCHED_TIME and cycle accordingly. */
1881 ps_insert_empty_row (partial_schedule_ptr ps, int split_row,
1882 sbitmap sched_nodes)
1884 ps_insn_ptr crr_insn;
1885 ps_insn_ptr *rows_new;
1887 int new_ii = ii + 1;
1890 verify_partial_schedule (ps, sched_nodes);
1892 /* We normalize sched_time and rotate ps to have only non-negative sched
1893 times, for simplicity of updating cycles after inserting new row. */
1894 split_row -= ps->min_cycle;
1895 split_row = SMODULO (split_row, ii);
1897 fprintf (dump_file, "split_row=%d\n", split_row);
1899 normalize_sched_times (ps);
1900 rotate_partial_schedule (ps, ps->min_cycle);
1902 rows_new = (ps_insn_ptr *) xcalloc (new_ii, sizeof (ps_insn_ptr));
1903 for (row = 0; row < split_row; row++)
1905 rows_new[row] = ps->rows[row];
1906 ps->rows[row] = NULL;
1907 for (crr_insn = rows_new[row];
1908 crr_insn; crr_insn = crr_insn->next_in_row)
1910 ddg_node_ptr u = crr_insn->node;
1911 int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii);
1913 SCHED_TIME (u) = new_time;
1914 crr_insn->cycle = new_time;
1915 SCHED_ROW (u) = new_time % new_ii;
1916 SCHED_STAGE (u) = new_time / new_ii;
1921 rows_new[split_row] = NULL;
1923 for (row = split_row; row < ii; row++)
1925 rows_new[row + 1] = ps->rows[row];
1926 ps->rows[row] = NULL;
1927 for (crr_insn = rows_new[row + 1];
1928 crr_insn; crr_insn = crr_insn->next_in_row)
1930 ddg_node_ptr u = crr_insn->node;
1931 int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii) + 1;
1933 SCHED_TIME (u) = new_time;
1934 crr_insn->cycle = new_time;
1935 SCHED_ROW (u) = new_time % new_ii;
1936 SCHED_STAGE (u) = new_time / new_ii;
1941 ps->min_cycle = ps->min_cycle + ps->min_cycle / ii
1942 + (SMODULO (ps->min_cycle, ii) >= split_row ? 1 : 0);
1943 ps->max_cycle = ps->max_cycle + ps->max_cycle / ii
1944 + (SMODULO (ps->max_cycle, ii) >= split_row ? 1 : 0);
1946 ps->rows = rows_new;
1948 gcc_assert (ps->min_cycle >= 0);
1950 verify_partial_schedule (ps, sched_nodes);
1953 fprintf (dump_file, "min_cycle=%d, max_cycle=%d\n", ps->min_cycle,
1957 /* Given U_NODE which is the node that failed to be scheduled; LOW and
1958 UP which are the boundaries of it's scheduling window; compute using
1959 SCHED_NODES and II a row in the partial schedule that can be split
1960 which will separate a critical predecessor from a critical successor
1961 thereby expanding the window, and return it. */
1963 compute_split_row (sbitmap sched_nodes, int low, int up, int ii,
1964 ddg_node_ptr u_node)
1967 int lower = INT_MIN, upper = INT_MAX;
1968 ddg_node_ptr crit_pred = NULL;
1969 ddg_node_ptr crit_succ = NULL;
1972 for (e = u_node->in; e != 0; e = e->next_in)
1974 ddg_node_ptr v_node = e->src;
1976 if (TEST_BIT (sched_nodes, v_node->cuid)
1977 && (low == SCHED_TIME (v_node) + e->latency - (e->distance * ii)))
1978 if (SCHED_TIME (v_node) > lower)
1981 lower = SCHED_TIME (v_node);
1985 if (crit_pred != NULL)
1987 crit_cycle = SCHED_TIME (crit_pred) + 1;
1988 return SMODULO (crit_cycle, ii);
1991 for (e = u_node->out; e != 0; e = e->next_out)
1993 ddg_node_ptr v_node = e->dest;
1994 if (TEST_BIT (sched_nodes, v_node->cuid)
1995 && (up == SCHED_TIME (v_node) - e->latency + (e->distance * ii)))
1996 if (SCHED_TIME (v_node) < upper)
1999 upper = SCHED_TIME (v_node);
2003 if (crit_succ != NULL)
2005 crit_cycle = SCHED_TIME (crit_succ);
2006 return SMODULO (crit_cycle, ii);
2010 fprintf (dump_file, "Both crit_pred and crit_succ are NULL\n");
2012 return SMODULO ((low + up + 1) / 2, ii);
2016 verify_partial_schedule (partial_schedule_ptr ps, sbitmap sched_nodes)
2019 ps_insn_ptr crr_insn;
2021 for (row = 0; row < ps->ii; row++)
2022 for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
2024 ddg_node_ptr u = crr_insn->node;
2026 gcc_assert (TEST_BIT (sched_nodes, u->cuid));
2027 /* ??? Test also that all nodes of sched_nodes are in ps, perhaps by
2028 popcount (sched_nodes) == number of insns in ps. */
2029 gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
2030 gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
2035 /* This page implements the algorithm for ordering the nodes of a DDG
2036 for modulo scheduling, activated through the
2037 "int sms_order_nodes (ddg_ptr, int mii, int * result)" API. */
2039 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
2040 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
2041 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
2042 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
2043 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
2044 #define DEPTH(x) (ASAP ((x)))
2046 typedef struct node_order_params * nopa;
2048 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
2049 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
2050 static nopa calculate_order_params (ddg_ptr, int, int *);
2051 static int find_max_asap (ddg_ptr, sbitmap);
2052 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
2053 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
2055 enum sms_direction {BOTTOMUP, TOPDOWN};
2057 struct node_order_params
2064 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1. */
2066 check_nodes_order (int *node_order, int num_nodes)
2069 sbitmap tmp = sbitmap_alloc (num_nodes);
2074 fprintf (dump_file, "SMS final nodes order: \n");
2076 for (i = 0; i < num_nodes; i++)
2078 int u = node_order[i];
2081 fprintf (dump_file, "%d ", u);
2082 gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u));
2088 fprintf (dump_file, "\n");
2093 /* Order the nodes of G for scheduling and pass the result in
2094 NODE_ORDER. Also set aux.count of each node to ASAP.
2095 Put maximal ASAP to PMAX_ASAP. Return the recMII for the given DDG. */
2097 sms_order_nodes (ddg_ptr g, int mii, int * node_order, int *pmax_asap)
2101 ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
2103 nopa nops = calculate_order_params (g, mii, pmax_asap);
2106 print_sccs (dump_file, sccs, g);
2108 order_nodes_of_sccs (sccs, node_order);
2110 if (sccs->num_sccs > 0)
2111 /* First SCC has the largest recurrence_length. */
2112 rec_mii = sccs->sccs[0]->recurrence_length;
2114 /* Save ASAP before destroying node_order_params. */
2115 for (i = 0; i < g->num_nodes; i++)
2117 ddg_node_ptr v = &g->nodes[i];
2118 v->aux.count = ASAP (v);
2122 free_ddg_all_sccs (sccs);
2123 check_nodes_order (node_order, g->num_nodes);
2129 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
2132 ddg_ptr g = all_sccs->ddg;
2133 int num_nodes = g->num_nodes;
2134 sbitmap prev_sccs = sbitmap_alloc (num_nodes);
2135 sbitmap on_path = sbitmap_alloc (num_nodes);
2136 sbitmap tmp = sbitmap_alloc (num_nodes);
2137 sbitmap ones = sbitmap_alloc (num_nodes);
2139 sbitmap_zero (prev_sccs);
2140 sbitmap_ones (ones);
2142 /* Perform the node ordering starting from the SCC with the highest recMII.
2143 For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc. */
2144 for (i = 0; i < all_sccs->num_sccs; i++)
2146 ddg_scc_ptr scc = all_sccs->sccs[i];
2148 /* Add nodes on paths from previous SCCs to the current SCC. */
2149 find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
2150 sbitmap_a_or_b (tmp, scc->nodes, on_path);
2152 /* Add nodes on paths from the current SCC to previous SCCs. */
2153 find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
2154 sbitmap_a_or_b (tmp, tmp, on_path);
2156 /* Remove nodes of previous SCCs from current extended SCC. */
2157 sbitmap_difference (tmp, tmp, prev_sccs);
2159 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
2160 /* Above call to order_nodes_in_scc updated prev_sccs |= tmp. */
2163 /* Handle the remaining nodes that do not belong to any scc. Each call
2164 to order_nodes_in_scc handles a single connected component. */
2165 while (pos < g->num_nodes)
2167 sbitmap_difference (tmp, ones, prev_sccs);
2168 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
2170 sbitmap_free (prev_sccs);
2171 sbitmap_free (on_path);
2173 sbitmap_free (ones);
2176 /* MII is needed if we consider backarcs (that do not close recursive cycles). */
2177 static struct node_order_params *
2178 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED, int *pmax_asap)
2182 int num_nodes = g->num_nodes;
2184 /* Allocate a place to hold ordering params for each node in the DDG. */
2185 nopa node_order_params_arr;
2187 /* Initialize of ASAP/ALAP/HEIGHT to zero. */
2188 node_order_params_arr = (nopa) xcalloc (num_nodes,
2189 sizeof (struct node_order_params));
2191 /* Set the aux pointer of each node to point to its order_params structure. */
2192 for (u = 0; u < num_nodes; u++)
2193 g->nodes[u].aux.info = &node_order_params_arr[u];
2195 /* Disregarding a backarc from each recursive cycle to obtain a DAG,
2196 calculate ASAP, ALAP, mobility, distance, and height for each node
2197 in the dependence (direct acyclic) graph. */
2199 /* We assume that the nodes in the array are in topological order. */
2202 for (u = 0; u < num_nodes; u++)
2204 ddg_node_ptr u_node = &g->nodes[u];
2207 for (e = u_node->in; e; e = e->next_in)
2208 if (e->distance == 0)
2209 ASAP (u_node) = MAX (ASAP (u_node),
2210 ASAP (e->src) + e->latency);
2211 max_asap = MAX (max_asap, ASAP (u_node));
2214 for (u = num_nodes - 1; u > -1; u--)
2216 ddg_node_ptr u_node = &g->nodes[u];
2218 ALAP (u_node) = max_asap;
2219 HEIGHT (u_node) = 0;
2220 for (e = u_node->out; e; e = e->next_out)
2221 if (e->distance == 0)
2223 ALAP (u_node) = MIN (ALAP (u_node),
2224 ALAP (e->dest) - e->latency);
2225 HEIGHT (u_node) = MAX (HEIGHT (u_node),
2226 HEIGHT (e->dest) + e->latency);
2231 fprintf (dump_file, "\nOrder params\n");
2232 for (u = 0; u < num_nodes; u++)
2234 ddg_node_ptr u_node = &g->nodes[u];
2236 fprintf (dump_file, "node %d, ASAP: %d, ALAP: %d, HEIGHT: %d\n", u,
2237 ASAP (u_node), ALAP (u_node), HEIGHT (u_node));
2241 *pmax_asap = max_asap;
2242 return node_order_params_arr;
2246 find_max_asap (ddg_ptr g, sbitmap nodes)
2251 sbitmap_iterator sbi;
2253 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
2255 ddg_node_ptr u_node = &g->nodes[u];
2257 if (max_asap < ASAP (u_node))
2259 max_asap = ASAP (u_node);
2267 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
2271 int min_mob = INT_MAX;
2273 sbitmap_iterator sbi;
2275 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
2277 ddg_node_ptr u_node = &g->nodes[u];
2279 if (max_hv < HEIGHT (u_node))
2281 max_hv = HEIGHT (u_node);
2282 min_mob = MOB (u_node);
2285 else if ((max_hv == HEIGHT (u_node))
2286 && (min_mob > MOB (u_node)))
2288 min_mob = MOB (u_node);
2296 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
2300 int min_mob = INT_MAX;
2302 sbitmap_iterator sbi;
2304 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
2306 ddg_node_ptr u_node = &g->nodes[u];
2308 if (max_dv < DEPTH (u_node))
2310 max_dv = DEPTH (u_node);
2311 min_mob = MOB (u_node);
2314 else if ((max_dv == DEPTH (u_node))
2315 && (min_mob > MOB (u_node)))
2317 min_mob = MOB (u_node);
2324 /* Places the nodes of SCC into the NODE_ORDER array starting
2325 at position POS, according to the SMS ordering algorithm.
2326 NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
2327 the NODE_ORDER array, starting from position zero. */
2329 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
2330 int * node_order, int pos)
2332 enum sms_direction dir;
2333 int num_nodes = g->num_nodes;
2334 sbitmap workset = sbitmap_alloc (num_nodes);
2335 sbitmap tmp = sbitmap_alloc (num_nodes);
2336 sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
2337 sbitmap predecessors = sbitmap_alloc (num_nodes);
2338 sbitmap successors = sbitmap_alloc (num_nodes);
2340 sbitmap_zero (predecessors);
2341 find_predecessors (predecessors, g, nodes_ordered);
2343 sbitmap_zero (successors);
2344 find_successors (successors, g, nodes_ordered);
2347 if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
2349 sbitmap_copy (workset, tmp);
2352 else if (sbitmap_a_and_b_cg (tmp, successors, scc))
2354 sbitmap_copy (workset, tmp);
2361 sbitmap_zero (workset);
2362 if ((u = find_max_asap (g, scc)) >= 0)
2363 SET_BIT (workset, u);
2367 sbitmap_zero (zero_bitmap);
2368 while (!sbitmap_equal (workset, zero_bitmap))
2371 ddg_node_ptr v_node;
2372 sbitmap v_node_preds;
2373 sbitmap v_node_succs;
2377 while (!sbitmap_equal (workset, zero_bitmap))
2379 v = find_max_hv_min_mob (g, workset);
2380 v_node = &g->nodes[v];
2381 node_order[pos++] = v;
2382 v_node_succs = NODE_SUCCESSORS (v_node);
2383 sbitmap_a_and_b (tmp, v_node_succs, scc);
2385 /* Don't consider the already ordered successors again. */
2386 sbitmap_difference (tmp, tmp, nodes_ordered);
2387 sbitmap_a_or_b (workset, workset, tmp);
2388 RESET_BIT (workset, v);
2389 SET_BIT (nodes_ordered, v);
2392 sbitmap_zero (predecessors);
2393 find_predecessors (predecessors, g, nodes_ordered);
2394 sbitmap_a_and_b (workset, predecessors, scc);
2398 while (!sbitmap_equal (workset, zero_bitmap))
2400 v = find_max_dv_min_mob (g, workset);
2401 v_node = &g->nodes[v];
2402 node_order[pos++] = v;
2403 v_node_preds = NODE_PREDECESSORS (v_node);
2404 sbitmap_a_and_b (tmp, v_node_preds, scc);
2406 /* Don't consider the already ordered predecessors again. */
2407 sbitmap_difference (tmp, tmp, nodes_ordered);
2408 sbitmap_a_or_b (workset, workset, tmp);
2409 RESET_BIT (workset, v);
2410 SET_BIT (nodes_ordered, v);
2413 sbitmap_zero (successors);
2414 find_successors (successors, g, nodes_ordered);
2415 sbitmap_a_and_b (workset, successors, scc);
2419 sbitmap_free (workset);
2420 sbitmap_free (zero_bitmap);
2421 sbitmap_free (predecessors);
2422 sbitmap_free (successors);
2427 /* This page contains functions for manipulating partial-schedules during
2428 modulo scheduling. */
2430 /* Create a partial schedule and allocate a memory to hold II rows. */
2432 static partial_schedule_ptr
2433 create_partial_schedule (int ii, ddg_ptr g, int history)
2435 partial_schedule_ptr ps = XNEW (struct partial_schedule);
2436 ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2438 ps->history = history;
2439 ps->min_cycle = INT_MAX;
2440 ps->max_cycle = INT_MIN;
2446 /* Free the PS_INSNs in rows array of the given partial schedule.
2447 ??? Consider caching the PS_INSN's. */
2449 free_ps_insns (partial_schedule_ptr ps)
2453 for (i = 0; i < ps->ii; i++)
2457 ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2460 ps->rows[i] = ps_insn;
2466 /* Free all the memory allocated to the partial schedule. */
2469 free_partial_schedule (partial_schedule_ptr ps)
2478 /* Clear the rows array with its PS_INSNs, and create a new one with
2482 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2487 if (new_ii == ps->ii)
2489 ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2490 * sizeof (ps_insn_ptr));
2491 memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2493 ps->min_cycle = INT_MAX;
2494 ps->max_cycle = INT_MIN;
2497 /* Prints the partial schedule as an ii rows array, for each rows
2498 print the ids of the insns in it. */
2500 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2504 for (i = 0; i < ps->ii; i++)
2506 ps_insn_ptr ps_i = ps->rows[i];
2508 fprintf (dump, "\n[ROW %d ]: ", i);
2511 fprintf (dump, "%d, ",
2512 INSN_UID (ps_i->node->insn));
2513 ps_i = ps_i->next_in_row;
2518 /* Creates an object of PS_INSN and initializes it to the given parameters. */
2520 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
2522 ps_insn_ptr ps_i = XNEW (struct ps_insn);
2525 ps_i->next_in_row = NULL;
2526 ps_i->prev_in_row = NULL;
2527 ps_i->row_rest_count = rest_count;
2528 ps_i->cycle = cycle;
2534 /* Removes the given PS_INSN from the partial schedule. Returns false if the
2535 node is not found in the partial schedule, else returns true. */
2537 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2544 row = SMODULO (ps_i->cycle, ps->ii);
2545 if (! ps_i->prev_in_row)
2547 if (ps_i != ps->rows[row])
2550 ps->rows[row] = ps_i->next_in_row;
2552 ps->rows[row]->prev_in_row = NULL;
2556 ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2557 if (ps_i->next_in_row)
2558 ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2564 /* Unlike what literature describes for modulo scheduling (which focuses
2565 on VLIW machines) the order of the instructions inside a cycle is
2566 important. Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2567 where the current instruction should go relative to the already
2568 scheduled instructions in the given cycle. Go over these
2569 instructions and find the first possible column to put it in. */
2571 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2572 sbitmap must_precede, sbitmap must_follow)
2574 ps_insn_ptr next_ps_i;
2575 ps_insn_ptr first_must_follow = NULL;
2576 ps_insn_ptr last_must_precede = NULL;
2582 row = SMODULO (ps_i->cycle, ps->ii);
2584 /* Find the first must follow and the last must precede
2585 and insert the node immediately after the must precede
2586 but make sure that it there is no must follow after it. */
2587 for (next_ps_i = ps->rows[row];
2589 next_ps_i = next_ps_i->next_in_row)
2591 if (must_follow && TEST_BIT (must_follow, next_ps_i->node->cuid)
2592 && ! first_must_follow)
2593 first_must_follow = next_ps_i;
2594 if (must_precede && TEST_BIT (must_precede, next_ps_i->node->cuid))
2596 /* If we have already met a node that must follow, then
2597 there is no possible column. */
2598 if (first_must_follow)
2601 last_must_precede = next_ps_i;
2605 /* Now insert the node after INSERT_AFTER_PSI. */
2607 if (! last_must_precede)
2609 ps_i->next_in_row = ps->rows[row];
2610 ps_i->prev_in_row = NULL;
2611 if (ps_i->next_in_row)
2612 ps_i->next_in_row->prev_in_row = ps_i;
2613 ps->rows[row] = ps_i;
2617 ps_i->next_in_row = last_must_precede->next_in_row;
2618 last_must_precede->next_in_row = ps_i;
2619 ps_i->prev_in_row = last_must_precede;
2620 if (ps_i->next_in_row)
2621 ps_i->next_in_row->prev_in_row = ps_i;
2627 /* Advances the PS_INSN one column in its current row; returns false
2628 in failure and true in success. Bit N is set in MUST_FOLLOW if
2629 the node with cuid N must be come after the node pointed to by
2630 PS_I when scheduled in the same cycle. */
2632 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2633 sbitmap must_follow)
2635 ps_insn_ptr prev, next;
2637 ddg_node_ptr next_node;
2642 row = SMODULO (ps_i->cycle, ps->ii);
2644 if (! ps_i->next_in_row)
2647 next_node = ps_i->next_in_row->node;
2649 /* Check if next_in_row is dependent on ps_i, both having same sched
2650 times (typically ANTI_DEP). If so, ps_i cannot skip over it. */
2651 if (must_follow && TEST_BIT (must_follow, next_node->cuid))
2654 /* Advance PS_I over its next_in_row in the doubly linked list. */
2655 prev = ps_i->prev_in_row;
2656 next = ps_i->next_in_row;
2658 if (ps_i == ps->rows[row])
2659 ps->rows[row] = next;
2661 ps_i->next_in_row = next->next_in_row;
2663 if (next->next_in_row)
2664 next->next_in_row->prev_in_row = ps_i;
2666 next->next_in_row = ps_i;
2667 ps_i->prev_in_row = next;
2669 next->prev_in_row = prev;
2671 prev->next_in_row = next;
2676 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2677 Returns 0 if this is not possible and a PS_INSN otherwise. Bit N is
2678 set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2679 before/after (respectively) the node pointed to by PS_I when scheduled
2680 in the same cycle. */
2682 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2683 sbitmap must_precede, sbitmap must_follow)
2687 int row = SMODULO (cycle, ps->ii);
2690 && ps->rows[row]->row_rest_count >= issue_rate)
2694 rest_count += ps->rows[row]->row_rest_count;
2696 ps_i = create_ps_insn (node, rest_count, cycle);
2698 /* Finds and inserts PS_I according to MUST_FOLLOW and
2700 if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2709 /* Advance time one cycle. Assumes DFA is being used. */
2711 advance_one_cycle (void)
2713 if (targetm.sched.dfa_pre_cycle_insn)
2714 state_transition (curr_state,
2715 targetm.sched.dfa_pre_cycle_insn ());
2717 state_transition (curr_state, NULL);
2719 if (targetm.sched.dfa_post_cycle_insn)
2720 state_transition (curr_state,
2721 targetm.sched.dfa_post_cycle_insn ());
2726 /* Checks if PS has resource conflicts according to DFA, starting from
2727 FROM cycle to TO cycle; returns true if there are conflicts and false
2728 if there are no conflicts. Assumes DFA is being used. */
2730 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2734 state_reset (curr_state);
2736 for (cycle = from; cycle <= to; cycle++)
2738 ps_insn_ptr crr_insn;
2739 /* Holds the remaining issue slots in the current row. */
2740 int can_issue_more = issue_rate;
2742 /* Walk through the DFA for the current row. */
2743 for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2745 crr_insn = crr_insn->next_in_row)
2747 rtx insn = crr_insn->node->insn;
2749 if (!NONDEBUG_INSN_P (insn))
2752 /* Check if there is room for the current insn. */
2753 if (!can_issue_more || state_dead_lock_p (curr_state))
2756 /* Update the DFA state and return with failure if the DFA found
2757 resource conflicts. */
2758 if (state_transition (curr_state, insn) >= 0)
2761 if (targetm.sched.variable_issue)
2763 targetm.sched.variable_issue (sched_dump, sched_verbose,
2764 insn, can_issue_more);
2765 /* A naked CLOBBER or USE generates no instruction, so don't
2766 let them consume issue slots. */
2767 else if (GET_CODE (PATTERN (insn)) != USE
2768 && GET_CODE (PATTERN (insn)) != CLOBBER)
2772 /* Advance the DFA to the next cycle. */
2773 advance_one_cycle ();
2778 /* Checks if the given node causes resource conflicts when added to PS at
2779 cycle C. If not the node is added to PS and returned; otherwise zero
2780 is returned. Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2781 cuid N must be come before/after (respectively) the node pointed to by
2782 PS_I when scheduled in the same cycle. */
2784 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2785 int c, sbitmap must_precede,
2786 sbitmap must_follow)
2788 int has_conflicts = 0;
2791 /* First add the node to the PS, if this succeeds check for
2792 conflicts, trying different issue slots in the same row. */
2793 if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2794 return NULL; /* Failed to insert the node at the given cycle. */
2796 has_conflicts = ps_has_conflicts (ps, c, c)
2798 && ps_has_conflicts (ps,
2802 /* Try different issue slots to find one that the given node can be
2803 scheduled in without conflicts. */
2804 while (has_conflicts)
2806 if (! ps_insn_advance_column (ps, ps_i, must_follow))
2808 has_conflicts = ps_has_conflicts (ps, c, c)
2810 && ps_has_conflicts (ps,
2817 remove_node_from_ps (ps, ps_i);
2821 ps->min_cycle = MIN (ps->min_cycle, c);
2822 ps->max_cycle = MAX (ps->max_cycle, c);
2826 /* Rotate the rows of PS such that insns scheduled at time
2827 START_CYCLE will appear in row 0. Updates max/min_cycles. */
2829 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2831 int i, row, backward_rotates;
2832 int last_row = ps->ii - 1;
2834 if (start_cycle == 0)
2837 backward_rotates = SMODULO (start_cycle, ps->ii);
2839 /* Revisit later and optimize this into a single loop. */
2840 for (i = 0; i < backward_rotates; i++)
2842 ps_insn_ptr first_row = ps->rows[0];
2844 for (row = 0; row < last_row; row++)
2845 ps->rows[row] = ps->rows[row+1];
2847 ps->rows[last_row] = first_row;
2850 ps->max_cycle -= start_cycle;
2851 ps->min_cycle -= start_cycle;
2854 #endif /* INSN_SCHEDULING */
2857 gate_handle_sms (void)
2859 return (optimize > 0 && flag_modulo_sched);
2863 /* Run instruction scheduler. */
2864 /* Perform SMS module scheduling. */
2866 rest_of_handle_sms (void)
2868 #ifdef INSN_SCHEDULING
2871 /* Collect loop information to be used in SMS. */
2872 cfg_layout_initialize (0);
2875 /* Update the life information, because we add pseudos. */
2876 max_regno = max_reg_num ();
2878 /* Finalize layout changes. */
2880 if (bb->next_bb != EXIT_BLOCK_PTR)
2881 bb->aux = bb->next_bb;
2882 free_dominance_info (CDI_DOMINATORS);
2883 cfg_layout_finalize ();
2884 #endif /* INSN_SCHEDULING */
2888 struct rtl_opt_pass pass_sms =
2893 gate_handle_sms, /* gate */
2894 rest_of_handle_sms, /* execute */
2897 0, /* static_pass_number */
2899 0, /* properties_required */
2900 0, /* properties_provided */
2901 0, /* properties_destroyed */
2902 TODO_dump_func, /* todo_flags_start */
2905 | TODO_verify_rtl_sharing
2907 | TODO_ggc_collect /* todo_flags_finish */