1 /* Swing Modulo Scheduling implementation.
2 Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
3 Free Software Foundation, Inc.
4 Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
25 #include "coretypes.h"
27 #include "diagnostic-core.h"
30 #include "hard-reg-set.h"
34 #include "insn-config.h"
35 #include "insn-attr.h"
38 #include "sched-int.h"
40 #include "cfglayout.h"
48 #include "tree-pass.h"
52 #ifdef INSN_SCHEDULING
54 /* This file contains the implementation of the Swing Modulo Scheduler,
55 described in the following references:
56 [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
57 Lifetime--sensitive modulo scheduling in a production environment.
58 IEEE Trans. on Comps., 50(3), March 2001
59 [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
60 Swing Modulo Scheduling: A Lifetime Sensitive Approach.
61 PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
63 The basic structure is:
64 1. Build a data-dependence graph (DDG) for each loop.
65 2. Use the DDG to order the insns of a loop (not in topological order
66 necessarily, but rather) trying to place each insn after all its
67 predecessors _or_ after all its successors.
68 3. Compute MII: a lower bound on the number of cycles to schedule the loop.
69 4. Use the ordering to perform list-scheduling of the loop:
70 1. Set II = MII. We will try to schedule the loop within II cycles.
71 2. Try to schedule the insns one by one according to the ordering.
72 For each insn compute an interval of cycles by considering already-
73 scheduled preds and succs (and associated latencies); try to place
74 the insn in the cycles of this window checking for potential
75 resource conflicts (using the DFA interface).
76 Note: this is different from the cycle-scheduling of schedule_insns;
77 here the insns are not scheduled monotonically top-down (nor bottom-
79 3. If failed in scheduling all insns - bump II++ and try again, unless
80 II reaches an upper bound MaxII, in which case report failure.
81 5. If we succeeded in scheduling the loop within II cycles, we now
82 generate prolog and epilog, decrease the counter of the loop, and
83 perform modulo variable expansion for live ranges that span more than
84 II cycles (i.e. use register copies to prevent a def from overwriting
85 itself before reaching the use).
87 SMS works with countable loops (1) whose control part can be easily
88 decoupled from the rest of the loop and (2) whose loop count can
89 be easily adjusted. This is because we peel a constant number of
90 iterations into a prologue and epilogue for which we want to avoid
91 emitting the control part, and a kernel which is to iterate that
92 constant number of iterations less than the original loop. So the
93 control part should be a set of insns clearly identified and having
94 its own iv, not otherwise used in the loop (at-least for now), which
95 initializes a register before the loop to the number of iterations.
96 Currently SMS relies on the do-loop pattern to recognize such loops,
97 where (1) the control part comprises of all insns defining and/or
98 using a certain 'count' register and (2) the loop count can be
99 adjusted by modifying this register prior to the loop.
100 TODO: Rely on cfgloop analysis instead. */
102 /* This page defines partial-schedule structures and functions for
103 modulo scheduling. */
105 typedef struct partial_schedule *partial_schedule_ptr;
106 typedef struct ps_insn *ps_insn_ptr;
108 /* The minimum (absolute) cycle that a node of ps was scheduled in. */
109 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
111 /* The maximum (absolute) cycle that a node of ps was scheduled in. */
112 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
114 /* Perform signed modulo, always returning a non-negative value. */
115 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
117 /* The number of different iterations the nodes in ps span, assuming
118 the stage boundaries are placed efficiently. */
119 #define CALC_STAGE_COUNT(max_cycle,min_cycle,ii) ((max_cycle - min_cycle \
121 /* The stage count of ps. */
122 #define PS_STAGE_COUNT(ps) (((partial_schedule_ptr)(ps))->stage_count)
124 /* A single instruction in the partial schedule. */
127 /* The corresponding DDG_NODE. */
130 /* The (absolute) cycle in which the PS instruction is scheduled.
131 Same as SCHED_TIME (node). */
134 /* The next/prev PS_INSN in the same row. */
135 ps_insn_ptr next_in_row,
140 /* Holds the partial schedule as an array of II rows. Each entry of the
141 array points to a linked list of PS_INSNs, which represents the
142 instructions that are scheduled for that row. */
143 struct partial_schedule
145 int ii; /* Number of rows in the partial schedule. */
146 int history; /* Threshold for conflict checking using DFA. */
148 /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii). */
151 /* rows_length[i] holds the number of instructions in the row.
152 It is used only (as an optimization) to back off quickly from
153 trying to schedule a node in a full row; that is, to avoid running
154 through futile DFA state transitions. */
157 /* The earliest absolute cycle of an insn in the partial schedule. */
160 /* The latest absolute cycle of an insn in the partial schedule. */
163 ddg_ptr g; /* The DDG of the insns in the partial schedule. */
165 int stage_count; /* The stage count of the partial schedule. */
168 /* We use this to record all the register replacements we do in
169 the kernel so we can undo SMS if it is not profitable. */
170 struct undo_replace_buff_elem
175 struct undo_replace_buff_elem *next;
180 static partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
181 static void free_partial_schedule (partial_schedule_ptr);
182 static void reset_partial_schedule (partial_schedule_ptr, int new_ii);
183 void print_partial_schedule (partial_schedule_ptr, FILE *);
184 static void verify_partial_schedule (partial_schedule_ptr, sbitmap);
185 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
186 ddg_node_ptr node, int cycle,
187 sbitmap must_precede,
188 sbitmap must_follow);
189 static void rotate_partial_schedule (partial_schedule_ptr, int);
190 void set_row_column_for_ps (partial_schedule_ptr);
191 static void ps_insert_empty_row (partial_schedule_ptr, int, sbitmap);
192 static int compute_split_row (sbitmap, int, int, int, ddg_node_ptr);
195 /* This page defines constants and structures for the modulo scheduling
198 static int sms_order_nodes (ddg_ptr, int, int *, int *);
199 static void set_node_sched_params (ddg_ptr);
200 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int, int *);
201 static void permute_partial_schedule (partial_schedule_ptr, rtx);
202 static void generate_prolog_epilog (partial_schedule_ptr, struct loop *,
204 static void duplicate_insns_of_cycles (partial_schedule_ptr,
206 static int calculate_stage_count (partial_schedule_ptr, int);
207 static void calculate_must_precede_follow (ddg_node_ptr, int, int,
208 int, int, sbitmap, sbitmap, sbitmap);
209 static int get_sched_window (partial_schedule_ptr, ddg_node_ptr,
210 sbitmap, int, int *, int *, int *);
211 static bool try_scheduling_node_in_cycle (partial_schedule_ptr, ddg_node_ptr,
212 int, int, sbitmap, int *, sbitmap,
214 static void remove_node_from_ps (partial_schedule_ptr, ps_insn_ptr);
216 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
217 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
218 #define SCHED_FIRST_REG_MOVE(x) \
219 (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
220 #define SCHED_NREG_MOVES(x) \
221 (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
222 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
223 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
224 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
226 /* The scheduling parameters held for each node. */
227 typedef struct node_sched_params
229 int asap; /* A lower-bound on the absolute scheduling cycle. */
230 int time; /* The absolute scheduling cycle (time >= asap). */
232 /* The following field (first_reg_move) is a pointer to the first
233 register-move instruction added to handle the modulo-variable-expansion
234 of the register defined by this node. This register-move copies the
235 original register defined by the node. */
238 /* The number of register-move instructions added, immediately preceding
242 int row; /* Holds time % ii. */
243 int stage; /* Holds time / ii. */
245 /* The column of a node inside the ps. If nodes u, v are on the same row,
246 u will precede v if column (u) < column (v). */
248 } *node_sched_params_ptr;
251 /* The following three functions are copied from the current scheduler
252 code in order to use sched_analyze() for computing the dependencies.
253 They are used when initializing the sched_info structure. */
255 sms_print_insn (const_rtx insn, int aligned ATTRIBUTE_UNUSED)
259 sprintf (tmp, "i%4d", INSN_UID (insn));
264 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
265 regset used ATTRIBUTE_UNUSED)
269 static struct common_sched_info_def sms_common_sched_info;
271 static struct sched_deps_info_def sms_sched_deps_info =
273 compute_jump_reg_dependencies,
274 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
279 static struct haifa_sched_info sms_sched_info =
288 NULL, /* insn_finishes_block_p */
293 NULL, NULL, NULL, NULL,
298 /* Given HEAD and TAIL which are the first and last insns in a loop;
299 return the register which controls the loop. Return zero if it has
300 more than one occurrence in the loop besides the control part or the
301 do-loop pattern is not of the form we expect. */
303 doloop_register_get (rtx head ATTRIBUTE_UNUSED, rtx tail ATTRIBUTE_UNUSED)
305 #ifdef HAVE_doloop_end
306 rtx reg, condition, insn, first_insn_not_to_check;
311 /* TODO: Free SMS's dependence on doloop_condition_get. */
312 condition = doloop_condition_get (tail);
316 if (REG_P (XEXP (condition, 0)))
317 reg = XEXP (condition, 0);
318 else if (GET_CODE (XEXP (condition, 0)) == PLUS
319 && REG_P (XEXP (XEXP (condition, 0), 0)))
320 reg = XEXP (XEXP (condition, 0), 0);
324 /* Check that the COUNT_REG has no other occurrences in the loop
325 until the decrement. We assume the control part consists of
326 either a single (parallel) branch-on-count or a (non-parallel)
327 branch immediately preceded by a single (decrement) insn. */
328 first_insn_not_to_check = (GET_CODE (PATTERN (tail)) == PARALLEL ? tail
329 : prev_nondebug_insn (tail));
331 for (insn = head; insn != first_insn_not_to_check; insn = NEXT_INSN (insn))
332 if (!DEBUG_INSN_P (insn) && reg_mentioned_p (reg, insn))
336 fprintf (dump_file, "SMS count_reg found ");
337 print_rtl_single (dump_file, reg);
338 fprintf (dump_file, " outside control in insn:\n");
339 print_rtl_single (dump_file, insn);
351 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
352 that the number of iterations is a compile-time constant. If so,
353 return the rtx that sets COUNT_REG to a constant, and set COUNT to
354 this constant. Otherwise return 0. */
356 const_iteration_count (rtx count_reg, basic_block pre_header,
357 HOST_WIDEST_INT * count)
365 get_ebb_head_tail (pre_header, pre_header, &head, &tail);
367 for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
368 if (NONDEBUG_INSN_P (insn) && single_set (insn) &&
369 rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
371 rtx pat = single_set (insn);
373 if (CONST_INT_P (SET_SRC (pat)))
375 *count = INTVAL (SET_SRC (pat));
385 /* A very simple resource-based lower bound on the initiation interval.
386 ??? Improve the accuracy of this bound by considering the
387 utilization of various units. */
391 if (targetm.sched.sms_res_mii)
392 return targetm.sched.sms_res_mii (g);
394 return ((g->num_nodes - g->num_debug) / issue_rate);
398 /* Points to the array that contains the sched data for each node. */
399 static node_sched_params_ptr node_sched_params;
401 /* Allocate sched_params for each node and initialize it. Assumes that
402 the aux field of each node contain the asap bound (computed earlier),
403 and copies it into the sched_params field. */
405 set_node_sched_params (ddg_ptr g)
409 /* Allocate for each node in the DDG a place to hold the "sched_data". */
410 /* Initialize ASAP/ALAP/HIGHT to zero. */
411 node_sched_params = (node_sched_params_ptr)
412 xcalloc (g->num_nodes,
413 sizeof (struct node_sched_params));
415 /* Set the pointer of the general data of the node to point to the
416 appropriate sched_params structure. */
417 for (i = 0; i < g->num_nodes; i++)
419 /* Watch out for aliasing problems? */
420 node_sched_params[i].asap = g->nodes[i].aux.count;
421 g->nodes[i].aux.info = &node_sched_params[i];
426 print_node_sched_params (FILE *file, int num_nodes, ddg_ptr g)
432 for (i = 0; i < num_nodes; i++)
434 node_sched_params_ptr nsp = &node_sched_params[i];
435 rtx reg_move = nsp->first_reg_move;
438 fprintf (file, "Node = %d; INSN = %d\n", i,
439 (INSN_UID (g->nodes[i].insn)));
440 fprintf (file, " asap = %d:\n", nsp->asap);
441 fprintf (file, " time = %d:\n", nsp->time);
442 fprintf (file, " nreg_moves = %d:\n", nsp->nreg_moves);
443 for (j = 0; j < nsp->nreg_moves; j++)
445 fprintf (file, " reg_move = ");
446 print_rtl_single (file, reg_move);
447 reg_move = PREV_INSN (reg_move);
453 Breaking intra-loop register anti-dependences:
454 Each intra-loop register anti-dependence implies a cross-iteration true
455 dependence of distance 1. Therefore, we can remove such false dependencies
456 and figure out if the partial schedule broke them by checking if (for a
457 true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
458 if so generate a register move. The number of such moves is equal to:
459 SCHED_TIME (use) - SCHED_TIME (def) { 0 broken
460 nreg_moves = ----------------------------------- + 1 - { dependence.
463 static struct undo_replace_buff_elem *
464 generate_reg_moves (partial_schedule_ptr ps, bool rescan)
469 struct undo_replace_buff_elem *reg_move_replaces = NULL;
471 for (i = 0; i < g->num_nodes; i++)
473 ddg_node_ptr u = &g->nodes[i];
475 int nreg_moves = 0, i_reg_move;
476 sbitmap *uses_of_defs;
478 rtx prev_reg, old_reg;
479 rtx set = single_set (u->insn);
481 /* Skip instructions that do not set a register. */
482 if ((set && !REG_P (SET_DEST (set))))
485 /* Compute the number of reg_moves needed for u, by looking at life
486 ranges started at u (excluding self-loops). */
487 for (e = u->out; e; e = e->next_out)
488 if (e->type == TRUE_DEP && e->dest != e->src)
490 int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
492 if (e->distance == 1)
493 nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
495 /* If dest precedes src in the schedule of the kernel, then dest
496 will read before src writes and we can save one reg_copy. */
497 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
498 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
501 if (nreg_moves4e >= 1)
503 /* !single_set instructions are not supported yet and
504 thus we do not except to encounter them in the loop
505 except from the doloop part. For the latter case
506 we assume no regmoves are generated as the doloop
507 instructions are tied to the branch with an edge. */
511 nreg_moves = MAX (nreg_moves, nreg_moves4e);
517 /* Every use of the register defined by node may require a different
518 copy of this register, depending on the time the use is scheduled.
519 Set a bitmap vector, telling which nodes use each copy of this
521 uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
522 sbitmap_vector_zero (uses_of_defs, nreg_moves);
523 for (e = u->out; e; e = e->next_out)
524 if (e->type == TRUE_DEP && e->dest != e->src)
526 int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
528 if (e->distance == 1)
529 dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
531 if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
532 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
536 SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
539 /* Now generate the reg_moves, attaching relevant uses to them. */
540 SCHED_NREG_MOVES (u) = nreg_moves;
541 old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
542 /* Insert the reg-moves right before the notes which precede
543 the insn they relates to. */
544 last_reg_move = u->first_note;
546 for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
548 unsigned int i_use = 0;
549 rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
550 rtx reg_move = gen_move_insn (new_reg, prev_reg);
551 sbitmap_iterator sbi;
553 add_insn_before (reg_move, last_reg_move, NULL);
554 last_reg_move = reg_move;
556 if (!SCHED_FIRST_REG_MOVE (u))
557 SCHED_FIRST_REG_MOVE (u) = reg_move;
559 EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use, sbi)
561 struct undo_replace_buff_elem *rep;
563 rep = (struct undo_replace_buff_elem *)
564 xcalloc (1, sizeof (struct undo_replace_buff_elem));
565 rep->insn = g->nodes[i_use].insn;
566 rep->orig_reg = old_reg;
567 rep->new_reg = new_reg;
569 if (! reg_move_replaces)
570 reg_move_replaces = rep;
573 rep->next = reg_move_replaces;
574 reg_move_replaces = rep;
577 replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
579 df_insn_rescan (g->nodes[i_use].insn);
584 sbitmap_vector_free (uses_of_defs);
586 return reg_move_replaces;
589 /* Free memory allocated for the undo buffer. */
591 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
594 while (reg_move_replaces)
596 struct undo_replace_buff_elem *rep = reg_move_replaces;
598 reg_move_replaces = reg_move_replaces->next;
603 /* Update the sched_params (time, row and stage) for node U using the II,
604 the CYCLE of U and MIN_CYCLE.
605 We're not simply taking the following
606 SCHED_STAGE (u) = CALC_STAGE_COUNT (SCHED_TIME (u), min_cycle, ii);
607 because the stages may not be aligned on cycle 0. */
609 update_node_sched_params (ddg_node_ptr u, int ii, int cycle, int min_cycle)
611 int sc_until_cycle_zero;
614 SCHED_TIME (u) = cycle;
615 SCHED_ROW (u) = SMODULO (cycle, ii);
617 /* The calculation of stage count is done adding the number
618 of stages before cycle zero and after cycle zero. */
619 sc_until_cycle_zero = CALC_STAGE_COUNT (-1, min_cycle, ii);
621 if (SCHED_TIME (u) < 0)
623 stage = CALC_STAGE_COUNT (-1, SCHED_TIME (u), ii);
624 SCHED_STAGE (u) = sc_until_cycle_zero - stage;
628 stage = CALC_STAGE_COUNT (SCHED_TIME (u), 0, ii);
629 SCHED_STAGE (u) = sc_until_cycle_zero + stage - 1;
633 /* Bump the SCHED_TIMEs of all nodes by AMOUNT. Set the values of
634 SCHED_ROW and SCHED_STAGE. Instruction scheduled on cycle AMOUNT
635 will move to cycle zero. */
637 reset_sched_times (partial_schedule_ptr ps, int amount)
641 ps_insn_ptr crr_insn;
643 for (row = 0; row < ii; row++)
644 for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
646 ddg_node_ptr u = crr_insn->node;
647 int normalized_time = SCHED_TIME (u) - amount;
648 int new_min_cycle = PS_MIN_CYCLE (ps) - amount;
652 /* Print the scheduling times after the rotation. */
653 fprintf (dump_file, "crr_insn->node=%d (insn id %d), "
654 "crr_insn->cycle=%d, min_cycle=%d", crr_insn->node->cuid,
655 INSN_UID (crr_insn->node->insn), normalized_time,
657 if (JUMP_P (crr_insn->node->insn))
658 fprintf (dump_file, " (branch)");
659 fprintf (dump_file, "\n");
662 gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
663 gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
665 crr_insn->cycle = normalized_time;
666 update_node_sched_params (u, ii, normalized_time, new_min_cycle);
670 /* Set SCHED_COLUMN of each node according to its position in PS. */
672 set_columns_for_ps (partial_schedule_ptr ps)
676 for (row = 0; row < ps->ii; row++)
678 ps_insn_ptr cur_insn = ps->rows[row];
681 for (; cur_insn; cur_insn = cur_insn->next_in_row)
682 SCHED_COLUMN (cur_insn->node) = column++;
686 /* Permute the insns according to their order in PS, from row 0 to
687 row ii-1, and position them right before LAST. This schedules
688 the insns of the loop kernel. */
690 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
696 for (row = 0; row < ii ; row++)
697 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
698 if (PREV_INSN (last) != ps_ij->node->insn)
699 reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
703 /* Set bitmaps TMP_FOLLOW and TMP_PRECEDE to MUST_FOLLOW and MUST_PRECEDE
704 respectively only if cycle C falls on the border of the scheduling
705 window boundaries marked by START and END cycles. STEP is the
706 direction of the window. */
708 set_must_precede_follow (sbitmap *tmp_follow, sbitmap must_follow,
709 sbitmap *tmp_precede, sbitmap must_precede, int c,
710 int start, int end, int step)
718 *tmp_precede = must_precede;
719 else /* step == -1. */
720 *tmp_follow = must_follow;
725 *tmp_follow = must_follow;
726 else /* step == -1. */
727 *tmp_precede = must_precede;
732 /* Return True if the branch can be moved to row ii-1 while
733 normalizing the partial schedule PS to start from cycle zero and thus
734 optimize the SC. Otherwise return False. */
736 optimize_sc (partial_schedule_ptr ps, ddg_ptr g)
738 int amount = PS_MIN_CYCLE (ps);
739 sbitmap sched_nodes = sbitmap_alloc (g->num_nodes);
740 int start, end, step;
743 int stage_count, stage_count_curr;
745 /* Compare the SC after normalization and SC after bringing the branch
746 to row ii-1. If they are equal just bail out. */
747 stage_count = calculate_stage_count (ps, amount);
749 calculate_stage_count (ps, SCHED_TIME (g->closing_branch) - (ii - 1));
751 if (stage_count == stage_count_curr)
754 fprintf (dump_file, "SMS SC already optimized.\n");
762 fprintf (dump_file, "SMS Trying to optimize branch location\n");
763 fprintf (dump_file, "SMS partial schedule before trial:\n");
764 print_partial_schedule (ps, dump_file);
767 /* First, normalize the partial scheduling. */
768 reset_sched_times (ps, amount);
769 rotate_partial_schedule (ps, amount);
773 "SMS partial schedule after normalization (ii, %d, SC %d):\n",
775 print_partial_schedule (ps, dump_file);
778 if (SMODULO (SCHED_TIME (g->closing_branch), ii) == ii - 1)
784 sbitmap_ones (sched_nodes);
786 /* Calculate the new placement of the branch. It should be in row
787 ii-1 and fall into it's scheduling window. */
788 if (get_sched_window (ps, g->closing_branch, sched_nodes, ii, &start,
792 ps_insn_ptr next_ps_i;
793 int branch_cycle = SCHED_TIME (g->closing_branch);
794 int row = SMODULO (branch_cycle, ps->ii);
796 sbitmap must_precede, must_follow, tmp_precede, tmp_follow;
800 fprintf (dump_file, "\nTrying to schedule node %d "
801 "INSN = %d in (%d .. %d) step %d\n",
802 g->closing_branch->cuid,
803 (INSN_UID (g->closing_branch->insn)), start, end, step);
805 gcc_assert ((step > 0 && start < end) || (step < 0 && start > end));
808 c = start + ii - SMODULO (start, ii) - 1;
809 gcc_assert (c >= start);
815 "SMS failed to schedule branch at cycle: %d\n", c);
821 c = start - SMODULO (start, ii) - 1;
822 gcc_assert (c <= start);
828 "SMS failed to schedule branch at cycle: %d\n", c);
834 must_precede = sbitmap_alloc (g->num_nodes);
835 must_follow = sbitmap_alloc (g->num_nodes);
837 /* Try to schedule the branch is it's new cycle. */
838 calculate_must_precede_follow (g->closing_branch, start, end,
839 step, ii, sched_nodes,
840 must_precede, must_follow);
842 set_must_precede_follow (&tmp_follow, must_follow, &tmp_precede,
843 must_precede, c, start, end, step);
845 /* Find the element in the partial schedule related to the closing
846 branch so we can remove it from it's current cycle. */
847 for (next_ps_i = ps->rows[row];
848 next_ps_i; next_ps_i = next_ps_i->next_in_row)
849 if (next_ps_i->node->cuid == g->closing_branch->cuid)
852 remove_node_from_ps (ps, next_ps_i);
854 try_scheduling_node_in_cycle (ps, g->closing_branch,
855 g->closing_branch->cuid, c,
856 sched_nodes, &num_splits,
857 tmp_precede, tmp_follow);
858 gcc_assert (num_splits == 0);
863 "SMS failed to schedule branch at cycle: %d, "
864 "bringing it back to cycle %d\n", c, branch_cycle);
866 /* The branch was failed to be placed in row ii - 1.
867 Put it back in it's original place in the partial
869 set_must_precede_follow (&tmp_follow, must_follow, &tmp_precede,
870 must_precede, branch_cycle, start, end,
873 try_scheduling_node_in_cycle (ps, g->closing_branch,
874 g->closing_branch->cuid,
875 branch_cycle, sched_nodes,
876 &num_splits, tmp_precede,
878 gcc_assert (success && (num_splits == 0));
883 /* The branch is placed in row ii - 1. */
886 "SMS success in moving branch to cycle %d\n", c);
888 update_node_sched_params (g->closing_branch, ii, c,
903 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
904 int to_stage, int for_prolog, rtx count_reg)
909 for (row = 0; row < ps->ii; row++)
910 for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
912 ddg_node_ptr u_node = ps_ij->node;
914 rtx reg_move = NULL_RTX;
916 /* Do not duplicate any insn which refers to count_reg as it
917 belongs to the control part.
918 The closing branch is scheduled as well and thus should
920 TODO: This should be done by analyzing the control part of
922 if (reg_mentioned_p (count_reg, u_node->insn)
923 || JUMP_P (ps_ij->node->insn))
928 /* SCHED_STAGE (u_node) >= from_stage == 0. Generate increasing
929 number of reg_moves starting with the second occurrence of
930 u_node, which is generated if its SCHED_STAGE <= to_stage. */
931 i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
932 i_reg_moves = MAX (i_reg_moves, 0);
933 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
935 /* The reg_moves start from the *first* reg_move backwards. */
938 reg_move = SCHED_FIRST_REG_MOVE (u_node);
939 for (j = 1; j < i_reg_moves; j++)
940 reg_move = PREV_INSN (reg_move);
943 else /* It's for the epilog. */
945 /* SCHED_STAGE (u_node) <= to_stage. Generate all reg_moves,
946 starting to decrease one stage after u_node no longer occurs;
947 that is, generate all reg_moves until
948 SCHED_STAGE (u_node) == from_stage - 1. */
949 i_reg_moves = SCHED_NREG_MOVES (u_node)
950 - (from_stage - SCHED_STAGE (u_node) - 1);
951 i_reg_moves = MAX (i_reg_moves, 0);
952 i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
954 /* The reg_moves start from the *last* reg_move forwards. */
957 reg_move = SCHED_FIRST_REG_MOVE (u_node);
958 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
959 reg_move = PREV_INSN (reg_move);
963 for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
964 emit_insn (copy_rtx (PATTERN (reg_move)));
965 if (SCHED_STAGE (u_node) >= from_stage
966 && SCHED_STAGE (u_node) <= to_stage)
967 duplicate_insn_chain (u_node->first_note, u_node->insn);
972 /* Generate the instructions (including reg_moves) for prolog & epilog. */
974 generate_prolog_epilog (partial_schedule_ptr ps, struct loop *loop,
975 rtx count_reg, rtx count_init)
978 int last_stage = PS_STAGE_COUNT (ps) - 1;
981 /* Generate the prolog, inserting its insns on the loop-entry edge. */
986 /* Generate instructions at the beginning of the prolog to
987 adjust the loop count by STAGE_COUNT. If loop count is constant
988 (count_init), this constant is adjusted by STAGE_COUNT in
989 generate_prolog_epilog function. */
990 rtx sub_reg = NULL_RTX;
992 sub_reg = expand_simple_binop (GET_MODE (count_reg), MINUS,
993 count_reg, GEN_INT (last_stage),
994 count_reg, 1, OPTAB_DIRECT);
995 gcc_assert (REG_P (sub_reg));
996 if (REGNO (sub_reg) != REGNO (count_reg))
997 emit_move_insn (count_reg, sub_reg);
1000 for (i = 0; i < last_stage; i++)
1001 duplicate_insns_of_cycles (ps, 0, i, 1, count_reg);
1003 /* Put the prolog on the entry edge. */
1004 e = loop_preheader_edge (loop);
1005 split_edge_and_insert (e, get_insns ());
1009 /* Generate the epilog, inserting its insns on the loop-exit edge. */
1012 for (i = 0; i < last_stage; i++)
1013 duplicate_insns_of_cycles (ps, i + 1, last_stage, 0, count_reg);
1015 /* Put the epilogue on the exit edge. */
1016 gcc_assert (single_exit (loop));
1017 e = single_exit (loop);
1018 split_edge_and_insert (e, get_insns ());
1022 /* Return true if all the BBs of the loop are empty except the
1025 loop_single_full_bb_p (struct loop *loop)
1028 basic_block *bbs = get_loop_body (loop);
1030 for (i = 0; i < loop->num_nodes ; i++)
1033 bool empty_bb = true;
1035 if (bbs[i] == loop->header)
1038 /* Make sure that basic blocks other than the header
1039 have only notes labels or jumps. */
1040 get_ebb_head_tail (bbs[i], bbs[i], &head, &tail);
1041 for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
1043 if (NOTE_P (head) || LABEL_P (head)
1044 || (INSN_P (head) && (DEBUG_INSN_P (head) || JUMP_P (head))))
1060 /* A simple loop from SMS point of view; it is a loop that is composed of
1061 either a single basic block or two BBs - a header and a latch. */
1062 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) \
1063 && (EDGE_COUNT (loop->latch->preds) == 1) \
1064 && (EDGE_COUNT (loop->latch->succs) == 1))
1066 /* Return true if the loop is in its canonical form and false if not.
1067 i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit. */
1069 loop_canon_p (struct loop *loop)
1072 if (loop->inner || !loop_outer (loop))
1075 fprintf (dump_file, "SMS loop inner or !loop_outer\n");
1079 if (!single_exit (loop))
1083 rtx insn = BB_END (loop->header);
1085 fprintf (dump_file, "SMS loop many exits ");
1086 fprintf (dump_file, " %s %d (file, line)\n",
1087 insn_file (insn), insn_line (insn));
1092 if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
1096 rtx insn = BB_END (loop->header);
1098 fprintf (dump_file, "SMS loop many BBs. ");
1099 fprintf (dump_file, " %s %d (file, line)\n",
1100 insn_file (insn), insn_line (insn));
1108 /* If there are more than one entry for the loop,
1109 make it one by splitting the first entry edge and
1110 redirecting the others to the new BB. */
1112 canon_loop (struct loop *loop)
1117 /* Avoid annoying special cases of edges going to exit
1119 FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
1120 if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
1123 if (loop->latch == loop->header
1124 || EDGE_COUNT (loop->latch->succs) > 1)
1126 FOR_EACH_EDGE (e, i, loop->header->preds)
1127 if (e->src == loop->latch)
1135 setup_sched_infos (void)
1137 memcpy (&sms_common_sched_info, &haifa_common_sched_info,
1138 sizeof (sms_common_sched_info));
1139 sms_common_sched_info.sched_pass_id = SCHED_SMS_PASS;
1140 common_sched_info = &sms_common_sched_info;
1142 sched_deps_info = &sms_sched_deps_info;
1143 current_sched_info = &sms_sched_info;
1146 /* Probability in % that the sms-ed loop rolls enough so that optimized
1147 version may be entered. Just a guess. */
1148 #define PROB_SMS_ENOUGH_ITERATIONS 80
1150 /* Used to calculate the upper bound of ii. */
1151 #define MAXII_FACTOR 2
1153 /* Main entry point, perform SMS scheduling on the loops of the function
1154 that consist of single basic blocks. */
1161 int maxii, max_asap;
1163 partial_schedule_ptr ps;
1164 basic_block bb = NULL;
1166 basic_block condition_bb = NULL;
1168 gcov_type trip_count = 0;
1170 loop_optimizer_init (LOOPS_HAVE_PREHEADERS
1171 | LOOPS_HAVE_RECORDED_EXITS);
1172 if (number_of_loops () <= 1)
1174 loop_optimizer_finalize ();
1175 return; /* There are no loops to schedule. */
1178 /* Initialize issue_rate. */
1179 if (targetm.sched.issue_rate)
1181 int temp = reload_completed;
1183 reload_completed = 1;
1184 issue_rate = targetm.sched.issue_rate ();
1185 reload_completed = temp;
1190 /* Initialize the scheduler. */
1191 setup_sched_infos ();
1192 haifa_sched_init ();
1194 /* Allocate memory to hold the DDG array one entry for each loop.
1195 We use loop->num as index into this array. */
1196 g_arr = XCNEWVEC (ddg_ptr, number_of_loops ());
1200 fprintf (dump_file, "\n\nSMS analysis phase\n");
1201 fprintf (dump_file, "===================\n\n");
1204 /* Build DDGs for all the relevant loops and hold them in G_ARR
1205 indexed by the loop index. */
1206 FOR_EACH_LOOP (li, loop, 0)
1211 /* For debugging. */
1212 if (dbg_cnt (sms_sched_loop) == false)
1215 fprintf (dump_file, "SMS reached max limit... \n");
1222 rtx insn = BB_END (loop->header);
1224 fprintf (dump_file, "SMS loop num: %d, file: %s, line: %d\n",
1225 loop->num, insn_file (insn), insn_line (insn));
1229 if (! loop_canon_p (loop))
1232 if (! loop_single_full_bb_p (loop))
1235 fprintf (dump_file, "SMS not loop_single_full_bb_p\n");
1241 get_ebb_head_tail (bb, bb, &head, &tail);
1242 latch_edge = loop_latch_edge (loop);
1243 gcc_assert (single_exit (loop));
1244 if (single_exit (loop)->count)
1245 trip_count = latch_edge->count / single_exit (loop)->count;
1247 /* Perform SMS only on loops that their average count is above threshold. */
1249 if ( latch_edge->count
1250 && (latch_edge->count < single_exit (loop)->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
1254 fprintf (dump_file, " %s %d (file, line)\n",
1255 insn_file (tail), insn_line (tail));
1256 fprintf (dump_file, "SMS single-bb-loop\n");
1257 if (profile_info && flag_branch_probabilities)
1259 fprintf (dump_file, "SMS loop-count ");
1260 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1261 (HOST_WIDEST_INT) bb->count);
1262 fprintf (dump_file, "\n");
1263 fprintf (dump_file, "SMS trip-count ");
1264 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1265 (HOST_WIDEST_INT) trip_count);
1266 fprintf (dump_file, "\n");
1267 fprintf (dump_file, "SMS profile-sum-max ");
1268 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1269 (HOST_WIDEST_INT) profile_info->sum_max);
1270 fprintf (dump_file, "\n");
1276 /* Make sure this is a doloop. */
1277 if ( !(count_reg = doloop_register_get (head, tail)))
1280 fprintf (dump_file, "SMS doloop_register_get failed\n");
1284 /* Don't handle BBs with calls or barriers or auto-increment insns
1285 (to avoid creating invalid reg-moves for the auto-increment insns),
1286 or !single_set with the exception of instructions that include
1287 count_reg---these instructions are part of the control part
1288 that do-loop recognizes.
1289 ??? Should handle auto-increment insns.
1290 ??? Should handle insns defining subregs. */
1291 for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1297 || (NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
1298 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE
1299 && !reg_mentioned_p (count_reg, insn))
1300 || (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0)
1301 || (INSN_P (insn) && (set = single_set (insn))
1302 && GET_CODE (SET_DEST (set)) == SUBREG))
1306 if (insn != NEXT_INSN (tail))
1311 fprintf (dump_file, "SMS loop-with-call\n");
1312 else if (BARRIER_P (insn))
1313 fprintf (dump_file, "SMS loop-with-barrier\n");
1314 else if (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0)
1315 fprintf (dump_file, "SMS reg inc\n");
1316 else if ((NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
1317 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
1318 fprintf (dump_file, "SMS loop-with-not-single-set\n");
1320 fprintf (dump_file, "SMS loop with subreg in lhs\n");
1321 print_rtl_single (dump_file, insn);
1327 /* Always schedule the closing branch with the rest of the
1328 instructions. The branch is rotated to be in row ii-1 at the
1329 end of the scheduling procedure to make sure it's the last
1330 instruction in the iteration. */
1331 if (! (g = create_ddg (bb, 1)))
1334 fprintf (dump_file, "SMS create_ddg failed\n");
1338 g_arr[loop->num] = g;
1340 fprintf (dump_file, "...OK\n");
1345 fprintf (dump_file, "\nSMS transformation phase\n");
1346 fprintf (dump_file, "=========================\n\n");
1349 /* We don't want to perform SMS on new loops - created by versioning. */
1350 FOR_EACH_LOOP (li, loop, 0)
1353 rtx count_reg, count_init;
1355 unsigned stage_count = 0;
1356 HOST_WIDEST_INT loop_count = 0;
1357 bool opt_sc_p = false;
1359 if (! (g = g_arr[loop->num]))
1364 rtx insn = BB_END (loop->header);
1366 fprintf (dump_file, "SMS loop num: %d, file: %s, line: %d\n",
1367 loop->num, insn_file (insn), insn_line (insn));
1369 print_ddg (dump_file, g);
1372 get_ebb_head_tail (loop->header, loop->header, &head, &tail);
1374 latch_edge = loop_latch_edge (loop);
1375 gcc_assert (single_exit (loop));
1376 if (single_exit (loop)->count)
1377 trip_count = latch_edge->count / single_exit (loop)->count;
1381 fprintf (dump_file, " %s %d (file, line)\n",
1382 insn_file (tail), insn_line (tail));
1383 fprintf (dump_file, "SMS single-bb-loop\n");
1384 if (profile_info && flag_branch_probabilities)
1386 fprintf (dump_file, "SMS loop-count ");
1387 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1388 (HOST_WIDEST_INT) bb->count);
1389 fprintf (dump_file, "\n");
1390 fprintf (dump_file, "SMS profile-sum-max ");
1391 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1392 (HOST_WIDEST_INT) profile_info->sum_max);
1393 fprintf (dump_file, "\n");
1395 fprintf (dump_file, "SMS doloop\n");
1396 fprintf (dump_file, "SMS built-ddg %d\n", g->num_nodes);
1397 fprintf (dump_file, "SMS num-loads %d\n", g->num_loads);
1398 fprintf (dump_file, "SMS num-stores %d\n", g->num_stores);
1402 /* In case of th loop have doloop register it gets special
1404 count_init = NULL_RTX;
1405 if ((count_reg = doloop_register_get (head, tail)))
1407 basic_block pre_header;
1409 pre_header = loop_preheader_edge (loop)->src;
1410 count_init = const_iteration_count (count_reg, pre_header,
1413 gcc_assert (count_reg);
1415 if (dump_file && count_init)
1417 fprintf (dump_file, "SMS const-doloop ");
1418 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1420 fprintf (dump_file, "\n");
1423 node_order = XNEWVEC (int, g->num_nodes);
1425 mii = 1; /* Need to pass some estimate of mii. */
1426 rec_mii = sms_order_nodes (g, mii, node_order, &max_asap);
1427 mii = MAX (res_MII (g), rec_mii);
1428 maxii = MAX (max_asap, MAXII_FACTOR * mii);
1431 fprintf (dump_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1432 rec_mii, mii, maxii);
1434 /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1436 set_node_sched_params (g);
1438 ps = sms_schedule_by_order (g, mii, maxii, node_order);
1442 /* Try to achieve optimized SC by normalizing the partial
1443 schedule (having the cycles start from cycle zero).
1444 The branch location must be placed in row ii-1 in the
1445 final scheduling. If failed, shift all instructions to
1446 position the branch in row ii-1. */
1447 opt_sc_p = optimize_sc (ps, g);
1449 stage_count = calculate_stage_count (ps, 0);
1452 /* Bring the branch to cycle ii-1. */
1453 int amount = SCHED_TIME (g->closing_branch) - (ps->ii - 1);
1456 fprintf (dump_file, "SMS schedule branch at cycle ii-1\n");
1458 stage_count = calculate_stage_count (ps, amount);
1461 gcc_assert (stage_count >= 1);
1462 PS_STAGE_COUNT (ps) = stage_count;
1465 /* The default value of PARAM_SMS_MIN_SC is 2 as stage count of
1466 1 means that there is no interleaving between iterations thus
1467 we let the scheduling passes do the job in this case. */
1468 if (stage_count < (unsigned) PARAM_VALUE (PARAM_SMS_MIN_SC)
1469 || (count_init && (loop_count <= stage_count))
1470 || (flag_branch_probabilities && (trip_count <= stage_count)))
1474 fprintf (dump_file, "SMS failed... \n");
1475 fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
1476 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1477 fprintf (dump_file, ", trip-count=");
1478 fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1479 fprintf (dump_file, ")\n");
1484 struct undo_replace_buff_elem *reg_move_replaces;
1488 /* Rotate the partial schedule to have the branch in row ii-1. */
1489 int amount = SCHED_TIME (g->closing_branch) - (ps->ii - 1);
1491 reset_sched_times (ps, amount);
1492 rotate_partial_schedule (ps, amount);
1495 set_columns_for_ps (ps);
1502 "%s:%d SMS succeeded %d %d (with ii, sc)\n",
1503 insn_file (tail), insn_line (tail), ps->ii, stage_count);
1504 print_partial_schedule (ps, dump_file);
1507 /* case the BCT count is not known , Do loop-versioning */
1508 if (count_reg && ! count_init)
1510 rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1511 GEN_INT(stage_count));
1512 unsigned prob = (PROB_SMS_ENOUGH_ITERATIONS
1513 * REG_BR_PROB_BASE) / 100;
1515 loop_version (loop, comp_rtx, &condition_bb,
1516 prob, prob, REG_BR_PROB_BASE - prob,
1520 /* Set new iteration count of loop kernel. */
1521 if (count_reg && count_init)
1522 SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1525 /* Now apply the scheduled kernel to the RTL of the loop. */
1526 permute_partial_schedule (ps, g->closing_branch->first_note);
1528 /* Mark this loop as software pipelined so the later
1529 scheduling passes doesn't touch it. */
1530 if (! flag_resched_modulo_sched)
1531 g->bb->flags |= BB_DISABLE_SCHEDULE;
1532 /* The life-info is not valid any more. */
1533 df_set_bb_dirty (g->bb);
1535 reg_move_replaces = generate_reg_moves (ps, true);
1537 print_node_sched_params (dump_file, g->num_nodes, g);
1538 /* Generate prolog and epilog. */
1539 generate_prolog_epilog (ps, loop, count_reg, count_init);
1541 free_undo_replace_buff (reg_move_replaces);
1544 free_partial_schedule (ps);
1545 free (node_sched_params);
1552 /* Release scheduler data, needed until now because of DFA. */
1553 haifa_sched_finish ();
1554 loop_optimizer_finalize ();
1557 /* The SMS scheduling algorithm itself
1558 -----------------------------------
1559 Input: 'O' an ordered list of insns of a loop.
1560 Output: A scheduling of the loop - kernel, prolog, and epilogue.
1562 'Q' is the empty Set
1563 'PS' is the partial schedule; it holds the currently scheduled nodes with
1565 'PSP' previously scheduled predecessors.
1566 'PSS' previously scheduled successors.
1567 't(u)' the cycle where u is scheduled.
1568 'l(u)' is the latency of u.
1569 'd(v,u)' is the dependence distance from v to u.
1570 'ASAP(u)' the earliest time at which u could be scheduled as computed in
1571 the node ordering phase.
1572 'check_hardware_resources_conflicts(u, PS, c)'
1573 run a trace around cycle/slot through DFA model
1574 to check resource conflicts involving instruction u
1575 at cycle c given the partial schedule PS.
1576 'add_to_partial_schedule_at_time(u, PS, c)'
1577 Add the node/instruction u to the partial schedule
1579 'calculate_register_pressure(PS)'
1580 Given a schedule of instructions, calculate the register
1581 pressure it implies. One implementation could be the
1582 maximum number of overlapping live ranges.
1583 'maxRP' The maximum allowed register pressure, it is usually derived from the number
1584 registers available in the hardware.
1588 3. for each node u in O in pre-computed order
1589 4. if (PSP(u) != Q && PSS(u) == Q) then
1590 5. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1591 6. start = Early_start; end = Early_start + II - 1; step = 1
1592 11. else if (PSP(u) == Q && PSS(u) != Q) then
1593 12. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1594 13. start = Late_start; end = Late_start - II + 1; step = -1
1595 14. else if (PSP(u) != Q && PSS(u) != Q) then
1596 15. Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1597 16. Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1598 17. start = Early_start;
1599 18. end = min(Early_start + II - 1 , Late_start);
1601 20. else "if (PSP(u) == Q && PSS(u) == Q)"
1602 21. start = ASAP(u); end = start + II - 1; step = 1
1606 24. for (c = start ; c != end ; c += step)
1607 25. if check_hardware_resources_conflicts(u, PS, c) then
1608 26. add_to_partial_schedule_at_time(u, PS, c)
1613 31. if (success == false) then
1615 33. if (II > maxII) then
1616 34. finish - failed to schedule
1621 39. if (calculate_register_pressure(PS) > maxRP) then
1624 42. compute epilogue & prologue
1625 43. finish - succeeded to schedule
1628 /* A limit on the number of cycles that resource conflicts can span. ??? Should
1629 be provided by DFA, and be dependent on the type of insn scheduled. Currently
1630 set to 0 to save compile time. */
1631 #define DFA_HISTORY SMS_DFA_HISTORY
1633 /* A threshold for the number of repeated unsuccessful attempts to insert
1634 an empty row, before we flush the partial schedule and start over. */
1635 #define MAX_SPLIT_NUM 10
1636 /* Given the partial schedule PS, this function calculates and returns the
1637 cycles in which we can schedule the node with the given index I.
1638 NOTE: Here we do the backtracking in SMS, in some special cases. We have
1639 noticed that there are several cases in which we fail to SMS the loop
1640 because the sched window of a node is empty due to tight data-deps. In
1641 such cases we want to unschedule some of the predecessors/successors
1642 until we get non-empty scheduling window. It returns -1 if the
1643 scheduling window is empty and zero otherwise. */
1646 get_sched_window (partial_schedule_ptr ps, ddg_node_ptr u_node,
1647 sbitmap sched_nodes, int ii, int *start_p, int *step_p,
1650 int start, step, end;
1651 int early_start, late_start;
1653 sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1654 sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1655 sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1656 sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1662 /* 1. compute sched window for u (start, end, step). */
1665 psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1666 pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1668 /* We first compute a forward range (start <= end), then decide whether
1670 early_start = INT_MIN;
1671 late_start = INT_MAX;
1679 if (dump_file && (psp_not_empty || pss_not_empty))
1681 fprintf (dump_file, "\nAnalyzing dependencies for node %d (INSN %d)"
1682 "; ii = %d\n\n", u_node->cuid, INSN_UID (u_node->insn), ii);
1683 fprintf (dump_file, "%11s %11s %11s %11s %5s\n",
1684 "start", "early start", "late start", "end", "time");
1685 fprintf (dump_file, "=========== =========== =========== ==========="
1688 /* Calculate early_start and limit end. Both bounds are inclusive. */
1690 for (e = u_node->in; e != 0; e = e->next_in)
1692 ddg_node_ptr v_node = e->src;
1694 if (TEST_BIT (sched_nodes, v_node->cuid))
1696 int p_st = SCHED_TIME (v_node);
1697 int earliest = p_st + e->latency - (e->distance * ii);
1698 int latest = (e->data_type == MEM_DEP ? p_st + ii - 1 : INT_MAX);
1702 fprintf (dump_file, "%11s %11d %11s %11d %5d",
1703 "", earliest, "", latest, p_st);
1704 print_ddg_edge (dump_file, e);
1705 fprintf (dump_file, "\n");
1708 early_start = MAX (early_start, earliest);
1709 end = MIN (end, latest);
1711 if (e->type == TRUE_DEP && e->data_type == REG_DEP)
1716 /* Calculate late_start and limit start. Both bounds are inclusive. */
1718 for (e = u_node->out; e != 0; e = e->next_out)
1720 ddg_node_ptr v_node = e->dest;
1722 if (TEST_BIT (sched_nodes, v_node->cuid))
1724 int s_st = SCHED_TIME (v_node);
1725 int earliest = (e->data_type == MEM_DEP ? s_st - ii + 1 : INT_MIN);
1726 int latest = s_st - e->latency + (e->distance * ii);
1730 fprintf (dump_file, "%11d %11s %11d %11s %5d",
1731 earliest, "", latest, "", s_st);
1732 print_ddg_edge (dump_file, e);
1733 fprintf (dump_file, "\n");
1736 start = MAX (start, earliest);
1737 late_start = MIN (late_start, latest);
1739 if (e->type == TRUE_DEP && e->data_type == REG_DEP)
1744 if (dump_file && (psp_not_empty || pss_not_empty))
1746 fprintf (dump_file, "----------- ----------- ----------- -----------"
1748 fprintf (dump_file, "%11d %11d %11d %11d %5s %s\n",
1749 start, early_start, late_start, end, "",
1750 "(max, max, min, min)");
1753 /* Get a target scheduling window no bigger than ii. */
1754 if (early_start == INT_MIN && late_start == INT_MAX)
1755 early_start = SCHED_ASAP (u_node);
1756 else if (early_start == INT_MIN)
1757 early_start = late_start - (ii - 1);
1758 late_start = MIN (late_start, early_start + (ii - 1));
1760 /* Apply memory dependence limits. */
1761 start = MAX (start, early_start);
1762 end = MIN (end, late_start);
1764 if (dump_file && (psp_not_empty || pss_not_empty))
1765 fprintf (dump_file, "%11s %11d %11d %11s %5s final window\n",
1766 "", start, end, "", "");
1768 /* If there are at least as many successors as predecessors, schedule the
1769 node close to its successors. */
1770 if (pss_not_empty && count_succs >= count_preds)
1778 /* Now that we've finalized the window, make END an exclusive rather
1779 than an inclusive bound. */
1788 if ((start >= end && step == 1) || (start <= end && step == -1))
1791 fprintf (dump_file, "\nEmpty window: start=%d, end=%d, step=%d\n",
1799 /* Calculate MUST_PRECEDE/MUST_FOLLOW bitmaps of U_NODE; which is the
1800 node currently been scheduled. At the end of the calculation
1801 MUST_PRECEDE/MUST_FOLLOW contains all predecessors/successors of
1802 U_NODE which are (1) already scheduled in the first/last row of
1803 U_NODE's scheduling window, (2) whose dependence inequality with U
1804 becomes an equality when U is scheduled in this same row, and (3)
1805 whose dependence latency is zero.
1807 The first and last rows are calculated using the following parameters:
1808 START/END rows - The cycles that begins/ends the traversal on the window;
1809 searching for an empty cycle to schedule U_NODE.
1810 STEP - The direction in which we traverse the window.
1811 II - The initiation interval. */
1814 calculate_must_precede_follow (ddg_node_ptr u_node, int start, int end,
1815 int step, int ii, sbitmap sched_nodes,
1816 sbitmap must_precede, sbitmap must_follow)
1819 int first_cycle_in_window, last_cycle_in_window;
1821 gcc_assert (must_precede && must_follow);
1823 /* Consider the following scheduling window:
1824 {first_cycle_in_window, first_cycle_in_window+1, ...,
1825 last_cycle_in_window}. If step is 1 then the following will be
1826 the order we traverse the window: {start=first_cycle_in_window,
1827 first_cycle_in_window+1, ..., end=last_cycle_in_window+1},
1828 or {start=last_cycle_in_window, last_cycle_in_window-1, ...,
1829 end=first_cycle_in_window-1} if step is -1. */
1830 first_cycle_in_window = (step == 1) ? start : end - step;
1831 last_cycle_in_window = (step == 1) ? end - step : start;
1833 sbitmap_zero (must_precede);
1834 sbitmap_zero (must_follow);
1837 fprintf (dump_file, "\nmust_precede: ");
1839 /* Instead of checking if:
1840 (SMODULO (SCHED_TIME (e->src), ii) == first_row_in_window)
1841 && ((SCHED_TIME (e->src) + e->latency - (e->distance * ii)) ==
1842 first_cycle_in_window)
1844 we use the fact that latency is non-negative:
1845 SCHED_TIME (e->src) - (e->distance * ii) <=
1846 SCHED_TIME (e->src) + e->latency - (e->distance * ii)) <=
1847 first_cycle_in_window
1849 SCHED_TIME (e->src) - (e->distance * ii) == first_cycle_in_window */
1850 for (e = u_node->in; e != 0; e = e->next_in)
1851 if (TEST_BIT (sched_nodes, e->src->cuid)
1852 && ((SCHED_TIME (e->src) - (e->distance * ii)) ==
1853 first_cycle_in_window))
1856 fprintf (dump_file, "%d ", e->src->cuid);
1858 SET_BIT (must_precede, e->src->cuid);
1862 fprintf (dump_file, "\nmust_follow: ");
1864 /* Instead of checking if:
1865 (SMODULO (SCHED_TIME (e->dest), ii) == last_row_in_window)
1866 && ((SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) ==
1867 last_cycle_in_window)
1869 we use the fact that latency is non-negative:
1870 SCHED_TIME (e->dest) + (e->distance * ii) >=
1871 SCHED_TIME (e->dest) - e->latency + (e->distance * ii)) >=
1872 last_cycle_in_window
1874 SCHED_TIME (e->dest) + (e->distance * ii) == last_cycle_in_window */
1875 for (e = u_node->out; e != 0; e = e->next_out)
1876 if (TEST_BIT (sched_nodes, e->dest->cuid)
1877 && ((SCHED_TIME (e->dest) + (e->distance * ii)) ==
1878 last_cycle_in_window))
1881 fprintf (dump_file, "%d ", e->dest->cuid);
1883 SET_BIT (must_follow, e->dest->cuid);
1887 fprintf (dump_file, "\n");
1890 /* Return 1 if U_NODE can be scheduled in CYCLE. Use the following
1891 parameters to decide if that's possible:
1892 PS - The partial schedule.
1893 U - The serial number of U_NODE.
1894 NUM_SPLITS - The number of row splits made so far.
1895 MUST_PRECEDE - The nodes that must precede U_NODE. (only valid at
1896 the first row of the scheduling window)
1897 MUST_FOLLOW - The nodes that must follow U_NODE. (only valid at the
1898 last row of the scheduling window) */
1901 try_scheduling_node_in_cycle (partial_schedule_ptr ps, ddg_node_ptr u_node,
1902 int u, int cycle, sbitmap sched_nodes,
1903 int *num_splits, sbitmap must_precede,
1904 sbitmap must_follow)
1909 verify_partial_schedule (ps, sched_nodes);
1910 psi = ps_add_node_check_conflicts (ps, u_node, cycle,
1911 must_precede, must_follow);
1914 SCHED_TIME (u_node) = cycle;
1915 SET_BIT (sched_nodes, u);
1919 fprintf (dump_file, "Scheduled w/o split in %d\n", cycle);
1926 /* This function implements the scheduling algorithm for SMS according to the
1928 static partial_schedule_ptr
1929 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order)
1932 int i, c, success, num_splits = 0;
1933 int flush_and_start_over = true;
1934 int num_nodes = g->num_nodes;
1935 int start, end, step; /* Place together into one struct? */
1936 sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1937 sbitmap must_precede = sbitmap_alloc (num_nodes);
1938 sbitmap must_follow = sbitmap_alloc (num_nodes);
1939 sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
1941 partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1943 sbitmap_ones (tobe_scheduled);
1944 sbitmap_zero (sched_nodes);
1946 while (flush_and_start_over && (ii < maxii))
1950 fprintf (dump_file, "Starting with ii=%d\n", ii);
1951 flush_and_start_over = false;
1952 sbitmap_zero (sched_nodes);
1954 for (i = 0; i < num_nodes; i++)
1956 int u = nodes_order[i];
1957 ddg_node_ptr u_node = &ps->g->nodes[u];
1958 rtx insn = u_node->insn;
1960 if (!NONDEBUG_INSN_P (insn))
1962 RESET_BIT (tobe_scheduled, u);
1966 if (TEST_BIT (sched_nodes, u))
1969 /* Try to get non-empty scheduling window. */
1971 if (get_sched_window (ps, u_node, sched_nodes, ii, &start,
1975 fprintf (dump_file, "\nTrying to schedule node %d "
1976 "INSN = %d in (%d .. %d) step %d\n", u, (INSN_UID
1977 (g->nodes[u].insn)), start, end, step);
1979 gcc_assert ((step > 0 && start < end)
1980 || (step < 0 && start > end));
1982 calculate_must_precede_follow (u_node, start, end, step, ii,
1983 sched_nodes, must_precede,
1986 for (c = start; c != end; c += step)
1988 sbitmap tmp_precede, tmp_follow;
1990 set_must_precede_follow (&tmp_follow, must_follow,
1991 &tmp_precede, must_precede,
1992 c, start, end, step);
1994 try_scheduling_node_in_cycle (ps, u_node, u, c,
1996 &num_splits, tmp_precede,
2002 verify_partial_schedule (ps, sched_nodes);
2011 if (num_splits >= MAX_SPLIT_NUM)
2014 flush_and_start_over = true;
2015 verify_partial_schedule (ps, sched_nodes);
2016 reset_partial_schedule (ps, ii);
2017 verify_partial_schedule (ps, sched_nodes);
2022 /* The scheduling window is exclusive of 'end'
2023 whereas compute_split_window() expects an inclusive,
2026 split_row = compute_split_row (sched_nodes, start, end - 1,
2029 split_row = compute_split_row (sched_nodes, end + 1, start,
2032 ps_insert_empty_row (ps, split_row, sched_nodes);
2033 i--; /* Go back and retry node i. */
2036 fprintf (dump_file, "num_splits=%d\n", num_splits);
2039 /* ??? If (success), check register pressure estimates. */
2040 } /* Continue with next node. */
2041 } /* While flush_and_start_over. */
2044 free_partial_schedule (ps);
2048 gcc_assert (sbitmap_equal (tobe_scheduled, sched_nodes));
2050 sbitmap_free (sched_nodes);
2051 sbitmap_free (must_precede);
2052 sbitmap_free (must_follow);
2053 sbitmap_free (tobe_scheduled);
2058 /* This function inserts a new empty row into PS at the position
2059 according to SPLITROW, keeping all already scheduled instructions
2060 intact and updating their SCHED_TIME and cycle accordingly. */
2062 ps_insert_empty_row (partial_schedule_ptr ps, int split_row,
2063 sbitmap sched_nodes)
2065 ps_insn_ptr crr_insn;
2066 ps_insn_ptr *rows_new;
2068 int new_ii = ii + 1;
2070 int *rows_length_new;
2072 verify_partial_schedule (ps, sched_nodes);
2074 /* We normalize sched_time and rotate ps to have only non-negative sched
2075 times, for simplicity of updating cycles after inserting new row. */
2076 split_row -= ps->min_cycle;
2077 split_row = SMODULO (split_row, ii);
2079 fprintf (dump_file, "split_row=%d\n", split_row);
2081 reset_sched_times (ps, PS_MIN_CYCLE (ps));
2082 rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
2084 rows_new = (ps_insn_ptr *) xcalloc (new_ii, sizeof (ps_insn_ptr));
2085 rows_length_new = (int *) xcalloc (new_ii, sizeof (int));
2086 for (row = 0; row < split_row; row++)
2088 rows_new[row] = ps->rows[row];
2089 rows_length_new[row] = ps->rows_length[row];
2090 ps->rows[row] = NULL;
2091 for (crr_insn = rows_new[row];
2092 crr_insn; crr_insn = crr_insn->next_in_row)
2094 ddg_node_ptr u = crr_insn->node;
2095 int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii);
2097 SCHED_TIME (u) = new_time;
2098 crr_insn->cycle = new_time;
2099 SCHED_ROW (u) = new_time % new_ii;
2100 SCHED_STAGE (u) = new_time / new_ii;
2105 rows_new[split_row] = NULL;
2107 for (row = split_row; row < ii; row++)
2109 rows_new[row + 1] = ps->rows[row];
2110 rows_length_new[row + 1] = ps->rows_length[row];
2111 ps->rows[row] = NULL;
2112 for (crr_insn = rows_new[row + 1];
2113 crr_insn; crr_insn = crr_insn->next_in_row)
2115 ddg_node_ptr u = crr_insn->node;
2116 int new_time = SCHED_TIME (u) + (SCHED_TIME (u) / ii) + 1;
2118 SCHED_TIME (u) = new_time;
2119 crr_insn->cycle = new_time;
2120 SCHED_ROW (u) = new_time % new_ii;
2121 SCHED_STAGE (u) = new_time / new_ii;
2126 ps->min_cycle = ps->min_cycle + ps->min_cycle / ii
2127 + (SMODULO (ps->min_cycle, ii) >= split_row ? 1 : 0);
2128 ps->max_cycle = ps->max_cycle + ps->max_cycle / ii
2129 + (SMODULO (ps->max_cycle, ii) >= split_row ? 1 : 0);
2131 ps->rows = rows_new;
2132 free (ps->rows_length);
2133 ps->rows_length = rows_length_new;
2135 gcc_assert (ps->min_cycle >= 0);
2137 verify_partial_schedule (ps, sched_nodes);
2140 fprintf (dump_file, "min_cycle=%d, max_cycle=%d\n", ps->min_cycle,
2144 /* Given U_NODE which is the node that failed to be scheduled; LOW and
2145 UP which are the boundaries of it's scheduling window; compute using
2146 SCHED_NODES and II a row in the partial schedule that can be split
2147 which will separate a critical predecessor from a critical successor
2148 thereby expanding the window, and return it. */
2150 compute_split_row (sbitmap sched_nodes, int low, int up, int ii,
2151 ddg_node_ptr u_node)
2154 int lower = INT_MIN, upper = INT_MAX;
2155 ddg_node_ptr crit_pred = NULL;
2156 ddg_node_ptr crit_succ = NULL;
2159 for (e = u_node->in; e != 0; e = e->next_in)
2161 ddg_node_ptr v_node = e->src;
2163 if (TEST_BIT (sched_nodes, v_node->cuid)
2164 && (low == SCHED_TIME (v_node) + e->latency - (e->distance * ii)))
2165 if (SCHED_TIME (v_node) > lower)
2168 lower = SCHED_TIME (v_node);
2172 if (crit_pred != NULL)
2174 crit_cycle = SCHED_TIME (crit_pred) + 1;
2175 return SMODULO (crit_cycle, ii);
2178 for (e = u_node->out; e != 0; e = e->next_out)
2180 ddg_node_ptr v_node = e->dest;
2181 if (TEST_BIT (sched_nodes, v_node->cuid)
2182 && (up == SCHED_TIME (v_node) - e->latency + (e->distance * ii)))
2183 if (SCHED_TIME (v_node) < upper)
2186 upper = SCHED_TIME (v_node);
2190 if (crit_succ != NULL)
2192 crit_cycle = SCHED_TIME (crit_succ);
2193 return SMODULO (crit_cycle, ii);
2197 fprintf (dump_file, "Both crit_pred and crit_succ are NULL\n");
2199 return SMODULO ((low + up + 1) / 2, ii);
2203 verify_partial_schedule (partial_schedule_ptr ps, sbitmap sched_nodes)
2206 ps_insn_ptr crr_insn;
2208 for (row = 0; row < ps->ii; row++)
2212 for (crr_insn = ps->rows[row]; crr_insn; crr_insn = crr_insn->next_in_row)
2214 ddg_node_ptr u = crr_insn->node;
2217 gcc_assert (TEST_BIT (sched_nodes, u->cuid));
2218 /* ??? Test also that all nodes of sched_nodes are in ps, perhaps by
2219 popcount (sched_nodes) == number of insns in ps. */
2220 gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
2221 gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
2224 gcc_assert (ps->rows_length[row] == length);
2229 /* This page implements the algorithm for ordering the nodes of a DDG
2230 for modulo scheduling, activated through the
2231 "int sms_order_nodes (ddg_ptr, int mii, int * result)" API. */
2233 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
2234 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
2235 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
2236 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
2237 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
2238 #define DEPTH(x) (ASAP ((x)))
2240 typedef struct node_order_params * nopa;
2242 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
2243 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
2244 static nopa calculate_order_params (ddg_ptr, int, int *);
2245 static int find_max_asap (ddg_ptr, sbitmap);
2246 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
2247 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
2249 enum sms_direction {BOTTOMUP, TOPDOWN};
2251 struct node_order_params
2258 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1. */
2260 check_nodes_order (int *node_order, int num_nodes)
2263 sbitmap tmp = sbitmap_alloc (num_nodes);
2268 fprintf (dump_file, "SMS final nodes order: \n");
2270 for (i = 0; i < num_nodes; i++)
2272 int u = node_order[i];
2275 fprintf (dump_file, "%d ", u);
2276 gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u));
2282 fprintf (dump_file, "\n");
2287 /* Order the nodes of G for scheduling and pass the result in
2288 NODE_ORDER. Also set aux.count of each node to ASAP.
2289 Put maximal ASAP to PMAX_ASAP. Return the recMII for the given DDG. */
2291 sms_order_nodes (ddg_ptr g, int mii, int * node_order, int *pmax_asap)
2295 ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
2297 nopa nops = calculate_order_params (g, mii, pmax_asap);
2300 print_sccs (dump_file, sccs, g);
2302 order_nodes_of_sccs (sccs, node_order);
2304 if (sccs->num_sccs > 0)
2305 /* First SCC has the largest recurrence_length. */
2306 rec_mii = sccs->sccs[0]->recurrence_length;
2308 /* Save ASAP before destroying node_order_params. */
2309 for (i = 0; i < g->num_nodes; i++)
2311 ddg_node_ptr v = &g->nodes[i];
2312 v->aux.count = ASAP (v);
2316 free_ddg_all_sccs (sccs);
2317 check_nodes_order (node_order, g->num_nodes);
2323 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
2326 ddg_ptr g = all_sccs->ddg;
2327 int num_nodes = g->num_nodes;
2328 sbitmap prev_sccs = sbitmap_alloc (num_nodes);
2329 sbitmap on_path = sbitmap_alloc (num_nodes);
2330 sbitmap tmp = sbitmap_alloc (num_nodes);
2331 sbitmap ones = sbitmap_alloc (num_nodes);
2333 sbitmap_zero (prev_sccs);
2334 sbitmap_ones (ones);
2336 /* Perform the node ordering starting from the SCC with the highest recMII.
2337 For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc. */
2338 for (i = 0; i < all_sccs->num_sccs; i++)
2340 ddg_scc_ptr scc = all_sccs->sccs[i];
2342 /* Add nodes on paths from previous SCCs to the current SCC. */
2343 find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
2344 sbitmap_a_or_b (tmp, scc->nodes, on_path);
2346 /* Add nodes on paths from the current SCC to previous SCCs. */
2347 find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
2348 sbitmap_a_or_b (tmp, tmp, on_path);
2350 /* Remove nodes of previous SCCs from current extended SCC. */
2351 sbitmap_difference (tmp, tmp, prev_sccs);
2353 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
2354 /* Above call to order_nodes_in_scc updated prev_sccs |= tmp. */
2357 /* Handle the remaining nodes that do not belong to any scc. Each call
2358 to order_nodes_in_scc handles a single connected component. */
2359 while (pos < g->num_nodes)
2361 sbitmap_difference (tmp, ones, prev_sccs);
2362 pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
2364 sbitmap_free (prev_sccs);
2365 sbitmap_free (on_path);
2367 sbitmap_free (ones);
2370 /* MII is needed if we consider backarcs (that do not close recursive cycles). */
2371 static struct node_order_params *
2372 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED, int *pmax_asap)
2376 int num_nodes = g->num_nodes;
2378 /* Allocate a place to hold ordering params for each node in the DDG. */
2379 nopa node_order_params_arr;
2381 /* Initialize of ASAP/ALAP/HEIGHT to zero. */
2382 node_order_params_arr = (nopa) xcalloc (num_nodes,
2383 sizeof (struct node_order_params));
2385 /* Set the aux pointer of each node to point to its order_params structure. */
2386 for (u = 0; u < num_nodes; u++)
2387 g->nodes[u].aux.info = &node_order_params_arr[u];
2389 /* Disregarding a backarc from each recursive cycle to obtain a DAG,
2390 calculate ASAP, ALAP, mobility, distance, and height for each node
2391 in the dependence (direct acyclic) graph. */
2393 /* We assume that the nodes in the array are in topological order. */
2396 for (u = 0; u < num_nodes; u++)
2398 ddg_node_ptr u_node = &g->nodes[u];
2401 for (e = u_node->in; e; e = e->next_in)
2402 if (e->distance == 0)
2403 ASAP (u_node) = MAX (ASAP (u_node),
2404 ASAP (e->src) + e->latency);
2405 max_asap = MAX (max_asap, ASAP (u_node));
2408 for (u = num_nodes - 1; u > -1; u--)
2410 ddg_node_ptr u_node = &g->nodes[u];
2412 ALAP (u_node) = max_asap;
2413 HEIGHT (u_node) = 0;
2414 for (e = u_node->out; e; e = e->next_out)
2415 if (e->distance == 0)
2417 ALAP (u_node) = MIN (ALAP (u_node),
2418 ALAP (e->dest) - e->latency);
2419 HEIGHT (u_node) = MAX (HEIGHT (u_node),
2420 HEIGHT (e->dest) + e->latency);
2425 fprintf (dump_file, "\nOrder params\n");
2426 for (u = 0; u < num_nodes; u++)
2428 ddg_node_ptr u_node = &g->nodes[u];
2430 fprintf (dump_file, "node %d, ASAP: %d, ALAP: %d, HEIGHT: %d\n", u,
2431 ASAP (u_node), ALAP (u_node), HEIGHT (u_node));
2435 *pmax_asap = max_asap;
2436 return node_order_params_arr;
2440 find_max_asap (ddg_ptr g, sbitmap nodes)
2445 sbitmap_iterator sbi;
2447 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
2449 ddg_node_ptr u_node = &g->nodes[u];
2451 if (max_asap < ASAP (u_node))
2453 max_asap = ASAP (u_node);
2461 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
2465 int min_mob = INT_MAX;
2467 sbitmap_iterator sbi;
2469 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
2471 ddg_node_ptr u_node = &g->nodes[u];
2473 if (max_hv < HEIGHT (u_node))
2475 max_hv = HEIGHT (u_node);
2476 min_mob = MOB (u_node);
2479 else if ((max_hv == HEIGHT (u_node))
2480 && (min_mob > MOB (u_node)))
2482 min_mob = MOB (u_node);
2490 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
2494 int min_mob = INT_MAX;
2496 sbitmap_iterator sbi;
2498 EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
2500 ddg_node_ptr u_node = &g->nodes[u];
2502 if (max_dv < DEPTH (u_node))
2504 max_dv = DEPTH (u_node);
2505 min_mob = MOB (u_node);
2508 else if ((max_dv == DEPTH (u_node))
2509 && (min_mob > MOB (u_node)))
2511 min_mob = MOB (u_node);
2518 /* Places the nodes of SCC into the NODE_ORDER array starting
2519 at position POS, according to the SMS ordering algorithm.
2520 NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
2521 the NODE_ORDER array, starting from position zero. */
2523 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
2524 int * node_order, int pos)
2526 enum sms_direction dir;
2527 int num_nodes = g->num_nodes;
2528 sbitmap workset = sbitmap_alloc (num_nodes);
2529 sbitmap tmp = sbitmap_alloc (num_nodes);
2530 sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
2531 sbitmap predecessors = sbitmap_alloc (num_nodes);
2532 sbitmap successors = sbitmap_alloc (num_nodes);
2534 sbitmap_zero (predecessors);
2535 find_predecessors (predecessors, g, nodes_ordered);
2537 sbitmap_zero (successors);
2538 find_successors (successors, g, nodes_ordered);
2541 if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
2543 sbitmap_copy (workset, tmp);
2546 else if (sbitmap_a_and_b_cg (tmp, successors, scc))
2548 sbitmap_copy (workset, tmp);
2555 sbitmap_zero (workset);
2556 if ((u = find_max_asap (g, scc)) >= 0)
2557 SET_BIT (workset, u);
2561 sbitmap_zero (zero_bitmap);
2562 while (!sbitmap_equal (workset, zero_bitmap))
2565 ddg_node_ptr v_node;
2566 sbitmap v_node_preds;
2567 sbitmap v_node_succs;
2571 while (!sbitmap_equal (workset, zero_bitmap))
2573 v = find_max_hv_min_mob (g, workset);
2574 v_node = &g->nodes[v];
2575 node_order[pos++] = v;
2576 v_node_succs = NODE_SUCCESSORS (v_node);
2577 sbitmap_a_and_b (tmp, v_node_succs, scc);
2579 /* Don't consider the already ordered successors again. */
2580 sbitmap_difference (tmp, tmp, nodes_ordered);
2581 sbitmap_a_or_b (workset, workset, tmp);
2582 RESET_BIT (workset, v);
2583 SET_BIT (nodes_ordered, v);
2586 sbitmap_zero (predecessors);
2587 find_predecessors (predecessors, g, nodes_ordered);
2588 sbitmap_a_and_b (workset, predecessors, scc);
2592 while (!sbitmap_equal (workset, zero_bitmap))
2594 v = find_max_dv_min_mob (g, workset);
2595 v_node = &g->nodes[v];
2596 node_order[pos++] = v;
2597 v_node_preds = NODE_PREDECESSORS (v_node);
2598 sbitmap_a_and_b (tmp, v_node_preds, scc);
2600 /* Don't consider the already ordered predecessors again. */
2601 sbitmap_difference (tmp, tmp, nodes_ordered);
2602 sbitmap_a_or_b (workset, workset, tmp);
2603 RESET_BIT (workset, v);
2604 SET_BIT (nodes_ordered, v);
2607 sbitmap_zero (successors);
2608 find_successors (successors, g, nodes_ordered);
2609 sbitmap_a_and_b (workset, successors, scc);
2613 sbitmap_free (workset);
2614 sbitmap_free (zero_bitmap);
2615 sbitmap_free (predecessors);
2616 sbitmap_free (successors);
2621 /* This page contains functions for manipulating partial-schedules during
2622 modulo scheduling. */
2624 /* Create a partial schedule and allocate a memory to hold II rows. */
2626 static partial_schedule_ptr
2627 create_partial_schedule (int ii, ddg_ptr g, int history)
2629 partial_schedule_ptr ps = XNEW (struct partial_schedule);
2630 ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2631 ps->rows_length = (int *) xcalloc (ii, sizeof (int));
2633 ps->history = history;
2634 ps->min_cycle = INT_MAX;
2635 ps->max_cycle = INT_MIN;
2641 /* Free the PS_INSNs in rows array of the given partial schedule.
2642 ??? Consider caching the PS_INSN's. */
2644 free_ps_insns (partial_schedule_ptr ps)
2648 for (i = 0; i < ps->ii; i++)
2652 ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2655 ps->rows[i] = ps_insn;
2661 /* Free all the memory allocated to the partial schedule. */
2664 free_partial_schedule (partial_schedule_ptr ps)
2670 free (ps->rows_length);
2674 /* Clear the rows array with its PS_INSNs, and create a new one with
2678 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2683 if (new_ii == ps->ii)
2685 ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2686 * sizeof (ps_insn_ptr));
2687 memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2688 ps->rows_length = (int *) xrealloc (ps->rows_length, new_ii * sizeof (int));
2689 memset (ps->rows_length, 0, new_ii * sizeof (int));
2691 ps->min_cycle = INT_MAX;
2692 ps->max_cycle = INT_MIN;
2695 /* Prints the partial schedule as an ii rows array, for each rows
2696 print the ids of the insns in it. */
2698 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2702 for (i = 0; i < ps->ii; i++)
2704 ps_insn_ptr ps_i = ps->rows[i];
2706 fprintf (dump, "\n[ROW %d ]: ", i);
2709 if (JUMP_P (ps_i->node->insn))
2710 fprintf (dump, "%d (branch), ",
2711 INSN_UID (ps_i->node->insn));
2713 fprintf (dump, "%d, ",
2714 INSN_UID (ps_i->node->insn));
2716 ps_i = ps_i->next_in_row;
2721 /* Creates an object of PS_INSN and initializes it to the given parameters. */
2723 create_ps_insn (ddg_node_ptr node, int cycle)
2725 ps_insn_ptr ps_i = XNEW (struct ps_insn);
2728 ps_i->next_in_row = NULL;
2729 ps_i->prev_in_row = NULL;
2730 ps_i->cycle = cycle;
2736 /* Removes the given PS_INSN from the partial schedule. */
2738 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2742 gcc_assert (ps && ps_i);
2744 row = SMODULO (ps_i->cycle, ps->ii);
2745 if (! ps_i->prev_in_row)
2747 gcc_assert (ps_i == ps->rows[row]);
2748 ps->rows[row] = ps_i->next_in_row;
2750 ps->rows[row]->prev_in_row = NULL;
2754 ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2755 if (ps_i->next_in_row)
2756 ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2759 ps->rows_length[row] -= 1;
2764 /* Unlike what literature describes for modulo scheduling (which focuses
2765 on VLIW machines) the order of the instructions inside a cycle is
2766 important. Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2767 where the current instruction should go relative to the already
2768 scheduled instructions in the given cycle. Go over these
2769 instructions and find the first possible column to put it in. */
2771 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2772 sbitmap must_precede, sbitmap must_follow)
2774 ps_insn_ptr next_ps_i;
2775 ps_insn_ptr first_must_follow = NULL;
2776 ps_insn_ptr last_must_precede = NULL;
2777 ps_insn_ptr last_in_row = NULL;
2783 row = SMODULO (ps_i->cycle, ps->ii);
2785 /* Find the first must follow and the last must precede
2786 and insert the node immediately after the must precede
2787 but make sure that it there is no must follow after it. */
2788 for (next_ps_i = ps->rows[row];
2790 next_ps_i = next_ps_i->next_in_row)
2792 if (must_follow && TEST_BIT (must_follow, next_ps_i->node->cuid)
2793 && ! first_must_follow)
2794 first_must_follow = next_ps_i;
2795 if (must_precede && TEST_BIT (must_precede, next_ps_i->node->cuid))
2797 /* If we have already met a node that must follow, then
2798 there is no possible column. */
2799 if (first_must_follow)
2802 last_must_precede = next_ps_i;
2804 /* The closing branch must be the last in the row. */
2806 && TEST_BIT (must_precede, next_ps_i->node->cuid)
2807 && JUMP_P (next_ps_i->node->insn))
2810 last_in_row = next_ps_i;
2813 /* The closing branch is scheduled as well. Make sure there is no
2814 dependent instruction after it as the branch should be the last
2815 instruction in the row. */
2816 if (JUMP_P (ps_i->node->insn))
2818 if (first_must_follow)
2822 /* Make the branch the last in the row. New instructions
2823 will be inserted at the beginning of the row or after the
2824 last must_precede instruction thus the branch is guaranteed
2825 to remain the last instruction in the row. */
2826 last_in_row->next_in_row = ps_i;
2827 ps_i->prev_in_row = last_in_row;
2828 ps_i->next_in_row = NULL;
2831 ps->rows[row] = ps_i;
2835 /* Now insert the node after INSERT_AFTER_PSI. */
2837 if (! last_must_precede)
2839 ps_i->next_in_row = ps->rows[row];
2840 ps_i->prev_in_row = NULL;
2841 if (ps_i->next_in_row)
2842 ps_i->next_in_row->prev_in_row = ps_i;
2843 ps->rows[row] = ps_i;
2847 ps_i->next_in_row = last_must_precede->next_in_row;
2848 last_must_precede->next_in_row = ps_i;
2849 ps_i->prev_in_row = last_must_precede;
2850 if (ps_i->next_in_row)
2851 ps_i->next_in_row->prev_in_row = ps_i;
2857 /* Advances the PS_INSN one column in its current row; returns false
2858 in failure and true in success. Bit N is set in MUST_FOLLOW if
2859 the node with cuid N must be come after the node pointed to by
2860 PS_I when scheduled in the same cycle. */
2862 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2863 sbitmap must_follow)
2865 ps_insn_ptr prev, next;
2867 ddg_node_ptr next_node;
2872 row = SMODULO (ps_i->cycle, ps->ii);
2874 if (! ps_i->next_in_row)
2877 next_node = ps_i->next_in_row->node;
2879 /* Check if next_in_row is dependent on ps_i, both having same sched
2880 times (typically ANTI_DEP). If so, ps_i cannot skip over it. */
2881 if (must_follow && TEST_BIT (must_follow, next_node->cuid))
2884 /* Advance PS_I over its next_in_row in the doubly linked list. */
2885 prev = ps_i->prev_in_row;
2886 next = ps_i->next_in_row;
2888 if (ps_i == ps->rows[row])
2889 ps->rows[row] = next;
2891 ps_i->next_in_row = next->next_in_row;
2893 if (next->next_in_row)
2894 next->next_in_row->prev_in_row = ps_i;
2896 next->next_in_row = ps_i;
2897 ps_i->prev_in_row = next;
2899 next->prev_in_row = prev;
2901 prev->next_in_row = next;
2906 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2907 Returns 0 if this is not possible and a PS_INSN otherwise. Bit N is
2908 set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2909 before/after (respectively) the node pointed to by PS_I when scheduled
2910 in the same cycle. */
2912 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2913 sbitmap must_precede, sbitmap must_follow)
2916 int row = SMODULO (cycle, ps->ii);
2918 if (ps->rows_length[row] >= issue_rate)
2921 ps_i = create_ps_insn (node, cycle);
2923 /* Finds and inserts PS_I according to MUST_FOLLOW and
2925 if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2931 ps->rows_length[row] += 1;
2935 /* Advance time one cycle. Assumes DFA is being used. */
2937 advance_one_cycle (void)
2939 if (targetm.sched.dfa_pre_cycle_insn)
2940 state_transition (curr_state,
2941 targetm.sched.dfa_pre_cycle_insn ());
2943 state_transition (curr_state, NULL);
2945 if (targetm.sched.dfa_post_cycle_insn)
2946 state_transition (curr_state,
2947 targetm.sched.dfa_post_cycle_insn ());
2952 /* Checks if PS has resource conflicts according to DFA, starting from
2953 FROM cycle to TO cycle; returns true if there are conflicts and false
2954 if there are no conflicts. Assumes DFA is being used. */
2956 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2960 state_reset (curr_state);
2962 for (cycle = from; cycle <= to; cycle++)
2964 ps_insn_ptr crr_insn;
2965 /* Holds the remaining issue slots in the current row. */
2966 int can_issue_more = issue_rate;
2968 /* Walk through the DFA for the current row. */
2969 for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2971 crr_insn = crr_insn->next_in_row)
2973 rtx insn = crr_insn->node->insn;
2975 if (!NONDEBUG_INSN_P (insn))
2978 /* Check if there is room for the current insn. */
2979 if (!can_issue_more || state_dead_lock_p (curr_state))
2982 /* Update the DFA state and return with failure if the DFA found
2983 resource conflicts. */
2984 if (state_transition (curr_state, insn) >= 0)
2987 if (targetm.sched.variable_issue)
2989 targetm.sched.variable_issue (sched_dump, sched_verbose,
2990 insn, can_issue_more);
2991 /* A naked CLOBBER or USE generates no instruction, so don't
2992 let them consume issue slots. */
2993 else if (GET_CODE (PATTERN (insn)) != USE
2994 && GET_CODE (PATTERN (insn)) != CLOBBER)
2998 /* Advance the DFA to the next cycle. */
2999 advance_one_cycle ();
3004 /* Checks if the given node causes resource conflicts when added to PS at
3005 cycle C. If not the node is added to PS and returned; otherwise zero
3006 is returned. Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
3007 cuid N must be come before/after (respectively) the node pointed to by
3008 PS_I when scheduled in the same cycle. */
3010 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
3011 int c, sbitmap must_precede,
3012 sbitmap must_follow)
3014 int has_conflicts = 0;
3017 /* First add the node to the PS, if this succeeds check for
3018 conflicts, trying different issue slots in the same row. */
3019 if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
3020 return NULL; /* Failed to insert the node at the given cycle. */
3022 has_conflicts = ps_has_conflicts (ps, c, c)
3024 && ps_has_conflicts (ps,
3028 /* Try different issue slots to find one that the given node can be
3029 scheduled in without conflicts. */
3030 while (has_conflicts)
3032 if (! ps_insn_advance_column (ps, ps_i, must_follow))
3034 has_conflicts = ps_has_conflicts (ps, c, c)
3036 && ps_has_conflicts (ps,
3043 remove_node_from_ps (ps, ps_i);
3047 ps->min_cycle = MIN (ps->min_cycle, c);
3048 ps->max_cycle = MAX (ps->max_cycle, c);
3052 /* Calculate the stage count of the partial schedule PS. The calculation
3053 takes into account the rotation amount passed in ROTATION_AMOUNT. */
3055 calculate_stage_count (partial_schedule_ptr ps, int rotation_amount)
3057 int new_min_cycle = PS_MIN_CYCLE (ps) - rotation_amount;
3058 int new_max_cycle = PS_MAX_CYCLE (ps) - rotation_amount;
3059 int stage_count = CALC_STAGE_COUNT (-1, new_min_cycle, ps->ii);
3061 /* The calculation of stage count is done adding the number of stages
3062 before cycle zero and after cycle zero. */
3063 stage_count += CALC_STAGE_COUNT (new_max_cycle, 0, ps->ii);
3068 /* Rotate the rows of PS such that insns scheduled at time
3069 START_CYCLE will appear in row 0. Updates max/min_cycles. */
3071 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
3073 int i, row, backward_rotates;
3074 int last_row = ps->ii - 1;
3076 if (start_cycle == 0)
3079 backward_rotates = SMODULO (start_cycle, ps->ii);
3081 /* Revisit later and optimize this into a single loop. */
3082 for (i = 0; i < backward_rotates; i++)
3084 ps_insn_ptr first_row = ps->rows[0];
3085 int first_row_length = ps->rows_length[0];
3087 for (row = 0; row < last_row; row++)
3089 ps->rows[row] = ps->rows[row + 1];
3090 ps->rows_length[row] = ps->rows_length[row + 1];
3093 ps->rows[last_row] = first_row;
3094 ps->rows_length[last_row] = first_row_length;
3097 ps->max_cycle -= start_cycle;
3098 ps->min_cycle -= start_cycle;
3101 #endif /* INSN_SCHEDULING */
3104 gate_handle_sms (void)
3106 return (optimize > 0 && flag_modulo_sched);
3110 /* Run instruction scheduler. */
3111 /* Perform SMS module scheduling. */
3113 rest_of_handle_sms (void)
3115 #ifdef INSN_SCHEDULING
3118 /* Collect loop information to be used in SMS. */
3119 cfg_layout_initialize (0);
3122 /* Update the life information, because we add pseudos. */
3123 max_regno = max_reg_num ();
3125 /* Finalize layout changes. */
3127 if (bb->next_bb != EXIT_BLOCK_PTR)
3128 bb->aux = bb->next_bb;
3129 free_dominance_info (CDI_DOMINATORS);
3130 cfg_layout_finalize ();
3131 #endif /* INSN_SCHEDULING */
3135 struct rtl_opt_pass pass_sms =
3140 gate_handle_sms, /* gate */
3141 rest_of_handle_sms, /* execute */
3144 0, /* static_pass_number */
3146 0, /* properties_required */
3147 0, /* properties_provided */
3148 0, /* properties_destroyed */
3149 0, /* todo_flags_start */
3152 | TODO_verify_rtl_sharing
3153 | TODO_ggc_collect /* todo_flags_finish */