gcc/modulo-sched.c

   1 /* Swing Modulo Scheduling implementation.
   2    Copyright (C) 2004, 2005
   3    Free Software Foundation, Inc.
   4    Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 2, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING.  If not, write to the Free
  20 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  21 02111-1307, USA.  */
  22
  23
  24 #include "config.h"
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "tm.h"
  28 #include "toplev.h"
  29 #include "rtl.h"
  30 #include "tm_p.h"
  31 #include "hard-reg-set.h"
  32 #include "regs.h"
  33 #include "function.h"
  34 #include "flags.h"
  35 #include "insn-config.h"
  36 #include "insn-attr.h"
  37 #include "except.h"
  38 #include "toplev.h"
  39 #include "recog.h"
  40 #include "sched-int.h"
  41 #include "target.h"
  42 #include "cfglayout.h"
  43 #include "cfgloop.h"
  44 #include "cfghooks.h"
  45 #include "expr.h"
  46 #include "params.h"
  47 #include "gcov-io.h"
  48 #include "df.h"
  49 #include "ddg.h"
  50
  51 #ifdef INSN_SCHEDULING
  52
  53 /* This file contains the implementation of the Swing Modulo Scheduler,
  54    described in the following references:
  55    [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
  56        Lifetime--sensitive modulo scheduling in a production environment.
  57        IEEE Trans. on Comps., 50(3), March 2001
  58    [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
  59        Swing Modulo Scheduling: A Lifetime Sensitive Approach.
  60        PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
  61
  62    The basic structure is:
  63    1. Build a data-dependence graph (DDG) for each loop.
  64    2. Use the DDG to order the insns of a loop (not in topological order
  65       necessarily, but rather) trying to place each insn after all its
  66       predecessors _or_ after all its successors.
  67    3. Compute MII: a lower bound on the number of cycles to schedule the loop.
  68    4. Use the ordering to perform list-scheduling of the loop:
  69       1. Set II = MII.  We will try to schedule the loop within II cycles.
  70       2. Try to schedule the insns one by one according to the ordering.
  71          For each insn compute an interval of cycles by considering already-
  72          scheduled preds and succs (and associated latencies); try to place
  73          the insn in the cycles of this window checking for potential
  74          resource conflicts (using the DFA interface).
  75          Note: this is different from the cycle-scheduling of schedule_insns;
  76          here the insns are not scheduled monotonically top-down (nor bottom-
  77          up).
  78       3. If failed in scheduling all insns - bump II++ and try again, unless
  79          II reaches an upper bound MaxII, in which case report failure.
  80    5. If we succeeded in scheduling the loop within II cycles, we now
  81       generate prolog and epilog, decrease the counter of the loop, and
  82       perform modulo variable expansion for live ranges that span more than
  83       II cycles (i.e. use register copies to prevent a def from overwriting
  84       itself before reaching the use).
  85 */
  86
  87 \f
  88 /* This page defines partial-schedule structures and functions for
  89    modulo scheduling.  */
  90
  91 typedef struct partial_schedule *partial_schedule_ptr;
  92 typedef struct ps_insn *ps_insn_ptr;
  93
  94 /* The minimum (absolute) cycle that a node of ps was scheduled in.  */
  95 #define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
  96
  97 /* The maximum (absolute) cycle that a node of ps was scheduled in.  */
  98 #define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
  99
 100 /* Perform signed modulo, always returning a non-negative value.  */
 101 #define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
 102
 103 /* The number of different iterations the nodes in ps span, assuming
 104    the stage boundaries are placed efficiently.  */
 105 #define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
 106                              + 1 + (ps)->ii - 1) / (ps)->ii)
 107
 108 /* A single instruction in the partial schedule.  */
 109 struct ps_insn
 110 {
 111   /* The corresponding DDG_NODE.  */
 112   ddg_node_ptr node;
 113
 114   /* The (absolute) cycle in which the PS instruction is scheduled.
 115      Same as SCHED_TIME (node).  */
 116   int cycle;
 117
 118   /* The next/prev PS_INSN in the same row.  */
 119   ps_insn_ptr next_in_row,
 120               prev_in_row;
 121
 122   /* The number of nodes in the same row that come after this node.  */
 123   int row_rest_count;
 124 };
 125
 126 /* Holds the partial schedule as an array of II rows.  Each entry of the
 127    array points to a linked list of PS_INSNs, which represents the
 128    instructions that are scheduled for that row.  */
 129 struct partial_schedule
 130 {
 131   int ii;       /* Number of rows in the partial schedule.  */
 132   int history;  /* Threshold for conflict checking using DFA.  */
 133
 134   /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii).  */
 135   ps_insn_ptr *rows;
 136
 137   /* The earliest absolute cycle of an insn in the partial schedule.  */
 138   int min_cycle;
 139
 140   /* The latest absolute cycle of an insn in the partial schedule.  */
 141   int max_cycle;
 142
 143   ddg_ptr g;    /* The DDG of the insns in the partial schedule.  */
 144 };
 145
 146 /* We use this to record all the register replacements we do in
 147    the kernel so we can undo SMS if it is not profitable.  */
 148 struct undo_replace_buff_elem
 149 {
 150   rtx insn;
 151   rtx orig_reg;
 152   rtx new_reg;
 153   struct undo_replace_buff_elem *next;
 154 };
 155
 156
 157
 158 partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
 159 void free_partial_schedule (partial_schedule_ptr);
 160 void reset_partial_schedule (partial_schedule_ptr, int new_ii);
 161 void print_partial_schedule (partial_schedule_ptr, FILE *);
 162 static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
 163 static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
 164                                                 ddg_node_ptr node, int cycle,
 165                                                 sbitmap must_precede,
 166                                                 sbitmap must_follow);
 167 static void rotate_partial_schedule (partial_schedule_ptr, int);
 168 void set_row_column_for_ps (partial_schedule_ptr);
 169 static bool ps_unschedule_node (partial_schedule_ptr, ddg_node_ptr );
 170
 171 \f
 172 /* This page defines constants and structures for the modulo scheduling
 173    driver.  */
 174
 175 /* As in haifa-sched.c:  */
 176 /* issue_rate is the number of insns that can be scheduled in the same
 177    machine cycle.  It can be defined in the config/mach/mach.h file,
 178    otherwise we set it to 1.  */
 179
 180 static int issue_rate;
 181
 182 /* For printing statistics.  */
 183 static FILE *stats_file;
 184
 185 static int sms_order_nodes (ddg_ptr, int, int * result);
 186 static void set_node_sched_params (ddg_ptr);
 187 static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int,
 188                                                    int *, FILE*);
 189 static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
 190 static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
 191 static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
 192                                        int from_stage, int to_stage,
 193                                        int is_prolog);
 194
 195 #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
 196 #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
 197 #define SCHED_FIRST_REG_MOVE(x) \
 198         (((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
 199 #define SCHED_NREG_MOVES(x) \
 200         (((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
 201 #define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
 202 #define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
 203 #define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
 204
 205 /* The scheduling parameters held for each node.  */
 206 typedef struct node_sched_params
 207 {
 208   int asap;     /* A lower-bound on the absolute scheduling cycle.  */
 209   int time;     /* The absolute scheduling cycle (time >= asap).  */
 210
 211   /* The following field (first_reg_move) is a pointer to the first
 212      register-move instruction added to handle the modulo-variable-expansion
 213      of the register defined by this node.  This register-move copies the
 214      original register defined by the node.  */
 215   rtx first_reg_move;
 216
 217   /* The number of register-move instructions added, immediately preceding
 218      first_reg_move.  */
 219   int nreg_moves;
 220
 221   int row;    /* Holds time % ii.  */
 222   int stage;  /* Holds time / ii.  */
 223
 224   /* The column of a node inside the ps.  If nodes u, v are on the same row,
 225      u will precede v if column (u) < column (v).  */
 226   int column;
 227 } *node_sched_params_ptr;
 228
 229 \f
 230 /* The following three functions are copied from the current scheduler
 231    code in order to use sched_analyze() for computing the dependencies.
 232    They are used when initializing the sched_info structure.  */
 233 static const char *
 234 sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
 235 {
 236   static char tmp[80];
 237
 238   sprintf (tmp, "i%4d", INSN_UID (insn));
 239   return tmp;
 240 }
 241
 242 static int
 243 contributes_to_priority (rtx next, rtx insn)
 244 {
 245   return BLOCK_NUM (next) == BLOCK_NUM (insn);
 246 }
 247
 248 static void
 249 compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
 250                                regset cond_exec ATTRIBUTE_UNUSED,
 251                                regset used ATTRIBUTE_UNUSED,
 252                                regset set ATTRIBUTE_UNUSED)
 253 {
 254 }
 255
 256 static struct sched_info sms_sched_info =
 257 {
 258   NULL,
 259   NULL,
 260   NULL,
 261   NULL,
 262   NULL,
 263   sms_print_insn,
 264   contributes_to_priority,
 265   compute_jump_reg_dependencies,
 266   NULL, NULL,
 267   NULL, NULL,
 268   0, 0, 0
 269 };
 270
 271
 272 /* Return the register decremented and tested in INSN,
 273    or zero if it is not a decrement-and-branch insn.  */
 274
 275 static rtx
 276 doloop_register_get (rtx insn ATTRIBUTE_UNUSED)
 277 {
 278 #ifdef HAVE_doloop_end
 279   rtx pattern, reg, condition;
 280
 281   if (! JUMP_P (insn))
 282     return NULL_RTX;
 283
 284   pattern = PATTERN (insn);
 285   condition = doloop_condition_get (pattern);
 286   if (! condition)
 287     return NULL_RTX;
 288
 289   if (REG_P (XEXP (condition, 0)))
 290     reg = XEXP (condition, 0);
 291   else if (GET_CODE (XEXP (condition, 0)) == PLUS
 292            && REG_P (XEXP (XEXP (condition, 0), 0)))
 293     reg = XEXP (XEXP (condition, 0), 0);
 294   else
 295     gcc_unreachable ();
 296
 297   return reg;
 298 #else
 299   return NULL_RTX;
 300 #endif
 301 }
 302
 303 /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
 304    that the number of iterations is a compile-time constant.  If so,
 305    return the rtx that sets COUNT_REG to a constant, and set COUNT to
 306    this constant.  Otherwise return 0.  */
 307 static rtx
 308 const_iteration_count (rtx count_reg, basic_block pre_header,
 309                        HOST_WIDEST_INT * count)
 310 {
 311   rtx insn;
 312   rtx head, tail;
 313
 314   if (! pre_header)
 315     return NULL_RTX;
 316
 317   get_block_head_tail (pre_header->index, &head, &tail);
 318
 319   for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
 320     if (INSN_P (insn) && single_set (insn) &&
 321         rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
 322       {
 323         rtx pat = single_set (insn);
 324
 325         if (GET_CODE (SET_SRC (pat)) == CONST_INT)
 326           {
 327             *count = INTVAL (SET_SRC (pat));
 328             return insn;
 329           }
 330
 331         return NULL_RTX;
 332       }
 333
 334   return NULL_RTX;
 335 }
 336
 337 /* A very simple resource-based lower bound on the initiation interval.
 338    ??? Improve the accuracy of this bound by considering the
 339    utilization of various units.  */
 340 static int
 341 res_MII (ddg_ptr g)
 342 {
 343   return (g->num_nodes / issue_rate);
 344 }
 345
 346
 347 /* Points to the array that contains the sched data for each node.  */
 348 static node_sched_params_ptr node_sched_params;
 349
 350 /* Allocate sched_params for each node and initialize it.  Assumes that
 351    the aux field of each node contain the asap bound (computed earlier),
 352    and copies it into the sched_params field.  */
 353 static void
 354 set_node_sched_params (ddg_ptr g)
 355 {
 356   int i;
 357
 358   /* Allocate for each node in the DDG a place to hold the "sched_data".  */
 359   /* Initialize ASAP/ALAP/HIGHT to zero.  */
 360   node_sched_params = (node_sched_params_ptr)
 361                        xcalloc (g->num_nodes,
 362                                 sizeof (struct node_sched_params));
 363
 364   /* Set the pointer of the general data of the node to point to the
 365      appropriate sched_params structure.  */
 366   for (i = 0; i < g->num_nodes; i++)
 367     {
 368       /* Watch out for aliasing problems?  */
 369       node_sched_params[i].asap = g->nodes[i].aux.count;
 370       g->nodes[i].aux.info = &node_sched_params[i];
 371     }
 372 }
 373
 374 static void
 375 print_node_sched_params (FILE * dump_file, int num_nodes)
 376 {
 377   int i;
 378
 379   if (! dump_file)
 380     return;
 381   for (i = 0; i < num_nodes; i++)
 382     {
 383       node_sched_params_ptr nsp = &node_sched_params[i];
 384       rtx reg_move = nsp->first_reg_move;
 385       int j;
 386
 387       fprintf (dump_file, "Node %d:\n", i);
 388       fprintf (dump_file, " asap = %d:\n", nsp->asap);
 389       fprintf (dump_file, " time = %d:\n", nsp->time);
 390       fprintf (dump_file, " nreg_moves = %d:\n", nsp->nreg_moves);
 391       for (j = 0; j < nsp->nreg_moves; j++)
 392         {
 393           fprintf (dump_file, " reg_move = ");
 394           print_rtl_single (dump_file, reg_move);
 395           reg_move = PREV_INSN (reg_move);
 396         }
 397     }
 398 }
 399
 400 /* Calculate an upper bound for II.  SMS should not schedule the loop if it
 401    requires more cycles than this bound.  Currently set to the sum of the
 402    longest latency edge for each node.  Reset based on experiments.  */
 403 static int
 404 calculate_maxii (ddg_ptr g)
 405 {
 406   int i;
 407   int maxii = 0;
 408
 409   for (i = 0; i < g->num_nodes; i++)
 410     {
 411       ddg_node_ptr u = &g->nodes[i];
 412       ddg_edge_ptr e;
 413       int max_edge_latency = 0;
 414
 415       for (e = u->out; e; e = e->next_out)
 416         max_edge_latency = MAX (max_edge_latency, e->latency);
 417
 418       maxii += max_edge_latency;
 419     }
 420   return maxii;
 421 }
 422
 423 /*
 424    Breaking intra-loop register anti-dependences:
 425    Each intra-loop register anti-dependence implies a cross-iteration true
 426    dependence of distance 1. Therefore, we can remove such false dependencies
 427    and figure out if the partial schedule broke them by checking if (for a
 428    true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
 429    if so generate a register move.   The number of such moves is equal to:
 430               SCHED_TIME (use) - SCHED_TIME (def)       { 0 broken
 431    nreg_moves = ----------------------------------- + 1 - {   dependence.
 432                             ii                          { 1 if not.
 433 */
 434 static struct undo_replace_buff_elem *
 435 generate_reg_moves (partial_schedule_ptr ps)
 436 {
 437   ddg_ptr g = ps->g;
 438   int ii = ps->ii;
 439   int i;
 440   struct undo_replace_buff_elem *reg_move_replaces = NULL;
 441
 442   for (i = 0; i < g->num_nodes; i++)
 443     {
 444       ddg_node_ptr u = &g->nodes[i];
 445       ddg_edge_ptr e;
 446       int nreg_moves = 0, i_reg_move;
 447       sbitmap *uses_of_defs;
 448       rtx last_reg_move;
 449       rtx prev_reg, old_reg;
 450
 451       /* Compute the number of reg_moves needed for u, by looking at life
 452          ranges started at u (excluding self-loops).  */
 453       for (e = u->out; e; e = e->next_out)
 454         if (e->type == TRUE_DEP && e->dest != e->src)
 455           {
 456             int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
 457
 458             if (e->distance == 1)
 459               nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
 460
 461             /* If dest precedes src in the schedule of the kernel, then dest
 462                will read before src writes and we can save one reg_copy.  */
 463             if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
 464                 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
 465               nreg_moves4e--;
 466
 467             nreg_moves = MAX (nreg_moves, nreg_moves4e);
 468           }
 469
 470       if (nreg_moves == 0)
 471         continue;
 472
 473       /* Every use of the register defined by node may require a different
 474          copy of this register, depending on the time the use is scheduled.
 475          Set a bitmap vector, telling which nodes use each copy of this
 476          register.  */
 477       uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
 478       sbitmap_vector_zero (uses_of_defs, nreg_moves);
 479       for (e = u->out; e; e = e->next_out)
 480         if (e->type == TRUE_DEP && e->dest != e->src)
 481           {
 482             int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
 483
 484             if (e->distance == 1)
 485               dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
 486
 487             if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
 488                 && SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
 489               dest_copy--;
 490
 491             if (dest_copy)
 492               SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
 493           }
 494
 495       /* Now generate the reg_moves, attaching relevant uses to them.  */
 496       SCHED_NREG_MOVES (u) = nreg_moves;
 497       old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
 498       last_reg_move = u->insn;
 499
 500       for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
 501         {
 502           int i_use;
 503           rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
 504           rtx reg_move = gen_move_insn (new_reg, prev_reg);
 505
 506           add_insn_before (reg_move, last_reg_move);
 507           last_reg_move = reg_move;
 508
 509           if (!SCHED_FIRST_REG_MOVE (u))
 510             SCHED_FIRST_REG_MOVE (u) = reg_move;
 511
 512           EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use,
 513             {
 514               struct undo_replace_buff_elem *rep;
 515
 516               rep = (struct undo_replace_buff_elem *)
 517                     xcalloc (1, sizeof (struct undo_replace_buff_elem));
 518               rep->insn = g->nodes[i_use].insn;
 519               rep->orig_reg = old_reg;
 520               rep->new_reg = new_reg;
 521
 522               if (! reg_move_replaces)
 523                 reg_move_replaces = rep;
 524               else
 525                 {
 526                   rep->next = reg_move_replaces;
 527                   reg_move_replaces = rep;
 528                 }
 529
 530               replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
 531             });
 532
 533           prev_reg = new_reg;
 534         }
 535     }
 536   return reg_move_replaces;
 537 }
 538
 539 /* We call this when we want to undo the SMS schedule for a given loop.
 540    One of the things that we do is to delete the register moves generated
 541    for the sake of SMS; this function deletes the register move instructions
 542    recorded in the undo buffer.  */
 543 static void
 544 undo_generate_reg_moves (partial_schedule_ptr ps,
 545                          struct undo_replace_buff_elem *reg_move_replaces)
 546 {
 547   int i,j;
 548
 549   for (i = 0; i < ps->g->num_nodes; i++)
 550     {
 551       ddg_node_ptr u = &ps->g->nodes[i];
 552       rtx prev;
 553       rtx crr = SCHED_FIRST_REG_MOVE (u);
 554
 555       for (j = 0; j < SCHED_NREG_MOVES (u); j++)
 556         {
 557           prev = PREV_INSN (crr);
 558           delete_insn (crr);
 559           crr = prev;
 560         }
 561       SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
 562     }
 563
 564   while (reg_move_replaces)
 565     {
 566       struct undo_replace_buff_elem *rep = reg_move_replaces;
 567
 568       reg_move_replaces = reg_move_replaces->next;
 569       replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
 570     }
 571 }
 572
 573 /* Free memory allocated for the undo buffer.  */
 574 static void
 575 free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
 576 {
 577
 578   while (reg_move_replaces)
 579     {
 580       struct undo_replace_buff_elem *rep = reg_move_replaces;
 581
 582       reg_move_replaces = reg_move_replaces->next;
 583       free (rep);
 584     }
 585 }
 586
 587 /* Bump the SCHED_TIMEs of all nodes to start from zero.  Set the values
 588    of SCHED_ROW and SCHED_STAGE.  */
 589 static void
 590 normalize_sched_times (partial_schedule_ptr ps)
 591 {
 592   int i;
 593   ddg_ptr g = ps->g;
 594   int amount = PS_MIN_CYCLE (ps);
 595   int ii = ps->ii;
 596
 597   /* Don't include the closing branch assuming that it is the last node.  */
 598   for (i = 0; i < g->num_nodes - 1; i++)
 599     {
 600       ddg_node_ptr u = &g->nodes[i];
 601       int normalized_time = SCHED_TIME (u) - amount;
 602
 603       gcc_assert (normalized_time >= 0);
 604
 605       SCHED_TIME (u) = normalized_time;
 606       SCHED_ROW (u) = normalized_time % ii;
 607       SCHED_STAGE (u) = normalized_time / ii;
 608     }
 609 }
 610
 611 /* Set SCHED_COLUMN of each node according to its position in PS.  */
 612 static void
 613 set_columns_for_ps (partial_schedule_ptr ps)
 614 {
 615   int row;
 616
 617   for (row = 0; row < ps->ii; row++)
 618     {
 619       ps_insn_ptr cur_insn = ps->rows[row];
 620       int column = 0;
 621
 622       for (; cur_insn; cur_insn = cur_insn->next_in_row)
 623         SCHED_COLUMN (cur_insn->node) = column++;
 624     }
 625 }
 626
 627 /* Permute the insns according to their order in PS, from row 0 to
 628    row ii-1, and position them right before LAST.  This schedules
 629    the insns of the loop kernel.  */
 630 static void
 631 permute_partial_schedule (partial_schedule_ptr ps, rtx last)
 632 {
 633   int ii = ps->ii;
 634   int row;
 635   ps_insn_ptr ps_ij;
 636
 637   for (row = 0; row < ii ; row++)
 638     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
 639       if (PREV_INSN (last) != ps_ij->node->insn)
 640         reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
 641                             PREV_INSN (last));
 642 }
 643
 644 /* As part of undoing SMS we return to the original ordering of the
 645    instructions inside the loop kernel.  Given the partial schedule PS, this
 646    function returns the ordering of the instruction according to their CUID
 647    in the DDG (PS->G), which is the original order of the instruction before
 648    performing SMS.  */
 649 static void
 650 undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
 651 {
 652   int i;
 653
 654   for (i = 0 ; i < ps->g->num_nodes; i++)
 655     if (last == ps->g->nodes[i].insn
 656         || last == ps->g->nodes[i].first_note)
 657       break;
 658     else if (PREV_INSN (last) != ps->g->nodes[i].insn)
 659       reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
 660                           PREV_INSN (last));
 661 }
 662
 663 /* Used to generate the prologue & epilogue.  Duplicate the subset of
 664    nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
 665    of both), together with a prefix/suffix of their reg_moves.  */
 666 static void
 667 duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
 668                            int to_stage, int for_prolog)
 669 {
 670   int row;
 671   ps_insn_ptr ps_ij;
 672
 673   for (row = 0; row < ps->ii; row++)
 674     for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
 675       {
 676         ddg_node_ptr u_node = ps_ij->node;
 677         int j, i_reg_moves;
 678         rtx reg_move = NULL_RTX;
 679
 680         if (for_prolog)
 681           {
 682             /* SCHED_STAGE (u_node) >= from_stage == 0.  Generate increasing
 683                number of reg_moves starting with the second occurrence of
 684                u_node, which is generated if its SCHED_STAGE <= to_stage.  */
 685             i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
 686             i_reg_moves = MAX (i_reg_moves, 0);
 687             i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
 688
 689             /* The reg_moves start from the *first* reg_move backwards.  */
 690             if (i_reg_moves)
 691               {
 692                 reg_move = SCHED_FIRST_REG_MOVE (u_node);
 693                 for (j = 1; j < i_reg_moves; j++)
 694                   reg_move = PREV_INSN (reg_move);
 695               }
 696           }
 697         else /* It's for the epilog.  */
 698           {
 699             /* SCHED_STAGE (u_node) <= to_stage.  Generate all reg_moves,
 700                starting to decrease one stage after u_node no longer occurs;
 701                that is, generate all reg_moves until
 702                SCHED_STAGE (u_node) == from_stage - 1.  */
 703             i_reg_moves = SCHED_NREG_MOVES (u_node)
 704                        - (from_stage - SCHED_STAGE (u_node) - 1);
 705             i_reg_moves = MAX (i_reg_moves, 0);
 706             i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
 707
 708             /* The reg_moves start from the *last* reg_move forwards.  */
 709             if (i_reg_moves)
 710               {
 711                 reg_move = SCHED_FIRST_REG_MOVE (u_node);
 712                 for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
 713                   reg_move = PREV_INSN (reg_move);
 714               }
 715           }
 716
 717         for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
 718           emit_insn (copy_rtx (PATTERN (reg_move)));
 719         if (SCHED_STAGE (u_node) >= from_stage
 720             && SCHED_STAGE (u_node) <= to_stage)
 721           duplicate_insn_chain (u_node->first_note, u_node->insn);
 722       }
 723 }
 724
 725
 726 /* Generate the instructions (including reg_moves) for prolog & epilog.  */
 727 static void
 728 generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
 729 {
 730   int i;
 731   int last_stage = PS_STAGE_COUNT (ps) - 1;
 732   edge e;
 733
 734   /* Generate the prolog, inserting its insns on the loop-entry edge.  */
 735   start_sequence ();
 736
 737   if (count_reg)
 738    /* Generate a subtract instruction at the beginning of the prolog to
 739       adjust the loop count by STAGE_COUNT.  */
 740    emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
 741
 742   for (i = 0; i < last_stage; i++)
 743     duplicate_insns_of_cycles (ps, 0, i, 1);
 744
 745   /* Put the prolog ,  on the one and only entry edge.  */
 746   e = loop_preheader_edge (loop);
 747   loop_split_edge_with(e , get_insns());
 748
 749   end_sequence ();
 750
 751   /* Generate the epilog, inserting its insns on the loop-exit edge.  */
 752   start_sequence ();
 753
 754   for (i = 0; i < last_stage; i++)
 755     duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
 756
 757   /* Put the epilogue on the one and only one exit edge.  */
 758   gcc_assert (loop->single_exit);
 759   e = loop->single_exit;
 760   loop_split_edge_with(e , get_insns());
 761   end_sequence ();
 762 }
 763
 764 /* Return the line note insn preceding INSN, for debugging.  Taken from
 765    emit-rtl.c.  */
 766 static rtx
 767 find_line_note (rtx insn)
 768 {
 769   for (; insn; insn = PREV_INSN (insn))
 770     if (NOTE_P (insn)
 771         && NOTE_LINE_NUMBER (insn) >= 0)
 772       break;
 773
 774   return insn;
 775 }
 776
 777 /* Return true if all the BBs of the loop are empty except the
 778    loop header.  */
 779 static bool
 780 loop_single_full_bb_p (struct loop *loop)
 781 {
 782   unsigned i;
 783   basic_block *bbs = get_loop_body (loop);
 784
 785   for (i = 0; i < loop->num_nodes ; i++)
 786     {
 787       rtx head, tail;
 788       bool empty_bb = true;
 789
 790       if (bbs[i] == loop->header)
 791         continue;
 792
 793       /* Make sure that basic blocks other than the header
 794          have only notes labels or jumps.  */
 795       get_block_head_tail (bbs[i]->index, &head, &tail);
 796       for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
 797         {
 798           if (NOTE_P (head) || LABEL_P (head)
 799               || (INSN_P (head) && JUMP_P (head)))
 800             continue;
 801           empty_bb = false;
 802           break;
 803         }
 804
 805       if (! empty_bb)
 806         {
 807           free (bbs);
 808           return false;
 809         }
 810     }
 811   free (bbs);
 812   return true;
 813 }
 814
 815 /* A simple loop from SMS point of view; it is a loop that is composed of
 816    either a single basic block or two BBs - a header and a latch.  */
 817 #define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 )                     \
 818                                   && (EDGE_COUNT (loop->latch->preds) == 1) \
 819                                   && (EDGE_COUNT (loop->latch->succs) == 1))
 820
 821 /* Return true if the loop is in its canonical form and false if not.
 822    i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit.  */
 823 static bool
 824 loop_canon_p (struct loop *loop, FILE *dump_file)
 825 {
 826
 827   if (loop->inner || ! loop->outer)
 828     return false;
 829
 830   if (!loop->single_exit)
 831     {
 832       if (dump_file)
 833         {
 834           rtx line_note = find_line_note (BB_END (loop->header));
 835
 836           fprintf (dump_file, "SMS loop many exits ");
 837           if (line_note)
 838             {
 839               expanded_location xloc;
 840               NOTE_EXPANDED_LOCATION (xloc, line_note);
 841               fprintf (stats_file, " %s %d (file, line)\n",
 842                        xloc.file, xloc.line);
 843             }
 844         }
 845       return false;
 846     }
 847
 848   if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
 849     {
 850       if (dump_file)
 851         {
 852           rtx line_note = find_line_note (BB_END (loop->header));
 853
 854           fprintf (dump_file, "SMS loop many BBs. ");
 855           if (line_note)
 856             {
 857               expanded_location xloc;
 858               NOTE_EXPANDED_LOCATION (xloc, line_note);
 859               fprintf (stats_file, " %s %d (file, line)\n",
 860                        xloc.file, xloc.line);
 861             }
 862         }
 863       return false;
 864     }
 865
 866     return true;
 867 }
 868
 869 /* If there are more than one entry for the loop,
 870    make it one by splitting the first entry edge and
 871    redirecting the others to the new BB.  */
 872 static void
 873 canon_loop (struct loop *loop)
 874 {
 875   edge e;
 876   edge_iterator i;
 877
 878   /* Avoid annoying special cases of edges going to exit
 879      block.  */
 880   FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
 881     if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
 882       loop_split_edge_with (e, NULL_RTX);
 883
 884   if (loop->latch == loop->header
 885       || EDGE_COUNT (loop->latch->succs) > 1)
 886     {
 887       FOR_EACH_EDGE (e, i, loop->header->preds)
 888         if (e->src == loop->latch)
 889           break;
 890       loop_split_edge_with (e, NULL_RTX);
 891     }
 892 }
 893
 894 /* Build the loop information without loop
 895    canonization, the loop canonization will
 896    be performed if the loop is SMSable.  */
 897 static struct loops *
 898 build_loops_structure (FILE *dumpfile)
 899 {
 900   struct loops *loops = xcalloc (1, sizeof (struct loops));
 901
 902   /* Find the loops.  */
 903
 904   if (flow_loops_find (loops) <= 1)
 905     {
 906       /* No loops.  */
 907       flow_loops_free (loops);
 908       free (loops);
 909
 910       return NULL;
 911     }
 912
 913   /* Not going to update these.  */
 914   free (loops->cfg.rc_order);
 915   loops->cfg.rc_order = NULL;
 916   free (loops->cfg.dfs_order);
 917   loops->cfg.dfs_order = NULL;
 918
 919   create_preheaders (loops, CP_SIMPLE_PREHEADERS);
 920   mark_single_exit_loops (loops);
 921   /* Dump loops.  */
 922   flow_loops_dump (loops, dumpfile, NULL, 1);
 923
 924 #ifdef ENABLE_CHECKING
 925   verify_dominators (CDI_DOMINATORS);
 926   verify_loop_structure (loops);
 927 #endif
 928
 929   return loops;
 930 }
 931
 932 /* Main entry point, perform SMS scheduling on the loops of the function
 933    that consist of single basic blocks.  */
 934 void
 935 sms_schedule (FILE *dump_file)
 936 {
 937   static int passes = 0;
 938   rtx insn;
 939   ddg_ptr *g_arr, g;
 940   int * node_order;
 941   int maxii;
 942   unsigned i,num_loops;
 943   partial_schedule_ptr ps;
 944   struct df *df;
 945   struct loops *loops;
 946   basic_block bb = NULL;
 947   /* vars to the versioning only if needed*/
 948   struct loop * nloop;
 949   basic_block condition_bb = NULL;
 950   edge latch_edge;
 951   gcov_type trip_count = 0;
 952
 953   if (! (loops = build_loops_structure (dump_file)))
 954     return;  /* There is no loops to schedule.  */
 955
 956
 957   stats_file = dump_file;
 958
 959   /* Initialize issue_rate.  */
 960   if (targetm.sched.issue_rate)
 961     {
 962       int temp = reload_completed;
 963
 964       reload_completed = 1;
 965       issue_rate = targetm.sched.issue_rate ();
 966       reload_completed = temp;
 967     }
 968   else
 969     issue_rate = 1;
 970
 971   /* Initialize the scheduler.  */
 972   current_sched_info = &sms_sched_info;
 973   sched_init (NULL);
 974
 975   /* Init Data Flow analysis, to be used in interloop dep calculation.  */
 976   df = df_init ();
 977   df_analyze (df, 0, DF_ALL);
 978
 979   /* Allocate memory to hold the DDG array one entry for each loop.
 980      We use loop->num as index into this array.  */
 981   g_arr = xcalloc (loops->num, sizeof (ddg_ptr));
 982
 983
 984   /* Build DDGs for all the relevant loops and hold them in G_ARR
 985      indexed by the loop index.  */
 986   for (i = 0; i < loops->num; i++)
 987     {
 988       rtx head, tail;
 989       rtx count_reg;
 990       struct loop *loop = loops->parray[i];
 991
 992       /* For debugging.  */
 993       if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
 994         {
 995           if (dump_file)
 996             fprintf (dump_file, "SMS reached MAX_PASSES... \n");
 997
 998           break;
 999         }
1000
1001       if (! loop_canon_p (loop, dump_file))
1002         continue;
1003
1004       if (! loop_single_full_bb_p (loop))
1005         continue;
1006
1007       bb = loop->header;
1008
1009       get_block_head_tail (bb->index, &head, &tail);
1010       latch_edge = loop_latch_edge (loop);
1011       gcc_assert (loop->single_exit);
1012       if (loop->single_exit->count)
1013         trip_count = latch_edge->count / loop->single_exit->count;
1014
1015       /* Perfrom SMS only on loops that their average count is above threshold.  */
1016
1017       if ( latch_edge->count
1018           && (latch_edge->count < loop->single_exit->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
1019         {
1020           if (stats_file)
1021             {
1022               rtx line_note = find_line_note (tail);
1023
1024               if (line_note)
1025                 {
1026                   expanded_location xloc;
1027                   NOTE_EXPANDED_LOCATION (xloc, line_note);
1028                   fprintf (stats_file, "SMS bb %s %d (file, line)\n",
1029                            xloc.file, xloc.line);
1030                 }
1031               fprintf (stats_file, "SMS single-bb-loop\n");
1032               if (profile_info && flag_branch_probabilities)
1033                 {
1034                   fprintf (stats_file, "SMS loop-count ");
1035                   fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1036                            (HOST_WIDEST_INT) bb->count);
1037                   fprintf (stats_file, "\n");
1038                   fprintf (stats_file, "SMS trip-count ");
1039                   fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1040                            (HOST_WIDEST_INT) trip_count);
1041                   fprintf (stats_file, "\n");
1042                   fprintf (stats_file, "SMS profile-sum-max ");
1043                   fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1044                            (HOST_WIDEST_INT) profile_info->sum_max);
1045                   fprintf (stats_file, "\n");
1046                 }
1047             }
1048           continue;
1049         }
1050
1051       /* Make sure this is a doloop.  */
1052       if ( !(count_reg = doloop_register_get (tail)))
1053         continue;
1054
1055       /* Don't handle BBs with calls or barriers, or !single_set insns.  */
1056       for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1057         if (CALL_P (insn)
1058             || BARRIER_P (insn)
1059             || (INSN_P (insn) && !JUMP_P (insn)
1060                 && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
1061           break;
1062
1063       if (insn != NEXT_INSN (tail))
1064         {
1065           if (stats_file)
1066             {
1067               if (CALL_P (insn))
1068                 fprintf (stats_file, "SMS loop-with-call\n");
1069               else if (BARRIER_P (insn))
1070                 fprintf (stats_file, "SMS loop-with-barrier\n");
1071               else
1072                 fprintf (stats_file, "SMS loop-with-not-single-set\n");
1073               print_rtl_single (stats_file, insn);
1074             }
1075
1076           continue;
1077         }
1078
1079       if (! (g = create_ddg (bb, df, 0)))
1080         {
1081           if (stats_file)
1082             fprintf (stats_file, "SMS doloop\n");
1083           continue;
1084         }
1085
1086       g_arr[i] = g;
1087     }
1088
1089   /* Release Data Flow analysis data structures.  */
1090   df_finish (df);
1091
1092   /* We don't want to perform SMS on new loops - created by versioning.  */
1093   num_loops = loops->num;
1094   /* Go over the built DDGs and perfrom SMS for each one of them.  */
1095   for (i = 0; i < num_loops; i++)
1096     {
1097       rtx head, tail;
1098       rtx count_reg, count_init;
1099       int mii, rec_mii;
1100       unsigned stage_count = 0;
1101       HOST_WIDEST_INT loop_count = 0;
1102       struct loop *loop = loops->parray[i];
1103
1104       if (! (g = g_arr[i]))
1105         continue;
1106
1107       if (dump_file)
1108         print_ddg (dump_file, g);
1109
1110       get_block_head_tail (loop->header->index, &head, &tail);
1111
1112       latch_edge = loop_latch_edge (loop);
1113       gcc_assert (loop->single_exit);
1114       if (loop->single_exit->count)
1115         trip_count = latch_edge->count / loop->single_exit->count;
1116
1117       if (stats_file)
1118         {
1119           rtx line_note = find_line_note (tail);
1120
1121           if (line_note)
1122             {
1123               expanded_location xloc;
1124               NOTE_EXPANDED_LOCATION (xloc, line_note);
1125               fprintf (stats_file, "SMS bb %s %d (file, line)\n",
1126                        xloc.file, xloc.line);
1127             }
1128           fprintf (stats_file, "SMS single-bb-loop\n");
1129           if (profile_info && flag_branch_probabilities)
1130             {
1131               fprintf (stats_file, "SMS loop-count ");
1132               fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1133                        (HOST_WIDEST_INT) bb->count);
1134               fprintf (stats_file, "\n");
1135               fprintf (stats_file, "SMS profile-sum-max ");
1136               fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1137                        (HOST_WIDEST_INT) profile_info->sum_max);
1138               fprintf (stats_file, "\n");
1139             }
1140           fprintf (stats_file, "SMS doloop\n");
1141           fprintf (stats_file, "SMS built-ddg %d\n", g->num_nodes);
1142           fprintf (stats_file, "SMS num-loads %d\n", g->num_loads);
1143           fprintf (stats_file, "SMS num-stores %d\n", g->num_stores);
1144         }
1145
1146
1147       /* In case of th loop have doloop register it gets special
1148          handling.  */
1149       count_init = NULL_RTX;
1150       if ((count_reg = doloop_register_get (tail)))
1151         {
1152           basic_block pre_header;
1153
1154           pre_header = loop_preheader_edge (loop)->src;
1155           count_init = const_iteration_count (count_reg, pre_header,
1156                                               &loop_count);
1157         }
1158       gcc_assert (count_reg);
1159
1160       if (stats_file && count_init)
1161         {
1162           fprintf (stats_file, "SMS const-doloop ");
1163           fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1164                      loop_count);
1165           fprintf (stats_file, "\n");
1166         }
1167
1168       node_order = (int *) xmalloc (sizeof (int) * g->num_nodes);
1169
1170       mii = 1; /* Need to pass some estimate of mii.  */
1171       rec_mii = sms_order_nodes (g, mii, node_order);
1172       mii = MAX (res_MII (g), rec_mii);
1173       maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1174
1175       if (stats_file)
1176         fprintf (stats_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1177                  rec_mii, mii, maxii);
1178
1179       /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1180          ASAP.  */
1181       set_node_sched_params (g);
1182
1183       ps = sms_schedule_by_order (g, mii, maxii, node_order, dump_file);
1184
1185       if (ps)
1186         stage_count = PS_STAGE_COUNT (ps);
1187
1188       /* Stage count of 1 means that there is no interleaving between
1189          iterations, let the scheduling passes do the job.  */
1190       if (stage_count < 1
1191           || (count_init && (loop_count <= stage_count))
1192           || (flag_branch_probabilities && (trip_count <= stage_count)))
1193         {
1194           if (dump_file)
1195             {
1196               fprintf (dump_file, "SMS failed... \n");
1197               fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
1198               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1199               fprintf (dump_file, ", trip-count=");
1200               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1201               fprintf (dump_file, ")\n");
1202             }
1203           continue;
1204         }
1205       else
1206         {
1207           int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1208           int new_cycles;
1209           struct undo_replace_buff_elem *reg_move_replaces;
1210
1211           if (stats_file)
1212             {
1213               fprintf (stats_file,
1214                        "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1215                        stage_count);
1216               print_partial_schedule (ps, stats_file);
1217               fprintf (stats_file,
1218                        "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1219                        g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1220             }
1221
1222           /* Set the stage boundaries.  If the DDG is built with closing_branch_deps,
1223              the closing_branch was scheduled and should appear in the last (ii-1)
1224              row.  Otherwise, we are free to schedule the branch, and we let nodes
1225              that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1226              row; this should reduce stage_count to minimum.  */
1227           normalize_sched_times (ps);
1228           rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1229           set_columns_for_ps (ps);
1230
1231           /* Generate the kernel just to be able to measure its cycles.  */
1232           permute_partial_schedule (ps, g->closing_branch->first_note);
1233           reg_move_replaces = generate_reg_moves (ps);
1234
1235           /* Get the number of cycles the new kernel expect to execute in.  */
1236           new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1237
1238           /* Get back to the original loop so we can do loop versioning.  */
1239           undo_permute_partial_schedule (ps, g->closing_branch->first_note);
1240           if (reg_move_replaces)
1241             undo_generate_reg_moves (ps, reg_move_replaces);
1242
1243           if ( new_cycles >= orig_cycles)
1244             {
1245               /* SMS is not profitable so undo the permutation and reg move generation
1246                  and return the kernel to its original state.  */
1247               if (dump_file)
1248                 fprintf (dump_file, "Undoing SMS because it is not profitable.\n");
1249
1250             }
1251           else
1252             {
1253               canon_loop (loop);
1254
1255               /* case the BCT count is not known , Do loop-versioning */
1256               if (count_reg && ! count_init)
1257                 {
1258                   rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1259                                                  GEN_INT(stage_count));
1260
1261                   nloop = loop_version (loops, loop, comp_rtx, &condition_bb);
1262                 }
1263
1264               /* Set new iteration count of loop kernel.  */
1265               if (count_reg && count_init)
1266                 SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1267                                                              - stage_count + 1);
1268
1269               /* Now apply the scheduled kernel to the RTL of the loop.  */
1270               permute_partial_schedule (ps, g->closing_branch->first_note);
1271
1272               /* Mark this loop as software pipelined so the later
1273               scheduling passes doesn't touch it.  */
1274               if (! flag_resched_modulo_sched)
1275                 g->bb->flags |= BB_DISABLE_SCHEDULE;
1276               /* The life-info is not valid any more.  */
1277               g->bb->flags |= BB_DIRTY;
1278
1279               reg_move_replaces = generate_reg_moves (ps);
1280               if (dump_file)
1281                 print_node_sched_params (dump_file, g->num_nodes);
1282               /* Generate prolog and epilog.  */
1283               if (count_reg && !count_init)
1284                 generate_prolog_epilog (ps, loop, count_reg);
1285               else
1286                 generate_prolog_epilog (ps, loop, NULL_RTX);
1287             }
1288           free_undo_replace_buff (reg_move_replaces);
1289         }
1290
1291       free_partial_schedule (ps);
1292       free (node_sched_params);
1293       free (node_order);
1294       free_ddg (g);
1295     }
1296
1297   /* Release scheduler data, needed until now because of DFA.  */
1298   sched_finish ();
1299   loop_optimizer_finalize (loops, dump_file);
1300 }
1301
1302 /* The SMS scheduling algorithm itself
1303    -----------------------------------
1304    Input: 'O' an ordered list of insns of a loop.
1305    Output: A scheduling of the loop - kernel, prolog, and epilogue.
1306
1307    'Q' is the empty Set
1308    'PS' is the partial schedule; it holds the currently scheduled nodes with
1309         their cycle/slot.
1310    'PSP' previously scheduled predecessors.
1311    'PSS' previously scheduled successors.
1312    't(u)' the cycle where u is scheduled.
1313    'l(u)' is the latency of u.
1314    'd(v,u)' is the dependence distance from v to u.
1315    'ASAP(u)' the earliest time at which u could be scheduled as computed in
1316              the node ordering phase.
1317    'check_hardware_resources_conflicts(u, PS, c)'
1318                              run a trace around cycle/slot through DFA model
1319                              to check resource conflicts involving instruction u
1320                              at cycle c given the partial schedule PS.
1321    'add_to_partial_schedule_at_time(u, PS, c)'
1322                              Add the node/instruction u to the partial schedule
1323                              PS at time c.
1324    'calculate_register_pressure(PS)'
1325                              Given a schedule of instructions, calculate the register
1326                              pressure it implies.  One implementation could be the
1327                              maximum number of overlapping live ranges.
1328    'maxRP' The maximum allowed register pressure, it is usually derived from the number
1329            registers available in the hardware.
1330
1331    1. II = MII.
1332    2. PS = empty list
1333    3. for each node u in O in pre-computed order
1334    4.   if (PSP(u) != Q && PSS(u) == Q) then
1335    5.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1336    6.     start = Early_start; end = Early_start + II - 1; step = 1
1337    11.  else if (PSP(u) == Q && PSS(u) != Q) then
1338    12.      Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1339    13.     start = Late_start; end = Late_start - II + 1; step = -1
1340    14.  else if (PSP(u) != Q && PSS(u) != Q) then
1341    15.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1342    16.     Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1343    17.     start = Early_start;
1344    18.     end = min(Early_start + II - 1 , Late_start);
1345    19.     step = 1
1346    20.     else "if (PSP(u) == Q && PSS(u) == Q)"
1347    21.    start = ASAP(u); end = start + II - 1; step = 1
1348    22.  endif
1349
1350    23.  success = false
1351    24.  for (c = start ; c != end ; c += step)
1352    25.     if check_hardware_resources_conflicts(u, PS, c) then
1353    26.       add_to_partial_schedule_at_time(u, PS, c)
1354    27.       success = true
1355    28.       break
1356    29.     endif
1357    30.  endfor
1358    31.  if (success == false) then
1359    32.    II = II + 1
1360    33.    if (II > maxII) then
1361    34.       finish - failed to schedule
1362    35.   endif
1363    36.    goto 2.
1364    37.  endif
1365    38. endfor
1366    39. if (calculate_register_pressure(PS) > maxRP) then
1367    40.    goto 32.
1368    41. endif
1369    42. compute epilogue & prologue
1370    43. finish - succeeded to schedule
1371 */
1372
1373 /* A limit on the number of cycles that resource conflicts can span.  ??? Should
1374    be provided by DFA, and be dependent on the type of insn scheduled.  Currently
1375    set to 0 to save compile time.  */
1376 #define DFA_HISTORY SMS_DFA_HISTORY
1377
1378 /* Given the partial schedule PS, this function calculates and returns the
1379    cycles in which we can schedule the node with the given index I.
1380    NOTE: Here we do the backtracking in SMS, in some special cases. We have
1381    noticed that there are several cases in which we fail    to SMS the loop
1382    because the sched window of a node is empty    due to tight data-deps. In
1383    such cases we want to unschedule    some of the predecessors/successors
1384    until we get non-empty    scheduling window.  It returns -1 if the
1385    scheduling window is empty and zero otherwise.  */
1386
1387 static int
1388 get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i,
1389                   sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
1390 {
1391   int start, step, end;
1392   ddg_edge_ptr e;
1393   int u = nodes_order [i];
1394   ddg_node_ptr u_node = &ps->g->nodes[u];
1395   sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1396   sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1397   sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1398   sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1399   int psp_not_empty;
1400   int pss_not_empty;
1401
1402   /* 1. compute sched window for u (start, end, step).  */
1403   sbitmap_zero (psp);
1404   sbitmap_zero (pss);
1405   psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1406   pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1407
1408   if (psp_not_empty && !pss_not_empty)
1409     {
1410       int early_start = INT_MIN;
1411
1412       end = INT_MAX;
1413       for (e = u_node->in; e != 0; e = e->next_in)
1414         {
1415           ddg_node_ptr v_node = e->src;
1416           if (TEST_BIT (sched_nodes, v_node->cuid))
1417             {
1418               int node_st = SCHED_TIME (v_node)
1419                             + e->latency - (e->distance * ii);
1420
1421               early_start = MAX (early_start, node_st);
1422
1423               if (e->data_type == MEM_DEP)
1424                 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1425             }
1426         }
1427       start = early_start;
1428       end = MIN (end, early_start + ii);
1429       step = 1;
1430     }
1431
1432   else if (!psp_not_empty && pss_not_empty)
1433     {
1434       int late_start = INT_MAX;
1435
1436       end = INT_MIN;
1437       for (e = u_node->out; e != 0; e = e->next_out)
1438         {
1439           ddg_node_ptr v_node = e->dest;
1440           if (TEST_BIT (sched_nodes, v_node->cuid))
1441             {
1442               late_start = MIN (late_start,
1443                                 SCHED_TIME (v_node) - e->latency
1444                                 + (e->distance * ii));
1445               if (e->data_type == MEM_DEP)
1446                 end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1447             }
1448         }
1449       start = late_start;
1450       end = MAX (end, late_start - ii);
1451       step = -1;
1452     }
1453
1454   else if (psp_not_empty && pss_not_empty)
1455     {
1456       int early_start = INT_MIN;
1457       int late_start = INT_MAX;
1458
1459       start = INT_MIN;
1460       end = INT_MAX;
1461       for (e = u_node->in; e != 0; e = e->next_in)
1462         {
1463           ddg_node_ptr v_node = e->src;
1464
1465           if (TEST_BIT (sched_nodes, v_node->cuid))
1466             {
1467               early_start = MAX (early_start,
1468                                  SCHED_TIME (v_node) + e->latency
1469                                  - (e->distance * ii));
1470               if (e->data_type == MEM_DEP)
1471                 end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1472             }
1473         }
1474       for (e = u_node->out; e != 0; e = e->next_out)
1475         {
1476           ddg_node_ptr v_node = e->dest;
1477
1478           if (TEST_BIT (sched_nodes, v_node->cuid))
1479             {
1480               late_start = MIN (late_start,
1481                                 SCHED_TIME (v_node) - e->latency
1482                                 + (e->distance * ii));
1483               if (e->data_type == MEM_DEP)
1484                 start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1485             }
1486         }
1487       start = MAX (start, early_start);
1488       end = MIN (end, MIN (early_start + ii, late_start + 1));
1489       step = 1;
1490     }
1491   else /* psp is empty && pss is empty.  */
1492     {
1493       start = SCHED_ASAP (u_node);
1494       end = start + ii;
1495       step = 1;
1496     }
1497
1498   *start_p = start;
1499   *step_p = step;
1500   *end_p = end;
1501   sbitmap_free (psp);
1502   sbitmap_free (pss);
1503
1504   if ((start >= end && step == 1) || (start <= end && step == -1))
1505     return -1;
1506   else
1507     return 0;
1508 }
1509
1510 /* This function implements the scheduling algorithm for SMS according to the
1511    above algorithm.  */
1512 static partial_schedule_ptr
1513 sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order, FILE *dump_file)
1514 {
1515   int ii = mii;
1516   int i, c, success;
1517   int try_again_with_larger_ii = true;
1518   int num_nodes = g->num_nodes;
1519   ddg_edge_ptr e;
1520   int start, end, step; /* Place together into one struct?  */
1521   sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1522   sbitmap must_precede = sbitmap_alloc (num_nodes);
1523   sbitmap must_follow = sbitmap_alloc (num_nodes);
1524   sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
1525
1526   partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1527
1528   sbitmap_ones (tobe_scheduled);
1529   sbitmap_zero (sched_nodes);
1530
1531   while ((! sbitmap_equal (tobe_scheduled, sched_nodes)
1532          || try_again_with_larger_ii ) && ii < maxii)
1533     {
1534       int j;
1535       bool unscheduled_nodes = false;
1536
1537       if (dump_file)
1538         fprintf(dump_file, "Starting with ii=%d\n", ii);
1539       if (try_again_with_larger_ii)
1540         {
1541           try_again_with_larger_ii = false;
1542           sbitmap_zero (sched_nodes);
1543         }
1544
1545       for (i = 0; i < num_nodes; i++)
1546         {
1547           int u = nodes_order[i];
1548           ddg_node_ptr u_node = &ps->g->nodes[u];
1549           rtx insn = u_node->insn;
1550
1551           if (!INSN_P (insn))
1552             {
1553               RESET_BIT (tobe_scheduled, u);
1554               continue;
1555             }
1556
1557           if (JUMP_P (insn)) /* Closing branch handled later.  */
1558             {
1559               RESET_BIT (tobe_scheduled, u);
1560               continue;
1561             }
1562
1563           if (TEST_BIT (sched_nodes, u))
1564             continue;
1565
1566           /* Try to get non-empty scheduling window.  */
1567           j = i;
1568           while (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, &step, &end) < 0
1569                  && j > 0)
1570             {
1571               unscheduled_nodes = true;
1572               if (TEST_BIT (NODE_PREDECESSORS (u_node), nodes_order[j - 1])
1573                   || TEST_BIT (NODE_SUCCESSORS (u_node), nodes_order[j - 1]))
1574                 {
1575                   ps_unschedule_node (ps, &ps->g->nodes[nodes_order[j - 1]]);
1576                   RESET_BIT (sched_nodes, nodes_order [j - 1]);
1577                 }
1578               j--;
1579             }
1580           if (j < 0)
1581             {
1582               /* ??? Try backtracking instead of immediately ii++?  */
1583               ii++;
1584               try_again_with_larger_ii = true;
1585               reset_partial_schedule (ps, ii);
1586               break;
1587             }
1588           /* 2. Try scheduling u in window.  */
1589           if (dump_file)
1590             fprintf(dump_file, "Trying to schedule node %d in (%d .. %d) step %d\n",
1591                     u, start, end, step);
1592
1593           /* use must_follow & must_precede bitmaps to determine order
1594              of nodes within the cycle.  */
1595           sbitmap_zero (must_precede);
1596           sbitmap_zero (must_follow);
1597           for (e = u_node->in; e != 0; e = e->next_in)
1598             if (TEST_BIT (sched_nodes, e->src->cuid)
1599                 && e->latency == (ii * e->distance)
1600                 && start == SCHED_TIME (e->src))
1601              SET_BIT (must_precede, e->src->cuid);
1602
1603           for (e = u_node->out; e != 0; e = e->next_out)
1604             if (TEST_BIT (sched_nodes, e->dest->cuid)
1605                 && e->latency == (ii * e->distance)
1606                 && end == SCHED_TIME (e->dest))
1607              SET_BIT (must_follow, e->dest->cuid);
1608
1609           success = 0;
1610           if ((step > 0 && start < end) ||  (step < 0 && start > end))
1611             for (c = start; c != end; c += step)
1612               {
1613                 ps_insn_ptr psi;
1614
1615                 psi = ps_add_node_check_conflicts (ps, u_node, c,
1616                                                    must_precede,
1617                                                    must_follow);
1618
1619                 if (psi)
1620                   {
1621                     SCHED_TIME (u_node) = c;
1622                     SET_BIT (sched_nodes, u);
1623                     success = 1;
1624                     if (dump_file)
1625                       fprintf(dump_file, "Schedule in %d\n", c);
1626                     break;
1627                   }
1628               }
1629           if (!success)
1630             {
1631               /* ??? Try backtracking instead of immediately ii++?  */
1632               ii++;
1633               try_again_with_larger_ii = true;
1634               reset_partial_schedule (ps, ii);
1635               break;
1636             }
1637           if (unscheduled_nodes)
1638             break;
1639
1640           /* ??? If (success), check register pressure estimates.  */
1641         } /* Continue with next node.  */
1642     } /* While try_again_with_larger_ii.  */
1643
1644   sbitmap_free (sched_nodes);
1645
1646   if (ii >= maxii)
1647     {
1648       free_partial_schedule (ps);
1649       ps = NULL;
1650     }
1651   return ps;
1652 }
1653
1654 \f
1655 /* This page implements the algorithm for ordering the nodes of a DDG
1656    for modulo scheduling, activated through the
1657    "int sms_order_nodes (ddg_ptr, int mii, int * result)" API.  */
1658
1659 #define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1660 #define ASAP(x) (ORDER_PARAMS ((x))->asap)
1661 #define ALAP(x) (ORDER_PARAMS ((x))->alap)
1662 #define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1663 #define MOB(x) (ALAP ((x)) - ASAP ((x)))
1664 #define DEPTH(x) (ASAP ((x)))
1665
1666 typedef struct node_order_params * nopa;
1667
1668 static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1669 static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1670 static nopa  calculate_order_params (ddg_ptr, int mii);
1671 static int find_max_asap (ddg_ptr, sbitmap);
1672 static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1673 static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1674
1675 enum sms_direction {BOTTOMUP, TOPDOWN};
1676
1677 struct node_order_params
1678 {
1679   int asap;
1680   int alap;
1681   int height;
1682 };
1683
1684 /* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1.  */
1685 static void
1686 check_nodes_order (int *node_order, int num_nodes)
1687 {
1688   int i;
1689   sbitmap tmp = sbitmap_alloc (num_nodes);
1690
1691   sbitmap_zero (tmp);
1692
1693   for (i = 0; i < num_nodes; i++)
1694     {
1695       int u = node_order[i];
1696
1697       gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u));
1698
1699       SET_BIT (tmp, u);
1700     }
1701
1702   sbitmap_free (tmp);
1703 }
1704
1705 /* Order the nodes of G for scheduling and pass the result in
1706    NODE_ORDER.  Also set aux.count of each node to ASAP.
1707    Return the recMII for the given DDG.  */
1708 static int
1709 sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1710 {
1711   int i;
1712   int rec_mii = 0;
1713   ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1714
1715   nopa nops = calculate_order_params (g, mii);
1716
1717   order_nodes_of_sccs (sccs, node_order);
1718
1719   if (sccs->num_sccs > 0)
1720     /* First SCC has the largest recurrence_length.  */
1721     rec_mii = sccs->sccs[0]->recurrence_length;
1722
1723   /* Save ASAP before destroying node_order_params.  */
1724   for (i = 0; i < g->num_nodes; i++)
1725     {
1726       ddg_node_ptr v = &g->nodes[i];
1727       v->aux.count = ASAP (v);
1728     }
1729
1730   free (nops);
1731   free_ddg_all_sccs (sccs);
1732   check_nodes_order (node_order, g->num_nodes);
1733
1734   return rec_mii;
1735 }
1736
1737 static void
1738 order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1739 {
1740   int i, pos = 0;
1741   ddg_ptr g = all_sccs->ddg;
1742   int num_nodes = g->num_nodes;
1743   sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1744   sbitmap on_path = sbitmap_alloc (num_nodes);
1745   sbitmap tmp = sbitmap_alloc (num_nodes);
1746   sbitmap ones = sbitmap_alloc (num_nodes);
1747
1748   sbitmap_zero (prev_sccs);
1749   sbitmap_ones (ones);
1750
1751   /* Perfrom the node ordering starting from the SCC with the highest recMII.
1752      For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc.  */
1753   for (i = 0; i < all_sccs->num_sccs; i++)
1754     {
1755       ddg_scc_ptr scc = all_sccs->sccs[i];
1756
1757       /* Add nodes on paths from previous SCCs to the current SCC.  */
1758       find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1759       sbitmap_a_or_b (tmp, scc->nodes, on_path);
1760
1761       /* Add nodes on paths from the current SCC to previous SCCs.  */
1762       find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1763       sbitmap_a_or_b (tmp, tmp, on_path);
1764
1765       /* Remove nodes of previous SCCs from current extended SCC.  */
1766       sbitmap_difference (tmp, tmp, prev_sccs);
1767
1768       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1769       /* Above call to order_nodes_in_scc updated prev_sccs |= tmp.  */
1770     }
1771
1772   /* Handle the remaining nodes that do not belong to any scc.  Each call
1773      to order_nodes_in_scc handles a single connected component.  */
1774   while (pos < g->num_nodes)
1775     {
1776       sbitmap_difference (tmp, ones, prev_sccs);
1777       pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1778     }
1779   sbitmap_free (prev_sccs);
1780   sbitmap_free (on_path);
1781   sbitmap_free (tmp);
1782   sbitmap_free (ones);
1783 }
1784
1785 /* MII is needed if we consider backarcs (that do not close recursive cycles).  */
1786 static struct node_order_params *
1787 calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1788 {
1789   int u;
1790   int max_asap;
1791   int num_nodes = g->num_nodes;
1792   ddg_edge_ptr e;
1793   /* Allocate a place to hold ordering params for each node in the DDG.  */
1794   nopa node_order_params_arr;
1795
1796   /* Initialize of ASAP/ALAP/HEIGHT to zero.  */
1797   node_order_params_arr = (nopa) xcalloc (num_nodes,
1798                                           sizeof (struct node_order_params));
1799
1800   /* Set the aux pointer of each node to point to its order_params structure.  */
1801   for (u = 0; u < num_nodes; u++)
1802     g->nodes[u].aux.info = &node_order_params_arr[u];
1803
1804   /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1805      calculate ASAP, ALAP, mobility, distance, and height for each node
1806      in the dependence (direct acyclic) graph.  */
1807
1808   /* We assume that the nodes in the array are in topological order.  */
1809
1810   max_asap = 0;
1811   for (u = 0; u < num_nodes; u++)
1812     {
1813       ddg_node_ptr u_node = &g->nodes[u];
1814
1815       ASAP (u_node) = 0;
1816       for (e = u_node->in; e; e = e->next_in)
1817         if (e->distance == 0)
1818           ASAP (u_node) = MAX (ASAP (u_node),
1819                                ASAP (e->src) + e->latency);
1820       max_asap = MAX (max_asap, ASAP (u_node));
1821     }
1822
1823   for (u = num_nodes - 1; u > -1; u--)
1824     {
1825       ddg_node_ptr u_node = &g->nodes[u];
1826
1827       ALAP (u_node) = max_asap;
1828       HEIGHT (u_node) = 0;
1829       for (e = u_node->out; e; e = e->next_out)
1830         if (e->distance == 0)
1831           {
1832             ALAP (u_node) = MIN (ALAP (u_node),
1833                                  ALAP (e->dest) - e->latency);
1834             HEIGHT (u_node) = MAX (HEIGHT (u_node),
1835                                    HEIGHT (e->dest) + e->latency);
1836           }
1837     }
1838
1839   return node_order_params_arr;
1840 }
1841
1842 static int
1843 find_max_asap (ddg_ptr g, sbitmap nodes)
1844 {
1845   int u;
1846   int max_asap = -1;
1847   int result = -1;
1848
1849   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1850     {
1851       ddg_node_ptr u_node = &g->nodes[u];
1852
1853       if (max_asap < ASAP (u_node))
1854         {
1855           max_asap = ASAP (u_node);
1856           result = u;
1857         }
1858     });
1859   return result;
1860 }
1861
1862 static int
1863 find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1864 {
1865   int u;
1866   int max_hv = -1;
1867   int min_mob = INT_MAX;
1868   int result = -1;
1869
1870   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1871     {
1872       ddg_node_ptr u_node = &g->nodes[u];
1873
1874       if (max_hv < HEIGHT (u_node))
1875         {
1876           max_hv = HEIGHT (u_node);
1877           min_mob = MOB (u_node);
1878           result = u;
1879         }
1880       else if ((max_hv == HEIGHT (u_node))
1881                && (min_mob > MOB (u_node)))
1882         {
1883           min_mob = MOB (u_node);
1884           result = u;
1885         }
1886     });
1887   return result;
1888 }
1889
1890 static int
1891 find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1892 {
1893   int u;
1894   int max_dv = -1;
1895   int min_mob = INT_MAX;
1896   int result = -1;
1897
1898   EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u,
1899     {
1900       ddg_node_ptr u_node = &g->nodes[u];
1901
1902       if (max_dv < DEPTH (u_node))
1903         {
1904           max_dv = DEPTH (u_node);
1905           min_mob = MOB (u_node);
1906           result = u;
1907         }
1908       else if ((max_dv == DEPTH (u_node))
1909                && (min_mob > MOB (u_node)))
1910         {
1911           min_mob = MOB (u_node);
1912           result = u;
1913         }
1914     });
1915   return result;
1916 }
1917
1918 /* Places the nodes of SCC into the NODE_ORDER array starting
1919    at position POS, according to the SMS ordering algorithm.
1920    NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1921    the NODE_ORDER array, starting from position zero.  */
1922 static int
1923 order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1924                     int * node_order, int pos)
1925 {
1926   enum sms_direction dir;
1927   int num_nodes = g->num_nodes;
1928   sbitmap workset = sbitmap_alloc (num_nodes);
1929   sbitmap tmp = sbitmap_alloc (num_nodes);
1930   sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1931   sbitmap predecessors = sbitmap_alloc (num_nodes);
1932   sbitmap successors = sbitmap_alloc (num_nodes);
1933
1934   sbitmap_zero (predecessors);
1935   find_predecessors (predecessors, g, nodes_ordered);
1936
1937   sbitmap_zero (successors);
1938   find_successors (successors, g, nodes_ordered);
1939
1940   sbitmap_zero (tmp);
1941   if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1942     {
1943       sbitmap_copy (workset, tmp);
1944       dir = BOTTOMUP;
1945     }
1946   else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1947     {
1948       sbitmap_copy (workset, tmp);
1949       dir = TOPDOWN;
1950     }
1951   else
1952     {
1953       int u;
1954
1955       sbitmap_zero (workset);
1956       if ((u = find_max_asap (g, scc)) >= 0)
1957         SET_BIT (workset, u);
1958       dir = BOTTOMUP;
1959     }
1960
1961   sbitmap_zero (zero_bitmap);
1962   while (!sbitmap_equal (workset, zero_bitmap))
1963     {
1964       int v;
1965       ddg_node_ptr v_node;
1966       sbitmap v_node_preds;
1967       sbitmap v_node_succs;
1968
1969       if (dir == TOPDOWN)
1970         {
1971           while (!sbitmap_equal (workset, zero_bitmap))
1972             {
1973               v = find_max_hv_min_mob (g, workset);
1974               v_node = &g->nodes[v];
1975               node_order[pos++] = v;
1976               v_node_succs = NODE_SUCCESSORS (v_node);
1977               sbitmap_a_and_b (tmp, v_node_succs, scc);
1978
1979               /* Don't consider the already ordered successors again.  */
1980               sbitmap_difference (tmp, tmp, nodes_ordered);
1981               sbitmap_a_or_b (workset, workset, tmp);
1982               RESET_BIT (workset, v);
1983               SET_BIT (nodes_ordered, v);
1984             }
1985           dir = BOTTOMUP;
1986           sbitmap_zero (predecessors);
1987           find_predecessors (predecessors, g, nodes_ordered);
1988           sbitmap_a_and_b (workset, predecessors, scc);
1989         }
1990       else
1991         {
1992           while (!sbitmap_equal (workset, zero_bitmap))
1993             {
1994               v = find_max_dv_min_mob (g, workset);
1995               v_node = &g->nodes[v];
1996               node_order[pos++] = v;
1997               v_node_preds = NODE_PREDECESSORS (v_node);
1998               sbitmap_a_and_b (tmp, v_node_preds, scc);
1999
2000               /* Don't consider the already ordered predecessors again.  */
2001               sbitmap_difference (tmp, tmp, nodes_ordered);
2002               sbitmap_a_or_b (workset, workset, tmp);
2003               RESET_BIT (workset, v);
2004               SET_BIT (nodes_ordered, v);
2005             }
2006           dir = TOPDOWN;
2007           sbitmap_zero (successors);
2008           find_successors (successors, g, nodes_ordered);
2009           sbitmap_a_and_b (workset, successors, scc);
2010         }
2011     }
2012   sbitmap_free (tmp);
2013   sbitmap_free (workset);
2014   sbitmap_free (zero_bitmap);
2015   sbitmap_free (predecessors);
2016   sbitmap_free (successors);
2017   return pos;
2018 }
2019
2020 \f
2021 /* This page contains functions for manipulating partial-schedules during
2022    modulo scheduling.  */
2023
2024 /* Create a partial schedule and allocate a memory to hold II rows.  */
2025 partial_schedule_ptr
2026 create_partial_schedule (int ii, ddg_ptr g, int history)
2027 {
2028   partial_schedule_ptr ps = (partial_schedule_ptr)
2029                              xmalloc (sizeof (struct partial_schedule));
2030   ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2031   ps->ii = ii;
2032   ps->history = history;
2033   ps->min_cycle = INT_MAX;
2034   ps->max_cycle = INT_MIN;
2035   ps->g = g;
2036
2037   return ps;
2038 }
2039
2040 /* Free the PS_INSNs in rows array of the given partial schedule.
2041    ??? Consider caching the PS_INSN's.  */
2042 static void
2043 free_ps_insns (partial_schedule_ptr ps)
2044 {
2045   int i;
2046
2047   for (i = 0; i < ps->ii; i++)
2048     {
2049       while (ps->rows[i])
2050         {
2051           ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2052
2053           free (ps->rows[i]);
2054           ps->rows[i] = ps_insn;
2055         }
2056       ps->rows[i] = NULL;
2057     }
2058 }
2059
2060 /* Free all the memory allocated to the partial schedule.  */
2061 void
2062 free_partial_schedule (partial_schedule_ptr ps)
2063 {
2064   if (!ps)
2065     return;
2066   free_ps_insns (ps);
2067   free (ps->rows);
2068   free (ps);
2069 }
2070
2071 /* Clear the rows array with its PS_INSNs, and create a new one with
2072    NEW_II rows.  */
2073 void
2074 reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2075 {
2076   if (!ps)
2077     return;
2078   free_ps_insns (ps);
2079   if (new_ii == ps->ii)
2080     return;
2081   ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2082                                                  * sizeof (ps_insn_ptr));
2083   memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2084   ps->ii = new_ii;
2085   ps->min_cycle = INT_MAX;
2086   ps->max_cycle = INT_MIN;
2087 }
2088
2089 /* Prints the partial schedule as an ii rows array, for each rows
2090    print the ids of the insns in it.  */
2091 void
2092 print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2093 {
2094   int i;
2095
2096   for (i = 0; i < ps->ii; i++)
2097     {
2098       ps_insn_ptr ps_i = ps->rows[i];
2099
2100       fprintf (dump, "\n[CYCLE %d ]: ", i);
2101       while (ps_i)
2102         {
2103           fprintf (dump, "%d, ",
2104                    INSN_UID (ps_i->node->insn));
2105           ps_i = ps_i->next_in_row;
2106         }
2107     }
2108 }
2109
2110 /* Creates an object of PS_INSN and initializes it to the given parameters.  */
2111 static ps_insn_ptr
2112 create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
2113 {
2114   ps_insn_ptr ps_i = xmalloc (sizeof (struct ps_insn));
2115
2116   ps_i->node = node;
2117   ps_i->next_in_row = NULL;
2118   ps_i->prev_in_row = NULL;
2119   ps_i->row_rest_count = rest_count;
2120   ps_i->cycle = cycle;
2121
2122   return ps_i;
2123 }
2124
2125
2126 /* Removes the given PS_INSN from the partial schedule.  Returns false if the
2127    node is not found in the partial schedule, else returns true.  */
2128 static bool
2129 remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2130 {
2131   int row;
2132
2133   if (!ps || !ps_i)
2134     return false;
2135
2136   row = SMODULO (ps_i->cycle, ps->ii);
2137   if (! ps_i->prev_in_row)
2138     {
2139       if (ps_i != ps->rows[row])
2140         return false;
2141
2142       ps->rows[row] = ps_i->next_in_row;
2143       if (ps->rows[row])
2144         ps->rows[row]->prev_in_row = NULL;
2145     }
2146   else
2147     {
2148       ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2149       if (ps_i->next_in_row)
2150         ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2151     }
2152   free (ps_i);
2153   return true;
2154 }
2155
2156 /* Unlike what literature describes for modulo scheduling (which focuses
2157    on VLIW machines) the order of the instructions inside a cycle is
2158    important.  Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2159    where the current instruction should go relative to the already
2160    scheduled instructions in the given cycle.  Go over these
2161    instructions and find the first possible column to put it in.  */
2162 static bool
2163 ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2164                      sbitmap must_precede, sbitmap must_follow)
2165 {
2166   ps_insn_ptr next_ps_i;
2167   ps_insn_ptr first_must_follow = NULL;
2168   ps_insn_ptr last_must_precede = NULL;
2169   int row;
2170
2171   if (! ps_i)
2172     return false;
2173
2174   row = SMODULO (ps_i->cycle, ps->ii);
2175
2176   /* Find the first must follow and the last must precede
2177      and insert the node immediately after the must precede
2178      but make sure that it there is no must follow after it.  */
2179   for (next_ps_i = ps->rows[row];
2180        next_ps_i;
2181        next_ps_i = next_ps_i->next_in_row)
2182     {
2183       if (TEST_BIT (must_follow, next_ps_i->node->cuid)
2184           && ! first_must_follow)
2185         first_must_follow = next_ps_i;
2186       if (TEST_BIT (must_precede, next_ps_i->node->cuid))
2187         {
2188           /* If we have already met a node that must follow, then
2189              there is no possible column.  */
2190           if (first_must_follow)
2191             return false;
2192           else
2193             last_must_precede = next_ps_i;
2194         }
2195     }
2196
2197   /* Now insert the node after INSERT_AFTER_PSI.  */
2198
2199   if (! last_must_precede)
2200     {
2201       ps_i->next_in_row = ps->rows[row];
2202       ps_i->prev_in_row = NULL;
2203       if (ps_i->next_in_row)
2204         ps_i->next_in_row->prev_in_row = ps_i;
2205       ps->rows[row] = ps_i;
2206     }
2207   else
2208     {
2209       ps_i->next_in_row = last_must_precede->next_in_row;
2210       last_must_precede->next_in_row = ps_i;
2211       ps_i->prev_in_row = last_must_precede;
2212       if (ps_i->next_in_row)
2213         ps_i->next_in_row->prev_in_row = ps_i;
2214     }
2215
2216   return true;
2217 }
2218
2219 /* Advances the PS_INSN one column in its current row; returns false
2220    in failure and true in success.  Bit N is set in MUST_FOLLOW if
2221    the node with cuid N must be come after the node pointed to by
2222    PS_I when scheduled in the same cycle.  */
2223 static int
2224 ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2225                         sbitmap must_follow)
2226 {
2227   ps_insn_ptr prev, next;
2228   int row;
2229   ddg_node_ptr next_node;
2230
2231   if (!ps || !ps_i)
2232     return false;
2233
2234   row = SMODULO (ps_i->cycle, ps->ii);
2235
2236   if (! ps_i->next_in_row)
2237     return false;
2238
2239   next_node = ps_i->next_in_row->node;
2240
2241   /* Check if next_in_row is dependent on ps_i, both having same sched
2242      times (typically ANTI_DEP).  If so, ps_i cannot skip over it.  */
2243   if (TEST_BIT (must_follow, next_node->cuid))
2244     return false;
2245
2246   /* Advance PS_I over its next_in_row in the doubly linked list.  */
2247   prev = ps_i->prev_in_row;
2248   next = ps_i->next_in_row;
2249
2250   if (ps_i == ps->rows[row])
2251     ps->rows[row] = next;
2252
2253   ps_i->next_in_row = next->next_in_row;
2254
2255   if (next->next_in_row)
2256     next->next_in_row->prev_in_row = ps_i;
2257
2258   next->next_in_row = ps_i;
2259   ps_i->prev_in_row = next;
2260
2261   next->prev_in_row = prev;
2262   if (prev)
2263     prev->next_in_row = next;
2264
2265   return true;
2266 }
2267
2268 /* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2269    Returns 0 if this is not possible and a PS_INSN otherwise.  Bit N is
2270    set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2271    before/after (respectively) the node pointed to by PS_I when scheduled
2272    in the same cycle.  */
2273 static ps_insn_ptr
2274 add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2275                 sbitmap must_precede, sbitmap must_follow)
2276 {
2277   ps_insn_ptr ps_i;
2278   int rest_count = 1;
2279   int row = SMODULO (cycle, ps->ii);
2280
2281   if (ps->rows[row]
2282       && ps->rows[row]->row_rest_count >= issue_rate)
2283     return NULL;
2284
2285   if (ps->rows[row])
2286     rest_count += ps->rows[row]->row_rest_count;
2287
2288   ps_i = create_ps_insn (node, rest_count, cycle);
2289
2290   /* Finds and inserts PS_I according to MUST_FOLLOW and
2291      MUST_PRECEDE.  */
2292   if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2293     {
2294       free (ps_i);
2295       return NULL;
2296     }
2297
2298   return ps_i;
2299 }
2300
2301 /* Advance time one cycle.  Assumes DFA is being used.  */
2302 static void
2303 advance_one_cycle (void)
2304 {
2305   if (targetm.sched.dfa_pre_cycle_insn)
2306     state_transition (curr_state,
2307                       targetm.sched.dfa_pre_cycle_insn ());
2308
2309   state_transition (curr_state, NULL);
2310
2311   if (targetm.sched.dfa_post_cycle_insn)
2312     state_transition (curr_state,
2313                       targetm.sched.dfa_post_cycle_insn ());
2314 }
2315
2316 /* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
2317    the number of cycles according to DFA that the kernel fits in,
2318    we use this to check if we done well with SMS after we add
2319    register moves.  In some cases register moves overhead makes
2320    it even worse than the original loop.  We want SMS to be performed
2321    when it gives less cycles after register moves are added.  */
2322 static int
2323 kernel_number_of_cycles (rtx first_insn, rtx last_insn)
2324 {
2325   int cycles = 0;
2326   rtx insn;
2327   int can_issue_more = issue_rate;
2328
2329   state_reset (curr_state);
2330
2331   for (insn = first_insn;
2332        insn != NULL_RTX && insn != last_insn;
2333        insn = NEXT_INSN (insn))
2334     {
2335       if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
2336         continue;
2337
2338       /* Check if there is room for the current insn.  */
2339       if (!can_issue_more || state_dead_lock_p (curr_state))
2340         {
2341           cycles ++;
2342           advance_one_cycle ();
2343           can_issue_more = issue_rate;
2344         }
2345
2346         /* Update the DFA state and return with failure if the DFA found
2347            recource conflicts.  */
2348       if (state_transition (curr_state, insn) >= 0)
2349         {
2350           cycles ++;
2351           advance_one_cycle ();
2352           can_issue_more = issue_rate;
2353         }
2354
2355       if (targetm.sched.variable_issue)
2356         can_issue_more =
2357           targetm.sched.variable_issue (sched_dump, sched_verbose,
2358                                         insn, can_issue_more);
2359       /* A naked CLOBBER or USE generates no instruction, so don't
2360          let them consume issue slots.  */
2361       else if (GET_CODE (PATTERN (insn)) != USE
2362                && GET_CODE (PATTERN (insn)) != CLOBBER)
2363         can_issue_more--;
2364     }
2365   return cycles;
2366 }
2367
2368 /* Checks if PS has resource conflicts according to DFA, starting from
2369    FROM cycle to TO cycle; returns true if there are conflicts and false
2370    if there are no conflicts.  Assumes DFA is being used.  */
2371 static int
2372 ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2373 {
2374   int cycle;
2375
2376   state_reset (curr_state);
2377
2378   for (cycle = from; cycle <= to; cycle++)
2379     {
2380       ps_insn_ptr crr_insn;
2381       /* Holds the remaining issue slots in the current row.  */
2382       int can_issue_more = issue_rate;
2383
2384       /* Walk through the DFA for the current row.  */
2385       for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2386            crr_insn;
2387            crr_insn = crr_insn->next_in_row)
2388         {
2389           rtx insn = crr_insn->node->insn;
2390
2391           if (!INSN_P (insn))
2392             continue;
2393
2394           /* Check if there is room for the current insn.  */
2395           if (!can_issue_more || state_dead_lock_p (curr_state))
2396             return true;
2397
2398           /* Update the DFA state and return with failure if the DFA found
2399              recource conflicts.  */
2400           if (state_transition (curr_state, insn) >= 0)
2401             return true;
2402
2403           if (targetm.sched.variable_issue)
2404             can_issue_more =
2405               targetm.sched.variable_issue (sched_dump, sched_verbose,
2406                                             insn, can_issue_more);
2407           /* A naked CLOBBER or USE generates no instruction, so don't
2408              let them consume issue slots.  */
2409           else if (GET_CODE (PATTERN (insn)) != USE
2410                    && GET_CODE (PATTERN (insn)) != CLOBBER)
2411             can_issue_more--;
2412         }
2413
2414       /* Advance the DFA to the next cycle.  */
2415       advance_one_cycle ();
2416     }
2417   return false;
2418 }
2419
2420 /* Checks if the given node causes resource conflicts when added to PS at
2421    cycle C.  If not the node is added to PS and returned; otherwise zero
2422    is returned.  Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2423    cuid N must be come before/after (respectively) the node pointed to by
2424    PS_I when scheduled in the same cycle.  */
2425 ps_insn_ptr
2426 ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2427                              int c, sbitmap must_precede,
2428                              sbitmap must_follow)
2429 {
2430   int has_conflicts = 0;
2431   ps_insn_ptr ps_i;
2432
2433   /* First add the node to the PS, if this succeeds check for
2434      conflicts, trying different issue slots in the same row.  */
2435   if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2436     return NULL; /* Failed to insert the node at the given cycle.  */
2437
2438   has_conflicts = ps_has_conflicts (ps, c, c)
2439                   || (ps->history > 0
2440                       && ps_has_conflicts (ps,
2441                                            c - ps->history,
2442                                            c + ps->history));
2443
2444   /* Try different issue slots to find one that the given node can be
2445      scheduled in without conflicts.  */
2446   while (has_conflicts)
2447     {
2448       if (! ps_insn_advance_column (ps, ps_i, must_follow))
2449         break;
2450       has_conflicts = ps_has_conflicts (ps, c, c)
2451                       || (ps->history > 0
2452                           && ps_has_conflicts (ps,
2453                                                c - ps->history,
2454                                                c + ps->history));
2455     }
2456
2457   if (has_conflicts)
2458     {
2459       remove_node_from_ps (ps, ps_i);
2460       return NULL;
2461     }
2462
2463   ps->min_cycle = MIN (ps->min_cycle, c);
2464   ps->max_cycle = MAX (ps->max_cycle, c);
2465   return ps_i;
2466 }
2467
2468 /* Rotate the rows of PS such that insns scheduled at time
2469    START_CYCLE will appear in row 0.  Updates max/min_cycles.  */
2470 void
2471 rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2472 {
2473   int i, row, backward_rotates;
2474   int last_row = ps->ii - 1;
2475
2476   if (start_cycle == 0)
2477     return;
2478
2479   backward_rotates = SMODULO (start_cycle, ps->ii);
2480
2481   /* Revisit later and optimize this into a single loop.  */
2482   for (i = 0; i < backward_rotates; i++)
2483     {
2484       ps_insn_ptr first_row = ps->rows[0];
2485
2486       for (row = 0; row < last_row; row++)
2487         ps->rows[row] = ps->rows[row+1];
2488
2489       ps->rows[last_row] = first_row;
2490     }
2491
2492   ps->max_cycle -= start_cycle;
2493   ps->min_cycle -= start_cycle;
2494 }
2495
2496 /* Remove the node N from the partial schedule PS; because we restart the DFA
2497    each time we want to check for resource conflicts; this is equivalent to
2498    unscheduling the node N.  */
2499 static bool
2500 ps_unschedule_node (partial_schedule_ptr ps, ddg_node_ptr n)
2501 {
2502   ps_insn_ptr ps_i;
2503   int row = SMODULO (SCHED_TIME (n), ps->ii);
2504
2505   if (row < 0 || row > ps->ii)
2506     return false;
2507
2508   for (ps_i = ps->rows[row];
2509        ps_i &&  ps_i->node != n;
2510        ps_i = ps_i->next_in_row);
2511   if (!ps_i)
2512     return false;
2513
2514   return remove_node_from_ps (ps, ps_i);
2515 }
2516 #endif /* INSN_SCHEDULING*/
2517