gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2007, 2008, 2010
   3    Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify it under
   8 the terms of the GNU General Public License as published by the Free
   9 Software Foundation; either version 3, or (at your option) any later
  10 version.
  11
  12 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15 for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING3.  If not see
  19 <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "rtl.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "cfglayout.h"
  31 #include "params.h"
  32 #include "output.h"
  33 #include "expr.h"
  34 #include "hashtab.h"
  35 #include "recog.h"
  36 #include "target.h"
  37
  38 /* This pass performs loop unrolling and peeling.  We only perform these
  39    optimizations on innermost loops (with single exception) because
  40    the impact on performance is greatest here, and we want to avoid
  41    unnecessary code size growth.  The gain is caused by greater sequentiality
  42    of code, better code to optimize for further passes and in some cases
  43    by fewer testings of exit conditions.  The main problem is code growth,
  44    that impacts performance negatively due to effect of caches.
  45
  46    What we do:
  47
  48    -- complete peeling of once-rolling loops; this is the above mentioned
  49       exception, as this causes loop to be cancelled completely and
  50       does not cause code growth
  51    -- complete peeling of loops that roll (small) constant times.
  52    -- simple peeling of first iterations of loops that do not roll much
  53       (according to profile feedback)
  54    -- unrolling of loops that roll constant times; this is almost always
  55       win, as we get rid of exit condition tests.
  56    -- unrolling of loops that roll number of times that we can compute
  57       in runtime; we also get rid of exit condition tests here, but there
  58       is the extra expense for calculating the number of iterations
  59    -- simple unrolling of remaining loops; this is performed only if we
  60       are asked to, as the gain is questionable in this case and often
  61       it may even slow down the code
  62    For more detailed descriptions of each of those, see comments at
  63    appropriate function below.
  64
  65    There is a lot of parameters (defined and described in params.def) that
  66    control how much we unroll/peel.
  67
  68    ??? A great problem is that we don't have a good way how to determine
  69    how many times we should unroll the loop; the experiments I have made
  70    showed that this choice may affect performance in order of several %.
  71    */
  72
  73 /* Information about induction variables to split.  */
  74
  75 struct iv_to_split
  76 {
  77   rtx insn;             /* The insn in that the induction variable occurs.  */
  78   rtx base_var;         /* The variable on that the values in the further
  79                            iterations are based.  */
  80   rtx step;             /* Step of the induction variable.  */
  81   struct iv_to_split *next; /* Next entry in walking order.  */
  82   unsigned n_loc;
  83   unsigned loc[3];      /* Location where the definition of the induction
  84                            variable occurs in the insn.  For example if
  85                            N_LOC is 2, the expression is located at
  86                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  87 };
  88
  89 /* Information about accumulators to expand.  */
  90
  91 struct var_to_expand
  92 {
  93   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  94   rtx reg;                         /* The accumulator which is expanded.  */
  95   VEC(rtx,heap) *var_expansions;   /* The copies of the accumulator which is expanded.  */
  96   struct var_to_expand *next;      /* Next entry in walking order.  */
  97   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  98                                       or multiplication.  */
  99   int expansion_count;             /* Count the number of expansions generated so far.  */
 100   int reuse_expansion;             /* The expansion we intend to reuse to expand
 101                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 102                                       the original accumulator.  Else use
 103                                       var_expansions[REUSE_EXPANSION - 1].  */
 104   unsigned accum_pos;              /* The position in which the accumulator is placed in
 105                                       the insn src.  For example in x = x + something
 106                                       accum_pos is 0 while in x = something + x accum_pos
 107                                       is 1.  */
 108 };
 109
 110 /* Information about optimization applied in
 111    the unrolled loop.  */
 112
 113 struct opt_info
 114 {
 115   htab_t insns_to_split;           /* A hashtable of insns to split.  */
 116   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 117   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 118   htab_t insns_with_var_to_expand; /* A hashtable of insns with accumulators
 119                                       to expand.  */
 120   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 121   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 122   unsigned first_new_block;        /* The first basic block that was
 123                                       duplicated.  */
 124   basic_block loop_exit;           /* The loop exit basic block.  */
 125   basic_block loop_preheader;      /* The loop preheader basic block.  */
 126 };
 127
 128 static void decide_unrolling_and_peeling (int);
 129 static void peel_loops_completely (int);
 130 static void decide_peel_simple (struct loop *, int);
 131 static void decide_peel_once_rolling (struct loop *, int);
 132 static void decide_peel_completely (struct loop *, int);
 133 static void decide_unroll_stupid (struct loop *, int);
 134 static void decide_unroll_constant_iterations (struct loop *, int);
 135 static void decide_unroll_runtime_iterations (struct loop *, int);
 136 static void peel_loop_simple (struct loop *);
 137 static void peel_loop_completely (struct loop *);
 138 static void unroll_loop_stupid (struct loop *);
 139 static void unroll_loop_constant_iterations (struct loop *);
 140 static void unroll_loop_runtime_iterations (struct loop *);
 141 static struct opt_info *analyze_insns_in_loop (struct loop *);
 142 static void opt_info_start_duplication (struct opt_info *);
 143 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 144 static void free_opt_info (struct opt_info *);
 145 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 146 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 147 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 148 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 149 static void insert_var_expansion_initialization (struct var_to_expand *,
 150                                                  basic_block);
 151 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 152                                              basic_block);
 153 static rtx get_expansion (struct var_to_expand *);
 154
 155 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 156 void
 157 unroll_and_peel_loops (int flags)
 158 {
 159   struct loop *loop;
 160   bool check;
 161   loop_iterator li;
 162
 163   /* First perform complete loop peeling (it is almost surely a win,
 164      and affects parameters for further decision a lot).  */
 165   peel_loops_completely (flags);
 166
 167   /* Now decide rest of unrolling and peeling.  */
 168   decide_unrolling_and_peeling (flags);
 169
 170   /* Scan the loops, inner ones first.  */
 171   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 172     {
 173       check = true;
 174       /* And perform the appropriate transformations.  */
 175       switch (loop->lpt_decision.decision)
 176         {
 177         case LPT_PEEL_COMPLETELY:
 178           /* Already done.  */
 179           gcc_unreachable ();
 180         case LPT_PEEL_SIMPLE:
 181           peel_loop_simple (loop);
 182           break;
 183         case LPT_UNROLL_CONSTANT:
 184           unroll_loop_constant_iterations (loop);
 185           break;
 186         case LPT_UNROLL_RUNTIME:
 187           unroll_loop_runtime_iterations (loop);
 188           break;
 189         case LPT_UNROLL_STUPID:
 190           unroll_loop_stupid (loop);
 191           break;
 192         case LPT_NONE:
 193           check = false;
 194           break;
 195         default:
 196           gcc_unreachable ();
 197         }
 198       if (check)
 199         {
 200 #ifdef ENABLE_CHECKING
 201           verify_dominators (CDI_DOMINATORS);
 202           verify_loop_structure ();
 203 #endif
 204         }
 205     }
 206
 207   iv_analysis_done ();
 208 }
 209
 210 /* Check whether exit of the LOOP is at the end of loop body.  */
 211
 212 static bool
 213 loop_exit_at_end_p (struct loop *loop)
 214 {
 215   struct niter_desc *desc = get_simple_loop_desc (loop);
 216   rtx insn;
 217
 218   if (desc->in_edge->dest != loop->latch)
 219     return false;
 220
 221   /* Check that the latch is empty.  */
 222   FOR_BB_INSNS (loop->latch, insn)
 223     {
 224       if (INSN_P (insn))
 225         return false;
 226     }
 227
 228   return true;
 229 }
 230
 231 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 232 static void
 233 peel_loops_completely (int flags)
 234 {
 235   struct loop *loop;
 236   loop_iterator li;
 237
 238   /* Scan the loops, the inner ones first.  */
 239   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 240     {
 241       loop->lpt_decision.decision = LPT_NONE;
 242
 243       if (dump_file)
 244         fprintf (dump_file,
 245                  "\n;; *** Considering loop %d for complete peeling ***\n",
 246                  loop->num);
 247
 248       loop->ninsns = num_loop_insns (loop);
 249
 250       decide_peel_once_rolling (loop, flags);
 251       if (loop->lpt_decision.decision == LPT_NONE)
 252         decide_peel_completely (loop, flags);
 253
 254       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 255         {
 256           peel_loop_completely (loop);
 257 #ifdef ENABLE_CHECKING
 258           verify_dominators (CDI_DOMINATORS);
 259           verify_loop_structure ();
 260 #endif
 261         }
 262     }
 263 }
 264
 265 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 266 static void
 267 decide_unrolling_and_peeling (int flags)
 268 {
 269   struct loop *loop;
 270   loop_iterator li;
 271
 272   /* Scan the loops, inner ones first.  */
 273   FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
 274     {
 275       loop->lpt_decision.decision = LPT_NONE;
 276
 277       if (dump_file)
 278         fprintf (dump_file, "\n;; *** Considering loop %d ***\n", loop->num);
 279
 280       /* Do not peel cold areas.  */
 281       if (optimize_loop_for_size_p (loop))
 282         {
 283           if (dump_file)
 284             fprintf (dump_file, ";; Not considering loop, cold area\n");
 285           continue;
 286         }
 287
 288       /* Can the loop be manipulated?  */
 289       if (!can_duplicate_loop_p (loop))
 290         {
 291           if (dump_file)
 292             fprintf (dump_file,
 293                      ";; Not considering loop, cannot duplicate\n");
 294           continue;
 295         }
 296
 297       /* Skip non-innermost loops.  */
 298       if (loop->inner)
 299         {
 300           if (dump_file)
 301             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 302           continue;
 303         }
 304
 305       loop->ninsns = num_loop_insns (loop);
 306       loop->av_ninsns = average_num_loop_insns (loop);
 307
 308       /* Try transformations one by one in decreasing order of
 309          priority.  */
 310
 311       decide_unroll_constant_iterations (loop, flags);
 312       if (loop->lpt_decision.decision == LPT_NONE)
 313         decide_unroll_runtime_iterations (loop, flags);
 314       if (loop->lpt_decision.decision == LPT_NONE)
 315         decide_unroll_stupid (loop, flags);
 316       if (loop->lpt_decision.decision == LPT_NONE)
 317         decide_peel_simple (loop, flags);
 318     }
 319 }
 320
 321 /* Decide whether the LOOP is once rolling and suitable for complete
 322    peeling.  */
 323 static void
 324 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 325 {
 326   struct niter_desc *desc;
 327
 328   if (dump_file)
 329     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 330
 331   /* Is the loop small enough?  */
 332   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 333     {
 334       if (dump_file)
 335         fprintf (dump_file, ";; Not considering loop, is too big\n");
 336       return;
 337     }
 338
 339   /* Check for simple loops.  */
 340   desc = get_simple_loop_desc (loop);
 341
 342   /* Check number of iterations.  */
 343   if (!desc->simple_p
 344       || desc->assumptions
 345       || desc->infinite
 346       || !desc->const_iter
 347       || desc->niter != 0)
 348     {
 349       if (dump_file)
 350         fprintf (dump_file,
 351                  ";; Unable to prove that the loop rolls exactly once\n");
 352       return;
 353     }
 354
 355   /* Success.  */
 356   if (dump_file)
 357     fprintf (dump_file, ";; Decided to peel exactly once rolling loop\n");
 358   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 359 }
 360
 361 /* Decide whether the LOOP is suitable for complete peeling.  */
 362 static void
 363 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 364 {
 365   unsigned npeel;
 366   struct niter_desc *desc;
 367
 368   if (dump_file)
 369     fprintf (dump_file, "\n;; Considering peeling completely\n");
 370
 371   /* Skip non-innermost loops.  */
 372   if (loop->inner)
 373     {
 374       if (dump_file)
 375         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 376       return;
 377     }
 378
 379   /* Do not peel cold areas.  */
 380   if (optimize_loop_for_size_p (loop))
 381     {
 382       if (dump_file)
 383         fprintf (dump_file, ";; Not considering loop, cold area\n");
 384       return;
 385     }
 386
 387   /* Can the loop be manipulated?  */
 388   if (!can_duplicate_loop_p (loop))
 389     {
 390       if (dump_file)
 391         fprintf (dump_file,
 392                  ";; Not considering loop, cannot duplicate\n");
 393       return;
 394     }
 395
 396   /* npeel = number of iterations to peel.  */
 397   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 398   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 399     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 400
 401   /* Is the loop small enough?  */
 402   if (!npeel)
 403     {
 404       if (dump_file)
 405         fprintf (dump_file, ";; Not considering loop, is too big\n");
 406       return;
 407     }
 408
 409   /* Check for simple loops.  */
 410   desc = get_simple_loop_desc (loop);
 411
 412   /* Check number of iterations.  */
 413   if (!desc->simple_p
 414       || desc->assumptions
 415       || !desc->const_iter
 416       || desc->infinite)
 417     {
 418       if (dump_file)
 419         fprintf (dump_file,
 420                  ";; Unable to prove that the loop iterates constant times\n");
 421       return;
 422     }
 423
 424   if (desc->niter > npeel - 1)
 425     {
 426       if (dump_file)
 427         {
 428           fprintf (dump_file,
 429                    ";; Not peeling loop completely, rolls too much (");
 430           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 431           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 432         }
 433       return;
 434     }
 435
 436   /* Success.  */
 437   if (dump_file)
 438     fprintf (dump_file, ";; Decided to peel loop completely\n");
 439   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 440 }
 441
 442 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 443    completely.  The transformation done:
 444
 445    for (i = 0; i < 4; i++)
 446      body;
 447
 448    ==>
 449
 450    i = 0;
 451    body; i++;
 452    body; i++;
 453    body; i++;
 454    body; i++;
 455    */
 456 static void
 457 peel_loop_completely (struct loop *loop)
 458 {
 459   sbitmap wont_exit;
 460   unsigned HOST_WIDE_INT npeel;
 461   unsigned i;
 462   VEC (edge, heap) *remove_edges;
 463   edge ein;
 464   struct niter_desc *desc = get_simple_loop_desc (loop);
 465   struct opt_info *opt_info = NULL;
 466
 467   npeel = desc->niter;
 468
 469   if (npeel)
 470     {
 471       bool ok;
 472
 473       wont_exit = sbitmap_alloc (npeel + 1);
 474       sbitmap_ones (wont_exit);
 475       RESET_BIT (wont_exit, 0);
 476       if (desc->noloop_assumptions)
 477         RESET_BIT (wont_exit, 1);
 478
 479       remove_edges = NULL;
 480
 481       if (flag_split_ivs_in_unroller)
 482         opt_info = analyze_insns_in_loop (loop);
 483
 484       opt_info_start_duplication (opt_info);
 485       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 486                                           npeel,
 487                                           wont_exit, desc->out_edge,
 488                                           &remove_edges,
 489                                           DLTHE_FLAG_UPDATE_FREQ
 490                                           | DLTHE_FLAG_COMPLETTE_PEEL
 491                                           | (opt_info
 492                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 493       gcc_assert (ok);
 494
 495       free (wont_exit);
 496
 497       if (opt_info)
 498         {
 499           apply_opt_in_copies (opt_info, npeel, false, true);
 500           free_opt_info (opt_info);
 501         }
 502
 503       /* Remove the exit edges.  */
 504       for (i = 0; VEC_iterate (edge, remove_edges, i, ein); i++)
 505         remove_path (ein);
 506       VEC_free (edge, heap, remove_edges);
 507     }
 508
 509   ein = desc->in_edge;
 510   free_simple_loop_desc (loop);
 511
 512   /* Now remove the unreachable part of the last iteration and cancel
 513      the loop.  */
 514   remove_path (ein);
 515
 516   if (dump_file)
 517     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 518 }
 519
 520 /* Decide whether to unroll LOOP iterating constant number of times
 521    and how much.  */
 522
 523 static void
 524 decide_unroll_constant_iterations (struct loop *loop, int flags)
 525 {
 526   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 527   struct niter_desc *desc;
 528
 529   if (!(flags & UAP_UNROLL))
 530     {
 531       /* We were not asked to, just return back silently.  */
 532       return;
 533     }
 534
 535   if (dump_file)
 536     fprintf (dump_file,
 537              "\n;; Considering unrolling loop with constant "
 538              "number of iterations\n");
 539
 540   /* nunroll = total number of copies of the original loop body in
 541      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 542   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 543   nunroll_by_av
 544     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 545   if (nunroll > nunroll_by_av)
 546     nunroll = nunroll_by_av;
 547   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 548     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 549
 550   /* Skip big loops.  */
 551   if (nunroll <= 1)
 552     {
 553       if (dump_file)
 554         fprintf (dump_file, ";; Not considering loop, is too big\n");
 555       return;
 556     }
 557
 558   /* Check for simple loops.  */
 559   desc = get_simple_loop_desc (loop);
 560
 561   /* Check number of iterations.  */
 562   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 563     {
 564       if (dump_file)
 565         fprintf (dump_file,
 566                  ";; Unable to prove that the loop iterates constant times\n");
 567       return;
 568     }
 569
 570   /* Check whether the loop rolls enough to consider.  */
 571   if (desc->niter < 2 * nunroll)
 572     {
 573       if (dump_file)
 574         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 575       return;
 576     }
 577
 578   /* Success; now compute number of iterations to unroll.  We alter
 579      nunroll so that as few as possible copies of loop body are
 580      necessary, while still not decreasing the number of unrollings
 581      too much (at most by 1).  */
 582   best_copies = 2 * nunroll + 10;
 583
 584   i = 2 * nunroll + 2;
 585   if (i - 1 >= desc->niter)
 586     i = desc->niter - 2;
 587
 588   for (; i >= nunroll - 1; i--)
 589     {
 590       unsigned exit_mod = desc->niter % (i + 1);
 591
 592       if (!loop_exit_at_end_p (loop))
 593         n_copies = exit_mod + i + 1;
 594       else if (exit_mod != (unsigned) i
 595                || desc->noloop_assumptions != NULL_RTX)
 596         n_copies = exit_mod + i + 2;
 597       else
 598         n_copies = i + 1;
 599
 600       if (n_copies < best_copies)
 601         {
 602           best_copies = n_copies;
 603           best_unroll = i;
 604         }
 605     }
 606
 607   if (dump_file)
 608     fprintf (dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
 609              best_unroll + 1, best_copies, nunroll);
 610
 611   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 612   loop->lpt_decision.times = best_unroll;
 613
 614   if (dump_file)
 615     fprintf (dump_file,
 616              ";; Decided to unroll the constant times rolling loop, %d times.\n",
 617              loop->lpt_decision.times);
 618 }
 619
 620 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
 621    times.  The transformation does this:
 622
 623    for (i = 0; i < 102; i++)
 624      body;
 625
 626    ==>
 627
 628    i = 0;
 629    body; i++;
 630    body; i++;
 631    while (i < 102)
 632      {
 633        body; i++;
 634        body; i++;
 635        body; i++;
 636        body; i++;
 637      }
 638   */
 639 static void
 640 unroll_loop_constant_iterations (struct loop *loop)
 641 {
 642   unsigned HOST_WIDE_INT niter;
 643   unsigned exit_mod;
 644   sbitmap wont_exit;
 645   unsigned i;
 646   VEC (edge, heap) *remove_edges;
 647   edge e;
 648   unsigned max_unroll = loop->lpt_decision.times;
 649   struct niter_desc *desc = get_simple_loop_desc (loop);
 650   bool exit_at_end = loop_exit_at_end_p (loop);
 651   struct opt_info *opt_info = NULL;
 652   bool ok;
 653
 654   niter = desc->niter;
 655
 656   /* Should not get here (such loop should be peeled instead).  */
 657   gcc_assert (niter > max_unroll + 1);
 658
 659   exit_mod = niter % (max_unroll + 1);
 660
 661   wont_exit = sbitmap_alloc (max_unroll + 1);
 662   sbitmap_ones (wont_exit);
 663
 664   remove_edges = NULL;
 665   if (flag_split_ivs_in_unroller
 666       || flag_variable_expansion_in_unroller)
 667     opt_info = analyze_insns_in_loop (loop);
 668
 669   if (!exit_at_end)
 670     {
 671       /* The exit is not at the end of the loop; leave exit test
 672          in the first copy, so that the loops that start with test
 673          of exit condition have continuous body after unrolling.  */
 674
 675       if (dump_file)
 676         fprintf (dump_file, ";; Condition on beginning of loop.\n");
 677
 678       /* Peel exit_mod iterations.  */
 679       RESET_BIT (wont_exit, 0);
 680       if (desc->noloop_assumptions)
 681         RESET_BIT (wont_exit, 1);
 682
 683       if (exit_mod)
 684         {
 685           opt_info_start_duplication (opt_info);
 686           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 687                                               exit_mod,
 688                                               wont_exit, desc->out_edge,
 689                                               &remove_edges,
 690                                               DLTHE_FLAG_UPDATE_FREQ
 691                                               | (opt_info && exit_mod > 1
 692                                                  ? DLTHE_RECORD_COPY_NUMBER
 693                                                    : 0));
 694           gcc_assert (ok);
 695
 696           if (opt_info && exit_mod > 1)
 697             apply_opt_in_copies (opt_info, exit_mod, false, false);
 698
 699           desc->noloop_assumptions = NULL_RTX;
 700           desc->niter -= exit_mod;
 701           desc->niter_max -= exit_mod;
 702         }
 703
 704       SET_BIT (wont_exit, 1);
 705     }
 706   else
 707     {
 708       /* Leave exit test in last copy, for the same reason as above if
 709          the loop tests the condition at the end of loop body.  */
 710
 711       if (dump_file)
 712         fprintf (dump_file, ";; Condition on end of loop.\n");
 713
 714       /* We know that niter >= max_unroll + 2; so we do not need to care of
 715          case when we would exit before reaching the loop.  So just peel
 716          exit_mod + 1 iterations.  */
 717       if (exit_mod != max_unroll
 718           || desc->noloop_assumptions)
 719         {
 720           RESET_BIT (wont_exit, 0);
 721           if (desc->noloop_assumptions)
 722             RESET_BIT (wont_exit, 1);
 723
 724           opt_info_start_duplication (opt_info);
 725           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 726                                               exit_mod + 1,
 727                                               wont_exit, desc->out_edge,
 728                                               &remove_edges,
 729                                               DLTHE_FLAG_UPDATE_FREQ
 730                                               | (opt_info && exit_mod > 0
 731                                                  ? DLTHE_RECORD_COPY_NUMBER
 732                                                    : 0));
 733           gcc_assert (ok);
 734
 735           if (opt_info && exit_mod > 0)
 736             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 737
 738           desc->niter -= exit_mod + 1;
 739           desc->niter_max -= exit_mod + 1;
 740           desc->noloop_assumptions = NULL_RTX;
 741
 742           SET_BIT (wont_exit, 0);
 743           SET_BIT (wont_exit, 1);
 744         }
 745
 746       RESET_BIT (wont_exit, max_unroll);
 747     }
 748
 749   /* Now unroll the loop.  */
 750
 751   opt_info_start_duplication (opt_info);
 752   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 753                                       max_unroll,
 754                                       wont_exit, desc->out_edge,
 755                                       &remove_edges,
 756                                       DLTHE_FLAG_UPDATE_FREQ
 757                                       | (opt_info
 758                                          ? DLTHE_RECORD_COPY_NUMBER
 759                                            : 0));
 760   gcc_assert (ok);
 761
 762   if (opt_info)
 763     {
 764       apply_opt_in_copies (opt_info, max_unroll, true, true);
 765       free_opt_info (opt_info);
 766     }
 767
 768   free (wont_exit);
 769
 770   if (exit_at_end)
 771     {
 772       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 773       /* Find a new in and out edge; they are in the last copy we have made.  */
 774
 775       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 776         {
 777           desc->out_edge = EDGE_SUCC (exit_block, 0);
 778           desc->in_edge = EDGE_SUCC (exit_block, 1);
 779         }
 780       else
 781         {
 782           desc->out_edge = EDGE_SUCC (exit_block, 1);
 783           desc->in_edge = EDGE_SUCC (exit_block, 0);
 784         }
 785     }
 786
 787   desc->niter /= max_unroll + 1;
 788   desc->niter_max /= max_unroll + 1;
 789   desc->niter_expr = GEN_INT (desc->niter);
 790
 791   /* Remove the edges.  */
 792   for (i = 0; VEC_iterate (edge, remove_edges, i, e); i++)
 793     remove_path (e);
 794   VEC_free (edge, heap, remove_edges);
 795
 796   if (dump_file)
 797     fprintf (dump_file,
 798              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 799              max_unroll, num_loop_insns (loop));
 800 }
 801
 802 /* Decide whether to unroll LOOP iterating runtime computable number of times
 803    and how much.  */
 804 static void
 805 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 806 {
 807   unsigned nunroll, nunroll_by_av, i;
 808   struct niter_desc *desc;
 809
 810   if (!(flags & UAP_UNROLL))
 811     {
 812       /* We were not asked to, just return back silently.  */
 813       return;
 814     }
 815
 816   if (dump_file)
 817     fprintf (dump_file,
 818              "\n;; Considering unrolling loop with runtime "
 819              "computable number of iterations\n");
 820
 821   /* nunroll = total number of copies of the original loop body in
 822      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 823   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 824   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 825   if (nunroll > nunroll_by_av)
 826     nunroll = nunroll_by_av;
 827   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 828     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 829
 830   if (targetm.loop_unroll_adjust)
 831     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 832
 833   /* Skip big loops.  */
 834   if (nunroll <= 1)
 835     {
 836       if (dump_file)
 837         fprintf (dump_file, ";; Not considering loop, is too big\n");
 838       return;
 839     }
 840
 841   /* Check for simple loops.  */
 842   desc = get_simple_loop_desc (loop);
 843
 844   /* Check simpleness.  */
 845   if (!desc->simple_p || desc->assumptions)
 846     {
 847       if (dump_file)
 848         fprintf (dump_file,
 849                  ";; Unable to prove that the number of iterations "
 850                  "can be counted in runtime\n");
 851       return;
 852     }
 853
 854   if (desc->const_iter)
 855     {
 856       if (dump_file)
 857         fprintf (dump_file, ";; Loop iterates constant times\n");
 858       return;
 859     }
 860
 861   /* If we have profile feedback, check whether the loop rolls.  */
 862   if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
 863     {
 864       if (dump_file)
 865         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 866       return;
 867     }
 868
 869   /* Success; now force nunroll to be power of 2, as we are unable to
 870      cope with overflows in computation of number of iterations.  */
 871   for (i = 1; 2 * i <= nunroll; i *= 2)
 872     continue;
 873
 874   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
 875   loop->lpt_decision.times = i - 1;
 876
 877   if (dump_file)
 878     fprintf (dump_file,
 879              ";; Decided to unroll the runtime computable "
 880              "times rolling loop, %d times.\n",
 881              loop->lpt_decision.times);
 882 }
 883
 884 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
 885    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
 886    and NULL is returned instead.  */
 887
 888 basic_block
 889 split_edge_and_insert (edge e, rtx insns)
 890 {
 891   basic_block bb;
 892
 893   if (!insns)
 894     return NULL;
 895   bb = split_edge (e);
 896   emit_insn_after (insns, BB_END (bb));
 897
 898   /* ??? We used to assume that INSNS can contain control flow insns, and
 899      that we had to try to find sub basic blocks in BB to maintain a valid
 900      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
 901      and call break_superblocks when going out of cfglayout mode.  But it
 902      turns out that this never happens; and that if it does ever happen,
 903      the verify_flow_info call in loop_optimizer_finalize would fail.
 904
 905      There are two reasons why we expected we could have control flow insns
 906      in INSNS.  The first is when a comparison has to be done in parts, and
 907      the second is when the number of iterations is computed for loops with
 908      the number of iterations known at runtime.  In both cases, test cases
 909      to get control flow in INSNS appear to be impossible to construct:
 910
 911       * If do_compare_rtx_and_jump needs several branches to do comparison
 912         in a mode that needs comparison by parts, we cannot analyze the
 913         number of iterations of the loop, and we never get to unrolling it.
 914
 915       * The code in expand_divmod that was suspected to cause creation of
 916         branching code seems to be only accessed for signed division.  The
 917         divisions used by # of iterations analysis are always unsigned.
 918         Problems might arise on architectures that emits branching code
 919         for some operations that may appear in the unroller (especially
 920         for division), but we have no such architectures.
 921
 922      Considering all this, it was decided that we should for now assume
 923      that INSNS can in theory contain control flow insns, but in practice
 924      it never does.  So we don't handle the theoretical case, and should
 925      a real failure ever show up, we have a pretty good clue for how to
 926      fix it.  */
 927
 928   return bb;
 929 }
 930
 931 /* Unroll LOOP for that we are able to count number of iterations in runtime
 932    LOOP->LPT_DECISION.TIMES + 1 times.  The transformation does this (with some
 933    extra care for case n < 0):
 934
 935    for (i = 0; i < n; i++)
 936      body;
 937
 938    ==>
 939
 940    i = 0;
 941    mod = n % 4;
 942
 943    switch (mod)
 944      {
 945        case 3:
 946          body; i++;
 947        case 2:
 948          body; i++;
 949        case 1:
 950          body; i++;
 951        case 0: ;
 952      }
 953
 954    while (i < n)
 955      {
 956        body; i++;
 957        body; i++;
 958        body; i++;
 959        body; i++;
 960      }
 961    */
 962 static void
 963 unroll_loop_runtime_iterations (struct loop *loop)
 964 {
 965   rtx old_niter, niter, init_code, branch_code, tmp;
 966   unsigned i, j, p;
 967   basic_block preheader, *body, swtch, ezc_swtch;
 968   VEC (basic_block, heap) *dom_bbs;
 969   sbitmap wont_exit;
 970   int may_exit_copy;
 971   unsigned n_peel;
 972   VEC (edge, heap) *remove_edges;
 973   edge e;
 974   bool extra_zero_check, last_may_exit;
 975   unsigned max_unroll = loop->lpt_decision.times;
 976   struct niter_desc *desc = get_simple_loop_desc (loop);
 977   bool exit_at_end = loop_exit_at_end_p (loop);
 978   struct opt_info *opt_info = NULL;
 979   bool ok;
 980
 981   if (flag_split_ivs_in_unroller
 982       || flag_variable_expansion_in_unroller)
 983     opt_info = analyze_insns_in_loop (loop);
 984
 985   /* Remember blocks whose dominators will have to be updated.  */
 986   dom_bbs = NULL;
 987
 988   body = get_loop_body (loop);
 989   for (i = 0; i < loop->num_nodes; i++)
 990     {
 991       VEC (basic_block, heap) *ldom;
 992       basic_block bb;
 993
 994       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
 995       for (j = 0; VEC_iterate (basic_block, ldom, j, bb); j++)
 996         if (!flow_bb_inside_loop_p (loop, bb))
 997           VEC_safe_push (basic_block, heap, dom_bbs, bb);
 998
 999       VEC_free (basic_block, heap, ldom);
1000     }
1001   free (body);
1002
1003   if (!exit_at_end)
1004     {
1005       /* Leave exit in first copy (for explanation why see comment in
1006          unroll_loop_constant_iterations).  */
1007       may_exit_copy = 0;
1008       n_peel = max_unroll - 1;
1009       extra_zero_check = true;
1010       last_may_exit = false;
1011     }
1012   else
1013     {
1014       /* Leave exit in last copy (for explanation why see comment in
1015          unroll_loop_constant_iterations).  */
1016       may_exit_copy = max_unroll;
1017       n_peel = max_unroll;
1018       extra_zero_check = false;
1019       last_may_exit = true;
1020     }
1021
1022   /* Get expression for number of iterations.  */
1023   start_sequence ();
1024   old_niter = niter = gen_reg_rtx (desc->mode);
1025   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1026   if (tmp != niter)
1027     emit_move_insn (niter, tmp);
1028
1029   /* Count modulo by ANDing it with max_unroll; we use the fact that
1030      the number of unrollings is a power of two, and thus this is correct
1031      even if there is overflow in the computation.  */
1032   niter = expand_simple_binop (desc->mode, AND,
1033                                niter,
1034                                GEN_INT (max_unroll),
1035                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1036
1037   init_code = get_insns ();
1038   end_sequence ();
1039   unshare_all_rtl_in_chain (init_code);
1040
1041   /* Precondition the loop.  */
1042   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1043
1044   remove_edges = NULL;
1045
1046   wont_exit = sbitmap_alloc (max_unroll + 2);
1047
1048   /* Peel the first copy of loop body (almost always we must leave exit test
1049      here; the only exception is when we have extra zero check and the number
1050      of iterations is reliable.  Also record the place of (possible) extra
1051      zero check.  */
1052   sbitmap_zero (wont_exit);
1053   if (extra_zero_check
1054       && !desc->noloop_assumptions)
1055     SET_BIT (wont_exit, 1);
1056   ezc_swtch = loop_preheader_edge (loop)->src;
1057   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1058                                       1, wont_exit, desc->out_edge,
1059                                       &remove_edges,
1060                                       DLTHE_FLAG_UPDATE_FREQ);
1061   gcc_assert (ok);
1062
1063   /* Record the place where switch will be built for preconditioning.  */
1064   swtch = split_edge (loop_preheader_edge (loop));
1065
1066   for (i = 0; i < n_peel; i++)
1067     {
1068       /* Peel the copy.  */
1069       sbitmap_zero (wont_exit);
1070       if (i != n_peel - 1 || !last_may_exit)
1071         SET_BIT (wont_exit, 1);
1072       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1073                                           1, wont_exit, desc->out_edge,
1074                                           &remove_edges,
1075                                           DLTHE_FLAG_UPDATE_FREQ);
1076       gcc_assert (ok);
1077
1078       /* Create item for switch.  */
1079       j = n_peel - i - (extra_zero_check ? 0 : 1);
1080       p = REG_BR_PROB_BASE / (i + 2);
1081
1082       preheader = split_edge (loop_preheader_edge (loop));
1083       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1084                                           block_label (preheader), p,
1085                                           NULL_RTX);
1086
1087       /* We rely on the fact that the compare and jump cannot be optimized out,
1088          and hence the cfg we create is correct.  */
1089       gcc_assert (branch_code != NULL_RTX);
1090
1091       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1092       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1093       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1094       e = make_edge (swtch, preheader,
1095                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1096       e->probability = p;
1097     }
1098
1099   if (extra_zero_check)
1100     {
1101       /* Add branch for zero iterations.  */
1102       p = REG_BR_PROB_BASE / (max_unroll + 1);
1103       swtch = ezc_swtch;
1104       preheader = split_edge (loop_preheader_edge (loop));
1105       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1106                                           block_label (preheader), p,
1107                                           NULL_RTX);
1108       gcc_assert (branch_code != NULL_RTX);
1109
1110       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1111       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1112       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1113       e = make_edge (swtch, preheader,
1114                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1115       e->probability = p;
1116     }
1117
1118   /* Recount dominators for outer blocks.  */
1119   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1120
1121   /* And unroll loop.  */
1122
1123   sbitmap_ones (wont_exit);
1124   RESET_BIT (wont_exit, may_exit_copy);
1125   opt_info_start_duplication (opt_info);
1126
1127   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1128                                       max_unroll,
1129                                       wont_exit, desc->out_edge,
1130                                       &remove_edges,
1131                                       DLTHE_FLAG_UPDATE_FREQ
1132                                       | (opt_info
1133                                          ? DLTHE_RECORD_COPY_NUMBER
1134                                            : 0));
1135   gcc_assert (ok);
1136
1137   if (opt_info)
1138     {
1139       apply_opt_in_copies (opt_info, max_unroll, true, true);
1140       free_opt_info (opt_info);
1141     }
1142
1143   free (wont_exit);
1144
1145   if (exit_at_end)
1146     {
1147       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1148       /* Find a new in and out edge; they are in the last copy we have
1149          made.  */
1150
1151       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1152         {
1153           desc->out_edge = EDGE_SUCC (exit_block, 0);
1154           desc->in_edge = EDGE_SUCC (exit_block, 1);
1155         }
1156       else
1157         {
1158           desc->out_edge = EDGE_SUCC (exit_block, 1);
1159           desc->in_edge = EDGE_SUCC (exit_block, 0);
1160         }
1161     }
1162
1163   /* Remove the edges.  */
1164   for (i = 0; VEC_iterate (edge, remove_edges, i, e); i++)
1165     remove_path (e);
1166   VEC_free (edge, heap, remove_edges);
1167
1168   /* We must be careful when updating the number of iterations due to
1169      preconditioning and the fact that the value must be valid at entry
1170      of the loop.  After passing through the above code, we see that
1171      the correct new number of iterations is this:  */
1172   gcc_assert (!desc->const_iter);
1173   desc->niter_expr =
1174     simplify_gen_binary (UDIV, desc->mode, old_niter,
1175                          GEN_INT (max_unroll + 1));
1176   desc->niter_max /= max_unroll + 1;
1177   if (exit_at_end)
1178     {
1179       desc->niter_expr =
1180         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1181       desc->noloop_assumptions = NULL_RTX;
1182       desc->niter_max--;
1183     }
1184
1185   if (dump_file)
1186     fprintf (dump_file,
1187              ";; Unrolled loop %d times, counting # of iterations "
1188              "in runtime, %i insns\n",
1189              max_unroll, num_loop_insns (loop));
1190
1191   VEC_free (basic_block, heap, dom_bbs);
1192 }
1193
1194 /* Decide whether to simply peel LOOP and how much.  */
1195 static void
1196 decide_peel_simple (struct loop *loop, int flags)
1197 {
1198   unsigned npeel;
1199   struct niter_desc *desc;
1200
1201   if (!(flags & UAP_PEEL))
1202     {
1203       /* We were not asked to, just return back silently.  */
1204       return;
1205     }
1206
1207   if (dump_file)
1208     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1209
1210   /* npeel = number of iterations to peel.  */
1211   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1212   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1213     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1214
1215   /* Skip big loops.  */
1216   if (!npeel)
1217     {
1218       if (dump_file)
1219         fprintf (dump_file, ";; Not considering loop, is too big\n");
1220       return;
1221     }
1222
1223   /* Check for simple loops.  */
1224   desc = get_simple_loop_desc (loop);
1225
1226   /* Check number of iterations.  */
1227   if (desc->simple_p && !desc->assumptions && desc->const_iter)
1228     {
1229       if (dump_file)
1230         fprintf (dump_file, ";; Loop iterates constant times\n");
1231       return;
1232     }
1233
1234   /* Do not simply peel loops with branches inside -- it increases number
1235      of mispredicts.  */
1236   if (num_loop_branches (loop) > 1)
1237     {
1238       if (dump_file)
1239         fprintf (dump_file, ";; Not peeling, contains branches\n");
1240       return;
1241     }
1242
1243   if (loop->header->count)
1244     {
1245       unsigned niter = expected_loop_iterations (loop);
1246       if (niter + 1 > npeel)
1247         {
1248           if (dump_file)
1249             {
1250               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1251               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1252                        (HOST_WIDEST_INT) (niter + 1));
1253               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1254                        npeel);
1255             }
1256           return;
1257         }
1258       npeel = niter + 1;
1259     }
1260   else
1261     {
1262       /* For now we have no good heuristics to decide whether loop peeling
1263          will be effective, so disable it.  */
1264       if (dump_file)
1265         fprintf (dump_file,
1266                  ";; Not peeling loop, no evidence it will be profitable\n");
1267       return;
1268     }
1269
1270   /* Success.  */
1271   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1272   loop->lpt_decision.times = npeel;
1273
1274   if (dump_file)
1275     fprintf (dump_file, ";; Decided to simply peel the loop, %d times.\n",
1276              loop->lpt_decision.times);
1277 }
1278
1279 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1280    while (cond)
1281      body;
1282
1283    ==>
1284
1285    if (!cond) goto end;
1286    body;
1287    if (!cond) goto end;
1288    body;
1289    while (cond)
1290      body;
1291    end: ;
1292    */
1293 static void
1294 peel_loop_simple (struct loop *loop)
1295 {
1296   sbitmap wont_exit;
1297   unsigned npeel = loop->lpt_decision.times;
1298   struct niter_desc *desc = get_simple_loop_desc (loop);
1299   struct opt_info *opt_info = NULL;
1300   bool ok;
1301
1302   if (flag_split_ivs_in_unroller && npeel > 1)
1303     opt_info = analyze_insns_in_loop (loop);
1304
1305   wont_exit = sbitmap_alloc (npeel + 1);
1306   sbitmap_zero (wont_exit);
1307
1308   opt_info_start_duplication (opt_info);
1309
1310   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1311                                       npeel, wont_exit, NULL,
1312                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1313                                       | (opt_info
1314                                          ? DLTHE_RECORD_COPY_NUMBER
1315                                            : 0));
1316   gcc_assert (ok);
1317
1318   free (wont_exit);
1319
1320   if (opt_info)
1321     {
1322       apply_opt_in_copies (opt_info, npeel, false, false);
1323       free_opt_info (opt_info);
1324     }
1325
1326   if (desc->simple_p)
1327     {
1328       if (desc->const_iter)
1329         {
1330           desc->niter -= npeel;
1331           desc->niter_expr = GEN_INT (desc->niter);
1332           desc->noloop_assumptions = NULL_RTX;
1333         }
1334       else
1335         {
1336           /* We cannot just update niter_expr, as its value might be clobbered
1337              inside loop.  We could handle this by counting the number into
1338              temporary just like we do in runtime unrolling, but it does not
1339              seem worthwhile.  */
1340           free_simple_loop_desc (loop);
1341         }
1342     }
1343   if (dump_file)
1344     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1345 }
1346
1347 /* Decide whether to unroll LOOP stupidly and how much.  */
1348 static void
1349 decide_unroll_stupid (struct loop *loop, int flags)
1350 {
1351   unsigned nunroll, nunroll_by_av, i;
1352   struct niter_desc *desc;
1353
1354   if (!(flags & UAP_UNROLL_ALL))
1355     {
1356       /* We were not asked to, just return back silently.  */
1357       return;
1358     }
1359
1360   if (dump_file)
1361     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1362
1363   /* nunroll = total number of copies of the original loop body in
1364      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1365   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1366   nunroll_by_av
1367     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1368   if (nunroll > nunroll_by_av)
1369     nunroll = nunroll_by_av;
1370   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1371     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1372
1373   if (targetm.loop_unroll_adjust)
1374     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1375
1376   /* Skip big loops.  */
1377   if (nunroll <= 1)
1378     {
1379       if (dump_file)
1380         fprintf (dump_file, ";; Not considering loop, is too big\n");
1381       return;
1382     }
1383
1384   /* Check for simple loops.  */
1385   desc = get_simple_loop_desc (loop);
1386
1387   /* Check simpleness.  */
1388   if (desc->simple_p && !desc->assumptions)
1389     {
1390       if (dump_file)
1391         fprintf (dump_file, ";; The loop is simple\n");
1392       return;
1393     }
1394
1395   /* Do not unroll loops with branches inside -- it increases number
1396      of mispredicts.  */
1397   if (num_loop_branches (loop) > 1)
1398     {
1399       if (dump_file)
1400         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1401       return;
1402     }
1403
1404   /* If we have profile feedback, check whether the loop rolls.  */
1405   if (loop->header->count
1406       && expected_loop_iterations (loop) < 2 * nunroll)
1407     {
1408       if (dump_file)
1409         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1410       return;
1411     }
1412
1413   /* Success.  Now force nunroll to be power of 2, as it seems that this
1414      improves results (partially because of better alignments, partially
1415      because of some dark magic).  */
1416   for (i = 1; 2 * i <= nunroll; i *= 2)
1417     continue;
1418
1419   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1420   loop->lpt_decision.times = i - 1;
1421
1422   if (dump_file)
1423     fprintf (dump_file,
1424              ";; Decided to unroll the loop stupidly, %d times.\n",
1425              loop->lpt_decision.times);
1426 }
1427
1428 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1429    while (cond)
1430      body;
1431
1432    ==>
1433
1434    while (cond)
1435      {
1436        body;
1437        if (!cond) break;
1438        body;
1439        if (!cond) break;
1440        body;
1441        if (!cond) break;
1442        body;
1443      }
1444    */
1445 static void
1446 unroll_loop_stupid (struct loop *loop)
1447 {
1448   sbitmap wont_exit;
1449   unsigned nunroll = loop->lpt_decision.times;
1450   struct niter_desc *desc = get_simple_loop_desc (loop);
1451   struct opt_info *opt_info = NULL;
1452   bool ok;
1453
1454   if (flag_split_ivs_in_unroller
1455       || flag_variable_expansion_in_unroller)
1456     opt_info = analyze_insns_in_loop (loop);
1457
1458
1459   wont_exit = sbitmap_alloc (nunroll + 1);
1460   sbitmap_zero (wont_exit);
1461   opt_info_start_duplication (opt_info);
1462
1463   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1464                                       nunroll, wont_exit,
1465                                       NULL, NULL,
1466                                       DLTHE_FLAG_UPDATE_FREQ
1467                                       | (opt_info
1468                                          ? DLTHE_RECORD_COPY_NUMBER
1469                                            : 0));
1470   gcc_assert (ok);
1471
1472   if (opt_info)
1473     {
1474       apply_opt_in_copies (opt_info, nunroll, true, true);
1475       free_opt_info (opt_info);
1476     }
1477
1478   free (wont_exit);
1479
1480   if (desc->simple_p)
1481     {
1482       /* We indeed may get here provided that there are nontrivial assumptions
1483          for a loop to be really simple.  We could update the counts, but the
1484          problem is that we are unable to decide which exit will be taken
1485          (not really true in case the number of iterations is constant,
1486          but noone will do anything with this information, so we do not
1487          worry about it).  */
1488       desc->simple_p = false;
1489     }
1490
1491   if (dump_file)
1492     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1493              nunroll, num_loop_insns (loop));
1494 }
1495
1496 /* A hash function for information about insns to split.  */
1497
1498 static hashval_t
1499 si_info_hash (const void *ivts)
1500 {
1501   return (hashval_t) INSN_UID (((const struct iv_to_split *) ivts)->insn);
1502 }
1503
1504 /* An equality functions for information about insns to split.  */
1505
1506 static int
1507 si_info_eq (const void *ivts1, const void *ivts2)
1508 {
1509   const struct iv_to_split *const i1 = (const struct iv_to_split *) ivts1;
1510   const struct iv_to_split *const i2 = (const struct iv_to_split *) ivts2;
1511
1512   return i1->insn == i2->insn;
1513 }
1514
1515 /* Return a hash for VES, which is really a "var_to_expand *".  */
1516
1517 static hashval_t
1518 ve_info_hash (const void *ves)
1519 {
1520   return (hashval_t) INSN_UID (((const struct var_to_expand *) ves)->insn);
1521 }
1522
1523 /* Return true if IVTS1 and IVTS2 (which are really both of type
1524    "var_to_expand *") refer to the same instruction.  */
1525
1526 static int
1527 ve_info_eq (const void *ivts1, const void *ivts2)
1528 {
1529   const struct var_to_expand *const i1 = (const struct var_to_expand *) ivts1;
1530   const struct var_to_expand *const i2 = (const struct var_to_expand *) ivts2;
1531
1532   return i1->insn == i2->insn;
1533 }
1534
1535 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1536    Set *DEBUG_USES to the number of debug insns that reference the
1537    variable.  */
1538
1539 bool
1540 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1541                                   int *debug_uses)
1542 {
1543   basic_block *body, bb;
1544   unsigned i;
1545   int count_ref = 0;
1546   rtx insn;
1547
1548   body = get_loop_body (loop);
1549   for (i = 0; i < loop->num_nodes; i++)
1550     {
1551       bb = body[i];
1552
1553       FOR_BB_INSNS (bb, insn)
1554         if (!rtx_referenced_p (reg, insn))
1555           continue;
1556         else if (DEBUG_INSN_P (insn))
1557           ++*debug_uses;
1558         else if (++count_ref > 1)
1559           break;
1560     }
1561   free (body);
1562   return (count_ref  == 1);
1563 }
1564
1565 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1566
1567 static void
1568 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1569 {
1570   basic_block *body, bb;
1571   unsigned i;
1572   rtx insn;
1573
1574   body = get_loop_body (loop);
1575   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1576     {
1577       bb = body[i];
1578
1579       FOR_BB_INSNS (bb, insn)
1580         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1581           continue;
1582         else
1583           {
1584             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1585                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1586             if (!--debug_uses)
1587               break;
1588           }
1589     }
1590   free (body);
1591 }
1592
1593 /* Determine whether INSN contains an accumulator
1594    which can be expanded into separate copies,
1595    one for each copy of the LOOP body.
1596
1597    for (i = 0 ; i < n; i++)
1598      sum += a[i];
1599
1600    ==>
1601
1602    sum += a[i]
1603    ....
1604    i = i+1;
1605    sum1 += a[i]
1606    ....
1607    i = i+1
1608    sum2 += a[i];
1609    ....
1610
1611    Return NULL if INSN contains no opportunity for expansion of accumulator.
1612    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1613    information and return a pointer to it.
1614 */
1615
1616 static struct var_to_expand *
1617 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1618 {
1619   rtx set, dest, src, op1, op2, something;
1620   struct var_to_expand *ves;
1621   enum machine_mode mode1, mode2;
1622   unsigned accum_pos;
1623   int debug_uses = 0;
1624
1625   set = single_set (insn);
1626   if (!set)
1627     return NULL;
1628
1629   dest = SET_DEST (set);
1630   src = SET_SRC (set);
1631
1632   if (GET_CODE (src) != PLUS
1633       && GET_CODE (src) != MINUS
1634       && GET_CODE (src) != MULT)
1635     return NULL;
1636
1637   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1638      in MD.  But if there is no optab to generate the insn, we can not
1639      perform the variable expansion.  This can happen if an MD provides
1640      an insn but not a named pattern to generate it, for example to avoid
1641      producing code that needs additional mode switches like for x87/mmx.
1642
1643      So we check have_insn_for which looks for an optab for the operation
1644      in SRC.  If it doesn't exist, we can't perform the expansion even
1645      though INSN is valid.  */
1646   if (!have_insn_for (GET_CODE (src), GET_MODE (src)))
1647     return NULL;
1648
1649   op1 = XEXP (src, 0);
1650   op2 = XEXP (src, 1);
1651
1652   if (!REG_P (dest)
1653       && !(GET_CODE (dest) == SUBREG
1654            && REG_P (SUBREG_REG (dest))))
1655     return NULL;
1656
1657   if (rtx_equal_p (dest, op1))
1658     accum_pos = 0;
1659   else if (rtx_equal_p (dest, op2))
1660     accum_pos = 1;
1661   else
1662     return NULL;
1663
1664   /* The method of expansion that we are using; which includes
1665      the initialization of the expansions with zero and the summation of
1666      the expansions at the end of the computation will yield wrong results
1667      for (x = something - x) thus avoid using it in that case.  */
1668   if (accum_pos == 1
1669       && GET_CODE (src) == MINUS)
1670    return NULL;
1671
1672   something = (accum_pos == 0) ? op2 : op1;
1673
1674   if (rtx_referenced_p (dest, something))
1675     return NULL;
1676
1677   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1678     return NULL;
1679
1680   mode1 = GET_MODE (dest);
1681   mode2 = GET_MODE (something);
1682   if ((FLOAT_MODE_P (mode1)
1683        || FLOAT_MODE_P (mode2))
1684       && !flag_associative_math)
1685     return NULL;
1686
1687   if (dump_file)
1688   {
1689     fprintf (dump_file,
1690     "\n;; Expanding Accumulator ");
1691     print_rtl (dump_file, dest);
1692     fprintf (dump_file, "\n");
1693   }
1694
1695   if (debug_uses)
1696     /* Instead of resetting the debug insns, we could replace each
1697        debug use in the loop with the sum or product of all expanded
1698        accummulators.  Since we'll only know of all expansions at the
1699        end, we'd have to keep track of which vars_to_expand a debug
1700        insn in the loop references, take note of each copy of the
1701        debug insn during unrolling, and when it's all done, compute
1702        the sum or product of each variable and adjust the original
1703        debug insn and each copy thereof.  What a pain!  */
1704     reset_debug_uses_in_loop (loop, dest, debug_uses);
1705
1706   /* Record the accumulator to expand.  */
1707   ves = XNEW (struct var_to_expand);
1708   ves->insn = insn;
1709   ves->reg = copy_rtx (dest);
1710   ves->var_expansions = VEC_alloc (rtx, heap, 1);
1711   ves->next = NULL;
1712   ves->op = GET_CODE (src);
1713   ves->expansion_count = 0;
1714   ves->reuse_expansion = 0;
1715   ves->accum_pos = accum_pos;
1716   return ves;
1717 }
1718
1719 /* Determine whether there is an induction variable in INSN that
1720    we would like to split during unrolling.
1721
1722    I.e. replace
1723
1724    i = i + 1;
1725    ...
1726    i = i + 1;
1727    ...
1728    i = i + 1;
1729    ...
1730
1731    type chains by
1732
1733    i0 = i + 1
1734    ...
1735    i = i0 + 1
1736    ...
1737    i = i0 + 2
1738    ...
1739
1740    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1741    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1742    pointer to it.  */
1743
1744 static struct iv_to_split *
1745 analyze_iv_to_split_insn (rtx insn)
1746 {
1747   rtx set, dest;
1748   struct rtx_iv iv;
1749   struct iv_to_split *ivts;
1750   bool ok;
1751
1752   /* For now we just split the basic induction variables.  Later this may be
1753      extended for example by selecting also addresses of memory references.  */
1754   set = single_set (insn);
1755   if (!set)
1756     return NULL;
1757
1758   dest = SET_DEST (set);
1759   if (!REG_P (dest))
1760     return NULL;
1761
1762   if (!biv_p (insn, dest))
1763     return NULL;
1764
1765   ok = iv_analyze_result (insn, dest, &iv);
1766
1767   /* This used to be an assert under the assumption that if biv_p returns
1768      true that iv_analyze_result must also return true.  However, that
1769      assumption is not strictly correct as evidenced by pr25569.
1770
1771      Returning NULL when iv_analyze_result returns false is safe and
1772      avoids the problems in pr25569 until the iv_analyze_* routines
1773      can be fixed, which is apparently hard and time consuming
1774      according to their author.  */
1775   if (! ok)
1776     return NULL;
1777
1778   if (iv.step == const0_rtx
1779       || iv.mode != iv.extend_mode)
1780     return NULL;
1781
1782   /* Record the insn to split.  */
1783   ivts = XNEW (struct iv_to_split);
1784   ivts->insn = insn;
1785   ivts->base_var = NULL_RTX;
1786   ivts->step = iv.step;
1787   ivts->next = NULL;
1788   ivts->n_loc = 1;
1789   ivts->loc[0] = 1;
1790
1791   return ivts;
1792 }
1793
1794 /* Determines which of insns in LOOP can be optimized.
1795    Return a OPT_INFO struct with the relevant hash tables filled
1796    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1797    is undefined for the return value.  */
1798
1799 static struct opt_info *
1800 analyze_insns_in_loop (struct loop *loop)
1801 {
1802   basic_block *body, bb;
1803   unsigned i;
1804   struct opt_info *opt_info = XCNEW (struct opt_info);
1805   rtx insn;
1806   struct iv_to_split *ivts = NULL;
1807   struct var_to_expand *ves = NULL;
1808   PTR *slot1;
1809   PTR *slot2;
1810   VEC (edge, heap) *edges = get_loop_exit_edges (loop);
1811   edge exit;
1812   bool can_apply = false;
1813
1814   iv_analysis_loop_init (loop);
1815
1816   body = get_loop_body (loop);
1817
1818   if (flag_split_ivs_in_unroller)
1819     {
1820       opt_info->insns_to_split = htab_create (5 * loop->num_nodes,
1821                                               si_info_hash, si_info_eq, free);
1822       opt_info->iv_to_split_head = NULL;
1823       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1824     }
1825
1826   /* Record the loop exit bb and loop preheader before the unrolling.  */
1827   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1828
1829   if (VEC_length (edge, edges) == 1)
1830     {
1831       exit = VEC_index (edge, edges, 0);
1832       if (!(exit->flags & EDGE_COMPLEX))
1833         {
1834           opt_info->loop_exit = split_edge (exit);
1835           can_apply = true;
1836         }
1837     }
1838
1839   if (flag_variable_expansion_in_unroller
1840       && can_apply)
1841     {
1842       opt_info->insns_with_var_to_expand = htab_create (5 * loop->num_nodes,
1843                                                         ve_info_hash,
1844                                                         ve_info_eq, free);
1845       opt_info->var_to_expand_head = NULL;
1846       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1847     }
1848
1849   for (i = 0; i < loop->num_nodes; i++)
1850     {
1851       bb = body[i];
1852       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1853         continue;
1854
1855       FOR_BB_INSNS (bb, insn)
1856       {
1857         if (!INSN_P (insn))
1858           continue;
1859
1860         if (opt_info->insns_to_split)
1861           ivts = analyze_iv_to_split_insn (insn);
1862
1863         if (ivts)
1864           {
1865             slot1 = htab_find_slot (opt_info->insns_to_split, ivts, INSERT);
1866             gcc_assert (*slot1 == NULL);
1867             *slot1 = ivts;
1868             *opt_info->iv_to_split_tail = ivts;
1869             opt_info->iv_to_split_tail = &ivts->next;
1870             continue;
1871           }
1872
1873         if (opt_info->insns_with_var_to_expand)
1874           ves = analyze_insn_to_expand_var (loop, insn);
1875
1876         if (ves)
1877           {
1878             slot2 = htab_find_slot (opt_info->insns_with_var_to_expand, ves, INSERT);
1879             gcc_assert (*slot2 == NULL);
1880             *slot2 = ves;
1881             *opt_info->var_to_expand_tail = ves;
1882             opt_info->var_to_expand_tail = &ves->next;
1883           }
1884       }
1885     }
1886
1887   VEC_free (edge, heap, edges);
1888   free (body);
1889   return opt_info;
1890 }
1891
1892 /* Called just before loop duplication.  Records start of duplicated area
1893    to OPT_INFO.  */
1894
1895 static void
1896 opt_info_start_duplication (struct opt_info *opt_info)
1897 {
1898   if (opt_info)
1899     opt_info->first_new_block = last_basic_block;
1900 }
1901
1902 /* Determine the number of iterations between initialization of the base
1903    variable and the current copy (N_COPY).  N_COPIES is the total number
1904    of newly created copies.  UNROLLING is true if we are unrolling
1905    (not peeling) the loop.  */
1906
1907 static unsigned
1908 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1909 {
1910   if (unrolling)
1911     {
1912       /* If we are unrolling, initialization is done in the original loop
1913          body (number 0).  */
1914       return n_copy;
1915     }
1916   else
1917     {
1918       /* If we are peeling, the copy in that the initialization occurs has
1919          number 1.  The original loop (number 0) is the last.  */
1920       if (n_copy)
1921         return n_copy - 1;
1922       else
1923         return n_copies;
1924     }
1925 }
1926
1927 /* Locate in EXPR the expression corresponding to the location recorded
1928    in IVTS, and return a pointer to the RTX for this location.  */
1929
1930 static rtx *
1931 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
1932 {
1933   unsigned i;
1934   rtx *ret = &expr;
1935
1936   for (i = 0; i < ivts->n_loc; i++)
1937     ret = &XEXP (*ret, ivts->loc[i]);
1938
1939   return ret;
1940 }
1941
1942 /* Allocate basic variable for the induction variable chain.  */
1943
1944 static void
1945 allocate_basic_variable (struct iv_to_split *ivts)
1946 {
1947   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
1948
1949   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1950 }
1951
1952 /* Insert initialization of basic variable of IVTS before INSN, taking
1953    the initial value from INSN.  */
1954
1955 static void
1956 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
1957 {
1958   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
1959   rtx seq;
1960
1961   start_sequence ();
1962   expr = force_operand (expr, ivts->base_var);
1963   if (expr != ivts->base_var)
1964     emit_move_insn (ivts->base_var, expr);
1965   seq = get_insns ();
1966   end_sequence ();
1967
1968   emit_insn_before (seq, insn);
1969 }
1970
1971 /* Replace the use of induction variable described in IVTS in INSN
1972    by base variable + DELTA * step.  */
1973
1974 static void
1975 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
1976 {
1977   rtx expr, *loc, seq, incr, var;
1978   enum machine_mode mode = GET_MODE (ivts->base_var);
1979   rtx src, dest, set;
1980
1981   /* Construct base + DELTA * step.  */
1982   if (!delta)
1983     expr = ivts->base_var;
1984   else
1985     {
1986       incr = simplify_gen_binary (MULT, mode,
1987                                   ivts->step, gen_int_mode (delta, mode));
1988       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1989                                   ivts->base_var, incr);
1990     }
1991
1992   /* Figure out where to do the replacement.  */
1993   loc = get_ivts_expr (single_set (insn), ivts);
1994
1995   /* If we can make the replacement right away, we're done.  */
1996   if (validate_change (insn, loc, expr, 0))
1997     return;
1998
1999   /* Otherwise, force EXPR into a register and try again.  */
2000   start_sequence ();
2001   var = gen_reg_rtx (mode);
2002   expr = force_operand (expr, var);
2003   if (expr != var)
2004     emit_move_insn (var, expr);
2005   seq = get_insns ();
2006   end_sequence ();
2007   emit_insn_before (seq, insn);
2008
2009   if (validate_change (insn, loc, var, 0))
2010     return;
2011
2012   /* The last chance.  Try recreating the assignment in insn
2013      completely from scratch.  */
2014   set = single_set (insn);
2015   gcc_assert (set);
2016
2017   start_sequence ();
2018   *loc = var;
2019   src = copy_rtx (SET_SRC (set));
2020   dest = copy_rtx (SET_DEST (set));
2021   src = force_operand (src, dest);
2022   if (src != dest)
2023     emit_move_insn (dest, src);
2024   seq = get_insns ();
2025   end_sequence ();
2026
2027   emit_insn_before (seq, insn);
2028   delete_insn (insn);
2029 }
2030
2031
2032 /* Return one expansion of the accumulator recorded in struct VE.  */
2033
2034 static rtx
2035 get_expansion (struct var_to_expand *ve)
2036 {
2037   rtx reg;
2038
2039   if (ve->reuse_expansion == 0)
2040     reg = ve->reg;
2041   else
2042     reg = VEC_index (rtx, ve->var_expansions, ve->reuse_expansion - 1);
2043
2044   if (VEC_length (rtx, ve->var_expansions) == (unsigned) ve->reuse_expansion)
2045     ve->reuse_expansion = 0;
2046   else
2047     ve->reuse_expansion++;
2048
2049   return reg;
2050 }
2051
2052
2053 /* Given INSN replace the uses of the accumulator recorded in VE
2054    with a new register.  */
2055
2056 static void
2057 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2058 {
2059   rtx new_reg, set;
2060   bool really_new_expansion = false;
2061
2062   set = single_set (insn);
2063   gcc_assert (set);
2064
2065   /* Generate a new register only if the expansion limit has not been
2066      reached.  Else reuse an already existing expansion.  */
2067   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2068     {
2069       really_new_expansion = true;
2070       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2071     }
2072   else
2073     new_reg = get_expansion (ve);
2074
2075   validate_change (insn, &SET_DEST (set), new_reg, 1);
2076   validate_change (insn, &XEXP (SET_SRC (set), ve->accum_pos), new_reg, 1);
2077
2078   if (apply_change_group ())
2079     if (really_new_expansion)
2080       {
2081         VEC_safe_push (rtx, heap, ve->var_expansions, new_reg);
2082         ve->expansion_count++;
2083       }
2084 }
2085
2086 /* Initialize the variable expansions in loop preheader.  PLACE is the
2087    loop-preheader basic block where the initialization of the
2088    expansions should take place.  The expansions are initialized with
2089    (-0) when the operation is plus or minus to honor sign zero.  This
2090    way we can prevent cases where the sign of the final result is
2091    effected by the sign of the expansion.  Here is an example to
2092    demonstrate this:
2093
2094    for (i = 0 ; i < n; i++)
2095      sum += something;
2096
2097    ==>
2098
2099    sum += something
2100    ....
2101    i = i+1;
2102    sum1 += something
2103    ....
2104    i = i+1
2105    sum2 += something;
2106    ....
2107
2108    When SUM is initialized with -zero and SOMETHING is also -zero; the
2109    final result of sum should be -zero thus the expansions sum1 and sum2
2110    should be initialized with -zero as well (otherwise we will get +zero
2111    as the final result).  */
2112
2113 static void
2114 insert_var_expansion_initialization (struct var_to_expand *ve,
2115                                      basic_block place)
2116 {
2117   rtx seq, var, zero_init, insn;
2118   unsigned i;
2119   enum machine_mode mode = GET_MODE (ve->reg);
2120   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2121
2122   if (VEC_length (rtx, ve->var_expansions) == 0)
2123     return;
2124
2125   start_sequence ();
2126   if (ve->op == PLUS || ve->op == MINUS)
2127     for (i = 0; VEC_iterate (rtx, ve->var_expansions, i, var); i++)
2128       {
2129         if (honor_signed_zero_p)
2130           zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2131         else
2132           zero_init = CONST0_RTX (mode);
2133
2134         emit_move_insn (var, zero_init);
2135       }
2136   else if (ve->op == MULT)
2137     for (i = 0; VEC_iterate (rtx, ve->var_expansions, i, var); i++)
2138       {
2139         zero_init =  CONST1_RTX (GET_MODE (var));
2140         emit_move_insn (var, zero_init);
2141       }
2142
2143   seq = get_insns ();
2144   end_sequence ();
2145
2146   insn = BB_HEAD (place);
2147   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2148     insn = NEXT_INSN (insn);
2149
2150   emit_insn_after (seq, insn);
2151 }
2152
2153 /* Combine the variable expansions at the loop exit.  PLACE is the
2154    loop exit basic block where the summation of the expansions should
2155    take place.  */
2156
2157 static void
2158 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2159 {
2160   rtx sum = ve->reg;
2161   rtx expr, seq, var, insn;
2162   unsigned i;
2163
2164   if (VEC_length (rtx, ve->var_expansions) == 0)
2165     return;
2166
2167   start_sequence ();
2168   if (ve->op == PLUS || ve->op == MINUS)
2169     for (i = 0; VEC_iterate (rtx, ve->var_expansions, i, var); i++)
2170       {
2171         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg),
2172                                    var, sum);
2173       }
2174   else if (ve->op == MULT)
2175     for (i = 0; VEC_iterate (rtx, ve->var_expansions, i, var); i++)
2176       {
2177         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg),
2178                                    var, sum);
2179       }
2180
2181   expr = force_operand (sum, ve->reg);
2182   if (expr != ve->reg)
2183     emit_move_insn (ve->reg, expr);
2184   seq = get_insns ();
2185   end_sequence ();
2186
2187   insn = BB_HEAD (place);
2188   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2189     insn = NEXT_INSN (insn);
2190
2191   emit_insn_after (seq, insn);
2192 }
2193
2194 /* Apply loop optimizations in loop copies using the
2195    data which gathered during the unrolling.  Structure
2196    OPT_INFO record that data.
2197
2198    UNROLLING is true if we unrolled (not peeled) the loop.
2199    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2200    the loop (as it should happen in complete unrolling, but not in ordinary
2201    peeling of the loop).  */
2202
2203 static void
2204 apply_opt_in_copies (struct opt_info *opt_info,
2205                      unsigned n_copies, bool unrolling,
2206                      bool rewrite_original_loop)
2207 {
2208   unsigned i, delta;
2209   basic_block bb, orig_bb;
2210   rtx insn, orig_insn, next;
2211   struct iv_to_split ivts_templ, *ivts;
2212   struct var_to_expand ve_templ, *ves;
2213
2214   /* Sanity check -- we need to put initialization in the original loop
2215      body.  */
2216   gcc_assert (!unrolling || rewrite_original_loop);
2217
2218   /* Allocate the basic variables (i0).  */
2219   if (opt_info->insns_to_split)
2220     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2221       allocate_basic_variable (ivts);
2222
2223   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2224     {
2225       bb = BASIC_BLOCK (i);
2226       orig_bb = get_bb_original (bb);
2227
2228       /* bb->aux holds position in copy sequence initialized by
2229          duplicate_loop_to_header_edge.  */
2230       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2231                                         unrolling);
2232       bb->aux = 0;
2233       orig_insn = BB_HEAD (orig_bb);
2234       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb)); insn = next)
2235         {
2236           next = NEXT_INSN (insn);
2237           if (!INSN_P (insn))
2238             continue;
2239
2240           while (!INSN_P (orig_insn))
2241             orig_insn = NEXT_INSN (orig_insn);
2242
2243           ivts_templ.insn = orig_insn;
2244           ve_templ.insn = orig_insn;
2245
2246           /* Apply splitting iv optimization.  */
2247           if (opt_info->insns_to_split)
2248             {
2249               ivts = (struct iv_to_split *)
2250                 htab_find (opt_info->insns_to_split, &ivts_templ);
2251
2252               if (ivts)
2253                 {
2254                   gcc_assert (GET_CODE (PATTERN (insn))
2255                               == GET_CODE (PATTERN (orig_insn)));
2256
2257                   if (!delta)
2258                     insert_base_initialization (ivts, insn);
2259                   split_iv (ivts, insn, delta);
2260                 }
2261             }
2262           /* Apply variable expansion optimization.  */
2263           if (unrolling && opt_info->insns_with_var_to_expand)
2264             {
2265               ves = (struct var_to_expand *)
2266                 htab_find (opt_info->insns_with_var_to_expand, &ve_templ);
2267               if (ves)
2268                 {
2269                   gcc_assert (GET_CODE (PATTERN (insn))
2270                               == GET_CODE (PATTERN (orig_insn)));
2271                   expand_var_during_unrolling (ves, insn);
2272                 }
2273             }
2274           orig_insn = NEXT_INSN (orig_insn);
2275         }
2276     }
2277
2278   if (!rewrite_original_loop)
2279     return;
2280
2281   /* Initialize the variable expansions in the loop preheader
2282      and take care of combining them at the loop exit.  */
2283   if (opt_info->insns_with_var_to_expand)
2284     {
2285       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2286         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2287       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2288         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2289     }
2290
2291   /* Rewrite also the original loop body.  Find them as originals of the blocks
2292      in the last copied iteration, i.e. those that have
2293      get_bb_copy (get_bb_original (bb)) == bb.  */
2294   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2295     {
2296       bb = BASIC_BLOCK (i);
2297       orig_bb = get_bb_original (bb);
2298       if (get_bb_copy (orig_bb) != bb)
2299         continue;
2300
2301       delta = determine_split_iv_delta (0, n_copies, unrolling);
2302       for (orig_insn = BB_HEAD (orig_bb);
2303            orig_insn != NEXT_INSN (BB_END (bb));
2304            orig_insn = next)
2305         {
2306           next = NEXT_INSN (orig_insn);
2307
2308           if (!INSN_P (orig_insn))
2309             continue;
2310
2311           ivts_templ.insn = orig_insn;
2312           if (opt_info->insns_to_split)
2313             {
2314               ivts = (struct iv_to_split *)
2315                 htab_find (opt_info->insns_to_split, &ivts_templ);
2316               if (ivts)
2317                 {
2318                   if (!delta)
2319                     insert_base_initialization (ivts, orig_insn);
2320                   split_iv (ivts, orig_insn, delta);
2321                   continue;
2322                 }
2323             }
2324
2325         }
2326     }
2327 }
2328
2329 /* Release OPT_INFO.  */
2330
2331 static void
2332 free_opt_info (struct opt_info *opt_info)
2333 {
2334   if (opt_info->insns_to_split)
2335     htab_delete (opt_info->insns_to_split);
2336   if (opt_info->insns_with_var_to_expand)
2337     {
2338       struct var_to_expand *ves;
2339
2340       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2341         VEC_free (rtx, heap, ves->var_expansions);
2342       htab_delete (opt_info->insns_with_var_to_expand);
2343     }
2344   free (opt_info);
2345 }