gcc/config/i386/i386.c

   1 /* Subroutines used for code generation on IA-32.
   2    Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
   3    2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
   4
   5 This file is part of GCC.
   6
   7 GCC is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU General Public License as published by
   9 the Free Software Foundation; either version 2, or (at your option)
  10 any later version.
  11
  12 GCC is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License
  18 along with GCC; see the file COPYING.  If not, write to
  19 the Free Software Foundation, 51 Franklin Street, Fifth Floor,
  20 Boston, MA 02110-1301, USA.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "tm.h"
  26 #include "rtl.h"
  27 #include "tree.h"
  28 #include "tm_p.h"
  29 #include "regs.h"
  30 #include "hard-reg-set.h"
  31 #include "real.h"
  32 #include "insn-config.h"
  33 #include "conditions.h"
  34 #include "output.h"
  35 #include "insn-codes.h"
  36 #include "insn-attr.h"
  37 #include "flags.h"
  38 #include "except.h"
  39 #include "function.h"
  40 #include "recog.h"
  41 #include "expr.h"
  42 #include "optabs.h"
  43 #include "toplev.h"
  44 #include "basic-block.h"
  45 #include "ggc.h"
  46 #include "target.h"
  47 #include "target-def.h"
  48 #include "langhooks.h"
  49 #include "cgraph.h"
  50 #include "tree-gimple.h"
  51 #include "dwarf2.h"
  52 #include "tm-constrs.h"
  53 #include "params.h"
  54
  55 #ifndef CHECK_STACK_LIMIT
  56 #define CHECK_STACK_LIMIT (-1)
  57 #endif
  58
  59 /* Return index of given mode in mult and division cost tables.  */
  60 #define MODE_INDEX(mode)                                        \
  61   ((mode) == QImode ? 0                                         \
  62    : (mode) == HImode ? 1                                       \
  63    : (mode) == SImode ? 2                                       \
  64    : (mode) == DImode ? 3                                       \
  65    : 4)
  66
  67 /* Processor costs (relative to an add) */
  68 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes.  */
  69 #define COSTS_N_BYTES(N) ((N) * 2)
  70
  71 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
  72
  73 static const
  74 struct processor_costs size_cost = {    /* costs for tuning for size */
  75   COSTS_N_BYTES (2),                    /* cost of an add instruction */
  76   COSTS_N_BYTES (3),                    /* cost of a lea instruction */
  77   COSTS_N_BYTES (2),                    /* variable shift costs */
  78   COSTS_N_BYTES (3),                    /* constant shift costs */
  79   {COSTS_N_BYTES (3),                   /* cost of starting multiply for QI */
  80    COSTS_N_BYTES (3),                   /*                               HI */
  81    COSTS_N_BYTES (3),                   /*                               SI */
  82    COSTS_N_BYTES (3),                   /*                               DI */
  83    COSTS_N_BYTES (5)},                  /*                            other */
  84   0,                                    /* cost of multiply per each bit set */
  85   {COSTS_N_BYTES (3),                   /* cost of a divide/mod for QI */
  86    COSTS_N_BYTES (3),                   /*                          HI */
  87    COSTS_N_BYTES (3),                   /*                          SI */
  88    COSTS_N_BYTES (3),                   /*                          DI */
  89    COSTS_N_BYTES (5)},                  /*                       other */
  90   COSTS_N_BYTES (3),                    /* cost of movsx */
  91   COSTS_N_BYTES (3),                    /* cost of movzx */
  92   0,                                    /* "large" insn */
  93   2,                                    /* MOVE_RATIO */
  94   2,                                    /* cost for loading QImode using movzbl */
  95   {2, 2, 2},                            /* cost of loading integer registers
  96                                            in QImode, HImode and SImode.
  97                                            Relative to reg-reg move (2).  */
  98   {2, 2, 2},                            /* cost of storing integer registers */
  99   2,                                    /* cost of reg,reg fld/fst */
 100   {2, 2, 2},                            /* cost of loading fp registers
 101                                            in SFmode, DFmode and XFmode */
 102   {2, 2, 2},                            /* cost of storing fp registers
 103                                            in SFmode, DFmode and XFmode */
 104   3,                                    /* cost of moving MMX register */
 105   {3, 3},                               /* cost of loading MMX registers
 106                                            in SImode and DImode */
 107   {3, 3},                               /* cost of storing MMX registers
 108                                            in SImode and DImode */
 109   3,                                    /* cost of moving SSE register */
 110   {3, 3, 3},                            /* cost of loading SSE registers
 111                                            in SImode, DImode and TImode */
 112   {3, 3, 3},                            /* cost of storing SSE registers
 113                                            in SImode, DImode and TImode */
 114   3,                                    /* MMX or SSE register to integer */
 115   0,                                    /* size of prefetch block */
 116   0,                                    /* number of parallel prefetches */
 117   2,                                    /* Branch cost */
 118   COSTS_N_BYTES (2),                    /* cost of FADD and FSUB insns.  */
 119   COSTS_N_BYTES (2),                    /* cost of FMUL instruction.  */
 120   COSTS_N_BYTES (2),                    /* cost of FDIV instruction.  */
 121   COSTS_N_BYTES (2),                    /* cost of FABS instruction.  */
 122   COSTS_N_BYTES (2),                    /* cost of FCHS instruction.  */
 123   COSTS_N_BYTES (2),                    /* cost of FSQRT instruction.  */
 124   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
 125    {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
 126   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
 127    {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}
 128 };
 129
 130 /* Processor costs (relative to an add) */
 131 static const
 132 struct processor_costs i386_cost = {    /* 386 specific costs */
 133   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 134   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 135   COSTS_N_INSNS (3),                    /* variable shift costs */
 136   COSTS_N_INSNS (2),                    /* constant shift costs */
 137   {COSTS_N_INSNS (6),                   /* cost of starting multiply for QI */
 138    COSTS_N_INSNS (6),                   /*                               HI */
 139    COSTS_N_INSNS (6),                   /*                               SI */
 140    COSTS_N_INSNS (6),                   /*                               DI */
 141    COSTS_N_INSNS (6)},                  /*                               other */
 142   COSTS_N_INSNS (1),                    /* cost of multiply per each bit set */
 143   {COSTS_N_INSNS (23),                  /* cost of a divide/mod for QI */
 144    COSTS_N_INSNS (23),                  /*                          HI */
 145    COSTS_N_INSNS (23),                  /*                          SI */
 146    COSTS_N_INSNS (23),                  /*                          DI */
 147    COSTS_N_INSNS (23)},                 /*                          other */
 148   COSTS_N_INSNS (3),                    /* cost of movsx */
 149   COSTS_N_INSNS (2),                    /* cost of movzx */
 150   15,                                   /* "large" insn */
 151   3,                                    /* MOVE_RATIO */
 152   4,                                    /* cost for loading QImode using movzbl */
 153   {2, 4, 2},                            /* cost of loading integer registers
 154                                            in QImode, HImode and SImode.
 155                                            Relative to reg-reg move (2).  */
 156   {2, 4, 2},                            /* cost of storing integer registers */
 157   2,                                    /* cost of reg,reg fld/fst */
 158   {8, 8, 8},                            /* cost of loading fp registers
 159                                            in SFmode, DFmode and XFmode */
 160   {8, 8, 8},                            /* cost of storing fp registers
 161                                            in SFmode, DFmode and XFmode */
 162   2,                                    /* cost of moving MMX register */
 163   {4, 8},                               /* cost of loading MMX registers
 164                                            in SImode and DImode */
 165   {4, 8},                               /* cost of storing MMX registers
 166                                            in SImode and DImode */
 167   2,                                    /* cost of moving SSE register */
 168   {4, 8, 16},                           /* cost of loading SSE registers
 169                                            in SImode, DImode and TImode */
 170   {4, 8, 16},                           /* cost of storing SSE registers
 171                                            in SImode, DImode and TImode */
 172   3,                                    /* MMX or SSE register to integer */
 173   0,                                    /* size of prefetch block */
 174   0,                                    /* number of parallel prefetches */
 175   1,                                    /* Branch cost */
 176   COSTS_N_INSNS (23),                   /* cost of FADD and FSUB insns.  */
 177   COSTS_N_INSNS (27),                   /* cost of FMUL instruction.  */
 178   COSTS_N_INSNS (88),                   /* cost of FDIV instruction.  */
 179   COSTS_N_INSNS (22),                   /* cost of FABS instruction.  */
 180   COSTS_N_INSNS (24),                   /* cost of FCHS instruction.  */
 181   COSTS_N_INSNS (122),                  /* cost of FSQRT instruction.  */
 182   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
 183    DUMMY_STRINGOP_ALGS},
 184   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
 185    DUMMY_STRINGOP_ALGS},
 186 };
 187
 188 static const
 189 struct processor_costs i486_cost = {    /* 486 specific costs */
 190   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 191   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 192   COSTS_N_INSNS (3),                    /* variable shift costs */
 193   COSTS_N_INSNS (2),                    /* constant shift costs */
 194   {COSTS_N_INSNS (12),                  /* cost of starting multiply for QI */
 195    COSTS_N_INSNS (12),                  /*                               HI */
 196    COSTS_N_INSNS (12),                  /*                               SI */
 197    COSTS_N_INSNS (12),                  /*                               DI */
 198    COSTS_N_INSNS (12)},                 /*                               other */
 199   1,                                    /* cost of multiply per each bit set */
 200   {COSTS_N_INSNS (40),                  /* cost of a divide/mod for QI */
 201    COSTS_N_INSNS (40),                  /*                          HI */
 202    COSTS_N_INSNS (40),                  /*                          SI */
 203    COSTS_N_INSNS (40),                  /*                          DI */
 204    COSTS_N_INSNS (40)},                 /*                          other */
 205   COSTS_N_INSNS (3),                    /* cost of movsx */
 206   COSTS_N_INSNS (2),                    /* cost of movzx */
 207   15,                                   /* "large" insn */
 208   3,                                    /* MOVE_RATIO */
 209   4,                                    /* cost for loading QImode using movzbl */
 210   {2, 4, 2},                            /* cost of loading integer registers
 211                                            in QImode, HImode and SImode.
 212                                            Relative to reg-reg move (2).  */
 213   {2, 4, 2},                            /* cost of storing integer registers */
 214   2,                                    /* cost of reg,reg fld/fst */
 215   {8, 8, 8},                            /* cost of loading fp registers
 216                                            in SFmode, DFmode and XFmode */
 217   {8, 8, 8},                            /* cost of storing fp registers
 218                                            in SFmode, DFmode and XFmode */
 219   2,                                    /* cost of moving MMX register */
 220   {4, 8},                               /* cost of loading MMX registers
 221                                            in SImode and DImode */
 222   {4, 8},                               /* cost of storing MMX registers
 223                                            in SImode and DImode */
 224   2,                                    /* cost of moving SSE register */
 225   {4, 8, 16},                           /* cost of loading SSE registers
 226                                            in SImode, DImode and TImode */
 227   {4, 8, 16},                           /* cost of storing SSE registers
 228                                            in SImode, DImode and TImode */
 229   3,                                    /* MMX or SSE register to integer */
 230   0,                                    /* size of prefetch block */
 231   0,                                    /* number of parallel prefetches */
 232   1,                                    /* Branch cost */
 233   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
 234   COSTS_N_INSNS (16),                   /* cost of FMUL instruction.  */
 235   COSTS_N_INSNS (73),                   /* cost of FDIV instruction.  */
 236   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
 237   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
 238   COSTS_N_INSNS (83),                   /* cost of FSQRT instruction.  */
 239   {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
 240    DUMMY_STRINGOP_ALGS},
 241   {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
 242    DUMMY_STRINGOP_ALGS}
 243 };
 244
 245 static const
 246 struct processor_costs pentium_cost = {
 247   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 248   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 249   COSTS_N_INSNS (4),                    /* variable shift costs */
 250   COSTS_N_INSNS (1),                    /* constant shift costs */
 251   {COSTS_N_INSNS (11),                  /* cost of starting multiply for QI */
 252    COSTS_N_INSNS (11),                  /*                               HI */
 253    COSTS_N_INSNS (11),                  /*                               SI */
 254    COSTS_N_INSNS (11),                  /*                               DI */
 255    COSTS_N_INSNS (11)},                 /*                               other */
 256   0,                                    /* cost of multiply per each bit set */
 257   {COSTS_N_INSNS (25),                  /* cost of a divide/mod for QI */
 258    COSTS_N_INSNS (25),                  /*                          HI */
 259    COSTS_N_INSNS (25),                  /*                          SI */
 260    COSTS_N_INSNS (25),                  /*                          DI */
 261    COSTS_N_INSNS (25)},                 /*                          other */
 262   COSTS_N_INSNS (3),                    /* cost of movsx */
 263   COSTS_N_INSNS (2),                    /* cost of movzx */
 264   8,                                    /* "large" insn */
 265   6,                                    /* MOVE_RATIO */
 266   6,                                    /* cost for loading QImode using movzbl */
 267   {2, 4, 2},                            /* cost of loading integer registers
 268                                            in QImode, HImode and SImode.
 269                                            Relative to reg-reg move (2).  */
 270   {2, 4, 2},                            /* cost of storing integer registers */
 271   2,                                    /* cost of reg,reg fld/fst */
 272   {2, 2, 6},                            /* cost of loading fp registers
 273                                            in SFmode, DFmode and XFmode */
 274   {4, 4, 6},                            /* cost of storing fp registers
 275                                            in SFmode, DFmode and XFmode */
 276   8,                                    /* cost of moving MMX register */
 277   {8, 8},                               /* cost of loading MMX registers
 278                                            in SImode and DImode */
 279   {8, 8},                               /* cost of storing MMX registers
 280                                            in SImode and DImode */
 281   2,                                    /* cost of moving SSE register */
 282   {4, 8, 16},                           /* cost of loading SSE registers
 283                                            in SImode, DImode and TImode */
 284   {4, 8, 16},                           /* cost of storing SSE registers
 285                                            in SImode, DImode and TImode */
 286   3,                                    /* MMX or SSE register to integer */
 287   0,                                    /* size of prefetch block */
 288   0,                                    /* number of parallel prefetches */
 289   2,                                    /* Branch cost */
 290   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 291   COSTS_N_INSNS (3),                    /* cost of FMUL instruction.  */
 292   COSTS_N_INSNS (39),                   /* cost of FDIV instruction.  */
 293   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 294   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 295   COSTS_N_INSNS (70),                   /* cost of FSQRT instruction.  */
 296   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
 297    DUMMY_STRINGOP_ALGS},
 298   {{libcall, {{-1, rep_prefix_4_byte}}},
 299    DUMMY_STRINGOP_ALGS}
 300 };
 301
 302 static const
 303 struct processor_costs pentiumpro_cost = {
 304   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 305   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 306   COSTS_N_INSNS (1),                    /* variable shift costs */
 307   COSTS_N_INSNS (1),                    /* constant shift costs */
 308   {COSTS_N_INSNS (4),                   /* cost of starting multiply for QI */
 309    COSTS_N_INSNS (4),                   /*                               HI */
 310    COSTS_N_INSNS (4),                   /*                               SI */
 311    COSTS_N_INSNS (4),                   /*                               DI */
 312    COSTS_N_INSNS (4)},                  /*                               other */
 313   0,                                    /* cost of multiply per each bit set */
 314   {COSTS_N_INSNS (17),                  /* cost of a divide/mod for QI */
 315    COSTS_N_INSNS (17),                  /*                          HI */
 316    COSTS_N_INSNS (17),                  /*                          SI */
 317    COSTS_N_INSNS (17),                  /*                          DI */
 318    COSTS_N_INSNS (17)},                 /*                          other */
 319   COSTS_N_INSNS (1),                    /* cost of movsx */
 320   COSTS_N_INSNS (1),                    /* cost of movzx */
 321   8,                                    /* "large" insn */
 322   6,                                    /* MOVE_RATIO */
 323   2,                                    /* cost for loading QImode using movzbl */
 324   {4, 4, 4},                            /* cost of loading integer registers
 325                                            in QImode, HImode and SImode.
 326                                            Relative to reg-reg move (2).  */
 327   {2, 2, 2},                            /* cost of storing integer registers */
 328   2,                                    /* cost of reg,reg fld/fst */
 329   {2, 2, 6},                            /* cost of loading fp registers
 330                                            in SFmode, DFmode and XFmode */
 331   {4, 4, 6},                            /* cost of storing fp registers
 332                                            in SFmode, DFmode and XFmode */
 333   2,                                    /* cost of moving MMX register */
 334   {2, 2},                               /* cost of loading MMX registers
 335                                            in SImode and DImode */
 336   {2, 2},                               /* cost of storing MMX registers
 337                                            in SImode and DImode */
 338   2,                                    /* cost of moving SSE register */
 339   {2, 2, 8},                            /* cost of loading SSE registers
 340                                            in SImode, DImode and TImode */
 341   {2, 2, 8},                            /* cost of storing SSE registers
 342                                            in SImode, DImode and TImode */
 343   3,                                    /* MMX or SSE register to integer */
 344   32,                                   /* size of prefetch block */
 345   6,                                    /* number of parallel prefetches */
 346   2,                                    /* Branch cost */
 347   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 348   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
 349   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 350   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 351   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 352   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 353   /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes (we ensure
 354      the alignment).  For small blocks inline loop is still a noticeable win, for bigger
 355      blocks either rep movsl or rep movsb is way to go.  Rep movsb has apparently
 356      more expensive startup time in CPU, but after 4K the difference is down in the noise.
 357    */
 358   {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
 359                         {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
 360    DUMMY_STRINGOP_ALGS},
 361   {{rep_prefix_4_byte, {{1024, unrolled_loop},
 362                         {8192, rep_prefix_4_byte}, {-1, libcall}}},
 363    DUMMY_STRINGOP_ALGS}
 364 };
 365
 366 static const
 367 struct processor_costs geode_cost = {
 368   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 369   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 370   COSTS_N_INSNS (2),                    /* variable shift costs */
 371   COSTS_N_INSNS (1),                    /* constant shift costs */
 372   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 373    COSTS_N_INSNS (4),                   /*                               HI */
 374    COSTS_N_INSNS (7),                   /*                               SI */
 375    COSTS_N_INSNS (7),                   /*                               DI */
 376    COSTS_N_INSNS (7)},                  /*                               other */
 377   0,                                    /* cost of multiply per each bit set */
 378   {COSTS_N_INSNS (15),                  /* cost of a divide/mod for QI */
 379    COSTS_N_INSNS (23),                  /*                          HI */
 380    COSTS_N_INSNS (39),                  /*                          SI */
 381    COSTS_N_INSNS (39),                  /*                          DI */
 382    COSTS_N_INSNS (39)},                 /*                          other */
 383   COSTS_N_INSNS (1),                    /* cost of movsx */
 384   COSTS_N_INSNS (1),                    /* cost of movzx */
 385   8,                                    /* "large" insn */
 386   4,                                    /* MOVE_RATIO */
 387   1,                                    /* cost for loading QImode using movzbl */
 388   {1, 1, 1},                            /* cost of loading integer registers
 389                                            in QImode, HImode and SImode.
 390                                            Relative to reg-reg move (2).  */
 391   {1, 1, 1},                            /* cost of storing integer registers */
 392   1,                                    /* cost of reg,reg fld/fst */
 393   {1, 1, 1},                            /* cost of loading fp registers
 394                                            in SFmode, DFmode and XFmode */
 395   {4, 6, 6},                            /* cost of storing fp registers
 396                                            in SFmode, DFmode and XFmode */
 397
 398   1,                                    /* cost of moving MMX register */
 399   {1, 1},                               /* cost of loading MMX registers
 400                                            in SImode and DImode */
 401   {1, 1},                               /* cost of storing MMX registers
 402                                            in SImode and DImode */
 403   1,                                    /* cost of moving SSE register */
 404   {1, 1, 1},                            /* cost of loading SSE registers
 405                                            in SImode, DImode and TImode */
 406   {1, 1, 1},                            /* cost of storing SSE registers
 407                                            in SImode, DImode and TImode */
 408   1,                                    /* MMX or SSE register to integer */
 409   32,                                   /* size of prefetch block */
 410   1,                                    /* number of parallel prefetches */
 411   1,                                    /* Branch cost */
 412   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
 413   COSTS_N_INSNS (11),                   /* cost of FMUL instruction.  */
 414   COSTS_N_INSNS (47),                   /* cost of FDIV instruction.  */
 415   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 416   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 417   COSTS_N_INSNS (54),                   /* cost of FSQRT instruction.  */
 418   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
 419    DUMMY_STRINGOP_ALGS},
 420   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
 421    DUMMY_STRINGOP_ALGS}
 422 };
 423
 424 static const
 425 struct processor_costs k6_cost = {
 426   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 427   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 428   COSTS_N_INSNS (1),                    /* variable shift costs */
 429   COSTS_N_INSNS (1),                    /* constant shift costs */
 430   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 431    COSTS_N_INSNS (3),                   /*                               HI */
 432    COSTS_N_INSNS (3),                   /*                               SI */
 433    COSTS_N_INSNS (3),                   /*                               DI */
 434    COSTS_N_INSNS (3)},                  /*                               other */
 435   0,                                    /* cost of multiply per each bit set */
 436   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 437    COSTS_N_INSNS (18),                  /*                          HI */
 438    COSTS_N_INSNS (18),                  /*                          SI */
 439    COSTS_N_INSNS (18),                  /*                          DI */
 440    COSTS_N_INSNS (18)},                 /*                          other */
 441   COSTS_N_INSNS (2),                    /* cost of movsx */
 442   COSTS_N_INSNS (2),                    /* cost of movzx */
 443   8,                                    /* "large" insn */
 444   4,                                    /* MOVE_RATIO */
 445   3,                                    /* cost for loading QImode using movzbl */
 446   {4, 5, 4},                            /* cost of loading integer registers
 447                                            in QImode, HImode and SImode.
 448                                            Relative to reg-reg move (2).  */
 449   {2, 3, 2},                            /* cost of storing integer registers */
 450   4,                                    /* cost of reg,reg fld/fst */
 451   {6, 6, 6},                            /* cost of loading fp registers
 452                                            in SFmode, DFmode and XFmode */
 453   {4, 4, 4},                            /* cost of storing fp registers
 454                                            in SFmode, DFmode and XFmode */
 455   2,                                    /* cost of moving MMX register */
 456   {2, 2},                               /* cost of loading MMX registers
 457                                            in SImode and DImode */
 458   {2, 2},                               /* cost of storing MMX registers
 459                                            in SImode and DImode */
 460   2,                                    /* cost of moving SSE register */
 461   {2, 2, 8},                            /* cost of loading SSE registers
 462                                            in SImode, DImode and TImode */
 463   {2, 2, 8},                            /* cost of storing SSE registers
 464                                            in SImode, DImode and TImode */
 465   6,                                    /* MMX or SSE register to integer */
 466   32,                                   /* size of prefetch block */
 467   1,                                    /* number of parallel prefetches */
 468   1,                                    /* Branch cost */
 469   COSTS_N_INSNS (2),                    /* cost of FADD and FSUB insns.  */
 470   COSTS_N_INSNS (2),                    /* cost of FMUL instruction.  */
 471   COSTS_N_INSNS (56),                   /* cost of FDIV instruction.  */
 472   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 473   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 474   COSTS_N_INSNS (56),                   /* cost of FSQRT instruction.  */
 475   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
 476    DUMMY_STRINGOP_ALGS},
 477   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
 478    DUMMY_STRINGOP_ALGS}
 479 };
 480
 481 static const
 482 struct processor_costs athlon_cost = {
 483   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 484   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 485   COSTS_N_INSNS (1),                    /* variable shift costs */
 486   COSTS_N_INSNS (1),                    /* constant shift costs */
 487   {COSTS_N_INSNS (5),                   /* cost of starting multiply for QI */
 488    COSTS_N_INSNS (5),                   /*                               HI */
 489    COSTS_N_INSNS (5),                   /*                               SI */
 490    COSTS_N_INSNS (5),                   /*                               DI */
 491    COSTS_N_INSNS (5)},                  /*                               other */
 492   0,                                    /* cost of multiply per each bit set */
 493   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 494    COSTS_N_INSNS (26),                  /*                          HI */
 495    COSTS_N_INSNS (42),                  /*                          SI */
 496    COSTS_N_INSNS (74),                  /*                          DI */
 497    COSTS_N_INSNS (74)},                 /*                          other */
 498   COSTS_N_INSNS (1),                    /* cost of movsx */
 499   COSTS_N_INSNS (1),                    /* cost of movzx */
 500   8,                                    /* "large" insn */
 501   9,                                    /* MOVE_RATIO */
 502   4,                                    /* cost for loading QImode using movzbl */
 503   {3, 4, 3},                            /* cost of loading integer registers
 504                                            in QImode, HImode and SImode.
 505                                            Relative to reg-reg move (2).  */
 506   {3, 4, 3},                            /* cost of storing integer registers */
 507   4,                                    /* cost of reg,reg fld/fst */
 508   {4, 4, 12},                           /* cost of loading fp registers
 509                                            in SFmode, DFmode and XFmode */
 510   {6, 6, 8},                            /* cost of storing fp registers
 511                                            in SFmode, DFmode and XFmode */
 512   2,                                    /* cost of moving MMX register */
 513   {4, 4},                               /* cost of loading MMX registers
 514                                            in SImode and DImode */
 515   {4, 4},                               /* cost of storing MMX registers
 516                                            in SImode and DImode */
 517   2,                                    /* cost of moving SSE register */
 518   {4, 4, 6},                            /* cost of loading SSE registers
 519                                            in SImode, DImode and TImode */
 520   {4, 4, 5},                            /* cost of storing SSE registers
 521                                            in SImode, DImode and TImode */
 522   5,                                    /* MMX or SSE register to integer */
 523   64,                                   /* size of prefetch block */
 524   6,                                    /* number of parallel prefetches */
 525   5,                                    /* Branch cost */
 526   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 527   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 528   COSTS_N_INSNS (24),                   /* cost of FDIV instruction.  */
 529   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 530   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 531   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 532   /* For some reason, Athlon deals better with REP prefix (relative to loops)
 533      compared to K8. Alignment becomes important after 8 bytes for memcpy and
 534      128 bytes for memset.  */
 535   {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
 536    DUMMY_STRINGOP_ALGS},
 537   {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
 538    DUMMY_STRINGOP_ALGS}
 539 };
 540
 541 static const
 542 struct processor_costs k8_cost = {
 543   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 544   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 545   COSTS_N_INSNS (1),                    /* variable shift costs */
 546   COSTS_N_INSNS (1),                    /* constant shift costs */
 547   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 548    COSTS_N_INSNS (4),                   /*                               HI */
 549    COSTS_N_INSNS (3),                   /*                               SI */
 550    COSTS_N_INSNS (4),                   /*                               DI */
 551    COSTS_N_INSNS (5)},                  /*                               other */
 552   0,                                    /* cost of multiply per each bit set */
 553   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 554    COSTS_N_INSNS (26),                  /*                          HI */
 555    COSTS_N_INSNS (42),                  /*                          SI */
 556    COSTS_N_INSNS (74),                  /*                          DI */
 557    COSTS_N_INSNS (74)},                 /*                          other */
 558   COSTS_N_INSNS (1),                    /* cost of movsx */
 559   COSTS_N_INSNS (1),                    /* cost of movzx */
 560   8,                                    /* "large" insn */
 561   9,                                    /* MOVE_RATIO */
 562   4,                                    /* cost for loading QImode using movzbl */
 563   {3, 4, 3},                            /* cost of loading integer registers
 564                                            in QImode, HImode and SImode.
 565                                            Relative to reg-reg move (2).  */
 566   {3, 4, 3},                            /* cost of storing integer registers */
 567   4,                                    /* cost of reg,reg fld/fst */
 568   {4, 4, 12},                           /* cost of loading fp registers
 569                                            in SFmode, DFmode and XFmode */
 570   {6, 6, 8},                            /* cost of storing fp registers
 571                                            in SFmode, DFmode and XFmode */
 572   2,                                    /* cost of moving MMX register */
 573   {3, 3},                               /* cost of loading MMX registers
 574                                            in SImode and DImode */
 575   {4, 4},                               /* cost of storing MMX registers
 576                                            in SImode and DImode */
 577   2,                                    /* cost of moving SSE register */
 578   {4, 3, 6},                            /* cost of loading SSE registers
 579                                            in SImode, DImode and TImode */
 580   {4, 4, 5},                            /* cost of storing SSE registers
 581                                            in SImode, DImode and TImode */
 582   5,                                    /* MMX or SSE register to integer */
 583   64,                                   /* size of prefetch block */
 584   /* New AMD processors never drop prefetches; if they cannot be performed
 585      immediately, they are queued.  We set number of simultaneous prefetches
 586      to a large constant to reflect this (it probably is not a good idea not
 587      to limit number of prefetches at all, as their execution also takes some
 588      time).  */
 589   100,                                  /* number of parallel prefetches */
 590   5,                                    /* Branch cost */
 591   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 592   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 593   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
 594   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 595   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 596   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 597   /* K8 has optimized REP instruction for medium sized blocks, but for very small
 598      blocks it is better to use loop. For large blocks, libcall can do
 599      nontemporary accesses and beat inline considerably.  */
 600   {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
 601    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
 602   {{libcall, {{8, loop}, {24, unrolled_loop},
 603               {2048, rep_prefix_4_byte}, {-1, libcall}}},
 604    {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
 605 };
 606
 607 struct processor_costs amdfam10_cost = {
 608   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 609   COSTS_N_INSNS (2),                    /* cost of a lea instruction */
 610   COSTS_N_INSNS (1),                    /* variable shift costs */
 611   COSTS_N_INSNS (1),                    /* constant shift costs */
 612   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 613    COSTS_N_INSNS (4),                   /*                               HI */
 614    COSTS_N_INSNS (3),                   /*                               SI */
 615    COSTS_N_INSNS (4),                   /*                               DI */
 616    COSTS_N_INSNS (5)},                  /*                               other */
 617   0,                                    /* cost of multiply per each bit set */
 618   {COSTS_N_INSNS (19),                  /* cost of a divide/mod for QI */
 619    COSTS_N_INSNS (35),                  /*                          HI */
 620    COSTS_N_INSNS (51),                  /*                          SI */
 621    COSTS_N_INSNS (83),                  /*                          DI */
 622    COSTS_N_INSNS (83)},                 /*                          other */
 623   COSTS_N_INSNS (1),                    /* cost of movsx */
 624   COSTS_N_INSNS (1),                    /* cost of movzx */
 625   8,                                    /* "large" insn */
 626   9,                                    /* MOVE_RATIO */
 627   4,                                    /* cost for loading QImode using movzbl */
 628   {3, 4, 3},                            /* cost of loading integer registers
 629                                            in QImode, HImode and SImode.
 630                                            Relative to reg-reg move (2).  */
 631   {3, 4, 3},                            /* cost of storing integer registers */
 632   4,                                    /* cost of reg,reg fld/fst */
 633   {4, 4, 12},                           /* cost of loading fp registers
 634                                            in SFmode, DFmode and XFmode */
 635   {6, 6, 8},                            /* cost of storing fp registers
 636                                            in SFmode, DFmode and XFmode */
 637   2,                                    /* cost of moving MMX register */
 638   {3, 3},                               /* cost of loading MMX registers
 639                                            in SImode and DImode */
 640   {4, 4},                               /* cost of storing MMX registers
 641                                            in SImode and DImode */
 642   2,                                    /* cost of moving SSE register */
 643   {4, 4, 3},                            /* cost of loading SSE registers
 644                                            in SImode, DImode and TImode */
 645   {4, 4, 5},                            /* cost of storing SSE registers
 646                                            in SImode, DImode and TImode */
 647   3,                                    /* MMX or SSE register to integer */
 648                                         /* On K8
 649                                             MOVD reg64, xmmreg  Double  FSTORE 4
 650                                             MOVD reg32, xmmreg  Double  FSTORE 4
 651                                            On AMDFAM10
 652                                             MOVD reg64, xmmreg  Double  FADD 3
 653                                                                 1/1  1/1
 654                                             MOVD reg32, xmmreg  Double  FADD 3
 655                                                                 1/1  1/1 */
 656   64,                                   /* size of prefetch block */
 657   /* New AMD processors never drop prefetches; if they cannot be performed
 658      immediately, they are queued.  We set number of simultaneous prefetches
 659      to a large constant to reflect this (it probably is not a good idea not
 660      to limit number of prefetches at all, as their execution also takes some
 661      time).  */
 662   100,                                  /* number of parallel prefetches */
 663   5,                                    /* Branch cost */
 664   COSTS_N_INSNS (4),                    /* cost of FADD and FSUB insns.  */
 665   COSTS_N_INSNS (4),                    /* cost of FMUL instruction.  */
 666   COSTS_N_INSNS (19),                   /* cost of FDIV instruction.  */
 667   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 668   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 669   COSTS_N_INSNS (35),                   /* cost of FSQRT instruction.  */
 670
 671   /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
 672      very small blocks it is better to use loop. For large blocks, libcall can
 673      do nontemporary accesses and beat inline considerably.  */
 674   {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
 675    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
 676   {{libcall, {{8, loop}, {24, unrolled_loop},
 677               {2048, rep_prefix_4_byte}, {-1, libcall}}},
 678    {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
 679 };
 680
 681 static const
 682 struct processor_costs pentium4_cost = {
 683   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 684   COSTS_N_INSNS (3),                    /* cost of a lea instruction */
 685   COSTS_N_INSNS (4),                    /* variable shift costs */
 686   COSTS_N_INSNS (4),                    /* constant shift costs */
 687   {COSTS_N_INSNS (15),                  /* cost of starting multiply for QI */
 688    COSTS_N_INSNS (15),                  /*                               HI */
 689    COSTS_N_INSNS (15),                  /*                               SI */
 690    COSTS_N_INSNS (15),                  /*                               DI */
 691    COSTS_N_INSNS (15)},                 /*                               other */
 692   0,                                    /* cost of multiply per each bit set */
 693   {COSTS_N_INSNS (56),                  /* cost of a divide/mod for QI */
 694    COSTS_N_INSNS (56),                  /*                          HI */
 695    COSTS_N_INSNS (56),                  /*                          SI */
 696    COSTS_N_INSNS (56),                  /*                          DI */
 697    COSTS_N_INSNS (56)},                 /*                          other */
 698   COSTS_N_INSNS (1),                    /* cost of movsx */
 699   COSTS_N_INSNS (1),                    /* cost of movzx */
 700   16,                                   /* "large" insn */
 701   6,                                    /* MOVE_RATIO */
 702   2,                                    /* cost for loading QImode using movzbl */
 703   {4, 5, 4},                            /* cost of loading integer registers
 704                                            in QImode, HImode and SImode.
 705                                            Relative to reg-reg move (2).  */
 706   {2, 3, 2},                            /* cost of storing integer registers */
 707   2,                                    /* cost of reg,reg fld/fst */
 708   {2, 2, 6},                            /* cost of loading fp registers
 709                                            in SFmode, DFmode and XFmode */
 710   {4, 4, 6},                            /* cost of storing fp registers
 711                                            in SFmode, DFmode and XFmode */
 712   2,                                    /* cost of moving MMX register */
 713   {2, 2},                               /* cost of loading MMX registers
 714                                            in SImode and DImode */
 715   {2, 2},                               /* cost of storing MMX registers
 716                                            in SImode and DImode */
 717   12,                                   /* cost of moving SSE register */
 718   {12, 12, 12},                         /* cost of loading SSE registers
 719                                            in SImode, DImode and TImode */
 720   {2, 2, 8},                            /* cost of storing SSE registers
 721                                            in SImode, DImode and TImode */
 722   10,                                   /* MMX or SSE register to integer */
 723   64,                                   /* size of prefetch block */
 724   6,                                    /* number of parallel prefetches */
 725   2,                                    /* Branch cost */
 726   COSTS_N_INSNS (5),                    /* cost of FADD and FSUB insns.  */
 727   COSTS_N_INSNS (7),                    /* cost of FMUL instruction.  */
 728   COSTS_N_INSNS (43),                   /* cost of FDIV instruction.  */
 729   COSTS_N_INSNS (2),                    /* cost of FABS instruction.  */
 730   COSTS_N_INSNS (2),                    /* cost of FCHS instruction.  */
 731   COSTS_N_INSNS (43),                   /* cost of FSQRT instruction.  */
 732   {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
 733    DUMMY_STRINGOP_ALGS},
 734   {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
 735    {-1, libcall}}},
 736    DUMMY_STRINGOP_ALGS},
 737 };
 738
 739 static const
 740 struct processor_costs nocona_cost = {
 741   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 742   COSTS_N_INSNS (1),                    /* cost of a lea instruction */
 743   COSTS_N_INSNS (1),                    /* variable shift costs */
 744   COSTS_N_INSNS (1),                    /* constant shift costs */
 745   {COSTS_N_INSNS (10),                  /* cost of starting multiply for QI */
 746    COSTS_N_INSNS (10),                  /*                               HI */
 747    COSTS_N_INSNS (10),                  /*                               SI */
 748    COSTS_N_INSNS (10),                  /*                               DI */
 749    COSTS_N_INSNS (10)},                 /*                               other */
 750   0,                                    /* cost of multiply per each bit set */
 751   {COSTS_N_INSNS (66),                  /* cost of a divide/mod for QI */
 752    COSTS_N_INSNS (66),                  /*                          HI */
 753    COSTS_N_INSNS (66),                  /*                          SI */
 754    COSTS_N_INSNS (66),                  /*                          DI */
 755    COSTS_N_INSNS (66)},                 /*                          other */
 756   COSTS_N_INSNS (1),                    /* cost of movsx */
 757   COSTS_N_INSNS (1),                    /* cost of movzx */
 758   16,                                   /* "large" insn */
 759   17,                                   /* MOVE_RATIO */
 760   4,                                    /* cost for loading QImode using movzbl */
 761   {4, 4, 4},                            /* cost of loading integer registers
 762                                            in QImode, HImode and SImode.
 763                                            Relative to reg-reg move (2).  */
 764   {4, 4, 4},                            /* cost of storing integer registers */
 765   3,                                    /* cost of reg,reg fld/fst */
 766   {12, 12, 12},                         /* cost of loading fp registers
 767                                            in SFmode, DFmode and XFmode */
 768   {4, 4, 4},                            /* cost of storing fp registers
 769                                            in SFmode, DFmode and XFmode */
 770   6,                                    /* cost of moving MMX register */
 771   {12, 12},                             /* cost of loading MMX registers
 772                                            in SImode and DImode */
 773   {12, 12},                             /* cost of storing MMX registers
 774                                            in SImode and DImode */
 775   6,                                    /* cost of moving SSE register */
 776   {12, 12, 12},                         /* cost of loading SSE registers
 777                                            in SImode, DImode and TImode */
 778   {12, 12, 12},                         /* cost of storing SSE registers
 779                                            in SImode, DImode and TImode */
 780   8,                                    /* MMX or SSE register to integer */
 781   128,                                  /* size of prefetch block */
 782   8,                                    /* number of parallel prefetches */
 783   1,                                    /* Branch cost */
 784   COSTS_N_INSNS (6),                    /* cost of FADD and FSUB insns.  */
 785   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
 786   COSTS_N_INSNS (40),                   /* cost of FDIV instruction.  */
 787   COSTS_N_INSNS (3),                    /* cost of FABS instruction.  */
 788   COSTS_N_INSNS (3),                    /* cost of FCHS instruction.  */
 789   COSTS_N_INSNS (44),                   /* cost of FSQRT instruction.  */
 790   {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
 791    {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
 792               {100000, unrolled_loop}, {-1, libcall}}}},
 793   {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
 794    {-1, libcall}}},
 795    {libcall, {{24, loop}, {64, unrolled_loop},
 796               {8192, rep_prefix_8_byte}, {-1, libcall}}}}
 797 };
 798
 799 static const
 800 struct processor_costs core2_cost = {
 801   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 802   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
 803   COSTS_N_INSNS (1),                    /* variable shift costs */
 804   COSTS_N_INSNS (1),                    /* constant shift costs */
 805   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 806    COSTS_N_INSNS (3),                   /*                               HI */
 807    COSTS_N_INSNS (3),                   /*                               SI */
 808    COSTS_N_INSNS (3),                   /*                               DI */
 809    COSTS_N_INSNS (3)},                  /*                               other */
 810   0,                                    /* cost of multiply per each bit set */
 811   {COSTS_N_INSNS (22),                  /* cost of a divide/mod for QI */
 812    COSTS_N_INSNS (22),                  /*                          HI */
 813    COSTS_N_INSNS (22),                  /*                          SI */
 814    COSTS_N_INSNS (22),                  /*                          DI */
 815    COSTS_N_INSNS (22)},                 /*                          other */
 816   COSTS_N_INSNS (1),                    /* cost of movsx */
 817   COSTS_N_INSNS (1),                    /* cost of movzx */
 818   8,                                    /* "large" insn */
 819   16,                                   /* MOVE_RATIO */
 820   2,                                    /* cost for loading QImode using movzbl */
 821   {6, 6, 6},                            /* cost of loading integer registers
 822                                            in QImode, HImode and SImode.
 823                                            Relative to reg-reg move (2).  */
 824   {4, 4, 4},                            /* cost of storing integer registers */
 825   2,                                    /* cost of reg,reg fld/fst */
 826   {6, 6, 6},                            /* cost of loading fp registers
 827                                            in SFmode, DFmode and XFmode */
 828   {4, 4, 4},                            /* cost of loading integer registers */
 829   2,                                    /* cost of moving MMX register */
 830   {6, 6},                               /* cost of loading MMX registers
 831                                            in SImode and DImode */
 832   {4, 4},                               /* cost of storing MMX registers
 833                                            in SImode and DImode */
 834   2,                                    /* cost of moving SSE register */
 835   {6, 6, 6},                            /* cost of loading SSE registers
 836                                            in SImode, DImode and TImode */
 837   {4, 4, 4},                            /* cost of storing SSE registers
 838                                            in SImode, DImode and TImode */
 839   2,                                    /* MMX or SSE register to integer */
 840   128,                                  /* size of prefetch block */
 841   8,                                    /* number of parallel prefetches */
 842   3,                                    /* Branch cost */
 843   COSTS_N_INSNS (3),                    /* cost of FADD and FSUB insns.  */
 844   COSTS_N_INSNS (5),                    /* cost of FMUL instruction.  */
 845   COSTS_N_INSNS (32),                   /* cost of FDIV instruction.  */
 846   COSTS_N_INSNS (1),                    /* cost of FABS instruction.  */
 847   COSTS_N_INSNS (1),                    /* cost of FCHS instruction.  */
 848   COSTS_N_INSNS (58),                   /* cost of FSQRT instruction.  */
 849   {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
 850    {libcall, {{32, loop}, {64, rep_prefix_4_byte},
 851               {8192, rep_prefix_8_byte}, {-1, libcall}}}},
 852   {{libcall, {{8, loop}, {15, unrolled_loop},
 853               {2048, rep_prefix_4_byte}, {-1, libcall}}},
 854    {libcall, {{24, loop}, {32, unrolled_loop},
 855               {8192, rep_prefix_8_byte}, {-1, libcall}}}}
 856 };
 857
 858 /* Generic64 should produce code tuned for Nocona and K8.  */
 859 static const
 860 struct processor_costs generic64_cost = {
 861   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 862   /* On all chips taken into consideration lea is 2 cycles and more.  With
 863      this cost however our current implementation of synth_mult results in
 864      use of unnecessary temporary registers causing regression on several
 865      SPECfp benchmarks.  */
 866   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
 867   COSTS_N_INSNS (1),                    /* variable shift costs */
 868   COSTS_N_INSNS (1),                    /* constant shift costs */
 869   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 870    COSTS_N_INSNS (4),                   /*                               HI */
 871    COSTS_N_INSNS (3),                   /*                               SI */
 872    COSTS_N_INSNS (4),                   /*                               DI */
 873    COSTS_N_INSNS (2)},                  /*                               other */
 874   0,                                    /* cost of multiply per each bit set */
 875   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 876    COSTS_N_INSNS (26),                  /*                          HI */
 877    COSTS_N_INSNS (42),                  /*                          SI */
 878    COSTS_N_INSNS (74),                  /*                          DI */
 879    COSTS_N_INSNS (74)},                 /*                          other */
 880   COSTS_N_INSNS (1),                    /* cost of movsx */
 881   COSTS_N_INSNS (1),                    /* cost of movzx */
 882   8,                                    /* "large" insn */
 883   17,                                   /* MOVE_RATIO */
 884   4,                                    /* cost for loading QImode using movzbl */
 885   {4, 4, 4},                            /* cost of loading integer registers
 886                                            in QImode, HImode and SImode.
 887                                            Relative to reg-reg move (2).  */
 888   {4, 4, 4},                            /* cost of storing integer registers */
 889   4,                                    /* cost of reg,reg fld/fst */
 890   {12, 12, 12},                         /* cost of loading fp registers
 891                                            in SFmode, DFmode and XFmode */
 892   {6, 6, 8},                            /* cost of storing fp registers
 893                                            in SFmode, DFmode and XFmode */
 894   2,                                    /* cost of moving MMX register */
 895   {8, 8},                               /* cost of loading MMX registers
 896                                            in SImode and DImode */
 897   {8, 8},                               /* cost of storing MMX registers
 898                                            in SImode and DImode */
 899   2,                                    /* cost of moving SSE register */
 900   {8, 8, 8},                            /* cost of loading SSE registers
 901                                            in SImode, DImode and TImode */
 902   {8, 8, 8},                            /* cost of storing SSE registers
 903                                            in SImode, DImode and TImode */
 904   5,                                    /* MMX or SSE register to integer */
 905   64,                                   /* size of prefetch block */
 906   6,                                    /* number of parallel prefetches */
 907   /* Benchmarks shows large regressions on K8 sixtrack benchmark when this value
 908      is increased to perhaps more appropriate value of 5.  */
 909   3,                                    /* Branch cost */
 910   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
 911   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
 912   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
 913   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
 914   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
 915   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
 916   {DUMMY_STRINGOP_ALGS,
 917    {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
 918   {DUMMY_STRINGOP_ALGS,
 919    {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
 920 };
 921
 922 /* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8.  */
 923 static const
 924 struct processor_costs generic32_cost = {
 925   COSTS_N_INSNS (1),                    /* cost of an add instruction */
 926   COSTS_N_INSNS (1) + 1,                /* cost of a lea instruction */
 927   COSTS_N_INSNS (1),                    /* variable shift costs */
 928   COSTS_N_INSNS (1),                    /* constant shift costs */
 929   {COSTS_N_INSNS (3),                   /* cost of starting multiply for QI */
 930    COSTS_N_INSNS (4),                   /*                               HI */
 931    COSTS_N_INSNS (3),                   /*                               SI */
 932    COSTS_N_INSNS (4),                   /*                               DI */
 933    COSTS_N_INSNS (2)},                  /*                               other */
 934   0,                                    /* cost of multiply per each bit set */
 935   {COSTS_N_INSNS (18),                  /* cost of a divide/mod for QI */
 936    COSTS_N_INSNS (26),                  /*                          HI */
 937    COSTS_N_INSNS (42),                  /*                          SI */
 938    COSTS_N_INSNS (74),                  /*                          DI */
 939    COSTS_N_INSNS (74)},                 /*                          other */
 940   COSTS_N_INSNS (1),                    /* cost of movsx */
 941   COSTS_N_INSNS (1),                    /* cost of movzx */
 942   8,                                    /* "large" insn */
 943   17,                                   /* MOVE_RATIO */
 944   4,                                    /* cost for loading QImode using movzbl */
 945   {4, 4, 4},                            /* cost of loading integer registers
 946                                            in QImode, HImode and SImode.
 947                                            Relative to reg-reg move (2).  */
 948   {4, 4, 4},                            /* cost of storing integer registers */
 949   4,                                    /* cost of reg,reg fld/fst */
 950   {12, 12, 12},                         /* cost of loading fp registers
 951                                            in SFmode, DFmode and XFmode */
 952   {6, 6, 8},                            /* cost of storing fp registers
 953                                            in SFmode, DFmode and XFmode */
 954   2,                                    /* cost of moving MMX register */
 955   {8, 8},                               /* cost of loading MMX registers
 956                                            in SImode and DImode */
 957   {8, 8},                               /* cost of storing MMX registers
 958                                            in SImode and DImode */
 959   2,                                    /* cost of moving SSE register */
 960   {8, 8, 8},                            /* cost of loading SSE registers
 961                                            in SImode, DImode and TImode */
 962   {8, 8, 8},                            /* cost of storing SSE registers
 963                                            in SImode, DImode and TImode */
 964   5,                                    /* MMX or SSE register to integer */
 965   64,                                   /* size of prefetch block */
 966   6,                                    /* number of parallel prefetches */
 967   3,                                    /* Branch cost */
 968   COSTS_N_INSNS (8),                    /* cost of FADD and FSUB insns.  */
 969   COSTS_N_INSNS (8),                    /* cost of FMUL instruction.  */
 970   COSTS_N_INSNS (20),                   /* cost of FDIV instruction.  */
 971   COSTS_N_INSNS (8),                    /* cost of FABS instruction.  */
 972   COSTS_N_INSNS (8),                    /* cost of FCHS instruction.  */
 973   COSTS_N_INSNS (40),                   /* cost of FSQRT instruction.  */
 974   {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
 975    DUMMY_STRINGOP_ALGS},
 976   {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
 977    DUMMY_STRINGOP_ALGS},
 978 };
 979
 980 const struct processor_costs *ix86_cost = &pentium_cost;
 981
 982 /* Processor feature/optimization bitmasks.  */
 983 #define m_386 (1<<PROCESSOR_I386)
 984 #define m_486 (1<<PROCESSOR_I486)
 985 #define m_PENT (1<<PROCESSOR_PENTIUM)
 986 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
 987 #define m_PENT4  (1<<PROCESSOR_PENTIUM4)
 988 #define m_NOCONA  (1<<PROCESSOR_NOCONA)
 989 #define m_CORE2  (1<<PROCESSOR_CORE2)
 990
 991 #define m_GEODE  (1<<PROCESSOR_GEODE)
 992 #define m_K6  (1<<PROCESSOR_K6)
 993 #define m_K6_GEODE  (m_K6 | m_GEODE)
 994 #define m_K8  (1<<PROCESSOR_K8)
 995 #define m_ATHLON  (1<<PROCESSOR_ATHLON)
 996 #define m_ATHLON_K8  (m_K8 | m_ATHLON)
 997 #define m_AMDFAM10  (1<<PROCESSOR_AMDFAM10)
 998 #define m_ATHLON_K8_AMDFAM10  (m_K8 | m_ATHLON | m_AMDFAM10)
 999
1000 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1001 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1002
1003 /* Generic instruction choice should be common subset of supported CPUs
1004    (PPro/PENT4/NOCONA/CORE2/Athlon/K8).  */
1005 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1006
1007 /* Feature tests against the various tunings.  */
1008 unsigned int ix86_tune_features[X86_TUNE_LAST] = {
1009   /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1010      negatively, so enabling for Generic64 seems like good code size
1011      tradeoff.  We can't enable it for 32bit generic because it does not
1012      work well with PPro base chips.  */
1013   m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC64,
1014
1015   /* X86_TUNE_PUSH_MEMORY */
1016   m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
1017   | m_NOCONA | m_CORE2 | m_GENERIC,
1018
1019   /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1020   m_486 | m_PENT,
1021
1022   /* X86_TUNE_USE_BIT_TEST */
1023   m_386,
1024
1025   /* X86_TUNE_UNROLL_STRLEN */
1026   m_486 | m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6 | m_CORE2 | m_GENERIC,
1027
1028   /* X86_TUNE_DEEP_BRANCH_PREDICTION */
1029   m_PPRO | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_GENERIC,
1030
1031   /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1032      on simulation result. But after P4 was made, no performance benefit
1033      was observed with branch hints.  It also increases the code size.
1034      As a result, icc never generates branch hints.  */
1035   0,
1036
1037   /* X86_TUNE_DOUBLE_WITH_ADD */
1038   ~m_386,
1039
1040   /* X86_TUNE_USE_SAHF */
1041   m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_PENT4
1042   | m_NOCONA | m_CORE2 | m_GENERIC,
1043
1044   /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1045      partial dependencies.  */
1046   m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
1047   | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */,
1048
1049   /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1050      register stalls on Generic32 compilation setting as well.  However
1051      in current implementation the partial register stalls are not eliminated
1052      very well - they can be introduced via subregs synthesized by combine
1053      and can happen in caller/callee saving sequences.  Because this option
1054      pays back little on PPro based chips and is in conflict with partial reg
1055      dependencies used by Athlon/P4 based chips, it is better to leave it off
1056      for generic32 for now.  */
1057   m_PPRO,
1058
1059   /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1060   m_CORE2 | m_GENERIC,
1061
1062   /* X86_TUNE_USE_HIMODE_FIOP */
1063   m_386 | m_486 | m_K6_GEODE,
1064
1065   /* X86_TUNE_USE_SIMODE_FIOP */
1066   ~(m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT | m_CORE2 | m_GENERIC),
1067
1068   /* X86_TUNE_USE_MOV0 */
1069   m_K6,
1070
1071   /* X86_TUNE_USE_CLTD */
1072   ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC),
1073
1074   /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx.  */
1075   m_PENT4,
1076
1077   /* X86_TUNE_SPLIT_LONG_MOVES */
1078   m_PPRO,
1079
1080   /* X86_TUNE_READ_MODIFY_WRITE */
1081   ~m_PENT,
1082
1083   /* X86_TUNE_READ_MODIFY */
1084   ~(m_PENT | m_PPRO),
1085
1086   /* X86_TUNE_PROMOTE_QIMODE */
1087   m_K6_GEODE | m_PENT | m_386 | m_486 | m_ATHLON_K8_AMDFAM10 | m_CORE2
1088   | m_GENERIC /* | m_PENT4 ? */,
1089
1090   /* X86_TUNE_FAST_PREFIX */
1091   ~(m_PENT | m_486 | m_386),
1092
1093   /* X86_TUNE_SINGLE_STRINGOP */
1094   m_386 | m_PENT4 | m_NOCONA,
1095
1096   /* X86_TUNE_QIMODE_MATH */
1097   ~0,
1098
1099   /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1100      register stalls.  Just like X86_TUNE_PARTIAL_REG_STALL this option
1101      might be considered for Generic32 if our scheme for avoiding partial
1102      stalls was more effective.  */
1103   ~m_PPRO,
1104
1105   /* X86_TUNE_PROMOTE_QI_REGS */
1106   0,
1107
1108   /* X86_TUNE_PROMOTE_HI_REGS */
1109   m_PPRO,
1110
1111   /* X86_TUNE_ADD_ESP_4: Enable if add/sub is preferred over 1/2 push/pop.  */
1112   m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1113
1114   /* X86_TUNE_ADD_ESP_8 */
1115   m_ATHLON_K8_AMDFAM10 | m_PPRO | m_K6_GEODE | m_386
1116   | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1117
1118   /* X86_TUNE_SUB_ESP_4 */
1119   m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1120
1121   /* X86_TUNE_SUB_ESP_8 */
1122   m_ATHLON_K8_AMDFAM10 | m_PPRO | m_386 | m_486
1123   | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1124
1125   /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1126      for DFmode copies */
1127   ~(m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
1128     | m_GENERIC | m_GEODE),
1129
1130   /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1131   m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1132
1133   /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1134      conflict here in between PPro/Pentium4 based chips that thread 128bit
1135      SSE registers as single units versus K8 based chips that divide SSE
1136      registers to two 64bit halves.  This knob promotes all store destinations
1137      to be 128bit to allow register renaming on 128bit SSE units, but usually
1138      results in one extra microop on 64bit SSE units.  Experimental results
1139      shows that disabling this option on P4 brings over 20% SPECfp regression,
1140      while enabling it on K8 brings roughly 2.4% regression that can be partly
1141      masked by careful scheduling of moves.  */
1142   m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC | m_AMDFAM10,
1143
1144   /* X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL */
1145   m_AMDFAM10,
1146
1147   /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1148      are resolved on SSE register parts instead of whole registers, so we may
1149      maintain just lower part of scalar values in proper format leaving the
1150      upper part undefined.  */
1151   m_ATHLON_K8,
1152
1153   /* X86_TUNE_SSE_TYPELESS_STORES */
1154   m_ATHLON_K8_AMDFAM10,
1155
1156   /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1157   m_PPRO | m_PENT4 | m_NOCONA,
1158
1159   /* X86_TUNE_MEMORY_MISMATCH_STALL */
1160   m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1161
1162   /* X86_TUNE_PROLOGUE_USING_MOVE */
1163   m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
1164
1165   /* X86_TUNE_EPILOGUE_USING_MOVE */
1166   m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
1167
1168   /* X86_TUNE_SHIFT1 */
1169   ~m_486,
1170
1171   /* X86_TUNE_USE_FFREEP */
1172   m_ATHLON_K8_AMDFAM10,
1173
1174   /* X86_TUNE_INTER_UNIT_MOVES */
1175   ~(m_ATHLON_K8_AMDFAM10 | m_GENERIC),
1176
1177   /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1178      than 4 branch instructions in the 16 byte window.  */
1179   m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1180
1181   /* X86_TUNE_SCHEDULE */
1182   m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT | m_CORE2 | m_GENERIC,
1183
1184   /* X86_TUNE_USE_BT */
1185   m_ATHLON_K8_AMDFAM10,
1186
1187   /* X86_TUNE_USE_INCDEC */
1188   ~(m_PENT4 | m_NOCONA | m_GENERIC),
1189
1190   /* X86_TUNE_PAD_RETURNS */
1191   m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC,
1192
1193   /* X86_TUNE_EXT_80387_CONSTANTS */
1194   m_K6_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC,
1195
1196   /* X86_TUNE_SHORTEN_X87_SSE */
1197   ~m_K8,
1198
1199   /* X86_TUNE_AVOID_VECTOR_DECODE */
1200   m_K8 | m_GENERIC64,
1201
1202   /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1203      and SImode multiply, but 386 and 486 do HImode multiply faster.  */
1204   ~(m_386 | m_486),
1205
1206   /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1207      vector path on AMD machines.  */
1208   m_K8 | m_GENERIC64 | m_AMDFAM10,
1209
1210   /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1211      machines.  */
1212   m_K8 | m_GENERIC64 | m_AMDFAM10,
1213
1214   /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1215      than a MOV.  */
1216   m_PENT,
1217
1218   /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1219      but one byte longer.  */
1220   m_PENT,
1221
1222   /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1223      operand that cannot be represented using a modRM byte.  The XOR
1224      replacement is long decoded, so this split helps here as well.  */
1225   m_K6,
1226 };
1227
1228 /* Feature tests against the various architecture variations.  */
1229 unsigned int ix86_arch_features[X86_ARCH_LAST] = {
1230   /* X86_ARCH_CMOVE */
1231   m_PPRO | m_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA,
1232
1233   /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486.  */
1234   ~m_386,
1235
1236   /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1237   ~(m_386 | m_486),
1238
1239   /* X86_ARCH_XADD: Exchange and add was added for 80486.  */
1240   ~m_386,
1241
1242   /* X86_ARCH_BSWAP: Byteswap was added for 80486.  */
1243   ~m_386,
1244 };
1245
1246 static const unsigned int x86_accumulate_outgoing_args
1247   = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
1248
1249 static const unsigned int x86_arch_always_fancy_math_387
1250   = m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4
1251     | m_NOCONA | m_CORE2 | m_GENERIC;
1252
1253 static enum stringop_alg stringop_alg = no_stringop;
1254
1255 /* In case the average insn count for single function invocation is
1256    lower than this constant, emit fast (but longer) prologue and
1257    epilogue code.  */
1258 #define FAST_PROLOGUE_INSN_COUNT 20
1259
1260 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively.  */
1261 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1262 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1263 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1264
1265 /* Array of the smallest class containing reg number REGNO, indexed by
1266    REGNO.  Used by REGNO_REG_CLASS in i386.h.  */
1267
1268 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
1269 {
1270   /* ax, dx, cx, bx */
1271   AREG, DREG, CREG, BREG,
1272   /* si, di, bp, sp */
1273   SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
1274   /* FP registers */
1275   FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
1276   FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
1277   /* arg pointer */
1278   NON_Q_REGS,
1279   /* flags, fpsr, fpcr, frame */
1280   NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
1281   SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1282   SSE_REGS, SSE_REGS,
1283   MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
1284   MMX_REGS, MMX_REGS,
1285   NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1286   NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1287   SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1288   SSE_REGS, SSE_REGS,
1289 };
1290
1291 /* The "default" register map used in 32bit mode.  */
1292
1293 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
1294 {
1295   0, 2, 1, 3, 6, 7, 4, 5,               /* general regs */
1296   12, 13, 14, 15, 16, 17, 18, 19,       /* fp regs */
1297   -1, -1, -1, -1, -1,                   /* arg, flags, fpsr, fpcr, frame */
1298   21, 22, 23, 24, 25, 26, 27, 28,       /* SSE */
1299   29, 30, 31, 32, 33, 34, 35, 36,       /* MMX */
1300   -1, -1, -1, -1, -1, -1, -1, -1,       /* extended integer registers */
1301   -1, -1, -1, -1, -1, -1, -1, -1,       /* extended SSE registers */
1302 };
1303
1304 static int const x86_64_int_parameter_registers[6] =
1305 {
1306   5 /*RDI*/, 4 /*RSI*/, 1 /*RDX*/, 2 /*RCX*/,
1307   FIRST_REX_INT_REG /*R8 */, FIRST_REX_INT_REG + 1 /*R9 */
1308 };
1309
1310 static int const x86_64_ms_abi_int_parameter_registers[4] =
1311 {
1312   2 /*RCX*/, 1 /*RDX*/,
1313   FIRST_REX_INT_REG /*R8 */, FIRST_REX_INT_REG + 1 /*R9 */
1314 };
1315
1316 static int const x86_64_int_return_registers[4] =
1317 {
1318   0 /*RAX*/, 1 /*RDX*/, 5 /*RDI*/, 4 /*RSI*/
1319 };
1320
1321 /* The "default" register map used in 64bit mode.  */
1322 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
1323 {
1324   0, 1, 2, 3, 4, 5, 6, 7,               /* general regs */
1325   33, 34, 35, 36, 37, 38, 39, 40,       /* fp regs */
1326   -1, -1, -1, -1, -1,                   /* arg, flags, fpsr, fpcr, frame */
1327   17, 18, 19, 20, 21, 22, 23, 24,       /* SSE */
1328   41, 42, 43, 44, 45, 46, 47, 48,       /* MMX */
1329   8,9,10,11,12,13,14,15,                /* extended integer registers */
1330   25, 26, 27, 28, 29, 30, 31, 32,       /* extended SSE registers */
1331 };
1332
1333 /* Define the register numbers to be used in Dwarf debugging information.
1334    The SVR4 reference port C compiler uses the following register numbers
1335    in its Dwarf output code:
1336         0 for %eax (gcc regno = 0)
1337         1 for %ecx (gcc regno = 2)
1338         2 for %edx (gcc regno = 1)
1339         3 for %ebx (gcc regno = 3)
1340         4 for %esp (gcc regno = 7)
1341         5 for %ebp (gcc regno = 6)
1342         6 for %esi (gcc regno = 4)
1343         7 for %edi (gcc regno = 5)
1344    The following three DWARF register numbers are never generated by
1345    the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
1346    believes these numbers have these meanings.
1347         8  for %eip    (no gcc equivalent)
1348         9  for %eflags (gcc regno = 17)
1349         10 for %trapno (no gcc equivalent)
1350    It is not at all clear how we should number the FP stack registers
1351    for the x86 architecture.  If the version of SDB on x86/svr4 were
1352    a bit less brain dead with respect to floating-point then we would
1353    have a precedent to follow with respect to DWARF register numbers
1354    for x86 FP registers, but the SDB on x86/svr4 is so completely
1355    broken with respect to FP registers that it is hardly worth thinking
1356    of it as something to strive for compatibility with.
1357    The version of x86/svr4 SDB I have at the moment does (partially)
1358    seem to believe that DWARF register number 11 is associated with
1359    the x86 register %st(0), but that's about all.  Higher DWARF
1360    register numbers don't seem to be associated with anything in
1361    particular, and even for DWARF regno 11, SDB only seems to under-
1362    stand that it should say that a variable lives in %st(0) (when
1363    asked via an `=' command) if we said it was in DWARF regno 11,
1364    but SDB still prints garbage when asked for the value of the
1365    variable in question (via a `/' command).
1366    (Also note that the labels SDB prints for various FP stack regs
1367    when doing an `x' command are all wrong.)
1368    Note that these problems generally don't affect the native SVR4
1369    C compiler because it doesn't allow the use of -O with -g and
1370    because when it is *not* optimizing, it allocates a memory
1371    location for each floating-point variable, and the memory
1372    location is what gets described in the DWARF AT_location
1373    attribute for the variable in question.
1374    Regardless of the severe mental illness of the x86/svr4 SDB, we
1375    do something sensible here and we use the following DWARF
1376    register numbers.  Note that these are all stack-top-relative
1377    numbers.
1378         11 for %st(0) (gcc regno = 8)
1379         12 for %st(1) (gcc regno = 9)
1380         13 for %st(2) (gcc regno = 10)
1381         14 for %st(3) (gcc regno = 11)
1382         15 for %st(4) (gcc regno = 12)
1383         16 for %st(5) (gcc regno = 13)
1384         17 for %st(6) (gcc regno = 14)
1385         18 for %st(7) (gcc regno = 15)
1386 */
1387 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
1388 {
1389   0, 2, 1, 3, 6, 7, 5, 4,               /* general regs */
1390   11, 12, 13, 14, 15, 16, 17, 18,       /* fp regs */
1391   -1, 9, -1, -1, -1,                    /* arg, flags, fpsr, fpcr, frame */
1392   21, 22, 23, 24, 25, 26, 27, 28,       /* SSE registers */
1393   29, 30, 31, 32, 33, 34, 35, 36,       /* MMX registers */
1394   -1, -1, -1, -1, -1, -1, -1, -1,       /* extended integer registers */
1395   -1, -1, -1, -1, -1, -1, -1, -1,       /* extended SSE registers */
1396 };
1397
1398 /* Test and compare insns in i386.md store the information needed to
1399    generate branch and scc insns here.  */
1400
1401 rtx ix86_compare_op0 = NULL_RTX;
1402 rtx ix86_compare_op1 = NULL_RTX;
1403 rtx ix86_compare_emitted = NULL_RTX;
1404
1405 /* Size of the register save area.  */
1406 #define X86_64_VARARGS_SIZE (REGPARM_MAX * UNITS_PER_WORD + SSE_REGPARM_MAX * 16)
1407
1408 /* Define the structure for the machine field in struct function.  */
1409
1410 struct stack_local_entry GTY(())
1411 {
1412   unsigned short mode;
1413   unsigned short n;
1414   rtx rtl;
1415   struct stack_local_entry *next;
1416 };
1417
1418 /* Structure describing stack frame layout.
1419    Stack grows downward:
1420
1421    [arguments]
1422                                               <- ARG_POINTER
1423    saved pc
1424
1425    saved frame pointer if frame_pointer_needed
1426                                               <- HARD_FRAME_POINTER
1427    [saved regs]
1428
1429    [padding1]          \
1430                         )
1431    [va_arg registers]  (
1432                         > to_allocate         <- FRAME_POINTER
1433    [frame]             (
1434                         )
1435    [padding2]          /
1436   */
1437 struct ix86_frame
1438 {
1439   int nregs;
1440   int padding1;
1441   int va_arg_size;
1442   HOST_WIDE_INT frame;
1443   int padding2;
1444   int outgoing_arguments_size;
1445   int red_zone_size;
1446
1447   HOST_WIDE_INT to_allocate;
1448   /* The offsets relative to ARG_POINTER.  */
1449   HOST_WIDE_INT frame_pointer_offset;
1450   HOST_WIDE_INT hard_frame_pointer_offset;
1451   HOST_WIDE_INT stack_pointer_offset;
1452
1453   /* When save_regs_using_mov is set, emit prologue using
1454      move instead of push instructions.  */
1455   bool save_regs_using_mov;
1456 };
1457
1458 /* Code model option.  */
1459 enum cmodel ix86_cmodel;
1460 /* Asm dialect.  */
1461 enum asm_dialect ix86_asm_dialect = ASM_ATT;
1462 /* TLS dialects.  */
1463 enum tls_dialect ix86_tls_dialect = TLS_DIALECT_GNU;
1464
1465 /* Which unit we are generating floating point math for.  */
1466 enum fpmath_unit ix86_fpmath;
1467
1468 /* Which cpu are we scheduling for.  */
1469 enum processor_type ix86_tune;
1470
1471 /* Which instruction set architecture to use.  */
1472 enum processor_type ix86_arch;
1473
1474 /* true if sse prefetch instruction is not NOOP.  */
1475 int x86_prefetch_sse;
1476
1477 /* ix86_regparm_string as a number */
1478 static int ix86_regparm;
1479
1480 /* -mstackrealign option */
1481 extern int ix86_force_align_arg_pointer;
1482 static const char ix86_force_align_arg_pointer_string[] = "force_align_arg_pointer";
1483
1484 /* Preferred alignment for stack boundary in bits.  */
1485 unsigned int ix86_preferred_stack_boundary;
1486
1487 /* Values 1-5: see jump.c */
1488 int ix86_branch_cost;
1489
1490 /* Variables which are this size or smaller are put in the data/bss
1491    or ldata/lbss sections.  */
1492
1493 int ix86_section_threshold = 65536;
1494
1495 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL.  */
1496 char internal_label_prefix[16];
1497 int internal_label_prefix_len;
1498
1499 /* Register class used for passing given 64bit part of the argument.
1500    These represent classes as documented by the PS ABI, with the exception
1501    of SSESF, SSEDF classes, that are basically SSE class, just gcc will
1502    use SF or DFmode move instead of DImode to avoid reformatting penalties.
1503
1504    Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
1505    whenever possible (upper half does contain padding).  */
1506 enum x86_64_reg_class
1507   {
1508     X86_64_NO_CLASS,
1509     X86_64_INTEGER_CLASS,
1510     X86_64_INTEGERSI_CLASS,
1511     X86_64_SSE_CLASS,
1512     X86_64_SSESF_CLASS,
1513     X86_64_SSEDF_CLASS,
1514     X86_64_SSEUP_CLASS,
1515     X86_64_X87_CLASS,
1516     X86_64_X87UP_CLASS,
1517     X86_64_COMPLEX_X87_CLASS,
1518     X86_64_MEMORY_CLASS
1519   };
1520 static const char * const x86_64_reg_class_name[] =
1521 {
1522   "no", "integer", "integerSI", "sse", "sseSF", "sseDF",
1523   "sseup", "x87", "x87up", "cplx87", "no"
1524 };
1525
1526 #define MAX_CLASSES 4
1527
1528 /* Table of constants used by fldpi, fldln2, etc....  */
1529 static REAL_VALUE_TYPE ext_80387_constants_table [5];
1530 static bool ext_80387_constants_init = 0;
1531
1532 \f
1533 static struct machine_function * ix86_init_machine_status (void);
1534 static rtx ix86_function_value (tree, tree, bool);
1535 static int ix86_function_regparm (tree, tree);
1536 static void ix86_compute_frame_layout (struct ix86_frame *);
1537 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
1538                                                  rtx, rtx, int);
1539
1540 \f
1541 /* The svr4 ABI for the i386 says that records and unions are returned
1542    in memory.  */
1543 #ifndef DEFAULT_PCC_STRUCT_RETURN
1544 #define DEFAULT_PCC_STRUCT_RETURN 1
1545 #endif
1546
1547 /* Implement TARGET_HANDLE_OPTION.  */
1548
1549 static bool
1550 ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value)
1551 {
1552   switch (code)
1553     {
1554     case OPT_m3dnow:
1555       if (!value)
1556         {
1557           target_flags &= ~MASK_3DNOW_A;
1558           target_flags_explicit |= MASK_3DNOW_A;
1559         }
1560       return true;
1561
1562     case OPT_mmmx:
1563       if (!value)
1564         {
1565           target_flags &= ~(MASK_3DNOW | MASK_3DNOW_A);
1566           target_flags_explicit |= MASK_3DNOW | MASK_3DNOW_A;
1567         }
1568       return true;
1569
1570     case OPT_msse:
1571       if (!value)
1572         {
1573           target_flags &= ~(MASK_SSE2 | MASK_SSE3 | MASK_SSSE3
1574                             | MASK_SSE4A);
1575           target_flags_explicit |= (MASK_SSE2 | MASK_SSE3 | MASK_SSSE3
1576                                     | MASK_SSE4A);
1577         }
1578       return true;
1579
1580     case OPT_msse2:
1581       if (!value)
1582         {
1583           target_flags &= ~(MASK_SSE3 | MASK_SSSE3 | MASK_SSE4A);
1584           target_flags_explicit |= MASK_SSE3 | MASK_SSSE3 | MASK_SSE4A;
1585         }
1586       return true;
1587
1588     case OPT_msse3:
1589       if (!value)
1590         {
1591           target_flags &= ~(MASK_SSSE3 | MASK_SSE4A);
1592           target_flags_explicit |= MASK_SSSE3 | MASK_SSE4A;
1593         }
1594       return true;
1595
1596     case OPT_mssse3:
1597       if (!value)
1598         {
1599           target_flags &= ~MASK_SSE4A;
1600           target_flags_explicit |= MASK_SSE4A;
1601         }
1602       return true;
1603
1604     default:
1605       return true;
1606     }
1607 }
1608
1609 /* Sometimes certain combinations of command options do not make
1610    sense on a particular target machine.  You can define a macro
1611    `OVERRIDE_OPTIONS' to take account of this.  This macro, if
1612    defined, is executed once just after all the command options have
1613    been parsed.
1614
1615    Don't use this macro to turn on various extra optimizations for
1616    `-O'.  That is what `OPTIMIZATION_OPTIONS' is for.  */
1617
1618 void
1619 override_options (void)
1620 {
1621   int i;
1622   int ix86_tune_defaulted = 0;
1623   unsigned int ix86_arch_mask, ix86_tune_mask;
1624
1625   /* Comes from final.c -- no real reason to change it.  */
1626 #define MAX_CODE_ALIGN 16
1627
1628   static struct ptt
1629     {
1630       const struct processor_costs *cost;       /* Processor costs */
1631       const int target_enable;                  /* Target flags to enable.  */
1632       const int target_disable;                 /* Target flags to disable.  */
1633       const int align_loop;                     /* Default alignments.  */
1634       const int align_loop_max_skip;
1635       const int align_jump;
1636       const int align_jump_max_skip;
1637       const int align_func;
1638     }
1639   const processor_target_table[PROCESSOR_max] =
1640     {
1641       {&i386_cost, 0, 0, 4, 3, 4, 3, 4},
1642       {&i486_cost, 0, 0, 16, 15, 16, 15, 16},
1643       {&pentium_cost, 0, 0, 16, 7, 16, 7, 16},
1644       {&pentiumpro_cost, 0, 0, 16, 15, 16, 7, 16},
1645       {&geode_cost, 0, 0, 0, 0, 0, 0, 0},
1646       {&k6_cost, 0, 0, 32, 7, 32, 7, 32},
1647       {&athlon_cost, 0, 0, 16, 7, 16, 7, 16},
1648       {&pentium4_cost, 0, 0, 0, 0, 0, 0, 0},
1649       {&k8_cost, 0, 0, 16, 7, 16, 7, 16},
1650       {&nocona_cost, 0, 0, 0, 0, 0, 0, 0},
1651       {&core2_cost, 0, 0, 16, 7, 16, 7, 16},
1652       {&generic32_cost, 0, 0, 16, 7, 16, 7, 16},
1653       {&generic64_cost, 0, 0, 16, 7, 16, 7, 16},
1654       {&amdfam10_cost, 0, 0, 32, 7, 32, 7, 32}
1655     };
1656
1657   static const char * const cpu_names[] = TARGET_CPU_DEFAULT_NAMES;
1658   static struct pta
1659     {
1660       const char *const name;           /* processor name or nickname.  */
1661       const enum processor_type processor;
1662       const enum pta_flags
1663         {
1664           PTA_SSE = 1 << 0,
1665           PTA_SSE2 = 1 << 1,
1666           PTA_SSE3 = 1 << 2,
1667           PTA_MMX = 1 << 3,
1668           PTA_PREFETCH_SSE = 1 << 4,
1669           PTA_3DNOW = 1 << 5,
1670           PTA_3DNOW_A = 1 << 6,
1671           PTA_64BIT = 1 << 7,
1672           PTA_SSSE3 = 1 << 8,
1673           PTA_CX16 = 1 << 9,
1674           PTA_POPCNT = 1 << 10,
1675           PTA_ABM = 1 << 11,
1676           PTA_SSE4A = 1 << 12,
1677           PTA_NO_SAHF = 1 << 13
1678         } flags;
1679     }
1680   const processor_alias_table[] =
1681     {
1682       {"i386", PROCESSOR_I386, 0},
1683       {"i486", PROCESSOR_I486, 0},
1684       {"i586", PROCESSOR_PENTIUM, 0},
1685       {"pentium", PROCESSOR_PENTIUM, 0},
1686       {"pentium-mmx", PROCESSOR_PENTIUM, PTA_MMX},
1687       {"winchip-c6", PROCESSOR_I486, PTA_MMX},
1688       {"winchip2", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},
1689       {"c3", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},
1690       {"c3-2", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_PREFETCH_SSE | PTA_SSE},
1691       {"i686", PROCESSOR_PENTIUMPRO, 0},
1692       {"pentiumpro", PROCESSOR_PENTIUMPRO, 0},
1693       {"pentium2", PROCESSOR_PENTIUMPRO, PTA_MMX},
1694       {"pentium3", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_PREFETCH_SSE},
1695       {"pentium3m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_PREFETCH_SSE},
1696       {"pentium-m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_PREFETCH_SSE | PTA_SSE2},
1697       {"pentium4", PROCESSOR_PENTIUM4, PTA_SSE | PTA_SSE2
1698                                        | PTA_MMX | PTA_PREFETCH_SSE},
1699       {"pentium4m", PROCESSOR_PENTIUM4, PTA_SSE | PTA_SSE2
1700                                         | PTA_MMX | PTA_PREFETCH_SSE},
1701       {"prescott", PROCESSOR_NOCONA, PTA_SSE | PTA_SSE2 | PTA_SSE3
1702                                         | PTA_MMX | PTA_PREFETCH_SSE},
1703       {"nocona", PROCESSOR_NOCONA, PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_64BIT
1704                                         | PTA_MMX | PTA_PREFETCH_SSE
1705                                         | PTA_CX16 | PTA_NO_SAHF},
1706       {"core2", PROCESSOR_CORE2, PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
1707                                         | PTA_64BIT | PTA_MMX
1708                                         | PTA_PREFETCH_SSE | PTA_CX16},
1709       {"geode", PROCESSOR_GEODE, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1710                                    | PTA_3DNOW_A},
1711       {"k6", PROCESSOR_K6, PTA_MMX},
1712       {"k6-2", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
1713       {"k6-3", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
1714       {"athlon", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1715                                    | PTA_3DNOW_A},
1716       {"athlon-tbird", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE
1717                                          | PTA_3DNOW | PTA_3DNOW_A},
1718       {"athlon-4", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1719                                     | PTA_3DNOW_A | PTA_SSE},
1720       {"athlon-xp", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1721                                       | PTA_3DNOW_A | PTA_SSE},
1722       {"athlon-mp", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1723                                       | PTA_3DNOW_A | PTA_SSE},
1724       {"x86-64", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_64BIT
1725                                | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
1726       {"k8", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT
1727                                       | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
1728                                       | PTA_NO_SAHF},
1729       {"opteron", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1730                                         | PTA_64BIT | PTA_3DNOW_A | PTA_SSE
1731                                         | PTA_SSE2 | PTA_NO_SAHF},
1732       {"athlon64", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1733                                          | PTA_64BIT | PTA_3DNOW_A | PTA_SSE
1734                                          | PTA_SSE2 | PTA_NO_SAHF},
1735       {"athlon-fx", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1736                                           | PTA_64BIT | PTA_3DNOW_A | PTA_SSE
1737                                           | PTA_SSE2 | PTA_NO_SAHF},
1738       {"amdfam10", PROCESSOR_AMDFAM10, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1739                                        | PTA_64BIT | PTA_3DNOW_A | PTA_SSE
1740                                        | PTA_SSE2 | PTA_SSE3 | PTA_POPCNT
1741                                        | PTA_ABM | PTA_SSE4A | PTA_CX16},
1742       {"barcelona", PROCESSOR_AMDFAM10, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1743                                        | PTA_64BIT | PTA_3DNOW_A | PTA_SSE
1744                                        | PTA_SSE2 | PTA_SSE3 | PTA_POPCNT
1745                                        | PTA_ABM | PTA_SSE4A | PTA_CX16},
1746       {"generic32", PROCESSOR_GENERIC32, 0 /* flags are only used for -march switch.  */ },
1747       {"generic64", PROCESSOR_GENERIC64, PTA_64BIT /* flags are only used for -march switch.  */ },
1748     };
1749
1750   int const pta_size = ARRAY_SIZE (processor_alias_table);
1751
1752 #ifdef SUBTARGET_OVERRIDE_OPTIONS
1753   SUBTARGET_OVERRIDE_OPTIONS;
1754 #endif
1755
1756 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
1757   SUBSUBTARGET_OVERRIDE_OPTIONS;
1758 #endif
1759
1760   /* -fPIC is the default for x86_64.  */
1761   if (TARGET_MACHO && TARGET_64BIT)
1762     flag_pic = 2;
1763
1764   /* Set the default values for switches whose default depends on TARGET_64BIT
1765      in case they weren't overwritten by command line options.  */
1766   if (TARGET_64BIT)
1767     {
1768       /* Mach-O doesn't support omitting the frame pointer for now.  */
1769       if (flag_omit_frame_pointer == 2)
1770         flag_omit_frame_pointer = (TARGET_MACHO ? 0 : 1);
1771       if (flag_asynchronous_unwind_tables == 2)
1772         flag_asynchronous_unwind_tables = 1;
1773       if (flag_pcc_struct_return == 2)
1774         flag_pcc_struct_return = 0;
1775     }
1776   else
1777     {
1778       if (flag_omit_frame_pointer == 2)
1779         flag_omit_frame_pointer = 0;
1780       if (flag_asynchronous_unwind_tables == 2)
1781         flag_asynchronous_unwind_tables = 0;
1782       if (flag_pcc_struct_return == 2)
1783         flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
1784     }
1785
1786   /* Need to check -mtune=generic first.  */
1787   if (ix86_tune_string)
1788     {
1789       if (!strcmp (ix86_tune_string, "generic")
1790           || !strcmp (ix86_tune_string, "i686")
1791           /* As special support for cross compilers we read -mtune=native
1792              as -mtune=generic.  With native compilers we won't see the
1793              -mtune=native, as it was changed by the driver.  */
1794           || !strcmp (ix86_tune_string, "native"))
1795         {
1796           if (TARGET_64BIT)
1797             ix86_tune_string = "generic64";
1798           else
1799             ix86_tune_string = "generic32";
1800         }
1801       else if (!strncmp (ix86_tune_string, "generic", 7))
1802         error ("bad value (%s) for -mtune= switch", ix86_tune_string);
1803     }
1804   else
1805     {
1806       if (ix86_arch_string)
1807         ix86_tune_string = ix86_arch_string;
1808       if (!ix86_tune_string)
1809         {
1810           ix86_tune_string = cpu_names [TARGET_CPU_DEFAULT];
1811           ix86_tune_defaulted = 1;
1812         }
1813
1814       /* ix86_tune_string is set to ix86_arch_string or defaulted.  We
1815          need to use a sensible tune option.  */
1816       if (!strcmp (ix86_tune_string, "generic")
1817           || !strcmp (ix86_tune_string, "x86-64")
1818           || !strcmp (ix86_tune_string, "i686"))
1819         {
1820           if (TARGET_64BIT)
1821             ix86_tune_string = "generic64";
1822           else
1823             ix86_tune_string = "generic32";
1824         }
1825     }
1826   if (ix86_stringop_string)
1827     {
1828       if (!strcmp (ix86_stringop_string, "rep_byte"))
1829         stringop_alg = rep_prefix_1_byte;
1830       else if (!strcmp (ix86_stringop_string, "libcall"))
1831         stringop_alg = libcall;
1832       else if (!strcmp (ix86_stringop_string, "rep_4byte"))
1833         stringop_alg = rep_prefix_4_byte;
1834       else if (!strcmp (ix86_stringop_string, "rep_8byte"))
1835         stringop_alg = rep_prefix_8_byte;
1836       else if (!strcmp (ix86_stringop_string, "byte_loop"))
1837         stringop_alg = loop_1_byte;
1838       else if (!strcmp (ix86_stringop_string, "loop"))
1839         stringop_alg = loop;
1840       else if (!strcmp (ix86_stringop_string, "unrolled_loop"))
1841         stringop_alg = unrolled_loop;
1842       else
1843         error ("bad value (%s) for -mstringop-strategy= switch", ix86_stringop_string);
1844     }
1845   if (!strcmp (ix86_tune_string, "x86-64"))
1846     warning (OPT_Wdeprecated, "-mtune=x86-64 is deprecated.  Use -mtune=k8 or "
1847              "-mtune=generic instead as appropriate.");
1848
1849   if (!ix86_arch_string)
1850     ix86_arch_string = TARGET_64BIT ? "x86-64" : "i386";
1851   if (!strcmp (ix86_arch_string, "generic"))
1852     error ("generic CPU can be used only for -mtune= switch");
1853   if (!strncmp (ix86_arch_string, "generic", 7))
1854     error ("bad value (%s) for -march= switch", ix86_arch_string);
1855
1856   if (ix86_cmodel_string != 0)
1857     {
1858       if (!strcmp (ix86_cmodel_string, "small"))
1859         ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
1860       else if (!strcmp (ix86_cmodel_string, "medium"))
1861         ix86_cmodel = flag_pic ? CM_MEDIUM_PIC : CM_MEDIUM;
1862       else if (!strcmp (ix86_cmodel_string, "large"))
1863         ix86_cmodel = flag_pic ? CM_LARGE_PIC : CM_LARGE;
1864       else if (flag_pic)
1865         error ("code model %s does not support PIC mode", ix86_cmodel_string);
1866       else if (!strcmp (ix86_cmodel_string, "32"))
1867         ix86_cmodel = CM_32;
1868       else if (!strcmp (ix86_cmodel_string, "kernel") && !flag_pic)
1869         ix86_cmodel = CM_KERNEL;
1870       else
1871         error ("bad value (%s) for -mcmodel= switch", ix86_cmodel_string);
1872     }
1873   else
1874     {
1875       /* For TARGET_64BIT_MS_ABI, force pic on, in order to enable the
1876          use of rip-relative addressing.  This eliminates fixups that
1877          would otherwise be needed if this object is to be placed in a
1878          DLL, and is essentially just as efficient as direct addressing.  */
1879       if (TARGET_64BIT_MS_ABI)
1880         ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
1881       else if (TARGET_64BIT)
1882         ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
1883       else
1884         ix86_cmodel = CM_32;
1885     }
1886   if (ix86_asm_string != 0)
1887     {
1888       if (! TARGET_MACHO
1889           && !strcmp (ix86_asm_string, "intel"))
1890         ix86_asm_dialect = ASM_INTEL;
1891       else if (!strcmp (ix86_asm_string, "att"))
1892         ix86_asm_dialect = ASM_ATT;
1893       else
1894         error ("bad value (%s) for -masm= switch", ix86_asm_string);
1895     }
1896   if ((TARGET_64BIT == 0) != (ix86_cmodel == CM_32))
1897     error ("code model %qs not supported in the %s bit mode",
1898            ix86_cmodel_string, TARGET_64BIT ? "64" : "32");
1899   if ((TARGET_64BIT != 0) != ((target_flags & MASK_64BIT) != 0))
1900     sorry ("%i-bit mode not compiled in",
1901            (target_flags & MASK_64BIT) ? 64 : 32);
1902
1903   for (i = 0; i < pta_size; i++)
1904     if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
1905       {
1906         ix86_arch = processor_alias_table[i].processor;
1907         /* Default cpu tuning to the architecture.  */
1908         ix86_tune = ix86_arch;
1909         if (processor_alias_table[i].flags & PTA_MMX
1910             && !(target_flags_explicit & MASK_MMX))
1911           target_flags |= MASK_MMX;
1912         if (processor_alias_table[i].flags & PTA_3DNOW
1913             && !(target_flags_explicit & MASK_3DNOW))
1914           target_flags |= MASK_3DNOW;
1915         if (processor_alias_table[i].flags & PTA_3DNOW_A
1916             && !(target_flags_explicit & MASK_3DNOW_A))
1917           target_flags |= MASK_3DNOW_A;
1918         if (processor_alias_table[i].flags & PTA_SSE
1919             && !(target_flags_explicit & MASK_SSE))
1920           target_flags |= MASK_SSE;
1921         if (processor_alias_table[i].flags & PTA_SSE2
1922             && !(target_flags_explicit & MASK_SSE2))
1923           target_flags |= MASK_SSE2;
1924         if (processor_alias_table[i].flags & PTA_SSE3
1925             && !(target_flags_explicit & MASK_SSE3))
1926           target_flags |= MASK_SSE3;
1927         if (processor_alias_table[i].flags & PTA_SSSE3
1928             && !(target_flags_explicit & MASK_SSSE3))
1929           target_flags |= MASK_SSSE3;
1930         if (processor_alias_table[i].flags & PTA_PREFETCH_SSE)
1931           x86_prefetch_sse = true;
1932         if (processor_alias_table[i].flags & PTA_CX16)
1933           x86_cmpxchg16b = true;
1934         if (processor_alias_table[i].flags & PTA_POPCNT
1935             && !(target_flags_explicit & MASK_POPCNT))
1936           target_flags |= MASK_POPCNT;
1937         if (processor_alias_table[i].flags & PTA_ABM
1938             && !(target_flags_explicit & MASK_ABM))
1939           target_flags |= MASK_ABM;
1940         if (processor_alias_table[i].flags & PTA_SSE4A
1941             && !(target_flags_explicit & MASK_SSE4A))
1942           target_flags |= MASK_SSE4A;
1943         if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF)))
1944           x86_sahf = true;
1945         if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
1946           error ("CPU you selected does not support x86-64 "
1947                  "instruction set");
1948         break;
1949       }
1950
1951   if (i == pta_size)
1952     error ("bad value (%s) for -march= switch", ix86_arch_string);
1953
1954   ix86_arch_mask = 1u << ix86_arch;
1955   for (i = 0; i < X86_ARCH_LAST; ++i)
1956     ix86_arch_features[i] &= ix86_arch_mask;
1957
1958   for (i = 0; i < pta_size; i++)
1959     if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
1960       {
1961         ix86_tune = processor_alias_table[i].processor;
1962         if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
1963           {
1964             if (ix86_tune_defaulted)
1965               {
1966                 ix86_tune_string = "x86-64";
1967                 for (i = 0; i < pta_size; i++)
1968                   if (! strcmp (ix86_tune_string,
1969                                 processor_alias_table[i].name))
1970                     break;
1971                 ix86_tune = processor_alias_table[i].processor;
1972               }
1973             else
1974               error ("CPU you selected does not support x86-64 "
1975                      "instruction set");
1976           }
1977         /* Intel CPUs have always interpreted SSE prefetch instructions as
1978            NOPs; so, we can enable SSE prefetch instructions even when
1979            -mtune (rather than -march) points us to a processor that has them.
1980            However, the VIA C3 gives a SIGILL, so we only do that for i686 and
1981            higher processors.  */
1982         if (TARGET_CMOVE && (processor_alias_table[i].flags & PTA_PREFETCH_SSE))
1983           x86_prefetch_sse = true;
1984         break;
1985       }
1986   if (i == pta_size)
1987     error ("bad value (%s) for -mtune= switch", ix86_tune_string);
1988
1989   ix86_tune_mask = 1u << ix86_tune;
1990   for (i = 0; i < X86_TUNE_LAST; ++i)
1991     ix86_tune_features[i] &= ix86_tune_mask;
1992
1993   if (optimize_size)
1994     ix86_cost = &size_cost;
1995   else
1996     ix86_cost = processor_target_table[ix86_tune].cost;
1997   target_flags |= processor_target_table[ix86_tune].target_enable;
1998   target_flags &= ~processor_target_table[ix86_tune].target_disable;
1999
2000   /* Arrange to set up i386_stack_locals for all functions.  */
2001   init_machine_status = ix86_init_machine_status;
2002
2003   /* Validate -mregparm= value.  */
2004   if (ix86_regparm_string)
2005     {
2006       if (TARGET_64BIT)
2007         warning (0, "-mregparm is ignored in 64-bit mode");
2008       i = atoi (ix86_regparm_string);
2009       if (i < 0 || i > REGPARM_MAX)
2010         error ("-mregparm=%d is not between 0 and %d", i, REGPARM_MAX);
2011       else
2012         ix86_regparm = i;
2013     }
2014   if (TARGET_64BIT)
2015     ix86_regparm = REGPARM_MAX;
2016
2017   /* If the user has provided any of the -malign-* options,
2018      warn and use that value only if -falign-* is not set.
2019      Remove this code in GCC 3.2 or later.  */
2020   if (ix86_align_loops_string)
2021     {
2022       warning (0, "-malign-loops is obsolete, use -falign-loops");
2023       if (align_loops == 0)
2024         {
2025           i = atoi (ix86_align_loops_string);
2026           if (i < 0 || i > MAX_CODE_ALIGN)
2027             error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2028           else
2029             align_loops = 1 << i;
2030         }
2031     }
2032
2033   if (ix86_align_jumps_string)
2034     {
2035       warning (0, "-malign-jumps is obsolete, use -falign-jumps");
2036       if (align_jumps == 0)
2037         {
2038           i = atoi (ix86_align_jumps_string);
2039           if (i < 0 || i > MAX_CODE_ALIGN)
2040             error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2041           else
2042             align_jumps = 1 << i;
2043         }
2044     }
2045
2046   if (ix86_align_funcs_string)
2047     {
2048       warning (0, "-malign-functions is obsolete, use -falign-functions");
2049       if (align_functions == 0)
2050         {
2051           i = atoi (ix86_align_funcs_string);
2052           if (i < 0 || i > MAX_CODE_ALIGN)
2053             error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2054           else
2055             align_functions = 1 << i;
2056         }
2057     }
2058
2059   /* Default align_* from the processor table.  */
2060   if (align_loops == 0)
2061     {
2062       align_loops = processor_target_table[ix86_tune].align_loop;
2063       align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
2064     }
2065   if (align_jumps == 0)
2066     {
2067       align_jumps = processor_target_table[ix86_tune].align_jump;
2068       align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
2069     }
2070   if (align_functions == 0)
2071     {
2072       align_functions = processor_target_table[ix86_tune].align_func;
2073     }
2074
2075   /* Validate -mbranch-cost= value, or provide default.  */
2076   ix86_branch_cost = ix86_cost->branch_cost;
2077   if (ix86_branch_cost_string)
2078     {
2079       i = atoi (ix86_branch_cost_string);
2080       if (i < 0 || i > 5)
2081         error ("-mbranch-cost=%d is not between 0 and 5", i);
2082       else
2083         ix86_branch_cost = i;
2084     }
2085   if (ix86_section_threshold_string)
2086     {
2087       i = atoi (ix86_section_threshold_string);
2088       if (i < 0)
2089         error ("-mlarge-data-threshold=%d is negative", i);
2090       else
2091         ix86_section_threshold = i;
2092     }
2093
2094   if (ix86_tls_dialect_string)
2095     {
2096       if (strcmp (ix86_tls_dialect_string, "gnu") == 0)
2097         ix86_tls_dialect = TLS_DIALECT_GNU;
2098       else if (strcmp (ix86_tls_dialect_string, "gnu2") == 0)
2099         ix86_tls_dialect = TLS_DIALECT_GNU2;
2100       else if (strcmp (ix86_tls_dialect_string, "sun") == 0)
2101         ix86_tls_dialect = TLS_DIALECT_SUN;
2102       else
2103         error ("bad value (%s) for -mtls-dialect= switch",
2104                ix86_tls_dialect_string);
2105     }
2106
2107   if (ix87_precision_string)
2108     {
2109       i = atoi (ix87_precision_string);
2110       if (i != 32 && i != 64 && i != 80)
2111         error ("pc%d is not valid precision setting (32, 64 or 80)", i);
2112     }
2113
2114   /* Keep nonleaf frame pointers.  */
2115   if (flag_omit_frame_pointer)
2116     target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
2117   else if (TARGET_OMIT_LEAF_FRAME_POINTER)
2118     flag_omit_frame_pointer = 1;
2119
2120   /* If we're doing fast math, we don't care about comparison order
2121      wrt NaNs.  This lets us use a shorter comparison sequence.  */
2122   if (flag_finite_math_only)
2123     target_flags &= ~MASK_IEEE_FP;
2124
2125   /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
2126      since the insns won't need emulation.  */
2127   if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
2128     target_flags &= ~MASK_NO_FANCY_MATH_387;
2129
2130   /* Likewise, if the target doesn't have a 387, or we've specified
2131      software floating point, don't use 387 inline intrinsics.  */
2132   if (!TARGET_80387)
2133     target_flags |= MASK_NO_FANCY_MATH_387;
2134
2135   /* Turn on SSE3 builtins for -mssse3.  */
2136   if (TARGET_SSSE3)
2137     target_flags |= MASK_SSE3;
2138
2139   /* Turn on SSE3 builtins for -msse4a.  */
2140   if (TARGET_SSE4A)
2141     target_flags |= MASK_SSE3;
2142
2143   /* Turn on SSE2 builtins for -msse3.  */
2144   if (TARGET_SSE3)
2145     target_flags |= MASK_SSE2;
2146
2147   /* Turn on SSE builtins for -msse2.  */
2148   if (TARGET_SSE2)
2149     target_flags |= MASK_SSE;
2150
2151   /* Turn on MMX builtins for -msse.  */
2152   if (TARGET_SSE)
2153     {
2154       target_flags |= MASK_MMX & ~target_flags_explicit;
2155       x86_prefetch_sse = true;
2156     }
2157
2158   /* Turn on MMX builtins for 3Dnow.  */
2159   if (TARGET_3DNOW)
2160     target_flags |= MASK_MMX;
2161
2162   /* Turn on POPCNT builtins for -mabm.  */
2163   if (TARGET_ABM)
2164     target_flags |= MASK_POPCNT;
2165
2166   if (TARGET_64BIT)
2167     {
2168       if (TARGET_RTD)
2169         warning (0, "-mrtd is ignored in 64bit mode");
2170
2171       /* Enable by default the SSE and MMX builtins.  Do allow the user to
2172          explicitly disable any of these.  In particular, disabling SSE and
2173          MMX for kernel code is extremely useful.  */
2174       target_flags
2175         |= ((MASK_SSE2 | MASK_SSE | MASK_MMX | TARGET_SUBTARGET64_DEFAULT)
2176             & ~target_flags_explicit);
2177     }
2178   else
2179     {
2180       /* i386 ABI does not specify red zone.  It still makes sense to use it
2181          when programmer takes care to stack from being destroyed.  */
2182       if (!(target_flags_explicit & MASK_NO_RED_ZONE))
2183         target_flags |= MASK_NO_RED_ZONE;
2184     }
2185
2186   /* Validate -mpreferred-stack-boundary= value, or provide default.
2187      The default of 128 bits is for Pentium III's SSE __m128.  We can't
2188      change it because of optimize_size.  Otherwise, we can't mix object
2189      files compiled with -Os and -On.  */
2190   ix86_preferred_stack_boundary = 128;
2191   if (ix86_preferred_stack_boundary_string)
2192     {
2193       i = atoi (ix86_preferred_stack_boundary_string);
2194       if (i < (TARGET_64BIT ? 4 : 2) || i > 12)
2195         error ("-mpreferred-stack-boundary=%d is not between %d and 12", i,
2196                TARGET_64BIT ? 4 : 2);
2197       else
2198         ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT;
2199     }
2200
2201   /* Accept -msseregparm only if at least SSE support is enabled.  */
2202   if (TARGET_SSEREGPARM
2203       && ! TARGET_SSE)
2204     error ("-msseregparm used without SSE enabled");
2205
2206   ix86_fpmath = TARGET_FPMATH_DEFAULT;
2207   if (ix86_fpmath_string != 0)
2208     {
2209       if (! strcmp (ix86_fpmath_string, "387"))
2210         ix86_fpmath = FPMATH_387;
2211       else if (! strcmp (ix86_fpmath_string, "sse"))
2212         {
2213           if (!TARGET_SSE)
2214             {
2215               warning (0, "SSE instruction set disabled, using 387 arithmetics");
2216               ix86_fpmath = FPMATH_387;
2217             }
2218           else
2219             ix86_fpmath = FPMATH_SSE;
2220         }
2221       else if (! strcmp (ix86_fpmath_string, "387,sse")
2222                || ! strcmp (ix86_fpmath_string, "sse,387"))
2223         {
2224           if (!TARGET_SSE)
2225             {
2226               warning (0, "SSE instruction set disabled, using 387 arithmetics");
2227               ix86_fpmath = FPMATH_387;
2228             }
2229           else if (!TARGET_80387)
2230             {
2231               warning (0, "387 instruction set disabled, using SSE arithmetics");
2232               ix86_fpmath = FPMATH_SSE;
2233             }
2234           else
2235             ix86_fpmath = FPMATH_SSE | FPMATH_387;
2236         }
2237       else
2238         error ("bad value (%s) for -mfpmath= switch", ix86_fpmath_string);
2239     }
2240
2241   /* If the i387 is disabled, then do not return values in it. */
2242   if (!TARGET_80387)
2243     target_flags &= ~MASK_FLOAT_RETURNS;
2244
2245   if ((x86_accumulate_outgoing_args & ix86_tune_mask)
2246       && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2247       && !optimize_size)
2248     target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2249
2250   /* ??? Unwind info is not correct around the CFG unless either a frame
2251      pointer is present or M_A_O_A is set.  Fixing this requires rewriting
2252      unwind info generation to be aware of the CFG and propagating states
2253      around edges.  */
2254   if ((flag_unwind_tables || flag_asynchronous_unwind_tables
2255        || flag_exceptions || flag_non_call_exceptions)
2256       && flag_omit_frame_pointer
2257       && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
2258     {
2259       if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2260         warning (0, "unwind tables currently require either a frame pointer "
2261                  "or -maccumulate-outgoing-args for correctness");
2262       target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2263     }
2264
2265   /* For sane SSE instruction set generation we need fcomi instruction.
2266      It is safe to enable all CMOVE instructions.  */
2267   if (TARGET_SSE)
2268     TARGET_CMOVE = 1;
2269
2270   /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix.  */
2271   {
2272     char *p;
2273     ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
2274     p = strchr (internal_label_prefix, 'X');
2275     internal_label_prefix_len = p - internal_label_prefix;
2276     *p = '\0';
2277   }
2278
2279   /* When scheduling description is not available, disable scheduler pass
2280      so it won't slow down the compilation and make x87 code slower.  */
2281   if (!TARGET_SCHEDULE)
2282     flag_schedule_insns_after_reload = flag_schedule_insns = 0;
2283
2284   if (!PARAM_SET_P (PARAM_SIMULTANEOUS_PREFETCHES))
2285     set_param_value ("simultaneous-prefetches",
2286                      ix86_cost->simultaneous_prefetches);
2287   if (!PARAM_SET_P (PARAM_L1_CACHE_LINE_SIZE))
2288     set_param_value ("l1-cache-line-size", ix86_cost->prefetch_block);
2289 }
2290 \f
2291 /* Return true if this goes in large data/bss.  */
2292
2293 static bool
2294 ix86_in_large_data_p (tree exp)
2295 {
2296   if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
2297     return false;
2298
2299   /* Functions are never large data.  */
2300   if (TREE_CODE (exp) == FUNCTION_DECL)
2301     return false;
2302
2303   if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
2304     {
2305       const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
2306       if (strcmp (section, ".ldata") == 0
2307           || strcmp (section, ".lbss") == 0)
2308         return true;
2309       return false;
2310     }
2311   else
2312     {
2313       HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
2314
2315       /* If this is an incomplete type with size 0, then we can't put it
2316          in data because it might be too big when completed.  */
2317       if (!size || size > ix86_section_threshold)
2318         return true;
2319     }
2320
2321   return false;
2322 }
2323
2324 /* Switch to the appropriate section for output of DECL.
2325    DECL is either a `VAR_DECL' node or a constant of some sort.
2326    RELOC indicates whether forming the initial value of DECL requires
2327    link-time relocations.  */
2328
2329 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
2330         ATTRIBUTE_UNUSED;
2331
2332 static section *
2333 x86_64_elf_select_section (tree decl, int reloc,
2334                            unsigned HOST_WIDE_INT align)
2335 {
2336   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2337       && ix86_in_large_data_p (decl))
2338     {
2339       const char *sname = NULL;
2340       unsigned int flags = SECTION_WRITE;
2341       switch (categorize_decl_for_section (decl, reloc))
2342         {
2343         case SECCAT_DATA:
2344           sname = ".ldata";
2345           break;
2346         case SECCAT_DATA_REL:
2347           sname = ".ldata.rel";
2348           break;
2349         case SECCAT_DATA_REL_LOCAL:
2350           sname = ".ldata.rel.local";
2351           break;
2352         case SECCAT_DATA_REL_RO:
2353           sname = ".ldata.rel.ro";
2354           break;
2355         case SECCAT_DATA_REL_RO_LOCAL:
2356           sname = ".ldata.rel.ro.local";
2357           break;
2358         case SECCAT_BSS:
2359           sname = ".lbss";
2360           flags |= SECTION_BSS;
2361           break;
2362         case SECCAT_RODATA:
2363         case SECCAT_RODATA_MERGE_STR:
2364         case SECCAT_RODATA_MERGE_STR_INIT:
2365         case SECCAT_RODATA_MERGE_CONST:
2366           sname = ".lrodata";
2367           flags = 0;
2368           break;
2369         case SECCAT_SRODATA:
2370         case SECCAT_SDATA:
2371         case SECCAT_SBSS:
2372           gcc_unreachable ();
2373         case SECCAT_TEXT:
2374         case SECCAT_TDATA:
2375         case SECCAT_TBSS:
2376           /* We don't split these for medium model.  Place them into
2377              default sections and hope for best.  */
2378           break;
2379         }
2380       if (sname)
2381         {
2382           /* We might get called with string constants, but get_named_section
2383              doesn't like them as they are not DECLs.  Also, we need to set
2384              flags in that case.  */
2385           if (!DECL_P (decl))
2386             return get_section (sname, flags, NULL);
2387           return get_named_section (decl, sname, reloc);
2388         }
2389     }
2390   return default_elf_select_section (decl, reloc, align);
2391 }
2392
2393 /* Build up a unique section name, expressed as a
2394    STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
2395    RELOC indicates whether the initial value of EXP requires
2396    link-time relocations.  */
2397
2398 static void ATTRIBUTE_UNUSED
2399 x86_64_elf_unique_section (tree decl, int reloc)
2400 {
2401   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2402       && ix86_in_large_data_p (decl))
2403     {
2404       const char *prefix = NULL;
2405       /* We only need to use .gnu.linkonce if we don't have COMDAT groups.  */
2406       bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
2407
2408       switch (categorize_decl_for_section (decl, reloc))
2409         {
2410         case SECCAT_DATA:
2411         case SECCAT_DATA_REL:
2412         case SECCAT_DATA_REL_LOCAL:
2413         case SECCAT_DATA_REL_RO:
2414         case SECCAT_DATA_REL_RO_LOCAL:
2415           prefix = one_only ? ".gnu.linkonce.ld." : ".ldata.";
2416           break;
2417         case SECCAT_BSS:
2418           prefix = one_only ? ".gnu.linkonce.lb." : ".lbss.";
2419           break;
2420         case SECCAT_RODATA:
2421         case SECCAT_RODATA_MERGE_STR:
2422         case SECCAT_RODATA_MERGE_STR_INIT:
2423         case SECCAT_RODATA_MERGE_CONST:
2424           prefix = one_only ? ".gnu.linkonce.lr." : ".lrodata.";
2425           break;
2426         case SECCAT_SRODATA:
2427         case SECCAT_SDATA:
2428         case SECCAT_SBSS:
2429           gcc_unreachable ();
2430         case SECCAT_TEXT:
2431         case SECCAT_TDATA:
2432         case SECCAT_TBSS:
2433           /* We don't split these for medium model.  Place them into
2434              default sections and hope for best.  */
2435           break;
2436         }
2437       if (prefix)
2438         {
2439           const char *name;
2440           size_t nlen, plen;
2441           char *string;
2442           plen = strlen (prefix);
2443
2444           name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
2445           name = targetm.strip_name_encoding (name);
2446           nlen = strlen (name);
2447
2448           string = alloca (nlen + plen + 1);
2449           memcpy (string, prefix, plen);
2450           memcpy (string + plen, name, nlen + 1);
2451
2452           DECL_SECTION_NAME (decl) = build_string (nlen + plen, string);
2453           return;
2454         }
2455     }
2456   default_unique_section (decl, reloc);
2457 }
2458
2459 #ifdef COMMON_ASM_OP
2460 /* This says how to output assembler code to declare an
2461    uninitialized external linkage data object.
2462
2463    For medium model x86-64 we need to use .largecomm opcode for
2464    large objects.  */
2465 void
2466 x86_elf_aligned_common (FILE *file,
2467                         const char *name, unsigned HOST_WIDE_INT size,
2468                         int align)
2469 {
2470   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2471       && size > (unsigned int)ix86_section_threshold)
2472     fprintf (file, ".largecomm\t");
2473   else
2474     fprintf (file, "%s", COMMON_ASM_OP);
2475   assemble_name (file, name);
2476   fprintf (file, ","HOST_WIDE_INT_PRINT_UNSIGNED",%u\n",
2477            size, align / BITS_PER_UNIT);
2478 }
2479 #endif
2480
2481 /* Utility function for targets to use in implementing
2482    ASM_OUTPUT_ALIGNED_BSS.  */
2483
2484 void
2485 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
2486                         const char *name, unsigned HOST_WIDE_INT size,
2487                         int align)
2488 {
2489   if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2490       && size > (unsigned int)ix86_section_threshold)
2491     switch_to_section (get_named_section (decl, ".lbss", 0));
2492   else
2493     switch_to_section (bss_section);
2494   ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
2495 #ifdef ASM_DECLARE_OBJECT_NAME
2496   last_assemble_variable_decl = decl;
2497   ASM_DECLARE_OBJECT_NAME (file, name, decl);
2498 #else
2499   /* Standard thing is just output label for the object.  */
2500   ASM_OUTPUT_LABEL (file, name);
2501 #endif /* ASM_DECLARE_OBJECT_NAME */
2502   ASM_OUTPUT_SKIP (file, size ? size : 1);
2503 }
2504 \f
2505 void
2506 optimization_options (int level, int size ATTRIBUTE_UNUSED)
2507 {
2508   /* For -O2 and beyond, turn off -fschedule-insns by default.  It tends to
2509      make the problem with not enough registers even worse.  */
2510 #ifdef INSN_SCHEDULING
2511   if (level > 1)
2512     flag_schedule_insns = 0;
2513 #endif
2514
2515   if (TARGET_MACHO)
2516     /* The Darwin libraries never set errno, so we might as well
2517        avoid calling them when that's the only reason we would.  */
2518     flag_errno_math = 0;
2519
2520   /* The default values of these switches depend on the TARGET_64BIT
2521      that is not known at this moment.  Mark these values with 2 and
2522      let user the to override these.  In case there is no command line option
2523      specifying them, we will set the defaults in override_options.  */
2524   if (optimize >= 1)
2525     flag_omit_frame_pointer = 2;
2526   flag_pcc_struct_return = 2;
2527   flag_asynchronous_unwind_tables = 2;
2528 #ifdef SUBTARGET_OPTIMIZATION_OPTIONS
2529   SUBTARGET_OPTIMIZATION_OPTIONS;
2530 #endif
2531 }
2532 \f
2533 /* Decide whether we can make a sibling call to a function.  DECL is the
2534    declaration of the function being targeted by the call and EXP is the
2535    CALL_EXPR representing the call.  */
2536
2537 static bool
2538 ix86_function_ok_for_sibcall (tree decl, tree exp)
2539 {
2540   tree func;
2541   rtx a, b;
2542
2543   /* If we are generating position-independent code, we cannot sibcall
2544      optimize any indirect call, or a direct call to a global function,
2545      as the PLT requires %ebx be live.  */
2546   if (!TARGET_64BIT && flag_pic && (!decl || !targetm.binds_local_p (decl)))
2547     return false;
2548
2549   if (decl)
2550     func = decl;
2551   else
2552     {
2553       func = TREE_TYPE (CALL_EXPR_FN (exp));
2554       if (POINTER_TYPE_P (func))
2555         func = TREE_TYPE (func);
2556     }
2557
2558   /* Check that the return value locations are the same.  Like
2559      if we are returning floats on the 80387 register stack, we cannot
2560      make a sibcall from a function that doesn't return a float to a
2561      function that does or, conversely, from a function that does return
2562      a float to a function that doesn't; the necessary stack adjustment
2563      would not be executed.  This is also the place we notice
2564      differences in the return value ABI.  Note that it is ok for one
2565      of the functions to have void return type as long as the return
2566      value of the other is passed in a register.  */
2567   a = ix86_function_value (TREE_TYPE (exp), func, false);
2568   b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
2569                            cfun->decl, false);
2570   if (STACK_REG_P (a) || STACK_REG_P (b))
2571     {
2572       if (!rtx_equal_p (a, b))
2573         return false;
2574     }
2575   else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
2576     ;
2577   else if (!rtx_equal_p (a, b))
2578     return false;
2579
2580   /* If this call is indirect, we'll need to be able to use a call-clobbered
2581      register for the address of the target function.  Make sure that all
2582      such registers are not used for passing parameters.  */
2583   if (!decl && !TARGET_64BIT)
2584     {
2585       tree type;
2586
2587       /* We're looking at the CALL_EXPR, we need the type of the function.  */
2588       type = CALL_EXPR_FN (exp);                /* pointer expression */
2589       type = TREE_TYPE (type);                  /* pointer type */
2590       type = TREE_TYPE (type);                  /* function type */
2591
2592       if (ix86_function_regparm (type, NULL) >= 3)
2593         {
2594           /* ??? Need to count the actual number of registers to be used,
2595              not the possible number of registers.  Fix later.  */
2596           return false;
2597         }
2598     }
2599
2600   /* Dllimport'd functions are also called indirectly.  */
2601   if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
2602       && decl && DECL_DLLIMPORT_P (decl)
2603       && ix86_function_regparm (TREE_TYPE (decl), NULL) >= 3)
2604     return false;
2605
2606   /* If we forced aligned the stack, then sibcalling would unalign the
2607      stack, which may break the called function.  */
2608   if (cfun->machine->force_align_arg_pointer)
2609     return false;
2610
2611   /* Otherwise okay.  That also includes certain types of indirect calls.  */
2612   return true;
2613 }
2614
2615 /* Handle "cdecl", "stdcall", "fastcall", "regparm" and "sseregparm"
2616    calling convention attributes;
2617    arguments as in struct attribute_spec.handler.  */
2618
2619 static tree
2620 ix86_handle_cconv_attribute (tree *node, tree name,
2621                                    tree args,
2622                                    int flags ATTRIBUTE_UNUSED,
2623                                    bool *no_add_attrs)
2624 {
2625   if (TREE_CODE (*node) != FUNCTION_TYPE
2626       && TREE_CODE (*node) != METHOD_TYPE
2627       && TREE_CODE (*node) != FIELD_DECL
2628       && TREE_CODE (*node) != TYPE_DECL)
2629     {
2630       warning (OPT_Wattributes, "%qs attribute only applies to functions",
2631                IDENTIFIER_POINTER (name));
2632       *no_add_attrs = true;
2633       return NULL_TREE;
2634     }
2635
2636   /* Can combine regparm with all attributes but fastcall.  */
2637   if (is_attribute_p ("regparm", name))
2638     {
2639       tree cst;
2640
2641       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
2642         {
2643           error ("fastcall and regparm attributes are not compatible");
2644         }
2645
2646       cst = TREE_VALUE (args);
2647       if (TREE_CODE (cst) != INTEGER_CST)
2648         {
2649           warning (OPT_Wattributes,
2650                    "%qs attribute requires an integer constant argument",
2651                    IDENTIFIER_POINTER (name));
2652           *no_add_attrs = true;
2653         }
2654       else if (compare_tree_int (cst, REGPARM_MAX) > 0)
2655         {
2656           warning (OPT_Wattributes, "argument to %qs attribute larger than %d",
2657                    IDENTIFIER_POINTER (name), REGPARM_MAX);
2658           *no_add_attrs = true;
2659         }
2660
2661       if (!TARGET_64BIT
2662           && lookup_attribute (ix86_force_align_arg_pointer_string,
2663                                TYPE_ATTRIBUTES (*node))
2664           && compare_tree_int (cst, REGPARM_MAX-1))
2665         {
2666           error ("%s functions limited to %d register parameters",
2667                  ix86_force_align_arg_pointer_string, REGPARM_MAX-1);
2668         }
2669
2670       return NULL_TREE;
2671     }
2672
2673   if (TARGET_64BIT)
2674     {
2675       /* Do not warn when emulating the MS ABI.  */
2676       if (!TARGET_64BIT_MS_ABI)
2677         warning (OPT_Wattributes, "%qs attribute ignored",
2678                  IDENTIFIER_POINTER (name));
2679       *no_add_attrs = true;
2680       return NULL_TREE;
2681     }
2682
2683   /* Can combine fastcall with stdcall (redundant) and sseregparm.  */
2684   if (is_attribute_p ("fastcall", name))
2685     {
2686       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
2687         {
2688           error ("fastcall and cdecl attributes are not compatible");
2689         }
2690       if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
2691         {
2692           error ("fastcall and stdcall attributes are not compatible");
2693         }
2694       if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
2695         {
2696           error ("fastcall and regparm attributes are not compatible");
2697         }
2698     }
2699
2700   /* Can combine stdcall with fastcall (redundant), regparm and
2701      sseregparm.  */
2702   else if (is_attribute_p ("stdcall", name))
2703     {
2704       if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
2705         {
2706           error ("stdcall and cdecl attributes are not compatible");
2707         }
2708       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
2709         {
2710           error ("stdcall and fastcall attributes are not compatible");
2711         }
2712     }
2713
2714   /* Can combine cdecl with regparm and sseregparm.  */
2715   else if (is_attribute_p ("cdecl", name))
2716     {
2717       if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
2718         {
2719           error ("stdcall and cdecl attributes are not compatible");
2720         }
2721       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
2722         {
2723           error ("fastcall and cdecl attributes are not compatible");
2724         }
2725     }
2726
2727   /* Can combine sseregparm with all attributes.  */
2728
2729   return NULL_TREE;
2730 }
2731
2732 /* Return 0 if the attributes for two types are incompatible, 1 if they
2733    are compatible, and 2 if they are nearly compatible (which causes a
2734    warning to be generated).  */
2735
2736 static int
2737 ix86_comp_type_attributes (tree type1, tree type2)
2738 {
2739   /* Check for mismatch of non-default calling convention.  */
2740   const char *const rtdstr = TARGET_RTD ? "cdecl" : "stdcall";
2741
2742   if (TREE_CODE (type1) != FUNCTION_TYPE)
2743     return 1;
2744
2745   /* Check for mismatched fastcall/regparm types.  */
2746   if ((!lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type1))
2747        != !lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type2)))
2748       || (ix86_function_regparm (type1, NULL)
2749           != ix86_function_regparm (type2, NULL)))
2750     return 0;
2751
2752   /* Check for mismatched sseregparm types.  */
2753   if (!lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type1))
2754       != !lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type2)))
2755     return 0;
2756
2757   /* Check for mismatched return types (cdecl vs stdcall).  */
2758   if (!lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type1))
2759       != !lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type2)))
2760     return 0;
2761
2762   return 1;
2763 }
2764 \f
2765 /* Return the regparm value for a function with the indicated TYPE and DECL.
2766    DECL may be NULL when calling function indirectly
2767    or considering a libcall.  */
2768
2769 static int
2770 ix86_function_regparm (tree type, tree decl)
2771 {
2772   tree attr;
2773   int regparm = ix86_regparm;
2774
2775   if (TARGET_64BIT)
2776     return regparm;
2777
2778   attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
2779   if (attr)
2780     return TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
2781
2782   if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type)))
2783     return 2;
2784
2785   /* Use register calling convention for local functions when possible.  */
2786   if (decl && flag_unit_at_a_time && !profile_flag)
2787     {
2788       struct cgraph_local_info *i = cgraph_local_info (decl);
2789       if (i && i->local)
2790         {
2791           int local_regparm, globals = 0, regno;
2792           struct function *f;
2793
2794           /* Make sure no regparm register is taken by a
2795              global register variable.  */
2796           for (local_regparm = 0; local_regparm < 3; local_regparm++)
2797             if (global_regs[local_regparm])
2798               break;
2799
2800           /* We can't use regparm(3) for nested functions as these use
2801              static chain pointer in third argument.  */
2802           if (local_regparm == 3
2803               && decl_function_context (decl)
2804               && !DECL_NO_STATIC_CHAIN (decl))
2805             local_regparm = 2;
2806
2807           /* If the function realigns its stackpointer, the prologue will
2808              clobber %ecx.  If we've already generated code for the callee,
2809              the callee DECL_STRUCT_FUNCTION is gone, so we fall back to
2810              scanning the attributes for the self-realigning property.  */
2811           f = DECL_STRUCT_FUNCTION (decl);
2812           if (local_regparm == 3
2813               && (f ? !!f->machine->force_align_arg_pointer
2814                   : !!lookup_attribute (ix86_force_align_arg_pointer_string,
2815                                         TYPE_ATTRIBUTES (TREE_TYPE (decl)))))
2816             local_regparm = 2;
2817
2818           /* Each global register variable increases register preassure,
2819              so the more global reg vars there are, the smaller regparm
2820              optimization use, unless requested by the user explicitly.  */
2821           for (regno = 0; regno < 6; regno++)
2822             if (global_regs[regno])
2823               globals++;
2824           local_regparm
2825             = globals < local_regparm ? local_regparm - globals : 0;
2826
2827           if (local_regparm > regparm)
2828             regparm = local_regparm;
2829         }
2830     }
2831
2832   return regparm;
2833 }
2834
2835 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
2836    DFmode (2) arguments in SSE registers for a function with the
2837    indicated TYPE and DECL.  DECL may be NULL when calling function
2838    indirectly or considering a libcall.  Otherwise return 0.  */
2839
2840 static int
2841 ix86_function_sseregparm (tree type, tree decl)
2842 {
2843   gcc_assert (!TARGET_64BIT);
2844
2845   /* Use SSE registers to pass SFmode and DFmode arguments if requested
2846      by the sseregparm attribute.  */
2847   if (TARGET_SSEREGPARM
2848       || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
2849     {
2850       if (!TARGET_SSE)
2851         {
2852           if (decl)
2853             error ("Calling %qD with attribute sseregparm without "
2854                    "SSE/SSE2 enabled", decl);
2855           else
2856             error ("Calling %qT with attribute sseregparm without "
2857                    "SSE/SSE2 enabled", type);
2858           return 0;
2859         }
2860
2861       return 2;
2862     }
2863
2864   /* For local functions, pass up to SSE_REGPARM_MAX SFmode
2865      (and DFmode for SSE2) arguments in SSE registers.  */
2866   if (decl && TARGET_SSE_MATH && flag_unit_at_a_time && !profile_flag)
2867     {
2868       struct cgraph_local_info *i = cgraph_local_info (decl);
2869       if (i && i->local)
2870         return TARGET_SSE2 ? 2 : 1;
2871     }
2872
2873   return 0;
2874 }
2875
2876 /* Return true if EAX is live at the start of the function.  Used by
2877    ix86_expand_prologue to determine if we need special help before
2878    calling allocate_stack_worker.  */
2879
2880 static bool
2881 ix86_eax_live_at_start_p (void)
2882 {
2883   /* Cheat.  Don't bother working forward from ix86_function_regparm
2884      to the function type to whether an actual argument is located in
2885      eax.  Instead just look at cfg info, which is still close enough
2886      to correct at this point.  This gives false positives for broken
2887      functions that might use uninitialized data that happens to be
2888      allocated in eax, but who cares?  */
2889   return REGNO_REG_SET_P (ENTRY_BLOCK_PTR->il.rtl->global_live_at_end, 0);
2890 }
2891
2892 /* Return true if TYPE has a variable argument list.  */
2893
2894 static bool
2895 type_has_variadic_args_p (tree type)
2896 {
2897   tree n, t = TYPE_ARG_TYPES (type);
2898
2899   if (t == NULL)
2900     return false;
2901
2902   while ((n = TREE_CHAIN (t)) != NULL)
2903     t = n;
2904
2905   return TREE_VALUE (t) != void_type_node;
2906 }
2907
2908 /* Value is the number of bytes of arguments automatically
2909    popped when returning from a subroutine call.
2910    FUNDECL is the declaration node of the function (as a tree),
2911    FUNTYPE is the data type of the function (as a tree),
2912    or for a library call it is an identifier node for the subroutine name.
2913    SIZE is the number of bytes of arguments passed on the stack.
2914
2915    On the 80386, the RTD insn may be used to pop them if the number
2916      of args is fixed, but if the number is variable then the caller
2917      must pop them all.  RTD can't be used for library calls now
2918      because the library is compiled with the Unix compiler.
2919    Use of RTD is a selectable option, since it is incompatible with
2920    standard Unix calling sequences.  If the option is not selected,
2921    the caller must always pop the args.
2922
2923    The attribute stdcall is equivalent to RTD on a per module basis.  */
2924
2925 int
2926 ix86_return_pops_args (tree fundecl, tree funtype, int size)
2927 {
2928   int rtd;
2929
2930   /* None of the 64-bit ABIs pop arguments.  */
2931   if (TARGET_64BIT)
2932     return 0;
2933
2934   rtd = TARGET_RTD && (!fundecl || TREE_CODE (fundecl) != IDENTIFIER_NODE);
2935
2936   /* Cdecl functions override -mrtd, and never pop the stack.  */
2937   if (! lookup_attribute ("cdecl", TYPE_ATTRIBUTES (funtype)))
2938     {
2939       /* Stdcall and fastcall functions will pop the stack if not
2940          variable args.  */
2941       if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (funtype))
2942           || lookup_attribute ("fastcall", TYPE_ATTRIBUTES (funtype)))
2943         rtd = 1;
2944
2945       if (rtd && ! type_has_variadic_args_p (funtype))
2946         return size;
2947     }
2948
2949   /* Lose any fake structure return argument if it is passed on the stack.  */
2950   if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
2951       && !KEEP_AGGREGATE_RETURN_POINTER)
2952     {
2953       int nregs = ix86_function_regparm (funtype, fundecl);
2954       if (nregs == 0)
2955         return GET_MODE_SIZE (Pmode);
2956     }
2957
2958   return 0;
2959 }
2960 \f
2961 /* Argument support functions.  */
2962
2963 /* Return true when register may be used to pass function parameters.  */
2964 bool
2965 ix86_function_arg_regno_p (int regno)
2966 {
2967   int i;
2968   const int *parm_regs;
2969
2970   if (!TARGET_64BIT)
2971     {
2972       if (TARGET_MACHO)
2973         return (regno < REGPARM_MAX
2974                 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
2975       else
2976         return (regno < REGPARM_MAX
2977                 || (TARGET_MMX && MMX_REGNO_P (regno)
2978                     && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
2979                 || (TARGET_SSE && SSE_REGNO_P (regno)
2980                     && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
2981     }
2982
2983   if (TARGET_MACHO)
2984     {
2985       if (SSE_REGNO_P (regno) && TARGET_SSE)
2986         return true;
2987     }
2988   else
2989     {
2990       if (TARGET_SSE && SSE_REGNO_P (regno)
2991           && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
2992         return true;
2993     }
2994
2995   /* RAX is used as hidden argument to va_arg functions.  */
2996   if (!TARGET_64BIT_MS_ABI && regno == 0)
2997     return true;
2998
2999   if (TARGET_64BIT_MS_ABI)
3000     parm_regs = x86_64_ms_abi_int_parameter_registers;
3001   else
3002     parm_regs = x86_64_int_parameter_registers;
3003   for (i = 0; i < REGPARM_MAX; i++)
3004     if (regno == parm_regs[i])
3005       return true;
3006   return false;
3007 }
3008
3009 /* Return if we do not know how to pass TYPE solely in registers.  */
3010
3011 static bool
3012 ix86_must_pass_in_stack (enum machine_mode mode, tree type)
3013 {
3014   if (must_pass_in_stack_var_size_or_pad (mode, type))
3015     return true;
3016
3017   /* For 32-bit, we want TImode aggregates to go on the stack.  But watch out!
3018      The layout_type routine is crafty and tries to trick us into passing
3019      currently unsupported vector types on the stack by using TImode.  */
3020   return (!TARGET_64BIT && mode == TImode
3021           && type && TREE_CODE (type) != VECTOR_TYPE);
3022 }
3023
3024 /* Initialize a variable CUM of type CUMULATIVE_ARGS
3025    for a call to a function whose data type is FNTYPE.
3026    For a library call, FNTYPE is 0.  */
3027
3028 void
3029 init_cumulative_args (CUMULATIVE_ARGS *cum,  /* Argument info to initialize */
3030                       tree fntype,      /* tree ptr for function decl */
3031                       rtx libname,      /* SYMBOL_REF of library name or 0 */
3032                       tree fndecl)
3033 {
3034   memset (cum, 0, sizeof (*cum));
3035
3036   /* Set up the number of registers to use for passing arguments.  */
3037   cum->nregs = ix86_regparm;
3038   if (TARGET_SSE)
3039     cum->sse_nregs = SSE_REGPARM_MAX;
3040   if (TARGET_MMX)
3041     cum->mmx_nregs = MMX_REGPARM_MAX;
3042   cum->warn_sse = true;
3043   cum->warn_mmx = true;
3044   cum->maybe_vaarg = (fntype ? type_has_variadic_args_p (fntype) : !libname);
3045
3046   if (!TARGET_64BIT)
3047     {
3048       /* If there are variable arguments, then we won't pass anything
3049          in registers in 32-bit mode. */
3050       if (cum->maybe_vaarg)
3051         {
3052           cum->nregs = 0;
3053           cum->sse_nregs = 0;
3054           cum->mmx_nregs = 0;
3055           cum->warn_sse = 0;
3056           cum->warn_mmx = 0;
3057           return;
3058         }
3059
3060       /* Use ecx and edx registers if function has fastcall attribute,
3061          else look for regparm information.  */
3062       if (fntype)
3063         {
3064           if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)))
3065             {
3066               cum->nregs = 2;
3067               cum->fastcall = 1;
3068             }
3069           else
3070             cum->nregs = ix86_function_regparm (fntype, fndecl);
3071         }
3072
3073       /* Set up the number of SSE registers used for passing SFmode
3074          and DFmode arguments.  Warn for mismatching ABI.  */
3075       cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl);
3076     }
3077 }
3078
3079 /* Return the "natural" mode for TYPE.  In most cases, this is just TYPE_MODE.
3080    But in the case of vector types, it is some vector mode.
3081
3082    When we have only some of our vector isa extensions enabled, then there
3083    are some modes for which vector_mode_supported_p is false.  For these
3084    modes, the generic vector support in gcc will choose some non-vector mode
3085    in order to implement the type.  By computing the natural mode, we'll
3086    select the proper ABI location for the operand and not depend on whatever
3087    the middle-end decides to do with these vector types.  */
3088
3089 static enum machine_mode
3090 type_natural_mode (tree type)
3091 {
3092   enum machine_mode mode = TYPE_MODE (type);
3093
3094   if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
3095     {
3096       HOST_WIDE_INT size = int_size_in_bytes (type);
3097       if ((size == 8 || size == 16)
3098           /* ??? Generic code allows us to create width 1 vectors.  Ignore.  */
3099           && TYPE_VECTOR_SUBPARTS (type) > 1)
3100         {
3101           enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
3102
3103           if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
3104             mode = MIN_MODE_VECTOR_FLOAT;
3105           else
3106             mode = MIN_MODE_VECTOR_INT;
3107
3108           /* Get the mode which has this inner mode and number of units.  */
3109           for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
3110             if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
3111                 && GET_MODE_INNER (mode) == innermode)
3112               return mode;
3113
3114           gcc_unreachable ();
3115         }
3116     }
3117
3118   return mode;
3119 }
3120
3121 /* We want to pass a value in REGNO whose "natural" mode is MODE.  However,
3122    this may not agree with the mode that the type system has chosen for the
3123    register, which is ORIG_MODE.  If ORIG_MODE is not BLKmode, then we can
3124    go ahead and use it.  Otherwise we have to build a PARALLEL instead.  */
3125
3126 static rtx
3127 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
3128                      unsigned int regno)
3129 {
3130   rtx tmp;
3131
3132   if (orig_mode != BLKmode)
3133     tmp = gen_rtx_REG (orig_mode, regno);
3134   else
3135     {
3136       tmp = gen_rtx_REG (mode, regno);
3137       tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
3138       tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
3139     }
3140
3141   return tmp;
3142 }
3143
3144 /* x86-64 register passing implementation.  See x86-64 ABI for details.  Goal
3145    of this code is to classify each 8bytes of incoming argument by the register
3146    class and assign registers accordingly.  */
3147
3148 /* Return the union class of CLASS1 and CLASS2.
3149    See the x86-64 PS ABI for details.  */
3150
3151 static enum x86_64_reg_class
3152 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
3153 {
3154   /* Rule #1: If both classes are equal, this is the resulting class.  */
3155   if (class1 == class2)
3156     return class1;
3157
3158   /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
3159      the other class.  */
3160   if (class1 == X86_64_NO_CLASS)
3161     return class2;
3162   if (class2 == X86_64_NO_CLASS)
3163     return class1;
3164
3165   /* Rule #3: If one of the classes is MEMORY, the result is MEMORY.  */
3166   if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
3167     return X86_64_MEMORY_CLASS;
3168
3169   /* Rule #4: If one of the classes is INTEGER, the result is INTEGER.  */
3170   if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
3171       || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
3172     return X86_64_INTEGERSI_CLASS;
3173   if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
3174       || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
3175     return X86_64_INTEGER_CLASS;
3176
3177   /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
3178      MEMORY is used.  */
3179   if (class1 == X86_64_X87_CLASS
3180       || class1 == X86_64_X87UP_CLASS
3181       || class1 == X86_64_COMPLEX_X87_CLASS
3182       || class2 == X86_64_X87_CLASS
3183       || class2 == X86_64_X87UP_CLASS
3184       || class2 == X86_64_COMPLEX_X87_CLASS)
3185     return X86_64_MEMORY_CLASS;
3186
3187   /* Rule #6: Otherwise class SSE is used.  */
3188   return X86_64_SSE_CLASS;
3189 }
3190
3191 /* Classify the argument of type TYPE and mode MODE.
3192    CLASSES will be filled by the register class used to pass each word
3193    of the operand.  The number of words is returned.  In case the parameter
3194    should be passed in memory, 0 is returned. As a special case for zero
3195    sized containers, classes[0] will be NO_CLASS and 1 is returned.
3196
3197    BIT_OFFSET is used internally for handling records and specifies offset
3198    of the offset in bits modulo 256 to avoid overflow cases.
3199
3200    See the x86-64 PS ABI for details.
3201 */
3202
3203 static int
3204 classify_argument (enum machine_mode mode, tree type,
3205                    enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
3206 {
3207   HOST_WIDE_INT bytes =
3208     (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
3209   int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
3210
3211   /* Variable sized entities are always passed/returned in memory.  */
3212   if (bytes < 0)
3213     return 0;
3214
3215   if (mode != VOIDmode
3216       && targetm.calls.must_pass_in_stack (mode, type))
3217     return 0;
3218
3219   if (type && AGGREGATE_TYPE_P (type))
3220     {
3221       int i;
3222       tree field;
3223       enum x86_64_reg_class subclasses[MAX_CLASSES];
3224
3225       /* On x86-64 we pass structures larger than 16 bytes on the stack.  */
3226       if (bytes > 16)
3227         return 0;
3228
3229       for (i = 0; i < words; i++)
3230         classes[i] = X86_64_NO_CLASS;
3231
3232       /* Zero sized arrays or structures are NO_CLASS.  We return 0 to
3233          signalize memory class, so handle it as special case.  */
3234       if (!words)
3235         {
3236           classes[0] = X86_64_NO_CLASS;
3237           return 1;
3238         }
3239
3240       /* Classify each field of record and merge classes.  */
3241       switch (TREE_CODE (type))
3242         {
3243         case RECORD_TYPE:
3244           /* And now merge the fields of structure.  */
3245           for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3246             {
3247               if (TREE_CODE (field) == FIELD_DECL)
3248                 {
3249                   int num;
3250
3251                   if (TREE_TYPE (field) == error_mark_node)
3252                     continue;
3253
3254                   /* Bitfields are always classified as integer.  Handle them
3255                      early, since later code would consider them to be
3256                      misaligned integers.  */
3257                   if (DECL_BIT_FIELD (field))
3258                     {
3259                       for (i = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
3260                            i < ((int_bit_position (field) + (bit_offset % 64))
3261                                 + tree_low_cst (DECL_SIZE (field), 0)
3262                                 + 63) / 8 / 8; i++)
3263                         classes[i] =
3264                           merge_classes (X86_64_INTEGER_CLASS,
3265                                          classes[i]);
3266                     }
3267                   else
3268                     {
3269                       num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
3270                                                TREE_TYPE (field), subclasses,
3271                                                (int_bit_position (field)
3272                                                 + bit_offset) % 256);
3273                       if (!num)
3274                         return 0;
3275                       for (i = 0; i < num; i++)
3276                         {
3277                           int pos =
3278                             (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
3279                           classes[i + pos] =
3280                             merge_classes (subclasses[i], classes[i + pos]);
3281                         }
3282                     }
3283                 }
3284             }
3285           break;
3286
3287         case ARRAY_TYPE:
3288           /* Arrays are handled as small records.  */
3289           {
3290             int num;
3291             num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
3292                                      TREE_TYPE (type), subclasses, bit_offset);
3293             if (!num)
3294               return 0;
3295
3296             /* The partial classes are now full classes.  */
3297             if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
3298               subclasses[0] = X86_64_SSE_CLASS;
3299             if (subclasses[0] == X86_64_INTEGERSI_CLASS && bytes != 4)
3300               subclasses[0] = X86_64_INTEGER_CLASS;
3301
3302             for (i = 0; i < words; i++)
3303               classes[i] = subclasses[i % num];
3304
3305             break;
3306           }
3307         case UNION_TYPE:
3308         case QUAL_UNION_TYPE:
3309           /* Unions are similar to RECORD_TYPE but offset is always 0.
3310              */
3311           for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3312             {
3313               if (TREE_CODE (field) == FIELD_DECL)
3314                 {
3315                   int num;
3316
3317                   if (TREE_TYPE (field) == error_mark_node)
3318                     continue;
3319
3320                   num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
3321                                            TREE_TYPE (field), subclasses,
3322                                            bit_offset);
3323                   if (!num)
3324                     return 0;
3325                   for (i = 0; i < num; i++)
3326                     classes[i] = merge_classes (subclasses[i], classes[i]);
3327                 }
3328             }
3329           break;
3330
3331         default:
3332           gcc_unreachable ();
3333         }
3334
3335       /* Final merger cleanup.  */
3336       for (i = 0; i < words; i++)
3337         {
3338           /* If one class is MEMORY, everything should be passed in
3339              memory.  */
3340           if (classes[i] == X86_64_MEMORY_CLASS)
3341             return 0;
3342
3343           /* The X86_64_SSEUP_CLASS should be always preceded by
3344              X86_64_SSE_CLASS.  */
3345           if (classes[i] == X86_64_SSEUP_CLASS
3346               && (i == 0 || classes[i - 1] != X86_64_SSE_CLASS))
3347             classes[i] = X86_64_SSE_CLASS;
3348
3349           /*  X86_64_X87UP_CLASS should be preceded by X86_64_X87_CLASS.  */
3350           if (classes[i] == X86_64_X87UP_CLASS
3351               && (i == 0 || classes[i - 1] != X86_64_X87_CLASS))
3352             classes[i] = X86_64_SSE_CLASS;
3353         }
3354       return words;
3355     }
3356
3357   /* Compute alignment needed.  We align all types to natural boundaries with
3358      exception of XFmode that is aligned to 64bits.  */
3359   if (mode != VOIDmode && mode != BLKmode)
3360     {
3361       int mode_alignment = GET_MODE_BITSIZE (mode);
3362
3363       if (mode == XFmode)
3364         mode_alignment = 128;
3365       else if (mode == XCmode)
3366         mode_alignment = 256;
3367       if (COMPLEX_MODE_P (mode))
3368         mode_alignment /= 2;
3369       /* Misaligned fields are always returned in memory.  */
3370       if (bit_offset % mode_alignment)
3371         return 0;
3372     }
3373
3374   /* for V1xx modes, just use the base mode */
3375   if (VECTOR_MODE_P (mode)
3376       && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
3377     mode = GET_MODE_INNER (mode);
3378
3379   /* Classification of atomic types.  */
3380   switch (mode)
3381     {
3382     case SDmode:
3383     case DDmode:
3384       classes[0] = X86_64_SSE_CLASS;
3385       return 1;
3386     case TDmode:
3387       classes[0] = X86_64_SSE_CLASS;
3388       classes[1] = X86_64_SSEUP_CLASS;
3389       return 2;
3390     case DImode:
3391     case SImode:
3392     case HImode:
3393     case QImode:
3394     case CSImode:
3395     case CHImode:
3396     case CQImode:
3397       if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
3398         classes[0] = X86_64_INTEGERSI_CLASS;
3399       else
3400         classes[0] = X86_64_INTEGER_CLASS;
3401       return 1;
3402     case CDImode:
3403     case TImode:
3404       classes[0] = classes[1] = X86_64_INTEGER_CLASS;
3405       return 2;
3406     case CTImode:
3407       return 0;
3408     case SFmode:
3409       if (!(bit_offset % 64))
3410         classes[0] = X86_64_SSESF_CLASS;
3411       else
3412         classes[0] = X86_64_SSE_CLASS;
3413       return 1;
3414     case DFmode:
3415       classes[0] = X86_64_SSEDF_CLASS;
3416       return 1;
3417     case XFmode:
3418       classes[0] = X86_64_X87_CLASS;
3419       classes[1] = X86_64_X87UP_CLASS;
3420       return 2;
3421     case TFmode:
3422       classes[0] = X86_64_SSE_CLASS;
3423       classes[1] = X86_64_SSEUP_CLASS;
3424       return 2;
3425     case SCmode:
3426       classes[0] = X86_64_SSE_CLASS;
3427       return 1;
3428     case DCmode:
3429       classes[0] = X86_64_SSEDF_CLASS;
3430       classes[1] = X86_64_SSEDF_CLASS;
3431       return 2;
3432     case XCmode:
3433       classes[0] = X86_64_COMPLEX_X87_CLASS;
3434       return 1;
3435     case TCmode:
3436       /* This modes is larger than 16 bytes.  */
3437       return 0;
3438     case V4SFmode:
3439     case V4SImode:
3440     case V16QImode:
3441     case V8HImode:
3442     case V2DFmode:
3443     case V2DImode:
3444       classes[0] = X86_64_SSE_CLASS;
3445       classes[1] = X86_64_SSEUP_CLASS;
3446       return 2;
3447     case V2SFmode:
3448     case V2SImode:
3449     case V4HImode:
3450     case V8QImode:
3451       classes[0] = X86_64_SSE_CLASS;
3452       return 1;
3453     case BLKmode:
3454     case VOIDmode:
3455       return 0;
3456     default:
3457       gcc_assert (VECTOR_MODE_P (mode));
3458
3459       if (bytes > 16)
3460         return 0;
3461
3462       gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
3463
3464       if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
3465         classes[0] = X86_64_INTEGERSI_CLASS;
3466       else
3467         classes[0] = X86_64_INTEGER_CLASS;
3468       classes[1] = X86_64_INTEGER_CLASS;
3469       return 1 + (bytes > 8);
3470     }
3471 }
3472
3473 /* Examine the argument and return set number of register required in each
3474    class.  Return 0 iff parameter should be passed in memory.  */
3475 static int
3476 examine_argument (enum machine_mode mode, tree type, int in_return,
3477                   int *int_nregs, int *sse_nregs)
3478 {
3479   enum x86_64_reg_class class[MAX_CLASSES];
3480   int n = classify_argument (mode, type, class, 0);
3481
3482   *int_nregs = 0;
3483   *sse_nregs = 0;
3484   if (!n)
3485     return 0;
3486   for (n--; n >= 0; n--)
3487     switch (class[n])
3488       {
3489       case X86_64_INTEGER_CLASS:
3490       case X86_64_INTEGERSI_CLASS:
3491         (*int_nregs)++;
3492         break;
3493       case X86_64_SSE_CLASS:
3494       case X86_64_SSESF_CLASS:
3495       case X86_64_SSEDF_CLASS:
3496         (*sse_nregs)++;
3497         break;
3498       case X86_64_NO_CLASS:
3499       case X86_64_SSEUP_CLASS:
3500         break;
3501       case X86_64_X87_CLASS:
3502       case X86_64_X87UP_CLASS:
3503         if (!in_return)
3504           return 0;
3505         break;
3506       case X86_64_COMPLEX_X87_CLASS:
3507         return in_return ? 2 : 0;
3508       case X86_64_MEMORY_CLASS:
3509         gcc_unreachable ();
3510       }
3511   return 1;
3512 }
3513
3514 /* Construct container for the argument used by GCC interface.  See
3515    FUNCTION_ARG for the detailed description.  */
3516
3517 static rtx
3518 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
3519                      tree type, int in_return, int nintregs, int nsseregs,
3520                      const int *intreg, int sse_regno)
3521 {
3522   /* The following variables hold the static issued_error state.  */
3523   static bool issued_sse_arg_error;
3524   static bool issued_sse_ret_error;
3525   static bool issued_x87_ret_error;
3526
3527   enum machine_mode tmpmode;
3528   int bytes =
3529     (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
3530   enum x86_64_reg_class class[MAX_CLASSES];
3531   int n;
3532   int i;
3533   int nexps = 0;
3534   int needed_sseregs, needed_intregs;
3535   rtx exp[MAX_CLASSES];
3536   rtx ret;
3537
3538   n = classify_argument (mode, type, class, 0);
3539   if (!n)
3540     return NULL;
3541   if (!examine_argument (mode, type, in_return, &needed_intregs,
3542                          &needed_sseregs))
3543     return NULL;
3544   if (needed_intregs > nintregs || needed_sseregs > nsseregs)
3545     return NULL;
3546
3547   /* We allowed the user to turn off SSE for kernel mode.  Don't crash if
3548      some less clueful developer tries to use floating-point anyway.  */
3549   if (needed_sseregs && !TARGET_SSE)
3550     {
3551       if (in_return)
3552         {
3553           if (!issued_sse_ret_error)
3554             {
3555               error ("SSE register return with SSE disabled");
3556               issued_sse_ret_error = true;
3557             }
3558         }
3559       else if (!issued_sse_arg_error)
3560         {
3561           error ("SSE register argument with SSE disabled");
3562           issued_sse_arg_error = true;
3563         }
3564       return NULL;
3565     }
3566
3567   /* Likewise, error if the ABI requires us to return values in the
3568      x87 registers and the user specified -mno-80387.  */
3569   if (!TARGET_80387 && in_return)
3570     for (i = 0; i < n; i++)
3571       if (class[i] == X86_64_X87_CLASS
3572           || class[i] == X86_64_X87UP_CLASS
3573           || class[i] == X86_64_COMPLEX_X87_CLASS)
3574         {
3575           if (!issued_x87_ret_error)
3576             {
3577               error ("x87 register return with x87 disabled");
3578               issued_x87_ret_error = true;
3579             }
3580           return NULL;
3581         }
3582
3583   /* First construct simple cases.  Avoid SCmode, since we want to use
3584      single register to pass this type.  */
3585   if (n == 1 && mode != SCmode)
3586     switch (class[0])
3587       {
3588       case X86_64_INTEGER_CLASS:
3589       case X86_64_INTEGERSI_CLASS:
3590         return gen_rtx_REG (mode, intreg[0]);
3591       case X86_64_SSE_CLASS:
3592       case X86_64_SSESF_CLASS:
3593       case X86_64_SSEDF_CLASS:
3594         return gen_reg_or_parallel (mode, orig_mode, SSE_REGNO (sse_regno));
3595       case X86_64_X87_CLASS:
3596       case X86_64_COMPLEX_X87_CLASS:
3597         return gen_rtx_REG (mode, FIRST_STACK_REG);
3598       case X86_64_NO_CLASS:
3599         /* Zero sized array, struct or class.  */
3600         return NULL;
3601       default:
3602         gcc_unreachable ();
3603       }
3604   if (n == 2 && class[0] == X86_64_SSE_CLASS && class[1] == X86_64_SSEUP_CLASS
3605       && mode != BLKmode)
3606     return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
3607
3608   if (n == 2
3609       && class[0] == X86_64_X87_CLASS && class[1] == X86_64_X87UP_CLASS)
3610     return gen_rtx_REG (XFmode, FIRST_STACK_REG);
3611   if (n == 2 && class[0] == X86_64_INTEGER_CLASS
3612       && class[1] == X86_64_INTEGER_CLASS
3613       && (mode == CDImode || mode == TImode || mode == TFmode)
3614       && intreg[0] + 1 == intreg[1])
3615     return gen_rtx_REG (mode, intreg[0]);
3616
3617   /* Otherwise figure out the entries of the PARALLEL.  */
3618   for (i = 0; i < n; i++)
3619     {
3620       switch (class[i])
3621         {
3622           case X86_64_NO_CLASS:
3623             break;
3624           case X86_64_INTEGER_CLASS:
3625           case X86_64_INTEGERSI_CLASS:
3626             /* Merge TImodes on aligned occasions here too.  */
3627             if (i * 8 + 8 > bytes)
3628               tmpmode = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
3629             else if (class[i] == X86_64_INTEGERSI_CLASS)
3630               tmpmode = SImode;
3631             else
3632               tmpmode = DImode;
3633             /* We've requested 24 bytes we don't have mode for.  Use DImode.  */
3634             if (tmpmode == BLKmode)
3635               tmpmode = DImode;
3636             exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
3637                                                gen_rtx_REG (tmpmode, *intreg),
3638                                                GEN_INT (i*8));
3639             intreg++;
3640             break;
3641           case X86_64_SSESF_CLASS:
3642             exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
3643                                                gen_rtx_REG (SFmode,
3644                                                             SSE_REGNO (sse_regno)),
3645                                                GEN_INT (i*8));
3646             sse_regno++;
3647             break;
3648           case X86_64_SSEDF_CLASS:
3649             exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
3650                                                gen_rtx_REG (DFmode,
3651                                                             SSE_REGNO (sse_regno)),
3652                                                GEN_INT (i*8));
3653             sse_regno++;
3654             break;
3655           case X86_64_SSE_CLASS:
3656             if (i < n - 1 && class[i + 1] == X86_64_SSEUP_CLASS)
3657               tmpmode = TImode;
3658             else
3659               tmpmode = DImode;
3660             exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
3661                                                gen_rtx_REG (tmpmode,
3662                                                             SSE_REGNO (sse_regno)),
3663                                                GEN_INT (i*8));
3664             if (tmpmode == TImode)
3665               i++;
3666             sse_regno++;
3667             break;
3668           default:
3669             gcc_unreachable ();
3670         }
3671     }
3672
3673   /* Empty aligned struct, union or class.  */
3674   if (nexps == 0)
3675     return NULL;
3676
3677   ret =  gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
3678   for (i = 0; i < nexps; i++)
3679     XVECEXP (ret, 0, i) = exp [i];
3680   return ret;
3681 }
3682
3683 /* Update the data in CUM to advance over an argument of mode MODE
3684    and data type TYPE.  (TYPE is null for libcalls where that information
3685    may not be available.)  */
3686
3687 static void
3688 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
3689                          tree type, HOST_WIDE_INT bytes, HOST_WIDE_INT words)
3690 {
3691   switch (mode)
3692     {
3693     default:
3694       break;
3695
3696     case BLKmode:
3697       if (bytes < 0)
3698         break;
3699       /* FALLTHRU */
3700
3701     case DImode:
3702     case SImode:
3703     case HImode:
3704     case QImode:
3705       cum->words += words;
3706       cum->nregs -= words;
3707       cum->regno += words;
3708
3709       if (cum->nregs <= 0)
3710         {
3711           cum->nregs = 0;
3712           cum->regno = 0;
3713         }
3714       break;
3715
3716     case DFmode:
3717       if (cum->float_in_sse < 2)
3718         break;
3719     case SFmode:
3720       if (cum->float_in_sse < 1)
3721         break;
3722       /* FALLTHRU */
3723
3724     case TImode:
3725     case V16QImode:
3726     case V8HImode:
3727     case V4SImode:
3728     case V2DImode:
3729     case V4SFmode:
3730     case V2DFmode:
3731       if (!type || !AGGREGATE_TYPE_P (type))
3732         {
3733           cum->sse_words += words;
3734           cum->sse_nregs -= 1;
3735           cum->sse_regno += 1;
3736           if (cum->sse_nregs <= 0)
3737             {
3738               cum->sse_nregs = 0;
3739               cum->sse_regno = 0;
3740             }
3741         }
3742       break;
3743
3744     case V8QImode:
3745     case V4HImode:
3746     case V2SImode:
3747     case V2SFmode:
3748       if (!type || !AGGREGATE_TYPE_P (type))
3749         {
3750           cum->mmx_words += words;
3751           cum->mmx_nregs -= 1;
3752           cum->mmx_regno += 1;
3753           if (cum->mmx_nregs <= 0)
3754             {
3755               cum->mmx_nregs = 0;
3756               cum->mmx_regno = 0;
3757             }
3758         }
3759       break;
3760     }
3761 }
3762
3763 static void
3764 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
3765                          tree type, HOST_WIDE_INT words)
3766 {
3767   int int_nregs, sse_nregs;
3768
3769   if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs))
3770     cum->words += words;
3771   else if (sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
3772     {
3773       cum->nregs -= int_nregs;
3774       cum->sse_nregs -= sse_nregs;
3775       cum->regno += int_nregs;
3776       cum->sse_regno += sse_nregs;
3777     }
3778   else
3779     cum->words += words;
3780 }
3781
3782 static void
3783 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
3784                             HOST_WIDE_INT words)
3785 {
3786   /* Otherwise, this should be passed indirect.  */
3787   gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
3788
3789   cum->words += words;
3790   if (cum->nregs > 0)
3791     {
3792       cum->nregs -= 1;
3793       cum->regno += 1;
3794     }
3795 }
3796
3797 void
3798 function_arg_advance (CUMULATIVE_ARGS *cum, enum machine_mode mode,
3799                       tree type, int named ATTRIBUTE_UNUSED)
3800 {
3801   HOST_WIDE_INT bytes, words;
3802
3803   if (mode == BLKmode)
3804     bytes = int_size_in_bytes (type);
3805   else
3806     bytes = GET_MODE_SIZE (mode);
3807   words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
3808
3809   if (type)
3810     mode = type_natural_mode (type);
3811
3812   if (TARGET_64BIT_MS_ABI)
3813     function_arg_advance_ms_64 (cum, bytes, words);
3814   else if (TARGET_64BIT)
3815     function_arg_advance_64 (cum, mode, type, words);
3816   else
3817     function_arg_advance_32 (cum, mode, type, bytes, words);
3818 }
3819
3820 /* Define where to put the arguments to a function.
3821    Value is zero to push the argument on the stack,
3822    or a hard register in which to store the argument.
3823
3824    MODE is the argument's machine mode.
3825    TYPE is the data type of the argument (as a tree).
3826     This is null for libcalls where that information may
3827     not be available.
3828    CUM is a variable of type CUMULATIVE_ARGS which gives info about
3829     the preceding args and about the function being called.
3830    NAMED is nonzero if this argument is a named parameter
3831     (otherwise it is an extra parameter matching an ellipsis).  */
3832
3833 static rtx
3834 function_arg_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
3835                  enum machine_mode orig_mode, tree type,
3836                  HOST_WIDE_INT bytes, HOST_WIDE_INT words)
3837 {
3838   static bool warnedsse, warnedmmx;
3839
3840   /* Avoid the AL settings for the Unix64 ABI.  */
3841   if (mode == VOIDmode)
3842     return constm1_rtx;
3843
3844   switch (mode)
3845     {
3846     default:
3847       break;
3848
3849     case BLKmode:
3850       if (bytes < 0)
3851         break;
3852       /* FALLTHRU */
3853     case DImode:
3854     case SImode:
3855     case HImode:
3856     case QImode:
3857       if (words <= cum->nregs)
3858         {
3859           int regno = cum->regno;
3860
3861           /* Fastcall allocates the first two DWORD (SImode) or
3862              smaller arguments to ECX and EDX.  */
3863           if (cum->fastcall)
3864             {
3865               if (mode == BLKmode || mode == DImode)
3866                 break;
3867
3868               /* ECX not EAX is the first allocated register.  */
3869               if (regno == 0)
3870                 regno = 2;
3871             }
3872           return gen_rtx_REG (mode, regno);
3873         }
3874       break;
3875
3876     case DFmode:
3877       if (cum->float_in_sse < 2)
3878         break;
3879     case SFmode:
3880       if (cum->float_in_sse < 1)
3881         break;
3882       /* FALLTHRU */
3883     case TImode:
3884     case V16QImode:
3885     case V8HImode:
3886     case V4SImode:
3887     case V2DImode:
3888     case V4SFmode:
3889     case V2DFmode:
3890       if (!type || !AGGREGATE_TYPE_P (type))
3891         {
3892           if (!TARGET_SSE && !warnedsse && cum->warn_sse)
3893             {
3894               warnedsse = true;
3895               warning (0, "SSE vector argument without SSE enabled "
3896                        "changes the ABI");
3897             }
3898           if (cum->sse_nregs)
3899             return gen_reg_or_parallel (mode, orig_mode,
3900                                         cum->sse_regno + FIRST_SSE_REG);
3901         }
3902       break;
3903
3904     case V8QImode:
3905     case V4HImode:
3906     case V2SImode:
3907     case V2SFmode:
3908       if (!type || !AGGREGATE_TYPE_P (type))
3909         {
3910           if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
3911             {
3912               warnedmmx = true;
3913               warning (0, "MMX vector argument without MMX enabled "
3914                        "changes the ABI");
3915             }
3916           if (cum->mmx_nregs)
3917             return gen_reg_or_parallel (mode, orig_mode,
3918                                         cum->mmx_regno + FIRST_MMX_REG);
3919         }
3920       break;
3921     }
3922
3923   return NULL_RTX;
3924 }
3925
3926 static rtx
3927 function_arg_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
3928                  enum machine_mode orig_mode, tree type)
3929 {
3930   /* Handle a hidden AL argument containing number of registers
3931      for varargs x86-64 functions.  */
3932   if (mode == VOIDmode)
3933     return GEN_INT (cum->maybe_vaarg
3934                     ? (cum->sse_nregs < 0
3935                        ? SSE_REGPARM_MAX
3936                        : cum->sse_regno)
3937                     : -1);
3938
3939   return construct_container (mode, orig_mode, type, 0, cum->nregs,
3940                               cum->sse_nregs,
3941                               &x86_64_int_parameter_registers [cum->regno],
3942                               cum->sse_regno);
3943 }
3944
3945 static rtx
3946 function_arg_ms_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
3947                     enum machine_mode orig_mode, int named)
3948 {
3949   unsigned int regno;
3950
3951   /* Avoid the AL settings for the Unix64 ABI.  */
3952   if (mode == VOIDmode)
3953     return constm1_rtx;
3954
3955   /* If we've run out of registers, it goes on the stack.  */
3956   if (cum->nregs == 0)
3957     return NULL_RTX;
3958
3959   regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
3960
3961   /* Only floating point modes are passed in anything but integer regs.  */
3962   if (TARGET_SSE && (mode == SFmode || mode == DFmode))
3963     {
3964       if (named)
3965         regno = cum->regno + FIRST_SSE_REG;
3966       else
3967         {
3968           rtx t1, t2;
3969
3970           /* Unnamed floating parameters are passed in both the
3971              SSE and integer registers.  */
3972           t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
3973           t2 = gen_rtx_REG (mode, regno);
3974           t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
3975           t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
3976           return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
3977         }
3978     }
3979
3980   return gen_reg_or_parallel (mode, orig_mode, regno);
3981 }
3982
3983 rtx
3984 function_arg (CUMULATIVE_ARGS *cum, enum machine_mode omode,
3985               tree type, int named)
3986 {
3987   enum machine_mode mode = omode;
3988   HOST_WIDE_INT bytes, words;
3989
3990   if (mode == BLKmode)
3991     bytes = int_size_in_bytes (type);
3992   else
3993     bytes = GET_MODE_SIZE (mode);
3994   words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
3995
3996   /* To simplify the code below, represent vector types with a vector mode
3997      even if MMX/SSE are not active.  */
3998   if (type && TREE_CODE (type) == VECTOR_TYPE)
3999     mode = type_natural_mode (type);
4000
4001   if (TARGET_64BIT_MS_ABI)
4002     return function_arg_ms_64 (cum, mode, omode, named);
4003   else if (TARGET_64BIT)
4004     return function_arg_64 (cum, mode, omode, type);
4005   else
4006     return function_arg_32 (cum, mode, omode, type, bytes, words);
4007 }
4008
4009 /* A C expression that indicates when an argument must be passed by
4010    reference.  If nonzero for an argument, a copy of that argument is
4011    made in memory and a pointer to the argument is passed instead of
4012    the argument itself.  The pointer is passed in whatever way is
4013    appropriate for passing a pointer to that type.  */
4014
4015 static bool
4016 ix86_pass_by_reference (CUMULATIVE_ARGS *cum ATTRIBUTE_UNUSED,
4017                         enum machine_mode mode ATTRIBUTE_UNUSED,
4018                         tree type, bool named ATTRIBUTE_UNUSED)
4019 {
4020   if (TARGET_64BIT_MS_ABI)
4021     {
4022       if (type)
4023         {
4024           /* Arrays are passed by reference.  */
4025           if (TREE_CODE (type) == ARRAY_TYPE)
4026             return true;
4027
4028           if (AGGREGATE_TYPE_P (type))
4029             {
4030               /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
4031                  are passed by reference.  */
4032               int el2 = exact_log2 (int_size_in_bytes (type));
4033               return !(el2 >= 0 && el2 <= 3);
4034             }
4035         }
4036
4037       /* __m128 is passed by reference.  */
4038       /* ??? How to handle complex?  For now treat them as structs,
4039          and pass them by reference if they're too large.  */
4040       if (GET_MODE_SIZE (mode) > 8)
4041         return true;
4042     }
4043   else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
4044     return 1;
4045
4046   return 0;
4047 }
4048
4049 /* Return true when TYPE should be 128bit aligned for 32bit argument passing
4050    ABI.  Only called if TARGET_SSE.  */
4051 static bool
4052 contains_128bit_aligned_vector_p (tree type)
4053 {
4054   enum machine_mode mode = TYPE_MODE (type);
4055   if (SSE_REG_MODE_P (mode)
4056       && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
4057     return true;
4058   if (TYPE_ALIGN (type) < 128)
4059     return false;
4060
4061   if (AGGREGATE_TYPE_P (type))
4062     {
4063       /* Walk the aggregates recursively.  */
4064       switch (TREE_CODE (type))
4065         {
4066         case RECORD_TYPE:
4067         case UNION_TYPE:
4068         case QUAL_UNION_TYPE:
4069           {
4070             tree field;
4071
4072             /* Walk all the structure fields.  */
4073             for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
4074               {
4075                 if (TREE_CODE (field) == FIELD_DECL
4076                     && contains_128bit_aligned_vector_p (TREE_TYPE (field)))
4077                   return true;
4078               }
4079             break;
4080           }
4081
4082         case ARRAY_TYPE:
4083           /* Just for use if some languages passes arrays by value.  */
4084           if (contains_128bit_aligned_vector_p (TREE_TYPE (type)))
4085             return true;
4086           break;
4087
4088         default:
4089           gcc_unreachable ();
4090         }
4091     }
4092   return false;
4093 }
4094
4095 /* Gives the alignment boundary, in bits, of an argument with the
4096    specified mode and type.  */
4097
4098 int
4099 ix86_function_arg_boundary (enum machine_mode mode, tree type)
4100 {
4101   int align;
4102   if (type)
4103     align = TYPE_ALIGN (type);
4104   else
4105     align = GET_MODE_ALIGNMENT (mode);
4106   if (align < PARM_BOUNDARY)
4107     align = PARM_BOUNDARY;
4108   if (!TARGET_64BIT)
4109     {
4110       /* i386 ABI defines all arguments to be 4 byte aligned.  We have to
4111          make an exception for SSE modes since these require 128bit
4112          alignment.
4113
4114          The handling here differs from field_alignment.  ICC aligns MMX
4115          arguments to 4 byte boundaries, while structure fields are aligned
4116          to 8 byte boundaries.  */
4117       if (!TARGET_SSE)
4118         align = PARM_BOUNDARY;
4119       else if (!type)
4120         {
4121           if (!SSE_REG_MODE_P (mode))
4122             align = PARM_BOUNDARY;
4123         }
4124       else
4125         {
4126           if (!contains_128bit_aligned_vector_p (type))
4127             align = PARM_BOUNDARY;
4128         }
4129     }
4130   if (align > 128)
4131     align = 128;
4132   return align;
4133 }
4134
4135 /* Return true if N is a possible register number of function value.  */
4136
4137 bool
4138 ix86_function_value_regno_p (int regno)
4139 {
4140   switch (regno)
4141     {
4142     case 0:
4143       return true;
4144
4145     case FIRST_FLOAT_REG:
4146       if (TARGET_64BIT_MS_ABI)
4147         return false;
4148       return TARGET_FLOAT_RETURNS_IN_80387;
4149
4150     case FIRST_SSE_REG:
4151       return TARGET_SSE;
4152
4153     case FIRST_MMX_REG:
4154       if (TARGET_MACHO || TARGET_64BIT)
4155         return false;
4156       return TARGET_MMX;
4157     }
4158
4159   return false;
4160 }
4161
4162 /* Define how to find the value returned by a function.
4163    VALTYPE is the data type of the value (as a tree).
4164    If the precise function being called is known, FUNC is its FUNCTION_DECL;
4165    otherwise, FUNC is 0.  */
4166
4167 static rtx
4168 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
4169                    tree fntype, tree fn)
4170 {
4171   unsigned int regno;
4172
4173   /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
4174      we normally prevent this case when mmx is not available.  However
4175      some ABIs may require the result to be returned like DImode.  */
4176   if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
4177     regno = TARGET_MMX ? FIRST_MMX_REG : 0;
4178
4179   /* 16-byte vector modes in %xmm0.  See ix86_return_in_memory for where
4180      we prevent this case when sse is not available.  However some ABIs
4181      may require the result to be returned like integer TImode.  */
4182   else if (mode == TImode
4183            || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
4184     regno = TARGET_SSE ? FIRST_SSE_REG : 0;
4185
4186   /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387).  */
4187   else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
4188     regno = FIRST_FLOAT_REG;
4189   else
4190     /* Most things go in %eax.  */
4191     regno = 0;
4192
4193   /* Override FP return register with %xmm0 for local functions when
4194      SSE math is enabled or for functions with sseregparm attribute.  */
4195   if ((fn || fntype) && (mode == SFmode || mode == DFmode))
4196     {
4197       int sse_level = ix86_function_sseregparm (fntype, fn);
4198       if ((sse_level >= 1 && mode == SFmode)
4199           || (sse_level == 2 && mode == DFmode))
4200         regno = FIRST_SSE_REG;
4201     }
4202
4203   return gen_rtx_REG (orig_mode, regno);
4204 }
4205
4206 static rtx
4207 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
4208                    tree valtype)
4209 {
4210   rtx ret;
4211
4212   /* Handle libcalls, which don't provide a type node.  */
4213   if (valtype == NULL)
4214     {
4215       switch (mode)
4216         {
4217         case SFmode:
4218         case SCmode:
4219         case DFmode:
4220         case DCmode:
4221         case TFmode:
4222         case SDmode:
4223         case DDmode:
4224         case TDmode:
4225           return gen_rtx_REG (mode, FIRST_SSE_REG);
4226         case XFmode:
4227         case XCmode:
4228           return gen_rtx_REG (mode, FIRST_FLOAT_REG);
4229         case TCmode:
4230           return NULL;
4231         default:
4232           return gen_rtx_REG (mode, 0);
4233         }
4234     }
4235
4236   ret = construct_container (mode, orig_mode, valtype, 1,
4237                              REGPARM_MAX, SSE_REGPARM_MAX,
4238                              x86_64_int_return_registers, 0);
4239
4240   /* For zero sized structures, construct_container returns NULL, but we
4241      need to keep rest of compiler happy by returning meaningful value.  */
4242   if (!ret)
4243     ret = gen_rtx_REG (orig_mode, 0);
4244
4245   return ret;
4246 }
4247
4248 static rtx
4249 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
4250 {
4251   unsigned int regno = 0;
4252
4253   if (TARGET_SSE)
4254     {
4255       if (mode == SFmode || mode == DFmode)
4256         regno = FIRST_SSE_REG;
4257       else if (VECTOR_MODE_P (mode) || GET_MODE_SIZE (mode) == 16)
4258         regno = FIRST_SSE_REG;
4259     }
4260
4261   return gen_rtx_REG (orig_mode, regno);
4262 }
4263
4264 static rtx
4265 ix86_function_value_1 (tree valtype, tree fntype_or_decl,
4266                        enum machine_mode orig_mode, enum machine_mode mode)
4267 {
4268   tree fn, fntype;
4269
4270   fn = NULL_TREE;
4271   if (fntype_or_decl && DECL_P (fntype_or_decl))
4272     fn = fntype_or_decl;
4273   fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
4274
4275   if (TARGET_64BIT_MS_ABI)
4276     return function_value_ms_64 (orig_mode, mode);
4277   else if (TARGET_64BIT)
4278     return function_value_64 (orig_mode, mode, valtype);
4279   else
4280     return function_value_32 (orig_mode, mode, fntype, fn);
4281 }
4282
4283 static rtx
4284 ix86_function_value (tree valtype, tree fntype_or_decl,
4285                      bool outgoing ATTRIBUTE_UNUSED)
4286 {
4287   enum machine_mode mode, orig_mode;
4288
4289   orig_mode = TYPE_MODE (valtype);
4290   mode = type_natural_mode (valtype);
4291   return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
4292 }
4293
4294 rtx
4295 ix86_libcall_value (enum machine_mode mode)
4296 {
4297   return ix86_function_value_1 (NULL, NULL, mode, mode);
4298 }
4299
4300 /* Return true iff type is returned in memory.  */
4301
4302 static int
4303 return_in_memory_32 (tree type, enum machine_mode mode)
4304 {
4305   HOST_WIDE_INT size;
4306
4307   if (mode == BLKmode)
4308     return 1;
4309
4310   size = int_size_in_bytes (type);
4311
4312   if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
4313     return 0;
4314
4315   if (VECTOR_MODE_P (mode) || mode == TImode)
4316     {
4317       /* User-created vectors small enough to fit in EAX.  */
4318       if (size < 8)
4319         return 0;
4320
4321       /* MMX/3dNow values are returned in MM0,
4322          except when it doesn't exits.  */
4323       if (size == 8)
4324         return (TARGET_MMX ? 0 : 1);
4325
4326       /* SSE values are returned in XMM0, except when it doesn't exist.  */
4327       if (size == 16)
4328         return (TARGET_SSE ? 0 : 1);
4329     }
4330
4331   if (mode == XFmode)
4332     return 0;
4333
4334   if (mode == TDmode)
4335     return 1;
4336
4337   if (size > 12)
4338     return 1;
4339   return 0;
4340 }
4341
4342 static int
4343 return_in_memory_64 (tree type, enum machine_mode mode)
4344 {
4345   int needed_intregs, needed_sseregs;
4346   return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
4347 }
4348
4349 static int
4350 return_in_memory_ms_64 (tree type, enum machine_mode mode)
4351 {
4352   HOST_WIDE_INT size = int_size_in_bytes (type);
4353
4354   /* __m128 and friends are returned in xmm0.  */
4355   if (size == 16 && VECTOR_MODE_P (mode))
4356     return 0;
4357
4358   /* Otherwise, the size must be exactly in [1248].  */
4359   return (size != 1 && size != 2 && size != 4 && size != 8);
4360 }
4361
4362 int
4363 ix86_return_in_memory (tree type)
4364 {
4365   enum machine_mode mode = type_natural_mode (type);
4366
4367   if (TARGET_64BIT_MS_ABI)
4368     return return_in_memory_ms_64 (type, mode);
4369   else if (TARGET_64BIT)
4370     return return_in_memory_64 (type, mode);
4371   else
4372     return return_in_memory_32 (type, mode);
4373 }
4374
4375 /* When returning SSE vector types, we have a choice of either
4376      (1) being abi incompatible with a -march switch, or
4377      (2) generating an error.
4378    Given no good solution, I think the safest thing is one warning.
4379    The user won't be able to use -Werror, but....
4380
4381    Choose the STRUCT_VALUE_RTX hook because that's (at present) only
4382    called in response to actually generating a caller or callee that
4383    uses such a type.  As opposed to RETURN_IN_MEMORY, which is called
4384    via aggregate_value_p for general type probing from tree-ssa.  */
4385
4386 static rtx
4387 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
4388 {
4389   static bool warnedsse, warnedmmx;
4390
4391   if (!TARGET_64BIT && type)
4392     {
4393       /* Look at the return type of the function, not the function type.  */
4394       enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
4395
4396       if (!TARGET_SSE && !warnedsse)
4397         {
4398           if (mode == TImode
4399               || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
4400             {
4401               warnedsse = true;
4402               warning (0, "SSE vector return without SSE enabled "
4403                        "changes the ABI");
4404             }
4405         }
4406
4407       if (!TARGET_MMX && !warnedmmx)
4408         {
4409           if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
4410             {
4411               warnedmmx = true;
4412               warning (0, "MMX vector return without MMX enabled "
4413                        "changes the ABI");
4414             }
4415         }
4416     }
4417
4418   return NULL;
4419 }
4420
4421 \f
4422 /* Create the va_list data type.  */
4423
4424 static tree
4425 ix86_build_builtin_va_list (void)
4426 {
4427   tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
4428
4429   /* For i386 we use plain pointer to argument area.  */
4430   if (!TARGET_64BIT || TARGET_64BIT_MS_ABI)
4431     return build_pointer_type (char_type_node);
4432
4433   record = (*lang_hooks.types.make_type) (RECORD_TYPE);
4434   type_decl = build_decl (TYPE_DECL, get_identifier ("__va_list_tag"), record);
4435
4436   f_gpr = build_decl (FIELD_DECL, get_identifier ("gp_offset"),
4437                       unsigned_type_node);
4438   f_fpr = build_decl (FIELD_DECL, get_identifier ("fp_offset"),
4439                       unsigned_type_node);
4440   f_ovf = build_decl (FIELD_DECL, get_identifier ("overflow_arg_area"),
4441                       ptr_type_node);
4442   f_sav = build_decl (FIELD_DECL, get_identifier ("reg_save_area"),
4443                       ptr_type_node);
4444
4445   va_list_gpr_counter_field = f_gpr;
4446   va_list_fpr_counter_field = f_fpr;
4447
4448   DECL_FIELD_CONTEXT (f_gpr) = record;
4449   DECL_FIELD_CONTEXT (f_fpr) = record;
4450   DECL_FIELD_CONTEXT (f_ovf) = record;
4451   DECL_FIELD_CONTEXT (f_sav) = record;
4452
4453   TREE_CHAIN (record) = type_decl;
4454   TYPE_NAME (record) = type_decl;
4455   TYPE_FIELDS (record) = f_gpr;
4456   TREE_CHAIN (f_gpr) = f_fpr;
4457   TREE_CHAIN (f_fpr) = f_ovf;
4458   TREE_CHAIN (f_ovf) = f_sav;
4459
4460   layout_type (record);
4461
4462   /* The correct type is an array type of one element.  */
4463   return build_array_type (record, build_index_type (size_zero_node));
4464 }
4465
4466 /* Worker function for TARGET_SETUP_INCOMING_VARARGS.  */
4467
4468 static void
4469 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
4470 {
4471   rtx save_area, mem;
4472   rtx label;
4473   rtx label_ref;
4474   rtx tmp_reg;
4475   rtx nsse_reg;
4476   int set;
4477   int i;
4478
4479   if (! cfun->va_list_gpr_size && ! cfun->va_list_fpr_size)
4480     return;
4481
4482   /* Indicate to allocate space on the stack for varargs save area.  */
4483   ix86_save_varrargs_registers = 1;
4484   cfun->stack_alignment_needed = 128;
4485
4486   save_area = frame_pointer_rtx;
4487   set = get_varargs_alias_set ();
4488
4489   for (i = cum->regno;
4490        i < ix86_regparm
4491        && i < cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
4492        i++)
4493     {
4494       mem = gen_rtx_MEM (Pmode,
4495                          plus_constant (save_area, i * UNITS_PER_WORD));
4496       MEM_NOTRAP_P (mem) = 1;
4497       set_mem_alias_set (mem, set);
4498       emit_move_insn (mem, gen_rtx_REG (Pmode,
4499                                         x86_64_int_parameter_registers[i]));
4500     }
4501
4502   if (cum->sse_nregs && cfun->va_list_fpr_size)
4503     {
4504       /* Now emit code to save SSE registers.  The AX parameter contains number
4505          of SSE parameter registers used to call this function.  We use
4506          sse_prologue_save insn template that produces computed jump across
4507          SSE saves.  We need some preparation work to get this working.  */
4508
4509       label = gen_label_rtx ();
4510       label_ref = gen_rtx_LABEL_REF (Pmode, label);
4511
4512       /* Compute address to jump to :
4513          label - 5*eax + nnamed_sse_arguments*5  */
4514       tmp_reg = gen_reg_rtx (Pmode);
4515       nsse_reg = gen_reg_rtx (Pmode);
4516       emit_insn (gen_zero_extendqidi2 (nsse_reg, gen_rtx_REG (QImode, 0)));
4517       emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
4518                               gen_rtx_MULT (Pmode, nsse_reg,
4519                                             GEN_INT (4))));
4520       if (cum->sse_regno)
4521         emit_move_insn
4522           (nsse_reg,
4523            gen_rtx_CONST (DImode,
4524                           gen_rtx_PLUS (DImode,
4525                                         label_ref,
4526                                         GEN_INT (cum->sse_regno * 4))));
4527       else
4528         emit_move_insn (nsse_reg, label_ref);
4529       emit_insn (gen_subdi3 (nsse_reg, nsse_reg, tmp_reg));
4530
4531       /* Compute address of memory block we save into.  We always use pointer
4532          pointing 127 bytes after first byte to store - this is needed to keep
4533          instruction size limited by 4 bytes.  */
4534       tmp_reg = gen_reg_rtx (Pmode);
4535       emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
4536                               plus_constant (save_area,
4537                                              8 * REGPARM_MAX + 127)));
4538       mem = gen_rtx_MEM (BLKmode, plus_constant (tmp_reg, -127));
4539       MEM_NOTRAP_P (mem) = 1;
4540       set_mem_alias_set (mem, set);
4541       set_mem_align (mem, BITS_PER_WORD);
4542
4543       /* And finally do the dirty job!  */
4544       emit_insn (gen_sse_prologue_save (mem, nsse_reg,
4545                                         GEN_INT (cum->sse_regno), label));
4546     }
4547 }
4548
4549 static void
4550 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
4551 {
4552   int set = get_varargs_alias_set ();
4553   int i;
4554
4555   for (i = cum->regno; i < REGPARM_MAX; i++)
4556     {
4557       rtx reg, mem;
4558
4559       mem = gen_rtx_MEM (Pmode,
4560                          plus_constant (virtual_incoming_args_rtx,
4561                                         i * UNITS_PER_WORD));
4562       MEM_NOTRAP_P (mem) = 1;
4563       set_mem_alias_set (mem, set);
4564
4565       reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
4566       emit_move_insn (mem, reg);
4567     }
4568 }
4569
4570 static void
4571 ix86_setup_incoming_varargs (CUMULATIVE_ARGS *cum, enum machine_mode mode,
4572                              tree type, int *pretend_size ATTRIBUTE_UNUSED,
4573                              int no_rtl)
4574 {
4575   CUMULATIVE_ARGS next_cum;
4576   tree fntype;
4577   int stdarg_p;
4578
4579   /* This argument doesn't appear to be used anymore.  Which is good,
4580      because the old code here didn't suppress rtl generation.  */
4581   gcc_assert (!no_rtl);
4582
4583   if (!TARGET_64BIT)
4584     return;
4585
4586   fntype = TREE_TYPE (current_function_decl);
4587   stdarg_p = (TYPE_ARG_TYPES (fntype) != 0
4588               && (TREE_VALUE (tree_last (TYPE_ARG_TYPES (fntype)))
4589                   != void_type_node));
4590
4591   /* For varargs, we do not want to skip the dummy va_dcl argument.
4592      For stdargs, we do want to skip the last named argument.  */
4593   next_cum = *cum;
4594   if (stdarg_p)
4595     function_arg_advance (&next_cum, mode, type, 1);
4596
4597   if (TARGET_64BIT_MS_ABI)
4598     setup_incoming_varargs_ms_64 (&next_cum);
4599   else
4600     setup_incoming_varargs_64 (&next_cum);
4601 }
4602
4603 /* Implement va_start.  */
4604
4605 void
4606 ix86_va_start (tree valist, rtx nextarg)
4607 {
4608   HOST_WIDE_INT words, n_gpr, n_fpr;
4609   tree f_gpr, f_fpr, f_ovf, f_sav;
4610   tree gpr, fpr, ovf, sav, t;
4611   tree type;
4612
4613   /* Only 64bit target needs something special.  */
4614   if (!TARGET_64BIT || TARGET_64BIT_MS_ABI)
4615     {
4616       std_expand_builtin_va_start (valist, nextarg);
4617       return;
4618     }
4619
4620   f_gpr = TYPE_FIELDS (TREE_TYPE (va_list_type_node));
4621   f_fpr = TREE_CHAIN (f_gpr);
4622   f_ovf = TREE_CHAIN (f_fpr);
4623   f_sav = TREE_CHAIN (f_ovf);
4624
4625   valist = build1 (INDIRECT_REF, TREE_TYPE (TREE_TYPE (valist)), valist);
4626   gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), valist, f_gpr, NULL_TREE);
4627   fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
4628   ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
4629   sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
4630
4631   /* Count number of gp and fp argument registers used.  */
4632   words = current_function_args_info.words;
4633   n_gpr = current_function_args_info.regno;
4634   n_fpr = current_function_args_info.sse_regno;
4635
4636   if (cfun->va_list_gpr_size)
4637     {
4638       type = TREE_TYPE (gpr);
4639       t = build2 (GIMPLE_MODIFY_STMT, type, gpr,
4640                   build_int_cst (type, n_gpr * 8));
4641       TREE_SIDE_EFFECTS (t) = 1;
4642       expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
4643     }
4644
4645   if (cfun->va_list_fpr_size)
4646     {
4647       type = TREE_TYPE (fpr);
4648       t = build2 (GIMPLE_MODIFY_STMT, type, fpr,
4649                   build_int_cst (type, n_fpr * 16 + 8*REGPARM_MAX));
4650       TREE_SIDE_EFFECTS (t) = 1;
4651       expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
4652     }
4653
4654   /* Find the overflow area.  */
4655   type = TREE_TYPE (ovf);
4656   t = make_tree (type, virtual_incoming_args_rtx);
4657   if (words != 0)
4658     t = build2 (PLUS_EXPR, type, t,
4659                 build_int_cst (type, words * UNITS_PER_WORD));
4660   t = build2 (GIMPLE_MODIFY_STMT, type, ovf, t);
4661   TREE_SIDE_EFFECTS (t) = 1;
4662   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
4663
4664   if (cfun->va_list_gpr_size || cfun->va_list_fpr_size)
4665     {
4666       /* Find the register save area.
4667          Prologue of the function save it right above stack frame.  */
4668       type = TREE_TYPE (sav);
4669       t = make_tree (type, frame_pointer_rtx);
4670       t = build2 (GIMPLE_MODIFY_STMT, type, sav, t);
4671       TREE_SIDE_EFFECTS (t) = 1;
4672       expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
4673     }
4674 }
4675
4676 /* Implement va_arg.  */
4677
4678 static tree
4679 ix86_gimplify_va_arg (tree valist, tree type, tree *pre_p, tree *post_p)
4680 {
4681   static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
4682   tree f_gpr, f_fpr, f_ovf, f_sav;
4683   tree gpr, fpr, ovf, sav, t;
4684   int size, rsize;
4685   tree lab_false, lab_over = NULL_TREE;
4686   tree addr, t2;
4687   rtx container;
4688   int indirect_p = 0;
4689   tree ptrtype;
4690   enum machine_mode nat_mode;
4691
4692   /* Only 64bit target needs something special.  */
4693   if (!TARGET_64BIT || TARGET_64BIT_MS_ABI)
4694     return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
4695
4696   f_gpr = TYPE_FIELDS (TREE_TYPE (va_list_type_node));
4697   f_fpr = TREE_CHAIN (f_gpr);
4698   f_ovf = TREE_CHAIN (f_fpr);
4699   f_sav = TREE_CHAIN (f_ovf);
4700
4701   valist = build_va_arg_indirect_ref (valist);
4702   gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), valist, f_gpr, NULL_TREE);
4703   fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
4704   ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
4705   sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
4706
4707   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
4708   if (indirect_p)
4709     type = build_pointer_type (type);
4710   size = int_size_in_bytes (type);
4711   rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
4712
4713   nat_mode = type_natural_mode (type);
4714   container = construct_container (nat_mode, TYPE_MODE (type), type, 0,
4715                                    REGPARM_MAX, SSE_REGPARM_MAX, intreg, 0);
4716
4717   /* Pull the value out of the saved registers.  */
4718
4719   addr = create_tmp_var (ptr_type_node, "addr");
4720   DECL_POINTER_ALIAS_SET (addr) = get_varargs_alias_set ();
4721
4722   if (container)
4723     {
4724       int needed_intregs, needed_sseregs;
4725       bool need_temp;
4726       tree int_addr, sse_addr;
4727
4728       lab_false = create_artificial_label ();
4729       lab_over = create_artificial_label ();
4730
4731       examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
4732
4733       need_temp = (!REG_P (container)
4734                    && ((needed_intregs && TYPE_ALIGN (type) > 64)
4735                        || TYPE_ALIGN (type) > 128));
4736
4737       /* In case we are passing structure, verify that it is consecutive block
4738          on the register save area.  If not we need to do moves.  */
4739       if (!need_temp && !REG_P (container))
4740         {
4741           /* Verify that all registers are strictly consecutive  */
4742           if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
4743             {
4744               int i;
4745
4746               for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
4747                 {
4748                   rtx slot = XVECEXP (container, 0, i);
4749                   if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
4750                       || INTVAL (XEXP (slot, 1)) != i * 16)
4751                     need_temp = 1;
4752                 }
4753             }
4754           else
4755             {
4756               int i;
4757
4758               for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
4759                 {
4760                   rtx slot = XVECEXP (container, 0, i);
4761                   if (REGNO (XEXP (slot, 0)) != (unsigned int) i
4762                       || INTVAL (XEXP (slot, 1)) != i * 8)
4763                     need_temp = 1;
4764                 }
4765             }
4766         }
4767       if (!need_temp)
4768         {
4769           int_addr = addr;
4770           sse_addr = addr;
4771         }
4772       else
4773         {
4774           int_addr = create_tmp_var (ptr_type_node, "int_addr");
4775           DECL_POINTER_ALIAS_SET (int_addr) = get_varargs_alias_set ();
4776           sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
4777           DECL_POINTER_ALIAS_SET (sse_addr) = get_varargs_alias_set ();
4778         }
4779
4780       /* First ensure that we fit completely in registers.  */
4781       if (needed_intregs)
4782         {
4783           t = build_int_cst (TREE_TYPE (gpr),
4784                              (REGPARM_MAX - needed_intregs + 1) * 8);
4785           t = build2 (GE_EXPR, boolean_type_node, gpr, t);
4786           t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
4787           t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
4788           gimplify_and_add (t, pre_p);
4789         }
4790       if (needed_sseregs)
4791         {
4792           t = build_int_cst (TREE_TYPE (fpr),
4793                              (SSE_REGPARM_MAX - needed_sseregs + 1) * 16
4794                              + REGPARM_MAX * 8);
4795           t = build2 (GE_EXPR, boolean_type_node, fpr, t);
4796           t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
4797           t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
4798           gimplify_and_add (t, pre_p);
4799         }
4800
4801       /* Compute index to start of area used for integer regs.  */
4802       if (needed_intregs)
4803         {
4804           /* int_addr = gpr + sav; */
4805           t = fold_convert (ptr_type_node, gpr);
4806           t = build2 (PLUS_EXPR, ptr_type_node, sav, t);
4807           t = build2 (GIMPLE_MODIFY_STMT, void_type_node, int_addr, t);
4808           gimplify_and_add (t, pre_p);
4809         }
4810       if (needed_sseregs)
4811         {
4812           /* sse_addr = fpr + sav; */
4813           t = fold_convert (ptr_type_node, fpr);
4814           t = build2 (PLUS_EXPR, ptr_type_node, sav, t);
4815           t = build2 (GIMPLE_MODIFY_STMT, void_type_node, sse_addr, t);
4816           gimplify_and_add (t, pre_p);
4817         }
4818       if (need_temp)
4819         {
4820           int i;
4821           tree temp = create_tmp_var (type, "va_arg_tmp");
4822
4823           /* addr = &temp; */
4824           t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
4825           t = build2 (GIMPLE_MODIFY_STMT, void_type_node, addr, t);
4826           gimplify_and_add (t, pre_p);
4827
4828           for (i = 0; i < XVECLEN (container, 0); i++)
4829             {
4830               rtx slot = XVECEXP (container, 0, i);
4831               rtx reg = XEXP (slot, 0);
4832               enum machine_mode mode = GET_MODE (reg);
4833               tree piece_type = lang_hooks.types.type_for_mode (mode, 1);
4834               tree addr_type = build_pointer_type (piece_type);
4835               tree src_addr, src;
4836               int src_offset;
4837               tree dest_addr, dest;
4838
4839               if (SSE_REGNO_P (REGNO (reg)))
4840                 {
4841                   src_addr = sse_addr;
4842                   src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
4843                 }
4844               else
4845                 {
4846                   src_addr = int_addr;
4847                   src_offset = REGNO (reg) * 8;
4848                 }
4849               src_addr = fold_convert (addr_type, src_addr);
4850               src_addr = fold_build2 (PLUS_EXPR, addr_type, src_addr,
4851                                       size_int (src_offset));
4852               src = build_va_arg_indirect_ref (src_addr);
4853
4854               dest_addr = fold_convert (addr_type, addr);
4855               dest_addr = fold_build2 (PLUS_EXPR, addr_type, dest_addr,
4856                                        size_int (INTVAL (XEXP (slot, 1))));
4857               dest = build_va_arg_indirect_ref (dest_addr);
4858
4859               t = build2 (GIMPLE_MODIFY_STMT, void_type_node, dest, src);
4860               gimplify_and_add (t, pre_p);
4861             }
4862         }
4863
4864       if (needed_intregs)
4865         {
4866           t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
4867                       build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
4868           t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (gpr), gpr, t);
4869           gimplify_and_add (t, pre_p);
4870         }
4871       if (needed_sseregs)
4872         {
4873           t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
4874                       build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
4875           t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (fpr), fpr, t);
4876           gimplify_and_add (t, pre_p);
4877         }
4878
4879       t = build1 (GOTO_EXPR, void_type_node, lab_over);
4880       gimplify_and_add (t, pre_p);
4881
4882       t = build1 (LABEL_EXPR, void_type_node, lab_false);
4883       append_to_statement_list (t, pre_p);
4884     }
4885
4886   /* ... otherwise out of the overflow area.  */
4887
4888   /* Care for on-stack alignment if needed.  */
4889   if (FUNCTION_ARG_BOUNDARY (VOIDmode, type) <= 64
4890       || integer_zerop (TYPE_SIZE (type)))
4891     t = ovf;
4892   else
4893     {
4894       HOST_WIDE_INT align = FUNCTION_ARG_BOUNDARY (VOIDmode, type) / 8;
4895       t = build2 (PLUS_EXPR, TREE_TYPE (ovf), ovf,
4896                   build_int_cst (TREE_TYPE (ovf), align - 1));
4897       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
4898                   build_int_cst (TREE_TYPE (t), -align));
4899     }
4900   gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
4901
4902   t2 = build2 (GIMPLE_MODIFY_STMT, void_type_node, addr, t);
4903   gimplify_and_add (t2, pre_p);
4904
4905   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
4906               build_int_cst (TREE_TYPE (t), rsize * UNITS_PER_WORD));
4907   t = build2 (GIMPLE_MODIFY_STMT, TREE_TYPE (ovf), ovf, t);
4908   gimplify_and_add (t, pre_p);
4909
4910   if (container)
4911     {
4912       t = build1 (LABEL_EXPR, void_type_node, lab_over);
4913       append_to_statement_list (t, pre_p);
4914     }
4915
4916   ptrtype = build_pointer_type (type);
4917   addr = fold_convert (ptrtype, addr);
4918
4919   if (indirect_p)
4920     addr = build_va_arg_indirect_ref (addr);
4921   return build_va_arg_indirect_ref (addr);
4922 }
4923 \f
4924 /* Return nonzero if OPNUM's MEM should be matched
4925    in movabs* patterns.  */
4926
4927 int
4928 ix86_check_movabs (rtx insn, int opnum)
4929 {
4930   rtx set, mem;
4931
4932   set = PATTERN (insn);
4933   if (GET_CODE (set) == PARALLEL)
4934     set = XVECEXP (set, 0, 0);
4935   gcc_assert (GET_CODE (set) == SET);
4936   mem = XEXP (set, opnum);
4937   while (GET_CODE (mem) == SUBREG)
4938     mem = SUBREG_REG (mem);
4939   gcc_assert (MEM_P (mem));
4940   return (volatile_ok || !MEM_VOLATILE_P (mem));
4941 }
4942 \f
4943 /* Initialize the table of extra 80387 mathematical constants.  */
4944
4945 static void
4946 init_ext_80387_constants (void)
4947 {
4948   static const char * cst[5] =
4949   {
4950     "0.3010299956639811952256464283594894482",  /* 0: fldlg2  */
4951     "0.6931471805599453094286904741849753009",  /* 1: fldln2  */
4952     "1.4426950408889634073876517827983434472",  /* 2: fldl2e  */
4953     "3.3219280948873623478083405569094566090",  /* 3: fldl2t  */
4954     "3.1415926535897932385128089594061862044",  /* 4: fldpi   */
4955   };
4956   int i;
4957
4958   for (i = 0; i < 5; i++)
4959     {
4960       real_from_string (&ext_80387_constants_table[i], cst[i]);
4961       /* Ensure each constant is rounded to XFmode precision.  */
4962       real_convert (&ext_80387_constants_table[i],
4963                     XFmode, &ext_80387_constants_table[i]);
4964     }
4965
4966   ext_80387_constants_init = 1;
4967 }
4968
4969 /* Return true if the constant is something that can be loaded with
4970    a special instruction.  */
4971
4972 int
4973 standard_80387_constant_p (rtx x)
4974 {
4975   enum machine_mode mode = GET_MODE (x);
4976
4977   REAL_VALUE_TYPE r;
4978
4979   if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
4980     return -1;
4981
4982   if (x == CONST0_RTX (mode))
4983     return 1;
4984   if (x == CONST1_RTX (mode))
4985     return 2;
4986
4987   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4988
4989   /* For XFmode constants, try to find a special 80387 instruction when
4990      optimizing for size or on those CPUs that benefit from them.  */
4991   if (mode == XFmode
4992       && (optimize_size || TARGET_EXT_80387_CONSTANTS))
4993     {
4994       int i;
4995
4996       if (! ext_80387_constants_init)
4997         init_ext_80387_constants ();
4998
4999       for (i = 0; i < 5; i++)
5000         if (real_identical (&r, &ext_80387_constants_table[i]))
5001           return i + 3;
5002     }
5003
5004   /* Load of the constant -0.0 or -1.0 will be split as
5005      fldz;fchs or fld1;fchs sequence.  */
5006   if (real_isnegzero (&r))
5007     return 8;
5008   if (real_identical (&r, &dconstm1))
5009     return 9;
5010
5011   return 0;
5012 }
5013
5014 /* Return the opcode of the special instruction to be used to load
5015    the constant X.  */
5016
5017 const char *
5018 standard_80387_constant_opcode (rtx x)
5019 {
5020   switch (standard_80387_constant_p (x))
5021     {
5022     case 1:
5023       return "fldz";
5024     case 2:
5025       return "fld1";
5026     case 3:
5027       return "fldlg2";
5028     case 4:
5029       return "fldln2";
5030     case 5:
5031       return "fldl2e";
5032     case 6:
5033       return "fldl2t";
5034     case 7:
5035       return "fldpi";
5036     case 8:
5037     case 9:
5038       return "#";
5039     default:
5040       gcc_unreachable ();
5041     }
5042 }
5043
5044 /* Return the CONST_DOUBLE representing the 80387 constant that is
5045    loaded by the specified special instruction.  The argument IDX
5046    matches the return value from standard_80387_constant_p.  */
5047
5048 rtx
5049 standard_80387_constant_rtx (int idx)
5050 {
5051   int i;
5052
5053   if (! ext_80387_constants_init)
5054     init_ext_80387_constants ();
5055
5056   switch (idx)
5057     {
5058     case 3:
5059     case 4:
5060     case 5:
5061     case 6:
5062     case 7:
5063       i = idx - 3;
5064       break;
5065
5066     default:
5067       gcc_unreachable ();
5068     }
5069
5070   return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
5071                                        XFmode);
5072 }
5073
5074 /* Return 1 if mode is a valid mode for sse.  */
5075 static int
5076 standard_sse_mode_p (enum machine_mode mode)
5077 {
5078   switch (mode)
5079     {
5080     case V16QImode:
5081     case V8HImode:
5082     case V4SImode:
5083     case V2DImode:
5084     case V4SFmode:
5085     case V2DFmode:
5086       return 1;
5087
5088     default:
5089       return 0;
5090     }
5091 }
5092
5093 /* Return 1 if X is FP constant we can load to SSE register w/o using memory.
5094  */
5095 int
5096 standard_sse_constant_p (rtx x)
5097 {
5098   enum machine_mode mode = GET_MODE (x);
5099
5100   if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
5101     return 1;
5102   if (vector_all_ones_operand (x, mode)
5103       && standard_sse_mode_p (mode))
5104     return TARGET_SSE2 ? 2 : -1;
5105
5106   return 0;
5107 }
5108
5109 /* Return the opcode of the special instruction to be used to load
5110    the constant X.  */
5111
5112 const char *
5113 standard_sse_constant_opcode (rtx insn, rtx x)
5114 {
5115   switch (standard_sse_constant_p (x))
5116     {
5117     case 1:
5118       if (get_attr_mode (insn) == MODE_V4SF)
5119         return "xorps\t%0, %0";
5120       else if (get_attr_mode (insn) == MODE_V2DF)
5121         return "xorpd\t%0, %0";
5122       else
5123         return "pxor\t%0, %0";
5124     case 2:
5125       return "pcmpeqd\t%0, %0";
5126     }
5127   gcc_unreachable ();
5128 }
5129
5130 /* Returns 1 if OP contains a symbol reference */
5131
5132 int
5133 symbolic_reference_mentioned_p (rtx op)
5134 {
5135   const char *fmt;
5136   int i;
5137
5138   if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
5139     return 1;
5140
5141   fmt = GET_RTX_FORMAT (GET_CODE (op));
5142   for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
5143     {
5144       if (fmt[i] == 'E')
5145         {
5146           int j;
5147
5148           for (j = XVECLEN (op, i) - 1; j >= 0; j--)
5149             if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
5150               return 1;
5151         }
5152
5153       else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
5154         return 1;
5155     }
5156
5157   return 0;
5158 }
5159
5160 /* Return 1 if it is appropriate to emit `ret' instructions in the
5161    body of a function.  Do this only if the epilogue is simple, needing a
5162    couple of insns.  Prior to reloading, we can't tell how many registers
5163    must be saved, so return 0 then.  Return 0 if there is no frame
5164    marker to de-allocate.  */
5165
5166 int
5167 ix86_can_use_return_insn_p (void)
5168 {
5169   struct ix86_frame frame;
5170
5171   if (! reload_completed || frame_pointer_needed)
5172     return 0;
5173
5174   /* Don't allow more than 32 pop, since that's all we can do
5175      with one instruction.  */
5176   if (current_function_pops_args
5177       && current_function_args_size >= 32768)
5178     return 0;
5179
5180   ix86_compute_frame_layout (&frame);
5181   return frame.to_allocate == 0 && frame.nregs == 0;
5182 }
5183 \f
5184 /* Value should be nonzero if functions must have frame pointers.
5185    Zero means the frame pointer need not be set up (and parms may
5186    be accessed via the stack pointer) in functions that seem suitable.  */
5187
5188 int
5189 ix86_frame_pointer_required (void)
5190 {
5191   /* If we accessed previous frames, then the generated code expects
5192      to be able to access the saved ebp value in our frame.  */
5193   if (cfun->machine->accesses_prev_frame)
5194     return 1;
5195
5196   /* Several x86 os'es need a frame pointer for other reasons,
5197      usually pertaining to setjmp.  */
5198   if (SUBTARGET_FRAME_POINTER_REQUIRED)
5199     return 1;
5200
5201   /* In override_options, TARGET_OMIT_LEAF_FRAME_POINTER turns off
5202      the frame pointer by default.  Turn it back on now if we've not
5203      got a leaf function.  */
5204   if (TARGET_OMIT_LEAF_FRAME_POINTER
5205       && (!current_function_is_leaf
5206           || ix86_current_function_calls_tls_descriptor))
5207     return 1;
5208
5209   if (current_function_profile)
5210     return 1;
5211
5212   return 0;
5213 }
5214
5215 /* Record that the current function accesses previous call frames.  */
5216
5217 void
5218 ix86_setup_frame_addresses (void)
5219 {
5220   cfun->machine->accesses_prev_frame = 1;
5221 }
5222 \f
5223 #if (defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)) || TARGET_MACHO
5224 # define USE_HIDDEN_LINKONCE 1
5225 #else
5226 # define USE_HIDDEN_LINKONCE 0
5227 #endif
5228
5229 static int pic_labels_used;
5230
5231 /* Fills in the label name that should be used for a pc thunk for
5232    the given register.  */
5233
5234 static void
5235 get_pc_thunk_name (char name[32], unsigned int regno)
5236 {
5237   gcc_assert (!TARGET_64BIT);
5238
5239   if (USE_HIDDEN_LINKONCE)
5240     sprintf (name, "__i686.get_pc_thunk.%s", reg_names[regno]);
5241   else
5242     ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
5243 }
5244
5245
5246 /* This function generates code for -fpic that loads %ebx with
5247    the return address of the caller and then returns.  */
5248
5249 void
5250 ix86_file_end (void)
5251 {
5252   rtx xops[2];
5253   int regno;
5254
5255   for (regno = 0; regno < 8; ++regno)
5256     {
5257       char name[32];
5258
5259       if (! ((pic_labels_used >> regno) & 1))
5260         continue;
5261
5262       get_pc_thunk_name (name, regno);
5263
5264 #if TARGET_MACHO
5265       if (TARGET_MACHO)
5266         {
5267           switch_to_section (darwin_sections[text_coal_section]);
5268           fputs ("\t.weak_definition\t", asm_out_file);
5269           assemble_name (asm_out_file, name);
5270           fputs ("\n\t.private_extern\t", asm_out_file);
5271           assemble_name (asm_out_file, name);
5272           fputs ("\n", asm_out_file);
5273           ASM_OUTPUT_LABEL (asm_out_file, name);
5274         }
5275       else
5276 #endif
5277       if (USE_HIDDEN_LINKONCE)
5278         {
5279           tree decl;
5280
5281           decl = build_decl (FUNCTION_DECL, get_identifier (name),
5282                              error_mark_node);
5283           TREE_PUBLIC (decl) = 1;
5284           TREE_STATIC (decl) = 1;
5285           DECL_ONE_ONLY (decl) = 1;
5286
5287           (*targetm.asm_out.unique_section) (decl, 0);
5288           switch_to_section (get_named_section (decl, NULL, 0));
5289
5290           (*targetm.asm_out.globalize_label) (asm_out_file, name);
5291           fputs ("\t.hidden\t", asm_out_file);
5292           assemble_name (asm_out_file, name);
5293           fputc ('\n', asm_out_file);
5294           ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
5295         }
5296       else
5297         {
5298           switch_to_section (text_section);
5299           ASM_OUTPUT_LABEL (asm_out_file, name);
5300         }
5301
5302       xops[0] = gen_rtx_REG (SImode, regno);
5303       xops[1] = gen_rtx_MEM (SImode, stack_pointer_rtx);
5304       output_asm_insn ("mov{l}\t{%1, %0|%0, %1}", xops);
5305       output_asm_insn ("ret", xops);
5306     }
5307
5308   if (NEED_INDICATE_EXEC_STACK)
5309     file_end_indicate_exec_stack ();
5310 }
5311
5312 /* Emit code for the SET_GOT patterns.  */
5313
5314 const char *
5315 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
5316 {
5317   rtx xops[3];
5318
5319   xops[0] = dest;
5320
5321   if (TARGET_VXWORKS_RTP && flag_pic)
5322     {
5323       /* Load (*VXWORKS_GOTT_BASE) into the PIC register.  */
5324       xops[2] = gen_rtx_MEM (Pmode,
5325                              gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
5326       output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
5327
5328       /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
5329          Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
5330          an unadorned address.  */
5331       xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
5332       SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
5333       output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
5334       return "";
5335     }
5336
5337   xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
5338
5339   if (! TARGET_DEEP_BRANCH_PREDICTION || !flag_pic)
5340     {
5341       xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
5342
5343       if (!flag_pic)
5344         output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
5345       else
5346         output_asm_insn ("call\t%a2", xops);
5347
5348 #if TARGET_MACHO
5349       /* Output the Mach-O "canonical" label name ("Lxx$pb") here too.  This
5350          is what will be referenced by the Mach-O PIC subsystem.  */
5351       if (!label)
5352         ASM_OUTPUT_LABEL (asm_out_file, machopic_function_base_name ());
5353 #endif
5354
5355       (*targetm.asm_out.internal_label) (asm_out_file, "L",
5356                                  CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
5357
5358       if (flag_pic)
5359         output_asm_insn ("pop{l}\t%0", xops);
5360     }
5361   else
5362     {
5363       char name[32];
5364       get_pc_thunk_name (name, REGNO (dest));
5365       pic_labels_used |= 1 << REGNO (dest);
5366
5367       xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
5368       xops[2] = gen_rtx_MEM (QImode, xops[2]);
5369       output_asm_insn ("call\t%X2", xops);
5370       /* Output the Mach-O "canonical" label name ("Lxx$pb") here too.  This
5371          is what will be referenced by the Mach-O PIC subsystem.  */
5372 #if TARGET_MACHO
5373       if (!label)
5374         ASM_OUTPUT_LABEL (asm_out_file, machopic_function_base_name ());
5375       else
5376         targetm.asm_out.internal_label (asm_out_file, "L",
5377                                            CODE_LABEL_NUMBER (label));
5378 #endif
5379     }
5380
5381   if (TARGET_MACHO)
5382     return "";
5383
5384   if (!flag_pic || TARGET_DEEP_BRANCH_PREDICTION)
5385     output_asm_insn ("add{l}\t{%1, %0|%0, %1}", xops);
5386   else
5387     output_asm_insn ("add{l}\t{%1+[.-%a2], %0|%0, %1+(.-%a2)}", xops);
5388
5389   return "";
5390 }
5391
5392 /* Generate an "push" pattern for input ARG.  */
5393
5394 static rtx
5395 gen_push (rtx arg)
5396 {
5397   return gen_rtx_SET (VOIDmode,
5398                       gen_rtx_MEM (Pmode,
5399                                    gen_rtx_PRE_DEC (Pmode,
5400                                                     stack_pointer_rtx)),
5401                       arg);
5402 }
5403
5404 /* Return >= 0 if there is an unused call-clobbered register available
5405    for the entire function.  */
5406
5407 static unsigned int
5408 ix86_select_alt_pic_regnum (void)
5409 {
5410   if (current_function_is_leaf && !current_function_profile
5411       && !ix86_current_function_calls_tls_descriptor)
5412     {
5413       int i;
5414       for (i = 2; i >= 0; --i)
5415         if (!regs_ever_live[i])
5416           return i;
5417     }
5418
5419   return INVALID_REGNUM;
5420 }
5421
5422 /* Return 1 if we need to save REGNO.  */
5423 static int
5424 ix86_save_reg (unsigned int regno, int maybe_eh_return)
5425 {
5426   if (pic_offset_table_rtx
5427       && regno == REAL_PIC_OFFSET_TABLE_REGNUM
5428       && (regs_ever_live[REAL_PIC_OFFSET_TABLE_REGNUM]
5429           || current_function_profile
5430           || current_function_calls_eh_return
5431           || current_function_uses_const_pool))
5432     {
5433       if (ix86_select_alt_pic_regnum () != INVALID_REGNUM)
5434         return 0;
5435       return 1;
5436     }
5437
5438   if (current_function_calls_eh_return && maybe_eh_return)
5439     {
5440       unsigned i;
5441       for (i = 0; ; i++)
5442         {
5443           unsigned test = EH_RETURN_DATA_REGNO (i);
5444           if (test == INVALID_REGNUM)
5445             break;
5446           if (test == regno)
5447             return 1;
5448         }
5449     }
5450
5451   if (cfun->machine->force_align_arg_pointer
5452       && regno == REGNO (cfun->machine->force_align_arg_pointer))
5453     return 1;
5454
5455   return (regs_ever_live[regno]
5456           && !call_used_regs[regno]
5457           && !fixed_regs[regno]
5458           && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
5459 }
5460
5461 /* Return number of registers to be saved on the stack.  */
5462
5463 static int
5464 ix86_nsaved_regs (void)
5465 {
5466   int nregs = 0;
5467   int regno;
5468
5469   for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; regno--)
5470     if (ix86_save_reg (regno, true))
5471       nregs++;
5472   return nregs;
5473 }
5474
5475 /* Return the offset between two registers, one to be eliminated, and the other
5476    its replacement, at the start of a routine.  */
5477
5478 HOST_WIDE_INT
5479 ix86_initial_elimination_offset (int from, int to)
5480 {
5481   struct ix86_frame frame;
5482   ix86_compute_frame_layout (&frame);
5483
5484   if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5485     return frame.hard_frame_pointer_offset;
5486   else if (from == FRAME_POINTER_REGNUM
5487            && to == HARD_FRAME_POINTER_REGNUM)
5488     return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
5489   else
5490     {
5491       gcc_assert (to == STACK_POINTER_REGNUM);
5492
5493       if (from == ARG_POINTER_REGNUM)
5494         return frame.stack_pointer_offset;
5495
5496       gcc_assert (from == FRAME_POINTER_REGNUM);
5497       return frame.stack_pointer_offset - frame.frame_pointer_offset;
5498     }
5499 }
5500
5501 /* Fill structure ix86_frame about frame of currently computed function.  */
5502
5503 static void
5504 ix86_compute_frame_layout (struct ix86_frame *frame)
5505 {
5506   HOST_WIDE_INT total_size;
5507   unsigned int stack_alignment_needed;
5508   HOST_WIDE_INT offset;
5509   unsigned int preferred_alignment;
5510   HOST_WIDE_INT size = get_frame_size ();
5511
5512   frame->nregs = ix86_nsaved_regs ();
5513   total_size = size;
5514
5515   stack_alignment_needed = cfun->stack_alignment_needed / BITS_PER_UNIT;
5516   preferred_alignment = cfun->preferred_stack_boundary / BITS_PER_UNIT;
5517
5518   /* During reload iteration the amount of registers saved can change.
5519      Recompute the value as needed.  Do not recompute when amount of registers
5520      didn't change as reload does multiple calls to the function and does not
5521      expect the decision to change within single iteration.  */
5522   if (!optimize_size
5523       && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
5524     {
5525       int count = frame->nregs;
5526
5527       cfun->machine->use_fast_prologue_epilogue_nregs = count;
5528       /* The fast prologue uses move instead of push to save registers.  This
5529          is significantly longer, but also executes faster as modern hardware
5530          can execute the moves in parallel, but can't do that for push/pop.
5531
5532          Be careful about choosing what prologue to emit:  When function takes
5533          many instructions to execute we may use slow version as well as in
5534          case function is known to be outside hot spot (this is known with
5535          feedback only).  Weight the size of function by number of registers
5536          to save as it is cheap to use one or two push instructions but very
5537          slow to use many of them.  */
5538       if (count)
5539         count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
5540       if (cfun->function_frequency < FUNCTION_FREQUENCY_NORMAL
5541           || (flag_branch_probabilities
5542               && cfun->function_frequency < FUNCTION_FREQUENCY_HOT))
5543         cfun->machine->use_fast_prologue_epilogue = false;
5544       else
5545         cfun->machine->use_fast_prologue_epilogue
5546            = !expensive_function_p (count);
5547     }
5548   if (TARGET_PROLOGUE_USING_MOVE
5549       && cfun->machine->use_fast_prologue_epilogue)
5550     frame->save_regs_using_mov = true;
5551   else
5552     frame->save_regs_using_mov = false;
5553
5554
5555   /* Skip return address and saved base pointer.  */
5556   offset = frame_pointer_needed ? UNITS_PER_WORD * 2 : UNITS_PER_WORD;
5557
5558   frame->hard_frame_pointer_offset = offset;
5559
5560   /* Do some sanity checking of stack_alignment_needed and
5561      preferred_alignment, since i386 port is the only using those features
5562      that may break easily.  */
5563
5564   gcc_assert (!size || stack_alignment_needed);
5565   gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
5566   gcc_assert (preferred_alignment <= PREFERRED_STACK_BOUNDARY / BITS_PER_UNIT);
5567   gcc_assert (stack_alignment_needed
5568               <= PREFERRED_STACK_BOUNDARY / BITS_PER_UNIT);
5569
5570   if (stack_alignment_needed < STACK_BOUNDARY / BITS_PER_UNIT)
5571     stack_alignment_needed = STACK_BOUNDARY / BITS_PER_UNIT;
5572
5573   /* Register save area */
5574   offset += frame->nregs * UNITS_PER_WORD;
5575
5576   /* Va-arg area */
5577   if (ix86_save_varrargs_registers)
5578     {
5579       offset += X86_64_VARARGS_SIZE;
5580       frame->va_arg_size = X86_64_VARARGS_SIZE;
5581     }
5582   else
5583     frame->va_arg_size = 0;
5584
5585   /* Align start of frame for local function.  */
5586   frame->padding1 = ((offset + stack_alignment_needed - 1)
5587                      & -stack_alignment_needed) - offset;
5588
5589   offset += frame->padding1;
5590
5591   /* Frame pointer points here.  */
5592   frame->frame_pointer_offset = offset;
5593
5594   offset += size;
5595
5596   /* Add outgoing arguments area.  Can be skipped if we eliminated
5597      all the function calls as dead code.
5598      Skipping is however impossible when function calls alloca.  Alloca
5599      expander assumes that last current_function_outgoing_args_size
5600      of stack frame are unused.  */
5601   if (ACCUMULATE_OUTGOING_ARGS
5602       && (!current_function_is_leaf || current_function_calls_alloca
5603           || ix86_current_function_calls_tls_descriptor))
5604     {
5605       offset += current_function_outgoing_args_size;
5606       frame->outgoing_arguments_size = current_function_outgoing_args_size;
5607     }
5608   else
5609     frame->outgoing_arguments_size = 0;
5610
5611   /* Align stack boundary.  Only needed if we're calling another function
5612      or using alloca.  */
5613   if (!current_function_is_leaf || current_function_calls_alloca
5614       || ix86_current_function_calls_tls_descriptor)
5615     frame->padding2 = ((offset + preferred_alignment - 1)
5616                        & -preferred_alignment) - offset;
5617   else
5618     frame->padding2 = 0;
5619
5620   offset += frame->padding2;
5621
5622   /* We've reached end of stack frame.  */
5623   frame->stack_pointer_offset = offset;
5624
5625   /* Size prologue needs to allocate.  */
5626   frame->to_allocate =
5627     (size + frame->padding1 + frame->padding2
5628      + frame->outgoing_arguments_size + frame->va_arg_size);
5629
5630   if ((!frame->to_allocate && frame->nregs <= 1)
5631       || (TARGET_64BIT && frame->to_allocate >= (HOST_WIDE_INT) 0x80000000))
5632     frame->save_regs_using_mov = false;
5633
5634   if (TARGET_RED_ZONE && current_function_sp_is_unchanging
5635       && current_function_is_leaf
5636       && !ix86_current_function_calls_tls_descriptor)
5637     {
5638       frame->red_zone_size = frame->to_allocate;
5639       if (frame->save_regs_using_mov)
5640         frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
5641       if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
5642         frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
5643     }
5644   else
5645     frame->red_zone_size = 0;
5646   frame->to_allocate -= frame->red_zone_size;
5647   frame->stack_pointer_offset -= frame->red_zone_size;
5648 #if 0
5649   fprintf (stderr, "\n");
5650   fprintf (stderr, "nregs: %ld\n", (long)frame->nregs);
5651   fprintf (stderr, "size: %ld\n", (long)size);
5652   fprintf (stderr, "alignment1: %ld\n", (long)stack_alignment_needed);
5653   fprintf (stderr, "padding1: %ld\n", (long)frame->padding1);
5654   fprintf (stderr, "va_arg: %ld\n", (long)frame->va_arg_size);
5655   fprintf (stderr, "padding2: %ld\n", (long)frame->padding2);
5656   fprintf (stderr, "to_allocate: %ld\n", (long)frame->to_allocate);
5657   fprintf (stderr, "red_zone_size: %ld\n", (long)frame->red_zone_size);
5658   fprintf (stderr, "frame_pointer_offset: %ld\n", (long)frame->frame_pointer_offset);
5659   fprintf (stderr, "hard_frame_pointer_offset: %ld\n",
5660            (long)frame->hard_frame_pointer_offset);
5661   fprintf (stderr, "stack_pointer_offset: %ld\n", (long)frame->stack_pointer_offset);
5662   fprintf (stderr, "current_function_is_leaf: %ld\n", (long)current_function_is_leaf);
5663   fprintf (stderr, "current_function_calls_alloca: %ld\n", (long)current_function_calls_alloca);
5664   fprintf (stderr, "x86_current_function_calls_tls_descriptor: %ld\n", (long)ix86_current_function_calls_tls_descriptor);
5665 #endif
5666 }
5667
5668 /* Emit code to save registers in the prologue.  */
5669
5670 static void
5671 ix86_emit_save_regs (void)
5672 {
5673   unsigned int regno;
5674   rtx insn;
5675
5676   for (regno = FIRST_PSEUDO_REGISTER; regno-- > 0; )
5677     if (ix86_save_reg (regno, true))
5678       {
5679         insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
5680         RTX_FRAME_RELATED_P (insn) = 1;
5681       }
5682 }
5683
5684 /* Emit code to save registers using MOV insns.  First register
5685    is restored from POINTER + OFFSET.  */
5686 static void
5687 ix86_emit_save_regs_using_mov (rtx pointer, HOST_WIDE_INT offset)
5688 {
5689   unsigned int regno;
5690   rtx insn;
5691
5692   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
5693     if (ix86_save_reg (regno, true))
5694       {
5695         insn = emit_move_insn (adjust_address (gen_rtx_MEM (Pmode, pointer),
5696                                                Pmode, offset),
5697                                gen_rtx_REG (Pmode, regno));
5698         RTX_FRAME_RELATED_P (insn) = 1;
5699         offset += UNITS_PER_WORD;
5700       }
5701 }
5702
5703 /* Expand prologue or epilogue stack adjustment.
5704    The pattern exist to put a dependency on all ebp-based memory accesses.
5705    STYLE should be negative if instructions should be marked as frame related,
5706    zero if %r11 register is live and cannot be freely used and positive
5707    otherwise.  */
5708
5709 static void
5710 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset, int style)
5711 {
5712   rtx insn;
5713
5714   if (! TARGET_64BIT)
5715     insn = emit_insn (gen_pro_epilogue_adjust_stack_1 (dest, src, offset));
5716   else if (x86_64_immediate_operand (offset, DImode))
5717     insn = emit_insn (gen_pro_epilogue_adjust_stack_rex64 (dest, src, offset));
5718   else
5719     {
5720       rtx r11;
5721       /* r11 is used by indirect sibcall return as well, set before the
5722          epilogue and used after the epilogue.  ATM indirect sibcall
5723          shouldn't be used together with huge frame sizes in one
5724          function because of the frame_size check in sibcall.c.  */
5725       gcc_assert (style);
5726       r11 = gen_rtx_REG (DImode, R11_REG);
5727       insn = emit_insn (gen_rtx_SET (DImode, r11, offset));
5728       if (style < 0)
5729         RTX_FRAME_RELATED_P (insn) = 1;
5730       insn = emit_insn (gen_pro_epilogue_adjust_stack_rex64_2 (dest, src, r11,
5731                                                                offset));
5732     }
5733   if (style < 0)
5734     RTX_FRAME_RELATED_P (insn) = 1;
5735 }
5736
5737 /* Handle the TARGET_INTERNAL_ARG_POINTER hook.  */
5738
5739 static rtx
5740 ix86_internal_arg_pointer (void)
5741 {
5742   bool has_force_align_arg_pointer =
5743     (0 != lookup_attribute (ix86_force_align_arg_pointer_string,
5744                             TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))));
5745   if ((FORCE_PREFERRED_STACK_BOUNDARY_IN_MAIN
5746        && DECL_NAME (current_function_decl)
5747        && MAIN_NAME_P (DECL_NAME (current_function_decl))
5748        && DECL_FILE_SCOPE_P (current_function_decl))
5749       || ix86_force_align_arg_pointer
5750       || has_force_align_arg_pointer)
5751     {
5752       /* Nested functions can't realign the stack due to a register
5753          conflict.  */
5754       if (DECL_CONTEXT (current_function_decl)
5755           && TREE_CODE (DECL_CONTEXT (current_function_decl)) == FUNCTION_DECL)
5756         {
5757           if (ix86_force_align_arg_pointer)
5758             warning (0, "-mstackrealign ignored for nested functions");
5759           if (has_force_align_arg_pointer)
5760             error ("%s not supported for nested functions",
5761                    ix86_force_align_arg_pointer_string);
5762           return virtual_incoming_args_rtx;
5763         }
5764       cfun->machine->force_align_arg_pointer = gen_rtx_REG (Pmode, 2);
5765       return copy_to_reg (cfun->machine->force_align_arg_pointer);
5766     }
5767   else
5768     return virtual_incoming_args_rtx;
5769 }
5770
5771 /* Handle the TARGET_DWARF_HANDLE_FRAME_UNSPEC hook.
5772    This is called from dwarf2out.c to emit call frame instructions
5773    for frame-related insns containing UNSPECs and UNSPEC_VOLATILEs. */
5774 static void
5775 ix86_dwarf_handle_frame_unspec (const char *label, rtx pattern, int index)
5776 {
5777   rtx unspec = SET_SRC (pattern);
5778   gcc_assert (GET_CODE (unspec) == UNSPEC);
5779
5780   switch (index)
5781     {
5782     case UNSPEC_REG_SAVE:
5783       dwarf2out_reg_save_reg (label, XVECEXP (unspec, 0, 0),
5784                               SET_DEST (pattern));
5785       break;
5786     case UNSPEC_DEF_CFA:
5787       dwarf2out_def_cfa (label, REGNO (SET_DEST (pattern)),
5788                          INTVAL (XVECEXP (unspec, 0, 0)));
5789       break;
5790     default:
5791       gcc_unreachable ();
5792     }
5793 }
5794
5795 /* Expand the prologue into a bunch of separate insns.  */
5796
5797 void
5798 ix86_expand_prologue (void)
5799 {
5800   rtx insn;
5801   bool pic_reg_used;
5802   struct ix86_frame frame;
5803   HOST_WIDE_INT allocate;
5804
5805   ix86_compute_frame_layout (&frame);
5806
5807   if (cfun->machine->force_align_arg_pointer)
5808     {
5809       rtx x, y;
5810
5811       /* Grab the argument pointer.  */
5812       x = plus_constant (stack_pointer_rtx, 4);
5813       y = cfun->machine->force_align_arg_pointer;
5814       insn = emit_insn (gen_rtx_SET (VOIDmode, y, x));
5815       RTX_FRAME_RELATED_P (insn) = 1;
5816
5817       /* The unwind info consists of two parts: install the fafp as the cfa,
5818          and record the fafp as the "save register" of the stack pointer.
5819          The later is there in order that the unwinder can see where it
5820          should restore the stack pointer across the and insn.  */
5821       x = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx), UNSPEC_DEF_CFA);
5822       x = gen_rtx_SET (VOIDmode, y, x);
5823       RTX_FRAME_RELATED_P (x) = 1;
5824       y = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, stack_pointer_rtx),
5825                           UNSPEC_REG_SAVE);
5826       y = gen_rtx_SET (VOIDmode, cfun->machine->force_align_arg_pointer, y);
5827       RTX_FRAME_RELATED_P (y) = 1;
5828       x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, x, y));
5829       x = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR, x, NULL);
5830       REG_NOTES (insn) = x;
5831
5832       /* Align the stack.  */
5833       emit_insn (gen_andsi3 (stack_pointer_rtx, stack_pointer_rtx,
5834                              GEN_INT (-16)));
5835
5836       /* And here we cheat like madmen with the unwind info.  We force the
5837          cfa register back to sp+4, which is exactly what it was at the
5838          start of the function.  Re-pushing the return address results in
5839          the return at the same spot relative to the cfa, and thus is
5840          correct wrt the unwind info.  */
5841       x = cfun->machine->force_align_arg_pointer;
5842       x = gen_frame_mem (Pmode, plus_constant (x, -4));
5843       insn = emit_insn (gen_push (x));
5844       RTX_FRAME_RELATED_P (insn) = 1;
5845
5846       x = GEN_INT (4);
5847       x = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, x), UNSPEC_DEF_CFA);
5848       x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
5849       x = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR, x, NULL);
5850       REG_NOTES (insn) = x;
5851     }
5852
5853   /* Note: AT&T enter does NOT have reversed args.  Enter is probably
5854      slower on all targets.  Also sdb doesn't like it.  */
5855
5856   if (frame_pointer_needed)
5857     {
5858       insn = emit_insn (gen_push (hard_frame_pointer_rtx));
5859       RTX_FRAME_RELATED_P (insn) = 1;
5860
5861       insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
5862       RTX_FRAME_RELATED_P (insn) = 1;
5863     }
5864
5865   allocate = frame.to_allocate;
5866
5867   if (!frame.save_regs_using_mov)
5868     ix86_emit_save_regs ();
5869   else
5870     allocate += frame.nregs * UNITS_PER_WORD;
5871
5872   /* When using red zone we may start register saving before allocating
5873      the stack frame saving one cycle of the prologue.  */
5874   if (TARGET_RED_ZONE && frame.save_regs_using_mov)
5875     ix86_emit_save_regs_using_mov (frame_pointer_needed ? hard_frame_pointer_rtx
5876                                    : stack_pointer_rtx,
5877                                    -frame.nregs * UNITS_PER_WORD);
5878
5879   if (allocate == 0)
5880     ;
5881   else if (! TARGET_STACK_PROBE || allocate < CHECK_STACK_LIMIT)
5882     pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
5883                                GEN_INT (-allocate), -1);
5884   else
5885     {
5886       /* Only valid for Win32.  */
5887       rtx eax = gen_rtx_REG (Pmode, 0);
5888       bool eax_live;
5889       rtx t;
5890
5891       gcc_assert (!TARGET_64BIT || TARGET_64BIT_MS_ABI);
5892
5893       if (TARGET_64BIT_MS_ABI)
5894         eax_live = false;
5895       else
5896         eax_live = ix86_eax_live_at_start_p ();
5897
5898       if (eax_live)
5899         {
5900           emit_insn (gen_push (eax));
5901           allocate -= UNITS_PER_WORD;
5902         }
5903
5904       emit_move_insn (eax, GEN_INT (allocate));
5905
5906       if (TARGET_64BIT)
5907         insn = gen_allocate_stack_worker_64 (eax);
5908       else
5909         insn = gen_allocate_stack_worker_32 (eax);
5910       insn = emit_insn (insn);
5911       RTX_FRAME_RELATED_P (insn) = 1;
5912       t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (-allocate));
5913       t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
5914       REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR,
5915                                             t, REG_NOTES (insn));
5916
5917       if (eax_live)
5918         {
5919           if (frame_pointer_needed)
5920             t = plus_constant (hard_frame_pointer_rtx,
5921                                allocate
5922                                - frame.to_allocate
5923                                - frame.nregs * UNITS_PER_WORD);
5924           else
5925             t = plus_constant (stack_pointer_rtx, allocate);
5926           emit_move_insn (eax, gen_rtx_MEM (Pmode, t));
5927         }
5928     }
5929
5930   if (frame.save_regs_using_mov && !TARGET_RED_ZONE)
5931     {
5932       if (!frame_pointer_needed || !frame.to_allocate)
5933         ix86_emit_save_regs_using_mov (stack_pointer_rtx, frame.to_allocate);
5934       else
5935         ix86_emit_save_regs_using_mov (hard_frame_pointer_rtx,
5936                                        -frame.nregs * UNITS_PER_WORD);
5937     }
5938
5939   pic_reg_used = false;
5940   if (pic_offset_table_rtx
5941       && (regs_ever_live[REAL_PIC_OFFSET_TABLE_REGNUM]
5942           || current_function_profile))
5943     {
5944       unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
5945
5946       if (alt_pic_reg_used != INVALID_REGNUM)
5947         REGNO (pic_offset_table_rtx) = alt_pic_reg_used;
5948
5949       pic_reg_used = true;
5950     }
5951
5952   if (pic_reg_used)
5953     {
5954       if (TARGET_64BIT)
5955         {
5956           if (ix86_cmodel == CM_LARGE_PIC)
5957             {
5958               rtx tmp_reg = gen_rtx_REG (DImode,
5959                                          FIRST_REX_INT_REG + 3 /* R11 */);
5960               rtx label = gen_label_rtx ();
5961               emit_label (label);
5962               LABEL_PRESERVE_P (label) = 1;
5963               gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
5964               insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label));
5965               REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_MAYBE_DEAD, const0_rtx, NULL);
5966               insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
5967               REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_MAYBE_DEAD, const0_rtx, NULL);
5968               insn = emit_insn (gen_adddi3 (pic_offset_table_rtx,
5969                                             pic_offset_table_rtx, tmp_reg));
5970             }
5971           else
5972             insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
5973         }
5974       else
5975         insn = emit_insn (gen_set_got (pic_offset_table_rtx));
5976
5977       /* Even with accurate pre-reload life analysis, we can wind up
5978          deleting all references to the pic register after reload.
5979          Consider if cross-jumping unifies two sides of a branch
5980          controlled by a comparison vs the only read from a global.
5981          In which case, allow the set_got to be deleted, though we're
5982          too late to do anything about the ebx save in the prologue.  */
5983       REG_NOTES (insn) = gen_rtx_EXPR_LIST (REG_MAYBE_DEAD, const0_rtx, NULL);
5984     }
5985
5986   /* Prevent function calls from be scheduled before the call to mcount.
5987      In the pic_reg_used case, make sure that the got load isn't deleted.  */
5988   if (current_function_profile)
5989     emit_insn (gen_blockage (pic_reg_used ? pic_offset_table_rtx : const0_rtx));
5990 }
5991
5992 /* Emit code to restore saved registers using MOV insns.  First register
5993    is restored from POINTER + OFFSET.  */
5994 static void
5995 ix86_emit_restore_regs_using_mov (rtx pointer, HOST_WIDE_INT offset,
5996                                   int maybe_eh_return)
5997 {
5998   int regno;
5999   rtx base_address = gen_rtx_MEM (Pmode, pointer);
6000
6001   for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
6002     if (ix86_save_reg (regno, maybe_eh_return))
6003       {
6004         /* Ensure that adjust_address won't be forced to produce pointer
6005            out of range allowed by x86-64 instruction set.  */
6006         if (TARGET_64BIT && offset != trunc_int_for_mode (offset, SImode))
6007           {
6008             rtx r11;
6009
6010             r11 = gen_rtx_REG (DImode, R11_REG);
6011             emit_move_insn (r11, GEN_INT (offset));
6012             emit_insn (gen_adddi3 (r11, r11, pointer));
6013             base_address = gen_rtx_MEM (Pmode, r11);
6014             offset = 0;
6015           }
6016         emit_move_insn (gen_rtx_REG (Pmode, regno),
6017                         adjust_address (base_address, Pmode, offset));
6018         offset += UNITS_PER_WORD;
6019       }
6020 }
6021
6022 /* Restore function stack, frame, and registers.  */
6023
6024 void
6025 ix86_expand_epilogue (int style)
6026 {
6027   int regno;
6028   int sp_valid = !frame_pointer_needed || current_function_sp_is_unchanging;
6029   struct ix86_frame frame;
6030   HOST_WIDE_INT offset;
6031
6032   ix86_compute_frame_layout (&frame);
6033
6034   /* Calculate start of saved registers relative to ebp.  Special care
6035      must be taken for the normal return case of a function using
6036      eh_return: the eax and edx registers are marked as saved, but not
6037      restored along this path.  */
6038   offset = frame.nregs;
6039   if (current_function_calls_eh_return && style != 2)
6040     offset -= 2;
6041   offset *= -UNITS_PER_WORD;
6042
6043   /* If we're only restoring one register and sp is not valid then
6044      using a move instruction to restore the register since it's
6045      less work than reloading sp and popping the register.
6046
6047      The default code result in stack adjustment using add/lea instruction,
6048      while this code results in LEAVE instruction (or discrete equivalent),
6049      so it is profitable in some other cases as well.  Especially when there
6050      are no registers to restore.  We also use this code when TARGET_USE_LEAVE
6051      and there is exactly one register to pop. This heuristic may need some
6052      tuning in future.  */
6053   if ((!sp_valid && frame.nregs <= 1)
6054       || (TARGET_EPILOGUE_USING_MOVE
6055           && cfun->machine->use_fast_prologue_epilogue
6056           && (frame.nregs > 1 || frame.to_allocate))
6057       || (frame_pointer_needed && !frame.nregs && frame.to_allocate)
6058       || (frame_pointer_needed && TARGET_USE_LEAVE
6059           && cfun->machine->use_fast_prologue_epilogue
6060           && frame.nregs == 1)
6061       || current_function_calls_eh_return)
6062     {
6063       /* Restore registers.  We can use ebp or esp to address the memory
6064          locations.  If both are available, default to ebp, since offsets
6065          are known to be small.  Only exception is esp pointing directly to the
6066          end of block of saved registers, where we may simplify addressing
6067          mode.  */
6068
6069       if (!frame_pointer_needed || (sp_valid && !frame.to_allocate))
6070         ix86_emit_restore_regs_using_mov (stack_pointer_rtx,
6071                                           frame.to_allocate, style == 2);
6072       else
6073         ix86_emit_restore_regs_using_mov (hard_frame_pointer_rtx,
6074                                           offset, style == 2);
6075
6076       /* eh_return epilogues need %ecx added to the stack pointer.  */
6077       if (style == 2)
6078         {
6079           rtx tmp, sa = EH_RETURN_STACKADJ_RTX;
6080
6081           if (frame_pointer_needed)
6082             {
6083               tmp = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
6084               tmp = plus_constant (tmp, UNITS_PER_WORD);
6085               emit_insn (gen_rtx_SET (VOIDmode, sa, tmp));
6086
6087               tmp = gen_rtx_MEM (Pmode, hard_frame_pointer_rtx);
6088               emit_move_insn (hard_frame_pointer_rtx, tmp);
6089
6090               pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
6091                                          const0_rtx, style);
6092             }
6093           else
6094             {
6095               tmp = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
6096               tmp = plus_constant (tmp, (frame.to_allocate
6097                                          + frame.nregs * UNITS_PER_WORD));
6098               emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, tmp));
6099             }
6100         }
6101       else if (!frame_pointer_needed)
6102         pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
6103                                    GEN_INT (frame.to_allocate
6104                                             + frame.nregs * UNITS_PER_WORD),
6105                                    style);
6106       /* If not an i386, mov & pop is faster than "leave".  */
6107       else if (TARGET_USE_LEAVE || optimize_size
6108                || !cfun->machine->use_fast_prologue_epilogue)
6109         emit_insn (TARGET_64BIT ? gen_leave_rex64 () : gen_leave ());
6110       else
6111         {
6112           pro_epilogue_adjust_stack (stack_pointer_rtx,
6113                                      hard_frame_pointer_rtx,
6114                                      const0_rtx, style);
6115           if (TARGET_64BIT)
6116             emit_insn (gen_popdi1 (hard_frame_pointer_rtx));
6117           else
6118             emit_insn (gen_popsi1 (hard_frame_pointer_rtx));
6119         }
6120     }
6121   else
6122     {
6123       /* First step is to deallocate the stack frame so that we can
6124          pop the registers.  */
6125       if (!sp_valid)
6126         {
6127           gcc_assert (frame_pointer_needed);
6128           pro_epilogue_adjust_stack (stack_pointer_rtx,
6129                                      hard_frame_pointer_rtx,
6130                                      GEN_INT (offset), style);
6131         }
6132       else if (frame.to_allocate)
6133         pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
6134                                    GEN_INT (frame.to_allocate), style);
6135
6136       for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
6137         if (ix86_save_reg (regno, false))
6138           {
6139             if (TARGET_64BIT)
6140               emit_insn (gen_popdi1 (gen_rtx_REG (Pmode, regno)));
6141             else
6142               emit_insn (gen_popsi1 (gen_rtx_REG (Pmode, regno)));
6143           }
6144       if (frame_pointer_needed)
6145         {
6146           /* Leave results in shorter dependency chains on CPUs that are
6147              able to grok it fast.  */
6148           if (TARGET_USE_LEAVE)
6149             emit_insn (TARGET_64BIT ? gen_leave_rex64 () : gen_leave ());
6150           else if (TARGET_64BIT)
6151             emit_insn (gen_popdi1 (hard_frame_pointer_rtx));
6152           else
6153             emit_insn (gen_popsi1 (hard_frame_pointer_rtx));
6154         }
6155     }
6156
6157   if (cfun->machine->force_align_arg_pointer)
6158     {
6159       emit_insn (gen_addsi3 (stack_pointer_rtx,
6160                              cfun->machine->force_align_arg_pointer,
6161                              GEN_INT (-4)));
6162     }
6163
6164   /* Sibcall epilogues don't want a return instruction.  */
6165   if (style == 0)
6166     return;
6167
6168   if (current_function_pops_args && current_function_args_size)
6169     {
6170       rtx popc = GEN_INT (current_function_pops_args);
6171
6172       /* i386 can only pop 64K bytes.  If asked to pop more, pop
6173          return address, do explicit add, and jump indirectly to the
6174          caller.  */
6175
6176       if (current_function_pops_args >= 65536)
6177         {
6178           rtx ecx = gen_rtx_REG (SImode, 2);
6179
6180           /* There is no "pascal" calling convention in any 64bit ABI.  */
6181           gcc_assert (!TARGET_64BIT);
6182
6183           emit_insn (gen_popsi1 (ecx));
6184           emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, popc));
6185           emit_jump_insn (gen_return_indirect_internal (ecx));
6186         }
6187       else
6188         emit_jump_insn (gen_return_pop_internal (popc));
6189     }
6190   else
6191     emit_jump_insn (gen_return_internal ());
6192 }
6193
6194 /* Reset from the function's potential modifications.  */
6195
6196 static void
6197 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
6198                                HOST_WIDE_INT size ATTRIBUTE_UNUSED)
6199 {
6200   if (pic_offset_table_rtx)
6201     REGNO (pic_offset_table_rtx) = REAL_PIC_OFFSET_TABLE_REGNUM;
6202 #if TARGET_MACHO
6203   /* Mach-O doesn't support labels at the end of objects, so if
6204      it looks like we might want one, insert a NOP.  */
6205   {
6206     rtx insn = get_last_insn ();
6207     while (insn
6208            && NOTE_P (insn)
6209            && NOTE_LINE_NUMBER (insn) != NOTE_INSN_DELETED_LABEL)
6210       insn = PREV_INSN (insn);
6211     if (insn
6212         && (LABEL_P (insn)
6213             || (NOTE_P (insn)
6214                 && NOTE_LINE_NUMBER (insn) == NOTE_INSN_DELETED_LABEL)))
6215       fputs ("\tnop\n", file);
6216   }
6217 #endif
6218
6219 }
6220 \f
6221 /* Extract the parts of an RTL expression that is a valid memory address
6222    for an instruction.  Return 0 if the structure of the address is
6223    grossly off.  Return -1 if the address contains ASHIFT, so it is not
6224    strictly valid, but still used for computing length of lea instruction.  */
6225
6226 int
6227 ix86_decompose_address (rtx addr, struct ix86_address *out)
6228 {
6229   rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
6230   rtx base_reg, index_reg;
6231   HOST_WIDE_INT scale = 1;
6232   rtx scale_rtx = NULL_RTX;
6233   int retval = 1;
6234   enum ix86_address_seg seg = SEG_DEFAULT;
6235
6236   if (REG_P (addr) || GET_CODE (addr) == SUBREG)
6237     base = addr;
6238   else if (GET_CODE (addr) == PLUS)
6239     {
6240       rtx addends[4], op;
6241       int n = 0, i;
6242
6243       op = addr;
6244       do
6245         {
6246           if (n >= 4)
6247             return 0;
6248           addends[n++] = XEXP (op, 1);
6249           op = XEXP (op, 0);
6250         }
6251       while (GET_CODE (op) == PLUS);
6252       if (n >= 4)
6253         return 0;
6254       addends[n] = op;
6255
6256       for (i = n; i >= 0; --i)
6257         {
6258           op = addends[i];
6259           switch (GET_CODE (op))
6260             {
6261             case MULT:
6262               if (index)
6263                 return 0;
6264               index = XEXP (op, 0);
6265               scale_rtx = XEXP (op, 1);
6266               break;
6267
6268             case UNSPEC:
6269               if (XINT (op, 1) == UNSPEC_TP
6270                   && TARGET_TLS_DIRECT_SEG_REFS
6271                   && seg == SEG_DEFAULT)
6272                 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
6273               else
6274                 return 0;
6275               break;
6276
6277             case REG:
6278             case SUBREG:
6279               if (!base)
6280                 base = op;
6281               else if (!index)
6282                 index = op;
6283               else
6284                 return 0;
6285               break;
6286
6287             case CONST:
6288             case CONST_INT:
6289             case SYMBOL_REF:
6290             case LABEL_REF:
6291               if (disp)
6292                 return 0;
6293               disp = op;
6294               break;
6295
6296             default:
6297               return 0;
6298             }
6299         }
6300     }
6301   else if (GET_CODE (addr) == MULT)
6302     {
6303       index = XEXP (addr, 0);           /* index*scale */
6304       scale_rtx = XEXP (addr, 1);
6305     }
6306   else if (GET_CODE (addr) == ASHIFT)
6307     {
6308       rtx tmp;
6309
6310       /* We're called for lea too, which implements ashift on occasion.  */
6311       index = XEXP (addr, 0);
6312       tmp = XEXP (addr, 1);
6313       if (!CONST_INT_P (tmp))
6314         return 0;
6315       scale = INTVAL (tmp);
6316       if ((unsigned HOST_WIDE_INT) scale > 3)
6317         return 0;
6318       scale = 1 << scale;
6319       retval = -1;
6320     }
6321   else
6322     disp = addr;                        /* displacement */
6323
6324   /* Extract the integral value of scale.  */
6325   if (scale_rtx)
6326     {
6327       if (!CONST_INT_P (scale_rtx))
6328         return 0;
6329       scale = INTVAL (scale_rtx);
6330     }
6331
6332   base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
6333   index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
6334
6335   /* Allow arg pointer and stack pointer as index if there is not scaling.  */
6336   if (base_reg && index_reg && scale == 1
6337       && (index_reg == arg_pointer_rtx
6338           || index_reg == frame_pointer_rtx
6339           || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
6340     {
6341       rtx tmp;
6342       tmp = base, base = index, index = tmp;
6343       tmp = base_reg, base_reg = index_reg, index_reg = tmp;
6344     }
6345
6346   /* Special case: %ebp cannot be encoded as a base without a displacement.  */
6347   if ((base_reg == hard_frame_pointer_rtx
6348        || base_reg == frame_pointer_rtx
6349        || base_reg == arg_pointer_rtx) && !disp)
6350     disp = const0_rtx;
6351
6352   /* Special case: on K6, [%esi] makes the instruction vector decoded.
6353      Avoid this by transforming to [%esi+0].  */
6354   if (ix86_tune == PROCESSOR_K6 && !optimize_size
6355       && base_reg && !index_reg && !disp
6356       && REG_P (base_reg)
6357       && REGNO_REG_CLASS (REGNO (base_reg)) == SIREG)
6358     disp = const0_rtx;
6359
6360   /* Special case: encode reg+reg instead of reg*2.  */
6361   if (!base && index && scale && scale == 2)
6362     base = index, base_reg = index_reg, scale = 1;
6363
6364   /* Special case: scaling cannot be encoded without base or displacement.  */
6365   if (!base && !disp && index && scale != 1)
6366     disp = const0_rtx;
6367
6368   out->base = base;
6369   out->index = index;
6370   out->disp = disp;
6371   out->scale = scale;
6372   out->seg = seg;
6373
6374   return retval;
6375 }
6376 \f
6377 /* Return cost of the memory address x.
6378    For i386, it is better to use a complex address than let gcc copy
6379    the address into a reg and make a new pseudo.  But not if the address
6380    requires to two regs - that would mean more pseudos with longer
6381    lifetimes.  */
6382 static int
6383 ix86_address_cost (rtx x)
6384 {
6385   struct ix86_address parts;
6386   int cost = 1;
6387   int ok = ix86_decompose_address (x, &parts);
6388
6389   gcc_assert (ok);
6390
6391   if (parts.base && GET_CODE (parts.base) == SUBREG)
6392     parts.base = SUBREG_REG (parts.base);
6393   if (parts.index && GET_CODE (parts.index) == SUBREG)
6394     parts.index = SUBREG_REG (parts.index);
6395
6396   /* More complex memory references are better.  */
6397   if (parts.disp && parts.disp != const0_rtx)
6398     cost--;
6399   if (parts.seg != SEG_DEFAULT)
6400     cost--;
6401
6402   /* Attempt to minimize number of registers in the address.  */
6403   if ((parts.base
6404        && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
6405       || (parts.index
6406           && (!REG_P (parts.index)
6407               || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
6408     cost++;
6409
6410   if (parts.base
6411       && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
6412       && parts.index
6413       && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
6414       && parts.base != parts.index)
6415     cost++;
6416
6417   /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
6418      since it's predecode logic can't detect the length of instructions
6419      and it degenerates to vector decoded.  Increase cost of such
6420      addresses here.  The penalty is minimally 2 cycles.  It may be worthwhile
6421      to split such addresses or even refuse such addresses at all.
6422
6423      Following addressing modes are affected:
6424       [base+scale*index]
6425       [scale*index+disp]
6426       [base+index]
6427
6428      The first and last case  may be avoidable by explicitly coding the zero in
6429      memory address, but I don't have AMD-K6 machine handy to check this
6430      theory.  */
6431
6432   if (TARGET_K6
6433       && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
6434           || (parts.disp && !parts.base && parts.index && parts.scale != 1)
6435           || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
6436     cost += 10;
6437
6438   return cost;
6439 }
6440 \f
6441 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
6442    this is used for to form addresses to local data when -fPIC is in
6443    use.  */
6444
6445 static bool
6446 darwin_local_data_pic (rtx disp)
6447 {
6448   if (GET_CODE (disp) == MINUS)
6449     {
6450       if (GET_CODE (XEXP (disp, 0)) == LABEL_REF
6451           || GET_CODE (XEXP (disp, 0)) == SYMBOL_REF)
6452         if (GET_CODE (XEXP (disp, 1)) == SYMBOL_REF)
6453           {
6454             const char *sym_name = XSTR (XEXP (disp, 1), 0);
6455             if (! strcmp (sym_name, "<pic base>"))
6456               return true;
6457           }
6458     }
6459
6460   return false;
6461 }
6462
6463 /* Determine if a given RTX is a valid constant.  We already know this
6464    satisfies CONSTANT_P.  */
6465
6466 bool
6467 legitimate_constant_p (rtx x)
6468 {
6469   switch (GET_CODE (x))
6470     {
6471     case CONST:
6472       x = XEXP (x, 0);
6473
6474       if (GET_CODE (x) == PLUS)
6475         {
6476           if (!CONST_INT_P (XEXP (x, 1)))
6477             return false;
6478           x = XEXP (x, 0);
6479         }
6480
6481       if (TARGET_MACHO && darwin_local_data_pic (x))
6482         return true;
6483
6484       /* Only some unspecs are valid as "constants".  */
6485       if (GET_CODE (x) == UNSPEC)
6486         switch (XINT (x, 1))
6487           {
6488           case UNSPEC_GOT:
6489           case UNSPEC_GOTOFF:
6490           case UNSPEC_PLTOFF:
6491             return TARGET_64BIT;
6492           case UNSPEC_TPOFF:
6493           case UNSPEC_NTPOFF:
6494             x = XVECEXP (x, 0, 0);
6495             return (GET_CODE (x) == SYMBOL_REF
6496                     && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
6497           case UNSPEC_DTPOFF:
6498             x = XVECEXP (x, 0, 0);
6499             return (GET_CODE (x) == SYMBOL_REF
6500                     && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
6501           default:
6502             return false;
6503           }
6504
6505       /* We must have drilled down to a symbol.  */
6506       if (GET_CODE (x) == LABEL_REF)
6507         return true;
6508       if (GET_CODE (x) != SYMBOL_REF)
6509         return false;
6510       /* FALLTHRU */
6511
6512     case SYMBOL_REF:
6513       /* TLS symbols are never valid.  */
6514       if (SYMBOL_REF_TLS_MODEL (x))
6515         return false;
6516
6517       /* DLLIMPORT symbols are never valid.  */
6518       if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
6519           && SYMBOL_REF_DLLIMPORT_P (x))
6520         return false;
6521       break;
6522
6523     case CONST_DOUBLE:
6524       if (GET_MODE (x) == TImode
6525           && x != CONST0_RTX (TImode)
6526           && !TARGET_64BIT)
6527         return false;
6528       break;
6529
6530     case CONST_VECTOR:
6531       if (x == CONST0_RTX (GET_MODE (x)))
6532         return true;
6533       return false;
6534
6535     default:
6536       break;
6537     }
6538
6539   /* Otherwise we handle everything else in the move patterns.  */
6540   return true;
6541 }
6542
6543 /* Determine if it's legal to put X into the constant pool.  This
6544    is not possible for the address of thread-local symbols, which
6545    is checked above.  */
6546
6547 static bool
6548 ix86_cannot_force_const_mem (rtx x)
6549 {
6550   /* We can always put integral constants and vectors in memory.  */
6551   switch (GET_CODE (x))
6552     {
6553     case CONST_INT:
6554     case CONST_DOUBLE:
6555     case CONST_VECTOR:
6556       return false;
6557
6558     default:
6559       break;
6560     }
6561   return !legitimate_constant_p (x);
6562 }
6563
6564 /* Determine if a given RTX is a valid constant address.  */
6565
6566 bool
6567 constant_address_p (rtx x)
6568 {
6569   return CONSTANT_P (x) && legitimate_address_p (Pmode, x, 1);
6570 }
6571
6572 /* Nonzero if the constant value X is a legitimate general operand
6573    when generating PIC code.  It is given that flag_pic is on and
6574    that X satisfies CONSTANT_P or is a CONST_DOUBLE.  */
6575
6576 bool
6577 legitimate_pic_operand_p (rtx x)
6578 {
6579   rtx inner;
6580
6581   switch (GET_CODE (x))
6582     {
6583     case CONST:
6584       inner = XEXP (x, 0);
6585       if (GET_CODE (inner) == PLUS
6586           && CONST_INT_P (XEXP (inner, 1)))
6587         inner = XEXP (inner, 0);
6588
6589       /* Only some unspecs are valid as "constants".  */
6590       if (GET_CODE (inner) == UNSPEC)
6591         switch (XINT (inner, 1))
6592           {
6593           case UNSPEC_GOT:
6594           case UNSPEC_GOTOFF:
6595           case UNSPEC_PLTOFF:
6596             return TARGET_64BIT;
6597           case UNSPEC_TPOFF:
6598             x = XVECEXP (inner, 0, 0);
6599             return (GET_CODE (x) == SYMBOL_REF
6600                     && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
6601           default:
6602             return false;
6603           }
6604       /* FALLTHRU */
6605
6606     case SYMBOL_REF:
6607     case LABEL_REF:
6608       return legitimate_pic_address_disp_p (x);
6609
6610     default:
6611       return true;
6612     }
6613 }
6614
6615 /* Determine if a given CONST RTX is a valid memory displacement
6616    in PIC mode.  */
6617
6618 int
6619 legitimate_pic_address_disp_p (rtx disp)
6620 {
6621   bool saw_plus;
6622
6623   /* In 64bit mode we can allow direct addresses of symbols and labels
6624      when they are not dynamic symbols.  */
6625   if (TARGET_64BIT)
6626     {
6627       rtx op0 = disp, op1;
6628
6629       switch (GET_CODE (disp))
6630         {
6631         case LABEL_REF:
6632           return true;
6633
6634         case CONST:
6635           if (GET_CODE (XEXP (disp, 0)) != PLUS)
6636             break;
6637           op0 = XEXP (XEXP (disp, 0), 0);
6638           op1 = XEXP (XEXP (disp, 0), 1);
6639           if (!CONST_INT_P (op1)
6640               || INTVAL (op1) >= 16*1024*1024
6641               || INTVAL (op1) < -16*1024*1024)
6642             break;
6643           if (GET_CODE (op0) == LABEL_REF)
6644             return true;
6645           if (GET_CODE (op0) != SYMBOL_REF)
6646             break;
6647           /* FALLTHRU */
6648
6649         case SYMBOL_REF:
6650           /* TLS references should always be enclosed in UNSPEC.  */
6651           if (SYMBOL_REF_TLS_MODEL (op0))
6652             return false;
6653           if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
6654               && ix86_cmodel != CM_LARGE_PIC)
6655             return true;
6656           break;
6657
6658         default:
6659           break;
6660         }
6661     }
6662   if (GET_CODE (disp) != CONST)
6663     return 0;
6664   disp = XEXP (disp, 0);
6665
6666   if (TARGET_64BIT)
6667     {
6668       /* We are unsafe to allow PLUS expressions.  This limit allowed distance
6669          of GOT tables.  We should not need these anyway.  */
6670       if (GET_CODE (disp) != UNSPEC
6671           || (XINT (disp, 1) != UNSPEC_GOTPCREL
6672               && XINT (disp, 1) != UNSPEC_GOTOFF
6673               && XINT (disp, 1) != UNSPEC_PLTOFF))
6674         return 0;
6675
6676       if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
6677           && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
6678         return 0;
6679       return 1;
6680     }
6681
6682   saw_plus = false;
6683   if (GET_CODE (disp) == PLUS)
6684     {
6685       if (!CONST_INT_P (XEXP (disp, 1)))
6686         return 0;
6687       disp = XEXP (disp, 0);
6688       saw_plus = true;
6689     }
6690
6691   if (TARGET_MACHO && darwin_local_data_pic (disp))
6692     return 1;
6693
6694   if (GET_CODE (disp) != UNSPEC)
6695     return 0;
6696
6697   switch (XINT (disp, 1))
6698     {
6699     case UNSPEC_GOT:
6700       if (saw_plus)
6701         return false;
6702       /* We need to check for both symbols and labels because VxWorks loads
6703          text labels with @GOT rather than @GOTOFF.  See gotoff_operand for
6704          details.  */
6705       return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
6706               || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
6707     case UNSPEC_GOTOFF:
6708       /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
6709          While ABI specify also 32bit relocation but we don't produce it in
6710          small PIC model at all.  */
6711       if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
6712            || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
6713           && !TARGET_64BIT)
6714         return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
6715       return false;
6716     case UNSPEC_GOTTPOFF:
6717     case UNSPEC_GOTNTPOFF:
6718     case UNSPEC_INDNTPOFF:
6719       if (saw_plus)
6720         return false;
6721       disp = XVECEXP (disp, 0, 0);
6722       return (GET_CODE (disp) == SYMBOL_REF
6723               && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
6724     case UNSPEC_NTPOFF:
6725       disp = XVECEXP (disp, 0, 0);
6726       return (GET_CODE (disp) == SYMBOL_REF
6727               && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
6728     case UNSPEC_DTPOFF:
6729       disp = XVECEXP (disp, 0, 0);
6730       return (GET_CODE (disp) == SYMBOL_REF
6731               && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
6732     }
6733
6734   return 0;
6735 }
6736
6737 /* GO_IF_LEGITIMATE_ADDRESS recognizes an RTL expression that is a valid
6738    memory address for an instruction.  The MODE argument is the machine mode
6739    for the MEM expression that wants to use this address.
6740
6741    It only recognizes address in canonical form.  LEGITIMIZE_ADDRESS should
6742    convert common non-canonical forms to canonical form so that they will
6743    be recognized.  */
6744
6745 int
6746 legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
6747                       rtx addr, int strict)
6748 {
6749   struct ix86_address parts;
6750   rtx base, index, disp;
6751   HOST_WIDE_INT scale;
6752   const char *reason = NULL;
6753   rtx reason_rtx = NULL_RTX;
6754
6755   if (ix86_decompose_address (addr, &parts) <= 0)
6756     {
6757       reason = "decomposition failed";
6758       goto report_error;
6759     }
6760
6761   base = parts.base;
6762   index = parts.index;
6763   disp = parts.disp;
6764   scale = parts.scale;
6765
6766   /* Validate base register.
6767
6768      Don't allow SUBREG's that span more than a word here.  It can lead to spill
6769      failures when the base is one word out of a two word structure, which is
6770      represented internally as a DImode int.  */
6771
6772   if (base)
6773     {
6774       rtx reg;
6775       reason_rtx = base;
6776
6777       if (REG_P (base))
6778         reg = base;
6779       else if (GET_CODE (base) == SUBREG
6780                && REG_P (SUBREG_REG (base))
6781                && GET_MODE_SIZE (GET_MODE (SUBREG_REG (base)))
6782                   <= UNITS_PER_WORD)
6783         reg = SUBREG_REG (base);
6784       else
6785         {
6786           reason = "base is not a register";
6787           goto report_error;
6788         }
6789
6790       if (GET_MODE (base) != Pmode)
6791         {
6792           reason = "base is not in Pmode";
6793           goto report_error;
6794         }
6795
6796       if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
6797           || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
6798         {
6799           reason = "base is not valid";
6800           goto report_error;
6801         }
6802     }
6803
6804   /* Validate index register.
6805
6806      Don't allow SUBREG's that span more than a word here -- same as above.  */
6807
6808   if (index)
6809     {
6810       rtx reg;
6811       reason_rtx = index;
6812
6813       if (REG_P (index))
6814         reg = index;
6815       else if (GET_CODE (index) == SUBREG
6816                && REG_P (SUBREG_REG (index))
6817                && GET_MODE_SIZE (GET_MODE (SUBREG_REG (index)))
6818                   <= UNITS_PER_WORD)
6819         reg = SUBREG_REG (index);
6820       else
6821         {
6822           reason = "index is not a register";
6823           goto report_error;
6824         }
6825
6826       if (GET_MODE (index) != Pmode)
6827         {
6828           reason = "index is not in Pmode";
6829           goto report_error;
6830         }
6831
6832       if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
6833           || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
6834         {
6835           reason = "index is not valid";
6836           goto report_error;
6837         }
6838     }
6839
6840   /* Validate scale factor.  */
6841   if (scale != 1)
6842     {
6843       reason_rtx = GEN_INT (scale);
6844       if (!index)
6845         {
6846           reason = "scale without index";
6847           goto report_error;
6848         }
6849
6850       if (scale != 2 && scale != 4 && scale != 8)
6851         {
6852           reason = "scale is not a valid multiplier";
6853           goto report_error;
6854         }
6855     }
6856
6857   /* Validate displacement.  */
6858   if (disp)
6859     {
6860       reason_rtx = disp;
6861
6862       if (GET_CODE (disp) == CONST
6863           && GET_CODE (XEXP (disp, 0)) == UNSPEC)
6864         switch (XINT (XEXP (disp, 0), 1))
6865           {
6866           /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
6867              used.  While ABI specify also 32bit relocations, we don't produce
6868              them at all and use IP relative instead.  */
6869           case UNSPEC_GOT:
6870           case UNSPEC_GOTOFF:
6871             gcc_assert (flag_pic);
6872             if (!TARGET_64BIT)
6873               goto is_legitimate_pic;
6874             reason = "64bit address unspec";
6875             goto report_error;
6876
6877           case UNSPEC_GOTPCREL:
6878             gcc_assert (flag_pic);
6879             goto is_legitimate_pic;
6880
6881           case UNSPEC_GOTTPOFF:
6882           case UNSPEC_GOTNTPOFF:
6883           case UNSPEC_INDNTPOFF:
6884           case UNSPEC_NTPOFF:
6885           case UNSPEC_DTPOFF:
6886             break;
6887
6888           default:
6889             reason = "invalid address unspec";
6890             goto report_error;
6891           }
6892
6893       else if (SYMBOLIC_CONST (disp)
6894                && (flag_pic
6895                    || (TARGET_MACHO
6896 #if TARGET_MACHO
6897                        && MACHOPIC_INDIRECT
6898                        && !machopic_operand_p (disp)
6899 #endif
6900                )))
6901         {
6902
6903         is_legitimate_pic:
6904           if (TARGET_64BIT && (index || base))
6905             {
6906               /* foo@dtpoff(%rX) is ok.  */
6907               if (GET_CODE (disp) != CONST
6908                   || GET_CODE (XEXP (disp, 0)) != PLUS
6909                   || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
6910                   || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
6911                   || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
6912                       && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
6913                 {
6914                   reason = "non-constant pic memory reference";
6915                   goto report_error;
6916                 }
6917             }
6918           else if (! legitimate_pic_address_disp_p (disp))
6919             {
6920               reason = "displacement is an invalid pic construct";
6921               goto report_error;
6922             }
6923
6924           /* This code used to verify that a symbolic pic displacement
6925              includes the pic_offset_table_rtx register.
6926
6927              While this is good idea, unfortunately these constructs may
6928              be created by "adds using lea" optimization for incorrect
6929              code like:
6930
6931              int a;
6932              int foo(int i)
6933                {
6934                  return *(&a+i);
6935                }
6936
6937              This code is nonsensical, but results in addressing
6938              GOT table with pic_offset_table_rtx base.  We can't
6939              just refuse it easily, since it gets matched by
6940              "addsi3" pattern, that later gets split to lea in the
6941              case output register differs from input.  While this
6942              can be handled by separate addsi pattern for this case
6943              that never results in lea, this seems to be easier and
6944              correct fix for crash to disable this test.  */
6945         }
6946       else if (GET_CODE (disp) != LABEL_REF
6947                && !CONST_INT_P (disp)
6948                && (GET_CODE (disp) != CONST
6949                    || !legitimate_constant_p (disp))
6950                && (GET_CODE (disp) != SYMBOL_REF
6951                    || !legitimate_constant_p (disp)))
6952         {
6953           reason = "displacement is not constant";
6954           goto report_error;
6955         }
6956       else if (TARGET_64BIT
6957                && !x86_64_immediate_operand (disp, VOIDmode))
6958         {
6959           reason = "displacement is out of range";
6960           goto report_error;
6961         }
6962     }
6963
6964   /* Everything looks valid.  */
6965   return TRUE;
6966
6967  report_error:
6968   return FALSE;
6969 }
6970 \f
6971 /* Return a unique alias set for the GOT.  */
6972
6973 static HOST_WIDE_INT
6974 ix86_GOT_alias_set (void)
6975 {
6976   static HOST_WIDE_INT set = -1;
6977   if (set == -1)
6978     set = new_alias_set ();
6979   return set;
6980 }
6981
6982 /* Return a legitimate reference for ORIG (an address) using the
6983    register REG.  If REG is 0, a new pseudo is generated.
6984
6985    There are two types of references that must be handled:
6986
6987    1. Global data references must load the address from the GOT, via
6988       the PIC reg.  An insn is emitted to do this load, and the reg is
6989       returned.
6990
6991    2. Static data references, constant pool addresses, and code labels
6992       compute the address as an offset from the GOT, whose base is in
6993       the PIC reg.  Static data objects have SYMBOL_FLAG_LOCAL set to
6994       differentiate them from global data objects.  The returned
6995       address is the PIC reg + an unspec constant.
6996
6997    GO_IF_LEGITIMATE_ADDRESS rejects symbolic references unless the PIC
6998    reg also appears in the address.  */
6999
7000 static rtx
7001 legitimize_pic_address (rtx orig, rtx reg)
7002 {
7003   rtx addr = orig;
7004   rtx new = orig;
7005   rtx base;
7006
7007 #if TARGET_MACHO
7008   if (TARGET_MACHO && !TARGET_64BIT)
7009     {
7010       if (reg == 0)
7011         reg = gen_reg_rtx (Pmode);
7012       /* Use the generic Mach-O PIC machinery.  */
7013       return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
7014     }
7015 #endif
7016
7017   if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
7018     new = addr;
7019   else if (TARGET_64BIT
7020            && ix86_cmodel != CM_SMALL_PIC
7021            && gotoff_operand (addr, Pmode))
7022     {
7023       rtx tmpreg;
7024       /* This symbol may be referenced via a displacement from the PIC
7025          base address (@GOTOFF).  */
7026
7027       if (reload_in_progress)
7028         regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1;
7029       if (GET_CODE (addr) == CONST)
7030         addr = XEXP (addr, 0);
7031       if (GET_CODE (addr) == PLUS)
7032           {
7033             new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
7034                                   UNSPEC_GOTOFF);
7035             new = gen_rtx_PLUS (Pmode, new, XEXP (addr, 1));
7036           }
7037         else
7038           new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
7039       new = gen_rtx_CONST (Pmode, new);
7040       if (!reg)
7041         tmpreg = gen_reg_rtx (Pmode);
7042       else
7043         tmpreg = reg;
7044       emit_move_insn (tmpreg, new);
7045
7046       if (reg != 0)
7047         {
7048           new = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
7049                                      tmpreg, 1, OPTAB_DIRECT);
7050           new = reg;
7051         }
7052       else new = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
7053     }
7054   else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
7055     {
7056       /* This symbol may be referenced via a displacement from the PIC
7057          base address (@GOTOFF).  */
7058
7059       if (reload_in_progress)
7060         regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1;
7061       if (GET_CODE (addr) == CONST)
7062         addr = XEXP (addr, 0);
7063       if (GET_CODE (addr) == PLUS)
7064           {
7065             new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
7066                                   UNSPEC_GOTOFF);
7067             new = gen_rtx_PLUS (Pmode, new, XEXP (addr, 1));
7068           }
7069         else
7070           new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
7071       new = gen_rtx_CONST (Pmode, new);
7072       new = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new);
7073
7074       if (reg != 0)
7075         {
7076           emit_move_insn (reg, new);
7077           new = reg;
7078         }
7079     }
7080   else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
7081            /* We can't use @GOTOFF for text labels on VxWorks;
7082               see gotoff_operand.  */
7083            || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
7084     {
7085       /* Given that we've already handled dllimport variables separately
7086          in legitimize_address, and all other variables should satisfy
7087          legitimate_pic_address_disp_p, we should never arrive here.  */
7088       gcc_assert (!TARGET_64BIT_MS_ABI);
7089
7090       if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
7091         {
7092           new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
7093           new = gen_rtx_CONST (Pmode, new);
7094           new = gen_const_mem (Pmode, new);
7095           set_mem_alias_set (new, ix86_GOT_alias_set ());
7096
7097           if (reg == 0)
7098             reg = gen_reg_rtx (Pmode);
7099           /* Use directly gen_movsi, otherwise the address is loaded
7100              into register for CSE.  We don't want to CSE this addresses,
7101              instead we CSE addresses from the GOT table, so skip this.  */
7102           emit_insn (gen_movsi (reg, new));
7103           new = reg;
7104         }
7105       else
7106         {
7107           /* This symbol must be referenced via a load from the
7108              Global Offset Table (@GOT).  */
7109
7110           if (reload_in_progress)
7111             regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1;
7112           new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
7113           new = gen_rtx_CONST (Pmode, new);
7114           if (TARGET_64BIT)
7115             new = force_reg (Pmode, new);
7116           new = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new);
7117           new = gen_const_mem (Pmode, new);
7118           set_mem_alias_set (new, ix86_GOT_alias_set ());
7119
7120           if (reg == 0)
7121             reg = gen_reg_rtx (Pmode);
7122           emit_move_insn (reg, new);
7123           new = reg;
7124         }
7125     }
7126   else
7127     {
7128       if (CONST_INT_P (addr)
7129           && !x86_64_immediate_operand (addr, VOIDmode))
7130         {
7131           if (reg)
7132             {
7133               emit_move_insn (reg, addr);
7134               new = reg;
7135             }
7136           else
7137             new = force_reg (Pmode, addr);
7138         }
7139       else if (GET_CODE (addr) == CONST)
7140         {
7141           addr = XEXP (addr, 0);
7142
7143           /* We must match stuff we generate before.  Assume the only
7144              unspecs that can get here are ours.  Not that we could do
7145              anything with them anyway....  */
7146           if (GET_CODE (addr) == UNSPEC
7147               || (GET_CODE (addr) == PLUS
7148                   && GET_CODE (XEXP (addr, 0)) == UNSPEC))
7149             return orig;
7150           gcc_assert (GET_CODE (addr) == PLUS);
7151         }
7152       if (GET_CODE (addr) == PLUS)
7153         {
7154           rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
7155
7156           /* Check first to see if this is a constant offset from a @GOTOFF
7157              symbol reference.  */
7158           if (gotoff_operand (op0, Pmode)
7159               && CONST_INT_P (op1))
7160             {
7161               if (!TARGET_64BIT)
7162                 {
7163                   if (reload_in_progress)
7164                     regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1;
7165                   new = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
7166                                         UNSPEC_GOTOFF);
7167                   new = gen_rtx_PLUS (Pmode, new, op1);
7168                   new = gen_rtx_CONST (Pmode, new);
7169                   new = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new);
7170
7171                   if (reg != 0)
7172                     {
7173                       emit_move_insn (reg, new);
7174                       new = reg;
7175                     }
7176                 }
7177               else
7178                 {
7179                   if (INTVAL (op1) < -16*1024*1024
7180                       || INTVAL (op1) >= 16*1024*1024)
7181                     {
7182                       if (!x86_64_immediate_operand (op1, Pmode))
7183                         op1 = force_reg (Pmode, op1);
7184                       new = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
7185                     }
7186                 }
7187             }
7188           else
7189             {
7190               base = legitimize_pic_address (XEXP (addr, 0), reg);
7191               new  = legitimize_pic_address (XEXP (addr, 1),
7192                                              base == reg ? NULL_RTX : reg);
7193
7194               if (CONST_INT_P (new))
7195                 new = plus_constant (base, INTVAL (new));
7196               else
7197                 {
7198                   if (GET_CODE (new) == PLUS && CONSTANT_P (XEXP (new, 1)))
7199                     {
7200                       base = gen_rtx_PLUS (Pmode, base, XEXP (new, 0));
7201                       new = XEXP (new, 1);
7202                     }
7203                   new = gen_rtx_PLUS (Pmode, base, new);
7204                 }
7205             }
7206         }
7207     }
7208   return new;
7209 }
7210 \f
7211 /* Load the thread pointer.  If TO_REG is true, force it into a register.  */
7212
7213 static rtx
7214 get_thread_pointer (int to_reg)
7215 {
7216   rtx tp, reg, insn;
7217
7218   tp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
7219   if (!to_reg)
7220     return tp;
7221
7222   reg = gen_reg_rtx (Pmode);
7223   insn = gen_rtx_SET (VOIDmode, reg, tp);
7224   insn = emit_insn (insn);
7225
7226   return reg;
7227 }
7228
7229 /* A subroutine of legitimize_address and ix86_expand_move.  FOR_MOV is
7230    false if we expect this to be used for a memory address and true if
7231    we expect to load the address into a register.  */
7232
7233 static rtx
7234 legitimize_tls_address (rtx x, enum tls_model model, int for_mov)
7235 {
7236   rtx dest, base, off, pic, tp;
7237   int type;
7238
7239   switch (model)
7240     {
7241     case TLS_MODEL_GLOBAL_DYNAMIC:
7242       dest = gen_reg_rtx (Pmode);
7243       tp = TARGET_GNU2_TLS ? get_thread_pointer (1) : 0;
7244
7245       if (TARGET_64BIT && ! TARGET_GNU2_TLS)
7246         {
7247           rtx rax = gen_rtx_REG (Pmode, 0), insns;
7248
7249           start_sequence ();
7250           emit_call_insn (gen_tls_global_dynamic_64 (rax, x));
7251           insns = get_insns ();
7252           end_sequence ();
7253
7254           emit_libcall_block (insns, dest, rax, x);
7255         }
7256       else if (TARGET_64BIT && TARGET_GNU2_TLS)
7257         emit_insn (gen_tls_global_dynamic_64 (dest, x));
7258       else
7259         emit_insn (gen_tls_global_dynamic_32 (dest, x));
7260
7261       if (TARGET_GNU2_TLS)
7262         {
7263           dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
7264
7265           set_unique_reg_note (get_last_insn (), REG_EQUIV, x);
7266         }
7267       break;
7268
7269     case TLS_MODEL_LOCAL_DYNAMIC:
7270       base = gen_reg_rtx (Pmode);
7271       tp = TARGET_GNU2_TLS ? get_thread_pointer (1) : 0;
7272
7273       if (TARGET_64BIT && ! TARGET_GNU2_TLS)
7274         {
7275           rtx rax = gen_rtx_REG (Pmode, 0), insns, note;
7276
7277           start_sequence ();
7278           emit_call_insn (gen_tls_local_dynamic_base_64 (rax));
7279           insns = get_insns ();
7280           end_sequence ();
7281
7282           note = gen_rtx_EXPR_LIST (VOIDmode, const0_rtx, NULL);
7283           note = gen_rtx_EXPR_LIST (VOIDmode, ix86_tls_get_addr (), note);
7284           emit_libcall_block (insns, base, rax, note);
7285         }
7286       else if (TARGET_64BIT && TARGET_GNU2_TLS)
7287         emit_insn (gen_tls_local_dynamic_base_64 (base));
7288       else
7289         emit_insn (gen_tls_local_dynamic_base_32 (base));
7290
7291       if (TARGET_GNU2_TLS)
7292         {
7293           rtx x = ix86_tls_module_base ();
7294
7295           set_unique_reg_note (get_last_insn (), REG_EQUIV,
7296                                gen_rtx_MINUS (Pmode, x, tp));
7297         }
7298
7299       off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
7300       off = gen_rtx_CONST (Pmode, off);
7301
7302       dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
7303
7304       if (TARGET_GNU2_TLS)
7305         {
7306           dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
7307
7308           set_unique_reg_note (get_last_insn (), REG_EQUIV, x);
7309         }
7310
7311       break;
7312
7313     case TLS_MODEL_INITIAL_EXEC:
7314       if (TARGET_64BIT)
7315         {
7316           pic = NULL;
7317           type = UNSPEC_GOTNTPOFF;
7318         }
7319       else if (flag_pic)
7320         {
7321           if (reload_in_progress)
7322             regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1;
7323           pic = pic_offset_table_rtx;
7324           type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
7325         }
7326       else if (!TARGET_ANY_GNU_TLS)
7327         {
7328           pic = gen_reg_rtx (Pmode);
7329           emit_insn (gen_set_got (pic));
7330           type = UNSPEC_GOTTPOFF;
7331         }
7332       else
7333         {
7334           pic = NULL;
7335           type = UNSPEC_INDNTPOFF;
7336         }
7337
7338       off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
7339       off = gen_rtx_CONST (Pmode, off);
7340       if (pic)
7341         off = gen_rtx_PLUS (Pmode, pic, off);
7342       off = gen_const_mem (Pmode, off);
7343       set_mem_alias_set (off, ix86_GOT_alias_set ());
7344
7345       if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
7346         {
7347           base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
7348           off = force_reg (Pmode, off);
7349           return gen_rtx_PLUS (Pmode, base, off);
7350         }
7351       else
7352         {
7353           base = get_thread_pointer (true);
7354           dest = gen_reg_rtx (Pmode);
7355           emit_insn (gen_subsi3 (dest, base, off));
7356         }
7357       break;
7358
7359     case TLS_MODEL_LOCAL_EXEC:
7360       off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
7361                             (TARGET_64BIT || TARGET_ANY_GNU_TLS)
7362                             ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
7363       off = gen_rtx_CONST (Pmode, off);
7364
7365       if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
7366         {
7367           base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
7368           return gen_rtx_PLUS (Pmode, base, off);
7369         }
7370       else
7371         {
7372           base = get_thread_pointer (true);
7373           dest = gen_reg_rtx (Pmode);
7374           emit_insn (gen_subsi3 (dest, base, off));
7375         }
7376       break;
7377
7378     default:
7379       gcc_unreachable ();
7380     }
7381
7382   return dest;
7383 }
7384
7385 /* Create or return the unique __imp_DECL dllimport symbol corresponding
7386    to symbol DECL.  */
7387
7388 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
7389   htab_t dllimport_map;
7390
7391 static tree
7392 get_dllimport_decl (tree decl)
7393 {
7394   struct tree_map *h, in;
7395   void **loc;
7396   const char *name;
7397   const char *prefix;
7398   size_t namelen, prefixlen;
7399   char *imp_name;
7400   tree to;
7401   rtx rtl;
7402
7403   if (!dllimport_map)
7404     dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
7405
7406   in.hash = htab_hash_pointer (decl);
7407   in.base.from = decl;
7408   loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
7409   h = *loc;
7410   if (h)
7411     return h->to;
7412
7413   *loc = h = ggc_alloc (sizeof (struct tree_map));
7414   h->hash = in.hash;
7415   h->base.from = decl;
7416   h->to = to = build_decl (VAR_DECL, NULL, ptr_type_node);
7417   DECL_ARTIFICIAL (to) = 1;
7418   DECL_IGNORED_P (to) = 1;
7419   DECL_EXTERNAL (to) = 1;
7420   TREE_READONLY (to) = 1;
7421
7422   name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
7423   name = targetm.strip_name_encoding (name);
7424   if (name[0] == FASTCALL_PREFIX)
7425     {
7426       name++;
7427       prefix = "*__imp_";
7428     }
7429   else
7430     prefix = "*__imp__";
7431
7432   namelen = strlen (name);
7433   prefixlen = strlen (prefix);
7434   imp_name = alloca (namelen + prefixlen + 1);
7435   memcpy (imp_name, prefix, prefixlen);
7436   memcpy (imp_name + prefixlen, name, namelen + 1);
7437
7438   name = ggc_alloc_string (imp_name, namelen + prefixlen);
7439   rtl = gen_rtx_SYMBOL_REF (Pmode, name);
7440   SET_SYMBOL_REF_DECL (rtl, to);
7441   SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
7442
7443   rtl = gen_const_mem (Pmode, rtl);
7444   set_mem_alias_set (rtl, ix86_GOT_alias_set ());
7445
7446   SET_DECL_RTL (to, rtl);
7447
7448   return to;
7449 }
7450
7451 /* Expand SYMBOL into its corresponding dllimport symbol.  WANT_REG is
7452    true if we require the result be a register.  */
7453
7454 static rtx
7455 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
7456 {
7457   tree imp_decl;
7458   rtx x;
7459
7460   gcc_assert (SYMBOL_REF_DECL (symbol));
7461   imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
7462
7463   x = DECL_RTL (imp_decl);
7464   if (want_reg)
7465     x = force_reg (Pmode, x);
7466   return x;
7467 }
7468
7469 /* Try machine-dependent ways of modifying an illegitimate address
7470    to be legitimate.  If we find one, return the new, valid address.
7471    This macro is used in only one place: `memory_address' in explow.c.
7472
7473    OLDX is the address as it was before break_out_memory_refs was called.
7474    In some cases it is useful to look at this to decide what needs to be done.
7475
7476    MODE and WIN are passed so that this macro can use
7477    GO_IF_LEGITIMATE_ADDRESS.
7478
7479    It is always safe for this macro to do nothing.  It exists to recognize
7480    opportunities to optimize the output.
7481
7482    For the 80386, we handle X+REG by loading X into a register R and
7483    using R+REG.  R will go in a general reg and indexing will be used.
7484    However, if REG is a broken-out memory address or multiplication,
7485    nothing needs to be done because REG can certainly go in a general reg.
7486
7487    When -fpic is used, special handling is needed for symbolic references.
7488    See comments by legitimize_pic_address in i386.c for details.  */
7489
7490 rtx
7491 legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, enum machine_mode mode)
7492 {
7493   int changed = 0;
7494   unsigned log;
7495
7496   log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
7497   if (log)
7498     return legitimize_tls_address (x, log, false);
7499   if (GET_CODE (x) == CONST
7500       && GET_CODE (XEXP (x, 0)) == PLUS
7501       && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
7502       && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
7503     {
7504       rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0), log, false);
7505       return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
7506     }
7507
7508   if (flag_pic && SYMBOLIC_CONST (x))
7509     return legitimize_pic_address (x, 0);
7510
7511   if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
7512     {
7513       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
7514         return legitimize_dllimport_symbol (x, true);
7515       if (GET_CODE (x) == CONST
7516           && GET_CODE (XEXP (x, 0)) == PLUS
7517           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
7518           && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
7519         {
7520           rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
7521           return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
7522         }
7523     }
7524
7525   /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
7526   if (GET_CODE (x) == ASHIFT
7527       && CONST_INT_P (XEXP (x, 1))
7528       && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
7529     {
7530       changed = 1;
7531       log = INTVAL (XEXP (x, 1));
7532       x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
7533                         GEN_INT (1 << log));
7534     }
7535
7536   if (GET_CODE (x) == PLUS)
7537     {
7538       /* Canonicalize shifts by 0, 1, 2, 3 into multiply.  */
7539
7540       if (GET_CODE (XEXP (x, 0)) == ASHIFT
7541           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7542           && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
7543         {
7544           changed = 1;
7545           log = INTVAL (XEXP (XEXP (x, 0), 1));
7546           XEXP (x, 0) = gen_rtx_MULT (Pmode,
7547                                       force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
7548                                       GEN_INT (1 << log));
7549         }
7550
7551       if (GET_CODE (XEXP (x, 1)) == ASHIFT
7552           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
7553           && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
7554         {
7555           changed = 1;
7556           log = INTVAL (XEXP (XEXP (x, 1), 1));
7557           XEXP (x, 1) = gen_rtx_MULT (Pmode,
7558                                       force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
7559                                       GEN_INT (1 << log));
7560         }
7561
7562       /* Put multiply first if it isn't already.  */
7563       if (GET_CODE (XEXP (x, 1)) == MULT)
7564         {
7565           rtx tmp = XEXP (x, 0);
7566           XEXP (x, 0) = XEXP (x, 1);
7567           XEXP (x, 1) = tmp;
7568           changed = 1;
7569         }
7570
7571       /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
7572          into (plus (plus (mult (reg) (const)) (reg)) (const)).  This can be
7573          created by virtual register instantiation, register elimination, and
7574          similar optimizations.  */
7575       if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
7576         {
7577           changed = 1;
7578           x = gen_rtx_PLUS (Pmode,
7579                             gen_rtx_PLUS (Pmode, XEXP (x, 0),
7580                                           XEXP (XEXP (x, 1), 0)),
7581                             XEXP (XEXP (x, 1), 1));
7582         }
7583
7584       /* Canonicalize
7585          (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
7586          into (plus (plus (mult (reg) (const)) (reg)) (const)).  */
7587       else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
7588                && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7589                && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
7590                && CONSTANT_P (XEXP (x, 1)))
7591         {
7592           rtx constant;
7593           rtx other = NULL_RTX;
7594
7595           if (CONST_INT_P (XEXP (x, 1)))
7596             {
7597               constant = XEXP (x, 1);
7598               other = XEXP (XEXP (XEXP (x, 0), 1), 1);
7599             }
7600           else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
7601             {
7602               constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
7603               other = XEXP (x, 1);
7604             }
7605           else
7606             constant = 0;
7607
7608           if (constant)
7609             {
7610               changed = 1;
7611               x = gen_rtx_PLUS (Pmode,
7612                                 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
7613                                               XEXP (XEXP (XEXP (x, 0), 1), 0)),
7614                                 plus_constant (other, INTVAL (constant)));
7615             }
7616         }
7617
7618       if (changed && legitimate_address_p (mode, x, FALSE))
7619         return x;
7620
7621       if (GET_CODE (XEXP (x, 0)) == MULT)
7622         {
7623           changed = 1;
7624           XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
7625         }
7626
7627       if (GET_CODE (XEXP (x, 1)) == MULT)
7628         {
7629           changed = 1;
7630           XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
7631         }
7632
7633       if (changed
7634           && REG_P (XEXP (x, 1))
7635           && REG_P (XEXP (x, 0)))
7636         return x;
7637
7638       if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
7639         {
7640           changed = 1;
7641           x = legitimize_pic_address (x, 0);
7642         }
7643
7644       if (changed && legitimate_address_p (mode, x, FALSE))
7645         return x;
7646
7647       if (REG_P (XEXP (x, 0)))
7648         {
7649           rtx temp = gen_reg_rtx (Pmode);
7650           rtx val  = force_operand (XEXP (x, 1), temp);
7651           if (val != temp)
7652             emit_move_insn (temp, val);
7653
7654           XEXP (x, 1) = temp;
7655           return x;
7656         }
7657
7658       else if (REG_P (XEXP (x, 1)))
7659         {
7660           rtx temp = gen_reg_rtx (Pmode);
7661           rtx val  = force_operand (XEXP (x, 0), temp);
7662           if (val != temp)
7663             emit_move_insn (temp, val);
7664
7665           XEXP (x, 0) = temp;
7666           return x;
7667         }
7668     }
7669
7670   return x;
7671 }
7672 \f
7673 /* Print an integer constant expression in assembler syntax.  Addition
7674    and subtraction are the only arithmetic that may appear in these
7675    expressions.  FILE is the stdio stream to write to, X is the rtx, and
7676    CODE is the operand print code from the output string.  */
7677
7678 static void
7679 output_pic_addr_const (FILE *file, rtx x, int code)
7680 {
7681   char buf[256];
7682
7683   switch (GET_CODE (x))
7684     {
7685     case PC:
7686       gcc_assert (flag_pic);
7687       putc ('.', file);
7688       break;
7689
7690     case SYMBOL_REF:
7691       if (! TARGET_MACHO || TARGET_64BIT)
7692         output_addr_const (file, x);
7693       else
7694         {
7695           const char *name = XSTR (x, 0);
7696
7697           /* Mark the decl as referenced so that cgraph will
7698              output the function.  */
7699           if (SYMBOL_REF_DECL (x))
7700             mark_decl_referenced (SYMBOL_REF_DECL (x));
7701
7702 #if TARGET_MACHO
7703           if (MACHOPIC_INDIRECT
7704               && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
7705             name = machopic_indirection_name (x, /*stub_p=*/true);
7706 #endif
7707           assemble_name (file, name);
7708         }
7709       if (!TARGET_MACHO && !TARGET_64BIT_MS_ABI
7710           && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
7711         fputs ("@PLT", file);
7712       break;
7713
7714     case LABEL_REF:
7715       x = XEXP (x, 0);
7716       /* FALLTHRU */
7717     case CODE_LABEL:
7718       ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
7719       assemble_name (asm_out_file, buf);
7720       break;
7721
7722     case CONST_INT:
7723       fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7724       break;
7725
7726     case CONST:
7727       /* This used to output parentheses around the expression,
7728          but that does not work on the 386 (either ATT or BSD assembler).  */
7729       output_pic_addr_const (file, XEXP (x, 0), code);
7730       break;
7731
7732     case CONST_DOUBLE:
7733       if (GET_MODE (x) == VOIDmode)
7734         {
7735           /* We can use %d if the number is <32 bits and positive.  */
7736           if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
7737             fprintf (file, "0x%lx%08lx",
7738                      (unsigned long) CONST_DOUBLE_HIGH (x),
7739                      (unsigned long) CONST_DOUBLE_LOW (x));
7740           else
7741             fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
7742         }
7743       else
7744         /* We can't handle floating point constants;
7745            PRINT_OPERAND must handle them.  */
7746         output_operand_lossage ("floating constant misused");
7747       break;
7748
7749     case PLUS:
7750       /* Some assemblers need integer constants to appear first.  */
7751       if (CONST_INT_P (XEXP (x, 0)))
7752         {
7753           output_pic_addr_const (file, XEXP (x, 0), code);
7754           putc ('+', file);
7755           output_pic_addr_const (file, XEXP (x, 1), code);
7756         }
7757       else
7758         {
7759           gcc_assert (CONST_INT_P (XEXP (x, 1)));
7760           output_pic_addr_const (file, XEXP (x, 1), code);
7761           putc ('+', file);
7762           output_pic_addr_const (file, XEXP (x, 0), code);
7763         }
7764       break;
7765
7766     case MINUS:
7767       if (!TARGET_MACHO)
7768         putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
7769       output_pic_addr_const (file, XEXP (x, 0), code);
7770       putc ('-', file);
7771       output_pic_addr_const (file, XEXP (x, 1), code);
7772       if (!TARGET_MACHO)
7773         putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
7774       break;
7775
7776      case UNSPEC:
7777        gcc_assert (XVECLEN (x, 0) == 1);
7778        output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
7779        switch (XINT (x, 1))
7780         {
7781         case UNSPEC_GOT:
7782           fputs ("@GOT", file);
7783           break;
7784         case UNSPEC_GOTOFF:
7785           fputs ("@GOTOFF", file);
7786           break;
7787         case UNSPEC_PLTOFF:
7788           fputs ("@PLTOFF", file);
7789           break;
7790         case UNSPEC_GOTPCREL:
7791           fputs ("@GOTPCREL(%rip)", file);
7792           break;
7793         case UNSPEC_GOTTPOFF:
7794           /* FIXME: This might be @TPOFF in Sun ld too.  */
7795           fputs ("@GOTTPOFF", file);
7796           break;
7797         case UNSPEC_TPOFF:
7798           fputs ("@TPOFF", file);
7799           break;
7800         case UNSPEC_NTPOFF:
7801           if (TARGET_64BIT)
7802             fputs ("@TPOFF", file);
7803           else
7804             fputs ("@NTPOFF", file);
7805           break;
7806         case UNSPEC_DTPOFF:
7807           fputs ("@DTPOFF", file);
7808           break;
7809         case UNSPEC_GOTNTPOFF:
7810           if (TARGET_64BIT)
7811             fputs ("@GOTTPOFF(%rip)", file);
7812           else
7813             fputs ("@GOTNTPOFF", file);
7814           break;
7815         case UNSPEC_INDNTPOFF:
7816           fputs ("@INDNTPOFF", file);
7817           break;
7818         default:
7819           output_operand_lossage ("invalid UNSPEC as operand");
7820           break;
7821         }
7822        break;
7823
7824     default:
7825       output_operand_lossage ("invalid expression as operand");
7826     }
7827 }
7828
7829 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
7830    We need to emit DTP-relative relocations.  */
7831
7832 static void ATTRIBUTE_UNUSED
7833 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
7834 {
7835   fputs (ASM_LONG, file);
7836   output_addr_const (file, x);
7837   fputs ("@DTPOFF", file);
7838   switch (size)
7839     {
7840     case 4:
7841       break;
7842     case 8:
7843       fputs (", 0", file);
7844       break;
7845     default:
7846       gcc_unreachable ();
7847    }
7848 }
7849
7850 /* In the name of slightly smaller debug output, and to cater to
7851    general assembler lossage, recognize PIC+GOTOFF and turn it back
7852    into a direct symbol reference.
7853
7854    On Darwin, this is necessary to avoid a crash, because Darwin
7855    has a different PIC label for each routine but the DWARF debugging
7856    information is not associated with any particular routine, so it's
7857    necessary to remove references to the PIC label from RTL stored by
7858    the DWARF output code.  */
7859
7860 static rtx
7861 ix86_delegitimize_address (rtx orig_x)
7862 {
7863   rtx x = orig_x;
7864   /* reg_addend is NULL or a multiple of some register.  */
7865   rtx reg_addend = NULL_RTX;
7866   /* const_addend is NULL or a const_int.  */
7867   rtx const_addend = NULL_RTX;
7868   /* This is the result, or NULL.  */
7869   rtx result = NULL_RTX;
7870
7871   if (MEM_P (x))
7872     x = XEXP (x, 0);
7873
7874   if (TARGET_64BIT)
7875     {
7876       if (GET_CODE (x) != CONST
7877           || GET_CODE (XEXP (x, 0)) != UNSPEC
7878           || XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
7879           || !MEM_P (orig_x))
7880         return orig_x;
7881       return XVECEXP (XEXP (x, 0), 0, 0);
7882     }
7883
7884   if (GET_CODE (x) != PLUS
7885       || GET_CODE (XEXP (x, 1)) != CONST)
7886     return orig_x;
7887
7888   if (REG_P (XEXP (x, 0))
7889       && REGNO (XEXP (x, 0)) == PIC_OFFSET_TABLE_REGNUM)
7890     /* %ebx + GOT/GOTOFF */
7891     ;
7892   else if (GET_CODE (XEXP (x, 0)) == PLUS)
7893     {
7894       /* %ebx + %reg * scale + GOT/GOTOFF */
7895       reg_addend = XEXP (x, 0);
7896       if (REG_P (XEXP (reg_addend, 0))
7897           && REGNO (XEXP (reg_addend, 0)) == PIC_OFFSET_TABLE_REGNUM)
7898         reg_addend = XEXP (reg_addend, 1);
7899       else if (REG_P (XEXP (reg_addend, 1))
7900                && REGNO (XEXP (reg_addend, 1)) == PIC_OFFSET_TABLE_REGNUM)
7901         reg_addend = XEXP (reg_addend, 0);
7902       else
7903         return orig_x;
7904       if (!REG_P (reg_addend)
7905           && GET_CODE (reg_addend) != MULT
7906           && GET_CODE (reg_addend) != ASHIFT)
7907         return orig_x;
7908     }
7909   else
7910     return orig_x;
7911
7912   x = XEXP (XEXP (x, 1), 0);
7913   if (GET_CODE (x) == PLUS
7914       && CONST_INT_P (XEXP (x, 1)))
7915     {
7916       const_addend = XEXP (x, 1);
7917       x = XEXP (x, 0);
7918     }
7919
7920   if (GET_CODE (x) == UNSPEC
7921       && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x))
7922           || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
7923     result = XVECEXP (x, 0, 0);
7924
7925   if (TARGET_MACHO && darwin_local_data_pic (x)
7926       && !MEM_P (orig_x))
7927     result = XEXP (x, 0);
7928
7929   if (! result)
7930     return orig_x;
7931
7932   if (const_addend)
7933     result = gen_rtx_PLUS (Pmode, result, const_addend);
7934   if (reg_addend)
7935     result = gen_rtx_PLUS (Pmode, reg_addend, result);
7936   return result;
7937 }
7938
7939 /* If X is a machine specific address (i.e. a symbol or label being
7940    referenced as a displacement from the GOT implemented using an
7941    UNSPEC), then return the base term.  Otherwise return X.  */
7942
7943 rtx
7944 ix86_find_base_term (rtx x)
7945 {
7946   rtx term;
7947
7948   if (TARGET_64BIT)
7949     {
7950       if (GET_CODE (x) != CONST)
7951         return x;
7952       term = XEXP (x, 0);
7953       if (GET_CODE (term) == PLUS
7954           && (CONST_INT_P (XEXP (term, 1))
7955               || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
7956         term = XEXP (term, 0);
7957       if (GET_CODE (term) != UNSPEC
7958           || XINT (term, 1) != UNSPEC_GOTPCREL)
7959         return x;
7960
7961       term = XVECEXP (term, 0, 0);
7962
7963       if (GET_CODE (term) != SYMBOL_REF
7964           && GET_CODE (term) != LABEL_REF)
7965         return x;
7966
7967       return term;
7968     }
7969
7970   term = ix86_delegitimize_address (x);
7971
7972   if (GET_CODE (term) != SYMBOL_REF
7973       && GET_CODE (term) != LABEL_REF)
7974     return x;
7975
7976   return term;
7977 }
7978 \f
7979 static void
7980 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
7981                     int fp, FILE *file)
7982 {
7983   const char *suffix;
7984
7985   if (mode == CCFPmode || mode == CCFPUmode)
7986     {
7987       enum rtx_code second_code, bypass_code;
7988       ix86_fp_comparison_codes (code, &bypass_code, &code, &second_code);
7989       gcc_assert (bypass_code == UNKNOWN && second_code == UNKNOWN);
7990       code = ix86_fp_compare_code_to_integer (code);
7991       mode = CCmode;
7992     }
7993   if (reverse)
7994     code = reverse_condition (code);
7995
7996   switch (code)
7997     {
7998     case EQ:
7999       suffix = "e";
8000       break;
8001     case NE:
8002       suffix = "ne";
8003       break;
8004     case GT:
8005       gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
8006       suffix = "g";
8007       break;
8008     case GTU:
8009       /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
8010          Those same assemblers have the same but opposite lossage on cmov.  */
8011       gcc_assert (mode == CCmode);
8012       suffix = fp ? "nbe" : "a";
8013       break;
8014     case LT:
8015       switch (mode)
8016         {
8017         case CCNOmode:
8018         case CCGOCmode:
8019           suffix = "s";
8020           break;
8021
8022         case CCmode:
8023         case CCGCmode:
8024           suffix = "l";
8025           break;
8026
8027         default:
8028           gcc_unreachable ();
8029         }
8030       break;
8031     case LTU:
8032       gcc_assert (mode == CCmode);
8033       suffix = "b";
8034       break;
8035     case GE:
8036       switch (mode)
8037         {
8038         case CCNOmode:
8039         case CCGOCmode:
8040           suffix = "ns";
8041           break;
8042
8043         case CCmode:
8044         case CCGCmode:
8045           suffix = "ge";
8046           break;
8047
8048         default:
8049           gcc_unreachable ();
8050         }
8051       break;
8052     case GEU:
8053       /* ??? As above.  */
8054       gcc_assert (mode == CCmode);
8055       suffix = fp ? "nb" : "ae";
8056       break;
8057     case LE:
8058       gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
8059       suffix = "le";
8060       break;
8061     case LEU:
8062       gcc_assert (mode == CCmode);
8063       suffix = "be";
8064       break;
8065     case UNORDERED:
8066       suffix = fp ? "u" : "p";
8067       break;
8068     case ORDERED:
8069       suffix = fp ? "nu" : "np";
8070       break;
8071     default:
8072       gcc_unreachable ();
8073     }
8074   fputs (suffix, file);
8075 }
8076
8077 /* Print the name of register X to FILE based on its machine mode and number.
8078    If CODE is 'w', pretend the mode is HImode.
8079    If CODE is 'b', pretend the mode is QImode.
8080    If CODE is 'k', pretend the mode is SImode.
8081    If CODE is 'q', pretend the mode is DImode.
8082    If CODE is 'h', pretend the reg is the 'high' byte register.
8083    If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.  */
8084
8085 void
8086 print_reg (rtx x, int code, FILE *file)
8087 {
8088   gcc_assert (REGNO (x) != ARG_POINTER_REGNUM
8089               && REGNO (x) != FRAME_POINTER_REGNUM
8090               && REGNO (x) != FLAGS_REG
8091               && REGNO (x) != FPSR_REG
8092               && REGNO (x) != FPCR_REG);
8093
8094   if (ASSEMBLER_DIALECT == ASM_ATT || USER_LABEL_PREFIX[0] == 0)
8095     putc ('%', file);
8096
8097   if (code == 'w' || MMX_REG_P (x))
8098     code = 2;
8099   else if (code == 'b')
8100     code = 1;
8101   else if (code == 'k')
8102     code = 4;
8103   else if (code == 'q')
8104     code = 8;
8105   else if (code == 'y')
8106     code = 3;
8107   else if (code == 'h')
8108     code = 0;
8109   else
8110     code = GET_MODE_SIZE (GET_MODE (x));
8111
8112   /* Irritatingly, AMD extended registers use different naming convention
8113      from the normal registers.  */
8114   if (REX_INT_REG_P (x))
8115     {
8116       gcc_assert (TARGET_64BIT);
8117       switch (code)
8118         {
8119           case 0:
8120             error ("extended registers have no high halves");
8121             break;
8122           case 1:
8123             fprintf (file, "r%ib", REGNO (x) - FIRST_REX_INT_REG + 8);
8124             break;
8125           case 2:
8126             fprintf (file, "r%iw", REGNO (x) - FIRST_REX_INT_REG + 8);
8127             break;
8128           case 4:
8129             fprintf (file, "r%id", REGNO (x) - FIRST_REX_INT_REG + 8);
8130             break;
8131           case 8:
8132             fprintf (file, "r%i", REGNO (x) - FIRST_REX_INT_REG + 8);
8133             break;
8134           default:
8135             error ("unsupported operand size for extended register");
8136             break;
8137         }
8138       return;
8139     }
8140   switch (code)
8141     {
8142     case 3:
8143       if (STACK_TOP_P (x))
8144         {
8145           fputs ("st(0)", file);
8146           break;
8147         }
8148       /* FALLTHRU */
8149     case 8:
8150     case 4:
8151     case 12:
8152       if (! ANY_FP_REG_P (x))
8153         putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
8154       /* FALLTHRU */
8155     case 16:
8156     case 2:
8157     normal:
8158       fputs (hi_reg_name[REGNO (x)], file);
8159       break;
8160     case 1:
8161       if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
8162         goto normal;
8163       fputs (qi_reg_name[REGNO (x)], file);
8164       break;
8165     case 0:
8166       if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
8167         goto normal;
8168       fputs (qi_high_reg_name[REGNO (x)], file);
8169       break;
8170     default:
8171       gcc_unreachable ();
8172     }
8173 }
8174
8175 /* Locate some local-dynamic symbol still in use by this function
8176    so that we can print its name in some tls_local_dynamic_base
8177    pattern.  */
8178
8179 static int
8180 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
8181 {
8182   rtx x = *px;
8183
8184   if (GET_CODE (x) == SYMBOL_REF
8185       && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
8186     {
8187       cfun->machine->some_ld_name = XSTR (x, 0);
8188       return 1;
8189     }
8190
8191   return 0;
8192 }
8193
8194 static const char *
8195 get_some_local_dynamic_name (void)
8196 {
8197   rtx insn;
8198
8199   if (cfun->machine->some_ld_name)
8200     return cfun->machine->some_ld_name;
8201
8202   for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
8203     if (INSN_P (insn)
8204         && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
8205       return cfun->machine->some_ld_name;
8206
8207   gcc_unreachable ();
8208 }
8209
8210 /* Meaning of CODE:
8211    L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
8212    C -- print opcode suffix for set/cmov insn.
8213    c -- like C, but print reversed condition
8214    F,f -- likewise, but for floating-point.
8215    O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
8216         otherwise nothing
8217    R -- print the prefix for register names.
8218    z -- print the opcode suffix for the size of the current operand.
8219    * -- print a star (in certain assembler syntax)
8220    A -- print an absolute memory reference.
8221    w -- print the operand as if it's a "word" (HImode) even if it isn't.
8222    s -- print a shift double count, followed by the assemblers argument
8223         delimiter.
8224    b -- print the QImode name of the register for the indicated operand.
8225         %b0 would print %al if operands[0] is reg 0.
8226    w --  likewise, print the HImode name of the register.
8227    k --  likewise, print the SImode name of the register.
8228    q --  likewise, print the DImode name of the register.
8229    h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
8230    y -- print "st(0)" instead of "st" as a register.
8231    D -- print condition for SSE cmp instruction.
8232    P -- if PIC, print an @PLT suffix.
8233    X -- don't print any sort of PIC '@' suffix for a symbol.
8234    & -- print some in-use local-dynamic symbol name.
8235    H -- print a memory address offset by 8; used for sse high-parts
8236  */
8237
8238 void
8239 print_operand (FILE *file, rtx x, int code)
8240 {
8241   if (code)
8242     {
8243       switch (code)
8244         {
8245         case '*':
8246           if (ASSEMBLER_DIALECT == ASM_ATT)
8247             putc ('*', file);
8248           return;
8249
8250         case '&':
8251           assemble_name (file, get_some_local_dynamic_name ());
8252           return;
8253
8254         case 'A':
8255           switch (ASSEMBLER_DIALECT)
8256             {
8257             case ASM_ATT:
8258               putc ('*', file);
8259               break;
8260
8261             case ASM_INTEL:
8262               /* Intel syntax. For absolute addresses, registers should not
8263                  be surrounded by braces.  */
8264               if (!REG_P (x))
8265                 {
8266                   putc ('[', file);
8267                   PRINT_OPERAND (file, x, 0);
8268                   putc (']', file);
8269                   return;
8270                 }
8271               break;
8272
8273             default:
8274               gcc_unreachable ();
8275             }
8276
8277           PRINT_OPERAND (file, x, 0);
8278           return;
8279
8280
8281         case 'L':
8282           if (ASSEMBLER_DIALECT == ASM_ATT)
8283             putc ('l', file);
8284           return;
8285
8286         case 'W':
8287           if (ASSEMBLER_DIALECT == ASM_ATT)
8288             putc ('w', file);
8289           return;
8290
8291         case 'B':
8292           if (ASSEMBLER_DIALECT == ASM_ATT)
8293             putc ('b', file);
8294           return;
8295
8296         case 'Q':
8297           if (ASSEMBLER_DIALECT == ASM_ATT)
8298             putc ('l', file);
8299           return;
8300
8301         case 'S':
8302           if (ASSEMBLER_DIALECT == ASM_ATT)
8303             putc ('s', file);
8304           return;
8305
8306         case 'T':
8307           if (ASSEMBLER_DIALECT == ASM_ATT)
8308             putc ('t', file);
8309           return;
8310
8311         case 'z':
8312           /* 387 opcodes don't get size suffixes if the operands are
8313              registers.  */
8314           if (STACK_REG_P (x))
8315             return;
8316
8317           /* Likewise if using Intel opcodes.  */
8318           if (ASSEMBLER_DIALECT == ASM_INTEL)
8319             return;
8320
8321           /* This is the size of op from size of operand.  */
8322           switch (GET_MODE_SIZE (GET_MODE (x)))
8323             {
8324             case 1:
8325               putc ('b', file);
8326               return;
8327
8328             case 2:
8329 #ifdef HAVE_GAS_FILDS_FISTS
8330               putc ('s', file);
8331 #endif
8332               return;
8333
8334             case 4:
8335               if (GET_MODE (x) == SFmode)
8336                 {
8337                   putc ('s', file);
8338                   return;
8339                 }
8340               else
8341                 putc ('l', file);
8342               return;
8343
8344             case 12:
8345             case 16:
8346               putc ('t', file);
8347               return;
8348
8349             case 8:
8350               if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
8351                 {
8352 #ifdef GAS_MNEMONICS
8353                   putc ('q', file);
8354 #else
8355                   putc ('l', file);
8356                   putc ('l', file);
8357 #endif
8358                 }
8359               else
8360                 putc ('l', file);
8361               return;
8362
8363             default:
8364               gcc_unreachable ();
8365             }
8366
8367         case 'b':
8368         case 'w':
8369         case 'k':
8370         case 'q':
8371         case 'h':
8372         case 'y':
8373         case 'X':
8374         case 'P':
8375           break;
8376
8377         case 's':
8378           if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
8379             {
8380               PRINT_OPERAND (file, x, 0);
8381               putc (',', file);
8382             }
8383           return;
8384
8385         case 'D':
8386           /* Little bit of braindamage here.  The SSE compare instructions
8387              does use completely different names for the comparisons that the
8388              fp conditional moves.  */
8389           switch (GET_CODE (x))
8390             {
8391             case EQ:
8392             case UNEQ:
8393               fputs ("eq", file);
8394               break;
8395             case LT:
8396             case UNLT:
8397               fputs ("lt", file);
8398               break;
8399             case LE:
8400             case UNLE:
8401               fputs ("le", file);
8402               break;
8403             case UNORDERED:
8404               fputs ("unord", file);
8405               break;
8406             case NE:
8407             case LTGT:
8408               fputs ("neq", file);
8409               break;
8410             case UNGE:
8411             case GE:
8412               fputs ("nlt", file);
8413               break;
8414             case UNGT:
8415             case GT:
8416               fputs ("nle", file);
8417               break;
8418             case ORDERED:
8419               fputs ("ord", file);
8420               break;
8421             default:
8422               gcc_unreachable ();
8423             }
8424           return;
8425         case 'O':
8426 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
8427           if (ASSEMBLER_DIALECT == ASM_ATT)
8428             {
8429               switch (GET_MODE (x))
8430                 {
8431                 case HImode: putc ('w', file); break;
8432                 case SImode:
8433                 case SFmode: putc ('l', file); break;
8434                 case DImode:
8435                 case DFmode: putc ('q', file); break;
8436                 default: gcc_unreachable ();
8437                 }
8438               putc ('.', file);
8439             }
8440 #endif
8441           return;
8442         case 'C':
8443           put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
8444           return;
8445         case 'F':
8446 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
8447           if (ASSEMBLER_DIALECT == ASM_ATT)
8448             putc ('.', file);
8449 #endif
8450           put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
8451           return;
8452
8453           /* Like above, but reverse condition */
8454         case 'c':
8455           /* Check to see if argument to %c is really a constant
8456              and not a condition code which needs to be reversed.  */
8457           if (!COMPARISON_P (x))
8458           {
8459             output_operand_lossage ("operand is neither a constant nor a condition code, invalid operand code 'c'");
8460              return;
8461           }
8462           put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
8463           return;
8464         case 'f':
8465 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
8466           if (ASSEMBLER_DIALECT == ASM_ATT)
8467             putc ('.', file);
8468 #endif
8469           put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
8470           return;
8471
8472         case 'H':
8473           /* It doesn't actually matter what mode we use here, as we're
8474              only going to use this for printing.  */
8475           x = adjust_address_nv (x, DImode, 8);
8476           break;
8477
8478         case '+':
8479           {
8480             rtx x;
8481
8482             if (!optimize || optimize_size || !TARGET_BRANCH_PREDICTION_HINTS)
8483               return;
8484
8485             x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
8486             if (x)
8487               {
8488                 int pred_val = INTVAL (XEXP (x, 0));
8489
8490                 if (pred_val < REG_BR_PROB_BASE * 45 / 100
8491                     || pred_val > REG_BR_PROB_BASE * 55 / 100)
8492                   {
8493                     int taken = pred_val > REG_BR_PROB_BASE / 2;
8494                     int cputaken = final_forward_branch_p (current_output_insn) == 0;
8495
8496                     /* Emit hints only in the case default branch prediction
8497                        heuristics would fail.  */
8498                     if (taken != cputaken)
8499                       {
8500                         /* We use 3e (DS) prefix for taken branches and
8501                            2e (CS) prefix for not taken branches.  */
8502                         if (taken)
8503                           fputs ("ds ; ", file);
8504                         else
8505                           fputs ("cs ; ", file);
8506                       }
8507                   }
8508               }
8509             return;
8510           }
8511         default:
8512             output_operand_lossage ("invalid operand code '%c'", code);
8513         }
8514     }
8515
8516   if (REG_P (x))
8517     print_reg (x, code, file);
8518
8519   else if (MEM_P (x))
8520     {
8521       /* No `byte ptr' prefix for call instructions.  */
8522       if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P')
8523         {
8524           const char * size;
8525           switch (GET_MODE_SIZE (GET_MODE (x)))
8526             {
8527             case 1: size = "BYTE"; break;
8528             case 2: size = "WORD"; break;
8529             case 4: size = "DWORD"; break;
8530             case 8: size = "QWORD"; break;
8531             case 12: size = "XWORD"; break;
8532             case 16: size = "XMMWORD"; break;
8533             default:
8534               gcc_unreachable ();
8535             }
8536
8537           /* Check for explicit size override (codes 'b', 'w' and 'k')  */
8538           if (code == 'b')
8539             size = "BYTE";
8540           else if (code == 'w')
8541             size = "WORD";
8542           else if (code == 'k')
8543             size = "DWORD";
8544
8545           fputs (size, file);
8546           fputs (" PTR ", file);
8547         }
8548
8549       x = XEXP (x, 0);
8550       /* Avoid (%rip) for call operands.  */
8551       if (CONSTANT_ADDRESS_P (x) && code == 'P'
8552           && !CONST_INT_P (x))
8553         output_addr_const (file, x);
8554       else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
8555         output_operand_lossage ("invalid constraints for operand");
8556       else
8557         output_address (x);
8558     }
8559
8560   else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
8561     {
8562       REAL_VALUE_TYPE r;
8563       long l;
8564
8565       REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8566       REAL_VALUE_TO_TARGET_SINGLE (r, l);
8567
8568       if (ASSEMBLER_DIALECT == ASM_ATT)
8569         putc ('$', file);
8570       fprintf (file, "0x%08lx", l);
8571     }
8572
8573   /* These float cases don't actually occur as immediate operands.  */
8574   else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
8575     {
8576       char dstr[30];
8577
8578       real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
8579       fprintf (file, "%s", dstr);
8580     }
8581
8582   else if (GET_CODE (x) == CONST_DOUBLE
8583            && GET_MODE (x) == XFmode)
8584     {
8585       char dstr[30];
8586
8587       real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
8588       fprintf (file, "%s", dstr);
8589     }
8590
8591   else
8592     {
8593       /* We have patterns that allow zero sets of memory, for instance.
8594          In 64-bit mode, we should probably support all 8-byte vectors,
8595          since we can in fact encode that into an immediate.  */
8596       if (GET_CODE (x) == CONST_VECTOR)
8597         {
8598           gcc_assert (x == CONST0_RTX (GET_MODE (x)));
8599           x = const0_rtx;
8600         }
8601
8602       if (code != 'P')
8603         {
8604           if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
8605             {
8606               if (ASSEMBLER_DIALECT == ASM_ATT)
8607                 putc ('$', file);
8608             }
8609           else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
8610                    || GET_CODE (x) == LABEL_REF)
8611             {
8612               if (ASSEMBLER_DIALECT == ASM_ATT)
8613                 putc ('$', file);
8614               else
8615                 fputs ("OFFSET FLAT:", file);
8616             }
8617         }
8618       if (CONST_INT_P (x))
8619         fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
8620       else if (flag_pic)
8621         output_pic_addr_const (file, x, code);
8622       else
8623         output_addr_const (file, x);
8624     }
8625 }
8626 \f
8627 /* Print a memory operand whose address is ADDR.  */
8628
8629 void
8630 print_operand_address (FILE *file, rtx addr)
8631 {
8632   struct ix86_address parts;
8633   rtx base, index, disp;
8634   int scale;
8635   int ok = ix86_decompose_address (addr, &parts);
8636
8637   gcc_assert (ok);
8638
8639   base = parts.base;
8640   index = parts.index;
8641   disp = parts.disp;
8642   scale = parts.scale;
8643
8644   switch (parts.seg)
8645     {
8646     case SEG_DEFAULT:
8647       break;
8648     case SEG_FS:
8649     case SEG_GS:
8650       if (USER_LABEL_PREFIX[0] == 0)
8651         putc ('%', file);
8652       fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
8653       break;
8654     default:
8655       gcc_unreachable ();
8656     }
8657
8658   if (!base && !index)
8659     {
8660       /* Displacement only requires special attention.  */
8661
8662       if (CONST_INT_P (disp))
8663         {
8664           if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
8665             {
8666               if (USER_LABEL_PREFIX[0] == 0)
8667                 putc ('%', file);
8668               fputs ("ds:", file);
8669             }
8670           fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
8671         }
8672       else if (flag_pic)
8673         output_pic_addr_const (file, disp, 0);
8674       else
8675         output_addr_const (file, disp);
8676
8677       /* Use one byte shorter RIP relative addressing for 64bit mode.  */
8678       if (TARGET_64BIT)
8679         {
8680           if (GET_CODE (disp) == CONST
8681               && GET_CODE (XEXP (disp, 0)) == PLUS
8682               && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
8683             disp = XEXP (XEXP (disp, 0), 0);
8684           if (GET_CODE (disp) == LABEL_REF
8685               || (GET_CODE (disp) == SYMBOL_REF
8686                   && SYMBOL_REF_TLS_MODEL (disp) == 0))
8687             fputs ("(%rip)", file);
8688         }
8689     }
8690   else
8691     {
8692       if (ASSEMBLER_DIALECT == ASM_ATT)
8693         {
8694           if (disp)
8695             {
8696               if (flag_pic)
8697                 output_pic_addr_const (file, disp, 0);
8698               else if (GET_CODE (disp) == LABEL_REF)
8699                 output_asm_label (disp);
8700               else
8701                 output_addr_const (file, disp);
8702             }
8703
8704           putc ('(', file);
8705           if (base)
8706             print_reg (base, 0, file);
8707           if (index)
8708             {
8709               putc (',', file);
8710               print_reg (index, 0, file);
8711               if (scale != 1)
8712                 fprintf (file, ",%d", scale);
8713             }
8714           putc (')', file);
8715         }
8716       else
8717         {
8718           rtx offset = NULL_RTX;
8719
8720           if (disp)
8721             {
8722               /* Pull out the offset of a symbol; print any symbol itself.  */
8723               if (GET_CODE (disp) == CONST
8724                   && GET_CODE (XEXP (disp, 0)) == PLUS
8725                   && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
8726                 {
8727                   offset = XEXP (XEXP (disp, 0), 1);
8728                   disp = gen_rtx_CONST (VOIDmode,
8729                                         XEXP (XEXP (disp, 0), 0));
8730                 }
8731
8732               if (flag_pic)
8733                 output_pic_addr_const (file, disp, 0);
8734               else if (GET_CODE (disp) == LABEL_REF)
8735                 output_asm_label (disp);
8736               else if (CONST_INT_P (disp))
8737                 offset = disp;
8738               else
8739                 output_addr_const (file, disp);
8740             }
8741
8742           putc ('[', file);
8743           if (base)
8744             {
8745               print_reg (base, 0, file);
8746               if (offset)
8747                 {
8748                   if (INTVAL (offset) >= 0)
8749                     putc ('+', file);
8750                   fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
8751                 }
8752             }
8753           else if (offset)
8754             fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
8755           else
8756             putc ('0', file);
8757
8758           if (index)
8759             {
8760               putc ('+', file);
8761               print_reg (index, 0, file);
8762               if (scale != 1)
8763                 fprintf (file, "*%d", scale);
8764             }
8765           putc (']', file);
8766         }
8767     }
8768 }
8769
8770 bool
8771 output_addr_const_extra (FILE *file, rtx x)
8772 {
8773   rtx op;
8774
8775   if (GET_CODE (x) != UNSPEC)
8776     return false;
8777
8778   op = XVECEXP (x, 0, 0);
8779   switch (XINT (x, 1))
8780     {
8781     case UNSPEC_GOTTPOFF:
8782       output_addr_const (file, op);
8783       /* FIXME: This might be @TPOFF in Sun ld.  */
8784       fputs ("@GOTTPOFF", file);
8785       break;
8786     case UNSPEC_TPOFF:
8787       output_addr_const (file, op);
8788       fputs ("@TPOFF", file);
8789       break;
8790     case UNSPEC_NTPOFF:
8791       output_addr_const (file, op);
8792       if (TARGET_64BIT)
8793         fputs ("@TPOFF", file);
8794       else
8795         fputs ("@NTPOFF", file);
8796       break;
8797     case UNSPEC_DTPOFF:
8798       output_addr_const (file, op);
8799       fputs ("@DTPOFF", file);
8800       break;
8801     case UNSPEC_GOTNTPOFF:
8802       output_addr_const (file, op);
8803       if (TARGET_64BIT)
8804         fputs ("@GOTTPOFF(%rip)", file);
8805       else
8806         fputs ("@GOTNTPOFF", file);
8807       break;
8808     case UNSPEC_INDNTPOFF:
8809       output_addr_const (file, op);
8810       fputs ("@INDNTPOFF", file);
8811       break;
8812
8813     default:
8814       return false;
8815     }
8816
8817   return true;
8818 }
8819 \f
8820 /* Split one or more DImode RTL references into pairs of SImode
8821    references.  The RTL can be REG, offsettable MEM, integer constant, or
8822    CONST_DOUBLE.  "operands" is a pointer to an array of DImode RTL to
8823    split and "num" is its length.  lo_half and hi_half are output arrays
8824    that parallel "operands".  */
8825
8826 void
8827 split_di (rtx operands[], int num, rtx lo_half[], rtx hi_half[])
8828 {
8829   while (num--)
8830     {
8831       rtx op = operands[num];
8832
8833       /* simplify_subreg refuse to split volatile memory addresses,
8834          but we still have to handle it.  */
8835       if (MEM_P (op))
8836         {
8837           lo_half[num] = adjust_address (op, SImode, 0);
8838           hi_half[num] = adjust_address (op, SImode, 4);
8839         }
8840       else
8841         {
8842           lo_half[num] = simplify_gen_subreg (SImode, op,
8843                                               GET_MODE (op) == VOIDmode
8844                                               ? DImode : GET_MODE (op), 0);
8845           hi_half[num] = simplify_gen_subreg (SImode, op,
8846                                               GET_MODE (op) == VOIDmode
8847                                               ? DImode : GET_MODE (op), 4);
8848         }
8849     }
8850 }
8851 /* Split one or more TImode RTL references into pairs of DImode
8852    references.  The RTL can be REG, offsettable MEM, integer constant, or
8853    CONST_DOUBLE.  "operands" is a pointer to an array of DImode RTL to
8854    split and "num" is its length.  lo_half and hi_half are output arrays
8855    that parallel "operands".  */
8856
8857 void
8858 split_ti (rtx operands[], int num, rtx lo_half[], rtx hi_half[])
8859 {
8860   while (num--)
8861     {
8862       rtx op = operands[num];
8863
8864       /* simplify_subreg refuse to split volatile memory addresses, but we
8865          still have to handle it.  */
8866       if (MEM_P (op))
8867         {
8868           lo_half[num] = adjust_address (op, DImode, 0);
8869           hi_half[num] = adjust_address (op, DImode, 8);
8870         }
8871       else
8872         {
8873           lo_half[num] = simplify_gen_subreg (DImode, op, TImode, 0);
8874           hi_half[num] = simplify_gen_subreg (DImode, op, TImode, 8);
8875         }
8876     }
8877 }
8878 \f
8879 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
8880    MINUS, MULT or DIV.  OPERANDS are the insn operands, where operands[3]
8881    is the expression of the binary operation.  The output may either be
8882    emitted here, or returned to the caller, like all output_* functions.
8883
8884    There is no guarantee that the operands are the same mode, as they
8885    might be within FLOAT or FLOAT_EXTEND expressions.  */
8886
8887 #ifndef SYSV386_COMPAT
8888 /* Set to 1 for compatibility with brain-damaged assemblers.  No-one
8889    wants to fix the assemblers because that causes incompatibility
8890    with gcc.  No-one wants to fix gcc because that causes
8891    incompatibility with assemblers...  You can use the option of
8892    -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way.  */
8893 #define SYSV386_COMPAT 1
8894 #endif
8895
8896 const char *
8897 output_387_binary_op (rtx insn, rtx *operands)
8898 {
8899   static char buf[30];
8900   const char *p;
8901   const char *ssep;
8902   int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
8903
8904 #ifdef ENABLE_CHECKING
8905   /* Even if we do not want to check the inputs, this documents input
8906      constraints.  Which helps in understanding the following code.  */
8907   if (STACK_REG_P (operands[0])
8908       && ((REG_P (operands[1])
8909            && REGNO (operands[0]) == REGNO (operands[1])
8910            && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
8911           || (REG_P (operands[2])
8912               && REGNO (operands[0]) == REGNO (operands[2])
8913               && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
8914       && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
8915     ; /* ok */
8916   else
8917     gcc_assert (is_sse);
8918 #endif
8919
8920   switch (GET_CODE (operands[3]))
8921     {
8922     case PLUS:
8923       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
8924           || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
8925         p = "fiadd";
8926       else
8927         p = "fadd";
8928       ssep = "add";
8929       break;
8930
8931     case MINUS:
8932       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
8933           || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
8934         p = "fisub";
8935       else
8936         p = "fsub";
8937       ssep = "sub";
8938       break;
8939
8940     case MULT:
8941       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
8942           || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
8943         p = "fimul";
8944       else
8945         p = "fmul";
8946       ssep = "mul";
8947       break;
8948
8949     case DIV:
8950       if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
8951           || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
8952         p = "fidiv";
8953       else
8954         p = "fdiv";
8955       ssep = "div";
8956       break;
8957
8958     default:
8959       gcc_unreachable ();
8960     }
8961
8962   if (is_sse)
8963    {
8964       strcpy (buf, ssep);
8965       if (GET_MODE (operands[0]) == SFmode)
8966         strcat (buf, "ss\t{%2, %0|%0, %2}");
8967       else
8968         strcat (buf, "sd\t{%2, %0|%0, %2}");
8969       return buf;
8970    }
8971   strcpy (buf, p);
8972
8973   switch (GET_CODE (operands[3]))
8974     {
8975     case MULT:
8976     case PLUS:
8977       if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
8978         {
8979           rtx temp = operands[2];
8980           operands[2] = operands[1];
8981           operands[1] = temp;
8982         }
8983
8984       /* know operands[0] == operands[1].  */
8985
8986       if (MEM_P (operands[2]))
8987         {
8988           p = "%z2\t%2";
8989           break;
8990         }
8991
8992       if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
8993         {
8994           if (STACK_TOP_P (operands[0]))
8995             /* How is it that we are storing to a dead operand[2]?
8996                Well, presumably operands[1] is dead too.  We can't
8997                store the result to st(0) as st(0) gets popped on this
8998                instruction.  Instead store to operands[2] (which I
8999                think has to be st(1)).  st(1) will be popped later.
9000                gcc <= 2.8.1 didn't have this check and generated
9001                assembly code that the Unixware assembler rejected.  */
9002             p = "p\t{%0, %2|%2, %0}";   /* st(1) = st(0) op st(1); pop */
9003           else
9004             p = "p\t{%2, %0|%0, %2}";   /* st(r1) = st(r1) op st(0); pop */
9005           break;
9006         }
9007
9008       if (STACK_TOP_P (operands[0]))
9009         p = "\t{%y2, %0|%0, %y2}";      /* st(0) = st(0) op st(r2) */
9010       else
9011         p = "\t{%2, %0|%0, %2}";        /* st(r1) = st(r1) op st(0) */
9012       break;
9013
9014     case MINUS:
9015     case DIV:
9016       if (MEM_P (operands[1]))
9017         {
9018           p = "r%z1\t%1";
9019           break;
9020         }
9021
9022       if (MEM_P (operands[2]))
9023         {
9024           p = "%z2\t%2";
9025           break;
9026         }
9027
9028       if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
9029         {
9030 #if SYSV386_COMPAT
9031           /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
9032              derived assemblers, confusingly reverse the direction of
9033              the operation for fsub{r} and fdiv{r} when the
9034              destination register is not st(0).  The Intel assembler
9035              doesn't have this brain damage.  Read !SYSV386_COMPAT to
9036              figure out what the hardware really does.  */
9037           if (STACK_TOP_P (operands[0]))
9038             p = "{p\t%0, %2|rp\t%2, %0}";
9039           else
9040             p = "{rp\t%2, %0|p\t%0, %2}";
9041 #else
9042           if (STACK_TOP_P (operands[0]))
9043             /* As above for fmul/fadd, we can't store to st(0).  */
9044             p = "rp\t{%0, %2|%2, %0}";  /* st(1) = st(0) op st(1); pop */
9045           else
9046             p = "p\t{%2, %0|%0, %2}";   /* st(r1) = st(r1) op st(0); pop */
9047 #endif
9048           break;
9049         }
9050
9051       if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
9052         {
9053 #if SYSV386_COMPAT
9054           if (STACK_TOP_P (operands[0]))
9055             p = "{rp\t%0, %1|p\t%1, %0}";
9056           else
9057             p = "{p\t%1, %0|rp\t%0, %1}";
9058 #else
9059           if (STACK_TOP_P (operands[0]))
9060             p = "p\t{%0, %1|%1, %0}";   /* st(1) = st(1) op st(0); pop */
9061           else
9062             p = "rp\t{%1, %0|%0, %1}";  /* st(r2) = st(0) op st(r2); pop */
9063 #endif
9064           break;
9065         }
9066
9067       if (STACK_TOP_P (operands[0]))
9068         {
9069           if (STACK_TOP_P (operands[1]))
9070             p = "\t{%y2, %0|%0, %y2}";  /* st(0) = st(0) op st(r2) */
9071           else
9072             p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
9073           break;
9074         }
9075       else if (STACK_TOP_P (operands[1]))
9076         {
9077 #if SYSV386_COMPAT
9078           p = "{\t%1, %0|r\t%0, %1}";
9079 #else
9080           p = "r\t{%1, %0|%0, %1}";     /* st(r2) = st(0) op st(r2) */
9081 #endif
9082         }
9083       else
9084         {
9085 #if SYSV386_COMPAT
9086           p = "{r\t%2, %0|\t%0, %2}";
9087 #else
9088           p = "\t{%2, %0|%0, %2}";      /* st(r1) = st(r1) op st(0) */
9089 #endif
9090         }
9091       break;
9092
9093     default:
9094       gcc_unreachable ();
9095     }
9096
9097   strcat (buf, p);
9098   return buf;
9099 }
9100
9101 /* Return needed mode for entity in optimize_mode_switching pass.  */
9102
9103 int
9104 ix86_mode_needed (int entity, rtx insn)
9105 {
9106   enum attr_i387_cw mode;
9107
9108   /* The mode UNINITIALIZED is used to store control word after a
9109      function call or ASM pattern.  The mode ANY specify that function
9110      has no requirements on the control word and make no changes in the
9111      bits we are interested in.  */
9112
9113   if (CALL_P (insn)
9114       || (NONJUMP_INSN_P (insn)
9115           && (asm_noperands (PATTERN (insn)) >= 0
9116               || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
9117     return I387_CW_UNINITIALIZED;
9118
9119   if (recog_memoized (insn) < 0)
9120     return I387_CW_ANY;
9121
9122   mode = get_attr_i387_cw (insn);
9123
9124   switch (entity)
9125     {
9126     case I387_TRUNC:
9127       if (mode == I387_CW_TRUNC)
9128         return mode;
9129       break;
9130
9131     case I387_FLOOR:
9132       if (mode == I387_CW_FLOOR)
9133         return mode;
9134       break;
9135
9136     case I387_CEIL:
9137       if (mode == I387_CW_CEIL)
9138         return mode;
9139       break;
9140
9141     case I387_MASK_PM:
9142       if (mode == I387_CW_MASK_PM)
9143         return mode;
9144       break;
9145
9146     default:
9147       gcc_unreachable ();
9148     }
9149
9150   return I387_CW_ANY;
9151 }
9152
9153 /* Output code to initialize control word copies used by trunc?f?i and
9154    rounding patterns.  CURRENT_MODE is set to current control word,
9155    while NEW_MODE is set to new control word.  */
9156
9157 void
9158 emit_i387_cw_initialization (int mode)
9159 {
9160   rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
9161   rtx new_mode;
9162
9163   int slot;
9164
9165   rtx reg = gen_reg_rtx (HImode);
9166
9167   emit_insn (gen_x86_fnstcw_1 (stored_mode));
9168   emit_move_insn (reg, copy_rtx (stored_mode));
9169
9170   if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL || optimize_size)
9171     {
9172       switch (mode)
9173         {
9174         case I387_CW_TRUNC:
9175           /* round toward zero (truncate) */
9176           emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
9177           slot = SLOT_CW_TRUNC;
9178           break;
9179
9180         case I387_CW_FLOOR:
9181           /* round down toward -oo */
9182           emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
9183           emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
9184           slot = SLOT_CW_FLOOR;
9185           break;
9186
9187         case I387_CW_CEIL:
9188           /* round up toward +oo */
9189           emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
9190           emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
9191           slot = SLOT_CW_CEIL;
9192           break;
9193
9194         case I387_CW_MASK_PM:
9195           /* mask precision exception for nearbyint() */
9196           emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
9197           slot = SLOT_CW_MASK_PM;
9198           break;
9199
9200         default:
9201           gcc_unreachable ();
9202         }
9203     }
9204   else
9205     {
9206       switch (mode)
9207         {
9208         case I387_CW_TRUNC:
9209           /* round toward zero (truncate) */
9210           emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
9211           slot = SLOT_CW_TRUNC;
9212           break;
9213
9214         case I387_CW_FLOOR:
9215           /* round down toward -oo */
9216           emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
9217           slot = SLOT_CW_FLOOR;
9218           break;
9219
9220         case I387_CW_CEIL:
9221           /* round up toward +oo */
9222           emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
9223           slot = SLOT_CW_CEIL;
9224           break;
9225
9226         case I387_CW_MASK_PM:
9227           /* mask precision exception for nearbyint() */
9228           emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
9229           slot = SLOT_CW_MASK_PM;
9230           break;
9231
9232         default:
9233           gcc_unreachable ();
9234         }
9235     }
9236
9237   gcc_assert (slot < MAX_386_STACK_LOCALS);
9238
9239   new_mode = assign_386_stack_local (HImode, slot);
9240   emit_move_insn (new_mode, reg);
9241 }
9242
9243 /* Output code for INSN to convert a float to a signed int.  OPERANDS
9244    are the insn operands.  The output may be [HSD]Imode and the input
9245    operand may be [SDX]Fmode.  */
9246
9247 const char *
9248 output_fix_trunc (rtx insn, rtx *operands, int fisttp)
9249 {
9250   int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
9251   int dimode_p = GET_MODE (operands[0]) == DImode;
9252   int round_mode = get_attr_i387_cw (insn);
9253
9254   /* Jump through a hoop or two for DImode, since the hardware has no
9255      non-popping instruction.  We used to do this a different way, but
9256      that was somewhat fragile and broke with post-reload splitters.  */
9257   if ((dimode_p || fisttp) && !stack_top_dies)
9258     output_asm_insn ("fld\t%y1", operands);
9259
9260   gcc_assert (STACK_TOP_P (operands[1]));
9261   gcc_assert (MEM_P (operands[0]));
9262   gcc_assert (GET_MODE (operands[1]) != TFmode);
9263
9264   if (fisttp)
9265       output_asm_insn ("fisttp%z0\t%0", operands);
9266   else
9267     {
9268       if (round_mode != I387_CW_ANY)
9269         output_asm_insn ("fldcw\t%3", operands);
9270       if (stack_top_dies || dimode_p)
9271         output_asm_insn ("fistp%z0\t%0", operands);
9272       else
9273         output_asm_insn ("fist%z0\t%0", operands);
9274       if (round_mode != I387_CW_ANY)
9275         output_asm_insn ("fldcw\t%2", operands);
9276     }
9277
9278   return "";
9279 }
9280
9281 /* Output code for x87 ffreep insn.  The OPNO argument, which may only
9282    have the values zero or one, indicates the ffreep insn's operand
9283    from the OPERANDS array.  */
9284
9285 static const char *
9286 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
9287 {
9288   if (TARGET_USE_FFREEP)
9289 #if HAVE_AS_IX86_FFREEP
9290     return opno ? "ffreep\t%y1" : "ffreep\t%y0";
9291 #else
9292     {
9293       static char retval[] = ".word\t0xc_df";
9294       int regno = REGNO (operands[opno]);
9295
9296       gcc_assert (FP_REGNO_P (regno));
9297
9298       retval[9] = '0' + (regno - FIRST_STACK_REG);
9299       return retval;
9300     }
9301 #endif
9302
9303   return opno ? "fstp\t%y1" : "fstp\t%y0";
9304 }
9305
9306
9307 /* Output code for INSN to compare OPERANDS.  EFLAGS_P is 1 when fcomi
9308    should be used.  UNORDERED_P is true when fucom should be used.  */
9309
9310 const char *
9311 output_fp_compare (rtx insn, rtx *operands, int eflags_p, int unordered_p)
9312 {
9313   int stack_top_dies;
9314   rtx cmp_op0, cmp_op1;
9315   int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
9316
9317   if (eflags_p)
9318     {
9319       cmp_op0 = operands[0];
9320       cmp_op1 = operands[1];
9321     }
9322   else
9323     {
9324       cmp_op0 = operands[1];
9325       cmp_op1 = operands[2];
9326     }
9327
9328   if (is_sse)
9329     {
9330       if (GET_MODE (operands[0]) == SFmode)
9331         if (unordered_p)
9332           return "ucomiss\t{%1, %0|%0, %1}";
9333         else
9334           return "comiss\t{%1, %0|%0, %1}";
9335       else
9336         if (unordered_p)
9337           return "ucomisd\t{%1, %0|%0, %1}";
9338         else
9339           return "comisd\t{%1, %0|%0, %1}";
9340     }
9341
9342   gcc_assert (STACK_TOP_P (cmp_op0));
9343
9344   stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
9345
9346   if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
9347     {
9348       if (stack_top_dies)
9349         {
9350           output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
9351           return output_387_ffreep (operands, 1);
9352         }
9353       else
9354         return "ftst\n\tfnstsw\t%0";
9355     }
9356
9357   if (STACK_REG_P (cmp_op1)
9358       && stack_top_dies
9359       && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
9360       && REGNO (cmp_op1) != FIRST_STACK_REG)
9361     {
9362       /* If both the top of the 387 stack dies, and the other operand
9363          is also a stack register that dies, then this must be a
9364          `fcompp' float compare */
9365
9366       if (eflags_p)
9367         {
9368           /* There is no double popping fcomi variant.  Fortunately,
9369              eflags is immune from the fstp's cc clobbering.  */
9370           if (unordered_p)
9371             output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
9372           else
9373             output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
9374           return output_387_ffreep (operands, 0);
9375         }
9376       else
9377         {
9378           if (unordered_p)
9379             return "fucompp\n\tfnstsw\t%0";
9380           else
9381             return "fcompp\n\tfnstsw\t%0";
9382         }
9383     }
9384   else
9385     {
9386       /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies.  */
9387
9388       static const char * const alt[16] =
9389       {
9390         "fcom%z2\t%y2\n\tfnstsw\t%0",
9391         "fcomp%z2\t%y2\n\tfnstsw\t%0",
9392         "fucom%z2\t%y2\n\tfnstsw\t%0",
9393         "fucomp%z2\t%y2\n\tfnstsw\t%0",
9394
9395         "ficom%z2\t%y2\n\tfnstsw\t%0",
9396         "ficomp%z2\t%y2\n\tfnstsw\t%0",
9397         NULL,
9398         NULL,
9399
9400         "fcomi\t{%y1, %0|%0, %y1}",
9401         "fcomip\t{%y1, %0|%0, %y1}",
9402         "fucomi\t{%y1, %0|%0, %y1}",
9403         "fucomip\t{%y1, %0|%0, %y1}",
9404
9405         NULL,
9406         NULL,
9407         NULL,
9408         NULL
9409       };
9410
9411       int mask;
9412       const char *ret;
9413
9414       mask  = eflags_p << 3;
9415       mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
9416       mask |= unordered_p << 1;
9417       mask |= stack_top_dies;
9418
9419       gcc_assert (mask < 16);
9420       ret = alt[mask];
9421       gcc_assert (ret);
9422
9423       return ret;
9424     }
9425 }
9426
9427 void
9428 ix86_output_addr_vec_elt (FILE *file, int value)
9429 {
9430   const char *directive = ASM_LONG;
9431
9432 #ifdef ASM_QUAD
9433   if (TARGET_64BIT)
9434     directive = ASM_QUAD;
9435 #else
9436   gcc_assert (!TARGET_64BIT);
9437 #endif
9438
9439   fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
9440 }
9441
9442 void
9443 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
9444 {
9445   const char *directive = ASM_LONG;
9446
9447 #ifdef ASM_QUAD
9448   if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
9449     directive = ASM_QUAD;
9450 #else
9451   gcc_assert (!TARGET_64BIT);
9452 #endif
9453   /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand.  */
9454   if (TARGET_64BIT || TARGET_VXWORKS_RTP)
9455     fprintf (file, "%s%s%d-%s%d\n",
9456              directive, LPREFIX, value, LPREFIX, rel);
9457   else if (HAVE_AS_GOTOFF_IN_DATA)
9458     fprintf (file, "%s%s%d@GOTOFF\n", ASM_LONG, LPREFIX, value);
9459 #if TARGET_MACHO
9460   else if (TARGET_MACHO)
9461     {
9462       fprintf (file, "%s%s%d-", ASM_LONG, LPREFIX, value);
9463       machopic_output_function_base_name (file);
9464       fprintf(file, "\n");
9465     }
9466 #endif
9467   else
9468     asm_fprintf (file, "%s%U%s+[.-%s%d]\n",
9469                  ASM_LONG, GOT_SYMBOL_NAME, LPREFIX, value);
9470 }
9471 \f
9472 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
9473    for the target.  */
9474
9475 void
9476 ix86_expand_clear (rtx dest)
9477 {
9478   rtx tmp;
9479
9480   /* We play register width games, which are only valid after reload.  */
9481   gcc_assert (reload_completed);
9482
9483   /* Avoid HImode and its attendant prefix byte.  */
9484   if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
9485     dest = gen_rtx_REG (SImode, REGNO (dest));
9486   tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
9487
9488   /* This predicate should match that for movsi_xor and movdi_xor_rex64.  */
9489   if (reload_completed && (!TARGET_USE_MOV0 || optimize_size))
9490     {
9491       rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, 17));
9492       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
9493     }
9494
9495   emit_insn (tmp);
9496 }
9497
9498 /* X is an unchanging MEM.  If it is a constant pool reference, return
9499    the constant pool rtx, else NULL.  */
9500
9501 rtx
9502 maybe_get_pool_constant (rtx x)
9503 {
9504   x = ix86_delegitimize_address (XEXP (x, 0));
9505
9506   if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
9507     return get_pool_constant (x);
9508
9509   return NULL_RTX;
9510 }
9511
9512 void
9513 ix86_expand_move (enum machine_mode mode, rtx operands[])
9514 {
9515   int strict = (reload_in_progress || reload_completed);
9516   rtx op0, op1;
9517   enum tls_model model;
9518
9519   op0 = operands[0];
9520   op1 = operands[1];
9521
9522   if (GET_CODE (op1) == SYMBOL_REF)
9523     {
9524       model = SYMBOL_REF_TLS_MODEL (op1);
9525       if (model)
9526         {
9527           op1 = legitimize_tls_address (op1, model, true);
9528           op1 = force_operand (op1, op0);
9529           if (op1 == op0)
9530             return;
9531         }
9532       else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
9533                && SYMBOL_REF_DLLIMPORT_P (op1))
9534         op1 = legitimize_dllimport_symbol (op1, false);
9535     }
9536   else if (GET_CODE (op1) == CONST
9537            && GET_CODE (XEXP (op1, 0)) == PLUS
9538            && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
9539     {
9540       rtx addend = XEXP (XEXP (op1, 0), 1);
9541       rtx symbol = XEXP (XEXP (op1, 0), 0);
9542       rtx tmp = NULL;
9543
9544       model = SYMBOL_REF_TLS_MODEL (symbol);
9545       if (model)
9546         tmp = legitimize_tls_address (symbol, model, true);
9547       else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
9548                && SYMBOL_REF_DLLIMPORT_P (symbol))
9549         tmp = legitimize_dllimport_symbol (symbol, true);
9550
9551       if (tmp)
9552         {
9553           tmp = force_operand (tmp, NULL);
9554           tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
9555                                      op0, 1, OPTAB_DIRECT);
9556           if (tmp == op0)
9557             return;
9558         }
9559     }
9560
9561   if (flag_pic && mode == Pmode && symbolic_operand (op1, Pmode))
9562     {
9563       if (TARGET_MACHO && !TARGET_64BIT)
9564         {
9565 #if TARGET_MACHO
9566           if (MACHOPIC_PURE)
9567             {
9568               rtx temp = ((reload_in_progress
9569                            || ((op0 && REG_P (op0))
9570                                && mode == Pmode))
9571                           ? op0 : gen_reg_rtx (Pmode));
9572               op1 = machopic_indirect_data_reference (op1, temp);
9573               op1 = machopic_legitimize_pic_address (op1, mode,
9574                                                      temp == op1 ? 0 : temp);
9575             }
9576           else if (MACHOPIC_INDIRECT)
9577             op1 = machopic_indirect_data_reference (op1, 0);
9578           if (op0 == op1)
9579             return;
9580 #endif
9581         }
9582       else
9583         {
9584           if (MEM_P (op0))
9585             op1 = force_reg (Pmode, op1);
9586           else if (!TARGET_64BIT || !x86_64_movabs_operand (op1, Pmode))
9587             {
9588               rtx reg = no_new_pseudos ? op0 : NULL_RTX;
9589               op1 = legitimize_pic_address (op1, reg);
9590               if (op0 == op1)
9591                 return;
9592             }
9593         }
9594     }
9595   else
9596     {
9597       if (MEM_P (op0)
9598           && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
9599               || !push_operand (op0, mode))
9600           && MEM_P (op1))
9601         op1 = force_reg (mode, op1);
9602
9603       if (push_operand (op0, mode)
9604           && ! general_no_elim_operand (op1, mode))
9605         op1 = copy_to_mode_reg (mode, op1);
9606
9607       /* Force large constants in 64bit compilation into register
9608          to get them CSEed.  */
9609       if (TARGET_64BIT && mode == DImode
9610           && immediate_operand (op1, mode)
9611           && !x86_64_zext_immediate_operand (op1, VOIDmode)
9612           && !register_operand (op0, mode)
9613           && optimize && !reload_completed && !reload_in_progress)
9614         op1 = copy_to_mode_reg (mode, op1);
9615
9616       if (FLOAT_MODE_P (mode))
9617         {
9618           /* If we are loading a floating point constant to a register,
9619              force the value to memory now, since we'll get better code
9620              out the back end.  */
9621
9622           if (strict)
9623             ;
9624           else if (GET_CODE (op1) == CONST_DOUBLE)
9625             {
9626               op1 = validize_mem (force_const_mem (mode, op1));
9627               if (!register_operand (op0, mode))
9628                 {
9629                   rtx temp = gen_reg_rtx (mode);
9630                   emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
9631                   emit_move_insn (op0, temp);
9632                   return;
9633                 }
9634             }
9635         }
9636     }
9637
9638   emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
9639 }
9640
9641 void
9642 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
9643 {
9644   rtx op0 = operands[0], op1 = operands[1];
9645
9646   /* Force constants other than zero into memory.  We do not know how
9647      the instructions used to build constants modify the upper 64 bits
9648      of the register, once we have that information we may be able
9649      to handle some of them more efficiently.  */
9650   if ((reload_in_progress | reload_completed) == 0
9651       && register_operand (op0, mode)
9652       && CONSTANT_P (op1)
9653       && standard_sse_constant_p (op1) <= 0)
9654     op1 = validize_mem (force_const_mem (mode, op1));
9655
9656   /* Make operand1 a register if it isn't already.  */
9657   if (!no_new_pseudos
9658       && !register_operand (op0, mode)
9659       && !register_operand (op1, mode))
9660     {
9661       emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
9662       return;
9663     }
9664
9665   emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
9666 }
9667
9668 /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
9669    straight to ix86_expand_vector_move.  */
9670 /* Code generation for scalar reg-reg moves of single and double precision data:
9671      if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
9672        movaps reg, reg
9673      else
9674        movss reg, reg
9675      if (x86_sse_partial_reg_dependency == true)
9676        movapd reg, reg
9677      else
9678        movsd reg, reg
9679
9680    Code generation for scalar loads of double precision data:
9681      if (x86_sse_split_regs == true)
9682        movlpd mem, reg      (gas syntax)
9683      else
9684        movsd mem, reg
9685
9686    Code generation for unaligned packed loads of single precision data
9687    (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
9688      if (x86_sse_unaligned_move_optimal)
9689        movups mem, reg
9690
9691      if (x86_sse_partial_reg_dependency == true)
9692        {
9693          xorps  reg, reg
9694          movlps mem, reg
9695          movhps mem+8, reg
9696        }
9697      else
9698        {
9699          movlps mem, reg
9700          movhps mem+8, reg
9701        }
9702
9703    Code generation for unaligned packed loads of double precision data
9704    (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
9705      if (x86_sse_unaligned_move_optimal)
9706        movupd mem, reg
9707
9708      if (x86_sse_split_regs == true)
9709        {
9710          movlpd mem, reg
9711          movhpd mem+8, reg
9712        }
9713      else
9714        {
9715          movsd  mem, reg
9716          movhpd mem+8, reg
9717        }
9718  */
9719
9720 void
9721 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
9722 {
9723   rtx op0, op1, m;
9724
9725   op0 = operands[0];
9726   op1 = operands[1];
9727
9728   if (MEM_P (op1))
9729     {
9730       /* If we're optimizing for size, movups is the smallest.  */
9731       if (optimize_size)
9732         {
9733           op0 = gen_lowpart (V4SFmode, op0);
9734           op1 = gen_lowpart (V4SFmode, op1);
9735           emit_insn (gen_sse_movups (op0, op1));
9736           return;
9737         }
9738
9739       /* ??? If we have typed data, then it would appear that using
9740          movdqu is the only way to get unaligned data loaded with
9741          integer type.  */
9742       if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
9743         {
9744           op0 = gen_lowpart (V16QImode, op0);
9745           op1 = gen_lowpart (V16QImode, op1);
9746           emit_insn (gen_sse2_movdqu (op0, op1));
9747           return;
9748         }
9749
9750       if (TARGET_SSE2 && mode == V2DFmode)
9751         {
9752           rtx zero;
9753
9754           if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL)
9755             {
9756               op0 = gen_lowpart (V2DFmode, op0);
9757               op1 = gen_lowpart (V2DFmode, op1);
9758               emit_insn (gen_sse2_movupd (op0, op1));
9759               return;
9760             }
9761
9762           /* When SSE registers are split into halves, we can avoid
9763              writing to the top half twice.  */
9764           if (TARGET_SSE_SPLIT_REGS)
9765             {
9766               emit_insn (gen_rtx_CLOBBER (VOIDmode, op0));
9767               zero = op0;
9768             }
9769           else
9770             {
9771               /* ??? Not sure about the best option for the Intel chips.
9772                  The following would seem to satisfy; the register is
9773                  entirely cleared, breaking the dependency chain.  We
9774                  then store to the upper half, with a dependency depth
9775                  of one.  A rumor has it that Intel recommends two movsd
9776                  followed by an unpacklpd, but this is unconfirmed.  And
9777                  given that the dependency depth of the unpacklpd would
9778                  still be one, I'm not sure why this would be better.  */
9779               zero = CONST0_RTX (V2DFmode);
9780             }
9781
9782           m = adjust_address (op1, DFmode, 0);
9783           emit_insn (gen_sse2_loadlpd (op0, zero, m));
9784           m = adjust_address (op1, DFmode, 8);
9785           emit_insn (gen_sse2_loadhpd (op0, op0, m));
9786         }
9787       else
9788         {
9789           if (TARGET_SSE_UNALIGNED_MOVE_OPTIMAL)
9790             {
9791               op0 = gen_lowpart (V4SFmode, op0);
9792               op1 = gen_lowpart (V4SFmode, op1);
9793               emit_insn (gen_sse_movups (op0, op1));
9794               return;
9795             }
9796
9797           if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
9798             emit_move_insn (op0, CONST0_RTX (mode));
9799           else
9800             emit_insn (gen_rtx_CLOBBER (VOIDmode, op0));
9801
9802           if (mode != V4SFmode)
9803             op0 = gen_lowpart (V4SFmode, op0);
9804           m = adjust_address (op1, V2SFmode, 0);
9805           emit_insn (gen_sse_loadlps (op0, op0, m));
9806           m = adjust_address (op1, V2SFmode, 8);
9807           emit_insn (gen_sse_loadhps (op0, op0, m));
9808         }
9809     }
9810   else if (MEM_P (op0))
9811     {
9812       /* If we're optimizing for size, movups is the smallest.  */
9813       if (optimize_size)
9814         {
9815           op0 = gen_lowpart (V4SFmode, op0);
9816           op1 = gen_lowpart (V4SFmode, op1);
9817           emit_insn (gen_sse_movups (op0, op1));
9818           return;
9819         }
9820
9821       /* ??? Similar to above, only less clear because of quote
9822          typeless stores unquote.  */
9823       if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
9824           && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
9825         {
9826           op0 = gen_lowpart (V16QImode, op0);
9827           op1 = gen_lowpart (V16QImode, op1);
9828           emit_insn (gen_sse2_movdqu (op0, op1));
9829           return;
9830         }
9831
9832       if (TARGET_SSE2 && mode == V2DFmode)
9833         {
9834           m = adjust_address (op0, DFmode, 0);
9835           emit_insn (gen_sse2_storelpd (m, op1));
9836           m = adjust_address (op0, DFmode, 8);
9837           emit_insn (gen_sse2_storehpd (m, op1));
9838         }
9839       else
9840         {
9841           if (mode != V4SFmode)
9842             op1 = gen_lowpart (V4SFmode, op1);
9843           m = adjust_address (op0, V2SFmode, 0);
9844           emit_insn (gen_sse_storelps (m, op1));
9845           m = adjust_address (op0, V2SFmode, 8);
9846           emit_insn (gen_sse_storehps (m, op1));
9847         }
9848     }
9849   else
9850     gcc_unreachable ();
9851 }
9852
9853 /* Expand a push in MODE.  This is some mode for which we do not support
9854    proper push instructions, at least from the registers that we expect
9855    the value to live in.  */
9856
9857 void
9858 ix86_expand_push (enum machine_mode mode, rtx x)
9859 {
9860   rtx tmp;
9861
9862   tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
9863                              GEN_INT (-GET_MODE_SIZE (mode)),
9864                              stack_pointer_rtx, 1, OPTAB_DIRECT);
9865   if (tmp != stack_pointer_rtx)
9866     emit_move_insn (stack_pointer_rtx, tmp);
9867
9868   tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
9869   emit_move_insn (tmp, x);
9870 }
9871
9872 /* Helper function of ix86_fixup_binary_operands to canonicalize
9873    operand order.  Returns true if the operands should be swapped.  */
9874
9875 static bool
9876 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
9877                              rtx operands[])
9878 {
9879   rtx dst = operands[0];
9880   rtx src1 = operands[1];
9881   rtx src2 = operands[2];
9882
9883   /* If the operation is not commutative, we can't do anything.  */
9884   if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
9885     return false;
9886
9887   /* Highest priority is that src1 should match dst.  */
9888   if (rtx_equal_p (dst, src1))
9889     return false;
9890   if (rtx_equal_p (dst, src2))
9891     return true;
9892
9893   /* Next highest priority is that immediate constants come second.  */
9894   if (immediate_operand (src2, mode))
9895     return false;
9896   if (immediate_operand (src1, mode))
9897     return true;
9898
9899   /* Lowest priority is that memory references should come second.  */
9900   if (MEM_P (src2))
9901     return false;
9902   if (MEM_P (src1))
9903     return true;
9904
9905   return false;
9906 }
9907
9908
9909 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok.  Return the
9910    destination to use for the operation.  If different from the true
9911    destination in operands[0], a copy operation will be required.  */
9912
9913 rtx
9914 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
9915                             rtx operands[])
9916 {
9917   rtx dst = operands[0];
9918   rtx src1 = operands[1];
9919   rtx src2 = operands[2];
9920
9921   /* Canonicalize operand order.  */
9922   if (ix86_swap_binary_operands_p (code, mode, operands))
9923     {
9924       rtx temp = src1;
9925       src1 = src2;
9926       src2 = temp;
9927     }
9928
9929   /* Both source operands cannot be in memory.  */
9930   if (MEM_P (src1) && MEM_P (src2))
9931     {
9932       /* Optimization: Only read from memory once.  */
9933       if (rtx_equal_p (src1, src2))
9934         {
9935           src2 = force_reg (mode, src2);
9936           src1 = src2;
9937         }
9938       else
9939         src2 = force_reg (mode, src2);
9940     }
9941
9942   /* If the destination is memory, and we do not have matching source
9943      operands, do things in registers.  */
9944   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
9945     dst = gen_reg_rtx (mode);
9946
9947   /* Source 1 cannot be a constant.  */
9948   if (CONSTANT_P (src1))
9949     src1 = force_reg (mode, src1);
9950
9951   /* Source 1 cannot be a non-matching memory.  */
9952   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
9953     src1 = force_reg (mode, src1);
9954
9955   operands[1] = src1;
9956   operands[2] = src2;
9957   return dst;
9958 }
9959
9960 /* Similarly, but assume that the destination has already been
9961    set up properly.  */
9962
9963 void
9964 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
9965                                     enum machine_mode mode, rtx operands[])
9966 {
9967   rtx dst = ix86_fixup_binary_operands (code, mode, operands);
9968   gcc_assert (dst == operands[0]);
9969 }
9970
9971 /* Attempt to expand a binary operator.  Make the expansion closer to the
9972    actual machine, then just general_operand, which will allow 3 separate
9973    memory references (one output, two input) in a single insn.  */
9974
9975 void
9976 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
9977                              rtx operands[])
9978 {
9979   rtx src1, src2, dst, op, clob;
9980
9981   dst = ix86_fixup_binary_operands (code, mode, operands);
9982   src1 = operands[1];
9983   src2 = operands[2];
9984
9985  /* Emit the instruction.  */
9986
9987   op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
9988   if (reload_in_progress)
9989     {
9990       /* Reload doesn't know about the flags register, and doesn't know that
9991          it doesn't want to clobber it.  We can only do this with PLUS.  */
9992       gcc_assert (code == PLUS);
9993       emit_insn (op);
9994     }
9995   else
9996     {
9997       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
9998       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
9999     }
10000
10001   /* Fix up the destination if needed.  */
10002   if (dst != operands[0])
10003     emit_move_insn (operands[0], dst);
10004 }
10005
10006 /* Return TRUE or FALSE depending on whether the binary operator meets the
10007    appropriate constraints.  */
10008
10009 int
10010 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
10011                          rtx operands[3])
10012 {
10013   rtx dst = operands[0];
10014   rtx src1 = operands[1];
10015   rtx src2 = operands[2];
10016
10017   /* Both source operands cannot be in memory.  */
10018   if (MEM_P (src1) && MEM_P (src2))
10019     return 0;
10020
10021   /* Canonicalize operand order for commutative operators.  */
10022   if (ix86_swap_binary_operands_p (code, mode, operands))
10023     {
10024       rtx temp = src1;
10025       src1 = src2;
10026       src2 = temp;
10027     }
10028
10029   /* If the destination is memory, we must have a matching source operand.  */
10030   if (MEM_P (dst) && !rtx_equal_p (dst, src1))
10031       return 0;
10032
10033   /* Source 1 cannot be a constant.  */
10034   if (CONSTANT_P (src1))
10035     return 0;
10036
10037   /* Source 1 cannot be a non-matching memory.  */
10038   if (MEM_P (src1) && !rtx_equal_p (dst, src1))
10039     return 0;
10040
10041   return 1;
10042 }
10043
10044 /* Attempt to expand a unary operator.  Make the expansion closer to the
10045    actual machine, then just general_operand, which will allow 2 separate
10046    memory references (one output, one input) in a single insn.  */
10047
10048 void
10049 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
10050                             rtx operands[])
10051 {
10052   int matching_memory;
10053   rtx src, dst, op, clob;
10054
10055   dst = operands[0];
10056   src = operands[1];
10057
10058   /* If the destination is memory, and we do not have matching source
10059      operands, do things in registers.  */
10060   matching_memory = 0;
10061   if (MEM_P (dst))
10062     {
10063       if (rtx_equal_p (dst, src))
10064         matching_memory = 1;
10065       else
10066         dst = gen_reg_rtx (mode);
10067     }
10068
10069   /* When source operand is memory, destination must match.  */
10070   if (MEM_P (src) && !matching_memory)
10071     src = force_reg (mode, src);
10072
10073   /* Emit the instruction.  */
10074
10075   op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
10076   if (reload_in_progress || code == NOT)
10077     {
10078       /* Reload doesn't know about the flags register, and doesn't know that
10079          it doesn't want to clobber it.  */
10080       gcc_assert (code == NOT);
10081       emit_insn (op);
10082     }
10083   else
10084     {
10085       clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
10086       emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
10087     }
10088
10089   /* Fix up the destination if needed.  */
10090   if (dst != operands[0])
10091     emit_move_insn (operands[0], dst);
10092 }
10093
10094 /* Return TRUE or FALSE depending on whether the unary operator meets the
10095    appropriate constraints.  */
10096
10097 int
10098 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
10099                         enum machine_mode mode ATTRIBUTE_UNUSED,
10100                         rtx operands[2] ATTRIBUTE_UNUSED)
10101 {
10102   /* If one of operands is memory, source and destination must match.  */
10103   if ((MEM_P (operands[0])
10104        || MEM_P (operands[1]))
10105       && ! rtx_equal_p (operands[0], operands[1]))
10106     return FALSE;
10107   return TRUE;
10108 }
10109
10110 /* Post-reload splitter for converting an SF or DFmode value in an
10111    SSE register into an unsigned SImode.  */
10112
10113 void
10114 ix86_split_convert_uns_si_sse (rtx operands[])
10115 {
10116   enum machine_mode vecmode;
10117   rtx value, large, zero_or_two31, input, two31, x;
10118
10119   large = operands[1];
10120   zero_or_two31 = operands[2];
10121   input = operands[3];
10122   two31 = operands[4];
10123   vecmode = GET_MODE (large);
10124   value = gen_rtx_REG (vecmode, REGNO (operands[0]));
10125
10126   /* Load up the value into the low element.  We must ensure that the other
10127      elements are valid floats -- zero is the easiest such value.  */
10128   if (MEM_P (input))
10129     {
10130       if (vecmode == V4SFmode)
10131         emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
10132       else
10133         emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
10134     }
10135   else
10136     {
10137       input = gen_rtx_REG (vecmode, REGNO (input));
10138       emit_move_insn (value, CONST0_RTX (vecmode));
10139       if (vecmode == V4SFmode)
10140         emit_insn (gen_sse_movss (value, value, input));
10141       else
10142         emit_insn (gen_sse2_movsd (value, value, input));
10143     }
10144
10145   emit_move_insn (large, two31);
10146   emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
10147
10148   x = gen_rtx_fmt_ee (LE, vecmode, large, value);
10149   emit_insn (gen_rtx_SET (VOIDmode, large, x));
10150
10151   x = gen_rtx_AND (vecmode, zero_or_two31, large);
10152   emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
10153
10154   x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
10155   emit_insn (gen_rtx_SET (VOIDmode, value, x));
10156
10157   large = gen_rtx_REG (V4SImode, REGNO (large));
10158   emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
10159
10160   x = gen_rtx_REG (V4SImode, REGNO (value));
10161   if (vecmode == V4SFmode)
10162     emit_insn (gen_sse2_cvttps2dq (x, value));
10163   else
10164     emit_insn (gen_sse2_cvttpd2dq (x, value));
10165   value = x;
10166
10167   emit_insn (gen_xorv4si3 (value, value, large));
10168 }
10169
10170 /* Convert an unsigned DImode value into a DFmode, using only SSE.
10171    Expects the 64-bit DImode to be supplied in a pair of integral
10172    registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
10173    -mfpmath=sse, !optimize_size only.  */
10174
10175 void
10176 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
10177 {
10178   REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
10179   rtx int_xmm, fp_xmm;
10180   rtx biases, exponents;
10181   rtx x;
10182
10183   int_xmm = gen_reg_rtx (V4SImode);
10184   if (TARGET_INTER_UNIT_MOVES)
10185     emit_insn (gen_movdi_to_sse (int_xmm, input));
10186   else if (TARGET_SSE_SPLIT_REGS)
10187     {
10188       emit_insn (gen_rtx_CLOBBER (VOIDmode, int_xmm));
10189       emit_move_insn (gen_lowpart (DImode, int_xmm), input);
10190     }
10191   else
10192     {
10193       x = gen_reg_rtx (V2DImode);
10194       ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
10195       emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
10196     }
10197
10198   x = gen_rtx_CONST_VECTOR (V4SImode,
10199                             gen_rtvec (4, GEN_INT (0x43300000UL),
10200                                        GEN_INT (0x45300000UL),
10201                                        const0_rtx, const0_rtx));
10202   exponents = validize_mem (force_const_mem (V4SImode, x));
10203
10204   /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
10205   emit_insn (gen_sse2_punpckldq (int_xmm, int_xmm, exponents));
10206
10207   /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
10208      yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
10209      Similarly (0x45300000UL ## fp_value_hi_xmm) yields
10210      (0x1.0p84 + double(fp_value_hi_xmm)).
10211      Note these exponents differ by 32.  */
10212
10213   fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
10214
10215   /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
10216      in [0,2**32-1] and [0]+[2**32,2**64-1] respectively.  */
10217   real_ldexp (&bias_lo_rvt, &dconst1, 52);
10218   real_ldexp (&bias_hi_rvt, &dconst1, 84);
10219   biases = const_double_from_real_value (bias_lo_rvt, DFmode);
10220   x = const_double_from_real_value (bias_hi_rvt, DFmode);
10221   biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
10222   biases = validize_mem (force_const_mem (V2DFmode, biases));
10223   emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
10224
10225   /* Add the upper and lower DFmode values together.  */
10226   if (TARGET_SSE3)
10227     emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
10228   else
10229     {
10230       x = copy_to_mode_reg (V2DFmode, fp_xmm);
10231       emit_insn (gen_sse2_unpckhpd (fp_xmm, fp_xmm, fp_xmm));
10232       emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
10233     }
10234
10235   ix86_expand_vector_extract (false, target, fp_xmm, 0);
10236 }
10237
10238 /* Convert an unsigned SImode value into a DFmode.  Only currently used
10239    for SSE, but applicable anywhere.  */
10240
10241 void
10242 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
10243 {
10244   REAL_VALUE_TYPE TWO31r;
10245   rtx x, fp;
10246
10247   x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
10248                            NULL, 1, OPTAB_DIRECT);
10249
10250   fp = gen_reg_rtx (DFmode);
10251   emit_insn (gen_floatsidf2 (fp, x));
10252
10253   real_ldexp (&TWO31r, &dconst1, 31);
10254   x = const_double_from_real_value (TWO31r, DFmode);
10255
10256   x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
10257   if (x != target)
10258     emit_move_insn (target, x);
10259 }
10260
10261 /* Convert a signed DImode value into a DFmode.  Only used for SSE in
10262    32-bit mode; otherwise we have a direct convert instruction.  */
10263
10264 void
10265 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
10266 {
10267   REAL_VALUE_TYPE TWO32r;
10268   rtx fp_lo, fp_hi, x;
10269
10270   fp_lo = gen_reg_rtx (DFmode);
10271   fp_hi = gen_reg_rtx (DFmode);
10272
10273   emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
10274
10275   real_ldexp (&TWO32r, &dconst1, 32);
10276   x = const_double_from_real_value (TWO32r, DFmode);
10277   fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
10278
10279   ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
10280
10281   x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
10282                            0, OPTAB_DIRECT);
10283   if (x != target)
10284     emit_move_insn (target, x);
10285 }
10286
10287 /* Convert an unsigned SImode value into a SFmode, using only SSE.
10288    For x86_32, -mfpmath=sse, !optimize_size only.  */
10289 void
10290 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
10291 {
10292   REAL_VALUE_TYPE ONE16r;
10293   rtx fp_hi, fp_lo, int_hi, int_lo, x;
10294
10295   real_ldexp (&ONE16r, &dconst1, 16);
10296   x = const_double_from_real_value (ONE16r, SFmode);
10297   int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
10298                                       NULL, 0, OPTAB_DIRECT);
10299   int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
10300                                       NULL, 0, OPTAB_DIRECT);
10301   fp_hi = gen_reg_rtx (SFmode);
10302   fp_lo = gen_reg_rtx (SFmode);
10303   emit_insn (gen_floatsisf2 (fp_hi, int_hi));
10304   emit_insn (gen_floatsisf2 (fp_lo, int_lo));
10305   fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
10306                                0, OPTAB_DIRECT);
10307   fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
10308                                0, OPTAB_DIRECT);
10309   if (!rtx_equal_p (target, fp_hi))
10310     emit_move_insn (target, fp_hi);
10311 }
10312
10313 /* A subroutine of ix86_build_signbit_mask_vector.  If VECT is true,
10314    then replicate the value for all elements of the vector
10315    register.  */
10316
10317 rtx
10318 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
10319 {
10320   rtvec v;
10321   switch (mode)
10322     {
10323     case SFmode:
10324       if (vect)
10325         v = gen_rtvec (4, value, value, value, value);
10326       else
10327         v = gen_rtvec (4, value, CONST0_RTX (SFmode),
10328                        CONST0_RTX (SFmode), CONST0_RTX (SFmode));
10329       return gen_rtx_CONST_VECTOR (V4SFmode, v);
10330
10331     case DFmode:
10332       if (vect)
10333         v = gen_rtvec (2, value, value);
10334       else
10335         v = gen_rtvec (2, value, CONST0_RTX (DFmode));
10336       return gen_rtx_CONST_VECTOR (V2DFmode, v);
10337
10338     default:
10339       gcc_unreachable ();
10340     }
10341 }
10342
10343 /* A subroutine of ix86_expand_fp_absneg_operator and copysign expanders.
10344    Create a mask for the sign bit in MODE for an SSE register.  If VECT is
10345    true, then replicate the mask for all elements of the vector register.
10346    If INVERT is true, then create a mask excluding the sign bit.  */
10347
10348 rtx
10349 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
10350 {
10351   enum machine_mode vec_mode;
10352   HOST_WIDE_INT hi, lo;
10353   int shift = 63;
10354   rtx v;
10355   rtx mask;
10356
10357   /* Find the sign bit, sign extended to 2*HWI.  */
10358   if (mode == SFmode)
10359     lo = 0x80000000, hi = lo < 0;
10360   else if (HOST_BITS_PER_WIDE_INT >= 64)
10361     lo = (HOST_WIDE_INT)1 << shift, hi = -1;
10362   else
10363     lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
10364
10365   if (invert)
10366     lo = ~lo, hi = ~hi;
10367
10368   /* Force this value into the low part of a fp vector constant.  */
10369   mask = immed_double_const (lo, hi, mode == SFmode ? SImode : DImode);
10370   mask = gen_lowpart (mode, mask);
10371
10372   v = ix86_build_const_vector (mode, vect, mask);
10373   vec_mode = (mode == SFmode) ? V4SFmode : V2DFmode;
10374   return force_reg (vec_mode, v);
10375 }
10376
10377 /* Generate code for floating point ABS or NEG.  */
10378
10379 void
10380 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
10381                                 rtx operands[])
10382 {
10383   rtx mask, set, use, clob, dst, src;
10384   bool matching_memory;
10385   bool use_sse = false;
10386   bool vector_mode = VECTOR_MODE_P (mode);
10387   enum machine_mode elt_mode = mode;
10388
10389   if (vector_mode)
10390     {
10391       elt_mode = GET_MODE_INNER (mode);
10392       use_sse = true;
10393     }
10394   else if (TARGET_SSE_MATH)
10395     use_sse = SSE_FLOAT_MODE_P (mode);
10396
10397   /* NEG and ABS performed with SSE use bitwise mask operations.
10398      Create the appropriate mask now.  */
10399   if (use_sse)
10400     mask = ix86_build_signbit_mask (elt_mode, vector_mode, code == ABS);
10401   else
10402     mask = NULL_RTX;
10403
10404   dst = operands[0];
10405   src = operands[1];
10406
10407   /* If the destination is memory, and we don't have matching source
10408      operands or we're using the x87, do things in registers.  */
10409   matching_memory = false;
10410   if (MEM_P (dst))
10411     {
10412       if (use_sse && rtx_equal_p (dst, src))
10413         matching_memory = true;
10414       else
10415         dst = gen_reg_rtx (mode);
10416     }
10417   if (MEM_P (src) && !matching_memory)
10418     src = force_reg (mode, src);
10419
10420   if (vector_mode)
10421     {
10422       set = gen_rtx_fmt_ee (code == NEG ? XOR : AND, mode, src, mask);
10423       set = gen_rtx_SET (VOIDmode, dst, set);
10424       emit_insn (set);
10425     }
10426   else
10427     {
10428       set = gen_rtx_fmt_e (code, mode, src);
10429       set = gen_rtx_SET (VOIDmode, dst, set);
10430       if (mask)
10431         {
10432           use = gen_rtx_USE (VOIDmode, mask);
10433           clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
10434           emit_insn (gen_rtx_PARALLEL (VOIDmode,
10435                                        gen_rtvec (3, set, use, clob)));
10436         }
10437       else
10438         emit_insn (set);
10439     }
10440
10441   if (dst != operands[0])
10442     emit_move_insn (operands[0], dst);
10443 }
10444
10445 /* Expand a copysign operation.  Special case operand 0 being a constant.  */
10446
10447 void
10448 ix86_expand_copysign (rtx operands[])
10449 {
10450   enum machine_mode mode, vmode;
10451   rtx dest, op0, op1, mask, nmask;
10452
10453   dest = operands[0];
10454   op0 = operands[1];
10455   op1 = operands[2];
10456
10457   mode = GET_MODE (dest);
10458   vmode = mode == SFmode ? V4SFmode : V2DFmode;
10459
10460   if (GET_CODE (op0) == CONST_DOUBLE)
10461     {
10462       rtvec v;
10463
10464       if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
10465         op0 = simplify_unary_operation (ABS, mode, op0, mode);
10466
10467       if (op0 == CONST0_RTX (mode))
10468         op0 = CONST0_RTX (vmode);
10469       else
10470         {
10471           if (mode == SFmode)
10472             v = gen_rtvec (4, op0, CONST0_RTX (SFmode),
10473                            CONST0_RTX (SFmode), CONST0_RTX (SFmode));
10474           else
10475             v = gen_rtvec (2, op0, CONST0_RTX (DFmode));
10476           op0 = force_reg (vmode, gen_rtx_CONST_VECTOR (vmode, v));
10477         }
10478
10479       mask = ix86_build_signbit_mask (mode, 0, 0);
10480
10481       if (mode == SFmode)
10482         emit_insn (gen_copysignsf3_const (dest, op0, op1, mask));
10483       else
10484         emit_insn (gen_copysigndf3_const (dest, op0, op1, mask));
10485     }
10486   else
10487     {
10488       nmask = ix86_build_signbit_mask (mode, 0, 1);
10489       mask = ix86_build_signbit_mask (mode, 0, 0);
10490
10491       if (mode == SFmode)
10492         emit_insn (gen_copysignsf3_var (dest, NULL, op0, op1, nmask, mask));
10493       else
10494         emit_insn (gen_copysigndf3_var (dest, NULL, op0, op1, nmask, mask));
10495     }
10496 }
10497
10498 /* Deconstruct a copysign operation into bit masks.  Operand 0 is known to
10499    be a constant, and so has already been expanded into a vector constant.  */
10500
10501 void
10502 ix86_split_copysign_const (rtx operands[])
10503 {
10504   enum machine_mode mode, vmode;
10505   rtx dest, op0, op1, mask, x;
10506
10507   dest = operands[0];
10508   op0 = operands[1];
10509   op1 = operands[2];
10510   mask = operands[3];
10511
10512   mode = GET_MODE (dest);
10513   vmode = GET_MODE (mask);
10514
10515   dest = simplify_gen_subreg (vmode, dest, mode, 0);
10516   x = gen_rtx_AND (vmode, dest, mask);
10517   emit_insn (gen_rtx_SET (VOIDmode, dest, x));
10518
10519   if (op0 != CONST0_RTX (vmode))
10520     {
10521       x = gen_rtx_IOR (vmode, dest, op0);
10522       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
10523     }
10524 }
10525
10526 /* Deconstruct a copysign operation into bit masks.  Operand 0 is variable,
10527    so we have to do two masks.  */
10528
10529 void
10530 ix86_split_copysign_var (rtx operands[])
10531 {
10532   enum machine_mode mode, vmode;
10533   rtx dest, scratch, op0, op1, mask, nmask, x;
10534
10535   dest = operands[0];
10536   scratch = operands[1];
10537   op0 = operands[2];
10538   op1 = operands[3];
10539   nmask = operands[4];
10540   mask = operands[5];
10541
10542   mode = GET_MODE (dest);
10543   vmode = GET_MODE (mask);
10544
10545   if (rtx_equal_p (op0, op1))
10546     {
10547       /* Shouldn't happen often (it's useless, obviously), but when it does
10548          we'd generate incorrect code if we continue below.  */
10549       emit_move_insn (dest, op0);
10550       return;
10551     }
10552
10553   if (REG_P (mask) && REGNO (dest) == REGNO (mask))     /* alternative 0 */
10554     {
10555       gcc_assert (REGNO (op1) == REGNO (scratch));
10556
10557       x = gen_rtx_AND (vmode, scratch, mask);
10558       emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
10559
10560       dest = mask;
10561       op0 = simplify_gen_subreg (vmode, op0, mode, 0);
10562       x = gen_rtx_NOT (vmode, dest);
10563       x = gen_rtx_AND (vmode, x, op0);
10564       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
10565     }
10566   else
10567     {
10568       if (REGNO (op1) == REGNO (scratch))               /* alternative 1,3 */
10569         {
10570           x = gen_rtx_AND (vmode, scratch, mask);
10571         }
10572       else                                              /* alternative 2,4 */
10573         {
10574           gcc_assert (REGNO (mask) == REGNO (scratch));
10575           op1 = simplify_gen_subreg (vmode, op1, mode, 0);
10576           x = gen_rtx_AND (vmode, scratch, op1);
10577         }
10578       emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
10579
10580       if (REGNO (op0) == REGNO (dest))                  /* alternative 1,2 */
10581         {
10582           dest = simplify_gen_subreg (vmode, op0, mode, 0);
10583           x = gen_rtx_AND (vmode, dest, nmask);
10584         }
10585       else                                              /* alternative 3,4 */
10586         {
10587           gcc_assert (REGNO (nmask) == REGNO (dest));
10588           dest = nmask;
10589           op0 = simplify_gen_subreg (vmode, op0, mode, 0);
10590           x = gen_rtx_AND (vmode, dest, op0);
10591         }
10592       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
10593     }
10594
10595   x = gen_rtx_IOR (vmode, dest, scratch);
10596   emit_insn (gen_rtx_SET (VOIDmode, dest, x));
10597 }
10598
10599 /* Return TRUE or FALSE depending on whether the first SET in INSN
10600    has source and destination with matching CC modes, and that the
10601    CC mode is at least as constrained as REQ_MODE.  */
10602
10603 int
10604 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
10605 {
10606   rtx set;
10607   enum machine_mode set_mode;
10608
10609   set = PATTERN (insn);
10610   if (GET_CODE (set) == PARALLEL)
10611     set = XVECEXP (set, 0, 0);
10612   gcc_assert (GET_CODE (set) == SET);
10613   gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
10614
10615   set_mode = GET_MODE (SET_DEST (set));
10616   switch (set_mode)
10617     {
10618     case CCNOmode:
10619       if (req_mode != CCNOmode
10620           && (req_mode != CCmode
10621               || XEXP (SET_SRC (set), 1) != const0_rtx))
10622         return 0;
10623       break;
10624     case CCmode:
10625       if (req_mode == CCGCmode)
10626         return 0;
10627       /* FALLTHRU */
10628     case CCGCmode:
10629       if (req_mode == CCGOCmode || req_mode == CCNOmode)
10630         return 0;
10631       /* FALLTHRU */
10632     case CCGOCmode:
10633       if (req_mode == CCZmode)
10634         return 0;
10635       /* FALLTHRU */
10636     case CCZmode:
10637       break;
10638
10639     default:
10640       gcc_unreachable ();
10641     }
10642
10643   return (GET_MODE (SET_SRC (set)) == set_mode);
10644 }
10645
10646 /* Generate insn patterns to do an integer compare of OPERANDS.  */
10647
10648 static rtx
10649 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
10650 {
10651   enum machine_mode cmpmode;
10652   rtx tmp, flags;
10653
10654   cmpmode = SELECT_CC_MODE (code, op0, op1);
10655   flags = gen_rtx_REG (cmpmode, FLAGS_REG);
10656
10657   /* This is very simple, but making the interface the same as in the
10658      FP case makes the rest of the code easier.  */
10659   tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
10660   emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
10661
10662   /* Return the test that should be put into the flags user, i.e.
10663      the bcc, scc, or cmov instruction.  */
10664   return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
10665 }
10666
10667 /* Figure out whether to use ordered or unordered fp comparisons.
10668    Return the appropriate mode to use.  */
10669
10670 enum machine_mode
10671 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
10672 {
10673   /* ??? In order to make all comparisons reversible, we do all comparisons
10674      non-trapping when compiling for IEEE.  Once gcc is able to distinguish
10675      all forms trapping and nontrapping comparisons, we can make inequality
10676      comparisons trapping again, since it results in better code when using
10677      FCOM based compares.  */
10678   return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
10679 }
10680
10681 enum machine_mode
10682 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
10683 {
10684   enum machine_mode mode = GET_MODE (op0);
10685
10686   if (SCALAR_FLOAT_MODE_P (mode))
10687     {
10688       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
10689       return ix86_fp_compare_mode (code);
10690     }
10691
10692   switch (code)
10693     {
10694       /* Only zero flag is needed.  */
10695     case EQ:                    /* ZF=0 */
10696     case NE:                    /* ZF!=0 */
10697       return CCZmode;
10698       /* Codes needing carry flag.  */
10699     case GEU:                   /* CF=0 */
10700     case GTU:                   /* CF=0 & ZF=0 */
10701     case LTU:                   /* CF=1 */
10702     case LEU:                   /* CF=1 | ZF=1 */
10703       return CCmode;
10704       /* Codes possibly doable only with sign flag when
10705          comparing against zero.  */
10706     case GE:                    /* SF=OF   or   SF=0 */
10707     case LT:                    /* SF<>OF  or   SF=1 */
10708       if (op1 == const0_rtx)
10709         return CCGOCmode;
10710       else
10711         /* For other cases Carry flag is not required.  */
10712         return CCGCmode;
10713       /* Codes doable only with sign flag when comparing
10714          against zero, but we miss jump instruction for it
10715          so we need to use relational tests against overflow
10716          that thus needs to be zero.  */
10717     case GT:                    /* ZF=0 & SF=OF */
10718     case LE:                    /* ZF=1 | SF<>OF */
10719       if (op1 == const0_rtx)
10720         return CCNOmode;
10721       else
10722         return CCGCmode;
10723       /* strcmp pattern do (use flags) and combine may ask us for proper
10724          mode.  */
10725     case USE:
10726       return CCmode;
10727     default:
10728       gcc_unreachable ();
10729     }
10730 }
10731
10732 /* Return the fixed registers used for condition codes.  */
10733
10734 static bool
10735 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
10736 {
10737   *p1 = FLAGS_REG;
10738   *p2 = FPSR_REG;
10739   return true;
10740 }
10741
10742 /* If two condition code modes are compatible, return a condition code
10743    mode which is compatible with both.  Otherwise, return
10744    VOIDmode.  */
10745
10746 static enum machine_mode
10747 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
10748 {
10749   if (m1 == m2)
10750     return m1;
10751
10752   if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
10753     return VOIDmode;
10754
10755   if ((m1 == CCGCmode && m2 == CCGOCmode)
10756       || (m1 == CCGOCmode && m2 == CCGCmode))
10757     return CCGCmode;
10758
10759   switch (m1)
10760     {
10761     default:
10762       gcc_unreachable ();
10763
10764     case CCmode:
10765     case CCGCmode:
10766     case CCGOCmode:
10767     case CCNOmode:
10768     case CCZmode:
10769       switch (m2)
10770         {
10771         default:
10772           return VOIDmode;
10773
10774         case CCmode:
10775         case CCGCmode:
10776         case CCGOCmode:
10777         case CCNOmode:
10778         case CCZmode:
10779           return CCmode;
10780         }
10781
10782     case CCFPmode:
10783     case CCFPUmode:
10784       /* These are only compatible with themselves, which we already
10785          checked above.  */
10786       return VOIDmode;
10787     }
10788 }
10789
10790 /* Split comparison code CODE into comparisons we can do using branch
10791    instructions.  BYPASS_CODE is comparison code for branch that will
10792    branch around FIRST_CODE and SECOND_CODE.  If some of branches
10793    is not required, set value to UNKNOWN.
10794    We never require more than two branches.  */
10795
10796 void
10797 ix86_fp_comparison_codes (enum rtx_code code, enum rtx_code *bypass_code,
10798                           enum rtx_code *first_code,
10799                           enum rtx_code *second_code)
10800 {
10801   *first_code = code;
10802   *bypass_code = UNKNOWN;
10803   *second_code = UNKNOWN;
10804
10805   /* The fcomi comparison sets flags as follows:
10806
10807      cmp    ZF PF CF
10808      >      0  0  0
10809      <      0  0  1
10810      =      1  0  0
10811      un     1  1  1 */
10812
10813   switch (code)
10814     {
10815     case GT:                    /* GTU - CF=0 & ZF=0 */
10816     case GE:                    /* GEU - CF=0 */
10817     case ORDERED:               /* PF=0 */
10818     case UNORDERED:             /* PF=1 */
10819     case UNEQ:                  /* EQ - ZF=1 */
10820     case UNLT:                  /* LTU - CF=1 */
10821     case UNLE:                  /* LEU - CF=1 | ZF=1 */
10822     case LTGT:                  /* EQ - ZF=0 */
10823       break;
10824     case LT:                    /* LTU - CF=1 - fails on unordered */
10825       *first_code = UNLT;
10826       *bypass_code = UNORDERED;
10827       break;
10828     case LE:                    /* LEU - CF=1 | ZF=1 - fails on unordered */
10829       *first_code = UNLE;
10830       *bypass_code = UNORDERED;
10831       break;
10832     case EQ:                    /* EQ - ZF=1 - fails on unordered */
10833       *first_code = UNEQ;
10834       *bypass_code = UNORDERED;
10835       break;
10836     case NE:                    /* NE - ZF=0 - fails on unordered */
10837       *first_code = LTGT;
10838       *second_code = UNORDERED;
10839       break;
10840     case UNGE:                  /* GEU - CF=0 - fails on unordered */
10841       *first_code = GE;
10842       *second_code = UNORDERED;
10843       break;
10844     case UNGT:                  /* GTU - CF=0 & ZF=0 - fails on unordered */
10845       *first_code = GT;
10846       *second_code = UNORDERED;
10847       break;
10848     default:
10849       gcc_unreachable ();
10850     }
10851   if (!TARGET_IEEE_FP)
10852     {
10853       *second_code = UNKNOWN;
10854       *bypass_code = UNKNOWN;
10855     }
10856 }
10857
10858 /* Return cost of comparison done fcom + arithmetics operations on AX.
10859    All following functions do use number of instructions as a cost metrics.
10860    In future this should be tweaked to compute bytes for optimize_size and
10861    take into account performance of various instructions on various CPUs.  */
10862 static int
10863 ix86_fp_comparison_arithmetics_cost (enum rtx_code code)
10864 {
10865   if (!TARGET_IEEE_FP)
10866     return 4;
10867   /* The cost of code output by ix86_expand_fp_compare.  */
10868   switch (code)
10869     {
10870     case UNLE:
10871     case UNLT:
10872     case LTGT:
10873     case GT:
10874     case GE:
10875     case UNORDERED:
10876     case ORDERED:
10877     case UNEQ:
10878       return 4;
10879       break;
10880     case LT:
10881     case NE:
10882     case EQ:
10883     case UNGE:
10884       return 5;
10885       break;
10886     case LE:
10887     case UNGT:
10888       return 6;
10889       break;
10890     default:
10891       gcc_unreachable ();
10892     }
10893 }
10894
10895 /* Return cost of comparison done using fcomi operation.
10896    See ix86_fp_comparison_arithmetics_cost for the metrics.  */
10897 static int
10898 ix86_fp_comparison_fcomi_cost (enum rtx_code code)
10899 {
10900   enum rtx_code bypass_code, first_code, second_code;
10901   /* Return arbitrarily high cost when instruction is not supported - this
10902      prevents gcc from using it.  */
10903   if (!TARGET_CMOVE)
10904     return 1024;
10905   ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code);
10906   return (bypass_code != UNKNOWN || second_code != UNKNOWN) + 2;
10907 }
10908
10909 /* Return cost of comparison done using sahf operation.
10910    See ix86_fp_comparison_arithmetics_cost for the metrics.  */
10911 static int
10912 ix86_fp_comparison_sahf_cost (enum rtx_code code)
10913 {
10914   enum rtx_code bypass_code, first_code, second_code;
10915   /* Return arbitrarily high cost when instruction is not preferred - this
10916      avoids gcc from using it.  */
10917   if (!(TARGET_SAHF && (TARGET_USE_SAHF || optimize_size)))
10918     return 1024;
10919   ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code);
10920   return (bypass_code != UNKNOWN || second_code != UNKNOWN) + 3;
10921 }
10922
10923 /* Compute cost of the comparison done using any method.
10924    See ix86_fp_comparison_arithmetics_cost for the metrics.  */
10925 static int
10926 ix86_fp_comparison_cost (enum rtx_code code)
10927 {
10928   int fcomi_cost, sahf_cost, arithmetics_cost = 1024;
10929   int min;
10930
10931   fcomi_cost = ix86_fp_comparison_fcomi_cost (code);
10932   sahf_cost = ix86_fp_comparison_sahf_cost (code);
10933
10934   min = arithmetics_cost = ix86_fp_comparison_arithmetics_cost (code);
10935   if (min > sahf_cost)
10936     min = sahf_cost;
10937   if (min > fcomi_cost)
10938     min = fcomi_cost;
10939   return min;
10940 }
10941
10942 /* Return true if we should use an FCOMI instruction for this
10943    fp comparison.  */
10944
10945 int
10946 ix86_use_fcomi_compare (enum rtx_code code ATTRIBUTE_UNUSED)
10947 {
10948   enum rtx_code swapped_code = swap_condition (code);
10949
10950   return ((ix86_fp_comparison_cost (code)
10951            == ix86_fp_comparison_fcomi_cost (code))
10952           || (ix86_fp_comparison_cost (swapped_code)
10953               == ix86_fp_comparison_fcomi_cost (swapped_code)));
10954 }
10955
10956 /* Swap, force into registers, or otherwise massage the two operands
10957    to a fp comparison.  The operands are updated in place; the new
10958    comparison code is returned.  */
10959
10960 static enum rtx_code
10961 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
10962 {
10963   enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
10964   rtx op0 = *pop0, op1 = *pop1;
10965   enum machine_mode op_mode = GET_MODE (op0);
10966   int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
10967
10968   /* All of the unordered compare instructions only work on registers.
10969      The same is true of the fcomi compare instructions.  The XFmode
10970      compare instructions require registers except when comparing
10971      against zero or when converting operand 1 from fixed point to
10972      floating point.  */
10973
10974   if (!is_sse
10975       && (fpcmp_mode == CCFPUmode
10976           || (op_mode == XFmode
10977               && ! (standard_80387_constant_p (op0) == 1
10978                     || standard_80387_constant_p (op1) == 1)
10979               && GET_CODE (op1) != FLOAT)
10980           || ix86_use_fcomi_compare (code)))
10981     {
10982       op0 = force_reg (op_mode, op0);
10983       op1 = force_reg (op_mode, op1);
10984     }
10985   else
10986     {
10987       /* %%% We only allow op1 in memory; op0 must be st(0).  So swap
10988          things around if they appear profitable, otherwise force op0
10989          into a register.  */
10990
10991       if (standard_80387_constant_p (op0) == 0
10992           || (MEM_P (op0)
10993               && ! (standard_80387_constant_p (op1) == 0
10994                     || MEM_P (op1))))
10995         {
10996           rtx tmp;
10997           tmp = op0, op0 = op1, op1 = tmp;
10998           code = swap_condition (code);
10999         }
11000
11001       if (!REG_P (op0))
11002         op0 = force_reg (op_mode, op0);
11003
11004       if (CONSTANT_P (op1))
11005         {
11006           int tmp = standard_80387_constant_p (op1);
11007           if (tmp == 0)
11008             op1 = validize_mem (force_const_mem (op_mode, op1));
11009           else if (tmp == 1)
11010             {
11011               if (TARGET_CMOVE)
11012                 op1 = force_reg (op_mode, op1);
11013             }
11014           else
11015             op1 = force_reg (op_mode, op1);
11016         }
11017     }
11018
11019   /* Try to rearrange the comparison to make it cheaper.  */
11020   if (ix86_fp_comparison_cost (code)
11021       > ix86_fp_comparison_cost (swap_condition (code))
11022       && (REG_P (op1) || !no_new_pseudos))
11023     {
11024       rtx tmp;
11025       tmp = op0, op0 = op1, op1 = tmp;
11026       code = swap_condition (code);
11027       if (!REG_P (op0))
11028         op0 = force_reg (op_mode, op0);
11029     }
11030
11031   *pop0 = op0;
11032   *pop1 = op1;
11033   return code;
11034 }
11035
11036 /* Convert comparison codes we use to represent FP comparison to integer
11037    code that will result in proper branch.  Return UNKNOWN if no such code
11038    is available.  */
11039
11040 enum rtx_code
11041 ix86_fp_compare_code_to_integer (enum rtx_code code)
11042 {
11043   switch (code)
11044     {
11045     case GT:
11046       return GTU;
11047     case GE:
11048       return GEU;
11049     case ORDERED:
11050     case UNORDERED:
11051       return code;
11052       break;
11053     case UNEQ:
11054       return EQ;
11055       break;
11056     case UNLT:
11057       return LTU;
11058       break;
11059     case UNLE:
11060       return LEU;
11061       break;
11062     case LTGT:
11063       return NE;
11064       break;
11065     default:
11066       return UNKNOWN;
11067     }
11068 }
11069
11070 /* Generate insn patterns to do a floating point compare of OPERANDS.  */
11071
11072 static rtx
11073 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch,
11074                         rtx *second_test, rtx *bypass_test)
11075 {
11076   enum machine_mode fpcmp_mode, intcmp_mode;
11077   rtx tmp, tmp2;
11078   int cost = ix86_fp_comparison_cost (code);
11079   enum rtx_code bypass_code, first_code, second_code;
11080
11081   fpcmp_mode = ix86_fp_compare_mode (code);
11082   code = ix86_prepare_fp_compare_args (code, &op0, &op1);
11083
11084   if (second_test)
11085     *second_test = NULL_RTX;
11086   if (bypass_test)
11087     *bypass_test = NULL_RTX;
11088
11089   ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code);
11090
11091   /* Do fcomi/sahf based test when profitable.  */
11092   if ((TARGET_CMOVE || TARGET_SAHF)
11093       && (bypass_code == UNKNOWN || bypass_test)
11094       && (second_code == UNKNOWN || second_test)
11095       && ix86_fp_comparison_arithmetics_cost (code) > cost)
11096     {
11097       if (TARGET_CMOVE)
11098         {
11099           tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
11100           tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
11101                              tmp);
11102           emit_insn (tmp);
11103         }
11104       else
11105         {
11106           tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
11107           tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
11108           if (!scratch)
11109             scratch = gen_reg_rtx (HImode);
11110           emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
11111           emit_insn (gen_x86_sahf_1 (scratch));
11112         }
11113
11114       /* The FP codes work out to act like unsigned.  */
11115       intcmp_mode = fpcmp_mode;
11116       code = first_code;
11117       if (bypass_code != UNKNOWN)
11118         *bypass_test = gen_rtx_fmt_ee (bypass_code, VOIDmode,
11119                                        gen_rtx_REG (intcmp_mode, FLAGS_REG),
11120                                        const0_rtx);
11121       if (second_code != UNKNOWN)
11122         *second_test = gen_rtx_fmt_ee (second_code, VOIDmode,
11123                                        gen_rtx_REG (intcmp_mode, FLAGS_REG),
11124                                        const0_rtx);
11125     }
11126   else
11127     {
11128       /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first.  */
11129       tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
11130       tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
11131       if (!scratch)
11132         scratch = gen_reg_rtx (HImode);
11133       emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
11134
11135       /* In the unordered case, we have to check C2 for NaN's, which
11136          doesn't happen to work out to anything nice combination-wise.
11137          So do some bit twiddling on the value we've got in AH to come
11138          up with an appropriate set of condition codes.  */
11139
11140       intcmp_mode = CCNOmode;
11141       switch (code)
11142         {
11143         case GT:
11144         case UNGT:
11145           if (code == GT || !TARGET_IEEE_FP)
11146             {
11147               emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
11148               code = EQ;
11149             }
11150           else
11151             {
11152               emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
11153               emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
11154               emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
11155               intcmp_mode = CCmode;
11156               code = GEU;
11157             }
11158           break;
11159         case LT:
11160         case UNLT:
11161           if (code == LT && TARGET_IEEE_FP)
11162             {
11163               emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
11164               emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x01)));
11165               intcmp_mode = CCmode;
11166               code = EQ;
11167             }
11168           else
11169             {
11170               emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x01)));
11171               code = NE;
11172             }
11173           break;
11174         case GE:
11175         case UNGE:
11176           if (code == GE || !TARGET_IEEE_FP)
11177             {
11178               emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
11179               code = EQ;
11180             }
11181           else
11182             {
11183               emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
11184               emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
11185                                              GEN_INT (0x01)));
11186               code = NE;
11187             }
11188           break;
11189         case LE:
11190         case UNLE:
11191           if (code == LE && TARGET_IEEE_FP)
11192             {
11193               emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
11194               emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
11195               emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
11196               intcmp_mode = CCmode;
11197               code = LTU;
11198             }
11199           else
11200             {
11201               emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
11202               code = NE;
11203             }
11204           break;
11205         case EQ:
11206         case UNEQ:
11207           if (code == EQ && TARGET_IEEE_FP)
11208             {
11209               emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
11210               emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
11211               intcmp_mode = CCmode;
11212               code = EQ;
11213             }
11214           else
11215             {
11216               emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
11217               code = NE;
11218               break;
11219             }
11220           break;
11221         case NE:
11222         case LTGT:
11223           if (code == NE && TARGET_IEEE_FP)
11224             {
11225               emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
11226               emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
11227                                              GEN_INT (0x40)));
11228               code = NE;
11229             }
11230           else
11231             {
11232               emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
11233               code = EQ;
11234             }
11235           break;
11236
11237         case UNORDERED:
11238           emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
11239           code = NE;
11240           break;
11241         case ORDERED:
11242           emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
11243           code = EQ;
11244           break;
11245
11246         default:
11247           gcc_unreachable ();
11248         }
11249     }
11250
11251   /* Return the test that should be put into the flags user, i.e.
11252      the bcc, scc, or cmov instruction.  */
11253   return gen_rtx_fmt_ee (code, VOIDmode,
11254                          gen_rtx_REG (intcmp_mode, FLAGS_REG),
11255                          const0_rtx);
11256 }
11257
11258 rtx
11259 ix86_expand_compare (enum rtx_code code, rtx *second_test, rtx *bypass_test)
11260 {
11261   rtx op0, op1, ret;
11262   op0 = ix86_compare_op0;
11263   op1 = ix86_compare_op1;
11264
11265   if (second_test)
11266     *second_test = NULL_RTX;
11267   if (bypass_test)
11268     *bypass_test = NULL_RTX;
11269
11270   if (ix86_compare_emitted)
11271     {
11272       ret = gen_rtx_fmt_ee (code, VOIDmode, ix86_compare_emitted, const0_rtx);
11273       ix86_compare_emitted = NULL_RTX;
11274     }
11275   else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
11276     {
11277       gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
11278       ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX,
11279                                     second_test, bypass_test);
11280     }
11281   else
11282     ret = ix86_expand_int_compare (code, op0, op1);
11283
11284   return ret;
11285 }
11286
11287 /* Return true if the CODE will result in nontrivial jump sequence.  */
11288 bool
11289 ix86_fp_jump_nontrivial_p (enum rtx_code code)
11290 {
11291   enum rtx_code bypass_code, first_code, second_code;
11292   if (!TARGET_CMOVE)
11293     return true;
11294   ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code);
11295   return bypass_code != UNKNOWN || second_code != UNKNOWN;
11296 }
11297
11298 void
11299 ix86_expand_branch (enum rtx_code code, rtx label)
11300 {
11301   rtx tmp;
11302
11303   /* If we have emitted a compare insn, go straight to simple.
11304      ix86_expand_compare won't emit anything if ix86_compare_emitted
11305      is non NULL.  */
11306   if (ix86_compare_emitted)
11307     goto simple;
11308
11309   switch (GET_MODE (ix86_compare_op0))
11310     {
11311     case QImode:
11312     case HImode:
11313     case SImode:
11314       simple:
11315       tmp = ix86_expand_compare (code, NULL, NULL);
11316       tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
11317                                   gen_rtx_LABEL_REF (VOIDmode, label),
11318                                   pc_rtx);
11319       emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
11320       return;
11321
11322     case SFmode:
11323     case DFmode:
11324     case XFmode:
11325       {
11326         rtvec vec;
11327         int use_fcomi;
11328         enum rtx_code bypass_code, first_code, second_code;
11329
11330         code = ix86_prepare_fp_compare_args (code, &ix86_compare_op0,
11331                                              &ix86_compare_op1);
11332
11333         ix86_fp_comparison_codes (code, &bypass_code, &first_code, &second_code);
11334
11335         /* Check whether we will use the natural sequence with one jump.  If
11336            so, we can expand jump early.  Otherwise delay expansion by
11337            creating compound insn to not confuse optimizers.  */
11338         if (bypass_code == UNKNOWN && second_code == UNKNOWN
11339             && TARGET_CMOVE)
11340           {
11341             ix86_split_fp_branch (code, ix86_compare_op0, ix86_compare_op1,
11342                                   gen_rtx_LABEL_REF (VOIDmode, label),
11343                                   pc_rtx, NULL_RTX, NULL_RTX);
11344           }
11345         else
11346           {
11347             tmp = gen_rtx_fmt_ee (code, VOIDmode,
11348                                   ix86_compare_op0, ix86_compare_op1);
11349             tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
11350                                         gen_rtx_LABEL_REF (VOIDmode, label),
11351                                         pc_rtx);
11352             tmp = gen_rtx_SET (VOIDmode, pc_rtx, tmp);
11353
11354             use_fcomi = ix86_use_fcomi_compare (code);
11355             vec = rtvec_alloc (3 + !use_fcomi);
11356             RTVEC_ELT (vec, 0) = tmp;
11357             RTVEC_ELT (vec, 1)
11358               = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCFPmode, 18));
11359             RTVEC_ELT (vec, 2)
11360               = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCFPmode, 17));
11361             if (! use_fcomi)
11362               RTVEC_ELT (vec, 3)
11363                 = gen_rtx_CLOBBER (VOIDmode, gen_rtx_SCRATCH (HImode));
11364
11365             emit_jump_insn (gen_rtx_PARALLEL (VOIDmode, vec));
11366           }
11367         return;
11368       }
11369
11370     case DImode:
11371       if (TARGET_64BIT)
11372         goto simple;
11373     case TImode:
11374       /* Expand DImode branch into multiple compare+branch.  */
11375       {
11376         rtx lo[2], hi[2], label2;
11377         enum rtx_code code1, code2, code3;
11378         enum machine_mode submode;
11379
11380         if (CONSTANT_P (ix86_compare_op0) && ! CONSTANT_P (ix86_compare_op1))
11381           {
11382             tmp = ix86_compare_op0;
11383             ix86_compare_op0 = ix86_compare_op1;
11384             ix86_compare_op1 = tmp;
11385             code = swap_condition (code);
11386           }
11387         if (GET_MODE (ix86_compare_op0) == DImode)
11388           {
11389             split_di (&ix86_compare_op0, 1, lo+0, hi+0);
11390             split_di (&ix86_compare_op1, 1, lo+1, hi+1);
11391             submode = SImode;
11392           }
11393         else
11394           {
11395             split_ti (&ix86_compare_op0, 1, lo+0, hi+0);
11396             split_ti (&ix86_compare_op1, 1, lo+1, hi+1);
11397             submode = DImode;
11398           }
11399
11400         /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
11401            avoid two branches.  This costs one extra insn, so disable when
11402            optimizing for size.  */
11403
11404         if ((code == EQ || code == NE)
11405             && (!optimize_size
11406                 || hi[1] == const0_rtx || lo[1] == const0_rtx))
11407           {
11408             rtx xor0, xor1;
11409
11410             xor1 = hi[0];
11411             if (hi[1] != const0_rtx)
11412               xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
11413                                    NULL_RTX, 0, OPTAB_WIDEN);
11414
11415             xor0 = lo[0];
11416             if (lo[1] != const0_rtx)
11417               xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
11418                                    NULL_RTX, 0, OPTAB_WIDEN);
11419
11420             tmp = expand_binop (submode, ior_optab, xor1, xor0,
11421                                 NULL_RTX, 0, OPTAB_WIDEN);
11422
11423             ix86_compare_op0 = tmp;
11424             ix86_compare_op1 = const0_rtx;
11425             ix86_expand_branch (code, label);
11426             return;
11427           }
11428
11429         /* Otherwise, if we are doing less-than or greater-or-equal-than,
11430            op1 is a constant and the low word is zero, then we can just
11431            examine the high word.  */
11432
11433         if (CONST_INT_P (hi[1]) && lo[1] == const0_rtx)
11434           switch (code)
11435             {
11436             case LT: case LTU: case GE: case GEU:
11437               ix86_compare_op0 = hi[0];
11438               ix86_compare_op1 = hi[1];
11439               ix86_expand_branch (code, label);
11440               return;
11441             default:
11442               break;
11443             }
11444
11445         /* Otherwise, we need two or three jumps.  */
11446
11447         label2 = gen_label_rtx ();
11448
11449         code1 = code;
11450         code2 = swap_condition (code);
11451         code3 = unsigned_condition (code);
11452
11453         switch (code)
11454           {
11455           case LT: case GT: case LTU: case GTU:
11456             break;
11457
11458           case LE:   code1 = LT;  code2 = GT;  break;
11459           case GE:   code1 = GT;  code2 = LT;  break;
11460           case LEU:  code1 = LTU; code2 = GTU; break;
11461           case GEU:  code1 = GTU; code2 = LTU; break;
11462
11463           case EQ:   code1 = UNKNOWN; code2 = NE;  break;
11464           case NE:   code2 = UNKNOWN; break;
11465
11466           default:
11467             gcc_unreachable ();
11468           }
11469
11470         /*
11471          * a < b =>
11472          *    if (hi(a) < hi(b)) goto true;
11473          *    if (hi(a) > hi(b)) goto false;
11474          *    if (lo(a) < lo(b)) goto true;
11475          *  false:
11476          */
11477
11478         ix86_compare_op0 = hi[0];
11479         ix86_compare_op1 = hi[1];
11480
11481         if (code1 != UNKNOWN)
11482           ix86_expand_branch (code1, label);
11483         if (code2 != UNKNOWN)
11484           ix86_expand_branch (code2, label2);
11485
11486         ix86_compare_op0 = lo[0];
11487         ix86_compare_op1 = lo[1];
11488         ix86_expand_branch (code3, label);
11489
11490         if (code2 != UNKNOWN)
11491           emit_label (label2);
11492         return;
11493       }
11494
11495     default:
11496       gcc_unreachable ();
11497     }
11498 }
11499
11500 /* Split branch based on floating point condition.  */
11501 void
11502 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
11503                       rtx target1, rtx target2, rtx tmp, rtx pushed)
11504 {
11505   rtx second, bypass;
11506   rtx label = NULL_RTX;
11507   rtx condition;
11508   int bypass_probability = -1, second_probability = -1, probability = -1;
11509   rtx i;
11510
11511   if (target2 != pc_rtx)
11512     {
11513       rtx tmp = target2;
11514       code = reverse_condition_maybe_unordered (code);
11515       target2 = target1;
11516       target1 = tmp;
11517     }
11518
11519   condition = ix86_expand_fp_compare (code, op1, op2,
11520                                       tmp, &second, &bypass);
11521
11522   /* Remove pushed operand from stack.  */
11523   if (pushed)
11524     ix86_free_from_memory (GET_MODE (pushed));
11525
11526   if (split_branch_probability >= 0)
11527     {
11528       /* Distribute the probabilities across the jumps.
11529          Assume the BYPASS and SECOND to be always test
11530          for UNORDERED.  */
11531       probability = split_branch_probability;
11532
11533       /* Value of 1 is low enough to make no need for probability
11534          to be updated.  Later we may run some experiments and see
11535          if unordered values are more frequent in practice.  */
11536       if (bypass)
11537         bypass_probability = 1;
11538       if (second)
11539         second_probability = 1;
11540     }
11541   if (bypass != NULL_RTX)
11542     {
11543       label = gen_label_rtx ();
11544       i = emit_jump_insn (gen_rtx_SET
11545                           (VOIDmode, pc_rtx,
11546                            gen_rtx_IF_THEN_ELSE (VOIDmode,
11547                                                  bypass,
11548                                                  gen_rtx_LABEL_REF (VOIDmode,
11549                                                                     label),
11550                                                  pc_rtx)));
11551       if (bypass_probability >= 0)
11552         REG_NOTES (i)
11553           = gen_rtx_EXPR_LIST (REG_BR_PROB,
11554                                GEN_INT (bypass_probability),
11555                                REG_NOTES (i));
11556     }
11557   i = emit_jump_insn (gen_rtx_SET
11558                       (VOIDmode, pc_rtx,
11559                        gen_rtx_IF_THEN_ELSE (VOIDmode,
11560                                              condition, target1, target2)));
11561   if (probability >= 0)
11562     REG_NOTES (i)
11563       = gen_rtx_EXPR_LIST (REG_BR_PROB,
11564                            GEN_INT (probability),
11565                            REG_NOTES (i));
11566   if (second != NULL_RTX)
11567     {
11568       i = emit_jump_insn (gen_rtx_SET
11569                           (VOIDmode, pc_rtx,
11570                            gen_rtx_IF_THEN_ELSE (VOIDmode, second, target1,
11571                                                  target2)));
11572       if (second_probability >= 0)
11573         REG_NOTES (i)
11574           = gen_rtx_EXPR_LIST (REG_BR_PROB,
11575                                GEN_INT (second_probability),
11576                                REG_NOTES (i));
11577     }
11578   if (label != NULL_RTX)
11579     emit_label (label);
11580 }
11581
11582 int
11583 ix86_expand_setcc (enum rtx_code code, rtx dest)
11584 {
11585   rtx ret, tmp, tmpreg, equiv;
11586   rtx second_test, bypass_test;
11587
11588   if (GET_MODE (ix86_compare_op0) == (TARGET_64BIT ? TImode : DImode))
11589     return 0; /* FAIL */
11590
11591   gcc_assert (GET_MODE (dest) == QImode);
11592
11593   ret = ix86_expand_compare (code, &second_test, &bypass_test);
11594   PUT_MODE (ret, QImode);
11595
11596   tmp = dest;
11597   tmpreg = dest;
11598
11599   emit_insn (gen_rtx_SET (VOIDmode, tmp, ret));
11600   if (bypass_test || second_test)
11601     {
11602       rtx test = second_test;
11603       int bypass = 0;
11604       rtx tmp2 = gen_reg_rtx (QImode);
11605       if (bypass_test)
11606         {
11607           gcc_assert (!second_test);
11608           test = bypass_test;
11609           bypass = 1;
11610           PUT_CODE (test, reverse_condition_maybe_unordered (GET_CODE (test)));
11611         }
11612       PUT_MODE (test, QImode);
11613       emit_insn (gen_rtx_SET (VOIDmode, tmp2, test));
11614
11615       if (bypass)
11616         emit_insn (gen_andqi3 (tmp, tmpreg, tmp2));
11617       else
11618         emit_insn (gen_iorqi3 (tmp, tmpreg, tmp2));
11619     }
11620
11621   /* Attach a REG_EQUAL note describing the comparison result.  */
11622   if (ix86_compare_op0 && ix86_compare_op1)
11623     {
11624       equiv = simplify_gen_relational (code, QImode,
11625                                        GET_MODE (ix86_compare_op0),
11626                                        ix86_compare_op0, ix86_compare_op1);
11627       set_unique_reg_note (get_last_insn (), REG_EQUAL, equiv);
11628     }
11629
11630   return 1; /* DONE */
11631 }
11632
11633 /* Expand comparison setting or clearing carry flag.  Return true when
11634    successful and set pop for the operation.  */
11635 static bool
11636 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
11637 {
11638   enum machine_mode mode =
11639     GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
11640
11641   /* Do not handle DImode compares that go through special path.
11642      Also we can't deal with FP compares yet.  This is possible to add.  */
11643   if (mode == (TARGET_64BIT ? TImode : DImode))
11644     return false;
11645
11646   if (SCALAR_FLOAT_MODE_P (mode))
11647     {
11648       rtx second_test = NULL, bypass_test = NULL;
11649       rtx compare_op, compare_seq;
11650
11651       gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
11652
11653       /* Shortcut:  following common codes never translate
11654          into carry flag compares.  */
11655       if (code == EQ || code == NE || code == UNEQ || code == LTGT
11656           || code == ORDERED || code == UNORDERED)
11657         return false;
11658
11659       /* These comparisons require zero flag; swap operands so they won't.  */
11660       if ((code == GT || code == UNLE || code == LE || code == UNGT)
11661           && !TARGET_IEEE_FP)
11662         {
11663           rtx tmp = op0;
11664           op0 = op1;
11665           op1 = tmp;
11666           code = swap_condition (code);
11667         }
11668
11669       /* Try to expand the comparison and verify that we end up with carry flag
11670          based comparison.  This is fails to be true only when we decide to expand
11671          comparison using arithmetic that is not too common scenario.  */
11672       start_sequence ();
11673       compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX,
11674                                            &second_test, &bypass_test);
11675       compare_seq = get_insns ();
11676       end_sequence ();
11677
11678       if (second_test || bypass_test)
11679         return false;
11680       if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
11681           || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
11682         code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
11683       else
11684         code = GET_CODE (compare_op);
11685       if (code != LTU && code != GEU)
11686         return false;
11687       emit_insn (compare_seq);
11688       *pop = compare_op;
11689       return true;
11690     }
11691   if (!INTEGRAL_MODE_P (mode))
11692     return false;
11693   switch (code)
11694     {
11695     case LTU:
11696     case GEU:
11697       break;
11698
11699     /* Convert a==0 into (unsigned)a<1.  */
11700     case EQ:
11701     case NE:
11702       if (op1 != const0_rtx)
11703         return false;
11704       op1 = const1_rtx;
11705       code = (code == EQ ? LTU : GEU);
11706       break;
11707
11708     /* Convert a>b into b<a or a>=b-1.  */
11709     case GTU:
11710     case LEU:
11711       if (CONST_INT_P (op1))
11712         {
11713           op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
11714           /* Bail out on overflow.  We still can swap operands but that
11715              would force loading of the constant into register.  */
11716           if (op1 == const0_rtx
11717               || !x86_64_immediate_operand (op1, GET_MODE (op1)))
11718             return false;
11719           code = (code == GTU ? GEU : LTU);
11720         }
11721       else
11722         {
11723           rtx tmp = op1;
11724           op1 = op0;
11725           op0 = tmp;
11726           code = (code == GTU ? LTU : GEU);
11727         }
11728       break;
11729
11730     /* Convert a>=0 into (unsigned)a<0x80000000.  */
11731     case LT:
11732     case GE:
11733       if (mode == DImode || op1 != const0_rtx)
11734         return false;
11735       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
11736       code = (code == LT ? GEU : LTU);
11737       break;
11738     case LE:
11739     case GT:
11740       if (mode == DImode || op1 != constm1_rtx)
11741         return false;
11742       op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
11743       code = (code == LE ? GEU : LTU);
11744       break;
11745
11746     default:
11747       return false;
11748     }
11749   /* Swapping operands may cause constant to appear as first operand.  */
11750   if (!nonimmediate_operand (op0, VOIDmode))
11751     {
11752       if (no_new_pseudos)
11753         return false;
11754       op0 = force_reg (mode, op0);
11755     }
11756   ix86_compare_op0 = op0;
11757   ix86_compare_op1 = op1;
11758   *pop = ix86_expand_compare (code, NULL, NULL);
11759   gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
11760   return true;
11761 }
11762
11763 int
11764 ix86_expand_int_movcc (rtx operands[])
11765 {
11766   enum rtx_code code = GET_CODE (operands[1]), compare_code;
11767   rtx compare_seq, compare_op;
11768   rtx second_test, bypass_test;
11769   enum machine_mode mode = GET_MODE (operands[0]);
11770   bool sign_bit_compare_p = false;;
11771
11772   start_sequence ();
11773   compare_op = ix86_expand_compare (code, &second_test, &bypass_test);
11774   compare_seq = get_insns ();
11775   end_sequence ();
11776
11777   compare_code = GET_CODE (compare_op);
11778
11779   if ((ix86_compare_op1 == const0_rtx && (code == GE || code == LT))
11780       || (ix86_compare_op1 == constm1_rtx && (code == GT || code == LE)))
11781     sign_bit_compare_p = true;
11782
11783   /* Don't attempt mode expansion here -- if we had to expand 5 or 6
11784      HImode insns, we'd be swallowed in word prefix ops.  */
11785
11786   if ((mode != HImode || TARGET_FAST_PREFIX)
11787       && (mode != (TARGET_64BIT ? TImode : DImode))
11788       && CONST_INT_P (operands[2])
11789       && CONST_INT_P (operands[3]))
11790     {
11791       rtx out = operands[0];
11792       HOST_WIDE_INT ct = INTVAL (operands[2]);
11793       HOST_WIDE_INT cf = INTVAL (operands[3]);
11794       HOST_WIDE_INT diff;
11795
11796       diff = ct - cf;
11797       /*  Sign bit compares are better done using shifts than we do by using
11798           sbb.  */
11799       if (sign_bit_compare_p
11800           || ix86_expand_carry_flag_compare (code, ix86_compare_op0,
11801                                              ix86_compare_op1, &compare_op))
11802         {
11803           /* Detect overlap between destination and compare sources.  */
11804           rtx tmp = out;
11805
11806           if (!sign_bit_compare_p)
11807             {
11808               bool fpcmp = false;
11809
11810               compare_code = GET_CODE (compare_op);
11811
11812               if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
11813                   || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
11814                 {
11815                   fpcmp = true;
11816                   compare_code = ix86_fp_compare_code_to_integer (compare_code);
11817                 }
11818
11819               /* To simplify rest of code, restrict to the GEU case.  */
11820               if (compare_code == LTU)
11821                 {
11822                   HOST_WIDE_INT tmp = ct;
11823                   ct = cf;
11824                   cf = tmp;
11825                   compare_code = reverse_condition (compare_code);
11826                   code = reverse_condition (code);
11827                 }
11828               else
11829                 {
11830                   if (fpcmp)
11831                     PUT_CODE (compare_op,
11832                               reverse_condition_maybe_unordered
11833                                 (GET_CODE (compare_op)));
11834                   else
11835                     PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
11836                 }
11837               diff = ct - cf;
11838
11839               if (reg_overlap_mentioned_p (out, ix86_compare_op0)
11840                   || reg_overlap_mentioned_p (out, ix86_compare_op1))
11841                 tmp = gen_reg_rtx (mode);
11842
11843               if (mode == DImode)
11844                 emit_insn (gen_x86_movdicc_0_m1_rex64 (tmp, compare_op));
11845               else
11846                 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp), compare_op));
11847             }
11848           else
11849             {
11850               if (code == GT || code == GE)
11851                 code = reverse_condition (code);
11852               else
11853                 {
11854                   HOST_WIDE_INT tmp = ct;
11855                   ct = cf;
11856                   cf = tmp;
11857                   diff = ct - cf;
11858                 }
11859               tmp = emit_store_flag (tmp, code, ix86_compare_op0,
11860                                      ix86_compare_op1, VOIDmode, 0, -1);
11861             }
11862
11863           if (diff == 1)
11864             {
11865               /*
11866                * cmpl op0,op1
11867                * sbbl dest,dest
11868                * [addl dest, ct]
11869                *
11870                * Size 5 - 8.
11871                */
11872               if (ct)
11873                 tmp = expand_simple_binop (mode, PLUS,
11874                                            tmp, GEN_INT (ct),
11875                                            copy_rtx (tmp), 1, OPTAB_DIRECT);
11876             }
11877           else if (cf == -1)
11878             {
11879               /*
11880                * cmpl op0,op1
11881                * sbbl dest,dest
11882                * orl $ct, dest
11883                *
11884                * Size 8.
11885                */
11886               tmp = expand_simple_binop (mode, IOR,
11887                                          tmp, GEN_INT (ct),
11888                                          copy_rtx (tmp), 1, OPTAB_DIRECT);
11889             }
11890           else if (diff == -1 && ct)
11891             {
11892               /*
11893                * cmpl op0,op1
11894                * sbbl dest,dest
11895                * notl dest
11896                * [addl dest, cf]
11897                *
11898                * Size 8 - 11.
11899                */
11900               tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
11901               if (cf)
11902                 tmp = expand_simple_binop (mode, PLUS,
11903                                            copy_rtx (tmp), GEN_INT (cf),
11904                                            copy_rtx (tmp), 1, OPTAB_DIRECT);
11905             }
11906           else
11907             {
11908               /*
11909                * cmpl op0,op1
11910                * sbbl dest,dest
11911                * [notl dest]
11912                * andl cf - ct, dest
11913                * [addl dest, ct]
11914                *
11915                * Size 8 - 11.
11916                */
11917
11918               if (cf == 0)
11919                 {
11920                   cf = ct;
11921                   ct = 0;
11922                   tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
11923                 }
11924
11925               tmp = expand_simple_binop (mode, AND,
11926                                          copy_rtx (tmp),
11927                                          gen_int_mode (cf - ct, mode),
11928                                          copy_rtx (tmp), 1, OPTAB_DIRECT);
11929               if (ct)
11930                 tmp = expand_simple_binop (mode, PLUS,
11931                                            copy_rtx (tmp), GEN_INT (ct),
11932                                            copy_rtx (tmp), 1, OPTAB_DIRECT);
11933             }
11934
11935           if (!rtx_equal_p (tmp, out))
11936             emit_move_insn (copy_rtx (out), copy_rtx (tmp));
11937
11938           return 1; /* DONE */
11939         }
11940
11941       if (diff < 0)
11942         {
11943           enum machine_mode cmp_mode = GET_MODE (ix86_compare_op0);
11944
11945           HOST_WIDE_INT tmp;
11946           tmp = ct, ct = cf, cf = tmp;
11947           diff = -diff;
11948
11949           if (SCALAR_FLOAT_MODE_P (cmp_mode))
11950             {
11951               gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
11952
11953               /* We may be reversing unordered compare to normal compare, that
11954                  is not valid in general (we may convert non-trapping condition
11955                  to trapping one), however on i386 we currently emit all
11956                  comparisons unordered.  */
11957               compare_code = reverse_condition_maybe_unordered (compare_code);
11958               code = reverse_condition_maybe_unordered (code);
11959             }
11960           else
11961             {
11962               compare_code = reverse_condition (compare_code);
11963               code = reverse_condition (code);
11964             }
11965         }
11966
11967       compare_code = UNKNOWN;
11968       if (GET_MODE_CLASS (GET_MODE (ix86_compare_op0)) == MODE_INT
11969           && CONST_INT_P (ix86_compare_op1))
11970         {
11971           if (ix86_compare_op1 == const0_rtx
11972               && (code == LT || code == GE))
11973             compare_code = code;
11974           else if (ix86_compare_op1 == constm1_rtx)
11975             {
11976               if (code == LE)
11977                 compare_code = LT;
11978               else if (code == GT)
11979                 compare_code = GE;
11980             }
11981         }
11982
11983       /* Optimize dest = (op0 < 0) ? -1 : cf.  */
11984       if (compare_code != UNKNOWN
11985           && GET_MODE (ix86_compare_op0) == GET_MODE (out)
11986           && (cf == -1 || ct == -1))
11987         {
11988           /* If lea code below could be used, only optimize
11989              if it results in a 2 insn sequence.  */
11990
11991           if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
11992                  || diff == 3 || diff == 5 || diff == 9)
11993               || (compare_code == LT && ct == -1)
11994               || (compare_code == GE && cf == -1))
11995             {
11996               /*
11997                * notl op1       (if necessary)
11998                * sarl $31, op1
11999                * orl cf, op1
12000                */
12001               if (ct != -1)
12002                 {
12003                   cf = ct;
12004                   ct = -1;
12005                   code = reverse_condition (code);
12006                 }
12007
12008               out = emit_store_flag (out, code, ix86_compare_op0,
12009                                      ix86_compare_op1, VOIDmode, 0, -1);
12010
12011               out = expand_simple_binop (mode, IOR,
12012                                          out, GEN_INT (cf),
12013                                          out, 1, OPTAB_DIRECT);
12014               if (out != operands[0])
12015                 emit_move_insn (operands[0], out);
12016
12017               return 1; /* DONE */
12018             }
12019         }
12020
12021
12022       if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
12023            || diff == 3 || diff == 5 || diff == 9)
12024           && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
12025           && (mode != DImode
12026               || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
12027         {
12028           /*
12029            * xorl dest,dest
12030            * cmpl op1,op2
12031            * setcc dest
12032            * lea cf(dest*(ct-cf)),dest
12033            *
12034            * Size 14.
12035            *
12036            * This also catches the degenerate setcc-only case.
12037            */
12038
12039           rtx tmp;
12040           int nops;
12041
12042           out = emit_store_flag (out, code, ix86_compare_op0,
12043                                  ix86_compare_op1, VOIDmode, 0, 1);
12044
12045           nops = 0;
12046           /* On x86_64 the lea instruction operates on Pmode, so we need
12047              to get arithmetics done in proper mode to match.  */
12048           if (diff == 1)
12049             tmp = copy_rtx (out);
12050           else
12051             {
12052               rtx out1;
12053               out1 = copy_rtx (out);
12054               tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
12055               nops++;
12056               if (diff & 1)
12057                 {
12058                   tmp = gen_rtx_PLUS (mode, tmp, out1);
12059                   nops++;
12060                 }
12061             }
12062           if (cf != 0)
12063             {
12064               tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
12065               nops++;
12066             }
12067           if (!rtx_equal_p (tmp, out))
12068             {
12069               if (nops == 1)
12070                 out = force_operand (tmp, copy_rtx (out));
12071               else
12072                 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
12073             }
12074           if (!rtx_equal_p (out, operands[0]))
12075             emit_move_insn (operands[0], copy_rtx (out));
12076
12077           return 1; /* DONE */
12078         }
12079
12080       /*
12081        * General case:                  Jumpful:
12082        *   xorl dest,dest               cmpl op1, op2
12083        *   cmpl op1, op2                movl ct, dest
12084        *   setcc dest                   jcc 1f
12085        *   decl dest                    movl cf, dest
12086        *   andl (cf-ct),dest            1:
12087        *   addl ct,dest
12088        *
12089        * Size 20.                       Size 14.
12090        *
12091        * This is reasonably steep, but branch mispredict costs are
12092        * high on modern cpus, so consider failing only if optimizing
12093        * for space.
12094        */
12095
12096       if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
12097           && BRANCH_COST >= 2)
12098         {
12099           if (cf == 0)
12100             {
12101               enum machine_mode cmp_mode = GET_MODE (ix86_compare_op0);
12102
12103               cf = ct;
12104               ct = 0;
12105
12106               if (SCALAR_FLOAT_MODE_P (cmp_mode))
12107                 {
12108                   gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
12109
12110                   /* We may be reversing unordered compare to normal compare,
12111                      that is not valid in general (we may convert non-trapping
12112                      condition to trapping one), however on i386 we currently
12113                      emit all comparisons unordered.  */
12114                   code = reverse_condition_maybe_unordered (code);
12115                 }
12116               else
12117                 {
12118                   code = reverse_condition (code);
12119                   if (compare_code != UNKNOWN)
12120                     compare_code = reverse_condition (compare_code);
12121                 }
12122             }
12123
12124           if (compare_code != UNKNOWN)
12125             {
12126               /* notl op1       (if needed)
12127                  sarl $31, op1
12128                  andl (cf-ct), op1
12129                  addl ct, op1
12130
12131                  For x < 0 (resp. x <= -1) there will be no notl,
12132                  so if possible swap the constants to get rid of the
12133                  complement.
12134                  True/false will be -1/0 while code below (store flag
12135                  followed by decrement) is 0/-1, so the constants need
12136                  to be exchanged once more.  */
12137
12138               if (compare_code == GE || !cf)
12139                 {
12140                   code = reverse_condition (code);
12141                   compare_code = LT;
12142                 }
12143               else
12144                 {
12145                   HOST_WIDE_INT tmp = cf;
12146                   cf = ct;
12147                   ct = tmp;
12148                 }
12149
12150               out = emit_store_flag (out, code, ix86_compare_op0,
12151                                      ix86_compare_op1, VOIDmode, 0, -1);
12152             }
12153           else
12154             {
12155               out = emit_store_flag (out, code, ix86_compare_op0,
12156                                      ix86_compare_op1, VOIDmode, 0, 1);
12157
12158               out = expand_simple_binop (mode, PLUS, copy_rtx (out), constm1_rtx,
12159                                          copy_rtx (out), 1, OPTAB_DIRECT);
12160             }
12161
12162           out = expand_simple_binop (mode, AND, copy_rtx (out),
12163                                      gen_int_mode (cf - ct, mode),
12164                                      copy_rtx (out), 1, OPTAB_DIRECT);
12165           if (ct)
12166             out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
12167                                        copy_rtx (out), 1, OPTAB_DIRECT);
12168           if (!rtx_equal_p (out, operands[0]))
12169             emit_move_insn (operands[0], copy_rtx (out));
12170
12171           return 1; /* DONE */
12172         }
12173     }
12174
12175   if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
12176     {
12177       /* Try a few things more with specific constants and a variable.  */
12178
12179       optab op;
12180       rtx var, orig_out, out, tmp;
12181
12182       if (BRANCH_COST <= 2)
12183         return 0; /* FAIL */
12184
12185       /* If one of the two operands is an interesting constant, load a
12186          constant with the above and mask it in with a logical operation.  */
12187
12188       if (CONST_INT_P (operands[2]))
12189         {
12190           var = operands[3];
12191           if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
12192             operands[3] = constm1_rtx, op = and_optab;
12193           else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
12194             operands[3] = const0_rtx, op = ior_optab;
12195           else
12196             return 0; /* FAIL */
12197         }
12198       else if (CONST_INT_P (operands[3]))
12199         {
12200           var = operands[2];
12201           if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
12202             operands[2] = constm1_rtx, op = and_optab;
12203           else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
12204             operands[2] = const0_rtx, op = ior_optab;
12205           else
12206             return 0; /* FAIL */
12207         }
12208       else
12209         return 0; /* FAIL */
12210
12211       orig_out = operands[0];
12212       tmp = gen_reg_rtx (mode);
12213       operands[0] = tmp;
12214
12215       /* Recurse to get the constant loaded.  */
12216       if (ix86_expand_int_movcc (operands) == 0)
12217         return 0; /* FAIL */
12218
12219       /* Mask in the interesting variable.  */
12220       out = expand_binop (mode, op, var, tmp, orig_out, 0,
12221                           OPTAB_WIDEN);
12222       if (!rtx_equal_p (out, orig_out))
12223         emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
12224
12225       return 1; /* DONE */
12226     }
12227
12228   /*
12229    * For comparison with above,
12230    *
12231    * movl cf,dest
12232    * movl ct,tmp
12233    * cmpl op1,op2
12234    * cmovcc tmp,dest
12235    *
12236    * Size 15.
12237    */
12238
12239   if (! nonimmediate_operand (operands[2], mode))
12240     operands[2] = force_reg (mode, operands[2]);
12241   if (! nonimmediate_operand (operands[3], mode))
12242     operands[3] = force_reg (mode, operands[3]);
12243
12244   if (bypass_test && reg_overlap_mentioned_p (operands[0], operands[3]))
12245     {
12246       rtx tmp = gen_reg_rtx (mode);
12247       emit_move_insn (tmp, operands[3]);
12248       operands[3] = tmp;
12249     }
12250   if (second_test && reg_overlap_mentioned_p (operands[0], operands[2]))
12251     {
12252       rtx tmp = gen_reg_rtx (mode);
12253       emit_move_insn (tmp, operands[2]);
12254       operands[2] = tmp;
12255     }
12256
12257   if (! register_operand (operands[2], VOIDmode)
12258       && (mode == QImode
12259           || ! register_operand (operands[3], VOIDmode)))
12260     operands[2] = force_reg (mode, operands[2]);
12261
12262   if (mode == QImode
12263       && ! register_operand (operands[3], VOIDmode))
12264     operands[3] = force_reg (mode, operands[3]);
12265
12266   emit_insn (compare_seq);
12267   emit_insn (gen_rtx_SET (VOIDmode, operands[0],
12268                           gen_rtx_IF_THEN_ELSE (mode,
12269                                                 compare_op, operands[2],
12270                                                 operands[3])));
12271   if (bypass_test)
12272     emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (operands[0]),
12273                             gen_rtx_IF_THEN_ELSE (mode,
12274                                   bypass_test,
12275                                   copy_rtx (operands[3]),
12276                                   copy_rtx (operands[0]))));
12277   if (second_test)
12278     emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (operands[0]),
12279                             gen_rtx_IF_THEN_ELSE (mode,
12280                                   second_test,
12281                                   copy_rtx (operands[2]),
12282                                   copy_rtx (operands[0]))));
12283
12284   return 1; /* DONE */
12285 }
12286
12287 /* Swap, force into registers, or otherwise massage the two operands
12288    to an sse comparison with a mask result.  Thus we differ a bit from
12289    ix86_prepare_fp_compare_args which expects to produce a flags result.
12290
12291    The DEST operand exists to help determine whether to commute commutative
12292    operators.  The POP0/POP1 operands are updated in place.  The new
12293    comparison code is returned, or UNKNOWN if not implementable.  */
12294
12295 static enum rtx_code
12296 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
12297                                   rtx *pop0, rtx *pop1)
12298 {
12299   rtx tmp;
12300
12301   switch (code)
12302     {
12303     case LTGT:
12304     case UNEQ:
12305       /* We have no LTGT as an operator.  We could implement it with
12306          NE & ORDERED, but this requires an extra temporary.  It's
12307          not clear that it's worth it.  */
12308       return UNKNOWN;
12309
12310     case LT:
12311     case LE:
12312     case UNGT:
12313     case UNGE:
12314       /* These are supported directly.  */
12315       break;
12316
12317     case EQ:
12318     case NE:
12319     case UNORDERED:
12320     case ORDERED:
12321       /* For commutative operators, try to canonicalize the destination
12322          operand to be first in the comparison - this helps reload to
12323          avoid extra moves.  */
12324       if (!dest || !rtx_equal_p (dest, *pop1))
12325         break;
12326       /* FALLTHRU */
12327
12328     case GE:
12329     case GT:
12330     case UNLE:
12331     case UNLT:
12332       /* These are not supported directly.  Swap the comparison operands
12333          to transform into something that is supported.  */
12334       tmp = *pop0;
12335       *pop0 = *pop1;
12336       *pop1 = tmp;
12337       code = swap_condition (code);
12338       break;
12339
12340     default:
12341       gcc_unreachable ();
12342     }
12343
12344   return code;
12345 }
12346
12347 /* Detect conditional moves that exactly match min/max operational
12348    semantics.  Note that this is IEEE safe, as long as we don't
12349    interchange the operands.
12350
12351    Returns FALSE if this conditional move doesn't match a MIN/MAX,
12352    and TRUE if the operation is successful and instructions are emitted.  */
12353
12354 static bool
12355 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
12356                            rtx cmp_op1, rtx if_true, rtx if_false)
12357 {
12358   enum machine_mode mode;
12359   bool is_min;
12360   rtx tmp;
12361
12362   if (code == LT)
12363     ;
12364   else if (code == UNGE)
12365     {
12366       tmp = if_true;
12367       if_true = if_false;
12368       if_false = tmp;
12369     }
12370   else
12371     return false;
12372
12373   if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
12374     is_min = true;
12375   else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
12376     is_min = false;
12377   else
12378     return false;
12379
12380   mode = GET_MODE (dest);
12381
12382   /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
12383      but MODE may be a vector mode and thus not appropriate.  */
12384   if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
12385     {
12386       int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
12387       rtvec v;
12388
12389       if_true = force_reg (mode, if_true);
12390       v = gen_rtvec (2, if_true, if_false);
12391       tmp = gen_rtx_UNSPEC (mode, v, u);
12392     }
12393   else
12394     {
12395       code = is_min ? SMIN : SMAX;
12396       tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
12397     }
12398
12399   emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
12400   return true;
12401 }
12402
12403 /* Expand an sse vector comparison.  Return the register with the result.  */
12404
12405 static rtx
12406 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
12407                      rtx op_true, rtx op_false)
12408 {
12409   enum machine_mode mode = GET_MODE (dest);
12410   rtx x;
12411
12412   cmp_op0 = force_reg (mode, cmp_op0);
12413   if (!nonimmediate_operand (cmp_op1, mode))
12414     cmp_op1 = force_reg (mode, cmp_op1);
12415
12416   if (optimize
12417       || reg_overlap_mentioned_p (dest, op_true)
12418       || reg_overlap_mentioned_p (dest, op_false))
12419     dest = gen_reg_rtx (mode);
12420
12421   x = gen_rtx_fmt_ee (code, mode, cmp_op0, cmp_op1);
12422   emit_insn (gen_rtx_SET (VOIDmode, dest, x));
12423
12424   return dest;
12425 }
12426
12427 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
12428    operations.  This is used for both scalar and vector conditional moves.  */
12429
12430 static void
12431 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
12432 {
12433   enum machine_mode mode = GET_MODE (dest);
12434   rtx t2, t3, x;
12435
12436   if (op_false == CONST0_RTX (mode))
12437     {
12438       op_true = force_reg (mode, op_true);
12439       x = gen_rtx_AND (mode, cmp, op_true);
12440       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
12441     }
12442   else if (op_true == CONST0_RTX (mode))
12443     {
12444       op_false = force_reg (mode, op_false);
12445       x = gen_rtx_NOT (mode, cmp);
12446       x = gen_rtx_AND (mode, x, op_false);
12447       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
12448     }
12449   else
12450     {
12451       op_true = force_reg (mode, op_true);
12452       op_false = force_reg (mode, op_false);
12453
12454       t2 = gen_reg_rtx (mode);
12455       if (optimize)
12456         t3 = gen_reg_rtx (mode);
12457       else
12458         t3 = dest;
12459
12460       x = gen_rtx_AND (mode, op_true, cmp);
12461       emit_insn (gen_rtx_SET (VOIDmode, t2, x));
12462
12463       x = gen_rtx_NOT (mode, cmp);
12464       x = gen_rtx_AND (mode, x, op_false);
12465       emit_insn (gen_rtx_SET (VOIDmode, t3, x));
12466
12467       x = gen_rtx_IOR (mode, t3, t2);
12468       emit_insn (gen_rtx_SET (VOIDmode, dest, x));
12469     }
12470 }
12471
12472 /* Expand a floating-point conditional move.  Return true if successful.  */
12473
12474 int
12475 ix86_expand_fp_movcc (rtx operands[])
12476 {
12477   enum machine_mode mode = GET_MODE (operands[0]);
12478   enum rtx_code code = GET_CODE (operands[1]);
12479   rtx tmp, compare_op, second_test, bypass_test;
12480
12481   if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
12482     {
12483       enum machine_mode cmode;
12484
12485       /* Since we've no cmove for sse registers, don't force bad register
12486          allocation just to gain access to it.  Deny movcc when the
12487          comparison mode doesn't match the move mode.  */
12488       cmode = GET_MODE (ix86_compare_op0);
12489       if (cmode == VOIDmode)
12490         cmode = GET_MODE (ix86_compare_op1);
12491       if (cmode != mode)
12492         return 0;
12493
12494       code = ix86_prepare_sse_fp_compare_args (operands[0], code,
12495                                                &ix86_compare_op0,
12496                                                &ix86_compare_op1);
12497       if (code == UNKNOWN)
12498         return 0;
12499
12500       if (ix86_expand_sse_fp_minmax (operands[0], code, ix86_compare_op0,
12501                                      ix86_compare_op1, operands[2],
12502                                      operands[3]))
12503         return 1;
12504
12505       tmp = ix86_expand_sse_cmp (operands[0], code, ix86_compare_op0,
12506                                  ix86_compare_op1, operands[2], operands[3]);
12507       ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
12508       return 1;
12509     }
12510
12511   /* The floating point conditional move instructions don't directly
12512      support conditions resulting from a signed integer comparison.  */
12513
12514   compare_op = ix86_expand_compare (code, &second_test, &bypass_test);
12515
12516   /* The floating point conditional move instructions don't directly
12517      support signed integer comparisons.  */
12518
12519   if (!fcmov_comparison_operator (compare_op, VOIDmode))
12520     {
12521       gcc_assert (!second_test && !bypass_test);
12522       tmp = gen_reg_rtx (QImode);
12523       ix86_expand_setcc (code, tmp);
12524       code = NE;
12525       ix86_compare_op0 = tmp;
12526       ix86_compare_op1 = const0_rtx;
12527       compare_op = ix86_expand_compare (code,  &second_test, &bypass_test);
12528     }
12529   if (bypass_test && reg_overlap_mentioned_p (operands[0], operands[3]))
12530     {
12531       tmp = gen_reg_rtx (mode);
12532       emit_move_insn (tmp, operands[3]);
12533       operands[3] = tmp;
12534     }
12535   if (second_test && reg_overlap_mentioned_p (operands[0], operands[2]))
12536     {
12537       tmp = gen_reg_rtx (mode);
12538       emit_move_insn (tmp, operands[2]);
12539       operands[2] = tmp;
12540     }
12541
12542   emit_insn (gen_rtx_SET (VOIDmode, operands[0],
12543                           gen_rtx_IF_THEN_ELSE (mode, compare_op,
12544                                                 operands[2], operands[3])));
12545   if (bypass_test)
12546     emit_insn (gen_rtx_SET (VOIDmode, operands[0],
12547                             gen_rtx_IF_THEN_ELSE (mode, bypass_test,
12548                                                   operands[3], operands[0])));
12549   if (second_test)
12550     emit_insn (gen_rtx_SET (VOIDmode, operands[0],
12551                             gen_rtx_IF_THEN_ELSE (mode, second_test,
12552                                                   operands[2], operands[0])));
12553
12554   return 1;
12555 }
12556
12557 /* Expand a floating-point vector conditional move; a vcond operation
12558    rather than a movcc operation.  */
12559
12560 bool
12561 ix86_expand_fp_vcond (rtx operands[])
12562 {
12563   enum rtx_code code = GET_CODE (operands[3]);
12564   rtx cmp;
12565
12566   code = ix86_prepare_sse_fp_compare_args (operands[0], code,
12567                                            &operands[4], &operands[5]);
12568   if (code == UNKNOWN)
12569     return false;
12570
12571   if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
12572                                  operands[5], operands[1], operands[2]))
12573     return true;
12574
12575   cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
12576                              operands[1], operands[2]);
12577   ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
12578   return true;
12579 }
12580
12581 /* Expand a signed integral vector conditional move.  */
12582
12583 bool
12584 ix86_expand_int_vcond (rtx operands[])
12585 {
12586   enum machine_mode mode = GET_MODE (operands[0]);
12587   enum rtx_code code = GET_CODE (operands[3]);
12588   bool negate = false;
12589   rtx x, cop0, cop1;
12590
12591   cop0 = operands[4];
12592   cop1 = operands[5];
12593
12594   /* Canonicalize the comparison to EQ, GT, GTU.  */
12595   switch (code)
12596     {
12597     case EQ:
12598     case GT:
12599     case GTU:
12600       break;
12601
12602     case NE:
12603     case LE:
12604     case LEU:
12605       code = reverse_condition (code);
12606       negate = true;
12607       break;
12608
12609     case GE:
12610     case GEU:
12611       code = reverse_condition (code);
12612       negate = true;
12613       /* FALLTHRU */
12614
12615     case LT:
12616     case LTU:
12617       code = swap_condition (code);
12618       x = cop0, cop0 = cop1, cop1 = x;
12619       break;
12620
12621     default:
12622       gcc_unreachable ();
12623     }
12624
12625   /* Unsigned parallel compare is not supported by the hardware.  Play some
12626      tricks to turn this into a signed comparison against 0.  */
12627   if (code == GTU)
12628     {
12629       cop0 = force_reg (mode, cop0);
12630
12631       switch (mode)
12632         {
12633         case V4SImode:
12634           {
12635             rtx t1, t2, mask;
12636
12637             /* Perform a parallel modulo subtraction.  */
12638             t1 = gen_reg_rtx (mode);
12639             emit_insn (gen_subv4si3 (t1, cop0, cop1));
12640
12641             /* Extract the original sign bit of op0.  */
12642             mask = GEN_INT (-0x80000000);
12643             mask = gen_rtx_CONST_VECTOR (mode,
12644                         gen_rtvec (4, mask, mask, mask, mask));
12645             mask = force_reg (mode, mask);
12646             t2 = gen_reg_rtx (mode);
12647             emit_insn (gen_andv4si3 (t2, cop0, mask));
12648
12649             /* XOR it back into the result of the subtraction.  This results
12650                in the sign bit set iff we saw unsigned underflow.  */
12651             x = gen_reg_rtx (mode);
12652             emit_insn (gen_xorv4si3 (x, t1, t2));
12653
12654             code = GT;
12655           }
12656           break;
12657
12658         case V16QImode:
12659         case V8HImode:
12660           /* Perform a parallel unsigned saturating subtraction.  */
12661           x = gen_reg_rtx (mode);
12662           emit_insn (gen_rtx_SET (VOIDmode, x,
12663                                   gen_rtx_US_MINUS (mode, cop0, cop1)));
12664
12665           code = EQ;
12666           negate = !negate;
12667           break;
12668
12669         default:
12670           gcc_unreachable ();
12671         }
12672
12673       cop0 = x;
12674       cop1 = CONST0_RTX (mode);
12675     }
12676
12677   x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
12678                            operands[1+negate], operands[2-negate]);
12679
12680   ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
12681                          operands[2-negate]);
12682   return true;
12683 }
12684
12685 /* Unpack OP[1] into the next wider integer vector type.  UNSIGNED_P is
12686    true if we should do zero extension, else sign extension.  HIGH_P is
12687    true if we want the N/2 high elements, else the low elements.  */
12688
12689 void
12690 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
12691 {
12692   enum machine_mode imode = GET_MODE (operands[1]);
12693   rtx (*unpack)(rtx, rtx, rtx);
12694   rtx se, dest;
12695
12696   switch (imode)
12697     {
12698     case V16QImode:
12699       if (high_p)
12700         unpack = gen_vec_interleave_highv16qi;
12701       else
12702         unpack = gen_vec_interleave_lowv16qi;
12703       break;
12704     case V8HImode:
12705       if (high_p)
12706         unpack = gen_vec_interleave_highv8hi;
12707       else
12708         unpack = gen_vec_interleave_lowv8hi;
12709       break;
12710     case V4SImode:
12711       if (high_p)
12712         unpack = gen_vec_interleave_highv4si;
12713       else
12714         unpack = gen_vec_interleave_lowv4si;
12715       break;
12716     default:
12717       gcc_unreachable ();
12718     }
12719
12720   dest = gen_lowpart (imode, operands[0]);
12721
12722   if (unsigned_p)
12723     se = force_reg (imode, CONST0_RTX (imode));
12724   else
12725     se = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
12726                               operands[1], pc_rtx, pc_rtx);
12727
12728   emit_insn (unpack (dest, operands[1], se));
12729 }
12730
12731 /* Expand conditional increment or decrement using adb/sbb instructions.
12732    The default case using setcc followed by the conditional move can be
12733    done by generic code.  */
12734 int
12735 ix86_expand_int_addcc (rtx operands[])
12736 {
12737   enum rtx_code code = GET_CODE (operands[1]);
12738   rtx compare_op;
12739   rtx val = const0_rtx;
12740   bool fpcmp = false;
12741   enum machine_mode mode = GET_MODE (operands[0]);
12742
12743   if (operands[3] != const1_rtx
12744       && operands[3] != constm1_rtx)
12745     return 0;
12746   if (!ix86_expand_carry_flag_compare (code, ix86_compare_op0,
12747                                        ix86_compare_op1, &compare_op))
12748      return 0;
12749   code = GET_CODE (compare_op);
12750
12751   if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
12752       || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
12753     {
12754       fpcmp = true;
12755       code = ix86_fp_compare_code_to_integer (code);
12756     }
12757
12758   if (code != LTU)
12759     {
12760       val = constm1_rtx;
12761       if (fpcmp)
12762         PUT_CODE (compare_op,
12763                   reverse_condition_maybe_unordered
12764                     (GET_CODE (compare_op)));
12765       else
12766         PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
12767     }
12768   PUT_MODE (compare_op, mode);
12769
12770   /* Construct either adc or sbb insn.  */
12771   if ((code == LTU) == (operands[3] == constm1_rtx))
12772     {
12773       switch (GET_MODE (operands[0]))
12774         {
12775           case QImode:
12776             emit_insn (gen_subqi3_carry (operands[0], operands[2], val, compare_op));
12777             break;
12778           case HImode:
12779             emit_insn (gen_subhi3_carry (operands[0], operands[2], val, compare_op));
12780             break;
12781           case SImode:
12782             emit_insn (gen_subsi3_carry (operands[0], operands[2], val, compare_op));
12783             break;
12784           case DImode:
12785             emit_insn (gen_subdi3_carry_rex64 (operands[0], operands[2], val, compare_op));
12786             break;
12787           default:
12788             gcc_unreachable ();
12789         }
12790     }
12791   else
12792     {
12793       switch (GET_MODE (operands[0]))
12794         {
12795           case QImode:
12796             emit_insn (gen_addqi3_carry (operands[0], operands[2], val, compare_op));
12797             break;
12798           case HImode:
12799             emit_insn (gen_addhi3_carry (operands[0], operands[2], val, compare_op));
12800             break;
12801           case SImode:
12802             emit_insn (gen_addsi3_carry (operands[0], operands[2], val, compare_op));
12803             break;
12804           case DImode:
12805             emit_insn (gen_adddi3_carry_rex64 (operands[0], operands[2], val, compare_op));
12806             break;
12807           default:
12808             gcc_unreachable ();
12809         }
12810     }
12811   return 1; /* DONE */
12812 }
12813
12814
12815 /* Split operands 0 and 1 into SImode parts.  Similar to split_di, but
12816    works for floating pointer parameters and nonoffsetable memories.
12817    For pushes, it returns just stack offsets; the values will be saved
12818    in the right order.  Maximally three parts are generated.  */
12819
12820 static int
12821 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
12822 {
12823   int size;
12824
12825   if (!TARGET_64BIT)
12826     size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
12827   else
12828     size = (GET_MODE_SIZE (mode) + 4) / 8;
12829
12830   gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
12831   gcc_assert (size >= 2 && size <= 3);
12832
12833   /* Optimize constant pool reference to immediates.  This is used by fp
12834      moves, that force all constants to memory to allow combining.  */
12835   if (MEM_P (operand) && MEM_READONLY_P (operand))
12836     {
12837       rtx tmp = maybe_get_pool_constant (operand);
12838       if (tmp)
12839         operand = tmp;
12840     }
12841
12842   if (MEM_P (operand) && !offsettable_memref_p (operand))
12843     {
12844       /* The only non-offsetable memories we handle are pushes.  */
12845       int ok = push_operand (operand, VOIDmode);
12846
12847       gcc_assert (ok);
12848
12849       operand = copy_rtx (operand);
12850       PUT_MODE (operand, Pmode);
12851       parts[0] = parts[1] = parts[2] = operand;
12852       return size;
12853     }
12854
12855   if (GET_CODE (operand) == CONST_VECTOR)
12856     {
12857       enum machine_mode imode = int_mode_for_mode (mode);
12858       /* Caution: if we looked through a constant pool memory above,
12859          the operand may actually have a different mode now.  That's
12860          ok, since we want to pun this all the way back to an integer.  */
12861       operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
12862       gcc_assert (operand != NULL);
12863       mode = imode;
12864     }
12865
12866   if (!TARGET_64BIT)
12867     {
12868       if (mode == DImode)
12869         split_di (&operand, 1, &parts[0], &parts[1]);
12870       else
12871         {
12872           if (REG_P (operand))
12873             {
12874               gcc_assert (reload_completed);
12875               parts[0] = gen_rtx_REG (SImode, REGNO (operand) + 0);
12876               parts[1] = gen_rtx_REG (SImode, REGNO (operand) + 1);
12877               if (size == 3)
12878                 parts[2] = gen_rtx_REG (SImode, REGNO (operand) + 2);
12879             }
12880           else if (offsettable_memref_p (operand))
12881             {
12882               operand = adjust_address (operand, SImode, 0);
12883               parts[0] = operand;
12884               parts[1] = adjust_address (operand, SImode, 4);
12885               if (size == 3)
12886                 parts[2] = adjust_address (operand, SImode, 8);
12887             }
12888           else if (GET_CODE (operand) == CONST_DOUBLE)
12889             {
12890               REAL_VALUE_TYPE r;
12891               long l[4];
12892
12893               REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
12894               switch (mode)
12895                 {
12896                 case XFmode:
12897                   REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
12898                   parts[2] = gen_int_mode (l[2], SImode);
12899                   break;
12900                 case DFmode:
12901                   REAL_VALUE_TO_TARGET_DOUBLE (r, l);
12902                   break;
12903                 default:
12904                   gcc_unreachable ();
12905                 }
12906               parts[1] = gen_int_mode (l[1], SImode);
12907               parts[0] = gen_int_mode (l[0], SImode);
12908             }
12909           else
12910             gcc_unreachable ();
12911         }
12912     }
12913   else
12914     {
12915       if (mode == TImode)
12916         split_ti (&operand, 1, &parts[0], &parts[1]);
12917       if (mode == XFmode || mode == TFmode)
12918         {
12919           enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
12920           if (REG_P (operand))
12921             {
12922               gcc_assert (reload_completed);
12923               parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
12924               parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
12925             }
12926           else if (offsettable_memref_p (operand))
12927             {
12928               operand = adjust_address (operand, DImode, 0);
12929               parts[0] = operand;
12930               parts[1] = adjust_address (operand, upper_mode, 8);
12931             }
12932           else if (GET_CODE (operand) == CONST_DOUBLE)
12933             {
12934               REAL_VALUE_TYPE r;
12935               long l[4];
12936
12937               REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
12938               real_to_target (l, &r, mode);
12939
12940               /* Do not use shift by 32 to avoid warning on 32bit systems.  */
12941               if (HOST_BITS_PER_WIDE_INT >= 64)
12942                 parts[0]
12943                   = gen_int_mode
12944                       ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
12945                        + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
12946                        DImode);
12947               else
12948                 parts[0] = immed_double_const (l[0], l[1], DImode);
12949
12950               if (upper_mode == SImode)
12951                 parts[1] = gen_int_mode (l[2], SImode);
12952               else if (HOST_BITS_PER_WIDE_INT >= 64)
12953                 parts[1]
12954                   = gen_int_mode
12955                       ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
12956                        + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
12957                        DImode);
12958               else
12959                 parts[1] = immed_double_const (l[2], l[3], DImode);
12960             }
12961           else
12962             gcc_unreachable ();
12963         }
12964     }
12965
12966   return size;
12967 }
12968
12969 /* Emit insns to perform a move or push of DI, DF, and XF values.
12970    Return false when normal moves are needed; true when all required
12971    insns have been emitted.  Operands 2-4 contain the input values
12972    int the correct order; operands 5-7 contain the output values.  */
12973
12974 void
12975 ix86_split_long_move (rtx operands[])
12976 {
12977   rtx part[2][3];
12978   int nparts;
12979   int push = 0;
12980   int collisions = 0;
12981   enum machine_mode mode = GET_MODE (operands[0]);
12982
12983   /* The DFmode expanders may ask us to move double.
12984      For 64bit target this is single move.  By hiding the fact
12985      here we simplify i386.md splitters.  */
12986   if (GET_MODE_SIZE (GET_MODE (operands[0])) == 8 && TARGET_64BIT)
12987     {
12988       /* Optimize constant pool reference to immediates.  This is used by
12989          fp moves, that force all constants to memory to allow combining.  */
12990
12991       if (MEM_P (operands[1])
12992           && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
12993           && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
12994         operands[1] = get_pool_constant (XEXP (operands[1], 0));
12995       if (push_operand (operands[0], VOIDmode))
12996         {
12997           operands[0] = copy_rtx (operands[0]);
12998           PUT_MODE (operands[0], Pmode);
12999         }
13000       else
13001         operands[0] = gen_lowpart (DImode, operands[0]);
13002       operands[1] = gen_lowpart (DImode, operands[1]);
13003       emit_move_insn (operands[0], operands[1]);
13004       return;
13005     }
13006
13007   /* The only non-offsettable memory we handle is push.  */
13008   if (push_operand (operands[0], VOIDmode))
13009     push = 1;
13010   else
13011     gcc_assert (!MEM_P (operands[0])
13012                 || offsettable_memref_p (operands[0]));
13013
13014   nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
13015   ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
13016
13017   /* When emitting push, take care for source operands on the stack.  */
13018   if (push && MEM_P (operands[1])
13019       && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
13020     {
13021       if (nparts == 3)
13022         part[1][1] = change_address (part[1][1], GET_MODE (part[1][1]),
13023                                      XEXP (part[1][2], 0));
13024       part[1][0] = change_address (part[1][0], GET_MODE (part[1][0]),
13025                                    XEXP (part[1][1], 0));
13026     }
13027
13028   /* We need to do copy in the right order in case an address register
13029      of the source overlaps the destination.  */
13030   if (REG_P (part[0][0]) && MEM_P (part[1][0]))
13031     {
13032       if (reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0)))
13033         collisions++;
13034       if (reg_overlap_mentioned_p (part[0][1], XEXP (part[1][0], 0)))
13035         collisions++;
13036       if (nparts == 3
13037           && reg_overlap_mentioned_p (part[0][2], XEXP (part[1][0], 0)))
13038         collisions++;
13039
13040       /* Collision in the middle part can be handled by reordering.  */
13041       if (collisions == 1 && nparts == 3
13042           && reg_overlap_mentioned_p (part[0][1], XEXP (part[1][0], 0)))
13043         {
13044           rtx tmp;
13045           tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
13046           tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
13047         }
13048
13049       /* If there are more collisions, we can't handle it by reordering.
13050          Do an lea to the last part and use only one colliding move.  */
13051       else if (collisions > 1)
13052         {
13053           rtx base;
13054
13055           collisions = 1;
13056
13057           base = part[0][nparts - 1];
13058
13059           /* Handle the case when the last part isn't valid for lea.
13060              Happens in 64-bit mode storing the 12-byte XFmode.  */
13061           if (GET_MODE (base) != Pmode)
13062             base = gen_rtx_REG (Pmode, REGNO (base));
13063
13064           emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
13065           part[1][0] = replace_equiv_address (part[1][0], base);
13066           part[1][1] = replace_equiv_address (part[1][1],
13067                                       plus_constant (base, UNITS_PER_WORD));
13068           if (nparts == 3)
13069             part[1][2] = replace_equiv_address (part[1][2],
13070                                       plus_constant (base, 8));
13071         }
13072     }
13073
13074   if (push)
13075     {
13076       if (!TARGET_64BIT)
13077         {
13078           if (nparts == 3)
13079             {
13080               if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
13081                 emit_insn (gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, GEN_INT (-4)));
13082               emit_move_insn (part[0][2], part[1][2]);
13083             }
13084         }
13085       else
13086         {
13087           /* In 64bit mode we don't have 32bit push available.  In case this is
13088              register, it is OK - we will just use larger counterpart.  We also
13089              retype memory - these comes from attempt to avoid REX prefix on
13090              moving of second half of TFmode value.  */
13091           if (GET_MODE (part[1][1]) == SImode)
13092             {
13093               switch (GET_CODE (part[1][1]))
13094                 {
13095                 case MEM:
13096                   part[1][1] = adjust_address (part[1][1], DImode, 0);
13097                   break;
13098
13099                 case REG:
13100                   part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
13101                   break;
13102
13103                 default:
13104                   gcc_unreachable ();
13105                 }
13106
13107               if (GET_MODE (part[1][0]) == SImode)
13108                 part[1][0] = part[1][1];
13109             }
13110         }
13111       emit_move_insn (part[0][1], part[1][1]);
13112       emit_move_insn (part[0][0], part[1][0]);
13113       return;
13114     }
13115
13116   /* Choose correct order to not overwrite the source before it is copied.  */
13117   if ((REG_P (part[0][0])
13118        && REG_P (part[1][1])
13119        && (REGNO (part[0][0]) == REGNO (part[1][1])
13120            || (nparts == 3
13121                && REGNO (part[0][0]) == REGNO (part[1][2]))))
13122       || (collisions > 0
13123           && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
13124     {
13125       if (nparts == 3)
13126         {
13127           operands[2] = part[0][2];
13128           operands[3] = part[0][1];
13129           operands[4] = part[0][0];
13130           operands[5] = part[1][2];
13131           operands[6] = part[1][1];
13132           operands[7] = part[1][0];
13133         }
13134       else
13135         {
13136           operands[2] = part[0][1];
13137           operands[3] = part[0][0];
13138           operands[5] = part[1][1];
13139           operands[6] = part[1][0];
13140         }
13141     }
13142   else
13143     {
13144       if (nparts == 3)
13145         {
13146           operands[2] = part[0][0];
13147           operands[3] = part[0][1];
13148           operands[4] = part[0][2];
13149           operands[5] = part[1][0];
13150           operands[6] = part[1][1];
13151           operands[7] = part[1][2];
13152         }
13153       else
13154         {
13155           operands[2] = part[0][0];
13156           operands[3] = part[0][1];
13157           operands[5] = part[1][0];
13158           operands[6] = part[1][1];
13159         }
13160     }
13161
13162   /* If optimizing for size, attempt to locally unCSE nonzero constants.  */
13163   if (optimize_size)
13164     {
13165       if (CONST_INT_P (operands[5])
13166           && operands[5] != const0_rtx
13167           && REG_P (operands[2]))
13168         {
13169           if (CONST_INT_P (operands[6])
13170               && INTVAL (operands[6]) == INTVAL (operands[5]))
13171             operands[6] = operands[2];
13172
13173           if (nparts == 3
13174               && CONST_INT_P (operands[7])
13175               && INTVAL (operands[7]) == INTVAL (operands[5]))
13176             operands[7] = operands[2];
13177         }
13178
13179       if (nparts == 3
13180           && CONST_INT_P (operands[6])
13181           && operands[6] != const0_rtx
13182           && REG_P (operands[3])
13183           && CONST_INT_P (operands[7])
13184           && INTVAL (operands[7]) == INTVAL (operands[6]))
13185         operands[7] = operands[3];
13186     }
13187
13188   emit_move_insn (operands[2], operands[5]);
13189   emit_move_insn (operands[3], operands[6]);
13190   if (nparts == 3)
13191     emit_move_insn (operands[4], operands[7]);
13192
13193   return;
13194 }
13195
13196 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
13197    left shift by a constant, either using a single shift or
13198    a sequence of add instructions.  */
13199
13200 static void
13201 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
13202 {
13203   if (count == 1)
13204     {
13205       emit_insn ((mode == DImode
13206                   ? gen_addsi3
13207                   : gen_adddi3) (operand, operand, operand));
13208     }
13209   else if (!optimize_size
13210            && count * ix86_cost->add <= ix86_cost->shift_const)
13211     {
13212       int i;
13213       for (i=0; i<count; i++)
13214         {
13215           emit_insn ((mode == DImode
13216                       ? gen_addsi3
13217                       : gen_adddi3) (operand, operand, operand));
13218         }
13219     }
13220   else
13221     emit_insn ((mode == DImode
13222                 ? gen_ashlsi3
13223                 : gen_ashldi3) (operand, operand, GEN_INT (count)));
13224 }
13225
13226 void
13227 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
13228 {
13229   rtx low[2], high[2];
13230   int count;
13231   const int single_width = mode == DImode ? 32 : 64;
13232
13233   if (CONST_INT_P (operands[2]))
13234     {
13235       (mode == DImode ? split_di : split_ti) (operands, 2, low, high);
13236       count = INTVAL (operands[2]) & (single_width * 2 - 1);
13237
13238       if (count >= single_width)
13239         {
13240           emit_move_insn (high[0], low[1]);
13241           emit_move_insn (low[0], const0_rtx);
13242
13243           if (count > single_width)
13244             ix86_expand_ashl_const (high[0], count - single_width, mode);
13245         }
13246       else
13247         {
13248           if (!rtx_equal_p (operands[0], operands[1]))
13249             emit_move_insn (operands[0], operands[1]);
13250           emit_insn ((mode == DImode
13251                      ? gen_x86_shld_1
13252                      : gen_x86_64_shld) (high[0], low[0], GEN_INT (count)));
13253           ix86_expand_ashl_const (low[0], count, mode);
13254         }
13255       return;
13256     }
13257
13258   (mode == DImode ? split_di : split_ti) (operands, 1, low, high);
13259
13260   if (operands[1] == const1_rtx)
13261     {
13262       /* Assuming we've chosen a QImode capable registers, then 1 << N
13263          can be done with two 32/64-bit shifts, no branches, no cmoves.  */
13264       if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
13265         {
13266           rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
13267
13268           ix86_expand_clear (low[0]);
13269           ix86_expand_clear (high[0]);
13270           emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (single_width)));
13271
13272           d = gen_lowpart (QImode, low[0]);
13273           d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
13274           s = gen_rtx_EQ (QImode, flags, const0_rtx);
13275           emit_insn (gen_rtx_SET (VOIDmode, d, s));
13276
13277           d = gen_lowpart (QImode, high[0]);
13278           d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
13279           s = gen_rtx_NE (QImode, flags, const0_rtx);
13280           emit_insn (gen_rtx_SET (VOIDmode, d, s));
13281         }
13282
13283       /* Otherwise, we can get the same results by manually performing
13284          a bit extract operation on bit 5/6, and then performing the two
13285          shifts.  The two methods of getting 0/1 into low/high are exactly
13286          the same size.  Avoiding the shift in the bit extract case helps
13287          pentium4 a bit; no one else seems to care much either way.  */
13288       else
13289         {
13290           rtx x;
13291
13292           if (TARGET_PARTIAL_REG_STALL && !optimize_size)
13293             x = gen_rtx_ZERO_EXTEND (mode == DImode ? SImode : DImode, operands[2]);
13294           else
13295             x = gen_lowpart (mode == DImode ? SImode : DImode, operands[2]);
13296           emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
13297
13298           emit_insn ((mode == DImode
13299                       ? gen_lshrsi3
13300                       : gen_lshrdi3) (high[0], high[0], GEN_INT (mode == DImode ? 5 : 6)));
13301           emit_insn ((mode == DImode
13302                       ? gen_andsi3
13303                       : gen_anddi3) (high[0], high[0], GEN_INT (1)));
13304           emit_move_insn (low[0], high[0]);
13305           emit_insn ((mode == DImode
13306                       ? gen_xorsi3
13307                       : gen_xordi3) (low[0], low[0], GEN_INT (1)));
13308         }
13309
13310       emit_insn ((mode == DImode
13311                     ? gen_ashlsi3
13312                     : gen_ashldi3) (low[0], low[0], operands[2]));
13313       emit_insn ((mode == DImode
13314                     ? gen_ashlsi3
13315                     : gen_ashldi3) (high[0], high[0], operands[2]));
13316       return;
13317     }
13318
13319   if (operands[1] == constm1_rtx)
13320     {
13321       /* For -1 << N, we can avoid the shld instruction, because we
13322          know that we're shifting 0...31/63 ones into a -1.  */
13323       emit_move_insn (low[0], constm1_rtx);
13324       if (optimize_size)
13325         emit_move_insn (high[0], low[0]);
13326       else
13327         emit_move_insn (high[0], constm1_rtx);
13328     }
13329   else
13330     {
13331       if (!rtx_equal_p (operands[0], operands[1]))
13332         emit_move_insn (operands[0], operands[1]);
13333
13334       (mode == DImode ? split_di : split_ti) (operands, 1, low, high);
13335       emit_insn ((mode == DImode
13336                   ? gen_x86_shld_1
13337                   : gen_x86_64_shld) (high[0], low[0], operands[2]));
13338     }
13339
13340   emit_insn ((mode == DImode ? gen_ashlsi3 : gen_ashldi3) (low[0], low[0], operands[2]));
13341
13342   if (TARGET_CMOVE && scratch)
13343     {
13344       ix86_expand_clear (scratch);
13345       emit_insn ((mode == DImode
13346                   ? gen_x86_shift_adj_1
13347                   : gen_x86_64_shift_adj) (high[0], low[0], operands[2], scratch));
13348     }
13349   else
13350     emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
13351 }
13352
13353 void
13354 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
13355 {
13356   rtx low[2], high[2];
13357   int count;
13358   const int single_width = mode == DImode ? 32 : 64;
13359
13360   if (CONST_INT_P (operands[2]))
13361     {
13362       (mode == DImode ? split_di : split_ti) (operands, 2, low, high);
13363       count = INTVAL (operands[2]) & (single_width * 2 - 1);
13364
13365       if (count == single_width * 2 - 1)
13366         {
13367           emit_move_insn (high[0], high[1]);
13368           emit_insn ((mode == DImode
13369                       ? gen_ashrsi3
13370                       : gen_ashrdi3) (high[0], high[0],
13371                                       GEN_INT (single_width - 1)));
13372           emit_move_insn (low[0], high[0]);
13373
13374         }
13375       else if (count >= single_width)
13376         {
13377           emit_move_insn (low[0], high[1]);
13378           emit_move_insn (high[0], low[0]);
13379           emit_insn ((mode == DImode
13380                       ? gen_ashrsi3
13381                       : gen_ashrdi3) (high[0], high[0],
13382                                       GEN_INT (single_width - 1)));
13383           if (count > single_width)
13384             emit_insn ((mode == DImode
13385                         ? gen_ashrsi3
13386                         : gen_ashrdi3) (low[0], low[0],
13387                                         GEN_INT (count - single_width)));
13388         }
13389       else
13390         {
13391           if (!rtx_equal_p (operands[0], operands[1]))
13392             emit_move_insn (operands[0], operands[1]);
13393           emit_insn ((mode == DImode
13394                       ? gen_x86_shrd_1
13395                       : gen_x86_64_shrd) (low[0], high[0], GEN_INT (count)));
13396           emit_insn ((mode == DImode
13397                       ? gen_ashrsi3
13398                       : gen_ashrdi3) (high[0], high[0], GEN_INT (count)));
13399         }
13400     }
13401   else
13402     {
13403       if (!rtx_equal_p (operands[0], operands[1]))
13404         emit_move_insn (operands[0], operands[1]);
13405
13406       (mode == DImode ? split_di : split_ti) (operands, 1, low, high);
13407
13408       emit_insn ((mode == DImode
13409                   ? gen_x86_shrd_1
13410                   : gen_x86_64_shrd) (low[0], high[0], operands[2]));
13411       emit_insn ((mode == DImode
13412                   ? gen_ashrsi3
13413                   : gen_ashrdi3)  (high[0], high[0], operands[2]));
13414
13415       if (TARGET_CMOVE && scratch)
13416         {
13417           emit_move_insn (scratch, high[0]);
13418           emit_insn ((mode == DImode
13419                       ? gen_ashrsi3
13420                       : gen_ashrdi3) (scratch, scratch,
13421                                       GEN_INT (single_width - 1)));
13422           emit_insn ((mode == DImode
13423                       ? gen_x86_shift_adj_1
13424                       : gen_x86_64_shift_adj) (low[0], high[0], operands[2],
13425                                          scratch));
13426         }
13427       else
13428         emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
13429     }
13430 }
13431
13432 void
13433 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
13434 {
13435   rtx low[2], high[2];
13436   int count;
13437   const int single_width = mode == DImode ? 32 : 64;
13438
13439   if (CONST_INT_P (operands[2]))
13440     {
13441       (mode == DImode ? split_di : split_ti) (operands, 2, low, high);
13442       count = INTVAL (operands[2]) & (single_width * 2 - 1);
13443
13444       if (count >= single_width)
13445         {
13446           emit_move_insn (low[0], high[1]);
13447           ix86_expand_clear (high[0]);
13448
13449           if (count > single_width)
13450             emit_insn ((mode == DImode
13451                         ? gen_lshrsi3
13452                         : gen_lshrdi3) (low[0], low[0],
13453                                         GEN_INT (count - single_width)));
13454         }
13455       else
13456         {
13457           if (!rtx_equal_p (operands[0], operands[1]))
13458             emit_move_insn (operands[0], operands[1]);
13459           emit_insn ((mode == DImode
13460                       ? gen_x86_shrd_1
13461                       : gen_x86_64_shrd) (low[0], high[0], GEN_INT (count)));
13462           emit_insn ((mode == DImode
13463                       ? gen_lshrsi3
13464                       : gen_lshrdi3) (high[0], high[0], GEN_INT (count)));
13465         }
13466     }
13467   else
13468     {
13469       if (!rtx_equal_p (operands[0], operands[1]))
13470         emit_move_insn (operands[0], operands[1]);
13471
13472       (mode == DImode ? split_di : split_ti) (operands, 1, low, high);
13473
13474       emit_insn ((mode == DImode
13475                   ? gen_x86_shrd_1
13476                   : gen_x86_64_shrd) (low[0], high[0], operands[2]));
13477       emit_insn ((mode == DImode
13478                   ? gen_lshrsi3
13479                   : gen_lshrdi3) (high[0], high[0], operands[2]));
13480
13481       /* Heh.  By reversing the arguments, we can reuse this pattern.  */
13482       if (TARGET_CMOVE && scratch)
13483         {
13484           ix86_expand_clear (scratch);
13485           emit_insn ((mode == DImode
13486                       ? gen_x86_shift_adj_1
13487                       : gen_x86_64_shift_adj) (low[0], high[0], operands[2],
13488                                                scratch));
13489         }
13490       else
13491         emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
13492     }
13493 }
13494
13495 /* Predict just emitted jump instruction to be taken with probability PROB.  */
13496 static void
13497 predict_jump (int prob)
13498 {
13499   rtx insn = get_last_insn ();
13500   gcc_assert (JUMP_P (insn));
13501   REG_NOTES (insn)
13502     = gen_rtx_EXPR_LIST (REG_BR_PROB,
13503                          GEN_INT (prob),
13504                          REG_NOTES (insn));
13505 }
13506
13507 /* Helper function for the string operations below.  Dest VARIABLE whether
13508    it is aligned to VALUE bytes.  If true, jump to the label.  */
13509 static rtx
13510 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
13511 {
13512   rtx label = gen_label_rtx ();
13513   rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
13514   if (GET_MODE (variable) == DImode)
13515     emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
13516   else
13517     emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
13518   emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
13519                            1, label);
13520   if (epilogue)
13521     predict_jump (REG_BR_PROB_BASE * 50 / 100);
13522   else
13523     predict_jump (REG_BR_PROB_BASE * 90 / 100);
13524   return label;
13525 }
13526
13527 /* Adjust COUNTER by the VALUE.  */
13528 static void
13529 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
13530 {
13531   if (GET_MODE (countreg) == DImode)
13532     emit_insn (gen_adddi3 (countreg, countreg, GEN_INT (-value)));
13533   else
13534     emit_insn (gen_addsi3 (countreg, countreg, GEN_INT (-value)));
13535 }
13536
13537 /* Zero extend possibly SImode EXP to Pmode register.  */
13538 rtx
13539 ix86_zero_extend_to_Pmode (rtx exp)
13540 {
13541   rtx r;
13542   if (GET_MODE (exp) == VOIDmode)
13543     return force_reg (Pmode, exp);
13544   if (GET_MODE (exp) == Pmode)
13545     return copy_to_mode_reg (Pmode, exp);
13546   r = gen_reg_rtx (Pmode);
13547   emit_insn (gen_zero_extendsidi2 (r, exp));
13548   return r;
13549 }
13550
13551 /* Divide COUNTREG by SCALE.  */
13552 static rtx
13553 scale_counter (rtx countreg, int scale)
13554 {
13555   rtx sc;
13556   rtx piece_size_mask;
13557
13558   if (scale == 1)
13559     return countreg;
13560   if (CONST_INT_P (countreg))
13561     return GEN_INT (INTVAL (countreg) / scale);
13562   gcc_assert (REG_P (countreg));
13563
13564   piece_size_mask = GEN_INT (scale - 1);
13565   sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
13566                             GEN_INT (exact_log2 (scale)),
13567                             NULL, 1, OPTAB_DIRECT);
13568   return sc;
13569 }
13570
13571 /* Return mode for the memcpy/memset loop counter.  Prefer SImode over
13572    DImode for constant loop counts.  */
13573
13574 static enum machine_mode
13575 counter_mode (rtx count_exp)
13576 {
13577   if (GET_MODE (count_exp) != VOIDmode)
13578     return GET_MODE (count_exp);
13579   if (GET_CODE (count_exp) != CONST_INT)
13580     return Pmode;
13581   if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
13582     return DImode;
13583   return SImode;
13584 }
13585
13586 /* When SRCPTR is non-NULL, output simple loop to move memory
13587    pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
13588    overall size is COUNT specified in bytes.  When SRCPTR is NULL, output the
13589    equivalent loop to set memory by VALUE (supposed to be in MODE).
13590
13591    The size is rounded down to whole number of chunk size moved at once.
13592    SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info.  */
13593
13594
13595 static void
13596 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
13597                                rtx destptr, rtx srcptr, rtx value,
13598                                rtx count, enum machine_mode mode, int unroll,
13599                                int expected_size)
13600 {
13601   rtx out_label, top_label, iter, tmp;
13602   enum machine_mode iter_mode = counter_mode (count);
13603   rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
13604   rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
13605   rtx size;
13606   rtx x_addr;
13607   rtx y_addr;
13608   int i;
13609
13610   top_label = gen_label_rtx ();
13611   out_label = gen_label_rtx ();
13612   iter = gen_reg_rtx (iter_mode);
13613
13614   size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
13615                               NULL, 1, OPTAB_DIRECT);
13616   /* Those two should combine.  */
13617   if (piece_size == const1_rtx)
13618     {
13619       emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
13620                                true, out_label);
13621       predict_jump (REG_BR_PROB_BASE * 10 / 100);
13622     }
13623   emit_move_insn (iter, const0_rtx);
13624
13625   emit_label (top_label);
13626
13627   tmp = convert_modes (Pmode, iter_mode, iter, true);
13628   x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
13629   destmem = change_address (destmem, mode, x_addr);
13630
13631   if (srcmem)
13632     {
13633       y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
13634       srcmem = change_address (srcmem, mode, y_addr);
13635
13636       /* When unrolling for chips that reorder memory reads and writes,
13637          we can save registers by using single temporary.
13638          Also using 4 temporaries is overkill in 32bit mode.  */
13639       if (!TARGET_64BIT && 0)
13640         {
13641           for (i = 0; i < unroll; i++)
13642             {
13643               if (i)
13644                 {
13645                   destmem =
13646                     adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
13647                   srcmem =
13648                     adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
13649                 }
13650               emit_move_insn (destmem, srcmem);
13651             }
13652         }
13653       else
13654         {
13655           rtx tmpreg[4];
13656           gcc_assert (unroll <= 4);
13657           for (i = 0; i < unroll; i++)
13658             {
13659               tmpreg[i] = gen_reg_rtx (mode);
13660               if (i)
13661                 {
13662                   srcmem =
13663                     adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
13664                 }
13665               emit_move_insn (tmpreg[i], srcmem);
13666             }
13667           for (i = 0; i < unroll; i++)
13668             {
13669               if (i)
13670                 {
13671                   destmem =
13672                     adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
13673                 }
13674               emit_move_insn (destmem, tmpreg[i]);
13675             }
13676         }
13677     }
13678   else
13679     for (i = 0; i < unroll; i++)
13680       {
13681         if (i)
13682           destmem =
13683             adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
13684         emit_move_insn (destmem, value);
13685       }
13686
13687   tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
13688                              true, OPTAB_LIB_WIDEN);
13689   if (tmp != iter)
13690     emit_move_insn (iter, tmp);
13691
13692   emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
13693                            true, top_label);
13694   if (expected_size != -1)
13695     {
13696       expected_size /= GET_MODE_SIZE (mode) * unroll;
13697       if (expected_size == 0)
13698         predict_jump (0);
13699       else if (expected_size > REG_BR_PROB_BASE)
13700         predict_jump (REG_BR_PROB_BASE - 1);
13701       else
13702         predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
13703     }
13704   else
13705     predict_jump (REG_BR_PROB_BASE * 80 / 100);
13706   iter = ix86_zero_extend_to_Pmode (iter);
13707   tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
13708                              true, OPTAB_LIB_WIDEN);
13709   if (tmp != destptr)
13710     emit_move_insn (destptr, tmp);
13711   if (srcptr)
13712     {
13713       tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
13714                                  true, OPTAB_LIB_WIDEN);
13715       if (tmp != srcptr)
13716         emit_move_insn (srcptr, tmp);
13717     }
13718   emit_label (out_label);
13719 }
13720
13721 /* Output "rep; mov" instruction.
13722    Arguments have same meaning as for previous function */
13723 static void
13724 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
13725                            rtx destptr, rtx srcptr,
13726                            rtx count,
13727                            enum machine_mode mode)
13728 {
13729   rtx destexp;
13730   rtx srcexp;
13731   rtx countreg;
13732
13733   /* If the size is known, it is shorter to use rep movs.  */
13734   if (mode == QImode && CONST_INT_P (count)
13735       && !(INTVAL (count) & 3))
13736     mode = SImode;
13737
13738   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
13739     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
13740   if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
13741     srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
13742   countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
13743   if (mode != QImode)
13744     {
13745       destexp = gen_rtx_ASHIFT (Pmode, countreg,
13746                                 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
13747       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
13748       srcexp = gen_rtx_ASHIFT (Pmode, countreg,
13749                                GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
13750       srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
13751     }
13752   else
13753     {
13754       destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
13755       srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
13756     }
13757   emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
13758                           destexp, srcexp));
13759 }
13760
13761 /* Output "rep; stos" instruction.
13762    Arguments have same meaning as for previous function */
13763 static void
13764 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
13765                             rtx count,
13766                             enum machine_mode mode)
13767 {
13768   rtx destexp;
13769   rtx countreg;
13770
13771   if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
13772     destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
13773   value = force_reg (mode, gen_lowpart (mode, value));
13774   countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
13775   if (mode != QImode)
13776     {
13777       destexp = gen_rtx_ASHIFT (Pmode, countreg,
13778                                 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
13779       destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
13780     }
13781   else
13782     destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
13783   emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
13784 }
13785
13786 static void
13787 emit_strmov (rtx destmem, rtx srcmem,
13788              rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
13789 {
13790   rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
13791   rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
13792   emit_insn (gen_strmov (destptr, dest, srcptr, src));
13793 }
13794
13795 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST.  */
13796 static void
13797 expand_movmem_epilogue (rtx destmem, rtx srcmem,
13798                         rtx destptr, rtx srcptr, rtx count, int max_size)
13799 {
13800   rtx src, dest;
13801   if (CONST_INT_P (count))
13802     {
13803       HOST_WIDE_INT countval = INTVAL (count);
13804       int offset = 0;
13805
13806       if ((countval & 0x10) && max_size > 16)
13807         {
13808           if (TARGET_64BIT)
13809             {
13810               emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
13811               emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
13812             }
13813           else
13814             gcc_unreachable ();
13815           offset += 16;
13816         }
13817       if ((countval & 0x08) && max_size > 8)
13818         {
13819           if (TARGET_64BIT)
13820             emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
13821           else
13822             {
13823               emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
13824               emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
13825             }
13826           offset += 8;
13827         }
13828       if ((countval & 0x04) && max_size > 4)
13829         {
13830           emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
13831           offset += 4;
13832         }
13833       if ((countval & 0x02) && max_size > 2)
13834         {
13835           emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
13836           offset += 2;
13837         }
13838       if ((countval & 0x01) && max_size > 1)
13839         {
13840           emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
13841           offset += 1;
13842         }
13843       return;
13844     }
13845   if (max_size > 8)
13846     {
13847       count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
13848                                     count, 1, OPTAB_DIRECT);
13849       expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
13850                                      count, QImode, 1, 4);
13851       return;
13852     }
13853
13854   /* When there are stringops, we can cheaply increase dest and src pointers.
13855      Otherwise we save code size by maintaining offset (zero is readily
13856      available from preceding rep operation) and using x86 addressing modes.
13857    */
13858   if (TARGET_SINGLE_STRINGOP)
13859     {
13860       if (max_size > 4)
13861         {
13862           rtx label = ix86_expand_aligntest (count, 4, true);
13863           src = change_address (srcmem, SImode, srcptr);
13864           dest = change_address (destmem, SImode, destptr);
13865           emit_insn (gen_strmov (destptr, dest, srcptr, src));
13866           emit_label (label);
13867           LABEL_NUSES (label) = 1;
13868         }
13869       if (max_size > 2)
13870         {
13871           rtx label = ix86_expand_aligntest (count, 2, true);
13872           src = change_address (srcmem, HImode, srcptr);
13873           dest = change_address (destmem, HImode, destptr);
13874           emit_insn (gen_strmov (destptr, dest, srcptr, src));
13875           emit_label (label);
13876           LABEL_NUSES (label) = 1;
13877         }
13878       if (max_size > 1)
13879         {
13880           rtx label = ix86_expand_aligntest (count, 1, true);
13881           src = change_address (srcmem, QImode, srcptr);
13882           dest = change_address (destmem, QImode, destptr);
13883           emit_insn (gen_strmov (destptr, dest, srcptr, src));
13884           emit_label (label);
13885           LABEL_NUSES (label) = 1;
13886         }
13887     }
13888   else
13889     {
13890       rtx offset = force_reg (Pmode, const0_rtx);
13891       rtx tmp;
13892
13893       if (max_size > 4)
13894         {
13895           rtx label = ix86_expand_aligntest (count, 4, true);
13896           src = change_address (srcmem, SImode, srcptr);
13897           dest = change_address (destmem, SImode, destptr);
13898           emit_move_insn (dest, src);
13899           tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
13900                                      true, OPTAB_LIB_WIDEN);
13901           if (tmp != offset)
13902             emit_move_insn (offset, tmp);
13903           emit_label (label);
13904           LABEL_NUSES (label) = 1;
13905         }
13906       if (max_size > 2)
13907         {
13908           rtx label = ix86_expand_aligntest (count, 2, true);
13909           tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
13910           src = change_address (srcmem, HImode, tmp);
13911           tmp = gen_rtx_PLUS (Pmode, destptr, offset);
13912           dest = change_address (destmem, HImode, tmp);
13913           emit_move_insn (dest, src);
13914           tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
13915                                      true, OPTAB_LIB_WIDEN);
13916           if (tmp != offset)
13917             emit_move_insn (offset, tmp);
13918           emit_label (label);
13919           LABEL_NUSES (label) = 1;
13920         }
13921       if (max_size > 1)
13922         {
13923           rtx label = ix86_expand_aligntest (count, 1, true);
13924           tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
13925           src = change_address (srcmem, QImode, tmp);
13926           tmp = gen_rtx_PLUS (Pmode, destptr, offset);
13927           dest = change_address (destmem, QImode, tmp);
13928           emit_move_insn (dest, src);
13929           emit_label (label);
13930           LABEL_NUSES (label) = 1;
13931         }
13932     }
13933 }
13934
13935 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
13936 static void
13937 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
13938                                  rtx count, int max_size)
13939 {
13940   count =
13941     expand_simple_binop (counter_mode (count), AND, count,
13942                          GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
13943   expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
13944                                  gen_lowpart (QImode, value), count, QImode,
13945                                  1, max_size / 2);
13946 }
13947
13948 /* Output code to set at most count & (max_size - 1) bytes starting by DEST.  */
13949 static void
13950 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
13951 {
13952   rtx dest;
13953
13954   if (CONST_INT_P (count))
13955     {
13956       HOST_WIDE_INT countval = INTVAL (count);
13957       int offset = 0;
13958
13959       if ((countval & 0x10) && max_size > 16)
13960         {
13961           if (TARGET_64BIT)
13962             {
13963               dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
13964               emit_insn (gen_strset (destptr, dest, value));
13965               dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
13966               emit_insn (gen_strset (destptr, dest, value));
13967             }
13968           else
13969             gcc_unreachable ();
13970           offset += 16;
13971         }
13972       if ((countval & 0x08) && max_size > 8)
13973         {
13974           if (TARGET_64BIT)
13975             {
13976               dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
13977               emit_insn (gen_strset (destptr, dest, value));
13978             }
13979           else
13980             {
13981               dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
13982               emit_insn (gen_strset (destptr, dest, value));
13983               dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
13984               emit_insn (gen_strset (destptr, dest, value));
13985             }
13986           offset += 8;
13987         }
13988       if ((countval & 0x04) && max_size > 4)
13989         {
13990           dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
13991           emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
13992           offset += 4;
13993         }
13994       if ((countval & 0x02) && max_size > 2)
13995         {
13996           dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
13997           emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
13998           offset += 2;
13999         }
14000       if ((countval & 0x01) && max_size > 1)
14001         {
14002           dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
14003           emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
14004           offset += 1;
14005         }
14006       return;
14007     }
14008   if (max_size > 32)
14009     {
14010       expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
14011       return;
14012     }
14013   if (max_size > 16)
14014     {
14015       rtx label = ix86_expand_aligntest (count, 16, true);
14016       if (TARGET_64BIT)
14017         {
14018           dest = change_address (destmem, DImode, destptr);
14019           emit_insn (gen_strset (destptr, dest, value));
14020           emit_insn (gen_strset (destptr, dest, value));
14021         }
14022       else
14023         {
14024           dest = change_address (destmem, SImode, destptr);
14025           emit_insn (gen_strset (destptr, dest, value));
14026           emit_insn (gen_strset (destptr, dest, value));
14027           emit_insn (gen_strset (destptr, dest, value));
14028           emit_insn (gen_strset (destptr, dest, value));
14029         }
14030       emit_label (label);
14031       LABEL_NUSES (label) = 1;
14032     }
14033   if (max_size > 8)
14034     {
14035       rtx label = ix86_expand_aligntest (count, 8, true);
14036       if (TARGET_64BIT)
14037         {
14038           dest = change_address (destmem, DImode, destptr);
14039           emit_insn (gen_strset (destptr, dest, value));
14040         }
14041       else
14042         {
14043           dest = change_address (destmem, SImode, destptr);
14044           emit_insn (gen_strset (destptr, dest, value));
14045           emit_insn (gen_strset (destptr, dest, value));
14046         }
14047       emit_label (label);
14048       LABEL_NUSES (label) = 1;
14049     }
14050   if (max_size > 4)
14051     {
14052       rtx label = ix86_expand_aligntest (count, 4, true);
14053       dest = change_address (destmem, SImode, destptr);
14054       emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
14055       emit_label (label);
14056       LABEL_NUSES (label) = 1;
14057     }
14058   if (max_size > 2)
14059     {
14060       rtx label = ix86_expand_aligntest (count, 2, true);
14061       dest = change_address (destmem, HImode, destptr);
14062       emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
14063       emit_label (label);
14064       LABEL_NUSES (label) = 1;
14065     }
14066   if (max_size > 1)
14067     {
14068       rtx label = ix86_expand_aligntest (count, 1, true);
14069       dest = change_address (destmem, QImode, destptr);
14070       emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
14071       emit_label (label);
14072       LABEL_NUSES (label) = 1;
14073     }
14074 }
14075
14076 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
14077    DESIRED_ALIGNMENT.  */
14078 static void
14079 expand_movmem_prologue (rtx destmem, rtx srcmem,
14080                         rtx destptr, rtx srcptr, rtx count,
14081                         int align, int desired_alignment)
14082 {
14083   if (align <= 1 && desired_alignment > 1)
14084     {
14085       rtx label = ix86_expand_aligntest (destptr, 1, false);
14086       srcmem = change_address (srcmem, QImode, srcptr);
14087       destmem = change_address (destmem, QImode, destptr);
14088       emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
14089       ix86_adjust_counter (count, 1);
14090       emit_label (label);
14091       LABEL_NUSES (label) = 1;
14092     }
14093   if (align <= 2 && desired_alignment > 2)
14094     {
14095       rtx label = ix86_expand_aligntest (destptr, 2, false);
14096       srcmem = change_address (srcmem, HImode, srcptr);
14097       destmem = change_address (destmem, HImode, destptr);
14098       emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
14099       ix86_adjust_counter (count, 2);
14100       emit_label (label);
14101       LABEL_NUSES (label) = 1;
14102     }
14103   if (align <= 4 && desired_alignment > 4)
14104     {
14105       rtx label = ix86_expand_aligntest (destptr, 4, false);
14106       srcmem = change_address (srcmem, SImode, srcptr);
14107       destmem = change_address (destmem, SImode, destptr);
14108       emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
14109       ix86_adjust_counter (count, 4);
14110       emit_label (label);
14111       LABEL_NUSES (label) = 1;
14112     }
14113   gcc_assert (desired_alignment <= 8);
14114 }
14115
14116 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
14117    DESIRED_ALIGNMENT.  */
14118 static void
14119 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
14120                         int align, int desired_alignment)
14121 {
14122   if (align <= 1 && desired_alignment > 1)
14123     {
14124       rtx label = ix86_expand_aligntest (destptr, 1, false);
14125       destmem = change_address (destmem, QImode, destptr);
14126       emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
14127       ix86_adjust_counter (count, 1);
14128       emit_label (label);
14129       LABEL_NUSES (label) = 1;
14130     }
14131   if (align <= 2 && desired_alignment > 2)
14132     {
14133       rtx label = ix86_expand_aligntest (destptr, 2, false);
14134       destmem = change_address (destmem, HImode, destptr);
14135       emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
14136       ix86_adjust_counter (count, 2);
14137       emit_label (label);
14138       LABEL_NUSES (label) = 1;
14139     }
14140   if (align <= 4 && desired_alignment > 4)
14141     {
14142       rtx label = ix86_expand_aligntest (destptr, 4, false);
14143       destmem = change_address (destmem, SImode, destptr);
14144       emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
14145       ix86_adjust_counter (count, 4);
14146       emit_label (label);
14147       LABEL_NUSES (label) = 1;
14148     }
14149   gcc_assert (desired_alignment <= 8);
14150 }
14151
14152 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation.  */
14153 static enum stringop_alg
14154 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
14155             int *dynamic_check)
14156 {
14157   const struct stringop_algs * algs;
14158
14159   *dynamic_check = -1;
14160   if (memset)
14161     algs = &ix86_cost->memset[TARGET_64BIT != 0];
14162   else
14163     algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
14164   if (stringop_alg != no_stringop)
14165     return stringop_alg;
14166   /* rep; movq or rep; movl is the smallest variant.  */
14167   else if (optimize_size)
14168     {
14169       if (!count || (count & 3))
14170         return rep_prefix_1_byte;
14171       else
14172         return rep_prefix_4_byte;
14173     }
14174   /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
14175    */
14176   else if (expected_size != -1 && expected_size < 4)
14177     return loop_1_byte;
14178   else if (expected_size != -1)
14179     {
14180       unsigned int i;
14181       enum stringop_alg alg = libcall;
14182       for (i = 0; i < NAX_STRINGOP_ALGS; i++)
14183         {
14184           gcc_assert (algs->size[i].max);
14185           if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
14186             {
14187               if (algs->size[i].alg != libcall)
14188                 alg = algs->size[i].alg;
14189               /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
14190                  last non-libcall inline algorithm.  */
14191               if (TARGET_INLINE_ALL_STRINGOPS)
14192                 {
14193                   /* When the current size is best to be copied by a libcall,
14194                      but we are still forced to inline, run the heuristic bellow
14195                      that will pick code for medium sized blocks.  */
14196                   if (alg != libcall)
14197                     return alg;
14198                   break;
14199                 }
14200               else
14201                 return algs->size[i].alg;
14202             }
14203         }
14204       gcc_assert (TARGET_INLINE_ALL_STRINGOPS);
14205     }
14206   /* When asked to inline the call anyway, try to pick meaningful choice.
14207      We look for maximal size of block that is faster to copy by hand and
14208      take blocks of at most of that size guessing that average size will
14209      be roughly half of the block.
14210
14211      If this turns out to be bad, we might simply specify the preferred
14212      choice in ix86_costs.  */
14213   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
14214       && algs->unknown_size == libcall)
14215     {
14216       int max = -1;
14217       enum stringop_alg alg;
14218       int i;
14219
14220       for (i = 0; i < NAX_STRINGOP_ALGS; i++)
14221         if (algs->size[i].alg != libcall && algs->size[i].alg)
14222           max = algs->size[i].max;
14223       if (max == -1)
14224         max = 4096;
14225       alg = decide_alg (count, max / 2, memset, dynamic_check);
14226       gcc_assert (*dynamic_check == -1);
14227       gcc_assert (alg != libcall);
14228       if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
14229         *dynamic_check = max;
14230       return alg;
14231     }
14232   return algs->unknown_size;
14233 }
14234
14235 /* Decide on alignment.  We know that the operand is already aligned to ALIGN
14236    (ALIGN can be based on profile feedback and thus it is not 100% guaranteed).  */
14237 static int
14238 decide_alignment (int align,
14239                   enum stringop_alg alg,
14240                   int expected_size)
14241 {
14242   int desired_align = 0;
14243   switch (alg)
14244     {
14245       case no_stringop:
14246         gcc_unreachable ();
14247       case loop:
14248       case unrolled_loop:
14249         desired_align = GET_MODE_SIZE (Pmode);
14250         break;
14251       case rep_prefix_8_byte:
14252         desired_align = 8;
14253         break;
14254       case rep_prefix_4_byte:
14255         /* PentiumPro has special logic triggering for 8 byte aligned blocks.
14256            copying whole cacheline at once.  */
14257         if (TARGET_PENTIUMPRO)
14258           desired_align = 8;
14259         else
14260           desired_align = 4;
14261         break;
14262       case rep_prefix_1_byte:
14263         /* PentiumPro has special logic triggering for 8 byte aligned blocks.
14264            copying whole cacheline at once.  */
14265         if (TARGET_PENTIUMPRO)
14266           desired_align = 8;
14267         else
14268           desired_align = 1;
14269         break;
14270       case loop_1_byte:
14271         desired_align = 1;
14272         break;
14273       case libcall:
14274         return 0;
14275     }
14276
14277   if (optimize_size)
14278     desired_align = 1;
14279   if (desired_align < align)
14280     desired_align = align;
14281   if (expected_size != -1 && expected_size < 4)
14282     desired_align = align;
14283   return desired_align;
14284 }
14285
14286 /* Return the smallest power of 2 greater than VAL.  */
14287 static int
14288 smallest_pow2_greater_than (int val)
14289 {
14290   int ret = 1;
14291   while (ret <= val)
14292     ret <<= 1;
14293   return ret;
14294 }
14295
14296 /* Expand string move (memcpy) operation.  Use i386 string operations when
14297    profitable.  expand_clrmem contains similar code. The code depends upon
14298    architecture, block size and alignment, but always has the same
14299    overall structure:
14300
14301    1) Prologue guard: Conditional that jumps up to epilogues for small
14302       blocks that can be handled by epilogue alone.  This is faster but
14303       also needed for correctness, since prologue assume the block is larger
14304       than the desired alignment.
14305
14306       Optional dynamic check for size and libcall for large
14307       blocks is emitted here too, with -minline-stringops-dynamically.
14308
14309    2) Prologue: copy first few bytes in order to get destination aligned
14310       to DESIRED_ALIGN.  It is emitted only when ALIGN is less than
14311       DESIRED_ALIGN and and up to DESIRED_ALIGN - ALIGN bytes can be copied.
14312       We emit either a jump tree on power of two sized blocks, or a byte loop.
14313
14314    3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
14315       with specified algorithm.
14316
14317    4) Epilogue: code copying tail of the block that is too small to be
14318       handled by main body (or up to size guarded by prologue guard).  */
14319
14320 int
14321 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
14322                     rtx expected_align_exp, rtx expected_size_exp)
14323 {
14324   rtx destreg;
14325   rtx srcreg;
14326   rtx label = NULL;
14327   rtx tmp;
14328   rtx jump_around_label = NULL;
14329   HOST_WIDE_INT align = 1;
14330   unsigned HOST_WIDE_INT count = 0;
14331   HOST_WIDE_INT expected_size = -1;
14332   int size_needed = 0, epilogue_size_needed;
14333   int desired_align = 0;
14334   enum stringop_alg alg;
14335   int dynamic_check;
14336
14337   if (CONST_INT_P (align_exp))
14338     align = INTVAL (align_exp);
14339   /* i386 can do misaligned access on reasonably increased cost.  */
14340   if (CONST_INT_P (expected_align_exp)
14341       && INTVAL (expected_align_exp) > align)
14342     align = INTVAL (expected_align_exp);
14343   if (CONST_INT_P (count_exp))
14344     count = expected_size = INTVAL (count_exp);
14345   if (CONST_INT_P (expected_size_exp) && count == 0)
14346     expected_size = INTVAL (expected_size_exp);
14347
14348   /* Step 0: Decide on preferred algorithm, desired alignment and
14349      size of chunks to be copied by main loop.  */
14350
14351   alg = decide_alg (count, expected_size, false, &dynamic_check);
14352   desired_align = decide_alignment (align, alg, expected_size);
14353
14354   if (!TARGET_ALIGN_STRINGOPS)
14355     align = desired_align;
14356
14357   if (alg == libcall)
14358     return 0;
14359   gcc_assert (alg != no_stringop);
14360   if (!count)
14361     count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
14362   destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
14363   srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
14364   switch (alg)
14365     {
14366     case libcall:
14367     case no_stringop:
14368       gcc_unreachable ();
14369     case loop:
14370       size_needed = GET_MODE_SIZE (Pmode);
14371       break;
14372     case unrolled_loop:
14373       size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
14374       break;
14375     case rep_prefix_8_byte:
14376       size_needed = 8;
14377       break;
14378     case rep_prefix_4_byte:
14379       size_needed = 4;
14380       break;
14381     case rep_prefix_1_byte:
14382     case loop_1_byte:
14383       size_needed = 1;
14384       break;
14385     }
14386
14387   epilogue_size_needed = size_needed;
14388
14389   /* Step 1: Prologue guard.  */
14390
14391   /* Alignment code needs count to be in register.  */
14392   if (CONST_INT_P (count_exp) && desired_align > align)
14393     {
14394       enum machine_mode mode = SImode;
14395       if (TARGET_64BIT && (count & ~0xffffffff))
14396         mode = DImode;
14397       count_exp = force_reg (mode, count_exp);
14398     }
14399   gcc_assert (desired_align >= 1 && align >= 1);
14400
14401   /* Ensure that alignment prologue won't copy past end of block.  */
14402   if (size_needed > 1 || (desired_align > 1 && desired_align > align))
14403     {
14404       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
14405       /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
14406          Make sure it is power of 2.  */
14407       epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
14408
14409       label = gen_label_rtx ();
14410       emit_cmp_and_jump_insns (count_exp,
14411                                GEN_INT (epilogue_size_needed),
14412                                LTU, 0, counter_mode (count_exp), 1, label);
14413       if (GET_CODE (count_exp) == CONST_INT)
14414         ;
14415       else if (expected_size == -1 || expected_size < epilogue_size_needed)
14416         predict_jump (REG_BR_PROB_BASE * 60 / 100);
14417       else
14418         predict_jump (REG_BR_PROB_BASE * 20 / 100);
14419     }
14420   /* Emit code to decide on runtime whether library call or inline should be
14421      used.  */
14422   if (dynamic_check != -1)
14423     {
14424       rtx hot_label = gen_label_rtx ();
14425       jump_around_label = gen_label_rtx ();
14426       emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
14427                                LEU, 0, GET_MODE (count_exp), 1, hot_label);
14428       predict_jump (REG_BR_PROB_BASE * 90 / 100);
14429       emit_block_move_via_libcall (dst, src, count_exp, false);
14430       emit_jump (jump_around_label);
14431       emit_label (hot_label);
14432     }
14433
14434   /* Step 2: Alignment prologue.  */
14435
14436   if (desired_align > align)
14437     {
14438       /* Except for the first move in epilogue, we no longer know
14439          constant offset in aliasing info.  It don't seems to worth
14440          the pain to maintain it for the first move, so throw away
14441          the info early.  */
14442       src = change_address (src, BLKmode, srcreg);
14443       dst = change_address (dst, BLKmode, destreg);
14444       expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
14445                               desired_align);
14446     }
14447   if (label && size_needed == 1)
14448     {
14449       emit_label (label);
14450       LABEL_NUSES (label) = 1;
14451       label = NULL;
14452     }
14453
14454   /* Step 3: Main loop.  */
14455
14456   switch (alg)
14457     {
14458     case libcall:
14459     case no_stringop:
14460       gcc_unreachable ();
14461     case loop_1_byte:
14462       expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
14463                                      count_exp, QImode, 1, expected_size);
14464       break;
14465     case loop:
14466       expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
14467                                      count_exp, Pmode, 1, expected_size);
14468       break;
14469     case unrolled_loop:
14470       /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
14471          registers for 4 temporaries anyway.  */
14472       expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
14473                                      count_exp, Pmode, TARGET_64BIT ? 4 : 2,
14474                                      expected_size);
14475       break;
14476     case rep_prefix_8_byte:
14477       expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
14478                                  DImode);
14479       break;
14480     case rep_prefix_4_byte:
14481       expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
14482                                  SImode);
14483       break;
14484     case rep_prefix_1_byte:
14485       expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
14486                                  QImode);
14487       break;
14488     }
14489   /* Adjust properly the offset of src and dest memory for aliasing.  */
14490   if (CONST_INT_P (count_exp))
14491     {
14492       src = adjust_automodify_address_nv (src, BLKmode, srcreg,
14493                                           (count / size_needed) * size_needed);
14494       dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
14495                                           (count / size_needed) * size_needed);
14496     }
14497   else
14498     {
14499       src = change_address (src, BLKmode, srcreg);
14500       dst = change_address (dst, BLKmode, destreg);
14501     }
14502
14503   /* Step 4: Epilogue to copy the remaining bytes.  */
14504
14505   if (label)
14506     {
14507       /* When the main loop is done, COUNT_EXP might hold original count,
14508          while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
14509          Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
14510          bytes. Compensate if needed.  */
14511
14512       if (size_needed < epilogue_size_needed)
14513         {
14514           tmp =
14515             expand_simple_binop (counter_mode (count_exp), AND, count_exp,
14516                                  GEN_INT (size_needed - 1), count_exp, 1,
14517                                  OPTAB_DIRECT);
14518           if (tmp != count_exp)
14519             emit_move_insn (count_exp, tmp);
14520         }
14521       emit_label (label);
14522       LABEL_NUSES (label) = 1;
14523     }
14524
14525   if (count_exp != const0_rtx && epilogue_size_needed > 1)
14526     expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
14527                             epilogue_size_needed);
14528   if (jump_around_label)
14529     emit_label (jump_around_label);
14530   return 1;
14531 }
14532
14533 /* Helper function for memcpy.  For QImode value 0xXY produce
14534    0xXYXYXYXY of wide specified by MODE.  This is essentially
14535    a * 0x10101010, but we can do slightly better than
14536    synth_mult by unwinding the sequence by hand on CPUs with
14537    slow multiply.  */
14538 static rtx
14539 promote_duplicated_reg (enum machine_mode mode, rtx val)
14540 {
14541   enum machine_mode valmode = GET_MODE (val);
14542   rtx tmp;
14543   int nops = mode == DImode ? 3 : 2;
14544
14545   gcc_assert (mode == SImode || mode == DImode);
14546   if (val == const0_rtx)
14547     return copy_to_mode_reg (mode, const0_rtx);
14548   if (CONST_INT_P (val))
14549     {
14550       HOST_WIDE_INT v = INTVAL (val) & 255;
14551
14552       v |= v << 8;
14553       v |= v << 16;
14554       if (mode == DImode)
14555         v |= (v << 16) << 16;
14556       return copy_to_mode_reg (mode, gen_int_mode (v, mode));
14557     }
14558
14559   if (valmode == VOIDmode)
14560     valmode = QImode;
14561   if (valmode != QImode)
14562     val = gen_lowpart (QImode, val);
14563   if (mode == QImode)
14564     return val;
14565   if (!TARGET_PARTIAL_REG_STALL)
14566     nops--;
14567   if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
14568       + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
14569       <= (ix86_cost->shift_const + ix86_cost->add) * nops
14570           + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
14571     {
14572       rtx reg = convert_modes (mode, QImode, val, true);
14573       tmp = promote_duplicated_reg (mode, const1_rtx);
14574       return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
14575                                   OPTAB_DIRECT);
14576     }
14577   else
14578     {
14579       rtx reg = convert_modes (mode, QImode, val, true);
14580
14581       if (!TARGET_PARTIAL_REG_STALL)
14582         if (mode == SImode)
14583           emit_insn (gen_movsi_insv_1 (reg, reg));
14584         else
14585           emit_insn (gen_movdi_insv_1_rex64 (reg, reg));
14586       else
14587         {
14588           tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
14589                                      NULL, 1, OPTAB_DIRECT);
14590           reg =
14591             expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
14592         }
14593       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
14594                                  NULL, 1, OPTAB_DIRECT);
14595       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
14596       if (mode == SImode)
14597         return reg;
14598       tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
14599                                  NULL, 1, OPTAB_DIRECT);
14600       reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
14601       return reg;
14602     }
14603 }
14604
14605 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
14606    be needed by main loop copying SIZE_NEEDED chunks and prologue getting
14607    alignment from ALIGN to DESIRED_ALIGN.  */
14608 static rtx
14609 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
14610 {
14611   rtx promoted_val;
14612
14613   if (TARGET_64BIT
14614       && (size_needed > 4 || (desired_align > align && desired_align > 4)))
14615     promoted_val = promote_duplicated_reg (DImode, val);
14616   else if (size_needed > 2 || (desired_align > align && desired_align > 2))
14617     promoted_val = promote_duplicated_reg (SImode, val);
14618   else if (size_needed > 1 || (desired_align > align && desired_align > 1))
14619     promoted_val = promote_duplicated_reg (HImode, val);
14620   else
14621     promoted_val = val;
14622
14623   return promoted_val;
14624 }
14625
14626 /* Expand string clear operation (bzero).  Use i386 string operations when
14627    profitable.  See expand_movmem comment for explanation of individual
14628    steps performed.  */
14629 int
14630 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
14631                     rtx expected_align_exp, rtx expected_size_exp)
14632 {
14633   rtx destreg;
14634   rtx label = NULL;
14635   rtx tmp;
14636   rtx jump_around_label = NULL;
14637   HOST_WIDE_INT align = 1;
14638   unsigned HOST_WIDE_INT count = 0;
14639   HOST_WIDE_INT expected_size = -1;
14640   int size_needed = 0, epilogue_size_needed;
14641   int desired_align = 0;
14642   enum stringop_alg alg;
14643   rtx promoted_val = NULL;
14644   bool force_loopy_epilogue = false;
14645   int dynamic_check;
14646
14647   if (CONST_INT_P (align_exp))
14648     align = INTVAL (align_exp);
14649   /* i386 can do misaligned access on reasonably increased cost.  */
14650   if (CONST_INT_P (expected_align_exp)
14651       && INTVAL (expected_align_exp) > align)
14652     align = INTVAL (expected_align_exp);
14653   if (CONST_INT_P (count_exp))
14654     count = expected_size = INTVAL (count_exp);
14655   if (CONST_INT_P (expected_size_exp) && count == 0)
14656     expected_size = INTVAL (expected_size_exp);
14657
14658   /* Step 0: Decide on preferred algorithm, desired alignment and
14659      size of chunks to be copied by main loop.  */
14660
14661   alg = decide_alg (count, expected_size, true, &dynamic_check);
14662   desired_align = decide_alignment (align, alg, expected_size);
14663
14664   if (!TARGET_ALIGN_STRINGOPS)
14665     align = desired_align;
14666
14667   if (alg == libcall)
14668     return 0;
14669   gcc_assert (alg != no_stringop);
14670   if (!count)
14671     count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
14672   destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
14673   switch (alg)
14674     {
14675     case libcall:
14676     case no_stringop:
14677       gcc_unreachable ();
14678     case loop:
14679       size_needed = GET_MODE_SIZE (Pmode);
14680       break;
14681     case unrolled_loop:
14682       size_needed = GET_MODE_SIZE (Pmode) * 4;
14683       break;
14684     case rep_prefix_8_byte:
14685       size_needed = 8;
14686       break;
14687     case rep_prefix_4_byte:
14688       size_needed = 4;
14689       break;
14690     case rep_prefix_1_byte:
14691     case loop_1_byte:
14692       size_needed = 1;
14693       break;
14694     }
14695   epilogue_size_needed = size_needed;
14696
14697   /* Step 1: Prologue guard.  */
14698
14699   /* Alignment code needs count to be in register.  */
14700   if (CONST_INT_P (count_exp) && desired_align > align)
14701     {
14702       enum machine_mode mode = SImode;
14703       if (TARGET_64BIT && (count & ~0xffffffff))
14704         mode = DImode;
14705       count_exp = force_reg (mode, count_exp);
14706     }
14707   /* Do the cheap promotion to allow better CSE across the
14708      main loop and epilogue (ie one load of the big constant in the
14709      front of all code.  */
14710   if (CONST_INT_P (val_exp))
14711     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
14712                                                    desired_align, align);
14713   /* Ensure that alignment prologue won't copy past end of block.  */
14714   if (size_needed > 1 || (desired_align > 1 && desired_align > align))
14715     {
14716       epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
14717       /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
14718          Make sure it is power of 2.  */
14719       epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
14720
14721       /* To improve performance of small blocks, we jump around the VAL
14722          promoting mode.  This mean that if the promoted VAL is not constant,
14723          we might not use it in the epilogue and have to use byte
14724          loop variant.  */
14725       if (epilogue_size_needed > 2 && !promoted_val)
14726         force_loopy_epilogue = true;
14727       label = gen_label_rtx ();
14728       emit_cmp_and_jump_insns (count_exp,
14729                                GEN_INT (epilogue_size_needed),
14730                                LTU, 0, counter_mode (count_exp), 1, label);
14731       if (GET_CODE (count_exp) == CONST_INT)
14732         ;
14733       else if (expected_size == -1 || expected_size <= epilogue_size_needed)
14734         predict_jump (REG_BR_PROB_BASE * 60 / 100);
14735       else
14736         predict_jump (REG_BR_PROB_BASE * 20 / 100);
14737     }
14738   if (dynamic_check != -1)
14739     {
14740       rtx hot_label = gen_label_rtx ();
14741       jump_around_label = gen_label_rtx ();
14742       emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
14743                                LEU, 0, counter_mode (count_exp), 1, hot_label);
14744       predict_jump (REG_BR_PROB_BASE * 90 / 100);
14745       set_storage_via_libcall (dst, count_exp, val_exp, false);
14746       emit_jump (jump_around_label);
14747       emit_label (hot_label);
14748     }
14749
14750   /* Step 2: Alignment prologue.  */
14751
14752   /* Do the expensive promotion once we branched off the small blocks.  */
14753   if (!promoted_val)
14754     promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
14755                                                    desired_align, align);
14756   gcc_assert (desired_align >= 1 && align >= 1);
14757
14758   if (desired_align > align)
14759     {
14760       /* Except for the first move in epilogue, we no longer know
14761          constant offset in aliasing info.  It don't seems to worth
14762          the pain to maintain it for the first move, so throw away
14763          the info early.  */
14764       dst = change_address (dst, BLKmode, destreg);
14765       expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
14766                               desired_align);
14767     }
14768   if (label && size_needed == 1)
14769     {
14770       emit_label (label);
14771       LABEL_NUSES (label) = 1;
14772       label = NULL;
14773     }
14774
14775   /* Step 3: Main loop.  */
14776
14777   switch (alg)
14778     {
14779     case libcall:
14780     case no_stringop:
14781       gcc_unreachable ();
14782     case loop_1_byte:
14783       expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
14784                                      count_exp, QImode, 1, expected_size);
14785       break;
14786     case loop:
14787       expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
14788                                      count_exp, Pmode, 1, expected_size);
14789       break;
14790     case unrolled_loop:
14791       expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
14792                                      count_exp, Pmode, 4, expected_size);
14793       break;
14794     case rep_prefix_8_byte:
14795       expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
14796                                   DImode);
14797       break;
14798     case rep_prefix_4_byte:
14799       expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
14800                                   SImode);
14801       break;
14802     case rep_prefix_1_byte:
14803       expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
14804                                   QImode);
14805       break;
14806     }
14807   /* Adjust properly the offset of src and dest memory for aliasing.  */
14808   if (CONST_INT_P (count_exp))
14809     dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
14810                                         (count / size_needed) * size_needed);
14811   else
14812     dst = change_address (dst, BLKmode, destreg);
14813
14814   /* Step 4: Epilogue to copy the remaining bytes.  */
14815
14816   if (label)
14817     {
14818       /* When the main loop is done, COUNT_EXP might hold original count,
14819          while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
14820          Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
14821          bytes. Compensate if needed.  */
14822
14823       if (size_needed < desired_align - align)
14824         {
14825           tmp =
14826             expand_simple_binop (counter_mode (count_exp), AND, count_exp,
14827                                  GEN_INT (size_needed - 1), count_exp, 1,
14828                                  OPTAB_DIRECT);
14829           size_needed = desired_align - align + 1;
14830           if (tmp != count_exp)
14831             emit_move_insn (count_exp, tmp);
14832         }
14833       emit_label (label);
14834       LABEL_NUSES (label) = 1;
14835     }
14836   if (count_exp != const0_rtx && epilogue_size_needed > 1)
14837     {
14838       if (force_loopy_epilogue)
14839         expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
14840                                          size_needed);
14841       else
14842         expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
14843                                 size_needed);
14844     }
14845   if (jump_around_label)
14846     emit_label (jump_around_label);
14847   return 1;
14848 }
14849
14850 /* Expand the appropriate insns for doing strlen if not just doing
14851    repnz; scasb
14852
14853    out = result, initialized with the start address
14854    align_rtx = alignment of the address.
14855    scratch = scratch register, initialized with the startaddress when
14856         not aligned, otherwise undefined
14857
14858    This is just the body. It needs the initializations mentioned above and
14859    some address computing at the end.  These things are done in i386.md.  */
14860
14861 static void
14862 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
14863 {
14864   int align;
14865   rtx tmp;
14866   rtx align_2_label = NULL_RTX;
14867   rtx align_3_label = NULL_RTX;
14868   rtx align_4_label = gen_label_rtx ();
14869   rtx end_0_label = gen_label_rtx ();
14870   rtx mem;
14871   rtx tmpreg = gen_reg_rtx (SImode);
14872   rtx scratch = gen_reg_rtx (SImode);
14873   rtx cmp;
14874
14875   align = 0;
14876   if (CONST_INT_P (align_rtx))
14877     align = INTVAL (align_rtx);
14878
14879   /* Loop to check 1..3 bytes for null to get an aligned pointer.  */
14880
14881   /* Is there a known alignment and is it less than 4?  */
14882   if (align < 4)
14883     {
14884       rtx scratch1 = gen_reg_rtx (Pmode);
14885       emit_move_insn (scratch1, out);
14886       /* Is there a known alignment and is it not 2? */
14887       if (align != 2)
14888         {
14889           align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
14890           align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
14891
14892           /* Leave just the 3 lower bits.  */
14893           align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
14894                                     NULL_RTX, 0, OPTAB_WIDEN);
14895
14896           emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
14897                                    Pmode, 1, align_4_label);
14898           emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
14899                                    Pmode, 1, align_2_label);
14900           emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
14901                                    Pmode, 1, align_3_label);
14902         }
14903       else
14904         {
14905           /* Since the alignment is 2, we have to check 2 or 0 bytes;
14906              check if is aligned to 4 - byte.  */
14907
14908           align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
14909                                     NULL_RTX, 0, OPTAB_WIDEN);
14910
14911           emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
14912                                    Pmode, 1, align_4_label);
14913         }
14914
14915       mem = change_address (src, QImode, out);
14916
14917       /* Now compare the bytes.  */
14918
14919       /* Compare the first n unaligned byte on a byte per byte basis.  */
14920       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
14921                                QImode, 1, end_0_label);
14922
14923       /* Increment the address.  */
14924       if (TARGET_64BIT)
14925         emit_insn (gen_adddi3 (out, out, const1_rtx));
14926       else
14927         emit_insn (gen_addsi3 (out, out, const1_rtx));
14928
14929       /* Not needed with an alignment of 2 */
14930       if (align != 2)
14931         {
14932           emit_label (align_2_label);
14933
14934           emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
14935                                    end_0_label);
14936
14937           if (TARGET_64BIT)
14938             emit_insn (gen_adddi3 (out, out, const1_rtx));
14939           else
14940             emit_insn (gen_addsi3 (out, out, const1_rtx));
14941
14942           emit_label (align_3_label);
14943         }
14944
14945       emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
14946                                end_0_label);
14947
14948       if (TARGET_64BIT)
14949         emit_insn (gen_adddi3 (out, out, const1_rtx));
14950       else
14951         emit_insn (gen_addsi3 (out, out, const1_rtx));
14952     }
14953
14954   /* Generate loop to check 4 bytes at a time.  It is not a good idea to
14955      align this loop.  It gives only huge programs, but does not help to
14956      speed up.  */
14957   emit_label (align_4_label);
14958
14959   mem = change_address (src, SImode, out);
14960   emit_move_insn (scratch, mem);
14961   if (TARGET_64BIT)
14962     emit_insn (gen_adddi3 (out, out, GEN_INT (4)));
14963   else
14964     emit_insn (gen_addsi3 (out, out, GEN_INT (4)));
14965
14966   /* This formula yields a nonzero result iff one of the bytes is zero.
14967      This saves three branches inside loop and many cycles.  */
14968
14969   emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
14970   emit_insn (gen_one_cmplsi2 (scratch, scratch));
14971   emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
14972   emit_insn (gen_andsi3 (tmpreg, tmpreg,
14973                          gen_int_mode (0x80808080, SImode)));
14974   emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
14975                            align_4_label);
14976
14977   if (TARGET_CMOVE)
14978     {
14979        rtx reg = gen_reg_rtx (SImode);
14980        rtx reg2 = gen_reg_rtx (Pmode);
14981        emit_move_insn (reg, tmpreg);
14982        emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
14983
14984        /* If zero is not in the first two bytes, move two bytes forward.  */
14985        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
14986        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
14987        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
14988        emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
14989                                gen_rtx_IF_THEN_ELSE (SImode, tmp,
14990                                                      reg,
14991                                                      tmpreg)));
14992        /* Emit lea manually to avoid clobbering of flags.  */
14993        emit_insn (gen_rtx_SET (SImode, reg2,
14994                                gen_rtx_PLUS (Pmode, out, const2_rtx)));
14995
14996        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
14997        tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
14998        emit_insn (gen_rtx_SET (VOIDmode, out,
14999                                gen_rtx_IF_THEN_ELSE (Pmode, tmp,
15000                                                      reg2,
15001                                                      out)));
15002
15003     }
15004   else
15005     {
15006        rtx end_2_label = gen_label_rtx ();
15007        /* Is zero in the first two bytes? */
15008
15009        emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
15010        tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
15011        tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
15012        tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
15013                             gen_rtx_LABEL_REF (VOIDmode, end_2_label),
15014                             pc_rtx);
15015        tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
15016        JUMP_LABEL (tmp) = end_2_label;
15017
15018        /* Not in the first two.  Move two bytes forward.  */
15019        emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
15020        if (TARGET_64BIT)
15021          emit_insn (gen_adddi3 (out, out, const2_rtx));
15022        else
15023          emit_insn (gen_addsi3 (out, out, const2_rtx));
15024
15025        emit_label (end_2_label);
15026
15027     }
15028
15029   /* Avoid branch in fixing the byte.  */
15030   tmpreg = gen_lowpart (QImode, tmpreg);
15031   emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
15032   cmp = gen_rtx_LTU (Pmode, gen_rtx_REG (CCmode, 17), const0_rtx);
15033   if (TARGET_64BIT)
15034     emit_insn (gen_subdi3_carry_rex64 (out, out, GEN_INT (3), cmp));
15035   else
15036     emit_insn (gen_subsi3_carry (out, out, GEN_INT (3), cmp));
15037
15038   emit_label (end_0_label);
15039 }
15040
15041 /* Expand strlen.  */
15042
15043 int
15044 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
15045 {
15046   rtx addr, scratch1, scratch2, scratch3, scratch4;
15047
15048   /* The generic case of strlen expander is long.  Avoid it's
15049      expanding unless TARGET_INLINE_ALL_STRINGOPS.  */
15050
15051   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
15052       && !TARGET_INLINE_ALL_STRINGOPS
15053       && !optimize_size
15054       && (!CONST_INT_P (align) || INTVAL (align) < 4))
15055     return 0;
15056
15057   addr = force_reg (Pmode, XEXP (src, 0));
15058   scratch1 = gen_reg_rtx (Pmode);
15059
15060   if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
15061       && !optimize_size)
15062     {
15063       /* Well it seems that some optimizer does not combine a call like
15064          foo(strlen(bar), strlen(bar));
15065          when the move and the subtraction is done here.  It does calculate
15066          the length just once when these instructions are done inside of
15067          output_strlen_unroll().  But I think since &bar[strlen(bar)] is
15068          often used and I use one fewer register for the lifetime of
15069          output_strlen_unroll() this is better.  */
15070
15071       emit_move_insn (out, addr);
15072
15073       ix86_expand_strlensi_unroll_1 (out, src, align);
15074
15075       /* strlensi_unroll_1 returns the address of the zero at the end of
15076          the string, like memchr(), so compute the length by subtracting
15077          the start address.  */
15078       if (TARGET_64BIT)
15079         emit_insn (gen_subdi3 (out, out, addr));
15080       else
15081         emit_insn (gen_subsi3 (out, out, addr));
15082     }
15083   else
15084     {
15085       rtx unspec;
15086       scratch2 = gen_reg_rtx (Pmode);
15087       scratch3 = gen_reg_rtx (Pmode);
15088       scratch4 = force_reg (Pmode, constm1_rtx);
15089
15090       emit_move_insn (scratch3, addr);
15091       eoschar = force_reg (QImode, eoschar);
15092
15093       src = replace_equiv_address_nv (src, scratch3);
15094
15095       /* If .md starts supporting :P, this can be done in .md.  */
15096       unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
15097                                                  scratch4), UNSPEC_SCAS);
15098       emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
15099       if (TARGET_64BIT)
15100         {
15101           emit_insn (gen_one_cmpldi2 (scratch2, scratch1));
15102           emit_insn (gen_adddi3 (out, scratch2, constm1_rtx));
15103         }
15104       else
15105         {
15106           emit_insn (gen_one_cmplsi2 (scratch2, scratch1));
15107           emit_insn (gen_addsi3 (out, scratch2, constm1_rtx));
15108         }
15109     }
15110   return 1;
15111 }
15112
15113 /* For given symbol (function) construct code to compute address of it's PLT
15114    entry in large x86-64 PIC model.  */
15115 rtx
15116 construct_plt_address (rtx symbol)
15117 {
15118   rtx tmp = gen_reg_rtx (Pmode);
15119   rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
15120
15121   gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
15122   gcc_assert (ix86_cmodel == CM_LARGE_PIC);
15123
15124   emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
15125   emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx));
15126   return tmp;
15127 }
15128
15129 void
15130 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
15131                   rtx callarg2 ATTRIBUTE_UNUSED,
15132                   rtx pop, int sibcall)
15133 {
15134   rtx use = NULL, call;
15135
15136   if (pop == const0_rtx)
15137     pop = NULL;
15138   gcc_assert (!TARGET_64BIT || !pop);
15139
15140   if (TARGET_MACHO && !TARGET_64BIT)
15141     {
15142 #if TARGET_MACHO
15143       if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
15144         fnaddr = machopic_indirect_call_target (fnaddr);
15145 #endif
15146     }
15147   else
15148     {
15149       /* Static functions and indirect calls don't need the pic register.  */
15150       if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
15151           && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
15152           && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
15153         use_reg (&use, pic_offset_table_rtx);
15154     }
15155
15156   if (TARGET_64BIT && INTVAL (callarg2) >= 0)
15157     {
15158       rtx al = gen_rtx_REG (QImode, 0);
15159       emit_move_insn (al, callarg2);
15160       use_reg (&use, al);
15161     }
15162
15163   if (ix86_cmodel == CM_LARGE_PIC
15164       && GET_CODE (fnaddr) == MEM
15165       && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
15166       && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
15167     fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
15168   else if (! call_insn_operand (XEXP (fnaddr, 0), Pmode))
15169     {
15170       fnaddr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0));
15171       fnaddr = gen_rtx_MEM (QImode, fnaddr);
15172     }
15173   if (sibcall && TARGET_64BIT
15174       && !constant_call_address_operand (XEXP (fnaddr, 0), Pmode))
15175     {
15176       rtx addr;
15177       addr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0));
15178       fnaddr = gen_rtx_REG (Pmode, R11_REG);
15179       emit_move_insn (fnaddr, addr);
15180       fnaddr = gen_rtx_MEM (QImode, fnaddr);
15181     }
15182
15183   call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
15184   if (retval)
15185     call = gen_rtx_SET (VOIDmode, retval, call);
15186   if (pop)
15187     {
15188       pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
15189       pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
15190       call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, call, pop));
15191     }
15192
15193   call = emit_call_insn (call);
15194   if (use)
15195     CALL_INSN_FUNCTION_USAGE (call) = use;
15196 }
15197
15198 \f
15199 /* Clear stack slot assignments remembered from previous functions.
15200    This is called from INIT_EXPANDERS once before RTL is emitted for each
15201    function.  */
15202
15203 static struct machine_function *
15204 ix86_init_machine_status (void)
15205 {
15206   struct machine_function *f;
15207
15208   f = ggc_alloc_cleared (sizeof (struct machine_function));
15209   f->use_fast_prologue_epilogue_nregs = -1;
15210   f->tls_descriptor_call_expanded_p = 0;
15211
15212   return f;
15213 }
15214
15215 /* Return a MEM corresponding to a stack slot with mode MODE.
15216    Allocate a new slot if necessary.
15217
15218    The RTL for a function can have several slots available: N is
15219    which slot to use.  */
15220
15221 rtx
15222 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
15223 {
15224   struct stack_local_entry *s;
15225
15226   gcc_assert (n < MAX_386_STACK_LOCALS);
15227
15228   for (s = ix86_stack_locals; s; s = s->next)
15229     if (s->mode == mode && s->n == n)
15230       return copy_rtx (s->rtl);
15231
15232   s = (struct stack_local_entry *)
15233     ggc_alloc (sizeof (struct stack_local_entry));
15234   s->n = n;
15235   s->mode = mode;
15236   s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
15237
15238   s->next = ix86_stack_locals;
15239   ix86_stack_locals = s;
15240   return s->rtl;
15241 }
15242
15243 /* Construct the SYMBOL_REF for the tls_get_addr function.  */
15244
15245 static GTY(()) rtx ix86_tls_symbol;
15246 rtx
15247 ix86_tls_get_addr (void)
15248 {
15249
15250   if (!ix86_tls_symbol)
15251     {
15252       ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode,
15253                                             (TARGET_ANY_GNU_TLS
15254                                              && !TARGET_64BIT)
15255                                             ? "___tls_get_addr"
15256                                             : "__tls_get_addr");
15257     }
15258
15259   return ix86_tls_symbol;
15260 }
15261
15262 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol.  */
15263
15264 static GTY(()) rtx ix86_tls_module_base_symbol;
15265 rtx
15266 ix86_tls_module_base (void)
15267 {
15268
15269   if (!ix86_tls_module_base_symbol)
15270     {
15271       ix86_tls_module_base_symbol = gen_rtx_SYMBOL_REF (Pmode,
15272                                                         "_TLS_MODULE_BASE_");
15273       SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
15274         |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
15275     }
15276
15277   return ix86_tls_module_base_symbol;
15278 }
15279 \f
15280 /* Calculate the length of the memory address in the instruction
15281    encoding.  Does not include the one-byte modrm, opcode, or prefix.  */
15282
15283 int
15284 memory_address_length (rtx addr)
15285 {
15286   struct ix86_address parts;
15287   rtx base, index, disp;
15288   int len;
15289   int ok;
15290
15291   if (GET_CODE (addr) == PRE_DEC
15292       || GET_CODE (addr) == POST_INC
15293       || GET_CODE (addr) == PRE_MODIFY
15294       || GET_CODE (addr) == POST_MODIFY)
15295     return 0;
15296
15297   ok = ix86_decompose_address (addr, &parts);
15298   gcc_assert (ok);
15299
15300   if (parts.base && GET_CODE (parts.base) == SUBREG)
15301     parts.base = SUBREG_REG (parts.base);
15302   if (parts.index && GET_CODE (parts.index) == SUBREG)
15303     parts.index = SUBREG_REG (parts.index);
15304
15305   base = parts.base;
15306   index = parts.index;
15307   disp = parts.disp;
15308   len = 0;
15309
15310   /* Rule of thumb:
15311        - esp as the base always wants an index,
15312        - ebp as the base always wants a displacement.  */
15313
15314   /* Register Indirect.  */
15315   if (base && !index && !disp)
15316     {
15317       /* esp (for its index) and ebp (for its displacement) need
15318          the two-byte modrm form.  */
15319       if (addr == stack_pointer_rtx
15320           || addr == arg_pointer_rtx
15321           || addr == frame_pointer_rtx
15322           || addr == hard_frame_pointer_rtx)
15323         len = 1;
15324     }
15325
15326   /* Direct Addressing.  */
15327   else if (disp && !base && !index)
15328     len = 4;
15329
15330   else
15331     {
15332       /* Find the length of the displacement constant.  */
15333       if (disp)
15334         {
15335           if (base && satisfies_constraint_K (disp))
15336             len = 1;
15337           else
15338             len = 4;
15339         }
15340       /* ebp always wants a displacement.  */
15341       else if (base == hard_frame_pointer_rtx)
15342         len = 1;
15343
15344       /* An index requires the two-byte modrm form....  */
15345       if (index
15346           /* ...like esp, which always wants an index.  */
15347           || base == stack_pointer_rtx
15348           || base == arg_pointer_rtx
15349           || base == frame_pointer_rtx)
15350         len += 1;
15351     }
15352
15353   return len;
15354 }
15355
15356 /* Compute default value for "length_immediate" attribute.  When SHORTFORM
15357    is set, expect that insn have 8bit immediate alternative.  */
15358 int
15359 ix86_attr_length_immediate_default (rtx insn, int shortform)
15360 {
15361   int len = 0;
15362   int i;
15363   extract_insn_cached (insn);
15364   for (i = recog_data.n_operands - 1; i >= 0; --i)
15365     if (CONSTANT_P (recog_data.operand[i]))
15366       {
15367         gcc_assert (!len);
15368         if (shortform && satisfies_constraint_K (recog_data.operand[i]))
15369           len = 1;
15370         else
15371           {
15372             switch (get_attr_mode (insn))
15373               {
15374                 case MODE_QI:
15375                   len+=1;
15376                   break;
15377                 case MODE_HI:
15378                   len+=2;
15379                   break;
15380                 case MODE_SI:
15381                   len+=4;
15382                   break;
15383                 /* Immediates for DImode instructions are encoded as 32bit sign extended values.  */
15384                 case MODE_DI:
15385                   len+=4;
15386                   break;
15387                 default:
15388                   fatal_insn ("unknown insn mode", insn);
15389               }
15390           }
15391       }
15392   return len;
15393 }
15394 /* Compute default value for "length_address" attribute.  */
15395 int
15396 ix86_attr_length_address_default (rtx insn)
15397 {
15398   int i;
15399
15400   if (get_attr_type (insn) == TYPE_LEA)
15401     {
15402       rtx set = PATTERN (insn);
15403
15404       if (GET_CODE (set) == PARALLEL)
15405         set = XVECEXP (set, 0, 0);
15406
15407       gcc_assert (GET_CODE (set) == SET);
15408
15409       return memory_address_length (SET_SRC (set));
15410     }
15411
15412   extract_insn_cached (insn);
15413   for (i = recog_data.n_operands - 1; i >= 0; --i)
15414     if (MEM_P (recog_data.operand[i]))
15415       {
15416         return memory_address_length (XEXP (recog_data.operand[i], 0));
15417         break;
15418       }
15419   return 0;
15420 }
15421 \f
15422 /* Return the maximum number of instructions a cpu can issue.  */
15423
15424 static int
15425 ix86_issue_rate (void)
15426 {
15427   switch (ix86_tune)
15428     {
15429     case PROCESSOR_PENTIUM:
15430     case PROCESSOR_K6:
15431       return 2;
15432
15433     case PROCESSOR_PENTIUMPRO:
15434     case PROCESSOR_PENTIUM4:
15435     case PROCESSOR_ATHLON:
15436     case PROCESSOR_K8:
15437     case PROCESSOR_AMDFAM10:
15438     case PROCESSOR_NOCONA:
15439     case PROCESSOR_GENERIC32:
15440     case PROCESSOR_GENERIC64:
15441       return 3;
15442
15443     case PROCESSOR_CORE2:
15444       return 4;
15445
15446     default:
15447       return 1;
15448     }
15449 }
15450
15451 /* A subroutine of ix86_adjust_cost -- return true iff INSN reads flags set
15452    by DEP_INSN and nothing set by DEP_INSN.  */
15453
15454 static int
15455 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
15456 {
15457   rtx set, set2;
15458
15459   /* Simplify the test for uninteresting insns.  */
15460   if (insn_type != TYPE_SETCC
15461       && insn_type != TYPE_ICMOV
15462       && insn_type != TYPE_FCMOV
15463       && insn_type != TYPE_IBR)
15464     return 0;
15465
15466   if ((set = single_set (dep_insn)) != 0)
15467     {
15468       set = SET_DEST (set);
15469       set2 = NULL_RTX;
15470     }
15471   else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
15472            && XVECLEN (PATTERN (dep_insn), 0) == 2
15473            && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
15474            && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
15475     {
15476       set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
15477       set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
15478     }
15479   else
15480     return 0;
15481
15482   if (!REG_P (set) || REGNO (set) != FLAGS_REG)
15483     return 0;
15484
15485   /* This test is true if the dependent insn reads the flags but
15486      not any other potentially set register.  */
15487   if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
15488     return 0;
15489
15490   if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
15491     return 0;
15492
15493   return 1;
15494 }
15495
15496 /* A subroutine of ix86_adjust_cost -- return true iff INSN has a memory
15497    address with operands set by DEP_INSN.  */
15498
15499 static int
15500 ix86_agi_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
15501 {
15502   rtx addr;
15503
15504   if (insn_type == TYPE_LEA
15505       && TARGET_PENTIUM)
15506     {
15507       addr = PATTERN (insn);
15508
15509       if (GET_CODE (addr) == PARALLEL)
15510         addr = XVECEXP (addr, 0, 0);
15511
15512       gcc_assert (GET_CODE (addr) == SET);
15513
15514       addr = SET_SRC (addr);
15515     }
15516   else
15517     {
15518       int i;
15519       extract_insn_cached (insn);
15520       for (i = recog_data.n_operands - 1; i >= 0; --i)
15521         if (MEM_P (recog_data.operand[i]))
15522           {
15523             addr = XEXP (recog_data.operand[i], 0);
15524             goto found;
15525           }
15526       return 0;
15527     found:;
15528     }
15529
15530   return modified_in_p (addr, dep_insn);
15531 }
15532
15533 static int
15534 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
15535 {
15536   enum attr_type insn_type, dep_insn_type;
15537   enum attr_memory memory;
15538   rtx set, set2;
15539   int dep_insn_code_number;
15540
15541   /* Anti and output dependencies have zero cost on all CPUs.  */
15542   if (REG_NOTE_KIND (link) != 0)
15543     return 0;
15544
15545   dep_insn_code_number = recog_memoized (dep_insn);
15546
15547   /* If we can't recognize the insns, we can't really do anything.  */
15548   if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
15549     return cost;
15550
15551   insn_type = get_attr_type (insn);
15552   dep_insn_type = get_attr_type (dep_insn);
15553
15554   switch (ix86_tune)
15555     {
15556     case PROCESSOR_PENTIUM:
15557       /* Address Generation Interlock adds a cycle of latency.  */
15558       if (ix86_agi_dependent (insn, dep_insn, insn_type))
15559         cost += 1;
15560
15561       /* ??? Compares pair with jump/setcc.  */
15562       if (ix86_flags_dependent (insn, dep_insn, insn_type))
15563         cost = 0;
15564
15565       /* Floating point stores require value to be ready one cycle earlier.  */
15566       if (insn_type == TYPE_FMOV
15567           && get_attr_memory (insn) == MEMORY_STORE
15568           && !ix86_agi_dependent (insn, dep_insn, insn_type))
15569         cost += 1;
15570       break;
15571
15572     case PROCESSOR_PENTIUMPRO:
15573       memory = get_attr_memory (insn);
15574
15575       /* INT->FP conversion is expensive.  */
15576       if (get_attr_fp_int_src (dep_insn))
15577         cost += 5;
15578
15579       /* There is one cycle extra latency between an FP op and a store.  */
15580       if (insn_type == TYPE_FMOV
15581           && (set = single_set (dep_insn)) != NULL_RTX
15582           && (set2 = single_set (insn)) != NULL_RTX
15583           && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
15584           && MEM_P (SET_DEST (set2)))
15585         cost += 1;
15586
15587       /* Show ability of reorder buffer to hide latency of load by executing
15588          in parallel with previous instruction in case
15589          previous instruction is not needed to compute the address.  */
15590       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
15591           && !ix86_agi_dependent (insn, dep_insn, insn_type))
15592         {
15593           /* Claim moves to take one cycle, as core can issue one load
15594              at time and the next load can start cycle later.  */
15595           if (dep_insn_type == TYPE_IMOV
15596               || dep_insn_type == TYPE_FMOV)
15597             cost = 1;
15598           else if (cost > 1)
15599             cost--;
15600         }
15601       break;
15602
15603     case PROCESSOR_K6:
15604       memory = get_attr_memory (insn);
15605
15606       /* The esp dependency is resolved before the instruction is really
15607          finished.  */
15608       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
15609           && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
15610         return 1;
15611
15612       /* INT->FP conversion is expensive.  */
15613       if (get_attr_fp_int_src (dep_insn))
15614         cost += 5;
15615
15616       /* Show ability of reorder buffer to hide latency of load by executing
15617          in parallel with previous instruction in case
15618          previous instruction is not needed to compute the address.  */
15619       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
15620           && !ix86_agi_dependent (insn, dep_insn, insn_type))
15621         {
15622           /* Claim moves to take one cycle, as core can issue one load
15623              at time and the next load can start cycle later.  */
15624           if (dep_insn_type == TYPE_IMOV
15625               || dep_insn_type == TYPE_FMOV)
15626             cost = 1;
15627           else if (cost > 2)
15628             cost -= 2;
15629           else
15630             cost = 1;
15631         }
15632       break;
15633
15634     case PROCESSOR_ATHLON:
15635     case PROCESSOR_K8:
15636     case PROCESSOR_AMDFAM10:
15637     case PROCESSOR_GENERIC32:
15638     case PROCESSOR_GENERIC64:
15639       memory = get_attr_memory (insn);
15640
15641       /* Show ability of reorder buffer to hide latency of load by executing
15642          in parallel with previous instruction in case
15643          previous instruction is not needed to compute the address.  */
15644       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
15645           && !ix86_agi_dependent (insn, dep_insn, insn_type))
15646         {
15647           enum attr_unit unit = get_attr_unit (insn);
15648           int loadcost = 3;
15649
15650           /* Because of the difference between the length of integer and
15651              floating unit pipeline preparation stages, the memory operands
15652              for floating point are cheaper.
15653
15654              ??? For Athlon it the difference is most probably 2.  */
15655           if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
15656             loadcost = 3;
15657           else
15658             loadcost = TARGET_ATHLON ? 2 : 0;
15659
15660           if (cost >= loadcost)
15661             cost -= loadcost;
15662           else
15663             cost = 0;
15664         }
15665
15666     default:
15667       break;
15668     }
15669
15670   return cost;
15671 }
15672
15673 /* How many alternative schedules to try.  This should be as wide as the
15674    scheduling freedom in the DFA, but no wider.  Making this value too
15675    large results extra work for the scheduler.  */
15676
15677 static int
15678 ia32_multipass_dfa_lookahead (void)
15679 {
15680   if (ix86_tune == PROCESSOR_PENTIUM)
15681     return 2;
15682
15683   if (ix86_tune == PROCESSOR_PENTIUMPRO
15684       || ix86_tune == PROCESSOR_K6)
15685     return 1;
15686
15687   else
15688     return 0;
15689 }
15690
15691 \f
15692 /* Compute the alignment given to a constant that is being placed in memory.
15693    EXP is the constant and ALIGN is the alignment that the object would
15694    ordinarily have.
15695    The value of this function is used instead of that alignment to align
15696    the object.  */
15697
15698 int
15699 ix86_constant_alignment (tree exp, int align)
15700 {
15701   if (TREE_CODE (exp) == REAL_CST)
15702     {
15703       if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
15704         return 64;
15705       else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
15706         return 128;
15707     }
15708   else if (!optimize_size && TREE_CODE (exp) == STRING_CST
15709            && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
15710     return BITS_PER_WORD;
15711
15712   return align;
15713 }
15714
15715 /* Compute the alignment for a static variable.
15716    TYPE is the data type, and ALIGN is the alignment that
15717    the object would ordinarily have.  The value of this function is used
15718    instead of that alignment to align the object.  */
15719
15720 int
15721 ix86_data_alignment (tree type, int align)
15722 {
15723   int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
15724
15725   if (AGGREGATE_TYPE_P (type)
15726       && TYPE_SIZE (type)
15727       && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15728       && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
15729           || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
15730       && align < max_align)
15731     align = max_align;
15732
15733   /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
15734      to 16byte boundary.  */
15735   if (TARGET_64BIT)
15736     {
15737       if (AGGREGATE_TYPE_P (type)
15738            && TYPE_SIZE (type)
15739            && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15740            && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
15741                || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
15742         return 128;
15743     }
15744
15745   if (TREE_CODE (type) == ARRAY_TYPE)
15746     {
15747       if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
15748         return 64;
15749       if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
15750         return 128;
15751     }
15752   else if (TREE_CODE (type) == COMPLEX_TYPE)
15753     {
15754
15755       if (TYPE_MODE (type) == DCmode && align < 64)
15756         return 64;
15757       if (TYPE_MODE (type) == XCmode && align < 128)
15758         return 128;
15759     }
15760   else if ((TREE_CODE (type) == RECORD_TYPE
15761             || TREE_CODE (type) == UNION_TYPE
15762             || TREE_CODE (type) == QUAL_UNION_TYPE)
15763            && TYPE_FIELDS (type))
15764     {
15765       if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
15766         return 64;
15767       if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
15768         return 128;
15769     }
15770   else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
15771            || TREE_CODE (type) == INTEGER_TYPE)
15772     {
15773       if (TYPE_MODE (type) == DFmode && align < 64)
15774         return 64;
15775       if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
15776         return 128;
15777     }
15778
15779   return align;
15780 }
15781
15782 /* Compute the alignment for a local variable.
15783    TYPE is the data type, and ALIGN is the alignment that
15784    the object would ordinarily have.  The value of this macro is used
15785    instead of that alignment to align the object.  */
15786
15787 int
15788 ix86_local_alignment (tree type, int align)
15789 {
15790   /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
15791      to 16byte boundary.  */
15792   if (TARGET_64BIT)
15793     {
15794       if (AGGREGATE_TYPE_P (type)
15795            && TYPE_SIZE (type)
15796            && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
15797            && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
15798                || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
15799         return 128;
15800     }
15801   if (TREE_CODE (type) == ARRAY_TYPE)
15802     {
15803       if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
15804         return 64;
15805       if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
15806         return 128;
15807     }
15808   else if (TREE_CODE (type) == COMPLEX_TYPE)
15809     {
15810       if (TYPE_MODE (type) == DCmode && align < 64)
15811         return 64;
15812       if (TYPE_MODE (type) == XCmode && align < 128)
15813         return 128;
15814     }
15815   else if ((TREE_CODE (type) == RECORD_TYPE
15816             || TREE_CODE (type) == UNION_TYPE
15817             || TREE_CODE (type) == QUAL_UNION_TYPE)
15818            && TYPE_FIELDS (type))
15819     {
15820       if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
15821         return 64;
15822       if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
15823         return 128;
15824     }
15825   else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
15826            || TREE_CODE (type) == INTEGER_TYPE)
15827     {
15828
15829       if (TYPE_MODE (type) == DFmode && align < 64)
15830         return 64;
15831       if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
15832         return 128;
15833     }
15834   return align;
15835 }
15836 \f
15837 /* Emit RTL insns to initialize the variable parts of a trampoline.
15838    FNADDR is an RTX for the address of the function's pure code.
15839    CXT is an RTX for the static chain value for the function.  */
15840 void
15841 x86_initialize_trampoline (rtx tramp, rtx fnaddr, rtx cxt)
15842 {
15843   if (!TARGET_64BIT)
15844     {
15845       /* Compute offset from the end of the jmp to the target function.  */
15846       rtx disp = expand_binop (SImode, sub_optab, fnaddr,
15847                                plus_constant (tramp, 10),
15848                                NULL_RTX, 1, OPTAB_DIRECT);
15849       emit_move_insn (gen_rtx_MEM (QImode, tramp),
15850                       gen_int_mode (0xb9, QImode));
15851       emit_move_insn (gen_rtx_MEM (SImode, plus_constant (tramp, 1)), cxt);
15852       emit_move_insn (gen_rtx_MEM (QImode, plus_constant (tramp, 5)),
15853                       gen_int_mode (0xe9, QImode));
15854       emit_move_insn (gen_rtx_MEM (SImode, plus_constant (tramp, 6)), disp);
15855     }
15856   else
15857     {
15858       int offset = 0;
15859       /* Try to load address using shorter movl instead of movabs.
15860          We may want to support movq for kernel mode, but kernel does not use
15861          trampolines at the moment.  */
15862       if (x86_64_zext_immediate_operand (fnaddr, VOIDmode))
15863         {
15864           fnaddr = copy_to_mode_reg (DImode, fnaddr);
15865           emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)),
15866                           gen_int_mode (0xbb41, HImode));
15867           emit_move_insn (gen_rtx_MEM (SImode, plus_constant (tramp, offset + 2)),
15868                           gen_lowpart (SImode, fnaddr));
15869           offset += 6;
15870         }
15871       else
15872         {
15873           emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)),
15874                           gen_int_mode (0xbb49, HImode));
15875           emit_move_insn (gen_rtx_MEM (DImode, plus_constant (tramp, offset + 2)),
15876                           fnaddr);
15877           offset += 10;
15878         }
15879       /* Load static chain using movabs to r10.  */
15880       emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)),
15881                       gen_int_mode (0xba49, HImode));
15882       emit_move_insn (gen_rtx_MEM (DImode, plus_constant (tramp, offset + 2)),
15883                       cxt);
15884       offset += 10;
15885       /* Jump to the r11 */
15886       emit_move_insn (gen_rtx_MEM (HImode, plus_constant (tramp, offset)),
15887                       gen_int_mode (0xff49, HImode));
15888       emit_move_insn (gen_rtx_MEM (QImode, plus_constant (tramp, offset+2)),
15889                       gen_int_mode (0xe3, QImode));
15890       offset += 3;
15891       gcc_assert (offset <= TRAMPOLINE_SIZE);
15892     }
15893
15894 #ifdef ENABLE_EXECUTE_STACK
15895   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
15896                      LCT_NORMAL, VOIDmode, 1, tramp, Pmode);
15897 #endif
15898 }
15899 \f
15900 /* Codes for all the SSE/MMX builtins.  */
15901 enum ix86_builtins
15902 {
15903   IX86_BUILTIN_ADDPS,
15904   IX86_BUILTIN_ADDSS,
15905   IX86_BUILTIN_DIVPS,
15906   IX86_BUILTIN_DIVSS,
15907   IX86_BUILTIN_MULPS,
15908   IX86_BUILTIN_MULSS,
15909   IX86_BUILTIN_SUBPS,
15910   IX86_BUILTIN_SUBSS,
15911
15912   IX86_BUILTIN_CMPEQPS,
15913   IX86_BUILTIN_CMPLTPS,
15914   IX86_BUILTIN_CMPLEPS,
15915   IX86_BUILTIN_CMPGTPS,
15916   IX86_BUILTIN_CMPGEPS,
15917   IX86_BUILTIN_CMPNEQPS,
15918   IX86_BUILTIN_CMPNLTPS,
15919   IX86_BUILTIN_CMPNLEPS,
15920   IX86_BUILTIN_CMPNGTPS,
15921   IX86_BUILTIN_CMPNGEPS,
15922   IX86_BUILTIN_CMPORDPS,
15923   IX86_BUILTIN_CMPUNORDPS,
15924   IX86_BUILTIN_CMPEQSS,
15925   IX86_BUILTIN_CMPLTSS,
15926   IX86_BUILTIN_CMPLESS,
15927   IX86_BUILTIN_CMPNEQSS,
15928   IX86_BUILTIN_CMPNLTSS,
15929   IX86_BUILTIN_CMPNLESS,
15930   IX86_BUILTIN_CMPNGTSS,
15931   IX86_BUILTIN_CMPNGESS,
15932   IX86_BUILTIN_CMPORDSS,
15933   IX86_BUILTIN_CMPUNORDSS,
15934
15935   IX86_BUILTIN_COMIEQSS,
15936   IX86_BUILTIN_COMILTSS,
15937   IX86_BUILTIN_COMILESS,
15938   IX86_BUILTIN_COMIGTSS,
15939   IX86_BUILTIN_COMIGESS,
15940   IX86_BUILTIN_COMINEQSS,
15941   IX86_BUILTIN_UCOMIEQSS,
15942   IX86_BUILTIN_UCOMILTSS,
15943   IX86_BUILTIN_UCOMILESS,
15944   IX86_BUILTIN_UCOMIGTSS,
15945   IX86_BUILTIN_UCOMIGESS,
15946   IX86_BUILTIN_UCOMINEQSS,
15947
15948   IX86_BUILTIN_CVTPI2PS,
15949   IX86_BUILTIN_CVTPS2PI,
15950   IX86_BUILTIN_CVTSI2SS,
15951   IX86_BUILTIN_CVTSI642SS,
15952   IX86_BUILTIN_CVTSS2SI,
15953   IX86_BUILTIN_CVTSS2SI64,
15954   IX86_BUILTIN_CVTTPS2PI,
15955   IX86_BUILTIN_CVTTSS2SI,
15956   IX86_BUILTIN_CVTTSS2SI64,
15957
15958   IX86_BUILTIN_MAXPS,
15959   IX86_BUILTIN_MAXSS,
15960   IX86_BUILTIN_MINPS,
15961   IX86_BUILTIN_MINSS,
15962
15963   IX86_BUILTIN_LOADUPS,
15964   IX86_BUILTIN_STOREUPS,
15965   IX86_BUILTIN_MOVSS,
15966
15967   IX86_BUILTIN_MOVHLPS,
15968   IX86_BUILTIN_MOVLHPS,
15969   IX86_BUILTIN_LOADHPS,
15970   IX86_BUILTIN_LOADLPS,
15971   IX86_BUILTIN_STOREHPS,
15972   IX86_BUILTIN_STORELPS,
15973
15974   IX86_BUILTIN_MASKMOVQ,
15975   IX86_BUILTIN_MOVMSKPS,
15976   IX86_BUILTIN_PMOVMSKB,
15977
15978   IX86_BUILTIN_MOVNTPS,
15979   IX86_BUILTIN_MOVNTQ,
15980
15981   IX86_BUILTIN_LOADDQU,
15982   IX86_BUILTIN_STOREDQU,
15983
15984   IX86_BUILTIN_PACKSSWB,
15985   IX86_BUILTIN_PACKSSDW,
15986   IX86_BUILTIN_PACKUSWB,
15987
15988   IX86_BUILTIN_PADDB,
15989   IX86_BUILTIN_PADDW,
15990   IX86_BUILTIN_PADDD,
15991   IX86_BUILTIN_PADDQ,
15992   IX86_BUILTIN_PADDSB,
15993   IX86_BUILTIN_PADDSW,
15994   IX86_BUILTIN_PADDUSB,
15995   IX86_BUILTIN_PADDUSW,
15996   IX86_BUILTIN_PSUBB,
15997   IX86_BUILTIN_PSUBW,
15998   IX86_BUILTIN_PSUBD,
15999   IX86_BUILTIN_PSUBQ,
16000   IX86_BUILTIN_PSUBSB,
16001   IX86_BUILTIN_PSUBSW,
16002   IX86_BUILTIN_PSUBUSB,
16003   IX86_BUILTIN_PSUBUSW,
16004
16005   IX86_BUILTIN_PAND,
16006   IX86_BUILTIN_PANDN,
16007   IX86_BUILTIN_POR,
16008   IX86_BUILTIN_PXOR,
16009
16010   IX86_BUILTIN_PAVGB,
16011   IX86_BUILTIN_PAVGW,
16012
16013   IX86_BUILTIN_PCMPEQB,
16014   IX86_BUILTIN_PCMPEQW,
16015   IX86_BUILTIN_PCMPEQD,
16016   IX86_BUILTIN_PCMPGTB,
16017   IX86_BUILTIN_PCMPGTW,
16018   IX86_BUILTIN_PCMPGTD,
16019
16020   IX86_BUILTIN_PMADDWD,
16021
16022   IX86_BUILTIN_PMAXSW,
16023   IX86_BUILTIN_PMAXUB,
16024   IX86_BUILTIN_PMINSW,
16025   IX86_BUILTIN_PMINUB,
16026
16027   IX86_BUILTIN_PMULHUW,
16028   IX86_BUILTIN_PMULHW,
16029   IX86_BUILTIN_PMULLW,
16030
16031   IX86_BUILTIN_PSADBW,
16032   IX86_BUILTIN_PSHUFW,
16033
16034   IX86_BUILTIN_PSLLW,
16035   IX86_BUILTIN_PSLLD,
16036   IX86_BUILTIN_PSLLQ,
16037   IX86_BUILTIN_PSRAW,
16038   IX86_BUILTIN_PSRAD,
16039   IX86_BUILTIN_PSRLW,
16040   IX86_BUILTIN_PSRLD,
16041   IX86_BUILTIN_PSRLQ,
16042   IX86_BUILTIN_PSLLWI,
16043   IX86_BUILTIN_PSLLDI,
16044   IX86_BUILTIN_PSLLQI,
16045   IX86_BUILTIN_PSRAWI,
16046   IX86_BUILTIN_PSRADI,
16047   IX86_BUILTIN_PSRLWI,
16048   IX86_BUILTIN_PSRLDI,
16049   IX86_BUILTIN_PSRLQI,
16050
16051   IX86_BUILTIN_PUNPCKHBW,
16052   IX86_BUILTIN_PUNPCKHWD,
16053   IX86_BUILTIN_PUNPCKHDQ,
16054   IX86_BUILTIN_PUNPCKLBW,
16055   IX86_BUILTIN_PUNPCKLWD,
16056   IX86_BUILTIN_PUNPCKLDQ,
16057
16058   IX86_BUILTIN_SHUFPS,
16059
16060   IX86_BUILTIN_RCPPS,
16061   IX86_BUILTIN_RCPSS,
16062   IX86_BUILTIN_RSQRTPS,
16063   IX86_BUILTIN_RSQRTSS,
16064   IX86_BUILTIN_SQRTPS,
16065   IX86_BUILTIN_SQRTSS,
16066
16067   IX86_BUILTIN_UNPCKHPS,
16068   IX86_BUILTIN_UNPCKLPS,
16069
16070   IX86_BUILTIN_ANDPS,
16071   IX86_BUILTIN_ANDNPS,
16072   IX86_BUILTIN_ORPS,
16073   IX86_BUILTIN_XORPS,
16074
16075   IX86_BUILTIN_EMMS,
16076   IX86_BUILTIN_LDMXCSR,
16077   IX86_BUILTIN_STMXCSR,
16078   IX86_BUILTIN_SFENCE,
16079
16080   /* 3DNow! Original */
16081   IX86_BUILTIN_FEMMS,
16082   IX86_BUILTIN_PAVGUSB,
16083   IX86_BUILTIN_PF2ID,
16084   IX86_BUILTIN_PFACC,
16085   IX86_BUILTIN_PFADD,
16086   IX86_BUILTIN_PFCMPEQ,
16087   IX86_BUILTIN_PFCMPGE,
16088   IX86_BUILTIN_PFCMPGT,
16089   IX86_BUILTIN_PFMAX,
16090   IX86_BUILTIN_PFMIN,
16091   IX86_BUILTIN_PFMUL,
16092   IX86_BUILTIN_PFRCP,
16093   IX86_BUILTIN_PFRCPIT1,
16094   IX86_BUILTIN_PFRCPIT2,
16095   IX86_BUILTIN_PFRSQIT1,
16096   IX86_BUILTIN_PFRSQRT,
16097   IX86_BUILTIN_PFSUB,
16098   IX86_BUILTIN_PFSUBR,
16099   IX86_BUILTIN_PI2FD,
16100   IX86_BUILTIN_PMULHRW,
16101
16102   /* 3DNow! Athlon Extensions */
16103   IX86_BUILTIN_PF2IW,
16104   IX86_BUILTIN_PFNACC,
16105   IX86_BUILTIN_PFPNACC,
16106   IX86_BUILTIN_PI2FW,
16107   IX86_BUILTIN_PSWAPDSI,
16108   IX86_BUILTIN_PSWAPDSF,
16109
16110   /* SSE2 */
16111   IX86_BUILTIN_ADDPD,
16112   IX86_BUILTIN_ADDSD,
16113   IX86_BUILTIN_DIVPD,
16114   IX86_BUILTIN_DIVSD,
16115   IX86_BUILTIN_MULPD,
16116   IX86_BUILTIN_MULSD,
16117   IX86_BUILTIN_SUBPD,
16118   IX86_BUILTIN_SUBSD,
16119
16120   IX86_BUILTIN_CMPEQPD,
16121   IX86_BUILTIN_CMPLTPD,
16122   IX86_BUILTIN_CMPLEPD,
16123   IX86_BUILTIN_CMPGTPD,
16124   IX86_BUILTIN_CMPGEPD,
16125   IX86_BUILTIN_CMPNEQPD,
16126   IX86_BUILTIN_CMPNLTPD,
16127   IX86_BUILTIN_CMPNLEPD,
16128   IX86_BUILTIN_CMPNGTPD,
16129   IX86_BUILTIN_CMPNGEPD,
16130   IX86_BUILTIN_CMPORDPD,
16131   IX86_BUILTIN_CMPUNORDPD,
16132   IX86_BUILTIN_CMPEQSD,
16133   IX86_BUILTIN_CMPLTSD,
16134   IX86_BUILTIN_CMPLESD,
16135   IX86_BUILTIN_CMPNEQSD,
16136   IX86_BUILTIN_CMPNLTSD,
16137   IX86_BUILTIN_CMPNLESD,
16138   IX86_BUILTIN_CMPORDSD,
16139   IX86_BUILTIN_CMPUNORDSD,
16140
16141   IX86_BUILTIN_COMIEQSD,
16142   IX86_BUILTIN_COMILTSD,
16143   IX86_BUILTIN_COMILESD,
16144   IX86_BUILTIN_COMIGTSD,
16145   IX86_BUILTIN_COMIGESD,
16146   IX86_BUILTIN_COMINEQSD,
16147   IX86_BUILTIN_UCOMIEQSD,
16148   IX86_BUILTIN_UCOMILTSD,
16149   IX86_BUILTIN_UCOMILESD,
16150   IX86_BUILTIN_UCOMIGTSD,
16151   IX86_BUILTIN_UCOMIGESD,
16152   IX86_BUILTIN_UCOMINEQSD,
16153
16154   IX86_BUILTIN_MAXPD,
16155   IX86_BUILTIN_MAXSD,
16156   IX86_BUILTIN_MINPD,
16157   IX86_BUILTIN_MINSD,
16158
16159   IX86_BUILTIN_ANDPD,
16160   IX86_BUILTIN_ANDNPD,
16161   IX86_BUILTIN_ORPD,
16162   IX86_BUILTIN_XORPD,
16163
16164   IX86_BUILTIN_SQRTPD,
16165   IX86_BUILTIN_SQRTSD,
16166
16167   IX86_BUILTIN_UNPCKHPD,
16168   IX86_BUILTIN_UNPCKLPD,
16169
16170   IX86_BUILTIN_SHUFPD,
16171
16172   IX86_BUILTIN_LOADUPD,
16173   IX86_BUILTIN_STOREUPD,
16174   IX86_BUILTIN_MOVSD,
16175
16176   IX86_BUILTIN_LOADHPD,
16177   IX86_BUILTIN_LOADLPD,
16178
16179   IX86_BUILTIN_CVTDQ2PD,
16180   IX86_BUILTIN_CVTDQ2PS,
16181
16182   IX86_BUILTIN_CVTPD2DQ,
16183   IX86_BUILTIN_CVTPD2PI,
16184   IX86_BUILTIN_CVTPD2PS,
16185   IX86_BUILTIN_CVTTPD2DQ,
16186   IX86_BUILTIN_CVTTPD2PI,
16187
16188   IX86_BUILTIN_CVTPI2PD,
16189   IX86_BUILTIN_CVTSI2SD,
16190   IX86_BUILTIN_CVTSI642SD,
16191
16192   IX86_BUILTIN_CVTSD2SI,
16193   IX86_BUILTIN_CVTSD2SI64,
16194   IX86_BUILTIN_CVTSD2SS,
16195   IX86_BUILTIN_CVTSS2SD,
16196   IX86_BUILTIN_CVTTSD2SI,
16197   IX86_BUILTIN_CVTTSD2SI64,
16198
16199   IX86_BUILTIN_CVTPS2DQ,
16200   IX86_BUILTIN_CVTPS2PD,
16201   IX86_BUILTIN_CVTTPS2DQ,
16202
16203   IX86_BUILTIN_MOVNTI,
16204   IX86_BUILTIN_MOVNTPD,
16205   IX86_BUILTIN_MOVNTDQ,
16206
16207   /* SSE2 MMX */
16208   IX86_BUILTIN_MASKMOVDQU,
16209   IX86_BUILTIN_MOVMSKPD,
16210   IX86_BUILTIN_PMOVMSKB128,
16211
16212   IX86_BUILTIN_PACKSSWB128,
16213   IX86_BUILTIN_PACKSSDW128,
16214   IX86_BUILTIN_PACKUSWB128,
16215
16216   IX86_BUILTIN_PADDB128,
16217   IX86_BUILTIN_PADDW128,
16218   IX86_BUILTIN_PADDD128,
16219   IX86_BUILTIN_PADDQ128,
16220   IX86_BUILTIN_PADDSB128,
16221   IX86_BUILTIN_PADDSW128,
16222   IX86_BUILTIN_PADDUSB128,
16223   IX86_BUILTIN_PADDUSW128,
16224   IX86_BUILTIN_PSUBB128,
16225   IX86_BUILTIN_PSUBW128,
16226   IX86_BUILTIN_PSUBD128,
16227   IX86_BUILTIN_PSUBQ128,
16228   IX86_BUILTIN_PSUBSB128,
16229   IX86_BUILTIN_PSUBSW128,
16230   IX86_BUILTIN_PSUBUSB128,
16231   IX86_BUILTIN_PSUBUSW128,
16232
16233   IX86_BUILTIN_PAND128,
16234   IX86_BUILTIN_PANDN128,
16235   IX86_BUILTIN_POR128,
16236   IX86_BUILTIN_PXOR128,
16237
16238   IX86_BUILTIN_PAVGB128,
16239   IX86_BUILTIN_PAVGW128,
16240
16241   IX86_BUILTIN_PCMPEQB128,
16242   IX86_BUILTIN_PCMPEQW128,
16243   IX86_BUILTIN_PCMPEQD128,
16244   IX86_BUILTIN_PCMPGTB128,
16245   IX86_BUILTIN_PCMPGTW128,
16246   IX86_BUILTIN_PCMPGTD128,
16247
16248   IX86_BUILTIN_PMADDWD128,
16249
16250   IX86_BUILTIN_PMAXSW128,
16251   IX86_BUILTIN_PMAXUB128,
16252   IX86_BUILTIN_PMINSW128,
16253   IX86_BUILTIN_PMINUB128,
16254
16255   IX86_BUILTIN_PMULUDQ,
16256   IX86_BUILTIN_PMULUDQ128,
16257   IX86_BUILTIN_PMULHUW128,
16258   IX86_BUILTIN_PMULHW128,
16259   IX86_BUILTIN_PMULLW128,
16260
16261   IX86_BUILTIN_PSADBW128,
16262   IX86_BUILTIN_PSHUFHW,
16263   IX86_BUILTIN_PSHUFLW,
16264   IX86_BUILTIN_PSHUFD,
16265
16266   IX86_BUILTIN_PSLLDQI128,
16267   IX86_BUILTIN_PSLLWI128,
16268   IX86_BUILTIN_PSLLDI128,
16269   IX86_BUILTIN_PSLLQI128,
16270   IX86_BUILTIN_PSRAWI128,
16271   IX86_BUILTIN_PSRADI128,
16272   IX86_BUILTIN_PSRLDQI128,
16273   IX86_BUILTIN_PSRLWI128,
16274   IX86_BUILTIN_PSRLDI128,
16275   IX86_BUILTIN_PSRLQI128,
16276
16277   IX86_BUILTIN_PSLLDQ128,
16278   IX86_BUILTIN_PSLLW128,
16279   IX86_BUILTIN_PSLLD128,
16280   IX86_BUILTIN_PSLLQ128,
16281   IX86_BUILTIN_PSRAW128,
16282   IX86_BUILTIN_PSRAD128,
16283   IX86_BUILTIN_PSRLW128,
16284   IX86_BUILTIN_PSRLD128,
16285   IX86_BUILTIN_PSRLQ128,
16286
16287   IX86_BUILTIN_PUNPCKHBW128,
16288   IX86_BUILTIN_PUNPCKHWD128,
16289   IX86_BUILTIN_PUNPCKHDQ128,
16290   IX86_BUILTIN_PUNPCKHQDQ128,
16291   IX86_BUILTIN_PUNPCKLBW128,
16292   IX86_BUILTIN_PUNPCKLWD128,
16293   IX86_BUILTIN_PUNPCKLDQ128,
16294   IX86_BUILTIN_PUNPCKLQDQ128,
16295
16296   IX86_BUILTIN_CLFLUSH,
16297   IX86_BUILTIN_MFENCE,
16298   IX86_BUILTIN_LFENCE,
16299
16300   /* Prescott New Instructions.  */
16301   IX86_BUILTIN_ADDSUBPS,
16302   IX86_BUILTIN_HADDPS,
16303   IX86_BUILTIN_HSUBPS,
16304   IX86_BUILTIN_MOVSHDUP,
16305   IX86_BUILTIN_MOVSLDUP,
16306   IX86_BUILTIN_ADDSUBPD,
16307   IX86_BUILTIN_HADDPD,
16308   IX86_BUILTIN_HSUBPD,
16309   IX86_BUILTIN_LDDQU,
16310
16311   IX86_BUILTIN_MONITOR,
16312   IX86_BUILTIN_MWAIT,
16313
16314   /* SSSE3.  */
16315   IX86_BUILTIN_PHADDW,
16316   IX86_BUILTIN_PHADDD,
16317   IX86_BUILTIN_PHADDSW,
16318   IX86_BUILTIN_PHSUBW,
16319   IX86_BUILTIN_PHSUBD,
16320   IX86_BUILTIN_PHSUBSW,
16321   IX86_BUILTIN_PMADDUBSW,
16322   IX86_BUILTIN_PMULHRSW,
16323   IX86_BUILTIN_PSHUFB,
16324   IX86_BUILTIN_PSIGNB,
16325   IX86_BUILTIN_PSIGNW,
16326   IX86_BUILTIN_PSIGND,
16327   IX86_BUILTIN_PALIGNR,
16328   IX86_BUILTIN_PABSB,
16329   IX86_BUILTIN_PABSW,
16330   IX86_BUILTIN_PABSD,
16331
16332   IX86_BUILTIN_PHADDW128,
16333   IX86_BUILTIN_PHADDD128,
16334   IX86_BUILTIN_PHADDSW128,
16335   IX86_BUILTIN_PHSUBW128,
16336   IX86_BUILTIN_PHSUBD128,
16337   IX86_BUILTIN_PHSUBSW128,
16338   IX86_BUILTIN_PMADDUBSW128,
16339   IX86_BUILTIN_PMULHRSW128,
16340   IX86_BUILTIN_PSHUFB128,
16341   IX86_BUILTIN_PSIGNB128,
16342   IX86_BUILTIN_PSIGNW128,
16343   IX86_BUILTIN_PSIGND128,
16344   IX86_BUILTIN_PALIGNR128,
16345   IX86_BUILTIN_PABSB128,
16346   IX86_BUILTIN_PABSW128,
16347   IX86_BUILTIN_PABSD128,
16348
16349   /* AMDFAM10 - SSE4A New Instructions.  */
16350   IX86_BUILTIN_MOVNTSD,
16351   IX86_BUILTIN_MOVNTSS,
16352   IX86_BUILTIN_EXTRQI,
16353   IX86_BUILTIN_EXTRQ,
16354   IX86_BUILTIN_INSERTQI,
16355   IX86_BUILTIN_INSERTQ,
16356
16357   IX86_BUILTIN_VEC_INIT_V2SI,
16358   IX86_BUILTIN_VEC_INIT_V4HI,
16359   IX86_BUILTIN_VEC_INIT_V8QI,
16360   IX86_BUILTIN_VEC_EXT_V2DF,
16361   IX86_BUILTIN_VEC_EXT_V2DI,
16362   IX86_BUILTIN_VEC_EXT_V4SF,
16363   IX86_BUILTIN_VEC_EXT_V4SI,
16364   IX86_BUILTIN_VEC_EXT_V8HI,
16365   IX86_BUILTIN_VEC_EXT_V2SI,
16366   IX86_BUILTIN_VEC_EXT_V4HI,
16367   IX86_BUILTIN_VEC_SET_V8HI,
16368   IX86_BUILTIN_VEC_SET_V4HI,
16369
16370   IX86_BUILTIN_MAX
16371 };
16372
16373 /* Table for the ix86 builtin decls.  */
16374 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
16375
16376 /* Add a ix86 target builtin function with CODE, NAME and TYPE.  Do so,
16377  * if the target_flags include one of MASK.  Stores the function decl
16378  * in the ix86_builtins array.
16379  * Returns the function decl or NULL_TREE, if the builtin was not added.  */
16380
16381 static inline tree
16382 def_builtin (int mask, const char *name, tree type, enum ix86_builtins code)
16383 {
16384   tree decl = NULL_TREE;
16385
16386   if (mask & target_flags
16387       && (!(mask & MASK_64BIT) || TARGET_64BIT))
16388     {
16389       decl = add_builtin_function (name, type, code, BUILT_IN_MD,
16390                                    NULL, NULL_TREE);
16391       ix86_builtins[(int) code] = decl;
16392     }
16393
16394   return decl;
16395 }
16396
16397 /* Like def_builtin, but also marks the function decl "const".  */
16398
16399 static inline tree
16400 def_builtin_const (int mask, const char *name, tree type,
16401                    enum ix86_builtins code)
16402 {
16403   tree decl = def_builtin (mask, name, type, code);
16404   if (decl)
16405     TREE_READONLY (decl) = 1;
16406   return decl;
16407 }
16408
16409 /* Bits for builtin_description.flag.  */
16410
16411 /* Set when we don't support the comparison natively, and should
16412    swap_comparison in order to support it.  */
16413 #define BUILTIN_DESC_SWAP_OPERANDS      1
16414
16415 struct builtin_description
16416 {
16417   const unsigned int mask;
16418   const enum insn_code icode;
16419   const char *const name;
16420   const enum ix86_builtins code;
16421   const enum rtx_code comparison;
16422   const unsigned int flag;
16423 };
16424
16425 static const struct builtin_description bdesc_comi[] =
16426 {
16427   { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
16428   { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
16429   { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
16430   { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
16431   { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
16432   { MASK_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
16433   { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
16434   { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
16435   { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
16436   { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
16437   { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
16438   { MASK_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
16439   { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
16440   { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
16441   { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
16442   { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
16443   { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
16444   { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
16445   { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
16446   { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
16447   { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
16448   { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
16449   { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
16450   { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
16451 };
16452
16453 static const struct builtin_description bdesc_2arg[] =
16454 {
16455   /* SSE */
16456   { MASK_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, 0, 0 },
16457   { MASK_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, 0, 0 },
16458   { MASK_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, 0, 0 },
16459   { MASK_SSE, CODE_FOR_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, 0, 0 },
16460   { MASK_SSE, CODE_FOR_sse_vmaddv4sf3,  "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, 0, 0 },
16461   { MASK_SSE, CODE_FOR_sse_vmsubv4sf3,  "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, 0, 0 },
16462   { MASK_SSE, CODE_FOR_sse_vmmulv4sf3,  "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, 0, 0 },
16463   { MASK_SSE, CODE_FOR_sse_vmdivv4sf3,  "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, 0, 0 },
16464
16465   { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, 0 },
16466   { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, 0 },
16467   { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, 0 },
16468   { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT,
16469     BUILTIN_DESC_SWAP_OPERANDS },
16470   { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE,
16471     BUILTIN_DESC_SWAP_OPERANDS },
16472   { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, 0 },
16473   { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, 0 },
16474   { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, 0 },
16475   { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, 0 },
16476   { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE,
16477     BUILTIN_DESC_SWAP_OPERANDS },
16478   { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT,
16479     BUILTIN_DESC_SWAP_OPERANDS },
16480   { MASK_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, 0 },
16481   { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, 0 },
16482   { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, 0 },
16483   { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, 0 },
16484   { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, 0 },
16485   { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, 0 },
16486   { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, 0 },
16487   { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, 0 },
16488   { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE,
16489     BUILTIN_DESC_SWAP_OPERANDS },
16490   { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT,
16491     BUILTIN_DESC_SWAP_OPERANDS },
16492   { MASK_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, 0 },
16493
16494   { MASK_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, 0, 0 },
16495   { MASK_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, 0, 0 },
16496   { MASK_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, 0, 0 },
16497   { MASK_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, 0, 0 },
16498
16499   { MASK_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, 0, 0 },
16500   { MASK_SSE, CODE_FOR_sse_nandv4sf3,  "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, 0, 0 },
16501   { MASK_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, 0, 0 },
16502   { MASK_SSE, CODE_FOR_xorv4sf3,  "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, 0, 0 },
16503
16504   { MASK_SSE, CODE_FOR_sse_movss,  "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, 0, 0 },
16505   { MASK_SSE, CODE_FOR_sse_movhlps,  "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, 0, 0 },
16506   { MASK_SSE, CODE_FOR_sse_movlhps,  "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, 0, 0 },
16507   { MASK_SSE, CODE_FOR_sse_unpckhps, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, 0, 0 },
16508   { MASK_SSE, CODE_FOR_sse_unpcklps, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, 0, 0 },
16509
16510   /* MMX */
16511   { MASK_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, 0, 0 },
16512   { MASK_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, 0, 0 },
16513   { MASK_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, 0, 0 },
16514   { MASK_SSE2, CODE_FOR_mmx_adddi3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, 0, 0 },
16515   { MASK_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, 0, 0 },
16516   { MASK_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, 0, 0 },
16517   { MASK_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, 0, 0 },
16518   { MASK_SSE2, CODE_FOR_mmx_subdi3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, 0, 0 },
16519
16520   { MASK_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, 0, 0 },
16521   { MASK_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, 0, 0 },
16522   { MASK_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, 0, 0 },
16523   { MASK_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, 0, 0 },
16524   { MASK_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, 0, 0 },
16525   { MASK_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, 0, 0 },
16526   { MASK_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, 0, 0 },
16527   { MASK_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, 0, 0 },
16528
16529   { MASK_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, 0, 0 },
16530   { MASK_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, 0, 0 },
16531   { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, 0, 0 },
16532
16533   { MASK_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, 0, 0 },
16534   { MASK_MMX, CODE_FOR_mmx_nandv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, 0, 0 },
16535   { MASK_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, 0, 0 },
16536   { MASK_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, 0, 0 },
16537
16538   { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, 0, 0 },
16539   { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, 0, 0 },
16540
16541   { MASK_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, 0, 0 },
16542   { MASK_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, 0, 0 },
16543   { MASK_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, 0, 0 },
16544   { MASK_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, 0, 0 },
16545   { MASK_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, 0, 0 },
16546   { MASK_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, 0, 0 },
16547
16548   { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, 0, 0 },
16549   { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, 0, 0 },
16550   { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, 0, 0 },
16551   { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, 0, 0 },
16552
16553   { MASK_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, 0, 0 },
16554   { MASK_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, 0, 0 },
16555   { MASK_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, 0, 0 },
16556   { MASK_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, 0, 0 },
16557   { MASK_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, 0, 0 },
16558   { MASK_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, 0, 0 },
16559
16560   /* Special.  */
16561   { MASK_MMX, CODE_FOR_mmx_packsswb, 0, IX86_BUILTIN_PACKSSWB, 0, 0 },
16562   { MASK_MMX, CODE_FOR_mmx_packssdw, 0, IX86_BUILTIN_PACKSSDW, 0, 0 },
16563   { MASK_MMX, CODE_FOR_mmx_packuswb, 0, IX86_BUILTIN_PACKUSWB, 0, 0 },
16564
16565   { MASK_SSE, CODE_FOR_sse_cvtpi2ps, 0, IX86_BUILTIN_CVTPI2PS, 0, 0 },
16566   { MASK_SSE, CODE_FOR_sse_cvtsi2ss, 0, IX86_BUILTIN_CVTSI2SS, 0, 0 },
16567   { MASK_SSE | MASK_64BIT, CODE_FOR_sse_cvtsi2ssq, 0, IX86_BUILTIN_CVTSI642SS, 0, 0 },
16568
16569   { MASK_MMX, CODE_FOR_mmx_ashlv4hi3, 0, IX86_BUILTIN_PSLLW, 0, 0 },
16570   { MASK_MMX, CODE_FOR_mmx_ashlv4hi3, 0, IX86_BUILTIN_PSLLWI, 0, 0 },
16571   { MASK_MMX, CODE_FOR_mmx_ashlv2si3, 0, IX86_BUILTIN_PSLLD, 0, 0 },
16572   { MASK_MMX, CODE_FOR_mmx_ashlv2si3, 0, IX86_BUILTIN_PSLLDI, 0, 0 },
16573   { MASK_MMX, CODE_FOR_mmx_ashldi3, 0, IX86_BUILTIN_PSLLQ, 0, 0 },
16574   { MASK_MMX, CODE_FOR_mmx_ashldi3, 0, IX86_BUILTIN_PSLLQI, 0, 0 },
16575
16576   { MASK_MMX, CODE_FOR_mmx_lshrv4hi3, 0, IX86_BUILTIN_PSRLW, 0, 0 },
16577   { MASK_MMX, CODE_FOR_mmx_lshrv4hi3, 0, IX86_BUILTIN_PSRLWI, 0, 0 },
16578   { MASK_MMX, CODE_FOR_mmx_lshrv2si3, 0, IX86_BUILTIN_PSRLD, 0, 0 },
16579   { MASK_MMX, CODE_FOR_mmx_lshrv2si3, 0, IX86_BUILTIN_PSRLDI, 0, 0 },
16580   { MASK_MMX, CODE_FOR_mmx_lshrdi3, 0, IX86_BUILTIN_PSRLQ, 0, 0 },
16581   { MASK_MMX, CODE_FOR_mmx_lshrdi3, 0, IX86_BUILTIN_PSRLQI, 0, 0 },
16582
16583   { MASK_MMX, CODE_FOR_mmx_ashrv4hi3, 0, IX86_BUILTIN_PSRAW, 0, 0 },
16584   { MASK_MMX, CODE_FOR_mmx_ashrv4hi3, 0, IX86_BUILTIN_PSRAWI, 0, 0 },
16585   { MASK_MMX, CODE_FOR_mmx_ashrv2si3, 0, IX86_BUILTIN_PSRAD, 0, 0 },
16586   { MASK_MMX, CODE_FOR_mmx_ashrv2si3, 0, IX86_BUILTIN_PSRADI, 0, 0 },
16587
16588   { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_psadbw, 0, IX86_BUILTIN_PSADBW, 0, 0 },
16589   { MASK_MMX, CODE_FOR_mmx_pmaddwd, 0, IX86_BUILTIN_PMADDWD, 0, 0 },
16590
16591   /* SSE2 */
16592   { MASK_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, 0, 0 },
16593   { MASK_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, 0, 0 },
16594   { MASK_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, 0, 0 },
16595   { MASK_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, 0, 0 },
16596   { MASK_SSE2, CODE_FOR_sse2_vmaddv2df3,  "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, 0, 0 },
16597   { MASK_SSE2, CODE_FOR_sse2_vmsubv2df3,  "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, 0, 0 },
16598   { MASK_SSE2, CODE_FOR_sse2_vmmulv2df3,  "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, 0, 0 },
16599   { MASK_SSE2, CODE_FOR_sse2_vmdivv2df3,  "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, 0, 0 },
16600
16601   { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, 0 },
16602   { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, 0 },
16603   { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, 0 },
16604   { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT,
16605     BUILTIN_DESC_SWAP_OPERANDS },
16606   { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE,
16607     BUILTIN_DESC_SWAP_OPERANDS },
16608   { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, 0 },
16609   { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, 0 },
16610   { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, 0 },
16611   { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, 0 },
16612   { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE,
16613     BUILTIN_DESC_SWAP_OPERANDS },
16614   { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT,
16615     BUILTIN_DESC_SWAP_OPERANDS },
16616   { MASK_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, 0 },
16617   { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, 0 },
16618   { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, 0 },
16619   { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, 0 },
16620   { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, 0 },
16621   { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, 0 },
16622   { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, 0 },
16623   { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, 0 },
16624   { MASK_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, 0 },
16625
16626   { MASK_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, 0, 0 },
16627   { MASK_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, 0, 0 },
16628   { MASK_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, 0, 0 },
16629   { MASK_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, 0, 0 },
16630
16631   { MASK_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, 0, 0 },
16632   { MASK_SSE2, CODE_FOR_sse2_nandv2df3,  "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, 0, 0 },
16633   { MASK_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, 0, 0 },
16634   { MASK_SSE2, CODE_FOR_xorv2df3,  "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, 0, 0 },
16635
16636   { MASK_SSE2, CODE_FOR_sse2_movsd,  "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, 0, 0 },
16637   { MASK_SSE2, CODE_FOR_sse2_unpckhpd, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, 0, 0 },
16638   { MASK_SSE2, CODE_FOR_sse2_unpcklpd, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, 0, 0 },
16639
16640   /* SSE2 MMX */
16641   { MASK_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, 0, 0 },
16642   { MASK_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, 0, 0 },
16643   { MASK_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, 0, 0 },
16644   { MASK_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, 0, 0 },
16645   { MASK_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, 0, 0 },
16646   { MASK_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, 0, 0 },
16647   { MASK_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, 0, 0 },
16648   { MASK_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, 0, 0 },
16649
16650   { MASK_MMX, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, 0, 0 },
16651   { MASK_MMX, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, 0, 0 },
16652   { MASK_MMX, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, 0, 0 },
16653   { MASK_MMX, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, 0, 0 },
16654   { MASK_MMX, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, 0, 0 },
16655   { MASK_MMX, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, 0, 0 },
16656   { MASK_MMX, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, 0, 0 },
16657   { MASK_MMX, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, 0, 0 },
16658
16659   { MASK_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, 0, 0 },
16660   { MASK_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, 0, 0 },
16661
16662   { MASK_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, 0, 0 },
16663   { MASK_SSE2, CODE_FOR_sse2_nandv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, 0, 0 },
16664   { MASK_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, 0, 0 },
16665   { MASK_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, 0, 0 },
16666
16667   { MASK_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, 0, 0 },
16668   { MASK_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, 0, 0 },
16669
16670   { MASK_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, 0, 0 },
16671   { MASK_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, 0, 0 },
16672   { MASK_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, 0, 0 },
16673   { MASK_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, 0, 0 },
16674   { MASK_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, 0, 0 },
16675   { MASK_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, 0, 0 },
16676
16677   { MASK_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, 0, 0 },
16678   { MASK_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, 0, 0 },
16679   { MASK_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, 0, 0 },
16680   { MASK_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, 0, 0 },
16681
16682   { MASK_SSE2, CODE_FOR_sse2_punpckhbw, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, 0, 0 },
16683   { MASK_SSE2, CODE_FOR_sse2_punpckhwd, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, 0, 0 },
16684   { MASK_SSE2, CODE_FOR_sse2_punpckhdq, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, 0, 0 },
16685   { MASK_SSE2, CODE_FOR_sse2_punpckhqdq, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, 0, 0 },
16686   { MASK_SSE2, CODE_FOR_sse2_punpcklbw, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, 0, 0 },
16687   { MASK_SSE2, CODE_FOR_sse2_punpcklwd, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, 0, 0 },
16688   { MASK_SSE2, CODE_FOR_sse2_punpckldq, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, 0, 0 },
16689   { MASK_SSE2, CODE_FOR_sse2_punpcklqdq, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, 0, 0 },
16690
16691   { MASK_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, 0, 0 },
16692   { MASK_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, 0, 0 },
16693   { MASK_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, 0, 0 },
16694
16695   { MASK_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, 0, 0 },
16696   { MASK_SSE2, CODE_FOR_sse2_psadbw, 0, IX86_BUILTIN_PSADBW128, 0, 0 },
16697
16698   { MASK_SSE2, CODE_FOR_sse2_umulsidi3, 0, IX86_BUILTIN_PMULUDQ, 0, 0 },
16699   { MASK_SSE2, CODE_FOR_sse2_umulv2siv2di3, 0, IX86_BUILTIN_PMULUDQ128, 0, 0 },
16700
16701   { MASK_SSE2, CODE_FOR_ashlv8hi3, 0, IX86_BUILTIN_PSLLWI128, 0, 0 },
16702   { MASK_SSE2, CODE_FOR_ashlv4si3, 0, IX86_BUILTIN_PSLLDI128, 0, 0 },
16703   { MASK_SSE2, CODE_FOR_ashlv2di3, 0, IX86_BUILTIN_PSLLQI128, 0, 0 },
16704
16705   { MASK_SSE2, CODE_FOR_lshrv8hi3, 0, IX86_BUILTIN_PSRLWI128, 0, 0 },
16706   { MASK_SSE2, CODE_FOR_lshrv4si3, 0, IX86_BUILTIN_PSRLDI128, 0, 0 },
16707   { MASK_SSE2, CODE_FOR_lshrv2di3, 0, IX86_BUILTIN_PSRLQI128, 0, 0 },
16708
16709   { MASK_SSE2, CODE_FOR_ashrv8hi3, 0, IX86_BUILTIN_PSRAWI128, 0, 0 },
16710   { MASK_SSE2, CODE_FOR_ashrv4si3, 0, IX86_BUILTIN_PSRADI128, 0, 0 },
16711
16712   { MASK_SSE2, CODE_FOR_sse2_pmaddwd, 0, IX86_BUILTIN_PMADDWD128, 0, 0 },
16713
16714   { MASK_SSE2, CODE_FOR_sse2_cvtsi2sd, 0, IX86_BUILTIN_CVTSI2SD, 0, 0 },
16715   { MASK_SSE2 | MASK_64BIT, CODE_FOR_sse2_cvtsi2sdq, 0, IX86_BUILTIN_CVTSI642SD, 0, 0 },
16716   { MASK_SSE2, CODE_FOR_sse2_cvtsd2ss, 0, IX86_BUILTIN_CVTSD2SS, 0, 0 },
16717   { MASK_SSE2, CODE_FOR_sse2_cvtss2sd, 0, IX86_BUILTIN_CVTSS2SD, 0, 0 },
16718
16719   /* SSE3 MMX */
16720   { MASK_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, 0, 0 },
16721   { MASK_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, 0, 0 },
16722   { MASK_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, 0, 0 },
16723   { MASK_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, 0, 0 },
16724   { MASK_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, 0, 0 },
16725   { MASK_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, 0, 0 },
16726
16727   /* SSSE3 */
16728   { MASK_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, 0, 0 },
16729   { MASK_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, 0, 0 },
16730   { MASK_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, 0, 0 },
16731   { MASK_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, 0, 0 },
16732   { MASK_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, 0, 0 },
16733   { MASK_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, 0, 0 },
16734   { MASK_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, 0, 0 },
16735   { MASK_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, 0, 0 },
16736   { MASK_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, 0, 0 },
16737   { MASK_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, 0, 0 },
16738   { MASK_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, 0, 0 },
16739   { MASK_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, 0, 0 },
16740   { MASK_SSSE3, CODE_FOR_ssse3_pmaddubswv8hi3, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, 0, 0 },
16741   { MASK_SSSE3, CODE_FOR_ssse3_pmaddubswv4hi3, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, 0, 0 },
16742   { MASK_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, 0, 0 },
16743   { MASK_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, 0, 0 },
16744   { MASK_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, 0, 0 },
16745   { MASK_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, 0, 0 },
16746   { MASK_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, 0, 0 },
16747   { MASK_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, 0, 0 },
16748   { MASK_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, 0, 0 },
16749   { MASK_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, 0, 0 },
16750   { MASK_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, 0, 0 },
16751   { MASK_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, 0, 0 }
16752 };
16753
16754 static const struct builtin_description bdesc_1arg[] =
16755 {
16756   { MASK_SSE | MASK_3DNOW_A, CODE_FOR_mmx_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB, 0, 0 },
16757   { MASK_SSE, CODE_FOR_sse_movmskps, 0, IX86_BUILTIN_MOVMSKPS, 0, 0 },
16758
16759   { MASK_SSE, CODE_FOR_sqrtv4sf2, 0, IX86_BUILTIN_SQRTPS, 0, 0 },
16760   { MASK_SSE, CODE_FOR_sse_rsqrtv4sf2, 0, IX86_BUILTIN_RSQRTPS, 0, 0 },
16761   { MASK_SSE, CODE_FOR_sse_rcpv4sf2, 0, IX86_BUILTIN_RCPPS, 0, 0 },
16762
16763   { MASK_SSE, CODE_FOR_sse_cvtps2pi, 0, IX86_BUILTIN_CVTPS2PI, 0, 0 },
16764   { MASK_SSE, CODE_FOR_sse_cvtss2si, 0, IX86_BUILTIN_CVTSS2SI, 0, 0 },
16765   { MASK_SSE | MASK_64BIT, CODE_FOR_sse_cvtss2siq, 0, IX86_BUILTIN_CVTSS2SI64, 0, 0 },
16766   { MASK_SSE, CODE_FOR_sse_cvttps2pi, 0, IX86_BUILTIN_CVTTPS2PI, 0, 0 },
16767   { MASK_SSE, CODE_FOR_sse_cvttss2si, 0, IX86_BUILTIN_CVTTSS2SI, 0, 0 },
16768   { MASK_SSE | MASK_64BIT, CODE_FOR_sse_cvttss2siq, 0, IX86_BUILTIN_CVTTSS2SI64, 0, 0 },
16769
16770   { MASK_SSE2, CODE_FOR_sse2_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB128, 0, 0 },
16771   { MASK_SSE2, CODE_FOR_sse2_movmskpd, 0, IX86_BUILTIN_MOVMSKPD, 0, 0 },
16772
16773   { MASK_SSE2, CODE_FOR_sqrtv2df2, 0, IX86_BUILTIN_SQRTPD, 0, 0 },
16774
16775   { MASK_SSE2, CODE_FOR_sse2_cvtdq2pd, 0, IX86_BUILTIN_CVTDQ2PD, 0, 0 },
16776   { MASK_SSE2, CODE_FOR_sse2_cvtdq2ps, 0, IX86_BUILTIN_CVTDQ2PS, 0, 0 },
16777
16778   { MASK_SSE2, CODE_FOR_sse2_cvtpd2dq, 0, IX86_BUILTIN_CVTPD2DQ, 0, 0 },
16779   { MASK_SSE2, CODE_FOR_sse2_cvtpd2pi, 0, IX86_BUILTIN_CVTPD2PI, 0, 0 },
16780   { MASK_SSE2, CODE_FOR_sse2_cvtpd2ps, 0, IX86_BUILTIN_CVTPD2PS, 0, 0 },
16781   { MASK_SSE2, CODE_FOR_sse2_cvttpd2dq, 0, IX86_BUILTIN_CVTTPD2DQ, 0, 0 },
16782   { MASK_SSE2, CODE_FOR_sse2_cvttpd2pi, 0, IX86_BUILTIN_CVTTPD2PI, 0, 0 },
16783
16784   { MASK_SSE2, CODE_FOR_sse2_cvtpi2pd, 0, IX86_BUILTIN_CVTPI2PD, 0, 0 },
16785
16786   { MASK_SSE2, CODE_FOR_sse2_cvtsd2si, 0, IX86_BUILTIN_CVTSD2SI, 0, 0 },
16787   { MASK_SSE2, CODE_FOR_sse2_cvttsd2si, 0, IX86_BUILTIN_CVTTSD2SI, 0, 0 },
16788   { MASK_SSE2 | MASK_64BIT, CODE_FOR_sse2_cvtsd2siq, 0, IX86_BUILTIN_CVTSD2SI64, 0, 0 },
16789   { MASK_SSE2 | MASK_64BIT, CODE_FOR_sse2_cvttsd2siq, 0, IX86_BUILTIN_CVTTSD2SI64, 0, 0 },
16790
16791   { MASK_SSE2, CODE_FOR_sse2_cvtps2dq, 0, IX86_BUILTIN_CVTPS2DQ, 0, 0 },
16792   { MASK_SSE2, CODE_FOR_sse2_cvtps2pd, 0, IX86_BUILTIN_CVTPS2PD, 0, 0 },
16793   { MASK_SSE2, CODE_FOR_sse2_cvttps2dq, 0, IX86_BUILTIN_CVTTPS2DQ, 0, 0 },
16794
16795   /* SSE3 */
16796   { MASK_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, 0, 0 },
16797   { MASK_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, 0, 0 },
16798
16799   /* SSSE3 */
16800   { MASK_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, 0, 0 },
16801   { MASK_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, 0, 0 },
16802   { MASK_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, 0, 0 },
16803   { MASK_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, 0, 0 },
16804   { MASK_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, 0, 0 },
16805   { MASK_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, 0, 0 },
16806 };
16807
16808 /* Set up all the MMX/SSE builtins.  This is not called if TARGET_MMX
16809    is zero.  Otherwise, if TARGET_SSE is not set, only expand the MMX
16810    builtins.  */
16811 static void
16812 ix86_init_mmx_sse_builtins (void)
16813 {
16814   const struct builtin_description * d;
16815   size_t i;
16816
16817   tree V16QI_type_node = build_vector_type_for_mode (char_type_node, V16QImode);
16818   tree V2SI_type_node = build_vector_type_for_mode (intSI_type_node, V2SImode);
16819   tree V2SF_type_node = build_vector_type_for_mode (float_type_node, V2SFmode);
16820   tree V2DI_type_node
16821     = build_vector_type_for_mode (long_long_integer_type_node, V2DImode);
16822   tree V2DF_type_node = build_vector_type_for_mode (double_type_node, V2DFmode);
16823   tree V4SF_type_node = build_vector_type_for_mode (float_type_node, V4SFmode);
16824   tree V4SI_type_node = build_vector_type_for_mode (intSI_type_node, V4SImode);
16825   tree V4HI_type_node = build_vector_type_for_mode (intHI_type_node, V4HImode);
16826   tree V8QI_type_node = build_vector_type_for_mode (char_type_node, V8QImode);
16827   tree V8HI_type_node = build_vector_type_for_mode (intHI_type_node, V8HImode);
16828
16829   tree pchar_type_node = build_pointer_type (char_type_node);
16830   tree pcchar_type_node = build_pointer_type (
16831                              build_type_variant (char_type_node, 1, 0));
16832   tree pfloat_type_node = build_pointer_type (float_type_node);
16833   tree pcfloat_type_node = build_pointer_type (
16834                              build_type_variant (float_type_node, 1, 0));
16835   tree pv2si_type_node = build_pointer_type (V2SI_type_node);
16836   tree pv2di_type_node = build_pointer_type (V2DI_type_node);
16837   tree pdi_type_node = build_pointer_type (long_long_unsigned_type_node);
16838
16839   /* Comparisons.  */
16840   tree int_ftype_v4sf_v4sf
16841     = build_function_type_list (integer_type_node,
16842                                 V4SF_type_node, V4SF_type_node, NULL_TREE);
16843   tree v4si_ftype_v4sf_v4sf
16844     = build_function_type_list (V4SI_type_node,
16845                                 V4SF_type_node, V4SF_type_node, NULL_TREE);
16846   /* MMX/SSE/integer conversions.  */
16847   tree int_ftype_v4sf
16848     = build_function_type_list (integer_type_node,
16849                                 V4SF_type_node, NULL_TREE);
16850   tree int64_ftype_v4sf
16851     = build_function_type_list (long_long_integer_type_node,
16852                                 V4SF_type_node, NULL_TREE);
16853   tree int_ftype_v8qi
16854     = build_function_type_list (integer_type_node, V8QI_type_node, NULL_TREE);
16855   tree v4sf_ftype_v4sf_int
16856     = build_function_type_list (V4SF_type_node,
16857                                 V4SF_type_node, integer_type_node, NULL_TREE);
16858   tree v4sf_ftype_v4sf_int64
16859     = build_function_type_list (V4SF_type_node,
16860                                 V4SF_type_node, long_long_integer_type_node,
16861                                 NULL_TREE);
16862   tree v4sf_ftype_v4sf_v2si
16863     = build_function_type_list (V4SF_type_node,
16864                                 V4SF_type_node, V2SI_type_node, NULL_TREE);
16865
16866   /* Miscellaneous.  */
16867   tree v8qi_ftype_v4hi_v4hi
16868     = build_function_type_list (V8QI_type_node,
16869                                 V4HI_type_node, V4HI_type_node, NULL_TREE);
16870   tree v4hi_ftype_v2si_v2si
16871     = build_function_type_list (V4HI_type_node,
16872                                 V2SI_type_node, V2SI_type_node, NULL_TREE);
16873   tree v4sf_ftype_v4sf_v4sf_int
16874     = build_function_type_list (V4SF_type_node,
16875                                 V4SF_type_node, V4SF_type_node,
16876                                 integer_type_node, NULL_TREE);
16877   tree v2si_ftype_v4hi_v4hi
16878     = build_function_type_list (V2SI_type_node,
16879                                 V4HI_type_node, V4HI_type_node, NULL_TREE);
16880   tree v4hi_ftype_v4hi_int
16881     = build_function_type_list (V4HI_type_node,
16882                                 V4HI_type_node, integer_type_node, NULL_TREE);
16883   tree v4hi_ftype_v4hi_di
16884     = build_function_type_list (V4HI_type_node,
16885                                 V4HI_type_node, long_long_unsigned_type_node,
16886                                 NULL_TREE);
16887   tree v2si_ftype_v2si_di
16888     = build_function_type_list (V2SI_type_node,
16889                                 V2SI_type_node, long_long_unsigned_type_node,
16890                                 NULL_TREE);
16891   tree void_ftype_void
16892     = build_function_type (void_type_node, void_list_node);
16893   tree void_ftype_unsigned
16894     = build_function_type_list (void_type_node, unsigned_type_node, NULL_TREE);
16895   tree void_ftype_unsigned_unsigned
16896     = build_function_type_list (void_type_node, unsigned_type_node,
16897                                 unsigned_type_node, NULL_TREE);
16898   tree void_ftype_pcvoid_unsigned_unsigned
16899     = build_function_type_list (void_type_node, const_ptr_type_node,
16900                                 unsigned_type_node, unsigned_type_node,
16901                                 NULL_TREE);
16902   tree unsigned_ftype_void
16903     = build_function_type (unsigned_type_node, void_list_node);
16904   tree v2si_ftype_v4sf
16905     = build_function_type_list (V2SI_type_node, V4SF_type_node, NULL_TREE);
16906   /* Loads/stores.  */
16907   tree void_ftype_v8qi_v8qi_pchar
16908     = build_function_type_list (void_type_node,
16909                                 V8QI_type_node, V8QI_type_node,
16910                                 pchar_type_node, NULL_TREE);
16911   tree v4sf_ftype_pcfloat
16912     = build_function_type_list (V4SF_type_node, pcfloat_type_node, NULL_TREE);
16913   /* @@@ the type is bogus */
16914   tree v4sf_ftype_v4sf_pv2si
16915     = build_function_type_list (V4SF_type_node,
16916                                 V4SF_type_node, pv2si_type_node, NULL_TREE);
16917   tree void_ftype_pv2si_v4sf
16918     = build_function_type_list (void_type_node,
16919                                 pv2si_type_node, V4SF_type_node, NULL_TREE);
16920   tree void_ftype_pfloat_v4sf
16921     = build_function_type_list (void_type_node,
16922                                 pfloat_type_node, V4SF_type_node, NULL_TREE);
16923   tree void_ftype_pdi_di
16924     = build_function_type_list (void_type_node,
16925                                 pdi_type_node, long_long_unsigned_type_node,
16926                                 NULL_TREE);
16927   tree void_ftype_pv2di_v2di
16928     = build_function_type_list (void_type_node,
16929                                 pv2di_type_node, V2DI_type_node, NULL_TREE);
16930   /* Normal vector unops.  */
16931   tree v4sf_ftype_v4sf
16932     = build_function_type_list (V4SF_type_node, V4SF_type_node, NULL_TREE);
16933   tree v16qi_ftype_v16qi
16934     = build_function_type_list (V16QI_type_node, V16QI_type_node, NULL_TREE);
16935   tree v8hi_ftype_v8hi
16936     = build_function_type_list (V8HI_type_node, V8HI_type_node, NULL_TREE);
16937   tree v4si_ftype_v4si
16938     = build_function_type_list (V4SI_type_node, V4SI_type_node, NULL_TREE);
16939   tree v8qi_ftype_v8qi
16940     = build_function_type_list (V8QI_type_node, V8QI_type_node, NULL_TREE);
16941   tree v4hi_ftype_v4hi
16942     = build_function_type_list (V4HI_type_node, V4HI_type_node, NULL_TREE);
16943
16944   /* Normal vector binops.  */
16945   tree v4sf_ftype_v4sf_v4sf
16946     = build_function_type_list (V4SF_type_node,
16947                                 V4SF_type_node, V4SF_type_node, NULL_TREE);
16948   tree v8qi_ftype_v8qi_v8qi
16949     = build_function_type_list (V8QI_type_node,
16950                                 V8QI_type_node, V8QI_type_node, NULL_TREE);
16951   tree v4hi_ftype_v4hi_v4hi
16952     = build_function_type_list (V4HI_type_node,
16953                                 V4HI_type_node, V4HI_type_node, NULL_TREE);
16954   tree v2si_ftype_v2si_v2si
16955     = build_function_type_list (V2SI_type_node,
16956                                 V2SI_type_node, V2SI_type_node, NULL_TREE);
16957   tree di_ftype_di_di
16958     = build_function_type_list (long_long_unsigned_type_node,
16959                                 long_long_unsigned_type_node,
16960                                 long_long_unsigned_type_node, NULL_TREE);
16961
16962   tree di_ftype_di_di_int
16963     = build_function_type_list (long_long_unsigned_type_node,
16964                                 long_long_unsigned_type_node,
16965                                 long_long_unsigned_type_node,
16966                                 integer_type_node, NULL_TREE);
16967
16968   tree v2si_ftype_v2sf
16969     = build_function_type_list (V2SI_type_node, V2SF_type_node, NULL_TREE);
16970   tree v2sf_ftype_v2si
16971     = build_function_type_list (V2SF_type_node, V2SI_type_node, NULL_TREE);
16972   tree v2si_ftype_v2si
16973     = build_function_type_list (V2SI_type_node, V2SI_type_node, NULL_TREE);
16974   tree v2sf_ftype_v2sf
16975     = build_function_type_list (V2SF_type_node, V2SF_type_node, NULL_TREE);
16976   tree v2sf_ftype_v2sf_v2sf
16977     = build_function_type_list (V2SF_type_node,
16978                                 V2SF_type_node, V2SF_type_node, NULL_TREE);
16979   tree v2si_ftype_v2sf_v2sf
16980     = build_function_type_list (V2SI_type_node,
16981                                 V2SF_type_node, V2SF_type_node, NULL_TREE);
16982   tree pint_type_node    = build_pointer_type (integer_type_node);
16983   tree pdouble_type_node = build_pointer_type (double_type_node);
16984   tree pcdouble_type_node = build_pointer_type (
16985                                 build_type_variant (double_type_node, 1, 0));
16986   tree int_ftype_v2df_v2df
16987     = build_function_type_list (integer_type_node,
16988                                 V2DF_type_node, V2DF_type_node, NULL_TREE);
16989
16990   tree void_ftype_pcvoid
16991     = build_function_type_list (void_type_node, const_ptr_type_node, NULL_TREE);
16992   tree v4sf_ftype_v4si
16993     = build_function_type_list (V4SF_type_node, V4SI_type_node, NULL_TREE);
16994   tree v4si_ftype_v4sf
16995     = build_function_type_list (V4SI_type_node, V4SF_type_node, NULL_TREE);
16996   tree v2df_ftype_v4si
16997     = build_function_type_list (V2DF_type_node, V4SI_type_node, NULL_TREE);
16998   tree v4si_ftype_v2df
16999     = build_function_type_list (V4SI_type_node, V2DF_type_node, NULL_TREE);
17000   tree v2si_ftype_v2df
17001     = build_function_type_list (V2SI_type_node, V2DF_type_node, NULL_TREE);
17002   tree v4sf_ftype_v2df
17003     = build_function_type_list (V4SF_type_node, V2DF_type_node, NULL_TREE);
17004   tree v2df_ftype_v2si
17005     = build_function_type_list (V2DF_type_node, V2SI_type_node, NULL_TREE);
17006   tree v2df_ftype_v4sf
17007     = build_function_type_list (V2DF_type_node, V4SF_type_node, NULL_TREE);
17008   tree int_ftype_v2df
17009     = build_function_type_list (integer_type_node, V2DF_type_node, NULL_TREE);
17010   tree int64_ftype_v2df
17011     = build_function_type_list (long_long_integer_type_node,
17012                                 V2DF_type_node, NULL_TREE);
17013   tree v2df_ftype_v2df_int
17014     = build_function_type_list (V2DF_type_node,
17015                                 V2DF_type_node, integer_type_node, NULL_TREE);
17016   tree v2df_ftype_v2df_int64
17017     = build_function_type_list (V2DF_type_node,
17018                                 V2DF_type_node, long_long_integer_type_node,
17019                                 NULL_TREE);
17020   tree v4sf_ftype_v4sf_v2df
17021     = build_function_type_list (V4SF_type_node,
17022                                 V4SF_type_node, V2DF_type_node, NULL_TREE);
17023   tree v2df_ftype_v2df_v4sf
17024     = build_function_type_list (V2DF_type_node,
17025                                 V2DF_type_node, V4SF_type_node, NULL_TREE);
17026   tree v2df_ftype_v2df_v2df_int
17027     = build_function_type_list (V2DF_type_node,
17028                                 V2DF_type_node, V2DF_type_node,
17029                                 integer_type_node,
17030                                 NULL_TREE);
17031   tree v2df_ftype_v2df_pcdouble
17032     = build_function_type_list (V2DF_type_node,
17033                                 V2DF_type_node, pcdouble_type_node, NULL_TREE);
17034   tree void_ftype_pdouble_v2df
17035     = build_function_type_list (void_type_node,
17036                                 pdouble_type_node, V2DF_type_node, NULL_TREE);
17037   tree void_ftype_pint_int
17038     = build_function_type_list (void_type_node,
17039                                 pint_type_node, integer_type_node, NULL_TREE);
17040   tree void_ftype_v16qi_v16qi_pchar
17041     = build_function_type_list (void_type_node,
17042                                 V16QI_type_node, V16QI_type_node,
17043                                 pchar_type_node, NULL_TREE);
17044   tree v2df_ftype_pcdouble
17045     = build_function_type_list (V2DF_type_node, pcdouble_type_node, NULL_TREE);
17046   tree v2df_ftype_v2df_v2df
17047     = build_function_type_list (V2DF_type_node,
17048                                 V2DF_type_node, V2DF_type_node, NULL_TREE);
17049   tree v16qi_ftype_v16qi_v16qi
17050     = build_function_type_list (V16QI_type_node,
17051                                 V16QI_type_node, V16QI_type_node, NULL_TREE);
17052   tree v8hi_ftype_v8hi_v8hi
17053     = build_function_type_list (V8HI_type_node,
17054                                 V8HI_type_node, V8HI_type_node, NULL_TREE);
17055   tree v4si_ftype_v4si_v4si
17056     = build_function_type_list (V4SI_type_node,
17057                                 V4SI_type_node, V4SI_type_node, NULL_TREE);
17058   tree v2di_ftype_v2di_v2di
17059     = build_function_type_list (V2DI_type_node,
17060                                 V2DI_type_node, V2DI_type_node, NULL_TREE);
17061   tree v2di_ftype_v2df_v2df
17062     = build_function_type_list (V2DI_type_node,
17063                                 V2DF_type_node, V2DF_type_node, NULL_TREE);
17064   tree v2df_ftype_v2df
17065     = build_function_type_list (V2DF_type_node, V2DF_type_node, NULL_TREE);
17066   tree v2di_ftype_v2di_int
17067     = build_function_type_list (V2DI_type_node,
17068                                 V2DI_type_node, integer_type_node, NULL_TREE);
17069   tree v2di_ftype_v2di_v2di_int
17070     = build_function_type_list (V2DI_type_node, V2DI_type_node,
17071                                 V2DI_type_node, integer_type_node, NULL_TREE);
17072   tree v4si_ftype_v4si_int
17073     = build_function_type_list (V4SI_type_node,
17074                                 V4SI_type_node, integer_type_node, NULL_TREE);
17075   tree v8hi_ftype_v8hi_int
17076     = build_function_type_list (V8HI_type_node,
17077                                 V8HI_type_node, integer_type_node, NULL_TREE);
17078   tree v4si_ftype_v8hi_v8hi
17079     = build_function_type_list (V4SI_type_node,
17080                                 V8HI_type_node, V8HI_type_node, NULL_TREE);
17081   tree di_ftype_v8qi_v8qi
17082     = build_function_type_list (long_long_unsigned_type_node,
17083                                 V8QI_type_node, V8QI_type_node, NULL_TREE);
17084   tree di_ftype_v2si_v2si
17085     = build_function_type_list (long_long_unsigned_type_node,
17086                                 V2SI_type_node, V2SI_type_node, NULL_TREE);
17087   tree v2di_ftype_v16qi_v16qi
17088     = build_function_type_list (V2DI_type_node,
17089                                 V16QI_type_node, V16QI_type_node, NULL_TREE);
17090   tree v2di_ftype_v4si_v4si
17091     = build_function_type_list (V2DI_type_node,
17092                                 V4SI_type_node, V4SI_type_node, NULL_TREE);
17093   tree int_ftype_v16qi
17094     = build_function_type_list (integer_type_node, V16QI_type_node, NULL_TREE);
17095   tree v16qi_ftype_pcchar
17096     = build_function_type_list (V16QI_type_node, pcchar_type_node, NULL_TREE);
17097   tree void_ftype_pchar_v16qi
17098     = build_function_type_list (void_type_node,
17099                                 pchar_type_node, V16QI_type_node, NULL_TREE);
17100
17101   tree v2di_ftype_v2di_unsigned_unsigned
17102     = build_function_type_list (V2DI_type_node, V2DI_type_node,
17103                                 unsigned_type_node, unsigned_type_node,
17104                                 NULL_TREE);
17105   tree v2di_ftype_v2di_v2di_unsigned_unsigned
17106     = build_function_type_list (V2DI_type_node, V2DI_type_node, V2DI_type_node,
17107                                 unsigned_type_node, unsigned_type_node,
17108                                 NULL_TREE);
17109   tree v2di_ftype_v2di_v16qi
17110     = build_function_type_list (V2DI_type_node, V2DI_type_node, V16QI_type_node,
17111                                 NULL_TREE);
17112
17113   tree float80_type;
17114   tree float128_type;
17115   tree ftype;
17116
17117   /* The __float80 type.  */
17118   if (TYPE_MODE (long_double_type_node) == XFmode)
17119     (*lang_hooks.types.register_builtin_type) (long_double_type_node,
17120                                                "__float80");
17121   else
17122     {
17123       /* The __float80 type.  */
17124       float80_type = make_node (REAL_TYPE);
17125       TYPE_PRECISION (float80_type) = 80;
17126       layout_type (float80_type);
17127       (*lang_hooks.types.register_builtin_type) (float80_type, "__float80");
17128     }
17129
17130   if (TARGET_64BIT)
17131     {
17132       float128_type = make_node (REAL_TYPE);
17133       TYPE_PRECISION (float128_type) = 128;
17134       layout_type (float128_type);
17135       (*lang_hooks.types.register_builtin_type) (float128_type, "__float128");
17136     }
17137
17138   /* Add all builtins that are more or less simple operations on two
17139      operands.  */
17140   for (i = 0, d = bdesc_2arg; i < ARRAY_SIZE (bdesc_2arg); i++, d++)
17141     {
17142       /* Use one of the operands; the target can have a different mode for
17143          mask-generating compares.  */
17144       enum machine_mode mode;
17145       tree type;
17146
17147       if (d->name == 0)
17148         continue;
17149       mode = insn_data[d->icode].operand[1].mode;
17150
17151       switch (mode)
17152         {
17153         case V16QImode:
17154           type = v16qi_ftype_v16qi_v16qi;
17155           break;
17156         case V8HImode:
17157           type = v8hi_ftype_v8hi_v8hi;
17158           break;
17159         case V4SImode:
17160           type = v4si_ftype_v4si_v4si;
17161           break;
17162         case V2DImode:
17163           type = v2di_ftype_v2di_v2di;
17164           break;
17165         case V2DFmode:
17166           type = v2df_ftype_v2df_v2df;
17167           break;
17168         case V4SFmode:
17169           type = v4sf_ftype_v4sf_v4sf;
17170           break;
17171         case V8QImode:
17172           type = v8qi_ftype_v8qi_v8qi;
17173           break;
17174         case V4HImode:
17175           type = v4hi_ftype_v4hi_v4hi;
17176           break;
17177         case V2SImode:
17178           type = v2si_ftype_v2si_v2si;
17179           break;
17180         case DImode:
17181           type = di_ftype_di_di;
17182           break;
17183
17184         default:
17185           gcc_unreachable ();
17186         }
17187
17188       /* Override for comparisons.  */
17189       if (d->icode == CODE_FOR_sse_maskcmpv4sf3
17190           || d->icode == CODE_FOR_sse_vmmaskcmpv4sf3)
17191         type = v4si_ftype_v4sf_v4sf;
17192
17193       if (d->icode == CODE_FOR_sse2_maskcmpv2df3
17194           || d->icode == CODE_FOR_sse2_vmmaskcmpv2df3)
17195         type = v2di_ftype_v2df_v2df;
17196
17197       def_builtin (d->mask, d->name, type, d->code);
17198     }
17199
17200   /* Add all builtins that are more or less simple operations on 1 operand.  */
17201   for (i = 0, d = bdesc_1arg; i < ARRAY_SIZE (bdesc_1arg); i++, d++)
17202     {
17203       enum machine_mode mode;
17204       tree type;
17205
17206       if (d->name == 0)
17207         continue;
17208       mode = insn_data[d->icode].operand[1].mode;
17209
17210       switch (mode)
17211         {
17212         case V16QImode:
17213           type = v16qi_ftype_v16qi;
17214           break;
17215         case V8HImode:
17216           type = v8hi_ftype_v8hi;
17217           break;
17218         case V4SImode:
17219           type = v4si_ftype_v4si;
17220           break;
17221         case V2DFmode:
17222           type = v2df_ftype_v2df;
17223           break;
17224         case V4SFmode:
17225           type = v4sf_ftype_v4sf;
17226           break;
17227         case V8QImode:
17228           type = v8qi_ftype_v8qi;
17229           break;
17230         case V4HImode:
17231           type = v4hi_ftype_v4hi;
17232           break;
17233         case V2SImode:
17234           type = v2si_ftype_v2si;
17235           break;
17236
17237         default:
17238           abort ();
17239         }
17240
17241       def_builtin (d->mask, d->name, type, d->code);
17242     }
17243
17244   /* Add the remaining MMX insns with somewhat more complicated types.  */
17245   def_builtin (MASK_MMX, "__builtin_ia32_emms", void_ftype_void, IX86_BUILTIN_EMMS);
17246   def_builtin (MASK_MMX, "__builtin_ia32_psllw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSLLW);
17247   def_builtin (MASK_MMX, "__builtin_ia32_pslld", v2si_ftype_v2si_di, IX86_BUILTIN_PSLLD);
17248   def_builtin (MASK_MMX, "__builtin_ia32_psllq", di_ftype_di_di, IX86_BUILTIN_PSLLQ);
17249
17250   def_builtin (MASK_MMX, "__builtin_ia32_psrlw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSRLW);
17251   def_builtin (MASK_MMX, "__builtin_ia32_psrld", v2si_ftype_v2si_di, IX86_BUILTIN_PSRLD);
17252   def_builtin (MASK_MMX, "__builtin_ia32_psrlq", di_ftype_di_di, IX86_BUILTIN_PSRLQ);
17253
17254   def_builtin (MASK_MMX, "__builtin_ia32_psraw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSRAW);
17255   def_builtin (MASK_MMX, "__builtin_ia32_psrad", v2si_ftype_v2si_di, IX86_BUILTIN_PSRAD);
17256
17257   def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_pshufw", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSHUFW);
17258   def_builtin (MASK_MMX, "__builtin_ia32_pmaddwd", v2si_ftype_v4hi_v4hi, IX86_BUILTIN_PMADDWD);
17259
17260   /* comi/ucomi insns.  */
17261   for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
17262     if (d->mask == MASK_SSE2)
17263       def_builtin (d->mask, d->name, int_ftype_v2df_v2df, d->code);
17264     else
17265       def_builtin (d->mask, d->name, int_ftype_v4sf_v4sf, d->code);
17266
17267   def_builtin (MASK_MMX, "__builtin_ia32_packsswb", v8qi_ftype_v4hi_v4hi, IX86_BUILTIN_PACKSSWB);
17268   def_builtin (MASK_MMX, "__builtin_ia32_packssdw", v4hi_ftype_v2si_v2si, IX86_BUILTIN_PACKSSDW);
17269   def_builtin (MASK_MMX, "__builtin_ia32_packuswb", v8qi_ftype_v4hi_v4hi, IX86_BUILTIN_PACKUSWB);
17270
17271   def_builtin (MASK_SSE, "__builtin_ia32_ldmxcsr", void_ftype_unsigned, IX86_BUILTIN_LDMXCSR);
17272   def_builtin (MASK_SSE, "__builtin_ia32_stmxcsr", unsigned_ftype_void, IX86_BUILTIN_STMXCSR);
17273   def_builtin_const (MASK_SSE, "__builtin_ia32_cvtpi2ps", v4sf_ftype_v4sf_v2si, IX86_BUILTIN_CVTPI2PS);
17274   def_builtin_const (MASK_SSE, "__builtin_ia32_cvtps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTPS2PI);
17275   def_builtin_const (MASK_SSE, "__builtin_ia32_cvtsi2ss", v4sf_ftype_v4sf_int, IX86_BUILTIN_CVTSI2SS);
17276   def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtsi642ss", v4sf_ftype_v4sf_int64, IX86_BUILTIN_CVTSI642SS);
17277   def_builtin_const (MASK_SSE, "__builtin_ia32_cvtss2si", int_ftype_v4sf, IX86_BUILTIN_CVTSS2SI);
17278   def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvtss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTSS2SI64);
17279   def_builtin_const (MASK_SSE, "__builtin_ia32_cvttps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTTPS2PI);
17280   def_builtin_const (MASK_SSE, "__builtin_ia32_cvttss2si", int_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI);
17281   def_builtin_const (MASK_SSE | MASK_64BIT, "__builtin_ia32_cvttss2si64", int64_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI64);
17282
17283   def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_maskmovq", void_ftype_v8qi_v8qi_pchar, IX86_BUILTIN_MASKMOVQ);
17284
17285   def_builtin (MASK_SSE, "__builtin_ia32_loadups", v4sf_ftype_pcfloat, IX86_BUILTIN_LOADUPS);
17286   def_builtin (MASK_SSE, "__builtin_ia32_storeups", void_ftype_pfloat_v4sf, IX86_BUILTIN_STOREUPS);
17287
17288   def_builtin (MASK_SSE, "__builtin_ia32_loadhps", v4sf_ftype_v4sf_pv2si, IX86_BUILTIN_LOADHPS);
17289   def_builtin (MASK_SSE, "__builtin_ia32_loadlps", v4sf_ftype_v4sf_pv2si, IX86_BUILTIN_LOADLPS);
17290   def_builtin (MASK_SSE, "__builtin_ia32_storehps", void_ftype_pv2si_v4sf, IX86_BUILTIN_STOREHPS);
17291   def_builtin (MASK_SSE, "__builtin_ia32_storelps", void_ftype_pv2si_v4sf, IX86_BUILTIN_STORELPS);
17292
17293   def_builtin (MASK_SSE, "__builtin_ia32_movmskps", int_ftype_v4sf, IX86_BUILTIN_MOVMSKPS);
17294   def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_pmovmskb", int_ftype_v8qi, IX86_BUILTIN_PMOVMSKB);
17295   def_builtin (MASK_SSE, "__builtin_ia32_movntps", void_ftype_pfloat_v4sf, IX86_BUILTIN_MOVNTPS);
17296   def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_movntq", void_ftype_pdi_di, IX86_BUILTIN_MOVNTQ);
17297
17298   def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_sfence", void_ftype_void, IX86_BUILTIN_SFENCE);
17299
17300   def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_psadbw", di_ftype_v8qi_v8qi, IX86_BUILTIN_PSADBW);
17301
17302   def_builtin (MASK_SSE, "__builtin_ia32_rcpps", v4sf_ftype_v4sf, IX86_BUILTIN_RCPPS);
17303   def_builtin (MASK_SSE, "__builtin_ia32_rcpss", v4sf_ftype_v4sf, IX86_BUILTIN_RCPSS);
17304   def_builtin (MASK_SSE, "__builtin_ia32_rsqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS);
17305   def_builtin (MASK_SSE, "__builtin_ia32_rsqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTSS);
17306   def_builtin_const (MASK_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS);
17307   def_builtin_const (MASK_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS);
17308
17309   def_builtin (MASK_SSE, "__builtin_ia32_shufps", v4sf_ftype_v4sf_v4sf_int, IX86_BUILTIN_SHUFPS);
17310
17311   /* Original 3DNow!  */
17312   def_builtin (MASK_3DNOW, "__builtin_ia32_femms", void_ftype_void, IX86_BUILTIN_FEMMS);
17313   def_builtin (MASK_3DNOW, "__builtin_ia32_pavgusb", v8qi_ftype_v8qi_v8qi, IX86_BUILTIN_PAVGUSB);
17314   def_builtin (MASK_3DNOW, "__builtin_ia32_pf2id", v2si_ftype_v2sf, IX86_BUILTIN_PF2ID);
17315   def_builtin (MASK_3DNOW, "__builtin_ia32_pfacc", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFACC);
17316   def_builtin (MASK_3DNOW, "__builtin_ia32_pfadd", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFADD);
17317   def_builtin (MASK_3DNOW, "__builtin_ia32_pfcmpeq", v2si_ftype_v2sf_v2sf, IX86_BUILTIN_PFCMPEQ);
17318   def_builtin (MASK_3DNOW, "__builtin_ia32_pfcmpge", v2si_ftype_v2sf_v2sf, IX86_BUILTIN_PFCMPGE);
17319   def_builtin (MASK_3DNOW, "__builtin_ia32_pfcmpgt", v2si_ftype_v2sf_v2sf, IX86_BUILTIN_PFCMPGT);
17320   def_builtin (MASK_3DNOW, "__builtin_ia32_pfmax", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFMAX);
17321   def_builtin (MASK_3DNOW, "__builtin_ia32_pfmin", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFMIN);
17322   def_builtin (MASK_3DNOW, "__builtin_ia32_pfmul", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFMUL);
17323   def_builtin (MASK_3DNOW, "__builtin_ia32_pfrcp", v2sf_ftype_v2sf, IX86_BUILTIN_PFRCP);
17324   def_builtin (MASK_3DNOW, "__builtin_ia32_pfrcpit1", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFRCPIT1);
17325   def_builtin (MASK_3DNOW, "__builtin_ia32_pfrcpit2", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFRCPIT2);
17326   def_builtin (MASK_3DNOW, "__builtin_ia32_pfrsqrt", v2sf_ftype_v2sf, IX86_BUILTIN_PFRSQRT);
17327   def_builtin (MASK_3DNOW, "__builtin_ia32_pfrsqit1", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFRSQIT1);
17328   def_builtin (MASK_3DNOW, "__builtin_ia32_pfsub", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFSUB);
17329   def_builtin (MASK_3DNOW, "__builtin_ia32_pfsubr", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFSUBR);
17330   def_builtin (MASK_3DNOW, "__builtin_ia32_pi2fd", v2sf_ftype_v2si, IX86_BUILTIN_PI2FD);
17331   def_builtin (MASK_3DNOW, "__builtin_ia32_pmulhrw", v4hi_ftype_v4hi_v4hi, IX86_BUILTIN_PMULHRW);
17332
17333   /* 3DNow! extension as used in the Athlon CPU.  */
17334   def_builtin (MASK_3DNOW_A, "__builtin_ia32_pf2iw", v2si_ftype_v2sf, IX86_BUILTIN_PF2IW);
17335   def_builtin (MASK_3DNOW_A, "__builtin_ia32_pfnacc", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFNACC);
17336   def_builtin (MASK_3DNOW_A, "__builtin_ia32_pfpnacc", v2sf_ftype_v2sf_v2sf, IX86_BUILTIN_PFPNACC);
17337   def_builtin (MASK_3DNOW_A, "__builtin_ia32_pi2fw", v2sf_ftype_v2si, IX86_BUILTIN_PI2FW);
17338   def_builtin (MASK_3DNOW_A, "__builtin_ia32_pswapdsf", v2sf_ftype_v2sf, IX86_BUILTIN_PSWAPDSF);
17339   def_builtin (MASK_3DNOW_A, "__builtin_ia32_pswapdsi", v2si_ftype_v2si, IX86_BUILTIN_PSWAPDSI);
17340
17341   /* SSE2 */
17342   def_builtin (MASK_SSE2, "__builtin_ia32_maskmovdqu", void_ftype_v16qi_v16qi_pchar, IX86_BUILTIN_MASKMOVDQU);
17343
17344   def_builtin (MASK_SSE2, "__builtin_ia32_loadupd", v2df_ftype_pcdouble, IX86_BUILTIN_LOADUPD);
17345   def_builtin (MASK_SSE2, "__builtin_ia32_storeupd", void_ftype_pdouble_v2df, IX86_BUILTIN_STOREUPD);
17346
17347   def_builtin (MASK_SSE2, "__builtin_ia32_loadhpd", v2df_ftype_v2df_pcdouble, IX86_BUILTIN_LOADHPD);
17348   def_builtin (MASK_SSE2, "__builtin_ia32_loadlpd", v2df_ftype_v2df_pcdouble, IX86_BUILTIN_LOADLPD);
17349
17350   def_builtin (MASK_SSE2, "__builtin_ia32_movmskpd", int_ftype_v2df, IX86_BUILTIN_MOVMSKPD);
17351   def_builtin (MASK_SSE2, "__builtin_ia32_pmovmskb128", int_ftype_v16qi, IX86_BUILTIN_PMOVMSKB128);
17352   def_builtin (MASK_SSE2, "__builtin_ia32_movnti", void_ftype_pint_int, IX86_BUILTIN_MOVNTI);
17353   def_builtin (MASK_SSE2, "__builtin_ia32_movntpd", void_ftype_pdouble_v2df, IX86_BUILTIN_MOVNTPD);
17354   def_builtin (MASK_SSE2, "__builtin_ia32_movntdq", void_ftype_pv2di_v2di, IX86_BUILTIN_MOVNTDQ);
17355
17356   def_builtin (MASK_SSE2, "__builtin_ia32_pshufd", v4si_ftype_v4si_int, IX86_BUILTIN_PSHUFD);
17357   def_builtin (MASK_SSE2, "__builtin_ia32_pshuflw", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSHUFLW);
17358   def_builtin (MASK_SSE2, "__builtin_ia32_pshufhw", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSHUFHW);
17359   def_builtin (MASK_SSE2, "__builtin_ia32_psadbw128", v2di_ftype_v16qi_v16qi, IX86_BUILTIN_PSADBW128);
17360
17361   def_builtin_const (MASK_SSE2, "__builtin_ia32_sqrtpd", v2df_ftype_v2df, IX86_BUILTIN_SQRTPD);
17362   def_builtin_const (MASK_SSE2, "__builtin_ia32_sqrtsd", v2df_ftype_v2df, IX86_BUILTIN_SQRTSD);
17363
17364   def_builtin (MASK_SSE2, "__builtin_ia32_shufpd", v2df_ftype_v2df_v2df_int, IX86_BUILTIN_SHUFPD);
17365
17366   def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtdq2pd", v2df_ftype_v4si, IX86_BUILTIN_CVTDQ2PD);
17367   def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtdq2ps", v4sf_ftype_v4si, IX86_BUILTIN_CVTDQ2PS);
17368
17369   def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTPD2DQ);
17370   def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTPD2PI);
17371   def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpd2ps", v4sf_ftype_v2df, IX86_BUILTIN_CVTPD2PS);
17372   def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttpd2dq", v4si_ftype_v2df, IX86_BUILTIN_CVTTPD2DQ);
17373   def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttpd2pi", v2si_ftype_v2df, IX86_BUILTIN_CVTTPD2PI);
17374
17375   def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtpi2pd", v2df_ftype_v2si, IX86_BUILTIN_CVTPI2PD);
17376
17377   def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsd2si", int_ftype_v2df, IX86_BUILTIN_CVTSD2SI);
17378   def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttsd2si", int_ftype_v2df, IX86_BUILTIN_CVTTSD2SI);
17379   def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTSD2SI64);
17380   def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvttsd2si64", int64_ftype_v2df, IX86_BUILTIN_CVTTSD2SI64);
17381
17382   def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTPS2DQ);
17383   def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtps2pd", v2df_ftype_v4sf, IX86_BUILTIN_CVTPS2PD);
17384   def_builtin_const (MASK_SSE2, "__builtin_ia32_cvttps2dq", v4si_ftype_v4sf, IX86_BUILTIN_CVTTPS2DQ);
17385
17386   def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsi2sd", v2df_ftype_v2df_int, IX86_BUILTIN_CVTSI2SD);
17387   def_builtin_const (MASK_SSE2 | MASK_64BIT, "__builtin_ia32_cvtsi642sd", v2df_ftype_v2df_int64, IX86_BUILTIN_CVTSI642SD);
17388   def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtsd2ss", v4sf_ftype_v4sf_v2df, IX86_BUILTIN_CVTSD2SS);
17389   def_builtin_const (MASK_SSE2, "__builtin_ia32_cvtss2sd", v2df_ftype_v2df_v4sf, IX86_BUILTIN_CVTSS2SD);
17390
17391   def_builtin (MASK_SSE2, "__builtin_ia32_clflush", void_ftype_pcvoid, IX86_BUILTIN_CLFLUSH);
17392   def_builtin (MASK_SSE2, "__builtin_ia32_lfence", void_ftype_void, IX86_BUILTIN_LFENCE);
17393   def_builtin (MASK_SSE2, "__builtin_ia32_mfence", void_ftype_void, IX86_BUILTIN_MFENCE);
17394
17395   def_builtin (MASK_SSE2, "__builtin_ia32_loaddqu", v16qi_ftype_pcchar, IX86_BUILTIN_LOADDQU);
17396   def_builtin (MASK_SSE2, "__builtin_ia32_storedqu", void_ftype_pchar_v16qi, IX86_BUILTIN_STOREDQU);
17397
17398   def_builtin (MASK_SSE2, "__builtin_ia32_pmuludq", di_ftype_v2si_v2si, IX86_BUILTIN_PMULUDQ);
17399   def_builtin (MASK_SSE2, "__builtin_ia32_pmuludq128", v2di_ftype_v4si_v4si, IX86_BUILTIN_PMULUDQ128);
17400
17401   def_builtin (MASK_SSE2, "__builtin_ia32_pslldqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSLLDQI128);
17402   def_builtin (MASK_SSE2, "__builtin_ia32_psllwi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSLLWI128);
17403   def_builtin (MASK_SSE2, "__builtin_ia32_pslldi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSLLDI128);
17404   def_builtin (MASK_SSE2, "__builtin_ia32_psllqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSLLQI128);
17405   def_builtin (MASK_SSE2, "__builtin_ia32_psllw128", v8hi_ftype_v8hi_v8hi, IX86_BUILTIN_PSLLW128);
17406   def_builtin (MASK_SSE2, "__builtin_ia32_pslld128", v4si_ftype_v4si_v4si, IX86_BUILTIN_PSLLD128);
17407   def_builtin (MASK_SSE2, "__builtin_ia32_psllq128", v2di_ftype_v2di_v2di, IX86_BUILTIN_PSLLQ128);
17408
17409   def_builtin (MASK_SSE2, "__builtin_ia32_psrldqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSRLDQI128);
17410   def_builtin (MASK_SSE2, "__builtin_ia32_psrlwi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSRLWI128);
17411   def_builtin (MASK_SSE2, "__builtin_ia32_psrldi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSRLDI128);
17412   def_builtin (MASK_SSE2, "__builtin_ia32_psrlqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSRLQI128);
17413   def_builtin (MASK_SSE2, "__builtin_ia32_psrlw128", v8hi_ftype_v8hi_v8hi, IX86_BUILTIN_PSRLW128);
17414   def_builtin (MASK_SSE2, "__builtin_ia32_psrld128", v4si_ftype_v4si_v4si, IX86_BUILTIN_PSRLD128);
17415   def_builtin (MASK_SSE2, "__builtin_ia32_psrlq128", v2di_ftype_v2di_v2di, IX86_BUILTIN_PSRLQ128);
17416
17417   def_builtin (MASK_SSE2, "__builtin_ia32_psrawi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSRAWI128);
17418   def_builtin (MASK_SSE2, "__builtin_ia32_psradi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSRADI128);
17419   def_builtin (MASK_SSE2, "__builtin_ia32_psraw128", v8hi_ftype_v8hi_v8hi, IX86_BUILTIN_PSRAW128);
17420   def_builtin (MASK_SSE2, "__builtin_ia32_psrad128", v4si_ftype_v4si_v4si, IX86_BUILTIN_PSRAD128);
17421
17422   def_builtin (MASK_SSE2, "__builtin_ia32_pmaddwd128", v4si_ftype_v8hi_v8hi, IX86_BUILTIN_PMADDWD128);
17423
17424   /* Prescott New Instructions.  */
17425   def_builtin (MASK_SSE3, "__builtin_ia32_monitor",
17426                void_ftype_pcvoid_unsigned_unsigned,
17427                IX86_BUILTIN_MONITOR);
17428   def_builtin (MASK_SSE3, "__builtin_ia32_mwait",
17429                void_ftype_unsigned_unsigned,
17430                IX86_BUILTIN_MWAIT);
17431   def_builtin (MASK_SSE3, "__builtin_ia32_lddqu",
17432                v16qi_ftype_pcchar, IX86_BUILTIN_LDDQU);
17433
17434   /* SSSE3.  */
17435   def_builtin (MASK_SSSE3, "__builtin_ia32_palignr128",
17436                v2di_ftype_v2di_v2di_int, IX86_BUILTIN_PALIGNR128);
17437   def_builtin (MASK_SSSE3, "__builtin_ia32_palignr", di_ftype_di_di_int,
17438                IX86_BUILTIN_PALIGNR);
17439
17440   /* AMDFAM10 SSE4A New built-ins  */
17441   def_builtin (MASK_SSE4A, "__builtin_ia32_movntsd",
17442                void_ftype_pdouble_v2df, IX86_BUILTIN_MOVNTSD);
17443   def_builtin (MASK_SSE4A, "__builtin_ia32_movntss",
17444                void_ftype_pfloat_v4sf, IX86_BUILTIN_MOVNTSS);
17445   def_builtin (MASK_SSE4A, "__builtin_ia32_extrqi",
17446                v2di_ftype_v2di_unsigned_unsigned, IX86_BUILTIN_EXTRQI);
17447   def_builtin (MASK_SSE4A, "__builtin_ia32_extrq",
17448                v2di_ftype_v2di_v16qi,  IX86_BUILTIN_EXTRQ);
17449   def_builtin (MASK_SSE4A, "__builtin_ia32_insertqi",
17450                v2di_ftype_v2di_v2di_unsigned_unsigned, IX86_BUILTIN_INSERTQI);
17451   def_builtin (MASK_SSE4A, "__builtin_ia32_insertq",
17452                v2di_ftype_v2di_v2di, IX86_BUILTIN_INSERTQ);
17453
17454   /* Access to the vec_init patterns.  */
17455   ftype = build_function_type_list (V2SI_type_node, integer_type_node,
17456                                     integer_type_node, NULL_TREE);
17457   def_builtin (MASK_MMX, "__builtin_ia32_vec_init_v2si",
17458                ftype, IX86_BUILTIN_VEC_INIT_V2SI);
17459
17460   ftype = build_function_type_list (V4HI_type_node, short_integer_type_node,
17461                                     short_integer_type_node,
17462                                     short_integer_type_node,
17463                                     short_integer_type_node, NULL_TREE);
17464   def_builtin (MASK_MMX, "__builtin_ia32_vec_init_v4hi",
17465                ftype, IX86_BUILTIN_VEC_INIT_V4HI);
17466
17467   ftype = build_function_type_list (V8QI_type_node, char_type_node,
17468                                     char_type_node, char_type_node,
17469                                     char_type_node, char_type_node,
17470                                     char_type_node, char_type_node,
17471                                     char_type_node, NULL_TREE);
17472   def_builtin (MASK_MMX, "__builtin_ia32_vec_init_v8qi",
17473                ftype, IX86_BUILTIN_VEC_INIT_V8QI);
17474
17475   /* Access to the vec_extract patterns.  */
17476   ftype = build_function_type_list (double_type_node, V2DF_type_node,
17477                                     integer_type_node, NULL_TREE);
17478   def_builtin (MASK_SSE, "__builtin_ia32_vec_ext_v2df",
17479                ftype, IX86_BUILTIN_VEC_EXT_V2DF);
17480
17481   ftype = build_function_type_list (long_long_integer_type_node,
17482                                     V2DI_type_node, integer_type_node,
17483                                     NULL_TREE);
17484   def_builtin (MASK_SSE, "__builtin_ia32_vec_ext_v2di",
17485                ftype, IX86_BUILTIN_VEC_EXT_V2DI);
17486
17487   ftype = build_function_type_list (float_type_node, V4SF_type_node,
17488                                     integer_type_node, NULL_TREE);
17489   def_builtin (MASK_SSE, "__builtin_ia32_vec_ext_v4sf",
17490                ftype, IX86_BUILTIN_VEC_EXT_V4SF);
17491
17492   ftype = build_function_type_list (intSI_type_node, V4SI_type_node,
17493                                     integer_type_node, NULL_TREE);
17494   def_builtin (MASK_SSE, "__builtin_ia32_vec_ext_v4si",
17495                ftype, IX86_BUILTIN_VEC_EXT_V4SI);
17496
17497   ftype = build_function_type_list (intHI_type_node, V8HI_type_node,
17498                                     integer_type_node, NULL_TREE);
17499   def_builtin (MASK_SSE, "__builtin_ia32_vec_ext_v8hi",
17500                ftype, IX86_BUILTIN_VEC_EXT_V8HI);
17501
17502   ftype = build_function_type_list (intHI_type_node, V4HI_type_node,
17503                                     integer_type_node, NULL_TREE);
17504   def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_vec_ext_v4hi",
17505                ftype, IX86_BUILTIN_VEC_EXT_V4HI);
17506
17507   ftype = build_function_type_list (intSI_type_node, V2SI_type_node,
17508                                     integer_type_node, NULL_TREE);
17509   def_builtin (MASK_MMX, "__builtin_ia32_vec_ext_v2si",
17510                ftype, IX86_BUILTIN_VEC_EXT_V2SI);
17511
17512   /* Access to the vec_set patterns.  */
17513   ftype = build_function_type_list (V8HI_type_node, V8HI_type_node,
17514                                     intHI_type_node,
17515                                     integer_type_node, NULL_TREE);
17516   def_builtin (MASK_SSE, "__builtin_ia32_vec_set_v8hi",
17517                ftype, IX86_BUILTIN_VEC_SET_V8HI);
17518
17519   ftype = build_function_type_list (V4HI_type_node, V4HI_type_node,
17520                                     intHI_type_node,
17521                                     integer_type_node, NULL_TREE);
17522   def_builtin (MASK_SSE | MASK_3DNOW_A, "__builtin_ia32_vec_set_v4hi",
17523                ftype, IX86_BUILTIN_VEC_SET_V4HI);
17524 }
17525
17526 static void
17527 ix86_init_builtins (void)
17528 {
17529   if (TARGET_MMX)
17530     ix86_init_mmx_sse_builtins ();
17531 }
17532
17533 /* Errors in the source file can cause expand_expr to return const0_rtx
17534    where we expect a vector.  To avoid crashing, use one of the vector
17535    clear instructions.  */
17536 static rtx
17537 safe_vector_operand (rtx x, enum machine_mode mode)
17538 {
17539   if (x == const0_rtx)
17540     x = CONST0_RTX (mode);
17541   return x;
17542 }
17543
17544 /* Subroutine of ix86_expand_builtin to take care of binop insns.  */
17545
17546 static rtx
17547 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
17548 {
17549   rtx pat, xops[3];
17550   tree arg0 = CALL_EXPR_ARG (exp, 0);
17551   tree arg1 = CALL_EXPR_ARG (exp, 1);
17552   rtx op0 = expand_normal (arg0);
17553   rtx op1 = expand_normal (arg1);
17554   enum machine_mode tmode = insn_data[icode].operand[0].mode;
17555   enum machine_mode mode0 = insn_data[icode].operand[1].mode;
17556   enum machine_mode mode1 = insn_data[icode].operand[2].mode;
17557
17558   if (VECTOR_MODE_P (mode0))
17559     op0 = safe_vector_operand (op0, mode0);
17560   if (VECTOR_MODE_P (mode1))
17561     op1 = safe_vector_operand (op1, mode1);
17562
17563   if (optimize || !target
17564       || GET_MODE (target) != tmode
17565       || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17566     target = gen_reg_rtx (tmode);
17567
17568   if (GET_MODE (op1) == SImode && mode1 == TImode)
17569     {
17570       rtx x = gen_reg_rtx (V4SImode);
17571       emit_insn (gen_sse2_loadd (x, op1));
17572       op1 = gen_lowpart (TImode, x);
17573     }
17574
17575   /* The insn must want input operands in the same modes as the
17576      result.  */
17577   gcc_assert ((GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
17578               && (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode));
17579
17580   if (!(*insn_data[icode].operand[1].predicate) (op0, mode0))
17581     op0 = copy_to_mode_reg (mode0, op0);
17582   if (!(*insn_data[icode].operand[2].predicate) (op1, mode1))
17583     op1 = copy_to_mode_reg (mode1, op1);
17584
17585   /* ??? Using ix86_fixup_binary_operands is problematic when
17586      we've got mismatched modes.  Fake it.  */
17587
17588   xops[0] = target;
17589   xops[1] = op0;
17590   xops[2] = op1;
17591
17592   if (tmode == mode0 && tmode == mode1)
17593     {
17594       target = ix86_fixup_binary_operands (UNKNOWN, tmode, xops);
17595       op0 = xops[1];
17596       op1 = xops[2];
17597     }
17598   else if (optimize || !ix86_binary_operator_ok (UNKNOWN, tmode, xops))
17599     {
17600       op0 = force_reg (mode0, op0);
17601       op1 = force_reg (mode1, op1);
17602       target = gen_reg_rtx (tmode);
17603     }
17604
17605   pat = GEN_FCN (icode) (target, op0, op1);
17606   if (! pat)
17607     return 0;
17608   emit_insn (pat);
17609   return target;
17610 }
17611
17612 /* Subroutine of ix86_expand_builtin to take care of stores.  */
17613
17614 static rtx
17615 ix86_expand_store_builtin (enum insn_code icode, tree exp)
17616 {
17617   rtx pat;
17618   tree arg0 = CALL_EXPR_ARG (exp, 0);
17619   tree arg1 = CALL_EXPR_ARG (exp, 1);
17620   rtx op0 = expand_normal (arg0);
17621   rtx op1 = expand_normal (arg1);
17622   enum machine_mode mode0 = insn_data[icode].operand[0].mode;
17623   enum machine_mode mode1 = insn_data[icode].operand[1].mode;
17624
17625   if (VECTOR_MODE_P (mode1))
17626     op1 = safe_vector_operand (op1, mode1);
17627
17628   op0 = gen_rtx_MEM (mode0, copy_to_mode_reg (Pmode, op0));
17629   op1 = copy_to_mode_reg (mode1, op1);
17630
17631   pat = GEN_FCN (icode) (op0, op1);
17632   if (pat)
17633     emit_insn (pat);
17634   return 0;
17635 }
17636
17637 /* Subroutine of ix86_expand_builtin to take care of unop insns.  */
17638
17639 static rtx
17640 ix86_expand_unop_builtin (enum insn_code icode, tree exp,
17641                           rtx target, int do_load)
17642 {
17643   rtx pat;
17644   tree arg0 = CALL_EXPR_ARG (exp, 0);
17645   rtx op0 = expand_normal (arg0);
17646   enum machine_mode tmode = insn_data[icode].operand[0].mode;
17647   enum machine_mode mode0 = insn_data[icode].operand[1].mode;
17648
17649   if (optimize || !target
17650       || GET_MODE (target) != tmode
17651       || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17652     target = gen_reg_rtx (tmode);
17653   if (do_load)
17654     op0 = gen_rtx_MEM (mode0, copy_to_mode_reg (Pmode, op0));
17655   else
17656     {
17657       if (VECTOR_MODE_P (mode0))
17658         op0 = safe_vector_operand (op0, mode0);
17659
17660       if ((optimize && !register_operand (op0, mode0))
17661           || ! (*insn_data[icode].operand[1].predicate) (op0, mode0))
17662         op0 = copy_to_mode_reg (mode0, op0);
17663     }
17664
17665   pat = GEN_FCN (icode) (target, op0);
17666   if (! pat)
17667     return 0;
17668   emit_insn (pat);
17669   return target;
17670 }
17671
17672 /* Subroutine of ix86_expand_builtin to take care of three special unop insns:
17673    sqrtss, rsqrtss, rcpss.  */
17674
17675 static rtx
17676 ix86_expand_unop1_builtin (enum insn_code icode, tree exp, rtx target)
17677 {
17678   rtx pat;
17679   tree arg0 = CALL_EXPR_ARG (exp, 0);
17680   rtx op1, op0 = expand_normal (arg0);
17681   enum machine_mode tmode = insn_data[icode].operand[0].mode;
17682   enum machine_mode mode0 = insn_data[icode].operand[1].mode;
17683
17684   if (optimize || !target
17685       || GET_MODE (target) != tmode
17686       || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
17687     target = gen_reg_rtx (tmode);
17688
17689   if (VECTOR_MODE_P (mode0))
17690     op0 = safe_vector_operand (op0, mode0);
17691
17692   if ((optimize && !register_operand (op0, mode0))
17693       || ! (*insn_data[icode].operand[1].predicate) (op0, mode0))
17694     op0 = copy_to_mode_reg (mode0, op0);
17695
17696   op1 = op0;
17697   if (! (*insn_data[icode].operand[2].predicate) (op1, mode0))
17698     op1 = copy_to_mode_reg (mode0, op1);
17699
17700   pat = GEN_FCN (icode) (target, op0, op1);
17701   if (! pat)
17702     return 0;
17703   emit_insn (pat);
17704   return target;
17705 }
17706
17707 /* Subroutine of ix86_expand_builtin to take care of comparison insns.  */
17708
17709 static rtx
17710 ix86_expand_sse_compare (const struct builtin_description *d, tree exp,
17711                          rtx target)
17712 {
17713   rtx pat;
17714   tree arg0 = CALL_EXPR_ARG (exp, 0);
17715   tree arg1 = CALL_EXPR_ARG (exp, 1);
17716   rtx op0 = expand_normal (arg0);
17717   rtx op1 = expand_normal (arg1);
17718   rtx op2;
17719   enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
17720   enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
17721   enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
17722   enum rtx_code comparison = d->comparison;
17723
17724   if (VECTOR_MODE_P (mode0))
17725     op0 = safe_vector_operand (op0, mode0);
17726   if (VECTOR_MODE_P (mode1))
17727     op1 = safe_vector_operand (op1, mode1);
17728
17729   /* Swap operands if we have a comparison that isn't available in
17730      hardware.  */
17731   if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
17732     {
17733       rtx tmp = gen_reg_rtx (mode1);
17734       emit_move_insn (tmp, op1);
17735       op1 = op0;
17736       op0 = tmp;
17737     }
17738
17739   if (optimize || !target
17740       || GET_MODE (target) != tmode
17741       || ! (*insn_data[d->icode].operand[0].predicate) (target, tmode))
17742     target = gen_reg_rtx (tmode);
17743
17744   if ((optimize && !register_operand (op0, mode0))
17745       || ! (*insn_data[d->icode].operand[1].predicate) (op0, mode0))
17746     op0 = copy_to_mode_reg (mode0, op0);
17747   if ((optimize && !register_operand (op1, mode1))
17748       || ! (*insn_data[d->icode].operand[2].predicate) (op1, mode1))
17749     op1 = copy_to_mode_reg (mode1, op1);
17750
17751   op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
17752   pat = GEN_FCN (d->icode) (target, op0, op1, op2);
17753   if (! pat)
17754     return 0;
17755   emit_insn (pat);
17756   return target;
17757 }
17758
17759 /* Subroutine of ix86_expand_builtin to take care of comi insns.  */
17760
17761 static rtx
17762 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
17763                       rtx target)
17764 {
17765   rtx pat;
17766   tree arg0 = CALL_EXPR_ARG (exp, 0);
17767   tree arg1 = CALL_EXPR_ARG (exp, 1);
17768   rtx op0 = expand_normal (arg0);
17769   rtx op1 = expand_normal (arg1);
17770   rtx op2;
17771   enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
17772   enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
17773   enum rtx_code comparison = d->comparison;
17774
17775   if (VECTOR_MODE_P (mode0))
17776     op0 = safe_vector_operand (op0, mode0);
17777   if (VECTOR_MODE_P (mode1))
17778     op1 = safe_vector_operand (op1, mode1);
17779
17780   /* Swap operands if we have a comparison that isn't available in
17781      hardware.  */
17782   if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
17783     {
17784       rtx tmp = op1;
17785       op1 = op0;
17786       op0 = tmp;
17787     }
17788
17789   target = gen_reg_rtx (SImode);
17790   emit_move_insn (target, const0_rtx);
17791   target = gen_rtx_SUBREG (QImode, target, 0);
17792
17793   if ((optimize && !register_operand (op0, mode0))
17794       || !(*insn_data[d->icode].operand[0].predicate) (op0, mode0))
17795     op0 = copy_to_mode_reg (mode0, op0);
17796   if ((optimize && !register_operand (op1, mode1))
17797       || !(*insn_data[d->icode].operand[1].predicate) (op1, mode1))
17798     op1 = copy_to_mode_reg (mode1, op1);
17799
17800   op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
17801   pat = GEN_FCN (d->icode) (op0, op1);
17802   if (! pat)
17803     return 0;
17804   emit_insn (pat);
17805   emit_insn (gen_rtx_SET (VOIDmode,
17806                           gen_rtx_STRICT_LOW_PART (VOIDmode, target),
17807                           gen_rtx_fmt_ee (comparison, QImode,
17808                                           SET_DEST (pat),
17809                                           const0_rtx)));
17810
17811   return SUBREG_REG (target);
17812 }
17813
17814 /* Return the integer constant in ARG.  Constrain it to be in the range
17815    of the subparts of VEC_TYPE; issue an error if not.  */
17816
17817 static int
17818 get_element_number (tree vec_type, tree arg)
17819 {
17820   unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
17821
17822   if (!host_integerp (arg, 1)
17823       || (elt = tree_low_cst (arg, 1), elt > max))
17824     {
17825       error ("selector must be an integer constant in the range 0..%wi", max);
17826       return 0;
17827     }
17828
17829   return elt;
17830 }
17831
17832 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
17833    ix86_expand_vector_init.  We DO have language-level syntax for this, in
17834    the form of  (type){ init-list }.  Except that since we can't place emms
17835    instructions from inside the compiler, we can't allow the use of MMX
17836    registers unless the user explicitly asks for it.  So we do *not* define
17837    vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md.  Instead
17838    we have builtins invoked by mmintrin.h that gives us license to emit
17839    these sorts of instructions.  */
17840
17841 static rtx
17842 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
17843 {
17844   enum machine_mode tmode = TYPE_MODE (type);
17845   enum machine_mode inner_mode = GET_MODE_INNER (tmode);
17846   int i, n_elt = GET_MODE_NUNITS (tmode);
17847   rtvec v = rtvec_alloc (n_elt);
17848
17849   gcc_assert (VECTOR_MODE_P (tmode));
17850   gcc_assert (call_expr_nargs (exp) == n_elt);
17851
17852   for (i = 0; i < n_elt; ++i)
17853     {
17854       rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
17855       RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
17856     }
17857
17858   if (!target || !register_operand (target, tmode))
17859     target = gen_reg_rtx (tmode);
17860
17861   ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
17862   return target;
17863 }
17864
17865 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
17866    ix86_expand_vector_extract.  They would be redundant (for non-MMX) if we
17867    had a language-level syntax for referencing vector elements.  */
17868
17869 static rtx
17870 ix86_expand_vec_ext_builtin (tree exp, rtx target)
17871 {
17872   enum machine_mode tmode, mode0;
17873   tree arg0, arg1;
17874   int elt;
17875   rtx op0;
17876
17877   arg0 = CALL_EXPR_ARG (exp, 0);
17878   arg1 = CALL_EXPR_ARG (exp, 1);
17879
17880   op0 = expand_normal (arg0);
17881   elt = get_element_number (TREE_TYPE (arg0), arg1);
17882
17883   tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
17884   mode0 = TYPE_MODE (TREE_TYPE (arg0));
17885   gcc_assert (VECTOR_MODE_P (mode0));
17886
17887   op0 = force_reg (mode0, op0);
17888
17889   if (optimize || !target || !register_operand (target, tmode))
17890     target = gen_reg_rtx (tmode);
17891
17892   ix86_expand_vector_extract (true, target, op0, elt);
17893
17894   return target;
17895 }
17896
17897 /* A subroutine of ix86_expand_builtin.  These builtins are a wrapper around
17898    ix86_expand_vector_set.  They would be redundant (for non-MMX) if we had
17899    a language-level syntax for referencing vector elements.  */
17900
17901 static rtx
17902 ix86_expand_vec_set_builtin (tree exp)
17903 {
17904   enum machine_mode tmode, mode1;
17905   tree arg0, arg1, arg2;
17906   int elt;
17907   rtx op0, op1, target;
17908
17909   arg0 = CALL_EXPR_ARG (exp, 0);
17910   arg1 = CALL_EXPR_ARG (exp, 1);
17911   arg2 = CALL_EXPR_ARG (exp, 2);
17912
17913   tmode = TYPE_MODE (TREE_TYPE (arg0));
17914   mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
17915   gcc_assert (VECTOR_MODE_P (tmode));
17916
17917   op0 = expand_expr (arg0, NULL_RTX, tmode, 0);
17918   op1 = expand_expr (arg1, NULL_RTX, mode1, 0);
17919   elt = get_element_number (TREE_TYPE (arg0), arg2);
17920
17921   if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
17922     op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
17923
17924   op0 = force_reg (tmode, op0);
17925   op1 = force_reg (mode1, op1);
17926
17927   /* OP0 is the source of these builtin functions and shouldn't be
17928      modified.  Create a copy, use it and return it as target.  */
17929   target = gen_reg_rtx (tmode);
17930   emit_move_insn (target, op0);
17931   ix86_expand_vector_set (true, target, op1, elt);
17932
17933   return target;
17934 }
17935
17936 /* Expand an expression EXP that calls a built-in function,
17937    with result going to TARGET if that's convenient
17938    (and in mode MODE if that's convenient).
17939    SUBTARGET may be used as the target for computing one of EXP's operands.
17940    IGNORE is nonzero if the value is to be ignored.  */
17941
17942 static rtx
17943 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
17944                      enum machine_mode mode ATTRIBUTE_UNUSED,
17945                      int ignore ATTRIBUTE_UNUSED)
17946 {
17947   const struct builtin_description *d;
17948   size_t i;
17949   enum insn_code icode;
17950   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
17951   tree arg0, arg1, arg2, arg3;
17952   rtx op0, op1, op2, op3, pat;
17953   enum machine_mode tmode, mode0, mode1, mode2, mode3, mode4;
17954   unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
17955
17956   switch (fcode)
17957     {
17958     case IX86_BUILTIN_EMMS:
17959       emit_insn (gen_mmx_emms ());
17960       return 0;
17961
17962     case IX86_BUILTIN_SFENCE:
17963       emit_insn (gen_sse_sfence ());
17964       return 0;
17965
17966     case IX86_BUILTIN_MASKMOVQ:
17967     case IX86_BUILTIN_MASKMOVDQU:
17968       icode = (fcode == IX86_BUILTIN_MASKMOVQ
17969                ? CODE_FOR_mmx_maskmovq
17970                : CODE_FOR_sse2_maskmovdqu);
17971       /* Note the arg order is different from the operand order.  */
17972       arg1 = CALL_EXPR_ARG (exp, 0);
17973       arg2 = CALL_EXPR_ARG (exp, 1);
17974       arg0 = CALL_EXPR_ARG (exp, 2);
17975       op0 = expand_normal (arg0);
17976       op1 = expand_normal (arg1);
17977       op2 = expand_normal (arg2);
17978       mode0 = insn_data[icode].operand[0].mode;
17979       mode1 = insn_data[icode].operand[1].mode;
17980       mode2 = insn_data[icode].operand[2].mode;
17981
17982       op0 = force_reg (Pmode, op0);
17983       op0 = gen_rtx_MEM (mode1, op0);
17984
17985       if (! (*insn_data[icode].operand[0].predicate) (op0, mode0))
17986         op0 = copy_to_mode_reg (mode0, op0);
17987       if (! (*insn_data[icode].operand[1].predicate) (op1, mode1))
17988         op1 = copy_to_mode_reg (mode1, op1);
17989       if (! (*insn_data[icode].operand[2].predicate) (op2, mode2))
17990         op2 = copy_to_mode_reg (mode2, op2);
17991       pat = GEN_FCN (icode) (op0, op1, op2);
17992       if (! pat)
17993         return 0;
17994       emit_insn (pat);
17995       return 0;
17996
17997     case IX86_BUILTIN_SQRTSS:
17998       return ix86_expand_unop1_builtin (CODE_FOR_sse_vmsqrtv4sf2, exp, target);
17999     case IX86_BUILTIN_RSQRTSS:
18000       return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrsqrtv4sf2, exp, target);
18001     case IX86_BUILTIN_RCPSS:
18002       return ix86_expand_unop1_builtin (CODE_FOR_sse_vmrcpv4sf2, exp, target);
18003
18004     case IX86_BUILTIN_LOADUPS:
18005       return ix86_expand_unop_builtin (CODE_FOR_sse_movups, exp, target, 1);
18006
18007     case IX86_BUILTIN_STOREUPS:
18008       return ix86_expand_store_builtin (CODE_FOR_sse_movups, exp);
18009
18010     case IX86_BUILTIN_LOADHPS:
18011     case IX86_BUILTIN_LOADLPS:
18012     case IX86_BUILTIN_LOADHPD:
18013     case IX86_BUILTIN_LOADLPD:
18014       icode = (fcode == IX86_BUILTIN_LOADHPS ? CODE_FOR_sse_loadhps
18015                : fcode == IX86_BUILTIN_LOADLPS ? CODE_FOR_sse_loadlps
18016                : fcode == IX86_BUILTIN_LOADHPD ? CODE_FOR_sse2_loadhpd
18017                : CODE_FOR_sse2_loadlpd);
18018       arg0 = CALL_EXPR_ARG (exp, 0);
18019       arg1 = CALL_EXPR_ARG (exp, 1);
18020       op0 = expand_normal (arg0);
18021       op1 = expand_normal (arg1);
18022       tmode = insn_data[icode].operand[0].mode;
18023       mode0 = insn_data[icode].operand[1].mode;
18024       mode1 = insn_data[icode].operand[2].mode;
18025
18026       op0 = force_reg (mode0, op0);
18027       op1 = gen_rtx_MEM (mode1, copy_to_mode_reg (Pmode, op1));
18028       if (optimize || target == 0
18029           || GET_MODE (target) != tmode
18030           || !register_operand (target, tmode))
18031         target = gen_reg_rtx (tmode);
18032       pat = GEN_FCN (icode) (target, op0, op1);
18033       if (! pat)
18034         return 0;
18035       emit_insn (pat);
18036       return target;
18037
18038     case IX86_BUILTIN_STOREHPS:
18039     case IX86_BUILTIN_STORELPS:
18040       icode = (fcode == IX86_BUILTIN_STOREHPS ? CODE_FOR_sse_storehps
18041                : CODE_FOR_sse_storelps);
18042       arg0 = CALL_EXPR_ARG (exp, 0);
18043       arg1 = CALL_EXPR_ARG (exp, 1);
18044       op0 = expand_normal (arg0);
18045       op1 = expand_normal (arg1);
18046       mode0 = insn_data[icode].operand[0].mode;
18047       mode1 = insn_data[icode].operand[1].mode;
18048
18049       op0 = gen_rtx_MEM (mode0, copy_to_mode_reg (Pmode, op0));
18050       op1 = force_reg (mode1, op1);
18051
18052       pat = GEN_FCN (icode) (op0, op1);
18053       if (! pat)
18054         return 0;
18055       emit_insn (pat);
18056       return const0_rtx;
18057
18058     case IX86_BUILTIN_MOVNTPS:
18059       return ix86_expand_store_builtin (CODE_FOR_sse_movntv4sf, exp);
18060     case IX86_BUILTIN_MOVNTQ:
18061       return ix86_expand_store_builtin (CODE_FOR_sse_movntdi, exp);
18062
18063     case IX86_BUILTIN_LDMXCSR:
18064       op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
18065       target = assign_386_stack_local (SImode, SLOT_TEMP);
18066       emit_move_insn (target, op0);
18067       emit_insn (gen_sse_ldmxcsr (target));
18068       return 0;
18069
18070     case IX86_BUILTIN_STMXCSR:
18071       target = assign_386_stack_local (SImode, SLOT_TEMP);
18072       emit_insn (gen_sse_stmxcsr (target));
18073       return copy_to_mode_reg (SImode, target);
18074
18075     case IX86_BUILTIN_SHUFPS:
18076     case IX86_BUILTIN_SHUFPD:
18077       icode = (fcode == IX86_BUILTIN_SHUFPS
18078                ? CODE_FOR_sse_shufps
18079                : CODE_FOR_sse2_shufpd);
18080       arg0 = CALL_EXPR_ARG (exp, 0);
18081       arg1 = CALL_EXPR_ARG (exp, 1);
18082       arg2 = CALL_EXPR_ARG (exp, 2);
18083       op0 = expand_normal (arg0);
18084       op1 = expand_normal (arg1);
18085       op2 = expand_normal (arg2);
18086       tmode = insn_data[icode].operand[0].mode;
18087       mode0 = insn_data[icode].operand[1].mode;
18088       mode1 = insn_data[icode].operand[2].mode;
18089       mode2 = insn_data[icode].operand[3].mode;
18090
18091       if (! (*insn_data[icode].operand[1].predicate) (op0, mode0))
18092         op0 = copy_to_mode_reg (mode0, op0);
18093       if ((optimize && !register_operand (op1, mode1))
18094           || !(*insn_data[icode].operand[2].predicate) (op1, mode1))
18095         op1 = copy_to_mode_reg (mode1, op1);
18096       if (! (*insn_data[icode].operand[3].predicate) (op2, mode2))
18097         {
18098           /* @@@ better error message */
18099           error ("mask must be an immediate");
18100           return gen_reg_rtx (tmode);
18101         }
18102       if (optimize || target == 0
18103           || GET_MODE (target) != tmode
18104           || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
18105         target = gen_reg_rtx (tmode);
18106       pat = GEN_FCN (icode) (target, op0, op1, op2);
18107       if (! pat)
18108         return 0;
18109       emit_insn (pat);
18110       return target;
18111
18112     case IX86_BUILTIN_PSHUFW:
18113     case IX86_BUILTIN_PSHUFD:
18114     case IX86_BUILTIN_PSHUFHW:
18115     case IX86_BUILTIN_PSHUFLW:
18116       icode = (  fcode == IX86_BUILTIN_PSHUFHW ? CODE_FOR_sse2_pshufhw
18117                : fcode == IX86_BUILTIN_PSHUFLW ? CODE_FOR_sse2_pshuflw
18118                : fcode == IX86_BUILTIN_PSHUFD ? CODE_FOR_sse2_pshufd
18119                : CODE_FOR_mmx_pshufw);
18120       arg0 = CALL_EXPR_ARG (exp, 0);
18121       arg1 = CALL_EXPR_ARG (exp, 1);
18122       op0 = expand_normal (arg0);
18123       op1 = expand_normal (arg1);
18124       tmode = insn_data[icode].operand[0].mode;
18125       mode1 = insn_data[icode].operand[1].mode;
18126       mode2 = insn_data[icode].operand[2].mode;
18127
18128       if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
18129         op0 = copy_to_mode_reg (mode1, op0);
18130       if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
18131         {
18132           /* @@@ better error message */
18133           error ("mask must be an immediate");
18134           return const0_rtx;
18135         }
18136       if (target == 0
18137           || GET_MODE (target) != tmode
18138           || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
18139         target = gen_reg_rtx (tmode);
18140       pat = GEN_FCN (icode) (target, op0, op1);
18141       if (! pat)
18142         return 0;
18143       emit_insn (pat);
18144       return target;
18145
18146     case IX86_BUILTIN_PSLLWI128:
18147       icode = CODE_FOR_ashlv8hi3;
18148       goto do_pshifti;
18149     case IX86_BUILTIN_PSLLDI128:
18150       icode = CODE_FOR_ashlv4si3;
18151       goto do_pshifti;
18152     case IX86_BUILTIN_PSLLQI128:
18153       icode = CODE_FOR_ashlv2di3;
18154       goto do_pshifti;
18155     case IX86_BUILTIN_PSRAWI128:
18156       icode = CODE_FOR_ashrv8hi3;
18157       goto do_pshifti;
18158     case IX86_BUILTIN_PSRADI128:
18159       icode = CODE_FOR_ashrv4si3;
18160       goto do_pshifti;
18161     case IX86_BUILTIN_PSRLWI128:
18162       icode = CODE_FOR_lshrv8hi3;
18163       goto do_pshifti;
18164     case IX86_BUILTIN_PSRLDI128:
18165       icode = CODE_FOR_lshrv4si3;
18166       goto do_pshifti;
18167     case IX86_BUILTIN_PSRLQI128:
18168       icode = CODE_FOR_lshrv2di3;
18169       goto do_pshifti;
18170     do_pshifti:
18171       arg0 = CALL_EXPR_ARG (exp, 0);
18172       arg1 = CALL_EXPR_ARG (exp, 1);
18173       op0 = expand_normal (arg0);
18174       op1 = expand_normal (arg1);
18175
18176       if (!CONST_INT_P (op1))
18177         {
18178           error ("shift must be an immediate");
18179           return const0_rtx;
18180         }
18181       if (INTVAL (op1) < 0 || INTVAL (op1) > 255)
18182         op1 = GEN_INT (255);
18183
18184       tmode = insn_data[icode].operand[0].mode;
18185       mode1 = insn_data[icode].operand[1].mode;
18186       if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
18187         op0 = copy_to_reg (op0);
18188
18189       target = gen_reg_rtx (tmode);
18190       pat = GEN_FCN (icode) (target, op0, op1);
18191       if (!pat)
18192         return 0;
18193       emit_insn (pat);
18194       return target;
18195
18196     case IX86_BUILTIN_PSLLW128:
18197       icode = CODE_FOR_ashlv8hi3;
18198       goto do_pshift;
18199     case IX86_BUILTIN_PSLLD128:
18200       icode = CODE_FOR_ashlv4si3;
18201       goto do_pshift;
18202     case IX86_BUILTIN_PSLLQ128:
18203       icode = CODE_FOR_ashlv2di3;
18204       goto do_pshift;
18205     case IX86_BUILTIN_PSRAW128:
18206       icode = CODE_FOR_ashrv8hi3;
18207       goto do_pshift;
18208     case IX86_BUILTIN_PSRAD128:
18209       icode = CODE_FOR_ashrv4si3;
18210       goto do_pshift;
18211     case IX86_BUILTIN_PSRLW128:
18212       icode = CODE_FOR_lshrv8hi3;
18213       goto do_pshift;
18214     case IX86_BUILTIN_PSRLD128:
18215       icode = CODE_FOR_lshrv4si3;
18216       goto do_pshift;
18217     case IX86_BUILTIN_PSRLQ128:
18218       icode = CODE_FOR_lshrv2di3;
18219       goto do_pshift;
18220     do_pshift:
18221       arg0 = CALL_EXPR_ARG (exp, 0);
18222       arg1 = CALL_EXPR_ARG (exp, 1);
18223       op0 = expand_normal (arg0);
18224       op1 = expand_normal (arg1);
18225
18226       tmode = insn_data[icode].operand[0].mode;
18227       mode1 = insn_data[icode].operand[1].mode;
18228
18229       if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
18230         op0 = copy_to_reg (op0);
18231
18232       op1 = simplify_gen_subreg (TImode, op1, GET_MODE (op1), 0);
18233       if (! (*insn_data[icode].operand[2].predicate) (op1, TImode))
18234         op1 = copy_to_reg (op1);
18235
18236       target = gen_reg_rtx (tmode);
18237       pat = GEN_FCN (icode) (target, op0, op1);
18238       if (!pat)
18239         return 0;
18240       emit_insn (pat);
18241       return target;
18242
18243     case IX86_BUILTIN_PSLLDQI128:
18244     case IX86_BUILTIN_PSRLDQI128:
18245       icode = (fcode == IX86_BUILTIN_PSLLDQI128 ? CODE_FOR_sse2_ashlti3
18246                : CODE_FOR_sse2_lshrti3);
18247       arg0 = CALL_EXPR_ARG (exp, 0);
18248       arg1 = CALL_EXPR_ARG (exp, 1);
18249       op0 = expand_normal (arg0);
18250       op1 = expand_normal (arg1);
18251       tmode = insn_data[icode].operand[0].mode;
18252       mode1 = insn_data[icode].operand[1].mode;
18253       mode2 = insn_data[icode].operand[2].mode;
18254
18255       if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
18256         {
18257           op0 = copy_to_reg (op0);
18258           op0 = simplify_gen_subreg (mode1, op0, GET_MODE (op0), 0);
18259         }
18260       if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
18261         {
18262           error ("shift must be an immediate");
18263           return const0_rtx;
18264         }
18265       target = gen_reg_rtx (V2DImode);
18266       pat = GEN_FCN (icode) (simplify_gen_subreg (tmode, target, V2DImode, 0),
18267                              op0, op1);
18268       if (! pat)
18269         return 0;
18270       emit_insn (pat);
18271       return target;
18272
18273     case IX86_BUILTIN_FEMMS:
18274       emit_insn (gen_mmx_femms ());
18275       return NULL_RTX;
18276
18277     case IX86_BUILTIN_PAVGUSB:
18278       return ix86_expand_binop_builtin (CODE_FOR_mmx_uavgv8qi3, exp, target);
18279
18280     case IX86_BUILTIN_PF2ID:
18281       return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2id, exp, target, 0);
18282
18283     case IX86_BUILTIN_PFACC:
18284       return ix86_expand_binop_builtin (CODE_FOR_mmx_haddv2sf3, exp, target);
18285
18286     case IX86_BUILTIN_PFADD:
18287      return ix86_expand_binop_builtin (CODE_FOR_mmx_addv2sf3, exp, target);
18288
18289     case IX86_BUILTIN_PFCMPEQ:
18290       return ix86_expand_binop_builtin (CODE_FOR_mmx_eqv2sf3, exp, target);
18291
18292     case IX86_BUILTIN_PFCMPGE:
18293       return ix86_expand_binop_builtin (CODE_FOR_mmx_gev2sf3, exp, target);
18294
18295     case IX86_BUILTIN_PFCMPGT:
18296       return ix86_expand_binop_builtin (CODE_FOR_mmx_gtv2sf3, exp, target);
18297
18298     case IX86_BUILTIN_PFMAX:
18299       return ix86_expand_binop_builtin (CODE_FOR_mmx_smaxv2sf3, exp, target);
18300
18301     case IX86_BUILTIN_PFMIN:
18302       return ix86_expand_binop_builtin (CODE_FOR_mmx_sminv2sf3, exp, target);
18303
18304     case IX86_BUILTIN_PFMUL:
18305       return ix86_expand_binop_builtin (CODE_FOR_mmx_mulv2sf3, exp, target);
18306
18307     case IX86_BUILTIN_PFRCP:
18308       return ix86_expand_unop_builtin (CODE_FOR_mmx_rcpv2sf2, exp, target, 0);
18309
18310     case IX86_BUILTIN_PFRCPIT1:
18311       return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit1v2sf3, exp, target);
18312
18313     case IX86_BUILTIN_PFRCPIT2:
18314       return ix86_expand_binop_builtin (CODE_FOR_mmx_rcpit2v2sf3, exp, target);
18315
18316     case IX86_BUILTIN_PFRSQIT1:
18317       return ix86_expand_binop_builtin (CODE_FOR_mmx_rsqit1v2sf3, exp, target);
18318
18319     case IX86_BUILTIN_PFRSQRT:
18320       return ix86_expand_unop_builtin (CODE_FOR_mmx_rsqrtv2sf2, exp, target, 0);
18321
18322     case IX86_BUILTIN_PFSUB:
18323       return ix86_expand_binop_builtin (CODE_FOR_mmx_subv2sf3, exp, target);
18324
18325     case IX86_BUILTIN_PFSUBR:
18326       return ix86_expand_binop_builtin (CODE_FOR_mmx_subrv2sf3, exp, target);
18327
18328     case IX86_BUILTIN_PI2FD:
18329       return ix86_expand_unop_builtin (CODE_FOR_mmx_floatv2si2, exp, target, 0);
18330
18331     case IX86_BUILTIN_PMULHRW:
18332       return ix86_expand_binop_builtin (CODE_FOR_mmx_pmulhrwv4hi3, exp, target);
18333
18334     case IX86_BUILTIN_PF2IW:
18335       return ix86_expand_unop_builtin (CODE_FOR_mmx_pf2iw, exp, target, 0);
18336
18337     case IX86_BUILTIN_PFNACC:
18338       return ix86_expand_binop_builtin (CODE_FOR_mmx_hsubv2sf3, exp, target);
18339
18340     case IX86_BUILTIN_PFPNACC:
18341       return ix86_expand_binop_builtin (CODE_FOR_mmx_addsubv2sf3, exp, target);
18342
18343     case IX86_BUILTIN_PI2FW:
18344       return ix86_expand_unop_builtin (CODE_FOR_mmx_pi2fw, exp, target, 0);
18345
18346     case IX86_BUILTIN_PSWAPDSI:
18347       return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2si2, exp, target, 0);
18348
18349     case IX86_BUILTIN_PSWAPDSF:
18350       return ix86_expand_unop_builtin (CODE_FOR_mmx_pswapdv2sf2, exp, target, 0);
18351
18352     case IX86_BUILTIN_SQRTSD:
18353       return ix86_expand_unop1_builtin (CODE_FOR_sse2_vmsqrtv2df2, exp, target);
18354     case IX86_BUILTIN_LOADUPD:
18355       return ix86_expand_unop_builtin (CODE_FOR_sse2_movupd, exp, target, 1);
18356     case IX86_BUILTIN_STOREUPD:
18357       return ix86_expand_store_builtin (CODE_FOR_sse2_movupd, exp);
18358
18359     case IX86_BUILTIN_MFENCE:
18360         emit_insn (gen_sse2_mfence ());
18361         return 0;
18362     case IX86_BUILTIN_LFENCE:
18363         emit_insn (gen_sse2_lfence ());
18364         return 0;
18365
18366     case IX86_BUILTIN_CLFLUSH:
18367         arg0 = CALL_EXPR_ARG (exp, 0);
18368         op0 = expand_normal (arg0);
18369         icode = CODE_FOR_sse2_clflush;
18370         if (! (*insn_data[icode].operand[0].predicate) (op0, Pmode))
18371             op0 = copy_to_mode_reg (Pmode, op0);
18372
18373         emit_insn (gen_sse2_clflush (op0));
18374         return 0;
18375
18376     case IX86_BUILTIN_MOVNTPD:
18377       return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2df, exp);
18378     case IX86_BUILTIN_MOVNTDQ:
18379       return ix86_expand_store_builtin (CODE_FOR_sse2_movntv2di, exp);
18380     case IX86_BUILTIN_MOVNTI:
18381       return ix86_expand_store_builtin (CODE_FOR_sse2_movntsi, exp);
18382
18383     case IX86_BUILTIN_LOADDQU:
18384       return ix86_expand_unop_builtin (CODE_FOR_sse2_movdqu, exp, target, 1);
18385     case IX86_BUILTIN_STOREDQU:
18386       return ix86_expand_store_builtin (CODE_FOR_sse2_movdqu, exp);
18387
18388     case IX86_BUILTIN_MONITOR:
18389       arg0 = CALL_EXPR_ARG (exp, 0);
18390       arg1 = CALL_EXPR_ARG (exp, 1);
18391       arg2 = CALL_EXPR_ARG (exp, 2);
18392       op0 = expand_normal (arg0);
18393       op1 = expand_normal (arg1);
18394       op2 = expand_normal (arg2);
18395       if (!REG_P (op0))
18396         op0 = copy_to_mode_reg (Pmode, op0);
18397       if (!REG_P (op1))
18398         op1 = copy_to_mode_reg (SImode, op1);
18399       if (!REG_P (op2))
18400         op2 = copy_to_mode_reg (SImode, op2);
18401       if (!TARGET_64BIT)
18402         emit_insn (gen_sse3_monitor (op0, op1, op2));
18403       else
18404         emit_insn (gen_sse3_monitor64 (op0, op1, op2));
18405       return 0;
18406
18407     case IX86_BUILTIN_MWAIT:
18408       arg0 = CALL_EXPR_ARG (exp, 0);
18409       arg1 = CALL_EXPR_ARG (exp, 1);
18410       op0 = expand_normal (arg0);
18411       op1 = expand_normal (arg1);
18412       if (!REG_P (op0))
18413         op0 = copy_to_mode_reg (SImode, op0);
18414       if (!REG_P (op1))
18415         op1 = copy_to_mode_reg (SImode, op1);
18416       emit_insn (gen_sse3_mwait (op0, op1));
18417       return 0;
18418
18419     case IX86_BUILTIN_LDDQU:
18420       return ix86_expand_unop_builtin (CODE_FOR_sse3_lddqu, exp,
18421                                        target, 1);
18422
18423     case IX86_BUILTIN_PALIGNR:
18424     case IX86_BUILTIN_PALIGNR128:
18425       if (fcode == IX86_BUILTIN_PALIGNR)
18426         {
18427           icode = CODE_FOR_ssse3_palignrdi;
18428           mode = DImode;
18429         }
18430       else
18431         {
18432           icode = CODE_FOR_ssse3_palignrti;
18433           mode = V2DImode;
18434         }
18435       arg0 = CALL_EXPR_ARG (exp, 0);
18436       arg1 = CALL_EXPR_ARG (exp, 1);
18437       arg2 = CALL_EXPR_ARG (exp, 2);
18438       op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0);
18439       op1 = expand_expr (arg1, NULL_RTX, VOIDmode, 0);
18440       op2 = expand_expr (arg2, NULL_RTX, VOIDmode, 0);
18441       tmode = insn_data[icode].operand[0].mode;
18442       mode1 = insn_data[icode].operand[1].mode;
18443       mode2 = insn_data[icode].operand[2].mode;
18444       mode3 = insn_data[icode].operand[3].mode;
18445
18446       if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
18447         {
18448           op0 = copy_to_reg (op0);
18449           op0 = simplify_gen_subreg (mode1, op0, GET_MODE (op0), 0);
18450         }
18451       if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
18452         {
18453           op1 = copy_to_reg (op1);
18454           op1 = simplify_gen_subreg (mode2, op1, GET_MODE (op1), 0);
18455         }
18456       if (! (*insn_data[icode].operand[3].predicate) (op2, mode3))
18457         {
18458           error ("shift must be an immediate");
18459           return const0_rtx;
18460         }
18461       target = gen_reg_rtx (mode);
18462       pat = GEN_FCN (icode) (simplify_gen_subreg (tmode, target, mode, 0),
18463                              op0, op1, op2);
18464       if (! pat)
18465         return 0;
18466       emit_insn (pat);
18467       return target;
18468
18469     case IX86_BUILTIN_MOVNTSD:
18470       return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv2df, exp);
18471
18472     case IX86_BUILTIN_MOVNTSS:
18473       return ix86_expand_store_builtin (CODE_FOR_sse4a_vmmovntv4sf, exp);
18474
18475     case IX86_BUILTIN_INSERTQ:
18476     case IX86_BUILTIN_EXTRQ:
18477       icode = (fcode == IX86_BUILTIN_EXTRQ
18478                ? CODE_FOR_sse4a_extrq
18479                : CODE_FOR_sse4a_insertq);
18480       arg0 = CALL_EXPR_ARG (exp, 0);
18481       arg1 = CALL_EXPR_ARG (exp, 1);
18482       op0 = expand_normal (arg0);
18483       op1 = expand_normal (arg1);
18484       tmode = insn_data[icode].operand[0].mode;
18485       mode1 = insn_data[icode].operand[1].mode;
18486       mode2 = insn_data[icode].operand[2].mode;
18487       if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
18488         op0 = copy_to_mode_reg (mode1, op0);
18489       if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
18490         op1 = copy_to_mode_reg (mode2, op1);
18491       if (optimize || target == 0
18492           || GET_MODE (target) != tmode
18493           || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
18494         target = gen_reg_rtx (tmode);
18495       pat = GEN_FCN (icode) (target, op0, op1);
18496       if (! pat)
18497         return NULL_RTX;
18498       emit_insn (pat);
18499       return target;
18500
18501     case IX86_BUILTIN_EXTRQI:
18502       icode = CODE_FOR_sse4a_extrqi;
18503       arg0 = CALL_EXPR_ARG (exp, 0);
18504       arg1 = CALL_EXPR_ARG (exp, 1);
18505       arg2 = CALL_EXPR_ARG (exp, 2);
18506       op0 = expand_normal (arg0);
18507       op1 = expand_normal (arg1);
18508       op2 = expand_normal (arg2);
18509       tmode = insn_data[icode].operand[0].mode;
18510       mode1 = insn_data[icode].operand[1].mode;
18511       mode2 = insn_data[icode].operand[2].mode;
18512       mode3 = insn_data[icode].operand[3].mode;
18513       if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
18514         op0 = copy_to_mode_reg (mode1, op0);
18515       if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
18516         {
18517           error ("index mask must be an immediate");
18518           return gen_reg_rtx (tmode);
18519         }
18520       if (! (*insn_data[icode].operand[3].predicate) (op2, mode3))
18521         {
18522           error ("length mask must be an immediate");
18523           return gen_reg_rtx (tmode);
18524         }
18525       if (optimize || target == 0
18526           || GET_MODE (target) != tmode
18527           || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
18528         target = gen_reg_rtx (tmode);
18529       pat = GEN_FCN (icode) (target, op0, op1, op2);
18530       if (! pat)
18531         return NULL_RTX;
18532       emit_insn (pat);
18533       return target;
18534
18535     case IX86_BUILTIN_INSERTQI:
18536       icode = CODE_FOR_sse4a_insertqi;
18537       arg0 = CALL_EXPR_ARG (exp, 0);
18538       arg1 = CALL_EXPR_ARG (exp, 1);
18539       arg2 = CALL_EXPR_ARG (exp, 2);
18540       arg3 = CALL_EXPR_ARG (exp, 3);
18541       op0 = expand_normal (arg0);
18542       op1 = expand_normal (arg1);
18543       op2 = expand_normal (arg2);
18544       op3 = expand_normal (arg3);
18545       tmode = insn_data[icode].operand[0].mode;
18546       mode1 = insn_data[icode].operand[1].mode;
18547       mode2 = insn_data[icode].operand[2].mode;
18548       mode3 = insn_data[icode].operand[3].mode;
18549       mode4 = insn_data[icode].operand[4].mode;
18550
18551       if (! (*insn_data[icode].operand[1].predicate) (op0, mode1))
18552         op0 = copy_to_mode_reg (mode1, op0);
18553
18554       if (! (*insn_data[icode].operand[2].predicate) (op1, mode2))
18555         op1 = copy_to_mode_reg (mode2, op1);
18556
18557       if (! (*insn_data[icode].operand[3].predicate) (op2, mode3))
18558         {
18559           error ("index mask must be an immediate");
18560           return gen_reg_rtx (tmode);
18561         }
18562       if (! (*insn_data[icode].operand[4].predicate) (op3, mode4))
18563         {
18564           error ("length mask must be an immediate");
18565           return gen_reg_rtx (tmode);
18566         }
18567       if (optimize || target == 0
18568           || GET_MODE (target) != tmode
18569           || ! (*insn_data[icode].operand[0].predicate) (target, tmode))
18570         target = gen_reg_rtx (tmode);
18571       pat = GEN_FCN (icode) (target, op0, op1, op2, op3);
18572       if (! pat)
18573         return NULL_RTX;
18574       emit_insn (pat);
18575       return target;
18576
18577     case IX86_BUILTIN_VEC_INIT_V2SI:
18578     case IX86_BUILTIN_VEC_INIT_V4HI:
18579     case IX86_BUILTIN_VEC_INIT_V8QI:
18580       return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
18581
18582     case IX86_BUILTIN_VEC_EXT_V2DF:
18583     case IX86_BUILTIN_VEC_EXT_V2DI:
18584     case IX86_BUILTIN_VEC_EXT_V4SF:
18585     case IX86_BUILTIN_VEC_EXT_V4SI:
18586     case IX86_BUILTIN_VEC_EXT_V8HI:
18587     case IX86_BUILTIN_VEC_EXT_V2SI:
18588     case IX86_BUILTIN_VEC_EXT_V4HI:
18589       return ix86_expand_vec_ext_builtin (exp, target);
18590
18591     case IX86_BUILTIN_VEC_SET_V8HI:
18592     case IX86_BUILTIN_VEC_SET_V4HI:
18593       return ix86_expand_vec_set_builtin (exp);
18594
18595     default:
18596       break;
18597     }
18598
18599   for (i = 0, d = bdesc_2arg; i < ARRAY_SIZE (bdesc_2arg); i++, d++)
18600     if (d->code == fcode)
18601       {
18602         /* Compares are treated specially.  */
18603         if (d->icode == CODE_FOR_sse_maskcmpv4sf3
18604             || d->icode == CODE_FOR_sse_vmmaskcmpv4sf3
18605             || d->icode == CODE_FOR_sse2_maskcmpv2df3
18606             || d->icode == CODE_FOR_sse2_vmmaskcmpv2df3)
18607           return ix86_expand_sse_compare (d, exp, target);
18608
18609         return ix86_expand_binop_builtin (d->icode, exp, target);
18610       }
18611
18612   for (i = 0, d = bdesc_1arg; i < ARRAY_SIZE (bdesc_1arg); i++, d++)
18613     if (d->code == fcode)
18614       return ix86_expand_unop_builtin (d->icode, exp, target, 0);
18615
18616   for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
18617     if (d->code == fcode)
18618       return ix86_expand_sse_comi (d, exp, target);
18619
18620   gcc_unreachable ();
18621 }
18622
18623 /* Returns a function decl for a vectorized version of the builtin function
18624    with builtin function code FN and the result vector type TYPE, or NULL_TREE
18625    if it is not available.  */
18626
18627 static tree
18628 ix86_builtin_vectorized_function (enum built_in_function fn, tree type_out,
18629                                   tree type_in)
18630 {
18631   enum machine_mode in_mode, out_mode;
18632   int in_n, out_n;
18633
18634   if (TREE_CODE (type_out) != VECTOR_TYPE
18635       || TREE_CODE (type_in) != VECTOR_TYPE)
18636     return NULL_TREE;
18637
18638   out_mode = TYPE_MODE (TREE_TYPE (type_out));
18639   out_n = TYPE_VECTOR_SUBPARTS (type_out);
18640   in_mode = TYPE_MODE (TREE_TYPE (type_in));
18641   in_n = TYPE_VECTOR_SUBPARTS (type_in);
18642
18643   switch (fn)
18644     {
18645     case BUILT_IN_SQRT:
18646       if (out_mode == DFmode && out_n == 2
18647           && in_mode == DFmode && in_n == 2)
18648         return ix86_builtins[IX86_BUILTIN_SQRTPD];
18649       return NULL_TREE;
18650
18651     case BUILT_IN_SQRTF:
18652       if (out_mode == SFmode && out_n == 4
18653           && in_mode == SFmode && in_n == 4)
18654         return ix86_builtins[IX86_BUILTIN_SQRTPS];
18655       return NULL_TREE;
18656
18657     case BUILT_IN_LRINTF:
18658       if (out_mode == SImode && out_n == 4
18659           && in_mode == SFmode && in_n == 4)
18660         return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
18661       return NULL_TREE;
18662
18663     default:
18664       ;
18665     }
18666
18667   return NULL_TREE;
18668 }
18669
18670 /* Returns a decl of a function that implements conversion of the
18671    input vector of type TYPE, or NULL_TREE if it is not available.  */
18672
18673 static tree
18674 ix86_builtin_conversion (enum tree_code code, tree type)
18675 {
18676   if (TREE_CODE (type) != VECTOR_TYPE)
18677     return NULL_TREE;
18678
18679   switch (code)
18680     {
18681     case FLOAT_EXPR:
18682       switch (TYPE_MODE (type))
18683         {
18684         case V4SImode:
18685           return ix86_builtins[IX86_BUILTIN_CVTDQ2PS];
18686         default:
18687           return NULL_TREE;
18688         }
18689
18690     case FIX_TRUNC_EXPR:
18691       switch (TYPE_MODE (type))
18692         {
18693         case V4SFmode:
18694           return ix86_builtins[IX86_BUILTIN_CVTTPS2DQ];
18695         default:
18696           return NULL_TREE;
18697         }
18698     default:
18699       return NULL_TREE;
18700
18701     }
18702 }
18703
18704 /* Store OPERAND to the memory after reload is completed.  This means
18705    that we can't easily use assign_stack_local.  */
18706 rtx
18707 ix86_force_to_memory (enum machine_mode mode, rtx operand)
18708 {
18709   rtx result;
18710
18711   gcc_assert (reload_completed);
18712   if (TARGET_RED_ZONE)
18713     {
18714       result = gen_rtx_MEM (mode,
18715                             gen_rtx_PLUS (Pmode,
18716                                           stack_pointer_rtx,
18717                                           GEN_INT (-RED_ZONE_SIZE)));
18718       emit_move_insn (result, operand);
18719     }
18720   else if (!TARGET_RED_ZONE && TARGET_64BIT)
18721     {
18722       switch (mode)
18723         {
18724         case HImode:
18725         case SImode:
18726           operand = gen_lowpart (DImode, operand);
18727           /* FALLTHRU */
18728         case DImode:
18729           emit_insn (
18730                       gen_rtx_SET (VOIDmode,
18731                                    gen_rtx_MEM (DImode,
18732                                                 gen_rtx_PRE_DEC (DImode,
18733                                                         stack_pointer_rtx)),
18734                                    operand));
18735           break;
18736         default:
18737           gcc_unreachable ();
18738         }
18739       result = gen_rtx_MEM (mode, stack_pointer_rtx);
18740     }
18741   else
18742     {
18743       switch (mode)
18744         {
18745         case DImode:
18746           {
18747             rtx operands[2];
18748             split_di (&operand, 1, operands, operands + 1);
18749             emit_insn (
18750                         gen_rtx_SET (VOIDmode,
18751                                      gen_rtx_MEM (SImode,
18752                                                   gen_rtx_PRE_DEC (Pmode,
18753                                                         stack_pointer_rtx)),
18754                                      operands[1]));
18755             emit_insn (
18756                         gen_rtx_SET (VOIDmode,
18757                                      gen_rtx_MEM (SImode,
18758                                                   gen_rtx_PRE_DEC (Pmode,
18759                                                         stack_pointer_rtx)),
18760                                      operands[0]));
18761           }
18762           break;
18763         case HImode:
18764           /* Store HImodes as SImodes.  */
18765           operand = gen_lowpart (SImode, operand);
18766           /* FALLTHRU */
18767         case SImode:
18768           emit_insn (
18769                       gen_rtx_SET (VOIDmode,
18770                                    gen_rtx_MEM (GET_MODE (operand),
18771                                                 gen_rtx_PRE_DEC (SImode,
18772                                                         stack_pointer_rtx)),
18773                                    operand));
18774           break;
18775         default:
18776           gcc_unreachable ();
18777         }
18778       result = gen_rtx_MEM (mode, stack_pointer_rtx);
18779     }
18780   return result;
18781 }
18782
18783 /* Free operand from the memory.  */
18784 void
18785 ix86_free_from_memory (enum machine_mode mode)
18786 {
18787   if (!TARGET_RED_ZONE)
18788     {
18789       int size;
18790
18791       if (mode == DImode || TARGET_64BIT)
18792         size = 8;
18793       else
18794         size = 4;
18795       /* Use LEA to deallocate stack space.  In peephole2 it will be converted
18796          to pop or add instruction if registers are available.  */
18797       emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
18798                               gen_rtx_PLUS (Pmode, stack_pointer_rtx,
18799                                             GEN_INT (size))));
18800     }
18801 }
18802
18803 /* Put float CONST_DOUBLE in the constant pool instead of fp regs.
18804    QImode must go into class Q_REGS.
18805    Narrow ALL_REGS to GENERAL_REGS.  This supports allowing movsf and
18806    movdf to do mem-to-mem moves through integer regs.  */
18807 enum reg_class
18808 ix86_preferred_reload_class (rtx x, enum reg_class class)
18809 {
18810   enum machine_mode mode = GET_MODE (x);
18811
18812   /* We're only allowed to return a subclass of CLASS.  Many of the
18813      following checks fail for NO_REGS, so eliminate that early.  */
18814   if (class == NO_REGS)
18815     return NO_REGS;
18816
18817   /* All classes can load zeros.  */
18818   if (x == CONST0_RTX (mode))
18819     return class;
18820
18821   /* Force constants into memory if we are loading a (nonzero) constant into
18822      an MMX or SSE register.  This is because there are no MMX/SSE instructions
18823      to load from a constant.  */
18824   if (CONSTANT_P (x)
18825       && (MAYBE_MMX_CLASS_P (class) || MAYBE_SSE_CLASS_P (class)))
18826     return NO_REGS;
18827
18828   /* Prefer SSE regs only, if we can use them for math.  */
18829   if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
18830     return SSE_CLASS_P (class) ? class : NO_REGS;
18831
18832   /* Floating-point constants need more complex checks.  */
18833   if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
18834     {
18835       /* General regs can load everything.  */
18836       if (reg_class_subset_p (class, GENERAL_REGS))
18837         return class;
18838
18839       /* Floats can load 0 and 1 plus some others.  Note that we eliminated
18840          zero above.  We only want to wind up preferring 80387 registers if
18841          we plan on doing computation with them.  */
18842       if (TARGET_80387
18843           && standard_80387_constant_p (x))
18844         {
18845           /* Limit class to non-sse.  */
18846           if (class == FLOAT_SSE_REGS)
18847             return FLOAT_REGS;
18848           if (class == FP_TOP_SSE_REGS)
18849             return FP_TOP_REG;
18850           if (class == FP_SECOND_SSE_REGS)
18851             return FP_SECOND_REG;
18852           if (class == FLOAT_INT_REGS || class == FLOAT_REGS)
18853             return class;
18854         }
18855
18856       return NO_REGS;
18857     }
18858
18859   /* Generally when we see PLUS here, it's the function invariant
18860      (plus soft-fp const_int).  Which can only be computed into general
18861      regs.  */
18862   if (GET_CODE (x) == PLUS)
18863     return reg_class_subset_p (class, GENERAL_REGS) ? class : NO_REGS;
18864
18865   /* QImode constants are easy to load, but non-constant QImode data
18866      must go into Q_REGS.  */
18867   if (GET_MODE (x) == QImode && !CONSTANT_P (x))
18868     {
18869       if (reg_class_subset_p (class, Q_REGS))
18870         return class;
18871       if (reg_class_subset_p (Q_REGS, class))
18872         return Q_REGS;
18873       return NO_REGS;
18874     }
18875
18876   return class;
18877 }
18878
18879 /* Discourage putting floating-point values in SSE registers unless
18880    SSE math is being used, and likewise for the 387 registers.  */
18881 enum reg_class
18882 ix86_preferred_output_reload_class (rtx x, enum reg_class class)
18883 {
18884   enum machine_mode mode = GET_MODE (x);
18885
18886   /* Restrict the output reload class to the register bank that we are doing
18887      math on.  If we would like not to return a subset of CLASS, reject this
18888      alternative: if reload cannot do this, it will still use its choice.  */
18889   mode = GET_MODE (x);
18890   if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
18891     return MAYBE_SSE_CLASS_P (class) ? SSE_REGS : NO_REGS;
18892
18893   if (X87_FLOAT_MODE_P (mode))
18894     {
18895       if (class == FP_TOP_SSE_REGS)
18896         return FP_TOP_REG;
18897       else if (class == FP_SECOND_SSE_REGS)
18898         return FP_SECOND_REG;
18899       else
18900         return FLOAT_CLASS_P (class) ? class : NO_REGS;
18901     }
18902
18903   return class;
18904 }
18905
18906 /* If we are copying between general and FP registers, we need a memory
18907    location. The same is true for SSE and MMX registers.
18908
18909    The macro can't work reliably when one of the CLASSES is class containing
18910    registers from multiple units (SSE, MMX, integer).  We avoid this by never
18911    combining those units in single alternative in the machine description.
18912    Ensure that this constraint holds to avoid unexpected surprises.
18913
18914    When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
18915    enforce these sanity checks.  */
18916
18917 int
18918 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
18919                               enum machine_mode mode, int strict)
18920 {
18921   if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
18922       || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
18923       || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
18924       || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
18925       || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
18926       || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
18927     {
18928       gcc_assert (!strict);
18929       return true;
18930     }
18931
18932   if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
18933     return true;
18934
18935   /* ??? This is a lie.  We do have moves between mmx/general, and for
18936      mmx/sse2.  But by saying we need secondary memory we discourage the
18937      register allocator from using the mmx registers unless needed.  */
18938   if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
18939     return true;
18940
18941   if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
18942     {
18943       /* SSE1 doesn't have any direct moves from other classes.  */
18944       if (!TARGET_SSE2)
18945         return true;
18946
18947       /* If the target says that inter-unit moves are more expensive
18948          than moving through memory, then don't generate them.  */
18949       if (!TARGET_INTER_UNIT_MOVES)
18950         return true;
18951
18952       /* Between SSE and general, we have moves no larger than word size.  */
18953       if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
18954         return true;
18955     }
18956
18957   return false;
18958 }
18959
18960 /* Return true if the registers in CLASS cannot represent the change from
18961    modes FROM to TO.  */
18962
18963 bool
18964 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
18965                                enum reg_class class)
18966 {
18967   if (from == to)
18968     return false;
18969
18970   /* x87 registers can't do subreg at all, as all values are reformatted
18971      to extended precision.  */
18972   if (MAYBE_FLOAT_CLASS_P (class))
18973     return true;
18974
18975   if (MAYBE_SSE_CLASS_P (class) || MAYBE_MMX_CLASS_P (class))
18976     {
18977       /* Vector registers do not support QI or HImode loads.  If we don't
18978          disallow a change to these modes, reload will assume it's ok to
18979          drop the subreg from (subreg:SI (reg:HI 100) 0).  This affects
18980          the vec_dupv4hi pattern.  */
18981       if (GET_MODE_SIZE (from) < 4)
18982         return true;
18983
18984       /* Vector registers do not support subreg with nonzero offsets, which
18985          are otherwise valid for integer registers.  Since we can't see
18986          whether we have a nonzero offset from here, prohibit all
18987          nonparadoxical subregs changing size.  */
18988       if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
18989         return true;
18990     }
18991
18992   return false;
18993 }
18994
18995 /* Return the cost of moving data from a register in class CLASS1 to
18996    one in class CLASS2.
18997
18998    It is not required that the cost always equal 2 when FROM is the same as TO;
18999    on some machines it is expensive to move between registers if they are not
19000    general registers.  */
19001
19002 int
19003 ix86_register_move_cost (enum machine_mode mode, enum reg_class class1,
19004                          enum reg_class class2)
19005 {
19006   /* In case we require secondary memory, compute cost of the store followed
19007      by load.  In order to avoid bad register allocation choices, we need
19008      for this to be *at least* as high as the symmetric MEMORY_MOVE_COST.  */
19009
19010   if (ix86_secondary_memory_needed (class1, class2, mode, 0))
19011     {
19012       int cost = 1;
19013
19014       cost += MAX (MEMORY_MOVE_COST (mode, class1, 0),
19015                    MEMORY_MOVE_COST (mode, class1, 1));
19016       cost += MAX (MEMORY_MOVE_COST (mode, class2, 0),
19017                    MEMORY_MOVE_COST (mode, class2, 1));
19018
19019       /* In case of copying from general_purpose_register we may emit multiple
19020          stores followed by single load causing memory size mismatch stall.
19021          Count this as arbitrarily high cost of 20.  */
19022       if (CLASS_MAX_NREGS (class1, mode) > CLASS_MAX_NREGS (class2, mode))
19023         cost += 20;
19024
19025       /* In the case of FP/MMX moves, the registers actually overlap, and we
19026          have to switch modes in order to treat them differently.  */
19027       if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
19028           || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
19029         cost += 20;
19030
19031       return cost;
19032     }
19033
19034   /* Moves between SSE/MMX and integer unit are expensive.  */
19035   if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
19036       || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
19037     return ix86_cost->mmxsse_to_integer;
19038   if (MAYBE_FLOAT_CLASS_P (class1))
19039     return ix86_cost->fp_move;
19040   if (MAYBE_SSE_CLASS_P (class1))
19041     return ix86_cost->sse_move;
19042   if (MAYBE_MMX_CLASS_P (class1))
19043     return ix86_cost->mmx_move;
19044   return 2;
19045 }
19046
19047 /* Return 1 if hard register REGNO can hold a value of machine-mode MODE.  */
19048
19049 bool
19050 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
19051 {
19052   /* Flags and only flags can only hold CCmode values.  */
19053   if (CC_REGNO_P (regno))
19054     return GET_MODE_CLASS (mode) == MODE_CC;
19055   if (GET_MODE_CLASS (mode) == MODE_CC
19056       || GET_MODE_CLASS (mode) == MODE_RANDOM
19057       || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
19058     return 0;
19059   if (FP_REGNO_P (regno))
19060     return VALID_FP_MODE_P (mode);
19061   if (SSE_REGNO_P (regno))
19062     {
19063       /* We implement the move patterns for all vector modes into and
19064          out of SSE registers, even when no operation instructions
19065          are available.  */
19066       return (VALID_SSE_REG_MODE (mode)
19067               || VALID_SSE2_REG_MODE (mode)
19068               || VALID_MMX_REG_MODE (mode)
19069               || VALID_MMX_REG_MODE_3DNOW (mode));
19070     }
19071   if (MMX_REGNO_P (regno))
19072     {
19073       /* We implement the move patterns for 3DNOW modes even in MMX mode,
19074          so if the register is available at all, then we can move data of
19075          the given mode into or out of it.  */
19076       return (VALID_MMX_REG_MODE (mode)
19077               || VALID_MMX_REG_MODE_3DNOW (mode));
19078     }
19079
19080   if (mode == QImode)
19081     {
19082       /* Take care for QImode values - they can be in non-QI regs,
19083          but then they do cause partial register stalls.  */
19084       if (regno < 4 || TARGET_64BIT)
19085         return 1;
19086       if (!TARGET_PARTIAL_REG_STALL)
19087         return 1;
19088       return reload_in_progress || reload_completed;
19089     }
19090   /* We handle both integer and floats in the general purpose registers.  */
19091   else if (VALID_INT_MODE_P (mode))
19092     return 1;
19093   else if (VALID_FP_MODE_P (mode))
19094     return 1;
19095   /* Lots of MMX code casts 8 byte vector modes to DImode.  If we then go
19096      on to use that value in smaller contexts, this can easily force a
19097      pseudo to be allocated to GENERAL_REGS.  Since this is no worse than
19098      supporting DImode, allow it.  */
19099   else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
19100     return 1;
19101
19102   return 0;
19103 }
19104
19105 /* A subroutine of ix86_modes_tieable_p.  Return true if MODE is a
19106    tieable integer mode.  */
19107
19108 static bool
19109 ix86_tieable_integer_mode_p (enum machine_mode mode)
19110 {
19111   switch (mode)
19112     {
19113     case HImode:
19114     case SImode:
19115       return true;
19116
19117     case QImode:
19118       return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
19119
19120     case DImode:
19121       return TARGET_64BIT;
19122
19123     default:
19124       return false;
19125     }
19126 }
19127
19128 /* Return true if MODE1 is accessible in a register that can hold MODE2
19129    without copying.  That is, all register classes that can hold MODE2
19130    can also hold MODE1.  */
19131
19132 bool
19133 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
19134 {
19135   if (mode1 == mode2)
19136     return true;
19137
19138   if (ix86_tieable_integer_mode_p (mode1)
19139       && ix86_tieable_integer_mode_p (mode2))
19140     return true;
19141
19142   /* MODE2 being XFmode implies fp stack or general regs, which means we
19143      can tie any smaller floating point modes to it.  Note that we do not
19144      tie this with TFmode.  */
19145   if (mode2 == XFmode)
19146     return mode1 == SFmode || mode1 == DFmode;
19147
19148   /* MODE2 being DFmode implies fp stack, general or sse regs, which means
19149      that we can tie it with SFmode.  */
19150   if (mode2 == DFmode)
19151     return mode1 == SFmode;
19152
19153   /* If MODE2 is only appropriate for an SSE register, then tie with
19154      any other mode acceptable to SSE registers.  */
19155   if (GET_MODE_SIZE (mode2) == 16
19156       && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
19157     return (GET_MODE_SIZE (mode1) == 16
19158             && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
19159
19160   /* If MODE2 is appropriate for an MMX register, then tie
19161      with any other mode acceptable to MMX registers.  */
19162   if (GET_MODE_SIZE (mode2) == 8
19163       && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
19164     return (GET_MODE_SIZE (mode1) == 8
19165             && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
19166
19167   return false;
19168 }
19169
19170 /* Return the cost of moving data of mode M between a
19171    register and memory.  A value of 2 is the default; this cost is
19172    relative to those in `REGISTER_MOVE_COST'.
19173
19174    If moving between registers and memory is more expensive than
19175    between two registers, you should define this macro to express the
19176    relative cost.
19177
19178    Model also increased moving costs of QImode registers in non
19179    Q_REGS classes.
19180  */
19181 int
19182 ix86_memory_move_cost (enum machine_mode mode, enum reg_class class, int in)
19183 {
19184   if (FLOAT_CLASS_P (class))
19185     {
19186       int index;
19187       switch (mode)
19188         {
19189           case SFmode:
19190             index = 0;
19191             break;
19192           case DFmode:
19193             index = 1;
19194             break;
19195           case XFmode:
19196             index = 2;
19197             break;
19198           default:
19199             return 100;
19200         }
19201       return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
19202     }
19203   if (SSE_CLASS_P (class))
19204     {
19205       int index;
19206       switch (GET_MODE_SIZE (mode))
19207         {
19208           case 4:
19209             index = 0;
19210             break;
19211           case 8:
19212             index = 1;
19213             break;
19214           case 16:
19215             index = 2;
19216             break;
19217           default:
19218             return 100;
19219         }
19220       return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
19221     }
19222   if (MMX_CLASS_P (class))
19223     {
19224       int index;
19225       switch (GET_MODE_SIZE (mode))
19226         {
19227           case 4:
19228             index = 0;
19229             break;
19230           case 8:
19231             index = 1;
19232             break;
19233           default:
19234             return 100;
19235         }
19236       return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
19237     }
19238   switch (GET_MODE_SIZE (mode))
19239     {
19240       case 1:
19241         if (in)
19242           return (Q_CLASS_P (class) ? ix86_cost->int_load[0]
19243                   : ix86_cost->movzbl_load);
19244         else
19245           return (Q_CLASS_P (class) ? ix86_cost->int_store[0]
19246                   : ix86_cost->int_store[0] + 4);
19247         break;
19248       case 2:
19249         return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
19250       default:
19251         /* Compute number of 32bit moves needed.  TFmode is moved as XFmode.  */
19252         if (mode == TFmode)
19253           mode = XFmode;
19254         return ((in ? ix86_cost->int_load[2] : ix86_cost->int_store[2])
19255                 * (((int) GET_MODE_SIZE (mode)
19256                     + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
19257     }
19258 }
19259
19260 /* Compute a (partial) cost for rtx X.  Return true if the complete
19261    cost has been computed, and false if subexpressions should be
19262    scanned.  In either case, *TOTAL contains the cost result.  */
19263
19264 static bool
19265 ix86_rtx_costs (rtx x, int code, int outer_code, int *total)
19266 {
19267   enum machine_mode mode = GET_MODE (x);
19268
19269   switch (code)
19270     {
19271     case CONST_INT:
19272     case CONST:
19273     case LABEL_REF:
19274     case SYMBOL_REF:
19275       if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
19276         *total = 3;
19277       else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
19278         *total = 2;
19279       else if (flag_pic && SYMBOLIC_CONST (x)
19280                && (!TARGET_64BIT
19281                    || (!GET_CODE (x) != LABEL_REF
19282                        && (GET_CODE (x) != SYMBOL_REF
19283                            || !SYMBOL_REF_LOCAL_P (x)))))
19284         *total = 1;
19285       else
19286         *total = 0;
19287       return true;
19288
19289     case CONST_DOUBLE:
19290       if (mode == VOIDmode)
19291         *total = 0;
19292       else
19293         switch (standard_80387_constant_p (x))
19294           {
19295           case 1: /* 0.0 */
19296             *total = 1;
19297             break;
19298           default: /* Other constants */
19299             *total = 2;
19300             break;
19301           case 0:
19302           case -1:
19303             /* Start with (MEM (SYMBOL_REF)), since that's where
19304                it'll probably end up.  Add a penalty for size.  */
19305             *total = (COSTS_N_INSNS (1)
19306                       + (flag_pic != 0 && !TARGET_64BIT)
19307                       + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
19308             break;
19309           }
19310       return true;
19311
19312     case ZERO_EXTEND:
19313       /* The zero extensions is often completely free on x86_64, so make
19314          it as cheap as possible.  */
19315       if (TARGET_64BIT && mode == DImode
19316           && GET_MODE (XEXP (x, 0)) == SImode)
19317         *total = 1;
19318       else if (TARGET_ZERO_EXTEND_WITH_AND)
19319         *total = ix86_cost->add;
19320       else
19321         *total = ix86_cost->movzx;
19322       return false;
19323
19324     case SIGN_EXTEND:
19325       *total = ix86_cost->movsx;
19326       return false;
19327
19328     case ASHIFT:
19329       if (CONST_INT_P (XEXP (x, 1))
19330           && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
19331         {
19332           HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
19333           if (value == 1)
19334             {
19335               *total = ix86_cost->add;
19336               return false;
19337             }
19338           if ((value == 2 || value == 3)
19339               && ix86_cost->lea <= ix86_cost->shift_const)
19340             {
19341               *total = ix86_cost->lea;
19342               return false;
19343             }
19344         }
19345       /* FALLTHRU */
19346
19347     case ROTATE:
19348     case ASHIFTRT:
19349     case LSHIFTRT:
19350     case ROTATERT:
19351       if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
19352         {
19353           if (CONST_INT_P (XEXP (x, 1)))
19354             {
19355               if (INTVAL (XEXP (x, 1)) > 32)
19356                 *total = ix86_cost->shift_const + COSTS_N_INSNS (2);
19357               else
19358                 *total = ix86_cost->shift_const * 2;
19359             }
19360           else
19361             {
19362               if (GET_CODE (XEXP (x, 1)) == AND)
19363                 *total = ix86_cost->shift_var * 2;
19364               else
19365                 *total = ix86_cost->shift_var * 6 + COSTS_N_INSNS (2);
19366             }
19367         }
19368       else
19369         {
19370           if (CONST_INT_P (XEXP (x, 1)))
19371             *total = ix86_cost->shift_const;
19372           else
19373             *total = ix86_cost->shift_var;
19374         }
19375       return false;
19376
19377     case MULT:
19378       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
19379         {
19380           /* ??? SSE scalar cost should be used here.  */
19381           *total = ix86_cost->fmul;
19382           return false;
19383         }
19384       else if (X87_FLOAT_MODE_P (mode))
19385         {
19386           *total = ix86_cost->fmul;
19387           return false;
19388         }
19389       else if (FLOAT_MODE_P (mode))
19390         {
19391           /* ??? SSE vector cost should be used here.  */
19392           *total = ix86_cost->fmul;
19393           return false;
19394         }
19395       else
19396         {
19397           rtx op0 = XEXP (x, 0);
19398           rtx op1 = XEXP (x, 1);
19399           int nbits;
19400           if (CONST_INT_P (XEXP (x, 1)))
19401             {
19402               unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
19403               for (nbits = 0; value != 0; value &= value - 1)
19404                 nbits++;
19405             }
19406           else
19407             /* This is arbitrary.  */
19408             nbits = 7;
19409
19410           /* Compute costs correctly for widening multiplication.  */
19411           if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op1) == ZERO_EXTEND)
19412               && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
19413                  == GET_MODE_SIZE (mode))
19414             {
19415               int is_mulwiden = 0;
19416               enum machine_mode inner_mode = GET_MODE (op0);
19417
19418               if (GET_CODE (op0) == GET_CODE (op1))
19419                 is_mulwiden = 1, op1 = XEXP (op1, 0);
19420               else if (CONST_INT_P (op1))
19421                 {
19422                   if (GET_CODE (op0) == SIGN_EXTEND)
19423                     is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
19424                                   == INTVAL (op1);
19425                   else
19426                     is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
19427                 }
19428
19429               if (is_mulwiden)
19430                 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
19431             }
19432
19433           *total = (ix86_cost->mult_init[MODE_INDEX (mode)]
19434                     + nbits * ix86_cost->mult_bit
19435                     + rtx_cost (op0, outer_code) + rtx_cost (op1, outer_code));
19436
19437           return true;
19438         }
19439
19440     case DIV:
19441     case UDIV:
19442     case MOD:
19443     case UMOD:
19444       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
19445         /* ??? SSE cost should be used here.  */
19446         *total = ix86_cost->fdiv;
19447       else if (X87_FLOAT_MODE_P (mode))
19448         *total = ix86_cost->fdiv;
19449       else if (FLOAT_MODE_P (mode))
19450         /* ??? SSE vector cost should be used here.  */
19451         *total = ix86_cost->fdiv;
19452       else
19453         *total = ix86_cost->divide[MODE_INDEX (mode)];
19454       return false;
19455
19456     case PLUS:
19457       if (GET_MODE_CLASS (mode) == MODE_INT
19458                && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
19459         {
19460           if (GET_CODE (XEXP (x, 0)) == PLUS
19461               && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
19462               && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
19463               && CONSTANT_P (XEXP (x, 1)))
19464             {
19465               HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
19466               if (val == 2 || val == 4 || val == 8)
19467                 {
19468                   *total = ix86_cost->lea;
19469                   *total += rtx_cost (XEXP (XEXP (x, 0), 1), outer_code);
19470                   *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
19471                                       outer_code);
19472                   *total += rtx_cost (XEXP (x, 1), outer_code);
19473                   return true;
19474                 }
19475             }
19476           else if (GET_CODE (XEXP (x, 0)) == MULT
19477                    && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
19478             {
19479               HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
19480               if (val == 2 || val == 4 || val == 8)
19481                 {
19482                   *total = ix86_cost->lea;
19483                   *total += rtx_cost (XEXP (XEXP (x, 0), 0), outer_code);
19484                   *total += rtx_cost (XEXP (x, 1), outer_code);
19485                   return true;
19486                 }
19487             }
19488           else if (GET_CODE (XEXP (x, 0)) == PLUS)
19489             {
19490               *total = ix86_cost->lea;
19491               *total += rtx_cost (XEXP (XEXP (x, 0), 0), outer_code);
19492               *total += rtx_cost (XEXP (XEXP (x, 0), 1), outer_code);
19493               *total += rtx_cost (XEXP (x, 1), outer_code);
19494               return true;
19495             }
19496         }
19497       /* FALLTHRU */
19498
19499     case MINUS:
19500       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
19501         {
19502           /* ??? SSE cost should be used here.  */
19503           *total = ix86_cost->fadd;
19504           return false;
19505         }
19506       else if (X87_FLOAT_MODE_P (mode))
19507         {
19508           *total = ix86_cost->fadd;
19509           return false;
19510         }
19511       else if (FLOAT_MODE_P (mode))
19512         {
19513           /* ??? SSE vector cost should be used here.  */
19514           *total = ix86_cost->fadd;
19515           return false;
19516         }
19517       /* FALLTHRU */
19518
19519     case AND:
19520     case IOR:
19521     case XOR:
19522       if (!TARGET_64BIT && mode == DImode)
19523         {
19524           *total = (ix86_cost->add * 2
19525                     + (rtx_cost (XEXP (x, 0), outer_code)
19526                        << (GET_MODE (XEXP (x, 0)) != DImode))
19527                     + (rtx_cost (XEXP (x, 1), outer_code)
19528                        << (GET_MODE (XEXP (x, 1)) != DImode)));
19529           return true;
19530         }
19531       /* FALLTHRU */
19532
19533     case NEG:
19534       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
19535         {
19536           /* ??? SSE cost should be used here.  */
19537           *total = ix86_cost->fchs;
19538           return false;
19539         }
19540       else if (X87_FLOAT_MODE_P (mode))
19541         {
19542           *total = ix86_cost->fchs;
19543           return false;
19544         }
19545       else if (FLOAT_MODE_P (mode))
19546         {
19547           /* ??? SSE vector cost should be used here.  */
19548           *total = ix86_cost->fchs;
19549           return false;
19550         }
19551       /* FALLTHRU */
19552
19553     case NOT:
19554       if (!TARGET_64BIT && mode == DImode)
19555         *total = ix86_cost->add * 2;
19556       else
19557         *total = ix86_cost->add;
19558       return false;
19559
19560     case COMPARE:
19561       if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
19562           && XEXP (XEXP (x, 0), 1) == const1_rtx
19563           && CONST_INT_P (XEXP (XEXP (x, 0), 2))
19564           && XEXP (x, 1) == const0_rtx)
19565         {
19566           /* This kind of construct is implemented using test[bwl].
19567              Treat it as if we had an AND.  */
19568           *total = (ix86_cost->add
19569                     + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code)
19570                     + rtx_cost (const1_rtx, outer_code));
19571           return true;
19572         }
19573       return false;
19574
19575     case FLOAT_EXTEND:
19576       if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
19577         *total = 0;
19578       return false;
19579
19580     case ABS:
19581       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
19582         /* ??? SSE cost should be used here.  */
19583         *total = ix86_cost->fabs;
19584       else if (X87_FLOAT_MODE_P (mode))
19585         *total = ix86_cost->fabs;
19586       else if (FLOAT_MODE_P (mode))
19587         /* ??? SSE vector cost should be used here.  */
19588         *total = ix86_cost->fabs;
19589       return false;
19590
19591     case SQRT:
19592       if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
19593         /* ??? SSE cost should be used here.  */
19594         *total = ix86_cost->fsqrt;
19595       else if (X87_FLOAT_MODE_P (mode))
19596         *total = ix86_cost->fsqrt;
19597       else if (FLOAT_MODE_P (mode))
19598         /* ??? SSE vector cost should be used here.  */
19599         *total = ix86_cost->fsqrt;
19600       return false;
19601
19602     case UNSPEC:
19603       if (XINT (x, 1) == UNSPEC_TP)
19604         *total = 0;
19605       return false;
19606
19607     default:
19608       return false;
19609     }
19610 }
19611
19612 #if TARGET_MACHO
19613
19614 static int current_machopic_label_num;
19615
19616 /* Given a symbol name and its associated stub, write out the
19617    definition of the stub.  */
19618
19619 void
19620 machopic_output_stub (FILE *file, const char *symb, const char *stub)
19621 {
19622   unsigned int length;
19623   char *binder_name, *symbol_name, lazy_ptr_name[32];
19624   int label = ++current_machopic_label_num;
19625
19626   /* For 64-bit we shouldn't get here.  */
19627   gcc_assert (!TARGET_64BIT);
19628
19629   /* Lose our funky encoding stuff so it doesn't contaminate the stub.  */
19630   symb = (*targetm.strip_name_encoding) (symb);
19631
19632   length = strlen (stub);
19633   binder_name = alloca (length + 32);
19634   GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
19635
19636   length = strlen (symb);
19637   symbol_name = alloca (length + 32);
19638   GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
19639
19640   sprintf (lazy_ptr_name, "L%d$lz", label);
19641
19642   if (MACHOPIC_PURE)
19643     switch_to_section (darwin_sections[machopic_picsymbol_stub_section]);
19644   else
19645     switch_to_section (darwin_sections[machopic_symbol_stub_section]);
19646
19647   fprintf (file, "%s:\n", stub);
19648   fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
19649
19650   if (MACHOPIC_PURE)
19651     {
19652       fprintf (file, "\tcall\tLPC$%d\nLPC$%d:\tpopl\t%%eax\n", label, label);
19653       fprintf (file, "\tmovl\t%s-LPC$%d(%%eax),%%edx\n", lazy_ptr_name, label);
19654       fprintf (file, "\tjmp\t*%%edx\n");
19655     }
19656   else
19657     fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
19658
19659   fprintf (file, "%s:\n", binder_name);
19660
19661   if (MACHOPIC_PURE)
19662     {
19663       fprintf (file, "\tlea\t%s-LPC$%d(%%eax),%%eax\n", lazy_ptr_name, label);
19664       fprintf (file, "\tpushl\t%%eax\n");
19665     }
19666   else
19667     fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
19668
19669   fprintf (file, "\tjmp\tdyld_stub_binding_helper\n");
19670
19671   switch_to_section (darwin_sections[machopic_lazy_symbol_ptr_section]);
19672   fprintf (file, "%s:\n", lazy_ptr_name);
19673   fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
19674   fprintf (file, "\t.long %s\n", binder_name);
19675 }
19676
19677 void
19678 darwin_x86_file_end (void)
19679 {
19680   darwin_file_end ();
19681   ix86_file_end ();
19682 }
19683 #endif /* TARGET_MACHO */
19684
19685 /* Order the registers for register allocator.  */
19686
19687 void
19688 x86_order_regs_for_local_alloc (void)
19689 {
19690    int pos = 0;
19691    int i;
19692
19693    /* First allocate the local general purpose registers.  */
19694    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
19695      if (GENERAL_REGNO_P (i) && call_used_regs[i])
19696         reg_alloc_order [pos++] = i;
19697
19698    /* Global general purpose registers.  */
19699    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
19700      if (GENERAL_REGNO_P (i) && !call_used_regs[i])
19701         reg_alloc_order [pos++] = i;
19702
19703    /* x87 registers come first in case we are doing FP math
19704       using them.  */
19705    if (!TARGET_SSE_MATH)
19706      for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
19707        reg_alloc_order [pos++] = i;
19708
19709    /* SSE registers.  */
19710    for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
19711      reg_alloc_order [pos++] = i;
19712    for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
19713      reg_alloc_order [pos++] = i;
19714
19715    /* x87 registers.  */
19716    if (TARGET_SSE_MATH)
19717      for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
19718        reg_alloc_order [pos++] = i;
19719
19720    for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
19721      reg_alloc_order [pos++] = i;
19722
19723    /* Initialize the rest of array as we do not allocate some registers
19724       at all.  */
19725    while (pos < FIRST_PSEUDO_REGISTER)
19726      reg_alloc_order [pos++] = 0;
19727 }
19728
19729 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
19730    struct attribute_spec.handler.  */
19731 static tree
19732 ix86_handle_struct_attribute (tree *node, tree name,
19733                               tree args ATTRIBUTE_UNUSED,
19734                               int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
19735 {
19736   tree *type = NULL;
19737   if (DECL_P (*node))
19738     {
19739       if (TREE_CODE (*node) == TYPE_DECL)
19740         type = &TREE_TYPE (*node);
19741     }
19742   else
19743     type = node;
19744
19745   if (!(type && (TREE_CODE (*type) == RECORD_TYPE
19746                  || TREE_CODE (*type) == UNION_TYPE)))
19747     {
19748       warning (OPT_Wattributes, "%qs attribute ignored",
19749                IDENTIFIER_POINTER (name));
19750       *no_add_attrs = true;
19751     }
19752
19753   else if ((is_attribute_p ("ms_struct", name)
19754             && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
19755            || ((is_attribute_p ("gcc_struct", name)
19756                 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
19757     {
19758       warning (OPT_Wattributes, "%qs incompatible attribute ignored",
19759                IDENTIFIER_POINTER (name));
19760       *no_add_attrs = true;
19761     }
19762
19763   return NULL_TREE;
19764 }
19765
19766 static bool
19767 ix86_ms_bitfield_layout_p (tree record_type)
19768 {
19769   return (TARGET_MS_BITFIELD_LAYOUT &&
19770           !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
19771     || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type));
19772 }
19773
19774 /* Returns an expression indicating where the this parameter is
19775    located on entry to the FUNCTION.  */
19776
19777 static rtx
19778 x86_this_parameter (tree function)
19779 {
19780   tree type = TREE_TYPE (function);
19781   bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
19782
19783   if (TARGET_64BIT)
19784     {
19785       const int *parm_regs;
19786
19787       if (TARGET_64BIT_MS_ABI)
19788         parm_regs = x86_64_ms_abi_int_parameter_registers;
19789       else
19790         parm_regs = x86_64_int_parameter_registers;
19791       return gen_rtx_REG (DImode, parm_regs[aggr]);
19792     }
19793
19794   if (ix86_function_regparm (type, function) > 0
19795       && !type_has_variadic_args_p (type))
19796     {
19797       int regno = 0;
19798       if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type)))
19799         regno = 2;
19800       return gen_rtx_REG (SImode, regno);
19801     }
19802
19803   return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
19804 }
19805
19806 /* Determine whether x86_output_mi_thunk can succeed.  */
19807
19808 static bool
19809 x86_can_output_mi_thunk (tree thunk ATTRIBUTE_UNUSED,
19810                          HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
19811                          HOST_WIDE_INT vcall_offset, tree function)
19812 {
19813   /* 64-bit can handle anything.  */
19814   if (TARGET_64BIT)
19815     return true;
19816
19817   /* For 32-bit, everything's fine if we have one free register.  */
19818   if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
19819     return true;
19820
19821   /* Need a free register for vcall_offset.  */
19822   if (vcall_offset)
19823     return false;
19824
19825   /* Need a free register for GOT references.  */
19826   if (flag_pic && !(*targetm.binds_local_p) (function))
19827     return false;
19828
19829   /* Otherwise ok.  */
19830   return true;
19831 }
19832
19833 /* Output the assembler code for a thunk function.  THUNK_DECL is the
19834    declaration for the thunk function itself, FUNCTION is the decl for
19835    the target function.  DELTA is an immediate constant offset to be
19836    added to THIS.  If VCALL_OFFSET is nonzero, the word at
19837    *(*this + vcall_offset) should be added to THIS.  */
19838
19839 static void
19840 x86_output_mi_thunk (FILE *file ATTRIBUTE_UNUSED,
19841                      tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
19842                      HOST_WIDE_INT vcall_offset, tree function)
19843 {
19844   rtx xops[3];
19845   rtx this = x86_this_parameter (function);
19846   rtx this_reg, tmp;
19847
19848   /* If VCALL_OFFSET, we'll need THIS in a register.  Might as well
19849      pull it in now and let DELTA benefit.  */
19850   if (REG_P (this))
19851     this_reg = this;
19852   else if (vcall_offset)
19853     {
19854       /* Put the this parameter into %eax.  */
19855       xops[0] = this;
19856       xops[1] = this_reg = gen_rtx_REG (Pmode, 0);
19857       output_asm_insn ("mov{l}\t{%0, %1|%1, %0}", xops);
19858     }
19859   else
19860     this_reg = NULL_RTX;
19861
19862   /* Adjust the this parameter by a fixed constant.  */
19863   if (delta)
19864     {
19865       xops[0] = GEN_INT (delta);
19866       xops[1] = this_reg ? this_reg : this;
19867       if (TARGET_64BIT)
19868         {
19869           if (!x86_64_general_operand (xops[0], DImode))
19870             {
19871               tmp = gen_rtx_REG (DImode, R10_REG);
19872               xops[1] = tmp;
19873               output_asm_insn ("mov{q}\t{%1, %0|%0, %1}", xops);
19874               xops[0] = tmp;
19875               xops[1] = this;
19876             }
19877           output_asm_insn ("add{q}\t{%0, %1|%1, %0}", xops);
19878         }
19879       else
19880         output_asm_insn ("add{l}\t{%0, %1|%1, %0}", xops);
19881     }
19882
19883   /* Adjust the this parameter by a value stored in the vtable.  */
19884   if (vcall_offset)
19885     {
19886       if (TARGET_64BIT)
19887         tmp = gen_rtx_REG (DImode, R10_REG);
19888       else
19889         {
19890           int tmp_regno = 2 /* ECX */;
19891           if (lookup_attribute ("fastcall",
19892                                 TYPE_ATTRIBUTES (TREE_TYPE (function))))
19893             tmp_regno = 0 /* EAX */;
19894           tmp = gen_rtx_REG (SImode, tmp_regno);
19895         }
19896
19897       xops[0] = gen_rtx_MEM (Pmode, this_reg);
19898       xops[1] = tmp;
19899       if (TARGET_64BIT)
19900         output_asm_insn ("mov{q}\t{%0, %1|%1, %0}", xops);
19901       else
19902         output_asm_insn ("mov{l}\t{%0, %1|%1, %0}", xops);
19903
19904       /* Adjust the this parameter.  */
19905       xops[0] = gen_rtx_MEM (Pmode, plus_constant (tmp, vcall_offset));
19906       if (TARGET_64BIT && !memory_operand (xops[0], Pmode))
19907         {
19908           rtx tmp2 = gen_rtx_REG (DImode, R11_REG);
19909           xops[0] = GEN_INT (vcall_offset);
19910           xops[1] = tmp2;
19911           output_asm_insn ("mov{q}\t{%0, %1|%1, %0}", xops);
19912           xops[0] = gen_rtx_MEM (Pmode, gen_rtx_PLUS (Pmode, tmp, tmp2));
19913         }
19914       xops[1] = this_reg;
19915       if (TARGET_64BIT)
19916         output_asm_insn ("add{q}\t{%0, %1|%1, %0}", xops);
19917       else
19918         output_asm_insn ("add{l}\t{%0, %1|%1, %0}", xops);
19919     }
19920
19921   /* If necessary, drop THIS back to its stack slot.  */
19922   if (this_reg && this_reg != this)
19923     {
19924       xops[0] = this_reg;
19925       xops[1] = this;
19926       output_asm_insn ("mov{l}\t{%0, %1|%1, %0}", xops);
19927     }
19928
19929   xops[0] = XEXP (DECL_RTL (function), 0);
19930   if (TARGET_64BIT)
19931     {
19932       if (!flag_pic || (*targetm.binds_local_p) (function))
19933         output_asm_insn ("jmp\t%P0", xops);
19934       /* All thunks should be in the same object as their target,
19935          and thus binds_local_p should be true.  */
19936       else if (TARGET_64BIT_MS_ABI)
19937         gcc_unreachable ();
19938       else
19939         {
19940           tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, xops[0]), UNSPEC_GOTPCREL);
19941           tmp = gen_rtx_CONST (Pmode, tmp);
19942           tmp = gen_rtx_MEM (QImode, tmp);
19943           xops[0] = tmp;
19944           output_asm_insn ("jmp\t%A0", xops);
19945         }
19946     }
19947   else
19948     {
19949       if (!flag_pic || (*targetm.binds_local_p) (function))
19950         output_asm_insn ("jmp\t%P0", xops);
19951       else
19952 #if TARGET_MACHO
19953         if (TARGET_MACHO)
19954           {
19955             rtx sym_ref = XEXP (DECL_RTL (function), 0);
19956             tmp = (gen_rtx_SYMBOL_REF
19957                    (Pmode,
19958                     machopic_indirection_name (sym_ref, /*stub_p=*/true)));
19959             tmp = gen_rtx_MEM (QImode, tmp);
19960             xops[0] = tmp;
19961             output_asm_insn ("jmp\t%0", xops);
19962           }
19963         else
19964 #endif /* TARGET_MACHO */
19965         {
19966           tmp = gen_rtx_REG (SImode, 2 /* ECX */);
19967           output_set_got (tmp, NULL_RTX);
19968
19969           xops[1] = tmp;
19970           output_asm_insn ("mov{l}\t{%0@GOT(%1), %1|%1, %0@GOT[%1]}", xops);
19971           output_asm_insn ("jmp\t{*}%1", xops);
19972         }
19973     }
19974 }
19975
19976 static void
19977 x86_file_start (void)
19978 {
19979   default_file_start ();
19980 #if TARGET_MACHO
19981   darwin_file_start ();
19982 #endif
19983   if (X86_FILE_START_VERSION_DIRECTIVE)
19984     fputs ("\t.version\t\"01.01\"\n", asm_out_file);
19985   if (X86_FILE_START_FLTUSED)
19986     fputs ("\t.global\t__fltused\n", asm_out_file);
19987   if (ix86_asm_dialect == ASM_INTEL)
19988     fputs ("\t.intel_syntax\n", asm_out_file);
19989 }
19990
19991 int
19992 x86_field_alignment (tree field, int computed)
19993 {
19994   enum machine_mode mode;
19995   tree type = TREE_TYPE (field);
19996
19997   if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
19998     return computed;
19999   mode = TYPE_MODE (TREE_CODE (type) == ARRAY_TYPE
20000                     ? get_inner_array_type (type) : type);
20001   if (mode == DFmode || mode == DCmode
20002       || GET_MODE_CLASS (mode) == MODE_INT
20003       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
20004     return MIN (32, computed);
20005   return computed;
20006 }
20007
20008 /* Output assembler code to FILE to increment profiler label # LABELNO
20009    for profiling a function entry.  */
20010 void
20011 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
20012 {
20013   if (TARGET_64BIT)
20014     {
20015 #ifndef NO_PROFILE_COUNTERS
20016       fprintf (file, "\tleaq\t%sP%d@(%%rip),%%r11\n", LPREFIX, labelno);
20017 #endif
20018
20019       if (!TARGET_64BIT_MS_ABI && flag_pic)
20020         fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", MCOUNT_NAME);
20021       else
20022         fprintf (file, "\tcall\t%s\n", MCOUNT_NAME);
20023     }
20024   else if (flag_pic)
20025     {
20026 #ifndef NO_PROFILE_COUNTERS
20027       fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%%s\n",
20028                LPREFIX, labelno, PROFILE_COUNT_REGISTER);
20029 #endif
20030       fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", MCOUNT_NAME);
20031     }
20032   else
20033     {
20034 #ifndef NO_PROFILE_COUNTERS
20035       fprintf (file, "\tmovl\t$%sP%d,%%%s\n", LPREFIX, labelno,
20036                PROFILE_COUNT_REGISTER);
20037 #endif
20038       fprintf (file, "\tcall\t%s\n", MCOUNT_NAME);
20039     }
20040 }
20041
20042 /* We don't have exact information about the insn sizes, but we may assume
20043    quite safely that we are informed about all 1 byte insns and memory
20044    address sizes.  This is enough to eliminate unnecessary padding in
20045    99% of cases.  */
20046
20047 static int
20048 min_insn_size (rtx insn)
20049 {
20050   int l = 0;
20051
20052   if (!INSN_P (insn) || !active_insn_p (insn))
20053     return 0;
20054
20055   /* Discard alignments we've emit and jump instructions.  */
20056   if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
20057       && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
20058     return 0;
20059   if (JUMP_P (insn)
20060       && (GET_CODE (PATTERN (insn)) == ADDR_VEC
20061           || GET_CODE (PATTERN (insn)) == ADDR_DIFF_VEC))
20062     return 0;
20063
20064   /* Important case - calls are always 5 bytes.
20065      It is common to have many calls in the row.  */
20066   if (CALL_P (insn)
20067       && symbolic_reference_mentioned_p (PATTERN (insn))
20068       && !SIBLING_CALL_P (insn))
20069     return 5;
20070   if (get_attr_length (insn) <= 1)
20071     return 1;
20072
20073   /* For normal instructions we may rely on the sizes of addresses
20074      and the presence of symbol to require 4 bytes of encoding.
20075      This is not the case for jumps where references are PC relative.  */
20076   if (!JUMP_P (insn))
20077     {
20078       l = get_attr_length_address (insn);
20079       if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
20080         l = 4;
20081     }
20082   if (l)
20083     return 1+l;
20084   else
20085     return 2;
20086 }
20087
20088 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
20089    window.  */
20090
20091 static void
20092 ix86_avoid_jump_misspredicts (void)
20093 {
20094   rtx insn, start = get_insns ();
20095   int nbytes = 0, njumps = 0;
20096   int isjump = 0;
20097
20098   /* Look for all minimal intervals of instructions containing 4 jumps.
20099      The intervals are bounded by START and INSN.  NBYTES is the total
20100      size of instructions in the interval including INSN and not including
20101      START.  When the NBYTES is smaller than 16 bytes, it is possible
20102      that the end of START and INSN ends up in the same 16byte page.
20103
20104      The smallest offset in the page INSN can start is the case where START
20105      ends on the offset 0.  Offset of INSN is then NBYTES - sizeof (INSN).
20106      We add p2align to 16byte window with maxskip 17 - NBYTES + sizeof (INSN).
20107      */
20108   for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
20109     {
20110
20111       nbytes += min_insn_size (insn);
20112       if (dump_file)
20113         fprintf(dump_file, "Insn %i estimated to %i bytes\n",
20114                 INSN_UID (insn), min_insn_size (insn));
20115       if ((JUMP_P (insn)
20116            && GET_CODE (PATTERN (insn)) != ADDR_VEC
20117            && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
20118           || CALL_P (insn))
20119         njumps++;
20120       else
20121         continue;
20122
20123       while (njumps > 3)
20124         {
20125           start = NEXT_INSN (start);
20126           if ((JUMP_P (start)
20127                && GET_CODE (PATTERN (start)) != ADDR_VEC
20128                && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
20129               || CALL_P (start))
20130             njumps--, isjump = 1;
20131           else
20132             isjump = 0;
20133           nbytes -= min_insn_size (start);
20134         }
20135       gcc_assert (njumps >= 0);
20136       if (dump_file)
20137         fprintf (dump_file, "Interval %i to %i has %i bytes\n",
20138                 INSN_UID (start), INSN_UID (insn), nbytes);
20139
20140       if (njumps == 3 && isjump && nbytes < 16)
20141         {
20142           int padsize = 15 - nbytes + min_insn_size (insn);
20143
20144           if (dump_file)
20145             fprintf (dump_file, "Padding insn %i by %i bytes!\n",
20146                      INSN_UID (insn), padsize);
20147           emit_insn_before (gen_align (GEN_INT (padsize)), insn);
20148         }
20149     }
20150 }
20151
20152 /* AMD Athlon works faster
20153    when RET is not destination of conditional jump or directly preceded
20154    by other jump instruction.  We avoid the penalty by inserting NOP just
20155    before the RET instructions in such cases.  */
20156 static void
20157 ix86_pad_returns (void)
20158 {
20159   edge e;
20160   edge_iterator ei;
20161
20162   FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
20163     {
20164       basic_block bb = e->src;
20165       rtx ret = BB_END (bb);
20166       rtx prev;
20167       bool replace = false;
20168
20169       if (!JUMP_P (ret) || GET_CODE (PATTERN (ret)) != RETURN
20170           || !maybe_hot_bb_p (bb))
20171         continue;
20172       for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
20173         if (active_insn_p (prev) || LABEL_P (prev))
20174           break;
20175       if (prev && LABEL_P (prev))
20176         {
20177           edge e;
20178           edge_iterator ei;
20179
20180           FOR_EACH_EDGE (e, ei, bb->preds)
20181             if (EDGE_FREQUENCY (e) && e->src->index >= 0
20182                 && !(e->flags & EDGE_FALLTHRU))
20183               replace = true;
20184         }
20185       if (!replace)
20186         {
20187           prev = prev_active_insn (ret);
20188           if (prev
20189               && ((JUMP_P (prev) && any_condjump_p (prev))
20190                   || CALL_P (prev)))
20191             replace = true;
20192           /* Empty functions get branch mispredict even when the jump destination
20193              is not visible to us.  */
20194           if (!prev && cfun->function_frequency > FUNCTION_FREQUENCY_UNLIKELY_EXECUTED)
20195             replace = true;
20196         }
20197       if (replace)
20198         {
20199           emit_insn_before (gen_return_internal_long (), ret);
20200           delete_insn (ret);
20201         }
20202     }
20203 }
20204
20205 /* Implement machine specific optimizations.  We implement padding of returns
20206    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
20207 static void
20208 ix86_reorg (void)
20209 {
20210   if (TARGET_PAD_RETURNS && optimize && !optimize_size)
20211     ix86_pad_returns ();
20212   if (TARGET_FOUR_JUMP_LIMIT && optimize && !optimize_size)
20213     ix86_avoid_jump_misspredicts ();
20214 }
20215
20216 /* Return nonzero when QImode register that must be represented via REX prefix
20217    is used.  */
20218 bool
20219 x86_extended_QIreg_mentioned_p (rtx insn)
20220 {
20221   int i;
20222   extract_insn_cached (insn);
20223   for (i = 0; i < recog_data.n_operands; i++)
20224     if (REG_P (recog_data.operand[i])
20225         && REGNO (recog_data.operand[i]) >= 4)
20226        return true;
20227   return false;
20228 }
20229
20230 /* Return nonzero when P points to register encoded via REX prefix.
20231    Called via for_each_rtx.  */
20232 static int
20233 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
20234 {
20235    unsigned int regno;
20236    if (!REG_P (*p))
20237      return 0;
20238    regno = REGNO (*p);
20239    return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
20240 }
20241
20242 /* Return true when INSN mentions register that must be encoded using REX
20243    prefix.  */
20244 bool
20245 x86_extended_reg_mentioned_p (rtx insn)
20246 {
20247   return for_each_rtx (&PATTERN (insn), extended_reg_mentioned_1, NULL);
20248 }
20249
20250 /* Generate an unsigned DImode/SImode to FP conversion.  This is the same code
20251    optabs would emit if we didn't have TFmode patterns.  */
20252
20253 void
20254 x86_emit_floatuns (rtx operands[2])
20255 {
20256   rtx neglab, donelab, i0, i1, f0, in, out;
20257   enum machine_mode mode, inmode;
20258
20259   inmode = GET_MODE (operands[1]);
20260   gcc_assert (inmode == SImode || inmode == DImode);
20261
20262   out = operands[0];
20263   in = force_reg (inmode, operands[1]);
20264   mode = GET_MODE (out);
20265   neglab = gen_label_rtx ();
20266   donelab = gen_label_rtx ();
20267   f0 = gen_reg_rtx (mode);
20268
20269   emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
20270
20271   expand_float (out, in, 0);
20272
20273   emit_jump_insn (gen_jump (donelab));
20274   emit_barrier ();
20275
20276   emit_label (neglab);
20277
20278   i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
20279                             1, OPTAB_DIRECT);
20280   i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
20281                             1, OPTAB_DIRECT);
20282   i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
20283
20284   expand_float (f0, i0, 0);
20285
20286   emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
20287
20288   emit_label (donelab);
20289 }
20290 \f
20291 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
20292    with all elements equal to VAR.  Return true if successful.  */
20293
20294 static bool
20295 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
20296                                    rtx target, rtx val)
20297 {
20298   enum machine_mode smode, wsmode, wvmode;
20299   rtx x;
20300
20301   switch (mode)
20302     {
20303     case V2SImode:
20304     case V2SFmode:
20305       if (!mmx_ok)
20306         return false;
20307       /* FALLTHRU */
20308
20309     case V2DFmode:
20310     case V2DImode:
20311     case V4SFmode:
20312     case V4SImode:
20313       val = force_reg (GET_MODE_INNER (mode), val);
20314       x = gen_rtx_VEC_DUPLICATE (mode, val);
20315       emit_insn (gen_rtx_SET (VOIDmode, target, x));
20316       return true;
20317
20318     case V4HImode:
20319       if (!mmx_ok)
20320         return false;
20321       if (TARGET_SSE || TARGET_3DNOW_A)
20322         {
20323           val = gen_lowpart (SImode, val);
20324           x = gen_rtx_TRUNCATE (HImode, val);
20325           x = gen_rtx_VEC_DUPLICATE (mode, x);
20326           emit_insn (gen_rtx_SET (VOIDmode, target, x));
20327           return true;
20328         }
20329       else
20330         {
20331           smode = HImode;
20332           wsmode = SImode;
20333           wvmode = V2SImode;
20334           goto widen;
20335         }
20336
20337     case V8QImode:
20338       if (!mmx_ok)
20339         return false;
20340       smode = QImode;
20341       wsmode = HImode;
20342       wvmode = V4HImode;
20343       goto widen;
20344     case V8HImode:
20345       if (TARGET_SSE2)
20346         {
20347           rtx tmp1, tmp2;
20348           /* Extend HImode to SImode using a paradoxical SUBREG.  */
20349           tmp1 = gen_reg_rtx (SImode);
20350           emit_move_insn (tmp1, gen_lowpart (SImode, val));
20351           /* Insert the SImode value as low element of V4SImode vector. */
20352           tmp2 = gen_reg_rtx (V4SImode);
20353           tmp1 = gen_rtx_VEC_MERGE (V4SImode,
20354                                     gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
20355                                     CONST0_RTX (V4SImode),
20356                                     const1_rtx);
20357           emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
20358           /* Cast the V4SImode vector back to a V8HImode vector.  */
20359           tmp1 = gen_reg_rtx (V8HImode);
20360           emit_move_insn (tmp1, gen_lowpart (V8HImode, tmp2));
20361           /* Duplicate the low short through the whole low SImode word.  */
20362           emit_insn (gen_sse2_punpcklwd (tmp1, tmp1, tmp1));
20363           /* Cast the V8HImode vector back to a V4SImode vector.  */
20364           tmp2 = gen_reg_rtx (V4SImode);
20365           emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
20366           /* Replicate the low element of the V4SImode vector.  */
20367           emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
20368           /* Cast the V2SImode back to V8HImode, and store in target.  */
20369           emit_move_insn (target, gen_lowpart (V8HImode, tmp2));
20370           return true;
20371         }
20372       smode = HImode;
20373       wsmode = SImode;
20374       wvmode = V4SImode;
20375       goto widen;
20376     case V16QImode:
20377       if (TARGET_SSE2)
20378         {
20379           rtx tmp1, tmp2;
20380           /* Extend QImode to SImode using a paradoxical SUBREG.  */
20381           tmp1 = gen_reg_rtx (SImode);
20382           emit_move_insn (tmp1, gen_lowpart (SImode, val));
20383           /* Insert the SImode value as low element of V4SImode vector. */
20384           tmp2 = gen_reg_rtx (V4SImode);
20385           tmp1 = gen_rtx_VEC_MERGE (V4SImode,
20386                                     gen_rtx_VEC_DUPLICATE (V4SImode, tmp1),
20387                                     CONST0_RTX (V4SImode),
20388                                     const1_rtx);
20389           emit_insn (gen_rtx_SET (VOIDmode, tmp2, tmp1));
20390           /* Cast the V4SImode vector back to a V16QImode vector.  */
20391           tmp1 = gen_reg_rtx (V16QImode);
20392           emit_move_insn (tmp1, gen_lowpart (V16QImode, tmp2));
20393           /* Duplicate the low byte through the whole low SImode word.  */
20394           emit_insn (gen_sse2_punpcklbw (tmp1, tmp1, tmp1));
20395           emit_insn (gen_sse2_punpcklbw (tmp1, tmp1, tmp1));
20396           /* Cast the V16QImode vector back to a V4SImode vector.  */
20397           tmp2 = gen_reg_rtx (V4SImode);
20398           emit_move_insn (tmp2, gen_lowpart (V4SImode, tmp1));
20399           /* Replicate the low element of the V4SImode vector.  */
20400           emit_insn (gen_sse2_pshufd (tmp2, tmp2, const0_rtx));
20401           /* Cast the V2SImode back to V16QImode, and store in target.  */
20402           emit_move_insn (target, gen_lowpart (V16QImode, tmp2));
20403           return true;
20404         }
20405       smode = QImode;
20406       wsmode = HImode;
20407       wvmode = V8HImode;
20408       goto widen;
20409     widen:
20410       /* Replicate the value once into the next wider mode and recurse.  */
20411       val = convert_modes (wsmode, smode, val, true);
20412       x = expand_simple_binop (wsmode, ASHIFT, val,
20413                                GEN_INT (GET_MODE_BITSIZE (smode)),
20414                                NULL_RTX, 1, OPTAB_LIB_WIDEN);
20415       val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
20416
20417       x = gen_reg_rtx (wvmode);
20418       if (!ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val))
20419         gcc_unreachable ();
20420       emit_move_insn (target, gen_lowpart (mode, x));
20421       return true;
20422
20423     default:
20424       return false;
20425     }
20426 }
20427
20428 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
20429    whose ONE_VAR element is VAR, and other elements are zero.  Return true
20430    if successful.  */
20431
20432 static bool
20433 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
20434                                      rtx target, rtx var, int one_var)
20435 {
20436   enum machine_mode vsimode;
20437   rtx new_target;
20438   rtx x, tmp;
20439
20440   switch (mode)
20441     {
20442     case V2SFmode:
20443     case V2SImode:
20444       if (!mmx_ok)
20445         return false;
20446       /* FALLTHRU */
20447
20448     case V2DFmode:
20449     case V2DImode:
20450       if (one_var != 0)
20451         return false;
20452       var = force_reg (GET_MODE_INNER (mode), var);
20453       x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
20454       emit_insn (gen_rtx_SET (VOIDmode, target, x));
20455       return true;
20456
20457     case V4SFmode:
20458     case V4SImode:
20459       if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
20460         new_target = gen_reg_rtx (mode);
20461       else
20462         new_target = target;
20463       var = force_reg (GET_MODE_INNER (mode), var);
20464       x = gen_rtx_VEC_DUPLICATE (mode, var);
20465       x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
20466       emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
20467       if (one_var != 0)
20468         {
20469           /* We need to shuffle the value to the correct position, so
20470              create a new pseudo to store the intermediate result.  */
20471
20472           /* With SSE2, we can use the integer shuffle insns.  */
20473           if (mode != V4SFmode && TARGET_SSE2)
20474             {
20475               emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
20476                                             GEN_INT (1),
20477                                             GEN_INT (one_var == 1 ? 0 : 1),
20478                                             GEN_INT (one_var == 2 ? 0 : 1),
20479                                             GEN_INT (one_var == 3 ? 0 : 1)));
20480               if (target != new_target)
20481                 emit_move_insn (target, new_target);
20482               return true;
20483             }
20484
20485           /* Otherwise convert the intermediate result to V4SFmode and
20486              use the SSE1 shuffle instructions.  */
20487           if (mode != V4SFmode)
20488             {
20489               tmp = gen_reg_rtx (V4SFmode);
20490               emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
20491             }
20492           else
20493             tmp = new_target;
20494
20495           emit_insn (gen_sse_shufps_1 (tmp, tmp, tmp,
20496                                        GEN_INT (1),
20497                                        GEN_INT (one_var == 1 ? 0 : 1),
20498                                        GEN_INT (one_var == 2 ? 0+4 : 1+4),
20499                                        GEN_INT (one_var == 3 ? 0+4 : 1+4)));
20500
20501           if (mode != V4SFmode)
20502             emit_move_insn (target, gen_lowpart (V4SImode, tmp));
20503           else if (tmp != target)
20504             emit_move_insn (target, tmp);
20505         }
20506       else if (target != new_target)
20507         emit_move_insn (target, new_target);
20508       return true;
20509
20510     case V8HImode:
20511     case V16QImode:
20512       vsimode = V4SImode;
20513       goto widen;
20514     case V4HImode:
20515     case V8QImode:
20516       if (!mmx_ok)
20517         return false;
20518       vsimode = V2SImode;
20519       goto widen;
20520     widen:
20521       if (one_var != 0)
20522         return false;
20523
20524       /* Zero extend the variable element to SImode and recurse.  */
20525       var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
20526
20527       x = gen_reg_rtx (vsimode);
20528       if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
20529                                                 var, one_var))
20530         gcc_unreachable ();
20531
20532       emit_move_insn (target, gen_lowpart (mode, x));
20533       return true;
20534
20535     default:
20536       return false;
20537     }
20538 }
20539
20540 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
20541    consisting of the values in VALS.  It is known that all elements
20542    except ONE_VAR are constants.  Return true if successful.  */
20543
20544 static bool
20545 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
20546                                  rtx target, rtx vals, int one_var)
20547 {
20548   rtx var = XVECEXP (vals, 0, one_var);
20549   enum machine_mode wmode;
20550   rtx const_vec, x;
20551
20552   const_vec = copy_rtx (vals);
20553   XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
20554   const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
20555
20556   switch (mode)
20557     {
20558     case V2DFmode:
20559     case V2DImode:
20560     case V2SFmode:
20561     case V2SImode:
20562       /* For the two element vectors, it's just as easy to use
20563          the general case.  */
20564       return false;
20565
20566     case V4SFmode:
20567     case V4SImode:
20568     case V8HImode:
20569     case V4HImode:
20570       break;
20571
20572     case V16QImode:
20573       wmode = V8HImode;
20574       goto widen;
20575     case V8QImode:
20576       wmode = V4HImode;
20577       goto widen;
20578     widen:
20579       /* There's no way to set one QImode entry easily.  Combine
20580          the variable value with its adjacent constant value, and
20581          promote to an HImode set.  */
20582       x = XVECEXP (vals, 0, one_var ^ 1);
20583       if (one_var & 1)
20584         {
20585           var = convert_modes (HImode, QImode, var, true);
20586           var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
20587                                      NULL_RTX, 1, OPTAB_LIB_WIDEN);
20588           x = GEN_INT (INTVAL (x) & 0xff);
20589         }
20590       else
20591         {
20592           var = convert_modes (HImode, QImode, var, true);
20593           x = gen_int_mode (INTVAL (x) << 8, HImode);
20594         }
20595       if (x != const0_rtx)
20596         var = expand_simple_binop (HImode, IOR, var, x, var,
20597                                    1, OPTAB_LIB_WIDEN);
20598
20599       x = gen_reg_rtx (wmode);
20600       emit_move_insn (x, gen_lowpart (wmode, const_vec));
20601       ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
20602
20603       emit_move_insn (target, gen_lowpart (mode, x));
20604       return true;
20605
20606     default:
20607       return false;
20608     }
20609
20610   emit_move_insn (target, const_vec);
20611   ix86_expand_vector_set (mmx_ok, target, var, one_var);
20612   return true;
20613 }
20614
20615 /* A subroutine of ix86_expand_vector_init.  Handle the most general case:
20616    all values variable, and none identical.  */
20617
20618 static void
20619 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
20620                                  rtx target, rtx vals)
20621 {
20622   enum machine_mode half_mode = GET_MODE_INNER (mode);
20623   rtx op0 = NULL, op1 = NULL;
20624   bool use_vec_concat = false;
20625
20626   switch (mode)
20627     {
20628     case V2SFmode:
20629     case V2SImode:
20630       if (!mmx_ok && !TARGET_SSE)
20631         break;
20632       /* FALLTHRU */
20633
20634     case V2DFmode:
20635     case V2DImode:
20636       /* For the two element vectors, we always implement VEC_CONCAT.  */
20637       op0 = XVECEXP (vals, 0, 0);
20638       op1 = XVECEXP (vals, 0, 1);
20639       use_vec_concat = true;
20640       break;
20641
20642     case V4SFmode:
20643       half_mode = V2SFmode;
20644       goto half;
20645     case V4SImode:
20646       half_mode = V2SImode;
20647       goto half;
20648     half:
20649       {
20650         rtvec v;
20651
20652         /* For V4SF and V4SI, we implement a concat of two V2 vectors.
20653            Recurse to load the two halves.  */
20654
20655         op0 = gen_reg_rtx (half_mode);
20656         v = gen_rtvec (2, XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1));
20657         ix86_expand_vector_init (false, op0, gen_rtx_PARALLEL (half_mode, v));
20658
20659         op1 = gen_reg_rtx (half_mode);
20660         v = gen_rtvec (2, XVECEXP (vals, 0, 2), XVECEXP (vals, 0, 3));
20661         ix86_expand_vector_init (false, op1, gen_rtx_PARALLEL (half_mode, v));
20662
20663         use_vec_concat = true;
20664       }
20665       break;
20666
20667     case V8HImode:
20668     case V16QImode:
20669     case V4HImode:
20670     case V8QImode:
20671       break;
20672
20673     default:
20674       gcc_unreachable ();
20675     }
20676
20677   if (use_vec_concat)
20678     {
20679       if (!register_operand (op0, half_mode))
20680         op0 = force_reg (half_mode, op0);
20681       if (!register_operand (op1, half_mode))
20682         op1 = force_reg (half_mode, op1);
20683
20684       emit_insn (gen_rtx_SET (VOIDmode, target,
20685                               gen_rtx_VEC_CONCAT (mode, op0, op1)));
20686     }
20687   else
20688     {
20689       int i, j, n_elts, n_words, n_elt_per_word;
20690       enum machine_mode inner_mode;
20691       rtx words[4], shift;
20692
20693       inner_mode = GET_MODE_INNER (mode);
20694       n_elts = GET_MODE_NUNITS (mode);
20695       n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
20696       n_elt_per_word = n_elts / n_words;
20697       shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
20698
20699       for (i = 0; i < n_words; ++i)
20700         {
20701           rtx word = NULL_RTX;
20702
20703           for (j = 0; j < n_elt_per_word; ++j)
20704             {
20705               rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
20706               elt = convert_modes (word_mode, inner_mode, elt, true);
20707
20708               if (j == 0)
20709                 word = elt;
20710               else
20711                 {
20712                   word = expand_simple_binop (word_mode, ASHIFT, word, shift,
20713                                               word, 1, OPTAB_LIB_WIDEN);
20714                   word = expand_simple_binop (word_mode, IOR, word, elt,
20715                                               word, 1, OPTAB_LIB_WIDEN);
20716                 }
20717             }
20718
20719           words[i] = word;
20720         }
20721
20722       if (n_words == 1)
20723         emit_move_insn (target, gen_lowpart (mode, words[0]));
20724       else if (n_words == 2)
20725         {
20726           rtx tmp = gen_reg_rtx (mode);
20727           emit_insn (gen_rtx_CLOBBER (VOIDmode, tmp));
20728           emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
20729           emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
20730           emit_move_insn (target, tmp);
20731         }
20732       else if (n_words == 4)
20733         {
20734           rtx tmp = gen_reg_rtx (V4SImode);
20735           vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
20736           ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
20737           emit_move_insn (target, gen_lowpart (mode, tmp));
20738         }
20739       else
20740         gcc_unreachable ();
20741     }
20742 }
20743
20744 /* Initialize vector TARGET via VALS.  Suppress the use of MMX
20745    instructions unless MMX_OK is true.  */
20746
20747 void
20748 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
20749 {
20750   enum machine_mode mode = GET_MODE (target);
20751   enum machine_mode inner_mode = GET_MODE_INNER (mode);
20752   int n_elts = GET_MODE_NUNITS (mode);
20753   int n_var = 0, one_var = -1;
20754   bool all_same = true, all_const_zero = true;
20755   int i;
20756   rtx x;
20757
20758   for (i = 0; i < n_elts; ++i)
20759     {
20760       x = XVECEXP (vals, 0, i);
20761       if (!CONSTANT_P (x))
20762         n_var++, one_var = i;
20763       else if (x != CONST0_RTX (inner_mode))
20764         all_const_zero = false;
20765       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
20766         all_same = false;
20767     }
20768
20769   /* Constants are best loaded from the constant pool.  */
20770   if (n_var == 0)
20771     {
20772       emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
20773       return;
20774     }
20775
20776   /* If all values are identical, broadcast the value.  */
20777   if (all_same
20778       && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
20779                                             XVECEXP (vals, 0, 0)))
20780     return;
20781
20782   /* Values where only one field is non-constant are best loaded from
20783      the pool and overwritten via move later.  */
20784   if (n_var == 1)
20785     {
20786       if (all_const_zero
20787           && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
20788                                                   XVECEXP (vals, 0, one_var),
20789                                                   one_var))
20790         return;
20791
20792       if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
20793         return;
20794     }
20795
20796   ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
20797 }
20798
20799 void
20800 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
20801 {
20802   enum machine_mode mode = GET_MODE (target);
20803   enum machine_mode inner_mode = GET_MODE_INNER (mode);
20804   bool use_vec_merge = false;
20805   rtx tmp;
20806
20807   switch (mode)
20808     {
20809     case V2SFmode:
20810     case V2SImode:
20811       if (mmx_ok)
20812         {
20813           tmp = gen_reg_rtx (GET_MODE_INNER (mode));
20814           ix86_expand_vector_extract (true, tmp, target, 1 - elt);
20815           if (elt == 0)
20816             tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
20817           else
20818             tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
20819           emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
20820           return;
20821         }
20822       break;
20823
20824     case V2DFmode:
20825     case V2DImode:
20826       {
20827         rtx op0, op1;
20828
20829         /* For the two element vectors, we implement a VEC_CONCAT with
20830            the extraction of the other element.  */
20831
20832         tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
20833         tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
20834
20835         if (elt == 0)
20836           op0 = val, op1 = tmp;
20837         else
20838           op0 = tmp, op1 = val;
20839
20840         tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
20841         emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
20842       }
20843       return;
20844
20845     case V4SFmode:
20846       switch (elt)
20847         {
20848         case 0:
20849           use_vec_merge = true;
20850           break;
20851
20852         case 1:
20853           /* tmp = target = A B C D */
20854           tmp = copy_to_reg (target);
20855           /* target = A A B B */
20856           emit_insn (gen_sse_unpcklps (target, target, target));
20857           /* target = X A B B */
20858           ix86_expand_vector_set (false, target, val, 0);
20859           /* target = A X C D  */
20860           emit_insn (gen_sse_shufps_1 (target, target, tmp,
20861                                        GEN_INT (1), GEN_INT (0),
20862                                        GEN_INT (2+4), GEN_INT (3+4)));
20863           return;
20864
20865         case 2:
20866           /* tmp = target = A B C D */
20867           tmp = copy_to_reg (target);
20868           /* tmp = X B C D */
20869           ix86_expand_vector_set (false, tmp, val, 0);
20870           /* target = A B X D */
20871           emit_insn (gen_sse_shufps_1 (target, target, tmp,
20872                                        GEN_INT (0), GEN_INT (1),
20873                                        GEN_INT (0+4), GEN_INT (3+4)));
20874           return;
20875
20876         case 3:
20877           /* tmp = target = A B C D */
20878           tmp = copy_to_reg (target);
20879           /* tmp = X B C D */
20880           ix86_expand_vector_set (false, tmp, val, 0);
20881           /* target = A B X D */
20882           emit_insn (gen_sse_shufps_1 (target, target, tmp,
20883                                        GEN_INT (0), GEN_INT (1),
20884                                        GEN_INT (2+4), GEN_INT (0+4)));
20885           return;
20886
20887         default:
20888           gcc_unreachable ();
20889         }
20890       break;
20891
20892     case V4SImode:
20893       /* Element 0 handled by vec_merge below.  */
20894       if (elt == 0)
20895         {
20896           use_vec_merge = true;
20897           break;
20898         }
20899
20900       if (TARGET_SSE2)
20901         {
20902           /* With SSE2, use integer shuffles to swap element 0 and ELT,
20903              store into element 0, then shuffle them back.  */
20904
20905           rtx order[4];
20906
20907           order[0] = GEN_INT (elt);
20908           order[1] = const1_rtx;
20909           order[2] = const2_rtx;
20910           order[3] = GEN_INT (3);
20911           order[elt] = const0_rtx;
20912
20913           emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
20914                                         order[1], order[2], order[3]));
20915
20916           ix86_expand_vector_set (false, target, val, 0);
20917
20918           emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
20919                                         order[1], order[2], order[3]));
20920         }
20921       else
20922         {
20923           /* For SSE1, we have to reuse the V4SF code.  */
20924           ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
20925                                   gen_lowpart (SFmode, val), elt);
20926         }
20927       return;
20928
20929     case V8HImode:
20930       use_vec_merge = TARGET_SSE2;
20931       break;
20932     case V4HImode:
20933       use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
20934       break;
20935
20936     case V16QImode:
20937     case V8QImode:
20938     default:
20939       break;
20940     }
20941
20942   if (use_vec_merge)
20943     {
20944       tmp = gen_rtx_VEC_DUPLICATE (mode, val);
20945       tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
20946       emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
20947     }
20948   else
20949     {
20950       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
20951
20952       emit_move_insn (mem, target);
20953
20954       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
20955       emit_move_insn (tmp, val);
20956
20957       emit_move_insn (target, mem);
20958     }
20959 }
20960
20961 void
20962 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
20963 {
20964   enum machine_mode mode = GET_MODE (vec);
20965   enum machine_mode inner_mode = GET_MODE_INNER (mode);
20966   bool use_vec_extr = false;
20967   rtx tmp;
20968
20969   switch (mode)
20970     {
20971     case V2SImode:
20972     case V2SFmode:
20973       if (!mmx_ok)
20974         break;
20975       /* FALLTHRU */
20976
20977     case V2DFmode:
20978     case V2DImode:
20979       use_vec_extr = true;
20980       break;
20981
20982     case V4SFmode:
20983       switch (elt)
20984         {
20985         case 0:
20986           tmp = vec;
20987           break;
20988
20989         case 1:
20990         case 3:
20991           tmp = gen_reg_rtx (mode);
20992           emit_insn (gen_sse_shufps_1 (tmp, vec, vec,
20993                                        GEN_INT (elt), GEN_INT (elt),
20994                                        GEN_INT (elt+4), GEN_INT (elt+4)));
20995           break;
20996
20997         case 2:
20998           tmp = gen_reg_rtx (mode);
20999           emit_insn (gen_sse_unpckhps (tmp, vec, vec));
21000           break;
21001
21002         default:
21003           gcc_unreachable ();
21004         }
21005       vec = tmp;
21006       use_vec_extr = true;
21007       elt = 0;
21008       break;
21009
21010     case V4SImode:
21011       if (TARGET_SSE2)
21012         {
21013           switch (elt)
21014             {
21015             case 0:
21016               tmp = vec;
21017               break;
21018
21019             case 1:
21020             case 3:
21021               tmp = gen_reg_rtx (mode);
21022               emit_insn (gen_sse2_pshufd_1 (tmp, vec,
21023                                             GEN_INT (elt), GEN_INT (elt),
21024                                             GEN_INT (elt), GEN_INT (elt)));
21025               break;
21026
21027             case 2:
21028               tmp = gen_reg_rtx (mode);
21029               emit_insn (gen_sse2_punpckhdq (tmp, vec, vec));
21030               break;
21031
21032             default:
21033               gcc_unreachable ();
21034             }
21035           vec = tmp;
21036           use_vec_extr = true;
21037           elt = 0;
21038         }
21039       else
21040         {
21041           /* For SSE1, we have to reuse the V4SF code.  */
21042           ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
21043                                       gen_lowpart (V4SFmode, vec), elt);
21044           return;
21045         }
21046       break;
21047
21048     case V8HImode:
21049       use_vec_extr = TARGET_SSE2;
21050       break;
21051     case V4HImode:
21052       use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
21053       break;
21054
21055     case V16QImode:
21056     case V8QImode:
21057       /* ??? Could extract the appropriate HImode element and shift.  */
21058     default:
21059       break;
21060     }
21061
21062   if (use_vec_extr)
21063     {
21064       tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
21065       tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
21066
21067       /* Let the rtl optimizers know about the zero extension performed.  */
21068       if (inner_mode == HImode)
21069         {
21070           tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
21071           target = gen_lowpart (SImode, target);
21072         }
21073
21074       emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
21075     }
21076   else
21077     {
21078       rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
21079
21080       emit_move_insn (mem, vec);
21081
21082       tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
21083       emit_move_insn (target, tmp);
21084     }
21085 }
21086
21087 /* Expand a vector reduction on V4SFmode for SSE1.  FN is the binary
21088    pattern to reduce; DEST is the destination; IN is the input vector.  */
21089
21090 void
21091 ix86_expand_reduc_v4sf (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
21092 {
21093   rtx tmp1, tmp2, tmp3;
21094
21095   tmp1 = gen_reg_rtx (V4SFmode);
21096   tmp2 = gen_reg_rtx (V4SFmode);
21097   tmp3 = gen_reg_rtx (V4SFmode);
21098
21099   emit_insn (gen_sse_movhlps (tmp1, in, in));
21100   emit_insn (fn (tmp2, tmp1, in));
21101
21102   emit_insn (gen_sse_shufps_1 (tmp3, tmp2, tmp2,
21103                                GEN_INT (1), GEN_INT (1),
21104                                GEN_INT (1+4), GEN_INT (1+4)));
21105   emit_insn (fn (dest, tmp2, tmp3));
21106 }
21107 \f
21108 /* Target hook for scalar_mode_supported_p.  */
21109 static bool
21110 ix86_scalar_mode_supported_p (enum machine_mode mode)
21111 {
21112   if (DECIMAL_FLOAT_MODE_P (mode))
21113     return true;
21114   else
21115     return default_scalar_mode_supported_p (mode);
21116 }
21117
21118 /* Implements target hook vector_mode_supported_p.  */
21119 static bool
21120 ix86_vector_mode_supported_p (enum machine_mode mode)
21121 {
21122   if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
21123     return true;
21124   if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
21125     return true;
21126   if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
21127     return true;
21128   if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
21129     return true;
21130   return false;
21131 }
21132
21133 /* Worker function for TARGET_MD_ASM_CLOBBERS.
21134
21135    We do this in the new i386 backend to maintain source compatibility
21136    with the old cc0-based compiler.  */
21137
21138 static tree
21139 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
21140                       tree inputs ATTRIBUTE_UNUSED,
21141                       tree clobbers)
21142 {
21143   clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
21144                         clobbers);
21145   clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
21146                         clobbers);
21147   return clobbers;
21148 }
21149
21150 /* Implements target vector targetm.asm.encode_section_info.  This
21151    is not used by netware.  */
21152
21153 static void ATTRIBUTE_UNUSED
21154 ix86_encode_section_info (tree decl, rtx rtl, int first)
21155 {
21156   default_encode_section_info (decl, rtl, first);
21157
21158   if (TREE_CODE (decl) == VAR_DECL
21159       && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
21160       && ix86_in_large_data_p (decl))
21161     SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
21162 }
21163
21164 /* Worker function for REVERSE_CONDITION.  */
21165
21166 enum rtx_code
21167 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
21168 {
21169   return (mode != CCFPmode && mode != CCFPUmode
21170           ? reverse_condition (code)
21171           : reverse_condition_maybe_unordered (code));
21172 }
21173
21174 /* Output code to perform an x87 FP register move, from OPERANDS[1]
21175    to OPERANDS[0].  */
21176
21177 const char *
21178 output_387_reg_move (rtx insn, rtx *operands)
21179 {
21180   if (REG_P (operands[0]))
21181     {
21182       if (REG_P (operands[1])
21183           && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
21184         {
21185           if (REGNO (operands[0]) == FIRST_STACK_REG)
21186             return output_387_ffreep (operands, 0);
21187           return "fstp\t%y0";
21188         }
21189       if (STACK_TOP_P (operands[0]))
21190         return "fld%z1\t%y1";
21191       return "fst\t%y0";
21192     }
21193   else if (MEM_P (operands[0]))
21194     {
21195       gcc_assert (REG_P (operands[1]));
21196       if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
21197         return "fstp%z0\t%y0";
21198       else
21199         {
21200           /* There is no non-popping store to memory for XFmode.
21201              So if we need one, follow the store with a load.  */
21202           if (GET_MODE (operands[0]) == XFmode)
21203             return "fstp%z0\t%y0\n\tfld%z0\t%y0";
21204           else
21205             return "fst%z0\t%y0";
21206         }
21207     }
21208   else
21209     gcc_unreachable();
21210 }
21211
21212 /* Output code to perform a conditional jump to LABEL, if C2 flag in
21213    FP status register is set.  */
21214
21215 void
21216 ix86_emit_fp_unordered_jump (rtx label)
21217 {
21218   rtx reg = gen_reg_rtx (HImode);
21219   rtx temp;
21220
21221   emit_insn (gen_x86_fnstsw_1 (reg));
21222
21223   if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_size))
21224     {
21225       emit_insn (gen_x86_sahf_1 (reg));
21226
21227       temp = gen_rtx_REG (CCmode, FLAGS_REG);
21228       temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
21229     }
21230   else
21231     {
21232       emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
21233
21234       temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
21235       temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
21236     }
21237
21238   temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
21239                               gen_rtx_LABEL_REF (VOIDmode, label),
21240                               pc_rtx);
21241   temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
21242
21243   emit_jump_insn (temp);
21244   predict_jump (REG_BR_PROB_BASE * 10 / 100);
21245 }
21246
21247 /* Output code to perform a log1p XFmode calculation.  */
21248
21249 void ix86_emit_i387_log1p (rtx op0, rtx op1)
21250 {
21251   rtx label1 = gen_label_rtx ();
21252   rtx label2 = gen_label_rtx ();
21253
21254   rtx tmp = gen_reg_rtx (XFmode);
21255   rtx tmp2 = gen_reg_rtx (XFmode);
21256
21257   emit_insn (gen_absxf2 (tmp, op1));
21258   emit_insn (gen_cmpxf (tmp,
21259     CONST_DOUBLE_FROM_REAL_VALUE (
21260        REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
21261        XFmode)));
21262   emit_jump_insn (gen_bge (label1));
21263
21264   emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
21265   emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
21266   emit_jump (label2);
21267
21268   emit_label (label1);
21269   emit_move_insn (tmp, CONST1_RTX (XFmode));
21270   emit_insn (gen_addxf3 (tmp, op1, tmp));
21271   emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
21272   emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
21273
21274   emit_label (label2);
21275 }
21276
21277 /* Solaris implementation of TARGET_ASM_NAMED_SECTION.  */
21278
21279 static void ATTRIBUTE_UNUSED
21280 i386_solaris_elf_named_section (const char *name, unsigned int flags,
21281                                 tree decl)
21282 {
21283   /* With Binutils 2.15, the "@unwind" marker must be specified on
21284      every occurrence of the ".eh_frame" section, not just the first
21285      one.  */
21286   if (TARGET_64BIT
21287       && strcmp (name, ".eh_frame") == 0)
21288     {
21289       fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
21290                flags & SECTION_WRITE ? "aw" : "a");
21291       return;
21292     }
21293   default_elf_asm_named_section (name, flags, decl);
21294 }
21295
21296 /* Return the mangling of TYPE if it is an extended fundamental type.  */
21297
21298 static const char *
21299 ix86_mangle_fundamental_type (tree type)
21300 {
21301   switch (TYPE_MODE (type))
21302     {
21303     case TFmode:
21304       /* __float128 is "g".  */
21305       return "g";
21306     case XFmode:
21307       /* "long double" or __float80 is "e".  */
21308       return "e";
21309     default:
21310       return NULL;
21311     }
21312 }
21313
21314 /* For 32-bit code we can save PIC register setup by using
21315    __stack_chk_fail_local hidden function instead of calling
21316    __stack_chk_fail directly.  64-bit code doesn't need to setup any PIC
21317    register, so it is better to call __stack_chk_fail directly.  */
21318
21319 static tree
21320 ix86_stack_protect_fail (void)
21321 {
21322   return TARGET_64BIT
21323          ? default_external_stack_protect_fail ()
21324          : default_hidden_stack_protect_fail ();
21325 }
21326
21327 /* Select a format to encode pointers in exception handling data.  CODE
21328    is 0 for data, 1 for code labels, 2 for function pointers.  GLOBAL is
21329    true if the symbol may be affected by dynamic relocations.
21330
21331    ??? All x86 object file formats are capable of representing this.
21332    After all, the relocation needed is the same as for the call insn.
21333    Whether or not a particular assembler allows us to enter such, I
21334    guess we'll have to see.  */
21335 int
21336 asm_preferred_eh_data_format (int code, int global)
21337 {
21338   if (flag_pic)
21339     {
21340       int type = DW_EH_PE_sdata8;
21341       if (!TARGET_64BIT
21342           || ix86_cmodel == CM_SMALL_PIC
21343           || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
21344         type = DW_EH_PE_sdata4;
21345       return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
21346     }
21347   if (ix86_cmodel == CM_SMALL
21348       || (ix86_cmodel == CM_MEDIUM && code))
21349     return DW_EH_PE_udata4;
21350   return DW_EH_PE_absptr;
21351 }
21352 \f
21353 /* Expand copysign from SIGN to the positive value ABS_VALUE
21354    storing in RESULT.  If MASK is non-null, it shall be a mask to mask out
21355    the sign-bit.  */
21356 static void
21357 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
21358 {
21359   enum machine_mode mode = GET_MODE (sign);
21360   rtx sgn = gen_reg_rtx (mode);
21361   if (mask == NULL_RTX)
21362     {
21363       mask = ix86_build_signbit_mask (mode, VECTOR_MODE_P (mode), false);
21364       if (!VECTOR_MODE_P (mode))
21365         {
21366           /* We need to generate a scalar mode mask in this case.  */
21367           rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
21368           tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
21369           mask = gen_reg_rtx (mode);
21370           emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
21371         }
21372     }
21373   else
21374     mask = gen_rtx_NOT (mode, mask);
21375   emit_insn (gen_rtx_SET (VOIDmode, sgn,
21376                           gen_rtx_AND (mode, mask, sign)));
21377   emit_insn (gen_rtx_SET (VOIDmode, result,
21378                           gen_rtx_IOR (mode, abs_value, sgn)));
21379 }
21380
21381 /* Expand fabs (OP0) and return a new rtx that holds the result.  The
21382    mask for masking out the sign-bit is stored in *SMASK, if that is
21383    non-null.  */
21384 static rtx
21385 ix86_expand_sse_fabs (rtx op0, rtx *smask)
21386 {
21387   enum machine_mode mode = GET_MODE (op0);
21388   rtx xa, mask;
21389
21390   xa = gen_reg_rtx (mode);
21391   mask = ix86_build_signbit_mask (mode, VECTOR_MODE_P (mode), true);
21392   if (!VECTOR_MODE_P (mode))
21393     {
21394       /* We need to generate a scalar mode mask in this case.  */
21395       rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
21396       tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
21397       mask = gen_reg_rtx (mode);
21398       emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
21399     }
21400   emit_insn (gen_rtx_SET (VOIDmode, xa,
21401                           gen_rtx_AND (mode, op0, mask)));
21402
21403   if (smask)
21404     *smask = mask;
21405
21406   return xa;
21407 }
21408
21409 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
21410    swapping the operands if SWAP_OPERANDS is true.  The expanded
21411    code is a forward jump to a newly created label in case the
21412    comparison is true.  The generated label rtx is returned.  */
21413 static rtx
21414 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
21415                                   bool swap_operands)
21416 {
21417   rtx label, tmp;
21418
21419   if (swap_operands)
21420     {
21421       tmp = op0;
21422       op0 = op1;
21423       op1 = tmp;
21424     }
21425
21426   label = gen_label_rtx ();
21427   tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
21428   emit_insn (gen_rtx_SET (VOIDmode, tmp,
21429                           gen_rtx_COMPARE (CCFPUmode, op0, op1)));
21430   tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
21431   tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
21432                               gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
21433   tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
21434   JUMP_LABEL (tmp) = label;
21435
21436   return label;
21437 }
21438
21439 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
21440    using comparison code CODE.  Operands are swapped for the comparison if
21441    SWAP_OPERANDS is true.  Returns a rtx for the generated mask.  */
21442 static rtx
21443 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
21444                               bool swap_operands)
21445 {
21446   enum machine_mode mode = GET_MODE (op0);
21447   rtx mask = gen_reg_rtx (mode);
21448
21449   if (swap_operands)
21450     {
21451       rtx tmp = op0;
21452       op0 = op1;
21453       op1 = tmp;
21454     }
21455
21456   if (mode == DFmode)
21457     emit_insn (gen_sse2_maskcmpdf3 (mask, op0, op1,
21458                                     gen_rtx_fmt_ee (code, mode, op0, op1)));
21459   else
21460     emit_insn (gen_sse_maskcmpsf3 (mask, op0, op1,
21461                                    gen_rtx_fmt_ee (code, mode, op0, op1)));
21462
21463   return mask;
21464 }
21465
21466 /* Generate and return a rtx of mode MODE for 2**n where n is the number
21467    of bits of the mantissa of MODE, which must be one of DFmode or SFmode.  */
21468 static rtx
21469 ix86_gen_TWO52 (enum machine_mode mode)
21470 {
21471   REAL_VALUE_TYPE TWO52r;
21472   rtx TWO52;
21473
21474   real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
21475   TWO52 = const_double_from_real_value (TWO52r, mode);
21476   TWO52 = force_reg (mode, TWO52);
21477
21478   return TWO52;
21479 }
21480
21481 /* Expand SSE sequence for computing lround from OP1 storing
21482    into OP0.  */
21483 void
21484 ix86_expand_lround (rtx op0, rtx op1)
21485 {
21486   /* C code for the stuff we're doing below:
21487        tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
21488        return (long)tmp;
21489    */
21490   enum machine_mode mode = GET_MODE (op1);
21491   const struct real_format *fmt;
21492   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
21493   rtx adj;
21494
21495   /* load nextafter (0.5, 0.0) */
21496   fmt = REAL_MODE_FORMAT (mode);
21497   real_2expN (&half_minus_pred_half, -(fmt->p) - 1);
21498   REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
21499
21500   /* adj = copysign (0.5, op1) */
21501   adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
21502   ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
21503
21504   /* adj = op1 + adj */
21505   adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
21506
21507   /* op0 = (imode)adj */
21508   expand_fix (op0, adj, 0);
21509 }
21510
21511 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
21512    into OPERAND0.  */
21513 void
21514 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
21515 {
21516   /* C code for the stuff we're doing below (for do_floor):
21517         xi = (long)op1;
21518         xi -= (double)xi > op1 ? 1 : 0;
21519         return xi;
21520    */
21521   enum machine_mode fmode = GET_MODE (op1);
21522   enum machine_mode imode = GET_MODE (op0);
21523   rtx ireg, freg, label, tmp;
21524
21525   /* reg = (long)op1 */
21526   ireg = gen_reg_rtx (imode);
21527   expand_fix (ireg, op1, 0);
21528
21529   /* freg = (double)reg */
21530   freg = gen_reg_rtx (fmode);
21531   expand_float (freg, ireg, 0);
21532
21533   /* ireg = (freg > op1) ? ireg - 1 : ireg */
21534   label = ix86_expand_sse_compare_and_jump (UNLE,
21535                                             freg, op1, !do_floor);
21536   tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
21537                              ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
21538   emit_move_insn (ireg, tmp);
21539
21540   emit_label (label);
21541   LABEL_NUSES (label) = 1;
21542
21543   emit_move_insn (op0, ireg);
21544 }
21545
21546 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
21547    result in OPERAND0.  */
21548 void
21549 ix86_expand_rint (rtx operand0, rtx operand1)
21550 {
21551   /* C code for the stuff we're doing below:
21552         xa = fabs (operand1);
21553         if (!isless (xa, 2**52))
21554           return operand1;
21555         xa = xa + 2**52 - 2**52;
21556         return copysign (xa, operand1);
21557    */
21558   enum machine_mode mode = GET_MODE (operand0);
21559   rtx res, xa, label, TWO52, mask;
21560
21561   res = gen_reg_rtx (mode);
21562   emit_move_insn (res, operand1);
21563
21564   /* xa = abs (operand1) */
21565   xa = ix86_expand_sse_fabs (res, &mask);
21566
21567   /* if (!isless (xa, TWO52)) goto label; */
21568   TWO52 = ix86_gen_TWO52 (mode);
21569   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21570
21571   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
21572   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
21573
21574   ix86_sse_copysign_to_positive (res, xa, res, mask);
21575
21576   emit_label (label);
21577   LABEL_NUSES (label) = 1;
21578
21579   emit_move_insn (operand0, res);
21580 }
21581
21582 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
21583    into OPERAND0.  */
21584 void
21585 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
21586 {
21587   /* C code for the stuff we expand below.
21588         double xa = fabs (x), x2;
21589         if (!isless (xa, TWO52))
21590           return x;
21591         xa = xa + TWO52 - TWO52;
21592         x2 = copysign (xa, x);
21593      Compensate.  Floor:
21594         if (x2 > x)
21595           x2 -= 1;
21596      Compensate.  Ceil:
21597         if (x2 < x)
21598           x2 -= -1;
21599         return x2;
21600    */
21601   enum machine_mode mode = GET_MODE (operand0);
21602   rtx xa, TWO52, tmp, label, one, res, mask;
21603
21604   TWO52 = ix86_gen_TWO52 (mode);
21605
21606   /* Temporary for holding the result, initialized to the input
21607      operand to ease control flow.  */
21608   res = gen_reg_rtx (mode);
21609   emit_move_insn (res, operand1);
21610
21611   /* xa = abs (operand1) */
21612   xa = ix86_expand_sse_fabs (res, &mask);
21613
21614   /* if (!isless (xa, TWO52)) goto label; */
21615   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21616
21617   /* xa = xa + TWO52 - TWO52; */
21618   xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
21619   xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
21620
21621   /* xa = copysign (xa, operand1) */
21622   ix86_sse_copysign_to_positive (xa, xa, res, mask);
21623
21624   /* generate 1.0 or -1.0 */
21625   one = force_reg (mode,
21626                    const_double_from_real_value (do_floor
21627                                                  ? dconst1 : dconstm1, mode));
21628
21629   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
21630   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
21631   emit_insn (gen_rtx_SET (VOIDmode, tmp,
21632                           gen_rtx_AND (mode, one, tmp)));
21633   /* We always need to subtract here to preserve signed zero.  */
21634   tmp = expand_simple_binop (mode, MINUS,
21635                              xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21636   emit_move_insn (res, tmp);
21637
21638   emit_label (label);
21639   LABEL_NUSES (label) = 1;
21640
21641   emit_move_insn (operand0, res);
21642 }
21643
21644 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
21645    into OPERAND0.  */
21646 void
21647 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
21648 {
21649   /* C code for the stuff we expand below.
21650         double xa = fabs (x), x2;
21651         if (!isless (xa, TWO52))
21652           return x;
21653         x2 = (double)(long)x;
21654      Compensate.  Floor:
21655         if (x2 > x)
21656           x2 -= 1;
21657      Compensate.  Ceil:
21658         if (x2 < x)
21659           x2 += 1;
21660         if (HONOR_SIGNED_ZEROS (mode))
21661           return copysign (x2, x);
21662         return x2;
21663    */
21664   enum machine_mode mode = GET_MODE (operand0);
21665   rtx xa, xi, TWO52, tmp, label, one, res, mask;
21666
21667   TWO52 = ix86_gen_TWO52 (mode);
21668
21669   /* Temporary for holding the result, initialized to the input
21670      operand to ease control flow.  */
21671   res = gen_reg_rtx (mode);
21672   emit_move_insn (res, operand1);
21673
21674   /* xa = abs (operand1) */
21675   xa = ix86_expand_sse_fabs (res, &mask);
21676
21677   /* if (!isless (xa, TWO52)) goto label; */
21678   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21679
21680   /* xa = (double)(long)x */
21681   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
21682   expand_fix (xi, res, 0);
21683   expand_float (xa, xi, 0);
21684
21685   /* generate 1.0 */
21686   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
21687
21688   /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
21689   tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
21690   emit_insn (gen_rtx_SET (VOIDmode, tmp,
21691                           gen_rtx_AND (mode, one, tmp)));
21692   tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
21693                              xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21694   emit_move_insn (res, tmp);
21695
21696   if (HONOR_SIGNED_ZEROS (mode))
21697     ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
21698
21699   emit_label (label);
21700   LABEL_NUSES (label) = 1;
21701
21702   emit_move_insn (operand0, res);
21703 }
21704
21705 /* Expand SSE sequence for computing round from OPERAND1 storing
21706    into OPERAND0.  Sequence that works without relying on DImode truncation
21707    via cvttsd2siq that is only available on 64bit targets.  */
21708 void
21709 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
21710 {
21711   /* C code for the stuff we expand below.
21712         double xa = fabs (x), xa2, x2;
21713         if (!isless (xa, TWO52))
21714           return x;
21715      Using the absolute value and copying back sign makes
21716      -0.0 -> -0.0 correct.
21717         xa2 = xa + TWO52 - TWO52;
21718      Compensate.
21719         dxa = xa2 - xa;
21720         if (dxa <= -0.5)
21721           xa2 += 1;
21722         else if (dxa > 0.5)
21723           xa2 -= 1;
21724         x2 = copysign (xa2, x);
21725         return x2;
21726    */
21727   enum machine_mode mode = GET_MODE (operand0);
21728   rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
21729
21730   TWO52 = ix86_gen_TWO52 (mode);
21731
21732   /* Temporary for holding the result, initialized to the input
21733      operand to ease control flow.  */
21734   res = gen_reg_rtx (mode);
21735   emit_move_insn (res, operand1);
21736
21737   /* xa = abs (operand1) */
21738   xa = ix86_expand_sse_fabs (res, &mask);
21739
21740   /* if (!isless (xa, TWO52)) goto label; */
21741   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21742
21743   /* xa2 = xa + TWO52 - TWO52; */
21744   xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
21745   xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
21746
21747   /* dxa = xa2 - xa; */
21748   dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
21749
21750   /* generate 0.5, 1.0 and -0.5 */
21751   half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
21752   one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
21753   mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
21754                                0, OPTAB_DIRECT);
21755
21756   /* Compensate.  */
21757   tmp = gen_reg_rtx (mode);
21758   /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
21759   tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
21760   emit_insn (gen_rtx_SET (VOIDmode, tmp,
21761                           gen_rtx_AND (mode, one, tmp)));
21762   xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21763   /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
21764   tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
21765   emit_insn (gen_rtx_SET (VOIDmode, tmp,
21766                           gen_rtx_AND (mode, one, tmp)));
21767   xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21768
21769   /* res = copysign (xa2, operand1) */
21770   ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
21771
21772   emit_label (label);
21773   LABEL_NUSES (label) = 1;
21774
21775   emit_move_insn (operand0, res);
21776 }
21777
21778 /* Expand SSE sequence for computing trunc from OPERAND1 storing
21779    into OPERAND0.  */
21780 void
21781 ix86_expand_trunc (rtx operand0, rtx operand1)
21782 {
21783   /* C code for SSE variant we expand below.
21784         double xa = fabs (x), x2;
21785         if (!isless (xa, TWO52))
21786           return x;
21787         x2 = (double)(long)x;
21788         if (HONOR_SIGNED_ZEROS (mode))
21789           return copysign (x2, x);
21790         return x2;
21791    */
21792   enum machine_mode mode = GET_MODE (operand0);
21793   rtx xa, xi, TWO52, label, res, mask;
21794
21795   TWO52 = ix86_gen_TWO52 (mode);
21796
21797   /* Temporary for holding the result, initialized to the input
21798      operand to ease control flow.  */
21799   res = gen_reg_rtx (mode);
21800   emit_move_insn (res, operand1);
21801
21802   /* xa = abs (operand1) */
21803   xa = ix86_expand_sse_fabs (res, &mask);
21804
21805   /* if (!isless (xa, TWO52)) goto label; */
21806   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21807
21808   /* x = (double)(long)x */
21809   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
21810   expand_fix (xi, res, 0);
21811   expand_float (res, xi, 0);
21812
21813   if (HONOR_SIGNED_ZEROS (mode))
21814     ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
21815
21816   emit_label (label);
21817   LABEL_NUSES (label) = 1;
21818
21819   emit_move_insn (operand0, res);
21820 }
21821
21822 /* Expand SSE sequence for computing trunc from OPERAND1 storing
21823    into OPERAND0.  */
21824 void
21825 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
21826 {
21827   enum machine_mode mode = GET_MODE (operand0);
21828   rtx xa, mask, TWO52, label, one, res, smask, tmp;
21829
21830   /* C code for SSE variant we expand below.
21831         double xa = fabs (x), x2;
21832         if (!isless (xa, TWO52))
21833           return x;
21834         xa2 = xa + TWO52 - TWO52;
21835      Compensate:
21836         if (xa2 > xa)
21837           xa2 -= 1.0;
21838         x2 = copysign (xa2, x);
21839         return x2;
21840    */
21841
21842   TWO52 = ix86_gen_TWO52 (mode);
21843
21844   /* Temporary for holding the result, initialized to the input
21845      operand to ease control flow.  */
21846   res = gen_reg_rtx (mode);
21847   emit_move_insn (res, operand1);
21848
21849   /* xa = abs (operand1) */
21850   xa = ix86_expand_sse_fabs (res, &smask);
21851
21852   /* if (!isless (xa, TWO52)) goto label; */
21853   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21854
21855   /* res = xa + TWO52 - TWO52; */
21856   tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
21857   tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
21858   emit_move_insn (res, tmp);
21859
21860   /* generate 1.0 */
21861   one = force_reg (mode, const_double_from_real_value (dconst1, mode));
21862
21863   /* Compensate: res = xa2 - (res > xa ? 1 : 0)  */
21864   mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
21865   emit_insn (gen_rtx_SET (VOIDmode, mask,
21866                           gen_rtx_AND (mode, mask, one)));
21867   tmp = expand_simple_binop (mode, MINUS,
21868                              res, mask, NULL_RTX, 0, OPTAB_DIRECT);
21869   emit_move_insn (res, tmp);
21870
21871   /* res = copysign (res, operand1) */
21872   ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
21873
21874   emit_label (label);
21875   LABEL_NUSES (label) = 1;
21876
21877   emit_move_insn (operand0, res);
21878 }
21879
21880 /* Expand SSE sequence for computing round from OPERAND1 storing
21881    into OPERAND0.  */
21882 void
21883 ix86_expand_round (rtx operand0, rtx operand1)
21884 {
21885   /* C code for the stuff we're doing below:
21886         double xa = fabs (x);
21887         if (!isless (xa, TWO52))
21888           return x;
21889         xa = (double)(long)(xa + nextafter (0.5, 0.0));
21890         return copysign (xa, x);
21891    */
21892   enum machine_mode mode = GET_MODE (operand0);
21893   rtx res, TWO52, xa, label, xi, half, mask;
21894   const struct real_format *fmt;
21895   REAL_VALUE_TYPE pred_half, half_minus_pred_half;
21896
21897   /* Temporary for holding the result, initialized to the input
21898      operand to ease control flow.  */
21899   res = gen_reg_rtx (mode);
21900   emit_move_insn (res, operand1);
21901
21902   TWO52 = ix86_gen_TWO52 (mode);
21903   xa = ix86_expand_sse_fabs (res, &mask);
21904   label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21905
21906   /* load nextafter (0.5, 0.0) */
21907   fmt = REAL_MODE_FORMAT (mode);
21908   real_2expN (&half_minus_pred_half, -(fmt->p) - 1);
21909   REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
21910
21911   /* xa = xa + 0.5 */
21912   half = force_reg (mode, const_double_from_real_value (pred_half, mode));
21913   xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
21914
21915   /* xa = (double)(int64_t)xa */
21916   xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
21917   expand_fix (xi, xa, 0);
21918   expand_float (xa, xi, 0);
21919
21920   /* res = copysign (xa, operand1) */
21921   ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
21922
21923   emit_label (label);
21924   LABEL_NUSES (label) = 1;
21925
21926   emit_move_insn (operand0, res);
21927 }
21928
21929 \f
21930 /* Table of valid machine attributes.  */
21931 static const struct attribute_spec ix86_attribute_table[] =
21932 {
21933   /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler } */
21934   /* Stdcall attribute says callee is responsible for popping arguments
21935      if they are not variable.  */
21936   { "stdcall",   0, 0, false, true,  true,  ix86_handle_cconv_attribute },
21937   /* Fastcall attribute says callee is responsible for popping arguments
21938      if they are not variable.  */
21939   { "fastcall",  0, 0, false, true,  true,  ix86_handle_cconv_attribute },
21940   /* Cdecl attribute says the callee is a normal C declaration */
21941   { "cdecl",     0, 0, false, true,  true,  ix86_handle_cconv_attribute },
21942   /* Regparm attribute specifies how many integer arguments are to be
21943      passed in registers.  */
21944   { "regparm",   1, 1, false, true,  true,  ix86_handle_cconv_attribute },
21945   /* Sseregparm attribute says we are using x86_64 calling conventions
21946      for FP arguments.  */
21947   { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute },
21948   /* force_align_arg_pointer says this function realigns the stack at entry.  */
21949   { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
21950     false, true,  true, ix86_handle_cconv_attribute },
21951 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
21952   { "dllimport", 0, 0, false, false, false, handle_dll_attribute },
21953   { "dllexport", 0, 0, false, false, false, handle_dll_attribute },
21954   { "shared",    0, 0, true,  false, false, ix86_handle_shared_attribute },
21955 #endif
21956   { "ms_struct", 0, 0, false, false,  false, ix86_handle_struct_attribute },
21957   { "gcc_struct", 0, 0, false, false,  false, ix86_handle_struct_attribute },
21958 #ifdef SUBTARGET_ATTRIBUTE_TABLE
21959   SUBTARGET_ATTRIBUTE_TABLE,
21960 #endif
21961   { NULL,        0, 0, false, false, false, NULL }
21962 };
21963
21964 /* Initialize the GCC target structure.  */
21965 #undef TARGET_ATTRIBUTE_TABLE
21966 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
21967 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
21968 #  undef TARGET_MERGE_DECL_ATTRIBUTES
21969 #  define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
21970 #endif
21971
21972 #undef TARGET_COMP_TYPE_ATTRIBUTES
21973 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
21974
21975 #undef TARGET_INIT_BUILTINS
21976 #define TARGET_INIT_BUILTINS ix86_init_builtins
21977 #undef TARGET_EXPAND_BUILTIN
21978 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
21979
21980 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
21981 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION ix86_builtin_vectorized_function
21982 #undef TARGET_VECTORIZE_BUILTIN_CONVERSION
21983 #define TARGET_VECTORIZE_BUILTIN_CONVERSION ix86_builtin_conversion
21984
21985 #undef TARGET_ASM_FUNCTION_EPILOGUE
21986 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
21987
21988 #undef TARGET_ENCODE_SECTION_INFO
21989 #ifndef SUBTARGET_ENCODE_SECTION_INFO
21990 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
21991 #else
21992 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
21993 #endif
21994
21995 #undef TARGET_ASM_OPEN_PAREN
21996 #define TARGET_ASM_OPEN_PAREN ""
21997 #undef TARGET_ASM_CLOSE_PAREN
21998 #define TARGET_ASM_CLOSE_PAREN ""
21999
22000 #undef TARGET_ASM_ALIGNED_HI_OP
22001 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
22002 #undef TARGET_ASM_ALIGNED_SI_OP
22003 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
22004 #ifdef ASM_QUAD
22005 #undef TARGET_ASM_ALIGNED_DI_OP
22006 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
22007 #endif
22008
22009 #undef TARGET_ASM_UNALIGNED_HI_OP
22010 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
22011 #undef TARGET_ASM_UNALIGNED_SI_OP
22012 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
22013 #undef TARGET_ASM_UNALIGNED_DI_OP
22014 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
22015
22016 #undef TARGET_SCHED_ADJUST_COST
22017 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
22018 #undef TARGET_SCHED_ISSUE_RATE
22019 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
22020 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
22021 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
22022   ia32_multipass_dfa_lookahead
22023
22024 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
22025 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
22026
22027 #ifdef HAVE_AS_TLS
22028 #undef TARGET_HAVE_TLS
22029 #define TARGET_HAVE_TLS true
22030 #endif
22031 #undef TARGET_CANNOT_FORCE_CONST_MEM
22032 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
22033 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
22034 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_rtx_true
22035
22036 #undef TARGET_DELEGITIMIZE_ADDRESS
22037 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
22038
22039 #undef TARGET_MS_BITFIELD_LAYOUT_P
22040 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
22041
22042 #if TARGET_MACHO
22043 #undef TARGET_BINDS_LOCAL_P
22044 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
22045 #endif
22046 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
22047 #undef TARGET_BINDS_LOCAL_P
22048 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
22049 #endif
22050
22051 #undef TARGET_ASM_OUTPUT_MI_THUNK
22052 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
22053 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
22054 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
22055
22056 #undef TARGET_ASM_FILE_START
22057 #define TARGET_ASM_FILE_START x86_file_start
22058
22059 #undef TARGET_DEFAULT_TARGET_FLAGS
22060 #define TARGET_DEFAULT_TARGET_FLAGS     \
22061   (TARGET_DEFAULT                       \
22062    | TARGET_64BIT_DEFAULT               \
22063    | TARGET_SUBTARGET_DEFAULT           \
22064    | TARGET_TLS_DIRECT_SEG_REFS_DEFAULT)
22065
22066 #undef TARGET_HANDLE_OPTION
22067 #define TARGET_HANDLE_OPTION ix86_handle_option
22068
22069 #undef TARGET_RTX_COSTS
22070 #define TARGET_RTX_COSTS ix86_rtx_costs
22071 #undef TARGET_ADDRESS_COST
22072 #define TARGET_ADDRESS_COST ix86_address_cost
22073
22074 #undef TARGET_FIXED_CONDITION_CODE_REGS
22075 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
22076 #undef TARGET_CC_MODES_COMPATIBLE
22077 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
22078
22079 #undef TARGET_MACHINE_DEPENDENT_REORG
22080 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
22081
22082 #undef TARGET_BUILD_BUILTIN_VA_LIST
22083 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
22084
22085 #undef TARGET_MD_ASM_CLOBBERS
22086 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
22087
22088 #undef TARGET_PROMOTE_PROTOTYPES
22089 #define TARGET_PROMOTE_PROTOTYPES hook_bool_tree_true
22090 #undef TARGET_STRUCT_VALUE_RTX
22091 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
22092 #undef TARGET_SETUP_INCOMING_VARARGS
22093 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
22094 #undef TARGET_MUST_PASS_IN_STACK
22095 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
22096 #undef TARGET_PASS_BY_REFERENCE
22097 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
22098 #undef TARGET_INTERNAL_ARG_POINTER
22099 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
22100 #undef TARGET_DWARF_HANDLE_FRAME_UNSPEC
22101 #define TARGET_DWARF_HANDLE_FRAME_UNSPEC ix86_dwarf_handle_frame_unspec
22102 #undef TARGET_STRICT_ARGUMENT_NAMING
22103 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
22104
22105 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
22106 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
22107
22108 #undef TARGET_SCALAR_MODE_SUPPORTED_P
22109 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
22110
22111 #undef TARGET_VECTOR_MODE_SUPPORTED_P
22112 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
22113
22114 #ifdef HAVE_AS_TLS
22115 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
22116 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
22117 #endif
22118
22119 #ifdef SUBTARGET_INSERT_ATTRIBUTES
22120 #undef TARGET_INSERT_ATTRIBUTES
22121 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
22122 #endif
22123
22124 #undef TARGET_MANGLE_FUNDAMENTAL_TYPE
22125 #define TARGET_MANGLE_FUNDAMENTAL_TYPE ix86_mangle_fundamental_type
22126
22127 #undef TARGET_STACK_PROTECT_FAIL
22128 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
22129
22130 #undef TARGET_FUNCTION_VALUE
22131 #define TARGET_FUNCTION_VALUE ix86_function_value
22132
22133 struct gcc_target targetm = TARGET_INITIALIZER;
22134 \f
22135 #include "gt-i386.h"