1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
3 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
29 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
43 #include "basic-block.h"
46 #include "target-def.h"
47 #include "langhooks.h"
49 #include "tree-gimple.h"
52 #include "tm-constrs.h"
55 #ifndef CHECK_STACK_LIMIT
56 #define CHECK_STACK_LIMIT (-1)
59 /* Return index of given mode in mult and division cost tables. */
60 #define MODE_INDEX(mode) \
61 ((mode) == QImode ? 0 \
62 : (mode) == HImode ? 1 \
63 : (mode) == SImode ? 2 \
64 : (mode) == DImode ? 3 \
67 /* Processor costs (relative to an add) */
68 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
69 #define COSTS_N_BYTES(N) ((N) * 2)
71 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
74 struct processor_costs size_cost = { /* costs for tuning for size */
75 COSTS_N_BYTES (2), /* cost of an add instruction */
76 COSTS_N_BYTES (3), /* cost of a lea instruction */
77 COSTS_N_BYTES (2), /* variable shift costs */
78 COSTS_N_BYTES (3), /* constant shift costs */
79 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
80 COSTS_N_BYTES (3), /* HI */
81 COSTS_N_BYTES (3), /* SI */
82 COSTS_N_BYTES (3), /* DI */
83 COSTS_N_BYTES (5)}, /* other */
84 0, /* cost of multiply per each bit set */
85 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
86 COSTS_N_BYTES (3), /* HI */
87 COSTS_N_BYTES (3), /* SI */
88 COSTS_N_BYTES (3), /* DI */
89 COSTS_N_BYTES (5)}, /* other */
90 COSTS_N_BYTES (3), /* cost of movsx */
91 COSTS_N_BYTES (3), /* cost of movzx */
94 2, /* cost for loading QImode using movzbl */
95 {2, 2, 2}, /* cost of loading integer registers
96 in QImode, HImode and SImode.
97 Relative to reg-reg move (2). */
98 {2, 2, 2}, /* cost of storing integer registers */
99 2, /* cost of reg,reg fld/fst */
100 {2, 2, 2}, /* cost of loading fp registers
101 in SFmode, DFmode and XFmode */
102 {2, 2, 2}, /* cost of storing fp registers
103 in SFmode, DFmode and XFmode */
104 3, /* cost of moving MMX register */
105 {3, 3}, /* cost of loading MMX registers
106 in SImode and DImode */
107 {3, 3}, /* cost of storing MMX registers
108 in SImode and DImode */
109 3, /* cost of moving SSE register */
110 {3, 3, 3}, /* cost of loading SSE registers
111 in SImode, DImode and TImode */
112 {3, 3, 3}, /* cost of storing SSE registers
113 in SImode, DImode and TImode */
114 3, /* MMX or SSE register to integer */
115 0, /* size of l1 cache */
116 0, /* size of l2 cache */
117 0, /* size of prefetch block */
118 0, /* number of parallel prefetches */
120 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
121 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
122 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
123 COSTS_N_BYTES (2), /* cost of FABS instruction. */
124 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
125 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
126 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
127 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
128 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
129 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}
132 /* Processor costs (relative to an add) */
134 struct processor_costs i386_cost = { /* 386 specific costs */
135 COSTS_N_INSNS (1), /* cost of an add instruction */
136 COSTS_N_INSNS (1), /* cost of a lea instruction */
137 COSTS_N_INSNS (3), /* variable shift costs */
138 COSTS_N_INSNS (2), /* constant shift costs */
139 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
140 COSTS_N_INSNS (6), /* HI */
141 COSTS_N_INSNS (6), /* SI */
142 COSTS_N_INSNS (6), /* DI */
143 COSTS_N_INSNS (6)}, /* other */
144 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
145 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
146 COSTS_N_INSNS (23), /* HI */
147 COSTS_N_INSNS (23), /* SI */
148 COSTS_N_INSNS (23), /* DI */
149 COSTS_N_INSNS (23)}, /* other */
150 COSTS_N_INSNS (3), /* cost of movsx */
151 COSTS_N_INSNS (2), /* cost of movzx */
152 15, /* "large" insn */
154 4, /* cost for loading QImode using movzbl */
155 {2, 4, 2}, /* cost of loading integer registers
156 in QImode, HImode and SImode.
157 Relative to reg-reg move (2). */
158 {2, 4, 2}, /* cost of storing integer registers */
159 2, /* cost of reg,reg fld/fst */
160 {8, 8, 8}, /* cost of loading fp registers
161 in SFmode, DFmode and XFmode */
162 {8, 8, 8}, /* cost of storing fp registers
163 in SFmode, DFmode and XFmode */
164 2, /* cost of moving MMX register */
165 {4, 8}, /* cost of loading MMX registers
166 in SImode and DImode */
167 {4, 8}, /* cost of storing MMX registers
168 in SImode and DImode */
169 2, /* cost of moving SSE register */
170 {4, 8, 16}, /* cost of loading SSE registers
171 in SImode, DImode and TImode */
172 {4, 8, 16}, /* cost of storing SSE registers
173 in SImode, DImode and TImode */
174 3, /* MMX or SSE register to integer */
175 0, /* size of l1 cache */
176 0, /* size of l2 cache */
177 0, /* size of prefetch block */
178 0, /* number of parallel prefetches */
180 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
181 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
182 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
183 COSTS_N_INSNS (22), /* cost of FABS instruction. */
184 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
185 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
186 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
187 DUMMY_STRINGOP_ALGS},
188 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
189 DUMMY_STRINGOP_ALGS},
193 struct processor_costs i486_cost = { /* 486 specific costs */
194 COSTS_N_INSNS (1), /* cost of an add instruction */
195 COSTS_N_INSNS (1), /* cost of a lea instruction */
196 COSTS_N_INSNS (3), /* variable shift costs */
197 COSTS_N_INSNS (2), /* constant shift costs */
198 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
199 COSTS_N_INSNS (12), /* HI */
200 COSTS_N_INSNS (12), /* SI */
201 COSTS_N_INSNS (12), /* DI */
202 COSTS_N_INSNS (12)}, /* other */
203 1, /* cost of multiply per each bit set */
204 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
205 COSTS_N_INSNS (40), /* HI */
206 COSTS_N_INSNS (40), /* SI */
207 COSTS_N_INSNS (40), /* DI */
208 COSTS_N_INSNS (40)}, /* other */
209 COSTS_N_INSNS (3), /* cost of movsx */
210 COSTS_N_INSNS (2), /* cost of movzx */
211 15, /* "large" insn */
213 4, /* cost for loading QImode using movzbl */
214 {2, 4, 2}, /* cost of loading integer registers
215 in QImode, HImode and SImode.
216 Relative to reg-reg move (2). */
217 {2, 4, 2}, /* cost of storing integer registers */
218 2, /* cost of reg,reg fld/fst */
219 {8, 8, 8}, /* cost of loading fp registers
220 in SFmode, DFmode and XFmode */
221 {8, 8, 8}, /* cost of storing fp registers
222 in SFmode, DFmode and XFmode */
223 2, /* cost of moving MMX register */
224 {4, 8}, /* cost of loading MMX registers
225 in SImode and DImode */
226 {4, 8}, /* cost of storing MMX registers
227 in SImode and DImode */
228 2, /* cost of moving SSE register */
229 {4, 8, 16}, /* cost of loading SSE registers
230 in SImode, DImode and TImode */
231 {4, 8, 16}, /* cost of storing SSE registers
232 in SImode, DImode and TImode */
233 3, /* MMX or SSE register to integer */
234 4, /* size of l1 cache. 486 has 8kB cache
235 shared for code and data, so 4kB is
236 not really precise. */
237 4, /* size of l2 cache */
238 0, /* size of prefetch block */
239 0, /* number of parallel prefetches */
241 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
242 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
243 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
244 COSTS_N_INSNS (3), /* cost of FABS instruction. */
245 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
246 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
247 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
248 DUMMY_STRINGOP_ALGS},
249 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
254 struct processor_costs pentium_cost = {
255 COSTS_N_INSNS (1), /* cost of an add instruction */
256 COSTS_N_INSNS (1), /* cost of a lea instruction */
257 COSTS_N_INSNS (4), /* variable shift costs */
258 COSTS_N_INSNS (1), /* constant shift costs */
259 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
260 COSTS_N_INSNS (11), /* HI */
261 COSTS_N_INSNS (11), /* SI */
262 COSTS_N_INSNS (11), /* DI */
263 COSTS_N_INSNS (11)}, /* other */
264 0, /* cost of multiply per each bit set */
265 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
266 COSTS_N_INSNS (25), /* HI */
267 COSTS_N_INSNS (25), /* SI */
268 COSTS_N_INSNS (25), /* DI */
269 COSTS_N_INSNS (25)}, /* other */
270 COSTS_N_INSNS (3), /* cost of movsx */
271 COSTS_N_INSNS (2), /* cost of movzx */
272 8, /* "large" insn */
274 6, /* cost for loading QImode using movzbl */
275 {2, 4, 2}, /* cost of loading integer registers
276 in QImode, HImode and SImode.
277 Relative to reg-reg move (2). */
278 {2, 4, 2}, /* cost of storing integer registers */
279 2, /* cost of reg,reg fld/fst */
280 {2, 2, 6}, /* cost of loading fp registers
281 in SFmode, DFmode and XFmode */
282 {4, 4, 6}, /* cost of storing fp registers
283 in SFmode, DFmode and XFmode */
284 8, /* cost of moving MMX register */
285 {8, 8}, /* cost of loading MMX registers
286 in SImode and DImode */
287 {8, 8}, /* cost of storing MMX registers
288 in SImode and DImode */
289 2, /* cost of moving SSE register */
290 {4, 8, 16}, /* cost of loading SSE registers
291 in SImode, DImode and TImode */
292 {4, 8, 16}, /* cost of storing SSE registers
293 in SImode, DImode and TImode */
294 3, /* MMX or SSE register to integer */
295 8, /* size of l1 cache. */
296 8, /* size of l2 cache */
297 0, /* size of prefetch block */
298 0, /* number of parallel prefetches */
300 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
301 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
302 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
303 COSTS_N_INSNS (1), /* cost of FABS instruction. */
304 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
305 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
306 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
307 DUMMY_STRINGOP_ALGS},
308 {{libcall, {{-1, rep_prefix_4_byte}}},
313 struct processor_costs pentiumpro_cost = {
314 COSTS_N_INSNS (1), /* cost of an add instruction */
315 COSTS_N_INSNS (1), /* cost of a lea instruction */
316 COSTS_N_INSNS (1), /* variable shift costs */
317 COSTS_N_INSNS (1), /* constant shift costs */
318 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
319 COSTS_N_INSNS (4), /* HI */
320 COSTS_N_INSNS (4), /* SI */
321 COSTS_N_INSNS (4), /* DI */
322 COSTS_N_INSNS (4)}, /* other */
323 0, /* cost of multiply per each bit set */
324 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
325 COSTS_N_INSNS (17), /* HI */
326 COSTS_N_INSNS (17), /* SI */
327 COSTS_N_INSNS (17), /* DI */
328 COSTS_N_INSNS (17)}, /* other */
329 COSTS_N_INSNS (1), /* cost of movsx */
330 COSTS_N_INSNS (1), /* cost of movzx */
331 8, /* "large" insn */
333 2, /* cost for loading QImode using movzbl */
334 {4, 4, 4}, /* cost of loading integer registers
335 in QImode, HImode and SImode.
336 Relative to reg-reg move (2). */
337 {2, 2, 2}, /* cost of storing integer registers */
338 2, /* cost of reg,reg fld/fst */
339 {2, 2, 6}, /* cost of loading fp registers
340 in SFmode, DFmode and XFmode */
341 {4, 4, 6}, /* cost of storing fp registers
342 in SFmode, DFmode and XFmode */
343 2, /* cost of moving MMX register */
344 {2, 2}, /* cost of loading MMX registers
345 in SImode and DImode */
346 {2, 2}, /* cost of storing MMX registers
347 in SImode and DImode */
348 2, /* cost of moving SSE register */
349 {2, 2, 8}, /* cost of loading SSE registers
350 in SImode, DImode and TImode */
351 {2, 2, 8}, /* cost of storing SSE registers
352 in SImode, DImode and TImode */
353 3, /* MMX or SSE register to integer */
354 8, /* size of l1 cache. */
355 256, /* size of l2 cache */
356 32, /* size of prefetch block */
357 6, /* number of parallel prefetches */
359 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
360 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
361 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
362 COSTS_N_INSNS (2), /* cost of FABS instruction. */
363 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
364 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
365 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes (we ensure
366 the alignment). For small blocks inline loop is still a noticeable win, for bigger
367 blocks either rep movsl or rep movsb is way to go. Rep movsb has apparently
368 more expensive startup time in CPU, but after 4K the difference is down in the noise.
370 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
371 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
372 DUMMY_STRINGOP_ALGS},
373 {{rep_prefix_4_byte, {{1024, unrolled_loop},
374 {8192, rep_prefix_4_byte}, {-1, libcall}}},
379 struct processor_costs geode_cost = {
380 COSTS_N_INSNS (1), /* cost of an add instruction */
381 COSTS_N_INSNS (1), /* cost of a lea instruction */
382 COSTS_N_INSNS (2), /* variable shift costs */
383 COSTS_N_INSNS (1), /* constant shift costs */
384 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
385 COSTS_N_INSNS (4), /* HI */
386 COSTS_N_INSNS (7), /* SI */
387 COSTS_N_INSNS (7), /* DI */
388 COSTS_N_INSNS (7)}, /* other */
389 0, /* cost of multiply per each bit set */
390 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
391 COSTS_N_INSNS (23), /* HI */
392 COSTS_N_INSNS (39), /* SI */
393 COSTS_N_INSNS (39), /* DI */
394 COSTS_N_INSNS (39)}, /* other */
395 COSTS_N_INSNS (1), /* cost of movsx */
396 COSTS_N_INSNS (1), /* cost of movzx */
397 8, /* "large" insn */
399 1, /* cost for loading QImode using movzbl */
400 {1, 1, 1}, /* cost of loading integer registers
401 in QImode, HImode and SImode.
402 Relative to reg-reg move (2). */
403 {1, 1, 1}, /* cost of storing integer registers */
404 1, /* cost of reg,reg fld/fst */
405 {1, 1, 1}, /* cost of loading fp registers
406 in SFmode, DFmode and XFmode */
407 {4, 6, 6}, /* cost of storing fp registers
408 in SFmode, DFmode and XFmode */
410 1, /* cost of moving MMX register */
411 {1, 1}, /* cost of loading MMX registers
412 in SImode and DImode */
413 {1, 1}, /* cost of storing MMX registers
414 in SImode and DImode */
415 1, /* cost of moving SSE register */
416 {1, 1, 1}, /* cost of loading SSE registers
417 in SImode, DImode and TImode */
418 {1, 1, 1}, /* cost of storing SSE registers
419 in SImode, DImode and TImode */
420 1, /* MMX or SSE register to integer */
421 64, /* size of l1 cache. */
422 128, /* size of l2 cache. */
423 32, /* size of prefetch block */
424 1, /* number of parallel prefetches */
426 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
427 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
428 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
429 COSTS_N_INSNS (1), /* cost of FABS instruction. */
430 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
431 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
432 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
433 DUMMY_STRINGOP_ALGS},
434 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
439 struct processor_costs k6_cost = {
440 COSTS_N_INSNS (1), /* cost of an add instruction */
441 COSTS_N_INSNS (2), /* cost of a lea instruction */
442 COSTS_N_INSNS (1), /* variable shift costs */
443 COSTS_N_INSNS (1), /* constant shift costs */
444 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
445 COSTS_N_INSNS (3), /* HI */
446 COSTS_N_INSNS (3), /* SI */
447 COSTS_N_INSNS (3), /* DI */
448 COSTS_N_INSNS (3)}, /* other */
449 0, /* cost of multiply per each bit set */
450 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
451 COSTS_N_INSNS (18), /* HI */
452 COSTS_N_INSNS (18), /* SI */
453 COSTS_N_INSNS (18), /* DI */
454 COSTS_N_INSNS (18)}, /* other */
455 COSTS_N_INSNS (2), /* cost of movsx */
456 COSTS_N_INSNS (2), /* cost of movzx */
457 8, /* "large" insn */
459 3, /* cost for loading QImode using movzbl */
460 {4, 5, 4}, /* cost of loading integer registers
461 in QImode, HImode and SImode.
462 Relative to reg-reg move (2). */
463 {2, 3, 2}, /* cost of storing integer registers */
464 4, /* cost of reg,reg fld/fst */
465 {6, 6, 6}, /* cost of loading fp registers
466 in SFmode, DFmode and XFmode */
467 {4, 4, 4}, /* cost of storing fp registers
468 in SFmode, DFmode and XFmode */
469 2, /* cost of moving MMX register */
470 {2, 2}, /* cost of loading MMX registers
471 in SImode and DImode */
472 {2, 2}, /* cost of storing MMX registers
473 in SImode and DImode */
474 2, /* cost of moving SSE register */
475 {2, 2, 8}, /* cost of loading SSE registers
476 in SImode, DImode and TImode */
477 {2, 2, 8}, /* cost of storing SSE registers
478 in SImode, DImode and TImode */
479 6, /* MMX or SSE register to integer */
480 32, /* size of l1 cache. */
481 32, /* size of l2 cache. Some models
482 have integrated l2 cache, but
483 optimizing for k6 is not important
484 enough to worry about that. */
485 32, /* size of prefetch block */
486 1, /* number of parallel prefetches */
488 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
489 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
490 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
491 COSTS_N_INSNS (2), /* cost of FABS instruction. */
492 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
493 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
494 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
495 DUMMY_STRINGOP_ALGS},
496 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
501 struct processor_costs athlon_cost = {
502 COSTS_N_INSNS (1), /* cost of an add instruction */
503 COSTS_N_INSNS (2), /* cost of a lea instruction */
504 COSTS_N_INSNS (1), /* variable shift costs */
505 COSTS_N_INSNS (1), /* constant shift costs */
506 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
507 COSTS_N_INSNS (5), /* HI */
508 COSTS_N_INSNS (5), /* SI */
509 COSTS_N_INSNS (5), /* DI */
510 COSTS_N_INSNS (5)}, /* other */
511 0, /* cost of multiply per each bit set */
512 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
513 COSTS_N_INSNS (26), /* HI */
514 COSTS_N_INSNS (42), /* SI */
515 COSTS_N_INSNS (74), /* DI */
516 COSTS_N_INSNS (74)}, /* other */
517 COSTS_N_INSNS (1), /* cost of movsx */
518 COSTS_N_INSNS (1), /* cost of movzx */
519 8, /* "large" insn */
521 4, /* cost for loading QImode using movzbl */
522 {3, 4, 3}, /* cost of loading integer registers
523 in QImode, HImode and SImode.
524 Relative to reg-reg move (2). */
525 {3, 4, 3}, /* cost of storing integer registers */
526 4, /* cost of reg,reg fld/fst */
527 {4, 4, 12}, /* cost of loading fp registers
528 in SFmode, DFmode and XFmode */
529 {6, 6, 8}, /* cost of storing fp registers
530 in SFmode, DFmode and XFmode */
531 2, /* cost of moving MMX register */
532 {4, 4}, /* cost of loading MMX registers
533 in SImode and DImode */
534 {4, 4}, /* cost of storing MMX registers
535 in SImode and DImode */
536 2, /* cost of moving SSE register */
537 {4, 4, 6}, /* cost of loading SSE registers
538 in SImode, DImode and TImode */
539 {4, 4, 5}, /* cost of storing SSE registers
540 in SImode, DImode and TImode */
541 5, /* MMX or SSE register to integer */
542 64, /* size of l1 cache. */
543 256, /* size of l2 cache. */
544 64, /* size of prefetch block */
545 6, /* number of parallel prefetches */
547 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
548 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
549 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
550 COSTS_N_INSNS (2), /* cost of FABS instruction. */
551 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
552 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
553 /* For some reason, Athlon deals better with REP prefix (relative to loops)
554 compared to K8. Alignment becomes important after 8 bytes for memcpy and
555 128 bytes for memset. */
556 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
557 DUMMY_STRINGOP_ALGS},
558 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
563 struct processor_costs k8_cost = {
564 COSTS_N_INSNS (1), /* cost of an add instruction */
565 COSTS_N_INSNS (2), /* cost of a lea instruction */
566 COSTS_N_INSNS (1), /* variable shift costs */
567 COSTS_N_INSNS (1), /* constant shift costs */
568 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
569 COSTS_N_INSNS (4), /* HI */
570 COSTS_N_INSNS (3), /* SI */
571 COSTS_N_INSNS (4), /* DI */
572 COSTS_N_INSNS (5)}, /* other */
573 0, /* cost of multiply per each bit set */
574 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
575 COSTS_N_INSNS (26), /* HI */
576 COSTS_N_INSNS (42), /* SI */
577 COSTS_N_INSNS (74), /* DI */
578 COSTS_N_INSNS (74)}, /* other */
579 COSTS_N_INSNS (1), /* cost of movsx */
580 COSTS_N_INSNS (1), /* cost of movzx */
581 8, /* "large" insn */
583 4, /* cost for loading QImode using movzbl */
584 {3, 4, 3}, /* cost of loading integer registers
585 in QImode, HImode and SImode.
586 Relative to reg-reg move (2). */
587 {3, 4, 3}, /* cost of storing integer registers */
588 4, /* cost of reg,reg fld/fst */
589 {4, 4, 12}, /* cost of loading fp registers
590 in SFmode, DFmode and XFmode */
591 {6, 6, 8}, /* cost of storing fp registers
592 in SFmode, DFmode and XFmode */
593 2, /* cost of moving MMX register */
594 {3, 3}, /* cost of loading MMX registers
595 in SImode and DImode */
596 {4, 4}, /* cost of storing MMX registers
597 in SImode and DImode */
598 2, /* cost of moving SSE register */
599 {4, 3, 6}, /* cost of loading SSE registers
600 in SImode, DImode and TImode */
601 {4, 4, 5}, /* cost of storing SSE registers
602 in SImode, DImode and TImode */
603 5, /* MMX or SSE register to integer */
604 64, /* size of l1 cache. */
605 512, /* size of l2 cache. */
606 64, /* size of prefetch block */
607 /* New AMD processors never drop prefetches; if they cannot be performed
608 immediately, they are queued. We set number of simultaneous prefetches
609 to a large constant to reflect this (it probably is not a good idea not
610 to limit number of prefetches at all, as their execution also takes some
612 100, /* number of parallel prefetches */
614 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
615 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
616 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
617 COSTS_N_INSNS (2), /* cost of FABS instruction. */
618 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
619 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
620 /* K8 has optimized REP instruction for medium sized blocks, but for very small
621 blocks it is better to use loop. For large blocks, libcall can do
622 nontemporary accesses and beat inline considerably. */
623 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
624 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
625 {{libcall, {{8, loop}, {24, unrolled_loop},
626 {2048, rep_prefix_4_byte}, {-1, libcall}}},
627 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
630 struct processor_costs amdfam10_cost = {
631 COSTS_N_INSNS (1), /* cost of an add instruction */
632 COSTS_N_INSNS (2), /* cost of a lea instruction */
633 COSTS_N_INSNS (1), /* variable shift costs */
634 COSTS_N_INSNS (1), /* constant shift costs */
635 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
636 COSTS_N_INSNS (4), /* HI */
637 COSTS_N_INSNS (3), /* SI */
638 COSTS_N_INSNS (4), /* DI */
639 COSTS_N_INSNS (5)}, /* other */
640 0, /* cost of multiply per each bit set */
641 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
642 COSTS_N_INSNS (35), /* HI */
643 COSTS_N_INSNS (51), /* SI */
644 COSTS_N_INSNS (83), /* DI */
645 COSTS_N_INSNS (83)}, /* other */
646 COSTS_N_INSNS (1), /* cost of movsx */
647 COSTS_N_INSNS (1), /* cost of movzx */
648 8, /* "large" insn */
650 4, /* cost for loading QImode using movzbl */
651 {3, 4, 3}, /* cost of loading integer registers
652 in QImode, HImode and SImode.
653 Relative to reg-reg move (2). */
654 {3, 4, 3}, /* cost of storing integer registers */
655 4, /* cost of reg,reg fld/fst */
656 {4, 4, 12}, /* cost of loading fp registers
657 in SFmode, DFmode and XFmode */
658 {6, 6, 8}, /* cost of storing fp registers
659 in SFmode, DFmode and XFmode */
660 2, /* cost of moving MMX register */
661 {3, 3}, /* cost of loading MMX registers
662 in SImode and DImode */
663 {4, 4}, /* cost of storing MMX registers
664 in SImode and DImode */
665 2, /* cost of moving SSE register */
666 {4, 4, 3}, /* cost of loading SSE registers
667 in SImode, DImode and TImode */
668 {4, 4, 5}, /* cost of storing SSE registers
669 in SImode, DImode and TImode */
670 3, /* MMX or SSE register to integer */
672 MOVD reg64, xmmreg Double FSTORE 4
673 MOVD reg32, xmmreg Double FSTORE 4
675 MOVD reg64, xmmreg Double FADD 3
677 MOVD reg32, xmmreg Double FADD 3
679 64, /* size of l1 cache. */
680 512, /* size of l2 cache. */
681 64, /* size of prefetch block */
682 /* New AMD processors never drop prefetches; if they cannot be performed
683 immediately, they are queued. We set number of simultaneous prefetches
684 to a large constant to reflect this (it probably is not a good idea not
685 to limit number of prefetches at all, as their execution also takes some
687 100, /* number of parallel prefetches */
689 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
690 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
691 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
692 COSTS_N_INSNS (2), /* cost of FABS instruction. */
693 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
694 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
696 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
697 very small blocks it is better to use loop. For large blocks, libcall can
698 do nontemporary accesses and beat inline considerably. */
699 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
700 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
701 {{libcall, {{8, loop}, {24, unrolled_loop},
702 {2048, rep_prefix_4_byte}, {-1, libcall}}},
703 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
707 struct processor_costs pentium4_cost = {
708 COSTS_N_INSNS (1), /* cost of an add instruction */
709 COSTS_N_INSNS (3), /* cost of a lea instruction */
710 COSTS_N_INSNS (4), /* variable shift costs */
711 COSTS_N_INSNS (4), /* constant shift costs */
712 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
713 COSTS_N_INSNS (15), /* HI */
714 COSTS_N_INSNS (15), /* SI */
715 COSTS_N_INSNS (15), /* DI */
716 COSTS_N_INSNS (15)}, /* other */
717 0, /* cost of multiply per each bit set */
718 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
719 COSTS_N_INSNS (56), /* HI */
720 COSTS_N_INSNS (56), /* SI */
721 COSTS_N_INSNS (56), /* DI */
722 COSTS_N_INSNS (56)}, /* other */
723 COSTS_N_INSNS (1), /* cost of movsx */
724 COSTS_N_INSNS (1), /* cost of movzx */
725 16, /* "large" insn */
727 2, /* cost for loading QImode using movzbl */
728 {4, 5, 4}, /* cost of loading integer registers
729 in QImode, HImode and SImode.
730 Relative to reg-reg move (2). */
731 {2, 3, 2}, /* cost of storing integer registers */
732 2, /* cost of reg,reg fld/fst */
733 {2, 2, 6}, /* cost of loading fp registers
734 in SFmode, DFmode and XFmode */
735 {4, 4, 6}, /* cost of storing fp registers
736 in SFmode, DFmode and XFmode */
737 2, /* cost of moving MMX register */
738 {2, 2}, /* cost of loading MMX registers
739 in SImode and DImode */
740 {2, 2}, /* cost of storing MMX registers
741 in SImode and DImode */
742 12, /* cost of moving SSE register */
743 {12, 12, 12}, /* cost of loading SSE registers
744 in SImode, DImode and TImode */
745 {2, 2, 8}, /* cost of storing SSE registers
746 in SImode, DImode and TImode */
747 10, /* MMX or SSE register to integer */
748 8, /* size of l1 cache. */
749 256, /* size of l2 cache. */
750 64, /* size of prefetch block */
751 6, /* number of parallel prefetches */
753 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
754 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
755 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
756 COSTS_N_INSNS (2), /* cost of FABS instruction. */
757 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
758 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
759 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
760 DUMMY_STRINGOP_ALGS},
761 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
763 DUMMY_STRINGOP_ALGS},
767 struct processor_costs nocona_cost = {
768 COSTS_N_INSNS (1), /* cost of an add instruction */
769 COSTS_N_INSNS (1), /* cost of a lea instruction */
770 COSTS_N_INSNS (1), /* variable shift costs */
771 COSTS_N_INSNS (1), /* constant shift costs */
772 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
773 COSTS_N_INSNS (10), /* HI */
774 COSTS_N_INSNS (10), /* SI */
775 COSTS_N_INSNS (10), /* DI */
776 COSTS_N_INSNS (10)}, /* other */
777 0, /* cost of multiply per each bit set */
778 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
779 COSTS_N_INSNS (66), /* HI */
780 COSTS_N_INSNS (66), /* SI */
781 COSTS_N_INSNS (66), /* DI */
782 COSTS_N_INSNS (66)}, /* other */
783 COSTS_N_INSNS (1), /* cost of movsx */
784 COSTS_N_INSNS (1), /* cost of movzx */
785 16, /* "large" insn */
787 4, /* cost for loading QImode using movzbl */
788 {4, 4, 4}, /* cost of loading integer registers
789 in QImode, HImode and SImode.
790 Relative to reg-reg move (2). */
791 {4, 4, 4}, /* cost of storing integer registers */
792 3, /* cost of reg,reg fld/fst */
793 {12, 12, 12}, /* cost of loading fp registers
794 in SFmode, DFmode and XFmode */
795 {4, 4, 4}, /* cost of storing fp registers
796 in SFmode, DFmode and XFmode */
797 6, /* cost of moving MMX register */
798 {12, 12}, /* cost of loading MMX registers
799 in SImode and DImode */
800 {12, 12}, /* cost of storing MMX registers
801 in SImode and DImode */
802 6, /* cost of moving SSE register */
803 {12, 12, 12}, /* cost of loading SSE registers
804 in SImode, DImode and TImode */
805 {12, 12, 12}, /* cost of storing SSE registers
806 in SImode, DImode and TImode */
807 8, /* MMX or SSE register to integer */
808 8, /* size of l1 cache. */
809 1024, /* size of l2 cache. */
810 128, /* size of prefetch block */
811 8, /* number of parallel prefetches */
813 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
814 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
815 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
816 COSTS_N_INSNS (3), /* cost of FABS instruction. */
817 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
818 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
819 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
820 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
821 {100000, unrolled_loop}, {-1, libcall}}}},
822 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
824 {libcall, {{24, loop}, {64, unrolled_loop},
825 {8192, rep_prefix_8_byte}, {-1, libcall}}}}
829 struct processor_costs core2_cost = {
830 COSTS_N_INSNS (1), /* cost of an add instruction */
831 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
832 COSTS_N_INSNS (1), /* variable shift costs */
833 COSTS_N_INSNS (1), /* constant shift costs */
834 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
835 COSTS_N_INSNS (3), /* HI */
836 COSTS_N_INSNS (3), /* SI */
837 COSTS_N_INSNS (3), /* DI */
838 COSTS_N_INSNS (3)}, /* other */
839 0, /* cost of multiply per each bit set */
840 {COSTS_N_INSNS (22), /* cost of a divide/mod for QI */
841 COSTS_N_INSNS (22), /* HI */
842 COSTS_N_INSNS (22), /* SI */
843 COSTS_N_INSNS (22), /* DI */
844 COSTS_N_INSNS (22)}, /* other */
845 COSTS_N_INSNS (1), /* cost of movsx */
846 COSTS_N_INSNS (1), /* cost of movzx */
847 8, /* "large" insn */
849 2, /* cost for loading QImode using movzbl */
850 {6, 6, 6}, /* cost of loading integer registers
851 in QImode, HImode and SImode.
852 Relative to reg-reg move (2). */
853 {4, 4, 4}, /* cost of storing integer registers */
854 2, /* cost of reg,reg fld/fst */
855 {6, 6, 6}, /* cost of loading fp registers
856 in SFmode, DFmode and XFmode */
857 {4, 4, 4}, /* cost of loading integer registers */
858 2, /* cost of moving MMX register */
859 {6, 6}, /* cost of loading MMX registers
860 in SImode and DImode */
861 {4, 4}, /* cost of storing MMX registers
862 in SImode and DImode */
863 2, /* cost of moving SSE register */
864 {6, 6, 6}, /* cost of loading SSE registers
865 in SImode, DImode and TImode */
866 {4, 4, 4}, /* cost of storing SSE registers
867 in SImode, DImode and TImode */
868 2, /* MMX or SSE register to integer */
869 32, /* size of l1 cache. */
870 2048, /* size of l2 cache. */
871 128, /* size of prefetch block */
872 8, /* number of parallel prefetches */
874 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
875 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
876 COSTS_N_INSNS (32), /* cost of FDIV instruction. */
877 COSTS_N_INSNS (1), /* cost of FABS instruction. */
878 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
879 COSTS_N_INSNS (58), /* cost of FSQRT instruction. */
880 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
881 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
882 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
883 {{libcall, {{8, loop}, {15, unrolled_loop},
884 {2048, rep_prefix_4_byte}, {-1, libcall}}},
885 {libcall, {{24, loop}, {32, unrolled_loop},
886 {8192, rep_prefix_8_byte}, {-1, libcall}}}}
889 /* Generic64 should produce code tuned for Nocona and K8. */
891 struct processor_costs generic64_cost = {
892 COSTS_N_INSNS (1), /* cost of an add instruction */
893 /* On all chips taken into consideration lea is 2 cycles and more. With
894 this cost however our current implementation of synth_mult results in
895 use of unnecessary temporary registers causing regression on several
896 SPECfp benchmarks. */
897 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
898 COSTS_N_INSNS (1), /* variable shift costs */
899 COSTS_N_INSNS (1), /* constant shift costs */
900 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
901 COSTS_N_INSNS (4), /* HI */
902 COSTS_N_INSNS (3), /* SI */
903 COSTS_N_INSNS (4), /* DI */
904 COSTS_N_INSNS (2)}, /* other */
905 0, /* cost of multiply per each bit set */
906 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
907 COSTS_N_INSNS (26), /* HI */
908 COSTS_N_INSNS (42), /* SI */
909 COSTS_N_INSNS (74), /* DI */
910 COSTS_N_INSNS (74)}, /* other */
911 COSTS_N_INSNS (1), /* cost of movsx */
912 COSTS_N_INSNS (1), /* cost of movzx */
913 8, /* "large" insn */
915 4, /* cost for loading QImode using movzbl */
916 {4, 4, 4}, /* cost of loading integer registers
917 in QImode, HImode and SImode.
918 Relative to reg-reg move (2). */
919 {4, 4, 4}, /* cost of storing integer registers */
920 4, /* cost of reg,reg fld/fst */
921 {12, 12, 12}, /* cost of loading fp registers
922 in SFmode, DFmode and XFmode */
923 {6, 6, 8}, /* cost of storing fp registers
924 in SFmode, DFmode and XFmode */
925 2, /* cost of moving MMX register */
926 {8, 8}, /* cost of loading MMX registers
927 in SImode and DImode */
928 {8, 8}, /* cost of storing MMX registers
929 in SImode and DImode */
930 2, /* cost of moving SSE register */
931 {8, 8, 8}, /* cost of loading SSE registers
932 in SImode, DImode and TImode */
933 {8, 8, 8}, /* cost of storing SSE registers
934 in SImode, DImode and TImode */
935 5, /* MMX or SSE register to integer */
936 32, /* size of l1 cache. */
937 512, /* size of l2 cache. */
938 64, /* size of prefetch block */
939 6, /* number of parallel prefetches */
940 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this value
941 is increased to perhaps more appropriate value of 5. */
943 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
944 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
945 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
946 COSTS_N_INSNS (8), /* cost of FABS instruction. */
947 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
948 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
949 {DUMMY_STRINGOP_ALGS,
950 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
951 {DUMMY_STRINGOP_ALGS,
952 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
955 /* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8. */
957 struct processor_costs generic32_cost = {
958 COSTS_N_INSNS (1), /* cost of an add instruction */
959 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
960 COSTS_N_INSNS (1), /* variable shift costs */
961 COSTS_N_INSNS (1), /* constant shift costs */
962 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
963 COSTS_N_INSNS (4), /* HI */
964 COSTS_N_INSNS (3), /* SI */
965 COSTS_N_INSNS (4), /* DI */
966 COSTS_N_INSNS (2)}, /* other */
967 0, /* cost of multiply per each bit set */
968 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
969 COSTS_N_INSNS (26), /* HI */
970 COSTS_N_INSNS (42), /* SI */
971 COSTS_N_INSNS (74), /* DI */
972 COSTS_N_INSNS (74)}, /* other */
973 COSTS_N_INSNS (1), /* cost of movsx */
974 COSTS_N_INSNS (1), /* cost of movzx */
975 8, /* "large" insn */
977 4, /* cost for loading QImode using movzbl */
978 {4, 4, 4}, /* cost of loading integer registers
979 in QImode, HImode and SImode.
980 Relative to reg-reg move (2). */
981 {4, 4, 4}, /* cost of storing integer registers */
982 4, /* cost of reg,reg fld/fst */
983 {12, 12, 12}, /* cost of loading fp registers
984 in SFmode, DFmode and XFmode */
985 {6, 6, 8}, /* cost of storing fp registers
986 in SFmode, DFmode and XFmode */
987 2, /* cost of moving MMX register */
988 {8, 8}, /* cost of loading MMX registers
989 in SImode and DImode */
990 {8, 8}, /* cost of storing MMX registers
991 in SImode and DImode */
992 2, /* cost of moving SSE register */
993 {8, 8, 8}, /* cost of loading SSE registers
994 in SImode, DImode and TImode */
995 {8, 8, 8}, /* cost of storing SSE registers
996 in SImode, DImode and TImode */
997 5, /* MMX or SSE register to integer */
998 32, /* size of l1 cache. */
999 256, /* size of l2 cache. */
1000 64, /* size of prefetch block */
1001 6, /* number of parallel prefetches */
1002 3, /* Branch cost */
1003 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1004 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1005 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1006 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1007 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1008 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1009 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1010 DUMMY_STRINGOP_ALGS},
1011 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1012 DUMMY_STRINGOP_ALGS},
1015 const struct processor_costs *ix86_cost = &pentium_cost;
1017 /* Processor feature/optimization bitmasks. */
1018 #define m_386 (1<<PROCESSOR_I386)
1019 #define m_486 (1<<PROCESSOR_I486)
1020 #define m_PENT (1<<PROCESSOR_PENTIUM)
1021 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1022 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1023 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1024 #define m_CORE2 (1<<PROCESSOR_CORE2)
1026 #define m_GEODE (1<<PROCESSOR_GEODE)
1027 #define m_K6 (1<<PROCESSOR_K6)
1028 #define m_K6_GEODE (m_K6 | m_GEODE)
1029 #define m_K8 (1<<PROCESSOR_K8)
1030 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1031 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1032 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1033 #define m_ATHLON_K8_AMDFAM10 (m_K8 | m_ATHLON | m_AMDFAM10)
1035 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1036 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1038 /* Generic instruction choice should be common subset of supported CPUs
1039 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1040 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1042 /* Feature tests against the various tunings. */
1043 unsigned int ix86_tune_features[X86_TUNE_LAST] = {
1044 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1045 negatively, so enabling for Generic64 seems like good code size
1046 tradeoff. We can't enable it for 32bit generic because it does not
1047 work well with PPro base chips. */
1048 m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC64,
1050 /* X86_TUNE_PUSH_MEMORY */
1051 m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
1052 | m_NOCONA | m_CORE2 | m_GENERIC,
1054 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1057 /* X86_TUNE_USE_BIT_TEST */
1060 /* X86_TUNE_UNROLL_STRLEN */
1061 m_486 | m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6 | m_CORE2 | m_GENERIC,
1063 /* X86_TUNE_DEEP_BRANCH_PREDICTION */
1064 m_PPRO | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_GENERIC,
1066 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1067 on simulation result. But after P4 was made, no performance benefit
1068 was observed with branch hints. It also increases the code size.
1069 As a result, icc never generates branch hints. */
1072 /* X86_TUNE_DOUBLE_WITH_ADD */
1075 /* X86_TUNE_USE_SAHF */
1076 m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_PENT4
1077 | m_NOCONA | m_CORE2 | m_GENERIC,
1079 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1080 partial dependencies. */
1081 m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
1082 | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */,
1084 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1085 register stalls on Generic32 compilation setting as well. However
1086 in current implementation the partial register stalls are not eliminated
1087 very well - they can be introduced via subregs synthesized by combine
1088 and can happen in caller/callee saving sequences. Because this option
1089 pays back little on PPro based chips and is in conflict with partial reg
1090 dependencies used by Athlon/P4 based chips, it is better to leave it off
1091 for generic32 for now. */
1094 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1095 m_CORE2 | m_GENERIC,
1097 /* X86_TUNE_USE_HIMODE_FIOP */
1098 m_386 | m_486 | m_K6_GEODE,
1100 /* X86_TUNE_USE_SIMODE_FIOP */
1101 ~(m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT | m_CORE2 | m_GENERIC),
1103 /* X86_TUNE_USE_MOV0 */
1106 /* X86_TUNE_USE_CLTD */
1107 ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC),
1109 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1112 /* X86_TUNE_SPLIT_LONG_MOVES */
1115 /* X86_TUNE_READ_MODIFY_WRITE */
1118 /* X86_TUNE_READ_MODIFY */
1121 /* X86_TUNE_PROMOTE_QIMODE */
1122 m_K6_GEODE | m_PENT | m_386 | m_486 | m_ATHLON_K8_AMDFAM10 | m_CORE2
1123 | m_GENERIC /* | m_PENT4 ? */,
1125 /* X86_TUNE_FAST_PREFIX */
1126 ~(m_PENT | m_486 | m_386),
1128 /* X86_TUNE_SINGLE_STRINGOP */
1129 m_386 | m_PENT4 | m_NOCONA,
1131 /* X86_TUNE_QIMODE_MATH */
1134 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1135 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1136 might be considered for Generic32 if our scheme for avoiding partial
1137 stalls was more effective. */
1140 /* X86_TUNE_PROMOTE_QI_REGS */
1143 /* X86_TUNE_PROMOTE_HI_REGS */
1146 /* X86_TUNE_ADD_ESP_4: Enable if add/sub is preferred over 1/2 push/pop. */
1147 m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1149 /* X86_TUNE_ADD_ESP_8 */
1150 m_ATHLON_K8_AMDFAM10 | m_PPRO | m_K6_GEODE | m_386
1151 | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1153 /* X86_TUNE_SUB_ESP_4 */
1154 m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1156 /* X86_TUNE_SUB_ESP_8 */
1157 m_ATHLON_K8_AMDFAM10 | m_PPRO | m_386 | m_486
1158 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1160 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1161 for DFmode copies */
1162 ~(m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
1163 | m_GENERIC | m_GEODE),
1165 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1166 m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1168 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1169 conflict here in between PPro/Pentium4 based chips that thread 128bit
1170 SSE registers as single units versus K8 based chips that divide SSE
1171 registers to two 64bit halves. This knob promotes all store destinations
1172 to be 128bit to allow register renaming on 128bit SSE units, but usually
1173 results in one extra microop on 64bit SSE units. Experimental results
1174 shows that disabling this option on P4 brings over 20% SPECfp regression,
1175 while enabling it on K8 brings roughly 2.4% regression that can be partly
1176 masked by careful scheduling of moves. */
1177 m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC | m_AMDFAM10,
1179 /* X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL */
1182 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1183 are resolved on SSE register parts instead of whole registers, so we may
1184 maintain just lower part of scalar values in proper format leaving the
1185 upper part undefined. */
1188 /* X86_TUNE_SSE_TYPELESS_STORES */
1189 m_ATHLON_K8_AMDFAM10,
1191 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1192 m_PPRO | m_PENT4 | m_NOCONA,
1194 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1195 m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1197 /* X86_TUNE_PROLOGUE_USING_MOVE */
1198 m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
1200 /* X86_TUNE_EPILOGUE_USING_MOVE */
1201 m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
1203 /* X86_TUNE_SHIFT1 */
1206 /* X86_TUNE_USE_FFREEP */
1207 m_ATHLON_K8_AMDFAM10,
1209 /* X86_TUNE_INTER_UNIT_MOVES */
1210 ~(m_ATHLON_K8_AMDFAM10 | m_GENERIC),
1212 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1213 than 4 branch instructions in the 16 byte window. */
1214 m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1216 /* X86_TUNE_SCHEDULE */
1217 m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT | m_CORE2 | m_GENERIC,
1219 /* X86_TUNE_USE_BT */
1220 m_ATHLON_K8_AMDFAM10,
1222 /* X86_TUNE_USE_INCDEC */
1223 ~(m_PENT4 | m_NOCONA | m_GENERIC),
1225 /* X86_TUNE_PAD_RETURNS */
1226 m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC,
1228 /* X86_TUNE_EXT_80387_CONSTANTS */
1229 m_K6_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC,
1231 /* X86_TUNE_SHORTEN_X87_SSE */
1234 /* X86_TUNE_AVOID_VECTOR_DECODE */
1237 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1238 and SImode multiply, but 386 and 486 do HImode multiply faster. */
1241 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1242 vector path on AMD machines. */
1243 m_K8 | m_GENERIC64 | m_AMDFAM10,
1245 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1247 m_K8 | m_GENERIC64 | m_AMDFAM10,
1249 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1253 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1254 but one byte longer. */
1257 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1258 operand that cannot be represented using a modRM byte. The XOR
1259 replacement is long decoded, so this split helps here as well. */
1263 /* Feature tests against the various architecture variations. */
1264 unsigned int ix86_arch_features[X86_ARCH_LAST] = {
1265 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
1266 ~(m_386 | m_486 | m_PENT | m_K6),
1268 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1271 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1274 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1277 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
1281 static const unsigned int x86_accumulate_outgoing_args
1282 = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
1284 static const unsigned int x86_arch_always_fancy_math_387
1285 = m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4
1286 | m_NOCONA | m_CORE2 | m_GENERIC;
1288 static enum stringop_alg stringop_alg = no_stringop;
1290 /* In case the average insn count for single function invocation is
1291 lower than this constant, emit fast (but longer) prologue and
1293 #define FAST_PROLOGUE_INSN_COUNT 20
1295 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
1296 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1297 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1298 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1300 /* Array of the smallest class containing reg number REGNO, indexed by
1301 REGNO. Used by REGNO_REG_CLASS in i386.h. */
1303 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
1305 /* ax, dx, cx, bx */
1306 AREG, DREG, CREG, BREG,
1307 /* si, di, bp, sp */
1308 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
1310 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
1311 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
1314 /* flags, fpsr, fpcr, frame */
1315 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
1317 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1320 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
1323 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1324 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1325 /* SSE REX registers */
1326 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1330 /* The "default" register map used in 32bit mode. */
1332 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
1334 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
1335 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
1336 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1337 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
1338 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
1339 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1340 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1343 static int const x86_64_int_parameter_registers[6] =
1345 5 /*RDI*/, 4 /*RSI*/, 1 /*RDX*/, 2 /*RCX*/,
1346 FIRST_REX_INT_REG /*R8 */, FIRST_REX_INT_REG + 1 /*R9 */
1349 static int const x86_64_ms_abi_int_parameter_registers[4] =
1351 2 /*RCX*/, 1 /*RDX*/,
1352 FIRST_REX_INT_REG /*R8 */, FIRST_REX_INT_REG + 1 /*R9 */
1355 static int const x86_64_int_return_registers[4] =
1357 0 /*RAX*/, 1 /*RDX*/, 5 /*RDI*/, 4 /*RSI*/
1360 /* The "default" register map used in 64bit mode. */
1361 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
1363 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
1364 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
1365 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1366 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
1367 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
1368 8,9,10,11,12,13,14,15, /* extended integer registers */
1369 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
1372 /* Define the register numbers to be used in Dwarf debugging information.
1373 The SVR4 reference port C compiler uses the following register numbers
1374 in its Dwarf output code:
1375 0 for %eax (gcc regno = 0)
1376 1 for %ecx (gcc regno = 2)
1377 2 for %edx (gcc regno = 1)
1378 3 for %ebx (gcc regno = 3)
1379 4 for %esp (gcc regno = 7)
1380 5 for %ebp (gcc regno = 6)
1381 6 for %esi (gcc regno = 4)
1382 7 for %edi (gcc regno = 5)
1383 The following three DWARF register numbers are never generated by
1384 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
1385 believes these numbers have these meanings.
1386 8 for %eip (no gcc equivalent)
1387 9 for %eflags (gcc regno = 17)
1388 10 for %trapno (no gcc equivalent)
1389 It is not at all clear how we should number the FP stack registers
1390 for the x86 architecture. If the version of SDB on x86/svr4 were
1391 a bit less brain dead with respect to floating-point then we would
1392 have a precedent to follow with respect to DWARF register numbers
1393 for x86 FP registers, but the SDB on x86/svr4 is so completely
1394 broken with respect to FP registers that it is hardly worth thinking
1395 of it as something to strive for compatibility with.
1396 The version of x86/svr4 SDB I have at the moment does (partially)
1397 seem to believe that DWARF register number 11 is associated with
1398 the x86 register %st(0), but that's about all. Higher DWARF
1399 register numbers don't seem to be associated with anything in
1400 particular, and even for DWARF regno 11, SDB only seems to under-
1401 stand that it should say that a variable lives in %st(0) (when
1402 asked via an `=' command) if we said it was in DWARF regno 11,
1403 but SDB still prints garbage when asked for the value of the
1404 variable in question (via a `/' command).
1405 (Also note that the labels SDB prints for various FP stack regs
1406 when doing an `x' command are all wrong.)
1407 Note that these problems generally don't affect the native SVR4
1408 C compiler because it doesn't allow the use of -O with -g and
1409 because when it is *not* optimizing, it allocates a memory
1410 location for each floating-point variable, and the memory
1411 location is what gets described in the DWARF AT_location
1412 attribute for the variable in question.
1413 Regardless of the severe mental illness of the x86/svr4 SDB, we
1414 do something sensible here and we use the following DWARF
1415 register numbers. Note that these are all stack-top-relative
1417 11 for %st(0) (gcc regno = 8)
1418 12 for %st(1) (gcc regno = 9)
1419 13 for %st(2) (gcc regno = 10)
1420 14 for %st(3) (gcc regno = 11)
1421 15 for %st(4) (gcc regno = 12)
1422 16 for %st(5) (gcc regno = 13)
1423 17 for %st(6) (gcc regno = 14)
1424 18 for %st(7) (gcc regno = 15)
1426 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
1428 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
1429 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
1430 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1431 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
1432 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
1433 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1434 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1437 /* Test and compare insns in i386.md store the information needed to
1438 generate branch and scc insns here. */
1440 rtx ix86_compare_op0 = NULL_RTX;
1441 rtx ix86_compare_op1 = NULL_RTX;
1442 rtx ix86_compare_emitted = NULL_RTX;
1444 /* Size of the register save area. */
1445 #define X86_64_VARARGS_SIZE (REGPARM_MAX * UNITS_PER_WORD + SSE_REGPARM_MAX * 16)
1447 /* Define the structure for the machine field in struct function. */
1449 struct stack_local_entry GTY(())
1451 unsigned short mode;
1454 struct stack_local_entry *next;
1457 /* Structure describing stack frame layout.
1458 Stack grows downward:
1464 saved frame pointer if frame_pointer_needed
1465 <- HARD_FRAME_POINTER
1470 [va_arg registers] (
1471 > to_allocate <- FRAME_POINTER
1481 HOST_WIDE_INT frame;
1483 int outgoing_arguments_size;
1486 HOST_WIDE_INT to_allocate;
1487 /* The offsets relative to ARG_POINTER. */
1488 HOST_WIDE_INT frame_pointer_offset;
1489 HOST_WIDE_INT hard_frame_pointer_offset;
1490 HOST_WIDE_INT stack_pointer_offset;
1492 /* When save_regs_using_mov is set, emit prologue using
1493 move instead of push instructions. */
1494 bool save_regs_using_mov;
1497 /* Code model option. */
1498 enum cmodel ix86_cmodel;
1500 enum asm_dialect ix86_asm_dialect = ASM_ATT;
1502 enum tls_dialect ix86_tls_dialect = TLS_DIALECT_GNU;
1504 /* Which unit we are generating floating point math for. */
1505 enum fpmath_unit ix86_fpmath;
1507 /* Which cpu are we scheduling for. */
1508 enum processor_type ix86_tune;
1510 /* Which instruction set architecture to use. */
1511 enum processor_type ix86_arch;
1513 /* true if sse prefetch instruction is not NOOP. */
1514 int x86_prefetch_sse;
1516 /* ix86_regparm_string as a number */
1517 static int ix86_regparm;
1519 /* -mstackrealign option */
1520 extern int ix86_force_align_arg_pointer;
1521 static const char ix86_force_align_arg_pointer_string[] = "force_align_arg_pointer";
1523 /* Preferred alignment for stack boundary in bits. */
1524 unsigned int ix86_preferred_stack_boundary;
1526 /* Values 1-5: see jump.c */
1527 int ix86_branch_cost;
1529 /* Variables which are this size or smaller are put in the data/bss
1530 or ldata/lbss sections. */
1532 int ix86_section_threshold = 65536;
1534 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
1535 char internal_label_prefix[16];
1536 int internal_label_prefix_len;
1538 /* Fence to use after loop using movnt. */
1541 /* Register class used for passing given 64bit part of the argument.
1542 These represent classes as documented by the PS ABI, with the exception
1543 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
1544 use SF or DFmode move instead of DImode to avoid reformatting penalties.
1546 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
1547 whenever possible (upper half does contain padding). */
1548 enum x86_64_reg_class
1551 X86_64_INTEGER_CLASS,
1552 X86_64_INTEGERSI_CLASS,
1559 X86_64_COMPLEX_X87_CLASS,
1562 static const char * const x86_64_reg_class_name[] =
1564 "no", "integer", "integerSI", "sse", "sseSF", "sseDF",
1565 "sseup", "x87", "x87up", "cplx87", "no"
1568 #define MAX_CLASSES 4
1570 /* Table of constants used by fldpi, fldln2, etc.... */
1571 static REAL_VALUE_TYPE ext_80387_constants_table [5];
1572 static bool ext_80387_constants_init = 0;
1575 static struct machine_function * ix86_init_machine_status (void);
1576 static rtx ix86_function_value (const_tree, const_tree, bool);
1577 static int ix86_function_regparm (const_tree, const_tree);
1578 static void ix86_compute_frame_layout (struct ix86_frame *);
1579 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
1583 /* The svr4 ABI for the i386 says that records and unions are returned
1585 #ifndef DEFAULT_PCC_STRUCT_RETURN
1586 #define DEFAULT_PCC_STRUCT_RETURN 1
1589 /* Bit flags that specify the ISA we are compiling for. */
1590 int ix86_isa_flags = TARGET_64BIT_DEFAULT | TARGET_SUBTARGET_ISA_DEFAULT;
1592 /* A mask of ix86_isa_flags that includes bit X if X
1593 was set or cleared on the command line. */
1594 static int ix86_isa_flags_explicit;
1596 /* Define a set of ISAs which aren't available for a given ISA. MMX
1597 and SSE ISAs are handled separately. */
1599 #define OPTION_MASK_ISA_MMX_UNSET \
1600 (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_UNSET)
1601 #define OPTION_MASK_ISA_3DNOW_UNSET OPTION_MASK_ISA_3DNOW_A
1603 #define OPTION_MASK_ISA_SSE_UNSET \
1604 (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE2_UNSET)
1605 #define OPTION_MASK_ISA_SSE2_UNSET \
1606 (OPTION_MASK_ISA_SSE3 | OPTION_MASK_ISA_SSE3_UNSET)
1607 #define OPTION_MASK_ISA_SSE3_UNSET \
1608 (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSSE3_UNSET)
1609 #define OPTION_MASK_ISA_SSSE3_UNSET \
1610 (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_1_UNSET)
1611 #define OPTION_MASK_ISA_SSE4_1_UNSET \
1612 (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_SSE4_2_UNSET)
1613 #define OPTION_MASK_ISA_SSE4_2_UNSET OPTION_MASK_ISA_SSE4A
1615 /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
1616 as -msse4.1 -msse4.2. -mno-sse4 should the same as -mno-sse4.1. */
1617 #define OPTION_MASK_ISA_SSE4 \
1618 (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_2)
1619 #define OPTION_MASK_ISA_SSE4_UNSET OPTION_MASK_ISA_SSE4_1_UNSET
1621 #define OPTION_MASK_ISA_SSE4A_UNSET OPTION_MASK_ISA_SSE4
1623 /* Vectorization library interface and handlers. */
1624 tree (*ix86_veclib_handler)(enum built_in_function, tree, tree) = NULL;
1625 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
1627 /* Implement TARGET_HANDLE_OPTION. */
1630 ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value)
1635 ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX;
1638 ix86_isa_flags &= ~OPTION_MASK_ISA_MMX_UNSET;
1639 ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX_UNSET;
1644 ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW;
1647 ix86_isa_flags &= ~OPTION_MASK_ISA_3DNOW_UNSET;
1648 ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW_UNSET;
1656 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE;
1659 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE_UNSET;
1660 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE_UNSET;
1665 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2;
1668 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE2_UNSET;
1669 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_UNSET;
1674 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3;
1677 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE3_UNSET;
1678 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3_UNSET;
1683 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3;
1686 ix86_isa_flags &= ~OPTION_MASK_ISA_SSSE3_UNSET;
1687 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3_UNSET;
1692 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1;
1695 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_1_UNSET;
1696 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1_UNSET;
1701 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2;
1704 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_2_UNSET;
1705 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2_UNSET;
1710 ix86_isa_flags |= OPTION_MASK_ISA_SSE4;
1711 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4;
1715 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_UNSET;
1716 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_UNSET;
1720 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A;
1723 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4A_UNSET;
1724 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A_UNSET;
1733 /* Sometimes certain combinations of command options do not make
1734 sense on a particular target machine. You can define a macro
1735 `OVERRIDE_OPTIONS' to take account of this. This macro, if
1736 defined, is executed once just after all the command options have
1739 Don't use this macro to turn on various extra optimizations for
1740 `-O'. That is what `OPTIMIZATION_OPTIONS' is for. */
1743 override_options (void)
1746 int ix86_tune_defaulted = 0;
1747 int ix86_arch_specified = 0;
1748 unsigned int ix86_arch_mask, ix86_tune_mask;
1750 /* Comes from final.c -- no real reason to change it. */
1751 #define MAX_CODE_ALIGN 16
1755 const struct processor_costs *cost; /* Processor costs */
1756 const int align_loop; /* Default alignments. */
1757 const int align_loop_max_skip;
1758 const int align_jump;
1759 const int align_jump_max_skip;
1760 const int align_func;
1762 const processor_target_table[PROCESSOR_max] =
1764 {&i386_cost, 4, 3, 4, 3, 4},
1765 {&i486_cost, 16, 15, 16, 15, 16},
1766 {&pentium_cost, 16, 7, 16, 7, 16},
1767 {&pentiumpro_cost, 16, 15, 16, 10, 16},
1768 {&geode_cost, 0, 0, 0, 0, 0},
1769 {&k6_cost, 32, 7, 32, 7, 32},
1770 {&athlon_cost, 16, 7, 16, 7, 16},
1771 {&pentium4_cost, 0, 0, 0, 0, 0},
1772 {&k8_cost, 16, 7, 16, 7, 16},
1773 {&nocona_cost, 0, 0, 0, 0, 0},
1774 {&core2_cost, 16, 10, 16, 10, 16},
1775 {&generic32_cost, 16, 7, 16, 7, 16},
1776 {&generic64_cost, 16, 10, 16, 10, 16},
1777 {&amdfam10_cost, 32, 24, 32, 7, 32}
1780 static const char * const cpu_names[] = TARGET_CPU_DEFAULT_NAMES;
1787 PTA_PREFETCH_SSE = 1 << 4,
1789 PTA_3DNOW_A = 1 << 6,
1793 PTA_POPCNT = 1 << 10,
1795 PTA_SSE4A = 1 << 12,
1796 PTA_NO_SAHF = 1 << 13,
1797 PTA_SSE4_1 = 1 << 14,
1798 PTA_SSE4_2 = 1 << 15
1803 const char *const name; /* processor name or nickname. */
1804 const enum processor_type processor;
1805 const unsigned /*enum pta_flags*/ flags;
1807 const processor_alias_table[] =
1809 {"i386", PROCESSOR_I386, 0},
1810 {"i486", PROCESSOR_I486, 0},
1811 {"i586", PROCESSOR_PENTIUM, 0},
1812 {"pentium", PROCESSOR_PENTIUM, 0},
1813 {"pentium-mmx", PROCESSOR_PENTIUM, PTA_MMX},
1814 {"winchip-c6", PROCESSOR_I486, PTA_MMX},
1815 {"winchip2", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},
1816 {"c3", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},
1817 {"c3-2", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE},
1818 {"i686", PROCESSOR_PENTIUMPRO, 0},
1819 {"pentiumpro", PROCESSOR_PENTIUMPRO, 0},
1820 {"pentium2", PROCESSOR_PENTIUMPRO, PTA_MMX},
1821 {"pentium3", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE},
1822 {"pentium3m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE},
1823 {"pentium-m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_SSE2},
1824 {"pentium4", PROCESSOR_PENTIUM4, PTA_MMX |PTA_SSE | PTA_SSE2},
1825 {"pentium4m", PROCESSOR_PENTIUM4, PTA_MMX | PTA_SSE | PTA_SSE2},
1826 {"prescott", PROCESSOR_NOCONA, PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
1827 {"nocona", PROCESSOR_NOCONA, (PTA_64BIT
1828 | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
1829 | PTA_CX16 | PTA_NO_SAHF)},
1830 {"core2", PROCESSOR_CORE2, (PTA_64BIT
1831 | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
1834 {"geode", PROCESSOR_GEODE, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
1835 |PTA_PREFETCH_SSE)},
1836 {"k6", PROCESSOR_K6, PTA_MMX},
1837 {"k6-2", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
1838 {"k6-3", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
1839 {"athlon", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
1840 | PTA_PREFETCH_SSE)},
1841 {"athlon-tbird", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
1842 | PTA_PREFETCH_SSE)},
1843 {"athlon-4", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
1845 {"athlon-xp", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
1847 {"athlon-mp", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
1849 {"x86-64", PROCESSOR_K8, (PTA_64BIT
1850 | PTA_MMX | PTA_SSE | PTA_SSE2
1852 {"k8", PROCESSOR_K8, (PTA_64BIT
1853 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
1854 | PTA_SSE | PTA_SSE2
1856 {"k8-sse3", PROCESSOR_K8, (PTA_64BIT
1857 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
1858 | PTA_SSE | PTA_SSE2 | PTA_SSE3
1860 {"opteron", PROCESSOR_K8, (PTA_64BIT
1861 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
1862 | PTA_SSE | PTA_SSE2
1864 {"opteron-sse3", PROCESSOR_K8, (PTA_64BIT
1865 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
1866 | PTA_SSE | PTA_SSE2 | PTA_SSE3
1868 {"athlon64", PROCESSOR_K8, (PTA_64BIT
1869 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
1870 | PTA_SSE | PTA_SSE2
1872 {"athlon64-sse3", PROCESSOR_K8, (PTA_64BIT
1873 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
1874 | PTA_SSE | PTA_SSE2 | PTA_SSE3
1876 {"athlon-fx", PROCESSOR_K8, (PTA_64BIT
1877 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
1878 | PTA_SSE | PTA_SSE2
1880 {"amdfam10", PROCESSOR_AMDFAM10, (PTA_64BIT
1881 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
1882 | PTA_SSE | PTA_SSE2 | PTA_SSE3
1884 | PTA_CX16 | PTA_ABM)},
1885 {"barcelona", PROCESSOR_AMDFAM10, (PTA_64BIT
1886 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
1887 | PTA_SSE | PTA_SSE2 | PTA_SSE3
1889 | PTA_CX16 | PTA_ABM)},
1890 {"generic32", PROCESSOR_GENERIC32, 0 /* flags are only used for -march switch. */ },
1891 {"generic64", PROCESSOR_GENERIC64, PTA_64BIT /* flags are only used for -march switch. */ },
1894 int const pta_size = ARRAY_SIZE (processor_alias_table);
1896 #ifdef SUBTARGET_OVERRIDE_OPTIONS
1897 SUBTARGET_OVERRIDE_OPTIONS;
1900 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
1901 SUBSUBTARGET_OVERRIDE_OPTIONS;
1904 /* -fPIC is the default for x86_64. */
1905 if (TARGET_MACHO && TARGET_64BIT)
1908 /* Set the default values for switches whose default depends on TARGET_64BIT
1909 in case they weren't overwritten by command line options. */
1912 /* Mach-O doesn't support omitting the frame pointer for now. */
1913 if (flag_omit_frame_pointer == 2)
1914 flag_omit_frame_pointer = (TARGET_MACHO ? 0 : 1);
1915 if (flag_asynchronous_unwind_tables == 2)
1916 flag_asynchronous_unwind_tables = 1;
1917 if (flag_pcc_struct_return == 2)
1918 flag_pcc_struct_return = 0;
1922 if (flag_omit_frame_pointer == 2)
1923 flag_omit_frame_pointer = 0;
1924 if (flag_asynchronous_unwind_tables == 2)
1925 flag_asynchronous_unwind_tables = 0;
1926 if (flag_pcc_struct_return == 2)
1927 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
1930 /* Need to check -mtune=generic first. */
1931 if (ix86_tune_string)
1933 if (!strcmp (ix86_tune_string, "generic")
1934 || !strcmp (ix86_tune_string, "i686")
1935 /* As special support for cross compilers we read -mtune=native
1936 as -mtune=generic. With native compilers we won't see the
1937 -mtune=native, as it was changed by the driver. */
1938 || !strcmp (ix86_tune_string, "native"))
1941 ix86_tune_string = "generic64";
1943 ix86_tune_string = "generic32";
1945 else if (!strncmp (ix86_tune_string, "generic", 7))
1946 error ("bad value (%s) for -mtune= switch", ix86_tune_string);
1950 if (ix86_arch_string)
1951 ix86_tune_string = ix86_arch_string;
1952 if (!ix86_tune_string)
1954 ix86_tune_string = cpu_names [TARGET_CPU_DEFAULT];
1955 ix86_tune_defaulted = 1;
1958 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
1959 need to use a sensible tune option. */
1960 if (!strcmp (ix86_tune_string, "generic")
1961 || !strcmp (ix86_tune_string, "x86-64")
1962 || !strcmp (ix86_tune_string, "i686"))
1965 ix86_tune_string = "generic64";
1967 ix86_tune_string = "generic32";
1970 if (ix86_stringop_string)
1972 if (!strcmp (ix86_stringop_string, "rep_byte"))
1973 stringop_alg = rep_prefix_1_byte;
1974 else if (!strcmp (ix86_stringop_string, "libcall"))
1975 stringop_alg = libcall;
1976 else if (!strcmp (ix86_stringop_string, "rep_4byte"))
1977 stringop_alg = rep_prefix_4_byte;
1978 else if (!strcmp (ix86_stringop_string, "rep_8byte"))
1979 stringop_alg = rep_prefix_8_byte;
1980 else if (!strcmp (ix86_stringop_string, "byte_loop"))
1981 stringop_alg = loop_1_byte;
1982 else if (!strcmp (ix86_stringop_string, "loop"))
1983 stringop_alg = loop;
1984 else if (!strcmp (ix86_stringop_string, "unrolled_loop"))
1985 stringop_alg = unrolled_loop;
1987 error ("bad value (%s) for -mstringop-strategy= switch", ix86_stringop_string);
1989 if (!strcmp (ix86_tune_string, "x86-64"))
1990 warning (OPT_Wdeprecated, "-mtune=x86-64 is deprecated. Use -mtune=k8 or "
1991 "-mtune=generic instead as appropriate.");
1993 if (!ix86_arch_string)
1994 ix86_arch_string = TARGET_64BIT ? "x86-64" : "i386";
1996 ix86_arch_specified = 1;
1998 if (!strcmp (ix86_arch_string, "generic"))
1999 error ("generic CPU can be used only for -mtune= switch");
2000 if (!strncmp (ix86_arch_string, "generic", 7))
2001 error ("bad value (%s) for -march= switch", ix86_arch_string);
2003 if (ix86_cmodel_string != 0)
2005 if (!strcmp (ix86_cmodel_string, "small"))
2006 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
2007 else if (!strcmp (ix86_cmodel_string, "medium"))
2008 ix86_cmodel = flag_pic ? CM_MEDIUM_PIC : CM_MEDIUM;
2009 else if (!strcmp (ix86_cmodel_string, "large"))
2010 ix86_cmodel = flag_pic ? CM_LARGE_PIC : CM_LARGE;
2012 error ("code model %s does not support PIC mode", ix86_cmodel_string);
2013 else if (!strcmp (ix86_cmodel_string, "32"))
2014 ix86_cmodel = CM_32;
2015 else if (!strcmp (ix86_cmodel_string, "kernel") && !flag_pic)
2016 ix86_cmodel = CM_KERNEL;
2018 error ("bad value (%s) for -mcmodel= switch", ix86_cmodel_string);
2022 /* For TARGET_64BIT_MS_ABI, force pic on, in order to enable the
2023 use of rip-relative addressing. This eliminates fixups that
2024 would otherwise be needed if this object is to be placed in a
2025 DLL, and is essentially just as efficient as direct addressing. */
2026 if (TARGET_64BIT_MS_ABI)
2027 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
2028 else if (TARGET_64BIT)
2029 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
2031 ix86_cmodel = CM_32;
2033 if (ix86_asm_string != 0)
2036 && !strcmp (ix86_asm_string, "intel"))
2037 ix86_asm_dialect = ASM_INTEL;
2038 else if (!strcmp (ix86_asm_string, "att"))
2039 ix86_asm_dialect = ASM_ATT;
2041 error ("bad value (%s) for -masm= switch", ix86_asm_string);
2043 if ((TARGET_64BIT == 0) != (ix86_cmodel == CM_32))
2044 error ("code model %qs not supported in the %s bit mode",
2045 ix86_cmodel_string, TARGET_64BIT ? "64" : "32");
2046 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
2047 sorry ("%i-bit mode not compiled in",
2048 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
2050 for (i = 0; i < pta_size; i++)
2051 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
2053 ix86_arch = processor_alias_table[i].processor;
2054 /* Default cpu tuning to the architecture. */
2055 ix86_tune = ix86_arch;
2057 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
2058 error ("CPU you selected does not support x86-64 "
2061 if (processor_alias_table[i].flags & PTA_MMX
2062 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
2063 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
2064 if (processor_alias_table[i].flags & PTA_3DNOW
2065 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
2066 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
2067 if (processor_alias_table[i].flags & PTA_3DNOW_A
2068 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
2069 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
2070 if (processor_alias_table[i].flags & PTA_SSE
2071 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
2072 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
2073 if (processor_alias_table[i].flags & PTA_SSE2
2074 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
2075 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
2076 if (processor_alias_table[i].flags & PTA_SSE3
2077 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
2078 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
2079 if (processor_alias_table[i].flags & PTA_SSSE3
2080 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
2081 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
2082 if (processor_alias_table[i].flags & PTA_SSE4_1
2083 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
2084 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
2085 if (processor_alias_table[i].flags & PTA_SSE4_2
2086 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
2087 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
2088 if (processor_alias_table[i].flags & PTA_SSE4A
2089 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
2090 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
2092 if (processor_alias_table[i].flags & PTA_ABM)
2094 if (processor_alias_table[i].flags & PTA_CX16)
2095 x86_cmpxchg16b = true;
2096 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM))
2098 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
2099 x86_prefetch_sse = true;
2100 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF)))
2107 error ("bad value (%s) for -march= switch", ix86_arch_string);
2109 ix86_arch_mask = 1u << ix86_arch;
2110 for (i = 0; i < X86_ARCH_LAST; ++i)
2111 ix86_arch_features[i] &= ix86_arch_mask;
2113 for (i = 0; i < pta_size; i++)
2114 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
2116 ix86_tune = processor_alias_table[i].processor;
2117 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
2119 if (ix86_tune_defaulted)
2121 ix86_tune_string = "x86-64";
2122 for (i = 0; i < pta_size; i++)
2123 if (! strcmp (ix86_tune_string,
2124 processor_alias_table[i].name))
2126 ix86_tune = processor_alias_table[i].processor;
2129 error ("CPU you selected does not support x86-64 "
2132 /* Intel CPUs have always interpreted SSE prefetch instructions as
2133 NOPs; so, we can enable SSE prefetch instructions even when
2134 -mtune (rather than -march) points us to a processor that has them.
2135 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
2136 higher processors. */
2138 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
2139 x86_prefetch_sse = true;
2143 error ("bad value (%s) for -mtune= switch", ix86_tune_string);
2145 ix86_tune_mask = 1u << ix86_tune;
2146 for (i = 0; i < X86_TUNE_LAST; ++i)
2147 ix86_tune_features[i] &= ix86_tune_mask;
2150 ix86_cost = &size_cost;
2152 ix86_cost = processor_target_table[ix86_tune].cost;
2154 /* Arrange to set up i386_stack_locals for all functions. */
2155 init_machine_status = ix86_init_machine_status;
2157 /* Validate -mregparm= value. */
2158 if (ix86_regparm_string)
2161 warning (0, "-mregparm is ignored in 64-bit mode");
2162 i = atoi (ix86_regparm_string);
2163 if (i < 0 || i > REGPARM_MAX)
2164 error ("-mregparm=%d is not between 0 and %d", i, REGPARM_MAX);
2169 ix86_regparm = REGPARM_MAX;
2171 /* If the user has provided any of the -malign-* options,
2172 warn and use that value only if -falign-* is not set.
2173 Remove this code in GCC 3.2 or later. */
2174 if (ix86_align_loops_string)
2176 warning (0, "-malign-loops is obsolete, use -falign-loops");
2177 if (align_loops == 0)
2179 i = atoi (ix86_align_loops_string);
2180 if (i < 0 || i > MAX_CODE_ALIGN)
2181 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2183 align_loops = 1 << i;
2187 if (ix86_align_jumps_string)
2189 warning (0, "-malign-jumps is obsolete, use -falign-jumps");
2190 if (align_jumps == 0)
2192 i = atoi (ix86_align_jumps_string);
2193 if (i < 0 || i > MAX_CODE_ALIGN)
2194 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2196 align_jumps = 1 << i;
2200 if (ix86_align_funcs_string)
2202 warning (0, "-malign-functions is obsolete, use -falign-functions");
2203 if (align_functions == 0)
2205 i = atoi (ix86_align_funcs_string);
2206 if (i < 0 || i > MAX_CODE_ALIGN)
2207 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2209 align_functions = 1 << i;
2213 /* Default align_* from the processor table. */
2214 if (align_loops == 0)
2216 align_loops = processor_target_table[ix86_tune].align_loop;
2217 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
2219 if (align_jumps == 0)
2221 align_jumps = processor_target_table[ix86_tune].align_jump;
2222 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
2224 if (align_functions == 0)
2226 align_functions = processor_target_table[ix86_tune].align_func;
2229 /* Validate -mbranch-cost= value, or provide default. */
2230 ix86_branch_cost = ix86_cost->branch_cost;
2231 if (ix86_branch_cost_string)
2233 i = atoi (ix86_branch_cost_string);
2235 error ("-mbranch-cost=%d is not between 0 and 5", i);
2237 ix86_branch_cost = i;
2239 if (ix86_section_threshold_string)
2241 i = atoi (ix86_section_threshold_string);
2243 error ("-mlarge-data-threshold=%d is negative", i);
2245 ix86_section_threshold = i;
2248 if (ix86_tls_dialect_string)
2250 if (strcmp (ix86_tls_dialect_string, "gnu") == 0)
2251 ix86_tls_dialect = TLS_DIALECT_GNU;
2252 else if (strcmp (ix86_tls_dialect_string, "gnu2") == 0)
2253 ix86_tls_dialect = TLS_DIALECT_GNU2;
2254 else if (strcmp (ix86_tls_dialect_string, "sun") == 0)
2255 ix86_tls_dialect = TLS_DIALECT_SUN;
2257 error ("bad value (%s) for -mtls-dialect= switch",
2258 ix86_tls_dialect_string);
2261 if (ix87_precision_string)
2263 i = atoi (ix87_precision_string);
2264 if (i != 32 && i != 64 && i != 80)
2265 error ("pc%d is not valid precision setting (32, 64 or 80)", i);
2270 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
2272 /* Enable by default the SSE and MMX builtins. Do allow the user to
2273 explicitly disable any of these. In particular, disabling SSE and
2274 MMX for kernel code is extremely useful. */
2275 if (!ix86_arch_specified)
2277 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
2278 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
2281 warning (0, "-mrtd is ignored in 64bit mode");
2285 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
2287 if (!ix86_arch_specified)
2289 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
2291 /* i386 ABI does not specify red zone. It still makes sense to use it
2292 when programmer takes care to stack from being destroyed. */
2293 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
2294 target_flags |= MASK_NO_RED_ZONE;
2297 /* Keep nonleaf frame pointers. */
2298 if (flag_omit_frame_pointer)
2299 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
2300 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
2301 flag_omit_frame_pointer = 1;
2303 /* If we're doing fast math, we don't care about comparison order
2304 wrt NaNs. This lets us use a shorter comparison sequence. */
2305 if (flag_finite_math_only)
2306 target_flags &= ~MASK_IEEE_FP;
2308 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
2309 since the insns won't need emulation. */
2310 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
2311 target_flags &= ~MASK_NO_FANCY_MATH_387;
2313 /* Likewise, if the target doesn't have a 387, or we've specified
2314 software floating point, don't use 387 inline intrinsics. */
2316 target_flags |= MASK_NO_FANCY_MATH_387;
2318 /* Turn on SSE4.1 builtins for -msse4.2. */
2320 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
2322 /* Turn on SSSE3 builtins for -msse4.1. */
2324 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
2326 /* Turn on SSE3 builtins for -mssse3. */
2328 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
2330 /* Turn on SSE3 builtins for -msse4a. */
2332 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
2334 /* Turn on SSE2 builtins for -msse3. */
2336 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
2338 /* Turn on SSE builtins for -msse2. */
2340 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
2342 /* Turn on MMX builtins for -msse. */
2345 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
2346 x86_prefetch_sse = true;
2349 /* Turn on MMX builtins for 3Dnow. */
2351 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
2353 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
2354 if (TARGET_SSE4_2 || TARGET_ABM)
2357 /* Validate -mpreferred-stack-boundary= value, or provide default.
2358 The default of 128 bits is for Pentium III's SSE __m128. We can't
2359 change it because of optimize_size. Otherwise, we can't mix object
2360 files compiled with -Os and -On. */
2361 ix86_preferred_stack_boundary = 128;
2362 if (ix86_preferred_stack_boundary_string)
2364 i = atoi (ix86_preferred_stack_boundary_string);
2365 if (i < (TARGET_64BIT ? 4 : 2) || i > 12)
2366 error ("-mpreferred-stack-boundary=%d is not between %d and 12", i,
2367 TARGET_64BIT ? 4 : 2);
2369 ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT;
2372 /* Accept -msseregparm only if at least SSE support is enabled. */
2373 if (TARGET_SSEREGPARM
2375 error ("-msseregparm used without SSE enabled");
2377 ix86_fpmath = TARGET_FPMATH_DEFAULT;
2378 if (ix86_fpmath_string != 0)
2380 if (! strcmp (ix86_fpmath_string, "387"))
2381 ix86_fpmath = FPMATH_387;
2382 else if (! strcmp (ix86_fpmath_string, "sse"))
2386 warning (0, "SSE instruction set disabled, using 387 arithmetics");
2387 ix86_fpmath = FPMATH_387;
2390 ix86_fpmath = FPMATH_SSE;
2392 else if (! strcmp (ix86_fpmath_string, "387,sse")
2393 || ! strcmp (ix86_fpmath_string, "sse,387"))
2397 warning (0, "SSE instruction set disabled, using 387 arithmetics");
2398 ix86_fpmath = FPMATH_387;
2400 else if (!TARGET_80387)
2402 warning (0, "387 instruction set disabled, using SSE arithmetics");
2403 ix86_fpmath = FPMATH_SSE;
2406 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
2409 error ("bad value (%s) for -mfpmath= switch", ix86_fpmath_string);
2412 /* If the i387 is disabled, then do not return values in it. */
2414 target_flags &= ~MASK_FLOAT_RETURNS;
2416 /* Use external vectorized library in vectorizing intrinsics. */
2417 if (ix86_veclibabi_string)
2419 if (strcmp (ix86_veclibabi_string, "acml") == 0)
2420 ix86_veclib_handler = ix86_veclibabi_acml;
2422 error ("unknown vectorization library ABI type (%s) for "
2423 "-mveclibabi= switch", ix86_veclibabi_string);
2426 if ((x86_accumulate_outgoing_args & ix86_tune_mask)
2427 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2429 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2431 /* ??? Unwind info is not correct around the CFG unless either a frame
2432 pointer is present or M_A_O_A is set. Fixing this requires rewriting
2433 unwind info generation to be aware of the CFG and propagating states
2435 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
2436 || flag_exceptions || flag_non_call_exceptions)
2437 && flag_omit_frame_pointer
2438 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
2440 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2441 warning (0, "unwind tables currently require either a frame pointer "
2442 "or -maccumulate-outgoing-args for correctness");
2443 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2446 /* For sane SSE instruction set generation we need fcomi instruction.
2447 It is safe to enable all CMOVE instructions. */
2451 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
2454 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
2455 p = strchr (internal_label_prefix, 'X');
2456 internal_label_prefix_len = p - internal_label_prefix;
2460 /* When scheduling description is not available, disable scheduler pass
2461 so it won't slow down the compilation and make x87 code slower. */
2462 if (!TARGET_SCHEDULE)
2463 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
2465 if (!PARAM_SET_P (PARAM_SIMULTANEOUS_PREFETCHES))
2466 set_param_value ("simultaneous-prefetches",
2467 ix86_cost->simultaneous_prefetches);
2468 if (!PARAM_SET_P (PARAM_L1_CACHE_LINE_SIZE))
2469 set_param_value ("l1-cache-line-size", ix86_cost->prefetch_block);
2470 if (!PARAM_SET_P (PARAM_L1_CACHE_SIZE))
2471 set_param_value ("l1-cache-size", ix86_cost->l1_cache_size);
2472 if (!PARAM_SET_P (PARAM_L2_CACHE_SIZE))
2473 set_param_value ("l2-cache-size", ix86_cost->l2_cache_size);
2476 /* Return true if this goes in large data/bss. */
2479 ix86_in_large_data_p (tree exp)
2481 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
2484 /* Functions are never large data. */
2485 if (TREE_CODE (exp) == FUNCTION_DECL)
2488 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
2490 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
2491 if (strcmp (section, ".ldata") == 0
2492 || strcmp (section, ".lbss") == 0)
2498 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
2500 /* If this is an incomplete type with size 0, then we can't put it
2501 in data because it might be too big when completed. */
2502 if (!size || size > ix86_section_threshold)
2509 /* Switch to the appropriate section for output of DECL.
2510 DECL is either a `VAR_DECL' node or a constant of some sort.
2511 RELOC indicates whether forming the initial value of DECL requires
2512 link-time relocations. */
2514 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
2518 x86_64_elf_select_section (tree decl, int reloc,
2519 unsigned HOST_WIDE_INT align)
2521 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2522 && ix86_in_large_data_p (decl))
2524 const char *sname = NULL;
2525 unsigned int flags = SECTION_WRITE;
2526 switch (categorize_decl_for_section (decl, reloc))
2531 case SECCAT_DATA_REL:
2532 sname = ".ldata.rel";
2534 case SECCAT_DATA_REL_LOCAL:
2535 sname = ".ldata.rel.local";
2537 case SECCAT_DATA_REL_RO:
2538 sname = ".ldata.rel.ro";
2540 case SECCAT_DATA_REL_RO_LOCAL:
2541 sname = ".ldata.rel.ro.local";
2545 flags |= SECTION_BSS;
2548 case SECCAT_RODATA_MERGE_STR:
2549 case SECCAT_RODATA_MERGE_STR_INIT:
2550 case SECCAT_RODATA_MERGE_CONST:
2554 case SECCAT_SRODATA:
2561 /* We don't split these for medium model. Place them into
2562 default sections and hope for best. */
2567 /* We might get called with string constants, but get_named_section
2568 doesn't like them as they are not DECLs. Also, we need to set
2569 flags in that case. */
2571 return get_section (sname, flags, NULL);
2572 return get_named_section (decl, sname, reloc);
2575 return default_elf_select_section (decl, reloc, align);
2578 /* Build up a unique section name, expressed as a
2579 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
2580 RELOC indicates whether the initial value of EXP requires
2581 link-time relocations. */
2583 static void ATTRIBUTE_UNUSED
2584 x86_64_elf_unique_section (tree decl, int reloc)
2586 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2587 && ix86_in_large_data_p (decl))
2589 const char *prefix = NULL;
2590 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
2591 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
2593 switch (categorize_decl_for_section (decl, reloc))
2596 case SECCAT_DATA_REL:
2597 case SECCAT_DATA_REL_LOCAL:
2598 case SECCAT_DATA_REL_RO:
2599 case SECCAT_DATA_REL_RO_LOCAL:
2600 prefix = one_only ? ".gnu.linkonce.ld." : ".ldata.";
2603 prefix = one_only ? ".gnu.linkonce.lb." : ".lbss.";
2606 case SECCAT_RODATA_MERGE_STR:
2607 case SECCAT_RODATA_MERGE_STR_INIT:
2608 case SECCAT_RODATA_MERGE_CONST:
2609 prefix = one_only ? ".gnu.linkonce.lr." : ".lrodata.";
2611 case SECCAT_SRODATA:
2618 /* We don't split these for medium model. Place them into
2619 default sections and hope for best. */
2627 plen = strlen (prefix);
2629 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
2630 name = targetm.strip_name_encoding (name);
2631 nlen = strlen (name);
2633 string = (char *) alloca (nlen + plen + 1);
2634 memcpy (string, prefix, plen);
2635 memcpy (string + plen, name, nlen + 1);
2637 DECL_SECTION_NAME (decl) = build_string (nlen + plen, string);
2641 default_unique_section (decl, reloc);
2644 #ifdef COMMON_ASM_OP
2645 /* This says how to output assembler code to declare an
2646 uninitialized external linkage data object.
2648 For medium model x86-64 we need to use .largecomm opcode for
2651 x86_elf_aligned_common (FILE *file,
2652 const char *name, unsigned HOST_WIDE_INT size,
2655 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2656 && size > (unsigned int)ix86_section_threshold)
2657 fprintf (file, ".largecomm\t");
2659 fprintf (file, "%s", COMMON_ASM_OP);
2660 assemble_name (file, name);
2661 fprintf (file, ","HOST_WIDE_INT_PRINT_UNSIGNED",%u\n",
2662 size, align / BITS_PER_UNIT);
2666 /* Utility function for targets to use in implementing
2667 ASM_OUTPUT_ALIGNED_BSS. */
2670 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
2671 const char *name, unsigned HOST_WIDE_INT size,
2674 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2675 && size > (unsigned int)ix86_section_threshold)
2676 switch_to_section (get_named_section (decl, ".lbss", 0));
2678 switch_to_section (bss_section);
2679 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
2680 #ifdef ASM_DECLARE_OBJECT_NAME
2681 last_assemble_variable_decl = decl;
2682 ASM_DECLARE_OBJECT_NAME (file, name, decl);
2684 /* Standard thing is just output label for the object. */
2685 ASM_OUTPUT_LABEL (file, name);
2686 #endif /* ASM_DECLARE_OBJECT_NAME */
2687 ASM_OUTPUT_SKIP (file, size ? size : 1);
2691 optimization_options (int level, int size ATTRIBUTE_UNUSED)
2693 /* For -O2 and beyond, turn off -fschedule-insns by default. It tends to
2694 make the problem with not enough registers even worse. */
2695 #ifdef INSN_SCHEDULING
2697 flag_schedule_insns = 0;
2701 /* The Darwin libraries never set errno, so we might as well
2702 avoid calling them when that's the only reason we would. */
2703 flag_errno_math = 0;
2705 /* The default values of these switches depend on the TARGET_64BIT
2706 that is not known at this moment. Mark these values with 2 and
2707 let user the to override these. In case there is no command line option
2708 specifying them, we will set the defaults in override_options. */
2710 flag_omit_frame_pointer = 2;
2711 flag_pcc_struct_return = 2;
2712 flag_asynchronous_unwind_tables = 2;
2713 #ifdef SUBTARGET_OPTIMIZATION_OPTIONS
2714 SUBTARGET_OPTIMIZATION_OPTIONS;
2718 /* Decide whether we can make a sibling call to a function. DECL is the
2719 declaration of the function being targeted by the call and EXP is the
2720 CALL_EXPR representing the call. */
2723 ix86_function_ok_for_sibcall (tree decl, tree exp)
2728 /* If we are generating position-independent code, we cannot sibcall
2729 optimize any indirect call, or a direct call to a global function,
2730 as the PLT requires %ebx be live. */
2731 if (!TARGET_64BIT && flag_pic && (!decl || !targetm.binds_local_p (decl)))
2738 func = TREE_TYPE (CALL_EXPR_FN (exp));
2739 if (POINTER_TYPE_P (func))
2740 func = TREE_TYPE (func);
2743 /* Check that the return value locations are the same. Like
2744 if we are returning floats on the 80387 register stack, we cannot
2745 make a sibcall from a function that doesn't return a float to a
2746 function that does or, conversely, from a function that does return
2747 a float to a function that doesn't; the necessary stack adjustment
2748 would not be executed. This is also the place we notice
2749 differences in the return value ABI. Note that it is ok for one
2750 of the functions to have void return type as long as the return
2751 value of the other is passed in a register. */
2752 a = ix86_function_value (TREE_TYPE (exp), func, false);
2753 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
2755 if (STACK_REG_P (a) || STACK_REG_P (b))
2757 if (!rtx_equal_p (a, b))
2760 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
2762 else if (!rtx_equal_p (a, b))
2765 /* If this call is indirect, we'll need to be able to use a call-clobbered
2766 register for the address of the target function. Make sure that all
2767 such registers are not used for passing parameters. */
2768 if (!decl && !TARGET_64BIT)
2772 /* We're looking at the CALL_EXPR, we need the type of the function. */
2773 type = CALL_EXPR_FN (exp); /* pointer expression */
2774 type = TREE_TYPE (type); /* pointer type */
2775 type = TREE_TYPE (type); /* function type */
2777 if (ix86_function_regparm (type, NULL) >= 3)
2779 /* ??? Need to count the actual number of registers to be used,
2780 not the possible number of registers. Fix later. */
2785 /* Dllimport'd functions are also called indirectly. */
2786 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
2787 && decl && DECL_DLLIMPORT_P (decl)
2788 && ix86_function_regparm (TREE_TYPE (decl), NULL) >= 3)
2791 /* If we forced aligned the stack, then sibcalling would unalign the
2792 stack, which may break the called function. */
2793 if (cfun->machine->force_align_arg_pointer)
2796 /* Otherwise okay. That also includes certain types of indirect calls. */
2800 /* Handle "cdecl", "stdcall", "fastcall", "regparm" and "sseregparm"
2801 calling convention attributes;
2802 arguments as in struct attribute_spec.handler. */
2805 ix86_handle_cconv_attribute (tree *node, tree name,
2807 int flags ATTRIBUTE_UNUSED,
2810 if (TREE_CODE (*node) != FUNCTION_TYPE
2811 && TREE_CODE (*node) != METHOD_TYPE
2812 && TREE_CODE (*node) != FIELD_DECL
2813 && TREE_CODE (*node) != TYPE_DECL)
2815 warning (OPT_Wattributes, "%qs attribute only applies to functions",
2816 IDENTIFIER_POINTER (name));
2817 *no_add_attrs = true;
2821 /* Can combine regparm with all attributes but fastcall. */
2822 if (is_attribute_p ("regparm", name))
2826 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
2828 error ("fastcall and regparm attributes are not compatible");
2831 cst = TREE_VALUE (args);
2832 if (TREE_CODE (cst) != INTEGER_CST)
2834 warning (OPT_Wattributes,
2835 "%qs attribute requires an integer constant argument",
2836 IDENTIFIER_POINTER (name));
2837 *no_add_attrs = true;
2839 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
2841 warning (OPT_Wattributes, "argument to %qs attribute larger than %d",
2842 IDENTIFIER_POINTER (name), REGPARM_MAX);
2843 *no_add_attrs = true;
2847 && lookup_attribute (ix86_force_align_arg_pointer_string,
2848 TYPE_ATTRIBUTES (*node))
2849 && compare_tree_int (cst, REGPARM_MAX-1))
2851 error ("%s functions limited to %d register parameters",
2852 ix86_force_align_arg_pointer_string, REGPARM_MAX-1);
2860 /* Do not warn when emulating the MS ABI. */
2861 if (!TARGET_64BIT_MS_ABI)
2862 warning (OPT_Wattributes, "%qs attribute ignored",
2863 IDENTIFIER_POINTER (name));
2864 *no_add_attrs = true;
2868 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
2869 if (is_attribute_p ("fastcall", name))
2871 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
2873 error ("fastcall and cdecl attributes are not compatible");
2875 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
2877 error ("fastcall and stdcall attributes are not compatible");
2879 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
2881 error ("fastcall and regparm attributes are not compatible");
2885 /* Can combine stdcall with fastcall (redundant), regparm and
2887 else if (is_attribute_p ("stdcall", name))
2889 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
2891 error ("stdcall and cdecl attributes are not compatible");
2893 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
2895 error ("stdcall and fastcall attributes are not compatible");
2899 /* Can combine cdecl with regparm and sseregparm. */
2900 else if (is_attribute_p ("cdecl", name))
2902 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
2904 error ("stdcall and cdecl attributes are not compatible");
2906 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
2908 error ("fastcall and cdecl attributes are not compatible");
2912 /* Can combine sseregparm with all attributes. */
2917 /* Return 0 if the attributes for two types are incompatible, 1 if they
2918 are compatible, and 2 if they are nearly compatible (which causes a
2919 warning to be generated). */
2922 ix86_comp_type_attributes (const_tree type1, const_tree type2)
2924 /* Check for mismatch of non-default calling convention. */
2925 const char *const rtdstr = TARGET_RTD ? "cdecl" : "stdcall";
2927 if (TREE_CODE (type1) != FUNCTION_TYPE)
2930 /* Check for mismatched fastcall/regparm types. */
2931 if ((!lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type1))
2932 != !lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type2)))
2933 || (ix86_function_regparm (type1, NULL)
2934 != ix86_function_regparm (type2, NULL)))
2937 /* Check for mismatched sseregparm types. */
2938 if (!lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type1))
2939 != !lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type2)))
2942 /* Check for mismatched return types (cdecl vs stdcall). */
2943 if (!lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type1))
2944 != !lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type2)))
2950 /* Return the regparm value for a function with the indicated TYPE and DECL.
2951 DECL may be NULL when calling function indirectly
2952 or considering a libcall. */
2955 ix86_function_regparm (const_tree type, const_tree decl)
2958 int regparm = ix86_regparm;
2963 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
2965 return TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
2967 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type)))
2970 /* Use register calling convention for local functions when possible. */
2971 if (decl && TREE_CODE (decl) == FUNCTION_DECL
2972 && flag_unit_at_a_time && !profile_flag)
2974 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
2975 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
2978 int local_regparm, globals = 0, regno;
2981 /* Make sure no regparm register is taken by a
2982 global register variable. */
2983 for (local_regparm = 0; local_regparm < 3; local_regparm++)
2984 if (global_regs[local_regparm])
2987 /* We can't use regparm(3) for nested functions as these use
2988 static chain pointer in third argument. */
2989 if (local_regparm == 3
2990 && (decl_function_context (decl)
2991 || ix86_force_align_arg_pointer)
2992 && !DECL_NO_STATIC_CHAIN (decl))
2995 /* If the function realigns its stackpointer, the prologue will
2996 clobber %ecx. If we've already generated code for the callee,
2997 the callee DECL_STRUCT_FUNCTION is gone, so we fall back to
2998 scanning the attributes for the self-realigning property. */
2999 f = DECL_STRUCT_FUNCTION (decl);
3000 if (local_regparm == 3
3001 && (f ? !!f->machine->force_align_arg_pointer
3002 : !!lookup_attribute (ix86_force_align_arg_pointer_string,
3003 TYPE_ATTRIBUTES (TREE_TYPE (decl)))))
3006 /* Each global register variable increases register preassure,
3007 so the more global reg vars there are, the smaller regparm
3008 optimization use, unless requested by the user explicitly. */
3009 for (regno = 0; regno < 6; regno++)
3010 if (global_regs[regno])
3013 = globals < local_regparm ? local_regparm - globals : 0;
3015 if (local_regparm > regparm)
3016 regparm = local_regparm;
3023 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
3024 DFmode (2) arguments in SSE registers for a function with the
3025 indicated TYPE and DECL. DECL may be NULL when calling function
3026 indirectly or considering a libcall. Otherwise return 0. */
3029 ix86_function_sseregparm (const_tree type, const_tree decl)
3031 gcc_assert (!TARGET_64BIT);
3033 /* Use SSE registers to pass SFmode and DFmode arguments if requested
3034 by the sseregparm attribute. */
3035 if (TARGET_SSEREGPARM
3036 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
3041 error ("Calling %qD with attribute sseregparm without "
3042 "SSE/SSE2 enabled", decl);
3044 error ("Calling %qT with attribute sseregparm without "
3045 "SSE/SSE2 enabled", type);
3052 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
3053 (and DFmode for SSE2) arguments in SSE registers. */
3054 if (decl && TARGET_SSE_MATH && flag_unit_at_a_time && !profile_flag)
3056 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
3057 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
3059 return TARGET_SSE2 ? 2 : 1;
3065 /* Return true if EAX is live at the start of the function. Used by
3066 ix86_expand_prologue to determine if we need special help before
3067 calling allocate_stack_worker. */
3070 ix86_eax_live_at_start_p (void)
3072 /* Cheat. Don't bother working forward from ix86_function_regparm
3073 to the function type to whether an actual argument is located in
3074 eax. Instead just look at cfg info, which is still close enough
3075 to correct at this point. This gives false positives for broken
3076 functions that might use uninitialized data that happens to be
3077 allocated in eax, but who cares? */
3078 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
3081 /* Return true if TYPE has a variable argument list. */
3084 type_has_variadic_args_p (tree type)
3086 tree n, t = TYPE_ARG_TYPES (type);
3091 while ((n = TREE_CHAIN (t)) != NULL)
3094 return TREE_VALUE (t) != void_type_node;
3097 /* Value is the number of bytes of arguments automatically
3098 popped when returning from a subroutine call.
3099 FUNDECL is the declaration node of the function (as a tree),
3100 FUNTYPE is the data type of the function (as a tree),
3101 or for a library call it is an identifier node for the subroutine name.
3102 SIZE is the number of bytes of arguments passed on the stack.
3104 On the 80386, the RTD insn may be used to pop them if the number
3105 of args is fixed, but if the number is variable then the caller
3106 must pop them all. RTD can't be used for library calls now
3107 because the library is compiled with the Unix compiler.
3108 Use of RTD is a selectable option, since it is incompatible with
3109 standard Unix calling sequences. If the option is not selected,
3110 the caller must always pop the args.
3112 The attribute stdcall is equivalent to RTD on a per module basis. */
3115 ix86_return_pops_args (tree fundecl, tree funtype, int size)
3119 /* None of the 64-bit ABIs pop arguments. */
3123 rtd = TARGET_RTD && (!fundecl || TREE_CODE (fundecl) != IDENTIFIER_NODE);
3125 /* Cdecl functions override -mrtd, and never pop the stack. */
3126 if (! lookup_attribute ("cdecl", TYPE_ATTRIBUTES (funtype)))
3128 /* Stdcall and fastcall functions will pop the stack if not
3130 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (funtype))
3131 || lookup_attribute ("fastcall", TYPE_ATTRIBUTES (funtype)))
3134 if (rtd && ! type_has_variadic_args_p (funtype))
3138 /* Lose any fake structure return argument if it is passed on the stack. */
3139 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
3140 && !KEEP_AGGREGATE_RETURN_POINTER)
3142 int nregs = ix86_function_regparm (funtype, fundecl);
3144 return GET_MODE_SIZE (Pmode);
3150 /* Argument support functions. */
3152 /* Return true when register may be used to pass function parameters. */
3154 ix86_function_arg_regno_p (int regno)
3157 const int *parm_regs;
3162 return (regno < REGPARM_MAX
3163 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
3165 return (regno < REGPARM_MAX
3166 || (TARGET_MMX && MMX_REGNO_P (regno)
3167 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
3168 || (TARGET_SSE && SSE_REGNO_P (regno)
3169 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
3174 if (SSE_REGNO_P (regno) && TARGET_SSE)
3179 if (TARGET_SSE && SSE_REGNO_P (regno)
3180 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
3184 /* RAX is used as hidden argument to va_arg functions. */
3185 if (!TARGET_64BIT_MS_ABI && regno == 0)
3188 if (TARGET_64BIT_MS_ABI)
3189 parm_regs = x86_64_ms_abi_int_parameter_registers;
3191 parm_regs = x86_64_int_parameter_registers;
3192 for (i = 0; i < REGPARM_MAX; i++)
3193 if (regno == parm_regs[i])
3198 /* Return if we do not know how to pass TYPE solely in registers. */
3201 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
3203 if (must_pass_in_stack_var_size_or_pad (mode, type))
3206 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
3207 The layout_type routine is crafty and tries to trick us into passing
3208 currently unsupported vector types on the stack by using TImode. */
3209 return (!TARGET_64BIT && mode == TImode
3210 && type && TREE_CODE (type) != VECTOR_TYPE);
3213 /* Initialize a variable CUM of type CUMULATIVE_ARGS
3214 for a call to a function whose data type is FNTYPE.
3215 For a library call, FNTYPE is 0. */
3218 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
3219 tree fntype, /* tree ptr for function decl */
3220 rtx libname, /* SYMBOL_REF of library name or 0 */
3223 memset (cum, 0, sizeof (*cum));
3225 /* Set up the number of registers to use for passing arguments. */
3226 cum->nregs = ix86_regparm;
3228 cum->sse_nregs = SSE_REGPARM_MAX;
3230 cum->mmx_nregs = MMX_REGPARM_MAX;
3231 cum->warn_sse = true;
3232 cum->warn_mmx = true;
3233 cum->maybe_vaarg = (fntype
3234 ? (!TYPE_ARG_TYPES (fntype)
3235 || type_has_variadic_args_p (fntype))
3240 /* If there are variable arguments, then we won't pass anything
3241 in registers in 32-bit mode. */
3242 if (cum->maybe_vaarg)
3252 /* Use ecx and edx registers if function has fastcall attribute,
3253 else look for regparm information. */
3256 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)))
3262 cum->nregs = ix86_function_regparm (fntype, fndecl);
3265 /* Set up the number of SSE registers used for passing SFmode
3266 and DFmode arguments. Warn for mismatching ABI. */
3267 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl);
3271 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
3272 But in the case of vector types, it is some vector mode.
3274 When we have only some of our vector isa extensions enabled, then there
3275 are some modes for which vector_mode_supported_p is false. For these
3276 modes, the generic vector support in gcc will choose some non-vector mode
3277 in order to implement the type. By computing the natural mode, we'll
3278 select the proper ABI location for the operand and not depend on whatever
3279 the middle-end decides to do with these vector types. */
3281 static enum machine_mode
3282 type_natural_mode (const_tree type)
3284 enum machine_mode mode = TYPE_MODE (type);
3286 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
3288 HOST_WIDE_INT size = int_size_in_bytes (type);
3289 if ((size == 8 || size == 16)
3290 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
3291 && TYPE_VECTOR_SUBPARTS (type) > 1)
3293 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
3295 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
3296 mode = MIN_MODE_VECTOR_FLOAT;
3298 mode = MIN_MODE_VECTOR_INT;
3300 /* Get the mode which has this inner mode and number of units. */
3301 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
3302 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
3303 && GET_MODE_INNER (mode) == innermode)
3313 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
3314 this may not agree with the mode that the type system has chosen for the
3315 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
3316 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
3319 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
3324 if (orig_mode != BLKmode)
3325 tmp = gen_rtx_REG (orig_mode, regno);
3328 tmp = gen_rtx_REG (mode, regno);
3329 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
3330 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
3336 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
3337 of this code is to classify each 8bytes of incoming argument by the register
3338 class and assign registers accordingly. */
3340 /* Return the union class of CLASS1 and CLASS2.
3341 See the x86-64 PS ABI for details. */
3343 static enum x86_64_reg_class
3344 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
3346 /* Rule #1: If both classes are equal, this is the resulting class. */
3347 if (class1 == class2)
3350 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
3352 if (class1 == X86_64_NO_CLASS)
3354 if (class2 == X86_64_NO_CLASS)
3357 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
3358 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
3359 return X86_64_MEMORY_CLASS;
3361 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
3362 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
3363 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
3364 return X86_64_INTEGERSI_CLASS;
3365 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
3366 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
3367 return X86_64_INTEGER_CLASS;
3369 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
3371 if (class1 == X86_64_X87_CLASS
3372 || class1 == X86_64_X87UP_CLASS
3373 || class1 == X86_64_COMPLEX_X87_CLASS
3374 || class2 == X86_64_X87_CLASS
3375 || class2 == X86_64_X87UP_CLASS
3376 || class2 == X86_64_COMPLEX_X87_CLASS)
3377 return X86_64_MEMORY_CLASS;
3379 /* Rule #6: Otherwise class SSE is used. */
3380 return X86_64_SSE_CLASS;
3383 /* Classify the argument of type TYPE and mode MODE.
3384 CLASSES will be filled by the register class used to pass each word
3385 of the operand. The number of words is returned. In case the parameter
3386 should be passed in memory, 0 is returned. As a special case for zero
3387 sized containers, classes[0] will be NO_CLASS and 1 is returned.
3389 BIT_OFFSET is used internally for handling records and specifies offset
3390 of the offset in bits modulo 256 to avoid overflow cases.
3392 See the x86-64 PS ABI for details.
3396 classify_argument (enum machine_mode mode, const_tree type,
3397 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
3399 HOST_WIDE_INT bytes =
3400 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
3401 int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
3403 /* Variable sized entities are always passed/returned in memory. */
3407 if (mode != VOIDmode
3408 && targetm.calls.must_pass_in_stack (mode, type))
3411 if (type && AGGREGATE_TYPE_P (type))
3415 enum x86_64_reg_class subclasses[MAX_CLASSES];
3417 /* On x86-64 we pass structures larger than 16 bytes on the stack. */
3421 for (i = 0; i < words; i++)
3422 classes[i] = X86_64_NO_CLASS;
3424 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
3425 signalize memory class, so handle it as special case. */