1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
3 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
12 GCC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING. If not, write to
19 the Free Software Foundation, 51 Franklin Street, Fifth Floor,
20 Boston, MA 02110-1301, USA. */
24 #include "coretypes.h"
30 #include "hard-reg-set.h"
32 #include "insn-config.h"
33 #include "conditions.h"
35 #include "insn-codes.h"
36 #include "insn-attr.h"
44 #include "basic-block.h"
47 #include "target-def.h"
48 #include "langhooks.h"
50 #include "tree-gimple.h"
52 #include "tm-constrs.h"
55 #ifndef CHECK_STACK_LIMIT
56 #define CHECK_STACK_LIMIT (-1)
59 /* Return index of given mode in mult and division cost tables. */
60 #define MODE_INDEX(mode) \
61 ((mode) == QImode ? 0 \
62 : (mode) == HImode ? 1 \
63 : (mode) == SImode ? 2 \
64 : (mode) == DImode ? 3 \
67 /* Processor costs (relative to an add) */
68 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
69 #define COSTS_N_BYTES(N) ((N) * 2)
71 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
74 struct processor_costs size_cost = { /* costs for tuning for size */
75 COSTS_N_BYTES (2), /* cost of an add instruction */
76 COSTS_N_BYTES (3), /* cost of a lea instruction */
77 COSTS_N_BYTES (2), /* variable shift costs */
78 COSTS_N_BYTES (3), /* constant shift costs */
79 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
80 COSTS_N_BYTES (3), /* HI */
81 COSTS_N_BYTES (3), /* SI */
82 COSTS_N_BYTES (3), /* DI */
83 COSTS_N_BYTES (5)}, /* other */
84 0, /* cost of multiply per each bit set */
85 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
86 COSTS_N_BYTES (3), /* HI */
87 COSTS_N_BYTES (3), /* SI */
88 COSTS_N_BYTES (3), /* DI */
89 COSTS_N_BYTES (5)}, /* other */
90 COSTS_N_BYTES (3), /* cost of movsx */
91 COSTS_N_BYTES (3), /* cost of movzx */
94 2, /* cost for loading QImode using movzbl */
95 {2, 2, 2}, /* cost of loading integer registers
96 in QImode, HImode and SImode.
97 Relative to reg-reg move (2). */
98 {2, 2, 2}, /* cost of storing integer registers */
99 2, /* cost of reg,reg fld/fst */
100 {2, 2, 2}, /* cost of loading fp registers
101 in SFmode, DFmode and XFmode */
102 {2, 2, 2}, /* cost of storing fp registers
103 in SFmode, DFmode and XFmode */
104 3, /* cost of moving MMX register */
105 {3, 3}, /* cost of loading MMX registers
106 in SImode and DImode */
107 {3, 3}, /* cost of storing MMX registers
108 in SImode and DImode */
109 3, /* cost of moving SSE register */
110 {3, 3, 3}, /* cost of loading SSE registers
111 in SImode, DImode and TImode */
112 {3, 3, 3}, /* cost of storing SSE registers
113 in SImode, DImode and TImode */
114 3, /* MMX or SSE register to integer */
115 0, /* size of prefetch block */
116 0, /* number of parallel prefetches */
118 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
119 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
120 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
121 COSTS_N_BYTES (2), /* cost of FABS instruction. */
122 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
123 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
124 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
125 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
126 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
127 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}
130 /* Processor costs (relative to an add) */
132 struct processor_costs i386_cost = { /* 386 specific costs */
133 COSTS_N_INSNS (1), /* cost of an add instruction */
134 COSTS_N_INSNS (1), /* cost of a lea instruction */
135 COSTS_N_INSNS (3), /* variable shift costs */
136 COSTS_N_INSNS (2), /* constant shift costs */
137 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
138 COSTS_N_INSNS (6), /* HI */
139 COSTS_N_INSNS (6), /* SI */
140 COSTS_N_INSNS (6), /* DI */
141 COSTS_N_INSNS (6)}, /* other */
142 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
143 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
144 COSTS_N_INSNS (23), /* HI */
145 COSTS_N_INSNS (23), /* SI */
146 COSTS_N_INSNS (23), /* DI */
147 COSTS_N_INSNS (23)}, /* other */
148 COSTS_N_INSNS (3), /* cost of movsx */
149 COSTS_N_INSNS (2), /* cost of movzx */
150 15, /* "large" insn */
152 4, /* cost for loading QImode using movzbl */
153 {2, 4, 2}, /* cost of loading integer registers
154 in QImode, HImode and SImode.
155 Relative to reg-reg move (2). */
156 {2, 4, 2}, /* cost of storing integer registers */
157 2, /* cost of reg,reg fld/fst */
158 {8, 8, 8}, /* cost of loading fp registers
159 in SFmode, DFmode and XFmode */
160 {8, 8, 8}, /* cost of storing fp registers
161 in SFmode, DFmode and XFmode */
162 2, /* cost of moving MMX register */
163 {4, 8}, /* cost of loading MMX registers
164 in SImode and DImode */
165 {4, 8}, /* cost of storing MMX registers
166 in SImode and DImode */
167 2, /* cost of moving SSE register */
168 {4, 8, 16}, /* cost of loading SSE registers
169 in SImode, DImode and TImode */
170 {4, 8, 16}, /* cost of storing SSE registers
171 in SImode, DImode and TImode */
172 3, /* MMX or SSE register to integer */
173 0, /* size of prefetch block */
174 0, /* number of parallel prefetches */
176 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
177 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
178 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
179 COSTS_N_INSNS (22), /* cost of FABS instruction. */
180 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
181 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
182 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
183 DUMMY_STRINGOP_ALGS},
184 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
185 DUMMY_STRINGOP_ALGS},
189 struct processor_costs i486_cost = { /* 486 specific costs */
190 COSTS_N_INSNS (1), /* cost of an add instruction */
191 COSTS_N_INSNS (1), /* cost of a lea instruction */
192 COSTS_N_INSNS (3), /* variable shift costs */
193 COSTS_N_INSNS (2), /* constant shift costs */
194 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
195 COSTS_N_INSNS (12), /* HI */
196 COSTS_N_INSNS (12), /* SI */
197 COSTS_N_INSNS (12), /* DI */
198 COSTS_N_INSNS (12)}, /* other */
199 1, /* cost of multiply per each bit set */
200 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
201 COSTS_N_INSNS (40), /* HI */
202 COSTS_N_INSNS (40), /* SI */
203 COSTS_N_INSNS (40), /* DI */
204 COSTS_N_INSNS (40)}, /* other */
205 COSTS_N_INSNS (3), /* cost of movsx */
206 COSTS_N_INSNS (2), /* cost of movzx */
207 15, /* "large" insn */
209 4, /* cost for loading QImode using movzbl */
210 {2, 4, 2}, /* cost of loading integer registers
211 in QImode, HImode and SImode.
212 Relative to reg-reg move (2). */
213 {2, 4, 2}, /* cost of storing integer registers */
214 2, /* cost of reg,reg fld/fst */
215 {8, 8, 8}, /* cost of loading fp registers
216 in SFmode, DFmode and XFmode */
217 {8, 8, 8}, /* cost of storing fp registers
218 in SFmode, DFmode and XFmode */
219 2, /* cost of moving MMX register */
220 {4, 8}, /* cost of loading MMX registers
221 in SImode and DImode */
222 {4, 8}, /* cost of storing MMX registers
223 in SImode and DImode */
224 2, /* cost of moving SSE register */
225 {4, 8, 16}, /* cost of loading SSE registers
226 in SImode, DImode and TImode */
227 {4, 8, 16}, /* cost of storing SSE registers
228 in SImode, DImode and TImode */
229 3, /* MMX or SSE register to integer */
230 0, /* size of prefetch block */
231 0, /* number of parallel prefetches */
233 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
234 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
235 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
236 COSTS_N_INSNS (3), /* cost of FABS instruction. */
237 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
238 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
239 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
240 DUMMY_STRINGOP_ALGS},
241 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
246 struct processor_costs pentium_cost = {
247 COSTS_N_INSNS (1), /* cost of an add instruction */
248 COSTS_N_INSNS (1), /* cost of a lea instruction */
249 COSTS_N_INSNS (4), /* variable shift costs */
250 COSTS_N_INSNS (1), /* constant shift costs */
251 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
252 COSTS_N_INSNS (11), /* HI */
253 COSTS_N_INSNS (11), /* SI */
254 COSTS_N_INSNS (11), /* DI */
255 COSTS_N_INSNS (11)}, /* other */
256 0, /* cost of multiply per each bit set */
257 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
258 COSTS_N_INSNS (25), /* HI */
259 COSTS_N_INSNS (25), /* SI */
260 COSTS_N_INSNS (25), /* DI */
261 COSTS_N_INSNS (25)}, /* other */
262 COSTS_N_INSNS (3), /* cost of movsx */
263 COSTS_N_INSNS (2), /* cost of movzx */
264 8, /* "large" insn */
266 6, /* cost for loading QImode using movzbl */
267 {2, 4, 2}, /* cost of loading integer registers
268 in QImode, HImode and SImode.
269 Relative to reg-reg move (2). */
270 {2, 4, 2}, /* cost of storing integer registers */
271 2, /* cost of reg,reg fld/fst */
272 {2, 2, 6}, /* cost of loading fp registers
273 in SFmode, DFmode and XFmode */
274 {4, 4, 6}, /* cost of storing fp registers
275 in SFmode, DFmode and XFmode */
276 8, /* cost of moving MMX register */
277 {8, 8}, /* cost of loading MMX registers
278 in SImode and DImode */
279 {8, 8}, /* cost of storing MMX registers
280 in SImode and DImode */
281 2, /* cost of moving SSE register */
282 {4, 8, 16}, /* cost of loading SSE registers
283 in SImode, DImode and TImode */
284 {4, 8, 16}, /* cost of storing SSE registers
285 in SImode, DImode and TImode */
286 3, /* MMX or SSE register to integer */
287 0, /* size of prefetch block */
288 0, /* number of parallel prefetches */
290 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
291 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
292 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
293 COSTS_N_INSNS (1), /* cost of FABS instruction. */
294 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
295 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
296 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
297 DUMMY_STRINGOP_ALGS},
298 {{libcall, {{-1, rep_prefix_4_byte}}},
303 struct processor_costs pentiumpro_cost = {
304 COSTS_N_INSNS (1), /* cost of an add instruction */
305 COSTS_N_INSNS (1), /* cost of a lea instruction */
306 COSTS_N_INSNS (1), /* variable shift costs */
307 COSTS_N_INSNS (1), /* constant shift costs */
308 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
309 COSTS_N_INSNS (4), /* HI */
310 COSTS_N_INSNS (4), /* SI */
311 COSTS_N_INSNS (4), /* DI */
312 COSTS_N_INSNS (4)}, /* other */
313 0, /* cost of multiply per each bit set */
314 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
315 COSTS_N_INSNS (17), /* HI */
316 COSTS_N_INSNS (17), /* SI */
317 COSTS_N_INSNS (17), /* DI */
318 COSTS_N_INSNS (17)}, /* other */
319 COSTS_N_INSNS (1), /* cost of movsx */
320 COSTS_N_INSNS (1), /* cost of movzx */
321 8, /* "large" insn */
323 2, /* cost for loading QImode using movzbl */
324 {4, 4, 4}, /* cost of loading integer registers
325 in QImode, HImode and SImode.
326 Relative to reg-reg move (2). */
327 {2, 2, 2}, /* cost of storing integer registers */
328 2, /* cost of reg,reg fld/fst */
329 {2, 2, 6}, /* cost of loading fp registers
330 in SFmode, DFmode and XFmode */
331 {4, 4, 6}, /* cost of storing fp registers
332 in SFmode, DFmode and XFmode */
333 2, /* cost of moving MMX register */
334 {2, 2}, /* cost of loading MMX registers
335 in SImode and DImode */
336 {2, 2}, /* cost of storing MMX registers
337 in SImode and DImode */
338 2, /* cost of moving SSE register */
339 {2, 2, 8}, /* cost of loading SSE registers
340 in SImode, DImode and TImode */
341 {2, 2, 8}, /* cost of storing SSE registers
342 in SImode, DImode and TImode */
343 3, /* MMX or SSE register to integer */
344 32, /* size of prefetch block */
345 6, /* number of parallel prefetches */
347 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
348 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
349 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
350 COSTS_N_INSNS (2), /* cost of FABS instruction. */
351 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
352 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
353 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes (we ensure
354 the alignment). For small blocks inline loop is still a noticeable win, for bigger
355 blocks either rep movsl or rep movsb is way to go. Rep movsb has apparently
356 more expensive startup time in CPU, but after 4K the difference is down in the noise.
358 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
359 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
360 DUMMY_STRINGOP_ALGS},
361 {{rep_prefix_4_byte, {{1024, unrolled_loop},
362 {8192, rep_prefix_4_byte}, {-1, libcall}}},
367 struct processor_costs geode_cost = {
368 COSTS_N_INSNS (1), /* cost of an add instruction */
369 COSTS_N_INSNS (1), /* cost of a lea instruction */
370 COSTS_N_INSNS (2), /* variable shift costs */
371 COSTS_N_INSNS (1), /* constant shift costs */
372 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
373 COSTS_N_INSNS (4), /* HI */
374 COSTS_N_INSNS (7), /* SI */
375 COSTS_N_INSNS (7), /* DI */
376 COSTS_N_INSNS (7)}, /* other */
377 0, /* cost of multiply per each bit set */
378 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
379 COSTS_N_INSNS (23), /* HI */
380 COSTS_N_INSNS (39), /* SI */
381 COSTS_N_INSNS (39), /* DI */
382 COSTS_N_INSNS (39)}, /* other */
383 COSTS_N_INSNS (1), /* cost of movsx */
384 COSTS_N_INSNS (1), /* cost of movzx */
385 8, /* "large" insn */
387 1, /* cost for loading QImode using movzbl */
388 {1, 1, 1}, /* cost of loading integer registers
389 in QImode, HImode and SImode.
390 Relative to reg-reg move (2). */
391 {1, 1, 1}, /* cost of storing integer registers */
392 1, /* cost of reg,reg fld/fst */
393 {1, 1, 1}, /* cost of loading fp registers
394 in SFmode, DFmode and XFmode */
395 {4, 6, 6}, /* cost of storing fp registers
396 in SFmode, DFmode and XFmode */
398 1, /* cost of moving MMX register */
399 {1, 1}, /* cost of loading MMX registers
400 in SImode and DImode */
401 {1, 1}, /* cost of storing MMX registers
402 in SImode and DImode */
403 1, /* cost of moving SSE register */
404 {1, 1, 1}, /* cost of loading SSE registers
405 in SImode, DImode and TImode */
406 {1, 1, 1}, /* cost of storing SSE registers
407 in SImode, DImode and TImode */
408 1, /* MMX or SSE register to integer */
409 32, /* size of prefetch block */
410 1, /* number of parallel prefetches */
412 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
413 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
414 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
415 COSTS_N_INSNS (1), /* cost of FABS instruction. */
416 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
417 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
418 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
419 DUMMY_STRINGOP_ALGS},
420 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
425 struct processor_costs k6_cost = {
426 COSTS_N_INSNS (1), /* cost of an add instruction */
427 COSTS_N_INSNS (2), /* cost of a lea instruction */
428 COSTS_N_INSNS (1), /* variable shift costs */
429 COSTS_N_INSNS (1), /* constant shift costs */
430 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
431 COSTS_N_INSNS (3), /* HI */
432 COSTS_N_INSNS (3), /* SI */
433 COSTS_N_INSNS (3), /* DI */
434 COSTS_N_INSNS (3)}, /* other */
435 0, /* cost of multiply per each bit set */
436 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
437 COSTS_N_INSNS (18), /* HI */
438 COSTS_N_INSNS (18), /* SI */
439 COSTS_N_INSNS (18), /* DI */
440 COSTS_N_INSNS (18)}, /* other */
441 COSTS_N_INSNS (2), /* cost of movsx */
442 COSTS_N_INSNS (2), /* cost of movzx */
443 8, /* "large" insn */
445 3, /* cost for loading QImode using movzbl */
446 {4, 5, 4}, /* cost of loading integer registers
447 in QImode, HImode and SImode.
448 Relative to reg-reg move (2). */
449 {2, 3, 2}, /* cost of storing integer registers */
450 4, /* cost of reg,reg fld/fst */
451 {6, 6, 6}, /* cost of loading fp registers
452 in SFmode, DFmode and XFmode */
453 {4, 4, 4}, /* cost of storing fp registers
454 in SFmode, DFmode and XFmode */
455 2, /* cost of moving MMX register */
456 {2, 2}, /* cost of loading MMX registers
457 in SImode and DImode */
458 {2, 2}, /* cost of storing MMX registers
459 in SImode and DImode */
460 2, /* cost of moving SSE register */
461 {2, 2, 8}, /* cost of loading SSE registers
462 in SImode, DImode and TImode */
463 {2, 2, 8}, /* cost of storing SSE registers
464 in SImode, DImode and TImode */
465 6, /* MMX or SSE register to integer */
466 32, /* size of prefetch block */
467 1, /* number of parallel prefetches */
469 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
470 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
471 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
472 COSTS_N_INSNS (2), /* cost of FABS instruction. */
473 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
474 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
475 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
476 DUMMY_STRINGOP_ALGS},
477 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
482 struct processor_costs athlon_cost = {
483 COSTS_N_INSNS (1), /* cost of an add instruction */
484 COSTS_N_INSNS (2), /* cost of a lea instruction */
485 COSTS_N_INSNS (1), /* variable shift costs */
486 COSTS_N_INSNS (1), /* constant shift costs */
487 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
488 COSTS_N_INSNS (5), /* HI */
489 COSTS_N_INSNS (5), /* SI */
490 COSTS_N_INSNS (5), /* DI */
491 COSTS_N_INSNS (5)}, /* other */
492 0, /* cost of multiply per each bit set */
493 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
494 COSTS_N_INSNS (26), /* HI */
495 COSTS_N_INSNS (42), /* SI */
496 COSTS_N_INSNS (74), /* DI */
497 COSTS_N_INSNS (74)}, /* other */
498 COSTS_N_INSNS (1), /* cost of movsx */
499 COSTS_N_INSNS (1), /* cost of movzx */
500 8, /* "large" insn */
502 4, /* cost for loading QImode using movzbl */
503 {3, 4, 3}, /* cost of loading integer registers
504 in QImode, HImode and SImode.
505 Relative to reg-reg move (2). */
506 {3, 4, 3}, /* cost of storing integer registers */
507 4, /* cost of reg,reg fld/fst */
508 {4, 4, 12}, /* cost of loading fp registers
509 in SFmode, DFmode and XFmode */
510 {6, 6, 8}, /* cost of storing fp registers
511 in SFmode, DFmode and XFmode */
512 2, /* cost of moving MMX register */
513 {4, 4}, /* cost of loading MMX registers
514 in SImode and DImode */
515 {4, 4}, /* cost of storing MMX registers
516 in SImode and DImode */
517 2, /* cost of moving SSE register */
518 {4, 4, 6}, /* cost of loading SSE registers
519 in SImode, DImode and TImode */
520 {4, 4, 5}, /* cost of storing SSE registers
521 in SImode, DImode and TImode */
522 5, /* MMX or SSE register to integer */
523 64, /* size of prefetch block */
524 6, /* number of parallel prefetches */
526 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
527 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
528 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
529 COSTS_N_INSNS (2), /* cost of FABS instruction. */
530 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
531 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
532 /* For some reason, Athlon deals better with REP prefix (relative to loops)
533 compared to K8. Alignment becomes important after 8 bytes for memcpy and
534 128 bytes for memset. */
535 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
536 DUMMY_STRINGOP_ALGS},
537 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
542 struct processor_costs k8_cost = {
543 COSTS_N_INSNS (1), /* cost of an add instruction */
544 COSTS_N_INSNS (2), /* cost of a lea instruction */
545 COSTS_N_INSNS (1), /* variable shift costs */
546 COSTS_N_INSNS (1), /* constant shift costs */
547 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
548 COSTS_N_INSNS (4), /* HI */
549 COSTS_N_INSNS (3), /* SI */
550 COSTS_N_INSNS (4), /* DI */
551 COSTS_N_INSNS (5)}, /* other */
552 0, /* cost of multiply per each bit set */
553 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
554 COSTS_N_INSNS (26), /* HI */
555 COSTS_N_INSNS (42), /* SI */
556 COSTS_N_INSNS (74), /* DI */
557 COSTS_N_INSNS (74)}, /* other */
558 COSTS_N_INSNS (1), /* cost of movsx */
559 COSTS_N_INSNS (1), /* cost of movzx */
560 8, /* "large" insn */
562 4, /* cost for loading QImode using movzbl */
563 {3, 4, 3}, /* cost of loading integer registers
564 in QImode, HImode and SImode.
565 Relative to reg-reg move (2). */
566 {3, 4, 3}, /* cost of storing integer registers */
567 4, /* cost of reg,reg fld/fst */
568 {4, 4, 12}, /* cost of loading fp registers
569 in SFmode, DFmode and XFmode */
570 {6, 6, 8}, /* cost of storing fp registers
571 in SFmode, DFmode and XFmode */
572 2, /* cost of moving MMX register */
573 {3, 3}, /* cost of loading MMX registers
574 in SImode and DImode */
575 {4, 4}, /* cost of storing MMX registers
576 in SImode and DImode */
577 2, /* cost of moving SSE register */
578 {4, 3, 6}, /* cost of loading SSE registers
579 in SImode, DImode and TImode */
580 {4, 4, 5}, /* cost of storing SSE registers
581 in SImode, DImode and TImode */
582 5, /* MMX or SSE register to integer */
583 64, /* size of prefetch block */
584 /* New AMD processors never drop prefetches; if they cannot be performed
585 immediately, they are queued. We set number of simultaneous prefetches
586 to a large constant to reflect this (it probably is not a good idea not
587 to limit number of prefetches at all, as their execution also takes some
589 100, /* number of parallel prefetches */
591 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
592 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
593 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
594 COSTS_N_INSNS (2), /* cost of FABS instruction. */
595 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
596 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
597 /* K8 has optimized REP instruction for medium sized blocks, but for very small
598 blocks it is better to use loop. For large blocks, libcall can do
599 nontemporary accesses and beat inline considerably. */
600 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
601 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
602 {{libcall, {{8, loop}, {24, unrolled_loop},
603 {2048, rep_prefix_4_byte}, {-1, libcall}}},
604 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
607 struct processor_costs amdfam10_cost = {
608 COSTS_N_INSNS (1), /* cost of an add instruction */
609 COSTS_N_INSNS (2), /* cost of a lea instruction */
610 COSTS_N_INSNS (1), /* variable shift costs */
611 COSTS_N_INSNS (1), /* constant shift costs */
612 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
613 COSTS_N_INSNS (4), /* HI */
614 COSTS_N_INSNS (3), /* SI */
615 COSTS_N_INSNS (4), /* DI */
616 COSTS_N_INSNS (5)}, /* other */
617 0, /* cost of multiply per each bit set */
618 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
619 COSTS_N_INSNS (35), /* HI */
620 COSTS_N_INSNS (51), /* SI */
621 COSTS_N_INSNS (83), /* DI */
622 COSTS_N_INSNS (83)}, /* other */
623 COSTS_N_INSNS (1), /* cost of movsx */
624 COSTS_N_INSNS (1), /* cost of movzx */
625 8, /* "large" insn */
627 4, /* cost for loading QImode using movzbl */
628 {3, 4, 3}, /* cost of loading integer registers
629 in QImode, HImode and SImode.
630 Relative to reg-reg move (2). */
631 {3, 4, 3}, /* cost of storing integer registers */
632 4, /* cost of reg,reg fld/fst */
633 {4, 4, 12}, /* cost of loading fp registers
634 in SFmode, DFmode and XFmode */
635 {6, 6, 8}, /* cost of storing fp registers
636 in SFmode, DFmode and XFmode */
637 2, /* cost of moving MMX register */
638 {3, 3}, /* cost of loading MMX registers
639 in SImode and DImode */
640 {4, 4}, /* cost of storing MMX registers
641 in SImode and DImode */
642 2, /* cost of moving SSE register */
643 {4, 4, 3}, /* cost of loading SSE registers
644 in SImode, DImode and TImode */
645 {4, 4, 5}, /* cost of storing SSE registers
646 in SImode, DImode and TImode */
647 3, /* MMX or SSE register to integer */
649 MOVD reg64, xmmreg Double FSTORE 4
650 MOVD reg32, xmmreg Double FSTORE 4
652 MOVD reg64, xmmreg Double FADD 3
654 MOVD reg32, xmmreg Double FADD 3
656 64, /* size of prefetch block */
657 /* New AMD processors never drop prefetches; if they cannot be performed
658 immediately, they are queued. We set number of simultaneous prefetches
659 to a large constant to reflect this (it probably is not a good idea not
660 to limit number of prefetches at all, as their execution also takes some
662 100, /* number of parallel prefetches */
664 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
665 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
666 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
667 COSTS_N_INSNS (2), /* cost of FABS instruction. */
668 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
669 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
671 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
672 very small blocks it is better to use loop. For large blocks, libcall can
673 do nontemporary accesses and beat inline considerably. */
674 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
675 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
676 {{libcall, {{8, loop}, {24, unrolled_loop},
677 {2048, rep_prefix_4_byte}, {-1, libcall}}},
678 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
682 struct processor_costs pentium4_cost = {
683 COSTS_N_INSNS (1), /* cost of an add instruction */
684 COSTS_N_INSNS (3), /* cost of a lea instruction */
685 COSTS_N_INSNS (4), /* variable shift costs */
686 COSTS_N_INSNS (4), /* constant shift costs */
687 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
688 COSTS_N_INSNS (15), /* HI */
689 COSTS_N_INSNS (15), /* SI */
690 COSTS_N_INSNS (15), /* DI */
691 COSTS_N_INSNS (15)}, /* other */
692 0, /* cost of multiply per each bit set */
693 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
694 COSTS_N_INSNS (56), /* HI */
695 COSTS_N_INSNS (56), /* SI */
696 COSTS_N_INSNS (56), /* DI */
697 COSTS_N_INSNS (56)}, /* other */
698 COSTS_N_INSNS (1), /* cost of movsx */
699 COSTS_N_INSNS (1), /* cost of movzx */
700 16, /* "large" insn */
702 2, /* cost for loading QImode using movzbl */
703 {4, 5, 4}, /* cost of loading integer registers
704 in QImode, HImode and SImode.
705 Relative to reg-reg move (2). */
706 {2, 3, 2}, /* cost of storing integer registers */
707 2, /* cost of reg,reg fld/fst */
708 {2, 2, 6}, /* cost of loading fp registers
709 in SFmode, DFmode and XFmode */
710 {4, 4, 6}, /* cost of storing fp registers
711 in SFmode, DFmode and XFmode */
712 2, /* cost of moving MMX register */
713 {2, 2}, /* cost of loading MMX registers
714 in SImode and DImode */
715 {2, 2}, /* cost of storing MMX registers
716 in SImode and DImode */
717 12, /* cost of moving SSE register */
718 {12, 12, 12}, /* cost of loading SSE registers
719 in SImode, DImode and TImode */
720 {2, 2, 8}, /* cost of storing SSE registers
721 in SImode, DImode and TImode */
722 10, /* MMX or SSE register to integer */
723 64, /* size of prefetch block */
724 6, /* number of parallel prefetches */
726 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
727 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
728 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
729 COSTS_N_INSNS (2), /* cost of FABS instruction. */
730 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
731 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
732 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
733 DUMMY_STRINGOP_ALGS},
734 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
736 DUMMY_STRINGOP_ALGS},
740 struct processor_costs nocona_cost = {
741 COSTS_N_INSNS (1), /* cost of an add instruction */
742 COSTS_N_INSNS (1), /* cost of a lea instruction */
743 COSTS_N_INSNS (1), /* variable shift costs */
744 COSTS_N_INSNS (1), /* constant shift costs */
745 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
746 COSTS_N_INSNS (10), /* HI */
747 COSTS_N_INSNS (10), /* SI */
748 COSTS_N_INSNS (10), /* DI */
749 COSTS_N_INSNS (10)}, /* other */
750 0, /* cost of multiply per each bit set */
751 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
752 COSTS_N_INSNS (66), /* HI */
753 COSTS_N_INSNS (66), /* SI */
754 COSTS_N_INSNS (66), /* DI */
755 COSTS_N_INSNS (66)}, /* other */
756 COSTS_N_INSNS (1), /* cost of movsx */
757 COSTS_N_INSNS (1), /* cost of movzx */
758 16, /* "large" insn */
760 4, /* cost for loading QImode using movzbl */
761 {4, 4, 4}, /* cost of loading integer registers
762 in QImode, HImode and SImode.
763 Relative to reg-reg move (2). */
764 {4, 4, 4}, /* cost of storing integer registers */
765 3, /* cost of reg,reg fld/fst */
766 {12, 12, 12}, /* cost of loading fp registers
767 in SFmode, DFmode and XFmode */
768 {4, 4, 4}, /* cost of storing fp registers
769 in SFmode, DFmode and XFmode */
770 6, /* cost of moving MMX register */
771 {12, 12}, /* cost of loading MMX registers
772 in SImode and DImode */
773 {12, 12}, /* cost of storing MMX registers
774 in SImode and DImode */
775 6, /* cost of moving SSE register */
776 {12, 12, 12}, /* cost of loading SSE registers
777 in SImode, DImode and TImode */
778 {12, 12, 12}, /* cost of storing SSE registers
779 in SImode, DImode and TImode */
780 8, /* MMX or SSE register to integer */
781 128, /* size of prefetch block */
782 8, /* number of parallel prefetches */
784 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
785 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
786 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
787 COSTS_N_INSNS (3), /* cost of FABS instruction. */
788 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
789 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
790 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
791 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
792 {100000, unrolled_loop}, {-1, libcall}}}},
793 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
795 {libcall, {{24, loop}, {64, unrolled_loop},
796 {8192, rep_prefix_8_byte}, {-1, libcall}}}}
800 struct processor_costs core2_cost = {
801 COSTS_N_INSNS (1), /* cost of an add instruction */
802 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
803 COSTS_N_INSNS (1), /* variable shift costs */
804 COSTS_N_INSNS (1), /* constant shift costs */
805 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
806 COSTS_N_INSNS (3), /* HI */
807 COSTS_N_INSNS (3), /* SI */
808 COSTS_N_INSNS (3), /* DI */
809 COSTS_N_INSNS (3)}, /* other */
810 0, /* cost of multiply per each bit set */
811 {COSTS_N_INSNS (22), /* cost of a divide/mod for QI */
812 COSTS_N_INSNS (22), /* HI */
813 COSTS_N_INSNS (22), /* SI */
814 COSTS_N_INSNS (22), /* DI */
815 COSTS_N_INSNS (22)}, /* other */
816 COSTS_N_INSNS (1), /* cost of movsx */
817 COSTS_N_INSNS (1), /* cost of movzx */
818 8, /* "large" insn */
820 2, /* cost for loading QImode using movzbl */
821 {6, 6, 6}, /* cost of loading integer registers
822 in QImode, HImode and SImode.
823 Relative to reg-reg move (2). */
824 {4, 4, 4}, /* cost of storing integer registers */
825 2, /* cost of reg,reg fld/fst */
826 {6, 6, 6}, /* cost of loading fp registers
827 in SFmode, DFmode and XFmode */
828 {4, 4, 4}, /* cost of loading integer registers */
829 2, /* cost of moving MMX register */
830 {6, 6}, /* cost of loading MMX registers
831 in SImode and DImode */
832 {4, 4}, /* cost of storing MMX registers
833 in SImode and DImode */
834 2, /* cost of moving SSE register */
835 {6, 6, 6}, /* cost of loading SSE registers
836 in SImode, DImode and TImode */
837 {4, 4, 4}, /* cost of storing SSE registers
838 in SImode, DImode and TImode */
839 2, /* MMX or SSE register to integer */
840 128, /* size of prefetch block */
841 8, /* number of parallel prefetches */
843 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
844 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
845 COSTS_N_INSNS (32), /* cost of FDIV instruction. */
846 COSTS_N_INSNS (1), /* cost of FABS instruction. */
847 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
848 COSTS_N_INSNS (58), /* cost of FSQRT instruction. */
849 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
850 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
851 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
852 {{libcall, {{8, loop}, {15, unrolled_loop},
853 {2048, rep_prefix_4_byte}, {-1, libcall}}},
854 {libcall, {{24, loop}, {32, unrolled_loop},
855 {8192, rep_prefix_8_byte}, {-1, libcall}}}}
858 /* Generic64 should produce code tuned for Nocona and K8. */
860 struct processor_costs generic64_cost = {
861 COSTS_N_INSNS (1), /* cost of an add instruction */
862 /* On all chips taken into consideration lea is 2 cycles and more. With
863 this cost however our current implementation of synth_mult results in
864 use of unnecessary temporary registers causing regression on several
865 SPECfp benchmarks. */
866 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
867 COSTS_N_INSNS (1), /* variable shift costs */
868 COSTS_N_INSNS (1), /* constant shift costs */
869 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
870 COSTS_N_INSNS (4), /* HI */
871 COSTS_N_INSNS (3), /* SI */
872 COSTS_N_INSNS (4), /* DI */
873 COSTS_N_INSNS (2)}, /* other */
874 0, /* cost of multiply per each bit set */
875 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
876 COSTS_N_INSNS (26), /* HI */
877 COSTS_N_INSNS (42), /* SI */
878 COSTS_N_INSNS (74), /* DI */
879 COSTS_N_INSNS (74)}, /* other */
880 COSTS_N_INSNS (1), /* cost of movsx */
881 COSTS_N_INSNS (1), /* cost of movzx */
882 8, /* "large" insn */
884 4, /* cost for loading QImode using movzbl */
885 {4, 4, 4}, /* cost of loading integer registers
886 in QImode, HImode and SImode.
887 Relative to reg-reg move (2). */
888 {4, 4, 4}, /* cost of storing integer registers */
889 4, /* cost of reg,reg fld/fst */
890 {12, 12, 12}, /* cost of loading fp registers
891 in SFmode, DFmode and XFmode */
892 {6, 6, 8}, /* cost of storing fp registers
893 in SFmode, DFmode and XFmode */
894 2, /* cost of moving MMX register */
895 {8, 8}, /* cost of loading MMX registers
896 in SImode and DImode */
897 {8, 8}, /* cost of storing MMX registers
898 in SImode and DImode */
899 2, /* cost of moving SSE register */
900 {8, 8, 8}, /* cost of loading SSE registers
901 in SImode, DImode and TImode */
902 {8, 8, 8}, /* cost of storing SSE registers
903 in SImode, DImode and TImode */
904 5, /* MMX or SSE register to integer */
905 64, /* size of prefetch block */
906 6, /* number of parallel prefetches */
907 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this value
908 is increased to perhaps more appropriate value of 5. */
910 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
911 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
912 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
913 COSTS_N_INSNS (8), /* cost of FABS instruction. */
914 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
915 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
916 {DUMMY_STRINGOP_ALGS,
917 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
918 {DUMMY_STRINGOP_ALGS,
919 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
922 /* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8. */
924 struct processor_costs generic32_cost = {
925 COSTS_N_INSNS (1), /* cost of an add instruction */
926 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
927 COSTS_N_INSNS (1), /* variable shift costs */
928 COSTS_N_INSNS (1), /* constant shift costs */
929 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
930 COSTS_N_INSNS (4), /* HI */
931 COSTS_N_INSNS (3), /* SI */
932 COSTS_N_INSNS (4), /* DI */
933 COSTS_N_INSNS (2)}, /* other */
934 0, /* cost of multiply per each bit set */
935 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
936 COSTS_N_INSNS (26), /* HI */
937 COSTS_N_INSNS (42), /* SI */
938 COSTS_N_INSNS (74), /* DI */
939 COSTS_N_INSNS (74)}, /* other */
940 COSTS_N_INSNS (1), /* cost of movsx */
941 COSTS_N_INSNS (1), /* cost of movzx */
942 8, /* "large" insn */
944 4, /* cost for loading QImode using movzbl */
945 {4, 4, 4}, /* cost of loading integer registers
946 in QImode, HImode and SImode.
947 Relative to reg-reg move (2). */
948 {4, 4, 4}, /* cost of storing integer registers */
949 4, /* cost of reg,reg fld/fst */
950 {12, 12, 12}, /* cost of loading fp registers
951 in SFmode, DFmode and XFmode */
952 {6, 6, 8}, /* cost of storing fp registers
953 in SFmode, DFmode and XFmode */
954 2, /* cost of moving MMX register */
955 {8, 8}, /* cost of loading MMX registers
956 in SImode and DImode */
957 {8, 8}, /* cost of storing MMX registers
958 in SImode and DImode */
959 2, /* cost of moving SSE register */
960 {8, 8, 8}, /* cost of loading SSE registers
961 in SImode, DImode and TImode */
962 {8, 8, 8}, /* cost of storing SSE registers
963 in SImode, DImode and TImode */
964 5, /* MMX or SSE register to integer */
965 64, /* size of prefetch block */
966 6, /* number of parallel prefetches */
968 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
969 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
970 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
971 COSTS_N_INSNS (8), /* cost of FABS instruction. */
972 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
973 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
974 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
975 DUMMY_STRINGOP_ALGS},
976 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
977 DUMMY_STRINGOP_ALGS},
980 const struct processor_costs *ix86_cost = &pentium_cost;
982 /* Processor feature/optimization bitmasks. */
983 #define m_386 (1<<PROCESSOR_I386)
984 #define m_486 (1<<PROCESSOR_I486)
985 #define m_PENT (1<<PROCESSOR_PENTIUM)
986 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
987 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
988 #define m_NOCONA (1<<PROCESSOR_NOCONA)
989 #define m_CORE2 (1<<PROCESSOR_CORE2)
991 #define m_GEODE (1<<PROCESSOR_GEODE)
992 #define m_K6 (1<<PROCESSOR_K6)
993 #define m_K6_GEODE (m_K6 | m_GEODE)
994 #define m_K8 (1<<PROCESSOR_K8)
995 #define m_ATHLON (1<<PROCESSOR_ATHLON)
996 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
997 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
998 #define m_ATHLON_K8_AMDFAM10 (m_K8 | m_ATHLON | m_AMDFAM10)
1000 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1001 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1003 /* Generic instruction choice should be common subset of supported CPUs
1004 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1005 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1007 /* Feature tests against the various tunings. */
1008 unsigned int ix86_tune_features[X86_TUNE_LAST] = {
1009 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1010 negatively, so enabling for Generic64 seems like good code size
1011 tradeoff. We can't enable it for 32bit generic because it does not
1012 work well with PPro base chips. */
1013 m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC64,
1015 /* X86_TUNE_PUSH_MEMORY */
1016 m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
1017 | m_NOCONA | m_CORE2 | m_GENERIC,
1019 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1022 /* X86_TUNE_USE_BIT_TEST */
1025 /* X86_TUNE_UNROLL_STRLEN */
1026 m_486 | m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6 | m_CORE2 | m_GENERIC,
1028 /* X86_TUNE_DEEP_BRANCH_PREDICTION */
1029 m_PPRO | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
1030 | m_NOCONA | m_CORE2 | m_GENERIC,
1032 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1033 on simulation result. But after P4 was made, no performance benefit
1034 was observed with branch hints. It also increases the code size.
1035 As a result, icc never generates branch hints. */
1038 /* X86_TUNE_DOUBLE_WITH_ADD */
1041 /* X86_TUNE_USE_SAHF */
1042 m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_PENT4
1043 | m_NOCONA | m_CORE2 | m_GENERIC,
1045 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1046 partial dependencies. */
1047 m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
1048 | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */,
1050 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1051 register stalls on Generic32 compilation setting as well. However
1052 in current implementation the partial register stalls are not eliminated
1053 very well - they can be introduced via subregs synthesized by combine
1054 and can happen in caller/callee saving sequences. Because this option
1055 pays back little on PPro based chips and is in conflict with partial reg
1056 dependencies used by Athlon/P4 based chips, it is better to leave it off
1057 for generic32 for now. */
1060 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1061 m_CORE2 | m_GENERIC,
1063 /* X86_TUNE_USE_HIMODE_FIOP */
1064 m_386 | m_486 | m_K6_GEODE,
1066 /* X86_TUNE_USE_SIMODE_FIOP */
1067 ~(m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT | m_CORE2 | m_GENERIC),
1069 /* X86_TUNE_USE_MOV0 */
1072 /* X86_TUNE_USE_CLTD */
1073 ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC),
1075 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1078 /* X86_TUNE_SPLIT_LONG_MOVES */
1081 /* X86_TUNE_READ_MODIFY_WRITE */
1084 /* X86_TUNE_READ_MODIFY */
1087 /* X86_TUNE_PROMOTE_QIMODE */
1088 m_K6_GEODE | m_PENT | m_386 | m_486 | m_ATHLON_K8_AMDFAM10 | m_CORE2
1089 | m_GENERIC /* | m_PENT4 ? */,
1091 /* X86_TUNE_FAST_PREFIX */
1092 ~(m_PENT | m_486 | m_386),
1094 /* X86_TUNE_SINGLE_STRINGOP */
1095 m_386 | m_PENT4 | m_NOCONA,
1097 /* X86_TUNE_QIMODE_MATH */
1100 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1101 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1102 might be considered for Generic32 if our scheme for avoiding partial
1103 stalls was more effective. */
1106 /* X86_TUNE_PROMOTE_QI_REGS */
1109 /* X86_TUNE_PROMOTE_HI_REGS */
1112 /* X86_TUNE_ADD_ESP_4: Enable if add/sub is preferred over 1/2 push/pop. */
1113 m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1115 /* X86_TUNE_ADD_ESP_8 */
1116 m_ATHLON_K8_AMDFAM10 | m_PPRO | m_K6_GEODE | m_386
1117 | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1119 /* X86_TUNE_SUB_ESP_4 */
1120 m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1122 /* X86_TUNE_SUB_ESP_8 */
1123 m_ATHLON_K8_AMDFAM10 | m_PPRO | m_386 | m_486
1124 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1126 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1127 for DFmode copies */
1128 ~(m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
1129 | m_GENERIC | m_GEODE),
1131 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1132 m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1134 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1135 conflict here in between PPro/Pentium4 based chips that thread 128bit
1136 SSE registers as single units versus K8 based chips that divide SSE
1137 registers to two 64bit halves. This knob promotes all store destinations
1138 to be 128bit to allow register renaming on 128bit SSE units, but usually
1139 results in one extra microop on 64bit SSE units. Experimental results
1140 shows that disabling this option on P4 brings over 20% SPECfp regression,
1141 while enabling it on K8 brings roughly 2.4% regression that can be partly
1142 masked by careful scheduling of moves. */
1143 m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC | m_AMDFAM10,
1145 /* X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL */
1148 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1149 are resolved on SSE register parts instead of whole registers, so we may
1150 maintain just lower part of scalar values in proper format leaving the
1151 upper part undefined. */
1154 /* X86_TUNE_SSE_TYPELESS_STORES */
1155 m_ATHLON_K8_AMDFAM10,
1157 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1158 m_PPRO | m_PENT4 | m_NOCONA,
1160 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1161 m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1163 /* X86_TUNE_PROLOGUE_USING_MOVE */
1164 m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
1166 /* X86_TUNE_EPILOGUE_USING_MOVE */
1167 m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
1169 /* X86_TUNE_SHIFT1 */
1172 /* X86_TUNE_USE_FFREEP */
1173 m_ATHLON_K8_AMDFAM10,
1175 /* X86_TUNE_INTER_UNIT_MOVES */
1176 ~(m_ATHLON_K8_AMDFAM10 | m_GENERIC),
1178 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1179 than 4 branch instructions in the 16 byte window. */
1180 m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1182 /* X86_TUNE_SCHEDULE */
1183 m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT | m_CORE2 | m_GENERIC,
1185 /* X86_TUNE_USE_BT */
1186 m_ATHLON_K8_AMDFAM10,
1188 /* X86_TUNE_USE_INCDEC */
1189 ~(m_PENT4 | m_NOCONA | m_GENERIC),
1191 /* X86_TUNE_PAD_RETURNS */
1192 m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC,
1194 /* X86_TUNE_EXT_80387_CONSTANTS */
1195 m_K6_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC,
1197 /* X86_TUNE_SHORTEN_X87_SSE */
1200 /* X86_TUNE_AVOID_VECTOR_DECODE */
1203 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1204 and SImode multiply, but 386 and 486 do HImode multiply faster. */
1207 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1208 vector path on AMD machines. */
1209 m_K8 | m_GENERIC64 | m_AMDFAM10,
1211 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1213 m_K8 | m_GENERIC64 | m_AMDFAM10,
1215 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1219 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1220 but one byte longer. */
1223 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1224 operand that cannot be represented using a modRM byte. The XOR
1225 replacement is long decoded, so this split helps here as well. */
1229 /* Feature tests against the various architecture variations. */
1230 unsigned int ix86_arch_features[X86_ARCH_LAST] = {
1231 /* X86_ARCH_CMOVE */
1232 m_PPRO | m_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA,
1234 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1237 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1240 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1243 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
1247 static const unsigned int x86_accumulate_outgoing_args
1248 = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
1250 static const unsigned int x86_arch_always_fancy_math_387
1251 = m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4
1252 | m_NOCONA | m_CORE2 | m_GENERIC;
1254 static enum stringop_alg stringop_alg = no_stringop;
1256 /* In case the average insn count for single function invocation is
1257 lower than this constant, emit fast (but longer) prologue and
1259 #define FAST_PROLOGUE_INSN_COUNT 20
1261 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
1262 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1263 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1264 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1266 /* Array of the smallest class containing reg number REGNO, indexed by
1267 REGNO. Used by REGNO_REG_CLASS in i386.h. */
1269 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
1271 /* ax, dx, cx, bx */
1272 AREG, DREG, CREG, BREG,
1273 /* si, di, bp, sp */
1274 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
1276 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
1277 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
1280 /* flags, fpsr, fpcr, frame */
1281 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
1282 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1284 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
1286 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1287 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1288 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1292 /* The "default" register map used in 32bit mode. */
1294 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
1296 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
1297 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
1298 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1299 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
1300 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
1301 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1302 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1305 static int const x86_64_int_parameter_registers[6] =
1307 5 /*RDI*/, 4 /*RSI*/, 1 /*RDX*/, 2 /*RCX*/,
1308 FIRST_REX_INT_REG /*R8 */, FIRST_REX_INT_REG + 1 /*R9 */
1311 static int const x86_64_ms_abi_int_parameter_registers[4] =
1313 2 /*RCX*/, 1 /*RDX*/,
1314 FIRST_REX_INT_REG /*R8 */, FIRST_REX_INT_REG + 1 /*R9 */
1317 static int const x86_64_int_return_registers[4] =
1319 0 /*RAX*/, 1 /*RDX*/, 5 /*RDI*/, 4 /*RSI*/
1322 /* The "default" register map used in 64bit mode. */
1323 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
1325 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
1326 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
1327 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1328 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
1329 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
1330 8,9,10,11,12,13,14,15, /* extended integer registers */
1331 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
1334 /* Define the register numbers to be used in Dwarf debugging information.
1335 The SVR4 reference port C compiler uses the following register numbers
1336 in its Dwarf output code:
1337 0 for %eax (gcc regno = 0)
1338 1 for %ecx (gcc regno = 2)
1339 2 for %edx (gcc regno = 1)
1340 3 for %ebx (gcc regno = 3)
1341 4 for %esp (gcc regno = 7)
1342 5 for %ebp (gcc regno = 6)
1343 6 for %esi (gcc regno = 4)
1344 7 for %edi (gcc regno = 5)
1345 The following three DWARF register numbers are never generated by
1346 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
1347 believes these numbers have these meanings.
1348 8 for %eip (no gcc equivalent)
1349 9 for %eflags (gcc regno = 17)
1350 10 for %trapno (no gcc equivalent)
1351 It is not at all clear how we should number the FP stack registers
1352 for the x86 architecture. If the version of SDB on x86/svr4 were
1353 a bit less brain dead with respect to floating-point then we would
1354 have a precedent to follow with respect to DWARF register numbers
1355 for x86 FP registers, but the SDB on x86/svr4 is so completely
1356 broken with respect to FP registers that it is hardly worth thinking
1357 of it as something to strive for compatibility with.
1358 The version of x86/svr4 SDB I have at the moment does (partially)
1359 seem to believe that DWARF register number 11 is associated with
1360 the x86 register %st(0), but that's about all. Higher DWARF
1361 register numbers don't seem to be associated with anything in
1362 particular, and even for DWARF regno 11, SDB only seems to under-
1363 stand that it should say that a variable lives in %st(0) (when
1364 asked via an `=' command) if we said it was in DWARF regno 11,
1365 but SDB still prints garbage when asked for the value of the
1366 variable in question (via a `/' command).
1367 (Also note that the labels SDB prints for various FP stack regs
1368 when doing an `x' command are all wrong.)
1369 Note that these problems generally don't affect the native SVR4
1370 C compiler because it doesn't allow the use of -O with -g and
1371 because when it is *not* optimizing, it allocates a memory
1372 location for each floating-point variable, and the memory
1373 location is what gets described in the DWARF AT_location
1374 attribute for the variable in question.
1375 Regardless of the severe mental illness of the x86/svr4 SDB, we
1376 do something sensible here and we use the following DWARF
1377 register numbers. Note that these are all stack-top-relative
1379 11 for %st(0) (gcc regno = 8)
1380 12 for %st(1) (gcc regno = 9)
1381 13 for %st(2) (gcc regno = 10)
1382 14 for %st(3) (gcc regno = 11)
1383 15 for %st(4) (gcc regno = 12)
1384 16 for %st(5) (gcc regno = 13)
1385 17 for %st(6) (gcc regno = 14)
1386 18 for %st(7) (gcc regno = 15)
1388 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
1390 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
1391 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
1392 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1393 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
1394 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
1395 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1396 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1399 /* Test and compare insns in i386.md store the information needed to
1400 generate branch and scc insns here. */
1402 rtx ix86_compare_op0 = NULL_RTX;
1403 rtx ix86_compare_op1 = NULL_RTX;
1404 rtx ix86_compare_emitted = NULL_RTX;
1406 /* Size of the register save area. */
1407 #define X86_64_VARARGS_SIZE (REGPARM_MAX * UNITS_PER_WORD + SSE_REGPARM_MAX * 16)
1409 /* Define the structure for the machine field in struct function. */
1411 struct stack_local_entry GTY(())
1413 unsigned short mode;
1416 struct stack_local_entry *next;
1419 /* Structure describing stack frame layout.
1420 Stack grows downward:
1426 saved frame pointer if frame_pointer_needed
1427 <- HARD_FRAME_POINTER
1432 [va_arg registers] (
1433 > to_allocate <- FRAME_POINTER
1443 HOST_WIDE_INT frame;
1445 int outgoing_arguments_size;
1448 HOST_WIDE_INT to_allocate;
1449 /* The offsets relative to ARG_POINTER. */
1450 HOST_WIDE_INT frame_pointer_offset;
1451 HOST_WIDE_INT hard_frame_pointer_offset;
1452 HOST_WIDE_INT stack_pointer_offset;
1454 /* When save_regs_using_mov is set, emit prologue using
1455 move instead of push instructions. */
1456 bool save_regs_using_mov;
1459 /* Code model option. */
1460 enum cmodel ix86_cmodel;
1462 enum asm_dialect ix86_asm_dialect = ASM_ATT;
1464 enum tls_dialect ix86_tls_dialect = TLS_DIALECT_GNU;
1466 /* Which unit we are generating floating point math for. */
1467 enum fpmath_unit ix86_fpmath;
1469 /* Which cpu are we scheduling for. */
1470 enum processor_type ix86_tune;
1472 /* Which instruction set architecture to use. */
1473 enum processor_type ix86_arch;
1475 /* true if sse prefetch instruction is not NOOP. */
1476 int x86_prefetch_sse;
1478 /* ix86_regparm_string as a number */
1479 static int ix86_regparm;
1481 /* -mstackrealign option */
1482 extern int ix86_force_align_arg_pointer;
1483 static const char ix86_force_align_arg_pointer_string[] = "force_align_arg_pointer";
1485 /* Preferred alignment for stack boundary in bits. */
1486 unsigned int ix86_preferred_stack_boundary;
1488 /* Values 1-5: see jump.c */
1489 int ix86_branch_cost;
1491 /* Variables which are this size or smaller are put in the data/bss
1492 or ldata/lbss sections. */
1494 int ix86_section_threshold = 65536;
1496 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
1497 char internal_label_prefix[16];
1498 int internal_label_prefix_len;
1500 /* Register class used for passing given 64bit part of the argument.
1501 These represent classes as documented by the PS ABI, with the exception
1502 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
1503 use SF or DFmode move instead of DImode to avoid reformatting penalties.
1505 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
1506 whenever possible (upper half does contain padding). */
1507 enum x86_64_reg_class
1510 X86_64_INTEGER_CLASS,
1511 X86_64_INTEGERSI_CLASS,
1518 X86_64_COMPLEX_X87_CLASS,
1521 static const char * const x86_64_reg_class_name[] =
1523 "no", "integer", "integerSI", "sse", "sseSF", "sseDF",
1524 "sseup", "x87", "x87up", "cplx87", "no"
1527 #define MAX_CLASSES 4
1529 /* Table of constants used by fldpi, fldln2, etc.... */
1530 static REAL_VALUE_TYPE ext_80387_constants_table [5];
1531 static bool ext_80387_constants_init = 0;
1534 static struct machine_function * ix86_init_machine_status (void);
1535 static rtx ix86_function_value (tree, tree, bool);
1536 static int ix86_function_regparm (tree, tree);
1537 static void ix86_compute_frame_layout (struct ix86_frame *);
1538 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
1542 /* The svr4 ABI for the i386 says that records and unions are returned
1544 #ifndef DEFAULT_PCC_STRUCT_RETURN
1545 #define DEFAULT_PCC_STRUCT_RETURN 1
1548 /* Implement TARGET_HANDLE_OPTION. */
1551 ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value)
1558 target_flags &= ~MASK_3DNOW_A;
1559 target_flags_explicit |= MASK_3DNOW_A;
1566 target_flags &= ~(MASK_3DNOW | MASK_3DNOW_A);
1567 target_flags_explicit |= MASK_3DNOW | MASK_3DNOW_A;
1574 target_flags &= ~(MASK_SSE2 | MASK_SSE3 | MASK_SSSE3
1576 target_flags_explicit |= (MASK_SSE2 | MASK_SSE3 | MASK_SSSE3
1584 target_flags &= ~(MASK_SSE3 | MASK_SSSE3 | MASK_SSE4A);
1585 target_flags_explicit |= MASK_SSE3 | MASK_SSSE3 | MASK_SSE4A;
1592 target_flags &= ~(MASK_SSSE3 | MASK_SSE4A);
1593 target_flags_explicit |= MASK_SSSE3 | MASK_SSE4A;
1600 target_flags &= ~MASK_SSE4A;
1601 target_flags_explicit |= MASK_SSE4A;
1610 /* Sometimes certain combinations of command options do not make
1611 sense on a particular target machine. You can define a macro
1612 `OVERRIDE_OPTIONS' to take account of this. This macro, if
1613 defined, is executed once just after all the command options have
1616 Don't use this macro to turn on various extra optimizations for
1617 `-O'. That is what `OPTIMIZATION_OPTIONS' is for. */
1620 override_options (void)
1623 int ix86_tune_defaulted = 0;
1624 unsigned int ix86_arch_mask, ix86_tune_mask;
1626 /* Comes from final.c -- no real reason to change it. */
1627 #define MAX_CODE_ALIGN 16
1631 const struct processor_costs *cost; /* Processor costs */
1632 const int target_enable; /* Target flags to enable. */
1633 const int target_disable; /* Target flags to disable. */
1634 const int align_loop; /* Default alignments. */
1635 const int align_loop_max_skip;
1636 const int align_jump;
1637 const int align_jump_max_skip;
1638 const int align_func;
1640 const processor_target_table[PROCESSOR_max] =
1642 {&i386_cost, 0, 0, 4, 3, 4, 3, 4},
1643 {&i486_cost, 0, 0, 16, 15, 16, 15, 16},
1644 {&pentium_cost, 0, 0, 16, 7, 16, 7, 16},
1645 {&pentiumpro_cost, 0, 0, 16, 15, 16, 7, 16},
1646 {&geode_cost, 0, 0, 0, 0, 0, 0, 0},
1647 {&k6_cost, 0, 0, 32, 7, 32, 7, 32},
1648 {&athlon_cost, 0, 0, 16, 7, 16, 7, 16},
1649 {&pentium4_cost, 0, 0, 0, 0, 0, 0, 0},
1650 {&k8_cost, 0, 0, 16, 7, 16, 7, 16},
1651 {&nocona_cost, 0, 0, 0, 0, 0, 0, 0},
1652 {&core2_cost, 0, 0, 16, 7, 16, 7, 16},
1653 {&generic32_cost, 0, 0, 16, 7, 16, 7, 16},
1654 {&generic64_cost, 0, 0, 16, 7, 16, 7, 16},
1655 {&amdfam10_cost, 0, 0, 32, 7, 32, 7, 32}
1658 static const char * const cpu_names[] = TARGET_CPU_DEFAULT_NAMES;
1661 const char *const name; /* processor name or nickname. */
1662 const enum processor_type processor;
1663 const enum pta_flags
1669 PTA_PREFETCH_SSE = 1 << 4,
1671 PTA_3DNOW_A = 1 << 6,
1675 PTA_POPCNT = 1 << 10,
1677 PTA_SSE4A = 1 << 12,
1678 PTA_NO_SAHF = 1 << 13
1681 const processor_alias_table[] =
1683 {"i386", PROCESSOR_I386, 0},
1684 {"i486", PROCESSOR_I486, 0},
1685 {"i586", PROCESSOR_PENTIUM, 0},
1686 {"pentium", PROCESSOR_PENTIUM, 0},
1687 {"pentium-mmx", PROCESSOR_PENTIUM, PTA_MMX},
1688 {"winchip-c6", PROCESSOR_I486, PTA_MMX},
1689 {"winchip2", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},
1690 {"c3", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},
1691 {"c3-2", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_PREFETCH_SSE | PTA_SSE},
1692 {"i686", PROCESSOR_PENTIUMPRO, 0},
1693 {"pentiumpro", PROCESSOR_PENTIUMPRO, 0},
1694 {"pentium2", PROCESSOR_PENTIUMPRO, PTA_MMX},
1695 {"pentium3", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_PREFETCH_SSE},
1696 {"pentium3m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_PREFETCH_SSE},
1697 {"pentium-m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_PREFETCH_SSE | PTA_SSE2},
1698 {"pentium4", PROCESSOR_PENTIUM4, PTA_SSE | PTA_SSE2
1699 | PTA_MMX | PTA_PREFETCH_SSE},
1700 {"pentium4m", PROCESSOR_PENTIUM4, PTA_SSE | PTA_SSE2
1701 | PTA_MMX | PTA_PREFETCH_SSE},
1702 {"prescott", PROCESSOR_NOCONA, PTA_SSE | PTA_SSE2 | PTA_SSE3
1703 | PTA_MMX | PTA_PREFETCH_SSE},
1704 {"nocona", PROCESSOR_NOCONA, PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_64BIT
1705 | PTA_MMX | PTA_PREFETCH_SSE
1706 | PTA_CX16 | PTA_NO_SAHF},
1707 {"core2", PROCESSOR_CORE2, PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3
1708 | PTA_64BIT | PTA_MMX
1709 | PTA_PREFETCH_SSE | PTA_CX16},
1710 {"geode", PROCESSOR_GEODE, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1712 {"k6", PROCESSOR_K6, PTA_MMX},
1713 {"k6-2", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
1714 {"k6-3", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
1715 {"athlon", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1717 {"athlon-tbird", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE
1718 | PTA_3DNOW | PTA_3DNOW_A},
1719 {"athlon-4", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1720 | PTA_3DNOW_A | PTA_SSE},
1721 {"athlon-xp", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1722 | PTA_3DNOW_A | PTA_SSE},
1723 {"athlon-mp", PROCESSOR_ATHLON, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1724 | PTA_3DNOW_A | PTA_SSE},
1725 {"x86-64", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_64BIT
1726 | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
1727 {"k8", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW | PTA_64BIT
1728 | PTA_3DNOW_A | PTA_SSE | PTA_SSE2
1730 {"opteron", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1731 | PTA_64BIT | PTA_3DNOW_A | PTA_SSE
1732 | PTA_SSE2 | PTA_NO_SAHF},
1733 {"athlon64", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1734 | PTA_64BIT | PTA_3DNOW_A | PTA_SSE
1735 | PTA_SSE2 | PTA_NO_SAHF},
1736 {"athlon-fx", PROCESSOR_K8, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1737 | PTA_64BIT | PTA_3DNOW_A | PTA_SSE
1738 | PTA_SSE2 | PTA_NO_SAHF},
1739 {"amdfam10", PROCESSOR_AMDFAM10, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1740 | PTA_64BIT | PTA_3DNOW_A | PTA_SSE
1741 | PTA_SSE2 | PTA_SSE3 | PTA_POPCNT
1742 | PTA_ABM | PTA_SSE4A | PTA_CX16},
1743 {"barcelona", PROCESSOR_AMDFAM10, PTA_MMX | PTA_PREFETCH_SSE | PTA_3DNOW
1744 | PTA_64BIT | PTA_3DNOW_A | PTA_SSE
1745 | PTA_SSE2 | PTA_SSE3 | PTA_POPCNT
1746 | PTA_ABM | PTA_SSE4A | PTA_CX16},
1747 {"generic32", PROCESSOR_GENERIC32, 0 /* flags are only used for -march switch. */ },
1748 {"generic64", PROCESSOR_GENERIC64, PTA_64BIT /* flags are only used for -march switch. */ },
1751 int const pta_size = ARRAY_SIZE (processor_alias_table);
1753 #ifdef SUBTARGET_OVERRIDE_OPTIONS
1754 SUBTARGET_OVERRIDE_OPTIONS;
1757 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
1758 SUBSUBTARGET_OVERRIDE_OPTIONS;
1761 /* -fPIC is the default for x86_64. */
1762 if (TARGET_MACHO && TARGET_64BIT)
1765 /* Set the default values for switches whose default depends on TARGET_64BIT
1766 in case they weren't overwritten by command line options. */
1769 /* Mach-O doesn't support omitting the frame pointer for now. */
1770 if (flag_omit_frame_pointer == 2)
1771 flag_omit_frame_pointer = (TARGET_MACHO ? 0 : 1);
1772 if (flag_asynchronous_unwind_tables == 2)
1773 flag_asynchronous_unwind_tables = 1;
1774 if (flag_pcc_struct_return == 2)
1775 flag_pcc_struct_return = 0;
1779 if (flag_omit_frame_pointer == 2)
1780 flag_omit_frame_pointer = 0;
1781 if (flag_asynchronous_unwind_tables == 2)
1782 flag_asynchronous_unwind_tables = 0;
1783 if (flag_pcc_struct_return == 2)
1784 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
1787 /* Need to check -mtune=generic first. */
1788 if (ix86_tune_string)
1790 if (!strcmp (ix86_tune_string, "generic")
1791 || !strcmp (ix86_tune_string, "i686")
1792 /* As special support for cross compilers we read -mtune=native
1793 as -mtune=generic. With native compilers we won't see the
1794 -mtune=native, as it was changed by the driver. */
1795 || !strcmp (ix86_tune_string, "native"))
1798 ix86_tune_string = "generic64";
1800 ix86_tune_string = "generic32";
1802 else if (!strncmp (ix86_tune_string, "generic", 7))
1803 error ("bad value (%s) for -mtune= switch", ix86_tune_string);
1807 if (ix86_arch_string)
1808 ix86_tune_string = ix86_arch_string;
1809 if (!ix86_tune_string)
1811 ix86_tune_string = cpu_names [TARGET_CPU_DEFAULT];
1812 ix86_tune_defaulted = 1;
1815 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
1816 need to use a sensible tune option. */
1817 if (!strcmp (ix86_tune_string, "generic")
1818 || !strcmp (ix86_tune_string, "x86-64")
1819 || !strcmp (ix86_tune_string, "i686"))
1822 ix86_tune_string = "generic64";
1824 ix86_tune_string = "generic32";
1827 if (ix86_stringop_string)
1829 if (!strcmp (ix86_stringop_string, "rep_byte"))
1830 stringop_alg = rep_prefix_1_byte;
1831 else if (!strcmp (ix86_stringop_string, "libcall"))
1832 stringop_alg = libcall;
1833 else if (!strcmp (ix86_stringop_string, "rep_4byte"))
1834 stringop_alg = rep_prefix_4_byte;
1835 else if (!strcmp (ix86_stringop_string, "rep_8byte"))
1836 stringop_alg = rep_prefix_8_byte;
1837 else if (!strcmp (ix86_stringop_string, "byte_loop"))
1838 stringop_alg = loop_1_byte;
1839 else if (!strcmp (ix86_stringop_string, "loop"))
1840 stringop_alg = loop;
1841 else if (!strcmp (ix86_stringop_string, "unrolled_loop"))
1842 stringop_alg = unrolled_loop;
1844 error ("bad value (%s) for -mstringop-strategy= switch", ix86_stringop_string);
1846 if (!strcmp (ix86_tune_string, "x86-64"))
1847 warning (OPT_Wdeprecated, "-mtune=x86-64 is deprecated. Use -mtune=k8 or "
1848 "-mtune=generic instead as appropriate.");
1850 if (!ix86_arch_string)
1851 ix86_arch_string = TARGET_64BIT ? "x86-64" : "i386";
1852 if (!strcmp (ix86_arch_string, "generic"))
1853 error ("generic CPU can be used only for -mtune= switch");
1854 if (!strncmp (ix86_arch_string, "generic", 7))
1855 error ("bad value (%s) for -march= switch", ix86_arch_string);
1857 if (ix86_cmodel_string != 0)
1859 if (!strcmp (ix86_cmodel_string, "small"))
1860 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
1861 else if (!strcmp (ix86_cmodel_string, "medium"))
1862 ix86_cmodel = flag_pic ? CM_MEDIUM_PIC : CM_MEDIUM;
1863 else if (!strcmp (ix86_cmodel_string, "large"))
1864 ix86_cmodel = flag_pic ? CM_LARGE_PIC : CM_LARGE;
1866 error ("code model %s does not support PIC mode", ix86_cmodel_string);
1867 else if (!strcmp (ix86_cmodel_string, "32"))
1868 ix86_cmodel = CM_32;
1869 else if (!strcmp (ix86_cmodel_string, "kernel") && !flag_pic)
1870 ix86_cmodel = CM_KERNEL;
1872 error ("bad value (%s) for -mcmodel= switch", ix86_cmodel_string);
1876 /* For TARGET_64BIT_MS_ABI, force pic on, in order to enable the
1877 use of rip-relative addressing. This eliminates fixups that
1878 would otherwise be needed if this object is to be placed in a
1879 DLL, and is essentially just as efficient as direct addressing. */
1880 if (TARGET_64BIT_MS_ABI)
1881 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
1882 else if (TARGET_64BIT)
1883 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
1885 ix86_cmodel = CM_32;
1887 if (ix86_asm_string != 0)
1890 && !strcmp (ix86_asm_string, "intel"))
1891 ix86_asm_dialect = ASM_INTEL;
1892 else if (!strcmp (ix86_asm_string, "att"))
1893 ix86_asm_dialect = ASM_ATT;
1895 error ("bad value (%s) for -masm= switch", ix86_asm_string);
1897 if ((TARGET_64BIT == 0) != (ix86_cmodel == CM_32))
1898 error ("code model %qs not supported in the %s bit mode",
1899 ix86_cmodel_string, TARGET_64BIT ? "64" : "32");
1900 if ((TARGET_64BIT != 0) != ((target_flags & MASK_64BIT) != 0))
1901 sorry ("%i-bit mode not compiled in",
1902 (target_flags & MASK_64BIT) ? 64 : 32);
1904 for (i = 0; i < pta_size; i++)
1905 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
1907 ix86_arch = processor_alias_table[i].processor;
1908 /* Default cpu tuning to the architecture. */
1909 ix86_tune = ix86_arch;
1910 if (processor_alias_table[i].flags & PTA_MMX
1911 && !(target_flags_explicit & MASK_MMX))
1912 target_flags |= MASK_MMX;
1913 if (processor_alias_table[i].flags & PTA_3DNOW
1914 && !(target_flags_explicit & MASK_3DNOW))
1915 target_flags |= MASK_3DNOW;
1916 if (processor_alias_table[i].flags & PTA_3DNOW_A
1917 && !(target_flags_explicit & MASK_3DNOW_A))
1918 target_flags |= MASK_3DNOW_A;
1919 if (processor_alias_table[i].flags & PTA_SSE
1920 && !(target_flags_explicit & MASK_SSE))
1921 target_flags |= MASK_SSE;
1922 if (processor_alias_table[i].flags & PTA_SSE2
1923 && !(target_flags_explicit & MASK_SSE2))
1924 target_flags |= MASK_SSE2;
1925 if (processor_alias_table[i].flags & PTA_SSE3
1926 && !(target_flags_explicit & MASK_SSE3))
1927 target_flags |= MASK_SSE3;
1928 if (processor_alias_table[i].flags & PTA_SSSE3
1929 && !(target_flags_explicit & MASK_SSSE3))
1930 target_flags |= MASK_SSSE3;
1931 if (processor_alias_table[i].flags & PTA_PREFETCH_SSE)
1932 x86_prefetch_sse = true;
1933 if (processor_alias_table[i].flags & PTA_CX16)
1934 x86_cmpxchg16b = true;
1935 if (processor_alias_table[i].flags & PTA_POPCNT
1936 && !(target_flags_explicit & MASK_POPCNT))
1937 target_flags |= MASK_POPCNT;
1938 if (processor_alias_table[i].flags & PTA_ABM
1939 && !(target_flags_explicit & MASK_ABM))
1940 target_flags |= MASK_ABM;
1941 if (processor_alias_table[i].flags & PTA_SSE4A
1942 && !(target_flags_explicit & MASK_SSE4A))
1943 target_flags |= MASK_SSE4A;
1944 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF)))
1946 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
1947 error ("CPU you selected does not support x86-64 "
1953 error ("bad value (%s) for -march= switch", ix86_arch_string);
1955 ix86_arch_mask = 1u << ix86_arch;
1956 for (i = 0; i < X86_ARCH_LAST; ++i)
1957 ix86_arch_features[i] &= ix86_arch_mask;
1959 for (i = 0; i < pta_size; i++)
1960 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
1962 ix86_tune = processor_alias_table[i].processor;
1963 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
1965 if (ix86_tune_defaulted)
1967 ix86_tune_string = "x86-64";
1968 for (i = 0; i < pta_size; i++)
1969 if (! strcmp (ix86_tune_string,
1970 processor_alias_table[i].name))
1972 ix86_tune = processor_alias_table[i].processor;
1975 error ("CPU you selected does not support x86-64 "
1978 /* Intel CPUs have always interpreted SSE prefetch instructions as
1979 NOPs; so, we can enable SSE prefetch instructions even when
1980 -mtune (rather than -march) points us to a processor that has them.
1981 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
1982 higher processors. */
1983 if (TARGET_CMOVE && (processor_alias_table[i].flags & PTA_PREFETCH_SSE))
1984 x86_prefetch_sse = true;
1988 error ("bad value (%s) for -mtune= switch", ix86_tune_string);
1990 ix86_tune_mask = 1u << ix86_tune;
1991 for (i = 0; i < X86_TUNE_LAST; ++i)
1992 ix86_tune_features[i] &= ix86_tune_mask;
1995 ix86_cost = &size_cost;
1997 ix86_cost = processor_target_table[ix86_tune].cost;
1998 target_flags |= processor_target_table[ix86_tune].target_enable;
1999 target_flags &= ~processor_target_table[ix86_tune].target_disable;
2001 /* Arrange to set up i386_stack_locals for all functions. */
2002 init_machine_status = ix86_init_machine_status;
2004 /* Validate -mregparm= value. */
2005 if (ix86_regparm_string)
2008 warning (0, "-mregparm is ignored in 64-bit mode");
2009 i = atoi (ix86_regparm_string);
2010 if (i < 0 || i > REGPARM_MAX)
2011 error ("-mregparm=%d is not between 0 and %d", i, REGPARM_MAX);
2016 ix86_regparm = REGPARM_MAX;
2018 /* If the user has provided any of the -malign-* options,
2019 warn and use that value only if -falign-* is not set.
2020 Remove this code in GCC 3.2 or later. */
2021 if (ix86_align_loops_string)
2023 warning (0, "-malign-loops is obsolete, use -falign-loops");
2024 if (align_loops == 0)
2026 i = atoi (ix86_align_loops_string);
2027 if (i < 0 || i > MAX_CODE_ALIGN)
2028 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2030 align_loops = 1 << i;
2034 if (ix86_align_jumps_string)
2036 warning (0, "-malign-jumps is obsolete, use -falign-jumps");
2037 if (align_jumps == 0)
2039 i = atoi (ix86_align_jumps_string);
2040 if (i < 0 || i > MAX_CODE_ALIGN)
2041 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2043 align_jumps = 1 << i;
2047 if (ix86_align_funcs_string)
2049 warning (0, "-malign-functions is obsolete, use -falign-functions");
2050 if (align_functions == 0)
2052 i = atoi (ix86_align_funcs_string);
2053 if (i < 0 || i > MAX_CODE_ALIGN)
2054 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2056 align_functions = 1 << i;
2060 /* Default align_* from the processor table. */
2061 if (align_loops == 0)
2063 align_loops = processor_target_table[ix86_tune].align_loop;
2064 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
2066 if (align_jumps == 0)
2068 align_jumps = processor_target_table[ix86_tune].align_jump;
2069 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
2071 if (align_functions == 0)
2073 align_functions = processor_target_table[ix86_tune].align_func;
2076 /* Validate -mbranch-cost= value, or provide default. */
2077 ix86_branch_cost = ix86_cost->branch_cost;
2078 if (ix86_branch_cost_string)
2080 i = atoi (ix86_branch_cost_string);
2082 error ("-mbranch-cost=%d is not between 0 and 5", i);
2084 ix86_branch_cost = i;
2086 if (ix86_section_threshold_string)
2088 i = atoi (ix86_section_threshold_string);
2090 error ("-mlarge-data-threshold=%d is negative", i);
2092 ix86_section_threshold = i;
2095 if (ix86_tls_dialect_string)
2097 if (strcmp (ix86_tls_dialect_string, "gnu") == 0)
2098 ix86_tls_dialect = TLS_DIALECT_GNU;
2099 else if (strcmp (ix86_tls_dialect_string, "gnu2") == 0)
2100 ix86_tls_dialect = TLS_DIALECT_GNU2;
2101 else if (strcmp (ix86_tls_dialect_string, "sun") == 0)
2102 ix86_tls_dialect = TLS_DIALECT_SUN;
2104 error ("bad value (%s) for -mtls-dialect= switch",
2105 ix86_tls_dialect_string);
2108 if (ix87_precision_string)
2110 i = atoi (ix87_precision_string);
2111 if (i != 32 && i != 64 && i != 80)
2112 error ("pc%d is not valid precision setting (32, 64 or 80)", i);
2115 /* Keep nonleaf frame pointers. */
2116 if (flag_omit_frame_pointer)
2117 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
2118 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
2119 flag_omit_frame_pointer = 1;
2121 /* If we're doing fast math, we don't care about comparison order
2122 wrt NaNs. This lets us use a shorter comparison sequence. */
2123 if (flag_finite_math_only)
2124 target_flags &= ~MASK_IEEE_FP;
2126 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
2127 since the insns won't need emulation. */
2128 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
2129 target_flags &= ~MASK_NO_FANCY_MATH_387;
2131 /* Likewise, if the target doesn't have a 387, or we've specified
2132 software floating point, don't use 387 inline intrinsics. */
2134 target_flags |= MASK_NO_FANCY_MATH_387;
2136 /* Turn on SSE3 builtins for -mssse3. */
2138 target_flags |= MASK_SSE3;
2140 /* Turn on SSE3 builtins for -msse4a. */
2142 target_flags |= MASK_SSE3;
2144 /* Turn on SSE2 builtins for -msse3. */
2146 target_flags |= MASK_SSE2;
2148 /* Turn on SSE builtins for -msse2. */
2150 target_flags |= MASK_SSE;
2152 /* Turn on MMX builtins for -msse. */
2155 target_flags |= MASK_MMX & ~target_flags_explicit;
2156 x86_prefetch_sse = true;
2159 /* Turn on MMX builtins for 3Dnow. */
2161 target_flags |= MASK_MMX;
2163 /* Turn on POPCNT builtins for -mabm. */
2165 target_flags |= MASK_POPCNT;
2170 warning (0, "-mrtd is ignored in 64bit mode");
2172 /* Enable by default the SSE and MMX builtins. Do allow the user to
2173 explicitly disable any of these. In particular, disabling SSE and
2174 MMX for kernel code is extremely useful. */
2176 |= ((MASK_SSE2 | MASK_SSE | MASK_MMX | TARGET_SUBTARGET64_DEFAULT)
2177 & ~target_flags_explicit);
2181 /* i386 ABI does not specify red zone. It still makes sense to use it
2182 when programmer takes care to stack from being destroyed. */
2183 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
2184 target_flags |= MASK_NO_RED_ZONE;
2187 /* Validate -mpreferred-stack-boundary= value, or provide default.
2188 The default of 128 bits is for Pentium III's SSE __m128. We can't
2189 change it because of optimize_size. Otherwise, we can't mix object
2190 files compiled with -Os and -On. */
2191 ix86_preferred_stack_boundary = 128;
2192 if (ix86_preferred_stack_boundary_string)
2194 i = atoi (ix86_preferred_stack_boundary_string);
2195 if (i < (TARGET_64BIT ? 4 : 2) || i > 12)
2196 error ("-mpreferred-stack-boundary=%d is not between %d and 12", i,
2197 TARGET_64BIT ? 4 : 2);
2199 ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT;
2202 /* Accept -msseregparm only if at least SSE support is enabled. */
2203 if (TARGET_SSEREGPARM
2205 error ("-msseregparm used without SSE enabled");
2207 ix86_fpmath = TARGET_FPMATH_DEFAULT;
2208 if (ix86_fpmath_string != 0)
2210 if (! strcmp (ix86_fpmath_string, "387"))
2211 ix86_fpmath = FPMATH_387;
2212 else if (! strcmp (ix86_fpmath_string, "sse"))
2216 warning (0, "SSE instruction set disabled, using 387 arithmetics");
2217 ix86_fpmath = FPMATH_387;
2220 ix86_fpmath = FPMATH_SSE;
2222 else if (! strcmp (ix86_fpmath_string, "387,sse")
2223 || ! strcmp (ix86_fpmath_string, "sse,387"))
2227 warning (0, "SSE instruction set disabled, using 387 arithmetics");
2228 ix86_fpmath = FPMATH_387;
2230 else if (!TARGET_80387)
2232 warning (0, "387 instruction set disabled, using SSE arithmetics");
2233 ix86_fpmath = FPMATH_SSE;
2236 ix86_fpmath = FPMATH_SSE | FPMATH_387;
2239 error ("bad value (%s) for -mfpmath= switch", ix86_fpmath_string);
2242 /* If the i387 is disabled, then do not return values in it. */
2244 target_flags &= ~MASK_FLOAT_RETURNS;
2246 if ((x86_accumulate_outgoing_args & ix86_tune_mask)
2247 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2249 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2251 /* ??? Unwind info is not correct around the CFG unless either a frame
2252 pointer is present or M_A_O_A is set. Fixing this requires rewriting
2253 unwind info generation to be aware of the CFG and propagating states
2255 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
2256 || flag_exceptions || flag_non_call_exceptions)
2257 && flag_omit_frame_pointer
2258 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
2260 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2261 warning (0, "unwind tables currently require either a frame pointer "
2262 "or -maccumulate-outgoing-args for correctness");
2263 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2266 /* For sane SSE instruction set generation we need fcomi instruction.
2267 It is safe to enable all CMOVE instructions. */
2271 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
2274 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
2275 p = strchr (internal_label_prefix, 'X');
2276 internal_label_prefix_len = p - internal_label_prefix;
2280 /* When scheduling description is not available, disable scheduler pass
2281 so it won't slow down the compilation and make x87 code slower. */
2282 if (!TARGET_SCHEDULE)
2283 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
2285 if (!PARAM_SET_P (PARAM_SIMULTANEOUS_PREFETCHES))
2286 set_param_value ("simultaneous-prefetches",
2287 ix86_cost->simultaneous_prefetches);
2288 if (!PARAM_SET_P (PARAM_L1_CACHE_LINE_SIZE))
2289 set_param_value ("l1-cache-line-size", ix86_cost->prefetch_block);
2292 /* Return true if this goes in large data/bss. */
2295 ix86_in_large_data_p (tree exp)
2297 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
2300 /* Functions are never large data. */
2301 if (TREE_CODE (exp) == FUNCTION_DECL)
2304 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
2306 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
2307 if (strcmp (section, ".ldata") == 0
2308 || strcmp (section, ".lbss") == 0)
2314 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
2316 /* If this is an incomplete type with size 0, then we can't put it
2317 in data because it might be too big when completed. */
2318 if (!size || size > ix86_section_threshold)
2325 /* Switch to the appropriate section for output of DECL.
2326 DECL is either a `VAR_DECL' node or a constant of some sort.
2327 RELOC indicates whether forming the initial value of DECL requires
2328 link-time relocations. */
2330 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
2334 x86_64_elf_select_section (tree decl, int reloc,
2335 unsigned HOST_WIDE_INT align)
2337 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2338 && ix86_in_large_data_p (decl))
2340 const char *sname = NULL;
2341 unsigned int flags = SECTION_WRITE;
2342 switch (categorize_decl_for_section (decl, reloc))
2347 case SECCAT_DATA_REL:
2348 sname = ".ldata.rel";
2350 case SECCAT_DATA_REL_LOCAL:
2351 sname = ".ldata.rel.local";
2353 case SECCAT_DATA_REL_RO:
2354 sname = ".ldata.rel.ro";
2356 case SECCAT_DATA_REL_RO_LOCAL:
2357 sname = ".ldata.rel.ro.local";
2361 flags |= SECTION_BSS;
2364 case SECCAT_RODATA_MERGE_STR:
2365 case SECCAT_RODATA_MERGE_STR_INIT:
2366 case SECCAT_RODATA_MERGE_CONST:
2370 case SECCAT_SRODATA:
2377 /* We don't split these for medium model. Place them into
2378 default sections and hope for best. */
2383 /* We might get called with string constants, but get_named_section
2384 doesn't like them as they are not DECLs. Also, we need to set
2385 flags in that case. */
2387 return get_section (sname, flags, NULL);
2388 return get_named_section (decl, sname, reloc);
2391 return default_elf_select_section (decl, reloc, align);
2394 /* Build up a unique section name, expressed as a
2395 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
2396 RELOC indicates whether the initial value of EXP requires
2397 link-time relocations. */
2399 static void ATTRIBUTE_UNUSED
2400 x86_64_elf_unique_section (tree decl, int reloc)
2402 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2403 && ix86_in_large_data_p (decl))
2405 const char *prefix = NULL;
2406 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
2407 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
2409 switch (categorize_decl_for_section (decl, reloc))
2412 case SECCAT_DATA_REL:
2413 case SECCAT_DATA_REL_LOCAL:
2414 case SECCAT_DATA_REL_RO:
2415 case SECCAT_DATA_REL_RO_LOCAL:
2416 prefix = one_only ? ".gnu.linkonce.ld." : ".ldata.";
2419 prefix = one_only ? ".gnu.linkonce.lb." : ".lbss.";
2422 case SECCAT_RODATA_MERGE_STR:
2423 case SECCAT_RODATA_MERGE_STR_INIT:
2424 case SECCAT_RODATA_MERGE_CONST:
2425 prefix = one_only ? ".gnu.linkonce.lr." : ".lrodata.";
2427 case SECCAT_SRODATA:
2434 /* We don't split these for medium model. Place them into
2435 default sections and hope for best. */
2443 plen = strlen (prefix);
2445 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
2446 name = targetm.strip_name_encoding (name);
2447 nlen = strlen (name);
2449 string = alloca (nlen + plen + 1);
2450 memcpy (string, prefix, plen);
2451 memcpy (string + plen, name, nlen + 1);
2453 DECL_SECTION_NAME (decl) = build_string (nlen + plen, string);
2457 default_unique_section (decl, reloc);
2460 #ifdef COMMON_ASM_OP
2461 /* This says how to output assembler code to declare an
2462 uninitialized external linkage data object.
2464 For medium model x86-64 we need to use .largecomm opcode for
2467 x86_elf_aligned_common (FILE *file,
2468 const char *name, unsigned HOST_WIDE_INT size,
2471 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2472 && size > (unsigned int)ix86_section_threshold)
2473 fprintf (file, ".largecomm\t");
2475 fprintf (file, "%s", COMMON_ASM_OP);
2476 assemble_name (file, name);
2477 fprintf (file, ","HOST_WIDE_INT_PRINT_UNSIGNED",%u\n",
2478 size, align / BITS_PER_UNIT);
2482 /* Utility function for targets to use in implementing
2483 ASM_OUTPUT_ALIGNED_BSS. */
2486 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
2487 const char *name, unsigned HOST_WIDE_INT size,
2490 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2491 && size > (unsigned int)ix86_section_threshold)
2492 switch_to_section (get_named_section (decl, ".lbss", 0));
2494 switch_to_section (bss_section);
2495 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
2496 #ifdef ASM_DECLARE_OBJECT_NAME
2497 last_assemble_variable_decl = decl;
2498 ASM_DECLARE_OBJECT_NAME (file, name, decl);
2500 /* Standard thing is just output label for the object. */
2501 ASM_OUTPUT_LABEL (file, name);
2502 #endif /* ASM_DECLARE_OBJECT_NAME */
2503 ASM_OUTPUT_SKIP (file, size ? size : 1);
2507 optimization_options (int level, int size ATTRIBUTE_UNUSED)
2509 /* For -O2 and beyond, turn off -fschedule-insns by default. It tends to
2510 make the problem with not enough registers even worse. */
2511 #ifdef INSN_SCHEDULING
2513 flag_schedule_insns = 0;
2517 /* The Darwin libraries never set errno, so we might as well
2518 avoid calling them when that's the only reason we would. */
2519 flag_errno_math = 0;
2521 /* The default values of these switches depend on the TARGET_64BIT
2522 that is not known at this moment. Mark these values with 2 and
2523 let user the to override these. In case there is no command line option
2524 specifying them, we will set the defaults in override_options. */
2526 flag_omit_frame_pointer = 2;
2527 flag_pcc_struct_return = 2;
2528 flag_asynchronous_unwind_tables = 2;
2529 #ifdef SUBTARGET_OPTIMIZATION_OPTIONS
2530 SUBTARGET_OPTIMIZATION_OPTIONS;
2534 /* Decide whether we can make a sibling call to a function. DECL is the
2535 declaration of the function being targeted by the call and EXP is the
2536 CALL_EXPR representing the call. */
2539 ix86_function_ok_for_sibcall (tree decl, tree exp)
2544 /* If we are generating position-independent code, we cannot sibcall
2545 optimize any indirect call, or a direct call to a global function,
2546 as the PLT requires %ebx be live. */
2547 if (!TARGET_64BIT && flag_pic && (!decl || !targetm.binds_local_p (decl)))
2554 func = TREE_TYPE (CALL_EXPR_FN (exp));
2555 if (POINTER_TYPE_P (func))
2556 func = TREE_TYPE (func);
2559 /* Check that the return value locations are the same. Like
2560 if we are returning floats on the 80387 register stack, we cannot
2561 make a sibcall from a function that doesn't return a float to a
2562 function that does or, conversely, from a function that does return
2563 a float to a function that doesn't; the necessary stack adjustment
2564 would not be executed. This is also the place we notice
2565 differences in the return value ABI. Note that it is ok for one
2566 of the functions to have void return type as long as the return
2567 value of the other is passed in a register. */
2568 a = ix86_function_value (TREE_TYPE (exp), func, false);
2569 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
2571 if (STACK_REG_P (a) || STACK_REG_P (b))
2573 if (!rtx_equal_p (a, b))
2576 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
2578 else if (!rtx_equal_p (a, b))
2581 /* If this call is indirect, we'll need to be able to use a call-clobbered
2582 register for the address of the target function. Make sure that all
2583 such registers are not used for passing parameters. */
2584 if (!decl && !TARGET_64BIT)
2588 /* We're looking at the CALL_EXPR, we need the type of the function. */
2589 type = CALL_EXPR_FN (exp); /* pointer expression */
2590 type = TREE_TYPE (type); /* pointer type */
2591 type = TREE_TYPE (type); /* function type */
2593 if (ix86_function_regparm (type, NULL) >= 3)
2595 /* ??? Need to count the actual number of registers to be used,
2596 not the possible number of registers. Fix later. */
2601 /* Dllimport'd functions are also called indirectly. */
2602 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
2603 && decl && DECL_DLLIMPORT_P (decl)
2604 && ix86_function_regparm (TREE_TYPE (decl), NULL) >= 3)
2607 /* If we forced aligned the stack, then sibcalling would unalign the
2608 stack, which may break the called function. */
2609 if (cfun->machine->force_align_arg_pointer)
2612 /* Otherwise okay. That also includes certain types of indirect calls. */
2616 /* Handle "cdecl", "stdcall", "fastcall", "regparm" and "sseregparm"
2617 calling convention attributes;
2618 arguments as in struct attribute_spec.handler. */
2621 ix86_handle_cconv_attribute (tree *node, tree name,
2623 int flags ATTRIBUTE_UNUSED,
2626 if (TREE_CODE (*node) != FUNCTION_TYPE
2627 && TREE_CODE (*node) != METHOD_TYPE
2628 && TREE_CODE (*node) != FIELD_DECL
2629 && TREE_CODE (*node) != TYPE_DECL)
2631 warning (OPT_Wattributes, "%qs attribute only applies to functions",
2632 IDENTIFIER_POINTER (name));
2633 *no_add_attrs = true;
2637 /* Can combine regparm with all attributes but fastcall. */
2638 if (is_attribute_p ("regparm", name))
2642 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
2644 error ("fastcall and regparm attributes are not compatible");
2647 cst = TREE_VALUE (args);
2648 if (TREE_CODE (cst) != INTEGER_CST)
2650 warning (OPT_Wattributes,
2651 "%qs attribute requires an integer constant argument",
2652 IDENTIFIER_POINTER (name));
2653 *no_add_attrs = true;
2655 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
2657 warning (OPT_Wattributes, "argument to %qs attribute larger than %d",
2658 IDENTIFIER_POINTER (name), REGPARM_MAX);
2659 *no_add_attrs = true;
2663 && lookup_attribute (ix86_force_align_arg_pointer_string,
2664 TYPE_ATTRIBUTES (*node))
2665 && compare_tree_int (cst, REGPARM_MAX-1))
2667 error ("%s functions limited to %d register parameters",
2668 ix86_force_align_arg_pointer_string, REGPARM_MAX-1);
2676 /* Do not warn when emulating the MS ABI. */
2677 if (!TARGET_64BIT_MS_ABI)
2678 warning (OPT_Wattributes, "%qs attribute ignored",
2679 IDENTIFIER_POINTER (name));
2680 *no_add_attrs = true;
2684 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
2685 if (is_attribute_p ("fastcall", name))
2687 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
2689 error ("fastcall and cdecl attributes are not compatible");
2691 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
2693 error ("fastcall and stdcall attributes are not compatible");
2695 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
2697 error ("fastcall and regparm attributes are not compatible");
2701 /* Can combine stdcall with fastcall (redundant), regparm and
2703 else if (is_attribute_p ("stdcall", name))
2705 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
2707 error ("stdcall and cdecl attributes are not compatible");
2709 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
2711 error ("stdcall and fastcall attributes are not compatible");
2715 /* Can combine cdecl with regparm and sseregparm. */
2716 else if (is_attribute_p ("cdecl", name))
2718 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
2720 error ("stdcall and cdecl attributes are not compatible");
2722 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
2724 error ("fastcall and cdecl attributes are not compatible");
2728 /* Can combine sseregparm with all attributes. */
2733 /* Return 0 if the attributes for two types are incompatible, 1 if they
2734 are compatible, and 2 if they are nearly compatible (which causes a
2735 warning to be generated). */
2738 ix86_comp_type_attributes (tree type1, tree type2)
2740 /* Check for mismatch of non-default calling convention. */
2741 const char *const rtdstr = TARGET_RTD ? "cdecl" : "stdcall";
2743 if (TREE_CODE (type1) != FUNCTION_TYPE)
2746 /* Check for mismatched fastcall/regparm types. */
2747 if ((!lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type1))
2748 != !lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type2)))
2749 || (ix86_function_regparm (type1, NULL)
2750 != ix86_function_regparm (type2, NULL)))
2753 /* Check for mismatched sseregparm types. */
2754 if (!lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type1))
2755 != !lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type2)))
2758 /* Check for mismatched return types (cdecl vs stdcall). */
2759 if (!lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type1))
2760 != !lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type2)))
2766 /* Return the regparm value for a function with the indicated TYPE and DECL.
2767 DECL may be NULL when calling function indirectly
2768 or considering a libcall. */
2771 ix86_function_regparm (tree type, tree decl)
2774 int regparm = ix86_regparm;
2779 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
2781 return TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
2783 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type)))
2786 /* Use register calling convention for local functions when possible. */
2787 if (decl && flag_unit_at_a_time && !profile_flag)
2789 struct cgraph_local_info *i = cgraph_local_info (decl);
2792 int local_regparm, globals = 0, regno;
2795 /* Make sure no regparm register is taken by a
2796 global register variable. */
2797 for (local_regparm = 0; local_regparm < 3; local_regparm++)
2798 if (global_regs[local_regparm])
2801 /* We can't use regparm(3) for nested functions as these use
2802 static chain pointer in third argument. */
2803 if (local_regparm == 3
2804 && decl_function_context (decl)
2805 && !DECL_NO_STATIC_CHAIN (decl))
2808 /* If the function realigns its stackpointer, the prologue will
2809 clobber %ecx. If we've already generated code for the callee,
2810 the callee DECL_STRUCT_FUNCTION is gone, so we fall back to
2811 scanning the attributes for the self-realigning property. */
2812 f = DECL_STRUCT_FUNCTION (decl);
2813 if (local_regparm == 3
2814 && (f ? !!f->machine->force_align_arg_pointer
2815 : !!lookup_attribute (ix86_force_align_arg_pointer_string,
2816 TYPE_ATTRIBUTES (TREE_TYPE (decl)))))
2819 /* Each global register variable increases register preassure,
2820 so the more global reg vars there are, the smaller regparm
2821 optimization use, unless requested by the user explicitly. */
2822 for (regno = 0; regno < 6; regno++)
2823 if (global_regs[regno])
2826 = globals < local_regparm ? local_regparm - globals : 0;
2828 if (local_regparm > regparm)
2829 regparm = local_regparm;
2836 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
2837 DFmode (2) arguments in SSE registers for a function with the
2838 indicated TYPE and DECL. DECL may be NULL when calling function
2839 indirectly or considering a libcall. Otherwise return 0. */
2842 ix86_function_sseregparm (tree type, tree decl)
2844 gcc_assert (!TARGET_64BIT);
2846 /* Use SSE registers to pass SFmode and DFmode arguments if requested
2847 by the sseregparm attribute. */
2848 if (TARGET_SSEREGPARM
2849 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
2854 error ("Calling %qD with attribute sseregparm without "
2855 "SSE/SSE2 enabled", decl);
2857 error ("Calling %qT with attribute sseregparm without "
2858 "SSE/SSE2 enabled", type);
2865 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
2866 (and DFmode for SSE2) arguments in SSE registers. */
2867 if (decl && TARGET_SSE_MATH && flag_unit_at_a_time && !profile_flag)
2869 struct cgraph_local_info *i = cgraph_local_info (decl);
2871 return TARGET_SSE2 ? 2 : 1;
2877 /* Return true if EAX is live at the start of the function. Used by
2878 ix86_expand_prologue to determine if we need special help before
2879 calling allocate_stack_worker. */
2882 ix86_eax_live_at_start_p (void)
2884 /* Cheat. Don't bother working forward from ix86_function_regparm
2885 to the function type to whether an actual argument is located in
2886 eax. Instead just look at cfg info, which is still close enough
2887 to correct at this point. This gives false positives for broken
2888 functions that might use uninitialized data that happens to be
2889 allocated in eax, but who cares? */
2890 return REGNO_REG_SET_P (ENTRY_BLOCK_PTR->il.rtl->global_live_at_end, 0);
2893 /* Return true if TYPE has a variable argument list. */
2896 type_has_variadic_args_p (tree type)
2900 for (t = TYPE_ARG_TYPES (type); t; t = TREE_CHAIN (t))
2901 if (t == void_list_node)
2906 /* Value is the number of bytes of arguments automatically
2907 popped when returning from a subroutine call.
2908 FUNDECL is the declaration node of the function (as a tree),
2909 FUNTYPE is the data type of the function (as a tree),
2910 or for a library call it is an identifier node for the subroutine name.
2911 SIZE is the number of bytes of arguments passed on the stack.
2913 On the 80386, the RTD insn may be used to pop them if the number
2914 of args is fixed, but if the number is variable then the caller
2915 must pop them all. RTD can't be used for library calls now
2916 because the library is compiled with the Unix compiler.
2917 Use of RTD is a selectable option, since it is incompatible with
2918 standard Unix calling sequences. If the option is not selected,
2919 the caller must always pop the args.
2921 The attribute stdcall is equivalent to RTD on a per module basis. */
2924 ix86_return_pops_args (tree fundecl, tree funtype, int size)
2928 /* None of the 64-bit ABIs pop arguments. */
2932 rtd = TARGET_RTD && (!fundecl || TREE_CODE (fundecl) != IDENTIFIER_NODE);
2934 /* Cdecl functions override -mrtd, and never pop the stack. */
2935 if (! lookup_attribute ("cdecl", TYPE_ATTRIBUTES (funtype)))
2937 /* Stdcall and fastcall functions will pop the stack if not
2939 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (funtype))
2940 || lookup_attribute ("fastcall", TYPE_ATTRIBUTES (funtype)))
2943 if (rtd && ! type_has_variadic_args_p (funtype))