1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
3 2002, 2003, 2004, 2005, 2006, 2007, 2008
4 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "hard-reg-set.h"
32 #include "insn-config.h"
33 #include "conditions.h"
35 #include "insn-codes.h"
36 #include "insn-attr.h"
45 #include "basic-block.h"
48 #include "target-def.h"
49 #include "langhooks.h"
51 #include "tree-gimple.h"
54 #include "tm-constrs.h"
57 static int x86_builtin_vectorization_cost (bool);
58 static rtx legitimize_dllimport_symbol (rtx, bool);
60 #ifndef CHECK_STACK_LIMIT
61 #define CHECK_STACK_LIMIT (-1)
64 /* Return index of given mode in mult and division cost tables. */
65 #define MODE_INDEX(mode) \
66 ((mode) == QImode ? 0 \
67 : (mode) == HImode ? 1 \
68 : (mode) == SImode ? 2 \
69 : (mode) == DImode ? 3 \
72 /* Processor costs (relative to an add) */
73 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
74 #define COSTS_N_BYTES(N) ((N) * 2)
76 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
79 struct processor_costs size_cost = { /* costs for tuning for size */
80 COSTS_N_BYTES (2), /* cost of an add instruction */
81 COSTS_N_BYTES (3), /* cost of a lea instruction */
82 COSTS_N_BYTES (2), /* variable shift costs */
83 COSTS_N_BYTES (3), /* constant shift costs */
84 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
85 COSTS_N_BYTES (3), /* HI */
86 COSTS_N_BYTES (3), /* SI */
87 COSTS_N_BYTES (3), /* DI */
88 COSTS_N_BYTES (5)}, /* other */
89 0, /* cost of multiply per each bit set */
90 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
91 COSTS_N_BYTES (3), /* HI */
92 COSTS_N_BYTES (3), /* SI */
93 COSTS_N_BYTES (3), /* DI */
94 COSTS_N_BYTES (5)}, /* other */
95 COSTS_N_BYTES (3), /* cost of movsx */
96 COSTS_N_BYTES (3), /* cost of movzx */
99 2, /* cost for loading QImode using movzbl */
100 {2, 2, 2}, /* cost of loading integer registers
101 in QImode, HImode and SImode.
102 Relative to reg-reg move (2). */
103 {2, 2, 2}, /* cost of storing integer registers */
104 2, /* cost of reg,reg fld/fst */
105 {2, 2, 2}, /* cost of loading fp registers
106 in SFmode, DFmode and XFmode */
107 {2, 2, 2}, /* cost of storing fp registers
108 in SFmode, DFmode and XFmode */
109 3, /* cost of moving MMX register */
110 {3, 3}, /* cost of loading MMX registers
111 in SImode and DImode */
112 {3, 3}, /* cost of storing MMX registers
113 in SImode and DImode */
114 3, /* cost of moving SSE register */
115 {3, 3, 3}, /* cost of loading SSE registers
116 in SImode, DImode and TImode */
117 {3, 3, 3}, /* cost of storing SSE registers
118 in SImode, DImode and TImode */
119 3, /* MMX or SSE register to integer */
120 0, /* size of l1 cache */
121 0, /* size of l2 cache */
122 0, /* size of prefetch block */
123 0, /* number of parallel prefetches */
125 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
126 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
127 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
128 COSTS_N_BYTES (2), /* cost of FABS instruction. */
129 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
130 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
131 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
132 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
133 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
134 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
135 1, /* scalar_stmt_cost. */
136 1, /* scalar load_cost. */
137 1, /* scalar_store_cost. */
138 1, /* vec_stmt_cost. */
139 1, /* vec_to_scalar_cost. */
140 1, /* scalar_to_vec_cost. */
141 1, /* vec_align_load_cost. */
142 1, /* vec_unalign_load_cost. */
143 1, /* vec_store_cost. */
144 1, /* cond_taken_branch_cost. */
145 1, /* cond_not_taken_branch_cost. */
148 /* Processor costs (relative to an add) */
150 struct processor_costs i386_cost = { /* 386 specific costs */
151 COSTS_N_INSNS (1), /* cost of an add instruction */
152 COSTS_N_INSNS (1), /* cost of a lea instruction */
153 COSTS_N_INSNS (3), /* variable shift costs */
154 COSTS_N_INSNS (2), /* constant shift costs */
155 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
156 COSTS_N_INSNS (6), /* HI */
157 COSTS_N_INSNS (6), /* SI */
158 COSTS_N_INSNS (6), /* DI */
159 COSTS_N_INSNS (6)}, /* other */
160 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
161 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
162 COSTS_N_INSNS (23), /* HI */
163 COSTS_N_INSNS (23), /* SI */
164 COSTS_N_INSNS (23), /* DI */
165 COSTS_N_INSNS (23)}, /* other */
166 COSTS_N_INSNS (3), /* cost of movsx */
167 COSTS_N_INSNS (2), /* cost of movzx */
168 15, /* "large" insn */
170 4, /* cost for loading QImode using movzbl */
171 {2, 4, 2}, /* cost of loading integer registers
172 in QImode, HImode and SImode.
173 Relative to reg-reg move (2). */
174 {2, 4, 2}, /* cost of storing integer registers */
175 2, /* cost of reg,reg fld/fst */
176 {8, 8, 8}, /* cost of loading fp registers
177 in SFmode, DFmode and XFmode */
178 {8, 8, 8}, /* cost of storing fp registers
179 in SFmode, DFmode and XFmode */
180 2, /* cost of moving MMX register */
181 {4, 8}, /* cost of loading MMX registers
182 in SImode and DImode */
183 {4, 8}, /* cost of storing MMX registers
184 in SImode and DImode */
185 2, /* cost of moving SSE register */
186 {4, 8, 16}, /* cost of loading SSE registers
187 in SImode, DImode and TImode */
188 {4, 8, 16}, /* cost of storing SSE registers
189 in SImode, DImode and TImode */
190 3, /* MMX or SSE register to integer */
191 0, /* size of l1 cache */
192 0, /* size of l2 cache */
193 0, /* size of prefetch block */
194 0, /* number of parallel prefetches */
196 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
197 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
198 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
199 COSTS_N_INSNS (22), /* cost of FABS instruction. */
200 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
201 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
202 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
203 DUMMY_STRINGOP_ALGS},
204 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
205 DUMMY_STRINGOP_ALGS},
206 1, /* scalar_stmt_cost. */
207 1, /* scalar load_cost. */
208 1, /* scalar_store_cost. */
209 1, /* vec_stmt_cost. */
210 1, /* vec_to_scalar_cost. */
211 1, /* scalar_to_vec_cost. */
212 1, /* vec_align_load_cost. */
213 2, /* vec_unalign_load_cost. */
214 1, /* vec_store_cost. */
215 3, /* cond_taken_branch_cost. */
216 1, /* cond_not_taken_branch_cost. */
220 struct processor_costs i486_cost = { /* 486 specific costs */
221 COSTS_N_INSNS (1), /* cost of an add instruction */
222 COSTS_N_INSNS (1), /* cost of a lea instruction */
223 COSTS_N_INSNS (3), /* variable shift costs */
224 COSTS_N_INSNS (2), /* constant shift costs */
225 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
226 COSTS_N_INSNS (12), /* HI */
227 COSTS_N_INSNS (12), /* SI */
228 COSTS_N_INSNS (12), /* DI */
229 COSTS_N_INSNS (12)}, /* other */
230 1, /* cost of multiply per each bit set */
231 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
232 COSTS_N_INSNS (40), /* HI */
233 COSTS_N_INSNS (40), /* SI */
234 COSTS_N_INSNS (40), /* DI */
235 COSTS_N_INSNS (40)}, /* other */
236 COSTS_N_INSNS (3), /* cost of movsx */
237 COSTS_N_INSNS (2), /* cost of movzx */
238 15, /* "large" insn */
240 4, /* cost for loading QImode using movzbl */
241 {2, 4, 2}, /* cost of loading integer registers
242 in QImode, HImode and SImode.
243 Relative to reg-reg move (2). */
244 {2, 4, 2}, /* cost of storing integer registers */
245 2, /* cost of reg,reg fld/fst */
246 {8, 8, 8}, /* cost of loading fp registers
247 in SFmode, DFmode and XFmode */
248 {8, 8, 8}, /* cost of storing fp registers
249 in SFmode, DFmode and XFmode */
250 2, /* cost of moving MMX register */
251 {4, 8}, /* cost of loading MMX registers
252 in SImode and DImode */
253 {4, 8}, /* cost of storing MMX registers
254 in SImode and DImode */
255 2, /* cost of moving SSE register */
256 {4, 8, 16}, /* cost of loading SSE registers
257 in SImode, DImode and TImode */
258 {4, 8, 16}, /* cost of storing SSE registers
259 in SImode, DImode and TImode */
260 3, /* MMX or SSE register to integer */
261 4, /* size of l1 cache. 486 has 8kB cache
262 shared for code and data, so 4kB is
263 not really precise. */
264 4, /* size of l2 cache */
265 0, /* size of prefetch block */
266 0, /* number of parallel prefetches */
268 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
269 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
270 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
271 COSTS_N_INSNS (3), /* cost of FABS instruction. */
272 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
273 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
274 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
275 DUMMY_STRINGOP_ALGS},
276 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
277 DUMMY_STRINGOP_ALGS},
278 1, /* scalar_stmt_cost. */
279 1, /* scalar load_cost. */
280 1, /* scalar_store_cost. */
281 1, /* vec_stmt_cost. */
282 1, /* vec_to_scalar_cost. */
283 1, /* scalar_to_vec_cost. */
284 1, /* vec_align_load_cost. */
285 2, /* vec_unalign_load_cost. */
286 1, /* vec_store_cost. */
287 3, /* cond_taken_branch_cost. */
288 1, /* cond_not_taken_branch_cost. */
292 struct processor_costs pentium_cost = {
293 COSTS_N_INSNS (1), /* cost of an add instruction */
294 COSTS_N_INSNS (1), /* cost of a lea instruction */
295 COSTS_N_INSNS (4), /* variable shift costs */
296 COSTS_N_INSNS (1), /* constant shift costs */
297 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
298 COSTS_N_INSNS (11), /* HI */
299 COSTS_N_INSNS (11), /* SI */
300 COSTS_N_INSNS (11), /* DI */
301 COSTS_N_INSNS (11)}, /* other */
302 0, /* cost of multiply per each bit set */
303 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
304 COSTS_N_INSNS (25), /* HI */
305 COSTS_N_INSNS (25), /* SI */
306 COSTS_N_INSNS (25), /* DI */
307 COSTS_N_INSNS (25)}, /* other */
308 COSTS_N_INSNS (3), /* cost of movsx */
309 COSTS_N_INSNS (2), /* cost of movzx */
310 8, /* "large" insn */
312 6, /* cost for loading QImode using movzbl */
313 {2, 4, 2}, /* cost of loading integer registers
314 in QImode, HImode and SImode.
315 Relative to reg-reg move (2). */
316 {2, 4, 2}, /* cost of storing integer registers */
317 2, /* cost of reg,reg fld/fst */
318 {2, 2, 6}, /* cost of loading fp registers
319 in SFmode, DFmode and XFmode */
320 {4, 4, 6}, /* cost of storing fp registers
321 in SFmode, DFmode and XFmode */
322 8, /* cost of moving MMX register */
323 {8, 8}, /* cost of loading MMX registers
324 in SImode and DImode */
325 {8, 8}, /* cost of storing MMX registers
326 in SImode and DImode */
327 2, /* cost of moving SSE register */
328 {4, 8, 16}, /* cost of loading SSE registers
329 in SImode, DImode and TImode */
330 {4, 8, 16}, /* cost of storing SSE registers
331 in SImode, DImode and TImode */
332 3, /* MMX or SSE register to integer */
333 8, /* size of l1 cache. */
334 8, /* size of l2 cache */
335 0, /* size of prefetch block */
336 0, /* number of parallel prefetches */
338 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
339 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
340 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
341 COSTS_N_INSNS (1), /* cost of FABS instruction. */
342 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
343 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
344 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
345 DUMMY_STRINGOP_ALGS},
346 {{libcall, {{-1, rep_prefix_4_byte}}},
347 DUMMY_STRINGOP_ALGS},
348 1, /* scalar_stmt_cost. */
349 1, /* scalar load_cost. */
350 1, /* scalar_store_cost. */
351 1, /* vec_stmt_cost. */
352 1, /* vec_to_scalar_cost. */
353 1, /* scalar_to_vec_cost. */
354 1, /* vec_align_load_cost. */
355 2, /* vec_unalign_load_cost. */
356 1, /* vec_store_cost. */
357 3, /* cond_taken_branch_cost. */
358 1, /* cond_not_taken_branch_cost. */
362 struct processor_costs pentiumpro_cost = {
363 COSTS_N_INSNS (1), /* cost of an add instruction */
364 COSTS_N_INSNS (1), /* cost of a lea instruction */
365 COSTS_N_INSNS (1), /* variable shift costs */
366 COSTS_N_INSNS (1), /* constant shift costs */
367 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
368 COSTS_N_INSNS (4), /* HI */
369 COSTS_N_INSNS (4), /* SI */
370 COSTS_N_INSNS (4), /* DI */
371 COSTS_N_INSNS (4)}, /* other */
372 0, /* cost of multiply per each bit set */
373 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
374 COSTS_N_INSNS (17), /* HI */
375 COSTS_N_INSNS (17), /* SI */
376 COSTS_N_INSNS (17), /* DI */
377 COSTS_N_INSNS (17)}, /* other */
378 COSTS_N_INSNS (1), /* cost of movsx */
379 COSTS_N_INSNS (1), /* cost of movzx */
380 8, /* "large" insn */
382 2, /* cost for loading QImode using movzbl */
383 {4, 4, 4}, /* cost of loading integer registers
384 in QImode, HImode and SImode.
385 Relative to reg-reg move (2). */
386 {2, 2, 2}, /* cost of storing integer registers */
387 2, /* cost of reg,reg fld/fst */
388 {2, 2, 6}, /* cost of loading fp registers
389 in SFmode, DFmode and XFmode */
390 {4, 4, 6}, /* cost of storing fp registers
391 in SFmode, DFmode and XFmode */
392 2, /* cost of moving MMX register */
393 {2, 2}, /* cost of loading MMX registers
394 in SImode and DImode */
395 {2, 2}, /* cost of storing MMX registers
396 in SImode and DImode */
397 2, /* cost of moving SSE register */
398 {2, 2, 8}, /* cost of loading SSE registers
399 in SImode, DImode and TImode */
400 {2, 2, 8}, /* cost of storing SSE registers
401 in SImode, DImode and TImode */
402 3, /* MMX or SSE register to integer */
403 8, /* size of l1 cache. */
404 256, /* size of l2 cache */
405 32, /* size of prefetch block */
406 6, /* number of parallel prefetches */
408 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
409 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
410 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
411 COSTS_N_INSNS (2), /* cost of FABS instruction. */
412 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
413 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
414 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes (we ensure
415 the alignment). For small blocks inline loop is still a noticeable win, for bigger
416 blocks either rep movsl or rep movsb is way to go. Rep movsb has apparently
417 more expensive startup time in CPU, but after 4K the difference is down in the noise.
419 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
420 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
421 DUMMY_STRINGOP_ALGS},
422 {{rep_prefix_4_byte, {{1024, unrolled_loop},
423 {8192, rep_prefix_4_byte}, {-1, libcall}}},
424 DUMMY_STRINGOP_ALGS},
425 1, /* scalar_stmt_cost. */
426 1, /* scalar load_cost. */
427 1, /* scalar_store_cost. */
428 1, /* vec_stmt_cost. */
429 1, /* vec_to_scalar_cost. */
430 1, /* scalar_to_vec_cost. */
431 1, /* vec_align_load_cost. */
432 2, /* vec_unalign_load_cost. */
433 1, /* vec_store_cost. */
434 3, /* cond_taken_branch_cost. */
435 1, /* cond_not_taken_branch_cost. */
439 struct processor_costs geode_cost = {
440 COSTS_N_INSNS (1), /* cost of an add instruction */
441 COSTS_N_INSNS (1), /* cost of a lea instruction */
442 COSTS_N_INSNS (2), /* variable shift costs */
443 COSTS_N_INSNS (1), /* constant shift costs */
444 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
445 COSTS_N_INSNS (4), /* HI */
446 COSTS_N_INSNS (7), /* SI */
447 COSTS_N_INSNS (7), /* DI */
448 COSTS_N_INSNS (7)}, /* other */
449 0, /* cost of multiply per each bit set */
450 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
451 COSTS_N_INSNS (23), /* HI */
452 COSTS_N_INSNS (39), /* SI */
453 COSTS_N_INSNS (39), /* DI */
454 COSTS_N_INSNS (39)}, /* other */
455 COSTS_N_INSNS (1), /* cost of movsx */
456 COSTS_N_INSNS (1), /* cost of movzx */
457 8, /* "large" insn */
459 1, /* cost for loading QImode using movzbl */
460 {1, 1, 1}, /* cost of loading integer registers
461 in QImode, HImode and SImode.
462 Relative to reg-reg move (2). */
463 {1, 1, 1}, /* cost of storing integer registers */
464 1, /* cost of reg,reg fld/fst */
465 {1, 1, 1}, /* cost of loading fp registers
466 in SFmode, DFmode and XFmode */
467 {4, 6, 6}, /* cost of storing fp registers
468 in SFmode, DFmode and XFmode */
470 1, /* cost of moving MMX register */
471 {1, 1}, /* cost of loading MMX registers
472 in SImode and DImode */
473 {1, 1}, /* cost of storing MMX registers
474 in SImode and DImode */
475 1, /* cost of moving SSE register */
476 {1, 1, 1}, /* cost of loading SSE registers
477 in SImode, DImode and TImode */
478 {1, 1, 1}, /* cost of storing SSE registers
479 in SImode, DImode and TImode */
480 1, /* MMX or SSE register to integer */
481 64, /* size of l1 cache. */
482 128, /* size of l2 cache. */
483 32, /* size of prefetch block */
484 1, /* number of parallel prefetches */
486 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
487 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
488 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
489 COSTS_N_INSNS (1), /* cost of FABS instruction. */
490 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
491 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
492 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
493 DUMMY_STRINGOP_ALGS},
494 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
495 DUMMY_STRINGOP_ALGS},
496 1, /* scalar_stmt_cost. */
497 1, /* scalar load_cost. */
498 1, /* scalar_store_cost. */
499 1, /* vec_stmt_cost. */
500 1, /* vec_to_scalar_cost. */
501 1, /* scalar_to_vec_cost. */
502 1, /* vec_align_load_cost. */
503 2, /* vec_unalign_load_cost. */
504 1, /* vec_store_cost. */
505 3, /* cond_taken_branch_cost. */
506 1, /* cond_not_taken_branch_cost. */
510 struct processor_costs k6_cost = {
511 COSTS_N_INSNS (1), /* cost of an add instruction */
512 COSTS_N_INSNS (2), /* cost of a lea instruction */
513 COSTS_N_INSNS (1), /* variable shift costs */
514 COSTS_N_INSNS (1), /* constant shift costs */
515 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
516 COSTS_N_INSNS (3), /* HI */
517 COSTS_N_INSNS (3), /* SI */
518 COSTS_N_INSNS (3), /* DI */
519 COSTS_N_INSNS (3)}, /* other */
520 0, /* cost of multiply per each bit set */
521 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
522 COSTS_N_INSNS (18), /* HI */
523 COSTS_N_INSNS (18), /* SI */
524 COSTS_N_INSNS (18), /* DI */
525 COSTS_N_INSNS (18)}, /* other */
526 COSTS_N_INSNS (2), /* cost of movsx */
527 COSTS_N_INSNS (2), /* cost of movzx */
528 8, /* "large" insn */
530 3, /* cost for loading QImode using movzbl */
531 {4, 5, 4}, /* cost of loading integer registers
532 in QImode, HImode and SImode.
533 Relative to reg-reg move (2). */
534 {2, 3, 2}, /* cost of storing integer registers */
535 4, /* cost of reg,reg fld/fst */
536 {6, 6, 6}, /* cost of loading fp registers
537 in SFmode, DFmode and XFmode */
538 {4, 4, 4}, /* cost of storing fp registers
539 in SFmode, DFmode and XFmode */
540 2, /* cost of moving MMX register */
541 {2, 2}, /* cost of loading MMX registers
542 in SImode and DImode */
543 {2, 2}, /* cost of storing MMX registers
544 in SImode and DImode */
545 2, /* cost of moving SSE register */
546 {2, 2, 8}, /* cost of loading SSE registers
547 in SImode, DImode and TImode */
548 {2, 2, 8}, /* cost of storing SSE registers
549 in SImode, DImode and TImode */
550 6, /* MMX or SSE register to integer */
551 32, /* size of l1 cache. */
552 32, /* size of l2 cache. Some models
553 have integrated l2 cache, but
554 optimizing for k6 is not important
555 enough to worry about that. */
556 32, /* size of prefetch block */
557 1, /* number of parallel prefetches */
559 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
560 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
561 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
562 COSTS_N_INSNS (2), /* cost of FABS instruction. */
563 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
564 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
565 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
566 DUMMY_STRINGOP_ALGS},
567 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
568 DUMMY_STRINGOP_ALGS},
569 1, /* scalar_stmt_cost. */
570 1, /* scalar load_cost. */
571 1, /* scalar_store_cost. */
572 1, /* vec_stmt_cost. */
573 1, /* vec_to_scalar_cost. */
574 1, /* scalar_to_vec_cost. */
575 1, /* vec_align_load_cost. */
576 2, /* vec_unalign_load_cost. */
577 1, /* vec_store_cost. */
578 3, /* cond_taken_branch_cost. */
579 1, /* cond_not_taken_branch_cost. */
583 struct processor_costs athlon_cost = {
584 COSTS_N_INSNS (1), /* cost of an add instruction */
585 COSTS_N_INSNS (2), /* cost of a lea instruction */
586 COSTS_N_INSNS (1), /* variable shift costs */
587 COSTS_N_INSNS (1), /* constant shift costs */
588 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
589 COSTS_N_INSNS (5), /* HI */
590 COSTS_N_INSNS (5), /* SI */
591 COSTS_N_INSNS (5), /* DI */
592 COSTS_N_INSNS (5)}, /* other */
593 0, /* cost of multiply per each bit set */
594 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
595 COSTS_N_INSNS (26), /* HI */
596 COSTS_N_INSNS (42), /* SI */
597 COSTS_N_INSNS (74), /* DI */
598 COSTS_N_INSNS (74)}, /* other */
599 COSTS_N_INSNS (1), /* cost of movsx */
600 COSTS_N_INSNS (1), /* cost of movzx */
601 8, /* "large" insn */
603 4, /* cost for loading QImode using movzbl */
604 {3, 4, 3}, /* cost of loading integer registers
605 in QImode, HImode and SImode.
606 Relative to reg-reg move (2). */
607 {3, 4, 3}, /* cost of storing integer registers */
608 4, /* cost of reg,reg fld/fst */
609 {4, 4, 12}, /* cost of loading fp registers
610 in SFmode, DFmode and XFmode */
611 {6, 6, 8}, /* cost of storing fp registers
612 in SFmode, DFmode and XFmode */
613 2, /* cost of moving MMX register */
614 {4, 4}, /* cost of loading MMX registers
615 in SImode and DImode */
616 {4, 4}, /* cost of storing MMX registers
617 in SImode and DImode */
618 2, /* cost of moving SSE register */
619 {4, 4, 6}, /* cost of loading SSE registers
620 in SImode, DImode and TImode */
621 {4, 4, 5}, /* cost of storing SSE registers
622 in SImode, DImode and TImode */
623 5, /* MMX or SSE register to integer */
624 64, /* size of l1 cache. */
625 256, /* size of l2 cache. */
626 64, /* size of prefetch block */
627 6, /* number of parallel prefetches */
629 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
630 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
631 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
632 COSTS_N_INSNS (2), /* cost of FABS instruction. */
633 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
634 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
635 /* For some reason, Athlon deals better with REP prefix (relative to loops)
636 compared to K8. Alignment becomes important after 8 bytes for memcpy and
637 128 bytes for memset. */
638 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
639 DUMMY_STRINGOP_ALGS},
640 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
641 DUMMY_STRINGOP_ALGS},
642 1, /* scalar_stmt_cost. */
643 1, /* scalar load_cost. */
644 1, /* scalar_store_cost. */
645 1, /* vec_stmt_cost. */
646 1, /* vec_to_scalar_cost. */
647 1, /* scalar_to_vec_cost. */
648 1, /* vec_align_load_cost. */
649 2, /* vec_unalign_load_cost. */
650 1, /* vec_store_cost. */
651 3, /* cond_taken_branch_cost. */
652 1, /* cond_not_taken_branch_cost. */
656 struct processor_costs k8_cost = {
657 COSTS_N_INSNS (1), /* cost of an add instruction */
658 COSTS_N_INSNS (2), /* cost of a lea instruction */
659 COSTS_N_INSNS (1), /* variable shift costs */
660 COSTS_N_INSNS (1), /* constant shift costs */
661 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
662 COSTS_N_INSNS (4), /* HI */
663 COSTS_N_INSNS (3), /* SI */
664 COSTS_N_INSNS (4), /* DI */
665 COSTS_N_INSNS (5)}, /* other */
666 0, /* cost of multiply per each bit set */
667 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
668 COSTS_N_INSNS (26), /* HI */
669 COSTS_N_INSNS (42), /* SI */
670 COSTS_N_INSNS (74), /* DI */
671 COSTS_N_INSNS (74)}, /* other */
672 COSTS_N_INSNS (1), /* cost of movsx */
673 COSTS_N_INSNS (1), /* cost of movzx */
674 8, /* "large" insn */
676 4, /* cost for loading QImode using movzbl */
677 {3, 4, 3}, /* cost of loading integer registers
678 in QImode, HImode and SImode.
679 Relative to reg-reg move (2). */
680 {3, 4, 3}, /* cost of storing integer registers */
681 4, /* cost of reg,reg fld/fst */
682 {4, 4, 12}, /* cost of loading fp registers
683 in SFmode, DFmode and XFmode */
684 {6, 6, 8}, /* cost of storing fp registers
685 in SFmode, DFmode and XFmode */
686 2, /* cost of moving MMX register */
687 {3, 3}, /* cost of loading MMX registers
688 in SImode and DImode */
689 {4, 4}, /* cost of storing MMX registers
690 in SImode and DImode */
691 2, /* cost of moving SSE register */
692 {4, 3, 6}, /* cost of loading SSE registers
693 in SImode, DImode and TImode */
694 {4, 4, 5}, /* cost of storing SSE registers
695 in SImode, DImode and TImode */
696 5, /* MMX or SSE register to integer */
697 64, /* size of l1 cache. */
698 512, /* size of l2 cache. */
699 64, /* size of prefetch block */
700 /* New AMD processors never drop prefetches; if they cannot be performed
701 immediately, they are queued. We set number of simultaneous prefetches
702 to a large constant to reflect this (it probably is not a good idea not
703 to limit number of prefetches at all, as their execution also takes some
705 100, /* number of parallel prefetches */
707 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
708 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
709 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
710 COSTS_N_INSNS (2), /* cost of FABS instruction. */
711 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
712 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
713 /* K8 has optimized REP instruction for medium sized blocks, but for very small
714 blocks it is better to use loop. For large blocks, libcall can do
715 nontemporary accesses and beat inline considerably. */
716 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
717 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
718 {{libcall, {{8, loop}, {24, unrolled_loop},
719 {2048, rep_prefix_4_byte}, {-1, libcall}}},
720 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
721 4, /* scalar_stmt_cost. */
722 2, /* scalar load_cost. */
723 2, /* scalar_store_cost. */
724 5, /* vec_stmt_cost. */
725 0, /* vec_to_scalar_cost. */
726 2, /* scalar_to_vec_cost. */
727 2, /* vec_align_load_cost. */
728 3, /* vec_unalign_load_cost. */
729 3, /* vec_store_cost. */
730 3, /* cond_taken_branch_cost. */
731 2, /* cond_not_taken_branch_cost. */
734 struct processor_costs amdfam10_cost = {
735 COSTS_N_INSNS (1), /* cost of an add instruction */
736 COSTS_N_INSNS (2), /* cost of a lea instruction */
737 COSTS_N_INSNS (1), /* variable shift costs */
738 COSTS_N_INSNS (1), /* constant shift costs */
739 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
740 COSTS_N_INSNS (4), /* HI */
741 COSTS_N_INSNS (3), /* SI */
742 COSTS_N_INSNS (4), /* DI */
743 COSTS_N_INSNS (5)}, /* other */
744 0, /* cost of multiply per each bit set */
745 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
746 COSTS_N_INSNS (35), /* HI */
747 COSTS_N_INSNS (51), /* SI */
748 COSTS_N_INSNS (83), /* DI */
749 COSTS_N_INSNS (83)}, /* other */
750 COSTS_N_INSNS (1), /* cost of movsx */
751 COSTS_N_INSNS (1), /* cost of movzx */
752 8, /* "large" insn */
754 4, /* cost for loading QImode using movzbl */
755 {3, 4, 3}, /* cost of loading integer registers
756 in QImode, HImode and SImode.
757 Relative to reg-reg move (2). */
758 {3, 4, 3}, /* cost of storing integer registers */
759 4, /* cost of reg,reg fld/fst */
760 {4, 4, 12}, /* cost of loading fp registers
761 in SFmode, DFmode and XFmode */
762 {6, 6, 8}, /* cost of storing fp registers
763 in SFmode, DFmode and XFmode */
764 2, /* cost of moving MMX register */
765 {3, 3}, /* cost of loading MMX registers
766 in SImode and DImode */
767 {4, 4}, /* cost of storing MMX registers
768 in SImode and DImode */
769 2, /* cost of moving SSE register */
770 {4, 4, 3}, /* cost of loading SSE registers
771 in SImode, DImode and TImode */
772 {4, 4, 5}, /* cost of storing SSE registers
773 in SImode, DImode and TImode */
774 3, /* MMX or SSE register to integer */
776 MOVD reg64, xmmreg Double FSTORE 4
777 MOVD reg32, xmmreg Double FSTORE 4
779 MOVD reg64, xmmreg Double FADD 3
781 MOVD reg32, xmmreg Double FADD 3
783 64, /* size of l1 cache. */
784 512, /* size of l2 cache. */
785 64, /* size of prefetch block */
786 /* New AMD processors never drop prefetches; if they cannot be performed
787 immediately, they are queued. We set number of simultaneous prefetches
788 to a large constant to reflect this (it probably is not a good idea not
789 to limit number of prefetches at all, as their execution also takes some
791 100, /* number of parallel prefetches */
793 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
794 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
795 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
796 COSTS_N_INSNS (2), /* cost of FABS instruction. */
797 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
798 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
800 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
801 very small blocks it is better to use loop. For large blocks, libcall can
802 do nontemporary accesses and beat inline considerably. */
803 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
804 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
805 {{libcall, {{8, loop}, {24, unrolled_loop},
806 {2048, rep_prefix_4_byte}, {-1, libcall}}},
807 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
808 4, /* scalar_stmt_cost. */
809 2, /* scalar load_cost. */
810 2, /* scalar_store_cost. */
811 6, /* vec_stmt_cost. */
812 0, /* vec_to_scalar_cost. */
813 2, /* scalar_to_vec_cost. */
814 2, /* vec_align_load_cost. */
815 2, /* vec_unalign_load_cost. */
816 2, /* vec_store_cost. */
817 2, /* cond_taken_branch_cost. */
818 1, /* cond_not_taken_branch_cost. */
822 struct processor_costs pentium4_cost = {
823 COSTS_N_INSNS (1), /* cost of an add instruction */
824 COSTS_N_INSNS (3), /* cost of a lea instruction */
825 COSTS_N_INSNS (4), /* variable shift costs */
826 COSTS_N_INSNS (4), /* constant shift costs */
827 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
828 COSTS_N_INSNS (15), /* HI */
829 COSTS_N_INSNS (15), /* SI */
830 COSTS_N_INSNS (15), /* DI */
831 COSTS_N_INSNS (15)}, /* other */
832 0, /* cost of multiply per each bit set */
833 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
834 COSTS_N_INSNS (56), /* HI */
835 COSTS_N_INSNS (56), /* SI */
836 COSTS_N_INSNS (56), /* DI */
837 COSTS_N_INSNS (56)}, /* other */
838 COSTS_N_INSNS (1), /* cost of movsx */
839 COSTS_N_INSNS (1), /* cost of movzx */
840 16, /* "large" insn */
842 2, /* cost for loading QImode using movzbl */
843 {4, 5, 4}, /* cost of loading integer registers
844 in QImode, HImode and SImode.
845 Relative to reg-reg move (2). */
846 {2, 3, 2}, /* cost of storing integer registers */
847 2, /* cost of reg,reg fld/fst */
848 {2, 2, 6}, /* cost of loading fp registers
849 in SFmode, DFmode and XFmode */
850 {4, 4, 6}, /* cost of storing fp registers
851 in SFmode, DFmode and XFmode */
852 2, /* cost of moving MMX register */
853 {2, 2}, /* cost of loading MMX registers
854 in SImode and DImode */
855 {2, 2}, /* cost of storing MMX registers
856 in SImode and DImode */
857 12, /* cost of moving SSE register */
858 {12, 12, 12}, /* cost of loading SSE registers
859 in SImode, DImode and TImode */
860 {2, 2, 8}, /* cost of storing SSE registers
861 in SImode, DImode and TImode */
862 10, /* MMX or SSE register to integer */
863 8, /* size of l1 cache. */
864 256, /* size of l2 cache. */
865 64, /* size of prefetch block */
866 6, /* number of parallel prefetches */
868 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
869 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
870 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
871 COSTS_N_INSNS (2), /* cost of FABS instruction. */
872 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
873 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
874 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
875 DUMMY_STRINGOP_ALGS},
876 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
878 DUMMY_STRINGOP_ALGS},
879 1, /* scalar_stmt_cost. */
880 1, /* scalar load_cost. */
881 1, /* scalar_store_cost. */
882 1, /* vec_stmt_cost. */
883 1, /* vec_to_scalar_cost. */
884 1, /* scalar_to_vec_cost. */
885 1, /* vec_align_load_cost. */
886 2, /* vec_unalign_load_cost. */
887 1, /* vec_store_cost. */
888 3, /* cond_taken_branch_cost. */
889 1, /* cond_not_taken_branch_cost. */
893 struct processor_costs nocona_cost = {
894 COSTS_N_INSNS (1), /* cost of an add instruction */
895 COSTS_N_INSNS (1), /* cost of a lea instruction */
896 COSTS_N_INSNS (1), /* variable shift costs */
897 COSTS_N_INSNS (1), /* constant shift costs */
898 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
899 COSTS_N_INSNS (10), /* HI */
900 COSTS_N_INSNS (10), /* SI */
901 COSTS_N_INSNS (10), /* DI */
902 COSTS_N_INSNS (10)}, /* other */
903 0, /* cost of multiply per each bit set */
904 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
905 COSTS_N_INSNS (66), /* HI */
906 COSTS_N_INSNS (66), /* SI */
907 COSTS_N_INSNS (66), /* DI */
908 COSTS_N_INSNS (66)}, /* other */
909 COSTS_N_INSNS (1), /* cost of movsx */
910 COSTS_N_INSNS (1), /* cost of movzx */
911 16, /* "large" insn */
913 4, /* cost for loading QImode using movzbl */
914 {4, 4, 4}, /* cost of loading integer registers
915 in QImode, HImode and SImode.
916 Relative to reg-reg move (2). */
917 {4, 4, 4}, /* cost of storing integer registers */
918 3, /* cost of reg,reg fld/fst */
919 {12, 12, 12}, /* cost of loading fp registers
920 in SFmode, DFmode and XFmode */
921 {4, 4, 4}, /* cost of storing fp registers
922 in SFmode, DFmode and XFmode */
923 6, /* cost of moving MMX register */
924 {12, 12}, /* cost of loading MMX registers
925 in SImode and DImode */
926 {12, 12}, /* cost of storing MMX registers
927 in SImode and DImode */
928 6, /* cost of moving SSE register */
929 {12, 12, 12}, /* cost of loading SSE registers
930 in SImode, DImode and TImode */
931 {12, 12, 12}, /* cost of storing SSE registers
932 in SImode, DImode and TImode */
933 8, /* MMX or SSE register to integer */
934 8, /* size of l1 cache. */
935 1024, /* size of l2 cache. */
936 128, /* size of prefetch block */
937 8, /* number of parallel prefetches */
939 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
940 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
941 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
942 COSTS_N_INSNS (3), /* cost of FABS instruction. */
943 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
944 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
945 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
946 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
947 {100000, unrolled_loop}, {-1, libcall}}}},
948 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
950 {libcall, {{24, loop}, {64, unrolled_loop},
951 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
952 1, /* scalar_stmt_cost. */
953 1, /* scalar load_cost. */
954 1, /* scalar_store_cost. */
955 1, /* vec_stmt_cost. */
956 1, /* vec_to_scalar_cost. */
957 1, /* scalar_to_vec_cost. */
958 1, /* vec_align_load_cost. */
959 2, /* vec_unalign_load_cost. */
960 1, /* vec_store_cost. */
961 3, /* cond_taken_branch_cost. */
962 1, /* cond_not_taken_branch_cost. */
966 struct processor_costs core2_cost = {
967 COSTS_N_INSNS (1), /* cost of an add instruction */
968 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
969 COSTS_N_INSNS (1), /* variable shift costs */
970 COSTS_N_INSNS (1), /* constant shift costs */
971 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
972 COSTS_N_INSNS (3), /* HI */
973 COSTS_N_INSNS (3), /* SI */
974 COSTS_N_INSNS (3), /* DI */
975 COSTS_N_INSNS (3)}, /* other */
976 0, /* cost of multiply per each bit set */
977 {COSTS_N_INSNS (22), /* cost of a divide/mod for QI */
978 COSTS_N_INSNS (22), /* HI */
979 COSTS_N_INSNS (22), /* SI */
980 COSTS_N_INSNS (22), /* DI */
981 COSTS_N_INSNS (22)}, /* other */
982 COSTS_N_INSNS (1), /* cost of movsx */
983 COSTS_N_INSNS (1), /* cost of movzx */
984 8, /* "large" insn */
986 2, /* cost for loading QImode using movzbl */
987 {6, 6, 6}, /* cost of loading integer registers
988 in QImode, HImode and SImode.
989 Relative to reg-reg move (2). */
990 {4, 4, 4}, /* cost of storing integer registers */
991 2, /* cost of reg,reg fld/fst */
992 {6, 6, 6}, /* cost of loading fp registers
993 in SFmode, DFmode and XFmode */
994 {4, 4, 4}, /* cost of loading integer registers */
995 2, /* cost of moving MMX register */
996 {6, 6}, /* cost of loading MMX registers
997 in SImode and DImode */
998 {4, 4}, /* cost of storing MMX registers
999 in SImode and DImode */
1000 2, /* cost of moving SSE register */
1001 {6, 6, 6}, /* cost of loading SSE registers
1002 in SImode, DImode and TImode */
1003 {4, 4, 4}, /* cost of storing SSE registers
1004 in SImode, DImode and TImode */
1005 2, /* MMX or SSE register to integer */
1006 32, /* size of l1 cache. */
1007 2048, /* size of l2 cache. */
1008 128, /* size of prefetch block */
1009 8, /* number of parallel prefetches */
1010 3, /* Branch cost */
1011 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
1012 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1013 COSTS_N_INSNS (32), /* cost of FDIV instruction. */
1014 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1015 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1016 COSTS_N_INSNS (58), /* cost of FSQRT instruction. */
1017 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1018 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1019 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1020 {{libcall, {{8, loop}, {15, unrolled_loop},
1021 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1022 {libcall, {{24, loop}, {32, unrolled_loop},
1023 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1024 1, /* scalar_stmt_cost. */
1025 1, /* scalar load_cost. */
1026 1, /* scalar_store_cost. */
1027 1, /* vec_stmt_cost. */
1028 1, /* vec_to_scalar_cost. */
1029 1, /* scalar_to_vec_cost. */
1030 1, /* vec_align_load_cost. */
1031 2, /* vec_unalign_load_cost. */
1032 1, /* vec_store_cost. */
1033 3, /* cond_taken_branch_cost. */
1034 1, /* cond_not_taken_branch_cost. */
1037 /* Generic64 should produce code tuned for Nocona and K8. */
1039 struct processor_costs generic64_cost = {
1040 COSTS_N_INSNS (1), /* cost of an add instruction */
1041 /* On all chips taken into consideration lea is 2 cycles and more. With
1042 this cost however our current implementation of synth_mult results in
1043 use of unnecessary temporary registers causing regression on several
1044 SPECfp benchmarks. */
1045 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1046 COSTS_N_INSNS (1), /* variable shift costs */
1047 COSTS_N_INSNS (1), /* constant shift costs */
1048 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1049 COSTS_N_INSNS (4), /* HI */
1050 COSTS_N_INSNS (3), /* SI */
1051 COSTS_N_INSNS (4), /* DI */
1052 COSTS_N_INSNS (2)}, /* other */
1053 0, /* cost of multiply per each bit set */
1054 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1055 COSTS_N_INSNS (26), /* HI */
1056 COSTS_N_INSNS (42), /* SI */
1057 COSTS_N_INSNS (74), /* DI */
1058 COSTS_N_INSNS (74)}, /* other */
1059 COSTS_N_INSNS (1), /* cost of movsx */
1060 COSTS_N_INSNS (1), /* cost of movzx */
1061 8, /* "large" insn */
1062 17, /* MOVE_RATIO */
1063 4, /* cost for loading QImode using movzbl */
1064 {4, 4, 4}, /* cost of loading integer registers
1065 in QImode, HImode and SImode.
1066 Relative to reg-reg move (2). */
1067 {4, 4, 4}, /* cost of storing integer registers */
1068 4, /* cost of reg,reg fld/fst */
1069 {12, 12, 12}, /* cost of loading fp registers
1070 in SFmode, DFmode and XFmode */
1071 {6, 6, 8}, /* cost of storing fp registers
1072 in SFmode, DFmode and XFmode */
1073 2, /* cost of moving MMX register */
1074 {8, 8}, /* cost of loading MMX registers
1075 in SImode and DImode */
1076 {8, 8}, /* cost of storing MMX registers
1077 in SImode and DImode */
1078 2, /* cost of moving SSE register */
1079 {8, 8, 8}, /* cost of loading SSE registers
1080 in SImode, DImode and TImode */
1081 {8, 8, 8}, /* cost of storing SSE registers
1082 in SImode, DImode and TImode */
1083 5, /* MMX or SSE register to integer */
1084 32, /* size of l1 cache. */
1085 512, /* size of l2 cache. */
1086 64, /* size of prefetch block */
1087 6, /* number of parallel prefetches */
1088 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this value
1089 is increased to perhaps more appropriate value of 5. */
1090 3, /* Branch cost */
1091 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1092 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1093 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1094 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1095 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1096 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1097 {DUMMY_STRINGOP_ALGS,
1098 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1099 {DUMMY_STRINGOP_ALGS,
1100 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1101 1, /* scalar_stmt_cost. */
1102 1, /* scalar load_cost. */
1103 1, /* scalar_store_cost. */
1104 1, /* vec_stmt_cost. */
1105 1, /* vec_to_scalar_cost. */
1106 1, /* scalar_to_vec_cost. */
1107 1, /* vec_align_load_cost. */
1108 2, /* vec_unalign_load_cost. */
1109 1, /* vec_store_cost. */
1110 3, /* cond_taken_branch_cost. */
1111 1, /* cond_not_taken_branch_cost. */
1114 /* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8. */
1116 struct processor_costs generic32_cost = {
1117 COSTS_N_INSNS (1), /* cost of an add instruction */
1118 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1119 COSTS_N_INSNS (1), /* variable shift costs */
1120 COSTS_N_INSNS (1), /* constant shift costs */
1121 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1122 COSTS_N_INSNS (4), /* HI */
1123 COSTS_N_INSNS (3), /* SI */
1124 COSTS_N_INSNS (4), /* DI */
1125 COSTS_N_INSNS (2)}, /* other */
1126 0, /* cost of multiply per each bit set */
1127 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1128 COSTS_N_INSNS (26), /* HI */
1129 COSTS_N_INSNS (42), /* SI */
1130 COSTS_N_INSNS (74), /* DI */
1131 COSTS_N_INSNS (74)}, /* other */
1132 COSTS_N_INSNS (1), /* cost of movsx */
1133 COSTS_N_INSNS (1), /* cost of movzx */
1134 8, /* "large" insn */
1135 17, /* MOVE_RATIO */
1136 4, /* cost for loading QImode using movzbl */
1137 {4, 4, 4}, /* cost of loading integer registers
1138 in QImode, HImode and SImode.
1139 Relative to reg-reg move (2). */
1140 {4, 4, 4}, /* cost of storing integer registers */
1141 4, /* cost of reg,reg fld/fst */
1142 {12, 12, 12}, /* cost of loading fp registers
1143 in SFmode, DFmode and XFmode */
1144 {6, 6, 8}, /* cost of storing fp registers
1145 in SFmode, DFmode and XFmode */
1146 2, /* cost of moving MMX register */
1147 {8, 8}, /* cost of loading MMX registers
1148 in SImode and DImode */
1149 {8, 8}, /* cost of storing MMX registers
1150 in SImode and DImode */
1151 2, /* cost of moving SSE register */
1152 {8, 8, 8}, /* cost of loading SSE registers
1153 in SImode, DImode and TImode */
1154 {8, 8, 8}, /* cost of storing SSE registers
1155 in SImode, DImode and TImode */
1156 5, /* MMX or SSE register to integer */
1157 32, /* size of l1 cache. */
1158 256, /* size of l2 cache. */
1159 64, /* size of prefetch block */
1160 6, /* number of parallel prefetches */
1161 3, /* Branch cost */
1162 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1163 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1164 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1165 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1166 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1167 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1168 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1169 DUMMY_STRINGOP_ALGS},
1170 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1171 DUMMY_STRINGOP_ALGS},
1172 1, /* scalar_stmt_cost. */
1173 1, /* scalar load_cost. */
1174 1, /* scalar_store_cost. */
1175 1, /* vec_stmt_cost. */
1176 1, /* vec_to_scalar_cost. */
1177 1, /* scalar_to_vec_cost. */
1178 1, /* vec_align_load_cost. */
1179 2, /* vec_unalign_load_cost. */
1180 1, /* vec_store_cost. */
1181 3, /* cond_taken_branch_cost. */
1182 1, /* cond_not_taken_branch_cost. */
1185 const struct processor_costs *ix86_cost = &pentium_cost;
1187 /* Processor feature/optimization bitmasks. */
1188 #define m_386 (1<<PROCESSOR_I386)
1189 #define m_486 (1<<PROCESSOR_I486)
1190 #define m_PENT (1<<PROCESSOR_PENTIUM)
1191 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1192 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1193 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1194 #define m_CORE2 (1<<PROCESSOR_CORE2)
1196 #define m_GEODE (1<<PROCESSOR_GEODE)
1197 #define m_K6 (1<<PROCESSOR_K6)
1198 #define m_K6_GEODE (m_K6 | m_GEODE)
1199 #define m_K8 (1<<PROCESSOR_K8)
1200 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1201 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1202 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1203 #define m_AMD_MULTIPLE (m_K8 | m_ATHLON | m_AMDFAM10)
1205 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1206 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1208 /* Generic instruction choice should be common subset of supported CPUs
1209 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1210 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1212 /* Feature tests against the various tunings. */
1213 unsigned int ix86_tune_features[X86_TUNE_LAST] = {
1214 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1215 negatively, so enabling for Generic64 seems like good code size
1216 tradeoff. We can't enable it for 32bit generic because it does not
1217 work well with PPro base chips. */
1218 m_386 | m_K6_GEODE | m_AMD_MULTIPLE | m_CORE2 | m_GENERIC64,
1220 /* X86_TUNE_PUSH_MEMORY */
1221 m_386 | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4
1222 | m_NOCONA | m_CORE2 | m_GENERIC,
1224 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1227 /* X86_TUNE_USE_BIT_TEST */
1230 /* X86_TUNE_UNROLL_STRLEN */
1231 m_486 | m_PENT | m_PPRO | m_AMD_MULTIPLE | m_K6 | m_CORE2 | m_GENERIC,
1233 /* X86_TUNE_DEEP_BRANCH_PREDICTION */
1234 m_PPRO | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4 | m_GENERIC,
1236 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1237 on simulation result. But after P4 was made, no performance benefit
1238 was observed with branch hints. It also increases the code size.
1239 As a result, icc never generates branch hints. */
1242 /* X86_TUNE_DOUBLE_WITH_ADD */
1245 /* X86_TUNE_USE_SAHF */
1246 m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_PENT4
1247 | m_NOCONA | m_CORE2 | m_GENERIC,
1249 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1250 partial dependencies. */
1251 m_AMD_MULTIPLE | m_PPRO | m_PENT4 | m_NOCONA
1252 | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */,
1254 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1255 register stalls on Generic32 compilation setting as well. However
1256 in current implementation the partial register stalls are not eliminated
1257 very well - they can be introduced via subregs synthesized by combine
1258 and can happen in caller/callee saving sequences. Because this option
1259 pays back little on PPro based chips and is in conflict with partial reg
1260 dependencies used by Athlon/P4 based chips, it is better to leave it off
1261 for generic32 for now. */
1264 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1265 m_CORE2 | m_GENERIC,
1267 /* X86_TUNE_USE_HIMODE_FIOP */
1268 m_386 | m_486 | m_K6_GEODE,
1270 /* X86_TUNE_USE_SIMODE_FIOP */
1271 ~(m_PPRO | m_AMD_MULTIPLE | m_PENT | m_CORE2 | m_GENERIC),
1273 /* X86_TUNE_USE_MOV0 */
1276 /* X86_TUNE_USE_CLTD */
1277 ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC),
1279 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1282 /* X86_TUNE_SPLIT_LONG_MOVES */
1285 /* X86_TUNE_READ_MODIFY_WRITE */
1288 /* X86_TUNE_READ_MODIFY */
1291 /* X86_TUNE_PROMOTE_QIMODE */
1292 m_K6_GEODE | m_PENT | m_386 | m_486 | m_AMD_MULTIPLE | m_CORE2
1293 | m_GENERIC /* | m_PENT4 ? */,
1295 /* X86_TUNE_FAST_PREFIX */
1296 ~(m_PENT | m_486 | m_386),
1298 /* X86_TUNE_SINGLE_STRINGOP */
1299 m_386 | m_PENT4 | m_NOCONA,
1301 /* X86_TUNE_QIMODE_MATH */
1304 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1305 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1306 might be considered for Generic32 if our scheme for avoiding partial
1307 stalls was more effective. */
1310 /* X86_TUNE_PROMOTE_QI_REGS */
1313 /* X86_TUNE_PROMOTE_HI_REGS */
1316 /* X86_TUNE_ADD_ESP_4: Enable if add/sub is preferred over 1/2 push/pop. */
1317 m_AMD_MULTIPLE | m_K6_GEODE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1319 /* X86_TUNE_ADD_ESP_8 */
1320 m_AMD_MULTIPLE | m_PPRO | m_K6_GEODE | m_386
1321 | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1323 /* X86_TUNE_SUB_ESP_4 */
1324 m_AMD_MULTIPLE | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1326 /* X86_TUNE_SUB_ESP_8 */
1327 m_AMD_MULTIPLE | m_PPRO | m_386 | m_486
1328 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1330 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1331 for DFmode copies */
1332 ~(m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
1333 | m_GENERIC | m_GEODE),
1335 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1336 m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1338 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1339 conflict here in between PPro/Pentium4 based chips that thread 128bit
1340 SSE registers as single units versus K8 based chips that divide SSE
1341 registers to two 64bit halves. This knob promotes all store destinations
1342 to be 128bit to allow register renaming on 128bit SSE units, but usually
1343 results in one extra microop on 64bit SSE units. Experimental results
1344 shows that disabling this option on P4 brings over 20% SPECfp regression,
1345 while enabling it on K8 brings roughly 2.4% regression that can be partly
1346 masked by careful scheduling of moves. */
1347 m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC | m_AMDFAM10,
1349 /* X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL */
1352 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1353 are resolved on SSE register parts instead of whole registers, so we may
1354 maintain just lower part of scalar values in proper format leaving the
1355 upper part undefined. */
1358 /* X86_TUNE_SSE_TYPELESS_STORES */
1361 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1362 m_PPRO | m_PENT4 | m_NOCONA,
1364 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1365 m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1367 /* X86_TUNE_PROLOGUE_USING_MOVE */
1368 m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
1370 /* X86_TUNE_EPILOGUE_USING_MOVE */
1371 m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
1373 /* X86_TUNE_SHIFT1 */
1376 /* X86_TUNE_USE_FFREEP */
1379 /* X86_TUNE_INTER_UNIT_MOVES */
1380 ~(m_AMD_MULTIPLE | m_GENERIC),
1382 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
1385 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1386 than 4 branch instructions in the 16 byte window. */
1387 m_PPRO | m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1389 /* X86_TUNE_SCHEDULE */
1390 m_PPRO | m_AMD_MULTIPLE | m_K6_GEODE | m_PENT | m_CORE2 | m_GENERIC,
1392 /* X86_TUNE_USE_BT */
1395 /* X86_TUNE_USE_INCDEC */
1396 ~(m_PENT4 | m_NOCONA | m_GENERIC),
1398 /* X86_TUNE_PAD_RETURNS */
1399 m_AMD_MULTIPLE | m_CORE2 | m_GENERIC,
1401 /* X86_TUNE_EXT_80387_CONSTANTS */
1402 m_K6_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC,
1404 /* X86_TUNE_SHORTEN_X87_SSE */
1407 /* X86_TUNE_AVOID_VECTOR_DECODE */
1410 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1411 and SImode multiply, but 386 and 486 do HImode multiply faster. */
1414 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1415 vector path on AMD machines. */
1416 m_K8 | m_GENERIC64 | m_AMDFAM10,
1418 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1420 m_K8 | m_GENERIC64 | m_AMDFAM10,
1422 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1426 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1427 but one byte longer. */
1430 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1431 operand that cannot be represented using a modRM byte. The XOR
1432 replacement is long decoded, so this split helps here as well. */
1435 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
1436 from integer to FP. */
1440 /* Feature tests against the various architecture variations. */
1441 unsigned int ix86_arch_features[X86_ARCH_LAST] = {
1442 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
1443 ~(m_386 | m_486 | m_PENT | m_K6),
1445 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1448 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1451 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1454 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
1458 static const unsigned int x86_accumulate_outgoing_args
1459 = m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
1461 static const unsigned int x86_arch_always_fancy_math_387
1462 = m_PENT | m_PPRO | m_AMD_MULTIPLE | m_PENT4
1463 | m_NOCONA | m_CORE2 | m_GENERIC;
1465 static enum stringop_alg stringop_alg = no_stringop;
1467 /* In case the average insn count for single function invocation is
1468 lower than this constant, emit fast (but longer) prologue and
1470 #define FAST_PROLOGUE_INSN_COUNT 20
1472 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
1473 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1474 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1475 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1477 /* Array of the smallest class containing reg number REGNO, indexed by
1478 REGNO. Used by REGNO_REG_CLASS in i386.h. */
1480 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
1482 /* ax, dx, cx, bx */
1483 AREG, DREG, CREG, BREG,
1484 /* si, di, bp, sp */
1485 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
1487 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
1488 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
1491 /* flags, fpsr, fpcr, frame */
1492 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
1494 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1497 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
1500 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1501 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1502 /* SSE REX registers */
1503 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1507 /* The "default" register map used in 32bit mode. */
1509 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
1511 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
1512 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
1513 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1514 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
1515 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
1516 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1517 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1520 static int const x86_64_int_parameter_registers[6] =
1522 5 /*RDI*/, 4 /*RSI*/, 1 /*RDX*/, 2 /*RCX*/,
1523 FIRST_REX_INT_REG /*R8 */, FIRST_REX_INT_REG + 1 /*R9 */
1526 static int const x86_64_ms_abi_int_parameter_registers[4] =
1528 2 /*RCX*/, 1 /*RDX*/,
1529 FIRST_REX_INT_REG /*R8 */, FIRST_REX_INT_REG + 1 /*R9 */
1532 static int const x86_64_int_return_registers[4] =
1534 0 /*RAX*/, 1 /*RDX*/, 5 /*RDI*/, 4 /*RSI*/
1537 /* The "default" register map used in 64bit mode. */
1538 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
1540 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
1541 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
1542 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1543 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
1544 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
1545 8,9,10,11,12,13,14,15, /* extended integer registers */
1546 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
1549 /* Define the register numbers to be used in Dwarf debugging information.
1550 The SVR4 reference port C compiler uses the following register numbers
1551 in its Dwarf output code:
1552 0 for %eax (gcc regno = 0)
1553 1 for %ecx (gcc regno = 2)
1554 2 for %edx (gcc regno = 1)
1555 3 for %ebx (gcc regno = 3)
1556 4 for %esp (gcc regno = 7)
1557 5 for %ebp (gcc regno = 6)
1558 6 for %esi (gcc regno = 4)
1559 7 for %edi (gcc regno = 5)
1560 The following three DWARF register numbers are never generated by
1561 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
1562 believes these numbers have these meanings.
1563 8 for %eip (no gcc equivalent)
1564 9 for %eflags (gcc regno = 17)
1565 10 for %trapno (no gcc equivalent)
1566 It is not at all clear how we should number the FP stack registers
1567 for the x86 architecture. If the version of SDB on x86/svr4 were
1568 a bit less brain dead with respect to floating-point then we would
1569 have a precedent to follow with respect to DWARF register numbers
1570 for x86 FP registers, but the SDB on x86/svr4 is so completely
1571 broken with respect to FP registers that it is hardly worth thinking
1572 of it as something to strive for compatibility with.
1573 The version of x86/svr4 SDB I have at the moment does (partially)
1574 seem to believe that DWARF register number 11 is associated with
1575 the x86 register %st(0), but that's about all. Higher DWARF
1576 register numbers don't seem to be associated with anything in
1577 particular, and even for DWARF regno 11, SDB only seems to under-
1578 stand that it should say that a variable lives in %st(0) (when
1579 asked via an `=' command) if we said it was in DWARF regno 11,
1580 but SDB still prints garbage when asked for the value of the
1581 variable in question (via a `/' command).
1582 (Also note that the labels SDB prints for various FP stack regs
1583 when doing an `x' command are all wrong.)
1584 Note that these problems generally don't affect the native SVR4
1585 C compiler because it doesn't allow the use of -O with -g and
1586 because when it is *not* optimizing, it allocates a memory
1587 location for each floating-point variable, and the memory
1588 location is what gets described in the DWARF AT_location
1589 attribute for the variable in question.
1590 Regardless of the severe mental illness of the x86/svr4 SDB, we
1591 do something sensible here and we use the following DWARF
1592 register numbers. Note that these are all stack-top-relative
1594 11 for %st(0) (gcc regno = 8)
1595 12 for %st(1) (gcc regno = 9)
1596 13 for %st(2) (gcc regno = 10)
1597 14 for %st(3) (gcc regno = 11)
1598 15 for %st(4) (gcc regno = 12)
1599 16 for %st(5) (gcc regno = 13)
1600 17 for %st(6) (gcc regno = 14)
1601 18 for %st(7) (gcc regno = 15)
1603 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
1605 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
1606 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
1607 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1608 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
1609 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
1610 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1611 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1614 /* Test and compare insns in i386.md store the information needed to
1615 generate branch and scc insns here. */
1617 rtx ix86_compare_op0 = NULL_RTX;
1618 rtx ix86_compare_op1 = NULL_RTX;
1619 rtx ix86_compare_emitted = NULL_RTX;
1621 /* Size of the register save area. */
1622 #define X86_64_VARARGS_SIZE (X86_64_REGPARM_MAX * UNITS_PER_WORD + X86_64_SSE_REGPARM_MAX * 16)
1624 /* Define the structure for the machine field in struct function. */
1626 struct stack_local_entry GTY(())
1628 unsigned short mode;
1631 struct stack_local_entry *next;
1634 /* Structure describing stack frame layout.
1635 Stack grows downward:
1641 saved frame pointer if frame_pointer_needed
1642 <- HARD_FRAME_POINTER
1647 [va_arg registers] (
1648 > to_allocate <- FRAME_POINTER
1658 HOST_WIDE_INT frame;
1660 int outgoing_arguments_size;
1663 HOST_WIDE_INT to_allocate;
1664 /* The offsets relative to ARG_POINTER. */
1665 HOST_WIDE_INT frame_pointer_offset;
1666 HOST_WIDE_INT hard_frame_pointer_offset;
1667 HOST_WIDE_INT stack_pointer_offset;
1669 /* When save_regs_using_mov is set, emit prologue using
1670 move instead of push instructions. */
1671 bool save_regs_using_mov;
1674 /* Code model option. */
1675 enum cmodel ix86_cmodel;
1677 enum asm_dialect ix86_asm_dialect = ASM_ATT;
1679 enum tls_dialect ix86_tls_dialect = TLS_DIALECT_GNU;
1681 /* Which unit we are generating floating point math for. */
1682 enum fpmath_unit ix86_fpmath;
1684 /* Which cpu are we scheduling for. */
1685 enum processor_type ix86_tune;
1687 /* Which instruction set architecture to use. */
1688 enum processor_type ix86_arch;
1690 /* true if sse prefetch instruction is not NOOP. */
1691 int x86_prefetch_sse;
1693 /* ix86_regparm_string as a number */
1694 static int ix86_regparm;
1696 /* -mstackrealign option */
1697 extern int ix86_force_align_arg_pointer;
1698 static const char ix86_force_align_arg_pointer_string[] = "force_align_arg_pointer";
1700 static rtx (*ix86_gen_leave) (void);
1701 static rtx (*ix86_gen_pop1) (rtx);
1702 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
1703 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
1704 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx);
1705 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
1706 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
1708 /* Preferred alignment for stack boundary in bits. */
1709 unsigned int ix86_preferred_stack_boundary;
1711 /* Values 1-5: see jump.c */
1712 int ix86_branch_cost;
1714 /* Variables which are this size or smaller are put in the data/bss
1715 or ldata/lbss sections. */
1717 int ix86_section_threshold = 65536;
1719 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
1720 char internal_label_prefix[16];
1721 int internal_label_prefix_len;
1723 /* Fence to use after loop using movnt. */
1726 /* Register class used for passing given 64bit part of the argument.
1727 These represent classes as documented by the PS ABI, with the exception
1728 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
1729 use SF or DFmode move instead of DImode to avoid reformatting penalties.
1731 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
1732 whenever possible (upper half does contain padding). */
1733 enum x86_64_reg_class
1736 X86_64_INTEGER_CLASS,
1737 X86_64_INTEGERSI_CLASS,
1744 X86_64_COMPLEX_X87_CLASS,
1747 static const char * const x86_64_reg_class_name[] =
1749 "no", "integer", "integerSI", "sse", "sseSF", "sseDF",
1750 "sseup", "x87", "x87up", "cplx87", "no"
1753 #define MAX_CLASSES 4
1755 /* Table of constants used by fldpi, fldln2, etc.... */
1756 static REAL_VALUE_TYPE ext_80387_constants_table [5];
1757 static bool ext_80387_constants_init = 0;
1760 static struct machine_function * ix86_init_machine_status (void);
1761 static rtx ix86_function_value (const_tree, const_tree, bool);
1762 static int ix86_function_regparm (const_tree, const_tree);
1763 static void ix86_compute_frame_layout (struct ix86_frame *);
1764 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
1768 /* The svr4 ABI for the i386 says that records and unions are returned
1770 #ifndef DEFAULT_PCC_STRUCT_RETURN
1771 #define DEFAULT_PCC_STRUCT_RETURN 1
1774 /* Bit flags that specify the ISA we are compiling for. */
1775 int ix86_isa_flags = TARGET_64BIT_DEFAULT | TARGET_SUBTARGET_ISA_DEFAULT;
1777 /* A mask of ix86_isa_flags that includes bit X if X
1778 was set or cleared on the command line. */
1779 static int ix86_isa_flags_explicit;
1781 /* Define a set of ISAs which are available when a given ISA is
1782 enabled. MMX and SSE ISAs are handled separately. */
1784 #define OPTION_MASK_ISA_MMX_SET OPTION_MASK_ISA_MMX
1785 #define OPTION_MASK_ISA_3DNOW_SET \
1786 (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_MMX_SET)
1788 #define OPTION_MASK_ISA_SSE_SET OPTION_MASK_ISA_SSE
1789 #define OPTION_MASK_ISA_SSE2_SET \
1790 (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE_SET)
1791 #define OPTION_MASK_ISA_SSE3_SET \
1792 (OPTION_MASK_ISA_SSE3 | OPTION_MASK_ISA_SSE2_SET)
1793 #define OPTION_MASK_ISA_SSSE3_SET \
1794 (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSE3_SET)
1795 #define OPTION_MASK_ISA_SSE4_1_SET \
1796 (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSSE3_SET)
1797 #define OPTION_MASK_ISA_SSE4_2_SET \
1798 (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_SSE4_1_SET)
1800 /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
1802 #define OPTION_MASK_ISA_SSE4_SET OPTION_MASK_ISA_SSE4_2_SET
1804 #define OPTION_MASK_ISA_SSE4A_SET \
1805 (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_SSE3_SET)
1806 #define OPTION_MASK_ISA_SSE5_SET \
1807 (OPTION_MASK_ISA_SSE5 | OPTION_MASK_ISA_SSE4A_SET)
1809 /* Define a set of ISAs which aren't available when a given ISA is
1810 disabled. MMX and SSE ISAs are handled separately. */
1812 #define OPTION_MASK_ISA_MMX_UNSET \
1813 (OPTION_MASK_ISA_MMX | OPTION_MASK_ISA_3DNOW_UNSET)
1814 #define OPTION_MASK_ISA_3DNOW_UNSET \
1815 (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_A_UNSET)
1816 #define OPTION_MASK_ISA_3DNOW_A_UNSET OPTION_MASK_ISA_3DNOW_A
1818 #define OPTION_MASK_ISA_SSE_UNSET \
1819 (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_SSE2_UNSET)
1820 #define OPTION_MASK_ISA_SSE2_UNSET \
1821 (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE3_UNSET)
1822 #define OPTION_MASK_ISA_SSE3_UNSET \
1823 (OPTION_MASK_ISA_SSE3 \
1824 | OPTION_MASK_ISA_SSSE3_UNSET \
1825 | OPTION_MASK_ISA_SSE4A_UNSET )
1826 #define OPTION_MASK_ISA_SSSE3_UNSET \
1827 (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSE4_1_UNSET)
1828 #define OPTION_MASK_ISA_SSE4_1_UNSET \
1829 (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_2_UNSET)
1830 #define OPTION_MASK_ISA_SSE4_2_UNSET OPTION_MASK_ISA_SSE4_2
1832 /* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same
1834 #define OPTION_MASK_ISA_SSE4_UNSET OPTION_MASK_ISA_SSE4_1_UNSET
1836 #define OPTION_MASK_ISA_SSE4A_UNSET \
1837 (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_SSE5_UNSET)
1839 #define OPTION_MASK_ISA_SSE5_UNSET OPTION_MASK_ISA_SSE5
1841 /* Vectorization library interface and handlers. */
1842 tree (*ix86_veclib_handler)(enum built_in_function, tree, tree) = NULL;
1843 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
1844 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
1846 /* Implement TARGET_HANDLE_OPTION. */
1849 ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value)
1856 ix86_isa_flags |= OPTION_MASK_ISA_MMX_SET;
1857 ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX_SET;
1861 ix86_isa_flags &= ~OPTION_MASK_ISA_MMX_UNSET;
1862 ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX_UNSET;
1869 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_SET;
1870 ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW_SET;
1874 ix86_isa_flags &= ~OPTION_MASK_ISA_3DNOW_UNSET;
1875 ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW_UNSET;
1885 ix86_isa_flags |= OPTION_MASK_ISA_SSE_SET;
1886 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE_SET;
1890 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE_UNSET;
1891 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE_UNSET;
1898 ix86_isa_flags |= OPTION_MASK_ISA_SSE2_SET;
1899 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_SET;
1903 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE2_UNSET;
1904 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_UNSET;
1911 ix86_isa_flags |= OPTION_MASK_ISA_SSE3_SET;
1912 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3_SET;
1916 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE3_UNSET;
1917 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3_UNSET;
1924 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3_SET;
1925 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3_SET;
1929 ix86_isa_flags &= ~OPTION_MASK_ISA_SSSE3_UNSET;
1930 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3_UNSET;
1937 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1_SET;
1938 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1_SET;
1942 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_1_UNSET;
1943 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1_UNSET;
1950 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2_SET;
1951 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2_SET;
1955 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_2_UNSET;
1956 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2_UNSET;
1961 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_SET;
1962 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_SET;
1966 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_UNSET;
1967 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_UNSET;
1973 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A_SET;
1974 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A_SET;
1978 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4A_UNSET;
1979 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A_UNSET;
1986 ix86_isa_flags |= OPTION_MASK_ISA_SSE5_SET;
1987 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE5_SET;
1991 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE5_UNSET;
1992 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE5_UNSET;
2001 /* Sometimes certain combinations of command options do not make
2002 sense on a particular target machine. You can define a macro
2003 `OVERRIDE_OPTIONS' to take account of this. This macro, if
2004 defined, is executed once just after all the command options have
2007 Don't use this macro to turn on various extra optimizations for
2008 `-O'. That is what `OPTIMIZATION_OPTIONS' is for. */
2011 override_options (void)
2014 int ix86_tune_defaulted = 0;
2015 int ix86_arch_specified = 0;
2016 unsigned int ix86_arch_mask, ix86_tune_mask;
2018 /* Comes from final.c -- no real reason to change it. */
2019 #define MAX_CODE_ALIGN 16
2023 const struct processor_costs *cost; /* Processor costs */
2024 const int align_loop; /* Default alignments. */
2025 const int align_loop_max_skip;
2026 const int align_jump;
2027 const int align_jump_max_skip;
2028 const int align_func;
2030 const processor_target_table[PROCESSOR_max] =
2032 {&i386_cost, 4, 3, 4, 3, 4},
2033 {&i486_cost, 16, 15, 16, 15, 16},
2034 {&pentium_cost, 16, 7, 16, 7, 16},
2035 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2036 {&geode_cost, 0, 0, 0, 0, 0},
2037 {&k6_cost, 32, 7, 32, 7, 32},
2038 {&athlon_cost, 16, 7, 16, 7, 16},
2039 {&pentium4_cost, 0, 0, 0, 0, 0},
2040 {&k8_cost, 16, 7, 16, 7, 16},
2041 {&nocona_cost, 0, 0, 0, 0, 0},
2042 {&core2_cost, 16, 10, 16, 10, 16},
2043 {&generic32_cost, 16, 7, 16, 7, 16},
2044 {&generic64_cost, 16, 10, 16, 10, 16},
2045 {&amdfam10_cost, 32, 24, 32, 7, 32}
2048 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2079 PTA_PREFETCH_SSE = 1 << 4,
2081 PTA_3DNOW_A = 1 << 6,
2085 PTA_POPCNT = 1 << 10,
2087 PTA_SSE4A = 1 << 12,
2088 PTA_NO_SAHF = 1 << 13,
2089 PTA_SSE4_1 = 1 << 14,
2090 PTA_SSE4_2 = 1 << 15,
2093 PTA_PCLMUL = 1 << 18
2098 const char *const name; /* processor name or nickname. */
2099 const enum processor_type processor;
2100 const unsigned /*enum pta_flags*/ flags;
2102 const processor_alias_table[] =
2104 {"i386", PROCESSOR_I386, 0},
2105 {"i486", PROCESSOR_I486, 0},
2106 {"i586", PROCESSOR_PENTIUM, 0},
2107 {"pentium", PROCESSOR_PENTIUM, 0},
2108 {"pentium-mmx", PROCESSOR_PENTIUM, PTA_MMX},
2109 {"winchip-c6", PROCESSOR_I486, PTA_MMX},
2110 {"winchip2", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},
2111 {"c3", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},
2112 {"c3-2", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE},
2113 {"i686", PROCESSOR_PENTIUMPRO, 0},
2114 {"pentiumpro", PROCESSOR_PENTIUMPRO, 0},
2115 {"pentium2", PROCESSOR_PENTIUMPRO, PTA_MMX},
2116 {"pentium3", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE},
2117 {"pentium3m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE},
2118 {"pentium-m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_SSE2},
2119 {"pentium4", PROCESSOR_PENTIUM4, PTA_MMX |PTA_SSE | PTA_SSE2},
2120 {"pentium4m", PROCESSOR_PENTIUM4, PTA_MMX | PTA_SSE | PTA_SSE2},
2121 {"prescott", PROCESSOR_NOCONA, PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2122 {"nocona", PROCESSOR_NOCONA, (PTA_64BIT
2123 | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2124 | PTA_CX16 | PTA_NO_SAHF)},
2125 {"core2", PROCESSOR_CORE2, (PTA_64BIT
2126 | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2129 {"geode", PROCESSOR_GEODE, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2130 |PTA_PREFETCH_SSE)},
2131 {"k6", PROCESSOR_K6, PTA_MMX},
2132 {"k6-2", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
2133 {"k6-3", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
2134 {"athlon", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2135 | PTA_PREFETCH_SSE)},
2136 {"athlon-tbird", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2137 | PTA_PREFETCH_SSE)},
2138 {"athlon-4", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2140 {"athlon-xp", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2142 {"athlon-mp", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2144 {"x86-64", PROCESSOR_K8, (PTA_64BIT
2145 | PTA_MMX | PTA_SSE | PTA_SSE2
2147 {"k8", PROCESSOR_K8, (PTA_64BIT
2148 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2149 | PTA_SSE | PTA_SSE2
2151 {"k8-sse3", PROCESSOR_K8, (PTA_64BIT
2152 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2153 | PTA_SSE | PTA_SSE2 | PTA_SSE3
2155 {"opteron", PROCESSOR_K8, (PTA_64BIT
2156 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2157 | PTA_SSE | PTA_SSE2
2159 {"opteron-sse3", PROCESSOR_K8, (PTA_64BIT
2160 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2161 | PTA_SSE | PTA_SSE2 | PTA_SSE3
2163 {"athlon64", PROCESSOR_K8, (PTA_64BIT
2164 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2165 | PTA_SSE | PTA_SSE2
2167 {"athlon64-sse3", PROCESSOR_K8, (PTA_64BIT
2168 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2169 | PTA_SSE | PTA_SSE2 | PTA_SSE3
2171 {"athlon-fx", PROCESSOR_K8, (PTA_64BIT
2172 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2173 | PTA_SSE | PTA_SSE2
2175 {"amdfam10", PROCESSOR_AMDFAM10, (PTA_64BIT
2176 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2177 | PTA_SSE | PTA_SSE2 | PTA_SSE3
2179 | PTA_CX16 | PTA_ABM)},
2180 {"barcelona", PROCESSOR_AMDFAM10, (PTA_64BIT
2181 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2182 | PTA_SSE | PTA_SSE2 | PTA_SSE3
2184 | PTA_CX16 | PTA_ABM)},
2185 {"generic32", PROCESSOR_GENERIC32, 0 /* flags are only used for -march switch. */ },
2186 {"generic64", PROCESSOR_GENERIC64, PTA_64BIT /* flags are only used for -march switch. */ },
2189 int const pta_size = ARRAY_SIZE (processor_alias_table);
2191 #ifdef SUBTARGET_OVERRIDE_OPTIONS
2192 SUBTARGET_OVERRIDE_OPTIONS;
2195 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
2196 SUBSUBTARGET_OVERRIDE_OPTIONS;
2199 /* -fPIC is the default for x86_64. */
2200 if (TARGET_MACHO && TARGET_64BIT)
2203 /* Set the default values for switches whose default depends on TARGET_64BIT
2204 in case they weren't overwritten by command line options. */
2207 /* Mach-O doesn't support omitting the frame pointer for now. */
2208 if (flag_omit_frame_pointer == 2)
2209 flag_omit_frame_pointer = (TARGET_MACHO ? 0 : 1);
2210 if (flag_asynchronous_unwind_tables == 2)
2211 flag_asynchronous_unwind_tables = 1;
2212 if (flag_pcc_struct_return == 2)
2213 flag_pcc_struct_return = 0;
2217 if (flag_omit_frame_pointer == 2)
2218 flag_omit_frame_pointer = 0;
2219 if (flag_asynchronous_unwind_tables == 2)
2220 flag_asynchronous_unwind_tables = 0;
2221 if (flag_pcc_struct_return == 2)
2222 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
2225 /* Need to check -mtune=generic first. */
2226 if (ix86_tune_string)
2228 if (!strcmp (ix86_tune_string, "generic")
2229 || !strcmp (ix86_tune_string, "i686")
2230 /* As special support for cross compilers we read -mtune=native
2231 as -mtune=generic. With native compilers we won't see the
2232 -mtune=native, as it was changed by the driver. */
2233 || !strcmp (ix86_tune_string, "native"))
2236 ix86_tune_string = "generic64";
2238 ix86_tune_string = "generic32";
2240 else if (!strncmp (ix86_tune_string, "generic", 7))
2241 error ("bad value (%s) for -mtune= switch", ix86_tune_string);
2245 if (ix86_arch_string)
2246 ix86_tune_string = ix86_arch_string;
2247 if (!ix86_tune_string)
2249 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
2250 ix86_tune_defaulted = 1;
2253 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
2254 need to use a sensible tune option. */
2255 if (!strcmp (ix86_tune_string, "generic")
2256 || !strcmp (ix86_tune_string, "x86-64")
2257 || !strcmp (ix86_tune_string, "i686"))
2260 ix86_tune_string = "generic64";
2262 ix86_tune_string = "generic32";
2265 if (ix86_stringop_string)
2267 if (!strcmp (ix86_stringop_string, "rep_byte"))
2268 stringop_alg = rep_prefix_1_byte;
2269 else if (!strcmp (ix86_stringop_string, "libcall"))
2270 stringop_alg = libcall;
2271 else if (!strcmp (ix86_stringop_string, "rep_4byte"))
2272 stringop_alg = rep_prefix_4_byte;
2273 else if (!strcmp (ix86_stringop_string, "rep_8byte"))
2274 stringop_alg = rep_prefix_8_byte;
2275 else if (!strcmp (ix86_stringop_string, "byte_loop"))
2276 stringop_alg = loop_1_byte;
2277 else if (!strcmp (ix86_stringop_string, "loop"))
2278 stringop_alg = loop;
2279 else if (!strcmp (ix86_stringop_string, "unrolled_loop"))
2280 stringop_alg = unrolled_loop;
2282 error ("bad value (%s) for -mstringop-strategy= switch", ix86_stringop_string);
2284 if (!strcmp (ix86_tune_string, "x86-64"))
2285 warning (OPT_Wdeprecated, "-mtune=x86-64 is deprecated. Use -mtune=k8 or "
2286 "-mtune=generic instead as appropriate.");
2288 if (!ix86_arch_string)
2289 ix86_arch_string = TARGET_64BIT ? "x86-64" : "i386";
2291 ix86_arch_specified = 1;
2293 if (!strcmp (ix86_arch_string, "generic"))
2294 error ("generic CPU can be used only for -mtune= switch");
2295 if (!strncmp (ix86_arch_string, "generic", 7))
2296 error ("bad value (%s) for -march= switch", ix86_arch_string);
2298 if (ix86_cmodel_string != 0)
2300 if (!strcmp (ix86_cmodel_string, "small"))
2301 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
2302 else if (!strcmp (ix86_cmodel_string, "medium"))
2303 ix86_cmodel = flag_pic ? CM_MEDIUM_PIC : CM_MEDIUM;
2304 else if (!strcmp (ix86_cmodel_string, "large"))
2305 ix86_cmodel = flag_pic ? CM_LARGE_PIC : CM_LARGE;
2307 error ("code model %s does not support PIC mode", ix86_cmodel_string);
2308 else if (!strcmp (ix86_cmodel_string, "32"))
2309 ix86_cmodel = CM_32;
2310 else if (!strcmp (ix86_cmodel_string, "kernel") && !flag_pic)
2311 ix86_cmodel = CM_KERNEL;
2313 error ("bad value (%s) for -mcmodel= switch", ix86_cmodel_string);
2317 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
2318 use of rip-relative addressing. This eliminates fixups that
2319 would otherwise be needed if this object is to be placed in a
2320 DLL, and is essentially just as efficient as direct addressing. */
2321 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
2322 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
2323 else if (TARGET_64BIT)
2324 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
2326 ix86_cmodel = CM_32;
2328 if (ix86_asm_string != 0)
2331 && !strcmp (ix86_asm_string, "intel"))
2332 ix86_asm_dialect = ASM_INTEL;
2333 else if (!strcmp (ix86_asm_string, "att"))
2334 ix86_asm_dialect = ASM_ATT;
2336 error ("bad value (%s) for -masm= switch", ix86_asm_string);
2338 if ((TARGET_64BIT == 0) != (ix86_cmodel == CM_32))
2339 error ("code model %qs not supported in the %s bit mode",
2340 ix86_cmodel_string, TARGET_64BIT ? "64" : "32");
2341 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
2342 sorry ("%i-bit mode not compiled in",
2343 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
2345 for (i = 0; i < pta_size; i++)
2346 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
2348 ix86_arch = processor_alias_table[i].processor;
2349 /* Default cpu tuning to the architecture. */
2350 ix86_tune = ix86_arch;
2352 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
2353 error ("CPU you selected does not support x86-64 "
2356 if (processor_alias_table[i].flags & PTA_MMX
2357 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
2358 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
2359 if (processor_alias_table[i].flags & PTA_3DNOW
2360 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
2361 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
2362 if (processor_alias_table[i].flags & PTA_3DNOW_A
2363 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
2364 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
2365 if (processor_alias_table[i].flags & PTA_SSE
2366 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
2367 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
2368 if (processor_alias_table[i].flags & PTA_SSE2
2369 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
2370 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
2371 if (processor_alias_table[i].flags & PTA_SSE3
2372 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
2373 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
2374 if (processor_alias_table[i].flags & PTA_SSSE3
2375 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
2376 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
2377 if (processor_alias_table[i].flags & PTA_SSE4_1
2378 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
2379 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
2380 if (processor_alias_table[i].flags & PTA_SSE4_2
2381 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
2382 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
2383 if (processor_alias_table[i].flags & PTA_SSE4A
2384 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
2385 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
2386 if (processor_alias_table[i].flags & PTA_SSE5
2387 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE5))
2388 ix86_isa_flags |= OPTION_MASK_ISA_SSE5;
2390 if (processor_alias_table[i].flags & PTA_ABM)
2392 if (processor_alias_table[i].flags & PTA_CX16)
2393 x86_cmpxchg16b = true;
2394 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM))
2396 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
2397 x86_prefetch_sse = true;
2398 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF)))
2400 if (processor_alias_table[i].flags & PTA_AES)
2402 if (processor_alias_table[i].flags & PTA_PCLMUL)
2409 error ("bad value (%s) for -march= switch", ix86_arch_string);
2411 ix86_arch_mask = 1u << ix86_arch;
2412 for (i = 0; i < X86_ARCH_LAST; ++i)
2413 ix86_arch_features[i] &= ix86_arch_mask;
2415 for (i = 0; i < pta_size; i++)
2416 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
2418 ix86_tune = processor_alias_table[i].processor;
2419 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
2421 if (ix86_tune_defaulted)
2423 ix86_tune_string = "x86-64";
2424 for (i = 0; i < pta_size; i++)
2425 if (! strcmp (ix86_tune_string,
2426 processor_alias_table[i].name))
2428 ix86_tune = processor_alias_table[i].processor;
2431 error ("CPU you selected does not support x86-64 "
2434 /* Intel CPUs have always interpreted SSE prefetch instructions as
2435 NOPs; so, we can enable SSE prefetch instructions even when
2436 -mtune (rather than -march) points us to a processor that has them.
2437 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
2438 higher processors. */
2440 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
2441 x86_prefetch_sse = true;
2445 error ("bad value (%s) for -mtune= switch", ix86_tune_string);
2447 /* Enable SSE2 if AES or PCLMUL is enabled. */
2448 if ((x86_aes || x86_pclmul)
2449 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
2451 ix86_isa_flags |= OPTION_MASK_ISA_SSE2_SET;
2452 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_SET;
2455 ix86_tune_mask = 1u << ix86_tune;
2456 for (i = 0; i < X86_TUNE_LAST; ++i)
2457 ix86_tune_features[i] &= ix86_tune_mask;
2460 ix86_cost = &size_cost;
2462 ix86_cost = processor_target_table[ix86_tune].cost;
2464 /* Arrange to set up i386_stack_locals for all functions. */
2465 init_machine_status = ix86_init_machine_status;
2467 /* Validate -mregparm= value. */
2468 if (ix86_regparm_string)
2471 warning (0, "-mregparm is ignored in 64-bit mode");
2472 i = atoi (ix86_regparm_string);
2473 if (i < 0 || i > REGPARM_MAX)
2474 error ("-mregparm=%d is not between 0 and %d", i, REGPARM_MAX);
2479 ix86_regparm = REGPARM_MAX;
2481 /* If the user has provided any of the -malign-* options,
2482 warn and use that value only if -falign-* is not set.
2483 Remove this code in GCC 3.2 or later. */
2484 if (ix86_align_loops_string)
2486 warning (0, "-malign-loops is obsolete, use -falign-loops");
2487 if (align_loops == 0)
2489 i = atoi (ix86_align_loops_string);
2490 if (i < 0 || i > MAX_CODE_ALIGN)
2491 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2493 align_loops = 1 << i;
2497 if (ix86_align_jumps_string)
2499 warning (0, "-malign-jumps is obsolete, use -falign-jumps");
2500 if (align_jumps == 0)
2502 i = atoi (ix86_align_jumps_string);
2503 if (i < 0 || i > MAX_CODE_ALIGN)
2504 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2506 align_jumps = 1 << i;
2510 if (ix86_align_funcs_string)
2512 warning (0, "-malign-functions is obsolete, use -falign-functions");
2513 if (align_functions == 0)
2515 i = atoi (ix86_align_funcs_string);
2516 if (i < 0 || i > MAX_CODE_ALIGN)
2517 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2519 align_functions = 1 << i;
2523 /* Default align_* from the processor table. */
2524 if (align_loops == 0)
2526 align_loops = processor_target_table[ix86_tune].align_loop;
2527 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
2529 if (align_jumps == 0)
2531 align_jumps = processor_target_table[ix86_tune].align_jump;
2532 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
2534 if (align_functions == 0)
2536 align_functions = processor_target_table[ix86_tune].align_func;
2539 /* Validate -mbranch-cost= value, or provide default. */
2540 ix86_branch_cost = ix86_cost->branch_cost;
2541 if (ix86_branch_cost_string)
2543 i = atoi (ix86_branch_cost_string);
2545 error ("-mbranch-cost=%d is not between 0 and 5", i);
2547 ix86_branch_cost = i;
2549 if (ix86_section_threshold_string)
2551 i = atoi (ix86_section_threshold_string);
2553 error ("-mlarge-data-threshold=%d is negative", i);
2555 ix86_section_threshold = i;
2558 if (ix86_tls_dialect_string)
2560 if (strcmp (ix86_tls_dialect_string, "gnu") == 0)
2561 ix86_tls_dialect = TLS_DIALECT_GNU;
2562 else if (strcmp (ix86_tls_dialect_string, "gnu2") == 0)
2563 ix86_tls_dialect = TLS_DIALECT_GNU2;
2564 else if (strcmp (ix86_tls_dialect_string, "sun") == 0)
2565 ix86_tls_dialect = TLS_DIALECT_SUN;
2567 error ("bad value (%s) for -mtls-dialect= switch",
2568 ix86_tls_dialect_string);
2571 if (ix87_precision_string)
2573 i = atoi (ix87_precision_string);
2574 if (i != 32 && i != 64 && i != 80)
2575 error ("pc%d is not valid precision setting (32, 64 or 80)", i);
2580 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
2582 /* Enable by default the SSE and MMX builtins. Do allow the user to
2583 explicitly disable any of these. In particular, disabling SSE and
2584 MMX for kernel code is extremely useful. */
2585 if (!ix86_arch_specified)
2587 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
2588 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
2591 warning (0, "-mrtd is ignored in 64bit mode");
2595 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
2597 if (!ix86_arch_specified)
2599 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
2601 /* i386 ABI does not specify red zone. It still makes sense to use it
2602 when programmer takes care to stack from being destroyed. */
2603 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
2604 target_flags |= MASK_NO_RED_ZONE;
2607 /* Keep nonleaf frame pointers. */
2608 if (flag_omit_frame_pointer)
2609 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
2610 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
2611 flag_omit_frame_pointer = 1;
2613 /* If we're doing fast math, we don't care about comparison order
2614 wrt NaNs. This lets us use a shorter comparison sequence. */
2615 if (flag_finite_math_only)
2616 target_flags &= ~MASK_IEEE_FP;
2618 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
2619 since the insns won't need emulation. */
2620 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
2621 target_flags &= ~MASK_NO_FANCY_MATH_387;
2623 /* Likewise, if the target doesn't have a 387, or we've specified
2624 software floating point, don't use 387 inline intrinsics. */
2626 target_flags |= MASK_NO_FANCY_MATH_387;
2628 /* Turn on MMX builtins for -msse. */
2631 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
2632 x86_prefetch_sse = true;
2635 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
2636 if (TARGET_SSE4_2 || TARGET_ABM)
2639 /* Validate -mpreferred-stack-boundary= value, or provide default.
2640 The default of 128 bits is for Pentium III's SSE __m128. We can't
2641 change it because of optimize_size. Otherwise, we can't mix object
2642 files compiled with -Os and -On. */
2643 ix86_preferred_stack_boundary = 128;
2644 if (ix86_preferred_stack_boundary_string)
2646 i = atoi (ix86_preferred_stack_boundary_string);
2647 if (i < (TARGET_64BIT ? 4 : 2) || i > 12)
2648 error ("-mpreferred-stack-boundary=%d is not between %d and 12", i,
2649 TARGET_64BIT ? 4 : 2);
2651 ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT;
2654 /* Accept -msseregparm only if at least SSE support is enabled. */
2655 if (TARGET_SSEREGPARM
2657 error ("-msseregparm used without SSE enabled");
2659 ix86_fpmath = TARGET_FPMATH_DEFAULT;
2660 if (ix86_fpmath_string != 0)
2662 if (! strcmp (ix86_fpmath_string, "387"))
2663 ix86_fpmath = FPMATH_387;
2664 else if (! strcmp (ix86_fpmath_string, "sse"))
2668 warning (0, "SSE instruction set disabled, using 387 arithmetics");
2669 ix86_fpmath = FPMATH_387;
2672 ix86_fpmath = FPMATH_SSE;
2674 else if (! strcmp (ix86_fpmath_string, "387,sse")
2675 || ! strcmp (ix86_fpmath_string, "sse,387"))
2679 warning (0, "SSE instruction set disabled, using 387 arithmetics");
2680 ix86_fpmath = FPMATH_387;
2682 else if (!TARGET_80387)
2684 warning (0, "387 instruction set disabled, using SSE arithmetics");
2685 ix86_fpmath = FPMATH_SSE;
2688 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
2691 error ("bad value (%s) for -mfpmath= switch", ix86_fpmath_string);
2694 /* If the i387 is disabled, then do not return values in it. */
2696 target_flags &= ~MASK_FLOAT_RETURNS;
2698 /* Use external vectorized library in vectorizing intrinsics. */
2699 if (ix86_veclibabi_string)
2701 if (strcmp (ix86_veclibabi_string, "svml") == 0)
2702 ix86_veclib_handler = ix86_veclibabi_svml;
2703 else if (strcmp (ix86_veclibabi_string, "acml") == 0)
2704 ix86_veclib_handler = ix86_veclibabi_acml;
2706 error ("unknown vectorization library ABI type (%s) for "
2707 "-mveclibabi= switch", ix86_veclibabi_string);
2710 if ((x86_accumulate_outgoing_args & ix86_tune_mask)
2711 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2713 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2715 /* ??? Unwind info is not correct around the CFG unless either a frame
2716 pointer is present or M_A_O_A is set. Fixing this requires rewriting
2717 unwind info generation to be aware of the CFG and propagating states
2719 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
2720 || flag_exceptions || flag_non_call_exceptions)
2721 && flag_omit_frame_pointer
2722 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
2724 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2725 warning (0, "unwind tables currently require either a frame pointer "
2726 "or -maccumulate-outgoing-args for correctness");
2727 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2730 /* If stack probes are required, the space used for large function
2731 arguments on the stack must also be probed, so enable
2732 -maccumulate-outgoing-args so this happens in the prologue. */
2733 if (TARGET_STACK_PROBE
2734 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
2736 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2737 warning (0, "stack probing requires -maccumulate-outgoing-args "
2739 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2742 /* For sane SSE instruction set generation we need fcomi instruction.
2743 It is safe to enable all CMOVE instructions. */
2747 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
2750 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
2751 p = strchr (internal_label_prefix, 'X');
2752 internal_label_prefix_len = p - internal_label_prefix;
2756 /* When scheduling description is not available, disable scheduler pass
2757 so it won't slow down the compilation and make x87 code slower. */
2758 if (!TARGET_SCHEDULE)
2759 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
2761 if (!PARAM_SET_P (PARAM_SIMULTANEOUS_PREFETCHES))
2762 set_param_value ("simultaneous-prefetches",
2763 ix86_cost->simultaneous_prefetches);
2764 if (!PARAM_SET_P (PARAM_L1_CACHE_LINE_SIZE))
2765 set_param_value ("l1-cache-line-size", ix86_cost->prefetch_block);
2766 if (!PARAM_SET_P (PARAM_L1_CACHE_SIZE))
2767 set_param_value ("l1-cache-size", ix86_cost->l1_cache_size);
2768 if (!PARAM_SET_P (PARAM_L2_CACHE_SIZE))
2769 set_param_value ("l2-cache-size", ix86_cost->l2_cache_size);
2771 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
2772 can be optimized to ap = __builtin_next_arg (0).
2773 For abi switching it should be corrected. */
2774 if (!TARGET_64BIT || DEFAULT_ABI == MS_ABI)
2775 targetm.expand_builtin_va_start = NULL;
2779 ix86_gen_leave = gen_leave_rex64;
2780 ix86_gen_pop1 = gen_popdi1;
2781 ix86_gen_add3 = gen_adddi3;
2782 ix86_gen_sub3 = gen_subdi3;
2783 ix86_gen_sub3_carry = gen_subdi3_carry_rex64;
2784 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
2785 ix86_gen_monitor = gen_sse3_monitor64;
2789 ix86_gen_leave = gen_leave;
2790 ix86_gen_pop1 = gen_popsi1;
2791 ix86_gen_add3 = gen_addsi3;
2792 ix86_gen_sub3 = gen_subsi3;
2793 ix86_gen_sub3_carry = gen_subsi3_carry;
2794 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
2795 ix86_gen_monitor = gen_sse3_monitor;
2799 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
2801 target_flags |= MASK_CLD & ~target_flags_explicit;
2805 /* Return true if this goes in large data/bss. */
2808 ix86_in_large_data_p (tree exp)
2810 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
2813 /* Functions are never large data. */
2814 if (TREE_CODE (exp) == FUNCTION_DECL)
2817 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
2819 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
2820 if (strcmp (section, ".ldata") == 0
2821 || strcmp (section, ".lbss") == 0)
2827 HOST_WIDE_INT size =