1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
3 2002, 2003, 2004, 2005, 2006, 2007, 2008
4 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "hard-reg-set.h"
32 #include "insn-config.h"
33 #include "conditions.h"
35 #include "insn-codes.h"
36 #include "insn-attr.h"
44 #include "basic-block.h"
47 #include "target-def.h"
48 #include "langhooks.h"
50 #include "tree-gimple.h"
53 #include "tm-constrs.h"
56 static int x86_builtin_vectorization_cost (bool);
58 #ifndef CHECK_STACK_LIMIT
59 #define CHECK_STACK_LIMIT (-1)
62 /* Return index of given mode in mult and division cost tables. */
63 #define MODE_INDEX(mode) \
64 ((mode) == QImode ? 0 \
65 : (mode) == HImode ? 1 \
66 : (mode) == SImode ? 2 \
67 : (mode) == DImode ? 3 \
70 /* Processor costs (relative to an add) */
71 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
72 #define COSTS_N_BYTES(N) ((N) * 2)
74 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
77 struct processor_costs size_cost = { /* costs for tuning for size */
78 COSTS_N_BYTES (2), /* cost of an add instruction */
79 COSTS_N_BYTES (3), /* cost of a lea instruction */
80 COSTS_N_BYTES (2), /* variable shift costs */
81 COSTS_N_BYTES (3), /* constant shift costs */
82 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
83 COSTS_N_BYTES (3), /* HI */
84 COSTS_N_BYTES (3), /* SI */
85 COSTS_N_BYTES (3), /* DI */
86 COSTS_N_BYTES (5)}, /* other */
87 0, /* cost of multiply per each bit set */
88 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
89 COSTS_N_BYTES (3), /* HI */
90 COSTS_N_BYTES (3), /* SI */
91 COSTS_N_BYTES (3), /* DI */
92 COSTS_N_BYTES (5)}, /* other */
93 COSTS_N_BYTES (3), /* cost of movsx */
94 COSTS_N_BYTES (3), /* cost of movzx */
97 2, /* cost for loading QImode using movzbl */
98 {2, 2, 2}, /* cost of loading integer registers
99 in QImode, HImode and SImode.
100 Relative to reg-reg move (2). */
101 {2, 2, 2}, /* cost of storing integer registers */
102 2, /* cost of reg,reg fld/fst */
103 {2, 2, 2}, /* cost of loading fp registers
104 in SFmode, DFmode and XFmode */
105 {2, 2, 2}, /* cost of storing fp registers
106 in SFmode, DFmode and XFmode */
107 3, /* cost of moving MMX register */
108 {3, 3}, /* cost of loading MMX registers
109 in SImode and DImode */
110 {3, 3}, /* cost of storing MMX registers
111 in SImode and DImode */
112 3, /* cost of moving SSE register */
113 {3, 3, 3}, /* cost of loading SSE registers
114 in SImode, DImode and TImode */
115 {3, 3, 3}, /* cost of storing SSE registers
116 in SImode, DImode and TImode */
117 3, /* MMX or SSE register to integer */
118 0, /* size of l1 cache */
119 0, /* size of l2 cache */
120 0, /* size of prefetch block */
121 0, /* number of parallel prefetches */
123 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
124 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
125 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
126 COSTS_N_BYTES (2), /* cost of FABS instruction. */
127 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
128 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
129 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
130 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
131 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
132 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
133 1, /* scalar_stmt_cost. */
134 1, /* scalar load_cost. */
135 1, /* scalar_store_cost. */
136 1, /* vec_stmt_cost. */
137 1, /* vec_to_scalar_cost. */
138 1, /* scalar_to_vec_cost. */
139 1, /* vec_align_load_cost. */
140 1, /* vec_unalign_load_cost. */
141 1, /* vec_store_cost. */
142 1, /* cond_taken_branch_cost. */
143 1, /* cond_not_taken_branch_cost. */
146 /* Processor costs (relative to an add) */
148 struct processor_costs i386_cost = { /* 386 specific costs */
149 COSTS_N_INSNS (1), /* cost of an add instruction */
150 COSTS_N_INSNS (1), /* cost of a lea instruction */
151 COSTS_N_INSNS (3), /* variable shift costs */
152 COSTS_N_INSNS (2), /* constant shift costs */
153 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
154 COSTS_N_INSNS (6), /* HI */
155 COSTS_N_INSNS (6), /* SI */
156 COSTS_N_INSNS (6), /* DI */
157 COSTS_N_INSNS (6)}, /* other */
158 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
159 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
160 COSTS_N_INSNS (23), /* HI */
161 COSTS_N_INSNS (23), /* SI */
162 COSTS_N_INSNS (23), /* DI */
163 COSTS_N_INSNS (23)}, /* other */
164 COSTS_N_INSNS (3), /* cost of movsx */
165 COSTS_N_INSNS (2), /* cost of movzx */
166 15, /* "large" insn */
168 4, /* cost for loading QImode using movzbl */
169 {2, 4, 2}, /* cost of loading integer registers
170 in QImode, HImode and SImode.
171 Relative to reg-reg move (2). */
172 {2, 4, 2}, /* cost of storing integer registers */
173 2, /* cost of reg,reg fld/fst */
174 {8, 8, 8}, /* cost of loading fp registers
175 in SFmode, DFmode and XFmode */
176 {8, 8, 8}, /* cost of storing fp registers
177 in SFmode, DFmode and XFmode */
178 2, /* cost of moving MMX register */
179 {4, 8}, /* cost of loading MMX registers
180 in SImode and DImode */
181 {4, 8}, /* cost of storing MMX registers
182 in SImode and DImode */
183 2, /* cost of moving SSE register */
184 {4, 8, 16}, /* cost of loading SSE registers
185 in SImode, DImode and TImode */
186 {4, 8, 16}, /* cost of storing SSE registers
187 in SImode, DImode and TImode */
188 3, /* MMX or SSE register to integer */
189 0, /* size of l1 cache */
190 0, /* size of l2 cache */
191 0, /* size of prefetch block */
192 0, /* number of parallel prefetches */
194 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
195 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
196 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
197 COSTS_N_INSNS (22), /* cost of FABS instruction. */
198 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
199 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
200 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
201 DUMMY_STRINGOP_ALGS},
202 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
203 DUMMY_STRINGOP_ALGS},
204 1, /* scalar_stmt_cost. */
205 1, /* scalar load_cost. */
206 1, /* scalar_store_cost. */
207 1, /* vec_stmt_cost. */
208 1, /* vec_to_scalar_cost. */
209 1, /* scalar_to_vec_cost. */
210 1, /* vec_align_load_cost. */
211 2, /* vec_unalign_load_cost. */
212 1, /* vec_store_cost. */
213 3, /* cond_taken_branch_cost. */
214 1, /* cond_not_taken_branch_cost. */
218 struct processor_costs i486_cost = { /* 486 specific costs */
219 COSTS_N_INSNS (1), /* cost of an add instruction */
220 COSTS_N_INSNS (1), /* cost of a lea instruction */
221 COSTS_N_INSNS (3), /* variable shift costs */
222 COSTS_N_INSNS (2), /* constant shift costs */
223 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
224 COSTS_N_INSNS (12), /* HI */
225 COSTS_N_INSNS (12), /* SI */
226 COSTS_N_INSNS (12), /* DI */
227 COSTS_N_INSNS (12)}, /* other */
228 1, /* cost of multiply per each bit set */
229 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
230 COSTS_N_INSNS (40), /* HI */
231 COSTS_N_INSNS (40), /* SI */
232 COSTS_N_INSNS (40), /* DI */
233 COSTS_N_INSNS (40)}, /* other */
234 COSTS_N_INSNS (3), /* cost of movsx */
235 COSTS_N_INSNS (2), /* cost of movzx */
236 15, /* "large" insn */
238 4, /* cost for loading QImode using movzbl */
239 {2, 4, 2}, /* cost of loading integer registers
240 in QImode, HImode and SImode.
241 Relative to reg-reg move (2). */
242 {2, 4, 2}, /* cost of storing integer registers */
243 2, /* cost of reg,reg fld/fst */
244 {8, 8, 8}, /* cost of loading fp registers
245 in SFmode, DFmode and XFmode */
246 {8, 8, 8}, /* cost of storing fp registers
247 in SFmode, DFmode and XFmode */
248 2, /* cost of moving MMX register */
249 {4, 8}, /* cost of loading MMX registers
250 in SImode and DImode */
251 {4, 8}, /* cost of storing MMX registers
252 in SImode and DImode */
253 2, /* cost of moving SSE register */
254 {4, 8, 16}, /* cost of loading SSE registers
255 in SImode, DImode and TImode */
256 {4, 8, 16}, /* cost of storing SSE registers
257 in SImode, DImode and TImode */
258 3, /* MMX or SSE register to integer */
259 4, /* size of l1 cache. 486 has 8kB cache
260 shared for code and data, so 4kB is
261 not really precise. */
262 4, /* size of l2 cache */
263 0, /* size of prefetch block */
264 0, /* number of parallel prefetches */
266 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
267 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
268 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
269 COSTS_N_INSNS (3), /* cost of FABS instruction. */
270 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
271 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
272 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
273 DUMMY_STRINGOP_ALGS},
274 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
275 DUMMY_STRINGOP_ALGS},
276 1, /* scalar_stmt_cost. */
277 1, /* scalar load_cost. */
278 1, /* scalar_store_cost. */
279 1, /* vec_stmt_cost. */
280 1, /* vec_to_scalar_cost. */
281 1, /* scalar_to_vec_cost. */
282 1, /* vec_align_load_cost. */
283 2, /* vec_unalign_load_cost. */
284 1, /* vec_store_cost. */
285 3, /* cond_taken_branch_cost. */
286 1, /* cond_not_taken_branch_cost. */
290 struct processor_costs pentium_cost = {
291 COSTS_N_INSNS (1), /* cost of an add instruction */
292 COSTS_N_INSNS (1), /* cost of a lea instruction */
293 COSTS_N_INSNS (4), /* variable shift costs */
294 COSTS_N_INSNS (1), /* constant shift costs */
295 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
296 COSTS_N_INSNS (11), /* HI */
297 COSTS_N_INSNS (11), /* SI */
298 COSTS_N_INSNS (11), /* DI */
299 COSTS_N_INSNS (11)}, /* other */
300 0, /* cost of multiply per each bit set */
301 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
302 COSTS_N_INSNS (25), /* HI */
303 COSTS_N_INSNS (25), /* SI */
304 COSTS_N_INSNS (25), /* DI */
305 COSTS_N_INSNS (25)}, /* other */
306 COSTS_N_INSNS (3), /* cost of movsx */
307 COSTS_N_INSNS (2), /* cost of movzx */
308 8, /* "large" insn */
310 6, /* cost for loading QImode using movzbl */
311 {2, 4, 2}, /* cost of loading integer registers
312 in QImode, HImode and SImode.
313 Relative to reg-reg move (2). */
314 {2, 4, 2}, /* cost of storing integer registers */
315 2, /* cost of reg,reg fld/fst */
316 {2, 2, 6}, /* cost of loading fp registers
317 in SFmode, DFmode and XFmode */
318 {4, 4, 6}, /* cost of storing fp registers
319 in SFmode, DFmode and XFmode */
320 8, /* cost of moving MMX register */
321 {8, 8}, /* cost of loading MMX registers
322 in SImode and DImode */
323 {8, 8}, /* cost of storing MMX registers
324 in SImode and DImode */
325 2, /* cost of moving SSE register */
326 {4, 8, 16}, /* cost of loading SSE registers
327 in SImode, DImode and TImode */
328 {4, 8, 16}, /* cost of storing SSE registers
329 in SImode, DImode and TImode */
330 3, /* MMX or SSE register to integer */
331 8, /* size of l1 cache. */
332 8, /* size of l2 cache */
333 0, /* size of prefetch block */
334 0, /* number of parallel prefetches */
336 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
337 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
338 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
339 COSTS_N_INSNS (1), /* cost of FABS instruction. */
340 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
341 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
342 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
343 DUMMY_STRINGOP_ALGS},
344 {{libcall, {{-1, rep_prefix_4_byte}}},
345 DUMMY_STRINGOP_ALGS},
346 1, /* scalar_stmt_cost. */
347 1, /* scalar load_cost. */
348 1, /* scalar_store_cost. */
349 1, /* vec_stmt_cost. */
350 1, /* vec_to_scalar_cost. */
351 1, /* scalar_to_vec_cost. */
352 1, /* vec_align_load_cost. */
353 2, /* vec_unalign_load_cost. */
354 1, /* vec_store_cost. */
355 3, /* cond_taken_branch_cost. */
356 1, /* cond_not_taken_branch_cost. */
360 struct processor_costs pentiumpro_cost = {
361 COSTS_N_INSNS (1), /* cost of an add instruction */
362 COSTS_N_INSNS (1), /* cost of a lea instruction */
363 COSTS_N_INSNS (1), /* variable shift costs */
364 COSTS_N_INSNS (1), /* constant shift costs */
365 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
366 COSTS_N_INSNS (4), /* HI */
367 COSTS_N_INSNS (4), /* SI */
368 COSTS_N_INSNS (4), /* DI */
369 COSTS_N_INSNS (4)}, /* other */
370 0, /* cost of multiply per each bit set */
371 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
372 COSTS_N_INSNS (17), /* HI */
373 COSTS_N_INSNS (17), /* SI */
374 COSTS_N_INSNS (17), /* DI */
375 COSTS_N_INSNS (17)}, /* other */
376 COSTS_N_INSNS (1), /* cost of movsx */
377 COSTS_N_INSNS (1), /* cost of movzx */
378 8, /* "large" insn */
380 2, /* cost for loading QImode using movzbl */
381 {4, 4, 4}, /* cost of loading integer registers
382 in QImode, HImode and SImode.
383 Relative to reg-reg move (2). */
384 {2, 2, 2}, /* cost of storing integer registers */
385 2, /* cost of reg,reg fld/fst */
386 {2, 2, 6}, /* cost of loading fp registers
387 in SFmode, DFmode and XFmode */
388 {4, 4, 6}, /* cost of storing fp registers
389 in SFmode, DFmode and XFmode */
390 2, /* cost of moving MMX register */
391 {2, 2}, /* cost of loading MMX registers
392 in SImode and DImode */
393 {2, 2}, /* cost of storing MMX registers
394 in SImode and DImode */
395 2, /* cost of moving SSE register */
396 {2, 2, 8}, /* cost of loading SSE registers
397 in SImode, DImode and TImode */
398 {2, 2, 8}, /* cost of storing SSE registers
399 in SImode, DImode and TImode */
400 3, /* MMX or SSE register to integer */
401 8, /* size of l1 cache. */
402 256, /* size of l2 cache */
403 32, /* size of prefetch block */
404 6, /* number of parallel prefetches */
406 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
407 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
408 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
409 COSTS_N_INSNS (2), /* cost of FABS instruction. */
410 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
411 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
412 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes (we ensure
413 the alignment). For small blocks inline loop is still a noticeable win, for bigger
414 blocks either rep movsl or rep movsb is way to go. Rep movsb has apparently
415 more expensive startup time in CPU, but after 4K the difference is down in the noise.
417 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
418 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
419 DUMMY_STRINGOP_ALGS},
420 {{rep_prefix_4_byte, {{1024, unrolled_loop},
421 {8192, rep_prefix_4_byte}, {-1, libcall}}},
422 DUMMY_STRINGOP_ALGS},
423 1, /* scalar_stmt_cost. */
424 1, /* scalar load_cost. */
425 1, /* scalar_store_cost. */
426 1, /* vec_stmt_cost. */
427 1, /* vec_to_scalar_cost. */
428 1, /* scalar_to_vec_cost. */
429 1, /* vec_align_load_cost. */
430 2, /* vec_unalign_load_cost. */
431 1, /* vec_store_cost. */
432 3, /* cond_taken_branch_cost. */
433 1, /* cond_not_taken_branch_cost. */
437 struct processor_costs geode_cost = {
438 COSTS_N_INSNS (1), /* cost of an add instruction */
439 COSTS_N_INSNS (1), /* cost of a lea instruction */
440 COSTS_N_INSNS (2), /* variable shift costs */
441 COSTS_N_INSNS (1), /* constant shift costs */
442 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
443 COSTS_N_INSNS (4), /* HI */
444 COSTS_N_INSNS (7), /* SI */
445 COSTS_N_INSNS (7), /* DI */
446 COSTS_N_INSNS (7)}, /* other */
447 0, /* cost of multiply per each bit set */
448 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
449 COSTS_N_INSNS (23), /* HI */
450 COSTS_N_INSNS (39), /* SI */
451 COSTS_N_INSNS (39), /* DI */
452 COSTS_N_INSNS (39)}, /* other */
453 COSTS_N_INSNS (1), /* cost of movsx */
454 COSTS_N_INSNS (1), /* cost of movzx */
455 8, /* "large" insn */
457 1, /* cost for loading QImode using movzbl */
458 {1, 1, 1}, /* cost of loading integer registers
459 in QImode, HImode and SImode.
460 Relative to reg-reg move (2). */
461 {1, 1, 1}, /* cost of storing integer registers */
462 1, /* cost of reg,reg fld/fst */
463 {1, 1, 1}, /* cost of loading fp registers
464 in SFmode, DFmode and XFmode */
465 {4, 6, 6}, /* cost of storing fp registers
466 in SFmode, DFmode and XFmode */
468 1, /* cost of moving MMX register */
469 {1, 1}, /* cost of loading MMX registers
470 in SImode and DImode */
471 {1, 1}, /* cost of storing MMX registers
472 in SImode and DImode */
473 1, /* cost of moving SSE register */
474 {1, 1, 1}, /* cost of loading SSE registers
475 in SImode, DImode and TImode */
476 {1, 1, 1}, /* cost of storing SSE registers
477 in SImode, DImode and TImode */
478 1, /* MMX or SSE register to integer */
479 64, /* size of l1 cache. */
480 128, /* size of l2 cache. */
481 32, /* size of prefetch block */
482 1, /* number of parallel prefetches */
484 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
485 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
486 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
487 COSTS_N_INSNS (1), /* cost of FABS instruction. */
488 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
489 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
490 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
491 DUMMY_STRINGOP_ALGS},
492 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
493 DUMMY_STRINGOP_ALGS},
494 1, /* scalar_stmt_cost. */
495 1, /* scalar load_cost. */
496 1, /* scalar_store_cost. */
497 1, /* vec_stmt_cost. */
498 1, /* vec_to_scalar_cost. */
499 1, /* scalar_to_vec_cost. */
500 1, /* vec_align_load_cost. */
501 2, /* vec_unalign_load_cost. */
502 1, /* vec_store_cost. */
503 3, /* cond_taken_branch_cost. */
504 1, /* cond_not_taken_branch_cost. */
508 struct processor_costs k6_cost = {
509 COSTS_N_INSNS (1), /* cost of an add instruction */
510 COSTS_N_INSNS (2), /* cost of a lea instruction */
511 COSTS_N_INSNS (1), /* variable shift costs */
512 COSTS_N_INSNS (1), /* constant shift costs */
513 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
514 COSTS_N_INSNS (3), /* HI */
515 COSTS_N_INSNS (3), /* SI */
516 COSTS_N_INSNS (3), /* DI */
517 COSTS_N_INSNS (3)}, /* other */
518 0, /* cost of multiply per each bit set */
519 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
520 COSTS_N_INSNS (18), /* HI */
521 COSTS_N_INSNS (18), /* SI */
522 COSTS_N_INSNS (18), /* DI */
523 COSTS_N_INSNS (18)}, /* other */
524 COSTS_N_INSNS (2), /* cost of movsx */
525 COSTS_N_INSNS (2), /* cost of movzx */
526 8, /* "large" insn */
528 3, /* cost for loading QImode using movzbl */
529 {4, 5, 4}, /* cost of loading integer registers
530 in QImode, HImode and SImode.
531 Relative to reg-reg move (2). */
532 {2, 3, 2}, /* cost of storing integer registers */
533 4, /* cost of reg,reg fld/fst */
534 {6, 6, 6}, /* cost of loading fp registers
535 in SFmode, DFmode and XFmode */
536 {4, 4, 4}, /* cost of storing fp registers
537 in SFmode, DFmode and XFmode */
538 2, /* cost of moving MMX register */
539 {2, 2}, /* cost of loading MMX registers
540 in SImode and DImode */
541 {2, 2}, /* cost of storing MMX registers
542 in SImode and DImode */
543 2, /* cost of moving SSE register */
544 {2, 2, 8}, /* cost of loading SSE registers
545 in SImode, DImode and TImode */
546 {2, 2, 8}, /* cost of storing SSE registers
547 in SImode, DImode and TImode */
548 6, /* MMX or SSE register to integer */
549 32, /* size of l1 cache. */
550 32, /* size of l2 cache. Some models
551 have integrated l2 cache, but
552 optimizing for k6 is not important
553 enough to worry about that. */
554 32, /* size of prefetch block */
555 1, /* number of parallel prefetches */
557 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
558 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
559 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
560 COSTS_N_INSNS (2), /* cost of FABS instruction. */
561 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
562 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
563 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
564 DUMMY_STRINGOP_ALGS},
565 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
566 DUMMY_STRINGOP_ALGS},
567 1, /* scalar_stmt_cost. */
568 1, /* scalar load_cost. */
569 1, /* scalar_store_cost. */
570 1, /* vec_stmt_cost. */
571 1, /* vec_to_scalar_cost. */
572 1, /* scalar_to_vec_cost. */
573 1, /* vec_align_load_cost. */
574 2, /* vec_unalign_load_cost. */
575 1, /* vec_store_cost. */
576 3, /* cond_taken_branch_cost. */
577 1, /* cond_not_taken_branch_cost. */
581 struct processor_costs athlon_cost = {
582 COSTS_N_INSNS (1), /* cost of an add instruction */
583 COSTS_N_INSNS (2), /* cost of a lea instruction */
584 COSTS_N_INSNS (1), /* variable shift costs */
585 COSTS_N_INSNS (1), /* constant shift costs */
586 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
587 COSTS_N_INSNS (5), /* HI */
588 COSTS_N_INSNS (5), /* SI */
589 COSTS_N_INSNS (5), /* DI */
590 COSTS_N_INSNS (5)}, /* other */
591 0, /* cost of multiply per each bit set */
592 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
593 COSTS_N_INSNS (26), /* HI */
594 COSTS_N_INSNS (42), /* SI */
595 COSTS_N_INSNS (74), /* DI */
596 COSTS_N_INSNS (74)}, /* other */
597 COSTS_N_INSNS (1), /* cost of movsx */
598 COSTS_N_INSNS (1), /* cost of movzx */
599 8, /* "large" insn */
601 4, /* cost for loading QImode using movzbl */
602 {3, 4, 3}, /* cost of loading integer registers
603 in QImode, HImode and SImode.
604 Relative to reg-reg move (2). */
605 {3, 4, 3}, /* cost of storing integer registers */
606 4, /* cost of reg,reg fld/fst */
607 {4, 4, 12}, /* cost of loading fp registers
608 in SFmode, DFmode and XFmode */
609 {6, 6, 8}, /* cost of storing fp registers
610 in SFmode, DFmode and XFmode */
611 2, /* cost of moving MMX register */
612 {4, 4}, /* cost of loading MMX registers
613 in SImode and DImode */
614 {4, 4}, /* cost of storing MMX registers
615 in SImode and DImode */
616 2, /* cost of moving SSE register */
617 {4, 4, 6}, /* cost of loading SSE registers
618 in SImode, DImode and TImode */
619 {4, 4, 5}, /* cost of storing SSE registers
620 in SImode, DImode and TImode */
621 5, /* MMX or SSE register to integer */
622 64, /* size of l1 cache. */
623 256, /* size of l2 cache. */
624 64, /* size of prefetch block */
625 6, /* number of parallel prefetches */
627 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
628 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
629 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
630 COSTS_N_INSNS (2), /* cost of FABS instruction. */
631 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
632 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
633 /* For some reason, Athlon deals better with REP prefix (relative to loops)
634 compared to K8. Alignment becomes important after 8 bytes for memcpy and
635 128 bytes for memset. */
636 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
637 DUMMY_STRINGOP_ALGS},
638 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
639 DUMMY_STRINGOP_ALGS},
640 1, /* scalar_stmt_cost. */
641 1, /* scalar load_cost. */
642 1, /* scalar_store_cost. */
643 1, /* vec_stmt_cost. */
644 1, /* vec_to_scalar_cost. */
645 1, /* scalar_to_vec_cost. */
646 1, /* vec_align_load_cost. */
647 2, /* vec_unalign_load_cost. */
648 1, /* vec_store_cost. */
649 3, /* cond_taken_branch_cost. */
650 1, /* cond_not_taken_branch_cost. */
654 struct processor_costs k8_cost = {
655 COSTS_N_INSNS (1), /* cost of an add instruction */
656 COSTS_N_INSNS (2), /* cost of a lea instruction */
657 COSTS_N_INSNS (1), /* variable shift costs */
658 COSTS_N_INSNS (1), /* constant shift costs */
659 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
660 COSTS_N_INSNS (4), /* HI */
661 COSTS_N_INSNS (3), /* SI */
662 COSTS_N_INSNS (4), /* DI */
663 COSTS_N_INSNS (5)}, /* other */
664 0, /* cost of multiply per each bit set */
665 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
666 COSTS_N_INSNS (26), /* HI */
667 COSTS_N_INSNS (42), /* SI */
668 COSTS_N_INSNS (74), /* DI */
669 COSTS_N_INSNS (74)}, /* other */
670 COSTS_N_INSNS (1), /* cost of movsx */
671 COSTS_N_INSNS (1), /* cost of movzx */
672 8, /* "large" insn */
674 4, /* cost for loading QImode using movzbl */
675 {3, 4, 3}, /* cost of loading integer registers
676 in QImode, HImode and SImode.
677 Relative to reg-reg move (2). */
678 {3, 4, 3}, /* cost of storing integer registers */
679 4, /* cost of reg,reg fld/fst */
680 {4, 4, 12}, /* cost of loading fp registers
681 in SFmode, DFmode and XFmode */
682 {6, 6, 8}, /* cost of storing fp registers
683 in SFmode, DFmode and XFmode */
684 2, /* cost of moving MMX register */
685 {3, 3}, /* cost of loading MMX registers
686 in SImode and DImode */
687 {4, 4}, /* cost of storing MMX registers
688 in SImode and DImode */
689 2, /* cost of moving SSE register */
690 {4, 3, 6}, /* cost of loading SSE registers
691 in SImode, DImode and TImode */
692 {4, 4, 5}, /* cost of storing SSE registers
693 in SImode, DImode and TImode */
694 5, /* MMX or SSE register to integer */
695 64, /* size of l1 cache. */
696 512, /* size of l2 cache. */
697 64, /* size of prefetch block */
698 /* New AMD processors never drop prefetches; if they cannot be performed
699 immediately, they are queued. We set number of simultaneous prefetches
700 to a large constant to reflect this (it probably is not a good idea not
701 to limit number of prefetches at all, as their execution also takes some
703 100, /* number of parallel prefetches */
705 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
706 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
707 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
708 COSTS_N_INSNS (2), /* cost of FABS instruction. */
709 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
710 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
711 /* K8 has optimized REP instruction for medium sized blocks, but for very small
712 blocks it is better to use loop. For large blocks, libcall can do
713 nontemporary accesses and beat inline considerably. */
714 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
715 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
716 {{libcall, {{8, loop}, {24, unrolled_loop},
717 {2048, rep_prefix_4_byte}, {-1, libcall}}},
718 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
719 4, /* scalar_stmt_cost. */
720 2, /* scalar load_cost. */
721 2, /* scalar_store_cost. */
722 5, /* vec_stmt_cost. */
723 0, /* vec_to_scalar_cost. */
724 2, /* scalar_to_vec_cost. */
725 2, /* vec_align_load_cost. */
726 3, /* vec_unalign_load_cost. */
727 3, /* vec_store_cost. */
728 3, /* cond_taken_branch_cost. */
729 2, /* cond_not_taken_branch_cost. */
732 struct processor_costs amdfam10_cost = {
733 COSTS_N_INSNS (1), /* cost of an add instruction */
734 COSTS_N_INSNS (2), /* cost of a lea instruction */
735 COSTS_N_INSNS (1), /* variable shift costs */
736 COSTS_N_INSNS (1), /* constant shift costs */
737 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
738 COSTS_N_INSNS (4), /* HI */
739 COSTS_N_INSNS (3), /* SI */
740 COSTS_N_INSNS (4), /* DI */
741 COSTS_N_INSNS (5)}, /* other */
742 0, /* cost of multiply per each bit set */
743 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
744 COSTS_N_INSNS (35), /* HI */
745 COSTS_N_INSNS (51), /* SI */
746 COSTS_N_INSNS (83), /* DI */
747 COSTS_N_INSNS (83)}, /* other */
748 COSTS_N_INSNS (1), /* cost of movsx */
749 COSTS_N_INSNS (1), /* cost of movzx */
750 8, /* "large" insn */
752 4, /* cost for loading QImode using movzbl */
753 {3, 4, 3}, /* cost of loading integer registers
754 in QImode, HImode and SImode.
755 Relative to reg-reg move (2). */
756 {3, 4, 3}, /* cost of storing integer registers */
757 4, /* cost of reg,reg fld/fst */
758 {4, 4, 12}, /* cost of loading fp registers
759 in SFmode, DFmode and XFmode */
760 {6, 6, 8}, /* cost of storing fp registers
761 in SFmode, DFmode and XFmode */
762 2, /* cost of moving MMX register */
763 {3, 3}, /* cost of loading MMX registers
764 in SImode and DImode */
765 {4, 4}, /* cost of storing MMX registers
766 in SImode and DImode */
767 2, /* cost of moving SSE register */
768 {4, 4, 3}, /* cost of loading SSE registers
769 in SImode, DImode and TImode */
770 {4, 4, 5}, /* cost of storing SSE registers
771 in SImode, DImode and TImode */
772 3, /* MMX or SSE register to integer */
774 MOVD reg64, xmmreg Double FSTORE 4
775 MOVD reg32, xmmreg Double FSTORE 4
777 MOVD reg64, xmmreg Double FADD 3
779 MOVD reg32, xmmreg Double FADD 3
781 64, /* size of l1 cache. */
782 512, /* size of l2 cache. */
783 64, /* size of prefetch block */
784 /* New AMD processors never drop prefetches; if they cannot be performed
785 immediately, they are queued. We set number of simultaneous prefetches
786 to a large constant to reflect this (it probably is not a good idea not
787 to limit number of prefetches at all, as their execution also takes some
789 100, /* number of parallel prefetches */
791 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
792 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
793 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
794 COSTS_N_INSNS (2), /* cost of FABS instruction. */
795 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
796 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
798 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
799 very small blocks it is better to use loop. For large blocks, libcall can
800 do nontemporary accesses and beat inline considerably. */
801 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
802 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
803 {{libcall, {{8, loop}, {24, unrolled_loop},
804 {2048, rep_prefix_4_byte}, {-1, libcall}}},
805 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
806 4, /* scalar_stmt_cost. */
807 2, /* scalar load_cost. */
808 2, /* scalar_store_cost. */
809 6, /* vec_stmt_cost. */
810 0, /* vec_to_scalar_cost. */
811 2, /* scalar_to_vec_cost. */
812 2, /* vec_align_load_cost. */
813 2, /* vec_unalign_load_cost. */
814 2, /* vec_store_cost. */
815 2, /* cond_taken_branch_cost. */
816 1, /* cond_not_taken_branch_cost. */
820 struct processor_costs pentium4_cost = {
821 COSTS_N_INSNS (1), /* cost of an add instruction */
822 COSTS_N_INSNS (3), /* cost of a lea instruction */
823 COSTS_N_INSNS (4), /* variable shift costs */
824 COSTS_N_INSNS (4), /* constant shift costs */
825 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
826 COSTS_N_INSNS (15), /* HI */
827 COSTS_N_INSNS (15), /* SI */
828 COSTS_N_INSNS (15), /* DI */
829 COSTS_N_INSNS (15)}, /* other */
830 0, /* cost of multiply per each bit set */
831 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
832 COSTS_N_INSNS (56), /* HI */
833 COSTS_N_INSNS (56), /* SI */
834 COSTS_N_INSNS (56), /* DI */
835 COSTS_N_INSNS (56)}, /* other */
836 COSTS_N_INSNS (1), /* cost of movsx */
837 COSTS_N_INSNS (1), /* cost of movzx */
838 16, /* "large" insn */
840 2, /* cost for loading QImode using movzbl */
841 {4, 5, 4}, /* cost of loading integer registers
842 in QImode, HImode and SImode.
843 Relative to reg-reg move (2). */
844 {2, 3, 2}, /* cost of storing integer registers */
845 2, /* cost of reg,reg fld/fst */
846 {2, 2, 6}, /* cost of loading fp registers
847 in SFmode, DFmode and XFmode */
848 {4, 4, 6}, /* cost of storing fp registers
849 in SFmode, DFmode and XFmode */
850 2, /* cost of moving MMX register */
851 {2, 2}, /* cost of loading MMX registers
852 in SImode and DImode */
853 {2, 2}, /* cost of storing MMX registers
854 in SImode and DImode */
855 12, /* cost of moving SSE register */
856 {12, 12, 12}, /* cost of loading SSE registers
857 in SImode, DImode and TImode */
858 {2, 2, 8}, /* cost of storing SSE registers
859 in SImode, DImode and TImode */
860 10, /* MMX or SSE register to integer */
861 8, /* size of l1 cache. */
862 256, /* size of l2 cache. */
863 64, /* size of prefetch block */
864 6, /* number of parallel prefetches */
866 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
867 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
868 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
869 COSTS_N_INSNS (2), /* cost of FABS instruction. */
870 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
871 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
872 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
873 DUMMY_STRINGOP_ALGS},
874 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
876 DUMMY_STRINGOP_ALGS},
877 1, /* scalar_stmt_cost. */
878 1, /* scalar load_cost. */
879 1, /* scalar_store_cost. */
880 1, /* vec_stmt_cost. */
881 1, /* vec_to_scalar_cost. */
882 1, /* scalar_to_vec_cost. */
883 1, /* vec_align_load_cost. */
884 2, /* vec_unalign_load_cost. */
885 1, /* vec_store_cost. */
886 3, /* cond_taken_branch_cost. */
887 1, /* cond_not_taken_branch_cost. */
891 struct processor_costs nocona_cost = {
892 COSTS_N_INSNS (1), /* cost of an add instruction */
893 COSTS_N_INSNS (1), /* cost of a lea instruction */
894 COSTS_N_INSNS (1), /* variable shift costs */
895 COSTS_N_INSNS (1), /* constant shift costs */
896 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
897 COSTS_N_INSNS (10), /* HI */
898 COSTS_N_INSNS (10), /* SI */
899 COSTS_N_INSNS (10), /* DI */
900 COSTS_N_INSNS (10)}, /* other */
901 0, /* cost of multiply per each bit set */
902 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
903 COSTS_N_INSNS (66), /* HI */
904 COSTS_N_INSNS (66), /* SI */
905 COSTS_N_INSNS (66), /* DI */
906 COSTS_N_INSNS (66)}, /* other */
907 COSTS_N_INSNS (1), /* cost of movsx */
908 COSTS_N_INSNS (1), /* cost of movzx */
909 16, /* "large" insn */
911 4, /* cost for loading QImode using movzbl */
912 {4, 4, 4}, /* cost of loading integer registers
913 in QImode, HImode and SImode.
914 Relative to reg-reg move (2). */
915 {4, 4, 4}, /* cost of storing integer registers */
916 3, /* cost of reg,reg fld/fst */
917 {12, 12, 12}, /* cost of loading fp registers
918 in SFmode, DFmode and XFmode */
919 {4, 4, 4}, /* cost of storing fp registers
920 in SFmode, DFmode and XFmode */
921 6, /* cost of moving MMX register */
922 {12, 12}, /* cost of loading MMX registers
923 in SImode and DImode */
924 {12, 12}, /* cost of storing MMX registers
925 in SImode and DImode */
926 6, /* cost of moving SSE register */
927 {12, 12, 12}, /* cost of loading SSE registers
928 in SImode, DImode and TImode */
929 {12, 12, 12}, /* cost of storing SSE registers
930 in SImode, DImode and TImode */
931 8, /* MMX or SSE register to integer */
932 8, /* size of l1 cache. */
933 1024, /* size of l2 cache. */
934 128, /* size of prefetch block */
935 8, /* number of parallel prefetches */
937 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
938 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
939 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
940 COSTS_N_INSNS (3), /* cost of FABS instruction. */
941 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
942 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
943 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
944 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
945 {100000, unrolled_loop}, {-1, libcall}}}},
946 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
948 {libcall, {{24, loop}, {64, unrolled_loop},
949 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
950 1, /* scalar_stmt_cost. */
951 1, /* scalar load_cost. */
952 1, /* scalar_store_cost. */
953 1, /* vec_stmt_cost. */
954 1, /* vec_to_scalar_cost. */
955 1, /* scalar_to_vec_cost. */
956 1, /* vec_align_load_cost. */
957 2, /* vec_unalign_load_cost. */
958 1, /* vec_store_cost. */
959 3, /* cond_taken_branch_cost. */
960 1, /* cond_not_taken_branch_cost. */
964 struct processor_costs core2_cost = {
965 COSTS_N_INSNS (1), /* cost of an add instruction */
966 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
967 COSTS_N_INSNS (1), /* variable shift costs */
968 COSTS_N_INSNS (1), /* constant shift costs */
969 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
970 COSTS_N_INSNS (3), /* HI */
971 COSTS_N_INSNS (3), /* SI */
972 COSTS_N_INSNS (3), /* DI */
973 COSTS_N_INSNS (3)}, /* other */
974 0, /* cost of multiply per each bit set */
975 {COSTS_N_INSNS (22), /* cost of a divide/mod for QI */
976 COSTS_N_INSNS (22), /* HI */
977 COSTS_N_INSNS (22), /* SI */
978 COSTS_N_INSNS (22), /* DI */
979 COSTS_N_INSNS (22)}, /* other */
980 COSTS_N_INSNS (1), /* cost of movsx */
981 COSTS_N_INSNS (1), /* cost of movzx */
982 8, /* "large" insn */
984 2, /* cost for loading QImode using movzbl */
985 {6, 6, 6}, /* cost of loading integer registers
986 in QImode, HImode and SImode.
987 Relative to reg-reg move (2). */
988 {4, 4, 4}, /* cost of storing integer registers */
989 2, /* cost of reg,reg fld/fst */
990 {6, 6, 6}, /* cost of loading fp registers
991 in SFmode, DFmode and XFmode */
992 {4, 4, 4}, /* cost of loading integer registers */
993 2, /* cost of moving MMX register */
994 {6, 6}, /* cost of loading MMX registers
995 in SImode and DImode */
996 {4, 4}, /* cost of storing MMX registers
997 in SImode and DImode */
998 2, /* cost of moving SSE register */
999 {6, 6, 6}, /* cost of loading SSE registers
1000 in SImode, DImode and TImode */
1001 {4, 4, 4}, /* cost of storing SSE registers
1002 in SImode, DImode and TImode */
1003 2, /* MMX or SSE register to integer */
1004 32, /* size of l1 cache. */
1005 2048, /* size of l2 cache. */
1006 128, /* size of prefetch block */
1007 8, /* number of parallel prefetches */
1008 3, /* Branch cost */
1009 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
1010 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1011 COSTS_N_INSNS (32), /* cost of FDIV instruction. */
1012 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1013 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1014 COSTS_N_INSNS (58), /* cost of FSQRT instruction. */
1015 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1016 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1017 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1018 {{libcall, {{8, loop}, {15, unrolled_loop},
1019 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1020 {libcall, {{24, loop}, {32, unrolled_loop},
1021 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1022 1, /* scalar_stmt_cost. */
1023 1, /* scalar load_cost. */
1024 1, /* scalar_store_cost. */
1025 1, /* vec_stmt_cost. */
1026 1, /* vec_to_scalar_cost. */
1027 1, /* scalar_to_vec_cost. */
1028 1, /* vec_align_load_cost. */
1029 2, /* vec_unalign_load_cost. */
1030 1, /* vec_store_cost. */
1031 3, /* cond_taken_branch_cost. */
1032 1, /* cond_not_taken_branch_cost. */
1035 /* Generic64 should produce code tuned for Nocona and K8. */
1037 struct processor_costs generic64_cost = {
1038 COSTS_N_INSNS (1), /* cost of an add instruction */
1039 /* On all chips taken into consideration lea is 2 cycles and more. With
1040 this cost however our current implementation of synth_mult results in
1041 use of unnecessary temporary registers causing regression on several
1042 SPECfp benchmarks. */
1043 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1044 COSTS_N_INSNS (1), /* variable shift costs */
1045 COSTS_N_INSNS (1), /* constant shift costs */
1046 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1047 COSTS_N_INSNS (4), /* HI */
1048 COSTS_N_INSNS (3), /* SI */
1049 COSTS_N_INSNS (4), /* DI */
1050 COSTS_N_INSNS (2)}, /* other */
1051 0, /* cost of multiply per each bit set */
1052 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1053 COSTS_N_INSNS (26), /* HI */
1054 COSTS_N_INSNS (42), /* SI */
1055 COSTS_N_INSNS (74), /* DI */
1056 COSTS_N_INSNS (74)}, /* other */
1057 COSTS_N_INSNS (1), /* cost of movsx */
1058 COSTS_N_INSNS (1), /* cost of movzx */
1059 8, /* "large" insn */
1060 17, /* MOVE_RATIO */
1061 4, /* cost for loading QImode using movzbl */
1062 {4, 4, 4}, /* cost of loading integer registers
1063 in QImode, HImode and SImode.
1064 Relative to reg-reg move (2). */
1065 {4, 4, 4}, /* cost of storing integer registers */
1066 4, /* cost of reg,reg fld/fst */
1067 {12, 12, 12}, /* cost of loading fp registers
1068 in SFmode, DFmode and XFmode */
1069 {6, 6, 8}, /* cost of storing fp registers
1070 in SFmode, DFmode and XFmode */
1071 2, /* cost of moving MMX register */
1072 {8, 8}, /* cost of loading MMX registers
1073 in SImode and DImode */
1074 {8, 8}, /* cost of storing MMX registers
1075 in SImode and DImode */
1076 2, /* cost of moving SSE register */
1077 {8, 8, 8}, /* cost of loading SSE registers
1078 in SImode, DImode and TImode */
1079 {8, 8, 8}, /* cost of storing SSE registers
1080 in SImode, DImode and TImode */
1081 5, /* MMX or SSE register to integer */
1082 32, /* size of l1 cache. */
1083 512, /* size of l2 cache. */
1084 64, /* size of prefetch block */
1085 6, /* number of parallel prefetches */
1086 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this value
1087 is increased to perhaps more appropriate value of 5. */
1088 3, /* Branch cost */
1089 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1090 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1091 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1092 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1093 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1094 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1095 {DUMMY_STRINGOP_ALGS,
1096 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1097 {DUMMY_STRINGOP_ALGS,
1098 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1099 1, /* scalar_stmt_cost. */
1100 1, /* scalar load_cost. */
1101 1, /* scalar_store_cost. */
1102 1, /* vec_stmt_cost. */
1103 1, /* vec_to_scalar_cost. */
1104 1, /* scalar_to_vec_cost. */
1105 1, /* vec_align_load_cost. */
1106 2, /* vec_unalign_load_cost. */
1107 1, /* vec_store_cost. */
1108 3, /* cond_taken_branch_cost. */
1109 1, /* cond_not_taken_branch_cost. */
1112 /* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8. */
1114 struct processor_costs generic32_cost = {
1115 COSTS_N_INSNS (1), /* cost of an add instruction */
1116 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1117 COSTS_N_INSNS (1), /* variable shift costs */
1118 COSTS_N_INSNS (1), /* constant shift costs */
1119 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1120 COSTS_N_INSNS (4), /* HI */
1121 COSTS_N_INSNS (3), /* SI */
1122 COSTS_N_INSNS (4), /* DI */
1123 COSTS_N_INSNS (2)}, /* other */
1124 0, /* cost of multiply per each bit set */
1125 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1126 COSTS_N_INSNS (26), /* HI */
1127 COSTS_N_INSNS (42), /* SI */
1128 COSTS_N_INSNS (74), /* DI */
1129 COSTS_N_INSNS (74)}, /* other */
1130 COSTS_N_INSNS (1), /* cost of movsx */
1131 COSTS_N_INSNS (1), /* cost of movzx */
1132 8, /* "large" insn */
1133 17, /* MOVE_RATIO */
1134 4, /* cost for loading QImode using movzbl */
1135 {4, 4, 4}, /* cost of loading integer registers
1136 in QImode, HImode and SImode.
1137 Relative to reg-reg move (2). */
1138 {4, 4, 4}, /* cost of storing integer registers */
1139 4, /* cost of reg,reg fld/fst */
1140 {12, 12, 12}, /* cost of loading fp registers
1141 in SFmode, DFmode and XFmode */
1142 {6, 6, 8}, /* cost of storing fp registers
1143 in SFmode, DFmode and XFmode */
1144 2, /* cost of moving MMX register */
1145 {8, 8}, /* cost of loading MMX registers
1146 in SImode and DImode */
1147 {8, 8}, /* cost of storing MMX registers
1148 in SImode and DImode */
1149 2, /* cost of moving SSE register */
1150 {8, 8, 8}, /* cost of loading SSE registers
1151 in SImode, DImode and TImode */
1152 {8, 8, 8}, /* cost of storing SSE registers
1153 in SImode, DImode and TImode */
1154 5, /* MMX or SSE register to integer */
1155 32, /* size of l1 cache. */
1156 256, /* size of l2 cache. */
1157 64, /* size of prefetch block */
1158 6, /* number of parallel prefetches */
1159 3, /* Branch cost */
1160 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1161 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1162 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1163 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1164 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1165 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1166 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1167 DUMMY_STRINGOP_ALGS},
1168 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1169 DUMMY_STRINGOP_ALGS},
1170 1, /* scalar_stmt_cost. */
1171 1, /* scalar load_cost. */
1172 1, /* scalar_store_cost. */
1173 1, /* vec_stmt_cost. */
1174 1, /* vec_to_scalar_cost. */
1175 1, /* scalar_to_vec_cost. */
1176 1, /* vec_align_load_cost. */
1177 2, /* vec_unalign_load_cost. */
1178 1, /* vec_store_cost. */
1179 3, /* cond_taken_branch_cost. */
1180 1, /* cond_not_taken_branch_cost. */
1183 const struct processor_costs *ix86_cost = &pentium_cost;
1185 /* Processor feature/optimization bitmasks. */
1186 #define m_386 (1<<PROCESSOR_I386)
1187 #define m_486 (1<<PROCESSOR_I486)
1188 #define m_PENT (1<<PROCESSOR_PENTIUM)
1189 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1190 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1191 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1192 #define m_CORE2 (1<<PROCESSOR_CORE2)
1194 #define m_GEODE (1<<PROCESSOR_GEODE)
1195 #define m_K6 (1<<PROCESSOR_K6)
1196 #define m_K6_GEODE (m_K6 | m_GEODE)
1197 #define m_K8 (1<<PROCESSOR_K8)
1198 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1199 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1200 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1201 #define m_AMD_MULTIPLE (m_K8 | m_ATHLON | m_AMDFAM10)
1203 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1204 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1206 /* Generic instruction choice should be common subset of supported CPUs
1207 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1208 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1210 /* Feature tests against the various tunings. */
1211 unsigned int ix86_tune_features[X86_TUNE_LAST] = {
1212 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1213 negatively, so enabling for Generic64 seems like good code size
1214 tradeoff. We can't enable it for 32bit generic because it does not
1215 work well with PPro base chips. */
1216 m_386 | m_K6_GEODE | m_AMD_MULTIPLE | m_CORE2 | m_GENERIC64,
1218 /* X86_TUNE_PUSH_MEMORY */
1219 m_386 | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4
1220 | m_NOCONA | m_CORE2 | m_GENERIC,
1222 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1225 /* X86_TUNE_USE_BIT_TEST */
1228 /* X86_TUNE_UNROLL_STRLEN */
1229 m_486 | m_PENT | m_PPRO | m_AMD_MULTIPLE | m_K6 | m_CORE2 | m_GENERIC,
1231 /* X86_TUNE_DEEP_BRANCH_PREDICTION */
1232 m_PPRO | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4 | m_GENERIC,
1234 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1235 on simulation result. But after P4 was made, no performance benefit
1236 was observed with branch hints. It also increases the code size.
1237 As a result, icc never generates branch hints. */
1240 /* X86_TUNE_DOUBLE_WITH_ADD */
1243 /* X86_TUNE_USE_SAHF */
1244 m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_PENT4
1245 | m_NOCONA | m_CORE2 | m_GENERIC,
1247 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1248 partial dependencies. */
1249 m_AMD_MULTIPLE | m_PPRO | m_PENT4 | m_NOCONA
1250 | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */,
1252 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1253 register stalls on Generic32 compilation setting as well. However
1254 in current implementation the partial register stalls are not eliminated
1255 very well - they can be introduced via subregs synthesized by combine
1256 and can happen in caller/callee saving sequences. Because this option
1257 pays back little on PPro based chips and is in conflict with partial reg
1258 dependencies used by Athlon/P4 based chips, it is better to leave it off
1259 for generic32 for now. */
1262 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1263 m_CORE2 | m_GENERIC,
1265 /* X86_TUNE_USE_HIMODE_FIOP */
1266 m_386 | m_486 | m_K6_GEODE,
1268 /* X86_TUNE_USE_SIMODE_FIOP */
1269 ~(m_PPRO | m_AMD_MULTIPLE | m_PENT | m_CORE2 | m_GENERIC),
1271 /* X86_TUNE_USE_MOV0 */
1274 /* X86_TUNE_USE_CLTD */
1275 ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC),
1277 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1280 /* X86_TUNE_SPLIT_LONG_MOVES */
1283 /* X86_TUNE_READ_MODIFY_WRITE */
1286 /* X86_TUNE_READ_MODIFY */
1289 /* X86_TUNE_PROMOTE_QIMODE */
1290 m_K6_GEODE | m_PENT | m_386 | m_486 | m_AMD_MULTIPLE | m_CORE2
1291 | m_GENERIC /* | m_PENT4 ? */,
1293 /* X86_TUNE_FAST_PREFIX */
1294 ~(m_PENT | m_486 | m_386),
1296 /* X86_TUNE_SINGLE_STRINGOP */
1297 m_386 | m_PENT4 | m_NOCONA,
1299 /* X86_TUNE_QIMODE_MATH */
1302 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1303 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1304 might be considered for Generic32 if our scheme for avoiding partial
1305 stalls was more effective. */
1308 /* X86_TUNE_PROMOTE_QI_REGS */
1311 /* X86_TUNE_PROMOTE_HI_REGS */
1314 /* X86_TUNE_ADD_ESP_4: Enable if add/sub is preferred over 1/2 push/pop. */
1315 m_AMD_MULTIPLE | m_K6_GEODE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1317 /* X86_TUNE_ADD_ESP_8 */
1318 m_AMD_MULTIPLE | m_PPRO | m_K6_GEODE | m_386
1319 | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1321 /* X86_TUNE_SUB_ESP_4 */
1322 m_AMD_MULTIPLE | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1324 /* X86_TUNE_SUB_ESP_8 */
1325 m_AMD_MULTIPLE | m_PPRO | m_386 | m_486
1326 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1328 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1329 for DFmode copies */
1330 ~(m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
1331 | m_GENERIC | m_GEODE),
1333 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1334 m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1336 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1337 conflict here in between PPro/Pentium4 based chips that thread 128bit
1338 SSE registers as single units versus K8 based chips that divide SSE
1339 registers to two 64bit halves. This knob promotes all store destinations
1340 to be 128bit to allow register renaming on 128bit SSE units, but usually
1341 results in one extra microop on 64bit SSE units. Experimental results
1342 shows that disabling this option on P4 brings over 20% SPECfp regression,
1343 while enabling it on K8 brings roughly 2.4% regression that can be partly
1344 masked by careful scheduling of moves. */
1345 m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC | m_AMDFAM10,
1347 /* X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL */
1350 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1351 are resolved on SSE register parts instead of whole registers, so we may
1352 maintain just lower part of scalar values in proper format leaving the
1353 upper part undefined. */
1356 /* X86_TUNE_SSE_TYPELESS_STORES */
1359 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1360 m_PPRO | m_PENT4 | m_NOCONA,
1362 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1363 m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1365 /* X86_TUNE_PROLOGUE_USING_MOVE */
1366 m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
1368 /* X86_TUNE_EPILOGUE_USING_MOVE */
1369 m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
1371 /* X86_TUNE_SHIFT1 */
1374 /* X86_TUNE_USE_FFREEP */
1377 /* X86_TUNE_INTER_UNIT_MOVES */
1378 ~(m_AMD_MULTIPLE | m_GENERIC),
1380 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
1383 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1384 than 4 branch instructions in the 16 byte window. */
1385 m_PPRO | m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1387 /* X86_TUNE_SCHEDULE */
1388 m_PPRO | m_AMD_MULTIPLE | m_K6_GEODE | m_PENT | m_CORE2 | m_GENERIC,
1390 /* X86_TUNE_USE_BT */
1393 /* X86_TUNE_USE_INCDEC */
1394 ~(m_PENT4 | m_NOCONA | m_GENERIC),
1396 /* X86_TUNE_PAD_RETURNS */
1397 m_AMD_MULTIPLE | m_CORE2 | m_GENERIC,
1399 /* X86_TUNE_EXT_80387_CONSTANTS */
1400 m_K6_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC,
1402 /* X86_TUNE_SHORTEN_X87_SSE */
1405 /* X86_TUNE_AVOID_VECTOR_DECODE */
1408 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1409 and SImode multiply, but 386 and 486 do HImode multiply faster. */
1412 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1413 vector path on AMD machines. */
1414 m_K8 | m_GENERIC64 | m_AMDFAM10,
1416 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1418 m_K8 | m_GENERIC64 | m_AMDFAM10,
1420 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1424 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1425 but one byte longer. */
1428 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1429 operand that cannot be represented using a modRM byte. The XOR
1430 replacement is long decoded, so this split helps here as well. */
1433 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
1434 from integer to FP. */
1438 /* Feature tests against the various architecture variations. */
1439 unsigned int ix86_arch_features[X86_ARCH_LAST] = {
1440 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
1441 ~(m_386 | m_486 | m_PENT | m_K6),
1443 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1446 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1449 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1452 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
1456 static const unsigned int x86_accumulate_outgoing_args
1457 = m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
1459 static const unsigned int x86_arch_always_fancy_math_387
1460 = m_PENT | m_PPRO | m_AMD_MULTIPLE | m_PENT4
1461 | m_NOCONA | m_CORE2 | m_GENERIC;
1463 static enum stringop_alg stringop_alg = no_stringop;
1465 /* In case the average insn count for single function invocation is
1466 lower than this constant, emit fast (but longer) prologue and
1468 #define FAST_PROLOGUE_INSN_COUNT 20
1470 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
1471 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1472 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1473 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1475 /* Array of the smallest class containing reg number REGNO, indexed by
1476 REGNO. Used by REGNO_REG_CLASS in i386.h. */
1478 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
1480 /* ax, dx, cx, bx */
1481 AREG, DREG, CREG, BREG,
1482 /* si, di, bp, sp */
1483 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
1485 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
1486 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
1489 /* flags, fpsr, fpcr, frame */
1490 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
1492 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1495 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
1498 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1499 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1500 /* SSE REX registers */
1501 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1505 /* The "default" register map used in 32bit mode. */
1507 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
1509 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
1510 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
1511 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1512 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
1513 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
1514 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1515 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1518 static int const x86_64_int_parameter_registers[6] =
1520 5 /*RDI*/, 4 /*RSI*/, 1 /*RDX*/, 2 /*RCX*/,
1521 FIRST_REX_INT_REG /*R8 */, FIRST_REX_INT_REG + 1 /*R9 */
1524 static int const x86_64_ms_abi_int_parameter_registers[4] =
1526 2 /*RCX*/, 1 /*RDX*/,
1527 FIRST_REX_INT_REG /*R8 */, FIRST_REX_INT_REG + 1 /*R9 */
1530 static int const x86_64_int_return_registers[4] =
1532 0 /*RAX*/, 1 /*RDX*/, 5 /*RDI*/, 4 /*RSI*/
1535 /* The "default" register map used in 64bit mode. */
1536 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
1538 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
1539 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
1540 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1541 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
1542 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
1543 8,9,10,11,12,13,14,15, /* extended integer registers */
1544 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
1547 /* Define the register numbers to be used in Dwarf debugging information.
1548 The SVR4 reference port C compiler uses the following register numbers
1549 in its Dwarf output code:
1550 0 for %eax (gcc regno = 0)
1551 1 for %ecx (gcc regno = 2)
1552 2 for %edx (gcc regno = 1)
1553 3 for %ebx (gcc regno = 3)
1554 4 for %esp (gcc regno = 7)
1555 5 for %ebp (gcc regno = 6)
1556 6 for %esi (gcc regno = 4)
1557 7 for %edi (gcc regno = 5)
1558 The following three DWARF register numbers are never generated by
1559 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
1560 believes these numbers have these meanings.
1561 8 for %eip (no gcc equivalent)
1562 9 for %eflags (gcc regno = 17)
1563 10 for %trapno (no gcc equivalent)
1564 It is not at all clear how we should number the FP stack registers
1565 for the x86 architecture. If the version of SDB on x86/svr4 were
1566 a bit less brain dead with respect to floating-point then we would
1567 have a precedent to follow with respect to DWARF register numbers
1568 for x86 FP registers, but the SDB on x86/svr4 is so completely
1569 broken with respect to FP registers that it is hardly worth thinking
1570 of it as something to strive for compatibility with.
1571 The version of x86/svr4 SDB I have at the moment does (partially)
1572 seem to believe that DWARF register number 11 is associated with
1573 the x86 register %st(0), but that's about all. Higher DWARF
1574 register numbers don't seem to be associated with anything in
1575 particular, and even for DWARF regno 11, SDB only seems to under-
1576 stand that it should say that a variable lives in %st(0) (when
1577 asked via an `=' command) if we said it was in DWARF regno 11,
1578 but SDB still prints garbage when asked for the value of the
1579 variable in question (via a `/' command).
1580 (Also note that the labels SDB prints for various FP stack regs
1581 when doing an `x' command are all wrong.)
1582 Note that these problems generally don't affect the native SVR4
1583 C compiler because it doesn't allow the use of -O with -g and
1584 because when it is *not* optimizing, it allocates a memory
1585 location for each floating-point variable, and the memory
1586 location is what gets described in the DWARF AT_location
1587 attribute for the variable in question.
1588 Regardless of the severe mental illness of the x86/svr4 SDB, we
1589 do something sensible here and we use the following DWARF
1590 register numbers. Note that these are all stack-top-relative
1592 11 for %st(0) (gcc regno = 8)
1593 12 for %st(1) (gcc regno = 9)
1594 13 for %st(2) (gcc regno = 10)
1595 14 for %st(3) (gcc regno = 11)
1596 15 for %st(4) (gcc regno = 12)
1597 16 for %st(5) (gcc regno = 13)
1598 17 for %st(6) (gcc regno = 14)
1599 18 for %st(7) (gcc regno = 15)
1601 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
1603 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
1604 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
1605 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1606 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
1607 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
1608 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1609 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1612 /* Test and compare insns in i386.md store the information needed to
1613 generate branch and scc insns here. */
1615 rtx ix86_compare_op0 = NULL_RTX;
1616 rtx ix86_compare_op1 = NULL_RTX;
1617 rtx ix86_compare_emitted = NULL_RTX;
1619 /* Size of the register save area. */
1620 #define X86_64_VARARGS_SIZE (REGPARM_MAX * UNITS_PER_WORD + SSE_REGPARM_MAX * 16)
1622 /* Define the structure for the machine field in struct function. */
1624 struct stack_local_entry GTY(())
1626 unsigned short mode;
1629 struct stack_local_entry *next;
1632 /* Structure describing stack frame layout.
1633 Stack grows downward:
1639 saved frame pointer if frame_pointer_needed
1640 <- HARD_FRAME_POINTER
1645 [va_arg registers] (
1646 > to_allocate <- FRAME_POINTER
1656 HOST_WIDE_INT frame;
1658 int outgoing_arguments_size;
1661 HOST_WIDE_INT to_allocate;
1662 /* The offsets relative to ARG_POINTER. */
1663 HOST_WIDE_INT frame_pointer_offset;
1664 HOST_WIDE_INT hard_frame_pointer_offset;
1665 HOST_WIDE_INT stack_pointer_offset;
1667 /* When save_regs_using_mov is set, emit prologue using
1668 move instead of push instructions. */
1669 bool save_regs_using_mov;
1672 /* Code model option. */
1673 enum cmodel ix86_cmodel;
1675 enum asm_dialect ix86_asm_dialect = ASM_ATT;
1677 enum tls_dialect ix86_tls_dialect = TLS_DIALECT_GNU;
1679 /* Which unit we are generating floating point math for. */
1680 enum fpmath_unit ix86_fpmath;
1682 /* Which cpu are we scheduling for. */
1683 enum processor_type ix86_tune;
1685 /* Which instruction set architecture to use. */
1686 enum processor_type ix86_arch;
1688 /* true if sse prefetch instruction is not NOOP. */
1689 int x86_prefetch_sse;
1691 /* ix86_regparm_string as a number */
1692 static int ix86_regparm;
1694 /* -mstackrealign option */
1695 extern int ix86_force_align_arg_pointer;
1696 static const char ix86_force_align_arg_pointer_string[] = "force_align_arg_pointer";
1698 /* Preferred alignment for stack boundary in bits. */
1699 unsigned int ix86_preferred_stack_boundary;
1701 /* Values 1-5: see jump.c */
1702 int ix86_branch_cost;
1704 /* Variables which are this size or smaller are put in the data/bss
1705 or ldata/lbss sections. */
1707 int ix86_section_threshold = 65536;
1709 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
1710 char internal_label_prefix[16];
1711 int internal_label_prefix_len;
1713 /* Fence to use after loop using movnt. */
1716 /* Register class used for passing given 64bit part of the argument.
1717 These represent classes as documented by the PS ABI, with the exception
1718 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
1719 use SF or DFmode move instead of DImode to avoid reformatting penalties.
1721 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
1722 whenever possible (upper half does contain padding). */
1723 enum x86_64_reg_class
1726 X86_64_INTEGER_CLASS,
1727 X86_64_INTEGERSI_CLASS,
1734 X86_64_COMPLEX_X87_CLASS,
1737 static const char * const x86_64_reg_class_name[] =
1739 "no", "integer", "integerSI", "sse", "sseSF", "sseDF",
1740 "sseup", "x87", "x87up", "cplx87", "no"
1743 #define MAX_CLASSES 4
1745 /* Table of constants used by fldpi, fldln2, etc.... */
1746 static REAL_VALUE_TYPE ext_80387_constants_table [5];
1747 static bool ext_80387_constants_init = 0;
1750 static struct machine_function * ix86_init_machine_status (void);
1751 static rtx ix86_function_value (const_tree, const_tree, bool);
1752 static int ix86_function_regparm (const_tree, const_tree);
1753 static void ix86_compute_frame_layout (struct ix86_frame *);
1754 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
1758 /* The svr4 ABI for the i386 says that records and unions are returned
1760 #ifndef DEFAULT_PCC_STRUCT_RETURN
1761 #define DEFAULT_PCC_STRUCT_RETURN 1
1764 /* Bit flags that specify the ISA we are compiling for. */
1765 int ix86_isa_flags = TARGET_64BIT_DEFAULT | TARGET_SUBTARGET_ISA_DEFAULT;
1767 /* A mask of ix86_isa_flags that includes bit X if X
1768 was set or cleared on the command line. */
1769 static int ix86_isa_flags_explicit;
1771 /* Define a set of ISAs which aren't available for a given ISA. MMX
1772 and SSE ISAs are handled separately. */
1774 #define OPTION_MASK_ISA_MMX_UNSET \
1775 (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_UNSET)
1776 #define OPTION_MASK_ISA_3DNOW_UNSET OPTION_MASK_ISA_3DNOW_A
1778 #define OPTION_MASK_ISA_SSE_UNSET \
1779 (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE2_UNSET)
1780 #define OPTION_MASK_ISA_SSE2_UNSET \
1781 (OPTION_MASK_ISA_SSE3 | OPTION_MASK_ISA_SSE3_UNSET)
1782 #define OPTION_MASK_ISA_SSE3_UNSET \
1783 (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSSE3_UNSET)
1784 #define OPTION_MASK_ISA_SSSE3_UNSET \
1785 (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_1_UNSET)
1786 #define OPTION_MASK_ISA_SSE4_1_UNSET \
1787 (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_SSE4_2_UNSET)
1788 #define OPTION_MASK_ISA_SSE4_2_UNSET OPTION_MASK_ISA_SSE4A
1790 /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
1791 as -msse4.1 -msse4.2. -mno-sse4 should the same as -mno-sse4.1. */
1792 #define OPTION_MASK_ISA_SSE4 \
1793 (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_2)
1794 #define OPTION_MASK_ISA_SSE4_UNSET OPTION_MASK_ISA_SSE4_1_UNSET
1796 #define OPTION_MASK_ISA_SSE4A_UNSET OPTION_MASK_ISA_SSE4
1798 #define OPTION_MASK_ISA_SSE5_UNSET \
1799 (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_UNSET)
1801 /* Vectorization library interface and handlers. */
1802 tree (*ix86_veclib_handler)(enum built_in_function, tree, tree) = NULL;
1803 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
1805 /* Implement TARGET_HANDLE_OPTION. */
1808 ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value)
1813 ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX;
1816 ix86_isa_flags &= ~OPTION_MASK_ISA_MMX_UNSET;
1817 ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX_UNSET;
1822 ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW;
1825 ix86_isa_flags &= ~OPTION_MASK_ISA_3DNOW_UNSET;
1826 ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW_UNSET;
1834 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE;
1837 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE_UNSET;
1838 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE_UNSET;
1843 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2;
1846 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE2_UNSET;
1847 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_UNSET;
1852 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3;
1855 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE3_UNSET;
1856 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3_UNSET;
1861 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3;
1864 ix86_isa_flags &= ~OPTION_MASK_ISA_SSSE3_UNSET;
1865 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3_UNSET;
1870 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1;
1873 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_1_UNSET;
1874 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1_UNSET;
1879 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2;
1882 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_2_UNSET;
1883 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2_UNSET;
1888 ix86_isa_flags |= OPTION_MASK_ISA_SSE4;
1889 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4;
1893 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_UNSET;
1894 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_UNSET;
1898 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A;
1901 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4A_UNSET;
1902 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A_UNSET;
1907 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE5;
1910 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE5_UNSET;
1911 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE5_UNSET;
1920 /* Sometimes certain combinations of command options do not make
1921 sense on a particular target machine. You can define a macro
1922 `OVERRIDE_OPTIONS' to take account of this. This macro, if
1923 defined, is executed once just after all the command options have
1926 Don't use this macro to turn on various extra optimizations for
1927 `-O'. That is what `OPTIMIZATION_OPTIONS' is for. */
1930 override_options (void)
1933 int ix86_tune_defaulted = 0;
1934 int ix86_arch_specified = 0;
1935 unsigned int ix86_arch_mask, ix86_tune_mask;
1937 /* Comes from final.c -- no real reason to change it. */
1938 #define MAX_CODE_ALIGN 16
1942 const struct processor_costs *cost; /* Processor costs */
1943 const int align_loop; /* Default alignments. */
1944 const int align_loop_max_skip;
1945 const int align_jump;
1946 const int align_jump_max_skip;
1947 const int align_func;
1949 const processor_target_table[PROCESSOR_max] =
1951 {&i386_cost, 4, 3, 4, 3, 4},
1952 {&i486_cost, 16, 15, 16, 15, 16},
1953 {&pentium_cost, 16, 7, 16, 7, 16},
1954 {&pentiumpro_cost, 16, 15, 16, 10, 16},
1955 {&geode_cost, 0, 0, 0, 0, 0},
1956 {&k6_cost, 32, 7, 32, 7, 32},
1957 {&athlon_cost, 16, 7, 16, 7, 16},
1958 {&pentium4_cost, 0, 0, 0, 0, 0},
1959 {&k8_cost, 16, 7, 16, 7, 16},
1960 {&nocona_cost, 0, 0, 0, 0, 0},
1961 {&core2_cost, 16, 10, 16, 10, 16},
1962 {&generic32_cost, 16, 7, 16, 7, 16},
1963 {&generic64_cost, 16, 10, 16, 10, 16},
1964 {&amdfam10_cost, 32, 24, 32, 7, 32}
1967 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
1998 PTA_PREFETCH_SSE = 1 << 4,
2000 PTA_3DNOW_A = 1 << 6,
2004 PTA_POPCNT = 1 << 10,
2006 PTA_SSE4A = 1 << 12,
2007 PTA_NO_SAHF = 1 << 13,
2008 PTA_SSE4_1 = 1 << 14,
2009 PTA_SSE4_2 = 1 << 15,
2015 const char *const name; /* processor name or nickname. */
2016 const enum processor_type processor;
2017 const unsigned /*enum pta_flags*/ flags;
2019 const processor_alias_table[] =
2021 {"i386", PROCESSOR_I386, 0},
2022 {"i486", PROCESSOR_I486, 0},
2023 {"i586", PROCESSOR_PENTIUM, 0},
2024 {"pentium", PROCESSOR_PENTIUM, 0},
2025 {"pentium-mmx", PROCESSOR_PENTIUM, PTA_MMX},
2026 {"winchip-c6", PROCESSOR_I486, PTA_MMX},
2027 {"winchip2", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},
2028 {"c3", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},
2029 {"c3-2", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE},
2030 {"i686", PROCESSOR_PENTIUMPRO, 0},
2031 {"pentiumpro", PROCESSOR_PENTIUMPRO, 0},
2032 {"pentium2", PROCESSOR_PENTIUMPRO, PTA_MMX},
2033 {"pentium3", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE},
2034 {"pentium3m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE},
2035 {"pentium-m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_SSE2},
2036 {"pentium4", PROCESSOR_PENTIUM4, PTA_MMX |PTA_SSE | PTA_SSE2},
2037 {"pentium4m", PROCESSOR_PENTIUM4, PTA_MMX | PTA_SSE | PTA_SSE2},
2038 {"prescott", PROCESSOR_NOCONA, PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2039 {"nocona", PROCESSOR_NOCONA, (PTA_64BIT
2040 | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2041 | PTA_CX16 | PTA_NO_SAHF)},
2042 {"core2", PROCESSOR_CORE2, (PTA_64BIT
2043 | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2046 {"geode", PROCESSOR_GEODE, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2047 |PTA_PREFETCH_SSE)},
2048 {"k6", PROCESSOR_K6, PTA_MMX},
2049 {"k6-2", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
2050 {"k6-3", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
2051 {"athlon", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2052 | PTA_PREFETCH_SSE)},
2053 {"athlon-tbird", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2054 | PTA_PREFETCH_SSE)},
2055 {"athlon-4", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2057 {"athlon-xp", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2059 {"athlon-mp", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2061 {"x86-64", PROCESSOR_K8, (PTA_64BIT
2062 | PTA_MMX | PTA_SSE | PTA_SSE2
2064 {"k8", PROCESSOR_K8, (PTA_64BIT
2065 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2066 | PTA_SSE | PTA_SSE2
2068 {"k8-sse3", PROCESSOR_K8, (PTA_64BIT
2069 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2070 | PTA_SSE | PTA_SSE2 | PTA_SSE3
2072 {"opteron", PROCESSOR_K8, (PTA_64BIT
2073 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2074 | PTA_SSE | PTA_SSE2
2076 {"opteron-sse3", PROCESSOR_K8, (PTA_64BIT
2077 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2078 | PTA_SSE | PTA_SSE2 | PTA_SSE3
2080 {"athlon64", PROCESSOR_K8, (PTA_64BIT
2081 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2082 | PTA_SSE | PTA_SSE2
2084 {"athlon64-sse3", PROCESSOR_K8, (PTA_64BIT
2085 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2086 | PTA_SSE | PTA_SSE2 | PTA_SSE3
2088 {"athlon-fx", PROCESSOR_K8, (PTA_64BIT
2089 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2090 | PTA_SSE | PTA_SSE2
2092 {"amdfam10", PROCESSOR_AMDFAM10, (PTA_64BIT
2093 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2094 | PTA_SSE | PTA_SSE2 | PTA_SSE3
2096 | PTA_CX16 | PTA_ABM)},
2097 {"barcelona", PROCESSOR_AMDFAM10, (PTA_64BIT
2098 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2099 | PTA_SSE | PTA_SSE2 | PTA_SSE3
2101 | PTA_CX16 | PTA_ABM)},
2102 {"generic32", PROCESSOR_GENERIC32, 0 /* flags are only used for -march switch. */ },
2103 {"generic64", PROCESSOR_GENERIC64, PTA_64BIT /* flags are only used for -march switch. */ },
2106 int const pta_size = ARRAY_SIZE (processor_alias_table);
2108 #ifdef SUBTARGET_OVERRIDE_OPTIONS
2109 SUBTARGET_OVERRIDE_OPTIONS;
2112 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
2113 SUBSUBTARGET_OVERRIDE_OPTIONS;
2116 /* -fPIC is the default for x86_64. */
2117 if (TARGET_MACHO && TARGET_64BIT)
2120 /* Set the default values for switches whose default depends on TARGET_64BIT
2121 in case they weren't overwritten by command line options. */
2124 /* Mach-O doesn't support omitting the frame pointer for now. */
2125 if (flag_omit_frame_pointer == 2)
2126 flag_omit_frame_pointer = (TARGET_MACHO ? 0 : 1);
2127 if (flag_asynchronous_unwind_tables == 2)
2128 flag_asynchronous_unwind_tables = 1;
2129 if (flag_pcc_struct_return == 2)
2130 flag_pcc_struct_return = 0;
2134 if (flag_omit_frame_pointer == 2)
2135 flag_omit_frame_pointer = 0;
2136 if (flag_asynchronous_unwind_tables == 2)
2137 flag_asynchronous_unwind_tables = 0;
2138 if (flag_pcc_struct_return == 2)
2139 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
2142 /* Need to check -mtune=generic first. */
2143 if (ix86_tune_string)
2145 if (!strcmp (ix86_tune_string, "generic")
2146 || !strcmp (ix86_tune_string, "i686")
2147 /* As special support for cross compilers we read -mtune=native
2148 as -mtune=generic. With native compilers we won't see the
2149 -mtune=native, as it was changed by the driver. */
2150 || !strcmp (ix86_tune_string, "native"))
2153 ix86_tune_string = "generic64";
2155 ix86_tune_string = "generic32";
2157 else if (!strncmp (ix86_tune_string, "generic", 7))
2158 error ("bad value (%s) for -mtune= switch", ix86_tune_string);
2162 if (ix86_arch_string)
2163 ix86_tune_string = ix86_arch_string;
2164 if (!ix86_tune_string)
2166 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
2167 ix86_tune_defaulted = 1;
2170 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
2171 need to use a sensible tune option. */
2172 if (!strcmp (ix86_tune_string, "generic")
2173 || !strcmp (ix86_tune_string, "x86-64")
2174 || !strcmp (ix86_tune_string, "i686"))
2177 ix86_tune_string = "generic64";
2179 ix86_tune_string = "generic32";
2182 if (ix86_stringop_string)
2184 if (!strcmp (ix86_stringop_string, "rep_byte"))
2185 stringop_alg = rep_prefix_1_byte;
2186 else if (!strcmp (ix86_stringop_string, "libcall"))
2187 stringop_alg = libcall;
2188 else if (!strcmp (ix86_stringop_string, "rep_4byte"))
2189 stringop_alg = rep_prefix_4_byte;
2190 else if (!strcmp (ix86_stringop_string, "rep_8byte"))
2191 stringop_alg = rep_prefix_8_byte;
2192 else if (!strcmp (ix86_stringop_string, "byte_loop"))
2193 stringop_alg = loop_1_byte;
2194 else if (!strcmp (ix86_stringop_string, "loop"))
2195 stringop_alg = loop;
2196 else if (!strcmp (ix86_stringop_string, "unrolled_loop"))
2197 stringop_alg = unrolled_loop;
2199 error ("bad value (%s) for -mstringop-strategy= switch", ix86_stringop_string);
2201 if (!strcmp (ix86_tune_string, "x86-64"))
2202 warning (OPT_Wdeprecated, "-mtune=x86-64 is deprecated. Use -mtune=k8 or "
2203 "-mtune=generic instead as appropriate.");
2205 if (!ix86_arch_string)
2206 ix86_arch_string = TARGET_64BIT ? "x86-64" : "i386";
2208 ix86_arch_specified = 1;
2210 if (!strcmp (ix86_arch_string, "generic"))
2211 error ("generic CPU can be used only for -mtune= switch");
2212 if (!strncmp (ix86_arch_string, "generic", 7))
2213 error ("bad value (%s) for -march= switch", ix86_arch_string);
2215 if (ix86_cmodel_string != 0)
2217 if (!strcmp (ix86_cmodel_string, "small"))
2218 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
2219 else if (!strcmp (ix86_cmodel_string, "medium"))
2220 ix86_cmodel = flag_pic ? CM_MEDIUM_PIC : CM_MEDIUM;
2221 else if (!strcmp (ix86_cmodel_string, "large"))
2222 ix86_cmodel = flag_pic ? CM_LARGE_PIC : CM_LARGE;
2224 error ("code model %s does not support PIC mode", ix86_cmodel_string);
2225 else if (!strcmp (ix86_cmodel_string, "32"))
2226 ix86_cmodel = CM_32;
2227 else if (!strcmp (ix86_cmodel_string, "kernel") && !flag_pic)
2228 ix86_cmodel = CM_KERNEL;
2230 error ("bad value (%s) for -mcmodel= switch", ix86_cmodel_string);
2234 /* For TARGET_64BIT_MS_ABI, force pic on, in order to enable the
2235 use of rip-relative addressing. This eliminates fixups that
2236 would otherwise be needed if this object is to be placed in a
2237 DLL, and is essentially just as efficient as direct addressing. */
2238 if (TARGET_64BIT_MS_ABI)
2239 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
2240 else if (TARGET_64BIT)
2241 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
2243 ix86_cmodel = CM_32;
2245 if (ix86_asm_string != 0)
2248 && !strcmp (ix86_asm_string, "intel"))
2249 ix86_asm_dialect = ASM_INTEL;
2250 else if (!strcmp (ix86_asm_string, "att"))
2251 ix86_asm_dialect = ASM_ATT;
2253 error ("bad value (%s) for -masm= switch", ix86_asm_string);
2255 if ((TARGET_64BIT == 0) != (ix86_cmodel == CM_32))
2256 error ("code model %qs not supported in the %s bit mode",
2257 ix86_cmodel_string, TARGET_64BIT ? "64" : "32");
2258 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
2259 sorry ("%i-bit mode not compiled in",
2260 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
2262 for (i = 0; i < pta_size; i++)
2263 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
2265 ix86_arch = processor_alias_table[i].processor;
2266 /* Default cpu tuning to the architecture. */
2267 ix86_tune = ix86_arch;
2269 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
2270 error ("CPU you selected does not support x86-64 "
2273 if (processor_alias_table[i].flags & PTA_MMX
2274 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
2275 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
2276 if (processor_alias_table[i].flags & PTA_3DNOW
2277 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
2278 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
2279 if (processor_alias_table[i].flags & PTA_3DNOW_A
2280 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
2281 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
2282 if (processor_alias_table[i].flags & PTA_SSE
2283 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
2284 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
2285 if (processor_alias_table[i].flags & PTA_SSE2
2286 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
2287 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
2288 if (processor_alias_table[i].flags & PTA_SSE3
2289 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
2290 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
2291 if (processor_alias_table[i].flags & PTA_SSSE3
2292 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
2293 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
2294 if (processor_alias_table[i].flags & PTA_SSE4_1
2295 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
2296 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
2297 if (processor_alias_table[i].flags & PTA_SSE4_2
2298 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
2299 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
2300 if (processor_alias_table[i].flags & PTA_SSE4A
2301 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
2302 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
2303 if (processor_alias_table[i].flags & PTA_SSE5
2304 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE5))
2305 ix86_isa_flags |= OPTION_MASK_ISA_SSE5;
2307 if (processor_alias_table[i].flags & PTA_ABM)
2309 if (processor_alias_table[i].flags & PTA_CX16)
2310 x86_cmpxchg16b = true;
2311 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM))
2313 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
2314 x86_prefetch_sse = true;
2315 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF)))
2322 error ("bad value (%s) for -march= switch", ix86_arch_string);
2324 ix86_arch_mask = 1u << ix86_arch;
2325 for (i = 0; i < X86_ARCH_LAST; ++i)
2326 ix86_arch_features[i] &= ix86_arch_mask;
2328 for (i = 0; i < pta_size; i++)
2329 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
2331 ix86_tune = processor_alias_table[i].processor;
2332 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
2334 if (ix86_tune_defaulted)
2336 ix86_tune_string = "x86-64";
2337 for (i = 0; i < pta_size; i++)
2338 if (! strcmp (ix86_tune_string,
2339 processor_alias_table[i].name))
2341 ix86_tune = processor_alias_table[i].processor;
2344 error ("CPU you selected does not support x86-64 "
2347 /* Intel CPUs have always interpreted SSE prefetch instructions as
2348 NOPs; so, we can enable SSE prefetch instructions even when
2349 -mtune (rather than -march) points us to a processor that has them.
2350 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
2351 higher processors. */
2353 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
2354 x86_prefetch_sse = true;
2358 error ("bad value (%s) for -mtune= switch", ix86_tune_string);
2360 ix86_tune_mask = 1u << ix86_tune;
2361 for (i = 0; i < X86_TUNE_LAST; ++i)
2362 ix86_tune_features[i] &= ix86_tune_mask;
2365 ix86_cost = &size_cost;
2367 ix86_cost = processor_target_table[ix86_tune].cost;
2369 /* Arrange to set up i386_stack_locals for all functions. */
2370 init_machine_status = ix86_init_machine_status;
2372 /* Validate -mregparm= value. */
2373 if (ix86_regparm_string)
2376 warning (0, "-mregparm is ignored in 64-bit mode");
2377 i = atoi (ix86_regparm_string);
2378 if (i < 0 || i > REGPARM_MAX)
2379 error ("-mregparm=%d is not between 0 and %d", i, REGPARM_MAX);
2384 ix86_regparm = REGPARM_MAX;
2386 /* If the user has provided any of the -malign-* options,
2387 warn and use that value only if -falign-* is not set.
2388 Remove this code in GCC 3.2 or later. */
2389 if (ix86_align_loops_string)
2391 warning (0, "-malign-loops is obsolete, use -falign-loops");
2392 if (align_loops == 0)
2394 i = atoi (ix86_align_loops_string);
2395 if (i < 0 || i > MAX_CODE_ALIGN)
2396 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2398 align_loops = 1 << i;
2402 if (ix86_align_jumps_string)
2404 warning (0, "-malign-jumps is obsolete, use -falign-jumps");
2405 if (align_jumps == 0)
2407 i = atoi (ix86_align_jumps_string);
2408 if (i < 0 || i > MAX_CODE_ALIGN)
2409 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2411 align_jumps = 1 << i;
2415 if (ix86_align_funcs_string)
2417 warning (0, "-malign-functions is obsolete, use -falign-functions");
2418 if (align_functions == 0)
2420 i = atoi (ix86_align_funcs_string);
2421 if (i < 0 || i > MAX_CODE_ALIGN)
2422 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2424 align_functions = 1 << i;
2428 /* Default align_* from the processor table. */
2429 if (align_loops == 0)
2431 align_loops = processor_target_table[ix86_tune].align_loop;
2432 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
2434 if (align_jumps == 0)
2436 align_jumps = processor_target_table[ix86_tune].align_jump;
2437 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
2439 if (align_functions == 0)
2441 align_functions = processor_target_table[ix86_tune].align_func;
2444 /* Validate -mbranch-cost= value, or provide default. */
2445 ix86_branch_cost = ix86_cost->branch_cost;
2446 if (ix86_branch_cost_string)
2448 i = atoi (ix86_branch_cost_string);
2450 error ("-mbranch-cost=%d is not between 0 and 5", i);
2452 ix86_branch_cost = i;
2454 if (ix86_section_threshold_string)
2456 i = atoi (ix86_section_threshold_string);
2458 error ("-mlarge-data-threshold=%d is negative", i);
2460 ix86_section_threshold = i;
2463 if (ix86_tls_dialect_string)
2465 if (strcmp (ix86_tls_dialect_string, "gnu") == 0)
2466 ix86_tls_dialect = TLS_DIALECT_GNU;
2467 else if (strcmp (ix86_tls_dialect_string, "gnu2") == 0)
2468 ix86_tls_dialect = TLS_DIALECT_GNU2;
2469 else if (strcmp (ix86_tls_dialect_string, "sun") == 0)
2470 ix86_tls_dialect = TLS_DIALECT_SUN;
2472 error ("bad value (%s) for -mtls-dialect= switch",
2473 ix86_tls_dialect_string);
2476 if (ix87_precision_string)
2478 i = atoi (ix87_precision_string);
2479 if (i != 32 && i != 64 && i != 80)
2480 error ("pc%d is not valid precision setting (32, 64 or 80)", i);
2485 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
2487 /* Enable by default the SSE and MMX builtins. Do allow the user to
2488 explicitly disable any of these. In particular, disabling SSE and
2489 MMX for kernel code is extremely useful. */
2490 if (!ix86_arch_specified)
2492 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
2493 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
2496 warning (0, "-mrtd is ignored in 64bit mode");
2500 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
2502 if (!ix86_arch_specified)
2504 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
2506 /* i386 ABI does not specify red zone. It still makes sense to use it
2507 when programmer takes care to stack from being destroyed. */
2508 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
2509 target_flags |= MASK_NO_RED_ZONE;
2512 /* Keep nonleaf frame pointers. */
2513 if (flag_omit_frame_pointer)
2514 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
2515 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
2516 flag_omit_frame_pointer = 1;
2518 /* If we're doing fast math, we don't care about comparison order
2519 wrt NaNs. This lets us use a shorter comparison sequence. */
2520 if (flag_finite_math_only)
2521 target_flags &= ~MASK_IEEE_FP;
2523 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
2524 since the insns won't need emulation. */
2525 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
2526 target_flags &= ~MASK_NO_FANCY_MATH_387;
2528 /* Likewise, if the target doesn't have a 387, or we've specified
2529 software floating point, don't use 387 inline intrinsics. */
2531 target_flags |= MASK_NO_FANCY_MATH_387;
2533 /* Turn on SSE4A bultins for -msse5. */
2535 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
2537 /* Turn on SSE4.1 builtins for -msse4.2. */
2539 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
2541 /* Turn on SSSE3 builtins for -msse4.1. */
2543 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
2545 /* Turn on SSE3 builtins for -mssse3. */
2547 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
2549 /* Turn on SSE3 builtins for -msse4a. */
2551 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
2553 /* Turn on SSE2 builtins for -msse3. */
2555 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
2557 /* Turn on SSE builtins for -msse2. */
2559 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
2561 /* Turn on MMX builtins for -msse. */
2564 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
2565 x86_prefetch_sse = true;
2568 /* Turn on MMX builtins for 3Dnow. */
2570 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
2572 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
2573 if (TARGET_SSE4_2 || TARGET_ABM)
2576 /* Validate -mpreferred-stack-boundary= value, or provide default.
2577 The default of 128 bits is for Pentium III's SSE __m128. We can't
2578 change it because of optimize_size. Otherwise, we can't mix object
2579 files compiled with -Os and -On. */
2580 ix86_preferred_stack_boundary = 128;
2581 if (ix86_preferred_stack_boundary_string)
2583 i = atoi (ix86_preferred_stack_boundary_string);
2584 if (i < (TARGET_64BIT ? 4 : 2) || i > 12)
2585 error ("-mpreferred-stack-boundary=%d is not between %d and 12", i,
2586 TARGET_64BIT ? 4 : 2);
2588 ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT;
2591 /* Accept -msseregparm only if at least SSE support is enabled. */
2592 if (TARGET_SSEREGPARM
2594 error ("-msseregparm used without SSE enabled");
2596 ix86_fpmath = TARGET_FPMATH_DEFAULT;
2597 if (ix86_fpmath_string != 0)
2599 if (! strcmp (ix86_fpmath_string, "387"))
2600 ix86_fpmath = FPMATH_387;
2601 else if (! strcmp (ix86_fpmath_string, "sse"))
2605 warning (0, "SSE instruction set disabled, using 387 arithmetics");
2606 ix86_fpmath = FPMATH_387;
2609 ix86_fpmath = FPMATH_SSE;
2611 else if (! strcmp (ix86_fpmath_string, "387,sse")
2612 || ! strcmp (ix86_fpmath_string, "sse,387"))
2616 warning (0, "SSE instruction set disabled, using 387 arithmetics");
2617 ix86_fpmath = FPMATH_387;
2619 else if (!TARGET_80387)
2621 warning (0, "387 instruction set disabled, using SSE arithmetics");
2622 ix86_fpmath = FPMATH_SSE;
2625 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
2628 error ("bad value (%s) for -mfpmath= switch", ix86_fpmath_string);
2631 /* If the i387 is disabled, then do not return values in it. */
2633 target_flags &= ~MASK_FLOAT_RETURNS;
2635 /* Use external vectorized library in vectorizing intrinsics. */
2636 if (ix86_veclibabi_string)
2638 if (strcmp (ix86_veclibabi_string, "acml") == 0)
2639 ix86_veclib_handler = ix86_veclibabi_acml;
2641 error ("unknown vectorization library ABI type (%s) for "
2642 "-mveclibabi= switch", ix86_veclibabi_string);
2645 if ((x86_accumulate_outgoing_args & ix86_tune_mask)
2646 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2648 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2650 /* ??? Unwind info is not correct around the CFG unless either a frame
2651 pointer is present or M_A_O_A is set. Fixing this requires rewriting
2652 unwind info generation to be aware of the CFG and propagating states
2654 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
2655 || flag_exceptions || flag_non_call_exceptions)
2656 && flag_omit_frame_pointer
2657 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
2659 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2660 warning (0, "unwind tables currently require either a frame pointer "
2661 "or -maccumulate-outgoing-args for correctness");
2662 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2665 /* For sane SSE instruction set generation we need fcomi instruction.
2666 It is safe to enable all CMOVE instructions. */
2670 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
2673 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
2674 p = strchr (internal_label_prefix, 'X');
2675 internal_label_prefix_len = p - internal_label_prefix;
2679 /* When scheduling description is not available, disable scheduler pass
2680 so it won't slow down the compilation and make x87 code slower. */
2681 if (!TARGET_SCHEDULE)
2682 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
2684 if (!PARAM_SET_P (PARAM_SIMULTANEOUS_PREFETCHES))
2685 set_param_value ("simultaneous-prefetches",
2686 ix86_cost->simultaneous_prefetches);
2687 if (!PARAM_SET_P (PARAM_L1_CACHE_LINE_SIZE))
2688 set_param_value ("l1-cache-line-size", ix86_cost->prefetch_block);
2689 if (!PARAM_SET_P (PARAM_L1_CACHE_SIZE))
2690 set_param_value ("l1-cache-size", ix86_cost->l1_cache_size);
2691 if (!PARAM_SET_P (PARAM_L2_CACHE_SIZE))
2692 set_param_value ("l2-cache-size", ix86_cost->l2_cache_size);
2694 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
2695 can be optimized to ap = __builtin_next_arg (0). */
2696 if (!TARGET_64BIT || TARGET_64BIT_MS_ABI)
2697 targetm.expand_builtin_va_start = NULL;
2700 /* Return true if this goes in large data/bss. */
2703 ix86_in_large_data_p (tree exp)
2705 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
2708 /* Functions are never large data. */
2709 if (TREE_CODE (exp) == FUNCTION_DECL)
2712 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
2714 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
2715 if (strcmp (section, ".ldata") == 0
2716 || strcmp (section, ".lbss") == 0)
2722 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
2724 /* If this is an incomplete type with size 0, then we can't put it
2725 in data because it might be too big when completed. */
2726 if (!size || size > ix86_section_threshold)
2733 /* Switch to the appropriate section for output of DECL.
2734 DECL is either a `VAR_DECL' node or a constant of some sort.
2735 RELOC indicates whether forming the initial value of DECL requires
2736 link-time relocations. */
2738 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
2742 x86_64_elf_select_section (tree decl, int reloc,
2743 unsigned HOST_WIDE_INT align)
2745 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2746 && ix86_in_large_data_p (decl))
2748 const char *sname = NULL;
2749 unsigned int flags = SECTION_WRITE;
2750 switch (categorize_decl_for_section (decl, reloc))
2755 case SECCAT_DATA_REL:
2756 sname = ".ldata.rel";
2758 case SECCAT_DATA_REL_LOCAL:
2759 sname = ".ldata.rel.local";
2761 case SECCAT_DATA_REL_RO:
2762 sname = ".ldata.rel.ro";
2764 case SECCAT_DATA_REL_RO_LOCAL:
2765 sname = ".ldata.rel.ro.local";
2769 flags |= SECTION_BSS;
2772 case SECCAT_RODATA_MERGE_STR:
2773 case SECCAT_RODATA_MERGE_STR_INIT:
2774 case SECCAT_RODATA_MERGE_CONST:
2778 case SECCAT_SRODATA:
2785 /* We don't split these for medium model. Place them into
2786 default sections and hope for best. */
2791 /* We might get called with string constants, but get_named_section
2792 doesn't like them as they are not DECLs. Also, we need to set
2793 flags in that case. */
2795 return get_section (sname, flags, NULL);
2796 return get_named_section (decl, sname, reloc);
2799 return default_elf_select_section (decl, reloc, align);
2802 /* Build up a unique section name, expressed as a
2803 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
2804 RELOC indicates whether the initial value of EXP requires
2805 link-time relocations. */
2807 static void ATTRIBUTE_UNUSED
2808 x86_64_elf_unique_section (tree decl, int reloc)
2810 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2811 && ix86_in_large_data_p (decl))
2813 const char *prefix = NULL;
2814 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
2815 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
2817 switch (categorize_decl_for_section (decl, reloc))
2820 case SECCAT_DATA_REL:
2821 case SECCAT_DATA_REL_LOCAL:
2822 case SECCAT_DATA_REL_RO:
2823 case SECCAT_DATA_REL_RO_LOCAL:
2824 prefix = one_only ? ".gnu.linkonce.ld." : ".ldata.";
2827 prefix = one_only ? ".gnu.linkonce.lb." : ".lbss.";
2830 case SECCAT_RODATA_MERGE_STR:
2831 case SECCAT_RODATA_MERGE_STR_INIT:
2832 case SECCAT_RODATA_MERGE_CONST:
2833 prefix = one_only ? ".gnu.linkonce.lr." : ".lrodata.";
2835 case SECCAT_SRODATA:
2842 /* We don't split these for medium model. Place them into
2843 default sections and hope for best. */
2851 plen = strlen (prefix);
2853 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
2854 name = targetm.strip_name_encoding (name);
2855 nlen = strlen (name);
2857 string = (char *) alloca (nlen + plen + 1);
2858 memcpy (string, prefix, plen);
2859 memcpy (string + plen, name, nlen + 1);
2861 DECL_SECTION_NAME (decl) = build_string (nlen + plen, string);
2865 default_unique_section (decl, reloc);
2868 #ifdef COMMON_ASM_OP
2869 /* This says how to output assembler code to declare an
2870 uninitialized external linkage data object.
2872 For medium model x86-64 we need to use .largecomm opcode for
2875 x86_elf_aligned_common (FILE *file,
2876 const char *name, unsigned HOST_WIDE_INT size,
2879 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2880 && size > (unsigned int)ix86_section_threshold)
2881 fprintf (file, ".largecomm\t");
2883 fprintf (file, "%s", COMMON_ASM_OP);
2884 assemble_name (file, name);
2885 fprintf (file, ","HOST_WIDE_INT_PRINT_UNSIGNED",%u\n",
2886 size, align / BITS_PER_UNIT);
2890 /* Utility function for targets to use in implementing
2891 ASM_OUTPUT_ALIGNED_BSS. */
2894 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
2895 const char *name, unsigned HOST_WIDE_INT size,
2898 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2899 && size > (unsigned int)ix86_section_threshold)
2900 switch_to_section (get_named_section (decl, ".lbss", 0));
2902 switch_to_section (bss_section);
2903 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
2904 #ifdef ASM_DECLARE_OBJECT_NAME
2905 last_assemble_variable_decl = decl;
2906 ASM_DECLARE_OBJECT_NAME (file, name, decl);
2908 /* Standard thing is just output label for the object. */
2909 ASM_OUTPUT_LABEL (file, name);
2910 #endif /* ASM_DECLARE_OBJECT_NAME */
2911 ASM_OUTPUT_SKIP (file, size ? size : 1);
2915 optimization_options (int level, int size ATTRIBUTE_UNUSED)
2917 /* For -O2 and beyond, turn off -fschedule-insns by default. It tends to
2918 make the problem with not enough registers even worse. */
2919 #ifdef INSN_SCHEDULING
2921 flag_schedule_insns = 0;
2925 /* The Darwin libraries never set errno, so we might as well
2926 avoid calling them when that's the only reason we would. */
2927 flag_errno_math = 0;
2929 /* The default values of these switches depend on the TARGET_64BIT
2930 that is not known at this moment. Mark these values with 2 and
2931 let user the to override these. In case there is no command line option
2932 specifying them, we will set the defaults in override_options. */
2934 flag_omit_frame_pointer = 2;
2935 flag_pcc_struct_return = 2;
2936 flag_asynchronous_unwind_tables = 2;
2937 flag_vect_cost_model = 1;
2938 #ifdef SUBTARGET_OPTIMIZATION_OPTIONS
2939 SUBTARGET_OPTIMIZATION_OPTIONS;
2943 /* Decide whether we can make a sibling call to a function. DECL is the
2944 declaration of the function being targeted by the call and EXP is the
2945 CALL_EXPR representing the call. */
2948 ix86_function_ok_for_sibcall (tree decl, tree exp)
2953 /* If we are generating position-independent code, we cannot sibcall
2954 optimize any indirect call, or a direct call to a global function,
2955 as the PLT requires %ebx be live. */
2956 if (!TARGET_64BIT && flag_pic && (!decl || !targetm.binds_local_p (decl)))
2963 func = TREE_TYPE (CALL_EXPR_FN (exp));
2964 if (POINTER_TYPE_P (func))
2965 func = TREE_TYPE (func);
2968 /* Check that the return value locations are the same. Like
2969 if we are returning floats on the 80387 register stack, we cannot
2970 make a sibcall from a function that doesn't return a float to a
2971 function that does or, conversely, from a function that does return
2972 a float to a function that doesn't; the necessary stack adjustment
2973 would not be executed. This is also the place we notice
2974 differences in the return value ABI. Note that it is ok for one
2975 of the functions to have void return type as long as the return
2976 value of the other is passed in a register. */
2977 a = ix86_function_value (TREE_TYPE (exp), func, false);
2978 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
2980 if (STACK_REG_P (a) || STACK_REG_P (b))
2982 if (!rtx_equal_p (a, b))
2985 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
2987 else if (!rtx_equal_p (a, b))
2990 /* If this call is indirect, we'll need to be able to use a call-clobbered
2991 register for the address of the target function. Make sure that all
2992 such registers are not used for passing parameters. */
2993 if (!decl && !TARGET_64BIT)
2997 /* We're looking at the CALL_EXPR, we need the type of the function. */
2998 type = CALL_EXPR_FN (exp); /* pointer expression */
2999 type = TREE_TYPE (type); /* pointer type */
3000 type = TREE_TYPE (type); /* function type */
3002 if (ix86_function_regparm (type, NULL) >= 3)
3004 /* ??? Need to count the actual number of registers to be used,
3005 not the possible number of registers. Fix later. */
3010 /* Dllimport'd functions are also called indirectly. */
3011 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
3012 && decl && DECL_DLLIMPORT_P (decl)
3013 && ix86_function_regparm (TREE_TYPE (decl), NULL) >= 3)
3016 /* If we forced aligned the stack, then sibcalling would unalign the
3017 stack, which may break the called function. */
3018 if (cfun->machine->force_align_arg_pointer)
3021 /* Otherwise okay. That also includes certain types of indirect calls. */
3025 /* Handle "cdecl", "stdcall", "fastcall", "regparm" and "sseregparm"
3026 calling convention attributes;
3027 arguments as in struct attribute_spec.handler. */
3030 ix86_handle_cconv_attribute (tree *node, tree name,
3032 int flags ATTRIBUTE_UNUSED,
3035 if (TREE_CODE (*node) != FUNCTION_TYPE
3036 && TREE_CODE (*node) != METHOD_TYPE
3037 && TREE_CODE (*node) != FIELD_DECL
3038 && TREE_CODE (*node) != TYPE_DECL)
3040 warning (OPT_Wattributes, "%qs attribute only applies to functions",
3041 IDENTIFIER_POINTER (name));
3042 *no_add_attrs = true;
3046 /* Can combine regparm with all attributes but fastcall. */
3047 if (is_attribute_p ("regparm", name))
3051 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
3053 error ("fastcall and regparm attributes are not compatible");
3056 cst = TREE_VALUE (args);
3057 if (TREE_CODE (cst) != INTEGER_CST)
3059 warning (OPT_Wattributes,
3060 "%qs attribute requires an integer constant argument",
3061 IDENTIFIER_POINTER (name));
3062 *no_add_attrs = true;
3064 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
3066 warning (OPT_Wattributes, "argument to %qs attribute larger than %d",
3067 IDENTIFIER_POINTER (name), REGPARM_MAX);
3068 *no_add_attrs = true;
3072 && lookup_attribute (ix86_force_align_arg_pointer_string,
3073 TYPE_ATTRIBUTES (*node))
3074 && compare_tree_int (cst, REGPARM_MAX-1))
3076 error ("%s functions limited to %d register parameters",
3077 ix86_force_align_arg_pointer_string, REGPARM_MAX-1);
3085 /* Do not warn when emulating the MS ABI. */
3086 if (!TARGET_64BIT_MS_ABI)
3087 warning (OPT_Wattributes, "%qs attribute ignored",
3088 IDENTIFIER_POINTER (name));
3089 *no_add_attrs = true;
3093 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
3094 if (is_attribute_p ("fastcall", name))
3096 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
3098 error ("fastcall and cdecl attributes are not compatible");
3100 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
3102 error ("fastcall and stdcall attributes are not compatible");
3104 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
3106 error ("fastcall and regparm attributes are not compatible");
3110 /* Can combine stdcall with fastcall (redundant), regparm and
3112 else if (is_attribute_p ("stdcall", name))
3114 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
3116 error ("stdcall and cdecl attributes are not compatible");
3118 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
3120 error ("stdcall and fastcall attributes are not compatible");
3124 /* Can combine cdecl with regparm and sseregparm. */
3125 else if (is_attribute_p ("cdecl", name))
3127 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
3129 error ("stdcall and cdecl attributes are not compatible");
3131 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
3133 error ("fastcall and cdecl attributes are not compatible");
3137 /* Can combine sseregparm with all attributes. */
3142 /* Return 0 if the attributes for two types are incompatible, 1 if they
3143 are compatible, and 2 if they are nearly compatible (which causes a
3144 warning to be generated). */
3147 ix86_comp_type_attributes (const_tree type1, const_tree type2)
3149 /* Check for mismatch of non-default calling convention. */
3150 const char *const rtdstr = TARGET_RTD ? "cdecl" : "stdcall";
3152 if (TREE_CODE (type1) != FUNCTION_TYPE
3153 && TREE_CODE (type1) != METHOD_TYPE)
3156 /* Check for mismatched fastcall/regparm types. */
3157 if ((!lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type1))
3158 != !lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type2)))
3159 || (ix86_function_regparm (type1, NULL)
3160 != ix86_function_regparm (type2, NULL)))
3163 /* Check for mismatched sseregparm types. */
3164 if (!lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type1))
3165 != !lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type2)))
3168 /* Check for mismatched return types (cdecl vs stdcall). */
3169 if (!lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type1))
3170 != !lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type2)))
3176 /* Return the regparm value for a function with the indicated TYPE and DECL.
3177 DECL may be NULL when calling function indirectly
3178 or considering a libcall. */
3181 ix86_function_regparm (const_tree type, const_tree decl)
3184 int regparm = ix86_regparm;
3189 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
3191 return TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
3193 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type)))
3196 /* Use register calling convention for local functions when possible. */
3197 if (decl && TREE_CODE (decl) == FUNCTION_DECL
3198 && flag_unit_at_a_time && !profile_flag)
3200 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
3201 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
3204 int local_regparm, globals = 0, regno;
3207 /* Make sure no regparm register is taken by a
3208 fixed register variable. */
3209 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
3210 if (fixed_regs[local_regparm])
3213 /* We can't use regparm(3) for nested functions as these use
3214 static chain pointer in third argument. */
3215 if (local_regparm == 3
3216 && (decl_function_context (decl)
3217 || ix86_force_align_arg_pointer)
3218 && !DECL_NO_STATIC_CHAIN (decl))
3221 /* If the function realigns its stackpointer, the prologue will
3222 clobber %ecx. If we've already generated code for the callee,
3223 the callee DECL_STRUCT_FUNCTION is gone, so we fall back to
3224 scanning the attributes for the self-realigning property. */
3225 f = DECL_STRUCT_FUNCTION (decl);
3226 if (local_regparm == 3
3227 && (f ? !!f->machine->force_align_arg_pointer
3228 : !!lookup_attribute (ix86_force_align_arg_pointer_string,
3229 TYPE_ATTRIBUTES (TREE_TYPE (decl)))))
3232 /* Each fixed register usage increases register pressure,
3233 so less registers should be used for argument passing.
3234 This functionality can be overriden by an explicit
3236 for (regno = 0; regno <= DI_REG; regno++)
3237 if (fixed_regs[regno])
3241 = globals < local_regparm ? local_regparm - globals : 0;
3243 if (local_regparm > regparm)
3244 regparm = local_regparm;
3251 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
3252 DFmode (2) arguments in SSE registers for a function with the
3253 indicated TYPE and DECL. DECL may be NULL when calling function
3254 indirectly or considering a libcall. Otherwise return 0. */
3257 ix86_function_sseregparm (const_tree type, const_tree decl)
3259 gcc_assert (!TARGET_64BIT);
3261 /* Use SSE registers to pass SFmode and DFmode arguments if requested
3262 by the sseregparm attribute. */
3263 if (TARGET_SSEREGPARM
3264 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
3269 error ("Calling %qD with attribute sseregparm without "
3270 "SSE/SSE2 enabled", decl);
3272 error ("Calling %qT with attribute sseregparm without "
3273 "SSE/SSE2 enabled", type);
3280 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
3281 (and DFmode for SSE2) arguments in SSE registers. */
3282 if (decl && TARGET_SSE_MATH && flag_unit_at_a_time && !profile_flag)
3284 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
3285 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
3287 return TARGET_SSE2 ? 2 : 1;
3293 /* Return true if EAX is live at the start of the function. Used by
3294 ix86_expand_prologue to determine if we need special help before
3295 calling allocate_stack_worker. */
3298 ix86_eax_live_at_start_p (void)
3300 /* Cheat. Don't bother working forward from ix86_function_regparm
3301 to the function type to whether an actual argument is located in
3302 eax. Instead just look at cfg info, which is still close enough
3303 to correct at this point. This gives false positives for broken