1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
3 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
12 GCC is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
23 #include "coretypes.h"
29 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
43 #include "basic-block.h"
46 #include "target-def.h"
47 #include "langhooks.h"
49 #include "tree-gimple.h"
52 #include "tm-constrs.h"
55 static int x86_builtin_vectorization_cost (bool);
57 #ifndef CHECK_STACK_LIMIT
58 #define CHECK_STACK_LIMIT (-1)
61 /* Return index of given mode in mult and division cost tables. */
62 #define MODE_INDEX(mode) \
63 ((mode) == QImode ? 0 \
64 : (mode) == HImode ? 1 \
65 : (mode) == SImode ? 2 \
66 : (mode) == DImode ? 3 \
69 /* Processor costs (relative to an add) */
70 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
71 #define COSTS_N_BYTES(N) ((N) * 2)
73 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
76 struct processor_costs size_cost = { /* costs for tuning for size */
77 COSTS_N_BYTES (2), /* cost of an add instruction */
78 COSTS_N_BYTES (3), /* cost of a lea instruction */
79 COSTS_N_BYTES (2), /* variable shift costs */
80 COSTS_N_BYTES (3), /* constant shift costs */
81 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
82 COSTS_N_BYTES (3), /* HI */
83 COSTS_N_BYTES (3), /* SI */
84 COSTS_N_BYTES (3), /* DI */
85 COSTS_N_BYTES (5)}, /* other */
86 0, /* cost of multiply per each bit set */
87 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
88 COSTS_N_BYTES (3), /* HI */
89 COSTS_N_BYTES (3), /* SI */
90 COSTS_N_BYTES (3), /* DI */
91 COSTS_N_BYTES (5)}, /* other */
92 COSTS_N_BYTES (3), /* cost of movsx */
93 COSTS_N_BYTES (3), /* cost of movzx */
96 2, /* cost for loading QImode using movzbl */
97 {2, 2, 2}, /* cost of loading integer registers
98 in QImode, HImode and SImode.
99 Relative to reg-reg move (2). */
100 {2, 2, 2}, /* cost of storing integer registers */
101 2, /* cost of reg,reg fld/fst */
102 {2, 2, 2}, /* cost of loading fp registers
103 in SFmode, DFmode and XFmode */
104 {2, 2, 2}, /* cost of storing fp registers
105 in SFmode, DFmode and XFmode */
106 3, /* cost of moving MMX register */
107 {3, 3}, /* cost of loading MMX registers
108 in SImode and DImode */
109 {3, 3}, /* cost of storing MMX registers
110 in SImode and DImode */
111 3, /* cost of moving SSE register */
112 {3, 3, 3}, /* cost of loading SSE registers
113 in SImode, DImode and TImode */
114 {3, 3, 3}, /* cost of storing SSE registers
115 in SImode, DImode and TImode */
116 3, /* MMX or SSE register to integer */
117 0, /* size of l1 cache */
118 0, /* size of l2 cache */
119 0, /* size of prefetch block */
120 0, /* number of parallel prefetches */
122 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
123 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
124 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
125 COSTS_N_BYTES (2), /* cost of FABS instruction. */
126 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
127 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
128 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
129 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
130 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
131 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
132 1, /* scalar_stmt_cost. */
133 1, /* scalar load_cost. */
134 1, /* scalar_store_cost. */
135 1, /* vec_stmt_cost. */
136 1, /* vec_to_scalar_cost. */
137 1, /* scalar_to_vec_cost. */
138 1, /* vec_align_load_cost. */
139 1, /* vec_unalign_load_cost. */
140 1, /* vec_store_cost. */
141 1, /* cond_taken_branch_cost. */
142 1, /* cond_not_taken_branch_cost. */
145 /* Processor costs (relative to an add) */
147 struct processor_costs i386_cost = { /* 386 specific costs */
148 COSTS_N_INSNS (1), /* cost of an add instruction */
149 COSTS_N_INSNS (1), /* cost of a lea instruction */
150 COSTS_N_INSNS (3), /* variable shift costs */
151 COSTS_N_INSNS (2), /* constant shift costs */
152 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
153 COSTS_N_INSNS (6), /* HI */
154 COSTS_N_INSNS (6), /* SI */
155 COSTS_N_INSNS (6), /* DI */
156 COSTS_N_INSNS (6)}, /* other */
157 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
158 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
159 COSTS_N_INSNS (23), /* HI */
160 COSTS_N_INSNS (23), /* SI */
161 COSTS_N_INSNS (23), /* DI */
162 COSTS_N_INSNS (23)}, /* other */
163 COSTS_N_INSNS (3), /* cost of movsx */
164 COSTS_N_INSNS (2), /* cost of movzx */
165 15, /* "large" insn */
167 4, /* cost for loading QImode using movzbl */
168 {2, 4, 2}, /* cost of loading integer registers
169 in QImode, HImode and SImode.
170 Relative to reg-reg move (2). */
171 {2, 4, 2}, /* cost of storing integer registers */
172 2, /* cost of reg,reg fld/fst */
173 {8, 8, 8}, /* cost of loading fp registers
174 in SFmode, DFmode and XFmode */
175 {8, 8, 8}, /* cost of storing fp registers
176 in SFmode, DFmode and XFmode */
177 2, /* cost of moving MMX register */
178 {4, 8}, /* cost of loading MMX registers
179 in SImode and DImode */
180 {4, 8}, /* cost of storing MMX registers
181 in SImode and DImode */
182 2, /* cost of moving SSE register */
183 {4, 8, 16}, /* cost of loading SSE registers
184 in SImode, DImode and TImode */
185 {4, 8, 16}, /* cost of storing SSE registers
186 in SImode, DImode and TImode */
187 3, /* MMX or SSE register to integer */
188 0, /* size of l1 cache */
189 0, /* size of l2 cache */
190 0, /* size of prefetch block */
191 0, /* number of parallel prefetches */
193 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
194 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
195 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
196 COSTS_N_INSNS (22), /* cost of FABS instruction. */
197 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
198 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
199 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
200 DUMMY_STRINGOP_ALGS},
201 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
202 DUMMY_STRINGOP_ALGS},
203 1, /* scalar_stmt_cost. */
204 1, /* scalar load_cost. */
205 1, /* scalar_store_cost. */
206 1, /* vec_stmt_cost. */
207 1, /* vec_to_scalar_cost. */
208 1, /* scalar_to_vec_cost. */
209 1, /* vec_align_load_cost. */
210 2, /* vec_unalign_load_cost. */
211 1, /* vec_store_cost. */
212 3, /* cond_taken_branch_cost. */
213 1, /* cond_not_taken_branch_cost. */
217 struct processor_costs i486_cost = { /* 486 specific costs */
218 COSTS_N_INSNS (1), /* cost of an add instruction */
219 COSTS_N_INSNS (1), /* cost of a lea instruction */
220 COSTS_N_INSNS (3), /* variable shift costs */
221 COSTS_N_INSNS (2), /* constant shift costs */
222 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
223 COSTS_N_INSNS (12), /* HI */
224 COSTS_N_INSNS (12), /* SI */
225 COSTS_N_INSNS (12), /* DI */
226 COSTS_N_INSNS (12)}, /* other */
227 1, /* cost of multiply per each bit set */
228 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
229 COSTS_N_INSNS (40), /* HI */
230 COSTS_N_INSNS (40), /* SI */
231 COSTS_N_INSNS (40), /* DI */
232 COSTS_N_INSNS (40)}, /* other */
233 COSTS_N_INSNS (3), /* cost of movsx */
234 COSTS_N_INSNS (2), /* cost of movzx */
235 15, /* "large" insn */
237 4, /* cost for loading QImode using movzbl */
238 {2, 4, 2}, /* cost of loading integer registers
239 in QImode, HImode and SImode.
240 Relative to reg-reg move (2). */
241 {2, 4, 2}, /* cost of storing integer registers */
242 2, /* cost of reg,reg fld/fst */
243 {8, 8, 8}, /* cost of loading fp registers
244 in SFmode, DFmode and XFmode */
245 {8, 8, 8}, /* cost of storing fp registers
246 in SFmode, DFmode and XFmode */
247 2, /* cost of moving MMX register */
248 {4, 8}, /* cost of loading MMX registers
249 in SImode and DImode */
250 {4, 8}, /* cost of storing MMX registers
251 in SImode and DImode */
252 2, /* cost of moving SSE register */
253 {4, 8, 16}, /* cost of loading SSE registers
254 in SImode, DImode and TImode */
255 {4, 8, 16}, /* cost of storing SSE registers
256 in SImode, DImode and TImode */
257 3, /* MMX or SSE register to integer */
258 4, /* size of l1 cache. 486 has 8kB cache
259 shared for code and data, so 4kB is
260 not really precise. */
261 4, /* size of l2 cache */
262 0, /* size of prefetch block */
263 0, /* number of parallel prefetches */
265 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
266 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
267 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
268 COSTS_N_INSNS (3), /* cost of FABS instruction. */
269 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
270 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
271 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
272 DUMMY_STRINGOP_ALGS},
273 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
274 DUMMY_STRINGOP_ALGS},
275 1, /* scalar_stmt_cost. */
276 1, /* scalar load_cost. */
277 1, /* scalar_store_cost. */
278 1, /* vec_stmt_cost. */
279 1, /* vec_to_scalar_cost. */
280 1, /* scalar_to_vec_cost. */
281 1, /* vec_align_load_cost. */
282 2, /* vec_unalign_load_cost. */
283 1, /* vec_store_cost. */
284 3, /* cond_taken_branch_cost. */
285 1, /* cond_not_taken_branch_cost. */
289 struct processor_costs pentium_cost = {
290 COSTS_N_INSNS (1), /* cost of an add instruction */
291 COSTS_N_INSNS (1), /* cost of a lea instruction */
292 COSTS_N_INSNS (4), /* variable shift costs */
293 COSTS_N_INSNS (1), /* constant shift costs */
294 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
295 COSTS_N_INSNS (11), /* HI */
296 COSTS_N_INSNS (11), /* SI */
297 COSTS_N_INSNS (11), /* DI */
298 COSTS_N_INSNS (11)}, /* other */
299 0, /* cost of multiply per each bit set */
300 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
301 COSTS_N_INSNS (25), /* HI */
302 COSTS_N_INSNS (25), /* SI */
303 COSTS_N_INSNS (25), /* DI */
304 COSTS_N_INSNS (25)}, /* other */
305 COSTS_N_INSNS (3), /* cost of movsx */
306 COSTS_N_INSNS (2), /* cost of movzx */
307 8, /* "large" insn */
309 6, /* cost for loading QImode using movzbl */
310 {2, 4, 2}, /* cost of loading integer registers
311 in QImode, HImode and SImode.
312 Relative to reg-reg move (2). */
313 {2, 4, 2}, /* cost of storing integer registers */
314 2, /* cost of reg,reg fld/fst */
315 {2, 2, 6}, /* cost of loading fp registers
316 in SFmode, DFmode and XFmode */
317 {4, 4, 6}, /* cost of storing fp registers
318 in SFmode, DFmode and XFmode */
319 8, /* cost of moving MMX register */
320 {8, 8}, /* cost of loading MMX registers
321 in SImode and DImode */
322 {8, 8}, /* cost of storing MMX registers
323 in SImode and DImode */
324 2, /* cost of moving SSE register */
325 {4, 8, 16}, /* cost of loading SSE registers
326 in SImode, DImode and TImode */
327 {4, 8, 16}, /* cost of storing SSE registers
328 in SImode, DImode and TImode */
329 3, /* MMX or SSE register to integer */
330 8, /* size of l1 cache. */
331 8, /* size of l2 cache */
332 0, /* size of prefetch block */
333 0, /* number of parallel prefetches */
335 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
336 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
337 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
338 COSTS_N_INSNS (1), /* cost of FABS instruction. */
339 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
340 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
341 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
342 DUMMY_STRINGOP_ALGS},
343 {{libcall, {{-1, rep_prefix_4_byte}}},
344 DUMMY_STRINGOP_ALGS},
345 1, /* scalar_stmt_cost. */
346 1, /* scalar load_cost. */
347 1, /* scalar_store_cost. */
348 1, /* vec_stmt_cost. */
349 1, /* vec_to_scalar_cost. */
350 1, /* scalar_to_vec_cost. */
351 1, /* vec_align_load_cost. */
352 2, /* vec_unalign_load_cost. */
353 1, /* vec_store_cost. */
354 3, /* cond_taken_branch_cost. */
355 1, /* cond_not_taken_branch_cost. */
359 struct processor_costs pentiumpro_cost = {
360 COSTS_N_INSNS (1), /* cost of an add instruction */
361 COSTS_N_INSNS (1), /* cost of a lea instruction */
362 COSTS_N_INSNS (1), /* variable shift costs */
363 COSTS_N_INSNS (1), /* constant shift costs */
364 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
365 COSTS_N_INSNS (4), /* HI */
366 COSTS_N_INSNS (4), /* SI */
367 COSTS_N_INSNS (4), /* DI */
368 COSTS_N_INSNS (4)}, /* other */
369 0, /* cost of multiply per each bit set */
370 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
371 COSTS_N_INSNS (17), /* HI */
372 COSTS_N_INSNS (17), /* SI */
373 COSTS_N_INSNS (17), /* DI */
374 COSTS_N_INSNS (17)}, /* other */
375 COSTS_N_INSNS (1), /* cost of movsx */
376 COSTS_N_INSNS (1), /* cost of movzx */
377 8, /* "large" insn */
379 2, /* cost for loading QImode using movzbl */
380 {4, 4, 4}, /* cost of loading integer registers
381 in QImode, HImode and SImode.
382 Relative to reg-reg move (2). */
383 {2, 2, 2}, /* cost of storing integer registers */
384 2, /* cost of reg,reg fld/fst */
385 {2, 2, 6}, /* cost of loading fp registers
386 in SFmode, DFmode and XFmode */
387 {4, 4, 6}, /* cost of storing fp registers
388 in SFmode, DFmode and XFmode */
389 2, /* cost of moving MMX register */
390 {2, 2}, /* cost of loading MMX registers
391 in SImode and DImode */
392 {2, 2}, /* cost of storing MMX registers
393 in SImode and DImode */
394 2, /* cost of moving SSE register */
395 {2, 2, 8}, /* cost of loading SSE registers
396 in SImode, DImode and TImode */
397 {2, 2, 8}, /* cost of storing SSE registers
398 in SImode, DImode and TImode */
399 3, /* MMX or SSE register to integer */
400 8, /* size of l1 cache. */
401 256, /* size of l2 cache */
402 32, /* size of prefetch block */
403 6, /* number of parallel prefetches */
405 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
406 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
407 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
408 COSTS_N_INSNS (2), /* cost of FABS instruction. */
409 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
410 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
411 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes (we ensure
412 the alignment). For small blocks inline loop is still a noticeable win, for bigger
413 blocks either rep movsl or rep movsb is way to go. Rep movsb has apparently
414 more expensive startup time in CPU, but after 4K the difference is down in the noise.
416 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
417 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
418 DUMMY_STRINGOP_ALGS},
419 {{rep_prefix_4_byte, {{1024, unrolled_loop},
420 {8192, rep_prefix_4_byte}, {-1, libcall}}},
421 DUMMY_STRINGOP_ALGS},
422 1, /* scalar_stmt_cost. */
423 1, /* scalar load_cost. */
424 1, /* scalar_store_cost. */
425 1, /* vec_stmt_cost. */
426 1, /* vec_to_scalar_cost. */
427 1, /* scalar_to_vec_cost. */
428 1, /* vec_align_load_cost. */
429 2, /* vec_unalign_load_cost. */
430 1, /* vec_store_cost. */
431 3, /* cond_taken_branch_cost. */
432 1, /* cond_not_taken_branch_cost. */
436 struct processor_costs geode_cost = {
437 COSTS_N_INSNS (1), /* cost of an add instruction */
438 COSTS_N_INSNS (1), /* cost of a lea instruction */
439 COSTS_N_INSNS (2), /* variable shift costs */
440 COSTS_N_INSNS (1), /* constant shift costs */
441 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
442 COSTS_N_INSNS (4), /* HI */
443 COSTS_N_INSNS (7), /* SI */
444 COSTS_N_INSNS (7), /* DI */
445 COSTS_N_INSNS (7)}, /* other */
446 0, /* cost of multiply per each bit set */
447 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
448 COSTS_N_INSNS (23), /* HI */
449 COSTS_N_INSNS (39), /* SI */
450 COSTS_N_INSNS (39), /* DI */
451 COSTS_N_INSNS (39)}, /* other */
452 COSTS_N_INSNS (1), /* cost of movsx */
453 COSTS_N_INSNS (1), /* cost of movzx */
454 8, /* "large" insn */
456 1, /* cost for loading QImode using movzbl */
457 {1, 1, 1}, /* cost of loading integer registers
458 in QImode, HImode and SImode.
459 Relative to reg-reg move (2). */
460 {1, 1, 1}, /* cost of storing integer registers */
461 1, /* cost of reg,reg fld/fst */
462 {1, 1, 1}, /* cost of loading fp registers
463 in SFmode, DFmode and XFmode */
464 {4, 6, 6}, /* cost of storing fp registers
465 in SFmode, DFmode and XFmode */
467 1, /* cost of moving MMX register */
468 {1, 1}, /* cost of loading MMX registers
469 in SImode and DImode */
470 {1, 1}, /* cost of storing MMX registers
471 in SImode and DImode */
472 1, /* cost of moving SSE register */
473 {1, 1, 1}, /* cost of loading SSE registers
474 in SImode, DImode and TImode */
475 {1, 1, 1}, /* cost of storing SSE registers
476 in SImode, DImode and TImode */
477 1, /* MMX or SSE register to integer */
478 64, /* size of l1 cache. */
479 128, /* size of l2 cache. */
480 32, /* size of prefetch block */
481 1, /* number of parallel prefetches */
483 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
484 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
485 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
486 COSTS_N_INSNS (1), /* cost of FABS instruction. */
487 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
488 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
489 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
490 DUMMY_STRINGOP_ALGS},
491 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
492 DUMMY_STRINGOP_ALGS},
493 1, /* scalar_stmt_cost. */
494 1, /* scalar load_cost. */
495 1, /* scalar_store_cost. */
496 1, /* vec_stmt_cost. */
497 1, /* vec_to_scalar_cost. */
498 1, /* scalar_to_vec_cost. */
499 1, /* vec_align_load_cost. */
500 2, /* vec_unalign_load_cost. */
501 1, /* vec_store_cost. */
502 3, /* cond_taken_branch_cost. */
503 1, /* cond_not_taken_branch_cost. */
507 struct processor_costs k6_cost = {
508 COSTS_N_INSNS (1), /* cost of an add instruction */
509 COSTS_N_INSNS (2), /* cost of a lea instruction */
510 COSTS_N_INSNS (1), /* variable shift costs */
511 COSTS_N_INSNS (1), /* constant shift costs */
512 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
513 COSTS_N_INSNS (3), /* HI */
514 COSTS_N_INSNS (3), /* SI */
515 COSTS_N_INSNS (3), /* DI */
516 COSTS_N_INSNS (3)}, /* other */
517 0, /* cost of multiply per each bit set */
518 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
519 COSTS_N_INSNS (18), /* HI */
520 COSTS_N_INSNS (18), /* SI */
521 COSTS_N_INSNS (18), /* DI */
522 COSTS_N_INSNS (18)}, /* other */
523 COSTS_N_INSNS (2), /* cost of movsx */
524 COSTS_N_INSNS (2), /* cost of movzx */
525 8, /* "large" insn */
527 3, /* cost for loading QImode using movzbl */
528 {4, 5, 4}, /* cost of loading integer registers
529 in QImode, HImode and SImode.
530 Relative to reg-reg move (2). */
531 {2, 3, 2}, /* cost of storing integer registers */
532 4, /* cost of reg,reg fld/fst */
533 {6, 6, 6}, /* cost of loading fp registers
534 in SFmode, DFmode and XFmode */
535 {4, 4, 4}, /* cost of storing fp registers
536 in SFmode, DFmode and XFmode */
537 2, /* cost of moving MMX register */
538 {2, 2}, /* cost of loading MMX registers
539 in SImode and DImode */
540 {2, 2}, /* cost of storing MMX registers
541 in SImode and DImode */
542 2, /* cost of moving SSE register */
543 {2, 2, 8}, /* cost of loading SSE registers
544 in SImode, DImode and TImode */
545 {2, 2, 8}, /* cost of storing SSE registers
546 in SImode, DImode and TImode */
547 6, /* MMX or SSE register to integer */
548 32, /* size of l1 cache. */
549 32, /* size of l2 cache. Some models
550 have integrated l2 cache, but
551 optimizing for k6 is not important
552 enough to worry about that. */
553 32, /* size of prefetch block */
554 1, /* number of parallel prefetches */
556 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
557 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
558 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
559 COSTS_N_INSNS (2), /* cost of FABS instruction. */
560 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
561 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
562 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
563 DUMMY_STRINGOP_ALGS},
564 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
565 DUMMY_STRINGOP_ALGS},
566 1, /* scalar_stmt_cost. */
567 1, /* scalar load_cost. */
568 1, /* scalar_store_cost. */
569 1, /* vec_stmt_cost. */
570 1, /* vec_to_scalar_cost. */
571 1, /* scalar_to_vec_cost. */
572 1, /* vec_align_load_cost. */
573 2, /* vec_unalign_load_cost. */
574 1, /* vec_store_cost. */
575 3, /* cond_taken_branch_cost. */
576 1, /* cond_not_taken_branch_cost. */
580 struct processor_costs athlon_cost = {
581 COSTS_N_INSNS (1), /* cost of an add instruction */
582 COSTS_N_INSNS (2), /* cost of a lea instruction */
583 COSTS_N_INSNS (1), /* variable shift costs */
584 COSTS_N_INSNS (1), /* constant shift costs */
585 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
586 COSTS_N_INSNS (5), /* HI */
587 COSTS_N_INSNS (5), /* SI */
588 COSTS_N_INSNS (5), /* DI */
589 COSTS_N_INSNS (5)}, /* other */
590 0, /* cost of multiply per each bit set */
591 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
592 COSTS_N_INSNS (26), /* HI */
593 COSTS_N_INSNS (42), /* SI */
594 COSTS_N_INSNS (74), /* DI */
595 COSTS_N_INSNS (74)}, /* other */
596 COSTS_N_INSNS (1), /* cost of movsx */
597 COSTS_N_INSNS (1), /* cost of movzx */
598 8, /* "large" insn */
600 4, /* cost for loading QImode using movzbl */
601 {3, 4, 3}, /* cost of loading integer registers
602 in QImode, HImode and SImode.
603 Relative to reg-reg move (2). */
604 {3, 4, 3}, /* cost of storing integer registers */
605 4, /* cost of reg,reg fld/fst */
606 {4, 4, 12}, /* cost of loading fp registers
607 in SFmode, DFmode and XFmode */
608 {6, 6, 8}, /* cost of storing fp registers
609 in SFmode, DFmode and XFmode */
610 2, /* cost of moving MMX register */
611 {4, 4}, /* cost of loading MMX registers
612 in SImode and DImode */
613 {4, 4}, /* cost of storing MMX registers
614 in SImode and DImode */
615 2, /* cost of moving SSE register */
616 {4, 4, 6}, /* cost of loading SSE registers
617 in SImode, DImode and TImode */
618 {4, 4, 5}, /* cost of storing SSE registers
619 in SImode, DImode and TImode */
620 5, /* MMX or SSE register to integer */
621 64, /* size of l1 cache. */
622 256, /* size of l2 cache. */
623 64, /* size of prefetch block */
624 6, /* number of parallel prefetches */
626 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
627 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
628 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
629 COSTS_N_INSNS (2), /* cost of FABS instruction. */
630 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
631 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
632 /* For some reason, Athlon deals better with REP prefix (relative to loops)
633 compared to K8. Alignment becomes important after 8 bytes for memcpy and
634 128 bytes for memset. */
635 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
636 DUMMY_STRINGOP_ALGS},
637 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
638 DUMMY_STRINGOP_ALGS},
639 1, /* scalar_stmt_cost. */
640 1, /* scalar load_cost. */
641 1, /* scalar_store_cost. */
642 1, /* vec_stmt_cost. */
643 1, /* vec_to_scalar_cost. */
644 1, /* scalar_to_vec_cost. */
645 1, /* vec_align_load_cost. */
646 2, /* vec_unalign_load_cost. */
647 1, /* vec_store_cost. */
648 3, /* cond_taken_branch_cost. */
649 1, /* cond_not_taken_branch_cost. */
653 struct processor_costs k8_cost = {
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (2), /* cost of a lea instruction */
656 COSTS_N_INSNS (1), /* variable shift costs */
657 COSTS_N_INSNS (1), /* constant shift costs */
658 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (4), /* HI */
660 COSTS_N_INSNS (3), /* SI */
661 COSTS_N_INSNS (4), /* DI */
662 COSTS_N_INSNS (5)}, /* other */
663 0, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (26), /* HI */
666 COSTS_N_INSNS (42), /* SI */
667 COSTS_N_INSNS (74), /* DI */
668 COSTS_N_INSNS (74)}, /* other */
669 COSTS_N_INSNS (1), /* cost of movsx */
670 COSTS_N_INSNS (1), /* cost of movzx */
671 8, /* "large" insn */
673 4, /* cost for loading QImode using movzbl */
674 {3, 4, 3}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {3, 4, 3}, /* cost of storing integer registers */
678 4, /* cost of reg,reg fld/fst */
679 {4, 4, 12}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {6, 6, 8}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {3, 3}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {4, 4}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {4, 3, 6}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {4, 4, 5}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 5, /* MMX or SSE register to integer */
694 64, /* size of l1 cache. */
695 512, /* size of l2 cache. */
696 64, /* size of prefetch block */
697 /* New AMD processors never drop prefetches; if they cannot be performed
698 immediately, they are queued. We set number of simultaneous prefetches
699 to a large constant to reflect this (it probably is not a good idea not
700 to limit number of prefetches at all, as their execution also takes some
702 100, /* number of parallel prefetches */
704 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
705 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
706 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
707 COSTS_N_INSNS (2), /* cost of FABS instruction. */
708 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
709 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
710 /* K8 has optimized REP instruction for medium sized blocks, but for very small
711 blocks it is better to use loop. For large blocks, libcall can do
712 nontemporary accesses and beat inline considerably. */
713 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
714 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
715 {{libcall, {{8, loop}, {24, unrolled_loop},
716 {2048, rep_prefix_4_byte}, {-1, libcall}}},
717 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
718 4, /* scalar_stmt_cost. */
719 2, /* scalar load_cost. */
720 2, /* scalar_store_cost. */
721 5, /* vec_stmt_cost. */
722 0, /* vec_to_scalar_cost. */
723 2, /* scalar_to_vec_cost. */
724 2, /* vec_align_load_cost. */
725 3, /* vec_unalign_load_cost. */
726 3, /* vec_store_cost. */
727 6, /* cond_taken_branch_cost. */
728 1, /* cond_not_taken_branch_cost. */
731 struct processor_costs amdfam10_cost = {
732 COSTS_N_INSNS (1), /* cost of an add instruction */
733 COSTS_N_INSNS (2), /* cost of a lea instruction */
734 COSTS_N_INSNS (1), /* variable shift costs */
735 COSTS_N_INSNS (1), /* constant shift costs */
736 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
737 COSTS_N_INSNS (4), /* HI */
738 COSTS_N_INSNS (3), /* SI */
739 COSTS_N_INSNS (4), /* DI */
740 COSTS_N_INSNS (5)}, /* other */
741 0, /* cost of multiply per each bit set */
742 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
743 COSTS_N_INSNS (35), /* HI */
744 COSTS_N_INSNS (51), /* SI */
745 COSTS_N_INSNS (83), /* DI */
746 COSTS_N_INSNS (83)}, /* other */
747 COSTS_N_INSNS (1), /* cost of movsx */
748 COSTS_N_INSNS (1), /* cost of movzx */
749 8, /* "large" insn */
751 4, /* cost for loading QImode using movzbl */
752 {3, 4, 3}, /* cost of loading integer registers
753 in QImode, HImode and SImode.
754 Relative to reg-reg move (2). */
755 {3, 4, 3}, /* cost of storing integer registers */
756 4, /* cost of reg,reg fld/fst */
757 {4, 4, 12}, /* cost of loading fp registers
758 in SFmode, DFmode and XFmode */
759 {6, 6, 8}, /* cost of storing fp registers
760 in SFmode, DFmode and XFmode */
761 2, /* cost of moving MMX register */
762 {3, 3}, /* cost of loading MMX registers
763 in SImode and DImode */
764 {4, 4}, /* cost of storing MMX registers
765 in SImode and DImode */
766 2, /* cost of moving SSE register */
767 {4, 4, 3}, /* cost of loading SSE registers
768 in SImode, DImode and TImode */
769 {4, 4, 5}, /* cost of storing SSE registers
770 in SImode, DImode and TImode */
771 3, /* MMX or SSE register to integer */
773 MOVD reg64, xmmreg Double FSTORE 4
774 MOVD reg32, xmmreg Double FSTORE 4
776 MOVD reg64, xmmreg Double FADD 3
778 MOVD reg32, xmmreg Double FADD 3
780 64, /* size of l1 cache. */
781 512, /* size of l2 cache. */
782 64, /* size of prefetch block */
783 /* New AMD processors never drop prefetches; if they cannot be performed
784 immediately, they are queued. We set number of simultaneous prefetches
785 to a large constant to reflect this (it probably is not a good idea not
786 to limit number of prefetches at all, as their execution also takes some
788 100, /* number of parallel prefetches */
790 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
791 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
792 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
793 COSTS_N_INSNS (2), /* cost of FABS instruction. */
794 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
795 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
797 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
798 very small blocks it is better to use loop. For large blocks, libcall can
799 do nontemporary accesses and beat inline considerably. */
800 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
801 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
802 {{libcall, {{8, loop}, {24, unrolled_loop},
803 {2048, rep_prefix_4_byte}, {-1, libcall}}},
804 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
805 4, /* scalar_stmt_cost. */
806 2, /* scalar load_cost. */
807 2, /* scalar_store_cost. */
808 6, /* vec_stmt_cost. */
809 0, /* vec_to_scalar_cost. */
810 2, /* scalar_to_vec_cost. */
811 2, /* vec_align_load_cost. */
812 2, /* vec_unalign_load_cost. */
813 2, /* vec_store_cost. */
814 6, /* cond_taken_branch_cost. */
815 1, /* cond_not_taken_branch_cost. */
819 struct processor_costs pentium4_cost = {
820 COSTS_N_INSNS (1), /* cost of an add instruction */
821 COSTS_N_INSNS (3), /* cost of a lea instruction */
822 COSTS_N_INSNS (4), /* variable shift costs */
823 COSTS_N_INSNS (4), /* constant shift costs */
824 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
825 COSTS_N_INSNS (15), /* HI */
826 COSTS_N_INSNS (15), /* SI */
827 COSTS_N_INSNS (15), /* DI */
828 COSTS_N_INSNS (15)}, /* other */
829 0, /* cost of multiply per each bit set */
830 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
831 COSTS_N_INSNS (56), /* HI */
832 COSTS_N_INSNS (56), /* SI */
833 COSTS_N_INSNS (56), /* DI */
834 COSTS_N_INSNS (56)}, /* other */
835 COSTS_N_INSNS (1), /* cost of movsx */
836 COSTS_N_INSNS (1), /* cost of movzx */
837 16, /* "large" insn */
839 2, /* cost for loading QImode using movzbl */
840 {4, 5, 4}, /* cost of loading integer registers
841 in QImode, HImode and SImode.
842 Relative to reg-reg move (2). */
843 {2, 3, 2}, /* cost of storing integer registers */
844 2, /* cost of reg,reg fld/fst */
845 {2, 2, 6}, /* cost of loading fp registers
846 in SFmode, DFmode and XFmode */
847 {4, 4, 6}, /* cost of storing fp registers
848 in SFmode, DFmode and XFmode */
849 2, /* cost of moving MMX register */
850 {2, 2}, /* cost of loading MMX registers
851 in SImode and DImode */
852 {2, 2}, /* cost of storing MMX registers
853 in SImode and DImode */
854 12, /* cost of moving SSE register */
855 {12, 12, 12}, /* cost of loading SSE registers
856 in SImode, DImode and TImode */
857 {2, 2, 8}, /* cost of storing SSE registers
858 in SImode, DImode and TImode */
859 10, /* MMX or SSE register to integer */
860 8, /* size of l1 cache. */
861 256, /* size of l2 cache. */
862 64, /* size of prefetch block */
863 6, /* number of parallel prefetches */
865 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
866 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
867 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
868 COSTS_N_INSNS (2), /* cost of FABS instruction. */
869 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
870 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
871 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
872 DUMMY_STRINGOP_ALGS},
873 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
875 DUMMY_STRINGOP_ALGS},
876 1, /* scalar_stmt_cost. */
877 1, /* scalar load_cost. */
878 1, /* scalar_store_cost. */
879 1, /* vec_stmt_cost. */
880 1, /* vec_to_scalar_cost. */
881 1, /* scalar_to_vec_cost. */
882 1, /* vec_align_load_cost. */
883 2, /* vec_unalign_load_cost. */
884 1, /* vec_store_cost. */
885 3, /* cond_taken_branch_cost. */
886 1, /* cond_not_taken_branch_cost. */
890 struct processor_costs nocona_cost = {
891 COSTS_N_INSNS (1), /* cost of an add instruction */
892 COSTS_N_INSNS (1), /* cost of a lea instruction */
893 COSTS_N_INSNS (1), /* variable shift costs */
894 COSTS_N_INSNS (1), /* constant shift costs */
895 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
896 COSTS_N_INSNS (10), /* HI */
897 COSTS_N_INSNS (10), /* SI */
898 COSTS_N_INSNS (10), /* DI */
899 COSTS_N_INSNS (10)}, /* other */
900 0, /* cost of multiply per each bit set */
901 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
902 COSTS_N_INSNS (66), /* HI */
903 COSTS_N_INSNS (66), /* SI */
904 COSTS_N_INSNS (66), /* DI */
905 COSTS_N_INSNS (66)}, /* other */
906 COSTS_N_INSNS (1), /* cost of movsx */
907 COSTS_N_INSNS (1), /* cost of movzx */
908 16, /* "large" insn */
910 4, /* cost for loading QImode using movzbl */
911 {4, 4, 4}, /* cost of loading integer registers
912 in QImode, HImode and SImode.
913 Relative to reg-reg move (2). */
914 {4, 4, 4}, /* cost of storing integer registers */
915 3, /* cost of reg,reg fld/fst */
916 {12, 12, 12}, /* cost of loading fp registers
917 in SFmode, DFmode and XFmode */
918 {4, 4, 4}, /* cost of storing fp registers
919 in SFmode, DFmode and XFmode */
920 6, /* cost of moving MMX register */
921 {12, 12}, /* cost of loading MMX registers
922 in SImode and DImode */
923 {12, 12}, /* cost of storing MMX registers
924 in SImode and DImode */
925 6, /* cost of moving SSE register */
926 {12, 12, 12}, /* cost of loading SSE registers
927 in SImode, DImode and TImode */
928 {12, 12, 12}, /* cost of storing SSE registers
929 in SImode, DImode and TImode */
930 8, /* MMX or SSE register to integer */
931 8, /* size of l1 cache. */
932 1024, /* size of l2 cache. */
933 128, /* size of prefetch block */
934 8, /* number of parallel prefetches */
936 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
937 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
938 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
939 COSTS_N_INSNS (3), /* cost of FABS instruction. */
940 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
941 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
942 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
943 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
944 {100000, unrolled_loop}, {-1, libcall}}}},
945 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
947 {libcall, {{24, loop}, {64, unrolled_loop},
948 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
949 1, /* scalar_stmt_cost. */
950 1, /* scalar load_cost. */
951 1, /* scalar_store_cost. */
952 1, /* vec_stmt_cost. */
953 1, /* vec_to_scalar_cost. */
954 1, /* scalar_to_vec_cost. */
955 1, /* vec_align_load_cost. */
956 2, /* vec_unalign_load_cost. */
957 1, /* vec_store_cost. */
958 3, /* cond_taken_branch_cost. */
959 1, /* cond_not_taken_branch_cost. */
963 struct processor_costs core2_cost = {
964 COSTS_N_INSNS (1), /* cost of an add instruction */
965 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
966 COSTS_N_INSNS (1), /* variable shift costs */
967 COSTS_N_INSNS (1), /* constant shift costs */
968 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
969 COSTS_N_INSNS (3), /* HI */
970 COSTS_N_INSNS (3), /* SI */
971 COSTS_N_INSNS (3), /* DI */
972 COSTS_N_INSNS (3)}, /* other */
973 0, /* cost of multiply per each bit set */
974 {COSTS_N_INSNS (22), /* cost of a divide/mod for QI */
975 COSTS_N_INSNS (22), /* HI */
976 COSTS_N_INSNS (22), /* SI */
977 COSTS_N_INSNS (22), /* DI */
978 COSTS_N_INSNS (22)}, /* other */
979 COSTS_N_INSNS (1), /* cost of movsx */
980 COSTS_N_INSNS (1), /* cost of movzx */
981 8, /* "large" insn */
983 2, /* cost for loading QImode using movzbl */
984 {6, 6, 6}, /* cost of loading integer registers
985 in QImode, HImode and SImode.
986 Relative to reg-reg move (2). */
987 {4, 4, 4}, /* cost of storing integer registers */
988 2, /* cost of reg,reg fld/fst */
989 {6, 6, 6}, /* cost of loading fp registers
990 in SFmode, DFmode and XFmode */
991 {4, 4, 4}, /* cost of loading integer registers */
992 2, /* cost of moving MMX register */
993 {6, 6}, /* cost of loading MMX registers
994 in SImode and DImode */
995 {4, 4}, /* cost of storing MMX registers
996 in SImode and DImode */
997 2, /* cost of moving SSE register */
998 {6, 6, 6}, /* cost of loading SSE registers
999 in SImode, DImode and TImode */
1000 {4, 4, 4}, /* cost of storing SSE registers
1001 in SImode, DImode and TImode */
1002 2, /* MMX or SSE register to integer */
1003 32, /* size of l1 cache. */
1004 2048, /* size of l2 cache. */
1005 128, /* size of prefetch block */
1006 8, /* number of parallel prefetches */
1007 3, /* Branch cost */
1008 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
1009 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
1010 COSTS_N_INSNS (32), /* cost of FDIV instruction. */
1011 COSTS_N_INSNS (1), /* cost of FABS instruction. */
1012 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
1013 COSTS_N_INSNS (58), /* cost of FSQRT instruction. */
1014 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1015 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1016 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1017 {{libcall, {{8, loop}, {15, unrolled_loop},
1018 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1019 {libcall, {{24, loop}, {32, unrolled_loop},
1020 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1021 1, /* scalar_stmt_cost. */
1022 1, /* scalar load_cost. */
1023 1, /* scalar_store_cost. */
1024 1, /* vec_stmt_cost. */
1025 1, /* vec_to_scalar_cost. */
1026 1, /* scalar_to_vec_cost. */
1027 1, /* vec_align_load_cost. */
1028 2, /* vec_unalign_load_cost. */
1029 1, /* vec_store_cost. */
1030 3, /* cond_taken_branch_cost. */
1031 1, /* cond_not_taken_branch_cost. */
1034 /* Generic64 should produce code tuned for Nocona and K8. */
1036 struct processor_costs generic64_cost = {
1037 COSTS_N_INSNS (1), /* cost of an add instruction */
1038 /* On all chips taken into consideration lea is 2 cycles and more. With
1039 this cost however our current implementation of synth_mult results in
1040 use of unnecessary temporary registers causing regression on several
1041 SPECfp benchmarks. */
1042 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1043 COSTS_N_INSNS (1), /* variable shift costs */
1044 COSTS_N_INSNS (1), /* constant shift costs */
1045 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1046 COSTS_N_INSNS (4), /* HI */
1047 COSTS_N_INSNS (3), /* SI */
1048 COSTS_N_INSNS (4), /* DI */
1049 COSTS_N_INSNS (2)}, /* other */
1050 0, /* cost of multiply per each bit set */
1051 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1052 COSTS_N_INSNS (26), /* HI */
1053 COSTS_N_INSNS (42), /* SI */
1054 COSTS_N_INSNS (74), /* DI */
1055 COSTS_N_INSNS (74)}, /* other */
1056 COSTS_N_INSNS (1), /* cost of movsx */
1057 COSTS_N_INSNS (1), /* cost of movzx */
1058 8, /* "large" insn */
1059 17, /* MOVE_RATIO */
1060 4, /* cost for loading QImode using movzbl */
1061 {4, 4, 4}, /* cost of loading integer registers
1062 in QImode, HImode and SImode.
1063 Relative to reg-reg move (2). */
1064 {4, 4, 4}, /* cost of storing integer registers */
1065 4, /* cost of reg,reg fld/fst */
1066 {12, 12, 12}, /* cost of loading fp registers
1067 in SFmode, DFmode and XFmode */
1068 {6, 6, 8}, /* cost of storing fp registers
1069 in SFmode, DFmode and XFmode */
1070 2, /* cost of moving MMX register */
1071 {8, 8}, /* cost of loading MMX registers
1072 in SImode and DImode */
1073 {8, 8}, /* cost of storing MMX registers
1074 in SImode and DImode */
1075 2, /* cost of moving SSE register */
1076 {8, 8, 8}, /* cost of loading SSE registers
1077 in SImode, DImode and TImode */
1078 {8, 8, 8}, /* cost of storing SSE registers
1079 in SImode, DImode and TImode */
1080 5, /* MMX or SSE register to integer */
1081 32, /* size of l1 cache. */
1082 512, /* size of l2 cache. */
1083 64, /* size of prefetch block */
1084 6, /* number of parallel prefetches */
1085 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this value
1086 is increased to perhaps more appropriate value of 5. */
1087 3, /* Branch cost */
1088 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1089 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1090 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1091 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1092 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1093 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1094 {DUMMY_STRINGOP_ALGS,
1095 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1096 {DUMMY_STRINGOP_ALGS,
1097 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1098 1, /* scalar_stmt_cost. */
1099 1, /* scalar load_cost. */
1100 1, /* scalar_store_cost. */
1101 1, /* vec_stmt_cost. */
1102 1, /* vec_to_scalar_cost. */
1103 1, /* scalar_to_vec_cost. */
1104 1, /* vec_align_load_cost. */
1105 2, /* vec_unalign_load_cost. */
1106 1, /* vec_store_cost. */
1107 3, /* cond_taken_branch_cost. */
1108 1, /* cond_not_taken_branch_cost. */
1111 /* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8. */
1113 struct processor_costs generic32_cost = {
1114 COSTS_N_INSNS (1), /* cost of an add instruction */
1115 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1116 COSTS_N_INSNS (1), /* variable shift costs */
1117 COSTS_N_INSNS (1), /* constant shift costs */
1118 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1119 COSTS_N_INSNS (4), /* HI */
1120 COSTS_N_INSNS (3), /* SI */
1121 COSTS_N_INSNS (4), /* DI */
1122 COSTS_N_INSNS (2)}, /* other */
1123 0, /* cost of multiply per each bit set */
1124 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1125 COSTS_N_INSNS (26), /* HI */
1126 COSTS_N_INSNS (42), /* SI */
1127 COSTS_N_INSNS (74), /* DI */
1128 COSTS_N_INSNS (74)}, /* other */
1129 COSTS_N_INSNS (1), /* cost of movsx */
1130 COSTS_N_INSNS (1), /* cost of movzx */
1131 8, /* "large" insn */
1132 17, /* MOVE_RATIO */
1133 4, /* cost for loading QImode using movzbl */
1134 {4, 4, 4}, /* cost of loading integer registers
1135 in QImode, HImode and SImode.
1136 Relative to reg-reg move (2). */
1137 {4, 4, 4}, /* cost of storing integer registers */
1138 4, /* cost of reg,reg fld/fst */
1139 {12, 12, 12}, /* cost of loading fp registers
1140 in SFmode, DFmode and XFmode */
1141 {6, 6, 8}, /* cost of storing fp registers
1142 in SFmode, DFmode and XFmode */
1143 2, /* cost of moving MMX register */
1144 {8, 8}, /* cost of loading MMX registers
1145 in SImode and DImode */
1146 {8, 8}, /* cost of storing MMX registers
1147 in SImode and DImode */
1148 2, /* cost of moving SSE register */
1149 {8, 8, 8}, /* cost of loading SSE registers
1150 in SImode, DImode and TImode */
1151 {8, 8, 8}, /* cost of storing SSE registers
1152 in SImode, DImode and TImode */
1153 5, /* MMX or SSE register to integer */
1154 32, /* size of l1 cache. */
1155 256, /* size of l2 cache. */
1156 64, /* size of prefetch block */
1157 6, /* number of parallel prefetches */
1158 3, /* Branch cost */
1159 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1160 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1161 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1162 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1163 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1164 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1165 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1166 DUMMY_STRINGOP_ALGS},
1167 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1168 DUMMY_STRINGOP_ALGS},
1169 1, /* scalar_stmt_cost. */
1170 1, /* scalar load_cost. */
1171 1, /* scalar_store_cost. */
1172 1, /* vec_stmt_cost. */
1173 1, /* vec_to_scalar_cost. */
1174 1, /* scalar_to_vec_cost. */
1175 1, /* vec_align_load_cost. */
1176 2, /* vec_unalign_load_cost. */
1177 1, /* vec_store_cost. */
1178 3, /* cond_taken_branch_cost. */
1179 1, /* cond_not_taken_branch_cost. */
1182 const struct processor_costs *ix86_cost = &pentium_cost;
1184 /* Processor feature/optimization bitmasks. */
1185 #define m_386 (1<<PROCESSOR_I386)
1186 #define m_486 (1<<PROCESSOR_I486)
1187 #define m_PENT (1<<PROCESSOR_PENTIUM)
1188 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1189 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1190 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1191 #define m_CORE2 (1<<PROCESSOR_CORE2)
1193 #define m_GEODE (1<<PROCESSOR_GEODE)
1194 #define m_K6 (1<<PROCESSOR_K6)
1195 #define m_K6_GEODE (m_K6 | m_GEODE)
1196 #define m_K8 (1<<PROCESSOR_K8)
1197 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1198 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1199 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1200 #define m_AMD_MULTIPLE (m_K8 | m_ATHLON | m_AMDFAM10)
1202 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1203 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1205 /* Generic instruction choice should be common subset of supported CPUs
1206 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1207 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1209 /* Feature tests against the various tunings. */
1210 unsigned int ix86_tune_features[X86_TUNE_LAST] = {
1211 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1212 negatively, so enabling for Generic64 seems like good code size
1213 tradeoff. We can't enable it for 32bit generic because it does not
1214 work well with PPro base chips. */
1215 m_386 | m_K6_GEODE | m_AMD_MULTIPLE | m_CORE2 | m_GENERIC64,
1217 /* X86_TUNE_PUSH_MEMORY */
1218 m_386 | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4
1219 | m_NOCONA | m_CORE2 | m_GENERIC,
1221 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1224 /* X86_TUNE_USE_BIT_TEST */
1227 /* X86_TUNE_UNROLL_STRLEN */
1228 m_486 | m_PENT | m_PPRO | m_AMD_MULTIPLE | m_K6 | m_CORE2 | m_GENERIC,
1230 /* X86_TUNE_DEEP_BRANCH_PREDICTION */
1231 m_PPRO | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4 | m_GENERIC,
1233 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1234 on simulation result. But after P4 was made, no performance benefit
1235 was observed with branch hints. It also increases the code size.
1236 As a result, icc never generates branch hints. */
1239 /* X86_TUNE_DOUBLE_WITH_ADD */
1242 /* X86_TUNE_USE_SAHF */
1243 m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_PENT4
1244 | m_NOCONA | m_CORE2 | m_GENERIC,
1246 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1247 partial dependencies. */
1248 m_AMD_MULTIPLE | m_PPRO | m_PENT4 | m_NOCONA
1249 | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */,
1251 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1252 register stalls on Generic32 compilation setting as well. However
1253 in current implementation the partial register stalls are not eliminated
1254 very well - they can be introduced via subregs synthesized by combine
1255 and can happen in caller/callee saving sequences. Because this option
1256 pays back little on PPro based chips and is in conflict with partial reg
1257 dependencies used by Athlon/P4 based chips, it is better to leave it off
1258 for generic32 for now. */
1261 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1262 m_CORE2 | m_GENERIC,
1264 /* X86_TUNE_USE_HIMODE_FIOP */
1265 m_386 | m_486 | m_K6_GEODE,
1267 /* X86_TUNE_USE_SIMODE_FIOP */
1268 ~(m_PPRO | m_AMD_MULTIPLE | m_PENT | m_CORE2 | m_GENERIC),
1270 /* X86_TUNE_USE_MOV0 */
1273 /* X86_TUNE_USE_CLTD */
1274 ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC),
1276 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1279 /* X86_TUNE_SPLIT_LONG_MOVES */
1282 /* X86_TUNE_READ_MODIFY_WRITE */
1285 /* X86_TUNE_READ_MODIFY */
1288 /* X86_TUNE_PROMOTE_QIMODE */
1289 m_K6_GEODE | m_PENT | m_386 | m_486 | m_AMD_MULTIPLE | m_CORE2
1290 | m_GENERIC /* | m_PENT4 ? */,
1292 /* X86_TUNE_FAST_PREFIX */
1293 ~(m_PENT | m_486 | m_386),
1295 /* X86_TUNE_SINGLE_STRINGOP */
1296 m_386 | m_PENT4 | m_NOCONA,
1298 /* X86_TUNE_QIMODE_MATH */
1301 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1302 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1303 might be considered for Generic32 if our scheme for avoiding partial
1304 stalls was more effective. */
1307 /* X86_TUNE_PROMOTE_QI_REGS */
1310 /* X86_TUNE_PROMOTE_HI_REGS */
1313 /* X86_TUNE_ADD_ESP_4: Enable if add/sub is preferred over 1/2 push/pop. */
1314 m_AMD_MULTIPLE | m_K6_GEODE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1316 /* X86_TUNE_ADD_ESP_8 */
1317 m_AMD_MULTIPLE | m_PPRO | m_K6_GEODE | m_386
1318 | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1320 /* X86_TUNE_SUB_ESP_4 */
1321 m_AMD_MULTIPLE | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1323 /* X86_TUNE_SUB_ESP_8 */
1324 m_AMD_MULTIPLE | m_PPRO | m_386 | m_486
1325 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1327 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1328 for DFmode copies */
1329 ~(m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
1330 | m_GENERIC | m_GEODE),
1332 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1333 m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1335 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1336 conflict here in between PPro/Pentium4 based chips that thread 128bit
1337 SSE registers as single units versus K8 based chips that divide SSE
1338 registers to two 64bit halves. This knob promotes all store destinations
1339 to be 128bit to allow register renaming on 128bit SSE units, but usually
1340 results in one extra microop on 64bit SSE units. Experimental results
1341 shows that disabling this option on P4 brings over 20% SPECfp regression,
1342 while enabling it on K8 brings roughly 2.4% regression that can be partly
1343 masked by careful scheduling of moves. */
1344 m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC | m_AMDFAM10,
1346 /* X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL */
1349 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1350 are resolved on SSE register parts instead of whole registers, so we may
1351 maintain just lower part of scalar values in proper format leaving the
1352 upper part undefined. */
1355 /* X86_TUNE_SSE_TYPELESS_STORES */
1358 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1359 m_PPRO | m_PENT4 | m_NOCONA,
1361 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1362 m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1364 /* X86_TUNE_PROLOGUE_USING_MOVE */
1365 m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
1367 /* X86_TUNE_EPILOGUE_USING_MOVE */
1368 m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
1370 /* X86_TUNE_SHIFT1 */
1373 /* X86_TUNE_USE_FFREEP */
1376 /* X86_TUNE_INTER_UNIT_MOVES */
1377 ~(m_AMD_MULTIPLE | m_GENERIC),
1379 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
1382 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
1383 than 4 branch instructions in the 16 byte window. */
1384 m_PPRO | m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
1386 /* X86_TUNE_SCHEDULE */
1387 m_PPRO | m_AMD_MULTIPLE | m_K6_GEODE | m_PENT | m_CORE2 | m_GENERIC,
1389 /* X86_TUNE_USE_BT */
1392 /* X86_TUNE_USE_INCDEC */
1393 ~(m_PENT4 | m_NOCONA | m_GENERIC),
1395 /* X86_TUNE_PAD_RETURNS */
1396 m_AMD_MULTIPLE | m_CORE2 | m_GENERIC,
1398 /* X86_TUNE_EXT_80387_CONSTANTS */
1399 m_K6_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC,
1401 /* X86_TUNE_SHORTEN_X87_SSE */
1404 /* X86_TUNE_AVOID_VECTOR_DECODE */
1407 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
1408 and SImode multiply, but 386 and 486 do HImode multiply faster. */
1411 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
1412 vector path on AMD machines. */
1413 m_K8 | m_GENERIC64 | m_AMDFAM10,
1415 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
1417 m_K8 | m_GENERIC64 | m_AMDFAM10,
1419 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
1423 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
1424 but one byte longer. */
1427 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
1428 operand that cannot be represented using a modRM byte. The XOR
1429 replacement is long decoded, so this split helps here as well. */
1432 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
1433 from integer to FP. */
1437 /* Feature tests against the various architecture variations. */
1438 unsigned int ix86_arch_features[X86_ARCH_LAST] = {
1439 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
1440 ~(m_386 | m_486 | m_PENT | m_K6),
1442 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
1445 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
1448 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
1451 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
1455 static const unsigned int x86_accumulate_outgoing_args
1456 = m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
1458 static const unsigned int x86_arch_always_fancy_math_387
1459 = m_PENT | m_PPRO | m_AMD_MULTIPLE | m_PENT4
1460 | m_NOCONA | m_CORE2 | m_GENERIC;
1462 static enum stringop_alg stringop_alg = no_stringop;
1464 /* In case the average insn count for single function invocation is
1465 lower than this constant, emit fast (but longer) prologue and
1467 #define FAST_PROLOGUE_INSN_COUNT 20
1469 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
1470 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
1471 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
1472 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
1474 /* Array of the smallest class containing reg number REGNO, indexed by
1475 REGNO. Used by REGNO_REG_CLASS in i386.h. */
1477 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
1479 /* ax, dx, cx, bx */
1480 AREG, DREG, CREG, BREG,
1481 /* si, di, bp, sp */
1482 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
1484 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
1485 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
1488 /* flags, fpsr, fpcr, frame */
1489 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
1491 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1494 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
1497 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1498 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
1499 /* SSE REX registers */
1500 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
1504 /* The "default" register map used in 32bit mode. */
1506 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
1508 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
1509 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
1510 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1511 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
1512 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
1513 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1514 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1517 static int const x86_64_int_parameter_registers[6] =
1519 5 /*RDI*/, 4 /*RSI*/, 1 /*RDX*/, 2 /*RCX*/,
1520 FIRST_REX_INT_REG /*R8 */, FIRST_REX_INT_REG + 1 /*R9 */
1523 static int const x86_64_ms_abi_int_parameter_registers[4] =
1525 2 /*RCX*/, 1 /*RDX*/,
1526 FIRST_REX_INT_REG /*R8 */, FIRST_REX_INT_REG + 1 /*R9 */
1529 static int const x86_64_int_return_registers[4] =
1531 0 /*RAX*/, 1 /*RDX*/, 5 /*RDI*/, 4 /*RSI*/
1534 /* The "default" register map used in 64bit mode. */
1535 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
1537 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
1538 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
1539 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1540 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
1541 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
1542 8,9,10,11,12,13,14,15, /* extended integer registers */
1543 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
1546 /* Define the register numbers to be used in Dwarf debugging information.
1547 The SVR4 reference port C compiler uses the following register numbers
1548 in its Dwarf output code:
1549 0 for %eax (gcc regno = 0)
1550 1 for %ecx (gcc regno = 2)
1551 2 for %edx (gcc regno = 1)
1552 3 for %ebx (gcc regno = 3)
1553 4 for %esp (gcc regno = 7)
1554 5 for %ebp (gcc regno = 6)
1555 6 for %esi (gcc regno = 4)
1556 7 for %edi (gcc regno = 5)
1557 The following three DWARF register numbers are never generated by
1558 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
1559 believes these numbers have these meanings.
1560 8 for %eip (no gcc equivalent)
1561 9 for %eflags (gcc regno = 17)
1562 10 for %trapno (no gcc equivalent)
1563 It is not at all clear how we should number the FP stack registers
1564 for the x86 architecture. If the version of SDB on x86/svr4 were
1565 a bit less brain dead with respect to floating-point then we would
1566 have a precedent to follow with respect to DWARF register numbers
1567 for x86 FP registers, but the SDB on x86/svr4 is so completely
1568 broken with respect to FP registers that it is hardly worth thinking
1569 of it as something to strive for compatibility with.
1570 The version of x86/svr4 SDB I have at the moment does (partially)
1571 seem to believe that DWARF register number 11 is associated with
1572 the x86 register %st(0), but that's about all. Higher DWARF
1573 register numbers don't seem to be associated with anything in
1574 particular, and even for DWARF regno 11, SDB only seems to under-
1575 stand that it should say that a variable lives in %st(0) (when
1576 asked via an `=' command) if we said it was in DWARF regno 11,
1577 but SDB still prints garbage when asked for the value of the
1578 variable in question (via a `/' command).
1579 (Also note that the labels SDB prints for various FP stack regs
1580 when doing an `x' command are all wrong.)
1581 Note that these problems generally don't affect the native SVR4
1582 C compiler because it doesn't allow the use of -O with -g and
1583 because when it is *not* optimizing, it allocates a memory
1584 location for each floating-point variable, and the memory
1585 location is what gets described in the DWARF AT_location
1586 attribute for the variable in question.
1587 Regardless of the severe mental illness of the x86/svr4 SDB, we
1588 do something sensible here and we use the following DWARF
1589 register numbers. Note that these are all stack-top-relative
1591 11 for %st(0) (gcc regno = 8)
1592 12 for %st(1) (gcc regno = 9)
1593 13 for %st(2) (gcc regno = 10)
1594 14 for %st(3) (gcc regno = 11)
1595 15 for %st(4) (gcc regno = 12)
1596 16 for %st(5) (gcc regno = 13)
1597 17 for %st(6) (gcc regno = 14)
1598 18 for %st(7) (gcc regno = 15)
1600 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
1602 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
1603 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
1604 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
1605 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
1606 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
1607 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
1608 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
1611 /* Test and compare insns in i386.md store the information needed to
1612 generate branch and scc insns here. */
1614 rtx ix86_compare_op0 = NULL_RTX;
1615 rtx ix86_compare_op1 = NULL_RTX;
1616 rtx ix86_compare_emitted = NULL_RTX;
1618 /* Size of the register save area. */
1619 #define X86_64_VARARGS_SIZE (REGPARM_MAX * UNITS_PER_WORD + SSE_REGPARM_MAX * 16)
1621 /* Define the structure for the machine field in struct function. */
1623 struct stack_local_entry GTY(())
1625 unsigned short mode;
1628 struct stack_local_entry *next;
1631 /* Structure describing stack frame layout.
1632 Stack grows downward:
1638 saved frame pointer if frame_pointer_needed
1639 <- HARD_FRAME_POINTER
1644 [va_arg registers] (
1645 > to_allocate <- FRAME_POINTER
1655 HOST_WIDE_INT frame;
1657 int outgoing_arguments_size;
1660 HOST_WIDE_INT to_allocate;
1661 /* The offsets relative to ARG_POINTER. */
1662 HOST_WIDE_INT frame_pointer_offset;
1663 HOST_WIDE_INT hard_frame_pointer_offset;
1664 HOST_WIDE_INT stack_pointer_offset;
1666 /* When save_regs_using_mov is set, emit prologue using
1667 move instead of push instructions. */
1668 bool save_regs_using_mov;
1671 /* Code model option. */
1672 enum cmodel ix86_cmodel;
1674 enum asm_dialect ix86_asm_dialect = ASM_ATT;
1676 enum tls_dialect ix86_tls_dialect = TLS_DIALECT_GNU;
1678 /* Which unit we are generating floating point math for. */
1679 enum fpmath_unit ix86_fpmath;
1681 /* Which cpu are we scheduling for. */
1682 enum processor_type ix86_tune;
1684 /* Which instruction set architecture to use. */
1685 enum processor_type ix86_arch;
1687 /* true if sse prefetch instruction is not NOOP. */
1688 int x86_prefetch_sse;
1690 /* ix86_regparm_string as a number */
1691 static int ix86_regparm;
1693 /* -mstackrealign option */
1694 extern int ix86_force_align_arg_pointer;
1695 static const char ix86_force_align_arg_pointer_string[] = "force_align_arg_pointer";
1697 /* Preferred alignment for stack boundary in bits. */
1698 unsigned int ix86_preferred_stack_boundary;
1700 /* Values 1-5: see jump.c */
1701 int ix86_branch_cost;
1703 /* Variables which are this size or smaller are put in the data/bss
1704 or ldata/lbss sections. */
1706 int ix86_section_threshold = 65536;
1708 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
1709 char internal_label_prefix[16];
1710 int internal_label_prefix_len;
1712 /* Fence to use after loop using movnt. */
1715 /* Register class used for passing given 64bit part of the argument.
1716 These represent classes as documented by the PS ABI, with the exception
1717 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
1718 use SF or DFmode move instead of DImode to avoid reformatting penalties.
1720 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
1721 whenever possible (upper half does contain padding). */
1722 enum x86_64_reg_class
1725 X86_64_INTEGER_CLASS,
1726 X86_64_INTEGERSI_CLASS,
1733 X86_64_COMPLEX_X87_CLASS,
1736 static const char * const x86_64_reg_class_name[] =
1738 "no", "integer", "integerSI", "sse", "sseSF", "sseDF",
1739 "sseup", "x87", "x87up", "cplx87", "no"
1742 #define MAX_CLASSES 4
1744 /* Table of constants used by fldpi, fldln2, etc.... */
1745 static REAL_VALUE_TYPE ext_80387_constants_table [5];
1746 static bool ext_80387_constants_init = 0;
1749 static struct machine_function * ix86_init_machine_status (void);
1750 static rtx ix86_function_value (const_tree, const_tree, bool);
1751 static int ix86_function_regparm (const_tree, const_tree);
1752 static void ix86_compute_frame_layout (struct ix86_frame *);
1753 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
1757 /* The svr4 ABI for the i386 says that records and unions are returned
1759 #ifndef DEFAULT_PCC_STRUCT_RETURN
1760 #define DEFAULT_PCC_STRUCT_RETURN 1
1763 /* Bit flags that specify the ISA we are compiling for. */
1764 int ix86_isa_flags = TARGET_64BIT_DEFAULT | TARGET_SUBTARGET_ISA_DEFAULT;
1766 /* A mask of ix86_isa_flags that includes bit X if X
1767 was set or cleared on the command line. */
1768 static int ix86_isa_flags_explicit;
1770 /* Define a set of ISAs which aren't available for a given ISA. MMX
1771 and SSE ISAs are handled separately. */
1773 #define OPTION_MASK_ISA_MMX_UNSET \
1774 (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_UNSET)
1775 #define OPTION_MASK_ISA_3DNOW_UNSET OPTION_MASK_ISA_3DNOW_A
1777 #define OPTION_MASK_ISA_SSE_UNSET \
1778 (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE2_UNSET)
1779 #define OPTION_MASK_ISA_SSE2_UNSET \
1780 (OPTION_MASK_ISA_SSE3 | OPTION_MASK_ISA_SSE3_UNSET)
1781 #define OPTION_MASK_ISA_SSE3_UNSET \
1782 (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSSE3_UNSET)
1783 #define OPTION_MASK_ISA_SSSE3_UNSET \
1784 (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_1_UNSET)
1785 #define OPTION_MASK_ISA_SSE4_1_UNSET \
1786 (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_SSE4_2_UNSET)
1787 #define OPTION_MASK_ISA_SSE4_2_UNSET OPTION_MASK_ISA_SSE4A
1789 /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
1790 as -msse4.1 -msse4.2. -mno-sse4 should the same as -mno-sse4.1. */
1791 #define OPTION_MASK_ISA_SSE4 \
1792 (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_2)
1793 #define OPTION_MASK_ISA_SSE4_UNSET OPTION_MASK_ISA_SSE4_1_UNSET
1795 #define OPTION_MASK_ISA_SSE4A_UNSET OPTION_MASK_ISA_SSE4
1797 #define OPTION_MASK_ISA_SSE5_UNSET \
1798 (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_UNSET)
1800 /* Vectorization library interface and handlers. */
1801 tree (*ix86_veclib_handler)(enum built_in_function, tree, tree) = NULL;
1802 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
1804 /* Implement TARGET_HANDLE_OPTION. */
1807 ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value)
1812 ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX;
1815 ix86_isa_flags &= ~OPTION_MASK_ISA_MMX_UNSET;
1816 ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX_UNSET;
1821 ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW;
1824 ix86_isa_flags &= ~OPTION_MASK_ISA_3DNOW_UNSET;
1825 ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW_UNSET;
1833 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE;
1836 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE_UNSET;
1837 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE_UNSET;
1842 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2;
1845 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE2_UNSET;
1846 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_UNSET;
1851 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3;
1854 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE3_UNSET;
1855 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3_UNSET;
1860 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3;
1863 ix86_isa_flags &= ~OPTION_MASK_ISA_SSSE3_UNSET;
1864 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3_UNSET;
1869 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1;
1872 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_1_UNSET;
1873 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1_UNSET;
1878 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2;
1881 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_2_UNSET;
1882 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2_UNSET;
1887 ix86_isa_flags |= OPTION_MASK_ISA_SSE4;
1888 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4;
1892 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_UNSET;
1893 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_UNSET;
1897 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A;
1900 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4A_UNSET;
1901 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A_UNSET;
1906 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE5;
1909 ix86_isa_flags &= ~OPTION_MASK_ISA_SSE5_UNSET;
1910 ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE5_UNSET;
1919 /* Sometimes certain combinations of command options do not make
1920 sense on a particular target machine. You can define a macro
1921 `OVERRIDE_OPTIONS' to take account of this. This macro, if
1922 defined, is executed once just after all the command options have
1925 Don't use this macro to turn on various extra optimizations for
1926 `-O'. That is what `OPTIMIZATION_OPTIONS' is for. */
1929 override_options (void)
1932 int ix86_tune_defaulted = 0;
1933 int ix86_arch_specified = 0;
1934 unsigned int ix86_arch_mask, ix86_tune_mask;
1936 /* Comes from final.c -- no real reason to change it. */
1937 #define MAX_CODE_ALIGN 16
1941 const struct processor_costs *cost; /* Processor costs */
1942 const int align_loop; /* Default alignments. */
1943 const int align_loop_max_skip;
1944 const int align_jump;
1945 const int align_jump_max_skip;
1946 const int align_func;
1948 const processor_target_table[PROCESSOR_max] =
1950 {&i386_cost, 4, 3, 4, 3, 4},
1951 {&i486_cost, 16, 15, 16, 15, 16},
1952 {&pentium_cost, 16, 7, 16, 7, 16},
1953 {&pentiumpro_cost, 16, 15, 16, 10, 16},
1954 {&geode_cost, 0, 0, 0, 0, 0},
1955 {&k6_cost, 32, 7, 32, 7, 32},
1956 {&athlon_cost, 16, 7, 16, 7, 16},
1957 {&pentium4_cost, 0, 0, 0, 0, 0},
1958 {&k8_cost, 16, 7, 16, 7, 16},
1959 {&nocona_cost, 0, 0, 0, 0, 0},
1960 {&core2_cost, 16, 10, 16, 10, 16},
1961 {&generic32_cost, 16, 7, 16, 7, 16},
1962 {&generic64_cost, 16, 10, 16, 10, 16},
1963 {&amdfam10_cost, 32, 24, 32, 7, 32}
1966 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
1997 PTA_PREFETCH_SSE = 1 << 4,
1999 PTA_3DNOW_A = 1 << 6,
2003 PTA_POPCNT = 1 << 10,
2005 PTA_SSE4A = 1 << 12,
2006 PTA_NO_SAHF = 1 << 13,
2007 PTA_SSE4_1 = 1 << 14,
2008 PTA_SSE4_2 = 1 << 15,
2014 const char *const name; /* processor name or nickname. */
2015 const enum processor_type processor;
2016 const unsigned /*enum pta_flags*/ flags;
2018 const processor_alias_table[] =
2020 {"i386", PROCESSOR_I386, 0},
2021 {"i486", PROCESSOR_I486, 0},
2022 {"i586", PROCESSOR_PENTIUM, 0},
2023 {"pentium", PROCESSOR_PENTIUM, 0},
2024 {"pentium-mmx", PROCESSOR_PENTIUM, PTA_MMX},
2025 {"winchip-c6", PROCESSOR_I486, PTA_MMX},
2026 {"winchip2", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},
2027 {"c3", PROCESSOR_I486, PTA_MMX | PTA_3DNOW},
2028 {"c3-2", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE},
2029 {"i686", PROCESSOR_PENTIUMPRO, 0},
2030 {"pentiumpro", PROCESSOR_PENTIUMPRO, 0},
2031 {"pentium2", PROCESSOR_PENTIUMPRO, PTA_MMX},
2032 {"pentium3", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE},
2033 {"pentium3m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE},
2034 {"pentium-m", PROCESSOR_PENTIUMPRO, PTA_MMX | PTA_SSE | PTA_SSE2},
2035 {"pentium4", PROCESSOR_PENTIUM4, PTA_MMX |PTA_SSE | PTA_SSE2},
2036 {"pentium4m", PROCESSOR_PENTIUM4, PTA_MMX | PTA_SSE | PTA_SSE2},
2037 {"prescott", PROCESSOR_NOCONA, PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2038 {"nocona", PROCESSOR_NOCONA, (PTA_64BIT
2039 | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2040 | PTA_CX16 | PTA_NO_SAHF)},
2041 {"core2", PROCESSOR_CORE2, (PTA_64BIT
2042 | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2045 {"geode", PROCESSOR_GEODE, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2046 |PTA_PREFETCH_SSE)},
2047 {"k6", PROCESSOR_K6, PTA_MMX},
2048 {"k6-2", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
2049 {"k6-3", PROCESSOR_K6, PTA_MMX | PTA_3DNOW},
2050 {"athlon", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2051 | PTA_PREFETCH_SSE)},
2052 {"athlon-tbird", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2053 | PTA_PREFETCH_SSE)},
2054 {"athlon-4", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2056 {"athlon-xp", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2058 {"athlon-mp", PROCESSOR_ATHLON, (PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2060 {"x86-64", PROCESSOR_K8, (PTA_64BIT
2061 | PTA_MMX | PTA_SSE | PTA_SSE2
2063 {"k8", PROCESSOR_K8, (PTA_64BIT
2064 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2065 | PTA_SSE | PTA_SSE2
2067 {"k8-sse3", PROCESSOR_K8, (PTA_64BIT
2068 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2069 | PTA_SSE | PTA_SSE2 | PTA_SSE3
2071 {"opteron", PROCESSOR_K8, (PTA_64BIT
2072 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2073 | PTA_SSE | PTA_SSE2
2075 {"opteron-sse3", PROCESSOR_K8, (PTA_64BIT
2076 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2077 | PTA_SSE | PTA_SSE2 | PTA_SSE3
2079 {"athlon64", PROCESSOR_K8, (PTA_64BIT
2080 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2081 | PTA_SSE | PTA_SSE2
2083 {"athlon64-sse3", PROCESSOR_K8, (PTA_64BIT
2084 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2085 | PTA_SSE | PTA_SSE2 | PTA_SSE3
2087 {"athlon-fx", PROCESSOR_K8, (PTA_64BIT
2088 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2089 | PTA_SSE | PTA_SSE2
2091 {"amdfam10", PROCESSOR_AMDFAM10, (PTA_64BIT
2092 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2093 | PTA_SSE | PTA_SSE2 | PTA_SSE3
2095 | PTA_CX16 | PTA_ABM)},
2096 {"barcelona", PROCESSOR_AMDFAM10, (PTA_64BIT
2097 | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A
2098 | PTA_SSE | PTA_SSE2 | PTA_SSE3
2100 | PTA_CX16 | PTA_ABM)},
2101 {"generic32", PROCESSOR_GENERIC32, 0 /* flags are only used for -march switch. */ },
2102 {"generic64", PROCESSOR_GENERIC64, PTA_64BIT /* flags are only used for -march switch. */ },
2105 int const pta_size = ARRAY_SIZE (processor_alias_table);
2107 #ifdef SUBTARGET_OVERRIDE_OPTIONS
2108 SUBTARGET_OVERRIDE_OPTIONS;
2111 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
2112 SUBSUBTARGET_OVERRIDE_OPTIONS;
2115 /* -fPIC is the default for x86_64. */
2116 if (TARGET_MACHO && TARGET_64BIT)
2119 /* Set the default values for switches whose default depends on TARGET_64BIT
2120 in case they weren't overwritten by command line options. */
2123 /* Mach-O doesn't support omitting the frame pointer for now. */
2124 if (flag_omit_frame_pointer == 2)
2125 flag_omit_frame_pointer = (TARGET_MACHO ? 0 : 1);
2126 if (flag_asynchronous_unwind_tables == 2)
2127 flag_asynchronous_unwind_tables = 1;
2128 if (flag_pcc_struct_return == 2)
2129 flag_pcc_struct_return = 0;
2133 if (flag_omit_frame_pointer == 2)
2134 flag_omit_frame_pointer = 0;
2135 if (flag_asynchronous_unwind_tables == 2)
2136 flag_asynchronous_unwind_tables = 0;
2137 if (flag_pcc_struct_return == 2)
2138 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
2141 /* Need to check -mtune=generic first. */
2142 if (ix86_tune_string)
2144 if (!strcmp (ix86_tune_string, "generic")
2145 || !strcmp (ix86_tune_string, "i686")
2146 /* As special support for cross compilers we read -mtune=native
2147 as -mtune=generic. With native compilers we won't see the
2148 -mtune=native, as it was changed by the driver. */
2149 || !strcmp (ix86_tune_string, "native"))
2152 ix86_tune_string = "generic64";
2154 ix86_tune_string = "generic32";
2156 else if (!strncmp (ix86_tune_string, "generic", 7))
2157 error ("bad value (%s) for -mtune= switch", ix86_tune_string);
2161 if (ix86_arch_string)
2162 ix86_tune_string = ix86_arch_string;
2163 if (!ix86_tune_string)
2165 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
2166 ix86_tune_defaulted = 1;
2169 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
2170 need to use a sensible tune option. */
2171 if (!strcmp (ix86_tune_string, "generic")
2172 || !strcmp (ix86_tune_string, "x86-64")
2173 || !strcmp (ix86_tune_string, "i686"))
2176 ix86_tune_string = "generic64";
2178 ix86_tune_string = "generic32";
2181 if (ix86_stringop_string)
2183 if (!strcmp (ix86_stringop_string, "rep_byte"))
2184 stringop_alg = rep_prefix_1_byte;
2185 else if (!strcmp (ix86_stringop_string, "libcall"))
2186 stringop_alg = libcall;
2187 else if (!strcmp (ix86_stringop_string, "rep_4byte"))
2188 stringop_alg = rep_prefix_4_byte;
2189 else if (!strcmp (ix86_stringop_string, "rep_8byte"))
2190 stringop_alg = rep_prefix_8_byte;
2191 else if (!strcmp (ix86_stringop_string, "byte_loop"))
2192 stringop_alg = loop_1_byte;
2193 else if (!strcmp (ix86_stringop_string, "loop"))
2194 stringop_alg = loop;
2195 else if (!strcmp (ix86_stringop_string, "unrolled_loop"))
2196 stringop_alg = unrolled_loop;
2198 error ("bad value (%s) for -mstringop-strategy= switch", ix86_stringop_string);
2200 if (!strcmp (ix86_tune_string, "x86-64"))
2201 warning (OPT_Wdeprecated, "-mtune=x86-64 is deprecated. Use -mtune=k8 or "
2202 "-mtune=generic instead as appropriate.");
2204 if (!ix86_arch_string)
2205 ix86_arch_string = TARGET_64BIT ? "x86-64" : "i386";
2207 ix86_arch_specified = 1;
2209 if (!strcmp (ix86_arch_string, "generic"))
2210 error ("generic CPU can be used only for -mtune= switch");
2211 if (!strncmp (ix86_arch_string, "generic", 7))
2212 error ("bad value (%s) for -march= switch", ix86_arch_string);
2214 if (ix86_cmodel_string != 0)
2216 if (!strcmp (ix86_cmodel_string, "small"))
2217 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
2218 else if (!strcmp (ix86_cmodel_string, "medium"))
2219 ix86_cmodel = flag_pic ? CM_MEDIUM_PIC : CM_MEDIUM;
2220 else if (!strcmp (ix86_cmodel_string, "large"))
2221 ix86_cmodel = flag_pic ? CM_LARGE_PIC : CM_LARGE;
2223 error ("code model %s does not support PIC mode", ix86_cmodel_string);
2224 else if (!strcmp (ix86_cmodel_string, "32"))
2225 ix86_cmodel = CM_32;
2226 else if (!strcmp (ix86_cmodel_string, "kernel") && !flag_pic)
2227 ix86_cmodel = CM_KERNEL;
2229 error ("bad value (%s) for -mcmodel= switch", ix86_cmodel_string);
2233 /* For TARGET_64BIT_MS_ABI, force pic on, in order to enable the
2234 use of rip-relative addressing. This eliminates fixups that
2235 would otherwise be needed if this object is to be placed in a
2236 DLL, and is essentially just as efficient as direct addressing. */
2237 if (TARGET_64BIT_MS_ABI)
2238 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
2239 else if (TARGET_64BIT)
2240 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
2242 ix86_cmodel = CM_32;
2244 if (ix86_asm_string != 0)
2247 && !strcmp (ix86_asm_string, "intel"))
2248 ix86_asm_dialect = ASM_INTEL;
2249 else if (!strcmp (ix86_asm_string, "att"))
2250 ix86_asm_dialect = ASM_ATT;
2252 error ("bad value (%s) for -masm= switch", ix86_asm_string);
2254 if ((TARGET_64BIT == 0) != (ix86_cmodel == CM_32))
2255 error ("code model %qs not supported in the %s bit mode",
2256 ix86_cmodel_string, TARGET_64BIT ? "64" : "32");
2257 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
2258 sorry ("%i-bit mode not compiled in",
2259 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
2261 for (i = 0; i < pta_size; i++)
2262 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
2264 ix86_arch = processor_alias_table[i].processor;
2265 /* Default cpu tuning to the architecture. */
2266 ix86_tune = ix86_arch;
2268 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
2269 error ("CPU you selected does not support x86-64 "
2272 if (processor_alias_table[i].flags & PTA_MMX
2273 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
2274 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
2275 if (processor_alias_table[i].flags & PTA_3DNOW
2276 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
2277 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
2278 if (processor_alias_table[i].flags & PTA_3DNOW_A
2279 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
2280 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
2281 if (processor_alias_table[i].flags & PTA_SSE
2282 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
2283 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
2284 if (processor_alias_table[i].flags & PTA_SSE2
2285 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
2286 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
2287 if (processor_alias_table[i].flags & PTA_SSE3
2288 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
2289 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
2290 if (processor_alias_table[i].flags & PTA_SSSE3
2291 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
2292 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
2293 if (processor_alias_table[i].flags & PTA_SSE4_1
2294 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
2295 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
2296 if (processor_alias_table[i].flags & PTA_SSE4_2
2297 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
2298 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
2299 if (processor_alias_table[i].flags & PTA_SSE4A
2300 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
2301 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
2302 if (processor_alias_table[i].flags & PTA_SSE5
2303 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE5))
2304 ix86_isa_flags |= OPTION_MASK_ISA_SSE5;
2306 if (processor_alias_table[i].flags & PTA_ABM)
2308 if (processor_alias_table[i].flags & PTA_CX16)
2309 x86_cmpxchg16b = true;
2310 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM))
2312 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
2313 x86_prefetch_sse = true;
2314 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF)))
2321 error ("bad value (%s) for -march= switch", ix86_arch_string);
2323 ix86_arch_mask = 1u << ix86_arch;
2324 for (i = 0; i < X86_ARCH_LAST; ++i)
2325 ix86_arch_features[i] &= ix86_arch_mask;
2327 for (i = 0; i < pta_size; i++)
2328 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
2330 ix86_tune = processor_alias_table[i].processor;
2331 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
2333 if (ix86_tune_defaulted)
2335 ix86_tune_string = "x86-64";
2336 for (i = 0; i < pta_size; i++)
2337 if (! strcmp (ix86_tune_string,
2338 processor_alias_table[i].name))
2340 ix86_tune = processor_alias_table[i].processor;
2343 error ("CPU you selected does not support x86-64 "
2346 /* Intel CPUs have always interpreted SSE prefetch instructions as
2347 NOPs; so, we can enable SSE prefetch instructions even when
2348 -mtune (rather than -march) points us to a processor that has them.
2349 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
2350 higher processors. */
2352 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
2353 x86_prefetch_sse = true;
2357 error ("bad value (%s) for -mtune= switch", ix86_tune_string);
2359 ix86_tune_mask = 1u << ix86_tune;
2360 for (i = 0; i < X86_TUNE_LAST; ++i)
2361 ix86_tune_features[i] &= ix86_tune_mask;
2364 ix86_cost = &size_cost;
2366 ix86_cost = processor_target_table[ix86_tune].cost;
2368 /* Arrange to set up i386_stack_locals for all functions. */
2369 init_machine_status = ix86_init_machine_status;
2371 /* Validate -mregparm= value. */
2372 if (ix86_regparm_string)
2375 warning (0, "-mregparm is ignored in 64-bit mode");
2376 i = atoi (ix86_regparm_string);
2377 if (i < 0 || i > REGPARM_MAX)
2378 error ("-mregparm=%d is not between 0 and %d", i, REGPARM_MAX);
2383 ix86_regparm = REGPARM_MAX;
2385 /* If the user has provided any of the -malign-* options,
2386 warn and use that value only if -falign-* is not set.
2387 Remove this code in GCC 3.2 or later. */
2388 if (ix86_align_loops_string)
2390 warning (0, "-malign-loops is obsolete, use -falign-loops");
2391 if (align_loops == 0)
2393 i = atoi (ix86_align_loops_string);
2394 if (i < 0 || i > MAX_CODE_ALIGN)
2395 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2397 align_loops = 1 << i;
2401 if (ix86_align_jumps_string)
2403 warning (0, "-malign-jumps is obsolete, use -falign-jumps");
2404 if (align_jumps == 0)
2406 i = atoi (ix86_align_jumps_string);
2407 if (i < 0 || i > MAX_CODE_ALIGN)
2408 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2410 align_jumps = 1 << i;
2414 if (ix86_align_funcs_string)
2416 warning (0, "-malign-functions is obsolete, use -falign-functions");
2417 if (align_functions == 0)
2419 i = atoi (ix86_align_funcs_string);
2420 if (i < 0 || i > MAX_CODE_ALIGN)
2421 error ("-malign-loops=%d is not between 0 and %d", i, MAX_CODE_ALIGN);
2423 align_functions = 1 << i;
2427 /* Default align_* from the processor table. */
2428 if (align_loops == 0)
2430 align_loops = processor_target_table[ix86_tune].align_loop;
2431 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
2433 if (align_jumps == 0)
2435 align_jumps = processor_target_table[ix86_tune].align_jump;
2436 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
2438 if (align_functions == 0)
2440 align_functions = processor_target_table[ix86_tune].align_func;
2443 /* Validate -mbranch-cost= value, or provide default. */
2444 ix86_branch_cost = ix86_cost->branch_cost;
2445 if (ix86_branch_cost_string)
2447 i = atoi (ix86_branch_cost_string);
2449 error ("-mbranch-cost=%d is not between 0 and 5", i);
2451 ix86_branch_cost = i;
2453 if (ix86_section_threshold_string)
2455 i = atoi (ix86_section_threshold_string);
2457 error ("-mlarge-data-threshold=%d is negative", i);
2459 ix86_section_threshold = i;
2462 if (ix86_tls_dialect_string)
2464 if (strcmp (ix86_tls_dialect_string, "gnu") == 0)
2465 ix86_tls_dialect = TLS_DIALECT_GNU;
2466 else if (strcmp (ix86_tls_dialect_string, "gnu2") == 0)
2467 ix86_tls_dialect = TLS_DIALECT_GNU2;
2468 else if (strcmp (ix86_tls_dialect_string, "sun") == 0)
2469 ix86_tls_dialect = TLS_DIALECT_SUN;
2471 error ("bad value (%s) for -mtls-dialect= switch",
2472 ix86_tls_dialect_string);
2475 if (ix87_precision_string)
2477 i = atoi (ix87_precision_string);
2478 if (i != 32 && i != 64 && i != 80)
2479 error ("pc%d is not valid precision setting (32, 64 or 80)", i);
2484 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
2486 /* Enable by default the SSE and MMX builtins. Do allow the user to
2487 explicitly disable any of these. In particular, disabling SSE and
2488 MMX for kernel code is extremely useful. */
2489 if (!ix86_arch_specified)
2491 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
2492 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
2495 warning (0, "-mrtd is ignored in 64bit mode");
2499 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
2501 if (!ix86_arch_specified)
2503 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
2505 /* i386 ABI does not specify red zone. It still makes sense to use it
2506 when programmer takes care to stack from being destroyed. */
2507 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
2508 target_flags |= MASK_NO_RED_ZONE;
2511 /* Keep nonleaf frame pointers. */
2512 if (flag_omit_frame_pointer)
2513 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
2514 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
2515 flag_omit_frame_pointer = 1;
2517 /* If we're doing fast math, we don't care about comparison order
2518 wrt NaNs. This lets us use a shorter comparison sequence. */
2519 if (flag_finite_math_only)
2520 target_flags &= ~MASK_IEEE_FP;
2522 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
2523 since the insns won't need emulation. */
2524 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
2525 target_flags &= ~MASK_NO_FANCY_MATH_387;
2527 /* Likewise, if the target doesn't have a 387, or we've specified
2528 software floating point, don't use 387 inline intrinsics. */
2530 target_flags |= MASK_NO_FANCY_MATH_387;
2532 /* Turn on SSE4A bultins for -msse5. */
2534 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
2536 /* Turn on SSE4.1 builtins for -msse4.2. */
2538 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
2540 /* Turn on SSSE3 builtins for -msse4.1. */
2542 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
2544 /* Turn on SSE3 builtins for -mssse3. */
2546 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
2548 /* Turn on SSE3 builtins for -msse4a. */
2550 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
2552 /* Turn on SSE2 builtins for -msse3. */
2554 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
2556 /* Turn on SSE builtins for -msse2. */
2558 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
2560 /* Turn on MMX builtins for -msse. */
2563 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
2564 x86_prefetch_sse = true;
2567 /* Turn on MMX builtins for 3Dnow. */
2569 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
2571 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
2572 if (TARGET_SSE4_2 || TARGET_ABM)
2575 /* Validate -mpreferred-stack-boundary= value, or provide default.
2576 The default of 128 bits is for Pentium III's SSE __m128. We can't
2577 change it because of optimize_size. Otherwise, we can't mix object
2578 files compiled with -Os and -On. */
2579 ix86_preferred_stack_boundary = 128;
2580 if (ix86_preferred_stack_boundary_string)
2582 i = atoi (ix86_preferred_stack_boundary_string);
2583 if (i < (TARGET_64BIT ? 4 : 2) || i > 12)
2584 error ("-mpreferred-stack-boundary=%d is not between %d and 12", i,
2585 TARGET_64BIT ? 4 : 2);
2587 ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT;
2590 /* Accept -msseregparm only if at least SSE support is enabled. */
2591 if (TARGET_SSEREGPARM
2593 error ("-msseregparm used without SSE enabled");
2595 ix86_fpmath = TARGET_FPMATH_DEFAULT;
2596 if (ix86_fpmath_string != 0)
2598 if (! strcmp (ix86_fpmath_string, "387"))
2599 ix86_fpmath = FPMATH_387;
2600 else if (! strcmp (ix86_fpmath_string, "sse"))
2604 warning (0, "SSE instruction set disabled, using 387 arithmetics");
2605 ix86_fpmath = FPMATH_387;
2608 ix86_fpmath = FPMATH_SSE;
2610 else if (! strcmp (ix86_fpmath_string, "387,sse")
2611 || ! strcmp (ix86_fpmath_string, "sse,387"))
2615 warning (0, "SSE instruction set disabled, using 387 arithmetics");
2616 ix86_fpmath = FPMATH_387;
2618 else if (!TARGET_80387)
2620 warning (0, "387 instruction set disabled, using SSE arithmetics");
2621 ix86_fpmath = FPMATH_SSE;
2624 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
2627 error ("bad value (%s) for -mfpmath= switch", ix86_fpmath_string);
2630 /* If the i387 is disabled, then do not return values in it. */
2632 target_flags &= ~MASK_FLOAT_RETURNS;
2634 /* Use external vectorized library in vectorizing intrinsics. */
2635 if (ix86_veclibabi_string)
2637 if (strcmp (ix86_veclibabi_string, "acml") == 0)
2638 ix86_veclib_handler = ix86_veclibabi_acml;
2640 error ("unknown vectorization library ABI type (%s) for "
2641 "-mveclibabi= switch", ix86_veclibabi_string);
2644 if ((x86_accumulate_outgoing_args & ix86_tune_mask)
2645 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2647 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2649 /* ??? Unwind info is not correct around the CFG unless either a frame
2650 pointer is present or M_A_O_A is set. Fixing this requires rewriting
2651 unwind info generation to be aware of the CFG and propagating states
2653 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
2654 || flag_exceptions || flag_non_call_exceptions)
2655 && flag_omit_frame_pointer
2656 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
2658 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
2659 warning (0, "unwind tables currently require either a frame pointer "
2660 "or -maccumulate-outgoing-args for correctness");
2661 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
2664 /* For sane SSE instruction set generation we need fcomi instruction.
2665 It is safe to enable all CMOVE instructions. */
2669 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
2672 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
2673 p = strchr (internal_label_prefix, 'X');
2674 internal_label_prefix_len = p - internal_label_prefix;
2678 /* When scheduling description is not available, disable scheduler pass
2679 so it won't slow down the compilation and make x87 code slower. */
2680 if (!TARGET_SCHEDULE)
2681 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
2683 if (!PARAM_SET_P (PARAM_SIMULTANEOUS_PREFETCHES))
2684 set_param_value ("simultaneous-prefetches",
2685 ix86_cost->simultaneous_prefetches);
2686 if (!PARAM_SET_P (PARAM_L1_CACHE_LINE_SIZE))
2687 set_param_value ("l1-cache-line-size", ix86_cost->prefetch_block);
2688 if (!PARAM_SET_P (PARAM_L1_CACHE_SIZE))
2689 set_param_value ("l1-cache-size", ix86_cost->l1_cache_size);
2690 if (!PARAM_SET_P (PARAM_L2_CACHE_SIZE))
2691 set_param_value ("l2-cache-size", ix86_cost->l2_cache_size);
2693 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
2694 can be optimized to ap = __builtin_next_arg (0). */
2695 if (!TARGET_64BIT || TARGET_64BIT_MS_ABI)
2696 targetm.expand_builtin_va_start = NULL;
2699 /* Return true if this goes in large data/bss. */
2702 ix86_in_large_data_p (tree exp)
2704 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
2707 /* Functions are never large data. */
2708 if (TREE_CODE (exp) == FUNCTION_DECL)
2711 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
2713 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
2714 if (strcmp (section, ".ldata") == 0
2715 || strcmp (section, ".lbss") == 0)
2721 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
2723 /* If this is an incomplete type with size 0, then we can't put it
2724 in data because it might be too big when completed. */
2725 if (!size || size > ix86_section_threshold)
2732 /* Switch to the appropriate section for output of DECL.
2733 DECL is either a `VAR_DECL' node or a constant of some sort.
2734 RELOC indicates whether forming the initial value of DECL requires
2735 link-time relocations. */
2737 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
2741 x86_64_elf_select_section (tree decl, int reloc,
2742 unsigned HOST_WIDE_INT align)
2744 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2745 && ix86_in_large_data_p (decl))
2747 const char *sname = NULL;
2748 unsigned int flags = SECTION_WRITE;
2749 switch (categorize_decl_for_section (decl, reloc))
2754 case SECCAT_DATA_REL:
2755 sname = ".ldata.rel";
2757 case SECCAT_DATA_REL_LOCAL:
2758 sname = ".ldata.rel.local";
2760 case SECCAT_DATA_REL_RO:
2761 sname = ".ldata.rel.ro";
2763 case SECCAT_DATA_REL_RO_LOCAL:
2764 sname = ".ldata.rel.ro.local";
2768 flags |= SECTION_BSS;
2771 case SECCAT_RODATA_MERGE_STR:
2772 case SECCAT_RODATA_MERGE_STR_INIT:
2773 case SECCAT_RODATA_MERGE_CONST:
2777 case SECCAT_SRODATA:
2784 /* We don't split these for medium model. Place them into
2785 default sections and hope for best. */
2790 /* We might get called with string constants, but get_named_section
2791 doesn't like them as they are not DECLs. Also, we need to set
2792 flags in that case. */
2794 return get_section (sname, flags, NULL);
2795 return get_named_section (decl, sname, reloc);
2798 return default_elf_select_section (decl, reloc, align);
2801 /* Build up a unique section name, expressed as a
2802 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
2803 RELOC indicates whether the initial value of EXP requires
2804 link-time relocations. */
2806 static void ATTRIBUTE_UNUSED
2807 x86_64_elf_unique_section (tree decl, int reloc)
2809 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2810 && ix86_in_large_data_p (decl))
2812 const char *prefix = NULL;
2813 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
2814 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
2816 switch (categorize_decl_for_section (decl, reloc))
2819 case SECCAT_DATA_REL:
2820 case SECCAT_DATA_REL_LOCAL:
2821 case SECCAT_DATA_REL_RO:
2822 case SECCAT_DATA_REL_RO_LOCAL:
2823 prefix = one_only ? ".gnu.linkonce.ld." : ".ldata.";
2826 prefix = one_only ? ".gnu.linkonce.lb." : ".lbss.";
2829 case SECCAT_RODATA_MERGE_STR:
2830 case SECCAT_RODATA_MERGE_STR_INIT:
2831 case SECCAT_RODATA_MERGE_CONST:
2832 prefix = one_only ? ".gnu.linkonce.lr." : ".lrodata.";
2834 case SECCAT_SRODATA:
2841 /* We don't split these for medium model. Place them into
2842 default sections and hope for best. */
2850 plen = strlen (prefix);
2852 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
2853 name = targetm.strip_name_encoding (name);
2854 nlen = strlen (name);
2856 string = (char *) alloca (nlen + plen + 1);
2857 memcpy (string, prefix, plen);
2858 memcpy (string + plen, name, nlen + 1);
2860 DECL_SECTION_NAME (decl) = build_string (nlen + plen, string);
2864 default_unique_section (decl, reloc);
2867 #ifdef COMMON_ASM_OP
2868 /* This says how to output assembler code to declare an
2869 uninitialized external linkage data object.
2871 For medium model x86-64 we need to use .largecomm opcode for
2874 x86_elf_aligned_common (FILE *file,
2875 const char *name, unsigned HOST_WIDE_INT size,
2878 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2879 && size > (unsigned int)ix86_section_threshold)
2880 fprintf (file, ".largecomm\t");
2882 fprintf (file, "%s", COMMON_ASM_OP);
2883 assemble_name (file, name);
2884 fprintf (file, ","HOST_WIDE_INT_PRINT_UNSIGNED",%u\n",
2885 size, align / BITS_PER_UNIT);
2889 /* Utility function for targets to use in implementing
2890 ASM_OUTPUT_ALIGNED_BSS. */
2893 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
2894 const char *name, unsigned HOST_WIDE_INT size,
2897 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
2898 && size > (unsigned int)ix86_section_threshold)
2899 switch_to_section (get_named_section (decl, ".lbss", 0));
2901 switch_to_section (bss_section);
2902 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
2903 #ifdef ASM_DECLARE_OBJECT_NAME
2904 last_assemble_variable_decl = decl;
2905 ASM_DECLARE_OBJECT_NAME (file, name, decl);
2907 /* Standard thing is just output label for the object. */
2908 ASM_OUTPUT_LABEL (file, name);
2909 #endif /* ASM_DECLARE_OBJECT_NAME */
2910 ASM_OUTPUT_SKIP (file, size ? size : 1);
2914 optimization_options (int level, int size ATTRIBUTE_UNUSED)
2916 /* For -O2 and beyond, turn off -fschedule-insns by default. It tends to
2917 make the problem with not enough registers even worse. */
2918 #ifdef INSN_SCHEDULING
2920 flag_schedule_insns = 0;
2924 /* The Darwin libraries never set errno, so we might as well
2925 avoid calling them when that's the only reason we would. */
2926 flag_errno_math = 0;
2928 /* The default values of these switches depend on the TARGET_64BIT
2929 that is not known at this moment. Mark these values with 2 and
2930 let user the to override these. In case there is no command line option
2931 specifying them, we will set the defaults in override_options. */
2933 flag_omit_frame_pointer = 2;
2934 flag_pcc_struct_return = 2;
2935 flag_asynchronous_unwind_tables = 2;
2936 flag_vect_cost_model = 1;
2937 #ifdef SUBTARGET_OPTIMIZATION_OPTIONS
2938 SUBTARGET_OPTIMIZATION_OPTIONS;
2942 /* Decide whether we can make a sibling call to a function. DECL is the
2943 declaration of the function being targeted by the call and EXP is the
2944 CALL_EXPR representing the call. */
2947 ix86_function_ok_for_sibcall (tree decl, tree exp)
2952 /* If we are generating position-independent code, we cannot sibcall
2953 optimize any indirect call, or a direct call to a global function,
2954 as the PLT requires %ebx be live. */
2955 if (!TARGET_64BIT && flag_pic && (!decl || !targetm.binds_local_p (decl)))
2962 func = TREE_TYPE (CALL_EXPR_FN (exp));
2963 if (POINTER_TYPE_P (func))
2964 func = TREE_TYPE (func);
2967 /* Check that the return value locations are the same. Like
2968 if we are returning floats on the 80387 register stack, we cannot
2969 make a sibcall from a function that doesn't return a float to a
2970 function that does or, conversely, from a function that does return
2971 a float to a function that doesn't; the necessary stack adjustment
2972 would not be executed. This is also the place we notice
2973 differences in the return value ABI. Note that it is ok for one
2974 of the functions to have void return type as long as the return
2975 value of the other is passed in a register. */
2976 a = ix86_function_value (TREE_TYPE (exp), func, false);
2977 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
2979 if (STACK_REG_P (a) || STACK_REG_P (b))
2981 if (!rtx_equal_p (a, b))
2984 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
2986 else if (!rtx_equal_p (a, b))
2989 /* If this call is indirect, we'll need to be able to use a call-clobbered
2990 register for the address of the target function. Make sure that all
2991 such registers are not used for passing parameters. */
2992 if (!decl && !TARGET_64BIT)
2996 /* We're looking at the CALL_EXPR, we need the type of the function. */
2997 type = CALL_EXPR_FN (exp); /* pointer expression */
2998 type = TREE_TYPE (type); /* pointer type */
2999 type = TREE_TYPE (type); /* function type */
3001 if (ix86_function_regparm (type, NULL) >= 3)
3003 /* ??? Need to count the actual number of registers to be used,
3004 not the possible number of registers. Fix later. */
3009 /* Dllimport'd functions are also called indirectly. */
3010 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
3011 && decl && DECL_DLLIMPORT_P (decl)
3012 && ix86_function_regparm (TREE_TYPE (decl), NULL) >= 3)
3015 /* If we forced aligned the stack, then sibcalling would unalign the
3016 stack, which may break the called function. */
3017 if (cfun->machine->force_align_arg_pointer)
3020 /* Otherwise okay. That also includes certain types of indirect calls. */
3024 /* Handle "cdecl", "stdcall", "fastcall", "regparm" and "sseregparm"
3025 calling convention attributes;
3026 arguments as in struct attribute_spec.handler. */
3029 ix86_handle_cconv_attribute (tree *node, tree name,
3031 int flags ATTRIBUTE_UNUSED,
3034 if (TREE_CODE (*node) != FUNCTION_TYPE
3035 && TREE_CODE (*node) != METHOD_TYPE
3036 && TREE_CODE (*node) != FIELD_DECL
3037 && TREE_CODE (*node) != TYPE_DECL)
3039 warning (OPT_Wattributes, "%qs attribute only applies to functions",
3040 IDENTIFIER_POINTER (name));
3041 *no_add_attrs = true;
3045 /* Can combine regparm with all attributes but fastcall. */
3046 if (is_attribute_p ("regparm", name))
3050 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
3052 error ("fastcall and regparm attributes are not compatible");
3055 cst = TREE_VALUE (args);
3056 if (TREE_CODE (cst) != INTEGER_CST)
3058 warning (OPT_Wattributes,
3059 "%qs attribute requires an integer constant argument",
3060 IDENTIFIER_POINTER (name));
3061 *no_add_attrs = true;
3063 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
3065 warning (OPT_Wattributes, "argument to %qs attribute larger than %d",
3066 IDENTIFIER_POINTER (name), REGPARM_MAX);
3067 *no_add_attrs = true;
3071 && lookup_attribute (ix86_force_align_arg_pointer_string,
3072 TYPE_ATTRIBUTES (*node))
3073 && compare_tree_int (cst, REGPARM_MAX-1))
3075 error ("%s functions limited to %d register parameters",
3076 ix86_force_align_arg_pointer_string, REGPARM_MAX-1);
3084 /* Do not warn when emulating the MS ABI. */
3085 if (!TARGET_64BIT_MS_ABI)
3086 warning (OPT_Wattributes, "%qs attribute ignored",
3087 IDENTIFIER_POINTER (name));
3088 *no_add_attrs = true;
3092 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
3093 if (is_attribute_p ("fastcall", name))
3095 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
3097 error ("fastcall and cdecl attributes are not compatible");
3099 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
3101 error ("fastcall and stdcall attributes are not compatible");
3103 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
3105 error ("fastcall and regparm attributes are not compatible");
3109 /* Can combine stdcall with fastcall (redundant), regparm and
3111 else if (is_attribute_p ("stdcall", name))
3113 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
3115 error ("stdcall and cdecl attributes are not compatible");
3117 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
3119 error ("stdcall and fastcall attributes are not compatible");
3123 /* Can combine cdecl with regparm and sseregparm. */
3124 else if (is_attribute_p ("cdecl", name))
3126 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
3128 error ("stdcall and cdecl attributes are not compatible");
3130 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
3132 error ("fastcall and cdecl attributes are not compatible");
3136 /* Can combine sseregparm with all attributes. */
3141 /* Return 0 if the attributes for two types are incompatible, 1 if they
3142 are compatible, and 2 if they are nearly compatible (which causes a
3143 warning to be generated). */
3146 ix86_comp_type_attributes (const_tree type1, const_tree type2)
3148 /* Check for mismatch of non-default calling convention. */
3149 const char *const rtdstr = TARGET_RTD ? "cdecl" : "stdcall";
3151 if (TREE_CODE (type1) != FUNCTION_TYPE
3152 && TREE_CODE (type1) != METHOD_TYPE)
3155 /* Check for mismatched fastcall/regparm types. */
3156 if ((!lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type1))
3157 != !lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type2)))
3158 || (ix86_function_regparm (type1, NULL)
3159 != ix86_function_regparm (type2, NULL)))
3162 /* Check for mismatched sseregparm types. */
3163 if (!lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type1))
3164 != !lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type2)))
3167 /* Check for mismatched return types (cdecl vs stdcall). */
3168 if (!lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type1))
3169 != !lookup_attribute (rtdstr, TYPE_ATTRIBUTES (type2)))
3175 /* Return the regparm value for a function with the indicated TYPE and DECL.
3176 DECL may be NULL when calling function indirectly
3177 or considering a libcall. */
3180 ix86_function_regparm (const_tree type, const_tree decl)
3183 int regparm = ix86_regparm;
3188 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
3190 return TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
3192 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (type)))
3195 /* Use register calling convention for local functions when possible. */
3196 if (decl && TREE_CODE (decl) == FUNCTION_DECL
3197 && flag_unit_at_a_time && !profile_flag)
3199 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
3200 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
3203 int local_regparm, globals = 0, regno;
3206 /* Make sure no regparm register is taken by a
3207 fixed register variable. */
3208 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
3209 if (fixed_regs[local_regparm])
3212 /* We can't use regparm(3) for nested functions as these use
3213 static chain pointer in third argument. */
3214 if (local_regparm == 3
3215 && (decl_function_context (decl)
3216 || ix86_force_align_arg_pointer)
3217 && !DECL_NO_STATIC_CHAIN (decl))
3220 /* If the function realigns its stackpointer, the prologue will
3221 clobber %ecx. If we've already generated code for the callee,
3222 the callee DECL_STRUCT_FUNCTION is gone, so we fall back to
3223 scanning the attributes for the self-realigning property. */
3224 f = DECL_STRUCT_FUNCTION (decl);
3225 if (local_regparm == 3
3226 && (f ? !!f->machine->force_align_arg_pointer
3227 : !!lookup_attribute (ix86_force_align_arg_pointer_string,
3228 TYPE_ATTRIBUTES (TREE_TYPE (decl)))))
3231 /* Each fixed register usage increases register pressure,
3232 so less registers should be used for argument passing.
3233 This functionality can be overriden by an explicit
3235 for (regno = 0; regno <= DI_REG; regno++)
3236 if (fixed_regs[regno])
3240 = globals < local_regparm ? local_regparm - globals : 0;
3242 if (local_regparm > regparm)
3243 regparm = local_regparm;
3250 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
3251 DFmode (2) arguments in SSE registers for a function with the
3252 indicated TYPE and DECL. DECL may be NULL when calling function
3253 indirectly or considering a libcall. Otherwise return 0. */
3256 ix86_function_sseregparm (const_tree type, const_tree decl)
3258 gcc_assert (!TARGET_64BIT);
3260 /* Use SSE registers to pass SFmode and DFmode arguments if requested
3261 by the sseregparm attribute. */
3262 if (TARGET_SSEREGPARM
3263 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
3268 error ("Calling %qD with attribute sseregparm without "
3269 "SSE/SSE2 enabled", decl);
3271 error ("Calling %qT with attribute sseregparm without "
3272 "SSE/SSE2 enabled", type);
3279 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
3280 (and DFmode for SSE2) arguments in SSE registers. */
3281 if (decl && TARGET_SSE_MATH && flag_unit_at_a_time && !profile_flag)
3283 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
3284 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
3286 return TARGET_SSE2 ? 2 : 1;
3292 /* Return true if EAX is live at the start of the function. Used by
3293 ix86_expand_prologue to determine if we need special help before
3294 calling allocate_stack_worker. */
3297 ix86_eax_live_at_start_p (void)
3299 /* Cheat. Don't bother working forward from ix86_function_regparm
3300 to the function type to whether an actual argument is located in
3301 eax. Instead just look at cfg info, which is still close enough
3302 to correct at this point. This gives false positives for broken
3303 functions that might use uninitialized data that happens to be