1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
3 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013
4 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
42 #include "diagnostic-core.h"
44 #include "basic-block.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
55 #include "tm-constrs.h"
59 #include "sched-int.h"
63 #include "diagnostic.h"
65 enum upper_128bits_state
72 typedef struct block_info_def
74 /* State of the upper 128bits of AVX registers at exit. */
75 enum upper_128bits_state state;
76 /* TRUE if state of the upper 128bits of AVX registers is unchanged
79 /* TRUE if block has been processed. */
81 /* TRUE if block has been scanned. */
83 /* Previous state of the upper 128bits of AVX registers at entry. */
84 enum upper_128bits_state prev;
87 #define BLOCK_INFO(B) ((block_info) (B)->aux)
89 enum call_avx256_state
91 /* Callee returns 256bit AVX register. */
92 callee_return_avx256 = -1,
93 /* Callee returns and passes 256bit AVX register. */
94 callee_return_pass_avx256,
95 /* Callee passes 256bit AVX register. */
97 /* Callee doesn't return nor passe 256bit AVX register, or no
98 256bit AVX register in function return. */
100 /* vzeroupper intrinsic. */
104 /* Check if a 256bit AVX register is referenced in stores. */
107 check_avx256_stores (rtx dest, const_rtx set, void *data)
110 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
111 || (GET_CODE (set) == SET
112 && REG_P (SET_SRC (set))
113 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
115 enum upper_128bits_state *state
116 = (enum upper_128bits_state *) data;
121 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
122 in basic block BB. Delete it if upper 128bit AVX registers are
123 unused. If it isn't deleted, move it to just before a jump insn.
125 STATE is state of the upper 128bits of AVX registers at entry. */
128 move_or_delete_vzeroupper_2 (basic_block bb,
129 enum upper_128bits_state state)
132 rtx vzeroupper_insn = NULL_RTX;
137 if (BLOCK_INFO (bb)->unchanged)
140 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
143 BLOCK_INFO (bb)->state = state;
147 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
150 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
151 bb->index, BLOCK_INFO (bb)->state);
155 BLOCK_INFO (bb)->prev = state;
158 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
163 /* BB_END changes when it is deleted. */
164 bb_end = BB_END (bb);
166 while (insn != bb_end)
168 insn = NEXT_INSN (insn);
170 if (!NONDEBUG_INSN_P (insn))
173 /* Move vzeroupper before jump/call. */
174 if (JUMP_P (insn) || CALL_P (insn))
176 if (!vzeroupper_insn)
179 if (PREV_INSN (insn) != vzeroupper_insn)
183 fprintf (dump_file, "Move vzeroupper after:\n");
184 print_rtl_single (dump_file, PREV_INSN (insn));
185 fprintf (dump_file, "before:\n");
186 print_rtl_single (dump_file, insn);
188 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
191 vzeroupper_insn = NULL_RTX;
195 pat = PATTERN (insn);
197 /* Check insn for vzeroupper intrinsic. */
198 if (GET_CODE (pat) == UNSPEC_VOLATILE
199 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
203 /* Found vzeroupper intrinsic. */
204 fprintf (dump_file, "Found vzeroupper:\n");
205 print_rtl_single (dump_file, insn);
210 /* Check insn for vzeroall intrinsic. */
211 if (GET_CODE (pat) == PARALLEL
212 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
213 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
218 /* Delete pending vzeroupper insertion. */
221 delete_insn (vzeroupper_insn);
222 vzeroupper_insn = NULL_RTX;
225 else if (state != used)
227 note_stores (pat, check_avx256_stores, &state);
234 /* Process vzeroupper intrinsic. */
235 avx256 = INTVAL (XVECEXP (pat, 0, 0));
239 /* Since the upper 128bits are cleared, callee must not pass
240 256bit AVX register. We only need to check if callee
241 returns 256bit AVX register. */
242 if (avx256 == callee_return_avx256)
248 /* Remove unnecessary vzeroupper since upper 128bits are
252 fprintf (dump_file, "Delete redundant vzeroupper:\n");
253 print_rtl_single (dump_file, insn);
259 /* Set state to UNUSED if callee doesn't return 256bit AVX
261 if (avx256 != callee_return_pass_avx256)
264 if (avx256 == callee_return_pass_avx256
265 || avx256 == callee_pass_avx256)
267 /* Must remove vzeroupper since callee passes in 256bit
271 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
272 print_rtl_single (dump_file, insn);
278 vzeroupper_insn = insn;
284 BLOCK_INFO (bb)->state = state;
285 BLOCK_INFO (bb)->unchanged = unchanged;
286 BLOCK_INFO (bb)->scanned = true;
289 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
290 bb->index, unchanged ? "unchanged" : "changed",
294 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
295 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
296 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
300 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
304 enum upper_128bits_state state, old_state, new_state;
308 fprintf (dump_file, " Process [bb %i]: status: %d\n",
309 block->index, BLOCK_INFO (block)->processed);
311 if (BLOCK_INFO (block)->processed)
316 /* Check all predecessor edges of this block. */
317 seen_unknown = false;
318 FOR_EACH_EDGE (e, ei, block->preds)
322 switch (BLOCK_INFO (e->src)->state)
325 if (!unknown_is_unused)
339 old_state = BLOCK_INFO (block)->state;
340 move_or_delete_vzeroupper_2 (block, state);
341 new_state = BLOCK_INFO (block)->state;
343 if (state != unknown || new_state == used)
344 BLOCK_INFO (block)->processed = true;
346 /* Need to rescan if the upper 128bits of AVX registers are changed
348 if (new_state != old_state)
350 if (new_state == used)
351 cfun->machine->rescan_vzeroupper_p = 1;
358 /* Go through the instruction stream looking for vzeroupper. Delete
359 it if upper 128bit AVX registers are unused. If it isn't deleted,
360 move it to just before a jump insn. */
363 move_or_delete_vzeroupper (void)
368 fibheap_t worklist, pending, fibheap_swap;
369 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
374 /* Set up block info for each basic block. */
375 alloc_aux_for_blocks (sizeof (struct block_info_def));
377 /* Process outgoing edges of entry point. */
379 fprintf (dump_file, "Process outgoing edges of entry point\n");
381 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
383 move_or_delete_vzeroupper_2 (e->dest,
384 cfun->machine->caller_pass_avx256_p
386 BLOCK_INFO (e->dest)->processed = true;
389 /* Compute reverse completion order of depth first search of the CFG
390 so that the data-flow runs faster. */
391 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
392 bb_order = XNEWVEC (int, last_basic_block);
393 pre_and_rev_post_order_compute (NULL, rc_order, false);
394 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
395 bb_order[rc_order[i]] = i;
398 worklist = fibheap_new ();
399 pending = fibheap_new ();
400 visited = sbitmap_alloc (last_basic_block);
401 in_worklist = sbitmap_alloc (last_basic_block);
402 in_pending = sbitmap_alloc (last_basic_block);
403 sbitmap_zero (in_worklist);
405 /* Don't check outgoing edges of entry point. */
406 sbitmap_ones (in_pending);
408 if (BLOCK_INFO (bb)->processed)
409 RESET_BIT (in_pending, bb->index);
412 move_or_delete_vzeroupper_1 (bb, false);
413 fibheap_insert (pending, bb_order[bb->index], bb);
417 fprintf (dump_file, "Check remaining basic blocks\n");
419 while (!fibheap_empty (pending))
421 fibheap_swap = pending;
423 worklist = fibheap_swap;
424 sbitmap_swap = in_pending;
425 in_pending = in_worklist;
426 in_worklist = sbitmap_swap;
428 sbitmap_zero (visited);
430 cfun->machine->rescan_vzeroupper_p = 0;
432 while (!fibheap_empty (worklist))
434 bb = (basic_block) fibheap_extract_min (worklist);
435 RESET_BIT (in_worklist, bb->index);
436 gcc_assert (!TEST_BIT (visited, bb->index));
437 if (!TEST_BIT (visited, bb->index))
441 SET_BIT (visited, bb->index);
443 if (move_or_delete_vzeroupper_1 (bb, false))
444 FOR_EACH_EDGE (e, ei, bb->succs)
446 if (e->dest == EXIT_BLOCK_PTR
447 || BLOCK_INFO (e->dest)->processed)
450 if (TEST_BIT (visited, e->dest->index))
452 if (!TEST_BIT (in_pending, e->dest->index))
454 /* Send E->DEST to next round. */
455 SET_BIT (in_pending, e->dest->index);
456 fibheap_insert (pending,
457 bb_order[e->dest->index],
461 else if (!TEST_BIT (in_worklist, e->dest->index))
463 /* Add E->DEST to current round. */
464 SET_BIT (in_worklist, e->dest->index);
465 fibheap_insert (worklist, bb_order[e->dest->index],
472 if (!cfun->machine->rescan_vzeroupper_p)
477 fibheap_delete (worklist);
478 fibheap_delete (pending);
479 sbitmap_free (visited);
480 sbitmap_free (in_worklist);
481 sbitmap_free (in_pending);
484 fprintf (dump_file, "Process remaining basic blocks\n");
487 move_or_delete_vzeroupper_1 (bb, true);
489 free_aux_for_blocks ();
492 static rtx legitimize_dllimport_symbol (rtx, bool);
494 #ifndef CHECK_STACK_LIMIT
495 #define CHECK_STACK_LIMIT (-1)
498 /* Return index of given mode in mult and division cost tables. */
499 #define MODE_INDEX(mode) \
500 ((mode) == QImode ? 0 \
501 : (mode) == HImode ? 1 \
502 : (mode) == SImode ? 2 \
503 : (mode) == DImode ? 3 \
506 /* Processor costs (relative to an add) */
507 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
508 #define COSTS_N_BYTES(N) ((N) * 2)
510 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
513 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
514 COSTS_N_BYTES (2), /* cost of an add instruction */
515 COSTS_N_BYTES (3), /* cost of a lea instruction */
516 COSTS_N_BYTES (2), /* variable shift costs */
517 COSTS_N_BYTES (3), /* constant shift costs */
518 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
519 COSTS_N_BYTES (3), /* HI */
520 COSTS_N_BYTES (3), /* SI */
521 COSTS_N_BYTES (3), /* DI */
522 COSTS_N_BYTES (5)}, /* other */
523 0, /* cost of multiply per each bit set */
524 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
525 COSTS_N_BYTES (3), /* HI */
526 COSTS_N_BYTES (3), /* SI */
527 COSTS_N_BYTES (3), /* DI */
528 COSTS_N_BYTES (5)}, /* other */
529 COSTS_N_BYTES (3), /* cost of movsx */
530 COSTS_N_BYTES (3), /* cost of movzx */
531 0, /* "large" insn */
533 2, /* cost for loading QImode using movzbl */
534 {2, 2, 2}, /* cost of loading integer registers
535 in QImode, HImode and SImode.
536 Relative to reg-reg move (2). */
537 {2, 2, 2}, /* cost of storing integer registers */
538 2, /* cost of reg,reg fld/fst */
539 {2, 2, 2}, /* cost of loading fp registers
540 in SFmode, DFmode and XFmode */
541 {2, 2, 2}, /* cost of storing fp registers
542 in SFmode, DFmode and XFmode */
543 3, /* cost of moving MMX register */
544 {3, 3}, /* cost of loading MMX registers
545 in SImode and DImode */
546 {3, 3}, /* cost of storing MMX registers
547 in SImode and DImode */
548 3, /* cost of moving SSE register */
549 {3, 3, 3}, /* cost of loading SSE registers
550 in SImode, DImode and TImode */
551 {3, 3, 3}, /* cost of storing SSE registers
552 in SImode, DImode and TImode */
553 3, /* MMX or SSE register to integer */
554 0, /* size of l1 cache */
555 0, /* size of l2 cache */
556 0, /* size of prefetch block */
557 0, /* number of parallel prefetches */
559 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
560 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
561 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
562 COSTS_N_BYTES (2), /* cost of FABS instruction. */
563 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
564 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
565 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
566 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
567 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
568 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
569 1, /* scalar_stmt_cost. */
570 1, /* scalar load_cost. */
571 1, /* scalar_store_cost. */
572 1, /* vec_stmt_cost. */
573 1, /* vec_to_scalar_cost. */
574 1, /* scalar_to_vec_cost. */
575 1, /* vec_align_load_cost. */
576 1, /* vec_unalign_load_cost. */
577 1, /* vec_store_cost. */
578 1, /* cond_taken_branch_cost. */
579 1, /* cond_not_taken_branch_cost. */
582 /* Processor costs (relative to an add) */
584 struct processor_costs i386_cost = { /* 386 specific costs */
585 COSTS_N_INSNS (1), /* cost of an add instruction */
586 COSTS_N_INSNS (1), /* cost of a lea instruction */
587 COSTS_N_INSNS (3), /* variable shift costs */
588 COSTS_N_INSNS (2), /* constant shift costs */
589 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
590 COSTS_N_INSNS (6), /* HI */
591 COSTS_N_INSNS (6), /* SI */
592 COSTS_N_INSNS (6), /* DI */
593 COSTS_N_INSNS (6)}, /* other */
594 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
595 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
596 COSTS_N_INSNS (23), /* HI */
597 COSTS_N_INSNS (23), /* SI */
598 COSTS_N_INSNS (23), /* DI */
599 COSTS_N_INSNS (23)}, /* other */
600 COSTS_N_INSNS (3), /* cost of movsx */
601 COSTS_N_INSNS (2), /* cost of movzx */
602 15, /* "large" insn */
604 4, /* cost for loading QImode using movzbl */
605 {2, 4, 2}, /* cost of loading integer registers
606 in QImode, HImode and SImode.
607 Relative to reg-reg move (2). */
608 {2, 4, 2}, /* cost of storing integer registers */
609 2, /* cost of reg,reg fld/fst */
610 {8, 8, 8}, /* cost of loading fp registers
611 in SFmode, DFmode and XFmode */
612 {8, 8, 8}, /* cost of storing fp registers
613 in SFmode, DFmode and XFmode */
614 2, /* cost of moving MMX register */
615 {4, 8}, /* cost of loading MMX registers
616 in SImode and DImode */
617 {4, 8}, /* cost of storing MMX registers
618 in SImode and DImode */
619 2, /* cost of moving SSE register */
620 {4, 8, 16}, /* cost of loading SSE registers
621 in SImode, DImode and TImode */
622 {4, 8, 16}, /* cost of storing SSE registers
623 in SImode, DImode and TImode */
624 3, /* MMX or SSE register to integer */
625 0, /* size of l1 cache */
626 0, /* size of l2 cache */
627 0, /* size of prefetch block */
628 0, /* number of parallel prefetches */
630 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
631 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
632 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
633 COSTS_N_INSNS (22), /* cost of FABS instruction. */
634 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
635 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
636 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
637 DUMMY_STRINGOP_ALGS},
638 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
639 DUMMY_STRINGOP_ALGS},
640 1, /* scalar_stmt_cost. */
641 1, /* scalar load_cost. */
642 1, /* scalar_store_cost. */
643 1, /* vec_stmt_cost. */
644 1, /* vec_to_scalar_cost. */
645 1, /* scalar_to_vec_cost. */
646 1, /* vec_align_load_cost. */
647 2, /* vec_unalign_load_cost. */
648 1, /* vec_store_cost. */
649 3, /* cond_taken_branch_cost. */
650 1, /* cond_not_taken_branch_cost. */
654 struct processor_costs i486_cost = { /* 486 specific costs */
655 COSTS_N_INSNS (1), /* cost of an add instruction */
656 COSTS_N_INSNS (1), /* cost of a lea instruction */
657 COSTS_N_INSNS (3), /* variable shift costs */
658 COSTS_N_INSNS (2), /* constant shift costs */
659 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
660 COSTS_N_INSNS (12), /* HI */
661 COSTS_N_INSNS (12), /* SI */
662 COSTS_N_INSNS (12), /* DI */
663 COSTS_N_INSNS (12)}, /* other */
664 1, /* cost of multiply per each bit set */
665 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
666 COSTS_N_INSNS (40), /* HI */
667 COSTS_N_INSNS (40), /* SI */
668 COSTS_N_INSNS (40), /* DI */
669 COSTS_N_INSNS (40)}, /* other */
670 COSTS_N_INSNS (3), /* cost of movsx */
671 COSTS_N_INSNS (2), /* cost of movzx */
672 15, /* "large" insn */
674 4, /* cost for loading QImode using movzbl */
675 {2, 4, 2}, /* cost of loading integer registers
676 in QImode, HImode and SImode.
677 Relative to reg-reg move (2). */
678 {2, 4, 2}, /* cost of storing integer registers */
679 2, /* cost of reg,reg fld/fst */
680 {8, 8, 8}, /* cost of loading fp registers
681 in SFmode, DFmode and XFmode */
682 {8, 8, 8}, /* cost of storing fp registers
683 in SFmode, DFmode and XFmode */
684 2, /* cost of moving MMX register */
685 {4, 8}, /* cost of loading MMX registers
686 in SImode and DImode */
687 {4, 8}, /* cost of storing MMX registers
688 in SImode and DImode */
689 2, /* cost of moving SSE register */
690 {4, 8, 16}, /* cost of loading SSE registers
691 in SImode, DImode and TImode */
692 {4, 8, 16}, /* cost of storing SSE registers
693 in SImode, DImode and TImode */
694 3, /* MMX or SSE register to integer */
695 4, /* size of l1 cache. 486 has 8kB cache
696 shared for code and data, so 4kB is
697 not really precise. */
698 4, /* size of l2 cache */
699 0, /* size of prefetch block */
700 0, /* number of parallel prefetches */
702 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
703 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
704 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
705 COSTS_N_INSNS (3), /* cost of FABS instruction. */
706 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
707 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
708 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
709 DUMMY_STRINGOP_ALGS},
710 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
711 DUMMY_STRINGOP_ALGS},
712 1, /* scalar_stmt_cost. */
713 1, /* scalar load_cost. */
714 1, /* scalar_store_cost. */
715 1, /* vec_stmt_cost. */
716 1, /* vec_to_scalar_cost. */
717 1, /* scalar_to_vec_cost. */
718 1, /* vec_align_load_cost. */
719 2, /* vec_unalign_load_cost. */
720 1, /* vec_store_cost. */
721 3, /* cond_taken_branch_cost. */
722 1, /* cond_not_taken_branch_cost. */
726 struct processor_costs pentium_cost = {
727 COSTS_N_INSNS (1), /* cost of an add instruction */
728 COSTS_N_INSNS (1), /* cost of a lea instruction */
729 COSTS_N_INSNS (4), /* variable shift costs */
730 COSTS_N_INSNS (1), /* constant shift costs */
731 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
732 COSTS_N_INSNS (11), /* HI */
733 COSTS_N_INSNS (11), /* SI */
734 COSTS_N_INSNS (11), /* DI */
735 COSTS_N_INSNS (11)}, /* other */
736 0, /* cost of multiply per each bit set */
737 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
738 COSTS_N_INSNS (25), /* HI */
739 COSTS_N_INSNS (25), /* SI */
740 COSTS_N_INSNS (25), /* DI */
741 COSTS_N_INSNS (25)}, /* other */
742 COSTS_N_INSNS (3), /* cost of movsx */
743 COSTS_N_INSNS (2), /* cost of movzx */
744 8, /* "large" insn */
746 6, /* cost for loading QImode using movzbl */
747 {2, 4, 2}, /* cost of loading integer registers
748 in QImode, HImode and SImode.
749 Relative to reg-reg move (2). */
750 {2, 4, 2}, /* cost of storing integer registers */
751 2, /* cost of reg,reg fld/fst */
752 {2, 2, 6}, /* cost of loading fp registers
753 in SFmode, DFmode and XFmode */
754 {4, 4, 6}, /* cost of storing fp registers
755 in SFmode, DFmode and XFmode */
756 8, /* cost of moving MMX register */
757 {8, 8}, /* cost of loading MMX registers
758 in SImode and DImode */
759 {8, 8}, /* cost of storing MMX registers
760 in SImode and DImode */
761 2, /* cost of moving SSE register */
762 {4, 8, 16}, /* cost of loading SSE registers
763 in SImode, DImode and TImode */
764 {4, 8, 16}, /* cost of storing SSE registers
765 in SImode, DImode and TImode */
766 3, /* MMX or SSE register to integer */
767 8, /* size of l1 cache. */
768 8, /* size of l2 cache */
769 0, /* size of prefetch block */
770 0, /* number of parallel prefetches */
772 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
773 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
774 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
775 COSTS_N_INSNS (1), /* cost of FABS instruction. */
776 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
777 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
778 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
779 DUMMY_STRINGOP_ALGS},
780 {{libcall, {{-1, rep_prefix_4_byte}}},
781 DUMMY_STRINGOP_ALGS},
782 1, /* scalar_stmt_cost. */
783 1, /* scalar load_cost. */
784 1, /* scalar_store_cost. */
785 1, /* vec_stmt_cost. */
786 1, /* vec_to_scalar_cost. */
787 1, /* scalar_to_vec_cost. */
788 1, /* vec_align_load_cost. */
789 2, /* vec_unalign_load_cost. */
790 1, /* vec_store_cost. */
791 3, /* cond_taken_branch_cost. */
792 1, /* cond_not_taken_branch_cost. */
796 struct processor_costs pentiumpro_cost = {
797 COSTS_N_INSNS (1), /* cost of an add instruction */
798 COSTS_N_INSNS (1), /* cost of a lea instruction */
799 COSTS_N_INSNS (1), /* variable shift costs */
800 COSTS_N_INSNS (1), /* constant shift costs */
801 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
802 COSTS_N_INSNS (4), /* HI */
803 COSTS_N_INSNS (4), /* SI */
804 COSTS_N_INSNS (4), /* DI */
805 COSTS_N_INSNS (4)}, /* other */
806 0, /* cost of multiply per each bit set */
807 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
808 COSTS_N_INSNS (17), /* HI */
809 COSTS_N_INSNS (17), /* SI */
810 COSTS_N_INSNS (17), /* DI */
811 COSTS_N_INSNS (17)}, /* other */
812 COSTS_N_INSNS (1), /* cost of movsx */
813 COSTS_N_INSNS (1), /* cost of movzx */
814 8, /* "large" insn */
816 2, /* cost for loading QImode using movzbl */
817 {4, 4, 4}, /* cost of loading integer registers
818 in QImode, HImode and SImode.
819 Relative to reg-reg move (2). */
820 {2, 2, 2}, /* cost of storing integer registers */
821 2, /* cost of reg,reg fld/fst */
822 {2, 2, 6}, /* cost of loading fp registers
823 in SFmode, DFmode and XFmode */
824 {4, 4, 6}, /* cost of storing fp registers
825 in SFmode, DFmode and XFmode */
826 2, /* cost of moving MMX register */
827 {2, 2}, /* cost of loading MMX registers
828 in SImode and DImode */
829 {2, 2}, /* cost of storing MMX registers
830 in SImode and DImode */
831 2, /* cost of moving SSE register */
832 {2, 2, 8}, /* cost of loading SSE registers
833 in SImode, DImode and TImode */
834 {2, 2, 8}, /* cost of storing SSE registers
835 in SImode, DImode and TImode */
836 3, /* MMX or SSE register to integer */
837 8, /* size of l1 cache. */
838 256, /* size of l2 cache */
839 32, /* size of prefetch block */
840 6, /* number of parallel prefetches */
842 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
843 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
844 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
845 COSTS_N_INSNS (2), /* cost of FABS instruction. */
846 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
847 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
848 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
849 (we ensure the alignment). For small blocks inline loop is still a
850 noticeable win, for bigger blocks either rep movsl or rep movsb is
851 way to go. Rep movsb has apparently more expensive startup time in CPU,
852 but after 4K the difference is down in the noise. */
853 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
854 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
855 DUMMY_STRINGOP_ALGS},
856 {{rep_prefix_4_byte, {{1024, unrolled_loop},
857 {8192, rep_prefix_4_byte}, {-1, libcall}}},
858 DUMMY_STRINGOP_ALGS},
859 1, /* scalar_stmt_cost. */
860 1, /* scalar load_cost. */
861 1, /* scalar_store_cost. */
862 1, /* vec_stmt_cost. */
863 1, /* vec_to_scalar_cost. */
864 1, /* scalar_to_vec_cost. */
865 1, /* vec_align_load_cost. */
866 2, /* vec_unalign_load_cost. */
867 1, /* vec_store_cost. */
868 3, /* cond_taken_branch_cost. */
869 1, /* cond_not_taken_branch_cost. */
873 struct processor_costs geode_cost = {
874 COSTS_N_INSNS (1), /* cost of an add instruction */
875 COSTS_N_INSNS (1), /* cost of a lea instruction */
876 COSTS_N_INSNS (2), /* variable shift costs */
877 COSTS_N_INSNS (1), /* constant shift costs */
878 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
879 COSTS_N_INSNS (4), /* HI */
880 COSTS_N_INSNS (7), /* SI */
881 COSTS_N_INSNS (7), /* DI */
882 COSTS_N_INSNS (7)}, /* other */
883 0, /* cost of multiply per each bit set */
884 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
885 COSTS_N_INSNS (23), /* HI */
886 COSTS_N_INSNS (39), /* SI */
887 COSTS_N_INSNS (39), /* DI */
888 COSTS_N_INSNS (39)}, /* other */
889 COSTS_N_INSNS (1), /* cost of movsx */
890 COSTS_N_INSNS (1), /* cost of movzx */
891 8, /* "large" insn */
893 1, /* cost for loading QImode using movzbl */
894 {1, 1, 1}, /* cost of loading integer registers
895 in QImode, HImode and SImode.
896 Relative to reg-reg move (2). */
897 {1, 1, 1}, /* cost of storing integer registers */
898 1, /* cost of reg,reg fld/fst */
899 {1, 1, 1}, /* cost of loading fp registers
900 in SFmode, DFmode and XFmode */
901 {4, 6, 6}, /* cost of storing fp registers
902 in SFmode, DFmode and XFmode */
904 1, /* cost of moving MMX register */
905 {1, 1}, /* cost of loading MMX registers
906 in SImode and DImode */
907 {1, 1}, /* cost of storing MMX registers
908 in SImode and DImode */
909 1, /* cost of moving SSE register */
910 {1, 1, 1}, /* cost of loading SSE registers
911 in SImode, DImode and TImode */
912 {1, 1, 1}, /* cost of storing SSE registers
913 in SImode, DImode and TImode */
914 1, /* MMX or SSE register to integer */
915 64, /* size of l1 cache. */
916 128, /* size of l2 cache. */
917 32, /* size of prefetch block */
918 1, /* number of parallel prefetches */
920 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
921 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
922 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
923 COSTS_N_INSNS (1), /* cost of FABS instruction. */
924 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
925 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
926 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
927 DUMMY_STRINGOP_ALGS},
928 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
929 DUMMY_STRINGOP_ALGS},
930 1, /* scalar_stmt_cost. */
931 1, /* scalar load_cost. */
932 1, /* scalar_store_cost. */
933 1, /* vec_stmt_cost. */
934 1, /* vec_to_scalar_cost. */
935 1, /* scalar_to_vec_cost. */
936 1, /* vec_align_load_cost. */
937 2, /* vec_unalign_load_cost. */
938 1, /* vec_store_cost. */
939 3, /* cond_taken_branch_cost. */
940 1, /* cond_not_taken_branch_cost. */
944 struct processor_costs k6_cost = {
945 COSTS_N_INSNS (1), /* cost of an add instruction */
946 COSTS_N_INSNS (2), /* cost of a lea instruction */
947 COSTS_N_INSNS (1), /* variable shift costs */
948 COSTS_N_INSNS (1), /* constant shift costs */
949 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
950 COSTS_N_INSNS (3), /* HI */
951 COSTS_N_INSNS (3), /* SI */
952 COSTS_N_INSNS (3), /* DI */
953 COSTS_N_INSNS (3)}, /* other */
954 0, /* cost of multiply per each bit set */
955 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
956 COSTS_N_INSNS (18), /* HI */
957 COSTS_N_INSNS (18), /* SI */
958 COSTS_N_INSNS (18), /* DI */
959 COSTS_N_INSNS (18)}, /* other */
960 COSTS_N_INSNS (2), /* cost of movsx */
961 COSTS_N_INSNS (2), /* cost of movzx */
962 8, /* "large" insn */
964 3, /* cost for loading QImode using movzbl */
965 {4, 5, 4}, /* cost of loading integer registers
966 in QImode, HImode and SImode.
967 Relative to reg-reg move (2). */
968 {2, 3, 2}, /* cost of storing integer registers */
969 4, /* cost of reg,reg fld/fst */
970 {6, 6, 6}, /* cost of loading fp registers
971 in SFmode, DFmode and XFmode */
972 {4, 4, 4}, /* cost of storing fp registers
973 in SFmode, DFmode and XFmode */
974 2, /* cost of moving MMX register */
975 {2, 2}, /* cost of loading MMX registers
976 in SImode and DImode */
977 {2, 2}, /* cost of storing MMX registers
978 in SImode and DImode */
979 2, /* cost of moving SSE register */
980 {2, 2, 8}, /* cost of loading SSE registers
981 in SImode, DImode and TImode */
982 {2, 2, 8}, /* cost of storing SSE registers
983 in SImode, DImode and TImode */
984 6, /* MMX or SSE register to integer */
985 32, /* size of l1 cache. */
986 32, /* size of l2 cache. Some models
987 have integrated l2 cache, but
988 optimizing for k6 is not important
989 enough to worry about that. */
990 32, /* size of prefetch block */
991 1, /* number of parallel prefetches */
993 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
994 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
995 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
996 COSTS_N_INSNS (2), /* cost of FABS instruction. */
997 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
998 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
999 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1000 DUMMY_STRINGOP_ALGS},
1001 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1002 DUMMY_STRINGOP_ALGS},
1003 1, /* scalar_stmt_cost. */
1004 1, /* scalar load_cost. */
1005 1, /* scalar_store_cost. */
1006 1, /* vec_stmt_cost. */
1007 1, /* vec_to_scalar_cost. */
1008 1, /* scalar_to_vec_cost. */
1009 1, /* vec_align_load_cost. */
1010 2, /* vec_unalign_load_cost. */
1011 1, /* vec_store_cost. */
1012 3, /* cond_taken_branch_cost. */
1013 1, /* cond_not_taken_branch_cost. */
1017 struct processor_costs athlon_cost = {
1018 COSTS_N_INSNS (1), /* cost of an add instruction */
1019 COSTS_N_INSNS (2), /* cost of a lea instruction */
1020 COSTS_N_INSNS (1), /* variable shift costs */
1021 COSTS_N_INSNS (1), /* constant shift costs */
1022 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1023 COSTS_N_INSNS (5), /* HI */
1024 COSTS_N_INSNS (5), /* SI */
1025 COSTS_N_INSNS (5), /* DI */
1026 COSTS_N_INSNS (5)}, /* other */
1027 0, /* cost of multiply per each bit set */
1028 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1029 COSTS_N_INSNS (26), /* HI */
1030 COSTS_N_INSNS (42), /* SI */
1031 COSTS_N_INSNS (74), /* DI */
1032 COSTS_N_INSNS (74)}, /* other */
1033 COSTS_N_INSNS (1), /* cost of movsx */
1034 COSTS_N_INSNS (1), /* cost of movzx */
1035 8, /* "large" insn */
1037 4, /* cost for loading QImode using movzbl */
1038 {3, 4, 3}, /* cost of loading integer registers
1039 in QImode, HImode and SImode.
1040 Relative to reg-reg move (2). */
1041 {3, 4, 3}, /* cost of storing integer registers */
1042 4, /* cost of reg,reg fld/fst */
1043 {4, 4, 12}, /* cost of loading fp registers
1044 in SFmode, DFmode and XFmode */
1045 {6, 6, 8}, /* cost of storing fp registers
1046 in SFmode, DFmode and XFmode */
1047 2, /* cost of moving MMX register */
1048 {4, 4}, /* cost of loading MMX registers
1049 in SImode and DImode */
1050 {4, 4}, /* cost of storing MMX registers
1051 in SImode and DImode */
1052 2, /* cost of moving SSE register */
1053 {4, 4, 6}, /* cost of loading SSE registers
1054 in SImode, DImode and TImode */
1055 {4, 4, 5}, /* cost of storing SSE registers
1056 in SImode, DImode and TImode */
1057 5, /* MMX or SSE register to integer */
1058 64, /* size of l1 cache. */
1059 256, /* size of l2 cache. */
1060 64, /* size of prefetch block */
1061 6, /* number of parallel prefetches */
1062 5, /* Branch cost */
1063 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1064 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1065 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1066 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1067 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1068 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1069 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1070 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1071 128 bytes for memset. */
1072 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1073 DUMMY_STRINGOP_ALGS},
1074 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1075 DUMMY_STRINGOP_ALGS},
1076 1, /* scalar_stmt_cost. */
1077 1, /* scalar load_cost. */
1078 1, /* scalar_store_cost. */
1079 1, /* vec_stmt_cost. */
1080 1, /* vec_to_scalar_cost. */
1081 1, /* scalar_to_vec_cost. */
1082 1, /* vec_align_load_cost. */
1083 2, /* vec_unalign_load_cost. */
1084 1, /* vec_store_cost. */
1085 3, /* cond_taken_branch_cost. */
1086 1, /* cond_not_taken_branch_cost. */
1090 struct processor_costs k8_cost = {
1091 COSTS_N_INSNS (1), /* cost of an add instruction */
1092 COSTS_N_INSNS (2), /* cost of a lea instruction */
1093 COSTS_N_INSNS (1), /* variable shift costs */
1094 COSTS_N_INSNS (1), /* constant shift costs */
1095 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1096 COSTS_N_INSNS (4), /* HI */
1097 COSTS_N_INSNS (3), /* SI */
1098 COSTS_N_INSNS (4), /* DI */
1099 COSTS_N_INSNS (5)}, /* other */
1100 0, /* cost of multiply per each bit set */
1101 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1102 COSTS_N_INSNS (26), /* HI */
1103 COSTS_N_INSNS (42), /* SI */
1104 COSTS_N_INSNS (74), /* DI */
1105 COSTS_N_INSNS (74)}, /* other */
1106 COSTS_N_INSNS (1), /* cost of movsx */
1107 COSTS_N_INSNS (1), /* cost of movzx */
1108 8, /* "large" insn */
1110 4, /* cost for loading QImode using movzbl */
1111 {3, 4, 3}, /* cost of loading integer registers
1112 in QImode, HImode and SImode.
1113 Relative to reg-reg move (2). */
1114 {3, 4, 3}, /* cost of storing integer registers */
1115 4, /* cost of reg,reg fld/fst */
1116 {4, 4, 12}, /* cost of loading fp registers
1117 in SFmode, DFmode and XFmode */
1118 {6, 6, 8}, /* cost of storing fp registers
1119 in SFmode, DFmode and XFmode */
1120 2, /* cost of moving MMX register */
1121 {3, 3}, /* cost of loading MMX registers
1122 in SImode and DImode */
1123 {4, 4}, /* cost of storing MMX registers
1124 in SImode and DImode */
1125 2, /* cost of moving SSE register */
1126 {4, 3, 6}, /* cost of loading SSE registers
1127 in SImode, DImode and TImode */
1128 {4, 4, 5}, /* cost of storing SSE registers
1129 in SImode, DImode and TImode */
1130 5, /* MMX or SSE register to integer */
1131 64, /* size of l1 cache. */
1132 512, /* size of l2 cache. */
1133 64, /* size of prefetch block */
1134 /* New AMD processors never drop prefetches; if they cannot be performed
1135 immediately, they are queued. We set number of simultaneous prefetches
1136 to a large constant to reflect this (it probably is not a good idea not
1137 to limit number of prefetches at all, as their execution also takes some
1139 100, /* number of parallel prefetches */
1140 3, /* Branch cost */
1141 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1142 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1143 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1144 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1145 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1146 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1147 /* K8 has optimized REP instruction for medium sized blocks, but for very
1148 small blocks it is better to use loop. For large blocks, libcall can
1149 do nontemporary accesses and beat inline considerably. */
1150 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1151 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1152 {{libcall, {{8, loop}, {24, unrolled_loop},
1153 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1154 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1155 4, /* scalar_stmt_cost. */
1156 2, /* scalar load_cost. */
1157 2, /* scalar_store_cost. */
1158 5, /* vec_stmt_cost. */
1159 0, /* vec_to_scalar_cost. */
1160 2, /* scalar_to_vec_cost. */
1161 2, /* vec_align_load_cost. */
1162 3, /* vec_unalign_load_cost. */
1163 3, /* vec_store_cost. */
1164 3, /* cond_taken_branch_cost. */
1165 2, /* cond_not_taken_branch_cost. */
1168 struct processor_costs amdfam10_cost = {
1169 COSTS_N_INSNS (1), /* cost of an add instruction */
1170 COSTS_N_INSNS (2), /* cost of a lea instruction */
1171 COSTS_N_INSNS (1), /* variable shift costs */
1172 COSTS_N_INSNS (1), /* constant shift costs */
1173 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1174 COSTS_N_INSNS (4), /* HI */
1175 COSTS_N_INSNS (3), /* SI */
1176 COSTS_N_INSNS (4), /* DI */
1177 COSTS_N_INSNS (5)}, /* other */
1178 0, /* cost of multiply per each bit set */
1179 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1180 COSTS_N_INSNS (35), /* HI */
1181 COSTS_N_INSNS (51), /* SI */
1182 COSTS_N_INSNS (83), /* DI */
1183 COSTS_N_INSNS (83)}, /* other */
1184 COSTS_N_INSNS (1), /* cost of movsx */
1185 COSTS_N_INSNS (1), /* cost of movzx */
1186 8, /* "large" insn */
1188 4, /* cost for loading QImode using movzbl */
1189 {3, 4, 3}, /* cost of loading integer registers
1190 in QImode, HImode and SImode.
1191 Relative to reg-reg move (2). */
1192 {3, 4, 3}, /* cost of storing integer registers */
1193 4, /* cost of reg,reg fld/fst */
1194 {4, 4, 12}, /* cost of loading fp registers
1195 in SFmode, DFmode and XFmode */
1196 {6, 6, 8}, /* cost of storing fp registers
1197 in SFmode, DFmode and XFmode */
1198 2, /* cost of moving MMX register */
1199 {3, 3}, /* cost of loading MMX registers
1200 in SImode and DImode */
1201 {4, 4}, /* cost of storing MMX registers
1202 in SImode and DImode */
1203 2, /* cost of moving SSE register */
1204 {4, 4, 3}, /* cost of loading SSE registers
1205 in SImode, DImode and TImode */
1206 {4, 4, 5}, /* cost of storing SSE registers
1207 in SImode, DImode and TImode */
1208 3, /* MMX or SSE register to integer */
1210 MOVD reg64, xmmreg Double FSTORE 4
1211 MOVD reg32, xmmreg Double FSTORE 4
1213 MOVD reg64, xmmreg Double FADD 3
1215 MOVD reg32, xmmreg Double FADD 3
1217 64, /* size of l1 cache. */
1218 512, /* size of l2 cache. */
1219 64, /* size of prefetch block */
1220 /* New AMD processors never drop prefetches; if they cannot be performed
1221 immediately, they are queued. We set number of simultaneous prefetches
1222 to a large constant to reflect this (it probably is not a good idea not
1223 to limit number of prefetches at all, as their execution also takes some
1225 100, /* number of parallel prefetches */
1226 2, /* Branch cost */
1227 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1228 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1229 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1230 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1231 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1232 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1234 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1235 very small blocks it is better to use loop. For large blocks, libcall can
1236 do nontemporary accesses and beat inline considerably. */
1237 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1238 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1239 {{libcall, {{8, loop}, {24, unrolled_loop},
1240 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1241 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1242 4, /* scalar_stmt_cost. */
1243 2, /* scalar load_cost. */
1244 2, /* scalar_store_cost. */
1245 6, /* vec_stmt_cost. */
1246 0, /* vec_to_scalar_cost. */
1247 2, /* scalar_to_vec_cost. */
1248 2, /* vec_align_load_cost. */
1249 2, /* vec_unalign_load_cost. */
1250 2, /* vec_store_cost. */
1251 2, /* cond_taken_branch_cost. */
1252 1, /* cond_not_taken_branch_cost. */
1255 struct processor_costs bdver1_cost = {
1256 COSTS_N_INSNS (1), /* cost of an add instruction */
1257 COSTS_N_INSNS (1), /* cost of a lea instruction */
1258 COSTS_N_INSNS (1), /* variable shift costs */
1259 COSTS_N_INSNS (1), /* constant shift costs */
1260 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1261 COSTS_N_INSNS (4), /* HI */
1262 COSTS_N_INSNS (4), /* SI */
1263 COSTS_N_INSNS (6), /* DI */
1264 COSTS_N_INSNS (6)}, /* other */
1265 0, /* cost of multiply per each bit set */
1266 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1267 COSTS_N_INSNS (35), /* HI */
1268 COSTS_N_INSNS (51), /* SI */
1269 COSTS_N_INSNS (83), /* DI */
1270 COSTS_N_INSNS (83)}, /* other */
1271 COSTS_N_INSNS (1), /* cost of movsx */
1272 COSTS_N_INSNS (1), /* cost of movzx */
1273 8, /* "large" insn */
1275 4, /* cost for loading QImode using movzbl */
1276 {5, 5, 4}, /* cost of loading integer registers
1277 in QImode, HImode and SImode.
1278 Relative to reg-reg move (2). */
1279 {4, 4, 4}, /* cost of storing integer registers */
1280 2, /* cost of reg,reg fld/fst */
1281 {5, 5, 12}, /* cost of loading fp registers
1282 in SFmode, DFmode and XFmode */
1283 {4, 4, 8}, /* cost of storing fp registers
1284 in SFmode, DFmode and XFmode */
1285 2, /* cost of moving MMX register */
1286 {4, 4}, /* cost of loading MMX registers
1287 in SImode and DImode */
1288 {4, 4}, /* cost of storing MMX registers
1289 in SImode and DImode */
1290 2, /* cost of moving SSE register */
1291 {4, 4, 4}, /* cost of loading SSE registers
1292 in SImode, DImode and TImode */
1293 {4, 4, 4}, /* cost of storing SSE registers
1294 in SImode, DImode and TImode */
1295 2, /* MMX or SSE register to integer */
1297 MOVD reg64, xmmreg Double FSTORE 4
1298 MOVD reg32, xmmreg Double FSTORE 4
1300 MOVD reg64, xmmreg Double FADD 3
1302 MOVD reg32, xmmreg Double FADD 3
1304 16, /* size of l1 cache. */
1305 2048, /* size of l2 cache. */
1306 64, /* size of prefetch block */
1307 /* New AMD processors never drop prefetches; if they cannot be performed
1308 immediately, they are queued. We set number of simultaneous prefetches
1309 to a large constant to reflect this (it probably is not a good idea not
1310 to limit number of prefetches at all, as their execution also takes some
1312 100, /* number of parallel prefetches */
1313 2, /* Branch cost */
1314 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1315 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1316 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1317 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1318 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1319 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1321 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1322 very small blocks it is better to use loop. For large blocks, libcall
1323 can do nontemporary accesses and beat inline considerably. */
1324 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1325 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1326 {{libcall, {{8, loop}, {24, unrolled_loop},
1327 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1328 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1329 6, /* scalar_stmt_cost. */
1330 4, /* scalar load_cost. */
1331 4, /* scalar_store_cost. */
1332 6, /* vec_stmt_cost. */
1333 0, /* vec_to_scalar_cost. */
1334 2, /* scalar_to_vec_cost. */
1335 4, /* vec_align_load_cost. */
1336 4, /* vec_unalign_load_cost. */
1337 4, /* vec_store_cost. */
1338 2, /* cond_taken_branch_cost. */
1339 1, /* cond_not_taken_branch_cost. */
1342 struct processor_costs bdver2_cost = {
1343 COSTS_N_INSNS (1), /* cost of an add instruction */
1344 COSTS_N_INSNS (1), /* cost of a lea instruction */
1345 COSTS_N_INSNS (1), /* variable shift costs */
1346 COSTS_N_INSNS (1), /* constant shift costs */
1347 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1348 COSTS_N_INSNS (4), /* HI */
1349 COSTS_N_INSNS (4), /* SI */
1350 COSTS_N_INSNS (6), /* DI */
1351 COSTS_N_INSNS (6)}, /* other */
1352 0, /* cost of multiply per each bit set */
1353 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1354 COSTS_N_INSNS (35), /* HI */
1355 COSTS_N_INSNS (51), /* SI */
1356 COSTS_N_INSNS (83), /* DI */
1357 COSTS_N_INSNS (83)}, /* other */
1358 COSTS_N_INSNS (1), /* cost of movsx */
1359 COSTS_N_INSNS (1), /* cost of movzx */
1360 8, /* "large" insn */
1362 4, /* cost for loading QImode using movzbl */
1363 {5, 5, 4}, /* cost of loading integer registers
1364 in QImode, HImode and SImode.
1365 Relative to reg-reg move (2). */
1366 {4, 4, 4}, /* cost of storing integer registers */
1367 2, /* cost of reg,reg fld/fst */
1368 {5, 5, 12}, /* cost of loading fp registers
1369 in SFmode, DFmode and XFmode */
1370 {4, 4, 8}, /* cost of storing fp registers
1371 in SFmode, DFmode and XFmode */
1372 2, /* cost of moving MMX register */
1373 {4, 4}, /* cost of loading MMX registers
1374 in SImode and DImode */
1375 {4, 4}, /* cost of storing MMX registers
1376 in SImode and DImode */
1377 2, /* cost of moving SSE register */
1378 {4, 4, 4}, /* cost of loading SSE registers
1379 in SImode, DImode and TImode */
1380 {4, 4, 4}, /* cost of storing SSE registers
1381 in SImode, DImode and TImode */
1382 2, /* MMX or SSE register to integer */
1384 MOVD reg64, xmmreg Double FSTORE 4
1385 MOVD reg32, xmmreg Double FSTORE 4
1387 MOVD reg64, xmmreg Double FADD 3
1389 MOVD reg32, xmmreg Double FADD 3
1391 16, /* size of l1 cache. */
1392 2048, /* size of l2 cache. */
1393 64, /* size of prefetch block */
1394 /* New AMD processors never drop prefetches; if they cannot be performed
1395 immediately, they are queued. We set number of simultaneous prefetches
1396 to a large constant to reflect this (it probably is not a good idea not
1397 to limit number of prefetches at all, as their execution also takes some
1399 100, /* number of parallel prefetches */
1400 2, /* Branch cost */
1401 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1402 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1403 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1404 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1405 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1406 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1408 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1409 very small blocks it is better to use loop. For large blocks, libcall
1410 can do nontemporary accesses and beat inline considerably. */
1411 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1412 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1413 {{libcall, {{8, loop}, {24, unrolled_loop},
1414 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1415 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1416 6, /* scalar_stmt_cost. */
1417 4, /* scalar load_cost. */
1418 4, /* scalar_store_cost. */
1419 6, /* vec_stmt_cost. */
1420 0, /* vec_to_scalar_cost. */
1421 2, /* scalar_to_vec_cost. */
1422 4, /* vec_align_load_cost. */
1423 4, /* vec_unalign_load_cost. */
1424 4, /* vec_store_cost. */
1425 2, /* cond_taken_branch_cost. */
1426 1, /* cond_not_taken_branch_cost. */
1429 struct processor_costs btver1_cost = {
1430 COSTS_N_INSNS (1), /* cost of an add instruction */
1431 COSTS_N_INSNS (2), /* cost of a lea instruction */
1432 COSTS_N_INSNS (1), /* variable shift costs */
1433 COSTS_N_INSNS (1), /* constant shift costs */
1434 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1435 COSTS_N_INSNS (4), /* HI */
1436 COSTS_N_INSNS (3), /* SI */
1437 COSTS_N_INSNS (4), /* DI */
1438 COSTS_N_INSNS (5)}, /* other */
1439 0, /* cost of multiply per each bit set */
1440 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1441 COSTS_N_INSNS (35), /* HI */
1442 COSTS_N_INSNS (51), /* SI */
1443 COSTS_N_INSNS (83), /* DI */
1444 COSTS_N_INSNS (83)}, /* other */
1445 COSTS_N_INSNS (1), /* cost of movsx */
1446 COSTS_N_INSNS (1), /* cost of movzx */
1447 8, /* "large" insn */
1449 4, /* cost for loading QImode using movzbl */
1450 {3, 4, 3}, /* cost of loading integer registers
1451 in QImode, HImode and SImode.
1452 Relative to reg-reg move (2). */
1453 {3, 4, 3}, /* cost of storing integer registers */
1454 4, /* cost of reg,reg fld/fst */
1455 {4, 4, 12}, /* cost of loading fp registers
1456 in SFmode, DFmode and XFmode */
1457 {6, 6, 8}, /* cost of storing fp registers
1458 in SFmode, DFmode and XFmode */
1459 2, /* cost of moving MMX register */
1460 {3, 3}, /* cost of loading MMX registers
1461 in SImode and DImode */
1462 {4, 4}, /* cost of storing MMX registers
1463 in SImode and DImode */
1464 2, /* cost of moving SSE register */
1465 {4, 4, 3}, /* cost of loading SSE registers
1466 in SImode, DImode and TImode */
1467 {4, 4, 5}, /* cost of storing SSE registers
1468 in SImode, DImode and TImode */
1469 3, /* MMX or SSE register to integer */
1471 MOVD reg64, xmmreg Double FSTORE 4
1472 MOVD reg32, xmmreg Double FSTORE 4
1474 MOVD reg64, xmmreg Double FADD 3
1476 MOVD reg32, xmmreg Double FADD 3
1478 32, /* size of l1 cache. */
1479 512, /* size of l2 cache. */
1480 64, /* size of prefetch block */
1481 100, /* number of parallel prefetches */
1482 2, /* Branch cost */
1483 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1484 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1485 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1486 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1487 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1488 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1490 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1491 very small blocks it is better to use loop. For large blocks, libcall can
1492 do nontemporary accesses and beat inline considerably. */
1493 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1494 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1495 {{libcall, {{8, loop}, {24, unrolled_loop},
1496 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1497 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1498 4, /* scalar_stmt_cost. */
1499 2, /* scalar load_cost. */
1500 2, /* scalar_store_cost. */
1501 6, /* vec_stmt_cost. */
1502 0, /* vec_to_scalar_cost. */
1503 2, /* scalar_to_vec_cost. */
1504 2, /* vec_align_load_cost. */
1505 2, /* vec_unalign_load_cost. */
1506 2, /* vec_store_cost. */
1507 2, /* cond_taken_branch_cost. */
1508 1, /* cond_not_taken_branch_cost. */
1512 struct processor_costs pentium4_cost = {
1513 COSTS_N_INSNS (1), /* cost of an add instruction */
1514 COSTS_N_INSNS (3), /* cost of a lea instruction */
1515 COSTS_N_INSNS (4), /* variable shift costs */
1516 COSTS_N_INSNS (4), /* constant shift costs */
1517 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1518 COSTS_N_INSNS (15), /* HI */
1519 COSTS_N_INSNS (15), /* SI */
1520 COSTS_N_INSNS (15), /* DI */
1521 COSTS_N_INSNS (15)}, /* other */
1522 0, /* cost of multiply per each bit set */
1523 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1524 COSTS_N_INSNS (56), /* HI */
1525 COSTS_N_INSNS (56), /* SI */
1526 COSTS_N_INSNS (56), /* DI */
1527 COSTS_N_INSNS (56)}, /* other */
1528 COSTS_N_INSNS (1), /* cost of movsx */
1529 COSTS_N_INSNS (1), /* cost of movzx */
1530 16, /* "large" insn */
1532 2, /* cost for loading QImode using movzbl */
1533 {4, 5, 4}, /* cost of loading integer registers
1534 in QImode, HImode and SImode.
1535 Relative to reg-reg move (2). */
1536 {2, 3, 2}, /* cost of storing integer registers */
1537 2, /* cost of reg,reg fld/fst */
1538 {2, 2, 6}, /* cost of loading fp registers
1539 in SFmode, DFmode and XFmode */
1540 {4, 4, 6}, /* cost of storing fp registers
1541 in SFmode, DFmode and XFmode */
1542 2, /* cost of moving MMX register */
1543 {2, 2}, /* cost of loading MMX registers
1544 in SImode and DImode */
1545 {2, 2}, /* cost of storing MMX registers
1546 in SImode and DImode */
1547 12, /* cost of moving SSE register */
1548 {12, 12, 12}, /* cost of loading SSE registers
1549 in SImode, DImode and TImode */
1550 {2, 2, 8}, /* cost of storing SSE registers
1551 in SImode, DImode and TImode */
1552 10, /* MMX or SSE register to integer */
1553 8, /* size of l1 cache. */
1554 256, /* size of l2 cache. */
1555 64, /* size of prefetch block */
1556 6, /* number of parallel prefetches */
1557 2, /* Branch cost */
1558 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1559 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1560 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1561 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1562 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1563 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1564 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1565 DUMMY_STRINGOP_ALGS},
1566 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1568 DUMMY_STRINGOP_ALGS},
1569 1, /* scalar_stmt_cost. */
1570 1, /* scalar load_cost. */
1571 1, /* scalar_store_cost. */
1572 1, /* vec_stmt_cost. */
1573 1, /* vec_to_scalar_cost. */
1574 1, /* scalar_to_vec_cost. */
1575 1, /* vec_align_load_cost. */
1576 2, /* vec_unalign_load_cost. */
1577 1, /* vec_store_cost. */
1578 3, /* cond_taken_branch_cost. */
1579 1, /* cond_not_taken_branch_cost. */
1583 struct processor_costs nocona_cost = {
1584 COSTS_N_INSNS (1), /* cost of an add instruction */
1585 COSTS_N_INSNS (1), /* cost of a lea instruction */
1586 COSTS_N_INSNS (1), /* variable shift costs */
1587 COSTS_N_INSNS (1), /* constant shift costs */
1588 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1589 COSTS_N_INSNS (10), /* HI */
1590 COSTS_N_INSNS (10), /* SI */
1591 COSTS_N_INSNS (10), /* DI */
1592 COSTS_N_INSNS (10)}, /* other */
1593 0, /* cost of multiply per each bit set */
1594 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1595 COSTS_N_INSNS (66), /* HI */
1596 COSTS_N_INSNS (66), /* SI */
1597 COSTS_N_INSNS (66), /* DI */
1598 COSTS_N_INSNS (66)}, /* other */
1599 COSTS_N_INSNS (1), /* cost of movsx */
1600 COSTS_N_INSNS (1), /* cost of movzx */
1601 16, /* "large" insn */
1602 17, /* MOVE_RATIO */
1603 4, /* cost for loading QImode using movzbl */
1604 {4, 4, 4}, /* cost of loading integer registers
1605 in QImode, HImode and SImode.
1606 Relative to reg-reg move (2). */
1607 {4, 4, 4}, /* cost of storing integer registers */
1608 3, /* cost of reg,reg fld/fst */
1609 {12, 12, 12}, /* cost of loading fp registers
1610 in SFmode, DFmode and XFmode */
1611 {4, 4, 4}, /* cost of storing fp registers
1612 in SFmode, DFmode and XFmode */
1613 6, /* cost of moving MMX register */
1614 {12, 12}, /* cost of loading MMX registers
1615 in SImode and DImode */
1616 {12, 12}, /* cost of storing MMX registers
1617 in SImode and DImode */
1618 6, /* cost of moving SSE register */
1619 {12, 12, 12}, /* cost of loading SSE registers
1620 in SImode, DImode and TImode */
1621 {12, 12, 12}, /* cost of storing SSE registers
1622 in SImode, DImode and TImode */
1623 8, /* MMX or SSE register to integer */
1624 8, /* size of l1 cache. */
1625 1024, /* size of l2 cache. */
1626 128, /* size of prefetch block */
1627 8, /* number of parallel prefetches */
1628 1, /* Branch cost */
1629 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1630 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1631 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1632 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1633 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1634 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1635 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1636 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1637 {100000, unrolled_loop}, {-1, libcall}}}},
1638 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1640 {libcall, {{24, loop}, {64, unrolled_loop},
1641 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1642 1, /* scalar_stmt_cost. */
1643 1, /* scalar load_cost. */
1644 1, /* scalar_store_cost. */
1645 1, /* vec_stmt_cost. */
1646 1, /* vec_to_scalar_cost. */
1647 1, /* scalar_to_vec_cost. */
1648 1, /* vec_align_load_cost. */
1649 2, /* vec_unalign_load_cost. */
1650 1, /* vec_store_cost. */
1651 3, /* cond_taken_branch_cost. */
1652 1, /* cond_not_taken_branch_cost. */
1656 struct processor_costs atom_cost = {
1657 COSTS_N_INSNS (1), /* cost of an add instruction */
1658 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1659 COSTS_N_INSNS (1), /* variable shift costs */
1660 COSTS_N_INSNS (1), /* constant shift costs */
1661 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1662 COSTS_N_INSNS (4), /* HI */
1663 COSTS_N_INSNS (3), /* SI */
1664 COSTS_N_INSNS (4), /* DI */
1665 COSTS_N_INSNS (2)}, /* other */
1666 0, /* cost of multiply per each bit set */
1667 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1668 COSTS_N_INSNS (26), /* HI */
1669 COSTS_N_INSNS (42), /* SI */
1670 COSTS_N_INSNS (74), /* DI */
1671 COSTS_N_INSNS (74)}, /* other */
1672 COSTS_N_INSNS (1), /* cost of movsx */
1673 COSTS_N_INSNS (1), /* cost of movzx */
1674 8, /* "large" insn */
1675 17, /* MOVE_RATIO */
1676 4, /* cost for loading QImode using movzbl */
1677 {4, 4, 4}, /* cost of loading integer registers
1678 in QImode, HImode and SImode.
1679 Relative to reg-reg move (2). */
1680 {4, 4, 4}, /* cost of storing integer registers */
1681 4, /* cost of reg,reg fld/fst */
1682 {12, 12, 12}, /* cost of loading fp registers
1683 in SFmode, DFmode and XFmode */
1684 {6, 6, 8}, /* cost of storing fp registers
1685 in SFmode, DFmode and XFmode */
1686 2, /* cost of moving MMX register */
1687 {8, 8}, /* cost of loading MMX registers
1688 in SImode and DImode */
1689 {8, 8}, /* cost of storing MMX registers
1690 in SImode and DImode */
1691 2, /* cost of moving SSE register */
1692 {8, 8, 8}, /* cost of loading SSE registers
1693 in SImode, DImode and TImode */
1694 {8, 8, 8}, /* cost of storing SSE registers
1695 in SImode, DImode and TImode */
1696 5, /* MMX or SSE register to integer */
1697 32, /* size of l1 cache. */
1698 256, /* size of l2 cache. */
1699 64, /* size of prefetch block */
1700 6, /* number of parallel prefetches */
1701 3, /* Branch cost */
1702 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1703 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1704 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1705 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1706 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1707 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1708 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1709 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1710 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1711 {{libcall, {{8, loop}, {15, unrolled_loop},
1712 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1713 {libcall, {{24, loop}, {32, unrolled_loop},
1714 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1715 1, /* scalar_stmt_cost. */
1716 1, /* scalar load_cost. */
1717 1, /* scalar_store_cost. */
1718 1, /* vec_stmt_cost. */
1719 1, /* vec_to_scalar_cost. */
1720 1, /* scalar_to_vec_cost. */
1721 1, /* vec_align_load_cost. */
1722 2, /* vec_unalign_load_cost. */
1723 1, /* vec_store_cost. */
1724 3, /* cond_taken_branch_cost. */
1725 1, /* cond_not_taken_branch_cost. */
1728 /* Generic64 should produce code tuned for Nocona and K8. */
1730 struct processor_costs generic64_cost = {
1731 COSTS_N_INSNS (1), /* cost of an add instruction */
1732 /* On all chips taken into consideration lea is 2 cycles and more. With
1733 this cost however our current implementation of synth_mult results in
1734 use of unnecessary temporary registers causing regression on several
1735 SPECfp benchmarks. */
1736 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1737 COSTS_N_INSNS (1), /* variable shift costs */
1738 COSTS_N_INSNS (1), /* constant shift costs */
1739 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1740 COSTS_N_INSNS (4), /* HI */
1741 COSTS_N_INSNS (3), /* SI */
1742 COSTS_N_INSNS (4), /* DI */
1743 COSTS_N_INSNS (2)}, /* other */
1744 0, /* cost of multiply per each bit set */
1745 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1746 COSTS_N_INSNS (26), /* HI */
1747 COSTS_N_INSNS (42), /* SI */
1748 COSTS_N_INSNS (74), /* DI */
1749 COSTS_N_INSNS (74)}, /* other */
1750 COSTS_N_INSNS (1), /* cost of movsx */
1751 COSTS_N_INSNS (1), /* cost of movzx */
1752 8, /* "large" insn */
1753 17, /* MOVE_RATIO */
1754 4, /* cost for loading QImode using movzbl */
1755 {4, 4, 4}, /* cost of loading integer registers
1756 in QImode, HImode and SImode.
1757 Relative to reg-reg move (2). */
1758 {4, 4, 4}, /* cost of storing integer registers */
1759 4, /* cost of reg,reg fld/fst */
1760 {12, 12, 12}, /* cost of loading fp registers
1761 in SFmode, DFmode and XFmode */
1762 {6, 6, 8}, /* cost of storing fp registers
1763 in SFmode, DFmode and XFmode */
1764 2, /* cost of moving MMX register */
1765 {8, 8}, /* cost of loading MMX registers
1766 in SImode and DImode */
1767 {8, 8}, /* cost of storing MMX registers
1768 in SImode and DImode */
1769 2, /* cost of moving SSE register */
1770 {8, 8, 8}, /* cost of loading SSE registers
1771 in SImode, DImode and TImode */
1772 {8, 8, 8}, /* cost of storing SSE registers
1773 in SImode, DImode and TImode */
1774 5, /* MMX or SSE register to integer */
1775 32, /* size of l1 cache. */
1776 512, /* size of l2 cache. */
1777 64, /* size of prefetch block */
1778 6, /* number of parallel prefetches */
1779 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1780 value is increased to perhaps more appropriate value of 5. */
1781 3, /* Branch cost */
1782 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1783 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1784 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1785 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1786 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1787 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1788 {DUMMY_STRINGOP_ALGS,
1789 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1790 {DUMMY_STRINGOP_ALGS,
1791 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1792 1, /* scalar_stmt_cost. */
1793 1, /* scalar load_cost. */
1794 1, /* scalar_store_cost. */
1795 1, /* vec_stmt_cost. */
1796 1, /* vec_to_scalar_cost. */
1797 1, /* scalar_to_vec_cost. */
1798 1, /* vec_align_load_cost. */
1799 2, /* vec_unalign_load_cost. */
1800 1, /* vec_store_cost. */
1801 3, /* cond_taken_branch_cost. */
1802 1, /* cond_not_taken_branch_cost. */
1805 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1808 struct processor_costs generic32_cost = {
1809 COSTS_N_INSNS (1), /* cost of an add instruction */
1810 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1811 COSTS_N_INSNS (1), /* variable shift costs */
1812 COSTS_N_INSNS (1), /* constant shift costs */
1813 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1814 COSTS_N_INSNS (4), /* HI */
1815 COSTS_N_INSNS (3), /* SI */
1816 COSTS_N_INSNS (4), /* DI */
1817 COSTS_N_INSNS (2)}, /* other */
1818 0, /* cost of multiply per each bit set */
1819 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1820 COSTS_N_INSNS (26), /* HI */
1821 COSTS_N_INSNS (42), /* SI */
1822 COSTS_N_INSNS (74), /* DI */
1823 COSTS_N_INSNS (74)}, /* other */
1824 COSTS_N_INSNS (1), /* cost of movsx */
1825 COSTS_N_INSNS (1), /* cost of movzx */
1826 8, /* "large" insn */
1827 17, /* MOVE_RATIO */
1828 4, /* cost for loading QImode using movzbl */
1829 {4, 4, 4}, /* cost of loading integer registers
1830 in QImode, HImode and SImode.
1831 Relative to reg-reg move (2). */
1832 {4, 4, 4}, /* cost of storing integer registers */
1833 4, /* cost of reg,reg fld/fst */
1834 {12, 12, 12}, /* cost of loading fp registers
1835 in SFmode, DFmode and XFmode */
1836 {6, 6, 8}, /* cost of storing fp registers
1837 in SFmode, DFmode and XFmode */
1838 2, /* cost of moving MMX register */
1839 {8, 8}, /* cost of loading MMX registers
1840 in SImode and DImode */
1841 {8, 8}, /* cost of storing MMX registers
1842 in SImode and DImode */
1843 2, /* cost of moving SSE register */
1844 {8, 8, 8}, /* cost of loading SSE registers
1845 in SImode, DImode and TImode */
1846 {8, 8, 8}, /* cost of storing SSE registers
1847 in SImode, DImode and TImode */
1848 5, /* MMX or SSE register to integer */
1849 32, /* size of l1 cache. */
1850 256, /* size of l2 cache. */
1851 64, /* size of prefetch block */
1852 6, /* number of parallel prefetches */
1853 3, /* Branch cost */
1854 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1855 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1856 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1857 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1858 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1859 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1860 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1861 DUMMY_STRINGOP_ALGS},
1862 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1863 DUMMY_STRINGOP_ALGS},
1864 1, /* scalar_stmt_cost. */
1865 1, /* scalar load_cost. */
1866 1, /* scalar_store_cost. */
1867 1, /* vec_stmt_cost. */
1868 1, /* vec_to_scalar_cost. */
1869 1, /* scalar_to_vec_cost. */
1870 1, /* vec_align_load_cost. */
1871 2, /* vec_unalign_load_cost. */
1872 1, /* vec_store_cost. */
1873 3, /* cond_taken_branch_cost. */
1874 1, /* cond_not_taken_branch_cost. */
1877 const struct processor_costs *ix86_cost = &pentium_cost;
1879 /* Processor feature/optimization bitmasks. */
1880 #define m_386 (1<<PROCESSOR_I386)
1881 #define m_486 (1<<PROCESSOR_I486)
1882 #define m_PENT (1<<PROCESSOR_PENTIUM)
1883 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1884 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1885 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1886 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1887 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1888 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1889 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1890 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1891 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1892 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1893 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1894 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1895 #define m_ATOM (1<<PROCESSOR_ATOM)
1897 #define m_GEODE (1<<PROCESSOR_GEODE)
1898 #define m_K6 (1<<PROCESSOR_K6)
1899 #define m_K6_GEODE (m_K6 | m_GEODE)
1900 #define m_K8 (1<<PROCESSOR_K8)
1901 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1902 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1903 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1904 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1905 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1906 #define m_BDVER (m_BDVER1 | m_BDVER2)
1907 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1908 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1910 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1911 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1913 /* Generic instruction choice should be common subset of supported CPUs
1914 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1915 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1917 /* Feature tests against the various tunings. */
1918 unsigned char ix86_tune_features[X86_TUNE_LAST];
1920 /* Feature tests against the various tunings used to create ix86_tune_features
1921 based on the processor mask. */
1922 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1923 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1924 negatively, so enabling for Generic64 seems like good code size
1925 tradeoff. We can't enable it for 32bit generic because it does not
1926 work well with PPro base chips. */
1927 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1929 /* X86_TUNE_PUSH_MEMORY */
1930 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1932 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1935 /* X86_TUNE_UNROLL_STRLEN */
1936 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1938 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1939 on simulation result. But after P4 was made, no performance benefit
1940 was observed with branch hints. It also increases the code size.
1941 As a result, icc never generates branch hints. */
1944 /* X86_TUNE_DOUBLE_WITH_ADD */
1947 /* X86_TUNE_USE_SAHF */
1948 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1950 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1951 partial dependencies. */
1952 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1954 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1955 register stalls on Generic32 compilation setting as well. However
1956 in current implementation the partial register stalls are not eliminated
1957 very well - they can be introduced via subregs synthesized by combine
1958 and can happen in caller/callee saving sequences. Because this option
1959 pays back little on PPro based chips and is in conflict with partial reg
1960 dependencies used by Athlon/P4 based chips, it is better to leave it off
1961 for generic32 for now. */
1964 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1965 m_CORE2I7 | m_GENERIC,
1967 /* X86_TUNE_USE_HIMODE_FIOP */
1968 m_386 | m_486 | m_K6_GEODE,
1970 /* X86_TUNE_USE_SIMODE_FIOP */
1971 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1973 /* X86_TUNE_USE_MOV0 */
1976 /* X86_TUNE_USE_CLTD */
1977 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1979 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1982 /* X86_TUNE_SPLIT_LONG_MOVES */
1985 /* X86_TUNE_READ_MODIFY_WRITE */
1988 /* X86_TUNE_READ_MODIFY */
1991 /* X86_TUNE_PROMOTE_QIMODE */
1992 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1994 /* X86_TUNE_FAST_PREFIX */
1995 ~(m_386 | m_486 | m_PENT),
1997 /* X86_TUNE_SINGLE_STRINGOP */
1998 m_386 | m_P4_NOCONA,
2000 /* X86_TUNE_QIMODE_MATH */
2003 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2004 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2005 might be considered for Generic32 if our scheme for avoiding partial
2006 stalls was more effective. */
2009 /* X86_TUNE_PROMOTE_QI_REGS */
2012 /* X86_TUNE_PROMOTE_HI_REGS */
2015 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2016 over esp addition. */
2017 m_386 | m_486 | m_PENT | m_PPRO,
2019 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2020 over esp addition. */
2023 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2024 over esp subtraction. */
2025 m_386 | m_486 | m_PENT | m_K6_GEODE,
2027 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2028 over esp subtraction. */
2029 m_PENT | m_K6_GEODE,
2031 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2032 for DFmode copies */
2033 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2035 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2036 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2038 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2039 conflict here in between PPro/Pentium4 based chips that thread 128bit
2040 SSE registers as single units versus K8 based chips that divide SSE
2041 registers to two 64bit halves. This knob promotes all store destinations
2042 to be 128bit to allow register renaming on 128bit SSE units, but usually
2043 results in one extra microop on 64bit SSE units. Experimental results
2044 shows that disabling this option on P4 brings over 20% SPECfp regression,
2045 while enabling it on K8 brings roughly 2.4% regression that can be partly
2046 masked by careful scheduling of moves. */
2047 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2049 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2050 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2052 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2055 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2058 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2059 are resolved on SSE register parts instead of whole registers, so we may
2060 maintain just lower part of scalar values in proper format leaving the
2061 upper part undefined. */
2064 /* X86_TUNE_SSE_TYPELESS_STORES */
2067 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2068 m_PPRO | m_P4_NOCONA,
2070 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2071 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2073 /* X86_TUNE_PROLOGUE_USING_MOVE */
2074 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2076 /* X86_TUNE_EPILOGUE_USING_MOVE */
2077 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2079 /* X86_TUNE_SHIFT1 */
2082 /* X86_TUNE_USE_FFREEP */
2085 /* X86_TUNE_INTER_UNIT_MOVES */
2086 ~(m_AMD_MULTIPLE | m_GENERIC),
2088 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2089 ~(m_AMDFAM10 | m_BDVER ),
2091 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2092 than 4 branch instructions in the 16 byte window. */
2093 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2095 /* X86_TUNE_SCHEDULE */
2096 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2098 /* X86_TUNE_USE_BT */
2099 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2101 /* X86_TUNE_USE_INCDEC */
2102 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2104 /* X86_TUNE_PAD_RETURNS */
2105 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2107 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2110 /* X86_TUNE_EXT_80387_CONSTANTS */
2111 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2113 /* X86_TUNE_SHORTEN_X87_SSE */
2116 /* X86_TUNE_AVOID_VECTOR_DECODE */
2117 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2119 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2120 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2123 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2124 vector path on AMD machines. */
2125 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2127 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2129 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2131 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2135 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2136 but one byte longer. */
2139 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2140 operand that cannot be represented using a modRM byte. The XOR
2141 replacement is long decoded, so this split helps here as well. */
2144 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2146 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2148 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2149 from integer to FP. */
2152 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2153 with a subsequent conditional jump instruction into a single
2154 compare-and-branch uop. */
2157 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2158 will impact LEA instruction selection. */
2161 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2165 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2166 at -O3. For the moment, the prefetching seems badly tuned for Intel
2168 m_K6_GEODE | m_AMD_MULTIPLE,
2170 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2171 the auto-vectorizer. */
2174 /* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations
2175 during reassociation of integer computation. */
2178 /* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations
2179 during reassociation of fp computation. */
2183 /* Feature tests against the various architecture variations. */
2184 unsigned char ix86_arch_features[X86_ARCH_LAST];
2186 /* Feature tests against the various architecture variations, used to create
2187 ix86_arch_features based on the processor mask. */
2188 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2189 /* X86_ARCH_CMOV: Conditional move was added for pentiumpro. */
2190 ~(m_386 | m_486 | m_PENT | m_K6),
2192 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2195 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2198 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2201 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2205 static const unsigned int x86_accumulate_outgoing_args
2206 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2208 static const unsigned int x86_arch_always_fancy_math_387
2209 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2211 static const unsigned int x86_avx256_split_unaligned_load
2212 = m_COREI7 | m_GENERIC;
2214 static const unsigned int x86_avx256_split_unaligned_store
2215 = m_COREI7 | m_BDVER | m_GENERIC;
2217 /* In case the average insn count for single function invocation is
2218 lower than this constant, emit fast (but longer) prologue and
2220 #define FAST_PROLOGUE_INSN_COUNT 20
2222 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2223 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2224 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2225 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2227 /* Array of the smallest class containing reg number REGNO, indexed by
2228 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2230 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2232 /* ax, dx, cx, bx */
2233 AREG, DREG, CREG, BREG,
2234 /* si, di, bp, sp */
2235 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2237 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2238 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2241 /* flags, fpsr, fpcr, frame */
2242 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2244 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2247 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2250 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2251 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2252 /* SSE REX registers */
2253 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2257 /* The "default" register map used in 32bit mode. */
2259 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2261 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2262 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2263 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2264 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2265 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2266 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2267 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2270 /* The "default" register map used in 64bit mode. */
2272 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2274 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2275 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2276 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2277 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2278 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2279 8,9,10,11,12,13,14,15, /* extended integer registers */
2280 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2283 /* Define the register numbers to be used in Dwarf debugging information.
2284 The SVR4 reference port C compiler uses the following register numbers
2285 in its Dwarf output code:
2286 0 for %eax (gcc regno = 0)
2287 1 for %ecx (gcc regno = 2)
2288 2 for %edx (gcc regno = 1)
2289 3 for %ebx (gcc regno = 3)
2290 4 for %esp (gcc regno = 7)
2291 5 for %ebp (gcc regno = 6)
2292 6 for %esi (gcc regno = 4)
2293 7 for %edi (gcc regno = 5)
2294 The following three DWARF register numbers are never generated by
2295 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2296 believes these numbers have these meanings.
2297 8 for %eip (no gcc equivalent)
2298 9 for %eflags (gcc regno = 17)
2299 10 for %trapno (no gcc equivalent)
2300 It is not at all clear how we should number the FP stack registers
2301 for the x86 architecture. If the version of SDB on x86/svr4 were
2302 a bit less brain dead with respect to floating-point then we would
2303 have a precedent to follow with respect to DWARF register numbers
2304 for x86 FP registers, but the SDB on x86/svr4 is so completely
2305 broken with respect to FP registers that it is hardly worth thinking
2306 of it as something to strive for compatibility with.
2307 The version of x86/svr4 SDB I have at the moment does (partially)
2308 seem to believe that DWARF register number 11 is associated with
2309 the x86 register %st(0), but that's about all. Higher DWARF
2310 register numbers don't seem to be associated with anything in
2311 particular, and even for DWARF regno 11, SDB only seems to under-
2312 stand that it should say that a variable lives in %st(0) (when
2313 asked via an `=' command) if we said it was in DWARF regno 11,
2314 but SDB still prints garbage when asked for the value of the
2315 variable in question (via a `/' command).
2316 (Also note that the labels SDB prints for various FP stack regs
2317 when doing an `x' command are all wrong.)
2318 Note that these problems generally don't affect the native SVR4
2319 C compiler because it doesn't allow the use of -O with -g and
2320 because when it is *not* optimizing, it allocates a memory
2321 location for each floating-point variable, and the memory
2322 location is what gets described in the DWARF AT_location
2323 attribute for the variable in question.
2324 Regardless of the severe mental illness of the x86/svr4 SDB, we
2325 do something sensible here and we use the following DWARF
2326 register numbers. Note that these are all stack-top-relative
2328 11 for %st(0) (gcc regno = 8)
2329 12 for %st(1) (gcc regno = 9)
2330 13 for %st(2) (gcc regno = 10)
2331 14 for %st(3) (gcc regno = 11)
2332 15 for %st(4) (gcc regno = 12)
2333 16 for %st(5) (gcc regno = 13)
2334 17 for %st(6) (gcc regno = 14)
2335 18 for %st(7) (gcc regno = 15)
2337 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2339 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2340 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2341 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2342 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2343 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2344 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2345 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2348 /* Define parameter passing and return registers. */
2350 static int const x86_64_int_parameter_registers[6] =
2352 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2355 static int const x86_64_ms_abi_int_parameter_registers[4] =
2357 CX_REG, DX_REG, R8_REG, R9_REG
2360 static int const x86_64_int_return_registers[4] =
2362 AX_REG, DX_REG, DI_REG, SI_REG
2365 /* Define the structure for the machine field in struct function. */
2367 struct GTY(()) stack_local_entry {
2368 unsigned short mode;
2371 struct stack_local_entry *next;
2374 /* Structure describing stack frame layout.
2375 Stack grows downward:
2381 saved static chain if ix86_static_chain_on_stack
2383 saved frame pointer if frame_pointer_needed
2384 <- HARD_FRAME_POINTER
2390 <- sse_regs_save_offset
2393 [va_arg registers] |
2397 [padding2] | = to_allocate
2406 int outgoing_arguments_size;
2407 HOST_WIDE_INT frame;
2409 /* The offsets relative to ARG_POINTER. */
2410 HOST_WIDE_INT frame_pointer_offset;
2411 HOST_WIDE_INT hard_frame_pointer_offset;
2412 HOST_WIDE_INT stack_pointer_offset;
2413 HOST_WIDE_INT hfp_save_offset;
2414 HOST_WIDE_INT reg_save_offset;
2415 HOST_WIDE_INT sse_reg_save_offset;
2417 /* When save_regs_using_mov is set, emit prologue using
2418 move instead of push instructions. */
2419 bool save_regs_using_mov;
2422 /* Which cpu are we scheduling for. */
2423 enum attr_cpu ix86_schedule;
2425 /* Which cpu are we optimizing for. */
2426 enum processor_type ix86_tune;
2428 /* Which instruction set architecture to use. */
2429 enum processor_type ix86_arch;
2431 /* True if processor has SSE prefetch instruction. */
2432 int x86_prefetch_sse;
2434 /* True if processor has prefetchw instruction. */
2437 /* -mstackrealign option */
2438 static const char ix86_force_align_arg_pointer_string[]
2439 = "force_align_arg_pointer";
2441 static rtx (*ix86_gen_leave) (void);
2442 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2443 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2444 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2445 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2446 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2447 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2448 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2449 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2450 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2452 /* Preferred alignment for stack boundary in bits. */
2453 unsigned int ix86_preferred_stack_boundary;
2455 /* Alignment for incoming stack boundary in bits specified at
2457 static unsigned int ix86_user_incoming_stack_boundary;
2459 /* Default alignment for incoming stack boundary in bits. */
2460 static unsigned int ix86_default_incoming_stack_boundary;
2462 /* Alignment for incoming stack boundary in bits. */
2463 unsigned int ix86_incoming_stack_boundary;
2465 /* Calling abi specific va_list type nodes. */
2466 static GTY(()) tree sysv_va_list_type_node;
2467 static GTY(()) tree ms_va_list_type_node;
2469 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2470 char internal_label_prefix[16];
2471 int internal_label_prefix_len;
2473 /* Fence to use after loop using movnt. */
2476 /* Register class used for passing given 64bit part of the argument.
2477 These represent classes as documented by the PS ABI, with the exception
2478 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2479 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2481 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2482 whenever possible (upper half does contain padding). */
2483 enum x86_64_reg_class
2486 X86_64_INTEGER_CLASS,
2487 X86_64_INTEGERSI_CLASS,
2494 X86_64_COMPLEX_X87_CLASS,
2498 #define MAX_CLASSES 4
2500 /* Table of constants used by fldpi, fldln2, etc.... */
2501 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2502 static bool ext_80387_constants_init = 0;
2505 static struct machine_function * ix86_init_machine_status (void);
2506 static rtx ix86_function_value (const_tree, const_tree, bool);
2507 static bool ix86_function_value_regno_p (const unsigned int);
2508 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2510 static rtx ix86_static_chain (const_tree, bool);
2511 static int ix86_function_regparm (const_tree, const_tree);
2512 static void ix86_compute_frame_layout (struct ix86_frame *);
2513 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2515 static void ix86_add_new_builtins (HOST_WIDE_INT);
2516 static tree ix86_canonical_va_list_type (tree);
2517 static void predict_jump (int);
2518 static unsigned int split_stack_prologue_scratch_regno (void);
2519 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2521 enum ix86_function_specific_strings
2523 IX86_FUNCTION_SPECIFIC_ARCH,
2524 IX86_FUNCTION_SPECIFIC_TUNE,
2525 IX86_FUNCTION_SPECIFIC_MAX
2528 static char *ix86_target_string (HOST_WIDE_INT, int, const char *,
2529 const char *, enum fpmath_unit, bool);
2530 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2531 static void ix86_function_specific_save (struct cl_target_option *);
2532 static void ix86_function_specific_restore (struct cl_target_option *);
2533 static void ix86_function_specific_print (FILE *, int,
2534 struct cl_target_option *);
2535 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2536 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2537 struct gcc_options *);
2538 static bool ix86_can_inline_p (tree, tree);
2539 static void ix86_set_current_function (tree);
2540 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2542 static enum calling_abi ix86_function_abi (const_tree);
2545 #ifndef SUBTARGET32_DEFAULT_CPU
2546 #define SUBTARGET32_DEFAULT_CPU "i386"
2549 /* The svr4 ABI for the i386 says that records and unions are returned
2551 #ifndef DEFAULT_PCC_STRUCT_RETURN
2552 #define DEFAULT_PCC_STRUCT_RETURN 1
2555 /* Whether -mtune= or -march= were specified */
2556 static int ix86_tune_defaulted;
2557 static int ix86_arch_specified;
2559 /* Vectorization library interface and handlers. */
2560 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2562 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2563 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2565 /* Processor target table, indexed by processor number */
2568 const struct processor_costs *cost; /* Processor costs */
2569 const int align_loop; /* Default alignments. */
2570 const int align_loop_max_skip;
2571 const int align_jump;
2572 const int align_jump_max_skip;
2573 const int align_func;
2576 static const struct ptt processor_target_table[PROCESSOR_max] =
2578 {&i386_cost, 4, 3, 4, 3, 4},
2579 {&i486_cost, 16, 15, 16, 15, 16},
2580 {&pentium_cost, 16, 7, 16, 7, 16},
2581 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2582 {&geode_cost, 0, 0, 0, 0, 0},
2583 {&k6_cost, 32, 7, 32, 7, 32},
2584 {&athlon_cost, 16, 7, 16, 7, 16},
2585 {&pentium4_cost, 0, 0, 0, 0, 0},
2586 {&k8_cost, 16, 7, 16, 7, 16},
2587 {&nocona_cost, 0, 0, 0, 0, 0},
2588 /* Core 2 32-bit. */
2589 {&generic32_cost, 16, 10, 16, 10, 16},
2590 /* Core 2 64-bit. */
2591 {&generic64_cost, 16, 10, 16, 10, 16},
2592 /* Core i7 32-bit. */
2593 {&generic32_cost, 16, 10, 16, 10, 16},
2594 /* Core i7 64-bit. */
2595 {&generic64_cost, 16, 10, 16, 10, 16},
2596 {&generic32_cost, 16, 7, 16, 7, 16},
2597 {&generic64_cost, 16, 10, 16, 10, 16},
2598 {&amdfam10_cost, 32, 24, 32, 7, 32},
2599 {&bdver1_cost, 32, 24, 32, 7, 32},
2600 {&bdver2_cost, 32, 24, 32, 7, 32},
2601 {&btver1_cost, 32, 24, 32, 7, 32},
2602 {&atom_cost, 16, 15, 16, 7, 16}
2605 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2635 /* Return true if a red-zone is in use. */
2638 ix86_using_red_zone (void)
2640 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2643 /* Return a string that documents the current -m options. The caller is
2644 responsible for freeing the string. */
2647 ix86_target_string (HOST_WIDE_INT isa, int flags, const char *arch,
2648 const char *tune, enum fpmath_unit fpmath,
2651 struct ix86_target_opts
2653 const char *option; /* option string */
2654 HOST_WIDE_INT mask; /* isa mask options */
2657 /* This table is ordered so that options like -msse4.2 that imply
2658 preceding options while match those first. */
2659 static struct ix86_target_opts isa_opts[] =
2661 { "-m64", OPTION_MASK_ISA_64BIT },
2662 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2663 { "-mfma", OPTION_MASK_ISA_FMA },
2664 { "-mxop", OPTION_MASK_ISA_XOP },
2665 { "-mlwp", OPTION_MASK_ISA_LWP },
2666 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2667 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2668 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2669 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2670 { "-msse3", OPTION_MASK_ISA_SSE3 },
2671 { "-msse2", OPTION_MASK_ISA_SSE2 },
2672 { "-msse", OPTION_MASK_ISA_SSE },
2673 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2674 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2675 { "-mmmx", OPTION_MASK_ISA_MMX },
2676 { "-mabm", OPTION_MASK_ISA_ABM },
2677 { "-mbmi", OPTION_MASK_ISA_BMI },
2678 { "-mbmi2", OPTION_MASK_ISA_BMI2 },
2679 { "-mlzcnt", OPTION_MASK_ISA_LZCNT },
2680 { "-mtbm", OPTION_MASK_ISA_TBM },
2681 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2682 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2683 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2684 { "-maes", OPTION_MASK_ISA_AES },
2685 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2686 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2687 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2688 { "-mf16c", OPTION_MASK_ISA_F16C },
2692 static struct ix86_target_opts flag_opts[] =
2694 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2695 { "-m80387", MASK_80387 },
2696 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2697 { "-malign-double", MASK_ALIGN_DOUBLE },
2698 { "-mcld", MASK_CLD },
2699 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2700 { "-mieee-fp", MASK_IEEE_FP },
2701 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2702 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2703 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2704 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2705 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2706 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2707 { "-mno-red-zone", MASK_NO_RED_ZONE },
2708 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2709 { "-mrecip", MASK_RECIP },
2710 { "-mrtd", MASK_RTD },
2711 { "-msseregparm", MASK_SSEREGPARM },
2712 { "-mstack-arg-probe", MASK_STACK_PROBE },
2713 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2714 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2715 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2716 { "-mvzeroupper", MASK_VZEROUPPER },
2717 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2718 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2719 { "-mprefer-avx128", MASK_PREFER_AVX128},
2722 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2725 char target_other[40];
2734 memset (opts, '\0', sizeof (opts));
2736 /* Add -march= option. */
2739 opts[num][0] = "-march=";
2740 opts[num++][1] = arch;
2743 /* Add -mtune= option. */
2746 opts[num][0] = "-mtune=";
2747 opts[num++][1] = tune;
2750 /* Pick out the options in isa options. */
2751 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2753 if ((isa & isa_opts[i].mask) != 0)
2755 opts[num++][0] = isa_opts[i].option;
2756 isa &= ~ isa_opts[i].mask;
2760 if (isa && add_nl_p)
2762 opts[num++][0] = isa_other;
2763 sprintf (isa_other, "(other isa: %#" HOST_WIDE_INT_PRINT "x)",
2767 /* Add flag options. */
2768 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2770 if ((flags & flag_opts[i].mask) != 0)
2772 opts[num++][0] = flag_opts[i].option;
2773 flags &= ~ flag_opts[i].mask;
2777 if (flags && add_nl_p)
2779 opts[num++][0] = target_other;
2780 sprintf (target_other, "(other flags: %#x)", flags);
2783 /* Add -fpmath= option. */
2786 opts[num][0] = "-mfpmath=";
2787 switch ((int) fpmath)
2790 opts[num++][1] = "387";
2794 opts[num++][1] = "sse";
2797 case FPMATH_387 | FPMATH_SSE:
2798 opts[num++][1] = "sse+387";
2810 gcc_assert (num < ARRAY_SIZE (opts));
2812 /* Size the string. */
2814 sep_len = (add_nl_p) ? 3 : 1;
2815 for (i = 0; i < num; i++)
2818 for (j = 0; j < 2; j++)
2820 len += strlen (opts[i][j]);
2823 /* Build the string. */
2824 ret = ptr = (char *) xmalloc (len);
2827 for (i = 0; i < num; i++)
2831 for (j = 0; j < 2; j++)
2832 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2839 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2847 for (j = 0; j < 2; j++)
2850 memcpy (ptr, opts[i][j], len2[j]);
2852 line_len += len2[j];
2857 gcc_assert (ret + len >= ptr);
2862 /* Return true, if profiling code should be emitted before
2863 prologue. Otherwise it returns false.
2864 Note: For x86 with "hotfix" it is sorried. */
2866 ix86_profile_before_prologue (void)
2868 return flag_fentry != 0;
2871 /* Function that is callable from the debugger to print the current
2874 ix86_debug_options (void)
2876 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2877 ix86_arch_string, ix86_tune_string,
2882 fprintf (stderr, "%s\n\n", opts);
2886 fputs ("<no options>\n\n", stderr);
2891 /* Override various settings based on options. If MAIN_ARGS_P, the
2892 options are from the command line, otherwise they are from
2896 ix86_option_override_internal (bool main_args_p)
2899 unsigned int ix86_arch_mask, ix86_tune_mask;
2900 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2905 #define PTA_3DNOW (HOST_WIDE_INT_1 << 0)
2906 #define PTA_3DNOW_A (HOST_WIDE_INT_1 << 1)
2907 #define PTA_64BIT (HOST_WIDE_INT_1 << 2)
2908 #define PTA_ABM (HOST_WIDE_INT_1 << 3)
2909 #define PTA_AES (HOST_WIDE_INT_1 << 4)
2910 #define PTA_AVX (HOST_WIDE_INT_1 << 5)
2911 #define PTA_BMI (HOST_WIDE_INT_1 << 6)
2912 #define PTA_CX16 (HOST_WIDE_INT_1 << 7)
2913 #define PTA_F16C (HOST_WIDE_INT_1 << 8)
2914 #define PTA_FMA (HOST_WIDE_INT_1 << 9)
2915 #define PTA_FMA4 (HOST_WIDE_INT_1 << 10)
2916 #define PTA_FSGSBASE (HOST_WIDE_INT_1 << 11)
2917 #define PTA_LWP (HOST_WIDE_INT_1 << 12)
2918 #define PTA_LZCNT (HOST_WIDE_INT_1 << 13)
2919 #define PTA_MMX (HOST_WIDE_INT_1 << 14)
2920 #define PTA_MOVBE (HOST_WIDE_INT_1 << 15)
2921 #define PTA_NO_SAHF (HOST_WIDE_INT_1 << 16)
2922 #define PTA_PCLMUL (HOST_WIDE_INT_1 << 17)
2923 #define PTA_POPCNT (HOST_WIDE_INT_1 << 18)
2924 #define PTA_PREFETCH_SSE (HOST_WIDE_INT_1 << 19)
2925 #define PTA_RDRND (HOST_WIDE_INT_1 << 20)
2926 #define PTA_SSE (HOST_WIDE_INT_1 << 21)
2927 #define PTA_SSE2 (HOST_WIDE_INT_1 << 22)
2928 #define PTA_SSE3 (HOST_WIDE_INT_1 << 23)
2929 #define PTA_SSE4_1 (HOST_WIDE_INT_1 << 24)
2930 #define PTA_SSE4_2 (HOST_WIDE_INT_1 << 25)
2931 #define PTA_SSE4A (HOST_WIDE_INT_1 << 26)
2932 #define PTA_SSSE3 (HOST_WIDE_INT_1 << 27)
2933 #define PTA_TBM (HOST_WIDE_INT_1 << 28)
2934 #define PTA_XOP (HOST_WIDE_INT_1 << 29)
2935 #define PTA_AVX2 (HOST_WIDE_INT_1 << 30)
2936 #define PTA_BMI2 (HOST_WIDE_INT_1 << 31)
2937 #define PTA_PREFETCHW (HOST_WIDE_INT_1 << 32)
2939 /* if this reaches 64, need to widen struct pta flags below */
2943 const char *const name; /* processor name or nickname. */
2944 const enum processor_type processor;
2945 const enum attr_cpu schedule;
2946 const unsigned HOST_WIDE_INT flags;
2948 const processor_alias_table[] =
2950 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2951 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2952 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2953 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2954 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2955 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2956 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2957 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2958 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2959 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2960 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2961 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2962 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2964 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2966 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2967 PTA_MMX | PTA_SSE | PTA_SSE2},
2968 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2969 PTA_MMX |PTA_SSE | PTA_SSE2},
2970 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2971 PTA_MMX | PTA_SSE | PTA_SSE2},
2972 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2973 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2974 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2975 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2976 | PTA_CX16 | PTA_NO_SAHF},
2977 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2978 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2979 | PTA_SSSE3 | PTA_CX16},
2980 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2981 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2982 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16 | PTA_POPCNT},
2983 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2984 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2985 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2986 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2987 {"core-avx-i", PROCESSOR_COREI7_64, CPU_COREI7,
2988 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2989 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2990 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2991 | PTA_RDRND | PTA_F16C},
2992 {"core-avx2", PROCESSOR_COREI7_64, CPU_COREI7,
2993 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2994 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2
2995 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE
2996 | PTA_RDRND | PTA_F16C | PTA_BMI | PTA_BMI2 | PTA_LZCNT
2997 | PTA_FMA | PTA_MOVBE},
2998 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2999 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3000 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
3001 {"geode", PROCESSOR_GEODE, CPU_GEODE,
3002 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3003 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
3004 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3005 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
3006 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
3007 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3008 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
3009 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
3010 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
3011 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3012 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
3013 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3014 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
3015 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
3016 {"x86-64", PROCESSOR_K8, CPU_K8,
3017 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
3018 {"k8", PROCESSOR_K8, CPU_K8,
3019 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3020 | PTA_SSE2 | PTA_NO_SAHF},
3021 {"k8-sse3", PROCESSOR_K8, CPU_K8,
3022 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3023 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3024 {"opteron", PROCESSOR_K8, CPU_K8,
3025 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3026 | PTA_SSE2 | PTA_NO_SAHF},
3027 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3028 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3029 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3030 {"athlon64", PROCESSOR_K8, CPU_K8,
3031 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3032 | PTA_SSE2 | PTA_NO_SAHF},
3033 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3034 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3035 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3036 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3037 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3038 | PTA_SSE2 | PTA_NO_SAHF},
3039 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3040 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3041 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3042 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3043 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3044 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3045 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3046 PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2
3047 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3
3048 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3049 | PTA_FMA4 | PTA_XOP | PTA_LWP},
3050 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3051 PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2
3052 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3
3053 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3054 | PTA_FMA4 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3056 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3057 PTA_64BIT | PTA_MMX | PTA_PREFETCHW | PTA_SSE | PTA_SSE2
3058 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4A | PTA_ABM | PTA_CX16},
3059 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3060 0 /* flags are only used for -march switch. */ },
3061 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3062 PTA_64BIT /* flags are only used for -march switch. */ },
3065 /* -mrecip options. */
3068 const char *string; /* option name */
3069 unsigned int mask; /* mask bits to set */
3071 const recip_options[] =
3073 { "all", RECIP_MASK_ALL },
3074 { "none", RECIP_MASK_NONE },
3075 { "div", RECIP_MASK_DIV },
3076 { "sqrt", RECIP_MASK_SQRT },
3077 { "vec-div", RECIP_MASK_VEC_DIV },
3078 { "vec-sqrt", RECIP_MASK_VEC_SQRT },
3081 int const pta_size = ARRAY_SIZE (processor_alias_table);
3083 /* Set up prefix/suffix so the error messages refer to either the command
3084 line argument, or the attribute(target). */
3093 prefix = "option(\"";
3098 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3099 SUBTARGET_OVERRIDE_OPTIONS;
3102 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3103 SUBSUBTARGET_OVERRIDE_OPTIONS;
3107 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3109 /* -fPIC is the default for x86_64. */
3110 if (TARGET_MACHO && TARGET_64BIT)
3113 /* Need to check -mtune=generic first. */
3114 if (ix86_tune_string)
3116 if (!strcmp (ix86_tune_string, "generic")
3117 || !strcmp (ix86_tune_string, "i686")
3118 /* As special support for cross compilers we read -mtune=native
3119 as -mtune=generic. With native compilers we won't see the
3120 -mtune=native, as it was changed by the driver. */
3121 || !strcmp (ix86_tune_string, "native"))
3124 ix86_tune_string = "generic64";
3126 ix86_tune_string = "generic32";
3128 /* If this call is for setting the option attribute, allow the
3129 generic32/generic64 that was previously set. */
3130 else if (!main_args_p
3131 && (!strcmp (ix86_tune_string, "generic32")
3132 || !strcmp (ix86_tune_string, "generic64")))
3134 else if (!strncmp (ix86_tune_string, "generic", 7))
3135 error ("bad value (%s) for %stune=%s %s",
3136 ix86_tune_string, prefix, suffix, sw);
3137 else if (!strcmp (ix86_tune_string, "x86-64"))
3138 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3139 "%stune=k8%s or %stune=generic%s instead as appropriate",
3140 prefix, suffix, prefix, suffix, prefix, suffix);
3144 if (ix86_arch_string)
3145 ix86_tune_string = ix86_arch_string;
3146 if (!ix86_tune_string)
3148 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3149 ix86_tune_defaulted = 1;
3152 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3153 need to use a sensible tune option. */
3154 if (!strcmp (ix86_tune_string, "generic")
3155 || !strcmp (ix86_tune_string, "x86-64")
3156 || !strcmp (ix86_tune_string, "i686"))
3159 ix86_tune_string = "generic64";
3161 ix86_tune_string = "generic32";