1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
4 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
42 #include "diagnostic-core.h"
44 #include "basic-block.h"
47 #include "target-def.h"
48 #include "langhooks.h"
53 #include "tm-constrs.h"
57 #include "dwarf2out.h"
58 #include "sched-int.h"
62 #include "diagnostic.h"
64 enum upper_128bits_state
71 typedef struct block_info_def
73 /* State of the upper 128bits of AVX registers at exit. */
74 enum upper_128bits_state state;
75 /* TRUE if state of the upper 128bits of AVX registers is unchanged
78 /* TRUE if block has been processed. */
80 /* TRUE if block has been scanned. */
82 /* Previous state of the upper 128bits of AVX registers at entry. */
83 enum upper_128bits_state prev;
86 #define BLOCK_INFO(B) ((block_info) (B)->aux)
88 enum call_avx256_state
90 /* Callee returns 256bit AVX register. */
91 callee_return_avx256 = -1,
92 /* Callee returns and passes 256bit AVX register. */
93 callee_return_pass_avx256,
94 /* Callee passes 256bit AVX register. */
96 /* Callee doesn't return nor passe 256bit AVX register, or no
97 256bit AVX register in function return. */
99 /* vzeroupper intrinsic. */
103 /* Check if a 256bit AVX register is referenced in stores. */
106 check_avx256_stores (rtx dest, const_rtx set, void *data)
109 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
110 || (GET_CODE (set) == SET
111 && REG_P (SET_SRC (set))
112 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
114 enum upper_128bits_state *state
115 = (enum upper_128bits_state *) data;
120 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
121 in basic block BB. Delete it if upper 128bit AVX registers are
122 unused. If it isn't deleted, move it to just before a jump insn.
124 STATE is state of the upper 128bits of AVX registers at entry. */
127 move_or_delete_vzeroupper_2 (basic_block bb,
128 enum upper_128bits_state state)
131 rtx vzeroupper_insn = NULL_RTX;
136 if (BLOCK_INFO (bb)->unchanged)
139 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
142 BLOCK_INFO (bb)->state = state;
146 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
149 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
150 bb->index, BLOCK_INFO (bb)->state);
154 BLOCK_INFO (bb)->prev = state;
157 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
162 /* BB_END changes when it is deleted. */
163 bb_end = BB_END (bb);
165 while (insn != bb_end)
167 insn = NEXT_INSN (insn);
169 if (!NONDEBUG_INSN_P (insn))
172 /* Move vzeroupper before jump/call. */
173 if (JUMP_P (insn) || CALL_P (insn))
175 if (!vzeroupper_insn)
178 if (PREV_INSN (insn) != vzeroupper_insn)
182 fprintf (dump_file, "Move vzeroupper after:\n");
183 print_rtl_single (dump_file, PREV_INSN (insn));
184 fprintf (dump_file, "before:\n");
185 print_rtl_single (dump_file, insn);
187 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
190 vzeroupper_insn = NULL_RTX;
194 pat = PATTERN (insn);
196 /* Check insn for vzeroupper intrinsic. */
197 if (GET_CODE (pat) == UNSPEC_VOLATILE
198 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
202 /* Found vzeroupper intrinsic. */
203 fprintf (dump_file, "Found vzeroupper:\n");
204 print_rtl_single (dump_file, insn);
209 /* Check insn for vzeroall intrinsic. */
210 if (GET_CODE (pat) == PARALLEL
211 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
212 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
217 /* Delete pending vzeroupper insertion. */
220 delete_insn (vzeroupper_insn);
221 vzeroupper_insn = NULL_RTX;
224 else if (state != used)
226 note_stores (pat, check_avx256_stores, &state);
233 /* Process vzeroupper intrinsic. */
234 avx256 = INTVAL (XVECEXP (pat, 0, 0));
238 /* Since the upper 128bits are cleared, callee must not pass
239 256bit AVX register. We only need to check if callee
240 returns 256bit AVX register. */
241 if (avx256 == callee_return_avx256)
247 /* Remove unnecessary vzeroupper since upper 128bits are
251 fprintf (dump_file, "Delete redundant vzeroupper:\n");
252 print_rtl_single (dump_file, insn);
258 /* Set state to UNUSED if callee doesn't return 256bit AVX
260 if (avx256 != callee_return_pass_avx256)
263 if (avx256 == callee_return_pass_avx256
264 || avx256 == callee_pass_avx256)
266 /* Must remove vzeroupper since callee passes in 256bit
270 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
271 print_rtl_single (dump_file, insn);
277 vzeroupper_insn = insn;
283 BLOCK_INFO (bb)->state = state;
284 BLOCK_INFO (bb)->unchanged = unchanged;
285 BLOCK_INFO (bb)->scanned = true;
288 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
289 bb->index, unchanged ? "unchanged" : "changed",
293 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
294 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
295 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
299 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
303 enum upper_128bits_state state, old_state, new_state;
307 fprintf (dump_file, " Process [bb %i]: status: %d\n",
308 block->index, BLOCK_INFO (block)->processed);
310 if (BLOCK_INFO (block)->processed)
315 /* Check all predecessor edges of this block. */
316 seen_unknown = false;
317 FOR_EACH_EDGE (e, ei, block->preds)
321 switch (BLOCK_INFO (e->src)->state)
324 if (!unknown_is_unused)
338 old_state = BLOCK_INFO (block)->state;
339 move_or_delete_vzeroupper_2 (block, state);
340 new_state = BLOCK_INFO (block)->state;
342 if (state != unknown || new_state == used)
343 BLOCK_INFO (block)->processed = true;
345 /* Need to rescan if the upper 128bits of AVX registers are changed
347 if (new_state != old_state)
349 if (new_state == used)
350 cfun->machine->rescan_vzeroupper_p = 1;
357 /* Go through the instruction stream looking for vzeroupper. Delete
358 it if upper 128bit AVX registers are unused. If it isn't deleted,
359 move it to just before a jump insn. */
362 move_or_delete_vzeroupper (void)
367 fibheap_t worklist, pending, fibheap_swap;
368 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
373 /* Set up block info for each basic block. */
374 alloc_aux_for_blocks (sizeof (struct block_info_def));
376 /* Process outgoing edges of entry point. */
378 fprintf (dump_file, "Process outgoing edges of entry point\n");
380 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
382 move_or_delete_vzeroupper_2 (e->dest,
383 cfun->machine->caller_pass_avx256_p
385 BLOCK_INFO (e->dest)->processed = true;
388 /* Compute reverse completion order of depth first search of the CFG
389 so that the data-flow runs faster. */
390 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
391 bb_order = XNEWVEC (int, last_basic_block);
392 pre_and_rev_post_order_compute (NULL, rc_order, false);
393 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
394 bb_order[rc_order[i]] = i;
397 worklist = fibheap_new ();
398 pending = fibheap_new ();
399 visited = sbitmap_alloc (last_basic_block);
400 in_worklist = sbitmap_alloc (last_basic_block);
401 in_pending = sbitmap_alloc (last_basic_block);
402 sbitmap_zero (in_worklist);
404 /* Don't check outgoing edges of entry point. */
405 sbitmap_ones (in_pending);
407 if (BLOCK_INFO (bb)->processed)
408 RESET_BIT (in_pending, bb->index);
411 move_or_delete_vzeroupper_1 (bb, false);
412 fibheap_insert (pending, bb_order[bb->index], bb);
416 fprintf (dump_file, "Check remaining basic blocks\n");
418 while (!fibheap_empty (pending))
420 fibheap_swap = pending;
422 worklist = fibheap_swap;
423 sbitmap_swap = in_pending;
424 in_pending = in_worklist;
425 in_worklist = sbitmap_swap;
427 sbitmap_zero (visited);
429 cfun->machine->rescan_vzeroupper_p = 0;
431 while (!fibheap_empty (worklist))
433 bb = (basic_block) fibheap_extract_min (worklist);
434 RESET_BIT (in_worklist, bb->index);
435 gcc_assert (!TEST_BIT (visited, bb->index));
436 if (!TEST_BIT (visited, bb->index))
440 SET_BIT (visited, bb->index);
442 if (move_or_delete_vzeroupper_1 (bb, false))
443 FOR_EACH_EDGE (e, ei, bb->succs)
445 if (e->dest == EXIT_BLOCK_PTR
446 || BLOCK_INFO (e->dest)->processed)
449 if (TEST_BIT (visited, e->dest->index))
451 if (!TEST_BIT (in_pending, e->dest->index))
453 /* Send E->DEST to next round. */
454 SET_BIT (in_pending, e->dest->index);
455 fibheap_insert (pending,
456 bb_order[e->dest->index],
460 else if (!TEST_BIT (in_worklist, e->dest->index))
462 /* Add E->DEST to current round. */
463 SET_BIT (in_worklist, e->dest->index);
464 fibheap_insert (worklist, bb_order[e->dest->index],
471 if (!cfun->machine->rescan_vzeroupper_p)
476 fibheap_delete (worklist);
477 fibheap_delete (pending);
478 sbitmap_free (visited);
479 sbitmap_free (in_worklist);
480 sbitmap_free (in_pending);
483 fprintf (dump_file, "Process remaining basic blocks\n");
486 move_or_delete_vzeroupper_1 (bb, true);
488 free_aux_for_blocks ();
491 static rtx legitimize_dllimport_symbol (rtx, bool);
493 #ifndef CHECK_STACK_LIMIT
494 #define CHECK_STACK_LIMIT (-1)
497 /* Return index of given mode in mult and division cost tables. */
498 #define MODE_INDEX(mode) \
499 ((mode) == QImode ? 0 \
500 : (mode) == HImode ? 1 \
501 : (mode) == SImode ? 2 \
502 : (mode) == DImode ? 3 \
505 /* Processor costs (relative to an add) */
506 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
507 #define COSTS_N_BYTES(N) ((N) * 2)
509 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
512 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
513 COSTS_N_BYTES (2), /* cost of an add instruction */
514 COSTS_N_BYTES (3), /* cost of a lea instruction */
515 COSTS_N_BYTES (2), /* variable shift costs */
516 COSTS_N_BYTES (3), /* constant shift costs */
517 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
518 COSTS_N_BYTES (3), /* HI */
519 COSTS_N_BYTES (3), /* SI */
520 COSTS_N_BYTES (3), /* DI */
521 COSTS_N_BYTES (5)}, /* other */
522 0, /* cost of multiply per each bit set */
523 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
524 COSTS_N_BYTES (3), /* HI */
525 COSTS_N_BYTES (3), /* SI */
526 COSTS_N_BYTES (3), /* DI */
527 COSTS_N_BYTES (5)}, /* other */
528 COSTS_N_BYTES (3), /* cost of movsx */
529 COSTS_N_BYTES (3), /* cost of movzx */
530 0, /* "large" insn */
532 2, /* cost for loading QImode using movzbl */
533 {2, 2, 2}, /* cost of loading integer registers
534 in QImode, HImode and SImode.
535 Relative to reg-reg move (2). */
536 {2, 2, 2}, /* cost of storing integer registers */
537 2, /* cost of reg,reg fld/fst */
538 {2, 2, 2}, /* cost of loading fp registers
539 in SFmode, DFmode and XFmode */
540 {2, 2, 2}, /* cost of storing fp registers
541 in SFmode, DFmode and XFmode */
542 3, /* cost of moving MMX register */
543 {3, 3}, /* cost of loading MMX registers
544 in SImode and DImode */
545 {3, 3}, /* cost of storing MMX registers
546 in SImode and DImode */
547 3, /* cost of moving SSE register */
548 {3, 3, 3}, /* cost of loading SSE registers
549 in SImode, DImode and TImode */
550 {3, 3, 3}, /* cost of storing SSE registers
551 in SImode, DImode and TImode */
552 3, /* MMX or SSE register to integer */
553 0, /* size of l1 cache */
554 0, /* size of l2 cache */
555 0, /* size of prefetch block */
556 0, /* number of parallel prefetches */
558 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
559 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
560 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
561 COSTS_N_BYTES (2), /* cost of FABS instruction. */
562 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
563 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
564 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
565 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 1, /* scalar_stmt_cost. */
569 1, /* scalar load_cost. */
570 1, /* scalar_store_cost. */
571 1, /* vec_stmt_cost. */
572 1, /* vec_to_scalar_cost. */
573 1, /* scalar_to_vec_cost. */
574 1, /* vec_align_load_cost. */
575 1, /* vec_unalign_load_cost. */
576 1, /* vec_store_cost. */
577 1, /* cond_taken_branch_cost. */
578 1, /* cond_not_taken_branch_cost. */
581 /* Processor costs (relative to an add) */
583 struct processor_costs i386_cost = { /* 386 specific costs */
584 COSTS_N_INSNS (1), /* cost of an add instruction */
585 COSTS_N_INSNS (1), /* cost of a lea instruction */
586 COSTS_N_INSNS (3), /* variable shift costs */
587 COSTS_N_INSNS (2), /* constant shift costs */
588 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
589 COSTS_N_INSNS (6), /* HI */
590 COSTS_N_INSNS (6), /* SI */
591 COSTS_N_INSNS (6), /* DI */
592 COSTS_N_INSNS (6)}, /* other */
593 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
594 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
595 COSTS_N_INSNS (23), /* HI */
596 COSTS_N_INSNS (23), /* SI */
597 COSTS_N_INSNS (23), /* DI */
598 COSTS_N_INSNS (23)}, /* other */
599 COSTS_N_INSNS (3), /* cost of movsx */
600 COSTS_N_INSNS (2), /* cost of movzx */
601 15, /* "large" insn */
603 4, /* cost for loading QImode using movzbl */
604 {2, 4, 2}, /* cost of loading integer registers
605 in QImode, HImode and SImode.
606 Relative to reg-reg move (2). */
607 {2, 4, 2}, /* cost of storing integer registers */
608 2, /* cost of reg,reg fld/fst */
609 {8, 8, 8}, /* cost of loading fp registers
610 in SFmode, DFmode and XFmode */
611 {8, 8, 8}, /* cost of storing fp registers
612 in SFmode, DFmode and XFmode */
613 2, /* cost of moving MMX register */
614 {4, 8}, /* cost of loading MMX registers
615 in SImode and DImode */
616 {4, 8}, /* cost of storing MMX registers
617 in SImode and DImode */
618 2, /* cost of moving SSE register */
619 {4, 8, 16}, /* cost of loading SSE registers
620 in SImode, DImode and TImode */
621 {4, 8, 16}, /* cost of storing SSE registers
622 in SImode, DImode and TImode */
623 3, /* MMX or SSE register to integer */
624 0, /* size of l1 cache */
625 0, /* size of l2 cache */
626 0, /* size of prefetch block */
627 0, /* number of parallel prefetches */
629 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
630 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
631 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
632 COSTS_N_INSNS (22), /* cost of FABS instruction. */
633 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
634 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
635 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
636 DUMMY_STRINGOP_ALGS},
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 1, /* scalar_stmt_cost. */
640 1, /* scalar load_cost. */
641 1, /* scalar_store_cost. */
642 1, /* vec_stmt_cost. */
643 1, /* vec_to_scalar_cost. */
644 1, /* scalar_to_vec_cost. */
645 1, /* vec_align_load_cost. */
646 2, /* vec_unalign_load_cost. */
647 1, /* vec_store_cost. */
648 3, /* cond_taken_branch_cost. */
649 1, /* cond_not_taken_branch_cost. */
653 struct processor_costs i486_cost = { /* 486 specific costs */
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (1), /* cost of a lea instruction */
656 COSTS_N_INSNS (3), /* variable shift costs */
657 COSTS_N_INSNS (2), /* constant shift costs */
658 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (12), /* HI */
660 COSTS_N_INSNS (12), /* SI */
661 COSTS_N_INSNS (12), /* DI */
662 COSTS_N_INSNS (12)}, /* other */
663 1, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (40), /* HI */
666 COSTS_N_INSNS (40), /* SI */
667 COSTS_N_INSNS (40), /* DI */
668 COSTS_N_INSNS (40)}, /* other */
669 COSTS_N_INSNS (3), /* cost of movsx */
670 COSTS_N_INSNS (2), /* cost of movzx */
671 15, /* "large" insn */
673 4, /* cost for loading QImode using movzbl */
674 {2, 4, 2}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {2, 4, 2}, /* cost of storing integer registers */
678 2, /* cost of reg,reg fld/fst */
679 {8, 8, 8}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {8, 8, 8}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {4, 8}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {4, 8}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {4, 8, 16}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {4, 8, 16}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 3, /* MMX or SSE register to integer */
694 4, /* size of l1 cache. 486 has 8kB cache
695 shared for code and data, so 4kB is
696 not really precise. */
697 4, /* size of l2 cache */
698 0, /* size of prefetch block */
699 0, /* number of parallel prefetches */
701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
702 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
703 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
704 COSTS_N_INSNS (3), /* cost of FABS instruction. */
705 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
706 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
707 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
708 DUMMY_STRINGOP_ALGS},
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 1, /* scalar_stmt_cost. */
712 1, /* scalar load_cost. */
713 1, /* scalar_store_cost. */
714 1, /* vec_stmt_cost. */
715 1, /* vec_to_scalar_cost. */
716 1, /* scalar_to_vec_cost. */
717 1, /* vec_align_load_cost. */
718 2, /* vec_unalign_load_cost. */
719 1, /* vec_store_cost. */
720 3, /* cond_taken_branch_cost. */
721 1, /* cond_not_taken_branch_cost. */
725 struct processor_costs pentium_cost = {
726 COSTS_N_INSNS (1), /* cost of an add instruction */
727 COSTS_N_INSNS (1), /* cost of a lea instruction */
728 COSTS_N_INSNS (4), /* variable shift costs */
729 COSTS_N_INSNS (1), /* constant shift costs */
730 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
731 COSTS_N_INSNS (11), /* HI */
732 COSTS_N_INSNS (11), /* SI */
733 COSTS_N_INSNS (11), /* DI */
734 COSTS_N_INSNS (11)}, /* other */
735 0, /* cost of multiply per each bit set */
736 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
737 COSTS_N_INSNS (25), /* HI */
738 COSTS_N_INSNS (25), /* SI */
739 COSTS_N_INSNS (25), /* DI */
740 COSTS_N_INSNS (25)}, /* other */
741 COSTS_N_INSNS (3), /* cost of movsx */
742 COSTS_N_INSNS (2), /* cost of movzx */
743 8, /* "large" insn */
745 6, /* cost for loading QImode using movzbl */
746 {2, 4, 2}, /* cost of loading integer registers
747 in QImode, HImode and SImode.
748 Relative to reg-reg move (2). */
749 {2, 4, 2}, /* cost of storing integer registers */
750 2, /* cost of reg,reg fld/fst */
751 {2, 2, 6}, /* cost of loading fp registers
752 in SFmode, DFmode and XFmode */
753 {4, 4, 6}, /* cost of storing fp registers
754 in SFmode, DFmode and XFmode */
755 8, /* cost of moving MMX register */
756 {8, 8}, /* cost of loading MMX registers
757 in SImode and DImode */
758 {8, 8}, /* cost of storing MMX registers
759 in SImode and DImode */
760 2, /* cost of moving SSE register */
761 {4, 8, 16}, /* cost of loading SSE registers
762 in SImode, DImode and TImode */
763 {4, 8, 16}, /* cost of storing SSE registers
764 in SImode, DImode and TImode */
765 3, /* MMX or SSE register to integer */
766 8, /* size of l1 cache. */
767 8, /* size of l2 cache */
768 0, /* size of prefetch block */
769 0, /* number of parallel prefetches */
771 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
772 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
773 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
774 COSTS_N_INSNS (1), /* cost of FABS instruction. */
775 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
776 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
777 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
778 DUMMY_STRINGOP_ALGS},
779 {{libcall, {{-1, rep_prefix_4_byte}}},
780 DUMMY_STRINGOP_ALGS},
781 1, /* scalar_stmt_cost. */
782 1, /* scalar load_cost. */
783 1, /* scalar_store_cost. */
784 1, /* vec_stmt_cost. */
785 1, /* vec_to_scalar_cost. */
786 1, /* scalar_to_vec_cost. */
787 1, /* vec_align_load_cost. */
788 2, /* vec_unalign_load_cost. */
789 1, /* vec_store_cost. */
790 3, /* cond_taken_branch_cost. */
791 1, /* cond_not_taken_branch_cost. */
795 struct processor_costs pentiumpro_cost = {
796 COSTS_N_INSNS (1), /* cost of an add instruction */
797 COSTS_N_INSNS (1), /* cost of a lea instruction */
798 COSTS_N_INSNS (1), /* variable shift costs */
799 COSTS_N_INSNS (1), /* constant shift costs */
800 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
801 COSTS_N_INSNS (4), /* HI */
802 COSTS_N_INSNS (4), /* SI */
803 COSTS_N_INSNS (4), /* DI */
804 COSTS_N_INSNS (4)}, /* other */
805 0, /* cost of multiply per each bit set */
806 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
807 COSTS_N_INSNS (17), /* HI */
808 COSTS_N_INSNS (17), /* SI */
809 COSTS_N_INSNS (17), /* DI */
810 COSTS_N_INSNS (17)}, /* other */
811 COSTS_N_INSNS (1), /* cost of movsx */
812 COSTS_N_INSNS (1), /* cost of movzx */
813 8, /* "large" insn */
815 2, /* cost for loading QImode using movzbl */
816 {4, 4, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 {2, 2, 2}, /* cost of storing integer registers */
820 2, /* cost of reg,reg fld/fst */
821 {2, 2, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 {4, 4, 6}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 2, /* cost of moving MMX register */
826 {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 2, /* cost of moving SSE register */
831 {2, 2, 8}, /* cost of loading SSE registers
832 in SImode, DImode and TImode */
833 {2, 2, 8}, /* cost of storing SSE registers
834 in SImode, DImode and TImode */
835 3, /* MMX or SSE register to integer */
836 8, /* size of l1 cache. */
837 256, /* size of l2 cache */
838 32, /* size of prefetch block */
839 6, /* number of parallel prefetches */
841 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
842 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
843 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
844 COSTS_N_INSNS (2), /* cost of FABS instruction. */
845 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
846 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
847 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
848 (we ensure the alignment). For small blocks inline loop is still a
849 noticeable win, for bigger blocks either rep movsl or rep movsb is
850 way to go. Rep movsb has apparently more expensive startup time in CPU,
851 but after 4K the difference is down in the noise. */
852 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
853 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
854 DUMMY_STRINGOP_ALGS},
855 {{rep_prefix_4_byte, {{1024, unrolled_loop},
856 {8192, rep_prefix_4_byte}, {-1, libcall}}},
857 DUMMY_STRINGOP_ALGS},
858 1, /* scalar_stmt_cost. */
859 1, /* scalar load_cost. */
860 1, /* scalar_store_cost. */
861 1, /* vec_stmt_cost. */
862 1, /* vec_to_scalar_cost. */
863 1, /* scalar_to_vec_cost. */
864 1, /* vec_align_load_cost. */
865 2, /* vec_unalign_load_cost. */
866 1, /* vec_store_cost. */
867 3, /* cond_taken_branch_cost. */
868 1, /* cond_not_taken_branch_cost. */
872 struct processor_costs geode_cost = {
873 COSTS_N_INSNS (1), /* cost of an add instruction */
874 COSTS_N_INSNS (1), /* cost of a lea instruction */
875 COSTS_N_INSNS (2), /* variable shift costs */
876 COSTS_N_INSNS (1), /* constant shift costs */
877 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
878 COSTS_N_INSNS (4), /* HI */
879 COSTS_N_INSNS (7), /* SI */
880 COSTS_N_INSNS (7), /* DI */
881 COSTS_N_INSNS (7)}, /* other */
882 0, /* cost of multiply per each bit set */
883 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
884 COSTS_N_INSNS (23), /* HI */
885 COSTS_N_INSNS (39), /* SI */
886 COSTS_N_INSNS (39), /* DI */
887 COSTS_N_INSNS (39)}, /* other */
888 COSTS_N_INSNS (1), /* cost of movsx */
889 COSTS_N_INSNS (1), /* cost of movzx */
890 8, /* "large" insn */
892 1, /* cost for loading QImode using movzbl */
893 {1, 1, 1}, /* cost of loading integer registers
894 in QImode, HImode and SImode.
895 Relative to reg-reg move (2). */
896 {1, 1, 1}, /* cost of storing integer registers */
897 1, /* cost of reg,reg fld/fst */
898 {1, 1, 1}, /* cost of loading fp registers
899 in SFmode, DFmode and XFmode */
900 {4, 6, 6}, /* cost of storing fp registers
901 in SFmode, DFmode and XFmode */
903 1, /* cost of moving MMX register */
904 {1, 1}, /* cost of loading MMX registers
905 in SImode and DImode */
906 {1, 1}, /* cost of storing MMX registers
907 in SImode and DImode */
908 1, /* cost of moving SSE register */
909 {1, 1, 1}, /* cost of loading SSE registers
910 in SImode, DImode and TImode */
911 {1, 1, 1}, /* cost of storing SSE registers
912 in SImode, DImode and TImode */
913 1, /* MMX or SSE register to integer */
914 64, /* size of l1 cache. */
915 128, /* size of l2 cache. */
916 32, /* size of prefetch block */
917 1, /* number of parallel prefetches */
919 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
920 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
921 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
922 COSTS_N_INSNS (1), /* cost of FABS instruction. */
923 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
924 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
925 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
926 DUMMY_STRINGOP_ALGS},
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 1, /* scalar_stmt_cost. */
930 1, /* scalar load_cost. */
931 1, /* scalar_store_cost. */
932 1, /* vec_stmt_cost. */
933 1, /* vec_to_scalar_cost. */
934 1, /* scalar_to_vec_cost. */
935 1, /* vec_align_load_cost. */
936 2, /* vec_unalign_load_cost. */
937 1, /* vec_store_cost. */
938 3, /* cond_taken_branch_cost. */
939 1, /* cond_not_taken_branch_cost. */
943 struct processor_costs k6_cost = {
944 COSTS_N_INSNS (1), /* cost of an add instruction */
945 COSTS_N_INSNS (2), /* cost of a lea instruction */
946 COSTS_N_INSNS (1), /* variable shift costs */
947 COSTS_N_INSNS (1), /* constant shift costs */
948 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
949 COSTS_N_INSNS (3), /* HI */
950 COSTS_N_INSNS (3), /* SI */
951 COSTS_N_INSNS (3), /* DI */
952 COSTS_N_INSNS (3)}, /* other */
953 0, /* cost of multiply per each bit set */
954 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
955 COSTS_N_INSNS (18), /* HI */
956 COSTS_N_INSNS (18), /* SI */
957 COSTS_N_INSNS (18), /* DI */
958 COSTS_N_INSNS (18)}, /* other */
959 COSTS_N_INSNS (2), /* cost of movsx */
960 COSTS_N_INSNS (2), /* cost of movzx */
961 8, /* "large" insn */
963 3, /* cost for loading QImode using movzbl */
964 {4, 5, 4}, /* cost of loading integer registers
965 in QImode, HImode and SImode.
966 Relative to reg-reg move (2). */
967 {2, 3, 2}, /* cost of storing integer registers */
968 4, /* cost of reg,reg fld/fst */
969 {6, 6, 6}, /* cost of loading fp registers
970 in SFmode, DFmode and XFmode */
971 {4, 4, 4}, /* cost of storing fp registers
972 in SFmode, DFmode and XFmode */
973 2, /* cost of moving MMX register */
974 {2, 2}, /* cost of loading MMX registers
975 in SImode and DImode */
976 {2, 2}, /* cost of storing MMX registers
977 in SImode and DImode */
978 2, /* cost of moving SSE register */
979 {2, 2, 8}, /* cost of loading SSE registers
980 in SImode, DImode and TImode */
981 {2, 2, 8}, /* cost of storing SSE registers
982 in SImode, DImode and TImode */
983 6, /* MMX or SSE register to integer */
984 32, /* size of l1 cache. */
985 32, /* size of l2 cache. Some models
986 have integrated l2 cache, but
987 optimizing for k6 is not important
988 enough to worry about that. */
989 32, /* size of prefetch block */
990 1, /* number of parallel prefetches */
992 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
993 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
994 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
995 COSTS_N_INSNS (2), /* cost of FABS instruction. */
996 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
997 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
998 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
999 DUMMY_STRINGOP_ALGS},
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 1, /* scalar_stmt_cost. */
1003 1, /* scalar load_cost. */
1004 1, /* scalar_store_cost. */
1005 1, /* vec_stmt_cost. */
1006 1, /* vec_to_scalar_cost. */
1007 1, /* scalar_to_vec_cost. */
1008 1, /* vec_align_load_cost. */
1009 2, /* vec_unalign_load_cost. */
1010 1, /* vec_store_cost. */
1011 3, /* cond_taken_branch_cost. */
1012 1, /* cond_not_taken_branch_cost. */
1016 struct processor_costs athlon_cost = {
1017 COSTS_N_INSNS (1), /* cost of an add instruction */
1018 COSTS_N_INSNS (2), /* cost of a lea instruction */
1019 COSTS_N_INSNS (1), /* variable shift costs */
1020 COSTS_N_INSNS (1), /* constant shift costs */
1021 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1022 COSTS_N_INSNS (5), /* HI */
1023 COSTS_N_INSNS (5), /* SI */
1024 COSTS_N_INSNS (5), /* DI */
1025 COSTS_N_INSNS (5)}, /* other */
1026 0, /* cost of multiply per each bit set */
1027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1028 COSTS_N_INSNS (26), /* HI */
1029 COSTS_N_INSNS (42), /* SI */
1030 COSTS_N_INSNS (74), /* DI */
1031 COSTS_N_INSNS (74)}, /* other */
1032 COSTS_N_INSNS (1), /* cost of movsx */
1033 COSTS_N_INSNS (1), /* cost of movzx */
1034 8, /* "large" insn */
1036 4, /* cost for loading QImode using movzbl */
1037 {3, 4, 3}, /* cost of loading integer registers
1038 in QImode, HImode and SImode.
1039 Relative to reg-reg move (2). */
1040 {3, 4, 3}, /* cost of storing integer registers */
1041 4, /* cost of reg,reg fld/fst */
1042 {4, 4, 12}, /* cost of loading fp registers
1043 in SFmode, DFmode and XFmode */
1044 {6, 6, 8}, /* cost of storing fp registers
1045 in SFmode, DFmode and XFmode */
1046 2, /* cost of moving MMX register */
1047 {4, 4}, /* cost of loading MMX registers
1048 in SImode and DImode */
1049 {4, 4}, /* cost of storing MMX registers
1050 in SImode and DImode */
1051 2, /* cost of moving SSE register */
1052 {4, 4, 6}, /* cost of loading SSE registers
1053 in SImode, DImode and TImode */
1054 {4, 4, 5}, /* cost of storing SSE registers
1055 in SImode, DImode and TImode */
1056 5, /* MMX or SSE register to integer */
1057 64, /* size of l1 cache. */
1058 256, /* size of l2 cache. */
1059 64, /* size of prefetch block */
1060 6, /* number of parallel prefetches */
1061 5, /* Branch cost */
1062 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1063 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1064 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1065 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1066 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1067 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1068 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1069 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1070 128 bytes for memset. */
1071 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1072 DUMMY_STRINGOP_ALGS},
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 1, /* scalar_stmt_cost. */
1076 1, /* scalar load_cost. */
1077 1, /* scalar_store_cost. */
1078 1, /* vec_stmt_cost. */
1079 1, /* vec_to_scalar_cost. */
1080 1, /* scalar_to_vec_cost. */
1081 1, /* vec_align_load_cost. */
1082 2, /* vec_unalign_load_cost. */
1083 1, /* vec_store_cost. */
1084 3, /* cond_taken_branch_cost. */
1085 1, /* cond_not_taken_branch_cost. */
1089 struct processor_costs k8_cost = {
1090 COSTS_N_INSNS (1), /* cost of an add instruction */
1091 COSTS_N_INSNS (2), /* cost of a lea instruction */
1092 COSTS_N_INSNS (1), /* variable shift costs */
1093 COSTS_N_INSNS (1), /* constant shift costs */
1094 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1095 COSTS_N_INSNS (4), /* HI */
1096 COSTS_N_INSNS (3), /* SI */
1097 COSTS_N_INSNS (4), /* DI */
1098 COSTS_N_INSNS (5)}, /* other */
1099 0, /* cost of multiply per each bit set */
1100 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1101 COSTS_N_INSNS (26), /* HI */
1102 COSTS_N_INSNS (42), /* SI */
1103 COSTS_N_INSNS (74), /* DI */
1104 COSTS_N_INSNS (74)}, /* other */
1105 COSTS_N_INSNS (1), /* cost of movsx */
1106 COSTS_N_INSNS (1), /* cost of movzx */
1107 8, /* "large" insn */
1109 4, /* cost for loading QImode using movzbl */
1110 {3, 4, 3}, /* cost of loading integer registers
1111 in QImode, HImode and SImode.
1112 Relative to reg-reg move (2). */
1113 {3, 4, 3}, /* cost of storing integer registers */
1114 4, /* cost of reg,reg fld/fst */
1115 {4, 4, 12}, /* cost of loading fp registers
1116 in SFmode, DFmode and XFmode */
1117 {6, 6, 8}, /* cost of storing fp registers
1118 in SFmode, DFmode and XFmode */
1119 2, /* cost of moving MMX register */
1120 {3, 3}, /* cost of loading MMX registers
1121 in SImode and DImode */
1122 {4, 4}, /* cost of storing MMX registers
1123 in SImode and DImode */
1124 2, /* cost of moving SSE register */
1125 {4, 3, 6}, /* cost of loading SSE registers
1126 in SImode, DImode and TImode */
1127 {4, 4, 5}, /* cost of storing SSE registers
1128 in SImode, DImode and TImode */
1129 5, /* MMX or SSE register to integer */
1130 64, /* size of l1 cache. */
1131 512, /* size of l2 cache. */
1132 64, /* size of prefetch block */
1133 /* New AMD processors never drop prefetches; if they cannot be performed
1134 immediately, they are queued. We set number of simultaneous prefetches
1135 to a large constant to reflect this (it probably is not a good idea not
1136 to limit number of prefetches at all, as their execution also takes some
1138 100, /* number of parallel prefetches */
1139 3, /* Branch cost */
1140 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1141 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1142 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1143 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1144 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1145 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1146 /* K8 has optimized REP instruction for medium sized blocks, but for very
1147 small blocks it is better to use loop. For large blocks, libcall can
1148 do nontemporary accesses and beat inline considerably. */
1149 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1150 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1151 {{libcall, {{8, loop}, {24, unrolled_loop},
1152 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1153 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1154 4, /* scalar_stmt_cost. */
1155 2, /* scalar load_cost. */
1156 2, /* scalar_store_cost. */
1157 5, /* vec_stmt_cost. */
1158 0, /* vec_to_scalar_cost. */
1159 2, /* scalar_to_vec_cost. */
1160 2, /* vec_align_load_cost. */
1161 3, /* vec_unalign_load_cost. */
1162 3, /* vec_store_cost. */
1163 3, /* cond_taken_branch_cost. */
1164 2, /* cond_not_taken_branch_cost. */
1167 struct processor_costs amdfam10_cost = {
1168 COSTS_N_INSNS (1), /* cost of an add instruction */
1169 COSTS_N_INSNS (2), /* cost of a lea instruction */
1170 COSTS_N_INSNS (1), /* variable shift costs */
1171 COSTS_N_INSNS (1), /* constant shift costs */
1172 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1173 COSTS_N_INSNS (4), /* HI */
1174 COSTS_N_INSNS (3), /* SI */
1175 COSTS_N_INSNS (4), /* DI */
1176 COSTS_N_INSNS (5)}, /* other */
1177 0, /* cost of multiply per each bit set */
1178 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1179 COSTS_N_INSNS (35), /* HI */
1180 COSTS_N_INSNS (51), /* SI */
1181 COSTS_N_INSNS (83), /* DI */
1182 COSTS_N_INSNS (83)}, /* other */
1183 COSTS_N_INSNS (1), /* cost of movsx */
1184 COSTS_N_INSNS (1), /* cost of movzx */
1185 8, /* "large" insn */
1187 4, /* cost for loading QImode using movzbl */
1188 {3, 4, 3}, /* cost of loading integer registers
1189 in QImode, HImode and SImode.
1190 Relative to reg-reg move (2). */
1191 {3, 4, 3}, /* cost of storing integer registers */
1192 4, /* cost of reg,reg fld/fst */
1193 {4, 4, 12}, /* cost of loading fp registers
1194 in SFmode, DFmode and XFmode */
1195 {6, 6, 8}, /* cost of storing fp registers
1196 in SFmode, DFmode and XFmode */
1197 2, /* cost of moving MMX register */
1198 {3, 3}, /* cost of loading MMX registers
1199 in SImode and DImode */
1200 {4, 4}, /* cost of storing MMX registers
1201 in SImode and DImode */
1202 2, /* cost of moving SSE register */
1203 {4, 4, 3}, /* cost of loading SSE registers
1204 in SImode, DImode and TImode */
1205 {4, 4, 5}, /* cost of storing SSE registers
1206 in SImode, DImode and TImode */
1207 3, /* MMX or SSE register to integer */
1209 MOVD reg64, xmmreg Double FSTORE 4
1210 MOVD reg32, xmmreg Double FSTORE 4
1212 MOVD reg64, xmmreg Double FADD 3
1214 MOVD reg32, xmmreg Double FADD 3
1216 64, /* size of l1 cache. */
1217 512, /* size of l2 cache. */
1218 64, /* size of prefetch block */
1219 /* New AMD processors never drop prefetches; if they cannot be performed
1220 immediately, they are queued. We set number of simultaneous prefetches
1221 to a large constant to reflect this (it probably is not a good idea not
1222 to limit number of prefetches at all, as their execution also takes some
1224 100, /* number of parallel prefetches */
1225 2, /* Branch cost */
1226 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1227 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1228 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1229 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1230 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1231 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1233 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1234 very small blocks it is better to use loop. For large blocks, libcall can
1235 do nontemporary accesses and beat inline considerably. */
1236 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1237 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1238 {{libcall, {{8, loop}, {24, unrolled_loop},
1239 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1240 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1241 4, /* scalar_stmt_cost. */
1242 2, /* scalar load_cost. */
1243 2, /* scalar_store_cost. */
1244 6, /* vec_stmt_cost. */
1245 0, /* vec_to_scalar_cost. */
1246 2, /* scalar_to_vec_cost. */
1247 2, /* vec_align_load_cost. */
1248 2, /* vec_unalign_load_cost. */
1249 2, /* vec_store_cost. */
1250 2, /* cond_taken_branch_cost. */
1251 1, /* cond_not_taken_branch_cost. */
1254 struct processor_costs bdver1_cost = {
1255 COSTS_N_INSNS (1), /* cost of an add instruction */
1256 COSTS_N_INSNS (1), /* cost of a lea instruction */
1257 COSTS_N_INSNS (1), /* variable shift costs */
1258 COSTS_N_INSNS (1), /* constant shift costs */
1259 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1260 COSTS_N_INSNS (4), /* HI */
1261 COSTS_N_INSNS (4), /* SI */
1262 COSTS_N_INSNS (6), /* DI */
1263 COSTS_N_INSNS (6)}, /* other */
1264 0, /* cost of multiply per each bit set */
1265 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1266 COSTS_N_INSNS (35), /* HI */
1267 COSTS_N_INSNS (51), /* SI */
1268 COSTS_N_INSNS (83), /* DI */
1269 COSTS_N_INSNS (83)}, /* other */
1270 COSTS_N_INSNS (1), /* cost of movsx */
1271 COSTS_N_INSNS (1), /* cost of movzx */
1272 8, /* "large" insn */
1274 4, /* cost for loading QImode using movzbl */
1275 {5, 5, 4}, /* cost of loading integer registers
1276 in QImode, HImode and SImode.
1277 Relative to reg-reg move (2). */
1278 {4, 4, 4}, /* cost of storing integer registers */
1279 2, /* cost of reg,reg fld/fst */
1280 {5, 5, 12}, /* cost of loading fp registers
1281 in SFmode, DFmode and XFmode */
1282 {4, 4, 8}, /* cost of storing fp registers
1283 in SFmode, DFmode and XFmode */
1284 2, /* cost of moving MMX register */
1285 {4, 4}, /* cost of loading MMX registers
1286 in SImode and DImode */
1287 {4, 4}, /* cost of storing MMX registers
1288 in SImode and DImode */
1289 2, /* cost of moving SSE register */
1290 {4, 4, 4}, /* cost of loading SSE registers
1291 in SImode, DImode and TImode */
1292 {4, 4, 4}, /* cost of storing SSE registers
1293 in SImode, DImode and TImode */
1294 2, /* MMX or SSE register to integer */
1296 MOVD reg64, xmmreg Double FSTORE 4
1297 MOVD reg32, xmmreg Double FSTORE 4
1299 MOVD reg64, xmmreg Double FADD 3
1301 MOVD reg32, xmmreg Double FADD 3
1303 16, /* size of l1 cache. */
1304 2048, /* size of l2 cache. */
1305 64, /* size of prefetch block */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1311 100, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1320 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1321 very small blocks it is better to use loop. For large blocks, libcall
1322 can do nontemporary accesses and beat inline considerably. */
1323 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1324 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1325 {{libcall, {{8, loop}, {24, unrolled_loop},
1326 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1327 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1328 6, /* scalar_stmt_cost. */
1329 4, /* scalar load_cost. */
1330 4, /* scalar_store_cost. */
1331 6, /* vec_stmt_cost. */
1332 0, /* vec_to_scalar_cost. */
1333 2, /* scalar_to_vec_cost. */
1334 4, /* vec_align_load_cost. */
1335 4, /* vec_unalign_load_cost. */
1336 4, /* vec_store_cost. */
1337 2, /* cond_taken_branch_cost. */
1338 1, /* cond_not_taken_branch_cost. */
1341 struct processor_costs btver1_cost = {
1342 COSTS_N_INSNS (1), /* cost of an add instruction */
1343 COSTS_N_INSNS (2), /* cost of a lea instruction */
1344 COSTS_N_INSNS (1), /* variable shift costs */
1345 COSTS_N_INSNS (1), /* constant shift costs */
1346 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1347 COSTS_N_INSNS (4), /* HI */
1348 COSTS_N_INSNS (3), /* SI */
1349 COSTS_N_INSNS (4), /* DI */
1350 COSTS_N_INSNS (5)}, /* other */
1351 0, /* cost of multiply per each bit set */
1352 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1353 COSTS_N_INSNS (35), /* HI */
1354 COSTS_N_INSNS (51), /* SI */
1355 COSTS_N_INSNS (83), /* DI */
1356 COSTS_N_INSNS (83)}, /* other */
1357 COSTS_N_INSNS (1), /* cost of movsx */
1358 COSTS_N_INSNS (1), /* cost of movzx */
1359 8, /* "large" insn */
1361 4, /* cost for loading QImode using movzbl */
1362 {3, 4, 3}, /* cost of loading integer registers
1363 in QImode, HImode and SImode.
1364 Relative to reg-reg move (2). */
1365 {3, 4, 3}, /* cost of storing integer registers */
1366 4, /* cost of reg,reg fld/fst */
1367 {4, 4, 12}, /* cost of loading fp registers
1368 in SFmode, DFmode and XFmode */
1369 {6, 6, 8}, /* cost of storing fp registers
1370 in SFmode, DFmode and XFmode */
1371 2, /* cost of moving MMX register */
1372 {3, 3}, /* cost of loading MMX registers
1373 in SImode and DImode */
1374 {4, 4}, /* cost of storing MMX registers
1375 in SImode and DImode */
1376 2, /* cost of moving SSE register */
1377 {4, 4, 3}, /* cost of loading SSE registers
1378 in SImode, DImode and TImode */
1379 {4, 4, 5}, /* cost of storing SSE registers
1380 in SImode, DImode and TImode */
1381 3, /* MMX or SSE register to integer */
1383 MOVD reg64, xmmreg Double FSTORE 4
1384 MOVD reg32, xmmreg Double FSTORE 4
1386 MOVD reg64, xmmreg Double FADD 3
1388 MOVD reg32, xmmreg Double FADD 3
1390 32, /* size of l1 cache. */
1391 512, /* size of l2 cache. */
1392 64, /* size of prefetch block */
1393 100, /* number of parallel prefetches */
1394 2, /* Branch cost */
1395 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1396 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1397 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1398 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1399 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1400 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1402 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1403 very small blocks it is better to use loop. For large blocks, libcall can
1404 do nontemporary accesses and beat inline considerably. */
1405 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1406 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1407 {{libcall, {{8, loop}, {24, unrolled_loop},
1408 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1409 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1410 4, /* scalar_stmt_cost. */
1411 2, /* scalar load_cost. */
1412 2, /* scalar_store_cost. */
1413 6, /* vec_stmt_cost. */
1414 0, /* vec_to_scalar_cost. */
1415 2, /* scalar_to_vec_cost. */
1416 2, /* vec_align_load_cost. */
1417 2, /* vec_unalign_load_cost. */
1418 2, /* vec_store_cost. */
1419 2, /* cond_taken_branch_cost. */
1420 1, /* cond_not_taken_branch_cost. */
1424 struct processor_costs pentium4_cost = {
1425 COSTS_N_INSNS (1), /* cost of an add instruction */
1426 COSTS_N_INSNS (3), /* cost of a lea instruction */
1427 COSTS_N_INSNS (4), /* variable shift costs */
1428 COSTS_N_INSNS (4), /* constant shift costs */
1429 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1430 COSTS_N_INSNS (15), /* HI */
1431 COSTS_N_INSNS (15), /* SI */
1432 COSTS_N_INSNS (15), /* DI */
1433 COSTS_N_INSNS (15)}, /* other */
1434 0, /* cost of multiply per each bit set */
1435 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1436 COSTS_N_INSNS (56), /* HI */
1437 COSTS_N_INSNS (56), /* SI */
1438 COSTS_N_INSNS (56), /* DI */
1439 COSTS_N_INSNS (56)}, /* other */
1440 COSTS_N_INSNS (1), /* cost of movsx */
1441 COSTS_N_INSNS (1), /* cost of movzx */
1442 16, /* "large" insn */
1444 2, /* cost for loading QImode using movzbl */
1445 {4, 5, 4}, /* cost of loading integer registers
1446 in QImode, HImode and SImode.
1447 Relative to reg-reg move (2). */
1448 {2, 3, 2}, /* cost of storing integer registers */
1449 2, /* cost of reg,reg fld/fst */
1450 {2, 2, 6}, /* cost of loading fp registers
1451 in SFmode, DFmode and XFmode */
1452 {4, 4, 6}, /* cost of storing fp registers
1453 in SFmode, DFmode and XFmode */
1454 2, /* cost of moving MMX register */
1455 {2, 2}, /* cost of loading MMX registers
1456 in SImode and DImode */
1457 {2, 2}, /* cost of storing MMX registers
1458 in SImode and DImode */
1459 12, /* cost of moving SSE register */
1460 {12, 12, 12}, /* cost of loading SSE registers
1461 in SImode, DImode and TImode */
1462 {2, 2, 8}, /* cost of storing SSE registers
1463 in SImode, DImode and TImode */
1464 10, /* MMX or SSE register to integer */
1465 8, /* size of l1 cache. */
1466 256, /* size of l2 cache. */
1467 64, /* size of prefetch block */
1468 6, /* number of parallel prefetches */
1469 2, /* Branch cost */
1470 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1471 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1472 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1473 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1474 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1475 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1476 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1477 DUMMY_STRINGOP_ALGS},
1478 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1480 DUMMY_STRINGOP_ALGS},
1481 1, /* scalar_stmt_cost. */
1482 1, /* scalar load_cost. */
1483 1, /* scalar_store_cost. */
1484 1, /* vec_stmt_cost. */
1485 1, /* vec_to_scalar_cost. */
1486 1, /* scalar_to_vec_cost. */
1487 1, /* vec_align_load_cost. */
1488 2, /* vec_unalign_load_cost. */
1489 1, /* vec_store_cost. */
1490 3, /* cond_taken_branch_cost. */
1491 1, /* cond_not_taken_branch_cost. */
1495 struct processor_costs nocona_cost = {
1496 COSTS_N_INSNS (1), /* cost of an add instruction */
1497 COSTS_N_INSNS (1), /* cost of a lea instruction */
1498 COSTS_N_INSNS (1), /* variable shift costs */
1499 COSTS_N_INSNS (1), /* constant shift costs */
1500 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1501 COSTS_N_INSNS (10), /* HI */
1502 COSTS_N_INSNS (10), /* SI */
1503 COSTS_N_INSNS (10), /* DI */
1504 COSTS_N_INSNS (10)}, /* other */
1505 0, /* cost of multiply per each bit set */
1506 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1507 COSTS_N_INSNS (66), /* HI */
1508 COSTS_N_INSNS (66), /* SI */
1509 COSTS_N_INSNS (66), /* DI */
1510 COSTS_N_INSNS (66)}, /* other */
1511 COSTS_N_INSNS (1), /* cost of movsx */
1512 COSTS_N_INSNS (1), /* cost of movzx */
1513 16, /* "large" insn */
1514 17, /* MOVE_RATIO */
1515 4, /* cost for loading QImode using movzbl */
1516 {4, 4, 4}, /* cost of loading integer registers
1517 in QImode, HImode and SImode.
1518 Relative to reg-reg move (2). */
1519 {4, 4, 4}, /* cost of storing integer registers */
1520 3, /* cost of reg,reg fld/fst */
1521 {12, 12, 12}, /* cost of loading fp registers
1522 in SFmode, DFmode and XFmode */
1523 {4, 4, 4}, /* cost of storing fp registers
1524 in SFmode, DFmode and XFmode */
1525 6, /* cost of moving MMX register */
1526 {12, 12}, /* cost of loading MMX registers
1527 in SImode and DImode */
1528 {12, 12}, /* cost of storing MMX registers
1529 in SImode and DImode */
1530 6, /* cost of moving SSE register */
1531 {12, 12, 12}, /* cost of loading SSE registers
1532 in SImode, DImode and TImode */
1533 {12, 12, 12}, /* cost of storing SSE registers
1534 in SImode, DImode and TImode */
1535 8, /* MMX or SSE register to integer */
1536 8, /* size of l1 cache. */
1537 1024, /* size of l2 cache. */
1538 128, /* size of prefetch block */
1539 8, /* number of parallel prefetches */
1540 1, /* Branch cost */
1541 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1542 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1543 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1544 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1545 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1546 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1547 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1548 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1549 {100000, unrolled_loop}, {-1, libcall}}}},
1550 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1552 {libcall, {{24, loop}, {64, unrolled_loop},
1553 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1554 1, /* scalar_stmt_cost. */
1555 1, /* scalar load_cost. */
1556 1, /* scalar_store_cost. */
1557 1, /* vec_stmt_cost. */
1558 1, /* vec_to_scalar_cost. */
1559 1, /* scalar_to_vec_cost. */
1560 1, /* vec_align_load_cost. */
1561 2, /* vec_unalign_load_cost. */
1562 1, /* vec_store_cost. */
1563 3, /* cond_taken_branch_cost. */
1564 1, /* cond_not_taken_branch_cost. */
1568 struct processor_costs atom_cost = {
1569 COSTS_N_INSNS (1), /* cost of an add instruction */
1570 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1571 COSTS_N_INSNS (1), /* variable shift costs */
1572 COSTS_N_INSNS (1), /* constant shift costs */
1573 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1574 COSTS_N_INSNS (4), /* HI */
1575 COSTS_N_INSNS (3), /* SI */
1576 COSTS_N_INSNS (4), /* DI */
1577 COSTS_N_INSNS (2)}, /* other */
1578 0, /* cost of multiply per each bit set */
1579 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1580 COSTS_N_INSNS (26), /* HI */
1581 COSTS_N_INSNS (42), /* SI */
1582 COSTS_N_INSNS (74), /* DI */
1583 COSTS_N_INSNS (74)}, /* other */
1584 COSTS_N_INSNS (1), /* cost of movsx */
1585 COSTS_N_INSNS (1), /* cost of movzx */
1586 8, /* "large" insn */
1587 17, /* MOVE_RATIO */
1588 2, /* cost for loading QImode using movzbl */
1589 {4, 4, 4}, /* cost of loading integer registers
1590 in QImode, HImode and SImode.
1591 Relative to reg-reg move (2). */
1592 {4, 4, 4}, /* cost of storing integer registers */
1593 4, /* cost of reg,reg fld/fst */
1594 {12, 12, 12}, /* cost of loading fp registers
1595 in SFmode, DFmode and XFmode */
1596 {6, 6, 8}, /* cost of storing fp registers
1597 in SFmode, DFmode and XFmode */
1598 2, /* cost of moving MMX register */
1599 {8, 8}, /* cost of loading MMX registers
1600 in SImode and DImode */
1601 {8, 8}, /* cost of storing MMX registers
1602 in SImode and DImode */
1603 2, /* cost of moving SSE register */
1604 {8, 8, 8}, /* cost of loading SSE registers
1605 in SImode, DImode and TImode */
1606 {8, 8, 8}, /* cost of storing SSE registers
1607 in SImode, DImode and TImode */
1608 5, /* MMX or SSE register to integer */
1609 32, /* size of l1 cache. */
1610 256, /* size of l2 cache. */
1611 64, /* size of prefetch block */
1612 6, /* number of parallel prefetches */
1613 3, /* Branch cost */
1614 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1615 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1616 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1617 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1618 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1619 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1620 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1621 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1622 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1623 {{libcall, {{8, loop}, {15, unrolled_loop},
1624 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1625 {libcall, {{24, loop}, {32, unrolled_loop},
1626 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1627 1, /* scalar_stmt_cost. */
1628 1, /* scalar load_cost. */
1629 1, /* scalar_store_cost. */
1630 1, /* vec_stmt_cost. */
1631 1, /* vec_to_scalar_cost. */
1632 1, /* scalar_to_vec_cost. */
1633 1, /* vec_align_load_cost. */
1634 2, /* vec_unalign_load_cost. */
1635 1, /* vec_store_cost. */
1636 3, /* cond_taken_branch_cost. */
1637 1, /* cond_not_taken_branch_cost. */
1640 /* Generic64 should produce code tuned for Nocona and K8. */
1642 struct processor_costs generic64_cost = {
1643 COSTS_N_INSNS (1), /* cost of an add instruction */
1644 /* On all chips taken into consideration lea is 2 cycles and more. With
1645 this cost however our current implementation of synth_mult results in
1646 use of unnecessary temporary registers causing regression on several
1647 SPECfp benchmarks. */
1648 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1649 COSTS_N_INSNS (1), /* variable shift costs */
1650 COSTS_N_INSNS (1), /* constant shift costs */
1651 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1652 COSTS_N_INSNS (4), /* HI */
1653 COSTS_N_INSNS (3), /* SI */
1654 COSTS_N_INSNS (4), /* DI */
1655 COSTS_N_INSNS (2)}, /* other */
1656 0, /* cost of multiply per each bit set */
1657 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1658 COSTS_N_INSNS (26), /* HI */
1659 COSTS_N_INSNS (42), /* SI */
1660 COSTS_N_INSNS (74), /* DI */
1661 COSTS_N_INSNS (74)}, /* other */
1662 COSTS_N_INSNS (1), /* cost of movsx */
1663 COSTS_N_INSNS (1), /* cost of movzx */
1664 8, /* "large" insn */
1665 17, /* MOVE_RATIO */
1666 4, /* cost for loading QImode using movzbl */
1667 {4, 4, 4}, /* cost of loading integer registers
1668 in QImode, HImode and SImode.
1669 Relative to reg-reg move (2). */
1670 {4, 4, 4}, /* cost of storing integer registers */
1671 4, /* cost of reg,reg fld/fst */
1672 {12, 12, 12}, /* cost of loading fp registers
1673 in SFmode, DFmode and XFmode */
1674 {6, 6, 8}, /* cost of storing fp registers
1675 in SFmode, DFmode and XFmode */
1676 2, /* cost of moving MMX register */
1677 {8, 8}, /* cost of loading MMX registers
1678 in SImode and DImode */
1679 {8, 8}, /* cost of storing MMX registers
1680 in SImode and DImode */
1681 2, /* cost of moving SSE register */
1682 {8, 8, 8}, /* cost of loading SSE registers
1683 in SImode, DImode and TImode */
1684 {8, 8, 8}, /* cost of storing SSE registers
1685 in SImode, DImode and TImode */
1686 5, /* MMX or SSE register to integer */
1687 32, /* size of l1 cache. */
1688 512, /* size of l2 cache. */
1689 64, /* size of prefetch block */
1690 6, /* number of parallel prefetches */
1691 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1692 value is increased to perhaps more appropriate value of 5. */
1693 3, /* Branch cost */
1694 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1695 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1696 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1697 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1698 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1699 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1700 {DUMMY_STRINGOP_ALGS,
1701 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1702 {DUMMY_STRINGOP_ALGS,
1703 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1704 1, /* scalar_stmt_cost. */
1705 1, /* scalar load_cost. */
1706 1, /* scalar_store_cost. */
1707 1, /* vec_stmt_cost. */
1708 1, /* vec_to_scalar_cost. */
1709 1, /* scalar_to_vec_cost. */
1710 1, /* vec_align_load_cost. */
1711 2, /* vec_unalign_load_cost. */
1712 1, /* vec_store_cost. */
1713 3, /* cond_taken_branch_cost. */
1714 1, /* cond_not_taken_branch_cost. */
1717 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1720 struct processor_costs generic32_cost = {
1721 COSTS_N_INSNS (1), /* cost of an add instruction */
1722 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1723 COSTS_N_INSNS (1), /* variable shift costs */
1724 COSTS_N_INSNS (1), /* constant shift costs */
1725 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1726 COSTS_N_INSNS (4), /* HI */
1727 COSTS_N_INSNS (3), /* SI */
1728 COSTS_N_INSNS (4), /* DI */
1729 COSTS_N_INSNS (2)}, /* other */
1730 0, /* cost of multiply per each bit set */
1731 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1732 COSTS_N_INSNS (26), /* HI */
1733 COSTS_N_INSNS (42), /* SI */
1734 COSTS_N_INSNS (74), /* DI */
1735 COSTS_N_INSNS (74)}, /* other */
1736 COSTS_N_INSNS (1), /* cost of movsx */
1737 COSTS_N_INSNS (1), /* cost of movzx */
1738 8, /* "large" insn */
1739 17, /* MOVE_RATIO */
1740 4, /* cost for loading QImode using movzbl */
1741 {4, 4, 4}, /* cost of loading integer registers
1742 in QImode, HImode and SImode.
1743 Relative to reg-reg move (2). */
1744 {4, 4, 4}, /* cost of storing integer registers */
1745 4, /* cost of reg,reg fld/fst */
1746 {12, 12, 12}, /* cost of loading fp registers
1747 in SFmode, DFmode and XFmode */
1748 {6, 6, 8}, /* cost of storing fp registers
1749 in SFmode, DFmode and XFmode */
1750 2, /* cost of moving MMX register */
1751 {8, 8}, /* cost of loading MMX registers
1752 in SImode and DImode */
1753 {8, 8}, /* cost of storing MMX registers
1754 in SImode and DImode */
1755 2, /* cost of moving SSE register */
1756 {8, 8, 8}, /* cost of loading SSE registers
1757 in SImode, DImode and TImode */
1758 {8, 8, 8}, /* cost of storing SSE registers
1759 in SImode, DImode and TImode */
1760 5, /* MMX or SSE register to integer */
1761 32, /* size of l1 cache. */
1762 256, /* size of l2 cache. */
1763 64, /* size of prefetch block */
1764 6, /* number of parallel prefetches */
1765 3, /* Branch cost */
1766 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1767 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1768 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1769 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1770 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1771 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1772 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1773 DUMMY_STRINGOP_ALGS},
1774 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1775 DUMMY_STRINGOP_ALGS},
1776 1, /* scalar_stmt_cost. */
1777 1, /* scalar load_cost. */
1778 1, /* scalar_store_cost. */
1779 1, /* vec_stmt_cost. */
1780 1, /* vec_to_scalar_cost. */
1781 1, /* scalar_to_vec_cost. */
1782 1, /* vec_align_load_cost. */
1783 2, /* vec_unalign_load_cost. */
1784 1, /* vec_store_cost. */
1785 3, /* cond_taken_branch_cost. */
1786 1, /* cond_not_taken_branch_cost. */
1789 const struct processor_costs *ix86_cost = &pentium_cost;
1791 /* Processor feature/optimization bitmasks. */
1792 #define m_386 (1<<PROCESSOR_I386)
1793 #define m_486 (1<<PROCESSOR_I486)
1794 #define m_PENT (1<<PROCESSOR_PENTIUM)
1795 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1796 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1797 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1798 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1799 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1800 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1801 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1802 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1803 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1804 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1805 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1806 #define m_ATOM (1<<PROCESSOR_ATOM)
1808 #define m_GEODE (1<<PROCESSOR_GEODE)
1809 #define m_K6 (1<<PROCESSOR_K6)
1810 #define m_K6_GEODE (m_K6 | m_GEODE)
1811 #define m_K8 (1<<PROCESSOR_K8)
1812 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1813 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1814 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1815 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1816 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1817 #define m_AMD_MULTIPLE (m_K8 | m_ATHLON | m_AMDFAM10 | m_BDVER1 | m_BTVER1)
1819 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1820 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1822 /* Generic instruction choice should be common subset of supported CPUs
1823 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1824 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1826 /* Feature tests against the various tunings. */
1827 unsigned char ix86_tune_features[X86_TUNE_LAST];
1829 /* Feature tests against the various tunings used to create ix86_tune_features
1830 based on the processor mask. */
1831 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1832 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1833 negatively, so enabling for Generic64 seems like good code size
1834 tradeoff. We can't enable it for 32bit generic because it does not
1835 work well with PPro base chips. */
1836 m_386 | m_K6_GEODE | m_AMD_MULTIPLE | m_CORE2I7_64 | m_GENERIC64,
1838 /* X86_TUNE_PUSH_MEMORY */
1839 m_386 | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4
1840 | m_NOCONA | m_CORE2I7 | m_GENERIC,
1842 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1845 /* X86_TUNE_UNROLL_STRLEN */
1846 m_486 | m_PENT | m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_K6
1847 | m_CORE2I7 | m_GENERIC,
1849 /* X86_TUNE_DEEP_BRANCH_PREDICTION */
1850 m_ATOM | m_PPRO | m_K6_GEODE | m_AMD_MULTIPLE | m_PENT4
1851 | m_CORE2I7 | m_GENERIC,
1853 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1854 on simulation result. But after P4 was made, no performance benefit
1855 was observed with branch hints. It also increases the code size.
1856 As a result, icc never generates branch hints. */
1859 /* X86_TUNE_DOUBLE_WITH_ADD */
1862 /* X86_TUNE_USE_SAHF */
1863 m_ATOM | m_PPRO | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER1 | m_BTVER1
1864 | m_PENT4 | m_NOCONA | m_CORE2I7 | m_GENERIC,
1866 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1867 partial dependencies. */
1868 m_AMD_MULTIPLE | m_ATOM | m_PPRO | m_PENT4 | m_NOCONA
1869 | m_CORE2I7 | m_GENERIC | m_GEODE /* m_386 | m_K6 */,
1871 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1872 register stalls on Generic32 compilation setting as well. However
1873 in current implementation the partial register stalls are not eliminated
1874 very well - they can be introduced via subregs synthesized by combine
1875 and can happen in caller/callee saving sequences. Because this option
1876 pays back little on PPro based chips and is in conflict with partial reg
1877 dependencies used by Athlon/P4 based chips, it is better to leave it off
1878 for generic32 for now. */
1881 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1882 m_CORE2I7 | m_GENERIC,
1884 /* X86_TUNE_USE_HIMODE_FIOP */
1885 m_386 | m_486 | m_K6_GEODE,
1887 /* X86_TUNE_USE_SIMODE_FIOP */
1888 ~(m_PPRO | m_AMD_MULTIPLE | m_PENT | m_ATOM | m_CORE2I7 | m_GENERIC),
1890 /* X86_TUNE_USE_MOV0 */
1893 /* X86_TUNE_USE_CLTD */
1894 ~(m_PENT | m_ATOM | m_K6 | m_CORE2I7 | m_GENERIC),
1896 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1899 /* X86_TUNE_SPLIT_LONG_MOVES */
1902 /* X86_TUNE_READ_MODIFY_WRITE */
1905 /* X86_TUNE_READ_MODIFY */
1908 /* X86_TUNE_PROMOTE_QIMODE */
1909 m_K6_GEODE | m_PENT | m_ATOM | m_386 | m_486 | m_AMD_MULTIPLE
1910 | m_CORE2I7 | m_GENERIC /* | m_PENT4 ? */,
1912 /* X86_TUNE_FAST_PREFIX */
1913 ~(m_PENT | m_486 | m_386),
1915 /* X86_TUNE_SINGLE_STRINGOP */
1916 m_386 | m_PENT4 | m_NOCONA,
1918 /* X86_TUNE_QIMODE_MATH */
1921 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
1922 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
1923 might be considered for Generic32 if our scheme for avoiding partial
1924 stalls was more effective. */
1927 /* X86_TUNE_PROMOTE_QI_REGS */
1930 /* X86_TUNE_PROMOTE_HI_REGS */
1933 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
1934 over esp addition. */
1935 m_386 | m_486 | m_PENT | m_PPRO,
1937 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
1938 over esp addition. */
1941 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
1942 over esp subtraction. */
1943 m_386 | m_486 | m_PENT | m_K6_GEODE,
1945 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
1946 over esp subtraction. */
1947 m_PENT | m_K6_GEODE,
1949 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
1950 for DFmode copies */
1951 ~(m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2I7
1952 | m_GENERIC | m_GEODE),
1954 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
1955 m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_CORE2I7 | m_GENERIC,
1957 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
1958 conflict here in between PPro/Pentium4 based chips that thread 128bit
1959 SSE registers as single units versus K8 based chips that divide SSE
1960 registers to two 64bit halves. This knob promotes all store destinations
1961 to be 128bit to allow register renaming on 128bit SSE units, but usually
1962 results in one extra microop on 64bit SSE units. Experimental results
1963 shows that disabling this option on P4 brings over 20% SPECfp regression,
1964 while enabling it on K8 brings roughly 2.4% regression that can be partly
1965 masked by careful scheduling of moves. */
1966 m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2I7 | m_GENERIC
1967 | m_AMDFAM10 | m_BDVER1,
1969 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
1970 m_AMDFAM10 | m_BDVER1 | m_BTVER1 | m_COREI7,
1972 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
1973 m_BDVER1 | m_COREI7,
1975 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
1978 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
1979 are resolved on SSE register parts instead of whole registers, so we may
1980 maintain just lower part of scalar values in proper format leaving the
1981 upper part undefined. */
1984 /* X86_TUNE_SSE_TYPELESS_STORES */
1987 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
1988 m_PPRO | m_PENT4 | m_NOCONA,
1990 /* X86_TUNE_MEMORY_MISMATCH_STALL */
1991 m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_CORE2I7 | m_GENERIC,
1993 /* X86_TUNE_PROLOGUE_USING_MOVE */
1994 m_ATHLON_K8 | m_ATOM | m_PPRO | m_CORE2I7 | m_GENERIC,
1996 /* X86_TUNE_EPILOGUE_USING_MOVE */
1997 m_ATHLON_K8 | m_ATOM | m_PPRO | m_CORE2I7 | m_GENERIC,
1999 /* X86_TUNE_SHIFT1 */
2002 /* X86_TUNE_USE_FFREEP */
2005 /* X86_TUNE_INTER_UNIT_MOVES */
2006 ~(m_AMD_MULTIPLE | m_GENERIC),
2008 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2009 ~(m_AMDFAM10 | m_BDVER1),
2011 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2012 than 4 branch instructions in the 16 byte window. */
2013 m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_PENT4 | m_NOCONA | m_CORE2I7
2016 /* X86_TUNE_SCHEDULE */
2017 m_PPRO | m_AMD_MULTIPLE | m_K6_GEODE | m_PENT | m_ATOM | m_CORE2I7
2020 /* X86_TUNE_USE_BT */
2021 m_AMD_MULTIPLE | m_ATOM | m_CORE2I7 | m_GENERIC,
2023 /* X86_TUNE_USE_INCDEC */
2024 ~(m_PENT4 | m_NOCONA | m_CORE2I7 | m_GENERIC | m_ATOM),
2026 /* X86_TUNE_PAD_RETURNS */
2027 m_AMD_MULTIPLE | m_CORE2I7 | m_GENERIC,
2029 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2032 /* X86_TUNE_EXT_80387_CONSTANTS */
2033 m_K6_GEODE | m_ATHLON_K8 | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO
2034 | m_CORE2I7 | m_GENERIC,
2036 /* X86_TUNE_SHORTEN_X87_SSE */
2039 /* X86_TUNE_AVOID_VECTOR_DECODE */
2040 m_K8 | m_CORE2I7_64 | m_GENERIC64,
2042 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2043 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2046 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2047 vector path on AMD machines. */
2048 m_K8 | m_CORE2I7_64 | m_GENERIC64 | m_AMDFAM10 | m_BDVER1 | m_BTVER1,
2050 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2052 m_K8 | m_CORE2I7_64 | m_GENERIC64 | m_AMDFAM10 | m_BDVER1 | m_BTVER1,
2054 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2058 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2059 but one byte longer. */
2062 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2063 operand that cannot be represented using a modRM byte. The XOR
2064 replacement is long decoded, so this split helps here as well. */
2067 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2069 m_AMDFAM10 | m_CORE2I7 | m_GENERIC,
2071 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2072 from integer to FP. */
2075 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2076 with a subsequent conditional jump instruction into a single
2077 compare-and-branch uop. */
2080 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2081 will impact LEA instruction selection. */
2084 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2088 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2089 at -O3. For the moment, the prefetching seems badly tuned for Intel
2091 m_K6_GEODE | m_AMD_MULTIPLE
2094 /* Feature tests against the various architecture variations. */
2095 unsigned char ix86_arch_features[X86_ARCH_LAST];
2097 /* Feature tests against the various architecture variations, used to create
2098 ix86_arch_features based on the processor mask. */
2099 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2100 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
2101 ~(m_386 | m_486 | m_PENT | m_K6),
2103 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2106 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2109 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2112 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2116 static const unsigned int x86_accumulate_outgoing_args
2117 = m_AMD_MULTIPLE | m_ATOM | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2I7
2120 static const unsigned int x86_arch_always_fancy_math_387
2121 = m_PENT | m_ATOM | m_PPRO | m_AMD_MULTIPLE | m_PENT4
2122 | m_NOCONA | m_CORE2I7 | m_GENERIC;
2124 /* In case the average insn count for single function invocation is
2125 lower than this constant, emit fast (but longer) prologue and
2127 #define FAST_PROLOGUE_INSN_COUNT 20
2129 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2130 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2131 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2132 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2134 /* Array of the smallest class containing reg number REGNO, indexed by
2135 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2137 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2139 /* ax, dx, cx, bx */
2140 AREG, DREG, CREG, BREG,
2141 /* si, di, bp, sp */
2142 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2144 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2145 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2148 /* flags, fpsr, fpcr, frame */
2149 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2151 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2154 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2157 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2158 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2159 /* SSE REX registers */
2160 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2164 /* The "default" register map used in 32bit mode. */
2166 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2168 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2169 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2170 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2171 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2172 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2173 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2174 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2177 /* The "default" register map used in 64bit mode. */
2179 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2181 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2182 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2183 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2184 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2185 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2186 8,9,10,11,12,13,14,15, /* extended integer registers */
2187 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2190 /* Define the register numbers to be used in Dwarf debugging information.
2191 The SVR4 reference port C compiler uses the following register numbers
2192 in its Dwarf output code:
2193 0 for %eax (gcc regno = 0)
2194 1 for %ecx (gcc regno = 2)
2195 2 for %edx (gcc regno = 1)
2196 3 for %ebx (gcc regno = 3)
2197 4 for %esp (gcc regno = 7)
2198 5 for %ebp (gcc regno = 6)
2199 6 for %esi (gcc regno = 4)
2200 7 for %edi (gcc regno = 5)
2201 The following three DWARF register numbers are never generated by
2202 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2203 believes these numbers have these meanings.
2204 8 for %eip (no gcc equivalent)
2205 9 for %eflags (gcc regno = 17)
2206 10 for %trapno (no gcc equivalent)
2207 It is not at all clear how we should number the FP stack registers
2208 for the x86 architecture. If the version of SDB on x86/svr4 were
2209 a bit less brain dead with respect to floating-point then we would
2210 have a precedent to follow with respect to DWARF register numbers
2211 for x86 FP registers, but the SDB on x86/svr4 is so completely
2212 broken with respect to FP registers that it is hardly worth thinking
2213 of it as something to strive for compatibility with.
2214 The version of x86/svr4 SDB I have at the moment does (partially)
2215 seem to believe that DWARF register number 11 is associated with
2216 the x86 register %st(0), but that's about all. Higher DWARF
2217 register numbers don't seem to be associated with anything in
2218 particular, and even for DWARF regno 11, SDB only seems to under-
2219 stand that it should say that a variable lives in %st(0) (when
2220 asked via an `=' command) if we said it was in DWARF regno 11,
2221 but SDB still prints garbage when asked for the value of the
2222 variable in question (via a `/' command).
2223 (Also note that the labels SDB prints for various FP stack regs
2224 when doing an `x' command are all wrong.)
2225 Note that these problems generally don't affect the native SVR4
2226 C compiler because it doesn't allow the use of -O with -g and
2227 because when it is *not* optimizing, it allocates a memory
2228 location for each floating-point variable, and the memory
2229 location is what gets described in the DWARF AT_location
2230 attribute for the variable in question.
2231 Regardless of the severe mental illness of the x86/svr4 SDB, we
2232 do something sensible here and we use the following DWARF
2233 register numbers. Note that these are all stack-top-relative
2235 11 for %st(0) (gcc regno = 8)
2236 12 for %st(1) (gcc regno = 9)
2237 13 for %st(2) (gcc regno = 10)
2238 14 for %st(3) (gcc regno = 11)
2239 15 for %st(4) (gcc regno = 12)
2240 16 for %st(5) (gcc regno = 13)
2241 17 for %st(6) (gcc regno = 14)
2242 18 for %st(7) (gcc regno = 15)
2244 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2246 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2247 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2248 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2249 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2250 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2251 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2252 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2255 /* Define parameter passing and return registers. */
2257 static int const x86_64_int_parameter_registers[6] =
2259 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2262 static int const x86_64_ms_abi_int_parameter_registers[4] =
2264 CX_REG, DX_REG, R8_REG, R9_REG
2267 static int const x86_64_int_return_registers[4] =
2269 AX_REG, DX_REG, DI_REG, SI_REG
2272 /* Define the structure for the machine field in struct function. */
2274 struct GTY(()) stack_local_entry {
2275 unsigned short mode;
2278 struct stack_local_entry *next;
2281 /* Structure describing stack frame layout.
2282 Stack grows downward:
2288 saved static chain if ix86_static_chain_on_stack
2290 saved frame pointer if frame_pointer_needed
2291 <- HARD_FRAME_POINTER
2297 <- sse_regs_save_offset
2300 [va_arg registers] |
2304 [padding2] | = to_allocate
2313 int outgoing_arguments_size;
2314 HOST_WIDE_INT frame;
2316 /* The offsets relative to ARG_POINTER. */
2317 HOST_WIDE_INT frame_pointer_offset;
2318 HOST_WIDE_INT hard_frame_pointer_offset;
2319 HOST_WIDE_INT stack_pointer_offset;
2320 HOST_WIDE_INT hfp_save_offset;
2321 HOST_WIDE_INT reg_save_offset;
2322 HOST_WIDE_INT sse_reg_save_offset;
2324 /* When save_regs_using_mov is set, emit prologue using
2325 move instead of push instructions. */
2326 bool save_regs_using_mov;
2329 /* Which cpu are we scheduling for. */
2330 enum attr_cpu ix86_schedule;
2332 /* Which cpu are we optimizing for. */
2333 enum processor_type ix86_tune;
2335 /* Which instruction set architecture to use. */
2336 enum processor_type ix86_arch;
2338 /* true if sse prefetch instruction is not NOOP. */
2339 int x86_prefetch_sse;
2341 /* -mstackrealign option */
2342 static const char ix86_force_align_arg_pointer_string[]
2343 = "force_align_arg_pointer";
2345 static rtx (*ix86_gen_leave) (void);
2346 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2347 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2348 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2349 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2350 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2351 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2352 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2353 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2354 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2356 /* Preferred alignment for stack boundary in bits. */
2357 unsigned int ix86_preferred_stack_boundary;
2359 /* Alignment for incoming stack boundary in bits specified at
2361 static unsigned int ix86_user_incoming_stack_boundary;
2363 /* Default alignment for incoming stack boundary in bits. */
2364 static unsigned int ix86_default_incoming_stack_boundary;
2366 /* Alignment for incoming stack boundary in bits. */
2367 unsigned int ix86_incoming_stack_boundary;
2369 /* Calling abi specific va_list type nodes. */
2370 static GTY(()) tree sysv_va_list_type_node;
2371 static GTY(()) tree ms_va_list_type_node;
2373 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2374 char internal_label_prefix[16];
2375 int internal_label_prefix_len;
2377 /* Fence to use after loop using movnt. */
2380 /* Register class used for passing given 64bit part of the argument.
2381 These represent classes as documented by the PS ABI, with the exception
2382 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2383 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2385 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2386 whenever possible (upper half does contain padding). */
2387 enum x86_64_reg_class
2390 X86_64_INTEGER_CLASS,
2391 X86_64_INTEGERSI_CLASS,
2398 X86_64_COMPLEX_X87_CLASS,
2402 #define MAX_CLASSES 4
2404 /* Table of constants used by fldpi, fldln2, etc.... */
2405 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2406 static bool ext_80387_constants_init = 0;
2409 static struct machine_function * ix86_init_machine_status (void);
2410 static rtx ix86_function_value (const_tree, const_tree, bool);
2411 static bool ix86_function_value_regno_p (const unsigned int);
2412 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2414 static rtx ix86_static_chain (const_tree, bool);
2415 static int ix86_function_regparm (const_tree, const_tree);
2416 static void ix86_compute_frame_layout (struct ix86_frame *);
2417 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2419 static void ix86_add_new_builtins (int);
2420 static rtx ix86_expand_vec_perm_builtin (tree);
2421 static tree ix86_canonical_va_list_type (tree);
2422 static void predict_jump (int);
2423 static unsigned int split_stack_prologue_scratch_regno (void);
2424 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2426 enum ix86_function_specific_strings
2428 IX86_FUNCTION_SPECIFIC_ARCH,
2429 IX86_FUNCTION_SPECIFIC_TUNE,
2430 IX86_FUNCTION_SPECIFIC_MAX
2433 static char *ix86_target_string (int, int, const char *, const char *,
2434 enum fpmath_unit, bool);
2435 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2436 static void ix86_function_specific_save (struct cl_target_option *);
2437 static void ix86_function_specific_restore (struct cl_target_option *);
2438 static void ix86_function_specific_print (FILE *, int,
2439 struct cl_target_option *);
2440 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2441 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2442 struct gcc_options *);
2443 static bool ix86_can_inline_p (tree, tree);
2444 static void ix86_set_current_function (tree);
2445 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2447 static enum calling_abi ix86_function_abi (const_tree);
2450 #ifndef SUBTARGET32_DEFAULT_CPU
2451 #define SUBTARGET32_DEFAULT_CPU "i386"
2454 /* The svr4 ABI for the i386 says that records and unions are returned
2456 #ifndef DEFAULT_PCC_STRUCT_RETURN
2457 #define DEFAULT_PCC_STRUCT_RETURN 1
2460 /* Whether -mtune= or -march= were specified */
2461 static int ix86_tune_defaulted;
2462 static int ix86_arch_specified;
2464 /* Define a set of ISAs which are available when a given ISA is
2465 enabled. MMX and SSE ISAs are handled separately. */
2467 #define OPTION_MASK_ISA_MMX_SET OPTION_MASK_ISA_MMX
2468 #define OPTION_MASK_ISA_3DNOW_SET \
2469 (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_MMX_SET)
2471 #define OPTION_MASK_ISA_SSE_SET OPTION_MASK_ISA_SSE
2472 #define OPTION_MASK_ISA_SSE2_SET \
2473 (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE_SET)
2474 #define OPTION_MASK_ISA_SSE3_SET \
2475 (OPTION_MASK_ISA_SSE3 | OPTION_MASK_ISA_SSE2_SET)
2476 #define OPTION_MASK_ISA_SSSE3_SET \
2477 (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSE3_SET)
2478 #define OPTION_MASK_ISA_SSE4_1_SET \
2479 (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSSE3_SET)
2480 #define OPTION_MASK_ISA_SSE4_2_SET \
2481 (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_SSE4_1_SET)
2482 #define OPTION_MASK_ISA_AVX_SET \
2483 (OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_SSE4_2_SET)
2484 #define OPTION_MASK_ISA_FMA_SET \
2485 (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_AVX_SET)
2487 /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
2489 #define OPTION_MASK_ISA_SSE4_SET OPTION_MASK_ISA_SSE4_2_SET
2491 #define OPTION_MASK_ISA_SSE4A_SET \
2492 (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_SSE3_SET)
2493 #define OPTION_MASK_ISA_FMA4_SET \
2494 (OPTION_MASK_ISA_FMA4 | OPTION_MASK_ISA_SSE4A_SET \
2495 | OPTION_MASK_ISA_AVX_SET)
2496 #define OPTION_MASK_ISA_XOP_SET \
2497 (OPTION_MASK_ISA_XOP | OPTION_MASK_ISA_FMA4_SET)
2498 #define OPTION_MASK_ISA_LWP_SET \
2501 /* AES and PCLMUL need SSE2 because they use xmm registers */
2502 #define OPTION_MASK_ISA_AES_SET \
2503 (OPTION_MASK_ISA_AES | OPTION_MASK_ISA_SSE2_SET)
2504 #define OPTION_MASK_ISA_PCLMUL_SET \
2505 (OPTION_MASK_ISA_PCLMUL | OPTION_MASK_ISA_SSE2_SET)
2507 #define OPTION_MASK_ISA_ABM_SET \
2508 (OPTION_MASK_ISA_ABM | OPTION_MASK_ISA_POPCNT)
2510 #define OPTION_MASK_ISA_BMI_SET OPTION_MASK_ISA_BMI
2511 #define OPTION_MASK_ISA_TBM_SET OPTION_MASK_ISA_TBM
2512 #define OPTION_MASK_ISA_POPCNT_SET OPTION_MASK_ISA_POPCNT
2513 #define OPTION_MASK_ISA_CX16_SET OPTION_MASK_ISA_CX16
2514 #define OPTION_MASK_ISA_SAHF_SET OPTION_MASK_ISA_SAHF
2515 #define OPTION_MASK_ISA_MOVBE_SET OPTION_MASK_ISA_MOVBE
2516 #define OPTION_MASK_ISA_CRC32_SET OPTION_MASK_ISA_CRC32
2518 #define OPTION_MASK_ISA_FSGSBASE_SET OPTION_MASK_ISA_FSGSBASE
2519 #define OPTION_MASK_ISA_RDRND_SET OPTION_MASK_ISA_RDRND
2520 #define OPTION_MASK_ISA_F16C_SET \
2521 (OPTION_MASK_ISA_F16C | OPTION_MASK_ISA_AVX_SET)
2523 /* Define a set of ISAs which aren't available when a given ISA is
2524 disabled. MMX and SSE ISAs are handled separately. */
2526 #define OPTION_MASK_ISA_MMX_UNSET \
2527 (OPTION_MASK_ISA_MMX | OPTION_MASK_ISA_3DNOW_UNSET)
2528 #define OPTION_MASK_ISA_3DNOW_UNSET \
2529 (OPTION_MASK_ISA_3DNOW | OPTION_MASK_ISA_3DNOW_A_UNSET)
2530 #define OPTION_MASK_ISA_3DNOW_A_UNSET OPTION_MASK_ISA_3DNOW_A
2532 #define OPTION_MASK_ISA_SSE_UNSET \
2533 (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_SSE2_UNSET)
2534 #define OPTION_MASK_ISA_SSE2_UNSET \
2535 (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE3_UNSET)
2536 #define OPTION_MASK_ISA_SSE3_UNSET \
2537 (OPTION_MASK_ISA_SSE3 \
2538 | OPTION_MASK_ISA_SSSE3_UNSET \
2539 | OPTION_MASK_ISA_SSE4A_UNSET )
2540 #define OPTION_MASK_ISA_SSSE3_UNSET \
2541 (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSE4_1_UNSET)
2542 #define OPTION_MASK_ISA_SSE4_1_UNSET \
2543 (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_2_UNSET)
2544 #define OPTION_MASK_ISA_SSE4_2_UNSET \
2545 (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_AVX_UNSET )
2546 #define OPTION_MASK_ISA_AVX_UNSET \
2547 (OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_FMA_UNSET \
2548 | OPTION_MASK_ISA_FMA4_UNSET | OPTION_MASK_ISA_F16C_UNSET)
2549 #define OPTION_MASK_ISA_FMA_UNSET OPTION_MASK_ISA_FMA
2551 /* SSE4 includes both SSE4.1 and SSE4.2. -mno-sse4 should the same
2553 #define OPTION_MASK_ISA_SSE4_UNSET OPTION_MASK_ISA_SSE4_1_UNSET
2555 #define OPTION_MASK_ISA_SSE4A_UNSET \
2556 (OPTION_MASK_ISA_SSE4A | OPTION_MASK_ISA_FMA4_UNSET)
2558 #define OPTION_MASK_ISA_FMA4_UNSET \
2559 (OPTION_MASK_ISA_FMA4 | OPTION_MASK_ISA_XOP_UNSET)
2560 #define OPTION_MASK_ISA_XOP_UNSET OPTION_MASK_ISA_XOP
2561 #define OPTION_MASK_ISA_LWP_UNSET OPTION_MASK_ISA_LWP
2563 #define OPTION_MASK_ISA_AES_UNSET OPTION_MASK_ISA_AES
2564 #define OPTION_MASK_ISA_PCLMUL_UNSET OPTION_MASK_ISA_PCLMUL
2565 #define OPTION_MASK_ISA_ABM_UNSET OPTION_MASK_ISA_ABM
2566 #define OPTION_MASK_ISA_BMI_UNSET OPTION_MASK_ISA_BMI
2567 #define OPTION_MASK_ISA_TBM_UNSET OPTION_MASK_ISA_TBM
2568 #define OPTION_MASK_ISA_POPCNT_UNSET OPTION_MASK_ISA_POPCNT
2569 #define OPTION_MASK_ISA_CX16_UNSET OPTION_MASK_ISA_CX16
2570 #define OPTION_MASK_ISA_SAHF_UNSET OPTION_MASK_ISA_SAHF
2571 #define OPTION_MASK_ISA_MOVBE_UNSET OPTION_MASK_ISA_MOVBE
2572 #define OPTION_MASK_ISA_CRC32_UNSET OPTION_MASK_ISA_CRC32
2574 #define OPTION_MASK_ISA_FSGSBASE_UNSET OPTION_MASK_ISA_FSGSBASE
2575 #define OPTION_MASK_ISA_RDRND_UNSET OPTION_MASK_ISA_RDRND
2576 #define OPTION_MASK_ISA_F16C_UNSET OPTION_MASK_ISA_F16C
2578 /* Vectorization library interface and handlers. */
2579 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2581 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2582 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2584 /* Processor target table, indexed by processor number */
2587 const struct processor_costs *cost; /* Processor costs */
2588 const int align_loop; /* Default alignments. */
2589 const int align_loop_max_skip;
2590 const int align_jump;
2591 const int align_jump_max_skip;
2592 const int align_func;
2595 static const struct ptt processor_target_table[PROCESSOR_max] =
2597 {&i386_cost, 4, 3, 4, 3, 4},
2598 {&i486_cost, 16, 15, 16, 15, 16},
2599 {&pentium_cost, 16, 7, 16, 7, 16},
2600 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2601 {&geode_cost, 0, 0, 0, 0, 0},
2602 {&k6_cost, 32, 7, 32, 7, 32},
2603 {&athlon_cost, 16, 7, 16, 7, 16},
2604 {&pentium4_cost, 0, 0, 0, 0, 0},
2605 {&k8_cost, 16, 7, 16, 7, 16},
2606 {&nocona_cost, 0, 0, 0, 0, 0},
2607 /* Core 2 32-bit. */
2608 {&generic32_cost, 16, 10, 16, 10, 16},
2609 /* Core 2 64-bit. */
2610 {&generic64_cost, 16, 10, 16, 10, 16},
2611 /* Core i7 32-bit. */
2612 {&generic32_cost, 16, 10, 16, 10, 16},
2613 /* Core i7 64-bit. */
2614 {&generic64_cost, 16, 10, 16, 10, 16},
2615 {&generic32_cost, 16, 7, 16, 7, 16},
2616 {&generic64_cost, 16, 10, 16, 10, 16},
2617 {&amdfam10_cost, 32, 24, 32, 7, 32},
2618 {&bdver1_cost, 32, 24, 32, 7, 32},
2619 {&btver1_cost, 32, 24, 32, 7, 32},
2620 {&atom_cost, 16, 7, 16, 7, 16}
2623 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2652 /* Return true if a red-zone is in use. */
2655 ix86_using_red_zone (void)
2657 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2660 /* Implement TARGET_HANDLE_OPTION. */
2663 ix86_handle_option (struct gcc_options *opts,
2664 struct gcc_options *opts_set ATTRIBUTE_UNUSED,
2665 const struct cl_decoded_option *decoded,
2668 size_t code = decoded->opt_index;
2669 int value = decoded->value;
2676 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MMX_SET;
2677 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX_SET;
2681 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_MMX_UNSET;
2682 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_MMX_UNSET;
2689 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_SET;
2690 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW_SET;
2694 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_3DNOW_UNSET;
2695 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_3DNOW_UNSET;
2705 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE_SET;
2706 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE_SET;
2710 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_SSE_UNSET;
2711 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE_UNSET;
2718 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE2_SET;
2719 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_SET;
2723 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_SSE2_UNSET;
2724 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE2_UNSET;
2731 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE3_SET;
2732 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3_SET;
2736 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_SSE3_UNSET;
2737 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE3_UNSET;
2744 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSSE3_SET;
2745 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3_SET;
2749 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_SSSE3_UNSET;
2750 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSSE3_UNSET;
2757 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1_SET;
2758 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1_SET;
2762 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_1_UNSET;
2763 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_1_UNSET;
2770 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2_SET;
2771 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2_SET;
2775 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_2_UNSET;
2776 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_2_UNSET;
2783 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AVX_SET;
2784 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX_SET;
2788 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_AVX_UNSET;
2789 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX_UNSET;
2796 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA_SET;
2797 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_FMA_SET;
2801 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_FMA_UNSET;
2802 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_FMA_UNSET;
2807 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4_SET;
2808 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_SET;
2812 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4_UNSET;
2813 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_UNSET;
2819 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SSE4A_SET;
2820 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A_SET;
2824 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_SSE4A_UNSET;
2825 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4A_UNSET;
2832 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FMA4_SET;
2833 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_FMA4_SET;
2837 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_FMA4_UNSET;
2838 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_FMA4_UNSET;
2845 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_XOP_SET;
2846 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_XOP_SET;
2850 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_XOP_UNSET;
2851 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_XOP_UNSET;
2858 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_LWP_SET;
2859 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_LWP_SET;
2863 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_LWP_UNSET;
2864 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_LWP_UNSET;
2871 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_ABM_SET;
2872 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_ABM_SET;
2876 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_ABM_UNSET;
2877 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_ABM_UNSET;
2884 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_BMI_SET;
2885 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_BMI_SET;
2889 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_BMI_UNSET;
2890 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_BMI_UNSET;
2897 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_TBM_SET;
2898 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_TBM_SET;
2902 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_TBM_UNSET;
2903 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_TBM_UNSET;
2910 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_POPCNT_SET;
2911 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_POPCNT_SET;
2915 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_POPCNT_UNSET;
2916 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_POPCNT_UNSET;
2923 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_SAHF_SET;
2924 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SAHF_SET;
2928 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_SAHF_UNSET;
2929 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_SAHF_UNSET;
2936 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CX16_SET;
2937 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_CX16_SET;
2941 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_CX16_UNSET;
2942 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_CX16_UNSET;
2949 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_MOVBE_SET;
2950 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_MOVBE_SET;
2954 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_MOVBE_UNSET;
2955 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_MOVBE_UNSET;
2962 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_CRC32_SET;
2963 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_CRC32_SET;
2967 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_CRC32_UNSET;
2968 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_CRC32_UNSET;
2975 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_AES_SET;
2976 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AES_SET;
2980 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_AES_UNSET;
2981 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_AES_UNSET;
2988 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL_SET;
2989 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_PCLMUL_SET;
2993 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_PCLMUL_UNSET;
2994 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_PCLMUL_UNSET;
3001 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE_SET;
3002 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_FSGSBASE_SET;
3006 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_FSGSBASE_UNSET;
3007 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_FSGSBASE_UNSET;
3014 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_RDRND_SET;
3015 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_RDRND_SET;
3019 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_RDRND_UNSET;
3020 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_RDRND_UNSET;
3027 opts->x_ix86_isa_flags |= OPTION_MASK_ISA_F16C_SET;
3028 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_F16C_SET;
3032 opts->x_ix86_isa_flags &= ~OPTION_MASK_ISA_F16C_UNSET;
3033 opts->x_ix86_isa_flags_explicit |= OPTION_MASK_ISA_F16C_UNSET;
3037 /* Comes from final.c -- no real reason to change it. */
3038 #define MAX_CODE_ALIGN 16
3040 case OPT_malign_loops_:
3041 warning_at (loc, 0, "-malign-loops is obsolete, use -falign-loops");
3042 if (value > MAX_CODE_ALIGN)
3043 error_at (loc, "-malign-loops=%d is not between 0 and %d",
3044 value, MAX_CODE_ALIGN);
3046 opts->x_align_loops = 1 << value;
3049 case OPT_malign_jumps_:
3050 warning_at (loc, 0, "-malign-jumps is obsolete, use -falign-jumps");
3051 if (value > MAX_CODE_ALIGN)
3052 error_at (loc, "-malign-jumps=%d is not between 0 and %d",
3053 value, MAX_CODE_ALIGN);
3055 opts->x_align_jumps = 1 << value;
3058 case OPT_malign_functions_:
3060 "-malign-functions is obsolete, use -falign-functions");
3061 if (value > MAX_CODE_ALIGN)
3062 error_at (loc, "-malign-functions=%d is not between 0 and %d",
3063 value, MAX_CODE_ALIGN);
3065 opts->x_align_functions = 1 << value;
3068 case OPT_mbranch_cost_:
3071 error_at (loc, "-mbranch-cost=%d is not between 0 and 5", value);
3072 opts->x_ix86_branch_cost = 5;
3081 /* Return a string that documents the current -m options. The caller is
3082 responsible for freeing the string. */
3085 ix86_target_string (int isa, int flags, const char *arch, const char *tune,
3086 enum fpmath_unit fpmath, bool add_nl_p)
3088 struct ix86_target_opts
3090 const char *option; /* option string */
3091 int mask; /* isa mask options */
3094 /* This table is ordered so that options like -msse4.2 that imply
3095 preceding options while match those first. */
3096 static struct ix86_target_opts isa_opts[] =
3098 { "-m64", OPTION_MASK_ISA_64BIT },
3099 { "-mfma4", OPTION_MASK_ISA_FMA4 },
3100 { "-mfma", OPTION_MASK_ISA_FMA },
3101 { "-mxop", OPTION_MASK_ISA_XOP },
3102 { "-mlwp", OPTION_MASK_ISA_LWP },
3103 { "-msse4a", OPTION_MASK_ISA_SSE4A },
3104 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
3105 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
3106 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
3107 { "-msse3", OPTION_MASK_ISA_SSE3 },
3108 { "-msse2", OPTION_MASK_ISA_SSE2 },
3109 { "-msse", OPTION_MASK_ISA_SSE },
3110 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
3111 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
3112 { "-mmmx", OPTION_MASK_ISA_MMX },
3113 { "-mabm", OPTION_MASK_ISA_ABM },
3114 { "-mbmi", OPTION_MASK_ISA_BMI },
3115 { "-mtbm", OPTION_MASK_ISA_TBM },
3116 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
3117 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
3118 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
3119 { "-maes", OPTION_MASK_ISA_AES },
3120 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
3121 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
3122 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
3123 { "-mf16c", OPTION_MASK_ISA_F16C },
3127 static struct ix86_target_opts flag_opts[] =
3129 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
3130 { "-m80387", MASK_80387 },
3131 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
3132 { "-malign-double", MASK_ALIGN_DOUBLE },
3133 { "-mcld", MASK_CLD },
3134 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
3135 { "-mieee-fp", MASK_IEEE_FP },
3136 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
3137 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
3138 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
3139 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
3140 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
3141 { "-mno-push-args", MASK_NO_PUSH_ARGS },
3142 { "-mno-red-zone", MASK_NO_RED_ZONE },
3143 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
3144 { "-mrecip", MASK_RECIP },
3145 { "-mrtd", MASK_RTD },
3146 { "-msseregparm", MASK_SSEREGPARM },
3147 { "-mstack-arg-probe", MASK_STACK_PROBE },
3148 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
3149 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
3150 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
3151 { "-mvzeroupper", MASK_VZEROUPPER },
3152 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
3153 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
3156 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
3159 char target_other[40];
3168 memset (opts, '\0', sizeof (opts));
3170 /* Add -march= option. */
3173 opts[num][0] = "-march=";
3174 opts[num++][1] = arch;
3177 /* Add -mtune= option. */
3180 opts[num][0] = "-mtune=";
3181 opts[num++][1] = tune;
3184 /* Pick out the options in isa options. */
3185 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
3187 if ((isa & isa_opts[i].mask) != 0)
3189 opts[num++][0] = isa_opts[i].option;
3190 isa &= ~ isa_opts[i].mask;
3194 if (isa && add_nl_p)
3196 opts[num++][0] = isa_other;
3197 sprintf (isa_other, "(other isa: %#x)", isa);
3200 /* Add flag options. */
3201 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
3203 if ((flags & flag_opts[i].mask) != 0)
3205 opts[num++][0] = flag_opts[i].option;
3206 flags &= ~ flag_opts[i].mask;
3210 if (flags && add_nl_p)
3212 opts[num++][0] = target_other;
3213 sprintf (target_other, "(other flags: %#x)", flags);
3216 /* Add -fpmath= option. */
3219 opts[num][0] = "-mfpmath=";
3220 switch ((int) fpmath)
3223 opts[num++][1] = "387";
3227 opts[num++][1] = "sse";
3230 case FPMATH_387 | FPMATH_SSE:
3231 opts[num++][1] = "sse+387";
3243 gcc_assert (num < ARRAY_SIZE (opts));
3245 /* Size the string. */
3247 sep_len = (add_nl_p) ? 3 : 1;
3248 for (i = 0; i < num; i++)
3251 for (j = 0; j < 2; j++)
3253 len += strlen (opts[i][j]);
3256 /* Build the string. */
3257 ret = ptr = (char *) xmalloc (len);