1 /* Subroutines used for code generation on IA-32.
2 Copyright (C) 1988, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
3 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
4 Free Software Foundation, Inc.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3, or (at your option)
13 GCC is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
24 #include "coretypes.h"
30 #include "hard-reg-set.h"
31 #include "insn-config.h"
32 #include "conditions.h"
34 #include "insn-codes.h"
35 #include "insn-attr.h"
42 #include "diagnostic-core.h"
44 #include "basic-block.h"
47 #include "target-def.h"
48 #include "common/common-target.h"
49 #include "langhooks.h"
54 #include "tm-constrs.h"
58 #include "sched-int.h"
62 #include "diagnostic.h"
64 enum upper_128bits_state
71 typedef struct block_info_def
73 /* State of the upper 128bits of AVX registers at exit. */
74 enum upper_128bits_state state;
75 /* TRUE if state of the upper 128bits of AVX registers is unchanged
78 /* TRUE if block has been processed. */
80 /* TRUE if block has been scanned. */
82 /* Previous state of the upper 128bits of AVX registers at entry. */
83 enum upper_128bits_state prev;
86 #define BLOCK_INFO(B) ((block_info) (B)->aux)
88 enum call_avx256_state
90 /* Callee returns 256bit AVX register. */
91 callee_return_avx256 = -1,
92 /* Callee returns and passes 256bit AVX register. */
93 callee_return_pass_avx256,
94 /* Callee passes 256bit AVX register. */
96 /* Callee doesn't return nor passe 256bit AVX register, or no
97 256bit AVX register in function return. */
99 /* vzeroupper intrinsic. */
103 /* Check if a 256bit AVX register is referenced in stores. */
106 check_avx256_stores (rtx dest, const_rtx set, void *data)
109 && VALID_AVX256_REG_MODE (GET_MODE (dest)))
110 || (GET_CODE (set) == SET
111 && REG_P (SET_SRC (set))
112 && VALID_AVX256_REG_MODE (GET_MODE (SET_SRC (set)))))
114 enum upper_128bits_state *state
115 = (enum upper_128bits_state *) data;
120 /* Helper function for move_or_delete_vzeroupper_1. Look for vzeroupper
121 in basic block BB. Delete it if upper 128bit AVX registers are
122 unused. If it isn't deleted, move it to just before a jump insn.
124 STATE is state of the upper 128bits of AVX registers at entry. */
127 move_or_delete_vzeroupper_2 (basic_block bb,
128 enum upper_128bits_state state)
131 rtx vzeroupper_insn = NULL_RTX;
136 if (BLOCK_INFO (bb)->unchanged)
139 fprintf (dump_file, " [bb %i] unchanged: upper 128bits: %d\n",
142 BLOCK_INFO (bb)->state = state;
146 if (BLOCK_INFO (bb)->scanned && BLOCK_INFO (bb)->prev == state)
149 fprintf (dump_file, " [bb %i] scanned: upper 128bits: %d\n",
150 bb->index, BLOCK_INFO (bb)->state);
154 BLOCK_INFO (bb)->prev = state;
157 fprintf (dump_file, " [bb %i] entry: upper 128bits: %d\n",
162 /* BB_END changes when it is deleted. */
163 bb_end = BB_END (bb);
165 while (insn != bb_end)
167 insn = NEXT_INSN (insn);
169 if (!NONDEBUG_INSN_P (insn))
172 /* Move vzeroupper before jump/call. */
173 if (JUMP_P (insn) || CALL_P (insn))
175 if (!vzeroupper_insn)
178 if (PREV_INSN (insn) != vzeroupper_insn)
182 fprintf (dump_file, "Move vzeroupper after:\n");
183 print_rtl_single (dump_file, PREV_INSN (insn));
184 fprintf (dump_file, "before:\n");
185 print_rtl_single (dump_file, insn);
187 reorder_insns_nobb (vzeroupper_insn, vzeroupper_insn,
190 vzeroupper_insn = NULL_RTX;
194 pat = PATTERN (insn);
196 /* Check insn for vzeroupper intrinsic. */
197 if (GET_CODE (pat) == UNSPEC_VOLATILE
198 && XINT (pat, 1) == UNSPECV_VZEROUPPER)
202 /* Found vzeroupper intrinsic. */
203 fprintf (dump_file, "Found vzeroupper:\n");
204 print_rtl_single (dump_file, insn);
209 /* Check insn for vzeroall intrinsic. */
210 if (GET_CODE (pat) == PARALLEL
211 && GET_CODE (XVECEXP (pat, 0, 0)) == UNSPEC_VOLATILE
212 && XINT (XVECEXP (pat, 0, 0), 1) == UNSPECV_VZEROALL)
217 /* Delete pending vzeroupper insertion. */
220 delete_insn (vzeroupper_insn);
221 vzeroupper_insn = NULL_RTX;
224 else if (state != used)
226 note_stores (pat, check_avx256_stores, &state);
233 /* Process vzeroupper intrinsic. */
234 avx256 = INTVAL (XVECEXP (pat, 0, 0));
238 /* Since the upper 128bits are cleared, callee must not pass
239 256bit AVX register. We only need to check if callee
240 returns 256bit AVX register. */
241 if (avx256 == callee_return_avx256)
247 /* Remove unnecessary vzeroupper since upper 128bits are
251 fprintf (dump_file, "Delete redundant vzeroupper:\n");
252 print_rtl_single (dump_file, insn);
258 /* Set state to UNUSED if callee doesn't return 256bit AVX
260 if (avx256 != callee_return_pass_avx256)
263 if (avx256 == callee_return_pass_avx256
264 || avx256 == callee_pass_avx256)
266 /* Must remove vzeroupper since callee passes in 256bit
270 fprintf (dump_file, "Delete callee pass vzeroupper:\n");
271 print_rtl_single (dump_file, insn);
277 vzeroupper_insn = insn;
283 BLOCK_INFO (bb)->state = state;
284 BLOCK_INFO (bb)->unchanged = unchanged;
285 BLOCK_INFO (bb)->scanned = true;
288 fprintf (dump_file, " [bb %i] exit: %s: upper 128bits: %d\n",
289 bb->index, unchanged ? "unchanged" : "changed",
293 /* Helper function for move_or_delete_vzeroupper. Process vzeroupper
294 in BLOCK and check its predecessor blocks. Treat UNKNOWN state
295 as USED if UNKNOWN_IS_UNUSED is true. Return TRUE if the exit
299 move_or_delete_vzeroupper_1 (basic_block block, bool unknown_is_unused)
303 enum upper_128bits_state state, old_state, new_state;
307 fprintf (dump_file, " Process [bb %i]: status: %d\n",
308 block->index, BLOCK_INFO (block)->processed);
310 if (BLOCK_INFO (block)->processed)
315 /* Check all predecessor edges of this block. */
316 seen_unknown = false;
317 FOR_EACH_EDGE (e, ei, block->preds)
321 switch (BLOCK_INFO (e->src)->state)
324 if (!unknown_is_unused)
338 old_state = BLOCK_INFO (block)->state;
339 move_or_delete_vzeroupper_2 (block, state);
340 new_state = BLOCK_INFO (block)->state;
342 if (state != unknown || new_state == used)
343 BLOCK_INFO (block)->processed = true;
345 /* Need to rescan if the upper 128bits of AVX registers are changed
347 if (new_state != old_state)
349 if (new_state == used)
350 cfun->machine->rescan_vzeroupper_p = 1;
357 /* Go through the instruction stream looking for vzeroupper. Delete
358 it if upper 128bit AVX registers are unused. If it isn't deleted,
359 move it to just before a jump insn. */
362 move_or_delete_vzeroupper (void)
367 fibheap_t worklist, pending, fibheap_swap;
368 sbitmap visited, in_worklist, in_pending, sbitmap_swap;
373 /* Set up block info for each basic block. */
374 alloc_aux_for_blocks (sizeof (struct block_info_def));
376 /* Process outgoing edges of entry point. */
378 fprintf (dump_file, "Process outgoing edges of entry point\n");
380 FOR_EACH_EDGE (e, ei, ENTRY_BLOCK_PTR->succs)
382 move_or_delete_vzeroupper_2 (e->dest,
383 cfun->machine->caller_pass_avx256_p
385 BLOCK_INFO (e->dest)->processed = true;
388 /* Compute reverse completion order of depth first search of the CFG
389 so that the data-flow runs faster. */
390 rc_order = XNEWVEC (int, n_basic_blocks - NUM_FIXED_BLOCKS);
391 bb_order = XNEWVEC (int, last_basic_block);
392 pre_and_rev_post_order_compute (NULL, rc_order, false);
393 for (i = 0; i < n_basic_blocks - NUM_FIXED_BLOCKS; i++)
394 bb_order[rc_order[i]] = i;
397 worklist = fibheap_new ();
398 pending = fibheap_new ();
399 visited = sbitmap_alloc (last_basic_block);
400 in_worklist = sbitmap_alloc (last_basic_block);
401 in_pending = sbitmap_alloc (last_basic_block);
402 sbitmap_zero (in_worklist);
404 /* Don't check outgoing edges of entry point. */
405 sbitmap_ones (in_pending);
407 if (BLOCK_INFO (bb)->processed)
408 RESET_BIT (in_pending, bb->index);
411 move_or_delete_vzeroupper_1 (bb, false);
412 fibheap_insert (pending, bb_order[bb->index], bb);
416 fprintf (dump_file, "Check remaining basic blocks\n");
418 while (!fibheap_empty (pending))
420 fibheap_swap = pending;
422 worklist = fibheap_swap;
423 sbitmap_swap = in_pending;
424 in_pending = in_worklist;
425 in_worklist = sbitmap_swap;
427 sbitmap_zero (visited);
429 cfun->machine->rescan_vzeroupper_p = 0;
431 while (!fibheap_empty (worklist))
433 bb = (basic_block) fibheap_extract_min (worklist);
434 RESET_BIT (in_worklist, bb->index);
435 gcc_assert (!TEST_BIT (visited, bb->index));
436 if (!TEST_BIT (visited, bb->index))
440 SET_BIT (visited, bb->index);
442 if (move_or_delete_vzeroupper_1 (bb, false))
443 FOR_EACH_EDGE (e, ei, bb->succs)
445 if (e->dest == EXIT_BLOCK_PTR
446 || BLOCK_INFO (e->dest)->processed)
449 if (TEST_BIT (visited, e->dest->index))
451 if (!TEST_BIT (in_pending, e->dest->index))
453 /* Send E->DEST to next round. */
454 SET_BIT (in_pending, e->dest->index);
455 fibheap_insert (pending,
456 bb_order[e->dest->index],
460 else if (!TEST_BIT (in_worklist, e->dest->index))
462 /* Add E->DEST to current round. */
463 SET_BIT (in_worklist, e->dest->index);
464 fibheap_insert (worklist, bb_order[e->dest->index],
471 if (!cfun->machine->rescan_vzeroupper_p)
476 fibheap_delete (worklist);
477 fibheap_delete (pending);
478 sbitmap_free (visited);
479 sbitmap_free (in_worklist);
480 sbitmap_free (in_pending);
483 fprintf (dump_file, "Process remaining basic blocks\n");
486 move_or_delete_vzeroupper_1 (bb, true);
488 free_aux_for_blocks ();
491 static rtx legitimize_dllimport_symbol (rtx, bool);
493 #ifndef CHECK_STACK_LIMIT
494 #define CHECK_STACK_LIMIT (-1)
497 /* Return index of given mode in mult and division cost tables. */
498 #define MODE_INDEX(mode) \
499 ((mode) == QImode ? 0 \
500 : (mode) == HImode ? 1 \
501 : (mode) == SImode ? 2 \
502 : (mode) == DImode ? 3 \
505 /* Processor costs (relative to an add) */
506 /* We assume COSTS_N_INSNS is defined as (N)*4 and an addition is 2 bytes. */
507 #define COSTS_N_BYTES(N) ((N) * 2)
509 #define DUMMY_STRINGOP_ALGS {libcall, {{-1, libcall}}}
512 struct processor_costs ix86_size_cost = {/* costs for tuning for size */
513 COSTS_N_BYTES (2), /* cost of an add instruction */
514 COSTS_N_BYTES (3), /* cost of a lea instruction */
515 COSTS_N_BYTES (2), /* variable shift costs */
516 COSTS_N_BYTES (3), /* constant shift costs */
517 {COSTS_N_BYTES (3), /* cost of starting multiply for QI */
518 COSTS_N_BYTES (3), /* HI */
519 COSTS_N_BYTES (3), /* SI */
520 COSTS_N_BYTES (3), /* DI */
521 COSTS_N_BYTES (5)}, /* other */
522 0, /* cost of multiply per each bit set */
523 {COSTS_N_BYTES (3), /* cost of a divide/mod for QI */
524 COSTS_N_BYTES (3), /* HI */
525 COSTS_N_BYTES (3), /* SI */
526 COSTS_N_BYTES (3), /* DI */
527 COSTS_N_BYTES (5)}, /* other */
528 COSTS_N_BYTES (3), /* cost of movsx */
529 COSTS_N_BYTES (3), /* cost of movzx */
530 0, /* "large" insn */
532 2, /* cost for loading QImode using movzbl */
533 {2, 2, 2}, /* cost of loading integer registers
534 in QImode, HImode and SImode.
535 Relative to reg-reg move (2). */
536 {2, 2, 2}, /* cost of storing integer registers */
537 2, /* cost of reg,reg fld/fst */
538 {2, 2, 2}, /* cost of loading fp registers
539 in SFmode, DFmode and XFmode */
540 {2, 2, 2}, /* cost of storing fp registers
541 in SFmode, DFmode and XFmode */
542 3, /* cost of moving MMX register */
543 {3, 3}, /* cost of loading MMX registers
544 in SImode and DImode */
545 {3, 3}, /* cost of storing MMX registers
546 in SImode and DImode */
547 3, /* cost of moving SSE register */
548 {3, 3, 3}, /* cost of loading SSE registers
549 in SImode, DImode and TImode */
550 {3, 3, 3}, /* cost of storing SSE registers
551 in SImode, DImode and TImode */
552 3, /* MMX or SSE register to integer */
553 0, /* size of l1 cache */
554 0, /* size of l2 cache */
555 0, /* size of prefetch block */
556 0, /* number of parallel prefetches */
558 COSTS_N_BYTES (2), /* cost of FADD and FSUB insns. */
559 COSTS_N_BYTES (2), /* cost of FMUL instruction. */
560 COSTS_N_BYTES (2), /* cost of FDIV instruction. */
561 COSTS_N_BYTES (2), /* cost of FABS instruction. */
562 COSTS_N_BYTES (2), /* cost of FCHS instruction. */
563 COSTS_N_BYTES (2), /* cost of FSQRT instruction. */
564 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
565 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
566 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
567 {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
568 1, /* scalar_stmt_cost. */
569 1, /* scalar load_cost. */
570 1, /* scalar_store_cost. */
571 1, /* vec_stmt_cost. */
572 1, /* vec_to_scalar_cost. */
573 1, /* scalar_to_vec_cost. */
574 1, /* vec_align_load_cost. */
575 1, /* vec_unalign_load_cost. */
576 1, /* vec_store_cost. */
577 1, /* cond_taken_branch_cost. */
578 1, /* cond_not_taken_branch_cost. */
581 /* Processor costs (relative to an add) */
583 struct processor_costs i386_cost = { /* 386 specific costs */
584 COSTS_N_INSNS (1), /* cost of an add instruction */
585 COSTS_N_INSNS (1), /* cost of a lea instruction */
586 COSTS_N_INSNS (3), /* variable shift costs */
587 COSTS_N_INSNS (2), /* constant shift costs */
588 {COSTS_N_INSNS (6), /* cost of starting multiply for QI */
589 COSTS_N_INSNS (6), /* HI */
590 COSTS_N_INSNS (6), /* SI */
591 COSTS_N_INSNS (6), /* DI */
592 COSTS_N_INSNS (6)}, /* other */
593 COSTS_N_INSNS (1), /* cost of multiply per each bit set */
594 {COSTS_N_INSNS (23), /* cost of a divide/mod for QI */
595 COSTS_N_INSNS (23), /* HI */
596 COSTS_N_INSNS (23), /* SI */
597 COSTS_N_INSNS (23), /* DI */
598 COSTS_N_INSNS (23)}, /* other */
599 COSTS_N_INSNS (3), /* cost of movsx */
600 COSTS_N_INSNS (2), /* cost of movzx */
601 15, /* "large" insn */
603 4, /* cost for loading QImode using movzbl */
604 {2, 4, 2}, /* cost of loading integer registers
605 in QImode, HImode and SImode.
606 Relative to reg-reg move (2). */
607 {2, 4, 2}, /* cost of storing integer registers */
608 2, /* cost of reg,reg fld/fst */
609 {8, 8, 8}, /* cost of loading fp registers
610 in SFmode, DFmode and XFmode */
611 {8, 8, 8}, /* cost of storing fp registers
612 in SFmode, DFmode and XFmode */
613 2, /* cost of moving MMX register */
614 {4, 8}, /* cost of loading MMX registers
615 in SImode and DImode */
616 {4, 8}, /* cost of storing MMX registers
617 in SImode and DImode */
618 2, /* cost of moving SSE register */
619 {4, 8, 16}, /* cost of loading SSE registers
620 in SImode, DImode and TImode */
621 {4, 8, 16}, /* cost of storing SSE registers
622 in SImode, DImode and TImode */
623 3, /* MMX or SSE register to integer */
624 0, /* size of l1 cache */
625 0, /* size of l2 cache */
626 0, /* size of prefetch block */
627 0, /* number of parallel prefetches */
629 COSTS_N_INSNS (23), /* cost of FADD and FSUB insns. */
630 COSTS_N_INSNS (27), /* cost of FMUL instruction. */
631 COSTS_N_INSNS (88), /* cost of FDIV instruction. */
632 COSTS_N_INSNS (22), /* cost of FABS instruction. */
633 COSTS_N_INSNS (24), /* cost of FCHS instruction. */
634 COSTS_N_INSNS (122), /* cost of FSQRT instruction. */
635 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
636 DUMMY_STRINGOP_ALGS},
637 {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
638 DUMMY_STRINGOP_ALGS},
639 1, /* scalar_stmt_cost. */
640 1, /* scalar load_cost. */
641 1, /* scalar_store_cost. */
642 1, /* vec_stmt_cost. */
643 1, /* vec_to_scalar_cost. */
644 1, /* scalar_to_vec_cost. */
645 1, /* vec_align_load_cost. */
646 2, /* vec_unalign_load_cost. */
647 1, /* vec_store_cost. */
648 3, /* cond_taken_branch_cost. */
649 1, /* cond_not_taken_branch_cost. */
653 struct processor_costs i486_cost = { /* 486 specific costs */
654 COSTS_N_INSNS (1), /* cost of an add instruction */
655 COSTS_N_INSNS (1), /* cost of a lea instruction */
656 COSTS_N_INSNS (3), /* variable shift costs */
657 COSTS_N_INSNS (2), /* constant shift costs */
658 {COSTS_N_INSNS (12), /* cost of starting multiply for QI */
659 COSTS_N_INSNS (12), /* HI */
660 COSTS_N_INSNS (12), /* SI */
661 COSTS_N_INSNS (12), /* DI */
662 COSTS_N_INSNS (12)}, /* other */
663 1, /* cost of multiply per each bit set */
664 {COSTS_N_INSNS (40), /* cost of a divide/mod for QI */
665 COSTS_N_INSNS (40), /* HI */
666 COSTS_N_INSNS (40), /* SI */
667 COSTS_N_INSNS (40), /* DI */
668 COSTS_N_INSNS (40)}, /* other */
669 COSTS_N_INSNS (3), /* cost of movsx */
670 COSTS_N_INSNS (2), /* cost of movzx */
671 15, /* "large" insn */
673 4, /* cost for loading QImode using movzbl */
674 {2, 4, 2}, /* cost of loading integer registers
675 in QImode, HImode and SImode.
676 Relative to reg-reg move (2). */
677 {2, 4, 2}, /* cost of storing integer registers */
678 2, /* cost of reg,reg fld/fst */
679 {8, 8, 8}, /* cost of loading fp registers
680 in SFmode, DFmode and XFmode */
681 {8, 8, 8}, /* cost of storing fp registers
682 in SFmode, DFmode and XFmode */
683 2, /* cost of moving MMX register */
684 {4, 8}, /* cost of loading MMX registers
685 in SImode and DImode */
686 {4, 8}, /* cost of storing MMX registers
687 in SImode and DImode */
688 2, /* cost of moving SSE register */
689 {4, 8, 16}, /* cost of loading SSE registers
690 in SImode, DImode and TImode */
691 {4, 8, 16}, /* cost of storing SSE registers
692 in SImode, DImode and TImode */
693 3, /* MMX or SSE register to integer */
694 4, /* size of l1 cache. 486 has 8kB cache
695 shared for code and data, so 4kB is
696 not really precise. */
697 4, /* size of l2 cache */
698 0, /* size of prefetch block */
699 0, /* number of parallel prefetches */
701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
702 COSTS_N_INSNS (16), /* cost of FMUL instruction. */
703 COSTS_N_INSNS (73), /* cost of FDIV instruction. */
704 COSTS_N_INSNS (3), /* cost of FABS instruction. */
705 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
706 COSTS_N_INSNS (83), /* cost of FSQRT instruction. */
707 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
708 DUMMY_STRINGOP_ALGS},
709 {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
710 DUMMY_STRINGOP_ALGS},
711 1, /* scalar_stmt_cost. */
712 1, /* scalar load_cost. */
713 1, /* scalar_store_cost. */
714 1, /* vec_stmt_cost. */
715 1, /* vec_to_scalar_cost. */
716 1, /* scalar_to_vec_cost. */
717 1, /* vec_align_load_cost. */
718 2, /* vec_unalign_load_cost. */
719 1, /* vec_store_cost. */
720 3, /* cond_taken_branch_cost. */
721 1, /* cond_not_taken_branch_cost. */
725 struct processor_costs pentium_cost = {
726 COSTS_N_INSNS (1), /* cost of an add instruction */
727 COSTS_N_INSNS (1), /* cost of a lea instruction */
728 COSTS_N_INSNS (4), /* variable shift costs */
729 COSTS_N_INSNS (1), /* constant shift costs */
730 {COSTS_N_INSNS (11), /* cost of starting multiply for QI */
731 COSTS_N_INSNS (11), /* HI */
732 COSTS_N_INSNS (11), /* SI */
733 COSTS_N_INSNS (11), /* DI */
734 COSTS_N_INSNS (11)}, /* other */
735 0, /* cost of multiply per each bit set */
736 {COSTS_N_INSNS (25), /* cost of a divide/mod for QI */
737 COSTS_N_INSNS (25), /* HI */
738 COSTS_N_INSNS (25), /* SI */
739 COSTS_N_INSNS (25), /* DI */
740 COSTS_N_INSNS (25)}, /* other */
741 COSTS_N_INSNS (3), /* cost of movsx */
742 COSTS_N_INSNS (2), /* cost of movzx */
743 8, /* "large" insn */
745 6, /* cost for loading QImode using movzbl */
746 {2, 4, 2}, /* cost of loading integer registers
747 in QImode, HImode and SImode.
748 Relative to reg-reg move (2). */
749 {2, 4, 2}, /* cost of storing integer registers */
750 2, /* cost of reg,reg fld/fst */
751 {2, 2, 6}, /* cost of loading fp registers
752 in SFmode, DFmode and XFmode */
753 {4, 4, 6}, /* cost of storing fp registers
754 in SFmode, DFmode and XFmode */
755 8, /* cost of moving MMX register */
756 {8, 8}, /* cost of loading MMX registers
757 in SImode and DImode */
758 {8, 8}, /* cost of storing MMX registers
759 in SImode and DImode */
760 2, /* cost of moving SSE register */
761 {4, 8, 16}, /* cost of loading SSE registers
762 in SImode, DImode and TImode */
763 {4, 8, 16}, /* cost of storing SSE registers
764 in SImode, DImode and TImode */
765 3, /* MMX or SSE register to integer */
766 8, /* size of l1 cache. */
767 8, /* size of l2 cache */
768 0, /* size of prefetch block */
769 0, /* number of parallel prefetches */
771 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
772 COSTS_N_INSNS (3), /* cost of FMUL instruction. */
773 COSTS_N_INSNS (39), /* cost of FDIV instruction. */
774 COSTS_N_INSNS (1), /* cost of FABS instruction. */
775 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
776 COSTS_N_INSNS (70), /* cost of FSQRT instruction. */
777 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
778 DUMMY_STRINGOP_ALGS},
779 {{libcall, {{-1, rep_prefix_4_byte}}},
780 DUMMY_STRINGOP_ALGS},
781 1, /* scalar_stmt_cost. */
782 1, /* scalar load_cost. */
783 1, /* scalar_store_cost. */
784 1, /* vec_stmt_cost. */
785 1, /* vec_to_scalar_cost. */
786 1, /* scalar_to_vec_cost. */
787 1, /* vec_align_load_cost. */
788 2, /* vec_unalign_load_cost. */
789 1, /* vec_store_cost. */
790 3, /* cond_taken_branch_cost. */
791 1, /* cond_not_taken_branch_cost. */
795 struct processor_costs pentiumpro_cost = {
796 COSTS_N_INSNS (1), /* cost of an add instruction */
797 COSTS_N_INSNS (1), /* cost of a lea instruction */
798 COSTS_N_INSNS (1), /* variable shift costs */
799 COSTS_N_INSNS (1), /* constant shift costs */
800 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
801 COSTS_N_INSNS (4), /* HI */
802 COSTS_N_INSNS (4), /* SI */
803 COSTS_N_INSNS (4), /* DI */
804 COSTS_N_INSNS (4)}, /* other */
805 0, /* cost of multiply per each bit set */
806 {COSTS_N_INSNS (17), /* cost of a divide/mod for QI */
807 COSTS_N_INSNS (17), /* HI */
808 COSTS_N_INSNS (17), /* SI */
809 COSTS_N_INSNS (17), /* DI */
810 COSTS_N_INSNS (17)}, /* other */
811 COSTS_N_INSNS (1), /* cost of movsx */
812 COSTS_N_INSNS (1), /* cost of movzx */
813 8, /* "large" insn */
815 2, /* cost for loading QImode using movzbl */
816 {4, 4, 4}, /* cost of loading integer registers
817 in QImode, HImode and SImode.
818 Relative to reg-reg move (2). */
819 {2, 2, 2}, /* cost of storing integer registers */
820 2, /* cost of reg,reg fld/fst */
821 {2, 2, 6}, /* cost of loading fp registers
822 in SFmode, DFmode and XFmode */
823 {4, 4, 6}, /* cost of storing fp registers
824 in SFmode, DFmode and XFmode */
825 2, /* cost of moving MMX register */
826 {2, 2}, /* cost of loading MMX registers
827 in SImode and DImode */
828 {2, 2}, /* cost of storing MMX registers
829 in SImode and DImode */
830 2, /* cost of moving SSE register */
831 {2, 2, 8}, /* cost of loading SSE registers
832 in SImode, DImode and TImode */
833 {2, 2, 8}, /* cost of storing SSE registers
834 in SImode, DImode and TImode */
835 3, /* MMX or SSE register to integer */
836 8, /* size of l1 cache. */
837 256, /* size of l2 cache */
838 32, /* size of prefetch block */
839 6, /* number of parallel prefetches */
841 COSTS_N_INSNS (3), /* cost of FADD and FSUB insns. */
842 COSTS_N_INSNS (5), /* cost of FMUL instruction. */
843 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
844 COSTS_N_INSNS (2), /* cost of FABS instruction. */
845 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
846 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
847 /* PentiumPro has optimized rep instructions for blocks aligned by 8 bytes
848 (we ensure the alignment). For small blocks inline loop is still a
849 noticeable win, for bigger blocks either rep movsl or rep movsb is
850 way to go. Rep movsb has apparently more expensive startup time in CPU,
851 but after 4K the difference is down in the noise. */
852 {{rep_prefix_4_byte, {{128, loop}, {1024, unrolled_loop},
853 {8192, rep_prefix_4_byte}, {-1, rep_prefix_1_byte}}},
854 DUMMY_STRINGOP_ALGS},
855 {{rep_prefix_4_byte, {{1024, unrolled_loop},
856 {8192, rep_prefix_4_byte}, {-1, libcall}}},
857 DUMMY_STRINGOP_ALGS},
858 1, /* scalar_stmt_cost. */
859 1, /* scalar load_cost. */
860 1, /* scalar_store_cost. */
861 1, /* vec_stmt_cost. */
862 1, /* vec_to_scalar_cost. */
863 1, /* scalar_to_vec_cost. */
864 1, /* vec_align_load_cost. */
865 2, /* vec_unalign_load_cost. */
866 1, /* vec_store_cost. */
867 3, /* cond_taken_branch_cost. */
868 1, /* cond_not_taken_branch_cost. */
872 struct processor_costs geode_cost = {
873 COSTS_N_INSNS (1), /* cost of an add instruction */
874 COSTS_N_INSNS (1), /* cost of a lea instruction */
875 COSTS_N_INSNS (2), /* variable shift costs */
876 COSTS_N_INSNS (1), /* constant shift costs */
877 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
878 COSTS_N_INSNS (4), /* HI */
879 COSTS_N_INSNS (7), /* SI */
880 COSTS_N_INSNS (7), /* DI */
881 COSTS_N_INSNS (7)}, /* other */
882 0, /* cost of multiply per each bit set */
883 {COSTS_N_INSNS (15), /* cost of a divide/mod for QI */
884 COSTS_N_INSNS (23), /* HI */
885 COSTS_N_INSNS (39), /* SI */
886 COSTS_N_INSNS (39), /* DI */
887 COSTS_N_INSNS (39)}, /* other */
888 COSTS_N_INSNS (1), /* cost of movsx */
889 COSTS_N_INSNS (1), /* cost of movzx */
890 8, /* "large" insn */
892 1, /* cost for loading QImode using movzbl */
893 {1, 1, 1}, /* cost of loading integer registers
894 in QImode, HImode and SImode.
895 Relative to reg-reg move (2). */
896 {1, 1, 1}, /* cost of storing integer registers */
897 1, /* cost of reg,reg fld/fst */
898 {1, 1, 1}, /* cost of loading fp registers
899 in SFmode, DFmode and XFmode */
900 {4, 6, 6}, /* cost of storing fp registers
901 in SFmode, DFmode and XFmode */
903 1, /* cost of moving MMX register */
904 {1, 1}, /* cost of loading MMX registers
905 in SImode and DImode */
906 {1, 1}, /* cost of storing MMX registers
907 in SImode and DImode */
908 1, /* cost of moving SSE register */
909 {1, 1, 1}, /* cost of loading SSE registers
910 in SImode, DImode and TImode */
911 {1, 1, 1}, /* cost of storing SSE registers
912 in SImode, DImode and TImode */
913 1, /* MMX or SSE register to integer */
914 64, /* size of l1 cache. */
915 128, /* size of l2 cache. */
916 32, /* size of prefetch block */
917 1, /* number of parallel prefetches */
919 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
920 COSTS_N_INSNS (11), /* cost of FMUL instruction. */
921 COSTS_N_INSNS (47), /* cost of FDIV instruction. */
922 COSTS_N_INSNS (1), /* cost of FABS instruction. */
923 COSTS_N_INSNS (1), /* cost of FCHS instruction. */
924 COSTS_N_INSNS (54), /* cost of FSQRT instruction. */
925 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
926 DUMMY_STRINGOP_ALGS},
927 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
928 DUMMY_STRINGOP_ALGS},
929 1, /* scalar_stmt_cost. */
930 1, /* scalar load_cost. */
931 1, /* scalar_store_cost. */
932 1, /* vec_stmt_cost. */
933 1, /* vec_to_scalar_cost. */
934 1, /* scalar_to_vec_cost. */
935 1, /* vec_align_load_cost. */
936 2, /* vec_unalign_load_cost. */
937 1, /* vec_store_cost. */
938 3, /* cond_taken_branch_cost. */
939 1, /* cond_not_taken_branch_cost. */
943 struct processor_costs k6_cost = {
944 COSTS_N_INSNS (1), /* cost of an add instruction */
945 COSTS_N_INSNS (2), /* cost of a lea instruction */
946 COSTS_N_INSNS (1), /* variable shift costs */
947 COSTS_N_INSNS (1), /* constant shift costs */
948 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
949 COSTS_N_INSNS (3), /* HI */
950 COSTS_N_INSNS (3), /* SI */
951 COSTS_N_INSNS (3), /* DI */
952 COSTS_N_INSNS (3)}, /* other */
953 0, /* cost of multiply per each bit set */
954 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
955 COSTS_N_INSNS (18), /* HI */
956 COSTS_N_INSNS (18), /* SI */
957 COSTS_N_INSNS (18), /* DI */
958 COSTS_N_INSNS (18)}, /* other */
959 COSTS_N_INSNS (2), /* cost of movsx */
960 COSTS_N_INSNS (2), /* cost of movzx */
961 8, /* "large" insn */
963 3, /* cost for loading QImode using movzbl */
964 {4, 5, 4}, /* cost of loading integer registers
965 in QImode, HImode and SImode.
966 Relative to reg-reg move (2). */
967 {2, 3, 2}, /* cost of storing integer registers */
968 4, /* cost of reg,reg fld/fst */
969 {6, 6, 6}, /* cost of loading fp registers
970 in SFmode, DFmode and XFmode */
971 {4, 4, 4}, /* cost of storing fp registers
972 in SFmode, DFmode and XFmode */
973 2, /* cost of moving MMX register */
974 {2, 2}, /* cost of loading MMX registers
975 in SImode and DImode */
976 {2, 2}, /* cost of storing MMX registers
977 in SImode and DImode */
978 2, /* cost of moving SSE register */
979 {2, 2, 8}, /* cost of loading SSE registers
980 in SImode, DImode and TImode */
981 {2, 2, 8}, /* cost of storing SSE registers
982 in SImode, DImode and TImode */
983 6, /* MMX or SSE register to integer */
984 32, /* size of l1 cache. */
985 32, /* size of l2 cache. Some models
986 have integrated l2 cache, but
987 optimizing for k6 is not important
988 enough to worry about that. */
989 32, /* size of prefetch block */
990 1, /* number of parallel prefetches */
992 COSTS_N_INSNS (2), /* cost of FADD and FSUB insns. */
993 COSTS_N_INSNS (2), /* cost of FMUL instruction. */
994 COSTS_N_INSNS (56), /* cost of FDIV instruction. */
995 COSTS_N_INSNS (2), /* cost of FABS instruction. */
996 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
997 COSTS_N_INSNS (56), /* cost of FSQRT instruction. */
998 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
999 DUMMY_STRINGOP_ALGS},
1000 {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
1001 DUMMY_STRINGOP_ALGS},
1002 1, /* scalar_stmt_cost. */
1003 1, /* scalar load_cost. */
1004 1, /* scalar_store_cost. */
1005 1, /* vec_stmt_cost. */
1006 1, /* vec_to_scalar_cost. */
1007 1, /* scalar_to_vec_cost. */
1008 1, /* vec_align_load_cost. */
1009 2, /* vec_unalign_load_cost. */
1010 1, /* vec_store_cost. */
1011 3, /* cond_taken_branch_cost. */
1012 1, /* cond_not_taken_branch_cost. */
1016 struct processor_costs athlon_cost = {
1017 COSTS_N_INSNS (1), /* cost of an add instruction */
1018 COSTS_N_INSNS (2), /* cost of a lea instruction */
1019 COSTS_N_INSNS (1), /* variable shift costs */
1020 COSTS_N_INSNS (1), /* constant shift costs */
1021 {COSTS_N_INSNS (5), /* cost of starting multiply for QI */
1022 COSTS_N_INSNS (5), /* HI */
1023 COSTS_N_INSNS (5), /* SI */
1024 COSTS_N_INSNS (5), /* DI */
1025 COSTS_N_INSNS (5)}, /* other */
1026 0, /* cost of multiply per each bit set */
1027 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1028 COSTS_N_INSNS (26), /* HI */
1029 COSTS_N_INSNS (42), /* SI */
1030 COSTS_N_INSNS (74), /* DI */
1031 COSTS_N_INSNS (74)}, /* other */
1032 COSTS_N_INSNS (1), /* cost of movsx */
1033 COSTS_N_INSNS (1), /* cost of movzx */
1034 8, /* "large" insn */
1036 4, /* cost for loading QImode using movzbl */
1037 {3, 4, 3}, /* cost of loading integer registers
1038 in QImode, HImode and SImode.
1039 Relative to reg-reg move (2). */
1040 {3, 4, 3}, /* cost of storing integer registers */
1041 4, /* cost of reg,reg fld/fst */
1042 {4, 4, 12}, /* cost of loading fp registers
1043 in SFmode, DFmode and XFmode */
1044 {6, 6, 8}, /* cost of storing fp registers
1045 in SFmode, DFmode and XFmode */
1046 2, /* cost of moving MMX register */
1047 {4, 4}, /* cost of loading MMX registers
1048 in SImode and DImode */
1049 {4, 4}, /* cost of storing MMX registers
1050 in SImode and DImode */
1051 2, /* cost of moving SSE register */
1052 {4, 4, 6}, /* cost of loading SSE registers
1053 in SImode, DImode and TImode */
1054 {4, 4, 5}, /* cost of storing SSE registers
1055 in SImode, DImode and TImode */
1056 5, /* MMX or SSE register to integer */
1057 64, /* size of l1 cache. */
1058 256, /* size of l2 cache. */
1059 64, /* size of prefetch block */
1060 6, /* number of parallel prefetches */
1061 5, /* Branch cost */
1062 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1063 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1064 COSTS_N_INSNS (24), /* cost of FDIV instruction. */
1065 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1066 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1067 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1068 /* For some reason, Athlon deals better with REP prefix (relative to loops)
1069 compared to K8. Alignment becomes important after 8 bytes for memcpy and
1070 128 bytes for memset. */
1071 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1072 DUMMY_STRINGOP_ALGS},
1073 {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
1074 DUMMY_STRINGOP_ALGS},
1075 1, /* scalar_stmt_cost. */
1076 1, /* scalar load_cost. */
1077 1, /* scalar_store_cost. */
1078 1, /* vec_stmt_cost. */
1079 1, /* vec_to_scalar_cost. */
1080 1, /* scalar_to_vec_cost. */
1081 1, /* vec_align_load_cost. */
1082 2, /* vec_unalign_load_cost. */
1083 1, /* vec_store_cost. */
1084 3, /* cond_taken_branch_cost. */
1085 1, /* cond_not_taken_branch_cost. */
1089 struct processor_costs k8_cost = {
1090 COSTS_N_INSNS (1), /* cost of an add instruction */
1091 COSTS_N_INSNS (2), /* cost of a lea instruction */
1092 COSTS_N_INSNS (1), /* variable shift costs */
1093 COSTS_N_INSNS (1), /* constant shift costs */
1094 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1095 COSTS_N_INSNS (4), /* HI */
1096 COSTS_N_INSNS (3), /* SI */
1097 COSTS_N_INSNS (4), /* DI */
1098 COSTS_N_INSNS (5)}, /* other */
1099 0, /* cost of multiply per each bit set */
1100 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1101 COSTS_N_INSNS (26), /* HI */
1102 COSTS_N_INSNS (42), /* SI */
1103 COSTS_N_INSNS (74), /* DI */
1104 COSTS_N_INSNS (74)}, /* other */
1105 COSTS_N_INSNS (1), /* cost of movsx */
1106 COSTS_N_INSNS (1), /* cost of movzx */
1107 8, /* "large" insn */
1109 4, /* cost for loading QImode using movzbl */
1110 {3, 4, 3}, /* cost of loading integer registers
1111 in QImode, HImode and SImode.
1112 Relative to reg-reg move (2). */
1113 {3, 4, 3}, /* cost of storing integer registers */
1114 4, /* cost of reg,reg fld/fst */
1115 {4, 4, 12}, /* cost of loading fp registers
1116 in SFmode, DFmode and XFmode */
1117 {6, 6, 8}, /* cost of storing fp registers
1118 in SFmode, DFmode and XFmode */
1119 2, /* cost of moving MMX register */
1120 {3, 3}, /* cost of loading MMX registers
1121 in SImode and DImode */
1122 {4, 4}, /* cost of storing MMX registers
1123 in SImode and DImode */
1124 2, /* cost of moving SSE register */
1125 {4, 3, 6}, /* cost of loading SSE registers
1126 in SImode, DImode and TImode */
1127 {4, 4, 5}, /* cost of storing SSE registers
1128 in SImode, DImode and TImode */
1129 5, /* MMX or SSE register to integer */
1130 64, /* size of l1 cache. */
1131 512, /* size of l2 cache. */
1132 64, /* size of prefetch block */
1133 /* New AMD processors never drop prefetches; if they cannot be performed
1134 immediately, they are queued. We set number of simultaneous prefetches
1135 to a large constant to reflect this (it probably is not a good idea not
1136 to limit number of prefetches at all, as their execution also takes some
1138 100, /* number of parallel prefetches */
1139 3, /* Branch cost */
1140 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1141 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1142 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1143 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1144 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1145 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1146 /* K8 has optimized REP instruction for medium sized blocks, but for very
1147 small blocks it is better to use loop. For large blocks, libcall can
1148 do nontemporary accesses and beat inline considerably. */
1149 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1150 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1151 {{libcall, {{8, loop}, {24, unrolled_loop},
1152 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1153 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1154 4, /* scalar_stmt_cost. */
1155 2, /* scalar load_cost. */
1156 2, /* scalar_store_cost. */
1157 5, /* vec_stmt_cost. */
1158 0, /* vec_to_scalar_cost. */
1159 2, /* scalar_to_vec_cost. */
1160 2, /* vec_align_load_cost. */
1161 3, /* vec_unalign_load_cost. */
1162 3, /* vec_store_cost. */
1163 3, /* cond_taken_branch_cost. */
1164 2, /* cond_not_taken_branch_cost. */
1167 struct processor_costs amdfam10_cost = {
1168 COSTS_N_INSNS (1), /* cost of an add instruction */
1169 COSTS_N_INSNS (2), /* cost of a lea instruction */
1170 COSTS_N_INSNS (1), /* variable shift costs */
1171 COSTS_N_INSNS (1), /* constant shift costs */
1172 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1173 COSTS_N_INSNS (4), /* HI */
1174 COSTS_N_INSNS (3), /* SI */
1175 COSTS_N_INSNS (4), /* DI */
1176 COSTS_N_INSNS (5)}, /* other */
1177 0, /* cost of multiply per each bit set */
1178 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1179 COSTS_N_INSNS (35), /* HI */
1180 COSTS_N_INSNS (51), /* SI */
1181 COSTS_N_INSNS (83), /* DI */
1182 COSTS_N_INSNS (83)}, /* other */
1183 COSTS_N_INSNS (1), /* cost of movsx */
1184 COSTS_N_INSNS (1), /* cost of movzx */
1185 8, /* "large" insn */
1187 4, /* cost for loading QImode using movzbl */
1188 {3, 4, 3}, /* cost of loading integer registers
1189 in QImode, HImode and SImode.
1190 Relative to reg-reg move (2). */
1191 {3, 4, 3}, /* cost of storing integer registers */
1192 4, /* cost of reg,reg fld/fst */
1193 {4, 4, 12}, /* cost of loading fp registers
1194 in SFmode, DFmode and XFmode */
1195 {6, 6, 8}, /* cost of storing fp registers
1196 in SFmode, DFmode and XFmode */
1197 2, /* cost of moving MMX register */
1198 {3, 3}, /* cost of loading MMX registers
1199 in SImode and DImode */
1200 {4, 4}, /* cost of storing MMX registers
1201 in SImode and DImode */
1202 2, /* cost of moving SSE register */
1203 {4, 4, 3}, /* cost of loading SSE registers
1204 in SImode, DImode and TImode */
1205 {4, 4, 5}, /* cost of storing SSE registers
1206 in SImode, DImode and TImode */
1207 3, /* MMX or SSE register to integer */
1209 MOVD reg64, xmmreg Double FSTORE 4
1210 MOVD reg32, xmmreg Double FSTORE 4
1212 MOVD reg64, xmmreg Double FADD 3
1214 MOVD reg32, xmmreg Double FADD 3
1216 64, /* size of l1 cache. */
1217 512, /* size of l2 cache. */
1218 64, /* size of prefetch block */
1219 /* New AMD processors never drop prefetches; if they cannot be performed
1220 immediately, they are queued. We set number of simultaneous prefetches
1221 to a large constant to reflect this (it probably is not a good idea not
1222 to limit number of prefetches at all, as their execution also takes some
1224 100, /* number of parallel prefetches */
1225 2, /* Branch cost */
1226 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1227 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1228 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1229 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1230 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1231 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1233 /* AMDFAM10 has optimized REP instruction for medium sized blocks, but for
1234 very small blocks it is better to use loop. For large blocks, libcall can
1235 do nontemporary accesses and beat inline considerably. */
1236 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1237 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1238 {{libcall, {{8, loop}, {24, unrolled_loop},
1239 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1240 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1241 4, /* scalar_stmt_cost. */
1242 2, /* scalar load_cost. */
1243 2, /* scalar_store_cost. */
1244 6, /* vec_stmt_cost. */
1245 0, /* vec_to_scalar_cost. */
1246 2, /* scalar_to_vec_cost. */
1247 2, /* vec_align_load_cost. */
1248 2, /* vec_unalign_load_cost. */
1249 2, /* vec_store_cost. */
1250 2, /* cond_taken_branch_cost. */
1251 1, /* cond_not_taken_branch_cost. */
1254 struct processor_costs bdver1_cost = {
1255 COSTS_N_INSNS (1), /* cost of an add instruction */
1256 COSTS_N_INSNS (1), /* cost of a lea instruction */
1257 COSTS_N_INSNS (1), /* variable shift costs */
1258 COSTS_N_INSNS (1), /* constant shift costs */
1259 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1260 COSTS_N_INSNS (4), /* HI */
1261 COSTS_N_INSNS (4), /* SI */
1262 COSTS_N_INSNS (6), /* DI */
1263 COSTS_N_INSNS (6)}, /* other */
1264 0, /* cost of multiply per each bit set */
1265 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1266 COSTS_N_INSNS (35), /* HI */
1267 COSTS_N_INSNS (51), /* SI */
1268 COSTS_N_INSNS (83), /* DI */
1269 COSTS_N_INSNS (83)}, /* other */
1270 COSTS_N_INSNS (1), /* cost of movsx */
1271 COSTS_N_INSNS (1), /* cost of movzx */
1272 8, /* "large" insn */
1274 4, /* cost for loading QImode using movzbl */
1275 {5, 5, 4}, /* cost of loading integer registers
1276 in QImode, HImode and SImode.
1277 Relative to reg-reg move (2). */
1278 {4, 4, 4}, /* cost of storing integer registers */
1279 2, /* cost of reg,reg fld/fst */
1280 {5, 5, 12}, /* cost of loading fp registers
1281 in SFmode, DFmode and XFmode */
1282 {4, 4, 8}, /* cost of storing fp registers
1283 in SFmode, DFmode and XFmode */
1284 2, /* cost of moving MMX register */
1285 {4, 4}, /* cost of loading MMX registers
1286 in SImode and DImode */
1287 {4, 4}, /* cost of storing MMX registers
1288 in SImode and DImode */
1289 2, /* cost of moving SSE register */
1290 {4, 4, 4}, /* cost of loading SSE registers
1291 in SImode, DImode and TImode */
1292 {4, 4, 4}, /* cost of storing SSE registers
1293 in SImode, DImode and TImode */
1294 2, /* MMX or SSE register to integer */
1296 MOVD reg64, xmmreg Double FSTORE 4
1297 MOVD reg32, xmmreg Double FSTORE 4
1299 MOVD reg64, xmmreg Double FADD 3
1301 MOVD reg32, xmmreg Double FADD 3
1303 16, /* size of l1 cache. */
1304 2048, /* size of l2 cache. */
1305 64, /* size of prefetch block */
1306 /* New AMD processors never drop prefetches; if they cannot be performed
1307 immediately, they are queued. We set number of simultaneous prefetches
1308 to a large constant to reflect this (it probably is not a good idea not
1309 to limit number of prefetches at all, as their execution also takes some
1311 100, /* number of parallel prefetches */
1312 2, /* Branch cost */
1313 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1314 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1315 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1316 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1317 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1318 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1320 /* BDVER1 has optimized REP instruction for medium sized blocks, but for
1321 very small blocks it is better to use loop. For large blocks, libcall
1322 can do nontemporary accesses and beat inline considerably. */
1323 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1324 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1325 {{libcall, {{8, loop}, {24, unrolled_loop},
1326 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1327 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1328 6, /* scalar_stmt_cost. */
1329 4, /* scalar load_cost. */
1330 4, /* scalar_store_cost. */
1331 6, /* vec_stmt_cost. */
1332 0, /* vec_to_scalar_cost. */
1333 2, /* scalar_to_vec_cost. */
1334 4, /* vec_align_load_cost. */
1335 4, /* vec_unalign_load_cost. */
1336 4, /* vec_store_cost. */
1337 2, /* cond_taken_branch_cost. */
1338 1, /* cond_not_taken_branch_cost. */
1341 struct processor_costs bdver2_cost = {
1342 COSTS_N_INSNS (1), /* cost of an add instruction */
1343 COSTS_N_INSNS (1), /* cost of a lea instruction */
1344 COSTS_N_INSNS (1), /* variable shift costs */
1345 COSTS_N_INSNS (1), /* constant shift costs */
1346 {COSTS_N_INSNS (4), /* cost of starting multiply for QI */
1347 COSTS_N_INSNS (4), /* HI */
1348 COSTS_N_INSNS (4), /* SI */
1349 COSTS_N_INSNS (6), /* DI */
1350 COSTS_N_INSNS (6)}, /* other */
1351 0, /* cost of multiply per each bit set */
1352 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1353 COSTS_N_INSNS (35), /* HI */
1354 COSTS_N_INSNS (51), /* SI */
1355 COSTS_N_INSNS (83), /* DI */
1356 COSTS_N_INSNS (83)}, /* other */
1357 COSTS_N_INSNS (1), /* cost of movsx */
1358 COSTS_N_INSNS (1), /* cost of movzx */
1359 8, /* "large" insn */
1361 4, /* cost for loading QImode using movzbl */
1362 {5, 5, 4}, /* cost of loading integer registers
1363 in QImode, HImode and SImode.
1364 Relative to reg-reg move (2). */
1365 {4, 4, 4}, /* cost of storing integer registers */
1366 2, /* cost of reg,reg fld/fst */
1367 {5, 5, 12}, /* cost of loading fp registers
1368 in SFmode, DFmode and XFmode */
1369 {4, 4, 8}, /* cost of storing fp registers
1370 in SFmode, DFmode and XFmode */
1371 2, /* cost of moving MMX register */
1372 {4, 4}, /* cost of loading MMX registers
1373 in SImode and DImode */
1374 {4, 4}, /* cost of storing MMX registers
1375 in SImode and DImode */
1376 2, /* cost of moving SSE register */
1377 {4, 4, 4}, /* cost of loading SSE registers
1378 in SImode, DImode and TImode */
1379 {4, 4, 4}, /* cost of storing SSE registers
1380 in SImode, DImode and TImode */
1381 2, /* MMX or SSE register to integer */
1383 MOVD reg64, xmmreg Double FSTORE 4
1384 MOVD reg32, xmmreg Double FSTORE 4
1386 MOVD reg64, xmmreg Double FADD 3
1388 MOVD reg32, xmmreg Double FADD 3
1390 16, /* size of l1 cache. */
1391 2048, /* size of l2 cache. */
1392 64, /* size of prefetch block */
1393 /* New AMD processors never drop prefetches; if they cannot be performed
1394 immediately, they are queued. We set number of simultaneous prefetches
1395 to a large constant to reflect this (it probably is not a good idea not
1396 to limit number of prefetches at all, as their execution also takes some
1398 100, /* number of parallel prefetches */
1399 2, /* Branch cost */
1400 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1401 COSTS_N_INSNS (6), /* cost of FMUL instruction. */
1402 COSTS_N_INSNS (42), /* cost of FDIV instruction. */
1403 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1404 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1405 COSTS_N_INSNS (52), /* cost of FSQRT instruction. */
1407 /* BDVER2 has optimized REP instruction for medium sized blocks, but for
1408 very small blocks it is better to use loop. For large blocks, libcall
1409 can do nontemporary accesses and beat inline considerably. */
1410 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1411 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1412 {{libcall, {{8, loop}, {24, unrolled_loop},
1413 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1414 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1415 6, /* scalar_stmt_cost. */
1416 4, /* scalar load_cost. */
1417 4, /* scalar_store_cost. */
1418 6, /* vec_stmt_cost. */
1419 0, /* vec_to_scalar_cost. */
1420 2, /* scalar_to_vec_cost. */
1421 4, /* vec_align_load_cost. */
1422 4, /* vec_unalign_load_cost. */
1423 4, /* vec_store_cost. */
1424 2, /* cond_taken_branch_cost. */
1425 1, /* cond_not_taken_branch_cost. */
1428 struct processor_costs btver1_cost = {
1429 COSTS_N_INSNS (1), /* cost of an add instruction */
1430 COSTS_N_INSNS (2), /* cost of a lea instruction */
1431 COSTS_N_INSNS (1), /* variable shift costs */
1432 COSTS_N_INSNS (1), /* constant shift costs */
1433 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1434 COSTS_N_INSNS (4), /* HI */
1435 COSTS_N_INSNS (3), /* SI */
1436 COSTS_N_INSNS (4), /* DI */
1437 COSTS_N_INSNS (5)}, /* other */
1438 0, /* cost of multiply per each bit set */
1439 {COSTS_N_INSNS (19), /* cost of a divide/mod for QI */
1440 COSTS_N_INSNS (35), /* HI */
1441 COSTS_N_INSNS (51), /* SI */
1442 COSTS_N_INSNS (83), /* DI */
1443 COSTS_N_INSNS (83)}, /* other */
1444 COSTS_N_INSNS (1), /* cost of movsx */
1445 COSTS_N_INSNS (1), /* cost of movzx */
1446 8, /* "large" insn */
1448 4, /* cost for loading QImode using movzbl */
1449 {3, 4, 3}, /* cost of loading integer registers
1450 in QImode, HImode and SImode.
1451 Relative to reg-reg move (2). */
1452 {3, 4, 3}, /* cost of storing integer registers */
1453 4, /* cost of reg,reg fld/fst */
1454 {4, 4, 12}, /* cost of loading fp registers
1455 in SFmode, DFmode and XFmode */
1456 {6, 6, 8}, /* cost of storing fp registers
1457 in SFmode, DFmode and XFmode */
1458 2, /* cost of moving MMX register */
1459 {3, 3}, /* cost of loading MMX registers
1460 in SImode and DImode */
1461 {4, 4}, /* cost of storing MMX registers
1462 in SImode and DImode */
1463 2, /* cost of moving SSE register */
1464 {4, 4, 3}, /* cost of loading SSE registers
1465 in SImode, DImode and TImode */
1466 {4, 4, 5}, /* cost of storing SSE registers
1467 in SImode, DImode and TImode */
1468 3, /* MMX or SSE register to integer */
1470 MOVD reg64, xmmreg Double FSTORE 4
1471 MOVD reg32, xmmreg Double FSTORE 4
1473 MOVD reg64, xmmreg Double FADD 3
1475 MOVD reg32, xmmreg Double FADD 3
1477 32, /* size of l1 cache. */
1478 512, /* size of l2 cache. */
1479 64, /* size of prefetch block */
1480 100, /* number of parallel prefetches */
1481 2, /* Branch cost */
1482 COSTS_N_INSNS (4), /* cost of FADD and FSUB insns. */
1483 COSTS_N_INSNS (4), /* cost of FMUL instruction. */
1484 COSTS_N_INSNS (19), /* cost of FDIV instruction. */
1485 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1486 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1487 COSTS_N_INSNS (35), /* cost of FSQRT instruction. */
1489 /* BTVER1 has optimized REP instruction for medium sized blocks, but for
1490 very small blocks it is better to use loop. For large blocks, libcall can
1491 do nontemporary accesses and beat inline considerably. */
1492 {{libcall, {{6, loop}, {14, unrolled_loop}, {-1, rep_prefix_4_byte}}},
1493 {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1494 {{libcall, {{8, loop}, {24, unrolled_loop},
1495 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1496 {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1497 4, /* scalar_stmt_cost. */
1498 2, /* scalar load_cost. */
1499 2, /* scalar_store_cost. */
1500 6, /* vec_stmt_cost. */
1501 0, /* vec_to_scalar_cost. */
1502 2, /* scalar_to_vec_cost. */
1503 2, /* vec_align_load_cost. */
1504 2, /* vec_unalign_load_cost. */
1505 2, /* vec_store_cost. */
1506 2, /* cond_taken_branch_cost. */
1507 1, /* cond_not_taken_branch_cost. */
1511 struct processor_costs pentium4_cost = {
1512 COSTS_N_INSNS (1), /* cost of an add instruction */
1513 COSTS_N_INSNS (3), /* cost of a lea instruction */
1514 COSTS_N_INSNS (4), /* variable shift costs */
1515 COSTS_N_INSNS (4), /* constant shift costs */
1516 {COSTS_N_INSNS (15), /* cost of starting multiply for QI */
1517 COSTS_N_INSNS (15), /* HI */
1518 COSTS_N_INSNS (15), /* SI */
1519 COSTS_N_INSNS (15), /* DI */
1520 COSTS_N_INSNS (15)}, /* other */
1521 0, /* cost of multiply per each bit set */
1522 {COSTS_N_INSNS (56), /* cost of a divide/mod for QI */
1523 COSTS_N_INSNS (56), /* HI */
1524 COSTS_N_INSNS (56), /* SI */
1525 COSTS_N_INSNS (56), /* DI */
1526 COSTS_N_INSNS (56)}, /* other */
1527 COSTS_N_INSNS (1), /* cost of movsx */
1528 COSTS_N_INSNS (1), /* cost of movzx */
1529 16, /* "large" insn */
1531 2, /* cost for loading QImode using movzbl */
1532 {4, 5, 4}, /* cost of loading integer registers
1533 in QImode, HImode and SImode.
1534 Relative to reg-reg move (2). */
1535 {2, 3, 2}, /* cost of storing integer registers */
1536 2, /* cost of reg,reg fld/fst */
1537 {2, 2, 6}, /* cost of loading fp registers
1538 in SFmode, DFmode and XFmode */
1539 {4, 4, 6}, /* cost of storing fp registers
1540 in SFmode, DFmode and XFmode */
1541 2, /* cost of moving MMX register */
1542 {2, 2}, /* cost of loading MMX registers
1543 in SImode and DImode */
1544 {2, 2}, /* cost of storing MMX registers
1545 in SImode and DImode */
1546 12, /* cost of moving SSE register */
1547 {12, 12, 12}, /* cost of loading SSE registers
1548 in SImode, DImode and TImode */
1549 {2, 2, 8}, /* cost of storing SSE registers
1550 in SImode, DImode and TImode */
1551 10, /* MMX or SSE register to integer */
1552 8, /* size of l1 cache. */
1553 256, /* size of l2 cache. */
1554 64, /* size of prefetch block */
1555 6, /* number of parallel prefetches */
1556 2, /* Branch cost */
1557 COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */
1558 COSTS_N_INSNS (7), /* cost of FMUL instruction. */
1559 COSTS_N_INSNS (43), /* cost of FDIV instruction. */
1560 COSTS_N_INSNS (2), /* cost of FABS instruction. */
1561 COSTS_N_INSNS (2), /* cost of FCHS instruction. */
1562 COSTS_N_INSNS (43), /* cost of FSQRT instruction. */
1563 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1564 DUMMY_STRINGOP_ALGS},
1565 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1567 DUMMY_STRINGOP_ALGS},
1568 1, /* scalar_stmt_cost. */
1569 1, /* scalar load_cost. */
1570 1, /* scalar_store_cost. */
1571 1, /* vec_stmt_cost. */
1572 1, /* vec_to_scalar_cost. */
1573 1, /* scalar_to_vec_cost. */
1574 1, /* vec_align_load_cost. */
1575 2, /* vec_unalign_load_cost. */
1576 1, /* vec_store_cost. */
1577 3, /* cond_taken_branch_cost. */
1578 1, /* cond_not_taken_branch_cost. */
1582 struct processor_costs nocona_cost = {
1583 COSTS_N_INSNS (1), /* cost of an add instruction */
1584 COSTS_N_INSNS (1), /* cost of a lea instruction */
1585 COSTS_N_INSNS (1), /* variable shift costs */
1586 COSTS_N_INSNS (1), /* constant shift costs */
1587 {COSTS_N_INSNS (10), /* cost of starting multiply for QI */
1588 COSTS_N_INSNS (10), /* HI */
1589 COSTS_N_INSNS (10), /* SI */
1590 COSTS_N_INSNS (10), /* DI */
1591 COSTS_N_INSNS (10)}, /* other */
1592 0, /* cost of multiply per each bit set */
1593 {COSTS_N_INSNS (66), /* cost of a divide/mod for QI */
1594 COSTS_N_INSNS (66), /* HI */
1595 COSTS_N_INSNS (66), /* SI */
1596 COSTS_N_INSNS (66), /* DI */
1597 COSTS_N_INSNS (66)}, /* other */
1598 COSTS_N_INSNS (1), /* cost of movsx */
1599 COSTS_N_INSNS (1), /* cost of movzx */
1600 16, /* "large" insn */
1601 17, /* MOVE_RATIO */
1602 4, /* cost for loading QImode using movzbl */
1603 {4, 4, 4}, /* cost of loading integer registers
1604 in QImode, HImode and SImode.
1605 Relative to reg-reg move (2). */
1606 {4, 4, 4}, /* cost of storing integer registers */
1607 3, /* cost of reg,reg fld/fst */
1608 {12, 12, 12}, /* cost of loading fp registers
1609 in SFmode, DFmode and XFmode */
1610 {4, 4, 4}, /* cost of storing fp registers
1611 in SFmode, DFmode and XFmode */
1612 6, /* cost of moving MMX register */
1613 {12, 12}, /* cost of loading MMX registers
1614 in SImode and DImode */
1615 {12, 12}, /* cost of storing MMX registers
1616 in SImode and DImode */
1617 6, /* cost of moving SSE register */
1618 {12, 12, 12}, /* cost of loading SSE registers
1619 in SImode, DImode and TImode */
1620 {12, 12, 12}, /* cost of storing SSE registers
1621 in SImode, DImode and TImode */
1622 8, /* MMX or SSE register to integer */
1623 8, /* size of l1 cache. */
1624 1024, /* size of l2 cache. */
1625 128, /* size of prefetch block */
1626 8, /* number of parallel prefetches */
1627 1, /* Branch cost */
1628 COSTS_N_INSNS (6), /* cost of FADD and FSUB insns. */
1629 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1630 COSTS_N_INSNS (40), /* cost of FDIV instruction. */
1631 COSTS_N_INSNS (3), /* cost of FABS instruction. */
1632 COSTS_N_INSNS (3), /* cost of FCHS instruction. */
1633 COSTS_N_INSNS (44), /* cost of FSQRT instruction. */
1634 {{libcall, {{12, loop_1_byte}, {-1, rep_prefix_4_byte}}},
1635 {libcall, {{32, loop}, {20000, rep_prefix_8_byte},
1636 {100000, unrolled_loop}, {-1, libcall}}}},
1637 {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
1639 {libcall, {{24, loop}, {64, unrolled_loop},
1640 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1641 1, /* scalar_stmt_cost. */
1642 1, /* scalar load_cost. */
1643 1, /* scalar_store_cost. */
1644 1, /* vec_stmt_cost. */
1645 1, /* vec_to_scalar_cost. */
1646 1, /* scalar_to_vec_cost. */
1647 1, /* vec_align_load_cost. */
1648 2, /* vec_unalign_load_cost. */
1649 1, /* vec_store_cost. */
1650 3, /* cond_taken_branch_cost. */
1651 1, /* cond_not_taken_branch_cost. */
1655 struct processor_costs atom_cost = {
1656 COSTS_N_INSNS (1), /* cost of an add instruction */
1657 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1658 COSTS_N_INSNS (1), /* variable shift costs */
1659 COSTS_N_INSNS (1), /* constant shift costs */
1660 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1661 COSTS_N_INSNS (4), /* HI */
1662 COSTS_N_INSNS (3), /* SI */
1663 COSTS_N_INSNS (4), /* DI */
1664 COSTS_N_INSNS (2)}, /* other */
1665 0, /* cost of multiply per each bit set */
1666 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1667 COSTS_N_INSNS (26), /* HI */
1668 COSTS_N_INSNS (42), /* SI */
1669 COSTS_N_INSNS (74), /* DI */
1670 COSTS_N_INSNS (74)}, /* other */
1671 COSTS_N_INSNS (1), /* cost of movsx */
1672 COSTS_N_INSNS (1), /* cost of movzx */
1673 8, /* "large" insn */
1674 17, /* MOVE_RATIO */
1675 2, /* cost for loading QImode using movzbl */
1676 {4, 4, 4}, /* cost of loading integer registers
1677 in QImode, HImode and SImode.
1678 Relative to reg-reg move (2). */
1679 {4, 4, 4}, /* cost of storing integer registers */
1680 4, /* cost of reg,reg fld/fst */
1681 {12, 12, 12}, /* cost of loading fp registers
1682 in SFmode, DFmode and XFmode */
1683 {6, 6, 8}, /* cost of storing fp registers
1684 in SFmode, DFmode and XFmode */
1685 2, /* cost of moving MMX register */
1686 {8, 8}, /* cost of loading MMX registers
1687 in SImode and DImode */
1688 {8, 8}, /* cost of storing MMX registers
1689 in SImode and DImode */
1690 2, /* cost of moving SSE register */
1691 {8, 8, 8}, /* cost of loading SSE registers
1692 in SImode, DImode and TImode */
1693 {8, 8, 8}, /* cost of storing SSE registers
1694 in SImode, DImode and TImode */
1695 5, /* MMX or SSE register to integer */
1696 32, /* size of l1 cache. */
1697 256, /* size of l2 cache. */
1698 64, /* size of prefetch block */
1699 6, /* number of parallel prefetches */
1700 3, /* Branch cost */
1701 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1702 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1703 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1704 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1705 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1706 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1707 {{libcall, {{11, loop}, {-1, rep_prefix_4_byte}}},
1708 {libcall, {{32, loop}, {64, rep_prefix_4_byte},
1709 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1710 {{libcall, {{8, loop}, {15, unrolled_loop},
1711 {2048, rep_prefix_4_byte}, {-1, libcall}}},
1712 {libcall, {{24, loop}, {32, unrolled_loop},
1713 {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1714 1, /* scalar_stmt_cost. */
1715 1, /* scalar load_cost. */
1716 1, /* scalar_store_cost. */
1717 1, /* vec_stmt_cost. */
1718 1, /* vec_to_scalar_cost. */
1719 1, /* scalar_to_vec_cost. */
1720 1, /* vec_align_load_cost. */
1721 2, /* vec_unalign_load_cost. */
1722 1, /* vec_store_cost. */
1723 3, /* cond_taken_branch_cost. */
1724 1, /* cond_not_taken_branch_cost. */
1727 /* Generic64 should produce code tuned for Nocona and K8. */
1729 struct processor_costs generic64_cost = {
1730 COSTS_N_INSNS (1), /* cost of an add instruction */
1731 /* On all chips taken into consideration lea is 2 cycles and more. With
1732 this cost however our current implementation of synth_mult results in
1733 use of unnecessary temporary registers causing regression on several
1734 SPECfp benchmarks. */
1735 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1736 COSTS_N_INSNS (1), /* variable shift costs */
1737 COSTS_N_INSNS (1), /* constant shift costs */
1738 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1739 COSTS_N_INSNS (4), /* HI */
1740 COSTS_N_INSNS (3), /* SI */
1741 COSTS_N_INSNS (4), /* DI */
1742 COSTS_N_INSNS (2)}, /* other */
1743 0, /* cost of multiply per each bit set */
1744 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1745 COSTS_N_INSNS (26), /* HI */
1746 COSTS_N_INSNS (42), /* SI */
1747 COSTS_N_INSNS (74), /* DI */
1748 COSTS_N_INSNS (74)}, /* other */
1749 COSTS_N_INSNS (1), /* cost of movsx */
1750 COSTS_N_INSNS (1), /* cost of movzx */
1751 8, /* "large" insn */
1752 17, /* MOVE_RATIO */
1753 4, /* cost for loading QImode using movzbl */
1754 {4, 4, 4}, /* cost of loading integer registers
1755 in QImode, HImode and SImode.
1756 Relative to reg-reg move (2). */
1757 {4, 4, 4}, /* cost of storing integer registers */
1758 4, /* cost of reg,reg fld/fst */
1759 {12, 12, 12}, /* cost of loading fp registers
1760 in SFmode, DFmode and XFmode */
1761 {6, 6, 8}, /* cost of storing fp registers
1762 in SFmode, DFmode and XFmode */
1763 2, /* cost of moving MMX register */
1764 {8, 8}, /* cost of loading MMX registers
1765 in SImode and DImode */
1766 {8, 8}, /* cost of storing MMX registers
1767 in SImode and DImode */
1768 2, /* cost of moving SSE register */
1769 {8, 8, 8}, /* cost of loading SSE registers
1770 in SImode, DImode and TImode */
1771 {8, 8, 8}, /* cost of storing SSE registers
1772 in SImode, DImode and TImode */
1773 5, /* MMX or SSE register to integer */
1774 32, /* size of l1 cache. */
1775 512, /* size of l2 cache. */
1776 64, /* size of prefetch block */
1777 6, /* number of parallel prefetches */
1778 /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
1779 value is increased to perhaps more appropriate value of 5. */
1780 3, /* Branch cost */
1781 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1782 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1783 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1784 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1785 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1786 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1787 {DUMMY_STRINGOP_ALGS,
1788 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1789 {DUMMY_STRINGOP_ALGS,
1790 {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
1791 1, /* scalar_stmt_cost. */
1792 1, /* scalar load_cost. */
1793 1, /* scalar_store_cost. */
1794 1, /* vec_stmt_cost. */
1795 1, /* vec_to_scalar_cost. */
1796 1, /* scalar_to_vec_cost. */
1797 1, /* vec_align_load_cost. */
1798 2, /* vec_unalign_load_cost. */
1799 1, /* vec_store_cost. */
1800 3, /* cond_taken_branch_cost. */
1801 1, /* cond_not_taken_branch_cost. */
1804 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
1807 struct processor_costs generic32_cost = {
1808 COSTS_N_INSNS (1), /* cost of an add instruction */
1809 COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */
1810 COSTS_N_INSNS (1), /* variable shift costs */
1811 COSTS_N_INSNS (1), /* constant shift costs */
1812 {COSTS_N_INSNS (3), /* cost of starting multiply for QI */
1813 COSTS_N_INSNS (4), /* HI */
1814 COSTS_N_INSNS (3), /* SI */
1815 COSTS_N_INSNS (4), /* DI */
1816 COSTS_N_INSNS (2)}, /* other */
1817 0, /* cost of multiply per each bit set */
1818 {COSTS_N_INSNS (18), /* cost of a divide/mod for QI */
1819 COSTS_N_INSNS (26), /* HI */
1820 COSTS_N_INSNS (42), /* SI */
1821 COSTS_N_INSNS (74), /* DI */
1822 COSTS_N_INSNS (74)}, /* other */
1823 COSTS_N_INSNS (1), /* cost of movsx */
1824 COSTS_N_INSNS (1), /* cost of movzx */
1825 8, /* "large" insn */
1826 17, /* MOVE_RATIO */
1827 4, /* cost for loading QImode using movzbl */
1828 {4, 4, 4}, /* cost of loading integer registers
1829 in QImode, HImode and SImode.
1830 Relative to reg-reg move (2). */
1831 {4, 4, 4}, /* cost of storing integer registers */
1832 4, /* cost of reg,reg fld/fst */
1833 {12, 12, 12}, /* cost of loading fp registers
1834 in SFmode, DFmode and XFmode */
1835 {6, 6, 8}, /* cost of storing fp registers
1836 in SFmode, DFmode and XFmode */
1837 2, /* cost of moving MMX register */
1838 {8, 8}, /* cost of loading MMX registers
1839 in SImode and DImode */
1840 {8, 8}, /* cost of storing MMX registers
1841 in SImode and DImode */
1842 2, /* cost of moving SSE register */
1843 {8, 8, 8}, /* cost of loading SSE registers
1844 in SImode, DImode and TImode */
1845 {8, 8, 8}, /* cost of storing SSE registers
1846 in SImode, DImode and TImode */
1847 5, /* MMX or SSE register to integer */
1848 32, /* size of l1 cache. */
1849 256, /* size of l2 cache. */
1850 64, /* size of prefetch block */
1851 6, /* number of parallel prefetches */
1852 3, /* Branch cost */
1853 COSTS_N_INSNS (8), /* cost of FADD and FSUB insns. */
1854 COSTS_N_INSNS (8), /* cost of FMUL instruction. */
1855 COSTS_N_INSNS (20), /* cost of FDIV instruction. */
1856 COSTS_N_INSNS (8), /* cost of FABS instruction. */
1857 COSTS_N_INSNS (8), /* cost of FCHS instruction. */
1858 COSTS_N_INSNS (40), /* cost of FSQRT instruction. */
1859 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1860 DUMMY_STRINGOP_ALGS},
1861 {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
1862 DUMMY_STRINGOP_ALGS},
1863 1, /* scalar_stmt_cost. */
1864 1, /* scalar load_cost. */
1865 1, /* scalar_store_cost. */
1866 1, /* vec_stmt_cost. */
1867 1, /* vec_to_scalar_cost. */
1868 1, /* scalar_to_vec_cost. */
1869 1, /* vec_align_load_cost. */
1870 2, /* vec_unalign_load_cost. */
1871 1, /* vec_store_cost. */
1872 3, /* cond_taken_branch_cost. */
1873 1, /* cond_not_taken_branch_cost. */
1876 const struct processor_costs *ix86_cost = &pentium_cost;
1878 /* Processor feature/optimization bitmasks. */
1879 #define m_386 (1<<PROCESSOR_I386)
1880 #define m_486 (1<<PROCESSOR_I486)
1881 #define m_PENT (1<<PROCESSOR_PENTIUM)
1882 #define m_PPRO (1<<PROCESSOR_PENTIUMPRO)
1883 #define m_PENT4 (1<<PROCESSOR_PENTIUM4)
1884 #define m_NOCONA (1<<PROCESSOR_NOCONA)
1885 #define m_P4_NOCONA (m_PENT4 | m_NOCONA)
1886 #define m_CORE2_32 (1<<PROCESSOR_CORE2_32)
1887 #define m_CORE2_64 (1<<PROCESSOR_CORE2_64)
1888 #define m_COREI7_32 (1<<PROCESSOR_COREI7_32)
1889 #define m_COREI7_64 (1<<PROCESSOR_COREI7_64)
1890 #define m_COREI7 (m_COREI7_32 | m_COREI7_64)
1891 #define m_CORE2I7_32 (m_CORE2_32 | m_COREI7_32)
1892 #define m_CORE2I7_64 (m_CORE2_64 | m_COREI7_64)
1893 #define m_CORE2I7 (m_CORE2I7_32 | m_CORE2I7_64)
1894 #define m_ATOM (1<<PROCESSOR_ATOM)
1896 #define m_GEODE (1<<PROCESSOR_GEODE)
1897 #define m_K6 (1<<PROCESSOR_K6)
1898 #define m_K6_GEODE (m_K6 | m_GEODE)
1899 #define m_K8 (1<<PROCESSOR_K8)
1900 #define m_ATHLON (1<<PROCESSOR_ATHLON)
1901 #define m_ATHLON_K8 (m_K8 | m_ATHLON)
1902 #define m_AMDFAM10 (1<<PROCESSOR_AMDFAM10)
1903 #define m_BDVER1 (1<<PROCESSOR_BDVER1)
1904 #define m_BDVER2 (1<<PROCESSOR_BDVER2)
1905 #define m_BDVER (m_BDVER1 | m_BDVER2)
1906 #define m_BTVER1 (1<<PROCESSOR_BTVER1)
1907 #define m_AMD_MULTIPLE (m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1)
1909 #define m_GENERIC32 (1<<PROCESSOR_GENERIC32)
1910 #define m_GENERIC64 (1<<PROCESSOR_GENERIC64)
1912 /* Generic instruction choice should be common subset of supported CPUs
1913 (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */
1914 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)
1916 /* Feature tests against the various tunings. */
1917 unsigned char ix86_tune_features[X86_TUNE_LAST];
1919 /* Feature tests against the various tunings used to create ix86_tune_features
1920 based on the processor mask. */
1921 static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = {
1922 /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
1923 negatively, so enabling for Generic64 seems like good code size
1924 tradeoff. We can't enable it for 32bit generic because it does not
1925 work well with PPro base chips. */
1926 m_386 | m_CORE2I7_64 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64,
1928 /* X86_TUNE_PUSH_MEMORY */
1929 m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1931 /* X86_TUNE_ZERO_EXTEND_WITH_AND */
1934 /* X86_TUNE_UNROLL_STRLEN */
1935 m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC,
1937 /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
1938 on simulation result. But after P4 was made, no performance benefit
1939 was observed with branch hints. It also increases the code size.
1940 As a result, icc never generates branch hints. */
1943 /* X86_TUNE_DOUBLE_WITH_ADD */
1946 /* X86_TUNE_USE_SAHF */
1947 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC,
1949 /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
1950 partial dependencies. */
1951 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1953 /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
1954 register stalls on Generic32 compilation setting as well. However
1955 in current implementation the partial register stalls are not eliminated
1956 very well - they can be introduced via subregs synthesized by combine
1957 and can happen in caller/callee saving sequences. Because this option
1958 pays back little on PPro based chips and is in conflict with partial reg
1959 dependencies used by Athlon/P4 based chips, it is better to leave it off
1960 for generic32 for now. */
1963 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
1964 m_CORE2I7 | m_GENERIC,
1966 /* X86_TUNE_USE_HIMODE_FIOP */
1967 m_386 | m_486 | m_K6_GEODE,
1969 /* X86_TUNE_USE_SIMODE_FIOP */
1970 ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC),
1972 /* X86_TUNE_USE_MOV0 */
1975 /* X86_TUNE_USE_CLTD */
1976 ~(m_PENT | m_CORE2I7 | m_ATOM | m_K6 | m_GENERIC),
1978 /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */
1981 /* X86_TUNE_SPLIT_LONG_MOVES */
1984 /* X86_TUNE_READ_MODIFY_WRITE */
1987 /* X86_TUNE_READ_MODIFY */
1990 /* X86_TUNE_PROMOTE_QIMODE */
1991 m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
1993 /* X86_TUNE_FAST_PREFIX */
1994 ~(m_386 | m_486 | m_PENT),
1996 /* X86_TUNE_SINGLE_STRINGOP */
1997 m_386 | m_P4_NOCONA,
1999 /* X86_TUNE_QIMODE_MATH */
2002 /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
2003 register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option
2004 might be considered for Generic32 if our scheme for avoiding partial
2005 stalls was more effective. */
2008 /* X86_TUNE_PROMOTE_QI_REGS */
2011 /* X86_TUNE_PROMOTE_HI_REGS */
2014 /* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred
2015 over esp addition. */
2016 m_386 | m_486 | m_PENT | m_PPRO,
2018 /* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred
2019 over esp addition. */
2022 /* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred
2023 over esp subtraction. */
2024 m_386 | m_486 | m_PENT | m_K6_GEODE,
2026 /* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred
2027 over esp subtraction. */
2028 m_PENT | m_K6_GEODE,
2030 /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
2031 for DFmode copies */
2032 ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC),
2034 /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
2035 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2037 /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
2038 conflict here in between PPro/Pentium4 based chips that thread 128bit
2039 SSE registers as single units versus K8 based chips that divide SSE
2040 registers to two 64bit halves. This knob promotes all store destinations
2041 to be 128bit to allow register renaming on 128bit SSE units, but usually
2042 results in one extra microop on 64bit SSE units. Experimental results
2043 shows that disabling this option on P4 brings over 20% SPECfp regression,
2044 while enabling it on K8 brings roughly 2.4% regression that can be partly
2045 masked by careful scheduling of moves. */
2046 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC,
2048 /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */
2049 m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER1,
2051 /* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL */
2054 /* X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL */
2057 /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
2058 are resolved on SSE register parts instead of whole registers, so we may
2059 maintain just lower part of scalar values in proper format leaving the
2060 upper part undefined. */
2063 /* X86_TUNE_SSE_TYPELESS_STORES */
2066 /* X86_TUNE_SSE_LOAD0_BY_PXOR */
2067 m_PPRO | m_P4_NOCONA,
2069 /* X86_TUNE_MEMORY_MISMATCH_STALL */
2070 m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2072 /* X86_TUNE_PROLOGUE_USING_MOVE */
2073 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2075 /* X86_TUNE_EPILOGUE_USING_MOVE */
2076 m_PPRO | m_CORE2I7 | m_ATOM | m_ATHLON_K8 | m_GENERIC,
2078 /* X86_TUNE_SHIFT1 */
2081 /* X86_TUNE_USE_FFREEP */
2084 /* X86_TUNE_INTER_UNIT_MOVES */
2085 ~(m_AMD_MULTIPLE | m_GENERIC),
2087 /* X86_TUNE_INTER_UNIT_CONVERSIONS */
2088 ~(m_AMDFAM10 | m_BDVER ),
2090 /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
2091 than 4 branch instructions in the 16 byte window. */
2092 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2094 /* X86_TUNE_SCHEDULE */
2095 m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC,
2097 /* X86_TUNE_USE_BT */
2098 m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC,
2100 /* X86_TUNE_USE_INCDEC */
2101 ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC),
2103 /* X86_TUNE_PAD_RETURNS */
2104 m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC,
2106 /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */
2109 /* X86_TUNE_EXT_80387_CONSTANTS */
2110 m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC,
2112 /* X86_TUNE_SHORTEN_X87_SSE */
2115 /* X86_TUNE_AVOID_VECTOR_DECODE */
2116 m_CORE2I7_64 | m_K8 | m_GENERIC64,
2118 /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode
2119 and SImode multiply, but 386 and 486 do HImode multiply faster. */
2122 /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is
2123 vector path on AMD machines. */
2124 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2126 /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD
2128 m_CORE2I7_64 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER1 | m_GENERIC64,
2130 /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR
2134 /* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is,
2135 but one byte longer. */
2138 /* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory
2139 operand that cannot be represented using a modRM byte. The XOR
2140 replacement is long decoded, so this split helps here as well. */
2143 /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion
2145 m_CORE2I7 | m_AMDFAM10 | m_GENERIC,
2147 /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion
2148 from integer to FP. */
2151 /* X86_TUNE_FUSE_CMP_AND_BRANCH: Fuse a compare or test instruction
2152 with a subsequent conditional jump instruction into a single
2153 compare-and-branch uop. */
2156 /* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag
2157 will impact LEA instruction selection. */
2160 /* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector
2164 /* X86_SOFTARE_PREFETCHING_BENEFICIAL: Enable software prefetching
2165 at -O3. For the moment, the prefetching seems badly tuned for Intel
2167 m_K6_GEODE | m_AMD_MULTIPLE,
2169 /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
2170 the auto-vectorizer. */
2174 /* Feature tests against the various architecture variations. */
2175 unsigned char ix86_arch_features[X86_ARCH_LAST];
2177 /* Feature tests against the various architecture variations, used to create
2178 ix86_arch_features based on the processor mask. */
2179 static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = {
2180 /* X86_ARCH_CMOVE: Conditional move was added for pentiumpro. */
2181 ~(m_386 | m_486 | m_PENT | m_K6),
2183 /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */
2186 /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
2189 /* X86_ARCH_XADD: Exchange and add was added for 80486. */
2192 /* X86_ARCH_BSWAP: Byteswap was added for 80486. */
2196 static const unsigned int x86_accumulate_outgoing_args
2197 = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC;
2199 static const unsigned int x86_arch_always_fancy_math_387
2200 = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC;
2202 static const unsigned int x86_avx256_split_unaligned_load
2203 = m_COREI7 | m_GENERIC;
2205 static const unsigned int x86_avx256_split_unaligned_store
2206 = m_COREI7 | m_BDVER | m_GENERIC;
2208 /* In case the average insn count for single function invocation is
2209 lower than this constant, emit fast (but longer) prologue and
2211 #define FAST_PROLOGUE_INSN_COUNT 20
2213 /* Names for 8 (low), 8 (high), and 16-bit registers, respectively. */
2214 static const char *const qi_reg_name[] = QI_REGISTER_NAMES;
2215 static const char *const qi_high_reg_name[] = QI_HIGH_REGISTER_NAMES;
2216 static const char *const hi_reg_name[] = HI_REGISTER_NAMES;
2218 /* Array of the smallest class containing reg number REGNO, indexed by
2219 REGNO. Used by REGNO_REG_CLASS in i386.h. */
2221 enum reg_class const regclass_map[FIRST_PSEUDO_REGISTER] =
2223 /* ax, dx, cx, bx */
2224 AREG, DREG, CREG, BREG,
2225 /* si, di, bp, sp */
2226 SIREG, DIREG, NON_Q_REGS, NON_Q_REGS,
2228 FP_TOP_REG, FP_SECOND_REG, FLOAT_REGS, FLOAT_REGS,
2229 FLOAT_REGS, FLOAT_REGS, FLOAT_REGS, FLOAT_REGS,
2232 /* flags, fpsr, fpcr, frame */
2233 NO_REGS, NO_REGS, NO_REGS, NON_Q_REGS,
2235 SSE_FIRST_REG, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2238 MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS, MMX_REGS,
2241 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2242 NON_Q_REGS, NON_Q_REGS, NON_Q_REGS, NON_Q_REGS,
2243 /* SSE REX registers */
2244 SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS, SSE_REGS,
2248 /* The "default" register map used in 32bit mode. */
2250 int const dbx_register_map[FIRST_PSEUDO_REGISTER] =
2252 0, 2, 1, 3, 6, 7, 4, 5, /* general regs */
2253 12, 13, 14, 15, 16, 17, 18, 19, /* fp regs */
2254 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2255 21, 22, 23, 24, 25, 26, 27, 28, /* SSE */
2256 29, 30, 31, 32, 33, 34, 35, 36, /* MMX */
2257 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2258 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2261 /* The "default" register map used in 64bit mode. */
2263 int const dbx64_register_map[FIRST_PSEUDO_REGISTER] =
2265 0, 1, 2, 3, 4, 5, 6, 7, /* general regs */
2266 33, 34, 35, 36, 37, 38, 39, 40, /* fp regs */
2267 -1, -1, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2268 17, 18, 19, 20, 21, 22, 23, 24, /* SSE */
2269 41, 42, 43, 44, 45, 46, 47, 48, /* MMX */
2270 8,9,10,11,12,13,14,15, /* extended integer registers */
2271 25, 26, 27, 28, 29, 30, 31, 32, /* extended SSE registers */
2274 /* Define the register numbers to be used in Dwarf debugging information.
2275 The SVR4 reference port C compiler uses the following register numbers
2276 in its Dwarf output code:
2277 0 for %eax (gcc regno = 0)
2278 1 for %ecx (gcc regno = 2)
2279 2 for %edx (gcc regno = 1)
2280 3 for %ebx (gcc regno = 3)
2281 4 for %esp (gcc regno = 7)
2282 5 for %ebp (gcc regno = 6)
2283 6 for %esi (gcc regno = 4)
2284 7 for %edi (gcc regno = 5)
2285 The following three DWARF register numbers are never generated by
2286 the SVR4 C compiler or by the GNU compilers, but SDB on x86/svr4
2287 believes these numbers have these meanings.
2288 8 for %eip (no gcc equivalent)
2289 9 for %eflags (gcc regno = 17)
2290 10 for %trapno (no gcc equivalent)
2291 It is not at all clear how we should number the FP stack registers
2292 for the x86 architecture. If the version of SDB on x86/svr4 were
2293 a bit less brain dead with respect to floating-point then we would
2294 have a precedent to follow with respect to DWARF register numbers
2295 for x86 FP registers, but the SDB on x86/svr4 is so completely
2296 broken with respect to FP registers that it is hardly worth thinking
2297 of it as something to strive for compatibility with.
2298 The version of x86/svr4 SDB I have at the moment does (partially)
2299 seem to believe that DWARF register number 11 is associated with
2300 the x86 register %st(0), but that's about all. Higher DWARF
2301 register numbers don't seem to be associated with anything in
2302 particular, and even for DWARF regno 11, SDB only seems to under-
2303 stand that it should say that a variable lives in %st(0) (when
2304 asked via an `=' command) if we said it was in DWARF regno 11,
2305 but SDB still prints garbage when asked for the value of the
2306 variable in question (via a `/' command).
2307 (Also note that the labels SDB prints for various FP stack regs
2308 when doing an `x' command are all wrong.)
2309 Note that these problems generally don't affect the native SVR4
2310 C compiler because it doesn't allow the use of -O with -g and
2311 because when it is *not* optimizing, it allocates a memory
2312 location for each floating-point variable, and the memory
2313 location is what gets described in the DWARF AT_location
2314 attribute for the variable in question.
2315 Regardless of the severe mental illness of the x86/svr4 SDB, we
2316 do something sensible here and we use the following DWARF
2317 register numbers. Note that these are all stack-top-relative
2319 11 for %st(0) (gcc regno = 8)
2320 12 for %st(1) (gcc regno = 9)
2321 13 for %st(2) (gcc regno = 10)
2322 14 for %st(3) (gcc regno = 11)
2323 15 for %st(4) (gcc regno = 12)
2324 16 for %st(5) (gcc regno = 13)
2325 17 for %st(6) (gcc regno = 14)
2326 18 for %st(7) (gcc regno = 15)
2328 int const svr4_dbx_register_map[FIRST_PSEUDO_REGISTER] =
2330 0, 2, 1, 3, 6, 7, 5, 4, /* general regs */
2331 11, 12, 13, 14, 15, 16, 17, 18, /* fp regs */
2332 -1, 9, -1, -1, -1, /* arg, flags, fpsr, fpcr, frame */
2333 21, 22, 23, 24, 25, 26, 27, 28, /* SSE registers */
2334 29, 30, 31, 32, 33, 34, 35, 36, /* MMX registers */
2335 -1, -1, -1, -1, -1, -1, -1, -1, /* extended integer registers */
2336 -1, -1, -1, -1, -1, -1, -1, -1, /* extended SSE registers */
2339 /* Define parameter passing and return registers. */
2341 static int const x86_64_int_parameter_registers[6] =
2343 DI_REG, SI_REG, DX_REG, CX_REG, R8_REG, R9_REG
2346 static int const x86_64_ms_abi_int_parameter_registers[4] =
2348 CX_REG, DX_REG, R8_REG, R9_REG
2351 static int const x86_64_int_return_registers[4] =
2353 AX_REG, DX_REG, DI_REG, SI_REG
2356 /* Define the structure for the machine field in struct function. */
2358 struct GTY(()) stack_local_entry {
2359 unsigned short mode;
2362 struct stack_local_entry *next;
2365 /* Structure describing stack frame layout.
2366 Stack grows downward:
2372 saved static chain if ix86_static_chain_on_stack
2374 saved frame pointer if frame_pointer_needed
2375 <- HARD_FRAME_POINTER
2381 <- sse_regs_save_offset
2384 [va_arg registers] |
2388 [padding2] | = to_allocate
2397 int outgoing_arguments_size;
2398 HOST_WIDE_INT frame;
2400 /* The offsets relative to ARG_POINTER. */
2401 HOST_WIDE_INT frame_pointer_offset;
2402 HOST_WIDE_INT hard_frame_pointer_offset;
2403 HOST_WIDE_INT stack_pointer_offset;
2404 HOST_WIDE_INT hfp_save_offset;
2405 HOST_WIDE_INT reg_save_offset;
2406 HOST_WIDE_INT sse_reg_save_offset;
2408 /* When save_regs_using_mov is set, emit prologue using
2409 move instead of push instructions. */
2410 bool save_regs_using_mov;
2413 /* Which cpu are we scheduling for. */
2414 enum attr_cpu ix86_schedule;
2416 /* Which cpu are we optimizing for. */
2417 enum processor_type ix86_tune;
2419 /* Which instruction set architecture to use. */
2420 enum processor_type ix86_arch;
2422 /* true if sse prefetch instruction is not NOOP. */
2423 int x86_prefetch_sse;
2425 /* -mstackrealign option */
2426 static const char ix86_force_align_arg_pointer_string[]
2427 = "force_align_arg_pointer";
2429 static rtx (*ix86_gen_leave) (void);
2430 static rtx (*ix86_gen_add3) (rtx, rtx, rtx);
2431 static rtx (*ix86_gen_sub3) (rtx, rtx, rtx);
2432 static rtx (*ix86_gen_sub3_carry) (rtx, rtx, rtx, rtx, rtx);
2433 static rtx (*ix86_gen_one_cmpl2) (rtx, rtx);
2434 static rtx (*ix86_gen_monitor) (rtx, rtx, rtx);
2435 static rtx (*ix86_gen_andsp) (rtx, rtx, rtx);
2436 static rtx (*ix86_gen_allocate_stack_worker) (rtx, rtx);
2437 static rtx (*ix86_gen_adjust_stack_and_probe) (rtx, rtx, rtx);
2438 static rtx (*ix86_gen_probe_stack_range) (rtx, rtx, rtx);
2440 /* Preferred alignment for stack boundary in bits. */
2441 unsigned int ix86_preferred_stack_boundary;
2443 /* Alignment for incoming stack boundary in bits specified at
2445 static unsigned int ix86_user_incoming_stack_boundary;
2447 /* Default alignment for incoming stack boundary in bits. */
2448 static unsigned int ix86_default_incoming_stack_boundary;
2450 /* Alignment for incoming stack boundary in bits. */
2451 unsigned int ix86_incoming_stack_boundary;
2453 /* Calling abi specific va_list type nodes. */
2454 static GTY(()) tree sysv_va_list_type_node;
2455 static GTY(()) tree ms_va_list_type_node;
2457 /* Prefix built by ASM_GENERATE_INTERNAL_LABEL. */
2458 char internal_label_prefix[16];
2459 int internal_label_prefix_len;
2461 /* Fence to use after loop using movnt. */
2464 /* Register class used for passing given 64bit part of the argument.
2465 These represent classes as documented by the PS ABI, with the exception
2466 of SSESF, SSEDF classes, that are basically SSE class, just gcc will
2467 use SF or DFmode move instead of DImode to avoid reformatting penalties.
2469 Similarly we play games with INTEGERSI_CLASS to use cheaper SImode moves
2470 whenever possible (upper half does contain padding). */
2471 enum x86_64_reg_class
2474 X86_64_INTEGER_CLASS,
2475 X86_64_INTEGERSI_CLASS,
2482 X86_64_COMPLEX_X87_CLASS,
2486 #define MAX_CLASSES 4
2488 /* Table of constants used by fldpi, fldln2, etc.... */
2489 static REAL_VALUE_TYPE ext_80387_constants_table [5];
2490 static bool ext_80387_constants_init = 0;
2493 static struct machine_function * ix86_init_machine_status (void);
2494 static rtx ix86_function_value (const_tree, const_tree, bool);
2495 static bool ix86_function_value_regno_p (const unsigned int);
2496 static unsigned int ix86_function_arg_boundary (enum machine_mode,
2498 static rtx ix86_static_chain (const_tree, bool);
2499 static int ix86_function_regparm (const_tree, const_tree);
2500 static void ix86_compute_frame_layout (struct ix86_frame *);
2501 static bool ix86_expand_vector_init_one_nonzero (bool, enum machine_mode,
2503 static void ix86_add_new_builtins (int);
2504 static rtx ix86_expand_vec_perm_builtin (tree);
2505 static tree ix86_canonical_va_list_type (tree);
2506 static void predict_jump (int);
2507 static unsigned int split_stack_prologue_scratch_regno (void);
2508 static bool i386_asm_output_addr_const_extra (FILE *, rtx);
2510 enum ix86_function_specific_strings
2512 IX86_FUNCTION_SPECIFIC_ARCH,
2513 IX86_FUNCTION_SPECIFIC_TUNE,
2514 IX86_FUNCTION_SPECIFIC_MAX
2517 static char *ix86_target_string (int, int, const char *, const char *,
2518 enum fpmath_unit, bool);
2519 static void ix86_debug_options (void) ATTRIBUTE_UNUSED;
2520 static void ix86_function_specific_save (struct cl_target_option *);
2521 static void ix86_function_specific_restore (struct cl_target_option *);
2522 static void ix86_function_specific_print (FILE *, int,
2523 struct cl_target_option *);
2524 static bool ix86_valid_target_attribute_p (tree, tree, tree, int);
2525 static bool ix86_valid_target_attribute_inner_p (tree, char *[],
2526 struct gcc_options *);
2527 static bool ix86_can_inline_p (tree, tree);
2528 static void ix86_set_current_function (tree);
2529 static unsigned int ix86_minimum_incoming_stack_boundary (bool);
2531 static enum calling_abi ix86_function_abi (const_tree);
2534 #ifndef SUBTARGET32_DEFAULT_CPU
2535 #define SUBTARGET32_DEFAULT_CPU "i386"
2538 /* The svr4 ABI for the i386 says that records and unions are returned
2540 #ifndef DEFAULT_PCC_STRUCT_RETURN
2541 #define DEFAULT_PCC_STRUCT_RETURN 1
2544 /* Whether -mtune= or -march= were specified */
2545 static int ix86_tune_defaulted;
2546 static int ix86_arch_specified;
2548 /* Vectorization library interface and handlers. */
2549 static tree (*ix86_veclib_handler) (enum built_in_function, tree, tree);
2551 static tree ix86_veclibabi_svml (enum built_in_function, tree, tree);
2552 static tree ix86_veclibabi_acml (enum built_in_function, tree, tree);
2554 /* Processor target table, indexed by processor number */
2557 const struct processor_costs *cost; /* Processor costs */
2558 const int align_loop; /* Default alignments. */
2559 const int align_loop_max_skip;
2560 const int align_jump;
2561 const int align_jump_max_skip;
2562 const int align_func;
2565 static const struct ptt processor_target_table[PROCESSOR_max] =
2567 {&i386_cost, 4, 3, 4, 3, 4},
2568 {&i486_cost, 16, 15, 16, 15, 16},
2569 {&pentium_cost, 16, 7, 16, 7, 16},
2570 {&pentiumpro_cost, 16, 15, 16, 10, 16},
2571 {&geode_cost, 0, 0, 0, 0, 0},
2572 {&k6_cost, 32, 7, 32, 7, 32},
2573 {&athlon_cost, 16, 7, 16, 7, 16},
2574 {&pentium4_cost, 0, 0, 0, 0, 0},
2575 {&k8_cost, 16, 7, 16, 7, 16},
2576 {&nocona_cost, 0, 0, 0, 0, 0},
2577 /* Core 2 32-bit. */
2578 {&generic32_cost, 16, 10, 16, 10, 16},
2579 /* Core 2 64-bit. */
2580 {&generic64_cost, 16, 10, 16, 10, 16},
2581 /* Core i7 32-bit. */
2582 {&generic32_cost, 16, 10, 16, 10, 16},
2583 /* Core i7 64-bit. */
2584 {&generic64_cost, 16, 10, 16, 10, 16},
2585 {&generic32_cost, 16, 7, 16, 7, 16},
2586 {&generic64_cost, 16, 10, 16, 10, 16},
2587 {&amdfam10_cost, 32, 24, 32, 7, 32},
2588 {&bdver1_cost, 32, 24, 32, 7, 32},
2589 {&bdver2_cost, 32, 24, 32, 7, 32},
2590 {&btver1_cost, 32, 24, 32, 7, 32},
2591 {&atom_cost, 16, 7, 16, 7, 16}
2594 static const char *const cpu_names[TARGET_CPU_DEFAULT_max] =
2624 /* Return true if a red-zone is in use. */
2627 ix86_using_red_zone (void)
2629 return TARGET_RED_ZONE && !TARGET_64BIT_MS_ABI;
2632 /* Return a string that documents the current -m options. The caller is
2633 responsible for freeing the string. */
2636 ix86_target_string (int isa, int flags, const char *arch, const char *tune,
2637 enum fpmath_unit fpmath, bool add_nl_p)
2639 struct ix86_target_opts
2641 const char *option; /* option string */
2642 int mask; /* isa mask options */
2645 /* This table is ordered so that options like -msse4.2 that imply
2646 preceding options while match those first. */
2647 static struct ix86_target_opts isa_opts[] =
2649 { "-m64", OPTION_MASK_ISA_64BIT },
2650 { "-mfma4", OPTION_MASK_ISA_FMA4 },
2651 { "-mfma", OPTION_MASK_ISA_FMA },
2652 { "-mxop", OPTION_MASK_ISA_XOP },
2653 { "-mlwp", OPTION_MASK_ISA_LWP },
2654 { "-msse4a", OPTION_MASK_ISA_SSE4A },
2655 { "-msse4.2", OPTION_MASK_ISA_SSE4_2 },
2656 { "-msse4.1", OPTION_MASK_ISA_SSE4_1 },
2657 { "-mssse3", OPTION_MASK_ISA_SSSE3 },
2658 { "-msse3", OPTION_MASK_ISA_SSE3 },
2659 { "-msse2", OPTION_MASK_ISA_SSE2 },
2660 { "-msse", OPTION_MASK_ISA_SSE },
2661 { "-m3dnow", OPTION_MASK_ISA_3DNOW },
2662 { "-m3dnowa", OPTION_MASK_ISA_3DNOW_A },
2663 { "-mmmx", OPTION_MASK_ISA_MMX },
2664 { "-mabm", OPTION_MASK_ISA_ABM },
2665 { "-mbmi", OPTION_MASK_ISA_BMI },
2666 { "-mtbm", OPTION_MASK_ISA_TBM },
2667 { "-mpopcnt", OPTION_MASK_ISA_POPCNT },
2668 { "-mmovbe", OPTION_MASK_ISA_MOVBE },
2669 { "-mcrc32", OPTION_MASK_ISA_CRC32 },
2670 { "-maes", OPTION_MASK_ISA_AES },
2671 { "-mpclmul", OPTION_MASK_ISA_PCLMUL },
2672 { "-mfsgsbase", OPTION_MASK_ISA_FSGSBASE },
2673 { "-mrdrnd", OPTION_MASK_ISA_RDRND },
2674 { "-mf16c", OPTION_MASK_ISA_F16C },
2678 static struct ix86_target_opts flag_opts[] =
2680 { "-m128bit-long-double", MASK_128BIT_LONG_DOUBLE },
2681 { "-m80387", MASK_80387 },
2682 { "-maccumulate-outgoing-args", MASK_ACCUMULATE_OUTGOING_ARGS },
2683 { "-malign-double", MASK_ALIGN_DOUBLE },
2684 { "-mcld", MASK_CLD },
2685 { "-mfp-ret-in-387", MASK_FLOAT_RETURNS },
2686 { "-mieee-fp", MASK_IEEE_FP },
2687 { "-minline-all-stringops", MASK_INLINE_ALL_STRINGOPS },
2688 { "-minline-stringops-dynamically", MASK_INLINE_STRINGOPS_DYNAMICALLY },
2689 { "-mms-bitfields", MASK_MS_BITFIELD_LAYOUT },
2690 { "-mno-align-stringops", MASK_NO_ALIGN_STRINGOPS },
2691 { "-mno-fancy-math-387", MASK_NO_FANCY_MATH_387 },
2692 { "-mno-push-args", MASK_NO_PUSH_ARGS },
2693 { "-mno-red-zone", MASK_NO_RED_ZONE },
2694 { "-momit-leaf-frame-pointer", MASK_OMIT_LEAF_FRAME_POINTER },
2695 { "-mrecip", MASK_RECIP },
2696 { "-mrtd", MASK_RTD },
2697 { "-msseregparm", MASK_SSEREGPARM },
2698 { "-mstack-arg-probe", MASK_STACK_PROBE },
2699 { "-mtls-direct-seg-refs", MASK_TLS_DIRECT_SEG_REFS },
2700 { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS },
2701 { "-m8bit-idiv", MASK_USE_8BIT_IDIV },
2702 { "-mvzeroupper", MASK_VZEROUPPER },
2703 { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD},
2704 { "-mavx256-split-unaligned-store", MASK_AVX256_SPLIT_UNALIGNED_STORE},
2705 { "-mprefer-avx128", MASK_PREFER_AVX128},
2708 const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2];
2711 char target_other[40];
2720 memset (opts, '\0', sizeof (opts));
2722 /* Add -march= option. */
2725 opts[num][0] = "-march=";
2726 opts[num++][1] = arch;
2729 /* Add -mtune= option. */
2732 opts[num][0] = "-mtune=";
2733 opts[num++][1] = tune;
2736 /* Pick out the options in isa options. */
2737 for (i = 0; i < ARRAY_SIZE (isa_opts); i++)
2739 if ((isa & isa_opts[i].mask) != 0)
2741 opts[num++][0] = isa_opts[i].option;
2742 isa &= ~ isa_opts[i].mask;
2746 if (isa && add_nl_p)
2748 opts[num++][0] = isa_other;
2749 sprintf (isa_other, "(other isa: %#x)", isa);
2752 /* Add flag options. */
2753 for (i = 0; i < ARRAY_SIZE (flag_opts); i++)
2755 if ((flags & flag_opts[i].mask) != 0)
2757 opts[num++][0] = flag_opts[i].option;
2758 flags &= ~ flag_opts[i].mask;
2762 if (flags && add_nl_p)
2764 opts[num++][0] = target_other;
2765 sprintf (target_other, "(other flags: %#x)", flags);
2768 /* Add -fpmath= option. */
2771 opts[num][0] = "-mfpmath=";
2772 switch ((int) fpmath)
2775 opts[num++][1] = "387";
2779 opts[num++][1] = "sse";
2782 case FPMATH_387 | FPMATH_SSE:
2783 opts[num++][1] = "sse+387";
2795 gcc_assert (num < ARRAY_SIZE (opts));
2797 /* Size the string. */
2799 sep_len = (add_nl_p) ? 3 : 1;
2800 for (i = 0; i < num; i++)
2803 for (j = 0; j < 2; j++)
2805 len += strlen (opts[i][j]);
2808 /* Build the string. */
2809 ret = ptr = (char *) xmalloc (len);
2812 for (i = 0; i < num; i++)
2816 for (j = 0; j < 2; j++)
2817 len2[j] = (opts[i][j]) ? strlen (opts[i][j]) : 0;
2824 if (add_nl_p && line_len + len2[0] + len2[1] > 70)
2832 for (j = 0; j < 2; j++)
2835 memcpy (ptr, opts[i][j], len2[j]);
2837 line_len += len2[j];
2842 gcc_assert (ret + len >= ptr);
2847 /* Return true, if profiling code should be emitted before
2848 prologue. Otherwise it returns false.
2849 Note: For x86 with "hotfix" it is sorried. */
2851 ix86_profile_before_prologue (void)
2853 return flag_fentry != 0;
2856 /* Function that is callable from the debugger to print the current
2859 ix86_debug_options (void)
2861 char *opts = ix86_target_string (ix86_isa_flags, target_flags,
2862 ix86_arch_string, ix86_tune_string,
2867 fprintf (stderr, "%s\n\n", opts);
2871 fputs ("<no options>\n\n", stderr);
2876 /* Override various settings based on options. If MAIN_ARGS_P, the
2877 options are from the command line, otherwise they are from
2881 ix86_option_override_internal (bool main_args_p)
2884 unsigned int ix86_arch_mask, ix86_tune_mask;
2885 const bool ix86_tune_specified = (ix86_tune_string != NULL);
2896 PTA_PREFETCH_SSE = 1 << 4,
2898 PTA_3DNOW_A = 1 << 6,
2902 PTA_POPCNT = 1 << 10,
2904 PTA_SSE4A = 1 << 12,
2905 PTA_NO_SAHF = 1 << 13,
2906 PTA_SSE4_1 = 1 << 14,
2907 PTA_SSE4_2 = 1 << 15,
2909 PTA_PCLMUL = 1 << 17,
2912 PTA_MOVBE = 1 << 20,
2916 PTA_FSGSBASE = 1 << 24,
2917 PTA_RDRND = 1 << 25,
2921 /* if this reaches 32, need to widen struct pta flags below */
2926 const char *const name; /* processor name or nickname. */
2927 const enum processor_type processor;
2928 const enum attr_cpu schedule;
2929 const unsigned /*enum pta_flags*/ flags;
2931 const processor_alias_table[] =
2933 {"i386", PROCESSOR_I386, CPU_NONE, 0},
2934 {"i486", PROCESSOR_I486, CPU_NONE, 0},
2935 {"i586", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2936 {"pentium", PROCESSOR_PENTIUM, CPU_PENTIUM, 0},
2937 {"pentium-mmx", PROCESSOR_PENTIUM, CPU_PENTIUM, PTA_MMX},
2938 {"winchip-c6", PROCESSOR_I486, CPU_NONE, PTA_MMX},
2939 {"winchip2", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2940 {"c3", PROCESSOR_I486, CPU_NONE, PTA_MMX | PTA_3DNOW},
2941 {"c3-2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX | PTA_SSE},
2942 {"i686", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2943 {"pentiumpro", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, 0},
2944 {"pentium2", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO, PTA_MMX},
2945 {"pentium3", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2947 {"pentium3m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2949 {"pentium-m", PROCESSOR_PENTIUMPRO, CPU_PENTIUMPRO,
2950 PTA_MMX | PTA_SSE | PTA_SSE2},
2951 {"pentium4", PROCESSOR_PENTIUM4, CPU_NONE,
2952 PTA_MMX |PTA_SSE | PTA_SSE2},
2953 {"pentium4m", PROCESSOR_PENTIUM4, CPU_NONE,
2954 PTA_MMX | PTA_SSE | PTA_SSE2},
2955 {"prescott", PROCESSOR_NOCONA, CPU_NONE,
2956 PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3},
2957 {"nocona", PROCESSOR_NOCONA, CPU_NONE,
2958 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2959 | PTA_CX16 | PTA_NO_SAHF},
2960 {"core2", PROCESSOR_CORE2_64, CPU_CORE2,
2961 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2962 | PTA_SSSE3 | PTA_CX16},
2963 {"corei7", PROCESSOR_COREI7_64, CPU_COREI7,
2964 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2965 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_CX16},
2966 {"corei7-avx", PROCESSOR_COREI7_64, CPU_COREI7,
2967 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2968 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX
2969 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL},
2970 {"atom", PROCESSOR_ATOM, CPU_ATOM,
2971 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
2972 | PTA_SSSE3 | PTA_CX16 | PTA_MOVBE},
2973 {"geode", PROCESSOR_GEODE, CPU_GEODE,
2974 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A |PTA_PREFETCH_SSE},
2975 {"k6", PROCESSOR_K6, CPU_K6, PTA_MMX},
2976 {"k6-2", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2977 {"k6-3", PROCESSOR_K6, CPU_K6, PTA_MMX | PTA_3DNOW},
2978 {"athlon", PROCESSOR_ATHLON, CPU_ATHLON,
2979 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2980 {"athlon-tbird", PROCESSOR_ATHLON, CPU_ATHLON,
2981 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE},
2982 {"athlon-4", PROCESSOR_ATHLON, CPU_ATHLON,
2983 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2984 {"athlon-xp", PROCESSOR_ATHLON, CPU_ATHLON,
2985 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2986 {"athlon-mp", PROCESSOR_ATHLON, CPU_ATHLON,
2987 PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE},
2988 {"x86-64", PROCESSOR_K8, CPU_K8,
2989 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_NO_SAHF},
2990 {"k8", PROCESSOR_K8, CPU_K8,
2991 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2992 | PTA_SSE2 | PTA_NO_SAHF},
2993 {"k8-sse3", PROCESSOR_K8, CPU_K8,
2994 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2995 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
2996 {"opteron", PROCESSOR_K8, CPU_K8,
2997 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
2998 | PTA_SSE2 | PTA_NO_SAHF},
2999 {"opteron-sse3", PROCESSOR_K8, CPU_K8,
3000 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3001 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3002 {"athlon64", PROCESSOR_K8, CPU_K8,
3003 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3004 | PTA_SSE2 | PTA_NO_SAHF},
3005 {"athlon64-sse3", PROCESSOR_K8, CPU_K8,
3006 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3007 | PTA_SSE2 | PTA_SSE3 | PTA_NO_SAHF},
3008 {"athlon-fx", PROCESSOR_K8, CPU_K8,
3009 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3010 | PTA_SSE2 | PTA_NO_SAHF},
3011 {"amdfam10", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3012 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3013 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3014 {"barcelona", PROCESSOR_AMDFAM10, CPU_AMDFAM10,
3015 PTA_64BIT | PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_SSE
3016 | PTA_SSE2 | PTA_SSE3 | PTA_SSE4A | PTA_CX16 | PTA_ABM},
3017 {"bdver1", PROCESSOR_BDVER1, CPU_BDVER1,
3018 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3019 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3020 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX | PTA_FMA4
3021 | PTA_XOP | PTA_LWP},
3022 {"bdver2", PROCESSOR_BDVER2, CPU_BDVER2,
3023 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3024 | PTA_SSE4A | PTA_CX16 | PTA_ABM | PTA_SSSE3 | PTA_SSE4_1
3025 | PTA_SSE4_2 | PTA_AES | PTA_PCLMUL | PTA_AVX
3026 | PTA_XOP | PTA_LWP | PTA_BMI | PTA_TBM | PTA_F16C
3028 {"btver1", PROCESSOR_BTVER1, CPU_GENERIC64,
3029 PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3
3030 | PTA_SSSE3 | PTA_SSE4A |PTA_ABM | PTA_CX16},
3031 {"generic32", PROCESSOR_GENERIC32, CPU_PENTIUMPRO,
3032 0 /* flags are only used for -march switch. */ },
3033 {"generic64", PROCESSOR_GENERIC64, CPU_GENERIC64,
3034 PTA_64BIT /* flags are only used for -march switch. */ },
3037 int const pta_size = ARRAY_SIZE (processor_alias_table);
3039 /* Set up prefix/suffix so the error messages refer to either the command
3040 line argument, or the attribute(target). */
3049 prefix = "option(\"";
3054 #ifdef SUBTARGET_OVERRIDE_OPTIONS
3055 SUBTARGET_OVERRIDE_OPTIONS;
3058 #ifdef SUBSUBTARGET_OVERRIDE_OPTIONS
3059 SUBSUBTARGET_OVERRIDE_OPTIONS;
3063 ix86_isa_flags |= OPTION_MASK_ISA_64BIT;
3065 /* -fPIC is the default for x86_64. */
3066 if (TARGET_MACHO && TARGET_64BIT)
3069 /* Need to check -mtune=generic first. */
3070 if (ix86_tune_string)
3072 if (!strcmp (ix86_tune_string, "generic")
3073 || !strcmp (ix86_tune_string, "i686")
3074 /* As special support for cross compilers we read -mtune=native
3075 as -mtune=generic. With native compilers we won't see the
3076 -mtune=native, as it was changed by the driver. */
3077 || !strcmp (ix86_tune_string, "native"))
3080 ix86_tune_string = "generic64";
3082 ix86_tune_string = "generic32";
3084 /* If this call is for setting the option attribute, allow the
3085 generic32/generic64 that was previously set. */
3086 else if (!main_args_p
3087 && (!strcmp (ix86_tune_string, "generic32")
3088 || !strcmp (ix86_tune_string, "generic64")))
3090 else if (!strncmp (ix86_tune_string, "generic", 7))
3091 error ("bad value (%s) for %stune=%s %s",
3092 ix86_tune_string, prefix, suffix, sw);
3093 else if (!strcmp (ix86_tune_string, "x86-64"))
3094 warning (OPT_Wdeprecated, "%stune=x86-64%s is deprecated; use "
3095 "%stune=k8%s or %stune=generic%s instead as appropriate",
3096 prefix, suffix, prefix, suffix, prefix, suffix);
3100 if (ix86_arch_string)
3101 ix86_tune_string = ix86_arch_string;
3102 if (!ix86_tune_string)
3104 ix86_tune_string = cpu_names[TARGET_CPU_DEFAULT];
3105 ix86_tune_defaulted = 1;
3108 /* ix86_tune_string is set to ix86_arch_string or defaulted. We
3109 need to use a sensible tune option. */
3110 if (!strcmp (ix86_tune_string, "generic")
3111 || !strcmp (ix86_tune_string, "x86-64")
3112 || !strcmp (ix86_tune_string, "i686"))
3115 ix86_tune_string = "generic64";
3117 ix86_tune_string = "generic32";
3121 if (ix86_stringop_alg == rep_prefix_8_byte && !TARGET_64BIT)
3123 /* rep; movq isn't available in 32-bit code. */
3124 error ("-mstringop-strategy=rep_8byte not supported for 32-bit code");
3125 ix86_stringop_alg = no_stringop;
3128 if (!ix86_arch_string)
3129 ix86_arch_string = TARGET_64BIT ? "x86-64" : SUBTARGET32_DEFAULT_CPU;
3131 ix86_arch_specified = 1;
3133 if (!global_options_set.x_ix86_abi)
3134 ix86_abi = DEFAULT_ABI;
3136 if (global_options_set.x_ix86_cmodel)
3138 switch (ix86_cmodel)
3143 ix86_cmodel = CM_SMALL_PIC;
3145 error ("code model %qs not supported in the %s bit mode",
3152 ix86_cmodel = CM_MEDIUM_PIC;
3154 error ("code model %qs not supported in the %s bit mode",
3156 else if (TARGET_X32)
3157 error ("code model %qs not supported in x32 mode",
3164 ix86_cmodel = CM_LARGE_PIC;
3166 error ("code model %qs not supported in the %s bit mode",
3168 else if (TARGET_X32)
3169 error ("code model %qs not supported in x32 mode",
3175 error ("code model %s does not support PIC mode", "32");
3177 error ("code model %qs not supported in the %s bit mode",
3184 error ("code model %s does not support PIC mode", "kernel");
3185 ix86_cmodel = CM_32;
3188 error ("code model %qs not supported in the %s bit mode",
3198 /* For TARGET_64BIT and MS_ABI, force pic on, in order to enable the
3199 use of rip-relative addressing. This eliminates fixups that
3200 would otherwise be needed if this object is to be placed in a
3201 DLL, and is essentially just as efficient as direct addressing. */
3202 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
3203 ix86_cmodel = CM_SMALL_PIC, flag_pic = 1;
3204 else if (TARGET_64BIT)
3205 ix86_cmodel = flag_pic ? CM_SMALL_PIC : CM_SMALL;
3207 ix86_cmodel = CM_32;
3209 if (TARGET_MACHO && ix86_asm_dialect == ASM_INTEL)
3211 error ("-masm=intel not supported in this configuration");
3212 ix86_asm_dialect = ASM_ATT;
3214 if ((TARGET_64BIT != 0) != ((ix86_isa_flags & OPTION_MASK_ISA_64BIT) != 0))
3215 sorry ("%i-bit mode not compiled in",
3216 (ix86_isa_flags & OPTION_MASK_ISA_64BIT) ? 64 : 32);
3218 for (i = 0; i < pta_size; i++)
3219 if (! strcmp (ix86_arch_string, processor_alias_table[i].name))
3221 ix86_schedule = processor_alias_table[i].schedule;
3222 ix86_arch = processor_alias_table[i].processor;
3223 /* Default cpu tuning to the architecture. */
3224 ix86_tune = ix86_arch;
3226 if (TARGET_64BIT && !(processor_alias_table[i].flags & PTA_64BIT))
3227 error ("CPU you selected does not support x86-64 "
3230 if (processor_alias_table[i].flags & PTA_MMX
3231 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MMX))
3232 ix86_isa_flags |= OPTION_MASK_ISA_MMX;
3233 if (processor_alias_table[i].flags & PTA_3DNOW
3234 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW))
3235 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW;
3236 if (processor_alias_table[i].flags & PTA_3DNOW_A
3237 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_3DNOW_A))
3238 ix86_isa_flags |= OPTION_MASK_ISA_3DNOW_A;
3239 if (processor_alias_table[i].flags & PTA_SSE
3240 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE))
3241 ix86_isa_flags |= OPTION_MASK_ISA_SSE;
3242 if (processor_alias_table[i].flags & PTA_SSE2
3243 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE2))
3244 ix86_isa_flags |= OPTION_MASK_ISA_SSE2;
3245 if (processor_alias_table[i].flags & PTA_SSE3
3246 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE3))
3247 ix86_isa_flags |= OPTION_MASK_ISA_SSE3;
3248 if (processor_alias_table[i].flags & PTA_SSSE3
3249 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSSE3))
3250 ix86_isa_flags |= OPTION_MASK_ISA_SSSE3;
3251 if (processor_alias_table[i].flags & PTA_SSE4_1
3252 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_1))
3253 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_1;
3254 if (processor_alias_table[i].flags & PTA_SSE4_2
3255 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
3256 ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
3257 if (processor_alias_table[i].flags & PTA_AVX
3258 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
3259 ix86_isa_flags |= OPTION_MASK_ISA_AVX;
3260 if (processor_alias_table[i].flags & PTA_FMA
3261 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
3262 ix86_isa_flags |= OPTION_MASK_ISA_FMA;
3263 if (processor_alias_table[i].flags & PTA_SSE4A
3264 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
3265 ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
3266 if (processor_alias_table[i].flags & PTA_FMA4
3267 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA4))
3268 ix86_isa_flags |= OPTION_MASK_ISA_FMA4;
3269 if (processor_alias_table[i].flags & PTA_XOP
3270 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_XOP))
3271 ix86_isa_flags |= OPTION_MASK_ISA_XOP;
3272 if (processor_alias_table[i].flags & PTA_LWP
3273 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_LWP))
3274 ix86_isa_flags |= OPTION_MASK_ISA_LWP;
3275 if (processor_alias_table[i].flags & PTA_ABM
3276 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_ABM))
3277 ix86_isa_flags |= OPTION_MASK_ISA_ABM;
3278 if (processor_alias_table[i].flags & PTA_BMI
3279 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_BMI))
3280 ix86_isa_flags |= OPTION_MASK_ISA_BMI;
3281 if (processor_alias_table[i].flags & PTA_TBM
3282 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_TBM))
3283 ix86_isa_flags |= OPTION_MASK_ISA_TBM;
3284 if (processor_alias_table[i].flags & PTA_CX16
3285 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_CX16))
3286 ix86_isa_flags |= OPTION_MASK_ISA_CX16;
3287 if (processor_alias_table[i].flags & (PTA_POPCNT | PTA_ABM)
3288 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_POPCNT))
3289 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT;
3290 if (!(TARGET_64BIT && (processor_alias_table[i].flags & PTA_NO_SAHF))
3291 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SAHF))
3292 ix86_isa_flags |= OPTION_MASK_ISA_SAHF;
3293 if (processor_alias_table[i].flags & PTA_MOVBE
3294 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_MOVBE))
3295 ix86_isa_flags |= OPTION_MASK_ISA_MOVBE;
3296 if (processor_alias_table[i].flags & PTA_AES
3297 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AES))
3298 ix86_isa_flags |= OPTION_MASK_ISA_AES;
3299 if (processor_alias_table[i].flags & PTA_PCLMUL
3300 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_PCLMUL))
3301 ix86_isa_flags |= OPTION_MASK_ISA_PCLMUL;
3302 if (processor_alias_table[i].flags & PTA_FSGSBASE
3303 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FSGSBASE))
3304 ix86_isa_flags |= OPTION_MASK_ISA_FSGSBASE;
3305 if (processor_alias_table[i].flags & PTA_RDRND
3306 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_RDRND))
3307 ix86_isa_flags |= OPTION_MASK_ISA_RDRND;
3308 if (processor_alias_table[i].flags & PTA_F16C
3309 && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_F16C))
3310 ix86_isa_flags |= OPTION_MASK_ISA_F16C;
3311 if (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE))
3312 x86_prefetch_sse = true;
3317 if (!strcmp (ix86_arch_string, "generic"))
3318 error ("generic CPU can be used only for %stune=%s %s",
3319 prefix, suffix, sw);
3320 else if (!strncmp (ix86_arch_string, "generic", 7) || i == pta_size)
3321 error ("bad value (%s) for %sarch=%s %s",
3322 ix86_arch_string, prefix, suffix, sw);
3324 ix86_arch_mask = 1u << ix86_arch;
3325 for (i = 0; i < X86_ARCH_LAST; ++i)
3326 ix86_arch_features[i] = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3328 for (i = 0; i < pta_size; i++)
3329 if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
3331 ix86_schedule = processor_alias_table[i].schedule;
3332 ix86_tune = processor_alias_table[i].processor;
3335 if (!(processor_alias_table[i].flags & PTA_64BIT))
3337 if (ix86_tune_defaulted)
3339 ix86_tune_string = "x86-64";
3340 for (i = 0; i < pta_size; i++)
3341 if (! strcmp (ix86_tune_string,
3342 processor_alias_table[i].name))
3344 ix86_schedule = processor_alias_table[i].schedule;
3345 ix86_tune = processor_alias_table[i].processor;
3348 error ("CPU you selected does not support x86-64 "
3354 /* Adjust tuning when compiling for 32-bit ABI. */
3357 case PROCESSOR_GENERIC64:
3358 ix86_tune = PROCESSOR_GENERIC32;
3359 ix86_schedule = CPU_PENTIUMPRO;
3362 case PROCESSOR_CORE2_64:
3363 ix86_tune = PROCESSOR_CORE2_32;
3366 case PROCESSOR_COREI7_64:
3367 ix86_tune = PROCESSOR_COREI7_32;
3374 /* Intel CPUs have always interpreted SSE prefetch instructions as
3375 NOPs; so, we can enable SSE prefetch instructions even when
3376 -mtune (rather than -march) points us to a processor that has them.
3377 However, the VIA C3 gives a SIGILL, so we only do that for i686 and
3378 higher processors. */
3380 && (processor_alias_table[i].flags & (PTA_PREFETCH_SSE | PTA_SSE)))
3381 x86_prefetch_sse = true;
3385 if (ix86_tune_specified && i == pta_size)
3386 error ("bad value (%s) for %stune=%s %s",
3387 ix86_tune_string, prefix, suffix, sw);
3389 ix86_tune_mask = 1u << ix86_tune;
3390 for (i = 0; i < X86_TUNE_LAST; ++i)
3391 ix86_tune_features[i] = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3393 #ifndef USE_IX86_FRAME_POINTER
3394 #define USE_IX86_FRAME_POINTER 0
3397 #ifndef USE_X86_64_FRAME_POINTER
3398 #define USE_X86_64_FRAME_POINTER 0
3401 /* Set the default values for switches whose default depends on TARGET_64BIT
3402 in case they weren't overwritten by command line options. */
3405 if (optimize > 1 && !global_options_set.x_flag_zee)
3407 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3408 flag_omit_frame_pointer = !USE_X86_64_FRAME_POINTER;
3409 if (flag_asynchronous_unwind_tables == 2)
3410 flag_unwind_tables = flag_asynchronous_unwind_tables = 1;
3411 if (flag_pcc_struct_return == 2)
3412 flag_pcc_struct_return = 0;
3416 if (optimize >= 1 && !global_options_set.x_flag_omit_frame_pointer)
3417 flag_omit_frame_pointer = !(USE_IX86_FRAME_POINTER || optimize_size);
3418 if (flag_asynchronous_unwind_tables == 2)
3419 flag_asynchronous_unwind_tables = !USE_IX86_FRAME_POINTER;
3420 if (flag_pcc_struct_return == 2)
3421 flag_pcc_struct_return = DEFAULT_PCC_STRUCT_RETURN;
3425 ix86_cost = &ix86_size_cost;
3427 ix86_cost = processor_target_table[ix86_tune].cost;
3429 /* Arrange to set up i386_stack_locals for all functions. */
3430 init_machine_status = ix86_init_machine_status;
3432 /* Validate -mregparm= value. */
3433 if (global_options_set.x_ix86_regparm)
3436 warning (0, "-mregparm is ignored in 64-bit mode");
3437 if (ix86_regparm > REGPARM_MAX)
3439 error ("-mregparm=%d is not between 0 and %d",
3440 ix86_regparm, REGPARM_MAX);
3445 ix86_regparm = REGPARM_MAX;
3447 /* Default align_* from the processor table. */
3448 if (align_loops == 0)
3450 align_loops = processor_target_table[ix86_tune].align_loop;
3451 align_loops_max_skip = processor_target_table[ix86_tune].align_loop_max_skip;
3453 if (align_jumps == 0)
3455 align_jumps = processor_target_table[ix86_tune].align_jump;
3456 align_jumps_max_skip = processor_target_table[ix86_tune].align_jump_max_skip;
3458 if (align_functions == 0)
3460 align_functions = processor_target_table[ix86_tune].align_func;
3463 /* Provide default for -mbranch-cost= value. */
3464 if (!global_options_set.x_ix86_branch_cost)
3465 ix86_branch_cost = ix86_cost->branch_cost;
3469 target_flags |= TARGET_SUBTARGET64_DEFAULT & ~target_flags_explicit;
3471 /* Enable by default the SSE and MMX builtins. Do allow the user to
3472 explicitly disable any of these. In particular, disabling SSE and
3473 MMX for kernel code is extremely useful. */
3474 if (!ix86_arch_specified)
3476 |= ((OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_MMX
3477 | TARGET_SUBTARGET64_ISA_DEFAULT) & ~ix86_isa_flags_explicit);
3480 warning (0, "%srtd%s is ignored in 64bit mode", prefix, suffix);
3484 target_flags |= TARGET_SUBTARGET32_DEFAULT & ~target_flags_explicit;
3486 if (!ix86_arch_specified)
3488 |= TARGET_SUBTARGET32_ISA_DEFAULT & ~ix86_isa_flags_explicit;
3490 /* i386 ABI does not specify red zone. It still makes sense to use it
3491 when programmer takes care to stack from being destroyed. */
3492 if (!(target_flags_explicit & MASK_NO_RED_ZONE))
3493 target_flags |= MASK_NO_RED_ZONE;
3496 /* Keep nonleaf frame pointers. */
3497 if (flag_omit_frame_pointer)
3498 target_flags &= ~MASK_OMIT_LEAF_FRAME_POINTER;
3499 else if (TARGET_OMIT_LEAF_FRAME_POINTER)
3500 flag_omit_frame_pointer = 1;
3502 /* If we're doing fast math, we don't care about comparison order
3503 wrt NaNs. This lets us use a shorter comparison sequence. */
3504 if (flag_finite_math_only)
3505 target_flags &= ~MASK_IEEE_FP;
3507 /* If the architecture always has an FPU, turn off NO_FANCY_MATH_387,
3508 since the insns won't need emulation. */
3509 if (x86_arch_always_fancy_math_387 & ix86_arch_mask)
3510 target_flags &= ~MASK_NO_FANCY_MATH_387;
3512 /* Likewise, if the target doesn't have a 387, or we've specified
3513 software floating point, don't use 387 inline intrinsics. */
3515 target_flags |= MASK_NO_FANCY_MATH_387;
3517 /* Turn on MMX builtins for -msse. */
3520 ix86_isa_flags |= OPTION_MASK_ISA_MMX & ~ix86_isa_flags_explicit;
3521 x86_prefetch_sse = true;
3524 /* Turn on popcnt instruction for -msse4.2 or -mabm. */
3525 if (TARGET_SSE4_2 || TARGET_ABM)
3526 ix86_isa_flags |= OPTION_MASK_ISA_POPCNT & ~ix86_isa_flags_explicit;
3528 /* Validate -mpreferred-stack-boundary= value or default it to
3529 PREFERRED_STACK_BOUNDARY_DEFAULT. */
3530 ix86_preferred_stack_boundary = PREFERRED_STACK_BOUNDARY_DEFAULT;
3531 if (global_options_set.x_ix86_preferred_stack_boundary_arg)
3533 int min = (TARGET_64BIT ? 4 : 2);
3534 int max = (TARGET_SEH ? 4 : 12);
3536 if (ix86_preferred_stack_boundary_arg < min
3537 || ix86_preferred_stack_boundary_arg > max)
3540 error ("-mpreferred-stack-boundary is not supported "
3543 error ("-mpreferred-stack-boundary=%d is not between %d and %d",
3544 ix86_preferred_stack_boundary_arg, min, max);
3547 ix86_preferred_stack_boundary
3548 = (1 << ix86_preferred_stack_boundary_arg) * BITS_PER_UNIT;
3551 /* Set the default value for -mstackrealign. */
3552 if (ix86_force_align_arg_pointer == -1)
3553 ix86_force_align_arg_pointer = STACK_REALIGN_DEFAULT;
3555 ix86_default_incoming_stack_boundary = PREFERRED_STACK_BOUNDARY;
3557 /* Validate -mincoming-stack-boundary= value or default it to
3558 MIN_STACK_BOUNDARY/PREFERRED_STACK_BOUNDARY. */
3559 ix86_incoming_stack_boundary = ix86_default_incoming_stack_boundary;
3560 if (global_options_set.x_ix86_incoming_stack_boundary_arg)
3562 if (ix86_incoming_stack_boundary_arg < (TARGET_64BIT ? 4 : 2)
3563 || ix86_incoming_stack_boundary_arg > 12)
3564 error ("-mincoming-stack-boundary=%d is not between %d and 12",
3565 ix86_incoming_stack_boundary_arg, TARGET_64BIT ? 4 : 2);
3568 ix86_user_incoming_stack_boundary
3569 = (1 << ix86_incoming_stack_boundary_arg) * BITS_PER_UNIT;
3570 ix86_incoming_stack_boundary
3571 = ix86_user_incoming_stack_boundary;
3575 /* Accept -msseregparm only if at least SSE support is enabled. */
3576 if (TARGET_SSEREGPARM
3578 error ("%ssseregparm%s used without SSE enabled", prefix, suffix);
3580 if (global_options_set.x_ix86_fpmath)
3582 if (ix86_fpmath & FPMATH_SSE)
3586 warning (0, "SSE instruction set disabled, using 387 arithmetics");
3587 ix86_fpmath = FPMATH_387;
3589 else if ((ix86_fpmath & FPMATH_387) && !TARGET_80387)
3591 warning (0, "387 instruction set disabled, using SSE arithmetics");
3592 ix86_fpmath = FPMATH_SSE;
3597 ix86_fpmath = TARGET_FPMATH_DEFAULT;
3599 /* If the i387 is disabled, then do not return values in it. */
3601 target_flags &= ~MASK_FLOAT_RETURNS;
3603 /* Use external vectorized library in vectorizing intrinsics. */
3604 if (global_options_set.x_ix86_veclibabi_type)
3605 switch (ix86_veclibabi_type)
3607 case ix86_veclibabi_type_svml:
3608 ix86_veclib_handler = ix86_veclibabi_svml;
3611 case ix86_veclibabi_type_acml:
3612 ix86_veclib_handler = ix86_veclibabi_acml;
3619 if ((!USE_IX86_FRAME_POINTER
3620 || (x86_accumulate_outgoing_args & ix86_tune_mask))
3621 && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3623 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3625 /* ??? Unwind info is not correct around the CFG unless either a frame
3626 pointer is present or M_A_O_A is set. Fixing this requires rewriting
3627 unwind info generation to be aware of the CFG and propagating states
3629 if ((flag_unwind_tables || flag_asynchronous_unwind_tables
3630 || flag_exceptions || flag_non_call_exceptions)
3631 && flag_omit_frame_pointer
3632 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3634 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3635 warning (0, "unwind tables currently require either a frame pointer "
3636 "or %saccumulate-outgoing-args%s for correctness",
3638 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3641 /* If stack probes are required, the space used for large function
3642 arguments on the stack must also be probed, so enable
3643 -maccumulate-outgoing-args so this happens in the prologue. */
3644 if (TARGET_STACK_PROBE
3645 && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS))
3647 if (target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS)
3648 warning (0, "stack probing requires %saccumulate-outgoing-args%s "
3649 "for correctness", prefix, suffix);
3650 target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
3653 /* For sane SSE instruction set generation we need fcomi instruction.
3654 It is safe to enable all CMOVE instructions. Also, RDRAND intrinsic
3655 expands to a sequence that includes conditional move. */
3656 if (TARGET_SSE || TARGET_RDRND)
3659 /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */
3662 ASM_GENERATE_INTERNAL_LABEL (internal_label_prefix, "LX", 0);
3663 p = strchr (internal_label_prefix, 'X');
3664 internal_label_prefix_len = p - internal_label_prefix;
3668 /* When scheduling description is not available, disable scheduler pass
3669 so it won't slow down the compilation and make x87 code slower. */
3670 if (!TARGET_SCHEDULE)
3671 flag_schedule_insns_after_reload = flag_schedule_insns = 0;
3673 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
3674 ix86_cost->simultaneous_prefetches,
3675 global_options.x_param_values,
3676 global_options_set.x_param_values);
3677 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, ix86_cost->prefetch_block,
3678 global_options.x_param_values,
3679 global_options_set.x_param_values);
3680 maybe_set_param_value (PARAM_L1_CACHE_SIZE, ix86_cost->l1_cache_size,
3681 global_options.x_param_values,
3682 global_options_set.x_param_values);
3683 maybe_set_param_value (PARAM_L2_CACHE_SIZE, ix86_cost->l2_cache_size,
3684 global_options.x_param_values,
3685 global_options_set.x_param_values);
3687 /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */
3688 if (flag_prefetch_loop_arrays < 0
3691 && TARGET_SOFTWARE_PREFETCHING_BENEFICIAL)
3692 flag_prefetch_loop_arrays = 1;
3694 /* If using typedef char *va_list, signal that __builtin_va_start (&ap, 0)
3695 can be optimized to ap = __builtin_next_arg (0). */
3696 if (!TARGET_64BIT && !flag_split_stack)
3697 targetm.expand_builtin_va_start = NULL;
3701 ix86_gen_leave = gen_leave_rex64;
3702 ix86_gen_add3 = gen_adddi3;
3703 ix86_gen_sub3 = gen_subdi3;
3704 ix86_gen_sub3_carry = gen_subdi3_carry;
3705 ix86_gen_one_cmpl2 = gen_one_cmpldi2;
3706 ix86_gen_monitor = gen_sse3_monitor64;
3707 ix86_gen_andsp = gen_anddi3;
3708 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_di;
3709 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probedi;
3710 ix86_gen_probe_stack_range = gen_probe_stack_rangedi;
3714 ix86_gen_leave = gen_leave;
3715 ix86_gen_add3 = gen_addsi3;
3716 ix86_gen_sub3 = gen_subsi3;
3717 ix86_gen_sub3_carry = gen_subsi3_carry;
3718 ix86_gen_one_cmpl2 = gen_one_cmplsi2;
3719 ix86_gen_monitor = gen_sse3_monitor;
3720 ix86_gen_andsp = gen_andsi3;
3721 ix86_gen_allocate_stack_worker = gen_allocate_stack_worker_probe_si;
3722 ix86_gen_adjust_stack_and_probe = gen_adjust_stack_and_probesi;
3723 ix86_gen_probe_stack_range = gen_probe_stack_rangesi;
3727 /* Use -mcld by default for 32-bit code if configured with --enable-cld. */
3729 target_flags |= MASK_CLD & ~target_flags_explicit;
3732 if (!TARGET_64BIT && flag_pic)
3734 if (flag_fentry > 0)
3735 sorry ("-mfentry isn%'t supported for 32-bit in combination "
3739 else if (TARGET_SEH)
3741 if (flag_fentry == 0)
3742 sorry ("-mno-fentry isn%'t compatible with SEH");
3745 else if (flag_fentry < 0)
3747 #if defined(PROFILE_BEFORE_PROLOGUE)
3756 /* When not optimize for size, enable vzeroupper optimization for
3757 TARGET_AVX with -fexpensive-optimizations and split 32-byte
3758 AVX unaligned load/store. */
3761 if (flag_expensive_optimizations
3762 && !(target_flags_explicit & MASK_VZEROUPPER))
3763 target_flags |= MASK_VZEROUPPER;
3764 if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
3765 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
3766 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
3767 if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
3768 && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
3769 target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
3770 /* Enable 128-bit AVX instruction generation for the auto-vectorizer. */
3771 if (TARGET_AVX128_OPTIMAL && !(target_flags_explicit & MASK_PREFER_AVX128))
3772 target_flags |= MASK_PREFER_AVX128;
3777 /* Disable vzeroupper pass if TARGET_AVX is disabled. */
3778 target_flags &= ~MASK_VZEROUPPER;
3781 /* Save the initial options in case the user does function specific
3784 target_option_default_node = target_option_current_node
3785 = build_target_option_node ();
3788 /* Return TRUE if VAL is passed in register with 256bit AVX modes. */
3791 function_pass_avx256_p (const_rtx val)
3796 if (REG_P (val) && VALID_AVX256_REG_MODE (GET_MODE (val)))
3799 if (GET_CODE (val) == PARALLEL)
3804 for (i = XVECLEN (val, 0) - 1; i >= 0; i--)
3806 r = XVECEXP (val, 0, i);
3807 if (GET_CODE (r) == EXPR_LIST
3809 && REG_P (XEXP (r, 0))
3810 && (GET_MODE (XEXP (r, 0)) == OImode
3811 || VALID_AVX256_REG_MODE (GET_MODE (XEXP (r, 0)))))
3819 /* Implement the TARGET_OPTION_OVERRIDE hook. */
3822 ix86_option_override (void)
3824 ix86_option_override_internal (true);
3827 /* Update register usage after having seen the compiler flags. */
3830 ix86_conditional_register_usage (void)
3835 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3837 if (fixed_regs[i] > 1)
3838 fixed_regs[i] = (fixed_regs[i] == (TARGET_64BIT ? 3 : 2));
3839 if (call_used_regs[i] > 1)
3840 call_used_regs[i] = (call_used_regs[i] == (TARGET_64BIT ? 3 : 2));
3843 /* The PIC register, if it exists, is fixed. */
3844 j = PIC_OFFSET_TABLE_REGNUM;
3845 if (j != INVALID_REGNUM)
3846 fixed_regs[j] = call_used_regs[j] = 1;
3848 /* The 64-bit MS_ABI changes the set of call-used registers. */
3849 if (TARGET_64BIT_MS_ABI)
3851 call_used_regs[SI_REG] = 0;
3852 call_used_regs[DI_REG] = 0;
3853 call_used_regs[XMM6_REG] = 0;
3854 call_used_regs[XMM7_REG] = 0;
3855 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3856 call_used_regs[i] = 0;
3859 /* The default setting of CLOBBERED_REGS is for 32-bit; add in the
3860 other call-clobbered regs for 64-bit. */
3863 CLEAR_HARD_REG_SET (reg_class_contents[(int)CLOBBERED_REGS]);
3865 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3866 if (TEST_HARD_REG_BIT (reg_class_contents[(int)GENERAL_REGS], i)
3867 && call_used_regs[i])
3868 SET_HARD_REG_BIT (reg_class_contents[(int)CLOBBERED_REGS], i);
3871 /* If MMX is disabled, squash the registers. */
3873 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3874 if (TEST_HARD_REG_BIT (reg_class_contents[(int)MMX_REGS], i))
3875 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3877 /* If SSE is disabled, squash the registers. */
3879 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3880 if (TEST_HARD_REG_BIT (reg_class_contents[(int)SSE_REGS], i))
3881 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3883 /* If the FPU is disabled, squash the registers. */
3884 if (! (TARGET_80387 || TARGET_FLOAT_RETURNS_IN_80387))
3885 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3886 if (TEST_HARD_REG_BIT (reg_class_contents[(int)FLOAT_REGS], i))
3887 fixed_regs[i] = call_used_regs[i] = 1, reg_names[i] = "";
3889 /* If 32-bit, squash the 64-bit registers. */
3892 for (i = FIRST_REX_INT_REG; i <= LAST_REX_INT_REG; i++)
3894 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
3900 /* Save the current options */
3903 ix86_function_specific_save (struct cl_target_option *ptr)
3905 ptr->arch = ix86_arch;
3906 ptr->schedule = ix86_schedule;
3907 ptr->tune = ix86_tune;
3908 ptr->branch_cost = ix86_branch_cost;
3909 ptr->tune_defaulted = ix86_tune_defaulted;
3910 ptr->arch_specified = ix86_arch_specified;
3911 ptr->x_ix86_isa_flags_explicit = ix86_isa_flags_explicit;
3912 ptr->ix86_target_flags_explicit = target_flags_explicit;
3914 /* The fields are char but the variables are not; make sure the
3915 values fit in the fields. */
3916 gcc_assert (ptr->arch == ix86_arch);
3917 gcc_assert (ptr->schedule == ix86_schedule);
3918 gcc_assert (ptr->tune == ix86_tune);
3919 gcc_assert (ptr->branch_cost == ix86_branch_cost);
3922 /* Restore the current options */
3925 ix86_function_specific_restore (struct cl_target_option *ptr)
3927 enum processor_type old_tune = ix86_tune;
3928 enum processor_type old_arch = ix86_arch;
3929 unsigned int ix86_arch_mask, ix86_tune_mask;
3932 ix86_arch = (enum processor_type) ptr->arch;
3933 ix86_schedule = (enum attr_cpu) ptr->schedule;
3934 ix86_tune = (enum processor_type) ptr->tune;
3935 ix86_branch_cost = ptr->branch_cost;
3936 ix86_tune_defaulted = ptr->tune_defaulted;
3937 ix86_arch_specified = ptr->arch_specified;
3938 ix86_isa_flags_explicit = ptr->x_ix86_isa_flags_explicit;
3939 target_flags_explicit = ptr->ix86_target_flags_explicit;
3941 /* Recreate the arch feature tests if the arch changed */
3942 if (old_arch != ix86_arch)
3944 ix86_arch_mask = 1u << ix86_arch;
3945 for (i = 0; i < X86_ARCH_LAST; ++i)
3946 ix86_arch_features[i]
3947 = !!(initial_ix86_arch_features[i] & ix86_arch_mask);
3950 /* Recreate the tune optimization tests */
3951 if (old_tune != ix86_tune)
3953 ix86_tune_mask = 1u << ix86_tune;
3954 for (i = 0; i < X86_TUNE_LAST; ++i)
3955 ix86_tune_features[i]
3956 = !!(initial_ix86_tune_features[i] & ix86_tune_mask);
3960 /* Print the current options */
3963 ix86_function_specific_print (FILE *file, int indent,
3964 struct cl_target_option *ptr)
3967 = ix86_target_string (ptr->x_ix86_isa_flags, ptr->x_target_flags,
3968 NULL, NULL, ptr->x_ix86_fpmath, false);
3970 fprintf (file, "%*sarch = %d (%s)\n",
3973 ((ptr->arch < TARGET_CPU_DEFAULT_max)
3974 ? cpu_names[ptr->arch]
3977 fprintf (file, "%*stune = %d (%s)\n",
3980 ((ptr->tune < TARGET_CPU_DEFAULT_max)
3981 ? cpu_names[ptr->tune]
3984 fprintf (file, "%*sbranch_cost = %d\n", indent, "", ptr->branch_cost);
3988 fprintf (file, "%*s%s\n", indent, "", target_string);
3989 free (target_string);
3994 /* Inner function to process the attribute((target(...))), take an argument and
3995 set the current options from the argument. If we have a list, recursively go
3999 ix86_valid_target_attribute_inner_p (tree args, char *p_strings[],
4000 struct gcc_options *enum_opts_set)
4005 #define IX86_ATTR_ISA(S,O) { S, sizeof (S)-1, ix86_opt_isa, O, 0 }
4006 #define IX86_ATTR_STR(S,O) { S, sizeof (S)-1, ix86_opt_str, O, 0 }
4007 #define IX86_ATTR_ENUM(S,O) { S, sizeof (S)-1, ix86_opt_enum, O, 0 }
4008 #define IX86_ATTR_YES(S,O,M) { S, sizeof (S)-1, ix86_opt_yes, O, M }
4009 #define IX86_ATTR_NO(S,O,M) { S, sizeof (S)-1, ix86_opt_no, O, M }
4025 enum ix86_opt_type type;
4030 IX86_ATTR_ISA ("3dnow", OPT_m3dnow),
4031 IX86_ATTR_ISA ("abm", OPT_mabm),
4032 IX86_ATTR_ISA ("bmi", OPT_mbmi),
4033 IX86_ATTR_ISA ("tbm", OPT_mtbm),
4034 IX86_ATTR_ISA ("aes", OPT_maes),
4035 IX86_ATTR_ISA ("avx", OPT_mavx),
4036 IX86_ATTR_ISA ("mmx", OPT_mmmx),
4037 IX86_ATTR_ISA ("pclmul", OPT_mpclmul),
4038 IX86_ATTR_ISA ("popcnt", OPT_mpopcnt),
4039 IX86_ATTR_ISA ("sse", OPT_msse),
4040 IX86_ATTR_ISA ("sse2", OPT_msse2),
4041 IX86_ATTR_ISA ("sse3", OPT_msse3),
4042 IX86_ATTR_ISA ("sse4", OPT_msse4),
4043 IX86_ATTR_ISA ("sse4.1", OPT_msse4_1),
4044 IX86_ATTR_ISA ("sse4.2", OPT_msse4_2),
4045 IX86_ATTR_ISA ("sse4a", OPT_msse4a),
4046 IX86_ATTR_ISA ("ssse3", OPT_mssse3),
4047 IX86_ATTR_ISA ("fma4", OPT_mfma4),
4048 IX86_ATTR_ISA ("xop", OPT_mxop),
4049 IX86_ATTR_ISA ("lwp", OPT_mlwp),
4050 IX86_ATTR_ISA ("fsgsbase", OPT_mfsgsbase),
4051 IX86_ATTR_ISA ("rdrnd", OPT_mrdrnd),
4052 IX86_ATTR_ISA ("f16c", OPT_mf16c),
4055 IX86_ATTR_ENUM ("fpmath=", OPT_mfpmath_),
4057 /* string options */
4058 IX86_ATTR_STR ("arch=", IX86_FUNCTION_SPECIFIC_ARCH),
4059 IX86_ATTR_STR ("tune=", IX86_FUNCTION_SPECIFIC_TUNE),
4062 IX86_ATTR_YES ("cld",
4066 IX86_ATTR_NO ("fancy-math-387",
4067 OPT_mfancy_math_387,
4068 MASK_NO_FANCY_MATH_387),
4070 IX86_ATTR_YES ("ieee-fp",
4074 IX86_ATTR_YES ("inline-all-stringops",
4075 OPT_minline_all_stringops,
4076 MASK_INLINE_ALL_STRINGOPS),
4078 IX86_ATTR_YES ("inline-stringops-dynamically",
4079 OPT_minline_stringops_dynamically,
4080 MASK_INLINE_STRINGOPS_DYNAMICALLY),
4082 IX86_ATTR_NO ("align-stringops",
4083 OPT_mno_align_stringops,
4084 MASK_NO_ALIGN_STRINGOPS),
4086 IX86_ATTR_YES ("recip",
4092 /* If this is a list, recurse to get the options. */
4093 if (TREE_CODE (args) == TREE_LIST)
4097 for (; args; args = TREE_CHAIN (args))
4098 if (TREE_VALUE (args)
4099 && !ix86_valid_target_attribute_inner_p (TREE_VALUE (args),
4100 p_strings, enum_opts_set))
4106 else if (TREE_CODE (args) != STRING_CST)
4109 /* Handle multiple arguments separated by commas. */
4110 next_optstr = ASTRDUP (TREE_STRING_POINTER (args));
4112 while (next_optstr && *next_optstr != '\0')
4114 char *p = next_optstr;
4116 char *comma = strchr (next_optstr, ',');
4117 const char *opt_string;
4118 size_t len, opt_len;
4123 enum ix86_opt_type type = ix86_opt_unknown;
4129 len = comma - next_optstr;
4130 next_optstr = comma + 1;
4138 /* Recognize no-xxx. */
4139 if (len > 3 && p[0] == 'n' && p[1] == 'o' && p[2] == '-')
4148 /* Find the option. */
4151 for (i = 0; i < ARRAY_SIZE (attrs); i++)
4153 type = attrs[i].type;
4154 opt_len = attrs[i].len;
4155 if (ch == attrs[i].string[0]
4156 && ((type != ix86_opt_str && type != ix86_opt_enum)
4159 && memcmp (p, attrs[i].string, opt_len) == 0)
4162 mask = attrs[i].mask;
4163 opt_string = attrs[i].string;
4168 /* Process the option. */
4171 error ("attribute(target(\"%s\")) is unknown", orig_p);
4175 else if (type == ix86_opt_isa)
4177 struct cl_decoded_option decoded;
4179 generate_option (opt, NULL, opt_set_p, CL_TARGET, &decoded);
4180 ix86_handle_option (&global_options, &global_options_set,
4181 &decoded, input_location);
4184 else if (type == ix86_opt_yes || type == ix86_opt_no)
4186 if (type == ix86_opt_no)
4187 opt_set_p = !opt_set_p;
4190 target_flags |= mask;
4192 target_flags &= ~mask;
4195 else if (type == ix86_opt_str)
4199 error ("option(\"%s\") was already specified", opt_string);
4203 p_strings[opt] = xstrdup (p + opt_len);
4206 else if (type == ix86_opt_enum)
4211 arg_ok = opt_enum_arg_to_value (opt, p + opt_len, &value, CL_TARGET);
4213 set_option (&global_options, enum_opts_set, opt, value,
4214 p + opt_len, DK_UNSPECIFIED, input_location,
4218 error ("attribute(target(\"%s\")) is unknown", orig_p);
4230 /* Return a TARGET_OPTION_NODE tree of the target options listed or NULL. */
4233 ix86_valid_target_attribute_tree (tree args)
4235 const char *orig_arch_string = ix86_arch_string;
4236 const char *orig_tune_string = ix86_tune_string;
4237 enum fpmath_unit orig_fpmath_set = global_options_set.x_ix86_fpmath;
4238 int orig_tune_defaulted = ix86_tune_defaulted;
4239 int orig_arch_specified = ix86_arch_specified;
4240 char *option_strings[IX86_FUNCTION_SPECIFIC_MAX] = { NULL, NULL };
4243 struct cl_target_option *def
4244 = TREE_TARGET_OPTION (target_option_default_node);
4245 struct gcc_options enum_opts_set;
4247 memset (&enum_opts_set, 0, sizeof (enum_opts_set));
4249 /* Process each of the options on the chain. */
4250 if (! ix86_valid_target_attribute_inner_p (args, option_strings,
4254 /* If the changed options are different from the default, rerun
4255 ix86_option_override_internal, and then save the options away.
4256 The string options are are attribute options, and will be undone
4257 when we copy the save structure. */
4258 if (ix86_isa_flags != def->x_ix86_isa_flags
4259 || target_flags != def->x_target_flags
4260 || option_strings[IX86_FUNCTION_SPECIFIC_ARCH]
4261 || option_strings[IX86_FUNCTION_SPECIFIC_TUNE]
4262 || enum_opts_set.x_ix86_fpmath)
4264 /* If we are using the default tune= or arch=, undo the string assigned,
4265 and use the default. */
4266 if (option_strings[IX86_FUNCTION_SPECIFIC_ARCH])
4267 ix86_arch_string = option_strings[IX86_FUNCTION_SPECIFIC_ARCH];
4268 else if (!orig_arch_specified)
4269 ix86_arch_string = NULL;
4271 if (option_strings[IX86_FUNCTION_SPECIFIC_TUNE])
4272 ix86_tune_string = option_strings[IX86_FUNCTION_SPECIFIC_TUNE];
4273 else if (orig_tune_defaulted)
4274 ix86_tune_string = NULL;
4276 /* If fpmath= is not set, and we now have sse2 on 32-bit, use it. */
4277 if (enum_opts_set.x_ix86_fpmath)
4278 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4279 else if (!TARGET_64BIT && TARGET_SSE)
4281 ix86_fpmath = (enum fpmath_unit) (FPMATH_SSE | FPMATH_387);
4282 global_options_set.x_ix86_fpmath = (enum fpmath_unit) 1;
4285 /* Do any overrides, such as arch=xxx, or tune=xxx support. */
4286 ix86_option_override_internal (false);
4288 /* Add any builtin functions with the new isa if any. */
4289 ix86_add_new_builtins (ix86_isa_flags);
4291 /* Save the current options unless we are validating options for
4293 t = build_target_option_node ();
4295 ix86_arch_string = orig_arch_string;
4296 ix86_tune_string = orig_tune_string;
4297 global_options_set.x_ix86_fpmath = orig_fpmath_set;
4299 /* Free up memory allocated to hold the strings */
4300 for (i = 0; i < IX86_FUNCTION_SPECIFIC_MAX; i++)
4301 free (option_strings[i]);
4307 /* Hook to validate attribute((target("string"))). */
4310 ix86_valid_target_attribute_p (tree fndecl,
4311 tree ARG_UNUSED (name),
4313 int ARG_UNUSED (flags))
4315 struct cl_target_option cur_target;
4317 tree old_optimize = build_optimization_node ();
4318 tree new_target, new_optimize;
4319 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
4321 /* If the function changed the optimization levels as well as setting target
4322 options, start with the optimizations specified. */
4323 if (func_optimize && func_optimize != old_optimize)
4324 cl_optimization_restore (&global_options,
4325 TREE_OPTIMIZATION (func_optimize));
4327 /* The target attributes may also change some optimization flags, so update
4328 the optimization options if necessary. */
4329 cl_target_option_save (&cur_target, &global_options);
4330 new_target = ix86_valid_target_attribute_tree (args);
4331 new_optimize = build_optimization_node ();
4338 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
4340 if (old_optimize != new_optimize)
4341 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
4344 cl_target_option_restore (&global_options, &cur_target);
4346 if (old_optimize != new_optimize)
4347 cl_optimization_restore (&global_options,
4348 TREE_OPTIMIZATION (old_optimize));
4354 /* Hook to determine if one function can safely inline another. */
4357 ix86_can_inline_p (tree caller, tree callee)
4360 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
4361 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
4363 /* If callee has no option attributes, then it is ok to inline. */
4367 /* If caller has no option attributes, but callee does then it is not ok to
4369 else if (!caller_tree)
4374 struct cl_target_option *caller_opts = TREE_TARGET_OPTION (caller_tree);
4375 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
4377 /* Callee's isa options should a subset of the caller's, i.e. a SSE4 function
4378 can inline a SSE2 function but a SSE2 function can't inline a SSE4
4380 if ((caller_opts->x_ix86_isa_flags & callee_opts->x_ix86_isa_flags)
4381 != callee_opts->x_ix86_isa_flags)
4384 /* See if we have the same non-isa options. */
4385 else if (caller_opts->x_target_flags != callee_opts->x_target_flags)
4388 /* See if arch, tune, etc. are the same. */
4389 else if (caller_opts->arch != callee_opts->arch)
4392 else if (caller_opts->tune != callee_opts->tune)
4395 else if (caller_opts->x_ix86_fpmath != callee_opts->x_ix86_fpmath)
4398 else if (caller_opts->branch_cost != callee_opts->branch_cost)
4409 /* Remember the last target of ix86_set_current_function. */
4410 static GTY(()) tree ix86_previous_fndecl;
4412 /* Establish appropriate back-end context for processing the function
4413 FNDECL. The argument might be NULL to indicate processing at top
4414 level, outside of any function scope. */
4416 ix86_set_current_function (tree fndecl)
4418 /* Only change the context if the function changes. This hook is called
4419 several times in the course of compiling a function, and we don't want to
4420 slow things down too much or call target_reinit when it isn't safe. */
4421 if (fndecl && fndecl != ix86_previous_fndecl)
4423 tree old_tree = (ix86_previous_fndecl
4424 ? DECL_FUNCTION_SPECIFIC_TARGET (ix86_previous_fndecl)
4427 tree new_tree = (fndecl
4428 ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
4431 ix86_previous_fndecl = fndecl;
4432 if (old_tree == new_tree)
4437 cl_target_option_restore (&global_options,
4438 TREE_TARGET_OPTION (new_tree));
4444 struct cl_target_option *def
4445 = TREE_TARGET_OPTION (target_option_current_node);
4447 cl_target_option_restore (&global_options, def);
4454 /* Return true if this goes in large data/bss. */
4457 ix86_in_large_data_p (tree exp)
4459 if (ix86_cmodel != CM_MEDIUM && ix86_cmodel != CM_MEDIUM_PIC)
4462 /* Functions are never large data. */
4463 if (TREE_CODE (exp) == FUNCTION_DECL)
4466 if (TREE_CODE (exp) == VAR_DECL && DECL_SECTION_NAME (exp))
4468 const char *section = TREE_STRING_POINTER (DECL_SECTION_NAME (exp));
4469 if (strcmp (section, ".ldata") == 0
4470 || strcmp (section, ".lbss") == 0)
4476 HOST_WIDE_INT size = int_size_in_bytes (TREE_TYPE (exp));
4478 /* If this is an incomplete type with size 0, then we can't put it
4479 in data because it might be too big when completed. */
4480 if (!size || size > ix86_section_threshold)
4487 /* Switch to the appropriate section for output of DECL.
4488 DECL is either a `VAR_DECL' node or a constant of some sort.
4489 RELOC indicates whether forming the initial value of DECL requires
4490 link-time relocations. */
4492 static section * x86_64_elf_select_section (tree, int, unsigned HOST_WIDE_INT)
4496 x86_64_elf_select_section (tree decl, int reloc,
4497 unsigned HOST_WIDE_INT align)
4499 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4500 && ix86_in_large_data_p (decl))
4502 const char *sname = NULL;
4503 unsigned int flags = SECTION_WRITE;
4504 switch (categorize_decl_for_section (decl, reloc))
4509 case SECCAT_DATA_REL:
4510 sname = ".ldata.rel";
4512 case SECCAT_DATA_REL_LOCAL:
4513 sname = ".ldata.rel.local";
4515 case SECCAT_DATA_REL_RO:
4516 sname = ".ldata.rel.ro";
4518 case SECCAT_DATA_REL_RO_LOCAL:
4519 sname = ".ldata.rel.ro.local";
4523 flags |= SECTION_BSS;
4526 case SECCAT_RODATA_MERGE_STR:
4527 case SECCAT_RODATA_MERGE_STR_INIT:
4528 case SECCAT_RODATA_MERGE_CONST:
4532 case SECCAT_SRODATA:
4539 /* We don't split these for medium model. Place them into
4540 default sections and hope for best. */
4545 /* We might get called with string constants, but get_named_section
4546 doesn't like them as they are not DECLs. Also, we need to set
4547 flags in that case. */
4549 return get_section (sname, flags, NULL);
4550 return get_named_section (decl, sname, reloc);
4553 return default_elf_select_section (decl, reloc, align);
4556 /* Build up a unique section name, expressed as a
4557 STRING_CST node, and assign it to DECL_SECTION_NAME (decl).
4558 RELOC indicates whether the initial value of EXP requires
4559 link-time relocations. */
4561 static void ATTRIBUTE_UNUSED
4562 x86_64_elf_unique_section (tree decl, int reloc)
4564 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4565 && ix86_in_large_data_p (decl))
4567 const char *prefix = NULL;
4568 /* We only need to use .gnu.linkonce if we don't have COMDAT groups. */
4569 bool one_only = DECL_ONE_ONLY (decl) && !HAVE_COMDAT_GROUP;
4571 switch (categorize_decl_for_section (decl, reloc))
4574 case SECCAT_DATA_REL:
4575 case SECCAT_DATA_REL_LOCAL:
4576 case SECCAT_DATA_REL_RO:
4577 case SECCAT_DATA_REL_RO_LOCAL:
4578 prefix = one_only ? ".ld" : ".ldata";
4581 prefix = one_only ? ".lb" : ".lbss";
4584 case SECCAT_RODATA_MERGE_STR:
4585 case SECCAT_RODATA_MERGE_STR_INIT:
4586 case SECCAT_RODATA_MERGE_CONST:
4587 prefix = one_only ? ".lr" : ".lrodata";
4589 case SECCAT_SRODATA:
4596 /* We don't split these for medium model. Place them into
4597 default sections and hope for best. */
4602 const char *name, *linkonce;
4605 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
4606 name = targetm.strip_name_encoding (name);
4608 /* If we're using one_only, then there needs to be a .gnu.linkonce
4609 prefix to the section name. */
4610 linkonce = one_only ? ".gnu.linkonce" : "";
4612 string = ACONCAT ((linkonce, prefix, ".", name, NULL));
4614 DECL_SECTION_NAME (decl) = build_string (strlen (string), string);
4618 default_unique_section (decl, reloc);
4621 #ifdef COMMON_ASM_OP
4622 /* This says how to output assembler code to declare an
4623 uninitialized external linkage data object.
4625 For medium model x86-64 we need to use .largecomm opcode for
4628 x86_elf_aligned_common (FILE *file,
4629 const char *name, unsigned HOST_WIDE_INT size,
4632 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4633 && size > (unsigned int)ix86_section_threshold)
4634 fputs (".largecomm\t", file);
4636 fputs (COMMON_ASM_OP, file);
4637 assemble_name (file, name);
4638 fprintf (file, "," HOST_WIDE_INT_PRINT_UNSIGNED ",%u\n",
4639 size, align / BITS_PER_UNIT);
4643 /* Utility function for targets to use in implementing
4644 ASM_OUTPUT_ALIGNED_BSS. */
4647 x86_output_aligned_bss (FILE *file, tree decl ATTRIBUTE_UNUSED,
4648 const char *name, unsigned HOST_WIDE_INT size,
4651 if ((ix86_cmodel == CM_MEDIUM || ix86_cmodel == CM_MEDIUM_PIC)
4652 && size > (unsigned int)ix86_section_threshold)
4653 switch_to_section (get_named_section (decl, ".lbss", 0));
4655 switch_to_section (bss_section);
4656 ASM_OUTPUT_ALIGN (file, floor_log2 (align / BITS_PER_UNIT));
4657 #ifdef ASM_DECLARE_OBJECT_NAME
4658 last_assemble_variable_decl = decl;
4659 ASM_DECLARE_OBJECT_NAME (file, name, decl);
4661 /* Standard thing is just output label for the object. */
4662 ASM_OUTPUT_LABEL (file, name);
4663 #endif /* ASM_DECLARE_OBJECT_NAME */
4664 ASM_OUTPUT_SKIP (file, size ? size : 1);
4667 /* Decide whether we must probe the stack before any space allocation
4668 on this target. It's essentially TARGET_STACK_PROBE except when
4669 -fstack-check causes the stack to be already probed differently. */
4672 ix86_target_stack_probe (void)
4674 /* Do not probe the stack twice if static stack checking is enabled. */
4675 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4678 return TARGET_STACK_PROBE;
4681 /* Decide whether we can make a sibling call to a function. DECL is the
4682 declaration of the function being targeted by the call and EXP is the
4683 CALL_EXPR representing the call. */
4686 ix86_function_ok_for_sibcall (tree decl, tree exp)
4688 tree type, decl_or_type;
4691 /* If we are generating position-independent code, we cannot sibcall
4692 optimize any indirect call, or a direct call to a global function,
4693 as the PLT requires %ebx be live. (Darwin does not have a PLT.) */
4697 && (!decl || !targetm.binds_local_p (decl)))
4700 /* If we need to align the outgoing stack, then sibcalling would
4701 unalign the stack, which may break the called function. */
4702 if (ix86_minimum_incoming_stack_boundary (true)
4703 < PREFERRED_STACK_BOUNDARY)
4708 decl_or_type = decl;
4709 type = TREE_TYPE (decl);
4713 /* We're looking at the CALL_EXPR, we need the type of the function. */
4714 type = CALL_EXPR_FN (exp); /* pointer expression */
4715 type = TREE_TYPE (type); /* pointer type */
4716 type = TREE_TYPE (type); /* function type */
4717 decl_or_type = type;
4720 /* Check that the return value locations are the same. Like
4721 if we are returning floats on the 80387 register stack, we cannot
4722 make a sibcall from a function that doesn't return a float to a
4723 function that does or, conversely, from a function that does return
4724 a float to a function that doesn't; the necessary stack adjustment
4725 would not be executed. This is also the place we notice
4726 differences in the return value ABI. Note that it is ok for one
4727 of the functions to have void return type as long as the return
4728 value of the other is passed in a register. */
4729 a = ix86_function_value (TREE_TYPE (exp), decl_or_type, false);
4730 b = ix86_function_value (TREE_TYPE (DECL_RESULT (cfun->decl)),
4732 if (STACK_REG_P (a) || STACK_REG_P (b))
4734 if (!rtx_equal_p (a, b))
4737 else if (VOID_TYPE_P (TREE_TYPE (DECL_RESULT (cfun->decl))))
4739 /* Disable sibcall if we need to generate vzeroupper after
4741 if (TARGET_VZEROUPPER
4742 && cfun->machine->callee_return_avx256_p
4743 && !cfun->machine->caller_return_avx256_p)
4746 else if (!rtx_equal_p (a, b))
4751 /* The SYSV ABI has more call-clobbered registers;
4752 disallow sibcalls from MS to SYSV. */
4753 if (cfun->machine->call_abi == MS_ABI
4754 && ix86_function_type_abi (type) == SYSV_ABI)
4759 /* If this call is indirect, we'll need to be able to use a
4760 call-clobbered register for the address of the target function.
4761 Make sure that all such registers are not used for passing
4762 parameters. Note that DLLIMPORT functions are indirect. */
4764 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES && DECL_DLLIMPORT_P (decl)))
4766 if (ix86_function_regparm (type, NULL) >= 3)
4768 /* ??? Need to count the actual number of registers to be used,
4769 not the possible number of registers. Fix later. */
4775 /* Otherwise okay. That also includes certain types of indirect calls. */
4779 /* Handle "cdecl", "stdcall", "fastcall", "regparm", "thiscall",
4780 and "sseregparm" calling convention attributes;
4781 arguments as in struct attribute_spec.handler. */
4784 ix86_handle_cconv_attribute (tree *node, tree name,
4786 int flags ATTRIBUTE_UNUSED,
4789 if (TREE_CODE (*node) != FUNCTION_TYPE
4790 && TREE_CODE (*node) != METHOD_TYPE
4791 && TREE_CODE (*node) != FIELD_DECL
4792 && TREE_CODE (*node) != TYPE_DECL)
4794 warning (OPT_Wattributes, "%qE attribute only applies to functions",
4796 *no_add_attrs = true;
4800 /* Can combine regparm with all attributes but fastcall, and thiscall. */
4801 if (is_attribute_p ("regparm", name))
4805 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4807 error ("fastcall and regparm attributes are not compatible");
4810 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4812 error ("regparam and thiscall attributes are not compatible");
4815 cst = TREE_VALUE (args);
4816 if (TREE_CODE (cst) != INTEGER_CST)
4818 warning (OPT_Wattributes,
4819 "%qE attribute requires an integer constant argument",
4821 *no_add_attrs = true;
4823 else if (compare_tree_int (cst, REGPARM_MAX) > 0)
4825 warning (OPT_Wattributes, "argument to %qE attribute larger than %d",
4827 *no_add_attrs = true;
4835 /* Do not warn when emulating the MS ABI. */
4836 if ((TREE_CODE (*node) != FUNCTION_TYPE
4837 && TREE_CODE (*node) != METHOD_TYPE)
4838 || ix86_function_type_abi (*node) != MS_ABI)
4839 warning (OPT_Wattributes, "%qE attribute ignored",
4841 *no_add_attrs = true;
4845 /* Can combine fastcall with stdcall (redundant) and sseregparm. */
4846 if (is_attribute_p ("fastcall", name))
4848 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4850 error ("fastcall and cdecl attributes are not compatible");
4852 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4854 error ("fastcall and stdcall attributes are not compatible");
4856 if (lookup_attribute ("regparm", TYPE_ATTRIBUTES (*node)))
4858 error ("fastcall and regparm attributes are not compatible");
4860 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4862 error ("fastcall and thiscall attributes are not compatible");
4866 /* Can combine stdcall with fastcall (redundant), regparm and
4868 else if (is_attribute_p ("stdcall", name))
4870 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4872 error ("stdcall and cdecl attributes are not compatible");
4874 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4876 error ("stdcall and fastcall attributes are not compatible");
4878 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4880 error ("stdcall and thiscall attributes are not compatible");
4884 /* Can combine cdecl with regparm and sseregparm. */
4885 else if (is_attribute_p ("cdecl", name))
4887 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4889 error ("stdcall and cdecl attributes are not compatible");
4891 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4893 error ("fastcall and cdecl attributes are not compatible");
4895 if (lookup_attribute ("thiscall", TYPE_ATTRIBUTES (*node)))
4897 error ("cdecl and thiscall attributes are not compatible");
4900 else if (is_attribute_p ("thiscall", name))
4902 if (TREE_CODE (*node) != METHOD_TYPE && pedantic)
4903 warning (OPT_Wattributes, "%qE attribute is used for none class-method",
4905 if (lookup_attribute ("stdcall", TYPE_ATTRIBUTES (*node)))
4907 error ("stdcall and thiscall attributes are not compatible");
4909 if (lookup_attribute ("fastcall", TYPE_ATTRIBUTES (*node)))
4911 error ("fastcall and thiscall attributes are not compatible");
4913 if (lookup_attribute ("cdecl", TYPE_ATTRIBUTES (*node)))
4915 error ("cdecl and thiscall attributes are not compatible");
4919 /* Can combine sseregparm with all attributes. */
4924 /* This function determines from TYPE the calling-convention. */
4927 ix86_get_callcvt (const_tree type)
4929 unsigned int ret = 0;
4934 return IX86_CALLCVT_CDECL;
4936 attrs = TYPE_ATTRIBUTES (type);
4937 if (attrs != NULL_TREE)
4939 if (lookup_attribute ("cdecl", attrs))
4940 ret |= IX86_CALLCVT_CDECL;
4941 else if (lookup_attribute ("stdcall", attrs))
4942 ret |= IX86_CALLCVT_STDCALL;
4943 else if (lookup_attribute ("fastcall", attrs))
4944 ret |= IX86_CALLCVT_FASTCALL;
4945 else if (lookup_attribute ("thiscall", attrs))
4946 ret |= IX86_CALLCVT_THISCALL;
4948 /* Regparam isn't allowed for thiscall and fastcall. */
4949 if ((ret & (IX86_CALLCVT_THISCALL | IX86_CALLCVT_FASTCALL)) == 0)
4951 if (lookup_attribute ("regparm", attrs))
4952 ret |= IX86_CALLCVT_REGPARM;
4953 if (lookup_attribute ("sseregparm", attrs))
4954 ret |= IX86_CALLCVT_SSEREGPARM;
4957 if (IX86_BASE_CALLCVT(ret) != 0)
4961 is_stdarg = stdarg_p (type);
4962 if (TARGET_RTD && !is_stdarg)
4963 return IX86_CALLCVT_STDCALL | ret;
4967 || TREE_CODE (type) != METHOD_TYPE
4968 || ix86_function_type_abi (type) != MS_ABI)
4969 return IX86_CALLCVT_CDECL | ret;
4971 return IX86_CALLCVT_THISCALL;
4974 /* Return 0 if the attributes for two types are incompatible, 1 if they
4975 are compatible, and 2 if they are nearly compatible (which causes a
4976 warning to be generated). */
4979 ix86_comp_type_attributes (const_tree type1, const_tree type2)
4981 unsigned int ccvt1, ccvt2;
4983 if (TREE_CODE (type1) != FUNCTION_TYPE
4984 && TREE_CODE (type1) != METHOD_TYPE)
4987 ccvt1 = ix86_get_callcvt (type1);
4988 ccvt2 = ix86_get_callcvt (type2);
4991 if (ix86_function_regparm (type1, NULL)
4992 != ix86_function_regparm (type2, NULL))
4998 /* Return the regparm value for a function with the indicated TYPE and DECL.
4999 DECL may be NULL when calling function indirectly
5000 or considering a libcall. */
5003 ix86_function_regparm (const_tree type, const_tree decl)
5010 return (ix86_function_type_abi (type) == SYSV_ABI
5011 ? X86_64_REGPARM_MAX : X86_64_MS_REGPARM_MAX);
5012 ccvt = ix86_get_callcvt (type);
5013 regparm = ix86_regparm;
5015 if ((ccvt & IX86_CALLCVT_REGPARM) != 0)
5017 attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type));
5020 regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr)));
5024 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5026 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5029 /* Use register calling convention for local functions when possible. */
5031 && TREE_CODE (decl) == FUNCTION_DECL
5033 && !(profile_flag && !flag_fentry))
5035 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5036 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE (decl));
5037 if (i && i->local && i->can_change_signature)
5039 int local_regparm, globals = 0, regno;
5041 /* Make sure no regparm register is taken by a
5042 fixed register variable. */
5043 for (local_regparm = 0; local_regparm < REGPARM_MAX; local_regparm++)
5044 if (fixed_regs[local_regparm])
5047 /* We don't want to use regparm(3) for nested functions as
5048 these use a static chain pointer in the third argument. */
5049 if (local_regparm == 3 && DECL_STATIC_CHAIN (decl))
5052 /* In 32-bit mode save a register for the split stack. */
5053 if (!TARGET_64BIT && local_regparm == 3 && flag_split_stack)
5056 /* Each fixed register usage increases register pressure,
5057 so less registers should be used for argument passing.
5058 This functionality can be overriden by an explicit
5060 for (regno = 0; regno <= DI_REG; regno++)
5061 if (fixed_regs[regno])
5065 = globals < local_regparm ? local_regparm - globals : 0;
5067 if (local_regparm > regparm)
5068 regparm = local_regparm;
5075 /* Return 1 or 2, if we can pass up to SSE_REGPARM_MAX SFmode (1) and
5076 DFmode (2) arguments in SSE registers for a function with the
5077 indicated TYPE and DECL. DECL may be NULL when calling function
5078 indirectly or considering a libcall. Otherwise return 0. */
5081 ix86_function_sseregparm (const_tree type, const_tree decl, bool warn)
5083 gcc_assert (!TARGET_64BIT);
5085 /* Use SSE registers to pass SFmode and DFmode arguments if requested
5086 by the sseregparm attribute. */
5087 if (TARGET_SSEREGPARM
5088 || (type && lookup_attribute ("sseregparm", TYPE_ATTRIBUTES (type))))
5095 error ("calling %qD with attribute sseregparm without "
5096 "SSE/SSE2 enabled", decl);
5098 error ("calling %qT with attribute sseregparm without "
5099 "SSE/SSE2 enabled", type);
5107 /* For local functions, pass up to SSE_REGPARM_MAX SFmode
5108 (and DFmode for SSE2) arguments in SSE registers. */
5109 if (decl && TARGET_SSE_MATH && optimize
5110 && !(profile_flag && !flag_fentry))
5112 /* FIXME: remove this CONST_CAST when cgraph.[ch] is constified. */
5113 struct cgraph_local_info *i = cgraph_local_info (CONST_CAST_TREE(decl));
5114 if (i && i->local && i->can_change_signature)
5115 return TARGET_SSE2 ? 2 : 1;
5121 /* Return true if EAX is live at the start of the function. Used by
5122 ix86_expand_prologue to determine if we need special help before
5123 calling allocate_stack_worker. */
5126 ix86_eax_live_at_start_p (void)
5128 /* Cheat. Don't bother working forward from ix86_function_regparm
5129 to the function type to whether an actual argument is located in
5130 eax. Instead just look at cfg info, which is still close enough
5131 to correct at this point. This gives false positives for broken
5132 functions that might use uninitialized data that happens to be
5133 allocated in eax, but who cares? */
5134 return REGNO_REG_SET_P (df_get_live_out (ENTRY_BLOCK_PTR), 0);
5138 ix86_keep_aggregate_return_pointer (tree fntype)
5144 attr = lookup_attribute ("callee_pop_aggregate_return",
5145 TYPE_ATTRIBUTES (fntype));
5147 return (TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))) == 0);
5149 /* For 32-bit MS-ABI the default is to keep aggregate
5151 if (ix86_function_type_abi (fntype) == MS_ABI)
5154 return KEEP_AGGREGATE_RETURN_POINTER != 0;
5157 /* Value is the number of bytes of arguments automatically
5158 popped when returning from a subroutine call.
5159 FUNDECL is the declaration node of the function (as a tree),
5160 FUNTYPE is the data type of the function (as a tree),
5161 or for a library call it is an identifier node for the subroutine name.
5162 SIZE is the number of bytes of arguments passed on the stack.
5164 On the 80386, the RTD insn may be used to pop them if the number
5165 of args is fixed, but if the number is variable then the caller
5166 must pop them all. RTD can't be used for library calls now
5167 because the library is compiled with the Unix compiler.
5168 Use of RTD is a selectable option, since it is incompatible with
5169 standard Unix calling sequences. If the option is not selected,
5170 the caller must always pop the args.
5172 The attribute stdcall is equivalent to RTD on a per module basis. */
5175 ix86_return_pops_args (tree fundecl, tree funtype, int size)
5179 /* None of the 64-bit ABIs pop arguments. */
5183 ccvt = ix86_get_callcvt (funtype);
5185 if ((ccvt & (IX86_CALLCVT_STDCALL | IX86_CALLCVT_FASTCALL
5186 | IX86_CALLCVT_THISCALL)) != 0
5187 && ! stdarg_p (funtype))
5190 /* Lose any fake structure return argument if it is passed on the stack. */
5191 if (aggregate_value_p (TREE_TYPE (funtype), fundecl)
5192 && !ix86_keep_aggregate_return_pointer (funtype))
5194 int nregs = ix86_function_regparm (funtype, fundecl);
5196 return GET_MODE_SIZE (Pmode);
5202 /* Argument support functions. */
5204 /* Return true when register may be used to pass function parameters. */
5206 ix86_function_arg_regno_p (int regno)
5209 const int *parm_regs;
5214 return (regno < REGPARM_MAX
5215 || (TARGET_SSE && SSE_REGNO_P (regno) && !fixed_regs[regno]));
5217 return (regno < REGPARM_MAX
5218 || (TARGET_MMX && MMX_REGNO_P (regno)
5219 && (regno < FIRST_MMX_REG + MMX_REGPARM_MAX))
5220 || (TARGET_SSE && SSE_REGNO_P (regno)
5221 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX)));
5226 if (SSE_REGNO_P (regno) && TARGET_SSE)
5231 if (TARGET_SSE && SSE_REGNO_P (regno)
5232 && (regno < FIRST_SSE_REG + SSE_REGPARM_MAX))
5236 /* TODO: The function should depend on current function ABI but
5237 builtins.c would need updating then. Therefore we use the
5240 /* RAX is used as hidden argument to va_arg functions. */
5241 if (ix86_abi == SYSV_ABI && regno == AX_REG)
5244 if (ix86_abi == MS_ABI)
5245 parm_regs = x86_64_ms_abi_int_parameter_registers;
5247 parm_regs = x86_64_int_parameter_registers;
5248 for (i = 0; i < (ix86_abi == MS_ABI
5249 ? X86_64_MS_REGPARM_MAX : X86_64_REGPARM_MAX); i++)
5250 if (regno == parm_regs[i])
5255 /* Return if we do not know how to pass TYPE solely in registers. */
5258 ix86_must_pass_in_stack (enum machine_mode mode, const_tree type)
5260 if (must_pass_in_stack_var_size_or_pad (mode, type))
5263 /* For 32-bit, we want TImode aggregates to go on the stack. But watch out!
5264 The layout_type routine is crafty and tries to trick us into passing
5265 currently unsupported vector types on the stack by using TImode. */
5266 return (!TARGET_64BIT && mode == TImode
5267 && type && TREE_CODE (type) != VECTOR_TYPE);
5270 /* It returns the size, in bytes, of the area reserved for arguments passed
5271 in registers for the function represented by fndecl dependent to the used
5274 ix86_reg_parm_stack_space (const_tree fndecl)
5276 enum calling_abi call_abi = SYSV_ABI;
5277 if (fndecl != NULL_TREE && TREE_CODE (fndecl) == FUNCTION_DECL)
5278 call_abi = ix86_function_abi (fndecl);
5280 call_abi = ix86_function_type_abi (fndecl);
5281 if (TARGET_64BIT && call_abi == MS_ABI)
5286 /* Returns value SYSV_ABI, MS_ABI dependent on fntype, specifying the
5289 ix86_function_type_abi (const_tree fntype)
5291 if (fntype != NULL_TREE && TYPE_ATTRIBUTES (fntype) != NULL_TREE)
5293 enum calling_abi abi = ix86_abi;
5294 if (abi == SYSV_ABI)
5296 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (fntype)))
5299 else if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (fntype)))
5307 ix86_function_ms_hook_prologue (const_tree fn)
5309 if (fn && lookup_attribute ("ms_hook_prologue", DECL_ATTRIBUTES (fn)))
5311 if (decl_function_context (fn) != NULL_TREE)
5312 error_at (DECL_SOURCE_LOCATION (fn),
5313 "ms_hook_prologue is not compatible with nested function");
5320 static enum calling_abi
5321 ix86_function_abi (const_tree fndecl)
5325 return ix86_function_type_abi (TREE_TYPE (fndecl));
5328 /* Returns value SYSV_ABI, MS_ABI dependent on cfun, specifying the
5331 ix86_cfun_abi (void)
5335 return cfun->machine->call_abi;
5338 /* Write the extra assembler code needed to declare a function properly. */
5341 ix86_asm_output_function_label (FILE *asm_out_file, const char *fname,
5344 bool is_ms_hook = ix86_function_ms_hook_prologue (decl);
5348 int i, filler_count = (TARGET_64BIT ? 32 : 16);
5349 unsigned int filler_cc = 0xcccccccc;
5351 for (i = 0; i < filler_count; i += 4)
5352 fprintf (asm_out_file, ASM_LONG " %#x\n", filler_cc);
5355 #ifdef SUBTARGET_ASM_UNWIND_INIT
5356 SUBTARGET_ASM_UNWIND_INIT (asm_out_file);
5359 ASM_OUTPUT_LABEL (asm_out_file, fname);
5361 /* Output magic byte marker, if hot-patch attribute is set. */
5366 /* leaq [%rsp + 0], %rsp */
5367 asm_fprintf (asm_out_file, ASM_BYTE
5368 "0x48, 0x8d, 0xa4, 0x24, 0x00, 0x00, 0x00, 0x00\n");
5372 /* movl.s %edi, %edi
5374 movl.s %esp, %ebp */
5375 asm_fprintf (asm_out_file, ASM_BYTE
5376 "0x8b, 0xff, 0x55, 0x8b, 0xec\n");
5382 extern void init_regs (void);
5384 /* Implementation of call abi switching target hook. Specific to FNDECL
5385 the specific call register sets are set. See also
5386 ix86_conditional_register_usage for more details. */
5388 ix86_call_abi_override (const_tree fndecl)
5390 if (fndecl == NULL_TREE)
5391 cfun->machine->call_abi = ix86_abi;
5393 cfun->machine->call_abi = ix86_function_type_abi (TREE_TYPE (fndecl));
5396 /* 64-bit MS and SYSV ABI have different set of call used registers. Avoid
5397 expensive re-initialization of init_regs each time we switch function context
5398 since this is needed only during RTL expansion. */
5400 ix86_maybe_switch_abi (void)
5403 call_used_regs[SI_REG] == (cfun->machine->call_abi == MS_ABI))
5407 /* Initialize a variable CUM of type CUMULATIVE_ARGS
5408 for a call to a function whose data type is FNTYPE.
5409 For a library call, FNTYPE is 0. */
5412 init_cumulative_args (CUMULATIVE_ARGS *cum, /* Argument info to initialize */
5413 tree fntype, /* tree ptr for function decl */
5414 rtx libname, /* SYMBOL_REF of library name or 0 */
5418 struct cgraph_local_info *i;
5421 memset (cum, 0, sizeof (*cum));
5423 /* Initialize for the current callee. */
5426 cfun->machine->callee_pass_avx256_p = false;
5427 cfun->machine->callee_return_avx256_p = false;
5432 i = cgraph_local_info (fndecl);
5433 cum->call_abi = ix86_function_abi (fndecl);
5434 fnret_type = TREE_TYPE (TREE_TYPE (fndecl));
5439 cum->call_abi = ix86_function_type_abi (fntype);
5441 fnret_type = TREE_TYPE (fntype);
5446 if (TARGET_VZEROUPPER && fnret_type)
5448 rtx fnret_value = ix86_function_value (fnret_type, fntype,
5450 if (function_pass_avx256_p (fnret_value))
5452 /* The return value of this function uses 256bit AVX modes. */
5454 cfun->machine->callee_return_avx256_p = true;
5456 cfun->machine->caller_return_avx256_p = true;
5460 cum->caller = caller;
5462 /* Set up the number of registers to use for passing arguments. */
5464 if (TARGET_64BIT && cum->call_abi == MS_ABI && !ACCUMULATE_OUTGOING_ARGS)
5465 sorry ("ms_abi attribute requires -maccumulate-outgoing-args "
5466 "or subtarget optimization implying it");
5467 cum->nregs = ix86_regparm;
5470 cum->nregs = (cum->call_abi == SYSV_ABI
5471 ? X86_64_REGPARM_MAX
5472 : X86_64_MS_REGPARM_MAX);
5476 cum->sse_nregs = SSE_REGPARM_MAX;
5479 cum->sse_nregs = (cum->call_abi == SYSV_ABI
5480 ? X86_64_SSE_REGPARM_MAX
5481 : X86_64_MS_SSE_REGPARM_MAX);
5485 cum->mmx_nregs = MMX_REGPARM_MAX;
5486 cum->warn_avx = true;
5487 cum->warn_sse = true;
5488 cum->warn_mmx = true;
5490 /* Because type might mismatch in between caller and callee, we need to
5491 use actual type of function for local calls.
5492 FIXME: cgraph_analyze can be told to actually record if function uses
5493 va_start so for local functions maybe_vaarg can be made aggressive
5495 FIXME: once typesytem is fixed, we won't need this code anymore. */
5496 if (i && i->local && i->can_change_signature)
5497 fntype = TREE_TYPE (fndecl);
5498 cum->maybe_vaarg = (fntype
5499 ? (!prototype_p (fntype) || stdarg_p (fntype))
5504 /* If there are variable arguments, then we won't pass anything
5505 in registers in 32-bit mode. */
5506 if (stdarg_p (fntype))
5517 /* Use ecx and edx registers if function has fastcall attribute,
5518 else look for regparm information. */
5521 unsigned int ccvt = ix86_get_callcvt (fntype);
5522 if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
5525 cum->fastcall = 1; /* Same first register as in fastcall. */
5527 else if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
5533 cum->nregs = ix86_function_regparm (fntype, fndecl);
5536 /* Set up the number of SSE registers used for passing SFmode
5537 and DFmode arguments. Warn for mismatching ABI. */
5538 cum->float_in_sse = ix86_function_sseregparm (fntype, fndecl, true);
5542 /* Return the "natural" mode for TYPE. In most cases, this is just TYPE_MODE.
5543 But in the case of vector types, it is some vector mode.
5545 When we have only some of our vector isa extensions enabled, then there
5546 are some modes for which vector_mode_supported_p is false. For these
5547 modes, the generic vector support in gcc will choose some non-vector mode
5548 in order to implement the type. By computing the natural mode, we'll
5549 select the proper ABI location for the operand and not depend on whatever
5550 the middle-end decides to do with these vector types.
5552 The midde-end can't deal with the vector types > 16 bytes. In this
5553 case, we return the original mode and warn ABI change if CUM isn't
5556 static enum machine_mode
5557 type_natural_mode (const_tree type, const CUMULATIVE_ARGS *cum)
5559 enum machine_mode mode = TYPE_MODE (type);
5561 if (TREE_CODE (type) == VECTOR_TYPE && !VECTOR_MODE_P (mode))
5563 HOST_WIDE_INT size = int_size_in_bytes (type);
5564 if ((size == 8 || size == 16 || size == 32)
5565 /* ??? Generic code allows us to create width 1 vectors. Ignore. */
5566 && TYPE_VECTOR_SUBPARTS (type) > 1)
5568 enum machine_mode innermode = TYPE_MODE (TREE_TYPE (type));
5570 if (TREE_CODE (TREE_TYPE (type)) == REAL_TYPE)
5571 mode = MIN_MODE_VECTOR_FLOAT;
5573 mode = MIN_MODE_VECTOR_INT;
5575 /* Get the mode which has this inner mode and number of units. */
5576 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
5577 if (GET_MODE_NUNITS (mode) == TYPE_VECTOR_SUBPARTS (type)
5578 && GET_MODE_INNER (mode) == innermode)
5580 if (size == 32 && !TARGET_AVX)
5582 static bool warnedavx;
5589 warning (0, "AVX vector argument without AVX "
5590 "enabled changes the ABI");
5592 return TYPE_MODE (type);
5605 /* We want to pass a value in REGNO whose "natural" mode is MODE. However,
5606 this may not agree with the mode that the type system has chosen for the
5607 register, which is ORIG_MODE. If ORIG_MODE is not BLKmode, then we can
5608 go ahead and use it. Otherwise we have to build a PARALLEL instead. */
5611 gen_reg_or_parallel (enum machine_mode mode, enum machine_mode orig_mode,
5616 if (orig_mode != BLKmode)
5617 tmp = gen_rtx_REG (orig_mode, regno);
5620 tmp = gen_rtx_REG (mode, regno);
5621 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, const0_rtx);
5622 tmp = gen_rtx_PARALLEL (orig_mode, gen_rtvec (1, tmp));
5628 /* x86-64 register passing implementation. See x86-64 ABI for details. Goal
5629 of this code is to classify each 8bytes of incoming argument by the register
5630 class and assign registers accordingly. */
5632 /* Return the union class of CLASS1 and CLASS2.
5633 See the x86-64 PS ABI for details. */
5635 static enum x86_64_reg_class
5636 merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
5638 /* Rule #1: If both classes are equal, this is the resulting class. */
5639 if (class1 == class2)
5642 /* Rule #2: If one of the classes is NO_CLASS, the resulting class is
5644 if (class1 == X86_64_NO_CLASS)
5646 if (class2 == X86_64_NO_CLASS)
5649 /* Rule #3: If one of the classes is MEMORY, the result is MEMORY. */
5650 if (class1 == X86_64_MEMORY_CLASS || class2 == X86_64_MEMORY_CLASS)
5651 return X86_64_MEMORY_CLASS;
5653 /* Rule #4: If one of the classes is INTEGER, the result is INTEGER. */
5654 if ((class1 == X86_64_INTEGERSI_CLASS && class2 == X86_64_SSESF_CLASS)
5655 || (class2 == X86_64_INTEGERSI_CLASS && class1 == X86_64_SSESF_CLASS))
5656 return X86_64_INTEGERSI_CLASS;
5657 if (class1 == X86_64_INTEGER_CLASS || class1 == X86_64_INTEGERSI_CLASS
5658 || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
5659 return X86_64_INTEGER_CLASS;
5661 /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
5663 if (class1 == X86_64_X87_CLASS
5664 || class1 == X86_64_X87UP_CLASS
5665 || class1 == X86_64_COMPLEX_X87_CLASS
5666 || class2 == X86_64_X87_CLASS
5667 || class2 == X86_64_X87UP_CLASS
5668 || class2 == X86_64_COMPLEX_X87_CLASS)
5669 return X86_64_MEMORY_CLASS;
5671 /* Rule #6: Otherwise class SSE is used. */
5672 return X86_64_SSE_CLASS;
5675 /* Classify the argument of type TYPE and mode MODE.
5676 CLASSES will be filled by the register class used to pass each word
5677 of the operand. The number of words is returned. In case the parameter
5678 should be passed in memory, 0 is returned. As a special case for zero
5679 sized containers, classes[0] will be NO_CLASS and 1 is returned.
5681 BIT_OFFSET is used internally for handling records and specifies offset
5682 of the offset in bits modulo 256 to avoid overflow cases.
5684 See the x86-64 PS ABI for details.
5688 classify_argument (enum machine_mode mode, const_tree type,
5689 enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset)
5691 HOST_WIDE_INT bytes =
5692 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
5693 int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5695 /* Variable sized entities are always passed/returned in memory. */
5699 if (mode != VOIDmode
5700 && targetm.calls.must_pass_in_stack (mode, type))
5703 if (type && AGGREGATE_TYPE_P (type))
5707 enum x86_64_reg_class subclasses[MAX_CLASSES];
5709 /* On x86-64 we pass structures larger than 32 bytes on the stack. */
5713 for (i = 0; i < words; i++)
5714 classes[i] = X86_64_NO_CLASS;
5716 /* Zero sized arrays or structures are NO_CLASS. We return 0 to
5717 signalize memory class, so handle it as special case. */
5720 classes[0] = X86_64_NO_CLASS;
5724 /* Classify each field of record and merge classes. */
5725 switch (TREE_CODE (type))
5728 /* And now merge the fields of structure. */
5729 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5731 if (TREE_CODE (field) == FIELD_DECL)
5735 if (TREE_TYPE (field) == error_mark_node)
5738 /* Bitfields are always classified as integer. Handle them
5739 early, since later code would consider them to be
5740 misaligned integers. */
5741 if (DECL_BIT_FIELD (field))
5743 for (i = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5744 i < ((int_bit_position (field) + (bit_offset % 64))
5745 + tree_low_cst (DECL_SIZE (field), 0)
5748 merge_classes (X86_64_INTEGER_CLASS,
5755 type = TREE_TYPE (field);
5757 /* Flexible array member is ignored. */
5758 if (TYPE_MODE (type) == BLKmode
5759 && TREE_CODE (type) == ARRAY_TYPE
5760 && TYPE_SIZE (type) == NULL_TREE
5761 && TYPE_DOMAIN (type) != NULL_TREE
5762 && (TYPE_MAX_VALUE (TYPE_DOMAIN (type))
5767 if (!warned && warn_psabi)
5770 inform (input_location,
5771 "the ABI of passing struct with"
5772 " a flexible array member has"
5773 " changed in GCC 4.4");
5777 num = classify_argument (TYPE_MODE (type), type,
5779 (int_bit_position (field)
5780 + bit_offset) % 256);
5783 pos = (int_bit_position (field) + (bit_offset % 64)) / 8 / 8;
5784 for (i = 0; i < num && (i + pos) < words; i++)
5786 merge_classes (subclasses[i], classes[i + pos]);
5793 /* Arrays are handled as small records. */
5796 num = classify_argument (TYPE_MODE (TREE_TYPE (type)),
5797 TREE_TYPE (type), subclasses, bit_offset);
5801 /* The partial classes are now full classes. */
5802 if (subclasses[0] == X86_64_SSESF_CLASS && bytes != 4)
5803 subclasses[0] = X86_64_SSE_CLASS;
5804 if (subclasses[0] == X86_64_INTEGERSI_CLASS
5805 && !((bit_offset % 64) == 0 && bytes == 4))
5806 subclasses[0] = X86_64_INTEGER_CLASS;
5808 for (i = 0; i < words; i++)
5809 classes[i] = subclasses[i % num];
5814 case QUAL_UNION_TYPE:
5815 /* Unions are similar to RECORD_TYPE but offset is always 0.
5817 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5819 if (TREE_CODE (field) == FIELD_DECL)
5823 if (TREE_TYPE (field) == error_mark_node)
5826 num = classify_argument (TYPE_MODE (TREE_TYPE (field)),
5827 TREE_TYPE (field), subclasses,
5831 for (i = 0; i < num; i++)
5832 classes[i] = merge_classes (subclasses[i], classes[i]);
5843 /* When size > 16 bytes, if the first one isn't
5844 X86_64_SSE_CLASS or any other ones aren't
5845 X86_64_SSEUP_CLASS, everything should be passed in
5847 if (classes[0] != X86_64_SSE_CLASS)
5850 for (i = 1; i < words; i++)
5851 if (classes[i] != X86_64_SSEUP_CLASS)
5855 /* Final merger cleanup. */
5856 for (i = 0; i < words; i++)
5858 /* If one class is MEMORY, everything should be passed in
5860 if (classes[i] == X86_64_MEMORY_CLASS)
5863 /* The X86_64_SSEUP_CLASS should be always preceded by
5864 X86_64_SSE_CLASS or X86_64_SSEUP_CLASS. */
5865 if (classes[i] == X86_64_SSEUP_CLASS
5866 && classes[i - 1] != X86_64_SSE_CLASS
5867 && classes[i - 1] != X86_64_SSEUP_CLASS)
5869 /* The first one should never be X86_64_SSEUP_CLASS. */
5870 gcc_assert (i != 0);
5871 classes[i] = X86_64_SSE_CLASS;
5874 /* If X86_64_X87UP_CLASS isn't preceded by X86_64_X87_CLASS,
5875 everything should be passed in memory. */
5876 if (classes[i] == X86_64_X87UP_CLASS
5877 && (classes[i - 1] != X86_64_X87_CLASS))
5881 /* The first one should never be X86_64_X87UP_CLASS. */
5882 gcc_assert (i != 0);
5883 if (!warned && warn_psabi)
5886 inform (input_location,
5887 "the ABI of passing union with long double"
5888 " has changed in GCC 4.4");
5896 /* Compute alignment needed. We align all types to natural boundaries with
5897 exception of XFmode that is aligned to 64bits. */
5898 if (mode != VOIDmode && mode != BLKmode)
5900 int mode_alignment = GET_MODE_BITSIZE (mode);
5903 mode_alignment = 128;
5904 else if (mode == XCmode)
5905 mode_alignment = 256;
5906 if (COMPLEX_MODE_P (mode))
5907 mode_alignment /= 2;
5908 /* Misaligned fields are always returned in memory. */
5909 if (bit_offset % mode_alignment)
5913 /* for V1xx modes, just use the base mode */
5914 if (VECTOR_MODE_P (mode) && mode != V1DImode && mode != V1TImode
5915 && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
5916 mode = GET_MODE_INNER (mode);
5918 /* Classification of atomic types. */
5923 classes[0] = X86_64_SSE_CLASS;
5926 classes[0] = X86_64_SSE_CLASS;
5927 classes[1] = X86_64_SSEUP_CLASS;
5937 int size = (bit_offset % 64)+ (int) GET_MODE_BITSIZE (mode);
5941 classes[0] = X86_64_INTEGERSI_CLASS;
5944 else if (size <= 64)
5946 classes[0] = X86_64_INTEGER_CLASS;
5949 else if (size <= 64+32)
5951 classes[0] = X86_64_INTEGER_CLASS;
5952 classes[1] = X86_64_INTEGERSI_CLASS;
5955 else if (size <= 64+64)
5957 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
5965 classes[0] = classes[1] = X86_64_INTEGER_CLASS;
5969 /* OImode shouldn't be used directly. */
5974 if (!(bit_offset % 64))
5975 classes[0] = X86_64_SSESF_CLASS;
5977 classes[0] = X86_64_SSE_CLASS;
5980 classes[0] = X86_64_SSEDF_CLASS;
5983 classes[0] = X86_64_X87_CLASS;
5984 classes[1] = X86_64_X87UP_CLASS;
5987 classes[0] = X86_64_SSE_CLASS;
5988 classes[1] = X86_64_SSEUP_CLASS;
5991 classes[0] = X86_64_SSE_CLASS;
5992 if (!(bit_offset % 64))
5998 if (!warned && warn_psabi)
6001 inform (input_location,
6002 "the ABI of passing structure with complex float"
6003 " member has changed in GCC 4.4");
6005 classes[1] = X86_64_SSESF_CLASS;
6009 classes[0] = X86_64_SSEDF_CLASS;
6010 classes[1] = X86_64_SSEDF_CLASS;
6013 classes[0] = X86_64_COMPLEX_X87_CLASS;
6016 /* This modes is larger than 16 bytes. */
6024 classes[0] = X86_64_SSE_CLASS;
6025 classes[1] = X86_64_SSEUP_CLASS;
6026 classes[2] = X86_64_SSEUP_CLASS;
6027 classes[3] = X86_64_SSEUP_CLASS;
6035 classes[0] = X86_64_SSE_CLASS;
6036 classes[1] = X86_64_SSEUP_CLASS;
6044 classes[0] = X86_64_SSE_CLASS;
6050 gcc_assert (VECTOR_MODE_P (mode));
6055 gcc_assert (GET_MODE_CLASS (GET_MODE_INNER (mode)) == MODE_INT);
6057 if (bit_offset + GET_MODE_BITSIZE (mode) <= 32)
6058 classes[0] = X86_64_INTEGERSI_CLASS;
6060 classes[0] = X86_64_INTEGER_CLASS;
6061 classes[1] = X86_64_INTEGER_CLASS;
6062 return 1 + (bytes > 8);
6066 /* Examine the argument and return set number of register required in each
6067 class. Return 0 iff parameter should be passed in memory. */
6069 examine_argument (enum machine_mode mode, const_tree type, int in_return,
6070 int *int_nregs, int *sse_nregs)
6072 enum x86_64_reg_class regclass[MAX_CLASSES];
6073 int n = classify_argument (mode, type, regclass, 0);
6079 for (n--; n >= 0; n--)
6080 switch (regclass[n])
6082 case X86_64_INTEGER_CLASS:
6083 case X86_64_INTEGERSI_CLASS:
6086 case X86_64_SSE_CLASS:
6087 case X86_64_SSESF_CLASS:
6088 case X86_64_SSEDF_CLASS:
6091 case X86_64_NO_CLASS:
6092 case X86_64_SSEUP_CLASS:
6094 case X86_64_X87_CLASS:
6095 case X86_64_X87UP_CLASS:
6099 case X86_64_COMPLEX_X87_CLASS:
6100 return in_return ? 2 : 0;
6101 case X86_64_MEMORY_CLASS:
6107 /* Construct container for the argument used by GCC interface. See
6108 FUNCTION_ARG for the detailed description. */
6111 construct_container (enum machine_mode mode, enum machine_mode orig_mode,
6112 const_tree type, int in_return, int nintregs, int nsseregs,
6113 const int *intreg, int sse_regno)
6115 /* The following variables hold the static issued_error state. */
6116 static bool issued_sse_arg_error;
6117 static bool issued_sse_ret_error;
6118 static bool issued_x87_ret_error;
6120 enum machine_mode tmpmode;
6122 (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
6123 enum x86_64_reg_class regclass[MAX_CLASSES];
6127 int needed_sseregs, needed_intregs;
6128 rtx exp[MAX_CLASSES];
6131 n = classify_argument (mode, type, regclass, 0);
6134 if (!examine_argument (mode, type, in_return, &needed_intregs,
6137 if (needed_intregs > nintregs || needed_sseregs > nsseregs)
6140 /* We allowed the user to turn off SSE for kernel mode. Don't crash if
6141 some less clueful developer tries to use floating-point anyway. */
6142 if (needed_sseregs && !TARGET_SSE)
6146 if (!issued_sse_ret_error)
6148 error ("SSE register return with SSE disabled");
6149 issued_sse_ret_error = true;
6152 else if (!issued_sse_arg_error)
6154 error ("SSE register argument with SSE disabled");
6155 issued_sse_arg_error = true;
6160 /* Likewise, error if the ABI requires us to return values in the
6161 x87 registers and the user specified -mno-80387. */
6162 if (!TARGET_80387 && in_return)
6163 for (i = 0; i < n; i++)
6164 if (regclass[i] == X86_64_X87_CLASS
6165 || regclass[i] == X86_64_X87UP_CLASS
6166 || regclass[i] == X86_64_COMPLEX_X87_CLASS)
6168 if (!issued_x87_ret_error)
6170 error ("x87 register return with x87 disabled");
6171 issued_x87_ret_error = true;
6176 /* First construct simple cases. Avoid SCmode, since we want to use
6177 single register to pass this type. */
6178 if (n == 1 && mode != SCmode)
6179 switch (regclass[0])
6181 case X86_64_INTEGER_CLASS:
6182 case X86_64_INTEGERSI_CLASS:
6183 return gen_rtx_REG (mode, intreg[0]);
6184 case X86_64_SSE_CLASS:
6185 case X86_64_SSESF_CLASS:
6186 case X86_64_SSEDF_CLASS:
6187 if (mode != BLKmode)
6188 return gen_reg_or_parallel (mode, orig_mode,
6189 SSE_REGNO (sse_regno));
6191 case X86_64_X87_CLASS:
6192 case X86_64_COMPLEX_X87_CLASS:
6193 return gen_rtx_REG (mode, FIRST_STACK_REG);
6194 case X86_64_NO_CLASS:
6195 /* Zero sized array, struct or class. */
6200 if (n == 2 && regclass[0] == X86_64_SSE_CLASS
6201 && regclass[1] == X86_64_SSEUP_CLASS && mode != BLKmode)
6202 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6204 && regclass[0] == X86_64_SSE_CLASS
6205 && regclass[1] == X86_64_SSEUP_CLASS
6206 && regclass[2] == X86_64_SSEUP_CLASS
6207 && regclass[3] == X86_64_SSEUP_CLASS
6209 return gen_rtx_REG (mode, SSE_REGNO (sse_regno));
6212 && regclass[0] == X86_64_X87_CLASS && regclass[1] == X86_64_X87UP_CLASS)
6213 return gen_rtx_REG (XFmode, FIRST_STACK_REG);
6214 if (n == 2 && regclass[0] == X86_64_INTEGER_CLASS
6215 && regclass[1] == X86_64_INTEGER_CLASS
6216 && (mode == CDImode || mode == TImode || mode == TFmode)
6217 && intreg[0] + 1 == intreg[1])
6218 return gen_rtx_REG (mode, intreg[0]);
6220 /* Otherwise figure out the entries of the PARALLEL. */
6221 for (i = 0; i < n; i++)
6225 switch (regclass[i])
6227 case X86_64_NO_CLASS:
6229 case X86_64_INTEGER_CLASS:
6230 case X86_64_INTEGERSI_CLASS:
6231 /* Merge TImodes on aligned occasions here too. */
6232 if (i * 8 + 8 > bytes)
6233 tmpmode = mode_for_size ((bytes - i * 8) * BITS_PER_UNIT, MODE_INT, 0);
6234 else if (regclass[i] == X86_64_INTEGERSI_CLASS)
6238 /* We've requested 24 bytes we don't have mode for. Use DImode. */
6239 if (tmpmode == BLKmode)
6241 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6242 gen_rtx_REG (tmpmode, *intreg),
6246 case X86_64_SSESF_CLASS:
6247 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6248 gen_rtx_REG (SFmode,
6249 SSE_REGNO (sse_regno)),
6253 case X86_64_SSEDF_CLASS:
6254 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6255 gen_rtx_REG (DFmode,
6256 SSE_REGNO (sse_regno)),
6260 case X86_64_SSE_CLASS:
6268 if (i == 0 && regclass[1] == X86_64_SSEUP_CLASS)
6278 && regclass[1] == X86_64_SSEUP_CLASS
6279 && regclass[2] == X86_64_SSEUP_CLASS
6280 && regclass[3] == X86_64_SSEUP_CLASS);
6287 exp [nexps++] = gen_rtx_EXPR_LIST (VOIDmode,
6288 gen_rtx_REG (tmpmode,
6289 SSE_REGNO (sse_regno)),
6298 /* Empty aligned struct, union or class. */
6302 ret = gen_rtx_PARALLEL (mode, rtvec_alloc (nexps));
6303 for (i = 0; i < nexps; i++)
6304 XVECEXP (ret, 0, i) = exp [i];
6308 /* Update the data in CUM to advance over an argument of mode MODE
6309 and data type TYPE. (TYPE is null for libcalls where that information
6310 may not be available.) */
6313 function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6314 const_tree type, HOST_WIDE_INT bytes,
6315 HOST_WIDE_INT words)
6331 cum->words += words;
6332 cum->nregs -= words;
6333 cum->regno += words;
6335 if (cum->nregs <= 0)
6343 /* OImode shouldn't be used directly. */
6347 if (cum->float_in_sse < 2)
6350 if (cum->float_in_sse < 1)
6367 if (!type || !AGGREGATE_TYPE_P (type))
6369 cum->sse_words += words;
6370 cum->sse_nregs -= 1;
6371 cum->sse_regno += 1;
6372 if (cum->sse_nregs <= 0)
6386 if (!type || !AGGREGATE_TYPE_P (type))
6388 cum->mmx_words += words;
6389 cum->mmx_nregs -= 1;
6390 cum->mmx_regno += 1;
6391 if (cum->mmx_nregs <= 0)
6402 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
6403 const_tree type, HOST_WIDE_INT words, bool named)
6405 int int_nregs, sse_nregs;
6407 /* Unnamed 256bit vector mode parameters are passed on stack. */
6408 if (!named && VALID_AVX256_REG_MODE (mode))
6411 if (examine_argument (mode, type, 0, &int_nregs, &sse_nregs)
6412 && sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
6414 cum->nregs -= int_nregs;
6415 cum->sse_nregs -= sse_nregs;
6416 cum->regno += int_nregs;
6417 cum->sse_regno += sse_nregs;
6421 int align = ix86_function_arg_boundary (mode, type) / BITS_PER_WORD;
6422 cum->words = (cum->words + align - 1) & ~(align - 1);
6423 cum->words += words;
6428 function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
6429 HOST_WIDE_INT words)
6431 /* Otherwise, this should be passed indirect. */
6432 gcc_assert (bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8);
6434 cum->words += words;
6442 /* Update the data in CUM to advance over an argument of mode MODE and
6443 data type TYPE. (TYPE is null for libcalls where that information
6444 may not be available.) */
6447 ix86_function_arg_advance (cumulative_args_t cum_v, enum machine_mode mode,
6448 const_tree type, bool named)
6450 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6451 HOST_WIDE_INT bytes, words;
6453 if (mode == BLKmode)
6454 bytes = int_size_in_bytes (type);
6456 bytes = GET_MODE_SIZE (mode);
6457 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6460 mode = type_natural_mode (type, NULL);
6462 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6463 function_arg_advance_ms_64 (cum, bytes, words);
6464 else if (TARGET_64BIT)
6465 function_arg_advance_64 (cum, mode, type, words, named);
6467 function_arg_advance_32 (cum, mode, type, bytes, words);
6470 /* Define where to put the arguments to a function.
6471 Value is zero to push the argument on the stack,
6472 or a hard register in which to store the argument.
6474 MODE is the argument's machine mode.
6475 TYPE is the data type of the argument (as a tree).
6476 This is null for libcalls where that information may
6478 CUM is a variable of type CUMULATIVE_ARGS which gives info about
6479 the preceding args and about the function being called.
6480 NAMED is nonzero if this argument is a named parameter
6481 (otherwise it is an extra parameter matching an ellipsis). */
6484 function_arg_32 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6485 enum machine_mode orig_mode, const_tree type,
6486 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
6488 static bool warnedsse, warnedmmx;
6490 /* Avoid the AL settings for the Unix64 ABI. */
6491 if (mode == VOIDmode)
6507 if (words <= cum->nregs)
6509 int regno = cum->regno;
6511 /* Fastcall allocates the first two DWORD (SImode) or
6512 smaller arguments to ECX and EDX if it isn't an
6518 || (type && AGGREGATE_TYPE_P (type)))
6521 /* ECX not EAX is the first allocated register. */
6522 if (regno == AX_REG)
6525 return gen_rtx_REG (mode, regno);
6530 if (cum->float_in_sse < 2)
6533 if (cum->float_in_sse < 1)
6537 /* In 32bit, we pass TImode in xmm registers. */
6544 if (!type || !AGGREGATE_TYPE_P (type))
6546 if (!TARGET_SSE && !warnedsse && cum->warn_sse)
6549 warning (0, "SSE vector argument without SSE enabled "
6553 return gen_reg_or_parallel (mode, orig_mode,
6554 cum->sse_regno + FIRST_SSE_REG);
6559 /* OImode shouldn't be used directly. */
6568 if (!type || !AGGREGATE_TYPE_P (type))
6571 return gen_reg_or_parallel (mode, orig_mode,
6572 cum->sse_regno + FIRST_SSE_REG);
6582 if (!type || !AGGREGATE_TYPE_P (type))
6584 if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
6587 warning (0, "MMX vector argument without MMX enabled "
6591 return gen_reg_or_parallel (mode, orig_mode,
6592 cum->mmx_regno + FIRST_MMX_REG);
6601 function_arg_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6602 enum machine_mode orig_mode, const_tree type, bool named)
6604 /* Handle a hidden AL argument containing number of registers
6605 for varargs x86-64 functions. */
6606 if (mode == VOIDmode)
6607 return GEN_INT (cum->maybe_vaarg
6608 ? (cum->sse_nregs < 0
6609 ? X86_64_SSE_REGPARM_MAX
6624 /* Unnamed 256bit vector mode parameters are passed on stack. */
6630 return construct_container (mode, orig_mode, type, 0, cum->nregs,
6632 &x86_64_int_parameter_registers [cum->regno],
6637 function_arg_ms_64 (const CUMULATIVE_ARGS *cum, enum machine_mode mode,
6638 enum machine_mode orig_mode, bool named,
6639 HOST_WIDE_INT bytes)
6643 /* We need to add clobber for MS_ABI->SYSV ABI calls in expand_call.
6644 We use value of -2 to specify that current function call is MSABI. */
6645 if (mode == VOIDmode)
6646 return GEN_INT (-2);
6648 /* If we've run out of registers, it goes on the stack. */
6649 if (cum->nregs == 0)
6652 regno = x86_64_ms_abi_int_parameter_registers[cum->regno];
6654 /* Only floating point modes are passed in anything but integer regs. */
6655 if (TARGET_SSE && (mode == SFmode || mode == DFmode))
6658 regno = cum->regno + FIRST_SSE_REG;
6663 /* Unnamed floating parameters are passed in both the
6664 SSE and integer registers. */
6665 t1 = gen_rtx_REG (mode, cum->regno + FIRST_SSE_REG);
6666 t2 = gen_rtx_REG (mode, regno);
6667 t1 = gen_rtx_EXPR_LIST (VOIDmode, t1, const0_rtx);
6668 t2 = gen_rtx_EXPR_LIST (VOIDmode, t2, const0_rtx);
6669 return gen_rtx_PARALLEL (mode, gen_rtvec (2, t1, t2));
6672 /* Handle aggregated types passed in register. */
6673 if (orig_mode == BLKmode)
6675 if (bytes > 0 && bytes <= 8)
6676 mode = (bytes > 4 ? DImode : SImode);
6677 if (mode == BLKmode)
6681 return gen_reg_or_parallel (mode, orig_mode, regno);
6684 /* Return where to put the arguments to a function.
6685 Return zero to push the argument on the stack, or a hard register in which to store the argument.
6687 MODE is the argument's machine mode. TYPE is the data type of the
6688 argument. It is null for libcalls where that information may not be
6689 available. CUM gives information about the preceding args and about
6690 the function being called. NAMED is nonzero if this argument is a
6691 named parameter (otherwise it is an extra parameter matching an
6695 ix86_function_arg (cumulative_args_t cum_v, enum machine_mode omode,
6696 const_tree type, bool named)
6698 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6699 enum machine_mode mode = omode;
6700 HOST_WIDE_INT bytes, words;
6703 if (mode == BLKmode)
6704 bytes = int_size_in_bytes (type);
6706 bytes = GET_MODE_SIZE (mode);
6707 words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
6709 /* To simplify the code below, represent vector types with a vector mode
6710 even if MMX/SSE are not active. */
6711 if (type && TREE_CODE (type) == VECTOR_TYPE)
6712 mode = type_natural_mode (type, cum);
6714 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6715 arg = function_arg_ms_64 (cum, mode, omode, named, bytes);
6716 else if (TARGET_64BIT)
6717 arg = function_arg_64 (cum, mode, omode, type, named);
6719 arg = function_arg_32 (cum, mode, omode, type, bytes, words);
6721 if (TARGET_VZEROUPPER && function_pass_avx256_p (arg))
6723 /* This argument uses 256bit AVX modes. */
6725 cfun->machine->callee_pass_avx256_p = true;
6727 cfun->machine->caller_pass_avx256_p = true;
6733 /* A C expression that indicates when an argument must be passed by
6734 reference. If nonzero for an argument, a copy of that argument is
6735 made in memory and a pointer to the argument is passed instead of
6736 the argument itself. The pointer is passed in whatever way is
6737 appropriate for passing a pointer to that type. */
6740 ix86_pass_by_reference (cumulative_args_t cum_v ATTRIBUTE_UNUSED,
6741 enum machine_mode mode ATTRIBUTE_UNUSED,
6742 const_tree type, bool named ATTRIBUTE_UNUSED)
6744 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
6746 /* See Windows x64 Software Convention. */
6747 if (TARGET_64BIT && (cum ? cum->call_abi : ix86_abi) == MS_ABI)
6749 int msize = (int) GET_MODE_SIZE (mode);
6752 /* Arrays are passed by reference. */
6753 if (TREE_CODE (type) == ARRAY_TYPE)
6756 if (AGGREGATE_TYPE_P (type))
6758 /* Structs/unions of sizes other than 8, 16, 32, or 64 bits
6759 are passed by reference. */
6760 msize = int_size_in_bytes (type);
6764 /* __m128 is passed by reference. */
6766 case 1: case 2: case 4: case 8:
6772 else if (TARGET_64BIT && type && int_size_in_bytes (type) == -1)
6778 /* Return true when TYPE should be 128bit aligned for 32bit argument
6779 passing ABI. XXX: This function is obsolete and is only used for
6780 checking psABI compatibility with previous versions of GCC. */
6783 ix86_compat_aligned_value_p (const_tree type)
6785 enum machine_mode mode = TYPE_MODE (type);
6786 if (((TARGET_SSE && SSE_REG_MODE_P (mode))
6790 && (!TYPE_USER_ALIGN (type) || TYPE_ALIGN (type) > 128))
6792 if (TYPE_ALIGN (type) < 128)
6795 if (AGGREGATE_TYPE_P (type))
6797 /* Walk the aggregates recursively. */
6798 switch (TREE_CODE (type))
6802 case QUAL_UNION_TYPE:
6806 /* Walk all the structure fields. */
6807 for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6809 if (TREE_CODE (field) == FIELD_DECL
6810 && ix86_compat_aligned_value_p (TREE_TYPE (field)))
6817 /* Just for use if some languages passes arrays by value. */
6818 if (ix86_compat_aligned_value_p (TREE_TYPE (type)))
6829 /* Return the alignment boundary for MODE and TYPE with alignment ALIGN.
6830 XXX: This function is obsolete and is only used for checking psABI
6831 compatibility with previous versions of GCC. */
6834 ix86_compat_function_arg_boundary (enum machine_mode mode,
6835 const_tree type, unsigned int align)
6837 /* In 32bit, only _Decimal128 and __float128 are aligned to their
6838 natural boundaries. */
6839 if (!TARGET_64BIT && mode != TDmode && mode != TFmode)
6841 /* i386 ABI defines all arguments to be 4 byte aligned. We have to
6842 make an exception for SSE modes since these require 128bit
6845 The handling here differs from field_alignment. ICC aligns MMX
6846 arguments to 4 byte boundaries, while structure fields are aligned
6847 to 8 byte boundaries. */
6850 if (!(TARGET_SSE && SSE_REG_MODE_P (mode)))
6851 align = PARM_BOUNDARY;
6855 if (!ix86_compat_aligned_value_p (type))
6856 align = PARM_BOUNDARY;
6859 if (align > BIGGEST_ALIGNMENT)
6860 align = BIGGEST_ALIGNMENT;
6864 /* Return true when TYPE should be 128bit aligned for 32bit argument
6868 ix86_contains_aligned_value_p (const_tree type)
6870 enum machine_mode mode = TYPE_MODE (type);
6872 if (mode == XFmode || mode == XCmode)
6875 if (TYPE_ALIGN (type) < 128)
6878 if (AGGREGATE_TYPE_P (type))
6880 /* Walk the aggregates recursively. */
6881 switch (TREE_CODE (type))
6885 case QUAL_UNION_TYPE:
6889 /* Walk all the structure fields. */
6890 for (field = TYPE_FIELDS (type);
6892 field = DECL_CHAIN (field))
6894 if (TREE_CODE (field) == FIELD_DECL
6895 && ix86_contains_aligned_value_p (TREE_TYPE (field)))
6902 /* Just for use if some languages passes arrays by value. */
6903 if (ix86_contains_aligned_value_p (TREE_TYPE (type)))
6912 return TYPE_ALIGN (type) >= 128;
6917 /* Gives the alignment boundary, in bits, of an argument with the
6918 specified mode and type. */
6921 ix86_function_arg_boundary (enum machine_mode mode, const_tree type)
6926 /* Since the main variant type is used for call, we convert it to
6927 the main variant type. */
6928 type = TYPE_MAIN_VARIANT (type);
6929 align = TYPE_ALIGN (type);
6932 align = GET_MODE_ALIGNMENT (mode);
6933 if (align < PARM_BOUNDARY)
6934 align = PARM_BOUNDARY;
6938 unsigned int saved_align = align;
6942 /* i386 ABI defines XFmode arguments to be 4 byte aligned. */
6945 if (mode == XFmode || mode == XCmode)
6946 align = PARM_BOUNDARY;
6948 else if (!ix86_contains_aligned_value_p (type))
6949 align = PARM_BOUNDARY;
6952 align = PARM_BOUNDARY;
6957 && align != ix86_compat_function_arg_boundary (mode, type,
6961 inform (input_location,
6962 "The ABI for passing parameters with %d-byte"
6963 " alignment has changed in GCC 4.6",
6964 align / BITS_PER_UNIT);
6971 /* Return true if N is a possible register number of function value. */
6974 ix86_function_value_regno_p (const unsigned int regno)
6981 case FIRST_FLOAT_REG:
6982 /* TODO: The function should depend on current function ABI but
6983 builtins.c would need updating then. Therefore we use the
6985 if (TARGET_64BIT && ix86_abi == MS_ABI)
6987 return TARGET_FLOAT_RETURNS_IN_80387;
6993 if (TARGET_MACHO || TARGET_64BIT)
7001 /* Define how to find the value returned by a function.
7002 VALTYPE is the data type of the value (as a tree).
7003 If the precise function being called is known, FUNC is its FUNCTION_DECL;
7004 otherwise, FUNC is 0. */
7007 function_value_32 (enum machine_mode orig_mode, enum machine_mode mode,
7008 const_tree fntype, const_tree fn)
7012 /* 8-byte vector modes in %mm0. See ix86_return_in_memory for where
7013 we normally prevent this case when mmx is not available. However
7014 some ABIs may require the result to be returned like DImode. */
7015 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7016 regno = TARGET_MMX ? FIRST_MMX_REG : 0;
7018 /* 16-byte vector modes in %xmm0. See ix86_return_in_memory for where
7019 we prevent this case when sse is not available. However some ABIs
7020 may require the result to be returned like integer TImode. */
7021 else if (mode == TImode
7022 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7023 regno = TARGET_SSE ? FIRST_SSE_REG : 0;
7025 /* 32-byte vector modes in %ymm0. */
7026 else if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 32)
7027 regno = TARGET_AVX ? FIRST_SSE_REG : 0;
7029 /* Floating point return values in %st(0) (unless -mno-fp-ret-in-387). */
7030 else if (X87_FLOAT_MODE_P (mode) && TARGET_FLOAT_RETURNS_IN_80387)
7031 regno = FIRST_FLOAT_REG;
7033 /* Most things go in %eax. */
7036 /* Override FP return register with %xmm0 for local functions when
7037 SSE math is enabled or for functions with sseregparm attribute. */
7038 if ((fn || fntype) && (mode == SFmode || mode == DFmode))
7040 int sse_level = ix86_function_sseregparm (fntype, fn, false);
7041 if ((sse_level >= 1 && mode == SFmode)
7042 || (sse_level == 2 && mode == DFmode))
7043 regno = FIRST_SSE_REG;
7046 /* OImode shouldn't be used directly. */
7047 gcc_assert (mode != OImode);
7049 return gen_rtx_REG (orig_mode, regno);
7053 function_value_64 (enum machine_mode orig_mode, enum machine_mode mode,
7058 /* Handle libcalls, which don't provide a type node. */
7059 if (valtype == NULL)
7071 return gen_rtx_REG (mode, FIRST_SSE_REG);
7074 return gen_rtx_REG (mode, FIRST_FLOAT_REG);
7078 return gen_rtx_REG (mode, AX_REG);
7082 ret = construct_container (mode, orig_mode, valtype, 1,
7083 X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
7084 x86_64_int_return_registers, 0);
7086 /* For zero sized structures, construct_container returns NULL, but we
7087 need to keep rest of compiler happy by returning meaningful value. */
7089 ret = gen_rtx_REG (orig_mode, AX_REG);
7095 function_value_ms_64 (enum machine_mode orig_mode, enum machine_mode mode)
7097 unsigned int regno = AX_REG;
7101 switch (GET_MODE_SIZE (mode))
7104 if((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7105 && !COMPLEX_MODE_P (mode))
7106 regno = FIRST_SSE_REG;
7110 if (mode == SFmode || mode == DFmode)
7111 regno = FIRST_SSE_REG;
7117 return gen_rtx_REG (orig_mode, regno);
7121 ix86_function_value_1 (const_tree valtype, const_tree fntype_or_decl,
7122 enum machine_mode orig_mode, enum machine_mode mode)
7124 const_tree fn, fntype;
7127 if (fntype_or_decl && DECL_P (fntype_or_decl))
7128 fn = fntype_or_decl;
7129 fntype = fn ? TREE_TYPE (fn) : fntype_or_decl;
7131 if (TARGET_64BIT && ix86_function_type_abi (fntype) == MS_ABI)
7132 return function_value_ms_64 (orig_mode, mode);
7133 else if (TARGET_64BIT)
7134 return function_value_64 (orig_mode, mode, valtype);
7136 return function_value_32 (orig_mode, mode, fntype, fn);
7140 ix86_function_value (const_tree valtype, const_tree fntype_or_decl,
7141 bool outgoing ATTRIBUTE_UNUSED)
7143 enum machine_mode mode, orig_mode;
7145 orig_mode = TYPE_MODE (valtype);
7146 mode = type_natural_mode (valtype, NULL);
7147 return ix86_function_value_1 (valtype, fntype_or_decl, orig_mode, mode);
7151 ix86_libcall_value (enum machine_mode mode)
7153 return ix86_function_value_1 (NULL, NULL, mode, mode);
7156 /* Return true iff type is returned in memory. */
7158 static bool ATTRIBUTE_UNUSED
7159 return_in_memory_32 (const_tree type, enum machine_mode mode)
7163 if (mode == BLKmode)
7166 size = int_size_in_bytes (type);
7168 if (MS_AGGREGATE_RETURN && AGGREGATE_TYPE_P (type) && size <= 8)
7171 if (VECTOR_MODE_P (mode) || mode == TImode)
7173 /* User-created vectors small enough to fit in EAX. */
7177 /* MMX/3dNow values are returned in MM0,
7178 except when it doesn't exits or the ABI prescribes otherwise. */
7180 return !TARGET_MMX || TARGET_VECT8_RETURNS;
7182 /* SSE values are returned in XMM0, except when it doesn't exist. */
7186 /* AVX values are returned in YMM0, except when it doesn't exist. */
7197 /* OImode shouldn't be used directly. */
7198 gcc_assert (mode != OImode);
7203 static bool ATTRIBUTE_UNUSED
7204 return_in_memory_64 (const_tree type, enum machine_mode mode)
7206 int needed_intregs, needed_sseregs;
7207 return !examine_argument (mode, type, 1, &needed_intregs, &needed_sseregs);
7210 static bool ATTRIBUTE_UNUSED
7211 return_in_memory_ms_64 (const_tree type, enum machine_mode mode)
7213 HOST_WIDE_INT size = int_size_in_bytes (type);
7215 /* __m128 is returned in xmm0. */
7216 if ((SCALAR_INT_MODE_P (mode) || VECTOR_MODE_P (mode))
7217 && !COMPLEX_MODE_P (mode) && (GET_MODE_SIZE (mode) == 16 || size == 16))
7220 /* Otherwise, the size must be exactly in [1248]. */
7221 return size != 1 && size != 2 && size != 4 && size != 8;
7225 ix86_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
7227 #ifdef SUBTARGET_RETURN_IN_MEMORY
7228 return SUBTARGET_RETURN_IN_MEMORY (type, fntype);
7230 const enum machine_mode mode = type_natural_mode (type, NULL);
7234 if (ix86_function_type_abi (fntype) == MS_ABI)
7235 return return_in_memory_ms_64 (type, mode);
7237 return return_in_memory_64 (type, mode);
7240 return return_in_memory_32 (type, mode);
7244 /* When returning SSE vector types, we have a choice of either
7245 (1) being abi incompatible with a -march switch, or
7246 (2) generating an error.
7247 Given no good solution, I think the safest thing is one warning.
7248 The user won't be able to use -Werror, but....
7250 Choose the STRUCT_VALUE_RTX hook because that's (at present) only
7251 called in response to actually generating a caller or callee that
7252 uses such a type. As opposed to TARGET_RETURN_IN_MEMORY, which is called
7253 via aggregate_value_p for general type probing from tree-ssa. */
7256 ix86_struct_value_rtx (tree type, int incoming ATTRIBUTE_UNUSED)
7258 static bool warnedsse, warnedmmx;
7260 if (!TARGET_64BIT && type)
7262 /* Look at the return type of the function, not the function type. */
7263 enum machine_mode mode = TYPE_MODE (TREE_TYPE (type));
7265 if (!TARGET_SSE && !warnedsse)
7268 || (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 16))
7271 warning (0, "SSE vector return without SSE enabled "
7276 if (!TARGET_MMX && !warnedmmx)
7278 if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 8)
7281 warning (0, "MMX vector return without MMX enabled "
7291 /* Create the va_list data type. */
7293 /* Returns the calling convention specific va_list date type.
7294 The argument ABI can be DEFAULT_ABI, MS_ABI, or SYSV_ABI. */
7297 ix86_build_builtin_va_list_abi (enum calling_abi abi)
7299 tree f_gpr, f_fpr, f_ovf, f_sav, record, type_decl;
7301 /* For i386 we use plain pointer to argument area. */
7302 if (!TARGET_64BIT || abi == MS_ABI)
7303 return build_pointer_type (char_type_node);
7305 record = lang_hooks.types.make_type (RECORD_TYPE);
7306 type_decl = build_decl (BUILTINS_LOCATION,
7307 TYPE_DECL, get_identifier ("__va_list_tag"), record);
7309 f_gpr = build_decl (BUILTINS_LOCATION,
7310 FIELD_DECL, get_identifier ("gp_offset"),
7311 unsigned_type_node);
7312 f_fpr = build_decl (BUILTINS_LOCATION,
7313 FIELD_DECL, get_identifier ("fp_offset"),
7314 unsigned_type_node);
7315 f_ovf = build_decl (BUILTINS_LOCATION,
7316 FIELD_DECL, get_identifier ("overflow_arg_area"),
7318 f_sav = build_decl (BUILTINS_LOCATION,
7319 FIELD_DECL, get_identifier ("reg_save_area"),
7322 va_list_gpr_counter_field = f_gpr;
7323 va_list_fpr_counter_field = f_fpr;
7325 DECL_FIELD_CONTEXT (f_gpr) = record;
7326 DECL_FIELD_CONTEXT (f_fpr) = record;
7327 DECL_FIELD_CONTEXT (f_ovf) = record;
7328 DECL_FIELD_CONTEXT (f_sav) = record;
7330 TYPE_STUB_DECL (record) = type_decl;
7331 TYPE_NAME (record) = type_decl;
7332 TYPE_FIELDS (record) = f_gpr;
7333 DECL_CHAIN (f_gpr) = f_fpr;
7334 DECL_CHAIN (f_fpr) = f_ovf;
7335 DECL_CHAIN (f_ovf) = f_sav;
7337 layout_type (record);
7339 /* The correct type is an array type of one element. */
7340 return build_array_type (record, build_index_type (size_zero_node));
7343 /* Setup the builtin va_list data type and for 64-bit the additional
7344 calling convention specific va_list data types. */
7347 ix86_build_builtin_va_list (void)
7349 tree ret = ix86_build_builtin_va_list_abi (ix86_abi);
7351 /* Initialize abi specific va_list builtin types. */
7355 if (ix86_abi == MS_ABI)
7357 t = ix86_build_builtin_va_list_abi (SYSV_ABI);
7358 if (TREE_CODE (t) != RECORD_TYPE)
7359 t = build_variant_type_copy (t);
7360 sysv_va_list_type_node = t;
7365 if (TREE_CODE (t) != RECORD_TYPE)
7366 t = build_variant_type_copy (t);
7367 sysv_va_list_type_node = t;
7369 if (ix86_abi != MS_ABI)
7371 t = ix86_build_builtin_va_list_abi (MS_ABI);
7372 if (TREE_CODE (t) != RECORD_TYPE)
7373 t = build_variant_type_copy (t);
7374 ms_va_list_type_node = t;
7379 if (TREE_CODE (t) != RECORD_TYPE)
7380 t = build_variant_type_copy (t);
7381 ms_va_list_type_node = t;
7388 /* Worker function for TARGET_SETUP_INCOMING_VARARGS. */
7391 setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
7397 /* GPR size of varargs save area. */
7398 if (cfun->va_list_gpr_size)
7399 ix86_varargs_gpr_size = X86_64_REGPARM_MAX * UNITS_PER_WORD;
7401 ix86_varargs_gpr_size = 0;
7403 /* FPR size of varargs save area. We don't need it if we don't pass
7404 anything in SSE registers. */
7405 if (TARGET_SSE && cfun->va_list_fpr_size)
7406 ix86_varargs_fpr_size = X86_64_SSE_REGPARM_MAX * 16;
7408 ix86_varargs_fpr_size = 0;
7410 if (! ix86_varargs_gpr_size && ! ix86_varargs_fpr_size)
7413 save_area = frame_pointer_rtx;
7414 set = get_varargs_alias_set ();
7416 max = cum->regno + cfun->va_list_gpr_size / UNITS_PER_WORD;
7417 if (max > X86_64_REGPARM_MAX)
7418 max = X86_64_REGPARM_MAX;
7420 for (i = cum->regno; i < max; i++)
7422 mem = gen_rtx_MEM (Pmode,
7423 plus_constant (save_area, i * UNITS_PER_WORD));
7424 MEM_NOTRAP_P (mem) = 1;
7425 set_mem_alias_set (mem, set);
7426 emit_move_insn (mem, gen_rtx_REG (Pmode,
7427 x86_64_int_parameter_registers[i]));
7430 if (ix86_varargs_fpr_size)
7432 enum machine_mode smode;
7435 /* Now emit code to save SSE registers. The AX parameter contains number
7436 of SSE parameter registers used to call this function, though all we
7437 actually check here is the zero/non-zero status. */
7439 label = gen_label_rtx ();
7440 test = gen_rtx_EQ (VOIDmode, gen_rtx_REG (QImode, AX_REG), const0_rtx);
7441 emit_jump_insn (gen_cbranchqi4 (test, XEXP (test, 0), XEXP (test, 1),
7444 /* ??? If !TARGET_SSE_TYPELESS_STORES, would we perform better if
7445 we used movdqa (i.e. TImode) instead? Perhaps even better would
7446 be if we could determine the real mode of the data, via a hook
7447 into pass_stdarg. Ignore all that for now. */
7449 if (crtl->stack_alignment_needed < GET_MODE_ALIGNMENT (smode))
7450 crtl->stack_alignment_needed = GET_MODE_ALIGNMENT (smode);
7452 max = cum->sse_regno + cfun->va_list_fpr_size / 16;
7453 if (max > X86_64_SSE_REGPARM_MAX)
7454 max = X86_64_SSE_REGPARM_MAX;
7456 for (i = cum->sse_regno; i < max; ++i)
7458 mem = plus_constant (save_area, i * 16 + ix86_varargs_gpr_size);
7459 mem = gen_rtx_MEM (smode, mem);
7460 MEM_NOTRAP_P (mem) = 1;
7461 set_mem_alias_set (mem, set);
7462 set_mem_align (mem, GET_MODE_ALIGNMENT (smode));
7464 emit_move_insn (mem, gen_rtx_REG (smode, SSE_REGNO (i)));
7472 setup_incoming_varargs_ms_64 (CUMULATIVE_ARGS *cum)
7474 alias_set_type set = get_varargs_alias_set ();
7477 for (i = cum->regno; i < X86_64_MS_REGPARM_MAX; i++)
7481 mem = gen_rtx_MEM (Pmode,
7482 plus_constant (virtual_incoming_args_rtx,
7483 i * UNITS_PER_WORD));
7484 MEM_NOTRAP_P (mem) = 1;
7485 set_mem_alias_set (mem, set);
7487 reg = gen_rtx_REG (Pmode, x86_64_ms_abi_int_parameter_registers[i]);
7488 emit_move_insn (mem, reg);
7493 ix86_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7494 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7497 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7498 CUMULATIVE_ARGS next_cum;
7501 /* This argument doesn't appear to be used anymore. Which is good,
7502 because the old code here didn't suppress rtl generation. */
7503 gcc_assert (!no_rtl);
7508 fntype = TREE_TYPE (current_function_decl);
7510 /* For varargs, we do not want to skip the dummy va_dcl argument.
7511 For stdargs, we do want to skip the last named argument. */
7513 if (stdarg_p (fntype))
7514 ix86_function_arg_advance (pack_cumulative_args (&next_cum), mode, type,
7517 if (cum->call_abi == MS_ABI)
7518 setup_incoming_varargs_ms_64 (&next_cum);
7520 setup_incoming_varargs_64 (&next_cum);
7523 /* Checks if TYPE is of kind va_list char *. */
7526 is_va_list_char_pointer (tree type)
7530 /* For 32-bit it is always true. */
7533 canonic = ix86_canonical_va_list_type (type);
7534 return (canonic == ms_va_list_type_node
7535 || (ix86_abi == MS_ABI && canonic == va_list_type_node));
7538 /* Implement va_start. */
7541 ix86_va_start (tree valist, rtx nextarg)
7543 HOST_WIDE_INT words, n_gpr, n_fpr;
7544 tree f_gpr, f_fpr, f_ovf, f_sav;
7545 tree gpr, fpr, ovf, sav, t;
7549 if (flag_split_stack
7550 && cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7552 unsigned int scratch_regno;
7554 /* When we are splitting the stack, we can't refer to the stack
7555 arguments using internal_arg_pointer, because they may be on
7556 the old stack. The split stack prologue will arrange to
7557 leave a pointer to the old stack arguments in a scratch
7558 register, which we here copy to a pseudo-register. The split
7559 stack prologue can't set the pseudo-register directly because
7560 it (the prologue) runs before any registers have been saved. */
7562 scratch_regno = split_stack_prologue_scratch_regno ();
7563 if (scratch_regno != INVALID_REGNUM)
7567 reg = gen_reg_rtx (Pmode);
7568 cfun->machine->split_stack_varargs_pointer = reg;
7571 emit_move_insn (reg, gen_rtx_REG (Pmode, scratch_regno));
7575 push_topmost_sequence ();
7576 emit_insn_after (seq, entry_of_function ());
7577 pop_topmost_sequence ();
7581 /* Only 64bit target needs something special. */
7582 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7584 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7585 std_expand_builtin_va_start (valist, nextarg);
7590 va_r = expand_expr (valist, NULL_RTX, VOIDmode, EXPAND_WRITE);
7591 next = expand_binop (ptr_mode, add_optab,
7592 cfun->machine->split_stack_varargs_pointer,
7593 crtl->args.arg_offset_rtx,
7594 NULL_RTX, 0, OPTAB_LIB_WIDEN);
7595 convert_move (va_r, next, 0);
7600 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7601 f_fpr = DECL_CHAIN (f_gpr);
7602 f_ovf = DECL_CHAIN (f_fpr);
7603 f_sav = DECL_CHAIN (f_ovf);
7605 valist = build_simple_mem_ref (valist);
7606 TREE_TYPE (valist) = TREE_TYPE (sysv_va_list_type_node);
7607 /* The following should be folded into the MEM_REF offset. */
7608 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr), unshare_expr (valist),
7610 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), unshare_expr (valist),
7612 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), unshare_expr (valist),
7614 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), unshare_expr (valist),
7617 /* Count number of gp and fp argument registers used. */
7618 words = crtl->args.info.words;
7619 n_gpr = crtl->args.info.regno;
7620 n_fpr = crtl->args.info.sse_regno;
7622 if (cfun->va_list_gpr_size)
7624 type = TREE_TYPE (gpr);
7625 t = build2 (MODIFY_EXPR, type,
7626 gpr, build_int_cst (type, n_gpr * 8));
7627 TREE_SIDE_EFFECTS (t) = 1;
7628 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7631 if (TARGET_SSE && cfun->va_list_fpr_size)
7633 type = TREE_TYPE (fpr);
7634 t = build2 (MODIFY_EXPR, type, fpr,
7635 build_int_cst (type, n_fpr * 16 + 8*X86_64_REGPARM_MAX));
7636 TREE_SIDE_EFFECTS (t) = 1;
7637 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7640 /* Find the overflow area. */
7641 type = TREE_TYPE (ovf);
7642 if (cfun->machine->split_stack_varargs_pointer == NULL_RTX)
7643 ovf_rtx = crtl->args.internal_arg_pointer;
7645 ovf_rtx = cfun->machine->split_stack_varargs_pointer;
7646 t = make_tree (type, ovf_rtx);
7648 t = build2 (POINTER_PLUS_EXPR, type, t,
7649 size_int (words * UNITS_PER_WORD));
7650 t = build2 (MODIFY_EXPR, type, ovf, t);
7651 TREE_SIDE_EFFECTS (t) = 1;
7652 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7654 if (ix86_varargs_gpr_size || ix86_varargs_fpr_size)
7656 /* Find the register save area.
7657 Prologue of the function save it right above stack frame. */
7658 type = TREE_TYPE (sav);
7659 t = make_tree (type, frame_pointer_rtx);
7660 if (!ix86_varargs_gpr_size)
7661 t = build2 (POINTER_PLUS_EXPR, type, t,
7662 size_int (-8 * X86_64_REGPARM_MAX));
7663 t = build2 (MODIFY_EXPR, type, sav, t);
7664 TREE_SIDE_EFFECTS (t) = 1;
7665 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7669 /* Implement va_arg. */
7672 ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
7675 static const int intreg[6] = { 0, 1, 2, 3, 4, 5 };
7676 tree f_gpr, f_fpr, f_ovf, f_sav;
7677 tree gpr, fpr, ovf, sav, t;
7679 tree lab_false, lab_over = NULL_TREE;
7684 enum machine_mode nat_mode;
7685 unsigned int arg_boundary;
7687 /* Only 64bit target needs something special. */
7688 if (!TARGET_64BIT || is_va_list_char_pointer (TREE_TYPE (valist)))
7689 return std_gimplify_va_arg_expr (valist, type, pre_p, post_p);
7691 f_gpr = TYPE_FIELDS (TREE_TYPE (sysv_va_list_type_node));
7692 f_fpr = DECL_CHAIN (f_gpr);
7693 f_ovf = DECL_CHAIN (f_fpr);
7694 f_sav = DECL_CHAIN (f_ovf);
7696 gpr = build3 (COMPONENT_REF, TREE_TYPE (f_gpr),
7697 build_va_arg_indirect_ref (valist), f_gpr, NULL_TREE);
7698 valist = build_va_arg_indirect_ref (valist);
7699 fpr = build3 (COMPONENT_REF, TREE_TYPE (f_fpr), valist, f_fpr, NULL_TREE);
7700 ovf = build3 (COMPONENT_REF, TREE_TYPE (f_ovf), valist, f_ovf, NULL_TREE);
7701 sav = build3 (COMPONENT_REF, TREE_TYPE (f_sav), valist, f_sav, NULL_TREE);
7703 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7705 type = build_pointer_type (type);
7706 size = int_size_in_bytes (type);
7707 rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
7709 nat_mode = type_natural_mode (type, NULL);
7718 /* Unnamed 256bit vector mode parameters are passed on stack. */
7719 if (!TARGET_64BIT_MS_ABI)
7726 container = construct_container (nat_mode, TYPE_MODE (type),
7727 type, 0, X86_64_REGPARM_MAX,
7728 X86_64_SSE_REGPARM_MAX, intreg,
7733 /* Pull the value out of the saved registers. */
7735 addr = create_tmp_var (ptr_type_node, "addr");
7739 int needed_intregs, needed_sseregs;
7741 tree int_addr, sse_addr;
7743 lab_false = create_artificial_label (UNKNOWN_LOCATION);
7744 lab_over = create_artificial_label (UNKNOWN_LOCATION);
7746 examine_argument (nat_mode, type, 0, &needed_intregs, &needed_sseregs);
7748 need_temp = (!REG_P (container)
7749 && ((needed_intregs && TYPE_ALIGN (type) > 64)
7750 || TYPE_ALIGN (type) > 128));
7752 /* In case we are passing structure, verify that it is consecutive block
7753 on the register save area. If not we need to do moves. */
7754 if (!need_temp && !REG_P (container))
7756 /* Verify that all registers are strictly consecutive */
7757 if (SSE_REGNO_P (REGNO (XEXP (XVECEXP (container, 0, 0), 0))))
7761 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7763 rtx slot = XVECEXP (container, 0, i);
7764 if (REGNO (XEXP (slot, 0)) != FIRST_SSE_REG + (unsigned int) i
7765 || INTVAL (XEXP (slot, 1)) != i * 16)
7773 for (i = 0; i < XVECLEN (container, 0) && !need_temp; i++)
7775 rtx slot = XVECEXP (container, 0, i);
7776 if (REGNO (XEXP (slot, 0)) != (unsigned int) i
7777 || INTVAL (XEXP (slot, 1)) != i * 8)
7789 int_addr = create_tmp_var (ptr_type_node, "int_addr");
7790 sse_addr = create_tmp_var (ptr_type_node, "sse_addr");
7793 /* First ensure that we fit completely in registers. */
7796 t = build_int_cst (TREE_TYPE (gpr),
7797 (X86_64_REGPARM_MAX - needed_intregs + 1) * 8);
7798 t = build2 (GE_EXPR, boolean_type_node, gpr, t);
7799 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7800 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7801 gimplify_and_add (t, pre_p);
7805 t = build_int_cst (TREE_TYPE (fpr),
7806 (X86_64_SSE_REGPARM_MAX - needed_sseregs + 1) * 16
7807 + X86_64_REGPARM_MAX * 8);
7808 t = build2 (GE_EXPR, boolean_type_node, fpr, t);
7809 t2 = build1 (GOTO_EXPR, void_type_node, lab_false);
7810 t = build3 (COND_EXPR, void_type_node, t, t2, NULL_TREE);
7811 gimplify_and_add (t, pre_p);
7814 /* Compute index to start of area used for integer regs. */
7817 /* int_addr = gpr + sav; */
7818 t = fold_convert (sizetype, gpr);
7819 t = build2 (POINTER_PLUS_EXPR, ptr_type_node, sav, t);
7820 gimplify_assign (int_addr, t, pre_p);
7824 /* sse_addr = fpr + sav; */
7825 t = fold_convert (sizetype, fpr);
7826 t = build2 (POINTER_PLUS_EXPR, ptr_type_node, sav, t);
7827 gimplify_assign (sse_addr, t, pre_p);
7831 int i, prev_size = 0;
7832 tree temp = create_tmp_var (type, "va_arg_tmp");
7835 t = build1 (ADDR_EXPR, build_pointer_type (type), temp);
7836 gimplify_assign (addr, t, pre_p);
7838 for (i = 0; i < XVECLEN (container, 0); i++)
7840 rtx slot = XVECEXP (container, 0, i);
7841 rtx reg = XEXP (slot, 0);
7842 enum machine_mode mode = GET_MODE (reg);
7848 tree dest_addr, dest;
7849 int cur_size = GET_MODE_SIZE (mode);
7851 gcc_assert (prev_size <= INTVAL (XEXP (slot, 1)));
7852 prev_size = INTVAL (XEXP (slot, 1));
7853 if (prev_size + cur_size > size)
7855 cur_size = size - prev_size;
7856 mode = mode_for_size (cur_size * BITS_PER_UNIT, MODE_INT, 1);
7857 if (mode == BLKmode)
7860 piece_type = lang_hooks.types.type_for_mode (mode, 1);
7861 if (mode == GET_MODE (reg))
7862 addr_type = build_pointer_type (piece_type);
7864 addr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
7866 daddr_type = build_pointer_type_for_mode (piece_type, ptr_mode,
7869 if (SSE_REGNO_P (REGNO (reg)))
7871 src_addr = sse_addr;
7872 src_offset = (REGNO (reg) - FIRST_SSE_REG) * 16;
7876 src_addr = int_addr;
7877 src_offset = REGNO (reg) * 8;
7879 src_addr = fold_convert (addr_type, src_addr);
7880 src_addr = fold_build2 (POINTER_PLUS_EXPR, addr_type, src_addr,
7881 size_int (src_offset));
7883 dest_addr = fold_convert (daddr_type, addr);
7884 dest_addr = fold_build2 (POINTER_PLUS_EXPR, daddr_type, dest_addr,
7885 size_int (prev_size));
7886 if (cur_size == GET_MODE_SIZE (mode))
7888 src = build_va_arg_indirect_ref (src_addr);
7889 dest = build_va_arg_indirect_ref (dest_addr);
7891 gimplify_assign (dest, src, pre_p);
7896 = build_call_expr (implicit_built_in_decls[BUILT_IN_MEMCPY],
7897 3, dest_addr, src_addr,
7898 size_int (cur_size));
7899 gimplify_and_add (copy, pre_p);
7901 prev_size += cur_size;
7907 t = build2 (PLUS_EXPR, TREE_TYPE (gpr), gpr,
7908 build_int_cst (TREE_TYPE (gpr), needed_intregs * 8));
7909 gimplify_assign (gpr, t, pre_p);
7914 t = build2 (PLUS_EXPR, TREE_TYPE (fpr), fpr,
7915 build_int_cst (TREE_TYPE (fpr), needed_sseregs * 16));
7916 gimplify_assign (fpr, t, pre_p);
7919 gimple_seq_add_stmt (pre_p, gimple_build_goto (lab_over));
7921 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_false));
7924 /* ... otherwise out of the overflow area. */
7926 /* When we align parameter on stack for caller, if the parameter
7927 alignment is beyond MAX_SUPPORTED_STACK_ALIGNMENT, it will be
7928 aligned at MAX_SUPPORTED_STACK_ALIGNMENT. We will match callee
7929 here with caller. */
7930 arg_boundary = ix86_function_arg_boundary (VOIDmode, type);
7931 if ((unsigned int) arg_boundary > MAX_SUPPORTED_STACK_ALIGNMENT)
7932 arg_boundary = MAX_SUPPORTED_STACK_ALIGNMENT;
7934 /* Care for on-stack alignment if needed. */
7935 if (arg_boundary <= 64 || size == 0)
7939 HOST_WIDE_INT align = arg_boundary / 8;
7940 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (ovf), ovf,
7941 size_int (align - 1));
7942 t = fold_convert (sizetype, t);
7943 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7945 t = fold_convert (TREE_TYPE (ovf), t);
7948 gimplify_expr (&t, pre_p, NULL, is_gimple_val, fb_rvalue);
7949 gimplify_assign (addr, t, pre_p);
7951 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (t), t,
7952 size_int (rsize * UNITS_PER_WORD));
7953 gimplify_assign (unshare_expr (ovf), t, pre_p);
7956 gimple_seq_add_stmt (pre_p, gimple_build_label (lab_over));
7958 ptrtype = build_pointer_type_for_mode (type, ptr_mode, true);
7959 addr = fold_convert (ptrtype, addr);
7962 addr = build_va_arg_indirect_ref (addr);
7963 return build_va_arg_indirect_ref (addr);
7966 /* Return true if OPNUM's MEM should be matched
7967 in movabs* patterns. */
7970 ix86_check_movabs (rtx insn, int opnum)
7974 set = PATTERN (insn);
7975 if (GET_CODE (set) == PARALLEL)
7976 set = XVECEXP (set, 0, 0);
7977 gcc_assert (GET_CODE (set) == SET);
7978 mem = XEXP (set, opnum);
7979 while (GET_CODE (mem) == SUBREG)
7980 mem = SUBREG_REG (mem);
7981 gcc_assert (MEM_P (mem));
7982 return volatile_ok || !MEM_VOLATILE_P (mem);
7985 /* Initialize the table of extra 80387 mathematical constants. */
7988 init_ext_80387_constants (void)
7990 static const char * cst[5] =
7992 "0.3010299956639811952256464283594894482", /* 0: fldlg2 */
7993 "0.6931471805599453094286904741849753009", /* 1: fldln2 */
7994 "1.4426950408889634073876517827983434472", /* 2: fldl2e */
7995 "3.3219280948873623478083405569094566090", /* 3: fldl2t */
7996 "3.1415926535897932385128089594061862044", /* 4: fldpi */
8000 for (i = 0; i < 5; i++)
8002 real_from_string (&ext_80387_constants_table[i], cst[i]);
8003 /* Ensure each constant is rounded to XFmode precision. */
8004 real_convert (&ext_80387_constants_table[i],
8005 XFmode, &ext_80387_constants_table[i]);
8008 ext_80387_constants_init = 1;
8011 /* Return non-zero if the constant is something that
8012 can be loaded with a special instruction. */
8015 standard_80387_constant_p (rtx x)
8017 enum machine_mode mode = GET_MODE (x);
8021 if (!(X87_FLOAT_MODE_P (mode) && (GET_CODE (x) == CONST_DOUBLE)))
8024 if (x == CONST0_RTX (mode))
8026 if (x == CONST1_RTX (mode))
8029 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8031 /* For XFmode constants, try to find a special 80387 instruction when
8032 optimizing for size or on those CPUs that benefit from them. */
8034 && (optimize_function_for_size_p (cfun) || TARGET_EXT_80387_CONSTANTS))
8038 if (! ext_80387_constants_init)
8039 init_ext_80387_constants ();
8041 for (i = 0; i < 5; i++)
8042 if (real_identical (&r, &ext_80387_constants_table[i]))
8046 /* Load of the constant -0.0 or -1.0 will be split as
8047 fldz;fchs or fld1;fchs sequence. */
8048 if (real_isnegzero (&r))
8050 if (real_identical (&r, &dconstm1))
8056 /* Return the opcode of the special instruction to be used to load
8060 standard_80387_constant_opcode (rtx x)
8062 switch (standard_80387_constant_p (x))
8086 /* Return the CONST_DOUBLE representing the 80387 constant that is
8087 loaded by the specified special instruction. The argument IDX
8088 matches the return value from standard_80387_constant_p. */
8091 standard_80387_constant_rtx (int idx)
8095 if (! ext_80387_constants_init)
8096 init_ext_80387_constants ();
8112 return CONST_DOUBLE_FROM_REAL_VALUE (ext_80387_constants_table[i],
8116 /* Return 1 if X is all 0s and 2 if x is all 1s
8117 in supported SSE vector mode. */
8120 standard_sse_constant_p (rtx x)
8122 enum machine_mode mode = GET_MODE (x);
8124 if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
8126 if (vector_all_ones_operand (x, mode))
8142 /* Return the opcode of the special instruction to be used to load
8146 standard_sse_constant_opcode (rtx insn, rtx x)
8148 switch (standard_sse_constant_p (x))
8151 switch (get_attr_mode (insn))
8154 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8155 return "%vpxor\t%0, %d0";
8157 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8158 return "%vxorpd\t%0, %d0";
8160 return "%vxorps\t%0, %d0";
8163 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8164 return "vpxor\t%x0, %x0, %x0";
8166 if (!TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
8167 return "vxorpd\t%x0, %x0, %x0";
8169 return "vxorps\t%x0, %x0, %x0";
8176 return "%vpcmpeqd\t%0, %d0";
8183 /* Returns true if OP contains a symbol reference */
8186 symbolic_reference_mentioned_p (rtx op)
8191 if (GET_CODE (op) == SYMBOL_REF || GET_CODE (op) == LABEL_REF)
8194 fmt = GET_RTX_FORMAT (GET_CODE (op));
8195 for (i = GET_RTX_LENGTH (GET_CODE (op)) - 1; i >= 0; i--)
8201 for (j = XVECLEN (op, i) - 1; j >= 0; j--)
8202 if (symbolic_reference_mentioned_p (XVECEXP (op, i, j)))
8206 else if (fmt[i] == 'e' && symbolic_reference_mentioned_p (XEXP (op, i)))
8213 /* Return true if it is appropriate to emit `ret' instructions in the
8214 body of a function. Do this only if the epilogue is simple, needing a
8215 couple of insns. Prior to reloading, we can't tell how many registers
8216 must be saved, so return false then. Return false if there is no frame
8217 marker to de-allocate. */
8220 ix86_can_use_return_insn_p (void)
8222 struct ix86_frame frame;
8224 if (! reload_completed || frame_pointer_needed)
8227 /* Don't allow more than 32k pop, since that's all we can do
8228 with one instruction. */
8229 if (crtl->args.pops_args && crtl->args.size >= 32768)
8232 ix86_compute_frame_layout (&frame);
8233 return (frame.stack_pointer_offset == UNITS_PER_WORD
8234 && (frame.nregs + frame.nsseregs) == 0);
8237 /* Value should be nonzero if functions must have frame pointers.
8238 Zero means the frame pointer need not be set up (and parms may
8239 be accessed via the stack pointer) in functions that seem suitable. */
8242 ix86_frame_pointer_required (void)
8244 /* If we accessed previous frames, then the generated code expects
8245 to be able to access the saved ebp value in our frame. */
8246 if (cfun->machine->accesses_prev_frame)
8249 /* Several x86 os'es need a frame pointer for other reasons,
8250 usually pertaining to setjmp. */
8251 if (SUBTARGET_FRAME_POINTER_REQUIRED)
8254 /* In ix86_option_override_internal, TARGET_OMIT_LEAF_FRAME_POINTER
8255 turns off the frame pointer by default. Turn it back on now if
8256 we've not got a leaf function. */
8257 if (TARGET_OMIT_LEAF_FRAME_POINTER
8258 && (!current_function_is_leaf
8259 || ix86_current_function_calls_tls_descriptor))
8262 if (crtl->profile && !flag_fentry)
8268 /* Record that the current function accesses previous call frames. */
8271 ix86_setup_frame_addresses (void)
8273 cfun->machine->accesses_prev_frame = 1;
8276 #ifndef USE_HIDDEN_LINKONCE
8277 # if defined(HAVE_GAS_HIDDEN) && (SUPPORTS_ONE_ONLY - 0)
8278 # define USE_HIDDEN_LINKONCE 1
8280 # define USE_HIDDEN_LINKONCE 0
8284 static int pic_labels_used;
8286 /* Fills in the label name that should be used for a pc thunk for
8287 the given register. */
8290 get_pc_thunk_name (char name[32], unsigned int regno)
8292 gcc_assert (!TARGET_64BIT);
8294 if (USE_HIDDEN_LINKONCE)
8295 sprintf (name, "__i686.get_pc_thunk.%s", reg_names[regno]);
8297 ASM_GENERATE_INTERNAL_LABEL (name, "LPR", regno);
8301 /* This function generates code for -fpic that loads %ebx with
8302 the return address of the caller and then returns. */
8305 ix86_code_end (void)
8310 #ifdef TARGET_SOLARIS
8311 solaris_code_end ();
8314 for (regno = AX_REG; regno <= SP_REG; regno++)
8319 if (!(pic_labels_used & (1 << regno)))
8322 get_pc_thunk_name (name, regno);
8324 decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
8325 get_identifier (name),
8326 build_function_type_list (void_type_node, NULL_TREE));
8327 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
8328 NULL_TREE, void_type_node);
8329 TREE_PUBLIC (decl) = 1;
8330 TREE_STATIC (decl) = 1;
8335 switch_to_section (darwin_sections[text_coal_section]);
8336 fputs ("\t.weak_definition\t", asm_out_file);
8337 assemble_name (asm_out_file, name);
8338 fputs ("\n\t.private_extern\t", asm_out_file);
8339 assemble_name (asm_out_file, name);
8340 putc ('\n', asm_out_file);
8341 ASM_OUTPUT_LABEL (asm_out_file, name);
8342 DECL_WEAK (decl) = 1;
8346 if (USE_HIDDEN_LINKONCE)
8348 DECL_COMDAT_GROUP (decl) = DECL_ASSEMBLER_NAME (decl);
8350 targetm.asm_out.unique_section (decl, 0);
8351 switch_to_section (get_named_section (decl, NULL, 0));
8353 targetm.asm_out.globalize_label (asm_out_file, name);
8354 fputs ("\t.hidden\t", asm_out_file);
8355 assemble_name (asm_out_file, name);
8356 putc ('\n', asm_out_file);
8357 ASM_DECLARE_FUNCTION_NAME (asm_out_file, name, decl);
8361 switch_to_section (text_section);
8362 ASM_OUTPUT_LABEL (asm_out_file, name);
8365 DECL_INITIAL (decl) = make_node (BLOCK);
8366 current_function_decl = decl;
8367 init_function_start (decl);
8368 first_function_block_is_cold = false;
8369 /* Make sure unwind info is emitted for the thunk if needed. */
8370 final_start_function (emit_barrier (), asm_out_file, 1);
8372 /* Pad stack IP move with 4 instructions (two NOPs count
8373 as one instruction). */
8374 if (TARGET_PAD_SHORT_FUNCTION)
8379 fputs ("\tnop\n", asm_out_file);
8382 xops[0] = gen_rtx_REG (Pmode, regno);
8383 xops[1] = gen_rtx_MEM (Pmode, stack_pointer_rtx);
8384 output_asm_insn ("mov%z0\t{%1, %0|%0, %1}", xops);
8385 fputs ("\tret\n", asm_out_file);
8386 final_end_function ();
8387 init_insn_lengths ();
8388 free_after_compilation (cfun);
8390 current_function_decl = NULL;
8393 if (flag_split_stack)
8394 file_end_indicate_split_stack ();
8397 /* Emit code for the SET_GOT patterns. */
8400 output_set_got (rtx dest, rtx label ATTRIBUTE_UNUSED)
8406 if (TARGET_VXWORKS_RTP && flag_pic)
8408 /* Load (*VXWORKS_GOTT_BASE) into the PIC register. */
8409 xops[2] = gen_rtx_MEM (Pmode,
8410 gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_BASE));
8411 output_asm_insn ("mov{l}\t{%2, %0|%0, %2}", xops);
8413 /* Load (*VXWORKS_GOTT_BASE)[VXWORKS_GOTT_INDEX] into the PIC register.
8414 Use %P and a local symbol in order to print VXWORKS_GOTT_INDEX as
8415 an unadorned address. */
8416 xops[2] = gen_rtx_SYMBOL_REF (Pmode, VXWORKS_GOTT_INDEX);
8417 SYMBOL_REF_FLAGS (xops[2]) |= SYMBOL_FLAG_LOCAL;
8418 output_asm_insn ("mov{l}\t{%P2(%0), %0|%0, DWORD PTR %P2[%0]}", xops);
8422 xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME);
8426 xops[2] = gen_rtx_LABEL_REF (Pmode, label ? label : gen_label_rtx ());
8428 output_asm_insn ("mov%z0\t{%2, %0|%0, %2}", xops);
8431 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8432 is what will be referenced by the Mach-O PIC subsystem. */
8434 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8437 targetm.asm_out.internal_label (asm_out_file, "L",
8438 CODE_LABEL_NUMBER (XEXP (xops[2], 0)));
8443 get_pc_thunk_name (name, REGNO (dest));
8444 pic_labels_used |= 1 << REGNO (dest);
8446 xops[2] = gen_rtx_SYMBOL_REF (Pmode, ggc_strdup (name));
8447 xops[2] = gen_rtx_MEM (QImode, xops[2]);
8448 output_asm_insn ("call\t%X2", xops);
8449 /* Output the Mach-O "canonical" label name ("Lxx$pb") here too. This
8450 is what will be referenced by the Mach-O PIC subsystem. */
8453 ASM_OUTPUT_LABEL (asm_out_file, MACHOPIC_FUNCTION_BASE_NAME);
8455 targetm.asm_out.internal_label (asm_out_file, "L",
8456 CODE_LABEL_NUMBER (label));
8461 output_asm_insn ("add%z0\t{%1, %0|%0, %1}", xops);
8466 /* Generate an "push" pattern for input ARG. */
8471 struct machine_function *m = cfun->machine;
8473 if (m->fs.cfa_reg == stack_pointer_rtx)
8474 m->fs.cfa_offset += UNITS_PER_WORD;
8475 m->fs.sp_offset += UNITS_PER_WORD;
8477 return gen_rtx_SET (VOIDmode,
8479 gen_rtx_PRE_DEC (Pmode,
8480 stack_pointer_rtx)),
8484 /* Generate an "pop" pattern for input ARG. */
8489 return gen_rtx_SET (VOIDmode,
8492 gen_rtx_POST_INC (Pmode,
8493 stack_pointer_rtx)));
8496 /* Return >= 0 if there is an unused call-clobbered register available
8497 for the entire function. */
8500 ix86_select_alt_pic_regnum (void)
8502 if (current_function_is_leaf
8504 && !ix86_current_function_calls_tls_descriptor)
8507 /* Can't use the same register for both PIC and DRAP. */
8509 drap = REGNO (crtl->drap_reg);
8512 for (i = 2; i >= 0; --i)
8513 if (i != drap && !df_regs_ever_live_p (i))
8517 return INVALID_REGNUM;
8520 /* Return TRUE if we need to save REGNO. */
8523 ix86_save_reg (unsigned int regno, bool maybe_eh_return)
8525 if (pic_offset_table_rtx
8526 && regno == REAL_PIC_OFFSET_TABLE_REGNUM
8527 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
8529 || crtl->calls_eh_return
8530 || crtl->uses_const_pool))
8531 return ix86_select_alt_pic_regnum () == INVALID_REGNUM;
8533 if (crtl->calls_eh_return && maybe_eh_return)
8538 unsigned test = EH_RETURN_DATA_REGNO (i);
8539 if (test == INVALID_REGNUM)
8546 if (crtl->drap_reg && regno == REGNO (crtl->drap_reg))
8549 return (df_regs_ever_live_p (regno)
8550 && !call_used_regs[regno]
8551 && !fixed_regs[regno]
8552 && (regno != HARD_FRAME_POINTER_REGNUM || !frame_pointer_needed));
8555 /* Return number of saved general prupose registers. */
8558 ix86_nsaved_regs (void)
8563 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8564 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8569 /* Return number of saved SSE registrers. */
8572 ix86_nsaved_sseregs (void)
8577 if (!TARGET_64BIT_MS_ABI)
8579 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
8580 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8585 /* Given FROM and TO register numbers, say whether this elimination is
8586 allowed. If stack alignment is needed, we can only replace argument
8587 pointer with hard frame pointer, or replace frame pointer with stack
8588 pointer. Otherwise, frame pointer elimination is automatically
8589 handled and all other eliminations are valid. */
8592 ix86_can_eliminate (const int from, const int to)
8594 if (stack_realign_fp)
8595 return ((from == ARG_POINTER_REGNUM
8596 && to == HARD_FRAME_POINTER_REGNUM)
8597 || (from == FRAME_POINTER_REGNUM
8598 && to == STACK_POINTER_REGNUM));
8600 return to == STACK_POINTER_REGNUM ? !frame_pointer_needed : true;
8603 /* Return the offset between two registers, one to be eliminated, and the other
8604 its replacement, at the start of a routine. */
8607 ix86_initial_elimination_offset (int from, int to)
8609 struct ix86_frame frame;
8610 ix86_compute_frame_layout (&frame);
8612 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
8613 return frame.hard_frame_pointer_offset;
8614 else if (from == FRAME_POINTER_REGNUM
8615 && to == HARD_FRAME_POINTER_REGNUM)
8616 return frame.hard_frame_pointer_offset - frame.frame_pointer_offset;
8619 gcc_assert (to == STACK_POINTER_REGNUM);
8621 if (from == ARG_POINTER_REGNUM)
8622 return frame.stack_pointer_offset;
8624 gcc_assert (from == FRAME_POINTER_REGNUM);
8625 return frame.stack_pointer_offset - frame.frame_pointer_offset;
8629 /* In a dynamically-aligned function, we can't know the offset from
8630 stack pointer to frame pointer, so we must ensure that setjmp
8631 eliminates fp against the hard fp (%ebp) rather than trying to
8632 index from %esp up to the top of the frame across a gap that is
8633 of unknown (at compile-time) size. */
8635 ix86_builtin_setjmp_frame_value (void)
8637 return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
8640 /* When using -fsplit-stack, the allocation routines set a field in
8641 the TCB to the bottom of the stack plus this much space, measured
8644 #define SPLIT_STACK_AVAILABLE 256
8646 /* Fill structure ix86_frame about frame of currently computed function. */
8649 ix86_compute_frame_layout (struct ix86_frame *frame)
8651 unsigned int stack_alignment_needed;
8652 HOST_WIDE_INT offset;
8653 unsigned int preferred_alignment;
8654 HOST_WIDE_INT size = get_frame_size ();
8655 HOST_WIDE_INT to_allocate;
8657 frame->nregs = ix86_nsaved_regs ();
8658 frame->nsseregs = ix86_nsaved_sseregs ();
8660 stack_alignment_needed = crtl->stack_alignment_needed / BITS_PER_UNIT;
8661 preferred_alignment = crtl->preferred_stack_boundary / BITS_PER_UNIT;
8663 /* 64-bit MS ABI seem to require stack alignment to be always 16 except for
8664 function prologues and leaf. */
8665 if ((TARGET_64BIT_MS_ABI && preferred_alignment < 16)
8666 && (!current_function_is_leaf || cfun->calls_alloca != 0
8667 || ix86_current_function_calls_tls_descriptor))
8669 preferred_alignment = 16;
8670 stack_alignment_needed = 16;
8671 crtl->preferred_stack_boundary = 128;
8672 crtl->stack_alignment_needed = 128;
8675 gcc_assert (!size || stack_alignment_needed);
8676 gcc_assert (preferred_alignment >= STACK_BOUNDARY / BITS_PER_UNIT);
8677 gcc_assert (preferred_alignment <= stack_alignment_needed);
8679 /* For SEH we have to limit the amount of code movement into the prologue.
8680 At present we do this via a BLOCKAGE, at which point there's very little
8681 scheduling that can be done, which means that there's very little point
8682 in doing anything except PUSHs. */
8684 cfun->machine->use_fast_prologue_epilogue = false;
8686 /* During reload iteration the amount of registers saved can change.
8687 Recompute the value as needed. Do not recompute when amount of registers
8688 didn't change as reload does multiple calls to the function and does not
8689 expect the decision to change within single iteration. */
8690 else if (!optimize_function_for_size_p (cfun)
8691 && cfun->machine->use_fast_prologue_epilogue_nregs != frame->nregs)
8693 int count = frame->nregs;
8694 struct cgraph_node *node = cgraph_get_node (current_function_decl);
8696 cfun->machine->use_fast_prologue_epilogue_nregs = count;
8698 /* The fast prologue uses move instead of push to save registers. This
8699 is significantly longer, but also executes faster as modern hardware
8700 can execute the moves in parallel, but can't do that for push/pop.
8702 Be careful about choosing what prologue to emit: When function takes
8703 many instructions to execute we may use slow version as well as in
8704 case function is known to be outside hot spot (this is known with
8705 feedback only). Weight the size of function by number of registers
8706 to save as it is cheap to use one or two push instructions but very
8707 slow to use many of them. */
8709 count = (count - 1) * FAST_PROLOGUE_INSN_COUNT;
8710 if (node->frequency < NODE_FREQUENCY_NORMAL
8711 || (flag_branch_probabilities
8712 && node->frequency < NODE_FREQUENCY_HOT))
8713 cfun->machine->use_fast_prologue_epilogue = false;
8715 cfun->machine->use_fast_prologue_epilogue
8716 = !expensive_function_p (count);
8718 if (TARGET_PROLOGUE_USING_MOVE
8719 && cfun->machine->use_fast_prologue_epilogue)
8720 frame->save_regs_using_mov = true;
8722 frame->save_regs_using_mov = false;
8724 /* If static stack checking is enabled and done with probes, the registers
8725 need to be saved before allocating the frame. */
8726 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
8727 frame->save_regs_using_mov = false;
8729 /* Skip return address. */
8730 offset = UNITS_PER_WORD;
8732 /* Skip pushed static chain. */
8733 if (ix86_static_chain_on_stack)
8734 offset += UNITS_PER_WORD;
8736 /* Skip saved base pointer. */
8737 if (frame_pointer_needed)
8738 offset += UNITS_PER_WORD;
8739 frame->hfp_save_offset = offset;
8741 /* The traditional frame pointer location is at the top of the frame. */
8742 frame->hard_frame_pointer_offset = offset;
8744 /* Register save area */
8745 offset += frame->nregs * UNITS_PER_WORD;
8746 frame->reg_save_offset = offset;
8748 /* Align and set SSE register save area. */
8749 if (frame->nsseregs)
8751 /* The only ABI that has saved SSE registers (Win64) also has a
8752 16-byte aligned default stack, and thus we don't need to be
8753 within the re-aligned local stack frame to save them. */
8754 gcc_assert (INCOMING_STACK_BOUNDARY >= 128);
8755 offset = (offset + 16 - 1) & -16;
8756 offset += frame->nsseregs * 16;
8758 frame->sse_reg_save_offset = offset;
8760 /* The re-aligned stack starts here. Values before this point are not
8761 directly comparable with values below this point. In order to make
8762 sure that no value happens to be the same before and after, force
8763 the alignment computation below to add a non-zero value. */
8764 if (stack_realign_fp)
8765 offset = (offset + stack_alignment_needed) & -stack_alignment_needed;
8768 frame->va_arg_size = ix86_varargs_gpr_size + ix86_varargs_fpr_size;
8769 offset += frame->va_arg_size;
8771 /* Align start of frame for local function. */
8772 if (stack_realign_fp
8773 || offset != frame->sse_reg_save_offset
8775 || !current_function_is_leaf
8776 || cfun->calls_alloca
8777 || ix86_current_function_calls_tls_descriptor)
8778 offset = (offset + stack_alignment_needed - 1) & -stack_alignment_needed;
8780 /* Frame pointer points here. */
8781 frame->frame_pointer_offset = offset;
8785 /* Add outgoing arguments area. Can be skipped if we eliminated
8786 all the function calls as dead code.
8787 Skipping is however impossible when function calls alloca. Alloca
8788 expander assumes that last crtl->outgoing_args_size
8789 of stack frame are unused. */
8790 if (ACCUMULATE_OUTGOING_ARGS
8791 && (!current_function_is_leaf || cfun->calls_alloca
8792 || ix86_current_function_calls_tls_descriptor))
8794 offset += crtl->outgoing_args_size;
8795 frame->outgoing_arguments_size = crtl->outgoing_args_size;
8798 frame->outgoing_arguments_size = 0;
8800 /* Align stack boundary. Only needed if we're calling another function
8802 if (!current_function_is_leaf || cfun->calls_alloca
8803 || ix86_current_function_calls_tls_descriptor)
8804 offset = (offset + preferred_alignment - 1) & -preferred_alignment;
8806 /* We've reached end of stack frame. */
8807 frame->stack_pointer_offset = offset;
8809 /* Size prologue needs to allocate. */
8810 to_allocate = offset - frame->sse_reg_save_offset;
8812 if ((!to_allocate && frame->nregs <= 1)
8813 || (TARGET_64BIT && to_allocate >= (HOST_WIDE_INT) 0x80000000))
8814 frame->save_regs_using_mov = false;
8816 if (ix86_using_red_zone ()
8817 && current_function_sp_is_unchanging
8818 && current_function_is_leaf
8819 && !ix86_current_function_calls_tls_descriptor)
8821 frame->red_zone_size = to_allocate;
8822 if (frame->save_regs_using_mov)
8823 frame->red_zone_size += frame->nregs * UNITS_PER_WORD;
8824 if (frame->red_zone_size > RED_ZONE_SIZE - RED_ZONE_RESERVE)
8825 frame->red_zone_size = RED_ZONE_SIZE - RED_ZONE_RESERVE;
8828 frame->red_zone_size = 0;
8829 frame->stack_pointer_offset -= frame->red_zone_size;
8831 /* The SEH frame pointer location is near the bottom of the frame.
8832 This is enforced by the fact that the difference between the
8833 stack pointer and the frame pointer is limited to 240 bytes in
8834 the unwind data structure. */
8839 /* If we can leave the frame pointer where it is, do so. */
8840 diff = frame->stack_pointer_offset - frame->hard_frame_pointer_offset;
8841 if (diff > 240 || (diff & 15) != 0)
8843 /* Ideally we'd determine what portion of the local stack frame
8844 (within the constraint of the lowest 240) is most heavily used.
8845 But without that complication, simply bias the frame pointer
8846 by 128 bytes so as to maximize the amount of the local stack
8847 frame that is addressable with 8-bit offsets. */
8848 frame->hard_frame_pointer_offset = frame->stack_pointer_offset - 128;
8853 /* This is semi-inlined memory_address_length, but simplified
8854 since we know that we're always dealing with reg+offset, and
8855 to avoid having to create and discard all that rtl. */
8858 choose_baseaddr_len (unsigned int regno, HOST_WIDE_INT offset)
8864 /* EBP and R13 cannot be encoded without an offset. */
8865 len = (regno == BP_REG || regno == R13_REG);
8867 else if (IN_RANGE (offset, -128, 127))
8870 /* ESP and R12 must be encoded with a SIB byte. */
8871 if (regno == SP_REG || regno == R12_REG)
8877 /* Return an RTX that points to CFA_OFFSET within the stack frame.
8878 The valid base registers are taken from CFUN->MACHINE->FS. */
8881 choose_baseaddr (HOST_WIDE_INT cfa_offset)
8883 const struct machine_function *m = cfun->machine;
8884 rtx base_reg = NULL;
8885 HOST_WIDE_INT base_offset = 0;
8887 if (m->use_fast_prologue_epilogue)
8889 /* Choose the base register most likely to allow the most scheduling
8890 opportunities. Generally FP is valid througout the function,
8891 while DRAP must be reloaded within the epilogue. But choose either
8892 over the SP due to increased encoding size. */
8896 base_reg = hard_frame_pointer_rtx;
8897 base_offset = m->fs.fp_offset - cfa_offset;
8899 else if (m->fs.drap_valid)
8901 base_reg = crtl->drap_reg;
8902 base_offset = 0 - cfa_offset;
8904 else if (m->fs.sp_valid)
8906 base_reg = stack_pointer_rtx;
8907 base_offset = m->fs.sp_offset - cfa_offset;
8912 HOST_WIDE_INT toffset;
8915 /* Choose the base register with the smallest address encoding.
8916 With a tie, choose FP > DRAP > SP. */
8919 base_reg = stack_pointer_rtx;
8920 base_offset = m->fs.sp_offset - cfa_offset;
8921 len = choose_baseaddr_len (STACK_POINTER_REGNUM, base_offset);
8923 if (m->fs.drap_valid)
8925 toffset = 0 - cfa_offset;
8926 tlen = choose_baseaddr_len (REGNO (crtl->drap_reg), toffset);
8929 base_reg = crtl->drap_reg;
8930 base_offset = toffset;
8936 toffset = m->fs.fp_offset - cfa_offset;
8937 tlen = choose_baseaddr_len (HARD_FRAME_POINTER_REGNUM, toffset);
8940 base_reg = hard_frame_pointer_rtx;
8941 base_offset = toffset;
8946 gcc_assert (base_reg != NULL);
8948 return plus_constant (base_reg, base_offset);
8951 /* Emit code to save registers in the prologue. */
8954 ix86_emit_save_regs (void)
8959 for (regno = FIRST_PSEUDO_REGISTER - 1; regno-- > 0; )
8960 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
8962 insn = emit_insn (gen_push (gen_rtx_REG (Pmode, regno)));
8963 RTX_FRAME_RELATED_P (insn) = 1;
8967 /* Emit a single register save at CFA - CFA_OFFSET. */
8970 ix86_emit_save_reg_using_mov (enum machine_mode mode, unsigned int regno,
8971 HOST_WIDE_INT cfa_offset)
8973 struct machine_function *m = cfun->machine;
8974 rtx reg = gen_rtx_REG (mode, regno);
8975 rtx mem, addr, base, insn;
8977 addr = choose_baseaddr (cfa_offset);
8978 mem = gen_frame_mem (mode, addr);
8980 /* For SSE saves, we need to indicate the 128-bit alignment. */
8981 set_mem_align (mem, GET_MODE_ALIGNMENT (mode));
8983 insn = emit_move_insn (mem, reg);
8984 RTX_FRAME_RELATED_P (insn) = 1;
8987 if (GET_CODE (base) == PLUS)
8988 base = XEXP (base, 0);
8989 gcc_checking_assert (REG_P (base));
8991 /* When saving registers into a re-aligned local stack frame, avoid
8992 any tricky guessing by dwarf2out. */
8993 if (m->fs.realigned)
8995 gcc_checking_assert (stack_realign_drap);
8997 if (regno == REGNO (crtl->drap_reg))
8999 /* A bit of a hack. We force the DRAP register to be saved in
9000 the re-aligned stack frame, which provides us with a copy
9001 of the CFA that will last past the prologue. Install it. */
9002 gcc_checking_assert (cfun->machine->fs.fp_valid);
9003 addr = plus_constant (hard_frame_pointer_rtx,
9004 cfun->machine->fs.fp_offset - cfa_offset);
9005 mem = gen_rtx_MEM (mode, addr);
9006 add_reg_note (insn, REG_CFA_DEF_CFA, mem);
9010 /* The frame pointer is a stable reference within the
9011 aligned frame. Use it. */
9012 gcc_checking_assert (cfun->machine->fs.fp_valid);
9013 addr = plus_constant (hard_frame_pointer_rtx,
9014 cfun->machine->fs.fp_offset - cfa_offset);
9015 mem = gen_rtx_MEM (mode, addr);
9016 add_reg_note (insn, REG_CFA_EXPRESSION,
9017 gen_rtx_SET (VOIDmode, mem, reg));
9021 /* The memory may not be relative to the current CFA register,
9022 which means that we may need to generate a new pattern for
9023 use by the unwind info. */
9024 else if (base != m->fs.cfa_reg)
9026 addr = plus_constant (m->fs.cfa_reg, m->fs.cfa_offset - cfa_offset);
9027 mem = gen_rtx_MEM (mode, addr);
9028 add_reg_note (insn, REG_CFA_OFFSET, gen_rtx_SET (VOIDmode, mem, reg));
9032 /* Emit code to save registers using MOV insns.
9033 First register is stored at CFA - CFA_OFFSET. */
9035 ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
9039 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9040 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9042 ix86_emit_save_reg_using_mov (Pmode, regno, cfa_offset);
9043 cfa_offset -= UNITS_PER_WORD;
9047 /* Emit code to save SSE registers using MOV insns.
9048 First register is stored at CFA - CFA_OFFSET. */
9050 ix86_emit_save_sse_regs_using_mov (HOST_WIDE_INT cfa_offset)
9054 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
9055 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
9057 ix86_emit_save_reg_using_mov (V4SFmode, regno, cfa_offset);
9062 static GTY(()) rtx queued_cfa_restores;
9064 /* Add a REG_CFA_RESTORE REG note to INSN or queue them until next stack
9065 manipulation insn. The value is on the stack at CFA - CFA_OFFSET.
9066 Don't add the note if the previously saved value will be left untouched
9067 within stack red-zone till return, as unwinders can find the same value
9068 in the register and on the stack. */
9071 ix86_add_cfa_restore_note (rtx insn, rtx reg, HOST_WIDE_INT cfa_offset)
9073 if (cfa_offset <= cfun->machine->fs.red_zone_offset)
9078 add_reg_note (insn, REG_CFA_RESTORE, reg);
9079 RTX_FRAME_RELATED_P (insn) = 1;
9083 = alloc_reg_note (REG_CFA_RESTORE, reg, queued_cfa_restores);
9086 /* Add queued REG_CFA_RESTORE notes if any to INSN. */
9089 ix86_add_queued_cfa_restore_notes (rtx insn)
9092 if (!queued_cfa_restores)
9094 for (last = queued_cfa_restores; XEXP (last, 1); last = XEXP (last, 1))
9096 XEXP (last, 1) = REG_NOTES (insn);
9097 REG_NOTES (insn) = queued_cfa_restores;
9098 queued_cfa_restores = NULL_RTX;
9099 RTX_FRAME_RELATED_P (insn) = 1;
9102 /* Expand prologue or epilogue stack adjustment.
9103 The pattern exist to put a dependency on all ebp-based memory accesses.
9104 STYLE should be negative if instructions should be marked as frame related,
9105 zero if %r11 register is live and cannot be freely used and positive
9109 pro_epilogue_adjust_stack (rtx dest, rtx src, rtx offset,
9110 int style, bool set_cfa)
9112 struct machine_function *m = cfun->machine;
9114 bool add_frame_related_expr = false;
9117 insn = gen_pro_epilogue_adjust_stack_si_add (dest, src, offset);
9118 else if (x86_64_immediate_operand (offset, DImode))
9119 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, offset);
9123 /* r11 is used by indirect sibcall return as well, set before the
9124 epilogue and used after the epilogue. */
9126 tmp = gen_rtx_REG (DImode, R11_REG);
9129 gcc_assert (src != hard_frame_pointer_rtx
9130 && dest != hard_frame_pointer_rtx);
9131 tmp = hard_frame_pointer_rtx;
9133 insn = emit_insn (gen_rtx_SET (DImode, tmp, offset));
9135 add_frame_related_expr = true;
9137 insn = gen_pro_epilogue_adjust_stack_di_add (dest, src, tmp);
9140 insn = emit_insn (insn);
9142 ix86_add_queued_cfa_restore_notes (insn);
9148 gcc_assert (m->fs.cfa_reg == src);
9149 m->fs.cfa_offset += INTVAL (offset);
9150 m->fs.cfa_reg = dest;
9152 r = gen_rtx_PLUS (Pmode, src, offset);
9153 r = gen_rtx_SET (VOIDmode, dest, r);
9154 add_reg_note (insn, REG_CFA_ADJUST_CFA, r);
9155 RTX_FRAME_RELATED_P (insn) = 1;
9159 RTX_FRAME_RELATED_P (insn) = 1;
9160 if (add_frame_related_expr)
9162 rtx r = gen_rtx_PLUS (Pmode, src, offset);
9163 r = gen_rtx_SET (VOIDmode, dest, r);
9164 add_reg_note (insn, REG_FRAME_RELATED_EXPR, r);
9168 if (dest == stack_pointer_rtx)
9170 HOST_WIDE_INT ooffset = m->fs.sp_offset;
9171 bool valid = m->fs.sp_valid;
9173 if (src == hard_frame_pointer_rtx)
9175 valid = m->fs.fp_valid;
9176 ooffset = m->fs.fp_offset;
9178 else if (src == crtl->drap_reg)
9180 valid = m->fs.drap_valid;
9185 /* Else there are two possibilities: SP itself, which we set
9186 up as the default above. Or EH_RETURN_STACKADJ_RTX, which is
9187 taken care of this by hand along the eh_return path. */
9188 gcc_checking_assert (src == stack_pointer_rtx
9189 || offset == const0_rtx);
9192 m->fs.sp_offset = ooffset - INTVAL (offset);
9193 m->fs.sp_valid = valid;
9197 /* Find an available register to be used as dynamic realign argument
9198 pointer regsiter. Such a register will be written in prologue and
9199 used in begin of body, so it must not be
9200 1. parameter passing register.
9202 We reuse static-chain register if it is available. Otherwise, we
9203 use DI for i386 and R13 for x86-64. We chose R13 since it has
9206 Return: the regno of chosen register. */
9209 find_drap_reg (void)
9211 tree decl = cfun->decl;
9215 /* Use R13 for nested function or function need static chain.
9216 Since function with tail call may use any caller-saved
9217 registers in epilogue, DRAP must not use caller-saved
9218 register in such case. */
9219 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9226 /* Use DI for nested function or function need static chain.
9227 Since function with tail call may use any caller-saved
9228 registers in epilogue, DRAP must not use caller-saved
9229 register in such case. */
9230 if (DECL_STATIC_CHAIN (decl) || crtl->tail_call_emit)
9233 /* Reuse static chain register if it isn't used for parameter
9235 if (ix86_function_regparm (TREE_TYPE (decl), decl) <= 2)
9237 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (decl));
9238 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) == 0)
9245 /* Return minimum incoming stack alignment. */
9248 ix86_minimum_incoming_stack_boundary (bool sibcall)
9250 unsigned int incoming_stack_boundary;
9252 /* Prefer the one specified at command line. */
9253 if (ix86_user_incoming_stack_boundary)
9254 incoming_stack_boundary = ix86_user_incoming_stack_boundary;
9255 /* In 32bit, use MIN_STACK_BOUNDARY for incoming stack boundary
9256 if -mstackrealign is used, it isn't used for sibcall check and
9257 estimated stack alignment is 128bit. */
9260 && ix86_force_align_arg_pointer
9261 && crtl->stack_alignment_estimated == 128)
9262 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9264 incoming_stack_boundary = ix86_default_incoming_stack_boundary;
9266 /* Incoming stack alignment can be changed on individual functions
9267 via force_align_arg_pointer attribute. We use the smallest
9268 incoming stack boundary. */
9269 if (incoming_stack_boundary > MIN_STACK_BOUNDARY
9270 && lookup_attribute (ix86_force_align_arg_pointer_string,
9271 TYPE_ATTRIBUTES (TREE_TYPE (current_function_decl))))
9272 incoming_stack_boundary = MIN_STACK_BOUNDARY;
9274 /* The incoming stack frame has to be aligned at least at
9275 parm_stack_boundary. */
9276 if (incoming_stack_boundary < crtl->parm_stack_boundary)
9277 incoming_stack_boundary = crtl->parm_stack_boundary;
9279 /* Stack at entrance of main is aligned by runtime. We use the
9280 smallest incoming stack boundary. */
9281 if (incoming_stack_boundary > MAIN_STACK_BOUNDARY
9282 && DECL_NAME (current_function_decl)
9283 && MAIN_NAME_P (DECL_NAME (current_function_decl))
9284 && DECL_FILE_SCOPE_P (current_function_decl))
9285 incoming_stack_boundary = MAIN_STACK_BOUNDARY;
9287 return incoming_stack_boundary;
9290 /* Update incoming stack boundary and estimated stack alignment. */
9293 ix86_update_stack_boundary (void)
9295 ix86_incoming_stack_boundary
9296 = ix86_minimum_incoming_stack_boundary (false);
9298 /* x86_64 vararg needs 16byte stack alignment for register save
9302 && crtl->stack_alignment_estimated < 128)
9303 crtl->stack_alignment_estimated = 128;
9306 /* Handle the TARGET_GET_DRAP_RTX hook. Return NULL if no DRAP is
9307 needed or an rtx for DRAP otherwise. */
9310 ix86_get_drap_rtx (void)
9312 if (ix86_force_drap || !ACCUMULATE_OUTGOING_ARGS)
9313 crtl->need_drap = true;
9315 if (stack_realign_drap)
9317 /* Assign DRAP to vDRAP and returns vDRAP */
9318 unsigned int regno = find_drap_reg ();
9323 arg_ptr = gen_rtx_REG (Pmode, regno);
9324 crtl->drap_reg = arg_ptr;
9327 drap_vreg = copy_to_reg (arg_ptr);
9331 insn = emit_insn_before (seq, NEXT_INSN (entry_of_function ()));
9334 add_reg_note (insn, REG_CFA_SET_VDRAP, drap_vreg);
9335 RTX_FRAME_RELATED_P (insn) = 1;
9343 /* Handle the TARGET_INTERNAL_ARG_POINTER hook. */
9346 ix86_internal_arg_pointer (void)
9348 return virtual_incoming_args_rtx;
9351 struct scratch_reg {
9356 /* Return a short-lived scratch register for use on function entry.
9357 In 32-bit mode, it is valid only after the registers are saved
9358 in the prologue. This register must be released by means of
9359 release_scratch_register_on_entry once it is dead. */
9362 get_scratch_register_on_entry (struct scratch_reg *sr)
9370 /* We always use R11 in 64-bit mode. */
9375 tree decl = current_function_decl, fntype = TREE_TYPE (decl);
9377 = lookup_attribute ("fastcall", TYPE_ATTRIBUTES (fntype)) != NULL_TREE;
9378 bool static_chain_p = DECL_STATIC_CHAIN (decl);
9379 int regparm = ix86_function_regparm (fntype, decl);
9381 = crtl->drap_reg ? REGNO (crtl->drap_reg) : INVALID_REGNUM;
9383 /* 'fastcall' sets regparm to 2, uses ecx/edx for arguments and eax
9384 for the static chain register. */
9385 if ((regparm < 1 || (fastcall_p && !static_chain_p))
9386 && drap_regno != AX_REG)
9388 else if (regparm < 2 && drap_regno != DX_REG)
9390 /* ecx is the static chain register. */
9391 else if (regparm < 3 && !fastcall_p && !static_chain_p
9392 && drap_regno != CX_REG)
9394 else if (ix86_save_reg (BX_REG, true))
9396 /* esi is the static chain register. */
9397 else if (!(regparm == 3 && static_chain_p)
9398 && ix86_save_reg (SI_REG, true))
9400 else if (ix86_save_reg (DI_REG, true))
9404 regno = (drap_regno == AX_REG ? DX_REG : AX_REG);
9409 sr->reg = gen_rtx_REG (Pmode, regno);
9412 rtx insn = emit_insn (gen_push (sr->reg));
9413 RTX_FRAME_RELATED_P (insn) = 1;
9417 /* Release a scratch register obtained from the preceding function. */
9420 release_scratch_register_on_entry (struct scratch_reg *sr)
9424 rtx x, insn = emit_insn (gen_pop (sr->reg));
9426 /* The RTX_FRAME_RELATED_P mechanism doesn't know about pop. */
9427 RTX_FRAME_RELATED_P (insn) = 1;
9428 x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, GEN_INT (UNITS_PER_WORD));
9429 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
9430 add_reg_note (insn, REG_FRAME_RELATED_EXPR, x);
9434 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
9436 /* Emit code to adjust the stack pointer by SIZE bytes while probing it. */
9439 ix86_adjust_stack_and_probe (const HOST_WIDE_INT size)
9441 /* We skip the probe for the first interval + a small dope of 4 words and
9442 probe that many bytes past the specified size to maintain a protection
9443 area at the botton of the stack. */
9444 const int dope = 4 * UNITS_PER_WORD;
9445 rtx size_rtx = GEN_INT (size), last;
9447 /* See if we have a constant small number of probes to generate. If so,
9448 that's the easy case. The run-time loop is made up of 11 insns in the
9449 generic case while the compile-time loop is made up of 3+2*(n-1) insns
9450 for n # of intervals. */
9451 if (size <= 5 * PROBE_INTERVAL)
9453 HOST_WIDE_INT i, adjust;
9454 bool first_probe = true;
9456 /* Adjust SP and probe at PROBE_INTERVAL + N * PROBE_INTERVAL for
9457 values of N from 1 until it exceeds SIZE. If only one probe is
9458 needed, this will not generate any code. Then adjust and probe
9459 to PROBE_INTERVAL + SIZE. */
9460 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9464 adjust = 2 * PROBE_INTERVAL + dope;
9465 first_probe = false;
9468 adjust = PROBE_INTERVAL;
9470 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9471 plus_constant (stack_pointer_rtx, -adjust)));
9472 emit_stack_probe (stack_pointer_rtx);
9476 adjust = size + PROBE_INTERVAL + dope;
9478 adjust = size + PROBE_INTERVAL - i;
9480 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9481 plus_constant (stack_pointer_rtx, -adjust)));
9482 emit_stack_probe (stack_pointer_rtx);
9484 /* Adjust back to account for the additional first interval. */
9485 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9486 plus_constant (stack_pointer_rtx,
9487 PROBE_INTERVAL + dope)));
9490 /* Otherwise, do the same as above, but in a loop. Note that we must be
9491 extra careful with variables wrapping around because we might be at
9492 the very top (or the very bottom) of the address space and we have
9493 to be able to handle this case properly; in particular, we use an
9494 equality test for the loop condition. */
9497 HOST_WIDE_INT rounded_size;
9498 struct scratch_reg sr;
9500 get_scratch_register_on_entry (&sr);
9503 /* Step 1: round SIZE to the previous multiple of the interval. */
9505 rounded_size = size & -PROBE_INTERVAL;
9508 /* Step 2: compute initial and final value of the loop counter. */
9510 /* SP = SP_0 + PROBE_INTERVAL. */
9511 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9512 plus_constant (stack_pointer_rtx,
9513 - (PROBE_INTERVAL + dope))));
9515 /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE. */
9516 emit_move_insn (sr.reg, GEN_INT (-rounded_size));
9517 emit_insn (gen_rtx_SET (VOIDmode, sr.reg,
9518 gen_rtx_PLUS (Pmode, sr.reg,
9519 stack_pointer_rtx)));
9524 while (SP != LAST_ADDR)
9526 SP = SP + PROBE_INTERVAL
9530 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
9531 values of N from 1 until it is equal to ROUNDED_SIZE. */
9533 emit_insn (ix86_gen_adjust_stack_and_probe (sr.reg, sr.reg, size_rtx));
9536 /* Step 4: adjust SP and probe at PROBE_INTERVAL + SIZE if we cannot
9537 assert at compile-time that SIZE is equal to ROUNDED_SIZE. */
9539 if (size != rounded_size)
9541 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9542 plus_constant (stack_pointer_rtx,
9543 rounded_size - size)));
9544 emit_stack_probe (stack_pointer_rtx);
9547 /* Adjust back to account for the additional first interval. */
9548 last = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9549 plus_constant (stack_pointer_rtx,
9550 PROBE_INTERVAL + dope)));
9552 release_scratch_register_on_entry (&sr);
9555 gcc_assert (cfun->machine->fs.cfa_reg != stack_pointer_rtx);
9557 /* Even if the stack pointer isn't the CFA register, we need to correctly
9558 describe the adjustments made to it, in particular differentiate the
9559 frame-related ones from the frame-unrelated ones. */
9562 rtx expr = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (2));
9563 XVECEXP (expr, 0, 0)
9564 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9565 plus_constant (stack_pointer_rtx, -size));
9566 XVECEXP (expr, 0, 1)
9567 = gen_rtx_SET (VOIDmode, stack_pointer_rtx,
9568 plus_constant (stack_pointer_rtx,
9569 PROBE_INTERVAL + dope + size));
9570 add_reg_note (last, REG_FRAME_RELATED_EXPR, expr);
9571 RTX_FRAME_RELATED_P (last) = 1;
9573 cfun->machine->fs.sp_offset += size;
9576 /* Make sure nothing is scheduled before we are done. */
9577 emit_insn (gen_blockage ());
9580 /* Adjust the stack pointer up to REG while probing it. */
9583 output_adjust_stack_and_probe (rtx reg)
9585 static int labelno = 0;
9586 char loop_lab[32], end_lab[32];
9589 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9590 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9592 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9594 /* Jump to END_LAB if SP == LAST_ADDR. */
9595 xops[0] = stack_pointer_rtx;
9597 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9598 fputs ("\tje\t", asm_out_file);
9599 assemble_name_raw (asm_out_file, end_lab);
9600 fputc ('\n', asm_out_file);
9602 /* SP = SP + PROBE_INTERVAL. */
9603 xops[1] = GEN_INT (PROBE_INTERVAL);
9604 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9607 xops[1] = const0_rtx;
9608 output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
9610 fprintf (asm_out_file, "\tjmp\t");
9611 assemble_name_raw (asm_out_file, loop_lab);
9612 fputc ('\n', asm_out_file);
9614 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9619 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
9620 inclusive. These are offsets from the current stack pointer. */
9623 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
9625 /* See if we have a constant small number of probes to generate. If so,
9626 that's the easy case. The run-time loop is made up of 7 insns in the
9627 generic case while the compile-time loop is made up of n insns for n #
9629 if (size <= 7 * PROBE_INTERVAL)
9633 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 1 until
9634 it exceeds SIZE. If only one probe is needed, this will not
9635 generate any code. Then probe at FIRST + SIZE. */
9636 for (i = PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
9637 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + i)));
9639 emit_stack_probe (plus_constant (stack_pointer_rtx, -(first + size)));
9642 /* Otherwise, do the same as above, but in a loop. Note that we must be
9643 extra careful with variables wrapping around because we might be at
9644 the very top (or the very bottom) of the address space and we have
9645 to be able to handle this case properly; in particular, we use an
9646 equality test for the loop condition. */
9649 HOST_WIDE_INT rounded_size, last;
9650 struct scratch_reg sr;
9652 get_scratch_register_on_entry (&sr);
9655 /* Step 1: round SIZE to the previous multiple of the interval. */
9657 rounded_size = size & -PROBE_INTERVAL;
9660 /* Step 2: compute initial and final value of the loop counter. */
9662 /* TEST_OFFSET = FIRST. */
9663 emit_move_insn (sr.reg, GEN_INT (-first));
9665 /* LAST_OFFSET = FIRST + ROUNDED_SIZE. */
9666 last = first + rounded_size;
9671 while (TEST_ADDR != LAST_ADDR)
9673 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
9677 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
9678 until it is equal to ROUNDED_SIZE. */
9680 emit_insn (ix86_gen_probe_stack_range (sr.reg, sr.reg, GEN_INT (-last)));
9683 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
9684 that SIZE is equal to ROUNDED_SIZE. */
9686 if (size != rounded_size)
9687 emit_stack_probe (plus_constant (gen_rtx_PLUS (Pmode,
9690 rounded_size - size));
9692 release_scratch_register_on_entry (&sr);
9695 /* Make sure nothing is scheduled before we are done. */
9696 emit_insn (gen_blockage ());
9699 /* Probe a range of stack addresses from REG to END, inclusive. These are
9700 offsets from the current stack pointer. */
9703 output_probe_stack_range (rtx reg, rtx end)
9705 static int labelno = 0;
9706 char loop_lab[32], end_lab[32];
9709 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
9710 ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
9712 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
9714 /* Jump to END_LAB if TEST_ADDR == LAST_ADDR. */
9717 output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
9718 fputs ("\tje\t", asm_out_file);
9719 assemble_name_raw (asm_out_file, end_lab);
9720 fputc ('\n', asm_out_file);
9722 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
9723 xops[1] = GEN_INT (PROBE_INTERVAL);
9724 output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
9726 /* Probe at TEST_ADDR. */
9727 xops[0] = stack_pointer_rtx;
9729 xops[2] = const0_rtx;
9730 output_asm_insn ("or%z0\t{%2, (%0,%1)|DWORD PTR [%0+%1], %2}", xops);
9732 fprintf (asm_out_file, "\tjmp\t");
9733 assemble_name_raw (asm_out_file, loop_lab);
9734 fputc ('\n', asm_out_file);
9736 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
9741 /* Finalize stack_realign_needed flag, which will guide prologue/epilogue
9742 to be generated in correct form. */
9744 ix86_finalize_stack_realign_flags (void)
9746 /* Check if stack realign is really needed after reload, and
9747 stores result in cfun */
9748 unsigned int incoming_stack_boundary
9749 = (crtl->parm_stack_boundary > ix86_incoming_stack_boundary
9750 ? crtl->parm_stack_boundary : ix86_incoming_stack_boundary);
9751 unsigned int stack_realign = (incoming_stack_boundary
9752 < (current_function_is_leaf
9753 ? crtl->max_used_stack_slot_alignment
9754 : crtl->stack_alignment_needed));
9756 if (crtl->stack_realign_finalized)
9758 /* After stack_realign_needed is finalized, we can't no longer
9760 gcc_assert (crtl->stack_realign_needed == stack_realign);
9764 crtl->stack_realign_needed = stack_realign;
9765 crtl->stack_realign_finalized = true;
9769 /* Expand the prologue into a bunch of separate insns. */
9772 ix86_expand_prologue (void)
9774 struct machine_function *m = cfun->machine;
9777 struct ix86_frame frame;
9778 HOST_WIDE_INT allocate;
9779 bool int_registers_saved;
9781 ix86_finalize_stack_realign_flags ();
9783 /* DRAP should not coexist with stack_realign_fp */
9784 gcc_assert (!(crtl->drap_reg && stack_realign_fp));
9786 memset (&m->fs, 0, sizeof (m->fs));
9788 /* Initialize CFA state for before the prologue. */
9789 m->fs.cfa_reg = stack_pointer_rtx;
9790 m->fs.cfa_offset = INCOMING_FRAME_SP_OFFSET;
9792 /* Track SP offset to the CFA. We continue tracking this after we've
9793 swapped the CFA register away from SP. In the case of re-alignment
9794 this is fudged; we're interested to offsets within the local frame. */
9795 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
9796 m->fs.sp_valid = true;
9798 ix86_compute_frame_layout (&frame);
9800 if (!TARGET_64BIT && ix86_function_ms_hook_prologue (current_function_decl))
9802 /* We should have already generated an error for any use of
9803 ms_hook on a nested function. */
9804 gcc_checking_assert (!ix86_static_chain_on_stack);
9806 /* Check if profiling is active and we shall use profiling before
9807 prologue variant. If so sorry. */
9808 if (crtl->profile && flag_fentry != 0)
9809 sorry ("ms_hook_prologue attribute isn%'t compatible "
9810 "with -mfentry for 32-bit");
9812 /* In ix86_asm_output_function_label we emitted:
9813 8b ff movl.s %edi,%edi
9815 8b ec movl.s %esp,%ebp
9817 This matches the hookable function prologue in Win32 API
9818 functions in Microsoft Windows XP Service Pack 2 and newer.
9819 Wine uses this to enable Windows apps to hook the Win32 API
9820 functions provided by Wine.
9822 What that means is that we've already set up the frame pointer. */
9824 if (frame_pointer_needed
9825 && !(crtl->drap_reg && crtl->stack_realign_needed))
9829 /* We've decided to use the frame pointer already set up.
9830 Describe this to the unwinder by pretending that both
9831 push and mov insns happen right here.
9833 Putting the unwind info here at the end of the ms_hook
9834 is done so that we can make absolutely certain we get
9835 the required byte sequence at the start of the function,
9836 rather than relying on an assembler that can produce
9837 the exact encoding required.
9839 However it does mean (in the unpatched case) that we have
9840 a 1 insn window where the asynchronous unwind info is
9841 incorrect. However, if we placed the unwind info at
9842 its correct location we would have incorrect unwind info
9843 in the patched case. Which is probably all moot since
9844 I don't expect Wine generates dwarf2 unwind info for the
9845 system libraries that use this feature. */
9847 insn = emit_insn (gen_blockage ());
9849 push = gen_push (hard_frame_pointer_rtx);
9850 mov = gen_rtx_SET (VOIDmode, hard_frame_pointer_rtx,
9852 RTX_FRAME_RELATED_P (push) = 1;
9853 RTX_FRAME_RELATED_P (mov) = 1;
9855 RTX_FRAME_RELATED_P (insn) = 1;
9856 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
9857 gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, push, mov)));
9859 /* Note that gen_push incremented m->fs.cfa_offset, even
9860 though we didn't emit the push insn here. */
9861 m->fs.cfa_reg = hard_frame_pointer_rtx;
9862 m->fs.fp_offset = m->fs.cfa_offset;
9863 m->fs.fp_valid = true;
9867 /* The frame pointer is not needed so pop %ebp again.
9868 This leaves us with a pristine state. */
9869 emit_insn (gen_pop (hard_frame_pointer_rtx));
9873 /* The first insn of a function that accepts its static chain on the
9874 stack is to push the register that would be filled in by a direct
9875 call. This insn will be skipped by the trampoline. */
9876 else if (ix86_static_chain_on_stack)
9878 insn = emit_insn (gen_push (ix86_static_chain (cfun->decl, false)));
9879 emit_insn (gen_blockage ());
9881 /* We don't want to interpret this push insn as a register save,
9882 only as a stack adjustment. The real copy of the register as
9883 a save will be done later, if needed. */
9884 t = plus_constant (stack_pointer_rtx, -UNITS_PER_WORD);
9885 t = gen_rtx_SET (VOIDmode, stack_pointer_rtx, t);
9886 add_reg_note (insn, REG_CFA_ADJUST_CFA, t);
9887 RTX_FRAME_RELATED_P (insn) = 1;
9890 /* Emit prologue code to adjust stack alignment and setup DRAP, in case
9891 of DRAP is needed and stack realignment is really needed after reload */
9892 if (stack_realign_drap)
9894 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
9896 /* Only need to push parameter pointer reg if it is caller saved. */
9897 if (!call_used_regs[REGNO (crtl->drap_reg)])
9899 /* Push arg pointer reg */
9900 insn = emit_insn (gen_push (crtl->drap_reg));
9901 RTX_FRAME_RELATED_P (insn) = 1;
9904 /* Grab the argument pointer. */
9905 t = plus_constant (stack_pointer_rtx, m->fs.sp_offset);
9906 insn = emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
9907 RTX_FRAME_RELATED_P (insn) = 1;
9908 m->fs.cfa_reg = crtl->drap_reg;
9909 m->fs.cfa_offset = 0;
9911 /* Align the stack. */
9912 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
9914 GEN_INT (-align_bytes)));
9915 RTX_FRAME_RELATED_P (insn) = 1;
9917 /* Replicate the return address on the stack so that return
9918 address can be reached via (argp - 1) slot. This is needed
9919 to implement macro RETURN_ADDR_RTX and intrinsic function
9920 expand_builtin_return_addr etc. */
9921 t = plus_constant (crtl->drap_reg, -UNITS_PER_WORD);
9922 t = gen_frame_mem (Pmode, t);
9923 insn = emit_insn (gen_push (t));
9924 RTX_FRAME_RELATED_P (insn) = 1;
9926 /* For the purposes of frame and register save area addressing,
9927 we've started over with a new frame. */
9928 m->fs.sp_offset = INCOMING_FRAME_SP_OFFSET;
9929 m->fs.realigned = true;
9932 if (frame_pointer_needed && !m->fs.fp_valid)
9934 /* Note: AT&T enter does NOT have reversed args. Enter is probably
9935 slower on all targets. Also sdb doesn't like it. */
9936 insn = emit_insn (gen_push (hard_frame_pointer_rtx));
9937 RTX_FRAME_RELATED_P (insn) = 1;
9939 if (m->fs.sp_offset == frame.hard_frame_pointer_offset)
9941 insn = emit_move_insn (hard_frame_pointer_rtx, stack_pointer_rtx);
9942 RTX_FRAME_RELATED_P (insn) = 1;
9944 if (m->fs.cfa_reg == stack_pointer_rtx)
9945 m->fs.cfa_reg = hard_frame_pointer_rtx;
9946 m->fs.fp_offset = m->fs.sp_offset;
9947 m->fs.fp_valid = true;
9951 int_registers_saved = (frame.nregs == 0);
9953 if (!int_registers_saved)
9955 /* If saving registers via PUSH, do so now. */
9956 if (!frame.save_regs_using_mov)
9958 ix86_emit_save_regs ();
9959 int_registers_saved = true;
9960 gcc_assert (m->fs.sp_offset == frame.reg_save_offset);
9963 /* When using red zone we may start register saving before allocating
9964 the stack frame saving one cycle of the prologue. However, avoid
9965 doing this if we have to probe the stack; at least on x86_64 the
9966 stack probe can turn into a call that clobbers a red zone location. */
9967 else if (ix86_using_red_zone ()
9968 && (! TARGET_STACK_PROBE
9969 || frame.stack_pointer_offset < CHECK_STACK_LIMIT))
9971 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
9972 int_registers_saved = true;
9976 if (stack_realign_fp)
9978 int align_bytes = crtl->stack_alignment_needed / BITS_PER_UNIT;
9979 gcc_assert (align_bytes > MIN_STACK_BOUNDARY / BITS_PER_UNIT);
9981 /* The computation of the size of the re-aligned stack frame means
9982 that we must allocate the size of the register save area before
9983 performing the actual alignment. Otherwise we cannot guarantee
9984 that there's enough storage above the realignment point. */
9985 if (m->fs.sp_offset != frame.sse_reg_save_offset)
9986 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
9987 GEN_INT (m->fs.sp_offset
9988 - frame.sse_reg_save_offset),
9991 /* Align the stack. */
9992 insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx,
9994 GEN_INT (-align_bytes)));
9996 /* For the purposes of register save area addressing, the stack
9997 pointer is no longer valid. As for the value of sp_offset,
9998 see ix86_compute_frame_layout, which we need to match in order
9999 to pass verification of stack_pointer_offset at the end. */
10000 m->fs.sp_offset = (m->fs.sp_offset + align_bytes) & -align_bytes;
10001 m->fs.sp_valid = false;
10004 allocate = frame.stack_pointer_offset - m->fs.sp_offset;
10006 if (flag_stack_usage_info)
10008 /* We start to count from ARG_POINTER. */
10009 HOST_WIDE_INT stack_size = frame.stack_pointer_offset;
10011 /* If it was realigned, take into account the fake frame. */
10012 if (stack_realign_drap)
10014 if (ix86_static_chain_on_stack)
10015 stack_size += UNITS_PER_WORD;
10017 if (!call_used_regs[REGNO (crtl->drap_reg)])
10018 stack_size += UNITS_PER_WORD;
10020 /* This over-estimates by 1 minimal-stack-alignment-unit but
10021 mitigates that by counting in the new return address slot. */
10022 current_function_dynamic_stack_size
10023 += crtl->stack_alignment_needed / BITS_PER_UNIT;
10026 current_function_static_stack_size = stack_size;
10029 /* The stack has already been decremented by the instruction calling us
10030 so probe if the size is non-negative to preserve the protection area. */
10031 if (allocate >= 0 && flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10033 /* We expect the registers to be saved when probes are used. */
10034 gcc_assert (int_registers_saved);
10036 if (STACK_CHECK_MOVING_SP)
10038 ix86_adjust_stack_and_probe (allocate);
10043 HOST_WIDE_INT size = allocate;
10045 if (TARGET_64BIT && size >= (HOST_WIDE_INT) 0x80000000)
10046 size = 0x80000000 - STACK_CHECK_PROTECT - 1;
10048 if (TARGET_STACK_PROBE)
10049 ix86_emit_probe_stack_range (0, size + STACK_CHECK_PROTECT);
10051 ix86_emit_probe_stack_range (STACK_CHECK_PROTECT, size);
10057 else if (!ix86_target_stack_probe ()
10058 || frame.stack_pointer_offset < CHECK_STACK_LIMIT)
10060 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10061 GEN_INT (-allocate), -1,
10062 m->fs.cfa_reg == stack_pointer_rtx);
10066 rtx eax = gen_rtx_REG (Pmode, AX_REG);
10068 rtx (*adjust_stack_insn)(rtx, rtx, rtx);
10070 bool eax_live = false;
10071 bool r10_live = false;
10074 r10_live = (DECL_STATIC_CHAIN (current_function_decl) != 0);
10075 if (!TARGET_64BIT_MS_ABI)
10076 eax_live = ix86_eax_live_at_start_p ();
10080 emit_insn (gen_push (eax));
10081 allocate -= UNITS_PER_WORD;
10085 r10 = gen_rtx_REG (Pmode, R10_REG);
10086 emit_insn (gen_push (r10));
10087 allocate -= UNITS_PER_WORD;
10090 emit_move_insn (eax, GEN_INT (allocate));
10091 emit_insn (ix86_gen_allocate_stack_worker (eax, eax));
10093 /* Use the fact that AX still contains ALLOCATE. */
10094 adjust_stack_insn = (TARGET_64BIT
10095 ? gen_pro_epilogue_adjust_stack_di_sub
10096 : gen_pro_epilogue_adjust_stack_si_sub);
10098 insn = emit_insn (adjust_stack_insn (stack_pointer_rtx,
10099 stack_pointer_rtx, eax));
10101 /* Note that SEH directives need to continue tracking the stack
10102 pointer even after the frame pointer has been set up. */
10103 if (m->fs.cfa_reg == stack_pointer_rtx || TARGET_SEH)
10105 if (m->fs.cfa_reg == stack_pointer_rtx)
10106 m->fs.cfa_offset += allocate;
10108 RTX_FRAME_RELATED_P (insn) = 1;
10109 add_reg_note (insn, REG_FRAME_RELATED_EXPR,
10110 gen_rtx_SET (VOIDmode, stack_pointer_rtx,
10111 plus_constant (stack_pointer_rtx,
10114 m->fs.sp_offset += allocate;
10116 if (r10_live && eax_live)
10118 t = choose_baseaddr (m->fs.sp_offset - allocate);
10119 emit_move_insn (r10, gen_frame_mem (Pmode, t));
10120 t = choose_baseaddr (m->fs.sp_offset - allocate - UNITS_PER_WORD);
10121 emit_move_insn (eax, gen_frame_mem (Pmode, t));
10123 else if (eax_live || r10_live)
10125 t = choose_baseaddr (m->fs.sp_offset - allocate);
10126 emit_move_insn ((eax_live ? eax : r10), gen_frame_mem (Pmode, t));
10129 gcc_assert (m->fs.sp_offset == frame.stack_pointer_offset);
10131 /* If we havn't already set up the frame pointer, do so now. */
10132 if (frame_pointer_needed && !m->fs.fp_valid)
10134 insn = ix86_gen_add3 (hard_frame_pointer_rtx, stack_pointer_rtx,
10135 GEN_INT (frame.stack_pointer_offset
10136 - frame.hard_frame_pointer_offset));
10137 insn = emit_insn (insn);
10138 RTX_FRAME_RELATED_P (insn) = 1;
10139 add_reg_note (insn, REG_CFA_ADJUST_CFA, NULL);
10141 if (m->fs.cfa_reg == stack_pointer_rtx)
10142 m->fs.cfa_reg = hard_frame_pointer_rtx;
10143 m->fs.fp_offset = frame.hard_frame_pointer_offset;
10144 m->fs.fp_valid = true;
10147 if (!int_registers_saved)
10148 ix86_emit_save_regs_using_mov (frame.reg_save_offset);
10149 if (frame.nsseregs)
10150 ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset);
10152 pic_reg_used = false;
10153 if (pic_offset_table_rtx
10154 && (df_regs_ever_live_p (REAL_PIC_OFFSET_TABLE_REGNUM)
10157 unsigned int alt_pic_reg_used = ix86_select_alt_pic_regnum ();
10159 if (alt_pic_reg_used != INVALID_REGNUM)
10160 SET_REGNO (pic_offset_table_rtx, alt_pic_reg_used);
10162 pic_reg_used = true;
10169 if (ix86_cmodel == CM_LARGE_PIC)
10171 rtx tmp_reg = gen_rtx_REG (DImode, R11_REG);
10172 rtx label = gen_label_rtx ();
10173 emit_label (label);
10174 LABEL_PRESERVE_P (label) = 1;
10175 gcc_assert (REGNO (pic_offset_table_rtx) != REGNO (tmp_reg));
10176 insn = emit_insn (gen_set_rip_rex64 (pic_offset_table_rtx, label));
10177 insn = emit_insn (gen_set_got_offset_rex64 (tmp_reg, label));
10178 insn = emit_insn (gen_adddi3 (pic_offset_table_rtx,
10179 pic_offset_table_rtx, tmp_reg));
10182 insn = emit_insn (gen_set_got_rex64 (pic_offset_table_rtx));
10186 insn = emit_insn (gen_set_got (pic_offset_table_rtx));
10187 RTX_FRAME_RELATED_P (insn) = 1;
10188 add_reg_note (insn, REG_CFA_FLUSH_QUEUE, NULL_RTX);
10192 /* In the pic_reg_used case, make sure that the got load isn't deleted
10193 when mcount needs it. Blockage to avoid call movement across mcount
10194 call is emitted in generic code after the NOTE_INSN_PROLOGUE_END
10196 if (crtl->profile && !flag_fentry && pic_reg_used)
10197 emit_insn (gen_prologue_use (pic_offset_table_rtx));
10199 if (crtl->drap_reg && !crtl->stack_realign_needed)
10201 /* vDRAP is setup but after reload it turns out stack realign
10202 isn't necessary, here we will emit prologue to setup DRAP
10203 without stack realign adjustment */
10204 t = choose_baseaddr (0);
10205 emit_insn (gen_rtx_SET (VOIDmode, crtl->drap_reg, t));
10208 /* Prevent instructions from being scheduled into register save push
10209 sequence when access to the redzone area is done through frame pointer.
10210 The offset between the frame pointer and the stack pointer is calculated
10211 relative to the value of the stack pointer at the end of the function
10212 prologue, and moving instructions that access redzone area via frame
10213 pointer inside push sequence violates this assumption. */
10214 if (frame_pointer_needed && frame.red_zone_size)
10215 emit_insn (gen_memory_blockage ());
10217 /* Emit cld instruction if stringops are used in the function. */
10218 if (TARGET_CLD && ix86_current_function_needs_cld)
10219 emit_insn (gen_cld ());
10221 /* SEH requires that the prologue end within 256 bytes of the start of
10222 the function. Prevent instruction schedules that would extend that. */
10224 emit_insn (gen_blockage ());
10227 /* Emit code to restore REG using a POP insn. */
10230 ix86_emit_restore_reg_using_pop (rtx reg)
10232 struct machine_function *m = cfun->machine;
10233 rtx insn = emit_insn (gen_pop (reg));
10235 ix86_add_cfa_restore_note (insn, reg, m->fs.sp_offset);
10236 m->fs.sp_offset -= UNITS_PER_WORD;
10238 if (m->fs.cfa_reg == crtl->drap_reg
10239 && REGNO (reg) == REGNO (crtl->drap_reg))
10241 /* Previously we'd represented the CFA as an expression
10242 like *(%ebp - 8). We've just popped that value from
10243 the stack, which means we need to reset the CFA to
10244 the drap register. This will remain until we restore
10245 the stack pointer. */
10246 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10247 RTX_FRAME_RELATED_P (insn) = 1;
10249 /* This means that the DRAP register is valid for addressing too. */
10250 m->fs.drap_valid = true;
10254 if (m->fs.cfa_reg == stack_pointer_rtx)
10256 rtx x = plus_constant (stack_pointer_rtx, UNITS_PER_WORD);
10257 x = gen_rtx_SET (VOIDmode, stack_pointer_rtx, x);
10258 add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
10259 RTX_FRAME_RELATED_P (insn) = 1;
10261 m->fs.cfa_offset -= UNITS_PER_WORD;
10264 /* When the frame pointer is the CFA, and we pop it, we are
10265 swapping back to the stack pointer as the CFA. This happens
10266 for stack frames that don't allocate other data, so we assume
10267 the stack pointer is now pointing at the return address, i.e.
10268 the function entry state, which makes the offset be 1 word. */
10269 if (reg == hard_frame_pointer_rtx)
10271 m->fs.fp_valid = false;
10272 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10274 m->fs.cfa_reg = stack_pointer_rtx;
10275 m->fs.cfa_offset -= UNITS_PER_WORD;
10277 add_reg_note (insn, REG_CFA_DEF_CFA,
10278 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10279 GEN_INT (m->fs.cfa_offset)));
10280 RTX_FRAME_RELATED_P (insn) = 1;
10285 /* Emit code to restore saved registers using POP insns. */
10288 ix86_emit_restore_regs_using_pop (void)
10290 unsigned int regno;
10292 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10293 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, false))
10294 ix86_emit_restore_reg_using_pop (gen_rtx_REG (Pmode, regno));
10297 /* Emit code and notes for the LEAVE instruction. */
10300 ix86_emit_leave (void)
10302 struct machine_function *m = cfun->machine;
10303 rtx insn = emit_insn (ix86_gen_leave ());
10305 ix86_add_queued_cfa_restore_notes (insn);
10307 gcc_assert (m->fs.fp_valid);
10308 m->fs.sp_valid = true;
10309 m->fs.sp_offset = m->fs.fp_offset - UNITS_PER_WORD;
10310 m->fs.fp_valid = false;
10312 if (m->fs.cfa_reg == hard_frame_pointer_rtx)
10314 m->fs.cfa_reg = stack_pointer_rtx;
10315 m->fs.cfa_offset = m->fs.sp_offset;
10317 add_reg_note (insn, REG_CFA_DEF_CFA,
10318 plus_constant (stack_pointer_rtx, m->fs.sp_offset));
10319 RTX_FRAME_RELATED_P (insn) = 1;
10320 ix86_add_cfa_restore_note (insn, hard_frame_pointer_rtx,
10325 /* Emit code to restore saved registers using MOV insns.
10326 First register is restored from CFA - CFA_OFFSET. */
10328 ix86_emit_restore_regs_using_mov (HOST_WIDE_INT cfa_offset,
10329 bool maybe_eh_return)
10331 struct machine_function *m = cfun->machine;
10332 unsigned int regno;
10334 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10335 if (!SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10337 rtx reg = gen_rtx_REG (Pmode, regno);
10340 mem = choose_baseaddr (cfa_offset);
10341 mem = gen_frame_mem (Pmode, mem);
10342 insn = emit_move_insn (reg, mem);
10344 if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
10346 /* Previously we'd represented the CFA as an expression
10347 like *(%ebp - 8). We've just popped that value from
10348 the stack, which means we need to reset the CFA to
10349 the drap register. This will remain until we restore
10350 the stack pointer. */
10351 add_reg_note (insn, REG_CFA_DEF_CFA, reg);
10352 RTX_FRAME_RELATED_P (insn) = 1;
10354 /* This means that the DRAP register is valid for addressing. */
10355 m->fs.drap_valid = true;
10358 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10360 cfa_offset -= UNITS_PER_WORD;
10364 /* Emit code to restore saved registers using MOV insns.
10365 First register is restored from CFA - CFA_OFFSET. */
10367 ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset,
10368 bool maybe_eh_return)
10370 unsigned int regno;
10372 for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
10373 if (SSE_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return))
10375 rtx reg = gen_rtx_REG (V4SFmode, regno);
10378 mem = choose_baseaddr (cfa_offset);
10379 mem = gen_rtx_MEM (V4SFmode, mem);
10380 set_mem_align (mem, 128);
10381 emit_move_insn (reg, mem);
10383 ix86_add_cfa_restore_note (NULL_RTX, reg, cfa_offset);
10389 /* Restore function stack, frame, and registers. */
10392 ix86_expand_epilogue (int style)
10394 struct machine_function *m = cfun->machine;
10395 struct machine_frame_state frame_state_save = m->fs;
10396 struct ix86_frame frame;
10397 bool restore_regs_via_mov;
10400 ix86_finalize_stack_realign_flags ();
10401 ix86_compute_frame_layout (&frame);
10403 m->fs.sp_valid = (!frame_pointer_needed
10404 || (current_function_sp_is_unchanging
10405 && !stack_realign_fp));
10406 gcc_assert (!m->fs.sp_valid
10407 || m->fs.sp_offset == frame.stack_pointer_offset);
10409 /* The FP must be valid if the frame pointer is present. */
10410 gcc_assert (frame_pointer_needed == m->fs.fp_valid);
10411 gcc_assert (!m->fs.fp_valid
10412 || m->fs.fp_offset == frame.hard_frame_pointer_offset);
10414 /* We must have *some* valid pointer to the stack frame. */
10415 gcc_assert (m->fs.sp_valid || m->fs.fp_valid);
10417 /* The DRAP is never valid at this point. */
10418 gcc_assert (!m->fs.drap_valid);
10420 /* See the comment about red zone and frame
10421 pointer usage in ix86_expand_prologue. */
10422 if (frame_pointer_needed && frame.red_zone_size)
10423 emit_insn (gen_memory_blockage ());
10425 using_drap = crtl->drap_reg && crtl->stack_realign_needed;
10426 gcc_assert (!using_drap || m->fs.cfa_reg == crtl->drap_reg);
10428 /* Determine the CFA offset of the end of the red-zone. */
10429 m->fs.red_zone_offset = 0;
10430 if (ix86_using_red_zone () && crtl->args.pops_args < 65536)
10432 /* The red-zone begins below the return address. */
10433 m->fs.red_zone_offset = RED_ZONE_SIZE + UNITS_PER_WORD;
10435 /* When the register save area is in the aligned portion of
10436 the stack, determine the maximum runtime displacement that
10437 matches up with the aligned frame. */
10438 if (stack_realign_drap)
10439 m->fs.red_zone_offset -= (crtl->stack_alignment_needed / BITS_PER_UNIT
10443 /* Special care must be taken for the normal return case of a function
10444 using eh_return: the eax and edx registers are marked as saved, but
10445 not restored along this path. Adjust the save location to match. */
10446 if (crtl->calls_eh_return && style != 2)
10447 frame.reg_save_offset -= 2 * UNITS_PER_WORD;
10449 /* EH_RETURN requires the use of moves to function properly. */
10450 if (crtl->calls_eh_return)
10451 restore_regs_via_mov = true;
10452 /* SEH requires the use of pops to identify the epilogue. */
10453 else if (TARGET_SEH)
10454 restore_regs_via_mov = false;
10455 /* If we're only restoring one register and sp is not valid then
10456 using a move instruction to restore the register since it's
10457 less work than reloading sp and popping the register. */
10458 else if (!m->fs.sp_valid && frame.nregs <= 1)
10459 restore_regs_via_mov = true;
10460 else if (TARGET_EPILOGUE_USING_MOVE
10461 && cfun->machine->use_fast_prologue_epilogue
10462 && (frame.nregs > 1
10463 || m->fs.sp_offset != frame.reg_save_offset))
10464 restore_regs_via_mov = true;
10465 else if (frame_pointer_needed
10467 && m->fs.sp_offset != frame.reg_save_offset)
10468 restore_regs_via_mov = true;
10469 else if (frame_pointer_needed
10470 && TARGET_USE_LEAVE
10471 && cfun->machine->use_fast_prologue_epilogue
10472 && frame.nregs == 1)
10473 restore_regs_via_mov = true;
10475 restore_regs_via_mov = false;
10477 if (restore_regs_via_mov || frame.nsseregs)
10479 /* Ensure that the entire register save area is addressable via
10480 the stack pointer, if we will restore via sp. */
10482 && m->fs.sp_offset > 0x7fffffff
10483 && !(m->fs.fp_valid || m->fs.drap_valid)
10484 && (frame.nsseregs + frame.nregs) != 0)
10486 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10487 GEN_INT (m->fs.sp_offset
10488 - frame.sse_reg_save_offset),
10490 m->fs.cfa_reg == stack_pointer_rtx);
10494 /* If there are any SSE registers to restore, then we have to do it
10495 via moves, since there's obviously no pop for SSE regs. */
10496 if (frame.nsseregs)
10497 ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset,
10500 if (restore_regs_via_mov)
10505 ix86_emit_restore_regs_using_mov (frame.reg_save_offset, style == 2);
10507 /* eh_return epilogues need %ecx added to the stack pointer. */
10510 rtx insn, sa = EH_RETURN_STACKADJ_RTX;
10512 /* Stack align doesn't work with eh_return. */
10513 gcc_assert (!stack_realign_drap);
10514 /* Neither does regparm nested functions. */
10515 gcc_assert (!ix86_static_chain_on_stack);
10517 if (frame_pointer_needed)
10519 t = gen_rtx_PLUS (Pmode, hard_frame_pointer_rtx, sa);
10520 t = plus_constant (t, m->fs.fp_offset - UNITS_PER_WORD);
10521 emit_insn (gen_rtx_SET (VOIDmode, sa, t));
10523 t = gen_frame_mem (Pmode, hard_frame_pointer_rtx);
10524 insn = emit_move_insn (hard_frame_pointer_rtx, t);
10526 /* Note that we use SA as a temporary CFA, as the return
10527 address is at the proper place relative to it. We
10528 pretend this happens at the FP restore insn because
10529 prior to this insn the FP would be stored at the wrong
10530 offset relative to SA, and after this insn we have no
10531 other reasonable register to use for the CFA. We don't
10532 bother resetting the CFA to the SP for the duration of
10533 the return insn. */
10534 add_reg_note (insn, REG_CFA_DEF_CFA,
10535 plus_constant (sa, UNITS_PER_WORD));
10536 ix86_add_queued_cfa_restore_notes (insn);
10537 add_reg_note (insn, REG_CFA_RESTORE, hard_frame_pointer_rtx);
10538 RTX_FRAME_RELATED_P (insn) = 1;
10540 m->fs.cfa_reg = sa;
10541 m->fs.cfa_offset = UNITS_PER_WORD;
10542 m->fs.fp_valid = false;
10544 pro_epilogue_adjust_stack (stack_pointer_rtx, sa,
10545 const0_rtx, style, false);
10549 t = gen_rtx_PLUS (Pmode, stack_pointer_rtx, sa);
10550 t = plus_constant (t, m->fs.sp_offset - UNITS_PER_WORD);
10551 insn = emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx, t));
10552 ix86_add_queued_cfa_restore_notes (insn);
10554 gcc_assert (m->fs.cfa_reg == stack_pointer_rtx);
10555 if (m->fs.cfa_offset != UNITS_PER_WORD)
10557 m->fs.cfa_offset = UNITS_PER_WORD;
10558 add_reg_note (insn, REG_CFA_DEF_CFA,
10559 plus_constant (stack_pointer_rtx,
10561 RTX_FRAME_RELATED_P (insn) = 1;
10564 m->fs.sp_offset = UNITS_PER_WORD;
10565 m->fs.sp_valid = true;
10570 /* SEH requires that the function end with (1) a stack adjustment
10571 if necessary, (2) a sequence of pops, and (3) a return or
10572 jump instruction. Prevent insns from the function body from
10573 being scheduled into this sequence. */
10576 /* Prevent a catch region from being adjacent to the standard
10577 epilogue sequence. Unfortuantely crtl->uses_eh_lsda nor
10578 several other flags that would be interesting to test are
10580 if (flag_non_call_exceptions)
10581 emit_insn (gen_nops (const1_rtx));
10583 emit_insn (gen_blockage ());
10586 /* First step is to deallocate the stack frame so that we can
10587 pop the registers. */
10588 if (!m->fs.sp_valid)
10590 pro_epilogue_adjust_stack (stack_pointer_rtx, hard_frame_pointer_rtx,
10591 GEN_INT (m->fs.fp_offset
10592 - frame.reg_save_offset),
10595 else if (m->fs.sp_offset != frame.reg_save_offset)
10597 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10598 GEN_INT (m->fs.sp_offset
10599 - frame.reg_save_offset),
10601 m->fs.cfa_reg == stack_pointer_rtx);
10604 ix86_emit_restore_regs_using_pop ();
10607 /* If we used a stack pointer and haven't already got rid of it,
10609 if (m->fs.fp_valid)
10611 /* If the stack pointer is valid and pointing at the frame
10612 pointer store address, then we only need a pop. */
10613 if (m->fs.sp_valid && m->fs.sp_offset == frame.hfp_save_offset)
10614 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10615 /* Leave results in shorter dependency chains on CPUs that are
10616 able to grok it fast. */
10617 else if (TARGET_USE_LEAVE
10618 || optimize_function_for_size_p (cfun)
10619 || !cfun->machine->use_fast_prologue_epilogue)
10620 ix86_emit_leave ();
10623 pro_epilogue_adjust_stack (stack_pointer_rtx,
10624 hard_frame_pointer_rtx,
10625 const0_rtx, style, !using_drap);
10626 ix86_emit_restore_reg_using_pop (hard_frame_pointer_rtx);
10632 int param_ptr_offset = UNITS_PER_WORD;
10635 gcc_assert (stack_realign_drap);
10637 if (ix86_static_chain_on_stack)
10638 param_ptr_offset += UNITS_PER_WORD;
10639 if (!call_used_regs[REGNO (crtl->drap_reg)])
10640 param_ptr_offset += UNITS_PER_WORD;
10642 insn = emit_insn (gen_rtx_SET
10643 (VOIDmode, stack_pointer_rtx,
10644 gen_rtx_PLUS (Pmode,
10646 GEN_INT (-param_ptr_offset))));
10647 m->fs.cfa_reg = stack_pointer_rtx;
10648 m->fs.cfa_offset = param_ptr_offset;
10649 m->fs.sp_offset = param_ptr_offset;
10650 m->fs.realigned = false;
10652 add_reg_note (insn, REG_CFA_DEF_CFA,
10653 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10654 GEN_INT (param_ptr_offset)));
10655 RTX_FRAME_RELATED_P (insn) = 1;
10657 if (!call_used_regs[REGNO (crtl->drap_reg)])
10658 ix86_emit_restore_reg_using_pop (crtl->drap_reg);
10661 /* At this point the stack pointer must be valid, and we must have
10662 restored all of the registers. We may not have deallocated the
10663 entire stack frame. We've delayed this until now because it may
10664 be possible to merge the local stack deallocation with the
10665 deallocation forced by ix86_static_chain_on_stack. */
10666 gcc_assert (m->fs.sp_valid);
10667 gcc_assert (!m->fs.fp_valid);
10668 gcc_assert (!m->fs.realigned);
10669 if (m->fs.sp_offset != UNITS_PER_WORD)
10671 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10672 GEN_INT (m->fs.sp_offset - UNITS_PER_WORD),
10676 /* Sibcall epilogues don't want a return instruction. */
10679 m->fs = frame_state_save;
10683 /* Emit vzeroupper if needed. */
10684 if (TARGET_VZEROUPPER
10685 && !TREE_THIS_VOLATILE (cfun->decl)
10686 && !cfun->machine->caller_return_avx256_p)
10687 emit_insn (gen_avx_vzeroupper (GEN_INT (call_no_avx256)));
10689 if (crtl->args.pops_args && crtl->args.size)
10691 rtx popc = GEN_INT (crtl->args.pops_args);
10693 /* i386 can only pop 64K bytes. If asked to pop more, pop return
10694 address, do explicit add, and jump indirectly to the caller. */
10696 if (crtl->args.pops_args >= 65536)
10698 rtx ecx = gen_rtx_REG (SImode, CX_REG);
10701 /* There is no "pascal" calling convention in any 64bit ABI. */
10702 gcc_assert (!TARGET_64BIT);
10704 insn = emit_insn (gen_pop (ecx));
10705 m->fs.cfa_offset -= UNITS_PER_WORD;
10706 m->fs.sp_offset -= UNITS_PER_WORD;
10708 add_reg_note (insn, REG_CFA_ADJUST_CFA,
10709 copy_rtx (XVECEXP (PATTERN (insn), 0, 1)));
10710 add_reg_note (insn, REG_CFA_REGISTER,
10711 gen_rtx_SET (VOIDmode, ecx, pc_rtx));
10712 RTX_FRAME_RELATED_P (insn) = 1;
10714 pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
10716 emit_jump_insn (gen_return_indirect_internal (ecx));
10719 emit_jump_insn (gen_return_pop_internal (popc));
10722 emit_jump_insn (gen_return_internal ());
10724 /* Restore the state back to the state from the prologue,
10725 so that it's correct for the next epilogue. */
10726 m->fs = frame_state_save;
10729 /* Reset from the function's potential modifications. */
10732 ix86_output_function_epilogue (FILE *file ATTRIBUTE_UNUSED,
10733 HOST_WIDE_INT size ATTRIBUTE_UNUSED)
10735 if (pic_offset_table_rtx)
10736 SET_REGNO (pic_offset_table_rtx, REAL_PIC_OFFSET_TABLE_REGNUM);
10738 /* Mach-O doesn't support labels at the end of objects, so if
10739 it looks like we might want one, insert a NOP. */
10741 rtx insn = get_last_insn ();
10744 && NOTE_KIND (insn) != NOTE_INSN_DELETED_LABEL)
10745 insn = PREV_INSN (insn);
10749 && NOTE_KIND (insn) == NOTE_INSN_DELETED_LABEL)))
10750 fputs ("\tnop\n", file);
10756 /* Return a scratch register to use in the split stack prologue. The
10757 split stack prologue is used for -fsplit-stack. It is the first
10758 instructions in the function, even before the regular prologue.
10759 The scratch register can be any caller-saved register which is not
10760 used for parameters or for the static chain. */
10762 static unsigned int
10763 split_stack_prologue_scratch_regno (void)
10772 is_fastcall = (lookup_attribute ("fastcall",
10773 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
10775 regparm = ix86_function_regparm (TREE_TYPE (cfun->decl), cfun->decl);
10779 if (DECL_STATIC_CHAIN (cfun->decl))
10781 sorry ("-fsplit-stack does not support fastcall with "
10782 "nested function");
10783 return INVALID_REGNUM;
10787 else if (regparm < 3)
10789 if (!DECL_STATIC_CHAIN (cfun->decl))
10795 sorry ("-fsplit-stack does not support 2 register "
10796 " parameters for a nested function");
10797 return INVALID_REGNUM;
10804 /* FIXME: We could make this work by pushing a register
10805 around the addition and comparison. */
10806 sorry ("-fsplit-stack does not support 3 register parameters");
10807 return INVALID_REGNUM;
10812 /* A SYMBOL_REF for the function which allocates new stackspace for
10815 static GTY(()) rtx split_stack_fn;
10817 /* A SYMBOL_REF for the more stack function when using the large
10820 static GTY(()) rtx split_stack_fn_large;
10822 /* Handle -fsplit-stack. These are the first instructions in the
10823 function, even before the regular prologue. */
10826 ix86_expand_split_stack_prologue (void)
10828 struct ix86_frame frame;
10829 HOST_WIDE_INT allocate;
10830 unsigned HOST_WIDE_INT args_size;
10831 rtx label, limit, current, jump_insn, allocate_rtx, call_insn, call_fusage;
10832 rtx scratch_reg = NULL_RTX;
10833 rtx varargs_label = NULL_RTX;
10836 gcc_assert (flag_split_stack && reload_completed);
10838 ix86_finalize_stack_realign_flags ();
10839 ix86_compute_frame_layout (&frame);
10840 allocate = frame.stack_pointer_offset - INCOMING_FRAME_SP_OFFSET;
10842 /* This is the label we will branch to if we have enough stack
10843 space. We expect the basic block reordering pass to reverse this
10844 branch if optimizing, so that we branch in the unlikely case. */
10845 label = gen_label_rtx ();
10847 /* We need to compare the stack pointer minus the frame size with
10848 the stack boundary in the TCB. The stack boundary always gives
10849 us SPLIT_STACK_AVAILABLE bytes, so if we need less than that we
10850 can compare directly. Otherwise we need to do an addition. */
10852 limit = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
10853 UNSPEC_STACK_CHECK);
10854 limit = gen_rtx_CONST (Pmode, limit);
10855 limit = gen_rtx_MEM (Pmode, limit);
10856 if (allocate < SPLIT_STACK_AVAILABLE)
10857 current = stack_pointer_rtx;
10860 unsigned int scratch_regno;
10863 /* We need a scratch register to hold the stack pointer minus
10864 the required frame size. Since this is the very start of the
10865 function, the scratch register can be any caller-saved
10866 register which is not used for parameters. */
10867 offset = GEN_INT (- allocate);
10868 scratch_regno = split_stack_prologue_scratch_regno ();
10869 if (scratch_regno == INVALID_REGNUM)
10871 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
10872 if (!TARGET_64BIT || x86_64_immediate_operand (offset, Pmode))
10874 /* We don't use ix86_gen_add3 in this case because it will
10875 want to split to lea, but when not optimizing the insn
10876 will not be split after this point. */
10877 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
10878 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
10883 emit_move_insn (scratch_reg, offset);
10884 emit_insn (gen_adddi3 (scratch_reg, scratch_reg,
10885 stack_pointer_rtx));
10887 current = scratch_reg;
10890 ix86_expand_branch (GEU, current, limit, label);
10891 jump_insn = get_last_insn ();
10892 JUMP_LABEL (jump_insn) = label;
10894 /* Mark the jump as very likely to be taken. */
10895 add_reg_note (jump_insn, REG_BR_PROB,
10896 GEN_INT (REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100));
10898 if (split_stack_fn == NULL_RTX)
10899 split_stack_fn = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
10900 fn = split_stack_fn;
10902 /* Get more stack space. We pass in the desired stack space and the
10903 size of the arguments to copy to the new stack. In 32-bit mode
10904 we push the parameters; __morestack will return on a new stack
10905 anyhow. In 64-bit mode we pass the parameters in r10 and
10907 allocate_rtx = GEN_INT (allocate);
10908 args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
10909 call_fusage = NULL_RTX;
10914 reg10 = gen_rtx_REG (Pmode, R10_REG);
10915 reg11 = gen_rtx_REG (Pmode, R11_REG);
10917 /* If this function uses a static chain, it will be in %r10.
10918 Preserve it across the call to __morestack. */
10919 if (DECL_STATIC_CHAIN (cfun->decl))
10923 rax = gen_rtx_REG (Pmode, AX_REG);
10924 emit_move_insn (rax, reg10);
10925 use_reg (&call_fusage, rax);
10928 if (ix86_cmodel == CM_LARGE || ix86_cmodel == CM_LARGE_PIC)
10930 HOST_WIDE_INT argval;
10932 /* When using the large model we need to load the address
10933 into a register, and we've run out of registers. So we
10934 switch to a different calling convention, and we call a
10935 different function: __morestack_large. We pass the
10936 argument size in the upper 32 bits of r10 and pass the
10937 frame size in the lower 32 bits. */
10938 gcc_assert ((allocate & (HOST_WIDE_INT) 0xffffffff) == allocate);
10939 gcc_assert ((args_size & 0xffffffff) == args_size);
10941 if (split_stack_fn_large == NULL_RTX)
10942 split_stack_fn_large =
10943 gen_rtx_SYMBOL_REF (Pmode, "__morestack_large_model");
10945 if (ix86_cmodel == CM_LARGE_PIC)
10949 label = gen_label_rtx ();
10950 emit_label (label);
10951 LABEL_PRESERVE_P (label) = 1;
10952 emit_insn (gen_set_rip_rex64 (reg10, label));
10953 emit_insn (gen_set_got_offset_rex64 (reg11, label));
10954 emit_insn (gen_adddi3 (reg10, reg10, reg11));
10955 x = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, split_stack_fn_large),
10957 x = gen_rtx_CONST (Pmode, x);
10958 emit_move_insn (reg11, x);
10959 x = gen_rtx_PLUS (Pmode, reg10, reg11);
10960 x = gen_const_mem (Pmode, x);
10961 emit_move_insn (reg11, x);
10964 emit_move_insn (reg11, split_stack_fn_large);
10968 argval = ((args_size << 16) << 16) + allocate;
10969 emit_move_insn (reg10, GEN_INT (argval));
10973 emit_move_insn (reg10, allocate_rtx);
10974 emit_move_insn (reg11, GEN_INT (args_size));
10975 use_reg (&call_fusage, reg11);
10978 use_reg (&call_fusage, reg10);
10982 emit_insn (gen_push (GEN_INT (args_size)));
10983 emit_insn (gen_push (allocate_rtx));
10985 call_insn = ix86_expand_call (NULL_RTX, gen_rtx_MEM (QImode, fn),
10986 GEN_INT (UNITS_PER_WORD), constm1_rtx,
10988 add_function_usage_to (call_insn, call_fusage);
10990 /* In order to make call/return prediction work right, we now need
10991 to execute a return instruction. See
10992 libgcc/config/i386/morestack.S for the details on how this works.
10994 For flow purposes gcc must not see this as a return
10995 instruction--we need control flow to continue at the subsequent
10996 label. Therefore, we use an unspec. */
10997 gcc_assert (crtl->args.pops_args < 65536);
10998 emit_insn (gen_split_stack_return (GEN_INT (crtl->args.pops_args)));
11000 /* If we are in 64-bit mode and this function uses a static chain,
11001 we saved %r10 in %rax before calling _morestack. */
11002 if (TARGET_64BIT && DECL_STATIC_CHAIN (cfun->decl))
11003 emit_move_insn (gen_rtx_REG (Pmode, R10_REG),
11004 gen_rtx_REG (Pmode, AX_REG));
11006 /* If this function calls va_start, we need to store a pointer to
11007 the arguments on the old stack, because they may not have been
11008 all copied to the new stack. At this point the old stack can be
11009 found at the frame pointer value used by __morestack, because
11010 __morestack has set that up before calling back to us. Here we
11011 store that pointer in a scratch register, and in
11012 ix86_expand_prologue we store the scratch register in a stack
11014 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11016 unsigned int scratch_regno;
11020 scratch_regno = split_stack_prologue_scratch_regno ();
11021 scratch_reg = gen_rtx_REG (Pmode, scratch_regno);
11022 frame_reg = gen_rtx_REG (Pmode, BP_REG);
11026 return address within this function
11027 return address of caller of this function
11029 So we add three words to get to the stack arguments.
11033 return address within this function
11034 first argument to __morestack
11035 second argument to __morestack
11036 return address of caller of this function
11038 So we add five words to get to the stack arguments.
11040 words = TARGET_64BIT ? 3 : 5;
11041 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11042 gen_rtx_PLUS (Pmode, frame_reg,
11043 GEN_INT (words * UNITS_PER_WORD))));
11045 varargs_label = gen_label_rtx ();
11046 emit_jump_insn (gen_jump (varargs_label));
11047 JUMP_LABEL (get_last_insn ()) = varargs_label;
11052 emit_label (label);
11053 LABEL_NUSES (label) = 1;
11055 /* If this function calls va_start, we now have to set the scratch
11056 register for the case where we do not call __morestack. In this
11057 case we need to set it based on the stack pointer. */
11058 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11060 emit_insn (gen_rtx_SET (VOIDmode, scratch_reg,
11061 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
11062 GEN_INT (UNITS_PER_WORD))));
11064 emit_label (varargs_label);
11065 LABEL_NUSES (varargs_label) = 1;
11069 /* We may have to tell the dataflow pass that the split stack prologue
11070 is initializing a scratch register. */
11073 ix86_live_on_entry (bitmap regs)
11075 if (cfun->machine->split_stack_varargs_pointer != NULL_RTX)
11077 gcc_assert (flag_split_stack);
11078 bitmap_set_bit (regs, split_stack_prologue_scratch_regno ());
11082 /* Extract the parts of an RTL expression that is a valid memory address
11083 for an instruction. Return 0 if the structure of the address is
11084 grossly off. Return -1 if the address contains ASHIFT, so it is not
11085 strictly valid, but still used for computing length of lea instruction. */
11088 ix86_decompose_address (rtx addr, struct ix86_address *out)
11090 rtx base = NULL_RTX, index = NULL_RTX, disp = NULL_RTX;
11091 rtx base_reg, index_reg;
11092 HOST_WIDE_INT scale = 1;
11093 rtx scale_rtx = NULL_RTX;
11096 enum ix86_address_seg seg = SEG_DEFAULT;
11098 if (REG_P (addr) || GET_CODE (addr) == SUBREG)
11100 else if (GET_CODE (addr) == PLUS)
11102 rtx addends[4], op;
11110 addends[n++] = XEXP (op, 1);
11113 while (GET_CODE (op) == PLUS);
11118 for (i = n; i >= 0; --i)
11121 switch (GET_CODE (op))
11126 index = XEXP (op, 0);
11127 scale_rtx = XEXP (op, 1);
11133 index = XEXP (op, 0);
11134 tmp = XEXP (op, 1);
11135 if (!CONST_INT_P (tmp))
11137 scale = INTVAL (tmp);
11138 if ((unsigned HOST_WIDE_INT) scale > 3)
11140 scale = 1 << scale;
11144 if (XINT (op, 1) == UNSPEC_TP
11145 && TARGET_TLS_DIRECT_SEG_REFS
11146 && seg == SEG_DEFAULT)
11147 seg = TARGET_64BIT ? SEG_FS : SEG_GS;
11176 else if (GET_CODE (addr) == MULT)
11178 index = XEXP (addr, 0); /* index*scale */
11179 scale_rtx = XEXP (addr, 1);
11181 else if (GET_CODE (addr) == ASHIFT)
11183 /* We're called for lea too, which implements ashift on occasion. */
11184 index = XEXP (addr, 0);
11185 tmp = XEXP (addr, 1);
11186 if (!CONST_INT_P (tmp))
11188 scale = INTVAL (tmp);
11189 if ((unsigned HOST_WIDE_INT) scale > 3)
11191 scale = 1 << scale;
11195 disp = addr; /* displacement */
11197 /* Extract the integral value of scale. */
11200 if (!CONST_INT_P (scale_rtx))
11202 scale = INTVAL (scale_rtx);
11205 base_reg = base && GET_CODE (base) == SUBREG ? SUBREG_REG (base) : base;
11206 index_reg = index && GET_CODE (index) == SUBREG ? SUBREG_REG (index) : index;
11208 /* Avoid useless 0 displacement. */
11209 if (disp == const0_rtx && (base || index))
11212 /* Allow arg pointer and stack pointer as index if there is not scaling. */
11213 if (base_reg && index_reg && scale == 1
11214 && (index_reg == arg_pointer_rtx
11215 || index_reg == frame_pointer_rtx
11216 || (REG_P (index_reg) && REGNO (index_reg) == STACK_POINTER_REGNUM)))
11219 tmp = base, base = index, index = tmp;
11220 tmp = base_reg, base_reg = index_reg, index_reg = tmp;
11223 /* Special case: %ebp cannot be encoded as a base without a displacement.
11227 && (base_reg == hard_frame_pointer_rtx
11228 || base_reg == frame_pointer_rtx
11229 || base_reg == arg_pointer_rtx
11230 || (REG_P (base_reg)
11231 && (REGNO (base_reg) == HARD_FRAME_POINTER_REGNUM
11232 || REGNO (base_reg) == R13_REG))))
11235 /* Special case: on K6, [%esi] makes the instruction vector decoded.
11236 Avoid this by transforming to [%esi+0].
11237 Reload calls address legitimization without cfun defined, so we need
11238 to test cfun for being non-NULL. */
11239 if (TARGET_K6 && cfun && optimize_function_for_speed_p (cfun)
11240 && base_reg && !index_reg && !disp
11241 && REG_P (base_reg) && REGNO (base_reg) == SI_REG)
11244 /* Special case: encode reg+reg instead of reg*2. */
11245 if (!base && index && scale == 2)
11246 base = index, base_reg = index_reg, scale = 1;
11248 /* Special case: scaling cannot be encoded without base or displacement. */
11249 if (!base && !disp && index && scale != 1)
11253 out->index = index;
11255 out->scale = scale;
11261 /* Return cost of the memory address x.
11262 For i386, it is better to use a complex address than let gcc copy
11263 the address into a reg and make a new pseudo. But not if the address
11264 requires to two regs - that would mean more pseudos with longer
11267 ix86_address_cost (rtx x, bool speed ATTRIBUTE_UNUSED)
11269 struct ix86_address parts;
11271 int ok = ix86_decompose_address (x, &parts);
11275 if (parts.base && GET_CODE (parts.base) == SUBREG)
11276 parts.base = SUBREG_REG (parts.base);
11277 if (parts.index && GET_CODE (parts.index) == SUBREG)
11278 parts.index = SUBREG_REG (parts.index);
11280 /* Attempt to minimize number of registers in the address. */
11282 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER))
11284 && (!REG_P (parts.index)
11285 || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)))
11289 && (!REG_P (parts.base) || REGNO (parts.base) >= FIRST_PSEUDO_REGISTER)
11291 && (!REG_P (parts.index) || REGNO (parts.index) >= FIRST_PSEUDO_REGISTER)
11292 && parts.base != parts.index)
11295 /* AMD-K6 don't like addresses with ModR/M set to 00_xxx_100b,
11296 since it's predecode logic can't detect the length of instructions
11297 and it degenerates to vector decoded. Increase cost of such
11298 addresses here. The penalty is minimally 2 cycles. It may be worthwhile
11299 to split such addresses or even refuse such addresses at all.
11301 Following addressing modes are affected:
11306 The first and last case may be avoidable by explicitly coding the zero in
11307 memory address, but I don't have AMD-K6 machine handy to check this
11311 && ((!parts.disp && parts.base && parts.index && parts.scale != 1)
11312 || (parts.disp && !parts.base && parts.index && parts.scale != 1)
11313 || (!parts.disp && parts.base && parts.index && parts.scale == 1)))
11319 /* Allow {LABEL | SYMBOL}_REF - SYMBOL_REF-FOR-PICBASE for Mach-O as
11320 this is used for to form addresses to local data when -fPIC is in
11324 darwin_local_data_pic (rtx disp)
11326 return (GET_CODE (disp) == UNSPEC
11327 && XINT (disp, 1) == UNSPEC_MACHOPIC_OFFSET);
11330 /* Determine if a given RTX is a valid constant. We already know this
11331 satisfies CONSTANT_P. */
11334 ix86_legitimate_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
11336 switch (GET_CODE (x))
11341 if (GET_CODE (x) == PLUS)
11343 if (!CONST_INT_P (XEXP (x, 1)))
11348 if (TARGET_MACHO && darwin_local_data_pic (x))
11351 /* Only some unspecs are valid as "constants". */
11352 if (GET_CODE (x) == UNSPEC)
11353 switch (XINT (x, 1))
11356 case UNSPEC_GOTOFF:
11357 case UNSPEC_PLTOFF:
11358 return TARGET_64BIT;
11360 case UNSPEC_NTPOFF:
11361 x = XVECEXP (x, 0, 0);
11362 return (GET_CODE (x) == SYMBOL_REF
11363 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11364 case UNSPEC_DTPOFF:
11365 x = XVECEXP (x, 0, 0);
11366 return (GET_CODE (x) == SYMBOL_REF
11367 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
11372 /* We must have drilled down to a symbol. */
11373 if (GET_CODE (x) == LABEL_REF)
11375 if (GET_CODE (x) != SYMBOL_REF)
11380 /* TLS symbols are never valid. */
11381 if (SYMBOL_REF_TLS_MODEL (x))
11384 /* DLLIMPORT symbols are never valid. */
11385 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
11386 && SYMBOL_REF_DLLIMPORT_P (x))
11390 /* mdynamic-no-pic */
11391 if (MACHO_DYNAMIC_NO_PIC_P)
11392 return machopic_symbol_defined_p (x);
11397 if (GET_MODE (x) == TImode
11398 && x != CONST0_RTX (TImode)
11404 if (!standard_sse_constant_p (x))
11411 /* Otherwise we handle everything else in the move patterns. */
11415 /* Determine if it's legal to put X into the constant pool. This
11416 is not possible for the address of thread-local symbols, which
11417 is checked above. */
11420 ix86_cannot_force_const_mem (enum machine_mode mode, rtx x)
11422 /* We can always put integral constants and vectors in memory. */
11423 switch (GET_CODE (x))
11433 return !ix86_legitimate_constant_p (mode, x);
11437 /* Nonzero if the constant value X is a legitimate general operand
11438 when generating PIC code. It is given that flag_pic is on and
11439 that X satisfies CONSTANT_P or is a CONST_DOUBLE. */
11442 legitimate_pic_operand_p (rtx x)
11446 switch (GET_CODE (x))
11449 inner = XEXP (x, 0);
11450 if (GET_CODE (inner) == PLUS
11451 && CONST_INT_P (XEXP (inner, 1)))
11452 inner = XEXP (inner, 0);
11454 /* Only some unspecs are valid as "constants". */
11455 if (GET_CODE (inner) == UNSPEC)
11456 switch (XINT (inner, 1))
11459 case UNSPEC_GOTOFF:
11460 case UNSPEC_PLTOFF:
11461 return TARGET_64BIT;
11463 x = XVECEXP (inner, 0, 0);
11464 return (GET_CODE (x) == SYMBOL_REF
11465 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
11466 case UNSPEC_MACHOPIC_OFFSET:
11467 return legitimate_pic_address_disp_p (x);
11475 return legitimate_pic_address_disp_p (x);
11482 /* Determine if a given CONST RTX is a valid memory displacement
11486 legitimate_pic_address_disp_p (rtx disp)
11490 /* In 64bit mode we can allow direct addresses of symbols and labels
11491 when they are not dynamic symbols. */
11494 rtx op0 = disp, op1;
11496 switch (GET_CODE (disp))
11502 if (GET_CODE (XEXP (disp, 0)) != PLUS)
11504 op0 = XEXP (XEXP (disp, 0), 0);
11505 op1 = XEXP (XEXP (disp, 0), 1);
11506 if (!CONST_INT_P (op1)
11507 || INTVAL (op1) >= 16*1024*1024
11508 || INTVAL (op1) < -16*1024*1024)
11510 if (GET_CODE (op0) == LABEL_REF)
11512 if (GET_CODE (op0) != SYMBOL_REF)
11517 /* TLS references should always be enclosed in UNSPEC. */
11518 if (SYMBOL_REF_TLS_MODEL (op0))
11520 if (!SYMBOL_REF_FAR_ADDR_P (op0) && SYMBOL_REF_LOCAL_P (op0)
11521 && ix86_cmodel != CM_LARGE_PIC)
11529 if (GET_CODE (disp) != CONST)
11531 disp = XEXP (disp, 0);
11535 /* We are unsafe to allow PLUS expressions. This limit allowed distance
11536 of GOT tables. We should not need these anyway. */
11537 if (GET_CODE (disp) != UNSPEC
11538 || (XINT (disp, 1) != UNSPEC_GOTPCREL
11539 && XINT (disp, 1) != UNSPEC_GOTOFF
11540 && XINT (disp, 1) != UNSPEC_PCREL
11541 && XINT (disp, 1) != UNSPEC_PLTOFF))
11544 if (GET_CODE (XVECEXP (disp, 0, 0)) != SYMBOL_REF
11545 && GET_CODE (XVECEXP (disp, 0, 0)) != LABEL_REF)
11551 if (GET_CODE (disp) == PLUS)
11553 if (!CONST_INT_P (XEXP (disp, 1)))
11555 disp = XEXP (disp, 0);
11559 if (TARGET_MACHO && darwin_local_data_pic (disp))
11562 if (GET_CODE (disp) != UNSPEC)
11565 switch (XINT (disp, 1))
11570 /* We need to check for both symbols and labels because VxWorks loads
11571 text labels with @GOT rather than @GOTOFF. See gotoff_operand for
11573 return (GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11574 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF);
11575 case UNSPEC_GOTOFF:
11576 /* Refuse GOTOFF in 64bit mode since it is always 64bit when used.
11577 While ABI specify also 32bit relocation but we don't produce it in
11578 small PIC model at all. */
11579 if ((GET_CODE (XVECEXP (disp, 0, 0)) == SYMBOL_REF
11580 || GET_CODE (XVECEXP (disp, 0, 0)) == LABEL_REF)
11582 return gotoff_operand (XVECEXP (disp, 0, 0), Pmode);
11584 case UNSPEC_GOTTPOFF:
11585 case UNSPEC_GOTNTPOFF:
11586 case UNSPEC_INDNTPOFF:
11589 disp = XVECEXP (disp, 0, 0);
11590 return (GET_CODE (disp) == SYMBOL_REF
11591 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_INITIAL_EXEC);
11592 case UNSPEC_NTPOFF:
11593 disp = XVECEXP (disp, 0, 0);
11594 return (GET_CODE (disp) == SYMBOL_REF
11595 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_EXEC);
11596 case UNSPEC_DTPOFF:
11597 disp = XVECEXP (disp, 0, 0);
11598 return (GET_CODE (disp) == SYMBOL_REF
11599 && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
11605 /* Recognizes RTL expressions that are valid memory addresses for an
11606 instruction. The MODE argument is the machine mode for the MEM
11607 expression that wants to use this address.
11609 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should
11610 convert common non-canonical forms to canonical form so that they will
11614 ix86_legitimate_address_p (enum machine_mode mode ATTRIBUTE_UNUSED,
11615 rtx addr, bool strict)
11617 struct ix86_address parts;
11618 rtx base, index, disp;
11619 HOST_WIDE_INT scale;
11621 if (ix86_decompose_address (addr, &parts) <= 0)
11622 /* Decomposition failed. */
11626 index = parts.index;
11628 scale = parts.scale;
11630 /* Validate base register.
11632 Don't allow SUBREG's that span more than a word here. It can lead to spill
11633 failures when the base is one word out of a two word structure, which is
11634 represented internally as a DImode int. */
11642 else if (GET_CODE (base) == SUBREG
11643 && REG_P (SUBREG_REG (base))
11644 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (base)))
11646 reg = SUBREG_REG (base);
11648 /* Base is not a register. */
11651 if (GET_MODE (base) != Pmode)
11652 /* Base is not in Pmode. */
11655 if ((strict && ! REG_OK_FOR_BASE_STRICT_P (reg))
11656 || (! strict && ! REG_OK_FOR_BASE_NONSTRICT_P (reg)))
11657 /* Base is not valid. */
11661 /* Validate index register.
11663 Don't allow SUBREG's that span more than a word here -- same as above. */
11671 else if (GET_CODE (index) == SUBREG
11672 && REG_P (SUBREG_REG (index))
11673 && GET_MODE_SIZE (GET_MODE (SUBREG_REG (index)))
11675 reg = SUBREG_REG (index);
11677 /* Index is not a register. */
11680 if (GET_MODE (index) != Pmode)
11681 /* Index is not in Pmode. */
11684 if ((strict && ! REG_OK_FOR_INDEX_STRICT_P (reg))
11685 || (! strict && ! REG_OK_FOR_INDEX_NONSTRICT_P (reg)))
11686 /* Index is not valid. */
11690 /* Validate scale factor. */
11694 /* Scale without index. */
11697 if (scale != 2 && scale != 4 && scale != 8)
11698 /* Scale is not a valid multiplier. */
11702 /* Validate displacement. */
11705 if (GET_CODE (disp) == CONST
11706 && GET_CODE (XEXP (disp, 0)) == UNSPEC
11707 && XINT (XEXP (disp, 0), 1) != UNSPEC_MACHOPIC_OFFSET)
11708 switch (XINT (XEXP (disp, 0), 1))
11710 /* Refuse GOTOFF and GOT in 64bit mode since it is always 64bit when
11711 used. While ABI specify also 32bit relocations, we don't produce
11712 them at all and use IP relative instead. */
11714 case UNSPEC_GOTOFF:
11715 gcc_assert (flag_pic);
11717 goto is_legitimate_pic;
11719 /* 64bit address unspec. */
11722 case UNSPEC_GOTPCREL:
11724 gcc_assert (flag_pic);
11725 goto is_legitimate_pic;
11727 case UNSPEC_GOTTPOFF:
11728 case UNSPEC_GOTNTPOFF:
11729 case UNSPEC_INDNTPOFF:
11730 case UNSPEC_NTPOFF:
11731 case UNSPEC_DTPOFF:
11734 case UNSPEC_STACK_CHECK:
11735 gcc_assert (flag_split_stack);
11739 /* Invalid address unspec. */
11743 else if (SYMBOLIC_CONST (disp)
11747 && MACHOPIC_INDIRECT
11748 && !machopic_operand_p (disp)
11754 if (TARGET_64BIT && (index || base))
11756 /* foo@dtpoff(%rX) is ok. */
11757 if (GET_CODE (disp) != CONST
11758 || GET_CODE (XEXP (disp, 0)) != PLUS
11759 || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
11760 || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
11761 || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
11762 && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
11763 /* Non-constant pic memory reference. */
11766 else if ((!TARGET_MACHO || flag_pic)
11767 && ! legitimate_pic_address_disp_p (disp))
11768 /* Displacement is an invalid pic construct. */
11771 else if (MACHO_DYNAMIC_NO_PIC_P
11772 && !ix86_legitimate_constant_p (Pmode, disp))
11773 /* displacment must be referenced via non_lazy_pointer */
11777 /* This code used to verify that a symbolic pic displacement
11778 includes the pic_offset_table_rtx register.
11780 While this is good idea, unfortunately these constructs may
11781 be created by "adds using lea" optimization for incorrect
11790 This code is nonsensical, but results in addressing
11791 GOT table with pic_offset_table_rtx base. We can't
11792 just refuse it easily, since it gets matched by
11793 "addsi3" pattern, that later gets split to lea in the
11794 case output register differs from input. While this
11795 can be handled by separate addsi pattern for this case
11796 that never results in lea, this seems to be easier and
11797 correct fix for crash to disable this test. */
11799 else if (GET_CODE (disp) != LABEL_REF
11800 && !CONST_INT_P (disp)
11801 && (GET_CODE (disp) != CONST
11802 || !ix86_legitimate_constant_p (Pmode, disp))
11803 && (GET_CODE (disp) != SYMBOL_REF
11804 || !ix86_legitimate_constant_p (Pmode, disp)))
11805 /* Displacement is not constant. */
11807 else if (TARGET_64BIT
11808 && !x86_64_immediate_operand (disp, VOIDmode))
11809 /* Displacement is out of range. */
11813 /* Everything looks valid. */
11817 /* Determine if a given RTX is a valid constant address. */
11820 constant_address_p (rtx x)
11822 return CONSTANT_P (x) && ix86_legitimate_address_p (Pmode, x, 1);
11825 /* Return a unique alias set for the GOT. */
11827 static alias_set_type
11828 ix86_GOT_alias_set (void)
11830 static alias_set_type set = -1;
11832 set = new_alias_set ();
11836 /* Return a legitimate reference for ORIG (an address) using the
11837 register REG. If REG is 0, a new pseudo is generated.
11839 There are two types of references that must be handled:
11841 1. Global data references must load the address from the GOT, via
11842 the PIC reg. An insn is emitted to do this load, and the reg is
11845 2. Static data references, constant pool addresses, and code labels
11846 compute the address as an offset from the GOT, whose base is in
11847 the PIC reg. Static data objects have SYMBOL_FLAG_LOCAL set to
11848 differentiate them from global data objects. The returned
11849 address is the PIC reg + an unspec constant.
11851 TARGET_LEGITIMATE_ADDRESS_P rejects symbolic references unless the PIC
11852 reg also appears in the address. */
11855 legitimize_pic_address (rtx orig, rtx reg)
11858 rtx new_rtx = orig;
11862 if (TARGET_MACHO && !TARGET_64BIT)
11865 reg = gen_reg_rtx (Pmode);
11866 /* Use the generic Mach-O PIC machinery. */
11867 return machopic_legitimize_pic_address (orig, GET_MODE (orig), reg);
11871 if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
11873 else if (TARGET_64BIT
11874 && ix86_cmodel != CM_SMALL_PIC
11875 && gotoff_operand (addr, Pmode))
11878 /* This symbol may be referenced via a displacement from the PIC
11879 base address (@GOTOFF). */
11881 if (reload_in_progress)
11882 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
11883 if (GET_CODE (addr) == CONST)
11884 addr = XEXP (addr, 0);
11885 if (GET_CODE (addr) == PLUS)
11887 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
11889 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
11892 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
11893 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
11895 tmpreg = gen_reg_rtx (Pmode);
11898 emit_move_insn (tmpreg, new_rtx);
11902 new_rtx = expand_simple_binop (Pmode, PLUS, reg, pic_offset_table_rtx,
11903 tmpreg, 1, OPTAB_DIRECT);
11906 else new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, tmpreg);
11908 else if (!TARGET_64BIT && gotoff_operand (addr, Pmode))
11910 /* This symbol may be referenced via a displacement from the PIC
11911 base address (@GOTOFF). */
11913 if (reload_in_progress)
11914 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
11915 if (GET_CODE (addr) == CONST)
11916 addr = XEXP (addr, 0);
11917 if (GET_CODE (addr) == PLUS)
11919 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, XEXP (addr, 0)),
11921 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, XEXP (addr, 1));
11924 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTOFF);
11925 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
11926 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
11930 emit_move_insn (reg, new_rtx);
11934 else if ((GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (addr) == 0)
11935 /* We can't use @GOTOFF for text labels on VxWorks;
11936 see gotoff_operand. */
11937 || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
11939 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
11941 if (GET_CODE (addr) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (addr))
11942 return legitimize_dllimport_symbol (addr, true);
11943 if (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS
11944 && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF
11945 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (addr, 0), 0)))
11947 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (addr, 0), 0), true);
11948 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (addr, 0), 1));
11952 /* For x64 PE-COFF there is no GOT table. So we use address
11954 if (TARGET_64BIT && DEFAULT_ABI == MS_ABI)
11956 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_PCREL);
11957 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
11960 reg = gen_reg_rtx (Pmode);
11961 emit_move_insn (reg, new_rtx);
11964 else if (TARGET_64BIT && ix86_cmodel != CM_LARGE_PIC)
11966 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOTPCREL);
11967 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
11968 new_rtx = gen_const_mem (Pmode, new_rtx);
11969 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
11972 reg = gen_reg_rtx (Pmode);
11973 /* Use directly gen_movsi, otherwise the address is loaded
11974 into register for CSE. We don't want to CSE this addresses,
11975 instead we CSE addresses from the GOT table, so skip this. */
11976 emit_insn (gen_movsi (reg, new_rtx));
11981 /* This symbol must be referenced via a load from the
11982 Global Offset Table (@GOT). */
11984 if (reload_in_progress)
11985 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
11986 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr), UNSPEC_GOT);
11987 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
11989 new_rtx = force_reg (Pmode, new_rtx);
11990 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
11991 new_rtx = gen_const_mem (Pmode, new_rtx);
11992 set_mem_alias_set (new_rtx, ix86_GOT_alias_set ());
11995 reg = gen_reg_rtx (Pmode);
11996 emit_move_insn (reg, new_rtx);
12002 if (CONST_INT_P (addr)
12003 && !x86_64_immediate_operand (addr, VOIDmode))
12007 emit_move_insn (reg, addr);
12011 new_rtx = force_reg (Pmode, addr);
12013 else if (GET_CODE (addr) == CONST)
12015 addr = XEXP (addr, 0);
12017 /* We must match stuff we generate before. Assume the only
12018 unspecs that can get here are ours. Not that we could do
12019 anything with them anyway.... */
12020 if (GET_CODE (addr) == UNSPEC
12021 || (GET_CODE (addr) == PLUS
12022 && GET_CODE (XEXP (addr, 0)) == UNSPEC))
12024 gcc_assert (GET_CODE (addr) == PLUS);
12026 if (GET_CODE (addr) == PLUS)
12028 rtx op0 = XEXP (addr, 0), op1 = XEXP (addr, 1);
12030 /* Check first to see if this is a constant offset from a @GOTOFF
12031 symbol reference. */
12032 if (gotoff_operand (op0, Pmode)
12033 && CONST_INT_P (op1))
12037 if (reload_in_progress)
12038 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12039 new_rtx = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op0),
12041 new_rtx = gen_rtx_PLUS (Pmode, new_rtx, op1);
12042 new_rtx = gen_rtx_CONST (Pmode, new_rtx);
12043 new_rtx = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, new_rtx);
12047 emit_move_insn (reg, new_rtx);
12053 if (INTVAL (op1) < -16*1024*1024
12054 || INTVAL (op1) >= 16*1024*1024)
12056 if (!x86_64_immediate_operand (op1, Pmode))
12057 op1 = force_reg (Pmode, op1);
12058 new_rtx = gen_rtx_PLUS (Pmode, force_reg (Pmode, op0), op1);
12064 base = legitimize_pic_address (XEXP (addr, 0), reg);
12065 new_rtx = legitimize_pic_address (XEXP (addr, 1),
12066 base == reg ? NULL_RTX : reg);
12068 if (CONST_INT_P (new_rtx))
12069 new_rtx = plus_constant (base, INTVAL (new_rtx));
12072 if (GET_CODE (new_rtx) == PLUS && CONSTANT_P (XEXP (new_rtx, 1)))
12074 base = gen_rtx_PLUS (Pmode, base, XEXP (new_rtx, 0));
12075 new_rtx = XEXP (new_rtx, 1);
12077 new_rtx = gen_rtx_PLUS (Pmode, base, new_rtx);
12085 /* Load the thread pointer. If TO_REG is true, force it into a register. */
12088 get_thread_pointer (bool to_reg)
12092 tp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx), UNSPEC_TP);
12096 reg = gen_reg_rtx (Pmode);
12097 insn = gen_rtx_SET (VOIDmode, reg, tp);
12098 insn = emit_insn (insn);
12103 /* Construct the SYMBOL_REF for the tls_get_addr function. */
12105 static GTY(()) rtx ix86_tls_symbol;
12108 ix86_tls_get_addr (void)
12110 if (!ix86_tls_symbol)
12113 = ((TARGET_ANY_GNU_TLS && !TARGET_64BIT)
12114 ? "___tls_get_addr" : "__tls_get_addr");
12116 ix86_tls_symbol = gen_rtx_SYMBOL_REF (Pmode, sym);
12119 return ix86_tls_symbol;
12122 /* Construct the SYMBOL_REF for the _TLS_MODULE_BASE_ symbol. */
12124 static GTY(()) rtx ix86_tls_module_base_symbol;
12127 ix86_tls_module_base (void)
12129 if (!ix86_tls_module_base_symbol)
12131 ix86_tls_module_base_symbol
12132 = gen_rtx_SYMBOL_REF (Pmode, "_TLS_MODULE_BASE_");
12134 SYMBOL_REF_FLAGS (ix86_tls_module_base_symbol)
12135 |= TLS_MODEL_GLOBAL_DYNAMIC << SYMBOL_FLAG_TLS_SHIFT;
12138 return ix86_tls_module_base_symbol;
12141 /* A subroutine of ix86_legitimize_address and ix86_expand_move. FOR_MOV is
12142 false if we expect this to be used for a memory address and true if
12143 we expect to load the address into a register. */
12146 legitimize_tls_address (rtx x, enum tls_model model, bool for_mov)
12148 rtx dest, base, off;
12149 rtx pic = NULL_RTX, tp = NULL_RTX;
12154 case TLS_MODEL_GLOBAL_DYNAMIC:
12155 dest = gen_reg_rtx (Pmode);
12160 pic = pic_offset_table_rtx;
12163 pic = gen_reg_rtx (Pmode);
12164 emit_insn (gen_set_got (pic));
12168 if (TARGET_GNU2_TLS)
12171 emit_insn (gen_tls_dynamic_gnu2_64 (dest, x));
12173 emit_insn (gen_tls_dynamic_gnu2_32 (dest, x, pic));
12175 tp = get_thread_pointer (true);
12176 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, tp, dest));
12178 set_unique_reg_note (get_last_insn (), REG_EQUIV, x);
12182 rtx caddr = ix86_tls_get_addr ();
12186 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns;
12189 emit_call_insn (gen_tls_global_dynamic_64 (rax, x, caddr));
12190 insns = get_insns ();
12193 RTL_CONST_CALL_P (insns) = 1;
12194 emit_libcall_block (insns, dest, rax, x);
12197 emit_insn (gen_tls_global_dynamic_32 (dest, x, pic, caddr));
12201 case TLS_MODEL_LOCAL_DYNAMIC:
12202 base = gen_reg_rtx (Pmode);
12207 pic = pic_offset_table_rtx;
12210 pic = gen_reg_rtx (Pmode);
12211 emit_insn (gen_set_got (pic));
12215 if (TARGET_GNU2_TLS)
12217 rtx tmp = ix86_tls_module_base ();
12220 emit_insn (gen_tls_dynamic_gnu2_64 (base, tmp));
12222 emit_insn (gen_tls_dynamic_gnu2_32 (base, tmp, pic));
12224 tp = get_thread_pointer (true);
12225 set_unique_reg_note (get_last_insn (), REG_EQUIV,
12226 gen_rtx_MINUS (Pmode, tmp, tp));
12230 rtx caddr = ix86_tls_get_addr ();
12234 rtx rax = gen_rtx_REG (Pmode, AX_REG), insns, eqv;
12237 emit_call_insn (gen_tls_local_dynamic_base_64 (rax, caddr));
12238 insns = get_insns ();
12241 /* Attach a unique REG_EQUIV, to allow the RTL optimizers to
12242 share the LD_BASE result with other LD model accesses. */
12243 eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
12244 UNSPEC_TLS_LD_BASE);
12246 RTL_CONST_CALL_P (insns) = 1;
12247 emit_libcall_block (insns, base, rax, eqv);
12250 emit_insn (gen_tls_local_dynamic_base_32 (base, pic, caddr));
12253 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_DTPOFF);
12254 off = gen_rtx_CONST (Pmode, off);
12256 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, base, off));
12258 if (TARGET_GNU2_TLS)
12260 dest = force_reg (Pmode, gen_rtx_PLUS (Pmode, dest, tp));
12262 set_unique_reg_note (get_last_insn (), REG_EQUIV, x);
12266 case TLS_MODEL_INITIAL_EXEC:
12269 if (TARGET_SUN_TLS)
12271 /* The Sun linker took the AMD64 TLS spec literally
12272 and can only handle %rax as destination of the
12273 initial executable code sequence. */
12275 dest = gen_reg_rtx (Pmode);
12276 emit_insn (gen_tls_initial_exec_64_sun (dest, x));
12281 type = UNSPEC_GOTNTPOFF;
12285 if (reload_in_progress)
12286 df_set_regs_ever_live (PIC_OFFSET_TABLE_REGNUM, true);
12287 pic = pic_offset_table_rtx;
12288 type = TARGET_ANY_GNU_TLS ? UNSPEC_GOTNTPOFF : UNSPEC_GOTTPOFF;
12290 else if (!TARGET_ANY_GNU_TLS)
12292 pic = gen_reg_rtx (Pmode);
12293 emit_insn (gen_set_got (pic));
12294 type = UNSPEC_GOTTPOFF;
12299 type = UNSPEC_INDNTPOFF;
12302 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), type);
12303 off = gen_rtx_CONST (Pmode, off);
12305 off = gen_rtx_PLUS (Pmode, pic, off);
12306 off = gen_const_mem (Pmode, off);
12307 set_mem_alias_set (off, ix86_GOT_alias_set ());
12309 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12311 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12312 off = force_reg (Pmode, off);
12313 return gen_rtx_PLUS (Pmode, base, off);
12317 base = get_thread_pointer (true);
12318 dest = gen_reg_rtx (Pmode);
12319 emit_insn (gen_subsi3 (dest, base, off));
12323 case TLS_MODEL_LOCAL_EXEC:
12324 off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x),
12325 (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12326 ? UNSPEC_NTPOFF : UNSPEC_TPOFF);
12327 off = gen_rtx_CONST (Pmode, off);
12329 if (TARGET_64BIT || TARGET_ANY_GNU_TLS)
12331 base = get_thread_pointer (for_mov || !TARGET_TLS_DIRECT_SEG_REFS);
12332 return gen_rtx_PLUS (Pmode, base, off);
12336 base = get_thread_pointer (true);
12337 dest = gen_reg_rtx (Pmode);
12338 emit_insn (gen_subsi3 (dest, base, off));
12343 gcc_unreachable ();
12349 /* Create or return the unique __imp_DECL dllimport symbol corresponding
12352 static GTY((if_marked ("tree_map_marked_p"), param_is (struct tree_map)))
12353 htab_t dllimport_map;
12356 get_dllimport_decl (tree decl)
12358 struct tree_map *h, in;
12361 const char *prefix;
12362 size_t namelen, prefixlen;
12367 if (!dllimport_map)
12368 dllimport_map = htab_create_ggc (512, tree_map_hash, tree_map_eq, 0);
12370 in.hash = htab_hash_pointer (decl);
12371 in.base.from = decl;
12372 loc = htab_find_slot_with_hash (dllimport_map, &in, in.hash, INSERT);
12373 h = (struct tree_map *) *loc;
12377 *loc = h = ggc_alloc_tree_map ();
12379 h->base.from = decl;
12380 h->to = to = build_decl (DECL_SOURCE_LOCATION (decl),
12381 VAR_DECL, NULL, ptr_type_node);
12382 DECL_ARTIFICIAL (to) = 1;
12383 DECL_IGNORED_P (to) = 1;
12384 DECL_EXTERNAL (to) = 1;
12385 TREE_READONLY (to) = 1;
12387 name = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl));
12388 name = targetm.strip_name_encoding (name);
12389 prefix = name[0] == FASTCALL_PREFIX || user_label_prefix[0] == 0
12390 ? "*__imp_" : "*__imp__";
12391 namelen = strlen (name);
12392 prefixlen = strlen (prefix);
12393 imp_name = (char *) alloca (namelen + prefixlen + 1);
12394 memcpy (imp_name, prefix, prefixlen);
12395 memcpy (imp_name + prefixlen, name, namelen + 1);
12397 name = ggc_alloc_string (imp_name, namelen + prefixlen);
12398 rtl = gen_rtx_SYMBOL_REF (Pmode, name);
12399 SET_SYMBOL_REF_DECL (rtl, to);
12400 SYMBOL_REF_FLAGS (rtl) = SYMBOL_FLAG_LOCAL;
12402 rtl = gen_const_mem (Pmode, rtl);
12403 set_mem_alias_set (rtl, ix86_GOT_alias_set ());
12405 SET_DECL_RTL (to, rtl);
12406 SET_DECL_ASSEMBLER_NAME (to, get_identifier (name));
12411 /* Expand SYMBOL into its corresponding dllimport symbol. WANT_REG is
12412 true if we require the result be a register. */
12415 legitimize_dllimport_symbol (rtx symbol, bool want_reg)
12420 gcc_assert (SYMBOL_REF_DECL (symbol));
12421 imp_decl = get_dllimport_decl (SYMBOL_REF_DECL (symbol));
12423 x = DECL_RTL (imp_decl);
12425 x = force_reg (Pmode, x);
12429 /* Try machine-dependent ways of modifying an illegitimate address
12430 to be legitimate. If we find one, return the new, valid address.
12431 This macro is used in only one place: `memory_address' in explow.c.
12433 OLDX is the address as it was before break_out_memory_refs was called.
12434 In some cases it is useful to look at this to decide what needs to be done.
12436 It is always safe for this macro to do nothing. It exists to recognize
12437 opportunities to optimize the output.
12439 For the 80386, we handle X+REG by loading X into a register R and
12440 using R+REG. R will go in a general reg and indexing will be used.
12441 However, if REG is a broken-out memory address or multiplication,
12442 nothing needs to be done because REG can certainly go in a general reg.
12444 When -fpic is used, special handling is needed for symbolic references.
12445 See comments by legitimize_pic_address in i386.c for details. */
12448 ix86_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
12449 enum machine_mode mode)
12454 log = GET_CODE (x) == SYMBOL_REF ? SYMBOL_REF_TLS_MODEL (x) : 0;
12456 return legitimize_tls_address (x, (enum tls_model) log, false);
12457 if (GET_CODE (x) == CONST
12458 && GET_CODE (XEXP (x, 0)) == PLUS
12459 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12460 && (log = SYMBOL_REF_TLS_MODEL (XEXP (XEXP (x, 0), 0))))
12462 rtx t = legitimize_tls_address (XEXP (XEXP (x, 0), 0),
12463 (enum tls_model) log, false);
12464 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12467 if (TARGET_DLLIMPORT_DECL_ATTRIBUTES)
12469 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_DLLIMPORT_P (x))
12470 return legitimize_dllimport_symbol (x, true);
12471 if (GET_CODE (x) == CONST
12472 && GET_CODE (XEXP (x, 0)) == PLUS
12473 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF
12474 && SYMBOL_REF_DLLIMPORT_P (XEXP (XEXP (x, 0), 0)))
12476 rtx t = legitimize_dllimport_symbol (XEXP (XEXP (x, 0), 0), true);
12477 return gen_rtx_PLUS (Pmode, t, XEXP (XEXP (x, 0), 1));
12481 if (flag_pic && SYMBOLIC_CONST (x))
12482 return legitimize_pic_address (x, 0);
12485 if (MACHO_DYNAMIC_NO_PIC_P && SYMBOLIC_CONST (x))
12486 return machopic_indirect_data_reference (x, 0);
12489 /* Canonicalize shifts by 0, 1, 2, 3 into multiply */
12490 if (GET_CODE (x) == ASHIFT
12491 && CONST_INT_P (XEXP (x, 1))
12492 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (x, 1)) < 4)
12495 log = INTVAL (XEXP (x, 1));
12496 x = gen_rtx_MULT (Pmode, force_reg (Pmode, XEXP (x, 0)),
12497 GEN_INT (1 << log));
12500 if (GET_CODE (x) == PLUS)
12502 /* Canonicalize shifts by 0, 1, 2, 3 into multiply. */
12504 if (GET_CODE (XEXP (x, 0)) == ASHIFT
12505 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
12506 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 0), 1)) < 4)
12509 log = INTVAL (XEXP (XEXP (x, 0), 1));
12510 XEXP (x, 0) = gen_rtx_MULT (Pmode,
12511 force_reg (Pmode, XEXP (XEXP (x, 0), 0)),
12512 GEN_INT (1 << log));
12515 if (GET_CODE (XEXP (x, 1)) == ASHIFT
12516 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
12517 && (unsigned HOST_WIDE_INT) INTVAL (XEXP (XEXP (x, 1), 1)) < 4)
12520 log = INTVAL (XEXP (XEXP (x, 1), 1));
12521 XEXP (x, 1) = gen_rtx_MULT (Pmode,
12522 force_reg (Pmode, XEXP (XEXP (x, 1), 0)),
12523 GEN_INT (1 << log));
12526 /* Put multiply first if it isn't already. */
12527 if (GET_CODE (XEXP (x, 1)) == MULT)
12529 rtx tmp = XEXP (x, 0);
12530 XEXP (x, 0) = XEXP (x, 1);
12535 /* Canonicalize (plus (mult (reg) (const)) (plus (reg) (const)))
12536 into (plus (plus (mult (reg) (const)) (reg)) (const)). This can be
12537 created by virtual register instantiation, register elimination, and
12538 similar optimizations. */
12539 if (GET_CODE (XEXP (x, 0)) == MULT && GET_CODE (XEXP (x, 1)) == PLUS)
12542 x = gen_rtx_PLUS (Pmode,
12543 gen_rtx_PLUS (Pmode, XEXP (x, 0),
12544 XEXP (XEXP (x, 1), 0)),
12545 XEXP (XEXP (x, 1), 1));
12549 (plus (plus (mult (reg) (const)) (plus (reg) (const))) const)
12550 into (plus (plus (mult (reg) (const)) (reg)) (const)). */
12551 else if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 0)) == PLUS
12552 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
12553 && GET_CODE (XEXP (XEXP (x, 0), 1)) == PLUS
12554 && CONSTANT_P (XEXP (x, 1)))
12557 rtx other = NULL_RTX;
12559 if (CONST_INT_P (XEXP (x, 1)))
12561 constant = XEXP (x, 1);
12562 other = XEXP (XEXP (XEXP (x, 0), 1), 1);
12564 else if (CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 1), 1)))
12566 constant = XEXP (XEXP (XEXP (x, 0), 1), 1);
12567 other = XEXP (x, 1);
12575 x = gen_rtx_PLUS (Pmode,
12576 gen_rtx_PLUS (Pmode, XEXP (XEXP (x, 0), 0),
12577 XEXP (XEXP (XEXP (x, 0), 1), 0)),
12578 plus_constant (other, INTVAL (constant)));
12582 if (changed && ix86_legitimate_address_p (mode, x, false))
12585 if (GET_CODE (XEXP (x, 0)) == MULT)
12588 XEXP (x, 0) = force_operand (XEXP (x, 0), 0);
12591 if (GET_CODE (XEXP (x, 1)) == MULT)
12594 XEXP (x, 1) = force_operand (XEXP (x, 1), 0);
12598 && REG_P (XEXP (x, 1))
12599 && REG_P (XEXP (x, 0)))
12602 if (flag_pic && SYMBOLIC_CONST (XEXP (x, 1)))
12605 x = legitimize_pic_address (x, 0);
12608 if (changed && ix86_legitimate_address_p (mode, x, false))
12611 if (REG_P (XEXP (x, 0)))
12613 rtx temp = gen_reg_rtx (Pmode);
12614 rtx val = force_operand (XEXP (x, 1), temp);
12616 emit_move_insn (temp, val);
12618 XEXP (x, 1) = temp;
12622 else if (REG_P (XEXP (x, 1)))
12624 rtx temp = gen_reg_rtx (Pmode);
12625 rtx val = force_operand (XEXP (x, 0), temp);
12627 emit_move_insn (temp, val);
12629 XEXP (x, 0) = temp;
12637 /* Print an integer constant expression in assembler syntax. Addition
12638 and subtraction are the only arithmetic that may appear in these
12639 expressions. FILE is the stdio stream to write to, X is the rtx, and
12640 CODE is the operand print code from the output string. */
12643 output_pic_addr_const (FILE *file, rtx x, int code)
12647 switch (GET_CODE (x))
12650 gcc_assert (flag_pic);
12655 if (TARGET_64BIT || ! TARGET_MACHO_BRANCH_ISLANDS)
12656 output_addr_const (file, x);
12659 const char *name = XSTR (x, 0);
12661 /* Mark the decl as referenced so that cgraph will
12662 output the function. */
12663 if (SYMBOL_REF_DECL (x))
12664 mark_decl_referenced (SYMBOL_REF_DECL (x));
12667 if (MACHOPIC_INDIRECT
12668 && machopic_classify_symbol (x) == MACHOPIC_UNDEFINED_FUNCTION)
12669 name = machopic_indirection_name (x, /*stub_p=*/true);
12671 assemble_name (file, name);
12673 if (!TARGET_MACHO && !(TARGET_64BIT && DEFAULT_ABI == MS_ABI)
12674 && code == 'P' && ! SYMBOL_REF_LOCAL_P (x))
12675 fputs ("@PLT", file);
12682 ASM_GENERATE_INTERNAL_LABEL (buf, "L", CODE_LABEL_NUMBER (x));
12683 assemble_name (asm_out_file, buf);
12687 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
12691 /* This used to output parentheses around the expression,
12692 but that does not work on the 386 (either ATT or BSD assembler). */
12693 output_pic_addr_const (file, XEXP (x, 0), code);
12697 if (GET_MODE (x) == VOIDmode)
12699 /* We can use %d if the number is <32 bits and positive. */
12700 if (CONST_DOUBLE_HIGH (x) || CONST_DOUBLE_LOW (x) < 0)
12701 fprintf (file, "0x%lx%08lx",
12702 (unsigned long) CONST_DOUBLE_HIGH (x),
12703 (unsigned long) CONST_DOUBLE_LOW (x));
12705 fprintf (file, HOST_WIDE_INT_PRINT_DEC, CONST_DOUBLE_LOW (x));
12708 /* We can't handle floating point constants;
12709 TARGET_PRINT_OPERAND must handle them. */
12710 output_operand_lossage ("floating constant misused");
12714 /* Some assemblers need integer constants to appear first. */
12715 if (CONST_INT_P (XEXP (x, 0)))
12717 output_pic_addr_const (file, XEXP (x, 0), code);
12719 output_pic_addr_const (file, XEXP (x, 1), code);
12723 gcc_assert (CONST_INT_P (XEXP (x, 1)));
12724 output_pic_addr_const (file, XEXP (x, 1), code);
12726 output_pic_addr_const (file, XEXP (x, 0), code);
12732 putc (ASSEMBLER_DIALECT == ASM_INTEL ? '(' : '[', file);
12733 output_pic_addr_const (file, XEXP (x, 0), code);
12735 output_pic_addr_const (file, XEXP (x, 1), code);
12737 putc (ASSEMBLER_DIALECT == ASM_INTEL ? ')' : ']', file);
12741 if (XINT (x, 1) == UNSPEC_STACK_CHECK)
12743 bool f = i386_asm_output_addr_const_extra (file, x);
12748 gcc_assert (XVECLEN (x, 0) == 1);
12749 output_pic_addr_const (file, XVECEXP (x, 0, 0), code);
12750 switch (XINT (x, 1))
12753 fputs ("@GOT", file);
12755 case UNSPEC_GOTOFF:
12756 fputs ("@GOTOFF", file);
12758 case UNSPEC_PLTOFF:
12759 fputs ("@PLTOFF", file);
12762 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
12763 "(%rip)" : "[rip]", file);
12765 case UNSPEC_GOTPCREL:
12766 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
12767 "@GOTPCREL(%rip)" : "@GOTPCREL[rip]", file);
12769 case UNSPEC_GOTTPOFF:
12770 /* FIXME: This might be @TPOFF in Sun ld too. */
12771 fputs ("@gottpoff", file);
12774 fputs ("@tpoff", file);
12776 case UNSPEC_NTPOFF:
12778 fputs ("@tpoff", file);
12780 fputs ("@ntpoff", file);
12782 case UNSPEC_DTPOFF:
12783 fputs ("@dtpoff", file);
12785 case UNSPEC_GOTNTPOFF:
12787 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
12788 "@gottpoff(%rip)": "@gottpoff[rip]", file);
12790 fputs ("@gotntpoff", file);
12792 case UNSPEC_INDNTPOFF:
12793 fputs ("@indntpoff", file);
12796 case UNSPEC_MACHOPIC_OFFSET:
12798 machopic_output_function_base_name (file);
12802 output_operand_lossage ("invalid UNSPEC as operand");
12808 output_operand_lossage ("invalid expression as operand");
12812 /* This is called from dwarf2out.c via TARGET_ASM_OUTPUT_DWARF_DTPREL.
12813 We need to emit DTP-relative relocations. */
12815 static void ATTRIBUTE_UNUSED
12816 i386_output_dwarf_dtprel (FILE *file, int size, rtx x)
12818 fputs (ASM_LONG, file);
12819 output_addr_const (file, x);
12820 fputs ("@dtpoff", file);
12826 fputs (", 0", file);
12829 gcc_unreachable ();
12833 /* Return true if X is a representation of the PIC register. This copes
12834 with calls from ix86_find_base_term, where the register might have
12835 been replaced by a cselib value. */
12838 ix86_pic_register_p (rtx x)
12840 if (GET_CODE (x) == VALUE && CSELIB_VAL_PTR (x))
12841 return (pic_offset_table_rtx
12842 && rtx_equal_for_cselib_p (x, pic_offset_table_rtx));
12844 return REG_P (x) && REGNO (x) == PIC_OFFSET_TABLE_REGNUM;
12847 /* Helper function for ix86_delegitimize_address.
12848 Attempt to delegitimize TLS local-exec accesses. */
12851 ix86_delegitimize_tls_address (rtx orig_x)
12853 rtx x = orig_x, unspec;
12854 struct ix86_address addr;
12856 if (!TARGET_TLS_DIRECT_SEG_REFS)
12860 if (GET_CODE (x) != PLUS || GET_MODE (x) != Pmode)
12862 if (ix86_decompose_address (x, &addr) == 0
12863 || addr.seg != (TARGET_64BIT ? SEG_FS : SEG_GS)
12864 || addr.disp == NULL_RTX
12865 || GET_CODE (addr.disp) != CONST)
12867 unspec = XEXP (addr.disp, 0);
12868 if (GET_CODE (unspec) == PLUS && CONST_INT_P (XEXP (unspec, 1)))
12869 unspec = XEXP (unspec, 0);
12870 if (GET_CODE (unspec) != UNSPEC || XINT (unspec, 1) != UNSPEC_NTPOFF)
12872 x = XVECEXP (unspec, 0, 0);
12873 gcc_assert (GET_CODE (x) == SYMBOL_REF);
12874 if (unspec != XEXP (addr.disp, 0))
12875 x = gen_rtx_PLUS (Pmode, x, XEXP (XEXP (addr.disp, 0), 1));
12878 rtx idx = addr.index;
12879 if (addr.scale != 1)
12880 idx = gen_rtx_MULT (Pmode, idx, GEN_INT (addr.scale));
12881 x = gen_rtx_PLUS (Pmode, idx, x);
12884 x = gen_rtx_PLUS (Pmode, addr.base, x);
12885 if (MEM_P (orig_x))
12886 x = replace_equiv_address_nv (orig_x, x);
12890 /* In the name of slightly smaller debug output, and to cater to
12891 general assembler lossage, recognize PIC+GOTOFF and turn it back
12892 into a direct symbol reference.
12894 On Darwin, this is necessary to avoid a crash, because Darwin
12895 has a different PIC label for each routine but the DWARF debugging
12896 information is not associated with any particular routine, so it's
12897 necessary to remove references to the PIC label from RTL stored by
12898 the DWARF output code. */
12901 ix86_delegitimize_address (rtx x)
12903 rtx orig_x = delegitimize_mem_from_attrs (x);
12904 /* addend is NULL or some rtx if x is something+GOTOFF where
12905 something doesn't include the PIC register. */
12906 rtx addend = NULL_RTX;
12907 /* reg_addend is NULL or a multiple of some register. */
12908 rtx reg_addend = NULL_RTX;
12909 /* const_addend is NULL or a const_int. */
12910 rtx const_addend = NULL_RTX;
12911 /* This is the result, or NULL. */
12912 rtx result = NULL_RTX;
12921 if (GET_CODE (x) != CONST
12922 || GET_CODE (XEXP (x, 0)) != UNSPEC
12923 || (XINT (XEXP (x, 0), 1) != UNSPEC_GOTPCREL
12924 && XINT (XEXP (x, 0), 1) != UNSPEC_PCREL)
12925 || !MEM_P (orig_x))
12926 return ix86_delegitimize_tls_address (orig_x);
12927 x = XVECEXP (XEXP (x, 0), 0, 0);
12928 if (GET_MODE (orig_x) != Pmode)
12930 x = simplify_gen_subreg (GET_MODE (orig_x), x, Pmode, 0);
12937 if (GET_CODE (x) != PLUS
12938 || GET_CODE (XEXP (x, 1)) != CONST)
12939 return ix86_delegitimize_tls_address (orig_x);
12941 if (ix86_pic_register_p (XEXP (x, 0)))
12942 /* %ebx + GOT/GOTOFF */
12944 else if (GET_CODE (XEXP (x, 0)) == PLUS)
12946 /* %ebx + %reg * scale + GOT/GOTOFF */
12947 reg_addend = XEXP (x, 0);
12948 if (ix86_pic_register_p (XEXP (reg_addend, 0)))
12949 reg_addend = XEXP (reg_addend, 1);
12950 else if (ix86_pic_register_p (XEXP (reg_addend, 1)))
12951 reg_addend = XEXP (reg_addend, 0);
12954 reg_addend = NULL_RTX;
12955 addend = XEXP (x, 0);
12959 addend = XEXP (x, 0);
12961 x = XEXP (XEXP (x, 1), 0);
12962 if (GET_CODE (x) == PLUS
12963 && CONST_INT_P (XEXP (x, 1)))
12965 const_addend = XEXP (x, 1);
12969 if (GET_CODE (x) == UNSPEC
12970 && ((XINT (x, 1) == UNSPEC_GOT && MEM_P (orig_x) && !addend)
12971 || (XINT (x, 1) == UNSPEC_GOTOFF && !MEM_P (orig_x))))
12972 result = XVECEXP (x, 0, 0);
12974 if (TARGET_MACHO && darwin_local_data_pic (x)
12975 && !MEM_P (orig_x))
12976 result = XVECEXP (x, 0, 0);
12979 return ix86_delegitimize_tls_address (orig_x);
12982 result = gen_rtx_CONST (Pmode, gen_rtx_PLUS (Pmode, result, const_addend));
12984 result = gen_rtx_PLUS (Pmode, reg_addend, result);
12987 /* If the rest of original X doesn't involve the PIC register, add
12988 addend and subtract pic_offset_table_rtx. This can happen e.g.
12990 leal (%ebx, %ecx, 4), %ecx
12992 movl foo@GOTOFF(%ecx), %edx
12993 in which case we return (%ecx - %ebx) + foo. */
12994 if (pic_offset_table_rtx)
12995 result = gen_rtx_PLUS (Pmode, gen_rtx_MINUS (Pmode, copy_rtx (addend),
12996 pic_offset_table_rtx),
13001 if (GET_MODE (orig_x) != Pmode && MEM_P (orig_x))
13003 result = simplify_gen_subreg (GET_MODE (orig_x), result, Pmode, 0);
13004 if (result == NULL_RTX)
13010 /* If X is a machine specific address (i.e. a symbol or label being
13011 referenced as a displacement from the GOT implemented using an
13012 UNSPEC), then return the base term. Otherwise return X. */
13015 ix86_find_base_term (rtx x)
13021 if (GET_CODE (x) != CONST)
13023 term = XEXP (x, 0);
13024 if (GET_CODE (term) == PLUS
13025 && (CONST_INT_P (XEXP (term, 1))
13026 || GET_CODE (XEXP (term, 1)) == CONST_DOUBLE))
13027 term = XEXP (term, 0);
13028 if (GET_CODE (term) != UNSPEC
13029 || (XINT (term, 1) != UNSPEC_GOTPCREL
13030 && XINT (term, 1) != UNSPEC_PCREL))
13033 return XVECEXP (term, 0, 0);
13036 return ix86_delegitimize_address (x);
13040 put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
13041 int fp, FILE *file)
13043 const char *suffix;
13045 if (mode == CCFPmode || mode == CCFPUmode)
13047 code = ix86_fp_compare_code_to_integer (code);
13051 code = reverse_condition (code);
13102 gcc_assert (mode == CCmode || mode == CCNOmode || mode == CCGCmode);
13106 /* ??? Use "nbe" instead of "a" for fcmov lossage on some assemblers.
13107 Those same assemblers have the same but opposite lossage on cmov. */
13108 if (mode == CCmode)
13109 suffix = fp ? "nbe" : "a";
13110 else if (mode == CCCmode)
13113 gcc_unreachable ();
13129 gcc_unreachable ();
13133 gcc_assert (mode == CCmode || mode == CCCmode);
13150 gcc_unreachable ();
13154 /* ??? As above. */
13155 gcc_assert (mode == CCmode || mode == CCCmode);
13156 suffix = fp ? "nb" : "ae";
13159 gcc_assert (mode == CCmode || mode == CCGCmode || mode == CCNOmode);
13163 /* ??? As above. */
13164 if (mode == CCmode)
13166 else if (mode == CCCmode)
13167 suffix = fp ? "nb" : "ae";
13169 gcc_unreachable ();
13172 suffix = fp ? "u" : "p";
13175 suffix = fp ? "nu" : "np";
13178 gcc_unreachable ();
13180 fputs (suffix, file);
13183 /* Print the name of register X to FILE based on its machine mode and number.
13184 If CODE is 'w', pretend the mode is HImode.
13185 If CODE is 'b', pretend the mode is QImode.
13186 If CODE is 'k', pretend the mode is SImode.
13187 If CODE is 'q', pretend the mode is DImode.
13188 If CODE is 'x', pretend the mode is V4SFmode.
13189 If CODE is 't', pretend the mode is V8SFmode.
13190 If CODE is 'h', pretend the reg is the 'high' byte register.
13191 If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
13192 If CODE is 'd', duplicate the operand for AVX instruction.
13196 print_reg (rtx x, int code, FILE *file)
13199 bool duplicated = code == 'd' && TARGET_AVX;
13201 gcc_assert (x == pc_rtx
13202 || (REGNO (x) != ARG_POINTER_REGNUM
13203 && REGNO (x) != FRAME_POINTER_REGNUM
13204 && REGNO (x) != FLAGS_REG
13205 && REGNO (x) != FPSR_REG
13206 && REGNO (x) != FPCR_REG));
13208 if (ASSEMBLER_DIALECT == ASM_ATT)
13213 gcc_assert (TARGET_64BIT);
13214 fputs ("rip", file);
13218 if (code == 'w' || MMX_REG_P (x))
13220 else if (code == 'b')
13222 else if (code == 'k')
13224 else if (code == 'q')
13226 else if (code == 'y')
13228 else if (code == 'h')
13230 else if (code == 'x')
13232 else if (code == 't')
13235 code = GET_MODE_SIZE (GET_MODE (x));
13237 /* Irritatingly, AMD extended registers use different naming convention
13238 from the normal registers. */
13239 if (REX_INT_REG_P (x))
13241 gcc_assert (TARGET_64BIT);
13245 error ("extended registers have no high halves");
13248 fprintf (file, "r%ib", REGNO (x) - FIRST_REX_INT_REG + 8);
13251 fprintf (file, "r%iw", REGNO (x) - FIRST_REX_INT_REG + 8);
13254 fprintf (file, "r%id", REGNO (x) - FIRST_REX_INT_REG + 8);
13257 fprintf (file, "r%i", REGNO (x) - FIRST_REX_INT_REG + 8);
13260 error ("unsupported operand size for extended register");
13270 if (STACK_TOP_P (x))
13279 if (! ANY_FP_REG_P (x))
13280 putc (code == 8 && TARGET_64BIT ? 'r' : 'e', file);
13285 reg = hi_reg_name[REGNO (x)];
13288 if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
13290 reg = qi_reg_name[REGNO (x)];
13293 if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
13295 reg = qi_high_reg_name[REGNO (x)];
13300 gcc_assert (!duplicated);
13302 fputs (hi_reg_name[REGNO (x)] + 1, file);
13307 gcc_unreachable ();
13313 if (ASSEMBLER_DIALECT == ASM_ATT)
13314 fprintf (file, ", %%%s", reg);
13316 fprintf (file, ", %s", reg);
13320 /* Locate some local-dynamic symbol still in use by this function
13321 so that we can print its name in some tls_local_dynamic_base
13325 get_some_local_dynamic_name_1 (rtx *px, void *data ATTRIBUTE_UNUSED)
13329 if (GET_CODE (x) == SYMBOL_REF
13330 && SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC)
13332 cfun->machine->some_ld_name = XSTR (x, 0);
13339 static const char *
13340 get_some_local_dynamic_name (void)
13344 if (cfun->machine->some_ld_name)
13345 return cfun->machine->some_ld_name;
13347 for (insn = get_insns (); insn ; insn = NEXT_INSN (insn))
13348 if (NONDEBUG_INSN_P (insn)
13349 && for_each_rtx (&PATTERN (insn), get_some_local_dynamic_name_1, 0))
13350 return cfun->machine->some_ld_name;
13355 /* Meaning of CODE:
13356 L,W,B,Q,S,T -- print the opcode suffix for specified size of operand.
13357 C -- print opcode suffix for set/cmov insn.
13358 c -- like C, but print reversed condition
13359 F,f -- likewise, but for floating-point.
13360 O -- if HAVE_AS_IX86_CMOV_SUN_SYNTAX, expand to "w.", "l." or "q.",
13362 R -- print the prefix for register names.
13363 z -- print the opcode suffix for the size of the current operand.
13364 Z -- likewise, with special suffixes for x87 instructions.
13365 * -- print a star (in certain assembler syntax)
13366 A -- print an absolute memory reference.
13367 w -- print the operand as if it's a "word" (HImode) even if it isn't.
13368 s -- print a shift double count, followed by the assemblers argument
13370 b -- print the QImode name of the register for the indicated operand.
13371 %b0 would print %al if operands[0] is reg 0.
13372 w -- likewise, print the HImode name of the register.
13373 k -- likewise, print the SImode name of the register.
13374 q -- likewise, print the DImode name of the register.
13375 x -- likewise, print the V4SFmode name of the register.
13376 t -- likewise, print the V8SFmode name of the register.
13377 h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
13378 y -- print "st(0)" instead of "st" as a register.
13379 d -- print duplicated register operand for AVX instruction.
13380 D -- print condition for SSE cmp instruction.
13381 P -- if PIC, print an @PLT suffix.
13382 p -- print raw symbol name.
13383 X -- don't print any sort of PIC '@' suffix for a symbol.
13384 & -- print some in-use local-dynamic symbol name.
13385 H -- print a memory address offset by 8; used for sse high-parts
13386 Y -- print condition for XOP pcom* instruction.
13387 + -- print a branch hint as 'cs' or 'ds' prefix
13388 ; -- print a semicolon (after prefixes due to bug in older gas).
13389 @ -- print a segment register of thread base pointer load
13393 ix86_print_operand (FILE *file, rtx x, int code)
13400 if (ASSEMBLER_DIALECT == ASM_ATT)
13406 const char *name = get_some_local_dynamic_name ();
13408 output_operand_lossage ("'%%&' used without any "
13409 "local dynamic TLS references");
13411 assemble_name (file, name);
13416 switch (ASSEMBLER_DIALECT)
13423 /* Intel syntax. For absolute addresses, registers should not
13424 be surrounded by braces. */
13428 ix86_print_operand (file, x, 0);
13435 gcc_unreachable ();
13438 ix86_print_operand (file, x, 0);
13443 if (ASSEMBLER_DIALECT == ASM_ATT)
13448 if (ASSEMBLER_DIALECT == ASM_ATT)
13453 if (ASSEMBLER_DIALECT == ASM_ATT)
13458 if (ASSEMBLER_DIALECT == ASM_ATT)
13463 if (ASSEMBLER_DIALECT == ASM_ATT)
13468 if (ASSEMBLER_DIALECT == ASM_ATT)
13473 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13475 /* Opcodes don't get size suffixes if using Intel opcodes. */
13476 if (ASSEMBLER_DIALECT == ASM_INTEL)
13479 switch (GET_MODE_SIZE (GET_MODE (x)))
13498 output_operand_lossage
13499 ("invalid operand size for operand code '%c'", code);
13504 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13506 (0, "non-integer operand used with operand code '%c'", code);
13510 /* 387 opcodes don't get size suffixes if using Intel opcodes. */
13511 if (ASSEMBLER_DIALECT == ASM_INTEL)
13514 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
13516 switch (GET_MODE_SIZE (GET_MODE (x)))
13519 #ifdef HAVE_AS_IX86_FILDS
13529 #ifdef HAVE_AS_IX86_FILDQ
13532 fputs ("ll", file);
13540 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
13542 /* 387 opcodes don't get size suffixes
13543 if the operands are registers. */
13544 if (STACK_REG_P (x))
13547 switch (GET_MODE_SIZE (GET_MODE (x)))
13568 output_operand_lossage
13569 ("invalid operand type used with operand code '%c'", code);
13573 output_operand_lossage
13574 ("invalid operand size for operand code '%c'", code);
13592 if (CONST_INT_P (x) || ! SHIFT_DOUBLE_OMITS_COUNT)
13594 ix86_print_operand (file, x, 0);
13595 fputs (", ", file);
13600 /* Little bit of braindamage here. The SSE compare instructions
13601 does use completely different names for the comparisons that the
13602 fp conditional moves. */
13605 switch (GET_CODE (x))
13608 fputs ("eq", file);
13611 fputs ("eq_us", file);
13614 fputs ("lt", file);
13617 fputs ("nge", file);
13620 fputs ("le", file);
13623 fputs ("ngt", file);
13626 fputs ("unord", file);
13629 fputs ("neq", file);
13632 fputs ("neq_oq", file);
13635 fputs ("ge", file);
13638 fputs ("nlt", file);
13641 fputs ("gt", file);
13644 fputs ("nle", file);
13647 fputs ("ord", file);
13650 output_operand_lossage ("operand is not a condition code, "
13651 "invalid operand code 'D'");
13657 switch (GET_CODE (x))
13661 fputs ("eq", file);
13665 fputs ("lt", file);
13669 fputs ("le", file);
13672 fputs ("unord", file);
13676 fputs ("neq", file);
13680 fputs ("nlt", file);
13684 fputs ("nle", file);
13687 fputs ("ord", file);
13690 output_operand_lossage ("operand is not a condition code, "
13691 "invalid operand code 'D'");
13697 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
13698 if (ASSEMBLER_DIALECT == ASM_ATT)
13700 switch (GET_MODE (x))
13702 case HImode: putc ('w', file); break;
13704 case SFmode: putc ('l', file); break;
13706 case DFmode: putc ('q', file); break;
13707 default: gcc_unreachable ();
13714 if (!COMPARISON_P (x))
13716 output_operand_lossage ("operand is neither a constant nor a "
13717 "condition code, invalid operand code "
13721 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 0, file);
13724 if (!COMPARISON_P (x))
13726 output_operand_lossage ("operand is neither a constant nor a "
13727 "condition code, invalid operand code "
13731 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
13732 if (ASSEMBLER_DIALECT == ASM_ATT)
13735 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 0, 1, file);
13738 /* Like above, but reverse condition */
13740 /* Check to see if argument to %c is really a constant
13741 and not a condition code which needs to be reversed. */
13742 if (!COMPARISON_P (x))
13744 output_operand_lossage ("operand is neither a constant nor a "
13745 "condition code, invalid operand "
13749 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 0, file);
13752 if (!COMPARISON_P (x))
13754 output_operand_lossage ("operand is neither a constant nor a "
13755 "condition code, invalid operand "
13759 #ifdef HAVE_AS_IX86_CMOV_SUN_SYNTAX
13760 if (ASSEMBLER_DIALECT == ASM_ATT)
13763 put_condition_code (GET_CODE (x), GET_MODE (XEXP (x, 0)), 1, 1, file);
13767 /* It doesn't actually matter what mode we use here, as we're
13768 only going to use this for printing. */
13769 x = adjust_address_nv (x, DImode, 8);
13777 || optimize_function_for_size_p (cfun) || !TARGET_BRANCH_PREDICTION_HINTS)
13780 x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
13783 int pred_val = INTVAL (XEXP (x, 0));
13785 if (pred_val < REG_BR_PROB_BASE * 45 / 100
13786 || pred_val > REG_BR_PROB_BASE * 55 / 100)
13788 int taken = pred_val > REG_BR_PROB_BASE / 2;
13789 int cputaken = final_forward_branch_p (current_output_insn) == 0;
13791 /* Emit hints only in the case default branch prediction
13792 heuristics would fail. */
13793 if (taken != cputaken)
13795 /* We use 3e (DS) prefix for taken branches and
13796 2e (CS) prefix for not taken branches. */
13798 fputs ("ds ; ", file);
13800 fputs ("cs ; ", file);
13808 switch (GET_CODE (x))
13811 fputs ("neq", file);
13814 fputs ("eq", file);
13818 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "ge" : "unlt", file);
13822 fputs (INTEGRAL_MODE_P (GET_MODE (x)) ? "gt" : "unle", file);
13826 fputs ("le", file);
13830 fputs ("lt", file);
13833 fputs ("unord", file);
13836 fputs ("ord", file);
13839 fputs ("ueq", file);
13842 fputs ("nlt", file);
13845 fputs ("nle", file);
13848 fputs ("ule", file);
13851 fputs ("ult", file);
13854 fputs ("une", file);
13857 output_operand_lossage ("operand is not a condition code, "
13858 "invalid operand code 'Y'");
13864 #ifndef HAVE_AS_IX86_REP_LOCK_PREFIX
13870 if (ASSEMBLER_DIALECT == ASM_ATT)
13873 /* The kernel uses a different segment register for performance
13874 reasons; a system call would not have to trash the userspace
13875 segment register, which would be expensive. */
13876 if (TARGET_64BIT && ix86_cmodel != CM_KERNEL)
13877 fputs ("fs", file);
13879 fputs ("gs", file);
13883 output_operand_lossage ("invalid operand code '%c'", code);
13888 print_reg (x, code, file);
13890 else if (MEM_P (x))
13892 /* No `byte ptr' prefix for call instructions or BLKmode operands. */
13893 if (ASSEMBLER_DIALECT == ASM_INTEL && code != 'X' && code != 'P'
13894 && GET_MODE (x) != BLKmode)
13897 switch (GET_MODE_SIZE (GET_MODE (x)))
13899 case 1: size = "BYTE"; break;
13900 case 2: size = "WORD"; break;
13901 case 4: size = "DWORD"; break;
13902 case 8: size = "QWORD"; break;
13903 case 12: size = "TBYTE"; break;
13905 if (GET_MODE (x) == XFmode)
13910 case 32: size = "YMMWORD"; break;
13912 gcc_unreachable ();
13915 /* Check for explicit size override (codes 'b', 'w' and 'k') */
13918 else if (code == 'w')
13920 else if (code == 'k')
13923 fputs (size, file);
13924 fputs (" PTR ", file);
13928 /* Avoid (%rip) for call operands. */
13929 if (CONSTANT_ADDRESS_P (x) && code == 'P'
13930 && !CONST_INT_P (x))
13931 output_addr_const (file, x);
13932 else if (this_is_asm_operands && ! address_operand (x, VOIDmode))
13933 output_operand_lossage ("invalid constraints for operand");
13935 output_address (x);
13938 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == SFmode)
13943 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
13944 REAL_VALUE_TO_TARGET_SINGLE (r, l);
13946 if (ASSEMBLER_DIALECT == ASM_ATT)
13948 /* Sign extend 32bit SFmode immediate to 8 bytes. */
13950 fprintf (file, "0x%08llx", (unsigned long long) (int) l);
13952 fprintf (file, "0x%08x", (unsigned int) l);
13955 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode)
13960 REAL_VALUE_FROM_CONST_DOUBLE (r, x);
13961 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
13963 if (ASSEMBLER_DIALECT == ASM_ATT)
13965 fprintf (file, "0x%lx%08lx", l[1] & 0xffffffff, l[0] & 0xffffffff);
13968 /* These float cases don't actually occur as immediate operands. */
13969 else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == XFmode)
13973 real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1);
13974 fputs (dstr, file);
13979 /* We have patterns that allow zero sets of memory, for instance.
13980 In 64-bit mode, we should probably support all 8-byte vectors,
13981 since we can in fact encode that into an immediate. */
13982 if (GET_CODE (x) == CONST_VECTOR)
13984 gcc_assert (x == CONST0_RTX (GET_MODE (x)));
13988 if (code != 'P' && code != 'p')
13990 if (CONST_INT_P (x) || GET_CODE (x) == CONST_DOUBLE)
13992 if (ASSEMBLER_DIALECT == ASM_ATT)
13995 else if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF
13996 || GET_CODE (x) == LABEL_REF)
13998 if (ASSEMBLER_DIALECT == ASM_ATT)
14001 fputs ("OFFSET FLAT:", file);
14004 if (CONST_INT_P (x))
14005 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
14006 else if (flag_pic || MACHOPIC_INDIRECT)
14007 output_pic_addr_const (file, x, code);
14009 output_addr_const (file, x);
14014 ix86_print_operand_punct_valid_p (unsigned char code)
14016 return (code == '@' || code == '*' || code == '+'
14017 || code == '&' || code == ';');
14020 /* Print a memory operand whose address is ADDR. */
14023 ix86_print_operand_address (FILE *file, rtx addr)
14025 struct ix86_address parts;
14026 rtx base, index, disp;
14028 int ok = ix86_decompose_address (addr, &parts);
14033 index = parts.index;
14035 scale = parts.scale;
14043 if (ASSEMBLER_DIALECT == ASM_ATT)
14045 fputs ((parts.seg == SEG_FS ? "fs:" : "gs:"), file);
14048 gcc_unreachable ();
14051 /* Use one byte shorter RIP relative addressing for 64bit mode. */
14052 if (TARGET_64BIT && !base && !index)
14056 if (GET_CODE (disp) == CONST
14057 && GET_CODE (XEXP (disp, 0)) == PLUS
14058 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14059 symbol = XEXP (XEXP (disp, 0), 0);
14061 if (GET_CODE (symbol) == LABEL_REF
14062 || (GET_CODE (symbol) == SYMBOL_REF
14063 && SYMBOL_REF_TLS_MODEL (symbol) == 0))
14066 if (!base && !index)
14068 /* Displacement only requires special attention. */
14070 if (CONST_INT_P (disp))
14072 if (ASSEMBLER_DIALECT == ASM_INTEL && parts.seg == SEG_DEFAULT)
14073 fputs ("ds:", file);
14074 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (disp));
14077 output_pic_addr_const (file, disp, 0);
14079 output_addr_const (file, disp);
14083 if (ASSEMBLER_DIALECT == ASM_ATT)
14088 output_pic_addr_const (file, disp, 0);
14089 else if (GET_CODE (disp) == LABEL_REF)
14090 output_asm_label (disp);
14092 output_addr_const (file, disp);
14097 print_reg (base, 0, file);
14101 print_reg (index, 0, file);
14103 fprintf (file, ",%d", scale);
14109 rtx offset = NULL_RTX;
14113 /* Pull out the offset of a symbol; print any symbol itself. */
14114 if (GET_CODE (disp) == CONST
14115 && GET_CODE (XEXP (disp, 0)) == PLUS
14116 && CONST_INT_P (XEXP (XEXP (disp, 0), 1)))
14118 offset = XEXP (XEXP (disp, 0), 1);
14119 disp = gen_rtx_CONST (VOIDmode,
14120 XEXP (XEXP (disp, 0), 0));
14124 output_pic_addr_const (file, disp, 0);
14125 else if (GET_CODE (disp) == LABEL_REF)
14126 output_asm_label (disp);
14127 else if (CONST_INT_P (disp))
14130 output_addr_const (file, disp);
14136 print_reg (base, 0, file);
14139 if (INTVAL (offset) >= 0)
14141 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14145 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset));
14152 print_reg (index, 0, file);
14154 fprintf (file, "*%d", scale);
14161 /* Implementation of TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
14164 i386_asm_output_addr_const_extra (FILE *file, rtx x)
14168 if (GET_CODE (x) != UNSPEC)
14171 op = XVECEXP (x, 0, 0);
14172 switch (XINT (x, 1))
14174 case UNSPEC_GOTTPOFF:
14175 output_addr_const (file, op);
14176 /* FIXME: This might be @TPOFF in Sun ld. */
14177 fputs ("@gottpoff", file);
14180 output_addr_const (file, op);
14181 fputs ("@tpoff", file);
14183 case UNSPEC_NTPOFF:
14184 output_addr_const (file, op);
14186 fputs ("@tpoff", file);
14188 fputs ("@ntpoff", file);
14190 case UNSPEC_DTPOFF:
14191 output_addr_const (file, op);
14192 fputs ("@dtpoff", file);
14194 case UNSPEC_GOTNTPOFF:
14195 output_addr_const (file, op);
14197 fputs (ASSEMBLER_DIALECT == ASM_ATT ?
14198 "@gottpoff(%rip)" : "@gottpoff[rip]", file);
14200 fputs ("@gotntpoff", file);
14202 case UNSPEC_INDNTPOFF:
14203 output_addr_const (file, op);
14204 fputs ("@indntpoff", file);
14207 case UNSPEC_MACHOPIC_OFFSET:
14208 output_addr_const (file, op);
14210 machopic_output_function_base_name (file);
14214 case UNSPEC_STACK_CHECK:
14218 gcc_assert (flag_split_stack);
14220 #ifdef TARGET_THREAD_SPLIT_STACK_OFFSET
14221 offset = TARGET_THREAD_SPLIT_STACK_OFFSET;
14223 gcc_unreachable ();
14226 fprintf (file, "%s:%d", TARGET_64BIT ? "%fs" : "%gs", offset);
14237 /* Split one or more double-mode RTL references into pairs of half-mode
14238 references. The RTL can be REG, offsettable MEM, integer constant, or
14239 CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
14240 split and "num" is its length. lo_half and hi_half are output arrays
14241 that parallel "operands". */
14244 split_double_mode (enum machine_mode mode, rtx operands[],
14245 int num, rtx lo_half[], rtx hi_half[])
14247 enum machine_mode half_mode;
14253 half_mode = DImode;
14256 half_mode = SImode;
14259 gcc_unreachable ();
14262 byte = GET_MODE_SIZE (half_mode);
14266 rtx op = operands[num];
14268 /* simplify_subreg refuse to split volatile memory addresses,
14269 but we still have to handle it. */
14272 lo_half[num] = adjust_address (op, half_mode, 0);
14273 hi_half[num] = adjust_address (op, half_mode, byte);
14277 lo_half[num] = simplify_gen_subreg (half_mode, op,
14278 GET_MODE (op) == VOIDmode
14279 ? mode : GET_MODE (op), 0);
14280 hi_half[num] = simplify_gen_subreg (half_mode, op,
14281 GET_MODE (op) == VOIDmode
14282 ? mode : GET_MODE (op), byte);
14287 /* Output code to perform a 387 binary operation in INSN, one of PLUS,
14288 MINUS, MULT or DIV. OPERANDS are the insn operands, where operands[3]
14289 is the expression of the binary operation. The output may either be
14290 emitted here, or returned to the caller, like all output_* functions.
14292 There is no guarantee that the operands are the same mode, as they
14293 might be within FLOAT or FLOAT_EXTEND expressions. */
14295 #ifndef SYSV386_COMPAT
14296 /* Set to 1 for compatibility with brain-damaged assemblers. No-one
14297 wants to fix the assemblers because that causes incompatibility
14298 with gcc. No-one wants to fix gcc because that causes
14299 incompatibility with assemblers... You can use the option of
14300 -DSYSV386_COMPAT=0 if you recompile both gcc and gas this way. */
14301 #define SYSV386_COMPAT 1
14305 output_387_binary_op (rtx insn, rtx *operands)
14307 static char buf[40];
14310 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
14312 #ifdef ENABLE_CHECKING
14313 /* Even if we do not want to check the inputs, this documents input
14314 constraints. Which helps in understanding the following code. */
14315 if (STACK_REG_P (operands[0])
14316 && ((REG_P (operands[1])
14317 && REGNO (operands[0]) == REGNO (operands[1])
14318 && (STACK_REG_P (operands[2]) || MEM_P (operands[2])))
14319 || (REG_P (operands[2])
14320 && REGNO (operands[0]) == REGNO (operands[2])
14321 && (STACK_REG_P (operands[1]) || MEM_P (operands[1]))))
14322 && (STACK_TOP_P (operands[1]) || STACK_TOP_P (operands[2])))
14325 gcc_assert (is_sse);
14328 switch (GET_CODE (operands[3]))
14331 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14332 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14340 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14341 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14349 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14350 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14358 if (GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_INT
14359 || GET_MODE_CLASS (GET_MODE (operands[2])) == MODE_INT)
14367 gcc_unreachable ();
14374 strcpy (buf, ssep);
14375 if (GET_MODE (operands[0]) == SFmode)
14376 strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
14378 strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
14382 strcpy (buf, ssep + 1);
14383 if (GET_MODE (operands[0]) == SFmode)
14384 strcat (buf, "ss\t{%2, %0|%0, %2}");
14386 strcat (buf, "sd\t{%2, %0|%0, %2}");
14392 switch (GET_CODE (operands[3]))
14396 if (REG_P (operands[2]) && REGNO (operands[0]) == REGNO (operands[2]))
14398 rtx temp = operands[2];
14399 operands[2] = operands[1];
14400 operands[1] = temp;
14403 /* know operands[0] == operands[1]. */
14405 if (MEM_P (operands[2]))
14411 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14413 if (STACK_TOP_P (operands[0]))
14414 /* How is it that we are storing to a dead operand[2]?
14415 Well, presumably operands[1] is dead too. We can't
14416 store the result to st(0) as st(0) gets popped on this
14417 instruction. Instead store to operands[2] (which I
14418 think has to be st(1)). st(1) will be popped later.
14419 gcc <= 2.8.1 didn't have this check and generated
14420 assembly code that the Unixware assembler rejected. */
14421 p = "p\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14423 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14427 if (STACK_TOP_P (operands[0]))
14428 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14430 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14435 if (MEM_P (operands[1]))
14441 if (MEM_P (operands[2]))
14447 if (find_regno_note (insn, REG_DEAD, REGNO (operands[2])))
14450 /* The SystemV/386 SVR3.2 assembler, and probably all AT&T
14451 derived assemblers, confusingly reverse the direction of
14452 the operation for fsub{r} and fdiv{r} when the
14453 destination register is not st(0). The Intel assembler
14454 doesn't have this brain damage. Read !SYSV386_COMPAT to
14455 figure out what the hardware really does. */
14456 if (STACK_TOP_P (operands[0]))
14457 p = "{p\t%0, %2|rp\t%2, %0}";
14459 p = "{rp\t%2, %0|p\t%0, %2}";
14461 if (STACK_TOP_P (operands[0]))
14462 /* As above for fmul/fadd, we can't store to st(0). */
14463 p = "rp\t{%0, %2|%2, %0}"; /* st(1) = st(0) op st(1); pop */
14465 p = "p\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0); pop */
14470 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
14473 if (STACK_TOP_P (operands[0]))
14474 p = "{rp\t%0, %1|p\t%1, %0}";
14476 p = "{p\t%1, %0|rp\t%0, %1}";
14478 if (STACK_TOP_P (operands[0]))
14479 p = "p\t{%0, %1|%1, %0}"; /* st(1) = st(1) op st(0); pop */
14481 p = "rp\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2); pop */
14486 if (STACK_TOP_P (operands[0]))
14488 if (STACK_TOP_P (operands[1]))
14489 p = "\t{%y2, %0|%0, %y2}"; /* st(0) = st(0) op st(r2) */
14491 p = "r\t{%y1, %0|%0, %y1}"; /* st(0) = st(r1) op st(0) */
14494 else if (STACK_TOP_P (operands[1]))
14497 p = "{\t%1, %0|r\t%0, %1}";
14499 p = "r\t{%1, %0|%0, %1}"; /* st(r2) = st(0) op st(r2) */
14505 p = "{r\t%2, %0|\t%0, %2}";
14507 p = "\t{%2, %0|%0, %2}"; /* st(r1) = st(r1) op st(0) */
14513 gcc_unreachable ();
14520 /* Return needed mode for entity in optimize_mode_switching pass. */
14523 ix86_mode_needed (int entity, rtx insn)
14525 enum attr_i387_cw mode;
14527 /* The mode UNINITIALIZED is used to store control word after a
14528 function call or ASM pattern. The mode ANY specify that function
14529 has no requirements on the control word and make no changes in the
14530 bits we are interested in. */
14533 || (NONJUMP_INSN_P (insn)
14534 && (asm_noperands (PATTERN (insn)) >= 0
14535 || GET_CODE (PATTERN (insn)) == ASM_INPUT)))
14536 return I387_CW_UNINITIALIZED;
14538 if (recog_memoized (insn) < 0)
14539 return I387_CW_ANY;
14541 mode = get_attr_i387_cw (insn);
14546 if (mode == I387_CW_TRUNC)
14551 if (mode == I387_CW_FLOOR)
14556 if (mode == I387_CW_CEIL)
14561 if (mode == I387_CW_MASK_PM)
14566 gcc_unreachable ();
14569 return I387_CW_ANY;
14572 /* Output code to initialize control word copies used by trunc?f?i and
14573 rounding patterns. CURRENT_MODE is set to current control word,
14574 while NEW_MODE is set to new control word. */
14577 emit_i387_cw_initialization (int mode)
14579 rtx stored_mode = assign_386_stack_local (HImode, SLOT_CW_STORED);
14582 enum ix86_stack_slot slot;
14584 rtx reg = gen_reg_rtx (HImode);
14586 emit_insn (gen_x86_fnstcw_1 (stored_mode));
14587 emit_move_insn (reg, copy_rtx (stored_mode));
14589 if (TARGET_64BIT || TARGET_PARTIAL_REG_STALL
14590 || optimize_function_for_size_p (cfun))
14594 case I387_CW_TRUNC:
14595 /* round toward zero (truncate) */
14596 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0c00)));
14597 slot = SLOT_CW_TRUNC;
14600 case I387_CW_FLOOR:
14601 /* round down toward -oo */
14602 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
14603 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0400)));
14604 slot = SLOT_CW_FLOOR;
14608 /* round up toward +oo */
14609 emit_insn (gen_andhi3 (reg, reg, GEN_INT (~0x0c00)));
14610 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0800)));
14611 slot = SLOT_CW_CEIL;
14614 case I387_CW_MASK_PM:
14615 /* mask precision exception for nearbyint() */
14616 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
14617 slot = SLOT_CW_MASK_PM;
14621 gcc_unreachable ();
14628 case I387_CW_TRUNC:
14629 /* round toward zero (truncate) */
14630 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0xc)));
14631 slot = SLOT_CW_TRUNC;
14634 case I387_CW_FLOOR:
14635 /* round down toward -oo */
14636 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x4)));
14637 slot = SLOT_CW_FLOOR;
14641 /* round up toward +oo */
14642 emit_insn (gen_movsi_insv_1 (reg, GEN_INT (0x8)));
14643 slot = SLOT_CW_CEIL;
14646 case I387_CW_MASK_PM:
14647 /* mask precision exception for nearbyint() */
14648 emit_insn (gen_iorhi3 (reg, reg, GEN_INT (0x0020)));
14649 slot = SLOT_CW_MASK_PM;
14653 gcc_unreachable ();
14657 gcc_assert (slot < MAX_386_STACK_LOCALS);
14659 new_mode = assign_386_stack_local (HImode, slot);
14660 emit_move_insn (new_mode, reg);
14663 /* Output code for INSN to convert a float to a signed int. OPERANDS
14664 are the insn operands. The output may be [HSD]Imode and the input
14665 operand may be [SDX]Fmode. */
14668 output_fix_trunc (rtx insn, rtx *operands, bool fisttp)
14670 int stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
14671 int dimode_p = GET_MODE (operands[0]) == DImode;
14672 int round_mode = get_attr_i387_cw (insn);
14674 /* Jump through a hoop or two for DImode, since the hardware has no
14675 non-popping instruction. We used to do this a different way, but
14676 that was somewhat fragile and broke with post-reload splitters. */
14677 if ((dimode_p || fisttp) && !stack_top_dies)
14678 output_asm_insn ("fld\t%y1", operands);
14680 gcc_assert (STACK_TOP_P (operands[1]));
14681 gcc_assert (MEM_P (operands[0]));
14682 gcc_assert (GET_MODE (operands[1]) != TFmode);
14685 output_asm_insn ("fisttp%Z0\t%0", operands);
14688 if (round_mode != I387_CW_ANY)
14689 output_asm_insn ("fldcw\t%3", operands);
14690 if (stack_top_dies || dimode_p)
14691 output_asm_insn ("fistp%Z0\t%0", operands);
14693 output_asm_insn ("fist%Z0\t%0", operands);
14694 if (round_mode != I387_CW_ANY)
14695 output_asm_insn ("fldcw\t%2", operands);
14701 /* Output code for x87 ffreep insn. The OPNO argument, which may only
14702 have the values zero or one, indicates the ffreep insn's operand
14703 from the OPERANDS array. */
14705 static const char *
14706 output_387_ffreep (rtx *operands ATTRIBUTE_UNUSED, int opno)
14708 if (TARGET_USE_FFREEP)
14709 #ifdef HAVE_AS_IX86_FFREEP
14710 return opno ? "ffreep\t%y1" : "ffreep\t%y0";
14713 static char retval[32];
14714 int regno = REGNO (operands[opno]);
14716 gcc_assert (FP_REGNO_P (regno));
14718 regno -= FIRST_STACK_REG;
14720 snprintf (retval, sizeof (retval), ASM_SHORT "0xc%ddf", regno);
14725 return opno ? "fstp\t%y1" : "fstp\t%y0";
14729 /* Output code for INSN to compare OPERANDS. EFLAGS_P is 1 when fcomi
14730 should be used. UNORDERED_P is true when fucom should be used. */
14733 output_fp_compare (rtx insn, rtx *operands, bool eflags_p, bool unordered_p)
14735 int stack_top_dies;
14736 rtx cmp_op0, cmp_op1;
14737 int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]);
14741 cmp_op0 = operands[0];
14742 cmp_op1 = operands[1];
14746 cmp_op0 = operands[1];
14747 cmp_op1 = operands[2];
14752 static const char ucomiss[] = "vucomiss\t{%1, %0|%0, %1}";
14753 static const char ucomisd[] = "vucomisd\t{%1, %0|%0, %1}";
14754 static const char comiss[] = "vcomiss\t{%1, %0|%0, %1}";
14755 static const char comisd[] = "vcomisd\t{%1, %0|%0, %1}";
14757 if (GET_MODE (operands[0]) == SFmode)
14759 return &ucomiss[TARGET_AVX ? 0 : 1];
14761 return &comiss[TARGET_AVX ? 0 : 1];
14764 return &ucomisd[TARGET_AVX ? 0 : 1];
14766 return &comisd[TARGET_AVX ? 0 : 1];
14769 gcc_assert (STACK_TOP_P (cmp_op0));
14771 stack_top_dies = find_regno_note (insn, REG_DEAD, FIRST_STACK_REG) != 0;
14773 if (cmp_op1 == CONST0_RTX (GET_MODE (cmp_op1)))
14775 if (stack_top_dies)
14777 output_asm_insn ("ftst\n\tfnstsw\t%0", operands);
14778 return output_387_ffreep (operands, 1);
14781 return "ftst\n\tfnstsw\t%0";
14784 if (STACK_REG_P (cmp_op1)
14786 && find_regno_note (insn, REG_DEAD, REGNO (cmp_op1))
14787 && REGNO (cmp_op1) != FIRST_STACK_REG)
14789 /* If both the top of the 387 stack dies, and the other operand
14790 is also a stack register that dies, then this must be a
14791 `fcompp' float compare */
14795 /* There is no double popping fcomi variant. Fortunately,
14796 eflags is immune from the fstp's cc clobbering. */
14798 output_asm_insn ("fucomip\t{%y1, %0|%0, %y1}", operands);
14800 output_asm_insn ("fcomip\t{%y1, %0|%0, %y1}", operands);
14801 return output_387_ffreep (operands, 0);
14806 return "fucompp\n\tfnstsw\t%0";
14808 return "fcompp\n\tfnstsw\t%0";
14813 /* Encoded here as eflags_p | intmode | unordered_p | stack_top_dies. */
14815 static const char * const alt[16] =
14817 "fcom%Z2\t%y2\n\tfnstsw\t%0",
14818 "fcomp%Z2\t%y2\n\tfnstsw\t%0",
14819 "fucom%Z2\t%y2\n\tfnstsw\t%0",
14820 "fucomp%Z2\t%y2\n\tfnstsw\t%0",
14822 "ficom%Z2\t%y2\n\tfnstsw\t%0",
14823 "ficomp%Z2\t%y2\n\tfnstsw\t%0",
14827 "fcomi\t{%y1, %0|%0, %y1}",
14828 "fcomip\t{%y1, %0|%0, %y1}",
14829 "fucomi\t{%y1, %0|%0, %y1}",
14830 "fucomip\t{%y1, %0|%0, %y1}",
14841 mask = eflags_p << 3;
14842 mask |= (GET_MODE_CLASS (GET_MODE (cmp_op1)) == MODE_INT) << 2;
14843 mask |= unordered_p << 1;
14844 mask |= stack_top_dies;
14846 gcc_assert (mask < 16);
14855 ix86_output_addr_vec_elt (FILE *file, int value)
14857 const char *directive = ASM_LONG;
14861 directive = ASM_QUAD;
14863 gcc_assert (!TARGET_64BIT);
14866 fprintf (file, "%s%s%d\n", directive, LPREFIX, value);
14870 ix86_output_addr_diff_elt (FILE *file, int value, int rel)
14872 const char *directive = ASM_LONG;
14875 if (TARGET_64BIT && CASE_VECTOR_MODE == DImode)
14876 directive = ASM_QUAD;
14878 gcc_assert (!TARGET_64BIT);
14880 /* We can't use @GOTOFF for text labels on VxWorks; see gotoff_operand. */
14881 if (TARGET_64BIT || TARGET_VXWORKS_RTP)
14882 fprintf (file, "%s%s%d-%s%d\n",
14883 directive, LPREFIX, value, LPREFIX, rel);
14884 else if (HAVE_AS_GOTOFF_IN_DATA)
14885 fprintf (file, ASM_LONG "%s%d@GOTOFF\n", LPREFIX, value);
14887 else if (TARGET_MACHO)
14889 fprintf (file, ASM_LONG "%s%d-", LPREFIX, value);
14890 machopic_output_function_base_name (file);
14895 asm_fprintf (file, ASM_LONG "%U%s+[.-%s%d]\n",
14896 GOT_SYMBOL_NAME, LPREFIX, value);
14899 /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
14903 ix86_expand_clear (rtx dest)
14907 /* We play register width games, which are only valid after reload. */
14908 gcc_assert (reload_completed);
14910 /* Avoid HImode and its attendant prefix byte. */
14911 if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
14912 dest = gen_rtx_REG (SImode, REGNO (dest));
14913 tmp = gen_rtx_SET (VOIDmode, dest, const0_rtx);
14915 /* This predicate should match that for movsi_xor and movdi_xor_rex64. */
14916 if (!TARGET_USE_MOV0 || optimize_insn_for_speed_p ())
14918 rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
14919 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
14925 /* X is an unchanging MEM. If it is a constant pool reference, return
14926 the constant pool rtx, else NULL. */
14929 maybe_get_pool_constant (rtx x)
14931 x = ix86_delegitimize_address (XEXP (x, 0));
14933 if (GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x))
14934 return get_pool_constant (x);
14940 ix86_expand_move (enum machine_mode mode, rtx operands[])
14943 enum tls_model model;
14948 if (GET_CODE (op1) == SYMBOL_REF)
14950 model = SYMBOL_REF_TLS_MODEL (op1);
14953 op1 = legitimize_tls_address (op1, model, true);
14954 op1 = force_operand (op1, op0);
14958 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
14959 && SYMBOL_REF_DLLIMPORT_P (op1))
14960 op1 = legitimize_dllimport_symbol (op1, false);
14962 else if (GET_CODE (op1) == CONST
14963 && GET_CODE (XEXP (op1, 0)) == PLUS
14964 && GET_CODE (XEXP (XEXP (op1, 0), 0)) == SYMBOL_REF)
14966 rtx addend = XEXP (XEXP (op1, 0), 1);
14967 rtx symbol = XEXP (XEXP (op1, 0), 0);
14970 model = SYMBOL_REF_TLS_MODEL (symbol);
14972 tmp = legitimize_tls_address (symbol, model, true);
14973 else if (TARGET_DLLIMPORT_DECL_ATTRIBUTES
14974 && SYMBOL_REF_DLLIMPORT_P (symbol))
14975 tmp = legitimize_dllimport_symbol (symbol, true);
14979 tmp = force_operand (tmp, NULL);
14980 tmp = expand_simple_binop (Pmode, PLUS, tmp, addend,
14981 op0, 1, OPTAB_DIRECT);
14987 if ((flag_pic || MACHOPIC_INDIRECT)
14988 && mode == Pmode && symbolic_operand (op1, Pmode))
14990 if (TARGET_MACHO && !TARGET_64BIT)
14993 /* dynamic-no-pic */
14994 if (MACHOPIC_INDIRECT)
14996 rtx temp = ((reload_in_progress
14997 || ((op0 && REG_P (op0))
14999 ? op0 : gen_reg_rtx (Pmode));
15000 op1 = machopic_indirect_data_reference (op1, temp);
15002 op1 = machopic_legitimize_pic_address (op1, mode,
15003 temp == op1 ? 0 : temp);
15005 if (op0 != op1 && GET_CODE (op0) != MEM)
15007 rtx insn = gen_rtx_SET (VOIDmode, op0, op1);
15011 if (GET_CODE (op0) == MEM)
15012 op1 = force_reg (Pmode, op1);
15016 if (GET_CODE (temp) != REG)
15017 temp = gen_reg_rtx (Pmode);
15018 temp = legitimize_pic_address (op1, temp);
15023 /* dynamic-no-pic */
15029 op1 = force_reg (Pmode, op1);
15030 else if (!TARGET_64BIT || !x86_64_movabs_operand (op1, Pmode))
15032 rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
15033 op1 = legitimize_pic_address (op1, reg);
15042 && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
15043 || !push_operand (op0, mode))
15045 op1 = force_reg (mode, op1);
15047 if (push_operand (op0, mode)
15048 && ! general_no_elim_operand (op1, mode))
15049 op1 = copy_to_mode_reg (mode, op1);
15051 /* Force large constants in 64bit compilation into register
15052 to get them CSEed. */
15053 if (can_create_pseudo_p ()
15054 && (mode == DImode) && TARGET_64BIT
15055 && immediate_operand (op1, mode)
15056 && !x86_64_zext_immediate_operand (op1, VOIDmode)
15057 && !register_operand (op0, mode)
15059 op1 = copy_to_mode_reg (mode, op1);
15061 if (can_create_pseudo_p ()
15062 && FLOAT_MODE_P (mode)
15063 && GET_CODE (op1) == CONST_DOUBLE)
15065 /* If we are loading a floating point constant to a register,
15066 force the value to memory now, since we'll get better code
15067 out the back end. */
15069 op1 = validize_mem (force_const_mem (mode, op1));
15070 if (!register_operand (op0, mode))
15072 rtx temp = gen_reg_rtx (mode);
15073 emit_insn (gen_rtx_SET (VOIDmode, temp, op1));
15074 emit_move_insn (op0, temp);
15080 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15084 ix86_expand_vector_move (enum machine_mode mode, rtx operands[])
15086 rtx op0 = operands[0], op1 = operands[1];
15087 unsigned int align = GET_MODE_ALIGNMENT (mode);
15089 /* Force constants other than zero into memory. We do not know how
15090 the instructions used to build constants modify the upper 64 bits
15091 of the register, once we have that information we may be able
15092 to handle some of them more efficiently. */
15093 if (can_create_pseudo_p ()
15094 && register_operand (op0, mode)
15095 && (CONSTANT_P (op1)
15096 || (GET_CODE (op1) == SUBREG
15097 && CONSTANT_P (SUBREG_REG (op1))))
15098 && !standard_sse_constant_p (op1))
15099 op1 = validize_mem (force_const_mem (mode, op1));
15101 /* We need to check memory alignment for SSE mode since attribute
15102 can make operands unaligned. */
15103 if (can_create_pseudo_p ()
15104 && SSE_REG_MODE_P (mode)
15105 && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
15106 || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
15110 /* ix86_expand_vector_move_misalign() does not like constants ... */
15111 if (CONSTANT_P (op1)
15112 || (GET_CODE (op1) == SUBREG
15113 && CONSTANT_P (SUBREG_REG (op1))))
15114 op1 = validize_mem (force_const_mem (mode, op1));
15116 /* ... nor both arguments in memory. */
15117 if (!register_operand (op0, mode)
15118 && !register_operand (op1, mode))
15119 op1 = force_reg (mode, op1);
15121 tmp[0] = op0; tmp[1] = op1;
15122 ix86_expand_vector_move_misalign (mode, tmp);
15126 /* Make operand1 a register if it isn't already. */
15127 if (can_create_pseudo_p ()
15128 && !register_operand (op0, mode)
15129 && !register_operand (op1, mode))
15131 emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
15135 emit_insn (gen_rtx_SET (VOIDmode, op0, op1));
15138 /* Split 32-byte AVX unaligned load and store if needed. */
15141 ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
15144 rtx (*extract) (rtx, rtx, rtx);
15145 rtx (*move_unaligned) (rtx, rtx);
15146 enum machine_mode mode;
15148 switch (GET_MODE (op0))
15151 gcc_unreachable ();
15153 extract = gen_avx_vextractf128v32qi;
15154 move_unaligned = gen_avx_movdqu256;
15158 extract = gen_avx_vextractf128v8sf;
15159 move_unaligned = gen_avx_movups256;
15163 extract = gen_avx_vextractf128v4df;
15164 move_unaligned = gen_avx_movupd256;
15169 if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
15171 rtx r = gen_reg_rtx (mode);
15172 m = adjust_address (op1, mode, 0);
15173 emit_move_insn (r, m);
15174 m = adjust_address (op1, mode, 16);
15175 r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
15176 emit_move_insn (op0, r);
15178 else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
15180 m = adjust_address (op0, mode, 0);
15181 emit_insn (extract (m, op1, const0_rtx));
15182 m = adjust_address (op0, mode, 16);
15183 emit_insn (extract (m, op1, const1_rtx));
15186 emit_insn (move_unaligned (op0, op1));
15189 /* Implement the movmisalign patterns for SSE. Non-SSE modes go
15190 straight to ix86_expand_vector_move. */
15191 /* Code generation for scalar reg-reg moves of single and double precision data:
15192 if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
15196 if (x86_sse_partial_reg_dependency == true)
15201 Code generation for scalar loads of double precision data:
15202 if (x86_sse_split_regs == true)
15203 movlpd mem, reg (gas syntax)
15207 Code generation for unaligned packed loads of single precision data
15208 (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
15209 if (x86_sse_unaligned_move_optimal)
15212 if (x86_sse_partial_reg_dependency == true)
15224 Code generation for unaligned packed loads of double precision data
15225 (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
15226 if (x86_sse_unaligned_move_optimal)
15229 if (x86_sse_split_regs == true)
15242 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
15251 switch (GET_MODE_CLASS (mode))
15253 case MODE_VECTOR_INT:
15255 switch (GET_MODE_SIZE (mode))
15258 /* If we're optimizing for size, movups is the smallest. */
15259 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15261 op0 = gen_lowpart (V4SFmode, op0);
15262 op1 = gen_lowpart (V4SFmode, op1);
15263 emit_insn (gen_sse_movups (op0, op1));
15266 op0 = gen_lowpart (V16QImode, op0);
15267 op1 = gen_lowpart (V16QImode, op1);
15268 emit_insn (gen_sse2_movdqu (op0, op1));
15271 op0 = gen_lowpart (V32QImode, op0);
15272 op1 = gen_lowpart (V32QImode, op1);
15273 ix86_avx256_split_vector_move_misalign (op0, op1);
15276 gcc_unreachable ();
15279 case MODE_VECTOR_FLOAT:
15280 op0 = gen_lowpart (mode, op0);
15281 op1 = gen_lowpart (mode, op1);
15286 emit_insn (gen_sse_movups (op0, op1));
15289 ix86_avx256_split_vector_move_misalign (op0, op1);
15292 if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15294 op0 = gen_lowpart (V4SFmode, op0);
15295 op1 = gen_lowpart (V4SFmode, op1);
15296 emit_insn (gen_sse_movups (op0, op1));
15299 emit_insn (gen_sse2_movupd (op0, op1));
15302 ix86_avx256_split_vector_move_misalign (op0, op1);
15305 gcc_unreachable ();
15310 gcc_unreachable ();
15318 /* If we're optimizing for size, movups is the smallest. */
15319 if (optimize_insn_for_size_p ()
15320 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15322 op0 = gen_lowpart (V4SFmode, op0);
15323 op1 = gen_lowpart (V4SFmode, op1);
15324 emit_insn (gen_sse_movups (op0, op1));
15328 /* ??? If we have typed data, then it would appear that using
15329 movdqu is the only way to get unaligned data loaded with
15331 if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15333 op0 = gen_lowpart (V16QImode, op0);
15334 op1 = gen_lowpart (V16QImode, op1);
15335 emit_insn (gen_sse2_movdqu (op0, op1));
15339 if (TARGET_SSE2 && mode == V2DFmode)
15343 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15345 op0 = gen_lowpart (V2DFmode, op0);
15346 op1 = gen_lowpart (V2DFmode, op1);
15347 emit_insn (gen_sse2_movupd (op0, op1));
15351 /* When SSE registers are split into halves, we can avoid
15352 writing to the top half twice. */
15353 if (TARGET_SSE_SPLIT_REGS)
15355 emit_clobber (op0);
15360 /* ??? Not sure about the best option for the Intel chips.
15361 The following would seem to satisfy; the register is
15362 entirely cleared, breaking the dependency chain. We
15363 then store to the upper half, with a dependency depth
15364 of one. A rumor has it that Intel recommends two movsd
15365 followed by an unpacklpd, but this is unconfirmed. And
15366 given that the dependency depth of the unpacklpd would
15367 still be one, I'm not sure why this would be better. */
15368 zero = CONST0_RTX (V2DFmode);
15371 m = adjust_address (op1, DFmode, 0);
15372 emit_insn (gen_sse2_loadlpd (op0, zero, m));
15373 m = adjust_address (op1, DFmode, 8);
15374 emit_insn (gen_sse2_loadhpd (op0, op0, m));
15378 if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL)
15380 op0 = gen_lowpart (V4SFmode, op0);
15381 op1 = gen_lowpart (V4SFmode, op1);
15382 emit_insn (gen_sse_movups (op0, op1));
15386 if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
15387 emit_move_insn (op0, CONST0_RTX (mode));
15389 emit_clobber (op0);
15391 if (mode != V4SFmode)
15392 op0 = gen_lowpart (V4SFmode, op0);
15393 m = adjust_address (op1, V2SFmode, 0);
15394 emit_insn (gen_sse_loadlps (op0, op0, m));
15395 m = adjust_address (op1, V2SFmode, 8);
15396 emit_insn (gen_sse_loadhps (op0, op0, m));
15399 else if (MEM_P (op0))
15401 /* If we're optimizing for size, movups is the smallest. */
15402 if (optimize_insn_for_size_p ()
15403 || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
15405 op0 = gen_lowpart (V4SFmode, op0);
15406 op1 = gen_lowpart (V4SFmode, op1);
15407 emit_insn (gen_sse_movups (op0, op1));
15411 /* ??? Similar to above, only less clear because of quote
15412 typeless stores unquote. */
15413 if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
15414 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
15416 op0 = gen_lowpart (V16QImode, op0);
15417 op1 = gen_lowpart (V16QImode, op1);
15418 emit_insn (gen_sse2_movdqu (op0, op1));
15422 if (TARGET_SSE2 && mode == V2DFmode)
15424 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15426 op0 = gen_lowpart (V2DFmode, op0);
15427 op1 = gen_lowpart (V2DFmode, op1);
15428 emit_insn (gen_sse2_movupd (op0, op1));
15432 m = adjust_address (op0, DFmode, 0);
15433 emit_insn (gen_sse2_storelpd (m, op1));
15434 m = adjust_address (op0, DFmode, 8);
15435 emit_insn (gen_sse2_storehpd (m, op1));
15440 if (mode != V4SFmode)
15441 op1 = gen_lowpart (V4SFmode, op1);
15443 if (TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
15445 op0 = gen_lowpart (V4SFmode, op0);
15446 emit_insn (gen_sse_movups (op0, op1));
15450 m = adjust_address (op0, V2SFmode, 0);
15451 emit_insn (gen_sse_storelps (m, op1));
15452 m = adjust_address (op0, V2SFmode, 8);
15453 emit_insn (gen_sse_storehps (m, op1));
15458 gcc_unreachable ();
15461 /* Expand a push in MODE. This is some mode for which we do not support
15462 proper push instructions, at least from the registers that we expect
15463 the value to live in. */
15466 ix86_expand_push (enum machine_mode mode, rtx x)
15470 tmp = expand_simple_binop (Pmode, PLUS, stack_pointer_rtx,
15471 GEN_INT (-GET_MODE_SIZE (mode)),
15472 stack_pointer_rtx, 1, OPTAB_DIRECT);
15473 if (tmp != stack_pointer_rtx)
15474 emit_move_insn (stack_pointer_rtx, tmp);
15476 tmp = gen_rtx_MEM (mode, stack_pointer_rtx);
15478 /* When we push an operand onto stack, it has to be aligned at least
15479 at the function argument boundary. However since we don't have
15480 the argument type, we can't determine the actual argument
15482 emit_move_insn (tmp, x);
15485 /* Helper function of ix86_fixup_binary_operands to canonicalize
15486 operand order. Returns true if the operands should be swapped. */
15489 ix86_swap_binary_operands_p (enum rtx_code code, enum machine_mode mode,
15492 rtx dst = operands[0];
15493 rtx src1 = operands[1];
15494 rtx src2 = operands[2];
15496 /* If the operation is not commutative, we can't do anything. */
15497 if (GET_RTX_CLASS (code) != RTX_COMM_ARITH)
15500 /* Highest priority is that src1 should match dst. */
15501 if (rtx_equal_p (dst, src1))
15503 if (rtx_equal_p (dst, src2))
15506 /* Next highest priority is that immediate constants come second. */
15507 if (immediate_operand (src2, mode))
15509 if (immediate_operand (src1, mode))
15512 /* Lowest priority is that memory references should come second. */
15522 /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
15523 destination to use for the operation. If different from the true
15524 destination in operands[0], a copy operation will be required. */
15527 ix86_fixup_binary_operands (enum rtx_code code, enum machine_mode mode,
15530 rtx dst = operands[0];
15531 rtx src1 = operands[1];
15532 rtx src2 = operands[2];
15534 /* Canonicalize operand order. */
15535 if (ix86_swap_binary_operands_p (code, mode, operands))
15539 /* It is invalid to swap operands of different modes. */
15540 gcc_assert (GET_MODE (src1) == GET_MODE (src2));
15547 /* Both source operands cannot be in memory. */
15548 if (MEM_P (src1) && MEM_P (src2))
15550 /* Optimization: Only read from memory once. */
15551 if (rtx_equal_p (src1, src2))
15553 src2 = force_reg (mode, src2);
15557 src2 = force_reg (mode, src2);
15560 /* If the destination is memory, and we do not have matching source
15561 operands, do things in registers. */
15562 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
15563 dst = gen_reg_rtx (mode);
15565 /* Source 1 cannot be a constant. */
15566 if (CONSTANT_P (src1))
15567 src1 = force_reg (mode, src1);
15569 /* Source 1 cannot be a non-matching memory. */
15570 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
15571 src1 = force_reg (mode, src1);
15573 operands[1] = src1;
15574 operands[2] = src2;
15578 /* Similarly, but assume that the destination has already been
15579 set up properly. */
15582 ix86_fixup_binary_operands_no_copy (enum rtx_code code,
15583 enum machine_mode mode, rtx operands[])
15585 rtx dst = ix86_fixup_binary_operands (code, mode, operands);
15586 gcc_assert (dst == operands[0]);
15589 /* Attempt to expand a binary operator. Make the expansion closer to the
15590 actual machine, then just general_operand, which will allow 3 separate
15591 memory references (one output, two input) in a single insn. */
15594 ix86_expand_binary_operator (enum rtx_code code, enum machine_mode mode,
15597 rtx src1, src2, dst, op, clob;
15599 dst = ix86_fixup_binary_operands (code, mode, operands);
15600 src1 = operands[1];
15601 src2 = operands[2];
15603 /* Emit the instruction. */
15605 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_ee (code, mode, src1, src2));
15606 if (reload_in_progress)
15608 /* Reload doesn't know about the flags register, and doesn't know that
15609 it doesn't want to clobber it. We can only do this with PLUS. */
15610 gcc_assert (code == PLUS);
15613 else if (reload_completed
15615 && !rtx_equal_p (dst, src1))
15617 /* This is going to be an LEA; avoid splitting it later. */
15622 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15623 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
15626 /* Fix up the destination if needed. */
15627 if (dst != operands[0])
15628 emit_move_insn (operands[0], dst);
15631 /* Return TRUE or FALSE depending on whether the binary operator meets the
15632 appropriate constraints. */
15635 ix86_binary_operator_ok (enum rtx_code code, enum machine_mode mode,
15638 rtx dst = operands[0];
15639 rtx src1 = operands[1];
15640 rtx src2 = operands[2];
15642 /* Both source operands cannot be in memory. */
15643 if (MEM_P (src1) && MEM_P (src2))
15646 /* Canonicalize operand order for commutative operators. */
15647 if (ix86_swap_binary_operands_p (code, mode, operands))
15654 /* If the destination is memory, we must have a matching source operand. */
15655 if (MEM_P (dst) && !rtx_equal_p (dst, src1))
15658 /* Source 1 cannot be a constant. */
15659 if (CONSTANT_P (src1))
15662 /* Source 1 cannot be a non-matching memory. */
15663 if (MEM_P (src1) && !rtx_equal_p (dst, src1))
15665 /* Support "andhi/andsi/anddi" as a zero-extending move. */
15666 return (code == AND
15669 || (TARGET_64BIT && mode == DImode))
15670 && CONST_INT_P (src2)
15671 && (INTVAL (src2) == 0xff
15672 || INTVAL (src2) == 0xffff));
15678 /* Attempt to expand a unary operator. Make the expansion closer to the
15679 actual machine, then just general_operand, which will allow 2 separate
15680 memory references (one output, one input) in a single insn. */
15683 ix86_expand_unary_operator (enum rtx_code code, enum machine_mode mode,
15686 int matching_memory;
15687 rtx src, dst, op, clob;
15692 /* If the destination is memory, and we do not have matching source
15693 operands, do things in registers. */
15694 matching_memory = 0;
15697 if (rtx_equal_p (dst, src))
15698 matching_memory = 1;
15700 dst = gen_reg_rtx (mode);
15703 /* When source operand is memory, destination must match. */
15704 if (MEM_P (src) && !matching_memory)
15705 src = force_reg (mode, src);
15707 /* Emit the instruction. */
15709 op = gen_rtx_SET (VOIDmode, dst, gen_rtx_fmt_e (code, mode, src));
15710 if (reload_in_progress || code == NOT)
15712 /* Reload doesn't know about the flags register, and doesn't know that
15713 it doesn't want to clobber it. */
15714 gcc_assert (code == NOT);
15719 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
15720 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
15723 /* Fix up the destination if needed. */
15724 if (dst != operands[0])
15725 emit_move_insn (operands[0], dst);
15728 /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
15729 divisor are within the range [0-255]. */
15732 ix86_split_idivmod (enum machine_mode mode, rtx operands[],
15735 rtx end_label, qimode_label;
15736 rtx insn, div, mod;
15737 rtx scratch, tmp0, tmp1, tmp2;
15738 rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
15739 rtx (*gen_zero_extend) (rtx, rtx);
15740 rtx (*gen_test_ccno_1) (rtx, rtx);
15745 gen_divmod4_1 = signed_p ? gen_divmodsi4_1 : gen_udivmodsi4_1;
15746 gen_test_ccno_1 = gen_testsi_ccno_1;
15747 gen_zero_extend = gen_zero_extendqisi2;
15750 gen_divmod4_1 = signed_p ? gen_divmoddi4_1 : gen_udivmoddi4_1;
15751 gen_test_ccno_1 = gen_testdi_ccno_1;
15752 gen_zero_extend = gen_zero_extendqidi2;
15755 gcc_unreachable ();
15758 end_label = gen_label_rtx ();
15759 qimode_label = gen_label_rtx ();
15761 scratch = gen_reg_rtx (mode);
15763 /* Use 8bit unsigned divimod if dividend and divisor are within
15764 the range [0-255]. */
15765 emit_move_insn (scratch, operands[2]);
15766 scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
15767 scratch, 1, OPTAB_DIRECT);
15768 emit_insn (gen_test_ccno_1 (scratch, GEN_INT (-0x100)));
15769 tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
15770 tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
15771 tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
15772 gen_rtx_LABEL_REF (VOIDmode, qimode_label),
15774 insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp0));
15775 predict_jump (REG_BR_PROB_BASE * 50 / 100);
15776 JUMP_LABEL (insn) = qimode_label;
15778 /* Generate original signed/unsigned divimod. */
15779 div = gen_divmod4_1 (operands[0], operands[1],
15780 operands[2], operands[3]);
15783 /* Branch to the end. */
15784 emit_jump_insn (gen_jump (end_label));
15787 /* Generate 8bit unsigned divide. */
15788 emit_label (qimode_label);
15789 /* Don't use operands[0] for result of 8bit divide since not all
15790 registers support QImode ZERO_EXTRACT. */
15791 tmp0 = simplify_gen_subreg (HImode, scratch, mode, 0);
15792 tmp1 = simplify_gen_subreg (HImode, operands[2], mode, 0);
15793 tmp2 = simplify_gen_subreg (QImode, operands[3], mode, 0);
15794 emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
15798 div = gen_rtx_DIV (SImode, operands[2], operands[3]);
15799 mod = gen_rtx_MOD (SImode, operands[2], operands[3]);
15803 div = gen_rtx_UDIV (SImode, operands[2], operands[3]);
15804 mod = gen_rtx_UMOD (SImode, operands[2], operands[3]);
15807 /* Extract remainder from AH. */
15808 tmp1 = gen_rtx_ZERO_EXTRACT (mode, tmp0, GEN_INT (8), GEN_INT (8));
15809 if (REG_P (operands[1]))
15810 insn = emit_move_insn (operands[1], tmp1);
15813 /* Need a new scratch register since the old one has result
15815 scratch = gen_reg_rtx (mode);
15816 emit_move_insn (scratch, tmp1);
15817 insn = emit_move_insn (operands[1], scratch);
15819 set_unique_reg_note (insn, REG_EQUAL, mod);
15821 /* Zero extend quotient from AL. */
15822 tmp1 = gen_lowpart (QImode, tmp0);
15823 insn = emit_insn (gen_zero_extend (operands[0], tmp1));
15824 set_unique_reg_note (insn, REG_EQUAL, div);
15826 emit_label (end_label);
15829 #define LEA_SEARCH_THRESHOLD 12
15831 /* Search backward for non-agu definition of register number REGNO1
15832 or register number REGNO2 in INSN's basic block until
15833 1. Pass LEA_SEARCH_THRESHOLD instructions, or
15834 2. Reach BB boundary, or
15835 3. Reach agu definition.
15836 Returns the distance between the non-agu definition point and INSN.
15837 If no definition point, returns -1. */
15840 distance_non_agu_define (unsigned int regno1, unsigned int regno2,
15843 basic_block bb = BLOCK_FOR_INSN (insn);
15846 enum attr_type insn_type;
15848 if (insn != BB_HEAD (bb))
15850 rtx prev = PREV_INSN (insn);
15851 while (prev && distance < LEA_SEARCH_THRESHOLD)
15853 if (NONDEBUG_INSN_P (prev))
15856 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
15857 if (DF_REF_TYPE (*def_rec) == DF_REF_REG_DEF
15858 && !DF_REF_IS_ARTIFICIAL (*def_rec)
15859 && (regno1 == DF_REF_REGNO (*def_rec)
15860 || regno2 == DF_REF_REGNO (*def_rec)))
15862 insn_type = get_attr_type (prev);
15863 if (insn_type != TYPE_LEA)
15867 if (prev == BB_HEAD (bb))
15869 prev = PREV_INSN (prev);
15873 if (distance < LEA_SEARCH_THRESHOLD)
15877 bool simple_loop = false;
15879 FOR_EACH_EDGE (e, ei, bb->preds)
15882 simple_loop = true;
15888 rtx prev = BB_END (bb);
15891 && distance < LEA_SEARCH_THRESHOLD)
15893 if (NONDEBUG_INSN_P (prev))
15896 for (def_rec = DF_INSN_DEFS (prev); *def_rec; def_rec++)
15897 if (DF_REF_TYPE (*def_rec) == DF_REF_REG_DEF
15898 && !DF_REF_IS_ARTIFICIAL (*def_rec)
15899 && (regno1 == DF_REF_REGNO (*def_rec)
15900 || regno2 == DF_REF_REGNO (*def_rec)))
15902 insn_type = get_attr_type (prev);
15903 if (insn_type != TYPE_LEA)
15907 prev = PREV_INSN (prev);
15915 /* get_attr_type may modify recog data. We want to make sure
15916 that recog data is valid for instruction INSN, on which
15917 distance_non_agu_define is called. INSN is unchanged here. */
15918 extract_insn_cached (insn);
15922 /* Return the distance between INSN and the next insn that uses
15923 register number REGNO0 in memory address. Return -1 if no such
15924 a use is found within LEA_SEARCH_THRESHOLD or REGNO0 is set. */
15927 distance_agu_use (unsigned int regno0, rtx insn)
15929 basic_block bb = BLOCK_FOR_INSN (insn);
15934 if (insn != BB_END (bb))
15936 rtx next = NEXT_INSN (insn);
15937 while (next && distance < LEA_SEARCH_THRESHOLD)
15939 if (NONDEBUG_INSN_P (next))
15943 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
15944 if ((DF_REF_TYPE (*use_rec) == DF_REF_REG_MEM_LOAD
15945 || DF_REF_TYPE (*use_rec) == DF_REF_REG_MEM_STORE)
15946 && regno0 == DF_REF_REGNO (*use_rec))
15948 /* Return DISTANCE if OP0 is used in memory
15949 address in NEXT. */
15953 for (def_rec = DF_INSN_DEFS (next); *def_rec; def_rec++)
15954 if (DF_REF_TYPE (*def_rec) == DF_REF_REG_DEF
15955 && !DF_REF_IS_ARTIFICIAL (*def_rec)
15956 && regno0 == DF_REF_REGNO (*def_rec))
15958 /* Return -1 if OP0 is set in NEXT. */
15962 if (next == BB_END (bb))
15964 next = NEXT_INSN (next);
15968 if (distance < LEA_SEARCH_THRESHOLD)
15972 bool simple_loop = false;
15974 FOR_EACH_EDGE (e, ei, bb->succs)
15977 simple_loop = true;
15983 rtx next = BB_HEAD (bb);
15986 && distance < LEA_SEARCH_THRESHOLD)
15988 if (NONDEBUG_INSN_P (next))
15992 for (use_rec = DF_INSN_USES (next); *use_rec; use_rec++)
15993 if ((DF_REF_TYPE (*use_rec) == DF_REF_REG_MEM_LOAD
15994 || DF_REF_TYPE (*use_rec) == DF_REF_REG_MEM_STORE)
15995 && regno0 == DF_REF_REGNO (*use_rec))
15997 /* Return DISTANCE if OP0 is used in memory
15998 address in NEXT. */
16002 for (def_rec = DF_INSN_DEFS (next); *def_rec; def_rec++)
16003 if (DF_REF_TYPE (*def_rec) == DF_REF_REG_DEF
16004 && !DF_REF_IS_ARTIFICIAL (*def_rec)
16005 && regno0 == DF_REF_REGNO (*def_rec))
16007 /* Return -1 if OP0 is set in NEXT. */
16012 next = NEXT_INSN (next);
16020 /* Define this macro to tune LEA priority vs ADD, it take effect when
16021 there is a dilemma of choicing LEA or ADD
16022 Negative value: ADD is more preferred than LEA
16024 Positive value: LEA is more preferred than ADD*/
16025 #define IX86_LEA_PRIORITY 2
16027 /* Return true if it is ok to optimize an ADD operation to LEA
16028 operation to avoid flag register consumation. For most processors,
16029 ADD is faster than LEA. For the processors like ATOM, if the
16030 destination register of LEA holds an actual address which will be
16031 used soon, LEA is better and otherwise ADD is better. */
16034 ix86_lea_for_add_ok (rtx insn, rtx operands[])
16036 unsigned int regno0 = true_regnum (operands[0]);
16037 unsigned int regno1 = true_regnum (operands[1]);
16038 unsigned int regno2 = true_regnum (operands[2]);
16040 /* If a = b + c, (a!=b && a!=c), must use lea form. */
16041 if (regno0 != regno1 && regno0 != regno2)
16044 if (!TARGET_OPT_AGU || optimize_function_for_size_p (cfun))
16048 int dist_define, dist_use;
16050 /* Return false if REGNO0 isn't used in memory address. */
16051 dist_use = distance_agu_use (regno0, insn);
16055 dist_define = distance_non_agu_define (regno1, regno2, insn);
16056 if (dist_define <= 0)
16059 /* If this insn has both backward non-agu dependence and forward
16060 agu dependence, the one with short distance take effect. */
16061 if ((dist_define + IX86_LEA_PRIORITY) < dist_use)
16068 /* Return true if destination reg of SET_BODY is shift count of
16072 ix86_dep_by_shift_count_body (const_rtx set_body, const_rtx use_body)
16078 /* Retrieve destination of SET_BODY. */
16079 switch (GET_CODE (set_body))
16082 set_dest = SET_DEST (set_body);
16083 if (!set_dest || !REG_P (set_dest))
16087 for (i = XVECLEN (set_body, 0) - 1; i >= 0; i--)
16088 if (ix86_dep_by_shift_count_body (XVECEXP (set_body, 0, i),
16096 /* Retrieve shift count of USE_BODY. */
16097 switch (GET_CODE (use_body))
16100 shift_rtx = XEXP (use_body, 1);
16103 for (i = XVECLEN (use_body, 0) - 1; i >= 0; i--)
16104 if (ix86_dep_by_shift_count_body (set_body,
16105 XVECEXP (use_body, 0, i)))
16113 && (GET_CODE (shift_rtx) == ASHIFT
16114 || GET_CODE (shift_rtx) == LSHIFTRT
16115 || GET_CODE (shift_rtx) == ASHIFTRT
16116 || GET_CODE (shift_rtx) == ROTATE
16117 || GET_CODE (shift_rtx) == ROTATERT))
16119 rtx shift_count = XEXP (shift_rtx, 1);
16121 /* Return true if shift count is dest of SET_BODY. */
16122 if (REG_P (shift_count)
16123 && true_regnum (set_dest) == true_regnum (shift_count))
16130 /* Return true if destination reg of SET_INSN is shift count of
16134 ix86_dep_by_shift_count (const_rtx set_insn, const_rtx use_insn)
16136 return ix86_dep_by_shift_count_body (PATTERN (set_insn),
16137 PATTERN (use_insn));
16140 /* Return TRUE or FALSE depending on whether the unary operator meets the
16141 appropriate constraints. */
16144 ix86_unary_operator_ok (enum rtx_code code ATTRIBUTE_UNUSED,
16145 enum machine_mode mode ATTRIBUTE_UNUSED,
16146 rtx operands[2] ATTRIBUTE_UNUSED)
16148 /* If one of operands is memory, source and destination must match. */
16149 if ((MEM_P (operands[0])
16150 || MEM_P (operands[1]))
16151 && ! rtx_equal_p (operands[0], operands[1]))
16156 /* Return TRUE if the operands to a vec_interleave_{high,low}v2df
16157 are ok, keeping in mind the possible movddup alternative. */
16160 ix86_vec_interleave_v2df_operator_ok (rtx operands[3], bool high)
16162 if (MEM_P (operands[0]))
16163 return rtx_equal_p (operands[0], operands[1 + high]);
16164 if (MEM_P (operands[1]) && MEM_P (operands[2]))
16165 return TARGET_SSE3 && rtx_equal_p (operands[1], operands[2]);
16169 /* Post-reload splitter for converting an SF or DFmode value in an
16170 SSE register into an unsigned SImode. */
16173 ix86_split_convert_uns_si_sse (rtx operands[])
16175 enum machine_mode vecmode;
16176 rtx value, large, zero_or_two31, input, two31, x;
16178 large = operands[1];
16179 zero_or_two31 = operands[2];
16180 input = operands[3];
16181 two31 = operands[4];
16182 vecmode = GET_MODE (large);
16183 value = gen_rtx_REG (vecmode, REGNO (operands[0]));
16185 /* Load up the value into the low element. We must ensure that the other
16186 elements are valid floats -- zero is the easiest such value. */
16189 if (vecmode == V4SFmode)
16190 emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
16192 emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
16196 input = gen_rtx_REG (vecmode, REGNO (input));
16197 emit_move_insn (value, CONST0_RTX (vecmode));
16198 if (vecmode == V4SFmode)
16199 emit_insn (gen_sse_movss (value, value, input));
16201 emit_insn (gen_sse2_movsd (value, value, input));
16204 emit_move_insn (large, two31);
16205 emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
16207 x = gen_rtx_fmt_ee (LE, vecmode, large, value);
16208 emit_insn (gen_rtx_SET (VOIDmode, large, x));
16210 x = gen_rtx_AND (vecmode, zero_or_two31, large);
16211 emit_insn (gen_rtx_SET (VOIDmode, zero_or_two31, x));
16213 x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
16214 emit_insn (gen_rtx_SET (VOIDmode, value, x));
16216 large = gen_rtx_REG (V4SImode, REGNO (large));
16217 emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
16219 x = gen_rtx_REG (V4SImode, REGNO (value));
16220 if (vecmode == V4SFmode)
16221 emit_insn (gen_sse2_cvttps2dq (x, value));
16223 emit_insn (gen_sse2_cvttpd2dq (x, value));
16226 emit_insn (gen_xorv4si3 (value, value, large));
16229 /* Convert an unsigned DImode value into a DFmode, using only SSE.
16230 Expects the 64-bit DImode to be supplied in a pair of integral
16231 registers. Requires SSE2; will use SSE3 if available. For x86_32,
16232 -mfpmath=sse, !optimize_size only. */
16235 ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
16237 REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
16238 rtx int_xmm, fp_xmm;
16239 rtx biases, exponents;
16242 int_xmm = gen_reg_rtx (V4SImode);
16243 if (TARGET_INTER_UNIT_MOVES)
16244 emit_insn (gen_movdi_to_sse (int_xmm, input));
16245 else if (TARGET_SSE_SPLIT_REGS)
16247 emit_clobber (int_xmm);
16248 emit_move_insn (gen_lowpart (DImode, int_xmm), input);
16252 x = gen_reg_rtx (V2DImode);
16253 ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
16254 emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
16257 x = gen_rtx_CONST_VECTOR (V4SImode,
16258 gen_rtvec (4, GEN_INT (0x43300000UL),
16259 GEN_INT (0x45300000UL),
16260 const0_rtx, const0_rtx));
16261 exponents = validize_mem (force_const_mem (V4SImode, x));
16263 /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
16264 emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
16266 /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
16267 yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
16268 Similarly (0x45300000UL ## fp_value_hi_xmm) yields
16269 (0x1.0p84 + double(fp_value_hi_xmm)).
16270 Note these exponents differ by 32. */
16272 fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
16274 /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
16275 in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
16276 real_ldexp (&bias_lo_rvt, &dconst1, 52);
16277 real_ldexp (&bias_hi_rvt, &dconst1, 84);
16278 biases = const_double_from_real_value (bias_lo_rvt, DFmode);
16279 x = const_double_from_real_value (bias_hi_rvt, DFmode);
16280 biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
16281 biases = validize_mem (force_const_mem (V2DFmode, biases));
16282 emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
16284 /* Add the upper and lower DFmode values together. */
16286 emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
16289 x = copy_to_mode_reg (V2DFmode, fp_xmm);
16290 emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
16291 emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
16294 ix86_expand_vector_extract (false, target, fp_xmm, 0);
16297 /* Not used, but eases macroization of patterns. */
16299 ix86_expand_convert_uns_sixf_sse (rtx target ATTRIBUTE_UNUSED,
16300 rtx input ATTRIBUTE_UNUSED)
16302 gcc_unreachable ();
16305 /* Convert an unsigned SImode value into a DFmode. Only currently used
16306 for SSE, but applicable anywhere. */
16309 ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
16311 REAL_VALUE_TYPE TWO31r;
16314 x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
16315 NULL, 1, OPTAB_DIRECT);
16317 fp = gen_reg_rtx (DFmode);
16318 emit_insn (gen_floatsidf2 (fp, x));
16320 real_ldexp (&TWO31r, &dconst1, 31);
16321 x = const_double_from_real_value (TWO31r, DFmode);
16323 x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
16325 emit_move_insn (target, x);
16328 /* Convert a signed DImode value into a DFmode. Only used for SSE in
16329 32-bit mode; otherwise we have a direct convert instruction. */
16332 ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
16334 REAL_VALUE_TYPE TWO32r;
16335 rtx fp_lo, fp_hi, x;
16337 fp_lo = gen_reg_rtx (DFmode);
16338 fp_hi = gen_reg_rtx (DFmode);
16340 emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
16342 real_ldexp (&TWO32r, &dconst1, 32);
16343 x = const_double_from_real_value (TWO32r, DFmode);
16344 fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
16346 ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
16348 x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
16351 emit_move_insn (target, x);
16354 /* Convert an unsigned SImode value into a SFmode, using only SSE.
16355 For x86_32, -mfpmath=sse, !optimize_size only. */
16357 ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
16359 REAL_VALUE_TYPE ONE16r;
16360 rtx fp_hi, fp_lo, int_hi, int_lo, x;
16362 real_ldexp (&ONE16r, &dconst1, 16);
16363 x = const_double_from_real_value (ONE16r, SFmode);
16364 int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
16365 NULL, 0, OPTAB_DIRECT);
16366 int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
16367 NULL, 0, OPTAB_DIRECT);
16368 fp_hi = gen_reg_rtx (SFmode);
16369 fp_lo = gen_reg_rtx (SFmode);
16370 emit_insn (gen_floatsisf2 (fp_hi, int_hi));
16371 emit_insn (gen_floatsisf2 (fp_lo, int_lo));
16372 fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
16374 fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
16376 if (!rtx_equal_p (target, fp_hi))
16377 emit_move_insn (target, fp_hi);
16380 /* A subroutine of ix86_build_signbit_mask. If VECT is true,
16381 then replicate the value for all elements of the vector
16385 ix86_build_const_vector (enum machine_mode mode, bool vect, rtx value)
16392 v = gen_rtvec (4, value, value, value, value);
16393 return gen_rtx_CONST_VECTOR (V4SImode, v);
16397 v = gen_rtvec (2, value, value);
16398 return gen_rtx_CONST_VECTOR (V2DImode, v);
16402 v = gen_rtvec (8, value, value, value, value,
16403 value, value, value, value);
16405 v = gen_rtvec (8, value, CONST0_RTX (SFmode),
16406 CONST0_RTX (SFmode), CONST0_RTX (SFmode),
16407 CONST0_RTX (SFmode), CONST0_RTX (SFmode),
16408 CONST0_RTX (SFmode), CONST0_RTX (SFmode));
16409 return gen_rtx_CONST_VECTOR (V8SFmode, v);
16413 v = gen_rtvec (4, value, value, value, value);
16415 v = gen_rtvec (4, value, CONST0_RTX (SFmode),
16416 CONST0_RTX (SFmode), CONST0_RTX (SFmode));
16417 return gen_rtx_CONST_VECTOR (V4SFmode, v);
16421 v = gen_rtvec (4, value, value, value, value);
16423 v = gen_rtvec (4, value, CONST0_RTX (DFmode),
16424 CONST0_RTX (DFmode), CONST0_RTX (DFmode));
16425 return gen_rtx_CONST_VECTOR (V4DFmode, v);
16429 v = gen_rtvec (2, value, value);
16431 v = gen_rtvec (2, value, CONST0_RTX (DFmode));
16432 return gen_rtx_CONST_VECTOR (V2DFmode, v);
16435 gcc_unreachable ();
16439 /* A subroutine of ix86_expand_fp_absneg_operator, copysign expanders
16440 and ix86_expand_int_vcond. Create a mask for the sign bit in MODE
16441 for an SSE register. If VECT is true, then replicate the mask for
16442 all elements of the vector register. If INVERT is true, then create
16443 a mask excluding the sign bit. */
16446 ix86_build_signbit_mask (enum machine_mode mode, bool vect, bool invert)
16448 enum machine_mode vec_mode, imode;
16449 HOST_WIDE_INT hi, lo;
16454 /* Find the sign bit, sign extended to 2*HWI. */
16461 mode = GET_MODE_INNER (mode);
16463 lo = 0x80000000, hi = lo < 0;
16470 mode = GET_MODE_INNER (mode);
16472 if (HOST_BITS_PER_WIDE_INT >= 64)
16473 lo = (HOST_WIDE_INT)1 << shift, hi = -1;
16475 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
16480 vec_mode = VOIDmode;
16481 if (HOST_BITS_PER_WIDE_INT >= 64)
16484 lo = 0, hi = (HOST_WIDE_INT)1 << shift;
16491 lo = 0, hi = (HOST_WIDE_INT)1 << (shift - HOST_BITS_PER_WIDE_INT);
16495 lo = ~lo, hi = ~hi;
16501 mask = immed_double_const (lo, hi, imode);
16503 vec = gen_rtvec (2, v, mask);
16504 v = gen_rtx_CONST_VECTOR (V2DImode, vec);
16505 v = copy_to_mode_reg (mode, gen_lowpart (mode, v));
16512 gcc_unreachable ();
16516 lo = ~lo, hi = ~hi;
16518 /* Force this value into the low part of a fp vector constant. */
16519 mask = immed_double_const (lo, hi, imode);
16520 mask = gen_lowpart (mode, mask);
16522 if (vec_mode == VOIDmode)
16523 return force_reg (mode, mask);
16525 v = ix86_build_const_vector (vec_mode, vect, mask);
16526 return force_reg (vec_mode, v);
16529 /* Generate code for floating point ABS or NEG. */
16532 ix86_expand_fp_absneg_operator (enum rtx_code code, enum machine_mode mode,
16535 rtx mask, set, dst, src;
16536 bool use_sse = false;
16537 bool vector_mode = VECTOR_MODE_P (mode);
16538 enum machine_mode vmode = mode;
16542 else if (mode == TFmode)
16544 else if (TARGET_SSE_MATH)
16546 use_sse = SSE_FLOAT_MODE_P (mode);
16547 if (mode == SFmode)
16549 else if (mode == DFmode)
16553 /* NEG and ABS performed with SSE use bitwise mask operations.
16554 Create the appropriate mask now. */
16556 mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
16563 set = gen_rtx_fmt_e (code, mode, src);
16564 set = gen_rtx_SET (VOIDmode, dst, set);
16571 use = gen_rtx_USE (VOIDmode, mask);
16573 par = gen_rtvec (2, set, use);
16576 clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
16577 par = gen_rtvec (3, set, use, clob);
16579 emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
16585 /* Expand a copysign operation. Special case operand 0 being a constant. */
16588 ix86_expand_copysign (rtx operands[])
16590 enum machine_mode mode, vmode;
16591 rtx dest, op0, op1, mask, nmask;
16593 dest = operands[0];
16597 mode = GET_MODE (dest);
16599 if (mode == SFmode)
16601 else if (mode == DFmode)
16606 if (GET_CODE (op0) == CONST_DOUBLE)
16608 rtx (*copysign_insn)(rtx, rtx, rtx, rtx);
16610 if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
16611 op0 = simplify_unary_operation (ABS, mode, op0, mode);
16613 if (mode == SFmode || mode == DFmode)
16615 if (op0 == CONST0_RTX (mode))
16616 op0 = CONST0_RTX (vmode);
16619 rtx v = ix86_build_const_vector (vmode, false, op0);
16621 op0 = force_reg (vmode, v);
16624 else if (op0 != CONST0_RTX (mode))
16625 op0 = force_reg (mode, op0);
16627 mask = ix86_build_signbit_mask (vmode, 0, 0);
16629 if (mode == SFmode)
16630 copysign_insn = gen_copysignsf3_const;
16631 else if (mode == DFmode)
16632 copysign_insn = gen_copysigndf3_const;
16634 copysign_insn = gen_copysigntf3_const;
16636 emit_insn (copysign_insn (dest, op0, op1, mask));
16640 rtx (*copysign_insn)(rtx, rtx, rtx, rtx, rtx, rtx);
16642 nmask = ix86_build_signbit_mask (vmode, 0, 1);
16643 mask = ix86_build_signbit_mask (vmode, 0, 0);
16645 if (mode == SFmode)
16646 copysign_insn = gen_copysignsf3_var;
16647 else if (mode == DFmode)
16648 copysign_insn = gen_copysigndf3_var;
16650 copysign_insn = gen_copysigntf3_var;
16652 emit_insn (copysign_insn (dest, NULL_RTX, op0, op1, nmask, mask));
16656 /* Deconstruct a copysign operation into bit masks. Operand 0 is known to
16657 be a constant, and so has already been expanded into a vector constant. */
16660 ix86_split_copysign_const (rtx operands[])
16662 enum machine_mode mode, vmode;
16663 rtx dest, op0, mask, x;
16665 dest = operands[0];
16667 mask = operands[3];
16669 mode = GET_MODE (dest);
16670 vmode = GET_MODE (mask);
16672 dest = simplify_gen_subreg (vmode, dest, mode, 0);
16673 x = gen_rtx_AND (vmode, dest, mask);
16674 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
16676 if (op0 != CONST0_RTX (vmode))
16678 x = gen_rtx_IOR (vmode, dest, op0);
16679 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
16683 /* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
16684 so we have to do two masks. */
16687 ix86_split_copysign_var (rtx operands[])
16689 enum machine_mode mode, vmode;
16690 rtx dest, scratch, op0, op1, mask, nmask, x;
16692 dest = operands[0];
16693 scratch = operands[1];
16696 nmask = operands[4];
16697 mask = operands[5];
16699 mode = GET_MODE (dest);
16700 vmode = GET_MODE (mask);
16702 if (rtx_equal_p (op0, op1))
16704 /* Shouldn't happen often (it's useless, obviously), but when it does
16705 we'd generate incorrect code if we continue below. */
16706 emit_move_insn (dest, op0);
16710 if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
16712 gcc_assert (REGNO (op1) == REGNO (scratch));
16714 x = gen_rtx_AND (vmode, scratch, mask);
16715 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
16718 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
16719 x = gen_rtx_NOT (vmode, dest);
16720 x = gen_rtx_AND (vmode, x, op0);
16721 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
16725 if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
16727 x = gen_rtx_AND (vmode, scratch, mask);
16729 else /* alternative 2,4 */
16731 gcc_assert (REGNO (mask) == REGNO (scratch));
16732 op1 = simplify_gen_subreg (vmode, op1, mode, 0);
16733 x = gen_rtx_AND (vmode, scratch, op1);
16735 emit_insn (gen_rtx_SET (VOIDmode, scratch, x));
16737 if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
16739 dest = simplify_gen_subreg (vmode, op0, mode, 0);
16740 x = gen_rtx_AND (vmode, dest, nmask);
16742 else /* alternative 3,4 */
16744 gcc_assert (REGNO (nmask) == REGNO (dest));
16746 op0 = simplify_gen_subreg (vmode, op0, mode, 0);
16747 x = gen_rtx_AND (vmode, dest, op0);
16749 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
16752 x = gen_rtx_IOR (vmode, dest, scratch);
16753 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
16756 /* Return TRUE or FALSE depending on whether the first SET in INSN
16757 has source and destination with matching CC modes, and that the
16758 CC mode is at least as constrained as REQ_MODE. */
16761 ix86_match_ccmode (rtx insn, enum machine_mode req_mode)
16764 enum machine_mode set_mode;
16766 set = PATTERN (insn);
16767 if (GET_CODE (set) == PARALLEL)
16768 set = XVECEXP (set, 0, 0);
16769 gcc_assert (GET_CODE (set) == SET);
16770 gcc_assert (GET_CODE (SET_SRC (set)) == COMPARE);
16772 set_mode = GET_MODE (SET_DEST (set));
16776 if (req_mode != CCNOmode
16777 && (req_mode != CCmode
16778 || XEXP (SET_SRC (set), 1) != const0_rtx))
16782 if (req_mode == CCGCmode)
16786 if (req_mode == CCGOCmode || req_mode == CCNOmode)
16790 if (req_mode == CCZmode)
16800 if (set_mode != req_mode)
16805 gcc_unreachable ();
16808 return GET_MODE (SET_SRC (set)) == set_mode;
16811 /* Generate insn patterns to do an integer compare of OPERANDS. */
16814 ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
16816 enum machine_mode cmpmode;
16819 cmpmode = SELECT_CC_MODE (code, op0, op1);
16820 flags = gen_rtx_REG (cmpmode, FLAGS_REG);
16822 /* This is very simple, but making the interface the same as in the
16823 FP case makes the rest of the code easier. */
16824 tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
16825 emit_insn (gen_rtx_SET (VOIDmode, flags, tmp));
16827 /* Return the test that should be put into the flags user, i.e.
16828 the bcc, scc, or cmov instruction. */
16829 return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
16832 /* Figure out whether to use ordered or unordered fp comparisons.
16833 Return the appropriate mode to use. */
16836 ix86_fp_compare_mode (enum rtx_code code ATTRIBUTE_UNUSED)
16838 /* ??? In order to make all comparisons reversible, we do all comparisons
16839 non-trapping when compiling for IEEE. Once gcc is able to distinguish
16840 all forms trapping and nontrapping comparisons, we can make inequality
16841 comparisons trapping again, since it results in better code when using
16842 FCOM based compares. */
16843 return TARGET_IEEE_FP ? CCFPUmode : CCFPmode;
16847 ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
16849 enum machine_mode mode = GET_MODE (op0);
16851 if (SCALAR_FLOAT_MODE_P (mode))
16853 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
16854 return ix86_fp_compare_mode (code);
16859 /* Only zero flag is needed. */
16860 case EQ: /* ZF=0 */
16861 case NE: /* ZF!=0 */
16863 /* Codes needing carry flag. */
16864 case GEU: /* CF=0 */
16865 case LTU: /* CF=1 */
16866 /* Detect overflow checks. They need just the carry flag. */
16867 if (GET_CODE (op0) == PLUS
16868 && rtx_equal_p (op1, XEXP (op0, 0)))
16872 case GTU: /* CF=0 & ZF=0 */
16873 case LEU: /* CF=1 | ZF=1 */
16874 /* Detect overflow checks. They need just the carry flag. */
16875 if (GET_CODE (op0) == MINUS
16876 && rtx_equal_p (op1, XEXP (op0, 0)))
16880 /* Codes possibly doable only with sign flag when
16881 comparing against zero. */
16882 case GE: /* SF=OF or SF=0 */
16883 case LT: /* SF<>OF or SF=1 */
16884 if (op1 == const0_rtx)
16887 /* For other cases Carry flag is not required. */
16889 /* Codes doable only with sign flag when comparing
16890 against zero, but we miss jump instruction for it
16891 so we need to use relational tests against overflow
16892 that thus needs to be zero. */
16893 case GT: /* ZF=0 & SF=OF */
16894 case LE: /* ZF=1 | SF<>OF */
16895 if (op1 == const0_rtx)
16899 /* strcmp pattern do (use flags) and combine may ask us for proper
16904 gcc_unreachable ();
16908 /* Return the fixed registers used for condition codes. */
16911 ix86_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
16918 /* If two condition code modes are compatible, return a condition code
16919 mode which is compatible with both. Otherwise, return
16922 static enum machine_mode
16923 ix86_cc_modes_compatible (enum machine_mode m1, enum machine_mode m2)
16928 if (GET_MODE_CLASS (m1) != MODE_CC || GET_MODE_CLASS (m2) != MODE_CC)
16931 if ((m1 == CCGCmode && m2 == CCGOCmode)
16932 || (m1 == CCGOCmode && m2 == CCGCmode))
16938 gcc_unreachable ();
16968 /* These are only compatible with themselves, which we already
16975 /* Return a comparison we can do and that it is equivalent to
16976 swap_condition (code) apart possibly from orderedness.
16977 But, never change orderedness if TARGET_IEEE_FP, returning
16978 UNKNOWN in that case if necessary. */
16980 static enum rtx_code
16981 ix86_fp_swap_condition (enum rtx_code code)
16985 case GT: /* GTU - CF=0 & ZF=0 */
16986 return TARGET_IEEE_FP ? UNKNOWN : UNLT;
16987 case GE: /* GEU - CF=0 */
16988 return TARGET_IEEE_FP ? UNKNOWN : UNLE;
16989 case UNLT: /* LTU - CF=1 */
16990 return TARGET_IEEE_FP ? UNKNOWN : GT;
16991 case UNLE: /* LEU - CF=1 | ZF=1 */
16992 return TARGET_IEEE_FP ? UNKNOWN : GE;
16994 return swap_condition (code);
16998 /* Return cost of comparison CODE using the best strategy for performance.
16999 All following functions do use number of instructions as a cost metrics.
17000 In future this should be tweaked to compute bytes for optimize_size and
17001 take into account performance of various instructions on various CPUs. */
17004 ix86_fp_comparison_cost (enum rtx_code code)
17008 /* The cost of code using bit-twiddling on %ah. */
17025 arith_cost = TARGET_IEEE_FP ? 5 : 4;
17029 arith_cost = TARGET_IEEE_FP ? 6 : 4;
17032 gcc_unreachable ();
17035 switch (ix86_fp_comparison_strategy (code))
17037 case IX86_FPCMP_COMI:
17038 return arith_cost > 4 ? 3 : 2;
17039 case IX86_FPCMP_SAHF:
17040 return arith_cost > 4 ? 4 : 3;
17046 /* Return strategy to use for floating-point. We assume that fcomi is always
17047 preferrable where available, since that is also true when looking at size
17048 (2 bytes, vs. 3 for fnstsw+sahf and at least 5 for fnstsw+test). */
17050 enum ix86_fpcmp_strategy
17051 ix86_fp_comparison_strategy (enum rtx_code code ATTRIBUTE_UNUSED)
17053 /* Do fcomi/sahf based test when profitable. */
17056 return IX86_FPCMP_COMI;
17058 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_function_for_size_p (cfun)))
17059 return IX86_FPCMP_SAHF;
17061 return IX86_FPCMP_ARITH;
17064 /* Swap, force into registers, or otherwise massage the two operands
17065 to a fp comparison. The operands are updated in place; the new
17066 comparison code is returned. */
17068 static enum rtx_code
17069 ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
17071 enum machine_mode fpcmp_mode = ix86_fp_compare_mode (code);
17072 rtx op0 = *pop0, op1 = *pop1;
17073 enum machine_mode op_mode = GET_MODE (op0);
17074 int is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
17076 /* All of the unordered compare instructions only work on registers.
17077 The same is true of the fcomi compare instructions. The XFmode
17078 compare instructions require registers except when comparing
17079 against zero or when converting operand 1 from fixed point to
17083 && (fpcmp_mode == CCFPUmode
17084 || (op_mode == XFmode
17085 && ! (standard_80387_constant_p (op0) == 1
17086 || standard_80387_constant_p (op1) == 1)
17087 && GET_CODE (op1) != FLOAT)
17088 || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
17090 op0 = force_reg (op_mode, op0);
17091 op1 = force_reg (op_mode, op1);
17095 /* %%% We only allow op1 in memory; op0 must be st(0). So swap
17096 things around if they appear profitable, otherwise force op0
17097 into a register. */
17099 if (standard_80387_constant_p (op0) == 0
17101 && ! (standard_80387_constant_p (op1) == 0
17104 enum rtx_code new_code = ix86_fp_swap_condition (code);
17105 if (new_code != UNKNOWN)
17108 tmp = op0, op0 = op1, op1 = tmp;
17114 op0 = force_reg (op_mode, op0);
17116 if (CONSTANT_P (op1))
17118 int tmp = standard_80387_constant_p (op1);
17120 op1 = validize_mem (force_const_mem (op_mode, op1));
17124 op1 = force_reg (op_mode, op1);
17127 op1 = force_reg (op_mode, op1);
17131 /* Try to rearrange the comparison to make it cheaper. */
17132 if (ix86_fp_comparison_cost (code)
17133 > ix86_fp_comparison_cost (swap_condition (code))
17134 && (REG_P (op1) || can_create_pseudo_p ()))
17137 tmp = op0, op0 = op1, op1 = tmp;
17138 code = swap_condition (code);
17140 op0 = force_reg (op_mode, op0);
17148 /* Convert comparison codes we use to represent FP comparison to integer
17149 code that will result in proper branch. Return UNKNOWN if no such code
17153 ix86_fp_compare_code_to_integer (enum rtx_code code)
17182 /* Generate insn patterns to do a floating point compare of OPERANDS. */
17185 ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1, rtx scratch)
17187 enum machine_mode fpcmp_mode, intcmp_mode;
17190 fpcmp_mode = ix86_fp_compare_mode (code);
17191 code = ix86_prepare_fp_compare_args (code, &op0, &op1);
17193 /* Do fcomi/sahf based test when profitable. */
17194 switch (ix86_fp_comparison_strategy (code))
17196 case IX86_FPCMP_COMI:
17197 intcmp_mode = fpcmp_mode;
17198 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
17199 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
17204 case IX86_FPCMP_SAHF:
17205 intcmp_mode = fpcmp_mode;
17206 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
17207 tmp = gen_rtx_SET (VOIDmode, gen_rtx_REG (fpcmp_mode, FLAGS_REG),
17211 scratch = gen_reg_rtx (HImode);
17212 tmp2 = gen_rtx_CLOBBER (VOIDmode, scratch);
17213 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, tmp2)));
17216 case IX86_FPCMP_ARITH:
17217 /* Sadness wrt reg-stack pops killing fpsr -- gotta get fnstsw first. */
17218 tmp = gen_rtx_COMPARE (fpcmp_mode, op0, op1);
17219 tmp2 = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
17221 scratch = gen_reg_rtx (HImode);
17222 emit_insn (gen_rtx_SET (VOIDmode, scratch, tmp2));
17224 /* In the unordered case, we have to check C2 for NaN's, which
17225 doesn't happen to work out to anything nice combination-wise.
17226 So do some bit twiddling on the value we've got in AH to come
17227 up with an appropriate set of condition codes. */
17229 intcmp_mode = CCNOmode;
17234 if (code == GT || !TARGET_IEEE_FP)
17236 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
17241 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
17242 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
17243 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
17244 intcmp_mode = CCmode;
17250 if (code == LT && TARGET_IEEE_FP)
17252 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
17253 emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
17254 intcmp_mode = CCmode;
17259 emit_insn (gen_testqi_ext_ccno_0 (scratch, const1_rtx));
17265 if (code == GE || !TARGET_IEEE_FP)
17267 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x05)));
17272 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
17273 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch, const1_rtx));
17279 if (code == LE && TARGET_IEEE_FP)
17281 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
17282 emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
17283 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
17284 intcmp_mode = CCmode;
17289 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x45)));
17295 if (code == EQ && TARGET_IEEE_FP)
17297 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
17298 emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
17299 intcmp_mode = CCmode;
17304 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
17310 if (code == NE && TARGET_IEEE_FP)
17312 emit_insn (gen_andqi_ext_0 (scratch, scratch, GEN_INT (0x45)));
17313 emit_insn (gen_xorqi_cc_ext_1 (scratch, scratch,
17319 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x40)));
17325 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
17329 emit_insn (gen_testqi_ext_ccno_0 (scratch, GEN_INT (0x04)));
17334 gcc_unreachable ();
17342 /* Return the test that should be put into the flags user, i.e.
17343 the bcc, scc, or cmov instruction. */
17344 return gen_rtx_fmt_ee (code, VOIDmode,
17345 gen_rtx_REG (intcmp_mode, FLAGS_REG),
17350 ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
17354 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
17355 ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
17357 else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
17359 gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
17360 ret = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
17363 ret = ix86_expand_int_compare (code, op0, op1);
17369 ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
17371 enum machine_mode mode = GET_MODE (op0);
17383 tmp = ix86_expand_compare (code, op0, op1);
17384 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
17385 gen_rtx_LABEL_REF (VOIDmode, label),
17387 emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
17394 /* Expand DImode branch into multiple compare+branch. */
17396 rtx lo[2], hi[2], label2;
17397 enum rtx_code code1, code2, code3;
17398 enum machine_mode submode;
17400 if (CONSTANT_P (op0) && !CONSTANT_P (op1))
17402 tmp = op0, op0 = op1, op1 = tmp;
17403 code = swap_condition (code);
17406 split_double_mode (mode, &op0, 1, lo+0, hi+0);
17407 split_double_mode (mode, &op1, 1, lo+1, hi+1);
17409 submode = mode == DImode ? SImode : DImode;
17411 /* When comparing for equality, we can use (hi0^hi1)|(lo0^lo1) to
17412 avoid two branches. This costs one extra insn, so disable when
17413 optimizing for size. */
17415 if ((code == EQ || code == NE)
17416 && (!optimize_insn_for_size_p ()
17417 || hi[1] == const0_rtx || lo[1] == const0_rtx))
17422 if (hi[1] != const0_rtx)
17423 xor1 = expand_binop (submode, xor_optab, xor1, hi[1],
17424 NULL_RTX, 0, OPTAB_WIDEN);
17427 if (lo[1] != const0_rtx)
17428 xor0 = expand_binop (submode, xor_optab, xor0, lo[1],
17429 NULL_RTX, 0, OPTAB_WIDEN);
17431 tmp = expand_binop (submode, ior_optab, xor1, xor0,
17432 NULL_RTX, 0, OPTAB_WIDEN);
17434 ix86_expand_branch (code, tmp, const0_rtx, label);
17438 /* Otherwise, if we are doing less-than or greater-or-equal-than,
17439 op1 is a constant and the low word is zero, then we can just
17440 examine the high word. Similarly for low word -1 and
17441 less-or-equal-than or greater-than. */
17443 if (CONST_INT_P (hi[1]))
17446 case LT: case LTU: case GE: case GEU:
17447 if (lo[1] == const0_rtx)
17449 ix86_expand_branch (code, hi[0], hi[1], label);
17453 case LE: case LEU: case GT: case GTU:
17454 if (lo[1] == constm1_rtx)
17456 ix86_expand_branch (code, hi[0], hi[1], label);
17464 /* Otherwise, we need two or three jumps. */
17466 label2 = gen_label_rtx ();
17469 code2 = swap_condition (code);
17470 code3 = unsigned_condition (code);
17474 case LT: case GT: case LTU: case GTU:
17477 case LE: code1 = LT; code2 = GT; break;
17478 case GE: code1 = GT; code2 = LT; break;
17479 case LEU: code1 = LTU; code2 = GTU; break;
17480 case GEU: code1 = GTU; code2 = LTU; break;
17482 case EQ: code1 = UNKNOWN; code2 = NE; break;
17483 case NE: code2 = UNKNOWN; break;
17486 gcc_unreachable ();
17491 * if (hi(a) < hi(b)) goto true;
17492 * if (hi(a) > hi(b)) goto false;
17493 * if (lo(a) < lo(b)) goto true;
17497 if (code1 != UNKNOWN)
17498 ix86_expand_branch (code1, hi[0], hi[1], label);
17499 if (code2 != UNKNOWN)
17500 ix86_expand_branch (code2, hi[0], hi[1], label2);
17502 ix86_expand_branch (code3, lo[0], lo[1], label);
17504 if (code2 != UNKNOWN)
17505 emit_label (label2);
17510 gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
17515 /* Split branch based on floating point condition. */
17517 ix86_split_fp_branch (enum rtx_code code, rtx op1, rtx op2,
17518 rtx target1, rtx target2, rtx tmp, rtx pushed)
17523 if (target2 != pc_rtx)
17526 code = reverse_condition_maybe_unordered (code);
17531 condition = ix86_expand_fp_compare (code, op1, op2,
17534 /* Remove pushed operand from stack. */
17536 ix86_free_from_memory (GET_MODE (pushed));
17538 i = emit_jump_insn (gen_rtx_SET
17540 gen_rtx_IF_THEN_ELSE (VOIDmode,
17541 condition, target1, target2)));
17542 if (split_branch_probability >= 0)
17543 add_reg_note (i, REG_BR_PROB, GEN_INT (split_branch_probability));
17547 ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
17551 gcc_assert (GET_MODE (dest) == QImode);
17553 ret = ix86_expand_compare (code, op0, op1);
17554 PUT_MODE (ret, QImode);
17555 emit_insn (gen_rtx_SET (VOIDmode, dest, ret));
17558 /* Expand comparison setting or clearing carry flag. Return true when
17559 successful and set pop for the operation. */
17561 ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
17563 enum machine_mode mode =
17564 GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
17566 /* Do not handle double-mode compares that go through special path. */
17567 if (mode == (TARGET_64BIT ? TImode : DImode))
17570 if (SCALAR_FLOAT_MODE_P (mode))
17572 rtx compare_op, compare_seq;
17574 gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
17576 /* Shortcut: following common codes never translate
17577 into carry flag compares. */
17578 if (code == EQ || code == NE || code == UNEQ || code == LTGT
17579 || code == ORDERED || code == UNORDERED)
17582 /* These comparisons require zero flag; swap operands so they won't. */
17583 if ((code == GT || code == UNLE || code == LE || code == UNGT)
17584 && !TARGET_IEEE_FP)
17589 code = swap_condition (code);
17592 /* Try to expand the comparison and verify that we end up with
17593 carry flag based comparison. This fails to be true only when
17594 we decide to expand comparison using arithmetic that is not
17595 too common scenario. */
17597 compare_op = ix86_expand_fp_compare (code, op0, op1, NULL_RTX);
17598 compare_seq = get_insns ();
17601 if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode
17602 || GET_MODE (XEXP (compare_op, 0)) == CCFPUmode)
17603 code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
17605 code = GET_CODE (compare_op);
17607 if (code != LTU && code != GEU)
17610 emit_insn (compare_seq);
17615 if (!INTEGRAL_MODE_P (mode))
17624 /* Convert a==0 into (unsigned)a<1. */
17627 if (op1 != const0_rtx)
17630 code = (code == EQ ? LTU : GEU);
17633 /* Convert a>b into b<a or a>=b-1. */
17636 if (CONST_INT_P (op1))
17638 op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
17639 /* Bail out on overflow. We still can swap operands but that
17640 would force loading of the constant into register. */
17641 if (op1 == const0_rtx
17642 || !x86_64_immediate_operand (op1, GET_MODE (op1)))
17644 code = (code == GTU ? GEU : LTU);
17651 code = (code == GTU ? LTU : GEU);
17655 /* Convert a>=0 into (unsigned)a<0x80000000. */
17658 if (mode == DImode || op1 != const0_rtx)
17660 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
17661 code = (code == LT ? GEU : LTU);
17665 if (mode == DImode || op1 != constm1_rtx)
17667 op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
17668 code = (code == LE ? GEU : LTU);
17674 /* Swapping operands may cause constant to appear as first operand. */
17675 if (!nonimmediate_operand (op0, VOIDmode))
17677 if (!can_create_pseudo_p ())
17679 op0 = force_reg (mode, op0);
17681 *pop = ix86_expand_compare (code, op0, op1);
17682 gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
17687 ix86_expand_int_movcc (rtx operands[])
17689 enum rtx_code code = GET_CODE (operands[1]), compare_code;
17690 rtx compare_seq, compare_op;
17691 enum machine_mode mode = GET_MODE (operands[0]);
17692 bool sign_bit_compare_p = false;
17693 rtx op0 = XEXP (operands[1], 0);
17694 rtx op1 = XEXP (operands[1], 1);
17697 compare_op = ix86_expand_compare (code, op0, op1);
17698 compare_seq = get_insns ();
17701 compare_code = GET_CODE (compare_op);
17703 if ((op1 == const0_rtx && (code == GE || code == LT))
17704 || (op1 == constm1_rtx && (code == GT || code == LE)))
17705 sign_bit_compare_p = true;
17707 /* Don't attempt mode expansion here -- if we had to expand 5 or 6
17708 HImode insns, we'd be swallowed in word prefix ops. */
17710 if ((mode != HImode || TARGET_FAST_PREFIX)
17711 && (mode != (TARGET_64BIT ? TImode : DImode))
17712 && CONST_INT_P (operands[2])
17713 && CONST_INT_P (operands[3]))
17715 rtx out = operands[0];
17716 HOST_WIDE_INT ct = INTVAL (operands[2]);
17717 HOST_WIDE_INT cf = INTVAL (operands[3]);
17718 HOST_WIDE_INT diff;
17721 /* Sign bit compares are better done using shifts than we do by using
17723 if (sign_bit_compare_p
17724 || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
17726 /* Detect overlap between destination and compare sources. */
17729 if (!sign_bit_compare_p)
17732 bool fpcmp = false;
17734 compare_code = GET_CODE (compare_op);
17736 flags = XEXP (compare_op, 0);
17738 if (GET_MODE (flags) == CCFPmode
17739 || GET_MODE (flags) == CCFPUmode)
17743 = ix86_fp_compare_code_to_integer (compare_code);
17746 /* To simplify rest of code, restrict to the GEU case. */
17747 if (compare_code == LTU)
17749 HOST_WIDE_INT tmp = ct;
17752 compare_code = reverse_condition (compare_code);
17753 code = reverse_condition (code);
17758 PUT_CODE (compare_op,
17759 reverse_condition_maybe_unordered
17760 (GET_CODE (compare_op)));
17762 PUT_CODE (compare_op,
17763 reverse_condition (GET_CODE (compare_op)));
17767 if (reg_overlap_mentioned_p (out, op0)
17768 || reg_overlap_mentioned_p (out, op1))
17769 tmp = gen_reg_rtx (mode);
17771 if (mode == DImode)
17772 emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
17774 emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
17775 flags, compare_op));
17779 if (code == GT || code == GE)
17780 code = reverse_condition (code);
17783 HOST_WIDE_INT tmp = ct;
17788 tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
17801 tmp = expand_simple_binop (mode, PLUS,
17803 copy_rtx (tmp), 1, OPTAB_DIRECT);
17814 tmp = expand_simple_binop (mode, IOR,
17816 copy_rtx (tmp), 1, OPTAB_DIRECT);
17818 else if (diff == -1 && ct)
17828 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
17830 tmp = expand_simple_binop (mode, PLUS,
17831 copy_rtx (tmp), GEN_INT (cf),
17832 copy_rtx (tmp), 1, OPTAB_DIRECT);
17840 * andl cf - ct, dest
17850 tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
17853 tmp = expand_simple_binop (mode, AND,
17855 gen_int_mode (cf - ct, mode),
17856 copy_rtx (tmp), 1, OPTAB_DIRECT);
17858 tmp = expand_simple_binop (mode, PLUS,
17859 copy_rtx (tmp), GEN_INT (ct),
17860 copy_rtx (tmp), 1, OPTAB_DIRECT);
17863 if (!rtx_equal_p (tmp, out))
17864 emit_move_insn (copy_rtx (out), copy_rtx (tmp));
17871 enum machine_mode cmp_mode = GET_MODE (op0);
17874 tmp = ct, ct = cf, cf = tmp;
17877 if (SCALAR_FLOAT_MODE_P (cmp_mode))
17879 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
17881 /* We may be reversing unordered compare to normal compare, that
17882 is not valid in general (we may convert non-trapping condition
17883 to trapping one), however on i386 we currently emit all
17884 comparisons unordered. */
17885 compare_code = reverse_condition_maybe_unordered (compare_code);
17886 code = reverse_condition_maybe_unordered (code);
17890 compare_code = reverse_condition (compare_code);
17891 code = reverse_condition (code);
17895 compare_code = UNKNOWN;
17896 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
17897 && CONST_INT_P (op1))
17899 if (op1 == const0_rtx
17900 && (code == LT || code == GE))
17901 compare_code = code;
17902 else if (op1 == constm1_rtx)
17906 else if (code == GT)
17911 /* Optimize dest = (op0 < 0) ? -1 : cf. */
17912 if (compare_code != UNKNOWN
17913 && GET_MODE (op0) == GET_MODE (out)
17914 && (cf == -1 || ct == -1))
17916 /* If lea code below could be used, only optimize
17917 if it results in a 2 insn sequence. */
17919 if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
17920 || diff == 3 || diff == 5 || diff == 9)
17921 || (compare_code == LT && ct == -1)
17922 || (compare_code == GE && cf == -1))
17925 * notl op1 (if necessary)
17933 code = reverse_condition (code);
17936 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
17938 out = expand_simple_binop (mode, IOR,
17940 out, 1, OPTAB_DIRECT);
17941 if (out != operands[0])
17942 emit_move_insn (operands[0], out);
17949 if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
17950 || diff == 3 || diff == 5 || diff == 9)
17951 && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
17953 || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
17959 * lea cf(dest*(ct-cf)),dest
17963 * This also catches the degenerate setcc-only case.
17969 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
17972 /* On x86_64 the lea instruction operates on Pmode, so we need
17973 to get arithmetics done in proper mode to match. */
17975 tmp = copy_rtx (out);
17979 out1 = copy_rtx (out);
17980 tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
17984 tmp = gen_rtx_PLUS (mode, tmp, out1);
17990 tmp = gen_rtx_PLUS (mode, tmp, GEN_INT (cf));
17993 if (!rtx_equal_p (tmp, out))
17996 out = force_operand (tmp, copy_rtx (out));
17998 emit_insn (gen_rtx_SET (VOIDmode, copy_rtx (out), copy_rtx (tmp)));
18000 if (!rtx_equal_p (out, operands[0]))
18001 emit_move_insn (operands[0], copy_rtx (out));
18007 * General case: Jumpful:
18008 * xorl dest,dest cmpl op1, op2
18009 * cmpl op1, op2 movl ct, dest
18010 * setcc dest jcc 1f
18011 * decl dest movl cf, dest
18012 * andl (cf-ct),dest 1:
18015 * Size 20. Size 14.
18017 * This is reasonably steep, but branch mispredict costs are
18018 * high on modern cpus, so consider failing only if optimizing
18022 if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
18023 && BRANCH_COST (optimize_insn_for_speed_p (),
18028 enum machine_mode cmp_mode = GET_MODE (op0);
18033 if (SCALAR_FLOAT_MODE_P (cmp_mode))
18035 gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
18037 /* We may be reversing unordered compare to normal compare,
18038 that is not valid in general (we may convert non-trapping
18039 condition to trapping one), however on i386 we currently
18040 emit all comparisons unordered. */
18041 code = reverse_condition_maybe_unordered (code);
18045 code = reverse_condition (code);
18046 if (compare_code != UNKNOWN)
18047 compare_code = reverse_condition (compare_code);
18051 if (compare_code != UNKNOWN)
18053 /* notl op1 (if needed)
18058 For x < 0 (resp. x <= -1) there will be no notl,
18059 so if possible swap the constants to get rid of the
18061 True/false will be -1/0 while code below (store flag
18062 followed by decrement) is 0/-1, so the constants need
18063 to be exchanged once more. */
18065 if (compare_code == GE || !cf)
18067 code = reverse_condition (code);
18072 HOST_WIDE_INT tmp = cf;
18077 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
18081 out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
18083 out = expand_simple_binop (mode, PLUS, copy_rtx (out),
18085 copy_rtx (out), 1, OPTAB_DIRECT);
18088 out = expand_simple_binop (mode, AND, copy_rtx (out),
18089 gen_int_mode (cf - ct, mode),
18090 copy_rtx (out), 1, OPTAB_DIRECT);
18092 out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
18093 copy_rtx (out), 1, OPTAB_DIRECT);
18094 if (!rtx_equal_p (out, operands[0]))
18095 emit_move_insn (operands[0], copy_rtx (out));
18101 if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
18103 /* Try a few things more with specific constants and a variable. */
18106 rtx var, orig_out, out, tmp;
18108 if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
18111 /* If one of the two operands is an interesting constant, load a
18112 constant with the above and mask it in with a logical operation. */
18114 if (CONST_INT_P (operands[2]))
18117 if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
18118 operands[3] = constm1_rtx, op = and_optab;
18119 else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
18120 operands[3] = const0_rtx, op = ior_optab;
18124 else if (CONST_INT_P (operands[3]))
18127 if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
18128 operands[2] = constm1_rtx, op = and_optab;
18129 else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
18130 operands[2] = const0_rtx, op = ior_optab;
18137 orig_out = operands[0];
18138 tmp = gen_reg_rtx (mode);
18141 /* Recurse to get the constant loaded. */
18142 if (ix86_expand_int_movcc (operands) == 0)
18145 /* Mask in the interesting variable. */
18146 out = expand_binop (mode, op, var, tmp, orig_out, 0,
18148 if (!rtx_equal_p (out, orig_out))
18149 emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
18155 * For comparison with above,
18165 if (! nonimmediate_operand (operands[2], mode))
18166 operands[2] = force_reg (mode, operands[2]);
18167 if (! nonimmediate_operand (operands[3], mode))
18168 operands[3] = force_reg (mode, operands[3]);
18170 if (! register_operand (operands[2], VOIDmode)
18172 || ! register_operand (operands[3], VOIDmode)))
18173 operands[2] = force_reg (mode, operands[2]);
18176 && ! register_operand (operands[3], VOIDmode))
18177 operands[3] = force_reg (mode, operands[3]);
18179 emit_insn (compare_seq);
18180 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
18181 gen_rtx_IF_THEN_ELSE (mode,
18182 compare_op, operands[2],
18187 /* Swap, force into registers, or otherwise massage the two operands
18188 to an sse comparison with a mask result. Thus we differ a bit from
18189 ix86_prepare_fp_compare_args which expects to produce a flags result.
18191 The DEST operand exists to help determine whether to commute commutative
18192 operators. The POP0/POP1 operands are updated in place. The new
18193 comparison code is returned, or UNKNOWN if not implementable. */
18195 static enum rtx_code
18196 ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
18197 rtx *pop0, rtx *pop1)
18205 /* We have no LTGT as an operator. We could implement it with
18206 NE & ORDERED, but this requires an extra temporary. It's
18207 not clear that it's worth it. */
18214 /* These are supported directly. */
18221 /* For commutative operators, try to canonicalize the destination
18222 operand to be first in the comparison - this helps reload to
18223 avoid extra moves. */
18224 if (!dest || !rtx_equal_p (dest, *pop1))
18232 /* These are not supported directly. Swap the comparison operands
18233 to transform into something that is supported. */
18237 code = swap_condition (code);
18241 gcc_unreachable ();
18247 /* Detect conditional moves that exactly match min/max operational
18248 semantics. Note that this is IEEE safe, as long as we don't
18249 interchange the operands.
18251 Returns FALSE if this conditional move doesn't match a MIN/MAX,
18252 and TRUE if the operation is successful and instructions are emitted. */
18255 ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
18256 rtx cmp_op1, rtx if_true, rtx if_false)
18258 enum machine_mode mode;
18264 else if (code == UNGE)
18267 if_true = if_false;
18273 if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
18275 else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
18280 mode = GET_MODE (dest);
18282 /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
18283 but MODE may be a vector mode and thus not appropriate. */
18284 if (!flag_finite_math_only || !flag_unsafe_math_optimizations)
18286 int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
18289 if_true = force_reg (mode, if_true);
18290 v = gen_rtvec (2, if_true, if_false);
18291 tmp = gen_rtx_UNSPEC (mode, v, u);
18295 code = is_min ? SMIN : SMAX;
18296 tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
18299 emit_insn (gen_rtx_SET (VOIDmode, dest, tmp));
18303 /* Expand an sse vector comparison. Return the register with the result. */
18306 ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
18307 rtx op_true, rtx op_false)
18309 enum machine_mode mode = GET_MODE (dest);
18312 cmp_op0 = force_reg (mode, cmp_op0);
18313 if (!nonimmediate_operand (cmp_op1, mode))
18314 cmp_op1 = force_reg (mode, cmp_op1);
18317 || reg_overlap_mentioned_p (dest, op_true)
18318 || reg_overlap_mentioned_p (dest, op_false))
18319 dest = gen_reg_rtx (mode);
18321 x = gen_rtx_fmt_ee (code, mode, cmp_op0, cmp_op1);
18322 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18327 /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
18328 operations. This is used for both scalar and vector conditional moves. */
18331 ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
18333 enum machine_mode mode = GET_MODE (dest);
18336 if (op_false == CONST0_RTX (mode))
18338 op_true = force_reg (mode, op_true);
18339 x = gen_rtx_AND (mode, cmp, op_true);
18340 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18342 else if (op_true == CONST0_RTX (mode))
18344 op_false = force_reg (mode, op_false);
18345 x = gen_rtx_NOT (mode, cmp);
18346 x = gen_rtx_AND (mode, x, op_false);
18347 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18349 else if (TARGET_XOP)
18351 rtx pcmov = gen_rtx_SET (mode, dest,
18352 gen_rtx_IF_THEN_ELSE (mode, cmp,
18359 op_true = force_reg (mode, op_true);
18360 op_false = force_reg (mode, op_false);
18362 t2 = gen_reg_rtx (mode);
18364 t3 = gen_reg_rtx (mode);
18368 x = gen_rtx_AND (mode, op_true, cmp);
18369 emit_insn (gen_rtx_SET (VOIDmode, t2, x));
18371 x = gen_rtx_NOT (mode, cmp);
18372 x = gen_rtx_AND (mode, x, op_false);
18373 emit_insn (gen_rtx_SET (VOIDmode, t3, x));
18375 x = gen_rtx_IOR (mode, t3, t2);
18376 emit_insn (gen_rtx_SET (VOIDmode, dest, x));
18380 /* Expand a floating-point conditional move. Return true if successful. */
18383 ix86_expand_fp_movcc (rtx operands[])
18385 enum machine_mode mode = GET_MODE (operands[0]);
18386 enum rtx_code code = GET_CODE (operands[1]);
18387 rtx tmp, compare_op;
18388 rtx op0 = XEXP (operands[1], 0);
18389 rtx op1 = XEXP (operands[1], 1);
18391 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
18393 enum machine_mode cmode;
18395 /* Since we've no cmove for sse registers, don't force bad register
18396 allocation just to gain access to it. Deny movcc when the
18397 comparison mode doesn't match the move mode. */
18398 cmode = GET_MODE (op0);
18399 if (cmode == VOIDmode)
18400 cmode = GET_MODE (op1);
18404 code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
18405 if (code == UNKNOWN)
18408 if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
18409 operands[2], operands[3]))
18412 tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
18413 operands[2], operands[3]);
18414 ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
18418 /* The floating point conditional move instructions don't directly
18419 support conditions resulting from a signed integer comparison. */
18421 compare_op = ix86_expand_compare (code, op0, op1);
18422 if (!fcmov_comparison_operator (compare_op, VOIDmode))
18424 tmp = gen_reg_rtx (QImode);
18425 ix86_expand_setcc (tmp, code, op0, op1);
18427 compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
18430 emit_insn (gen_rtx_SET (VOIDmode, operands[0],
18431 gen_rtx_IF_THEN_ELSE (mode, compare_op,
18432 operands[2], operands[3])));
18437 /* Expand a floating-point vector conditional move; a vcond operation
18438 rather than a movcc operation. */
18441 ix86_expand_fp_vcond (rtx operands[])
18443 enum rtx_code code = GET_CODE (operands[3]);
18446 code = ix86_prepare_sse_fp_compare_args (operands[0], code,
18447 &operands[4], &operands[5]);
18448 if (code == UNKNOWN)
18451 if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
18452 operands[5], operands[1], operands[2]))
18455 cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
18456 operands[1], operands[2]);
18457 ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
18461 /* Expand a signed/unsigned integral vector conditional move. */
18464 ix86_expand_int_vcond (rtx operands[])
18466 enum machine_mode mode = GET_MODE (operands[0]);
18467 enum rtx_code code = GET_CODE (operands[3]);
18468 bool negate = false;
18471 cop0 = operands[4];
18472 cop1 = operands[5];
18474 /* XOP supports all of the comparisons on all vector int types. */
18477 /* Canonicalize the comparison to EQ, GT, GTU. */
18488 code = reverse_condition (code);
18494 code = reverse_condition (code);
18500 code = swap_condition (code);
18501 x = cop0, cop0 = cop1, cop1 = x;
18505 gcc_unreachable ();
18508 /* Only SSE4.1/SSE4.2 supports V2DImode. */
18509 if (mode == V2DImode)
18514 /* SSE4.1 supports EQ. */
18515 if (!TARGET_SSE4_1)
18521 /* SSE4.2 supports GT/GTU. */
18522 if (!TARGET_SSE4_2)
18527 gcc_unreachable ();
18531 /* Unsigned parallel compare is not supported by the hardware.
18532 Play some tricks to turn this into a signed comparison
18536 cop0 = force_reg (mode, cop0);
18544 rtx (*gen_sub3) (rtx, rtx, rtx);
18546 /* Subtract (-(INT MAX) - 1) from both operands to make
18548 mask = ix86_build_signbit_mask (mode, true, false);
18549 gen_sub3 = (mode == V4SImode
18550 ? gen_subv4si3 : gen_subv2di3);
18551 t1 = gen_reg_rtx (mode);
18552 emit_insn (gen_sub3 (t1, cop0, mask));
18554 t2 = gen_reg_rtx (mode);
18555 emit_insn (gen_sub3 (t2, cop1, mask));
18565 /* Perform a parallel unsigned saturating subtraction. */
18566 x = gen_reg_rtx (mode);
18567 emit_insn (gen_rtx_SET (VOIDmode, x,
18568 gen_rtx_US_MINUS (mode, cop0, cop1)));
18571 cop1 = CONST0_RTX (mode);
18577 gcc_unreachable ();
18582 x = ix86_expand_sse_cmp (operands[0], code, cop0, cop1,
18583 operands[1+negate], operands[2-negate]);
18585 ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
18586 operands[2-negate]);
18590 /* Unpack OP[1] into the next wider integer vector type. UNSIGNED_P is
18591 true if we should do zero extension, else sign extension. HIGH_P is
18592 true if we want the N/2 high elements, else the low elements. */
18595 ix86_expand_sse_unpack (rtx operands[2], bool unsigned_p, bool high_p)
18597 enum machine_mode imode = GET_MODE (operands[1]);
18602 rtx (*unpack)(rtx, rtx);
18608 unpack = gen_sse4_1_zero_extendv8qiv8hi2;
18610 unpack = gen_sse4_1_sign_extendv8qiv8hi2;
18614 unpack = gen_sse4_1_zero_extendv4hiv4si2;
18616 unpack = gen_sse4_1_sign_extendv4hiv4si2;
18620 unpack = gen_sse4_1_zero_extendv2siv2di2;
18622 unpack = gen_sse4_1_sign_extendv2siv2di2;
18625 gcc_unreachable ();
18630 /* Shift higher 8 bytes to lower 8 bytes. */
18631 tmp = gen_reg_rtx (imode);
18632 emit_insn (gen_sse2_lshrv1ti3 (gen_lowpart (V1TImode, tmp),
18633 gen_lowpart (V1TImode, operands[1]),
18639 emit_insn (unpack (operands[0], tmp));
18643 rtx (*unpack)(rtx, rtx, rtx);
18649 unpack = gen_vec_interleave_highv16qi;
18651 unpack = gen_vec_interleave_lowv16qi;
18655 unpack = gen_vec_interleave_highv8hi;
18657 unpack = gen_vec_interleave_lowv8hi;
18661 unpack = gen_vec_interleave_highv4si;
18663 unpack = gen_vec_interleave_lowv4si;
18666 gcc_unreachable ();
18669 dest = gen_lowpart (imode, operands[0]);
18672 tmp = force_reg (imode, CONST0_RTX (imode));
18674 tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
18675 operands[1], pc_rtx, pc_rtx);
18677 emit_insn (unpack (dest, operands[1], tmp));
18681 /* Expand conditional increment or decrement using adb/sbb instructions.
18682 The default case using setcc followed by the conditional move can be
18683 done by generic code. */
18685 ix86_expand_int_addcc (rtx operands[])
18687 enum rtx_code code = GET_CODE (operands[1]);
18689 rtx (*insn)(rtx, rtx, rtx, rtx, rtx);
18691 rtx val = const0_rtx;
18692 bool fpcmp = false;
18693 enum machine_mode mode;
18694 rtx op0 = XEXP (operands[1], 0);
18695 rtx op1 = XEXP (operands[1], 1);
18697 if (operands[3] != const1_rtx
18698 && operands[3] != constm1_rtx)
18700 if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
18702 code = GET_CODE (compare_op);
18704 flags = XEXP (compare_op, 0);
18706 if (GET_MODE (flags) == CCFPmode
18707 || GET_MODE (flags) == CCFPUmode)
18710 code = ix86_fp_compare_code_to_integer (code);
18717 PUT_CODE (compare_op,
18718 reverse_condition_maybe_unordered
18719 (GET_CODE (compare_op)));
18721 PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
18724 mode = GET_MODE (operands[0]);
18726 /* Construct either adc or sbb insn. */
18727 if ((code == LTU) == (operands[3] == constm1_rtx))
18732 insn = gen_subqi3_carry;
18735 insn = gen_subhi3_carry;
18738 insn = gen_subsi3_carry;
18741 insn = gen_subdi3_carry;
18744 gcc_unreachable ();
18752 insn = gen_addqi3_carry;
18755 insn = gen_addhi3_carry;
18758 insn = gen_addsi3_carry;
18761 insn = gen_adddi3_carry;
18764 gcc_unreachable ();
18767 emit_insn (insn (operands[0], operands[2], val, flags, compare_op));
18773 /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
18774 but works for floating pointer parameters and nonoffsetable memories.
18775 For pushes, it returns just stack offsets; the values will be saved
18776 in the right order. Maximally three parts are generated. */
18779 ix86_split_to_parts (rtx operand, rtx *parts, enum machine_mode mode)
18784 size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
18786 size = (GET_MODE_SIZE (mode) + 4) / 8;
18788 gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
18789 gcc_assert (size >= 2 && size <= 4);
18791 /* Optimize constant pool reference to immediates. This is used by fp
18792 moves, that force all constants to memory to allow combining. */
18793 if (MEM_P (operand) && MEM_READONLY_P (operand))
18795 rtx tmp = maybe_get_pool_constant (operand);
18800 if (MEM_P (operand) && !offsettable_memref_p (operand))
18802 /* The only non-offsetable memories we handle are pushes. */
18803 int ok = push_operand (operand, VOIDmode);
18807 operand = copy_rtx (operand);
18808 PUT_MODE (operand, Pmode);
18809 parts[0] = parts[1] = parts[2] = parts[3] = operand;
18813 if (GET_CODE (operand) == CONST_VECTOR)
18815 enum machine_mode imode = int_mode_for_mode (mode);
18816 /* Caution: if we looked through a constant pool memory above,
18817 the operand may actually have a different mode now. That's
18818 ok, since we want to pun this all the way back to an integer. */
18819 operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
18820 gcc_assert (operand != NULL);
18826 if (mode == DImode)
18827 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
18832 if (REG_P (operand))
18834 gcc_assert (reload_completed);
18835 for (i = 0; i < size; i++)
18836 parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
18838 else if (offsettable_memref_p (operand))
18840 operand = adjust_address (operand, SImode, 0);
18841 parts[0] = operand;
18842 for (i = 1; i < size; i++)
18843 parts[i] = adjust_address (operand, SImode, 4 * i);
18845 else if (GET_CODE (operand) == CONST_DOUBLE)
18850 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
18854 real_to_target (l, &r, mode);
18855 parts[3] = gen_int_mode (l[3], SImode);
18856 parts[2] = gen_int_mode (l[2], SImode);
18859 REAL_VALUE_TO_TARGET_LONG_DOUBLE (r, l);
18860 parts[2] = gen_int_mode (l[2], SImode);
18863 REAL_VALUE_TO_TARGET_DOUBLE (r, l);
18866 gcc_unreachable ();
18868 parts[1] = gen_int_mode (l[1], SImode);
18869 parts[0] = gen_int_mode (l[0], SImode);
18872 gcc_unreachable ();
18877 if (mode == TImode)
18878 split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
18879 if (mode == XFmode || mode == TFmode)
18881 enum machine_mode upper_mode = mode==XFmode ? SImode : DImode;
18882 if (REG_P (operand))
18884 gcc_assert (reload_completed);
18885 parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
18886 parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
18888 else if (offsettable_memref_p (operand))
18890 operand = adjust_address (operand, DImode, 0);
18891 parts[0] = operand;
18892 parts[1] = adjust_address (operand, upper_mode, 8);
18894 else if (GET_CODE (operand) == CONST_DOUBLE)
18899 REAL_VALUE_FROM_CONST_DOUBLE (r, operand);
18900 real_to_target (l, &r, mode);
18902 /* Do not use shift by 32 to avoid warning on 32bit systems. */
18903 if (HOST_BITS_PER_WIDE_INT >= 64)
18906 ((l[0] & (((HOST_WIDE_INT) 2 << 31) - 1))
18907 + ((((HOST_WIDE_INT) l[1]) << 31) << 1),
18910 parts[0] = immed_double_const (l[0], l[1], DImode);
18912 if (upper_mode == SImode)
18913 parts[1] = gen_int_mode (l[2], SImode);
18914 else if (HOST_BITS_PER_WIDE_INT >= 64)
18917 ((l[2] & (((HOST_WIDE_INT) 2 << 31) - 1))
18918 + ((((HOST_WIDE_INT) l[3]) << 31) << 1),
18921 parts[1] = immed_double_const (l[2], l[3], DImode);
18924 gcc_unreachable ();
18931 /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
18932 Return false when normal moves are needed; true when all required
18933 insns have been emitted. Operands 2-4 contain the input values
18934 int the correct order; operands 5-7 contain the output values. */
18937 ix86_split_long_move (rtx operands[])
18942 int collisions = 0;
18943 enum machine_mode mode = GET_MODE (operands[0]);
18944 bool collisionparts[4];
18946 /* The DFmode expanders may ask us to move double.
18947 For 64bit target this is single move. By hiding the fact
18948 here we simplify i386.md splitters. */
18949 if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
18951 /* Optimize constant pool reference to immediates. This is used by
18952 fp moves, that force all constants to memory to allow combining. */
18954 if (MEM_P (operands[1])
18955 && GET_CODE (XEXP (operands[1], 0)) == SYMBOL_REF
18956 && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
18957 operands[1] = get_pool_constant (XEXP (operands[1], 0));
18958 if (push_operand (operands[0], VOIDmode))
18960 operands[0] = copy_rtx (operands[0]);
18961 PUT_MODE (operands[0], Pmode);
18964 operands[0] = gen_lowpart (DImode, operands[0]);
18965 operands[1] = gen_lowpart (DImode, operands[1]);
18966 emit_move_insn (operands[0], operands[1]);
18970 /* The only non-offsettable memory we handle is push. */
18971 if (push_operand (operands[0], VOIDmode))
18974 gcc_assert (!MEM_P (operands[0])
18975 || offsettable_memref_p (operands[0]));
18977 nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
18978 ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
18980 /* When emitting push, take care for source operands on the stack. */
18981 if (push && MEM_P (operands[1])
18982 && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
18984 rtx src_base = XEXP (part[1][nparts - 1], 0);
18986 /* Compensate for the stack decrement by 4. */
18987 if (!TARGET_64BIT && nparts == 3
18988 && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
18989 src_base = plus_constant (src_base, 4);
18991 /* src_base refers to the stack pointer and is
18992 automatically decreased by emitted push. */
18993 for (i = 0; i < nparts; i++)
18994 part[1][i] = change_address (part[1][i],
18995 GET_MODE (part[1][i]), src_base);
18998 /* We need to do copy in the right order in case an address register
18999 of the source overlaps the destination. */
19000 if (REG_P (part[0][0]) && MEM_P (part[1][0]))
19004 for (i = 0; i < nparts; i++)
19007 = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
19008 if (collisionparts[i])
19012 /* Collision in the middle part can be handled by reordering. */
19013 if (collisions == 1 && nparts == 3 && collisionparts [1])
19015 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
19016 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
19018 else if (collisions == 1
19020 && (collisionparts [1] || collisionparts [2]))
19022 if (collisionparts [1])
19024 tmp = part[0][1]; part[0][1] = part[0][2]; part[0][2] = tmp;
19025 tmp = part[1][1]; part[1][1] = part[1][2]; part[1][2] = tmp;
19029 tmp = part[0][2]; part[0][2] = part[0][3]; part[0][3] = tmp;
19030 tmp = part[1][2]; part[1][2] = part[1][3]; part[1][3] = tmp;
19034 /* If there are more collisions, we can't handle it by reordering.
19035 Do an lea to the last part and use only one colliding move. */
19036 else if (collisions > 1)
19042 base = part[0][nparts - 1];
19044 /* Handle the case when the last part isn't valid for lea.
19045 Happens in 64-bit mode storing the 12-byte XFmode. */
19046 if (GET_MODE (base) != Pmode)
19047 base = gen_rtx_REG (Pmode, REGNO (base));
19049 emit_insn (gen_rtx_SET (VOIDmode, base, XEXP (part[1][0], 0)));
19050 part[1][0] = replace_equiv_address (part[1][0], base);
19051 for (i = 1; i < nparts; i++)
19053 tmp = plus_constant (base, UNITS_PER_WORD * i);
19054 part[1][i] = replace_equiv_address (part[1][i], tmp);
19065 if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
19066 emit_insn (gen_addsi3 (stack_pointer_rtx,
19067 stack_pointer_rtx, GEN_INT (-4)));
19068 emit_move_insn (part[0][2], part[1][2]);
19070 else if (nparts == 4)
19072 emit_move_insn (part[0][3], part[1][3]);
19073 emit_move_insn (part[0][2], part[1][2]);
19078 /* In 64bit mode we don't have 32bit push available. In case this is
19079 register, it is OK - we will just use larger counterpart. We also
19080 retype memory - these comes from attempt to avoid REX prefix on
19081 moving of second half of TFmode value. */
19082 if (GET_MODE (part[1][1]) == SImode)
19084 switch (GET_CODE (part[1][1]))
19087 part[1][1] = adjust_address (part[1][1], DImode, 0);
19091 part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
19095 gcc_unreachable ();
19098 if (GET_MODE (part[1][0]) == SImode)
19099 part[1][0] = part[1][1];
19102 emit_move_insn (part[0][1], part[1][1]);
19103 emit_move_insn (part[0][0], part[1][0]);
19107 /* Choose correct order to not overwrite the source before it is copied. */
19108 if ((REG_P (part[0][0])
19109 && REG_P (part[1][1])
19110 && (REGNO (part[0][0]) == REGNO (part[1][1])
19112 && REGNO (part[0][0]) == REGNO (part[1][2]))
19114 && REGNO (part[0][0]) == REGNO (part[1][3]))))
19116 && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
19118 for (i = 0, j = nparts - 1; i < nparts; i++, j--)
19120 operands[2 + i] = part[0][j];
19121 operands[6 + i] = part[1][j];
19126 for (i = 0; i < nparts; i++)
19128 operands[2 + i] = part[0][i];
19129 operands[6 + i] = part[1][i];
19133 /* If optimizing for size, attempt to locally unCSE nonzero constants. */
19134 if (optimize_insn_for_size_p ())
19136 for (j = 0; j < nparts - 1; j++)
19137 if (CONST_INT_P (operands[6 + j])
19138 && operands[6 + j] != const0_rtx
19139 && REG_P (operands[2 + j]))
19140 for (i = j; i < nparts - 1; i++)
19141 if (CONST_INT_P (operands[7 + i])
19142 && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
19143 operands[7 + i] = operands[2 + j];
19146 for (i = 0; i < nparts; i++)
19147 emit_move_insn (operands[2 + i], operands[6 + i]);
19152 /* Helper function of ix86_split_ashl used to generate an SImode/DImode
19153 left shift by a constant, either using a single shift or
19154 a sequence of add instructions. */
19157 ix86_expand_ashl_const (rtx operand, int count, enum machine_mode mode)
19159 rtx (*insn)(rtx, rtx, rtx);
19162 || (count * ix86_cost->add <= ix86_cost->shift_const
19163 && !optimize_insn_for_size_p ()))
19165 insn = mode == DImode ? gen_addsi3 : gen_adddi3;
19166 while (count-- > 0)
19167 emit_insn (insn (operand, operand, operand));
19171 insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
19172 emit_insn (insn (operand, operand, GEN_INT (count)));
19177 ix86_split_ashl (rtx *operands, rtx scratch, enum machine_mode mode)
19179 rtx (*gen_ashl3)(rtx, rtx, rtx);
19180 rtx (*gen_shld)(rtx, rtx, rtx);
19181 int half_width = GET_MODE_BITSIZE (mode) >> 1;
19183 rtx low[2], high[2];
19186 if (CONST_INT_P (operands[2]))
19188 split_double_mode (mode, operands, 2, low, high);
19189 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
19191 if (count >= half_width)
19193 emit_move_insn (high[0], low[1]);
19194 emit_move_insn (low[0], const0_rtx);
19196 if (count > half_width)
19197 ix86_expand_ashl_const (high[0], count - half_width, mode);
19201 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
19203 if (!rtx_equal_p (operands[0], operands[1]))
19204 emit_move_insn (operands[0], operands[1]);
19206 emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
19207 ix86_expand_ashl_const (low[0], count, mode);
19212 split_double_mode (mode, operands, 1, low, high);
19214 gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
19216 if (operands[1] == const1_rtx)
19218 /* Assuming we've chosen a QImode capable registers, then 1 << N
19219 can be done with two 32/64-bit shifts, no branches, no cmoves. */
19220 if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
19222 rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
19224 ix86_expand_clear (low[0]);
19225 ix86_expand_clear (high[0]);
19226 emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
19228 d = gen_lowpart (QImode, low[0]);
19229 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
19230 s = gen_rtx_EQ (QImode, flags, const0_rtx);
19231 emit_insn (gen_rtx_SET (VOIDmode, d, s));
19233 d = gen_lowpart (QImode, high[0]);
19234 d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
19235 s = gen_rtx_NE (QImode, flags, const0_rtx);
19236 emit_insn (gen_rtx_SET (VOIDmode, d, s));
19239 /* Otherwise, we can get the same results by manually performing
19240 a bit extract operation on bit 5/6, and then performing the two
19241 shifts. The two methods of getting 0/1 into low/high are exactly
19242 the same size. Avoiding the shift in the bit extract case helps
19243 pentium4 a bit; no one else seems to care much either way. */
19246 enum machine_mode half_mode;
19247 rtx (*gen_lshr3)(rtx, rtx, rtx);
19248 rtx (*gen_and3)(rtx, rtx, rtx);
19249 rtx (*gen_xor3)(rtx, rtx, rtx);
19250 HOST_WIDE_INT bits;
19253 if (mode == DImode)
19255 half_mode = SImode;
19256 gen_lshr3 = gen_lshrsi3;
19257 gen_and3 = gen_andsi3;
19258 gen_xor3 = gen_xorsi3;
19263 half_mode = DImode;
19264 gen_lshr3 = gen_lshrdi3;
19265 gen_and3 = gen_anddi3;
19266 gen_xor3 = gen_xordi3;
19270 if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
19271 x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
19273 x = gen_lowpart (half_mode, operands[2]);
19274 emit_insn (gen_rtx_SET (VOIDmode, high[0], x));
19276 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
19277 emit_insn (gen_and3 (high[0], high[0], const1_rtx));
19278 emit_move_insn (low[0], high[0]);
19279 emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
19282 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
19283 emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
19287 if (operands[1] == constm1_rtx)
19289 /* For -1 << N, we can avoid the shld instruction, because we
19290 know that we're shifting 0...31/63 ones into a -1. */
19291 emit_move_insn (low[0], constm1_rtx);
19292 if (optimize_insn_for_size_p ())
19293 emit_move_insn (high[0], low[0]);
19295 emit_move_insn (high[0], constm1_rtx);
19299 gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
19301 if (!rtx_equal_p (operands[0], operands[1]))
19302 emit_move_insn (operands[0], operands[1]);
19304 split_double_mode (mode, operands, 1, low, high);
19305 emit_insn (gen_shld (high[0], low[0], operands[2]));
19308 emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
19310 if (TARGET_CMOVE && scratch)
19312 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
19313 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
19315 ix86_expand_clear (scratch);
19316 emit_insn (gen_x86_shift_adj_1 (high[0], low[0], operands[2], scratch));
19320 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
19321 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
19323 emit_insn (gen_x86_shift_adj_2 (high[0], low[0], operands[2]));
19328 ix86_split_ashr (rtx *operands, rtx scratch, enum machine_mode mode)
19330 rtx (*gen_ashr3)(rtx, rtx, rtx)
19331 = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
19332 rtx (*gen_shrd)(rtx, rtx, rtx);
19333 int half_width = GET_MODE_BITSIZE (mode) >> 1;
19335 rtx low[2], high[2];
19338 if (CONST_INT_P (operands[2]))
19340 split_double_mode (mode, operands, 2, low, high);
19341 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
19343 if (count == GET_MODE_BITSIZE (mode) - 1)
19345 emit_move_insn (high[0], high[1]);
19346 emit_insn (gen_ashr3 (high[0], high[0],
19347 GEN_INT (half_width - 1)));
19348 emit_move_insn (low[0], high[0]);
19351 else if (count >= half_width)
19353 emit_move_insn (low[0], high[1]);
19354 emit_move_insn (high[0], low[0]);
19355 emit_insn (gen_ashr3 (high[0], high[0],
19356 GEN_INT (half_width - 1)));
19358 if (count > half_width)
19359 emit_insn (gen_ashr3 (low[0], low[0],
19360 GEN_INT (count - half_width)));
19364 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
19366 if (!rtx_equal_p (operands[0], operands[1]))
19367 emit_move_insn (operands[0], operands[1]);
19369 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
19370 emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
19375 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
19377 if (!rtx_equal_p (operands[0], operands[1]))
19378 emit_move_insn (operands[0], operands[1]);
19380 split_double_mode (mode, operands, 1, low, high);
19382 emit_insn (gen_shrd (low[0], high[0], operands[2]));
19383 emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
19385 if (TARGET_CMOVE && scratch)
19387 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
19388 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
19390 emit_move_insn (scratch, high[0]);
19391 emit_insn (gen_ashr3 (scratch, scratch,
19392 GEN_INT (half_width - 1)));
19393 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
19398 rtx (*gen_x86_shift_adj_3)(rtx, rtx, rtx)
19399 = mode == DImode ? gen_x86_shiftsi_adj_3 : gen_x86_shiftdi_adj_3;
19401 emit_insn (gen_x86_shift_adj_3 (low[0], high[0], operands[2]));
19407 ix86_split_lshr (rtx *operands, rtx scratch, enum machine_mode mode)
19409 rtx (*gen_lshr3)(rtx, rtx, rtx)
19410 = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
19411 rtx (*gen_shrd)(rtx, rtx, rtx);
19412 int half_width = GET_MODE_BITSIZE (mode) >> 1;
19414 rtx low[2], high[2];
19417 if (CONST_INT_P (operands[2]))
19419 split_double_mode (mode, operands, 2, low, high);
19420 count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
19422 if (count >= half_width)
19424 emit_move_insn (low[0], high[1]);
19425 ix86_expand_clear (high[0]);
19427 if (count > half_width)
19428 emit_insn (gen_lshr3 (low[0], low[0],
19429 GEN_INT (count - half_width)));
19433 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
19435 if (!rtx_equal_p (operands[0], operands[1]))
19436 emit_move_insn (operands[0], operands[1]);
19438 emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
19439 emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
19444 gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
19446 if (!rtx_equal_p (operands[0], operands[1]))
19447 emit_move_insn (operands[0], operands[1]);
19449 split_double_mode (mode, operands, 1, low, high);
19451 emit_insn (gen_shrd (low[0], high[0], operands[2]));
19452 emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
19454 if (TARGET_CMOVE && scratch)
19456 rtx (*gen_x86_shift_adj_1)(rtx, rtx, rtx, rtx)
19457 = mode == DImode ? gen_x86_shiftsi_adj_1 : gen_x86_shiftdi_adj_1;
19459 ix86_expand_clear (scratch);
19460 emit_insn (gen_x86_shift_adj_1 (low[0], high[0], operands[2],
19465 rtx (*gen_x86_shift_adj_2)(rtx, rtx, rtx)
19466 = mode == DImode ? gen_x86_shiftsi_adj_2 : gen_x86_shiftdi_adj_2;
19468 emit_insn (gen_x86_shift_adj_2 (low[0], high[0], operands[2]));
19473 /* Predict just emitted jump instruction to be taken with probability PROB. */
19475 predict_jump (int prob)
19477 rtx insn = get_last_insn ();
19478 gcc_assert (JUMP_P (insn));
19479 add_reg_note (insn, REG_BR_PROB, GEN_INT (prob));
19482 /* Helper function for the string operations below. Dest VARIABLE whether
19483 it is aligned to VALUE bytes. If true, jump to the label. */
19485 ix86_expand_aligntest (rtx variable, int value, bool epilogue)
19487 rtx label = gen_label_rtx ();
19488 rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
19489 if (GET_MODE (variable) == DImode)
19490 emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
19492 emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
19493 emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
19496 predict_jump (REG_BR_PROB_BASE * 50 / 100);
19498 predict_jump (REG_BR_PROB_BASE * 90 / 100);
19502 /* Adjust COUNTER by the VALUE. */
19504 ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
19506 rtx (*gen_add)(rtx, rtx, rtx)
19507 = GET_MODE (countreg) == DImode ? gen_adddi3 : gen_addsi3;
19509 emit_insn (gen_add (countreg, countreg, GEN_INT (-value)));
19512 /* Zero extend possibly SImode EXP to Pmode register. */
19514 ix86_zero_extend_to_Pmode (rtx exp)
19517 if (GET_MODE (exp) == VOIDmode)
19518 return force_reg (Pmode, exp);
19519 if (GET_MODE (exp) == Pmode)
19520 return copy_to_mode_reg (Pmode, exp);
19521 r = gen_reg_rtx (Pmode);
19522 emit_insn (gen_zero_extendsidi2 (r, exp));
19526 /* Divide COUNTREG by SCALE. */
19528 scale_counter (rtx countreg, int scale)
19534 if (CONST_INT_P (countreg))
19535 return GEN_INT (INTVAL (countreg) / scale);
19536 gcc_assert (REG_P (countreg));
19538 sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
19539 GEN_INT (exact_log2 (scale)),
19540 NULL, 1, OPTAB_DIRECT);
19544 /* Return mode for the memcpy/memset loop counter. Prefer SImode over
19545 DImode for constant loop counts. */
19547 static enum machine_mode
19548 counter_mode (rtx count_exp)
19550 if (GET_MODE (count_exp) != VOIDmode)
19551 return GET_MODE (count_exp);
19552 if (!CONST_INT_P (count_exp))
19554 if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
19559 /* When SRCPTR is non-NULL, output simple loop to move memory
19560 pointer to SRCPTR to DESTPTR via chunks of MODE unrolled UNROLL times,
19561 overall size is COUNT specified in bytes. When SRCPTR is NULL, output the
19562 equivalent loop to set memory by VALUE (supposed to be in MODE).
19564 The size is rounded down to whole number of chunk size moved at once.
19565 SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
19569 expand_set_or_movmem_via_loop (rtx destmem, rtx srcmem,
19570 rtx destptr, rtx srcptr, rtx value,
19571 rtx count, enum machine_mode mode, int unroll,
19574 rtx out_label, top_label, iter, tmp;
19575 enum machine_mode iter_mode = counter_mode (count);
19576 rtx piece_size = GEN_INT (GET_MODE_SIZE (mode) * unroll);
19577 rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
19583 top_label = gen_label_rtx ();
19584 out_label = gen_label_rtx ();
19585 iter = gen_reg_rtx (iter_mode);
19587 size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
19588 NULL, 1, OPTAB_DIRECT);
19589 /* Those two should combine. */
19590 if (piece_size == const1_rtx)
19592 emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
19594 predict_jump (REG_BR_PROB_BASE * 10 / 100);
19596 emit_move_insn (iter, const0_rtx);
19598 emit_label (top_label);
19600 tmp = convert_modes (Pmode, iter_mode, iter, true);
19601 x_addr = gen_rtx_PLUS (Pmode, destptr, tmp);
19602 destmem = change_address (destmem, mode, x_addr);
19606 y_addr = gen_rtx_PLUS (Pmode, srcptr, copy_rtx (tmp));
19607 srcmem = change_address (srcmem, mode, y_addr);
19609 /* When unrolling for chips that reorder memory reads and writes,
19610 we can save registers by using single temporary.
19611 Also using 4 temporaries is overkill in 32bit mode. */
19612 if (!TARGET_64BIT && 0)
19614 for (i = 0; i < unroll; i++)
19619 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
19621 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
19623 emit_move_insn (destmem, srcmem);
19629 gcc_assert (unroll <= 4);
19630 for (i = 0; i < unroll; i++)
19632 tmpreg[i] = gen_reg_rtx (mode);
19636 adjust_address (copy_rtx (srcmem), mode, GET_MODE_SIZE (mode));
19638 emit_move_insn (tmpreg[i], srcmem);
19640 for (i = 0; i < unroll; i++)
19645 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
19647 emit_move_insn (destmem, tmpreg[i]);
19652 for (i = 0; i < unroll; i++)
19656 adjust_address (copy_rtx (destmem), mode, GET_MODE_SIZE (mode));
19657 emit_move_insn (destmem, value);
19660 tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
19661 true, OPTAB_LIB_WIDEN);
19663 emit_move_insn (iter, tmp);
19665 emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
19667 if (expected_size != -1)
19669 expected_size /= GET_MODE_SIZE (mode) * unroll;
19670 if (expected_size == 0)
19672 else if (expected_size > REG_BR_PROB_BASE)
19673 predict_jump (REG_BR_PROB_BASE - 1);
19675 predict_jump (REG_BR_PROB_BASE - (REG_BR_PROB_BASE + expected_size / 2) / expected_size);
19678 predict_jump (REG_BR_PROB_BASE * 80 / 100);
19679 iter = ix86_zero_extend_to_Pmode (iter);
19680 tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
19681 true, OPTAB_LIB_WIDEN);
19682 if (tmp != destptr)
19683 emit_move_insn (destptr, tmp);
19686 tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
19687 true, OPTAB_LIB_WIDEN);
19689 emit_move_insn (srcptr, tmp);
19691 emit_label (out_label);
19694 /* Output "rep; mov" instruction.
19695 Arguments have same meaning as for previous function */
19697 expand_movmem_via_rep_mov (rtx destmem, rtx srcmem,
19698 rtx destptr, rtx srcptr,
19700 enum machine_mode mode)
19706 /* If the size is known, it is shorter to use rep movs. */
19707 if (mode == QImode && CONST_INT_P (count)
19708 && !(INTVAL (count) & 3))
19711 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
19712 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
19713 if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
19714 srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
19715 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
19716 if (mode != QImode)
19718 destexp = gen_rtx_ASHIFT (Pmode, countreg,
19719 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
19720 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
19721 srcexp = gen_rtx_ASHIFT (Pmode, countreg,
19722 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
19723 srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
19727 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
19728 srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
19730 if (CONST_INT_P (count))
19732 count = GEN_INT (INTVAL (count)
19733 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
19734 destmem = shallow_copy_rtx (destmem);
19735 srcmem = shallow_copy_rtx (srcmem);
19736 set_mem_size (destmem, count);
19737 set_mem_size (srcmem, count);
19741 if (MEM_SIZE (destmem))
19742 set_mem_size (destmem, NULL_RTX);
19743 if (MEM_SIZE (srcmem))
19744 set_mem_size (srcmem, NULL_RTX);
19746 emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
19750 /* Output "rep; stos" instruction.
19751 Arguments have same meaning as for previous function */
19753 expand_setmem_via_rep_stos (rtx destmem, rtx destptr, rtx value,
19754 rtx count, enum machine_mode mode,
19760 if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
19761 destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
19762 value = force_reg (mode, gen_lowpart (mode, value));
19763 countreg = ix86_zero_extend_to_Pmode (scale_counter (count, GET_MODE_SIZE (mode)));
19764 if (mode != QImode)
19766 destexp = gen_rtx_ASHIFT (Pmode, countreg,
19767 GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
19768 destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
19771 destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
19772 if (orig_value == const0_rtx && CONST_INT_P (count))
19774 count = GEN_INT (INTVAL (count)
19775 & ~((HOST_WIDE_INT) GET_MODE_SIZE (mode) - 1));
19776 destmem = shallow_copy_rtx (destmem);
19777 set_mem_size (destmem, count);
19779 else if (MEM_SIZE (destmem))
19780 set_mem_size (destmem, NULL_RTX);
19781 emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
19785 emit_strmov (rtx destmem, rtx srcmem,
19786 rtx destptr, rtx srcptr, enum machine_mode mode, int offset)
19788 rtx src = adjust_automodify_address_nv (srcmem, mode, srcptr, offset);
19789 rtx dest = adjust_automodify_address_nv (destmem, mode, destptr, offset);
19790 emit_insn (gen_strmov (destptr, dest, srcptr, src));
19793 /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
19795 expand_movmem_epilogue (rtx destmem, rtx srcmem,
19796 rtx destptr, rtx srcptr, rtx count, int max_size)
19799 if (CONST_INT_P (count))
19801 HOST_WIDE_INT countval = INTVAL (count);
19804 if ((countval & 0x10) && max_size > 16)
19808 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
19809 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset + 8);
19812 gcc_unreachable ();
19815 if ((countval & 0x08) && max_size > 8)
19818 emit_strmov (destmem, srcmem, destptr, srcptr, DImode, offset);
19821 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
19822 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset + 4);
19826 if ((countval & 0x04) && max_size > 4)
19828 emit_strmov (destmem, srcmem, destptr, srcptr, SImode, offset);
19831 if ((countval & 0x02) && max_size > 2)
19833 emit_strmov (destmem, srcmem, destptr, srcptr, HImode, offset);
19836 if ((countval & 0x01) && max_size > 1)
19838 emit_strmov (destmem, srcmem, destptr, srcptr, QImode, offset);
19845 count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
19846 count, 1, OPTAB_DIRECT);
19847 expand_set_or_movmem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
19848 count, QImode, 1, 4);
19852 /* When there are stringops, we can cheaply increase dest and src pointers.
19853 Otherwise we save code size by maintaining offset (zero is readily
19854 available from preceding rep operation) and using x86 addressing modes.
19856 if (TARGET_SINGLE_STRINGOP)
19860 rtx label = ix86_expand_aligntest (count, 4, true);
19861 src = change_address (srcmem, SImode, srcptr);
19862 dest = change_address (destmem, SImode, destptr);
19863 emit_insn (gen_strmov (destptr, dest, srcptr, src));
19864 emit_label (label);
19865 LABEL_NUSES (label) = 1;
19869 rtx label = ix86_expand_aligntest (count, 2, true);
19870 src = change_address (srcmem, HImode, srcptr);
19871 dest = change_address (destmem, HImode, destptr);
19872 emit_insn (gen_strmov (destptr, dest, srcptr, src));
19873 emit_label (label);
19874 LABEL_NUSES (label) = 1;
19878 rtx label = ix86_expand_aligntest (count, 1, true);
19879 src = change_address (srcmem, QImode, srcptr);
19880 dest = change_address (destmem, QImode, destptr);
19881 emit_insn (gen_strmov (destptr, dest, srcptr, src));
19882 emit_label (label);
19883 LABEL_NUSES (label) = 1;
19888 rtx offset = force_reg (Pmode, const0_rtx);
19893 rtx label = ix86_expand_aligntest (count, 4, true);
19894 src = change_address (srcmem, SImode, srcptr);
19895 dest = change_address (destmem, SImode, destptr);
19896 emit_move_insn (dest, src);
19897 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
19898 true, OPTAB_LIB_WIDEN);
19900 emit_move_insn (offset, tmp);
19901 emit_label (label);
19902 LABEL_NUSES (label) = 1;
19906 rtx label = ix86_expand_aligntest (count, 2, true);
19907 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
19908 src = change_address (srcmem, HImode, tmp);
19909 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
19910 dest = change_address (destmem, HImode, tmp);
19911 emit_move_insn (dest, src);
19912 tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
19913 true, OPTAB_LIB_WIDEN);
19915 emit_move_insn (offset, tmp);
19916 emit_label (label);
19917 LABEL_NUSES (label) = 1;
19921 rtx label = ix86_expand_aligntest (count, 1, true);
19922 tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
19923 src = change_address (srcmem, QImode, tmp);
19924 tmp = gen_rtx_PLUS (Pmode, destptr, offset);
19925 dest = change_address (destmem, QImode, tmp);
19926 emit_move_insn (dest, src);
19927 emit_label (label);
19928 LABEL_NUSES (label) = 1;
19933 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
19935 expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
19936 rtx count, int max_size)
19939 expand_simple_binop (counter_mode (count), AND, count,
19940 GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
19941 expand_set_or_movmem_via_loop (destmem, NULL, destptr, NULL,
19942 gen_lowpart (QImode, value), count, QImode,
19946 /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
19948 expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx count, int max_size)
19952 if (CONST_INT_P (count))
19954 HOST_WIDE_INT countval = INTVAL (count);
19957 if ((countval & 0x10) && max_size > 16)
19961 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
19962 emit_insn (gen_strset (destptr, dest, value));
19963 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset + 8);
19964 emit_insn (gen_strset (destptr, dest, value));
19967 gcc_unreachable ();
19970 if ((countval & 0x08) && max_size > 8)
19974 dest = adjust_automodify_address_nv (destmem, DImode, destptr, offset);
19975 emit_insn (gen_strset (destptr, dest, value));
19979 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
19980 emit_insn (gen_strset (destptr, dest, value));
19981 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset + 4);
19982 emit_insn (gen_strset (destptr, dest, value));
19986 if ((countval & 0x04) && max_size > 4)
19988 dest = adjust_automodify_address_nv (destmem, SImode, destptr, offset);
19989 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
19992 if ((countval & 0x02) && max_size > 2)
19994 dest = adjust_automodify_address_nv (destmem, HImode, destptr, offset);
19995 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
19998 if ((countval & 0x01) && max_size > 1)
20000 dest = adjust_automodify_address_nv (destmem, QImode, destptr, offset);
20001 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
20008 expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
20013 rtx label = ix86_expand_aligntest (count, 16, true);
20016 dest = change_address (destmem, DImode, destptr);
20017 emit_insn (gen_strset (destptr, dest, value));
20018 emit_insn (gen_strset (destptr, dest, value));
20022 dest = change_address (destmem, SImode, destptr);
20023 emit_insn (gen_strset (destptr, dest, value));
20024 emit_insn (gen_strset (destptr, dest, value));
20025 emit_insn (gen_strset (destptr, dest, value));
20026 emit_insn (gen_strset (destptr, dest, value));
20028 emit_label (label);
20029 LABEL_NUSES (label) = 1;
20033 rtx label = ix86_expand_aligntest (count, 8, true);
20036 dest = change_address (destmem, DImode, destptr);
20037 emit_insn (gen_strset (destptr, dest, value));
20041 dest = change_address (destmem, SImode, destptr);
20042 emit_insn (gen_strset (destptr, dest, value));
20043 emit_insn (gen_strset (destptr, dest, value));
20045 emit_label (label);
20046 LABEL_NUSES (label) = 1;
20050 rtx label = ix86_expand_aligntest (count, 4, true);
20051 dest = change_address (destmem, SImode, destptr);
20052 emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
20053 emit_label (label);
20054 LABEL_NUSES (label) = 1;
20058 rtx label = ix86_expand_aligntest (count, 2, true);
20059 dest = change_address (destmem, HImode, destptr);
20060 emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
20061 emit_label (label);
20062 LABEL_NUSES (label) = 1;
20066 rtx label = ix86_expand_aligntest (count, 1, true);
20067 dest = change_address (destmem, QImode, destptr);
20068 emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
20069 emit_label (label);
20070 LABEL_NUSES (label) = 1;
20074 /* Copy enough from DEST to SRC to align DEST known to by aligned by ALIGN to
20075 DESIRED_ALIGNMENT. */
20077 expand_movmem_prologue (rtx destmem, rtx srcmem,
20078 rtx destptr, rtx srcptr, rtx count,
20079 int align, int desired_alignment)
20081 if (align <= 1 && desired_alignment > 1)
20083 rtx label = ix86_expand_aligntest (destptr, 1, false);
20084 srcmem = change_address (srcmem, QImode, srcptr);
20085 destmem = change_address (destmem, QImode, destptr);
20086 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
20087 ix86_adjust_counter (count, 1);
20088 emit_label (label);
20089 LABEL_NUSES (label) = 1;
20091 if (align <= 2 && desired_alignment > 2)
20093 rtx label = ix86_expand_aligntest (destptr, 2, false);
20094 srcmem = change_address (srcmem, HImode, srcptr);
20095 destmem = change_address (destmem, HImode, destptr);
20096 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
20097 ix86_adjust_counter (count, 2);
20098 emit_label (label);
20099 LABEL_NUSES (label) = 1;
20101 if (align <= 4 && desired_alignment > 4)
20103 rtx label = ix86_expand_aligntest (destptr, 4, false);
20104 srcmem = change_address (srcmem, SImode, srcptr);
20105 destmem = change_address (destmem, SImode, destptr);
20106 emit_insn (gen_strmov (destptr, destmem, srcptr, srcmem));
20107 ix86_adjust_counter (count, 4);
20108 emit_label (label);
20109 LABEL_NUSES (label) = 1;
20111 gcc_assert (desired_alignment <= 8);
20114 /* Copy enough from DST to SRC to align DST known to DESIRED_ALIGN.
20115 ALIGN_BYTES is how many bytes need to be copied. */
20117 expand_constant_movmem_prologue (rtx dst, rtx *srcp, rtx destreg, rtx srcreg,
20118 int desired_align, int align_bytes)
20121 rtx src_size, dst_size;
20123 int src_align_bytes = get_mem_align_offset (src, desired_align * BITS_PER_UNIT);
20124 if (src_align_bytes >= 0)
20125 src_align_bytes = desired_align - src_align_bytes;
20126 src_size = MEM_SIZE (src);
20127 dst_size = MEM_SIZE (dst);
20128 if (align_bytes & 1)
20130 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
20131 src = adjust_automodify_address_nv (src, QImode, srcreg, 0);
20133 emit_insn (gen_strmov (destreg, dst, srcreg, src));
20135 if (align_bytes & 2)
20137 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
20138 src = adjust_automodify_address_nv (src, HImode, srcreg, off);
20139 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
20140 set_mem_align (dst, 2 * BITS_PER_UNIT);
20141 if (src_align_bytes >= 0
20142 && (src_align_bytes & 1) == (align_bytes & 1)
20143 && MEM_ALIGN (src) < 2 * BITS_PER_UNIT)
20144 set_mem_align (src, 2 * BITS_PER_UNIT);
20146 emit_insn (gen_strmov (destreg, dst, srcreg, src));
20148 if (align_bytes & 4)
20150 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
20151 src = adjust_automodify_address_nv (src, SImode, srcreg, off);
20152 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
20153 set_mem_align (dst, 4 * BITS_PER_UNIT);
20154 if (src_align_bytes >= 0)
20156 unsigned int src_align = 0;
20157 if ((src_align_bytes & 3) == (align_bytes & 3))
20159 else if ((src_align_bytes & 1) == (align_bytes & 1))
20161 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
20162 set_mem_align (src, src_align * BITS_PER_UNIT);
20165 emit_insn (gen_strmov (destreg, dst, srcreg, src));
20167 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
20168 src = adjust_automodify_address_nv (src, BLKmode, srcreg, off);
20169 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
20170 set_mem_align (dst, desired_align * BITS_PER_UNIT);
20171 if (src_align_bytes >= 0)
20173 unsigned int src_align = 0;
20174 if ((src_align_bytes & 7) == (align_bytes & 7))
20176 else if ((src_align_bytes & 3) == (align_bytes & 3))
20178 else if ((src_align_bytes & 1) == (align_bytes & 1))
20180 if (src_align > (unsigned int) desired_align)
20181 src_align = desired_align;
20182 if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
20183 set_mem_align (src, src_align * BITS_PER_UNIT);
20186 set_mem_size (dst, GEN_INT (INTVAL (dst_size) - align_bytes));
20188 set_mem_size (dst, GEN_INT (INTVAL (src_size) - align_bytes));
20193 /* Set enough from DEST to align DEST known to by aligned by ALIGN to
20194 DESIRED_ALIGNMENT. */
20196 expand_setmem_prologue (rtx destmem, rtx destptr, rtx value, rtx count,
20197 int align, int desired_alignment)
20199 if (align <= 1 && desired_alignment > 1)
20201 rtx label = ix86_expand_aligntest (destptr, 1, false);
20202 destmem = change_address (destmem, QImode, destptr);
20203 emit_insn (gen_strset (destptr, destmem, gen_lowpart (QImode, value)));
20204 ix86_adjust_counter (count, 1);
20205 emit_label (label);
20206 LABEL_NUSES (label) = 1;
20208 if (align <= 2 && desired_alignment > 2)
20210 rtx label = ix86_expand_aligntest (destptr, 2, false);
20211 destmem = change_address (destmem, HImode, destptr);
20212 emit_insn (gen_strset (destptr, destmem, gen_lowpart (HImode, value)));
20213 ix86_adjust_counter (count, 2);
20214 emit_label (label);
20215 LABEL_NUSES (label) = 1;
20217 if (align <= 4 && desired_alignment > 4)
20219 rtx label = ix86_expand_aligntest (destptr, 4, false);
20220 destmem = change_address (destmem, SImode, destptr);
20221 emit_insn (gen_strset (destptr, destmem, gen_lowpart (SImode, value)));
20222 ix86_adjust_counter (count, 4);
20223 emit_label (label);
20224 LABEL_NUSES (label) = 1;
20226 gcc_assert (desired_alignment <= 8);
20229 /* Set enough from DST to align DST known to by aligned by ALIGN to
20230 DESIRED_ALIGN. ALIGN_BYTES is how many bytes need to be stored. */
20232 expand_constant_setmem_prologue (rtx dst, rtx destreg, rtx value,
20233 int desired_align, int align_bytes)
20236 rtx dst_size = MEM_SIZE (dst);
20237 if (align_bytes & 1)
20239 dst = adjust_automodify_address_nv (dst, QImode, destreg, 0);
20241 emit_insn (gen_strset (destreg, dst,
20242 gen_lowpart (QImode, value)));
20244 if (align_bytes & 2)
20246 dst = adjust_automodify_address_nv (dst, HImode, destreg, off);
20247 if (MEM_ALIGN (dst) < 2 * BITS_PER_UNIT)
20248 set_mem_align (dst, 2 * BITS_PER_UNIT);
20250 emit_insn (gen_strset (destreg, dst,
20251 gen_lowpart (HImode, value)));
20253 if (align_bytes & 4)
20255 dst = adjust_automodify_address_nv (dst, SImode, destreg, off);
20256 if (MEM_ALIGN (dst) < 4 * BITS_PER_UNIT)
20257 set_mem_align (dst, 4 * BITS_PER_UNIT);
20259 emit_insn (gen_strset (destreg, dst,
20260 gen_lowpart (SImode, value)));
20262 dst = adjust_automodify_address_nv (dst, BLKmode, destreg, off);
20263 if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
20264 set_mem_align (dst, desired_align * BITS_PER_UNIT);
20266 set_mem_size (dst, GEN_INT (INTVAL (dst_size) - align_bytes));
20270 /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
20271 static enum stringop_alg
20272 decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size, bool memset,
20273 int *dynamic_check)
20275 const struct stringop_algs * algs;
20276 bool optimize_for_speed;
20277 /* Algorithms using the rep prefix want at least edi and ecx;
20278 additionally, memset wants eax and memcpy wants esi. Don't
20279 consider such algorithms if the user has appropriated those
20280 registers for their own purposes. */
20281 bool rep_prefix_usable = !(fixed_regs[CX_REG] || fixed_regs[DI_REG]
20283 ? fixed_regs[AX_REG] : fixed_regs[SI_REG]));
20285 #define ALG_USABLE_P(alg) (rep_prefix_usable \
20286 || (alg != rep_prefix_1_byte \
20287 && alg != rep_prefix_4_byte \
20288 && alg != rep_prefix_8_byte))
20289 const struct processor_costs *cost;
20291 /* Even if the string operation call is cold, we still might spend a lot
20292 of time processing large blocks. */
20293 if (optimize_function_for_size_p (cfun)
20294 || (optimize_insn_for_size_p ()
20295 && expected_size != -1 && expected_size < 256))
20296 optimize_for_speed = false;
20298 optimize_for_speed = true;
20300 cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
20302 *dynamic_check = -1;
20304 algs = &cost->memset[TARGET_64BIT != 0];
20306 algs = &cost->memcpy[TARGET_64BIT != 0];
20307 if (ix86_stringop_alg != no_stringop && ALG_USABLE_P (ix86_stringop_alg))
20308 return ix86_stringop_alg;
20309 /* rep; movq or rep; movl is the smallest variant. */
20310 else if (!optimize_for_speed)
20312 if (!count || (count & 3))
20313 return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
20315 return rep_prefix_usable ? rep_prefix_4_byte : loop;
20317 /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
20319 else if (expected_size != -1 && expected_size < 4)
20320 return loop_1_byte;
20321 else if (expected_size != -1)
20324 enum stringop_alg alg = libcall;
20325 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
20327 /* We get here if the algorithms that were not libcall-based
20328 were rep-prefix based and we are unable to use rep prefixes
20329 based on global register usage. Break out of the loop and
20330 use the heuristic below. */
20331 if (algs->size[i].max == 0)
20333 if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
20335 enum stringop_alg candidate = algs->size[i].alg;
20337 if (candidate != libcall && ALG_USABLE_P (candidate))
20339 /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
20340 last non-libcall inline algorithm. */
20341 if (TARGET_INLINE_ALL_STRINGOPS)
20343 /* When the current size is best to be copied by a libcall,
20344 but we are still forced to inline, run the heuristic below
20345 that will pick code for medium sized blocks. */
20346 if (alg != libcall)
20350 else if (ALG_USABLE_P (candidate))
20354 gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
20356 /* When asked to inline the call anyway, try to pick meaningful choice.
20357 We look for maximal size of block that is faster to copy by hand and
20358 take blocks of at most of that size guessing that average size will
20359 be roughly half of the block.
20361 If this turns out to be bad, we might simply specify the preferred
20362 choice in ix86_costs. */
20363 if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
20364 && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
20367 enum stringop_alg alg;
20369 bool any_alg_usable_p = true;
20371 for (i = 0; i < MAX_STRINGOP_ALGS; i++)
20373 enum stringop_alg candidate = algs->size[i].alg;
20374 any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
20376 if (candidate != libcall && candidate
20377 && ALG_USABLE_P (candidate))
20378 max = algs->size[i].max;
20380 /* If there aren't any usable algorithms, then recursing on
20381 smaller sizes isn't going to find anything. Just return the
20382 simple byte-at-a-time copy loop. */
20383 if (!any_alg_usable_p)
20385 /* Pick something reasonable. */
20386 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
20387 *dynamic_check = 128;
20388 return loop_1_byte;
20392 alg = decide_alg (count, max / 2, memset, dynamic_check);
20393 gcc_assert (*dynamic_check == -1);
20394 gcc_assert (alg != libcall);
20395 if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
20396 *dynamic_check = max;
20399 return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
20400 #undef ALG_USABLE_P
20403 /* Decide on alignment. We know that the operand is already aligned to ALIGN
20404 (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
20406 decide_alignment (int align,
20407 enum stringop_alg alg,
20410 int desired_align = 0;
20414 gcc_unreachable ();
20416 case unrolled_loop:
20417 desired_align = GET_MODE_SIZE (Pmode);
20419 case rep_prefix_8_byte:
20422 case rep_prefix_4_byte:
20423 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
20424 copying whole cacheline at once. */
20425 if (TARGET_PENTIUMPRO)
20430 case rep_prefix_1_byte:
20431 /* PentiumPro has special logic triggering for 8 byte aligned blocks.
20432 copying whole cacheline at once. */
20433 if (TARGET_PENTIUMPRO)
20447 if (desired_align < align)
20448 desired_align = align;
20449 if (expected_size != -1 && expected_size < 4)
20450 desired_align = align;
20451 return desired_align;
20454 /* Return the smallest power of 2 greater than VAL. */
20456 smallest_pow2_greater_than (int val)
20464 /* Expand string move (memcpy) operation. Use i386 string operations
20465 when profitable. expand_setmem contains similar code. The code
20466 depends upon architecture, block size and alignment, but always has
20467 the same overall structure:
20469 1) Prologue guard: Conditional that jumps up to epilogues for small
20470 blocks that can be handled by epilogue alone. This is faster
20471 but also needed for correctness, since prologue assume the block
20472 is larger than the desired alignment.
20474 Optional dynamic check for size and libcall for large
20475 blocks is emitted here too, with -minline-stringops-dynamically.
20477 2) Prologue: copy first few bytes in order to get destination
20478 aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
20479 than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
20480 copied. We emit either a jump tree on power of two sized
20481 blocks, or a byte loop.
20483 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
20484 with specified algorithm.
20486 4) Epilogue: code copying tail of the block that is too small to be
20487 handled by main body (or up to size guarded by prologue guard). */
20490 ix86_expand_movmem (rtx dst, rtx src, rtx count_exp, rtx align_exp,
20491 rtx expected_align_exp, rtx expected_size_exp)
20497 rtx jump_around_label = NULL;
20498 HOST_WIDE_INT align = 1;
20499 unsigned HOST_WIDE_INT count = 0;
20500 HOST_WIDE_INT expected_size = -1;
20501 int size_needed = 0, epilogue_size_needed;
20502 int desired_align = 0, align_bytes = 0;
20503 enum stringop_alg alg;
20505 bool need_zero_guard = false;
20507 if (CONST_INT_P (align_exp))
20508 align = INTVAL (align_exp);
20509 /* i386 can do misaligned access on reasonably increased cost. */
20510 if (CONST_INT_P (expected_align_exp)
20511 && INTVAL (expected_align_exp) > align)
20512 align = INTVAL (expected_align_exp);
20513 /* ALIGN is the minimum of destination and source alignment, but we care here
20514 just about destination alignment. */
20515 else if (MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
20516 align = MEM_ALIGN (dst) / BITS_PER_UNIT;
20518 if (CONST_INT_P (count_exp))
20519 count = expected_size = INTVAL (count_exp);
20520 if (CONST_INT_P (expected_size_exp) && count == 0)
20521 expected_size = INTVAL (expected_size_exp);
20523 /* Make sure we don't need to care about overflow later on. */
20524 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
20527 /* Step 0: Decide on preferred algorithm, desired alignment and
20528 size of chunks to be copied by main loop. */
20530 alg = decide_alg (count, expected_size, false, &dynamic_check);
20531 desired_align = decide_alignment (align, alg, expected_size);
20533 if (!TARGET_ALIGN_STRINGOPS)
20534 align = desired_align;
20536 if (alg == libcall)
20538 gcc_assert (alg != no_stringop);
20540 count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
20541 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
20542 srcreg = copy_to_mode_reg (Pmode, XEXP (src, 0));
20547 gcc_unreachable ();
20549 need_zero_guard = true;
20550 size_needed = GET_MODE_SIZE (Pmode);
20552 case unrolled_loop:
20553 need_zero_guard = true;
20554 size_needed = GET_MODE_SIZE (Pmode) * (TARGET_64BIT ? 4 : 2);
20556 case rep_prefix_8_byte:
20559 case rep_prefix_4_byte:
20562 case rep_prefix_1_byte:
20566 need_zero_guard = true;
20571 epilogue_size_needed = size_needed;
20573 /* Step 1: Prologue guard. */
20575 /* Alignment code needs count to be in register. */
20576 if (CONST_INT_P (count_exp) && desired_align > align)
20578 if (INTVAL (count_exp) > desired_align
20579 && INTVAL (count_exp) > size_needed)
20582 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
20583 if (align_bytes <= 0)
20586 align_bytes = desired_align - align_bytes;
20588 if (align_bytes == 0)
20589 count_exp = force_reg (counter_mode (count_exp), count_exp);
20591 gcc_assert (desired_align >= 1 && align >= 1);
20593 /* Ensure that alignment prologue won't copy past end of block. */
20594 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
20596 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
20597 /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
20598 Make sure it is power of 2. */
20599 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
20603 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
20605 /* If main algorithm works on QImode, no epilogue is needed.
20606 For small sizes just don't align anything. */
20607 if (size_needed == 1)
20608 desired_align = align;
20615 label = gen_label_rtx ();
20616 emit_cmp_and_jump_insns (count_exp,
20617 GEN_INT (epilogue_size_needed),
20618 LTU, 0, counter_mode (count_exp), 1, label);
20619 if (expected_size == -1 || expected_size < epilogue_size_needed)
20620 predict_jump (REG_BR_PROB_BASE * 60 / 100);
20622 predict_jump (REG_BR_PROB_BASE * 20 / 100);
20626 /* Emit code to decide on runtime whether library call or inline should be
20628 if (dynamic_check != -1)
20630 if (CONST_INT_P (count_exp))
20632 if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
20634 emit_block_move_via_libcall (dst, src, count_exp, false);
20635 count_exp = const0_rtx;
20641 rtx hot_label = gen_label_rtx ();
20642 jump_around_label = gen_label_rtx ();
20643 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
20644 LEU, 0, GET_MODE (count_exp), 1, hot_label);
20645 predict_jump (REG_BR_PROB_BASE * 90 / 100);
20646 emit_block_move_via_libcall (dst, src, count_exp, false);
20647 emit_jump (jump_around_label);
20648 emit_label (hot_label);
20652 /* Step 2: Alignment prologue. */
20654 if (desired_align > align)
20656 if (align_bytes == 0)
20658 /* Except for the first move in epilogue, we no longer know
20659 constant offset in aliasing info. It don't seems to worth
20660 the pain to maintain it for the first move, so throw away
20662 src = change_address (src, BLKmode, srcreg);
20663 dst = change_address (dst, BLKmode, destreg);
20664 expand_movmem_prologue (dst, src, destreg, srcreg, count_exp, align,
20669 /* If we know how many bytes need to be stored before dst is
20670 sufficiently aligned, maintain aliasing info accurately. */
20671 dst = expand_constant_movmem_prologue (dst, &src, destreg, srcreg,
20672 desired_align, align_bytes);
20673 count_exp = plus_constant (count_exp, -align_bytes);
20674 count -= align_bytes;
20676 if (need_zero_guard
20677 && (count < (unsigned HOST_WIDE_INT) size_needed
20678 || (align_bytes == 0
20679 && count < ((unsigned HOST_WIDE_INT) size_needed
20680 + desired_align - align))))
20682 /* It is possible that we copied enough so the main loop will not
20684 gcc_assert (size_needed > 1);
20685 if (label == NULL_RTX)
20686 label = gen_label_rtx ();
20687 emit_cmp_and_jump_insns (count_exp,
20688 GEN_INT (size_needed),
20689 LTU, 0, counter_mode (count_exp), 1, label);
20690 if (expected_size == -1
20691 || expected_size < (desired_align - align) / 2 + size_needed)
20692 predict_jump (REG_BR_PROB_BASE * 20 / 100);
20694 predict_jump (REG_BR_PROB_BASE * 60 / 100);
20697 if (label && size_needed == 1)
20699 emit_label (label);
20700 LABEL_NUSES (label) = 1;
20702 epilogue_size_needed = 1;
20704 else if (label == NULL_RTX)
20705 epilogue_size_needed = size_needed;
20707 /* Step 3: Main loop. */
20713 gcc_unreachable ();
20715 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
20716 count_exp, QImode, 1, expected_size);
20719 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
20720 count_exp, Pmode, 1, expected_size);
20722 case unrolled_loop:
20723 /* Unroll only by factor of 2 in 32bit mode, since we don't have enough
20724 registers for 4 temporaries anyway. */
20725 expand_set_or_movmem_via_loop (dst, src, destreg, srcreg, NULL,
20726 count_exp, Pmode, TARGET_64BIT ? 4 : 2,
20729 case rep_prefix_8_byte:
20730 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
20733 case rep_prefix_4_byte:
20734 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
20737 case rep_prefix_1_byte:
20738 expand_movmem_via_rep_mov (dst, src, destreg, srcreg, count_exp,
20742 /* Adjust properly the offset of src and dest memory for aliasing. */
20743 if (CONST_INT_P (count_exp))
20745 src = adjust_automodify_address_nv (src, BLKmode, srcreg,
20746 (count / size_needed) * size_needed);
20747 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
20748 (count / size_needed) * size_needed);
20752 src = change_address (src, BLKmode, srcreg);
20753 dst = change_address (dst, BLKmode, destreg);
20756 /* Step 4: Epilogue to copy the remaining bytes. */
20760 /* When the main loop is done, COUNT_EXP might hold original count,
20761 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
20762 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
20763 bytes. Compensate if needed. */
20765 if (size_needed < epilogue_size_needed)
20768 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
20769 GEN_INT (size_needed - 1), count_exp, 1,
20771 if (tmp != count_exp)
20772 emit_move_insn (count_exp, tmp);
20774 emit_label (label);
20775 LABEL_NUSES (label) = 1;
20778 if (count_exp != const0_rtx && epilogue_size_needed > 1)
20779 expand_movmem_epilogue (dst, src, destreg, srcreg, count_exp,
20780 epilogue_size_needed);
20781 if (jump_around_label)
20782 emit_label (jump_around_label);
20786 /* Helper function for memcpy. For QImode value 0xXY produce
20787 0xXYXYXYXY of wide specified by MODE. This is essentially
20788 a * 0x10101010, but we can do slightly better than
20789 synth_mult by unwinding the sequence by hand on CPUs with
20792 promote_duplicated_reg (enum machine_mode mode, rtx val)
20794 enum machine_mode valmode = GET_MODE (val);
20796 int nops = mode == DImode ? 3 : 2;
20798 gcc_assert (mode == SImode || mode == DImode);
20799 if (val == const0_rtx)
20800 return copy_to_mode_reg (mode, const0_rtx);
20801 if (CONST_INT_P (val))
20803 HOST_WIDE_INT v = INTVAL (val) & 255;
20807 if (mode == DImode)
20808 v |= (v << 16) << 16;
20809 return copy_to_mode_reg (mode, gen_int_mode (v, mode));
20812 if (valmode == VOIDmode)
20814 if (valmode != QImode)
20815 val = gen_lowpart (QImode, val);
20816 if (mode == QImode)
20818 if (!TARGET_PARTIAL_REG_STALL)
20820 if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
20821 + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
20822 <= (ix86_cost->shift_const + ix86_cost->add) * nops
20823 + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
20825 rtx reg = convert_modes (mode, QImode, val, true);
20826 tmp = promote_duplicated_reg (mode, const1_rtx);
20827 return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
20832 rtx reg = convert_modes (mode, QImode, val, true);
20834 if (!TARGET_PARTIAL_REG_STALL)
20835 if (mode == SImode)
20836 emit_insn (gen_movsi_insv_1 (reg, reg));
20838 emit_insn (gen_movdi_insv_1 (reg, reg));
20841 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
20842 NULL, 1, OPTAB_DIRECT);
20844 expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
20846 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
20847 NULL, 1, OPTAB_DIRECT);
20848 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
20849 if (mode == SImode)
20851 tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
20852 NULL, 1, OPTAB_DIRECT);
20853 reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
20858 /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
20859 be needed by main loop copying SIZE_NEEDED chunks and prologue getting
20860 alignment from ALIGN to DESIRED_ALIGN. */
20862 promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align, int align)
20867 && (size_needed > 4 || (desired_align > align && desired_align > 4)))
20868 promoted_val = promote_duplicated_reg (DImode, val);
20869 else if (size_needed > 2 || (desired_align > align && desired_align > 2))
20870 promoted_val = promote_duplicated_reg (SImode, val);
20871 else if (size_needed > 1 || (desired_align > align && desired_align > 1))
20872 promoted_val = promote_duplicated_reg (HImode, val);
20874 promoted_val = val;
20876 return promoted_val;
20879 /* Expand string clear operation (bzero). Use i386 string operations when
20880 profitable. See expand_movmem comment for explanation of individual
20881 steps performed. */
20883 ix86_expand_setmem (rtx dst, rtx count_exp, rtx val_exp, rtx align_exp,
20884 rtx expected_align_exp, rtx expected_size_exp)
20889 rtx jump_around_label = NULL;
20890 HOST_WIDE_INT align = 1;
20891 unsigned HOST_WIDE_INT count = 0;
20892 HOST_WIDE_INT expected_size = -1;
20893 int size_needed = 0, epilogue_size_needed;
20894 int desired_align = 0, align_bytes = 0;
20895 enum stringop_alg alg;
20896 rtx promoted_val = NULL;
20897 bool force_loopy_epilogue = false;
20899 bool need_zero_guard = false;
20901 if (CONST_INT_P (align_exp))
20902 align = INTVAL (align_exp);
20903 /* i386 can do misaligned access on reasonably increased cost. */
20904 if (CONST_INT_P (expected_align_exp)
20905 && INTVAL (expected_align_exp) > align)
20906 align = INTVAL (expected_align_exp);
20907 if (CONST_INT_P (count_exp))
20908 count = expected_size = INTVAL (count_exp);
20909 if (CONST_INT_P (expected_size_exp) && count == 0)
20910 expected_size = INTVAL (expected_size_exp);
20912 /* Make sure we don't need to care about overflow later on. */
20913 if (count > ((unsigned HOST_WIDE_INT) 1 << 30))
20916 /* Step 0: Decide on preferred algorithm, desired alignment and
20917 size of chunks to be copied by main loop. */
20919 alg = decide_alg (count, expected_size, true, &dynamic_check);
20920 desired_align = decide_alignment (align, alg, expected_size);
20922 if (!TARGET_ALIGN_STRINGOPS)
20923 align = desired_align;
20925 if (alg == libcall)
20927 gcc_assert (alg != no_stringop);
20929 count_exp = copy_to_mode_reg (counter_mode (count_exp), count_exp);
20930 destreg = copy_to_mode_reg (Pmode, XEXP (dst, 0));
20935 gcc_unreachable ();
20937 need_zero_guard = true;
20938 size_needed = GET_MODE_SIZE (Pmode);
20940 case unrolled_loop:
20941 need_zero_guard = true;
20942 size_needed = GET_MODE_SIZE (Pmode) * 4;
20944 case rep_prefix_8_byte:
20947 case rep_prefix_4_byte:
20950 case rep_prefix_1_byte:
20954 need_zero_guard = true;
20958 epilogue_size_needed = size_needed;
20960 /* Step 1: Prologue guard. */
20962 /* Alignment code needs count to be in register. */
20963 if (CONST_INT_P (count_exp) && desired_align > align)
20965 if (INTVAL (count_exp) > desired_align
20966 && INTVAL (count_exp) > size_needed)
20969 = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
20970 if (align_bytes <= 0)
20973 align_bytes = desired_align - align_bytes;
20975 if (align_bytes == 0)
20977 enum machine_mode mode = SImode;
20978 if (TARGET_64BIT && (count & ~0xffffffff))
20980 count_exp = force_reg (mode, count_exp);
20983 /* Do the cheap promotion to allow better CSE across the
20984 main loop and epilogue (ie one load of the big constant in the
20985 front of all code. */
20986 if (CONST_INT_P (val_exp))
20987 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
20988 desired_align, align);
20989 /* Ensure that alignment prologue won't copy past end of block. */
20990 if (size_needed > 1 || (desired_align > 1 && desired_align > align))
20992 epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
20993 /* Epilogue always copies COUNT_EXP & (EPILOGUE_SIZE_NEEDED - 1) bytes.
20994 Make sure it is power of 2. */
20995 epilogue_size_needed = smallest_pow2_greater_than (epilogue_size_needed);
20997 /* To improve performance of small blocks, we jump around the VAL
20998 promoting mode. This mean that if the promoted VAL is not constant,
20999 we might not use it in the epilogue and have to use byte
21001 if (epilogue_size_needed > 2 && !promoted_val)
21002 force_loopy_epilogue = true;
21005 if (count < (unsigned HOST_WIDE_INT)epilogue_size_needed)
21007 /* If main algorithm works on QImode, no epilogue is needed.
21008 For small sizes just don't align anything. */
21009 if (size_needed == 1)
21010 desired_align = align;
21017 label = gen_label_rtx ();
21018 emit_cmp_and_jump_insns (count_exp,
21019 GEN_INT (epilogue_size_needed),
21020 LTU, 0, counter_mode (count_exp), 1, label);
21021 if (expected_size == -1 || expected_size <= epilogue_size_needed)
21022 predict_jump (REG_BR_PROB_BASE * 60 / 100);
21024 predict_jump (REG_BR_PROB_BASE * 20 / 100);
21027 if (dynamic_check != -1)
21029 rtx hot_label = gen_label_rtx ();
21030 jump_around_label = gen_label_rtx ();
21031 emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
21032 LEU, 0, counter_mode (count_exp), 1, hot_label);
21033 predict_jump (REG_BR_PROB_BASE * 90 / 100);
21034 set_storage_via_libcall (dst, count_exp, val_exp, false);
21035 emit_jump (jump_around_label);
21036 emit_label (hot_label);
21039 /* Step 2: Alignment prologue. */
21041 /* Do the expensive promotion once we branched off the small blocks. */
21043 promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
21044 desired_align, align);
21045 gcc_assert (desired_align >= 1 && align >= 1);
21047 if (desired_align > align)
21049 if (align_bytes == 0)
21051 /* Except for the first move in epilogue, we no longer know
21052 constant offset in aliasing info. It don't seems to worth
21053 the pain to maintain it for the first move, so throw away
21055 dst = change_address (dst, BLKmode, destreg);
21056 expand_setmem_prologue (dst, destreg, promoted_val, count_exp, align,
21061 /* If we know how many bytes need to be stored before dst is
21062 sufficiently aligned, maintain aliasing info accurately. */
21063 dst = expand_constant_setmem_prologue (dst, destreg, promoted_val,
21064 desired_align, align_bytes);
21065 count_exp = plus_constant (count_exp, -align_bytes);
21066 count -= align_bytes;
21068 if (need_zero_guard
21069 && (count < (unsigned HOST_WIDE_INT) size_needed
21070 || (align_bytes == 0
21071 && count < ((unsigned HOST_WIDE_INT) size_needed
21072 + desired_align - align))))
21074 /* It is possible that we copied enough so the main loop will not
21076 gcc_assert (size_needed > 1);
21077 if (label == NULL_RTX)
21078 label = gen_label_rtx ();
21079 emit_cmp_and_jump_insns (count_exp,
21080 GEN_INT (size_needed),
21081 LTU, 0, counter_mode (count_exp), 1, label);
21082 if (expected_size == -1
21083 || expected_size < (desired_align - align) / 2 + size_needed)
21084 predict_jump (REG_BR_PROB_BASE * 20 / 100);
21086 predict_jump (REG_BR_PROB_BASE * 60 / 100);
21089 if (label && size_needed == 1)
21091 emit_label (label);
21092 LABEL_NUSES (label) = 1;
21094 promoted_val = val_exp;
21095 epilogue_size_needed = 1;
21097 else if (label == NULL_RTX)
21098 epilogue_size_needed = size_needed;
21100 /* Step 3: Main loop. */
21106 gcc_unreachable ();
21108 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
21109 count_exp, QImode, 1, expected_size);
21112 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
21113 count_exp, Pmode, 1, expected_size);
21115 case unrolled_loop:
21116 expand_set_or_movmem_via_loop (dst, NULL, destreg, NULL, promoted_val,
21117 count_exp, Pmode, 4, expected_size);
21119 case rep_prefix_8_byte:
21120 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
21123 case rep_prefix_4_byte:
21124 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
21127 case rep_prefix_1_byte:
21128 expand_setmem_via_rep_stos (dst, destreg, promoted_val, count_exp,
21132 /* Adjust properly the offset of src and dest memory for aliasing. */
21133 if (CONST_INT_P (count_exp))
21134 dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
21135 (count / size_needed) * size_needed);
21137 dst = change_address (dst, BLKmode, destreg);
21139 /* Step 4: Epilogue to copy the remaining bytes. */
21143 /* When the main loop is done, COUNT_EXP might hold original count,
21144 while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
21145 Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
21146 bytes. Compensate if needed. */
21148 if (size_needed < epilogue_size_needed)
21151 expand_simple_binop (counter_mode (count_exp), AND, count_exp,
21152 GEN_INT (size_needed - 1), count_exp, 1,
21154 if (tmp != count_exp)
21155 emit_move_insn (count_exp, tmp);
21157 emit_label (label);
21158 LABEL_NUSES (label) = 1;
21161 if (count_exp != const0_rtx && epilogue_size_needed > 1)
21163 if (force_loopy_epilogue)
21164 expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
21165 epilogue_size_needed);
21167 expand_setmem_epilogue (dst, destreg, promoted_val, count_exp,
21168 epilogue_size_needed);
21170 if (jump_around_label)
21171 emit_label (jump_around_label);
21175 /* Expand the appropriate insns for doing strlen if not just doing
21178 out = result, initialized with the start address
21179 align_rtx = alignment of the address.
21180 scratch = scratch register, initialized with the startaddress when
21181 not aligned, otherwise undefined
21183 This is just the body. It needs the initializations mentioned above and
21184 some address computing at the end. These things are done in i386.md. */
21187 ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
21191 rtx align_2_label = NULL_RTX;
21192 rtx align_3_label = NULL_RTX;
21193 rtx align_4_label = gen_label_rtx ();
21194 rtx end_0_label = gen_label_rtx ();
21196 rtx tmpreg = gen_reg_rtx (SImode);
21197 rtx scratch = gen_reg_rtx (SImode);
21201 if (CONST_INT_P (align_rtx))
21202 align = INTVAL (align_rtx);
21204 /* Loop to check 1..3 bytes for null to get an aligned pointer. */
21206 /* Is there a known alignment and is it less than 4? */
21209 rtx scratch1 = gen_reg_rtx (Pmode);
21210 emit_move_insn (scratch1, out);
21211 /* Is there a known alignment and is it not 2? */
21214 align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
21215 align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
21217 /* Leave just the 3 lower bits. */
21218 align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
21219 NULL_RTX, 0, OPTAB_WIDEN);
21221 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
21222 Pmode, 1, align_4_label);
21223 emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
21224 Pmode, 1, align_2_label);
21225 emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
21226 Pmode, 1, align_3_label);
21230 /* Since the alignment is 2, we have to check 2 or 0 bytes;
21231 check if is aligned to 4 - byte. */
21233 align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
21234 NULL_RTX, 0, OPTAB_WIDEN);
21236 emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
21237 Pmode, 1, align_4_label);
21240 mem = change_address (src, QImode, out);
21242 /* Now compare the bytes. */
21244 /* Compare the first n unaligned byte on a byte per byte basis. */
21245 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
21246 QImode, 1, end_0_label);
21248 /* Increment the address. */
21249 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
21251 /* Not needed with an alignment of 2 */
21254 emit_label (align_2_label);
21256 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
21259 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
21261 emit_label (align_3_label);
21264 emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
21267 emit_insn (ix86_gen_add3 (out, out, const1_rtx));
21270 /* Generate loop to check 4 bytes at a time. It is not a good idea to
21271 align this loop. It gives only huge programs, but does not help to
21273 emit_label (align_4_label);
21275 mem = change_address (src, SImode, out);
21276 emit_move_insn (scratch, mem);
21277 emit_insn (ix86_gen_add3 (out, out, GEN_INT (4)));
21279 /* This formula yields a nonzero result iff one of the bytes is zero.
21280 This saves three branches inside loop and many cycles. */
21282 emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
21283 emit_insn (gen_one_cmplsi2 (scratch, scratch));
21284 emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
21285 emit_insn (gen_andsi3 (tmpreg, tmpreg,
21286 gen_int_mode (0x80808080, SImode)));
21287 emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
21292 rtx reg = gen_reg_rtx (SImode);
21293 rtx reg2 = gen_reg_rtx (Pmode);
21294 emit_move_insn (reg, tmpreg);
21295 emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
21297 /* If zero is not in the first two bytes, move two bytes forward. */
21298 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
21299 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
21300 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
21301 emit_insn (gen_rtx_SET (VOIDmode, tmpreg,
21302 gen_rtx_IF_THEN_ELSE (SImode, tmp,
21305 /* Emit lea manually to avoid clobbering of flags. */
21306 emit_insn (gen_rtx_SET (SImode, reg2,
21307 gen_rtx_PLUS (Pmode, out, const2_rtx)));
21309 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
21310 tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
21311 emit_insn (gen_rtx_SET (VOIDmode, out,
21312 gen_rtx_IF_THEN_ELSE (Pmode, tmp,
21318 rtx end_2_label = gen_label_rtx ();
21319 /* Is zero in the first two bytes? */
21321 emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
21322 tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
21323 tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
21324 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
21325 gen_rtx_LABEL_REF (VOIDmode, end_2_label),
21327 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
21328 JUMP_LABEL (tmp) = end_2_label;
21330 /* Not in the first two. Move two bytes forward. */
21331 emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
21332 emit_insn (ix86_gen_add3 (out, out, const2_rtx));
21334 emit_label (end_2_label);
21338 /* Avoid branch in fixing the byte. */
21339 tmpreg = gen_lowpart (QImode, tmpreg);
21340 emit_insn (gen_addqi3_cc (tmpreg, tmpreg, tmpreg));
21341 tmp = gen_rtx_REG (CCmode, FLAGS_REG);
21342 cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
21343 emit_insn (ix86_gen_sub3_carry (out, out, GEN_INT (3), tmp, cmp));
21345 emit_label (end_0_label);
21348 /* Expand strlen. */
21351 ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
21353 rtx addr, scratch1, scratch2, scratch3, scratch4;
21355 /* The generic case of strlen expander is long. Avoid it's
21356 expanding unless TARGET_INLINE_ALL_STRINGOPS. */
21358 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
21359 && !TARGET_INLINE_ALL_STRINGOPS
21360 && !optimize_insn_for_size_p ()
21361 && (!CONST_INT_P (align) || INTVAL (align) < 4))
21364 addr = force_reg (Pmode, XEXP (src, 0));
21365 scratch1 = gen_reg_rtx (Pmode);
21367 if (TARGET_UNROLL_STRLEN && eoschar == const0_rtx && optimize > 1
21368 && !optimize_insn_for_size_p ())
21370 /* Well it seems that some optimizer does not combine a call like
21371 foo(strlen(bar), strlen(bar));
21372 when the move and the subtraction is done here. It does calculate
21373 the length just once when these instructions are done inside of
21374 output_strlen_unroll(). But I think since &bar[strlen(bar)] is
21375 often used and I use one fewer register for the lifetime of
21376 output_strlen_unroll() this is better. */
21378 emit_move_insn (out, addr);
21380 ix86_expand_strlensi_unroll_1 (out, src, align);
21382 /* strlensi_unroll_1 returns the address of the zero at the end of
21383 the string, like memchr(), so compute the length by subtracting
21384 the start address. */
21385 emit_insn (ix86_gen_sub3 (out, out, addr));
21391 /* Can't use this if the user has appropriated eax, ecx, or edi. */
21392 if (fixed_regs[AX_REG] || fixed_regs[CX_REG] || fixed_regs[DI_REG])
21395 scratch2 = gen_reg_rtx (Pmode);
21396 scratch3 = gen_reg_rtx (Pmode);
21397 scratch4 = force_reg (Pmode, constm1_rtx);
21399 emit_move_insn (scratch3, addr);
21400 eoschar = force_reg (QImode, eoschar);
21402 src = replace_equiv_address_nv (src, scratch3);
21404 /* If .md starts supporting :P, this can be done in .md. */
21405 unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (4, src, eoschar, align,
21406 scratch4), UNSPEC_SCAS);
21407 emit_insn (gen_strlenqi_1 (scratch1, scratch3, unspec));
21408 emit_insn (ix86_gen_one_cmpl2 (scratch2, scratch1));
21409 emit_insn (ix86_gen_add3 (out, scratch2, constm1_rtx));
21414 /* For given symbol (function) construct code to compute address of it's PLT
21415 entry in large x86-64 PIC model. */
21417 construct_plt_address (rtx symbol)
21419 rtx tmp = gen_reg_rtx (Pmode);
21420 rtx unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
21422 gcc_assert (GET_CODE (symbol) == SYMBOL_REF);
21423 gcc_assert (ix86_cmodel == CM_LARGE_PIC);
21425 emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
21426 emit_insn (gen_adddi3 (tmp, tmp, pic_offset_table_rtx));
21431 ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
21433 rtx pop, bool sibcall)
21435 rtx use = NULL, call;
21437 if (pop == const0_rtx)
21439 gcc_assert (!TARGET_64BIT || !pop);
21441 if (TARGET_MACHO && !TARGET_64BIT)
21444 if (flag_pic && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
21445 fnaddr = machopic_indirect_call_target (fnaddr);
21450 /* Static functions and indirect calls don't need the pic register. */
21451 if (flag_pic && (!TARGET_64BIT || ix86_cmodel == CM_LARGE_PIC)
21452 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
21453 && ! SYMBOL_REF_LOCAL_P (XEXP (fnaddr, 0)))
21454 use_reg (&use, pic_offset_table_rtx);
21457 if (TARGET_64BIT && INTVAL (callarg2) >= 0)
21459 rtx al = gen_rtx_REG (QImode, AX_REG);
21460 emit_move_insn (al, callarg2);
21461 use_reg (&use, al);
21464 if (ix86_cmodel == CM_LARGE_PIC
21466 && GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF
21467 && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
21468 fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
21470 ? !sibcall_insn_operand (XEXP (fnaddr, 0), Pmode)
21471 : !call_insn_operand (XEXP (fnaddr, 0), Pmode))
21473 fnaddr = copy_to_mode_reg (Pmode, XEXP (fnaddr, 0));
21474 fnaddr = gen_rtx_MEM (QImode, fnaddr);
21477 call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
21479 call = gen_rtx_SET (VOIDmode, retval, call);
21482 pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
21483 pop = gen_rtx_SET (VOIDmode, stack_pointer_rtx, pop);
21484 call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, call, pop));
21486 if (TARGET_64BIT_MS_ABI
21487 && (!callarg2 || INTVAL (callarg2) != -2))
21489 /* We need to represent that SI and DI registers are clobbered
21491 static int clobbered_registers[] = {
21492 XMM6_REG, XMM7_REG, XMM8_REG,
21493 XMM9_REG, XMM10_REG, XMM11_REG,
21494 XMM12_REG, XMM13_REG, XMM14_REG,
21495 XMM15_REG, SI_REG, DI_REG
21498 rtx vec[ARRAY_SIZE (clobbered_registers) + 2];
21499 rtx unspec = gen_rtx_UNSPEC (VOIDmode, gen_rtvec (1, const0_rtx),
21500 UNSPEC_MS_TO_SYSV_CALL);
21504 for (i = 0; i < ARRAY_SIZE (clobbered_registers); i++)
21505 vec[i + 2] = gen_rtx_CLOBBER (SSE_REGNO_P (clobbered_registers[i])
21508 (SSE_REGNO_P (clobbered_registers[i])
21510 clobbered_registers[i]));
21512 call = gen_rtx_PARALLEL (VOIDmode,
21513 gen_rtvec_v (ARRAY_SIZE (clobbered_registers)
21517 /* Add UNSPEC_CALL_NEEDS_VZEROUPPER decoration. */
21518 if (TARGET_VZEROUPPER)
21523 if (cfun->machine->callee_pass_avx256_p)
21525 if (cfun->machine->callee_return_avx256_p)
21526 avx256 = callee_return_pass_avx256;
21528 avx256 = callee_pass_avx256;
21530 else if (cfun->machine->callee_return_avx256_p)
21531 avx256 = callee_return_avx256;
21533 avx256 = call_no_avx256;
21535 if (reload_completed)
21536 emit_insn (gen_avx_vzeroupper (GEN_INT (avx256)));
21539 unspec = gen_rtx_UNSPEC (VOIDmode,
21540 gen_rtvec (1, GEN_INT (avx256)),
21541 UNSPEC_CALL_NEEDS_VZEROUPPER);
21542 call = gen_rtx_PARALLEL (VOIDmode,
21543 gen_rtvec (2, call, unspec));
21547 call = emit_call_insn (call);
21549 CALL_INSN_FUNCTION_USAGE (call) = use;
21555 ix86_split_call_vzeroupper (rtx insn, rtx vzeroupper)
21557 rtx call = XVECEXP (PATTERN (insn), 0, 0);
21558 emit_insn (gen_avx_vzeroupper (vzeroupper));
21559 emit_call_insn (call);
21562 /* Output the assembly for a call instruction. */
21565 ix86_output_call_insn (rtx insn, rtx call_op)
21567 bool direct_p = constant_call_address_operand (call_op, Pmode);
21568 bool seh_nop_p = false;
21571 if (SIBLING_CALL_P (insn))
21575 /* SEH epilogue detection requires the indirect branch case
21576 to include REX.W. */
21577 else if (TARGET_SEH)
21578 xasm = "rex.W jmp %A0";
21582 output_asm_insn (xasm, &call_op);
21586 /* SEH unwinding can require an extra nop to be emitted in several
21587 circumstances. Determine if we have one of those. */
21592 for (i = NEXT_INSN (insn); i ; i = NEXT_INSN (i))
21594 /* If we get to another real insn, we don't need the nop. */
21598 /* If we get to the epilogue note, prevent a catch region from
21599 being adjacent to the standard epilogue sequence. If non-
21600 call-exceptions, we'll have done this during epilogue emission. */
21601 if (NOTE_P (i) && NOTE_KIND (i) == NOTE_INSN_EPILOGUE_BEG
21602 && !flag_non_call_exceptions
21603 && !can_throw_internal (insn))
21610 /* If we didn't find a real insn following the call, prevent the
21611 unwinder from looking into the next function. */
21617 xasm = "call\t%P0";
21619 xasm = "call\t%A0";
21621 output_asm_insn (xasm, &call_op);
21629 /* Clear stack slot assignments remembered from previous functions.
21630 This is called from INIT_EXPANDERS once before RTL is emitted for each
21633 static struct machine_function *
21634 ix86_init_machine_status (void)
21636 struct machine_function *f;
21638 f = ggc_alloc_cleared_machine_function ();
21639 f->use_fast_prologue_epilogue_nregs = -1;
21640 f->tls_descriptor_call_expanded_p = 0;
21641 f->call_abi = ix86_abi;
21646 /* Return a MEM corresponding to a stack slot with mode MODE.
21647 Allocate a new slot if necessary.
21649 The RTL for a function can have several slots available: N is
21650 which slot to use. */
21653 assign_386_stack_local (enum machine_mode mode, enum ix86_stack_slot n)
21655 struct stack_local_entry *s;
21657 gcc_assert (n < MAX_386_STACK_LOCALS);
21659 /* Virtual slot is valid only before vregs are instantiated. */
21660 gcc_assert ((n == SLOT_VIRTUAL) == !virtuals_instantiated);
21662 for (s = ix86_stack_locals; s; s = s->next)
21663 if (s->mode == mode && s->n == n)
21664 return copy_rtx (s->rtl);
21666 s = ggc_alloc_stack_local_entry ();
21669 s->rtl = assign_stack_local (mode, GET_MODE_SIZE (mode), 0);
21671 s->next = ix86_stack_locals;
21672 ix86_stack_locals = s;
21676 /* Calculate the length of the memory address in the instruction
21677 encoding. Does not include the one-byte modrm, opcode, or prefix. */
21680 memory_address_length (rtx addr)
21682 struct ix86_address parts;
21683 rtx base, index, disp;
21687 if (GET_CODE (addr) == PRE_DEC
21688 || GET_CODE (addr) == POST_INC
21689 || GET_CODE (addr) == PRE_MODIFY
21690 || GET_CODE (addr) == POST_MODIFY)
21693 ok = ix86_decompose_address (addr, &parts);
21696 if (parts.base && GET_CODE (parts.base) == SUBREG)
21697 parts.base = SUBREG_REG (parts.base);
21698 if (parts.index && GET_CODE (parts.index) == SUBREG)
21699 parts.index = SUBREG_REG (parts.index);
21702 index = parts.index;
21707 - esp as the base always wants an index,
21708 - ebp as the base always wants a displacement,
21709 - r12 as the base always wants an index,
21710 - r13 as the base always wants a displacement. */
21712 /* Register Indirect. */
21713 if (base && !index && !disp)
21715 /* esp (for its index) and ebp (for its displacement) need
21716 the two-byte modrm form. Similarly for r12 and r13 in 64-bit
21719 && (addr == arg_pointer_rtx
21720 || addr == frame_pointer_rtx
21721 || REGNO (addr) == SP_REG
21722 || REGNO (addr) == BP_REG
21723 || REGNO (addr) == R12_REG
21724 || REGNO (addr) == R13_REG))
21728 /* Direct Addressing. In 64-bit mode mod 00 r/m 5
21729 is not disp32, but disp32(%rip), so for disp32
21730 SIB byte is needed, unless print_operand_address
21731 optimizes it into disp32(%rip) or (%rip) is implied
21733 else if (disp && !base && !index)
21740 if (GET_CODE (disp) == CONST)
21741 symbol = XEXP (disp, 0);
21742 if (GET_CODE (symbol) == PLUS
21743 && CONST_INT_P (XEXP (symbol, 1)))
21744 symbol = XEXP (symbol, 0);
21746 if (GET_CODE (symbol) != LABEL_REF
21747 && (GET_CODE (symbol) != SYMBOL_REF
21748 || SYMBOL_REF_TLS_MODEL (symbol) != 0)
21749 && (GET_CODE (symbol) != UNSPEC
21750 || (XINT (symbol, 1) != UNSPEC_GOTPCREL
21751 && XINT (symbol, 1) != UNSPEC_PCREL
21752 && XINT (symbol, 1) != UNSPEC_GOTNTPOFF)))
21759 /* Find the length of the displacement constant. */
21762 if (base && satisfies_constraint_K (disp))
21767 /* ebp always wants a displacement. Similarly r13. */
21768 else if (base && REG_P (base)
21769 && (REGNO (base) == BP_REG || REGNO (base) == R13_REG))
21772 /* An index requires the two-byte modrm form.... */
21774 /* ...like esp (or r12), which always wants an index. */
21775 || base == arg_pointer_rtx
21776 || base == frame_pointer_rtx
21777 || (base && REG_P (base)
21778 && (REGNO (base) == SP_REG || REGNO (base) == R12_REG)))
21795 /* Compute default value for "length_immediate" attribute. When SHORTFORM
21796 is set, expect that insn have 8bit immediate alternative. */
21798 ix86_attr_length_immediate_default (rtx insn, bool shortform)
21802 extract_insn_cached (insn);
21803 for (i = recog_data.n_operands - 1; i >= 0; --i)
21804 if (CONSTANT_P (recog_data.operand[i]))
21806 enum attr_mode mode = get_attr_mode (insn);
21809 if (shortform && CONST_INT_P (recog_data.operand[i]))
21811 HOST_WIDE_INT ival = INTVAL (recog_data.operand[i]);
21818 ival = trunc_int_for_mode (ival, HImode);
21821 ival = trunc_int_for_mode (ival, SImode);
21826 if (IN_RANGE (ival, -128, 127))
21843 /* Immediates for DImode instructions are encoded as 32bit sign extended values. */
21848 fatal_insn ("unknown insn mode", insn);
21853 /* Compute default value for "length_address" attribute. */
21855 ix86_attr_length_address_default (rtx insn)
21859 if (get_attr_type (insn) == TYPE_LEA)
21861 rtx set = PATTERN (insn), addr;
21863 if (GET_CODE (set) == PARALLEL)
21864 set = XVECEXP (set, 0, 0);
21866 gcc_assert (GET_CODE (set) == SET);
21868 addr = SET_SRC (set);
21869 if (TARGET_64BIT && get_attr_mode (insn) == MODE_SI)
21871 if (GET_CODE (addr) == ZERO_EXTEND)
21872 addr = XEXP (addr, 0);
21873 if (GET_CODE (addr) == SUBREG)
21874 addr = SUBREG_REG (addr);
21877 return memory_address_length (addr);
21880 extract_insn_cached (insn);
21881 for (i = recog_data.n_operands - 1; i >= 0; --i)
21882 if (MEM_P (recog_data.operand[i]))
21884 constrain_operands_cached (reload_completed);
21885 if (which_alternative != -1)
21887 const char *constraints = recog_data.constraints[i];
21888 int alt = which_alternative;
21890 while (*constraints == '=' || *constraints == '+')
21893 while (*constraints++ != ',')
21895 /* Skip ignored operands. */
21896 if (*constraints == 'X')
21899 return memory_address_length (XEXP (recog_data.operand[i], 0));
21904 /* Compute default value for "length_vex" attribute. It includes
21905 2 or 3 byte VEX prefix and 1 opcode byte. */
21908 ix86_attr_length_vex_default (rtx insn, bool has_0f_opcode, bool has_vex_w)
21912 /* Only 0f opcode can use 2 byte VEX prefix and VEX W bit uses 3
21913 byte VEX prefix. */
21914 if (!has_0f_opcode || has_vex_w)
21917 /* We can always use 2 byte VEX prefix in 32bit. */
21921 extract_insn_cached (insn);
21923 for (i = recog_data.n_operands - 1; i >= 0; --i)
21924 if (REG_P (recog_data.operand[i]))
21926 /* REX.W bit uses 3 byte VEX prefix. */
21927 if (GET_MODE (recog_data.operand[i]) == DImode
21928 && GENERAL_REG_P (recog_data.operand[i]))
21933 /* REX.X or REX.B bits use 3 byte VEX prefix. */
21934 if (MEM_P (recog_data.operand[i])
21935 && x86_extended_reg_mentioned_p (recog_data.operand[i]))
21942 /* Return the maximum number of instructions a cpu can issue. */
21945 ix86_issue_rate (void)
21949 case PROCESSOR_PENTIUM:
21950 case PROCESSOR_ATOM:
21954 case PROCESSOR_PENTIUMPRO:
21955 case PROCESSOR_PENTIUM4:
21956 case PROCESSOR_CORE2_32:
21957 case PROCESSOR_CORE2_64:
21958 case PROCESSOR_COREI7_32:
21959 case PROCESSOR_COREI7_64:
21960 case PROCESSOR_ATHLON:
21962 case PROCESSOR_AMDFAM10:
21963 case PROCESSOR_NOCONA:
21964 case PROCESSOR_GENERIC32:
21965 case PROCESSOR_GENERIC64:
21966 case PROCESSOR_BDVER1:
21967 case PROCESSOR_BDVER2:
21968 case PROCESSOR_BTVER1:
21976 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
21977 by DEP_INSN and nothing set by DEP_INSN. */
21980 ix86_flags_dependent (rtx insn, rtx dep_insn, enum attr_type insn_type)
21984 /* Simplify the test for uninteresting insns. */
21985 if (insn_type != TYPE_SETCC
21986 && insn_type != TYPE_ICMOV
21987 && insn_type != TYPE_FCMOV
21988 && insn_type != TYPE_IBR)
21991 if ((set = single_set (dep_insn)) != 0)
21993 set = SET_DEST (set);
21996 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
21997 && XVECLEN (PATTERN (dep_insn), 0) == 2
21998 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
21999 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
22001 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
22002 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
22007 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
22010 /* This test is true if the dependent insn reads the flags but
22011 not any other potentially set register. */
22012 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
22015 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
22021 /* Return true iff USE_INSN has a memory address with operands set by
22025 ix86_agi_dependent (rtx set_insn, rtx use_insn)
22028 extract_insn_cached (use_insn);
22029 for (i = recog_data.n_operands - 1; i >= 0; --i)
22030 if (MEM_P (recog_data.operand[i]))
22032 rtx addr = XEXP (recog_data.operand[i], 0);
22033 return modified_in_p (addr, set_insn) != 0;
22039 ix86_adjust_cost (rtx insn, rtx link, rtx dep_insn, int cost)
22041 enum attr_type insn_type, dep_insn_type;
22042 enum attr_memory memory;
22044 int dep_insn_code_number;
22046 /* Anti and output dependencies have zero cost on all CPUs. */
22047 if (REG_NOTE_KIND (link) != 0)
22050 dep_insn_code_number = recog_memoized (dep_insn);
22052 /* If we can't recognize the insns, we can't really do anything. */
22053 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
22056 insn_type = get_attr_type (insn);
22057 dep_insn_type = get_attr_type (dep_insn);
22061 case PROCESSOR_PENTIUM:
22062 /* Address Generation Interlock adds a cycle of latency. */
22063 if (insn_type == TYPE_LEA)
22065 rtx addr = PATTERN (insn);
22067 if (GET_CODE (addr) == PARALLEL)
22068 addr = XVECEXP (addr, 0, 0);
22070 gcc_assert (GET_CODE (addr) == SET);
22072 addr = SET_SRC (addr);
22073 if (modified_in_p (addr, dep_insn))
22076 else if (ix86_agi_dependent (dep_insn, insn))
22079 /* ??? Compares pair with jump/setcc. */
22080 if (ix86_flags_dependent (insn, dep_insn, insn_type))
22083 /* Floating point stores require value to be ready one cycle earlier. */
22084 if (insn_type == TYPE_FMOV
22085 && get_attr_memory (insn) == MEMORY_STORE
22086 && !ix86_agi_dependent (dep_insn, insn))
22090 case PROCESSOR_PENTIUMPRO:
22091 memory = get_attr_memory (insn);
22093 /* INT->FP conversion is expensive. */
22094 if (get_attr_fp_int_src (dep_insn))
22097 /* There is one cycle extra latency between an FP op and a store. */
22098 if (insn_type == TYPE_FMOV
22099 && (set = single_set (dep_insn)) != NULL_RTX
22100 && (set2 = single_set (insn)) != NULL_RTX
22101 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
22102 && MEM_P (SET_DEST (set2)))
22105 /* Show ability of reorder buffer to hide latency of load by executing
22106 in parallel with previous instruction in case
22107 previous instruction is not needed to compute the address. */
22108 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
22109 && !ix86_agi_dependent (dep_insn, insn))
22111 /* Claim moves to take one cycle, as core can issue one load
22112 at time and the next load can start cycle later. */
22113 if (dep_insn_type == TYPE_IMOV
22114 || dep_insn_type == TYPE_FMOV)
22122 memory = get_attr_memory (insn);
22124 /* The esp dependency is resolved before the instruction is really
22126 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
22127 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
22130 /* INT->FP conversion is expensive. */
22131 if (get_attr_fp_int_src (dep_insn))
22134 /* Show ability of reorder buffer to hide latency of load by executing
22135 in parallel with previous instruction in case
22136 previous instruction is not needed to compute the address. */
22137 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
22138 && !ix86_agi_dependent (dep_insn, insn))
22140 /* Claim moves to take one cycle, as core can issue one load
22141 at time and the next load can start cycle later. */
22142 if (dep_insn_type == TYPE_IMOV
22143 || dep_insn_type == TYPE_FMOV)
22152 case PROCESSOR_ATHLON:
22154 case PROCESSOR_AMDFAM10:
22155 case PROCESSOR_BDVER1:
22156 case PROCESSOR_BDVER2:
22157 case PROCESSOR_BTVER1:
22158 case PROCESSOR_ATOM:
22159 case PROCESSOR_GENERIC32:
22160 case PROCESSOR_GENERIC64:
22161 memory = get_attr_memory (insn);
22163 /* Show ability of reorder buffer to hide latency of load by executing
22164 in parallel with previous instruction in case
22165 previous instruction is not needed to compute the address. */
22166 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
22167 && !ix86_agi_dependent (dep_insn, insn))
22169 enum attr_unit unit = get_attr_unit (insn);
22172 /* Because of the difference between the length of integer and
22173 floating unit pipeline preparation stages, the memory operands
22174 for floating point are cheaper.
22176 ??? For Athlon it the difference is most probably 2. */
22177 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
22180 loadcost = TARGET_ATHLON ? 2 : 0;
22182 if (cost >= loadcost)
22195 /* How many alternative schedules to try. This should be as wide as the
22196 scheduling freedom in the DFA, but no wider. Making this value too
22197 large results extra work for the scheduler. */
22200 ia32_multipass_dfa_lookahead (void)
22204 case PROCESSOR_PENTIUM:
22207 case PROCESSOR_PENTIUMPRO:
22211 case PROCESSOR_CORE2_32:
22212 case PROCESSOR_CORE2_64:
22213 case PROCESSOR_COREI7_32:
22214 case PROCESSOR_COREI7_64:
22215 /* Generally, we want haifa-sched:max_issue() to look ahead as far
22216 as many instructions can be executed on a cycle, i.e.,
22217 issue_rate. I wonder why tuning for many CPUs does not do this. */
22218 return ix86_issue_rate ();
22227 /* Model decoder of Core 2/i7.
22228 Below hooks for multipass scheduling (see haifa-sched.c:max_issue)
22229 track the instruction fetch block boundaries and make sure that long
22230 (9+ bytes) instructions are assigned to D0. */
22232 /* Maximum length of an insn that can be handled by
22233 a secondary decoder unit. '8' for Core 2/i7. */
22234 static int core2i7_secondary_decoder_max_insn_size;
22236 /* Ifetch block size, i.e., number of bytes decoder reads per cycle.
22237 '16' for Core 2/i7. */
22238 static int core2i7_ifetch_block_size;
22240 /* Maximum number of instructions decoder can handle per cycle.
22241 '6' for Core 2/i7. */
22242 static int core2i7_ifetch_block_max_insns;
22244 typedef struct ix86_first_cycle_multipass_data_ *
22245 ix86_first_cycle_multipass_data_t;
22246 typedef const struct ix86_first_cycle_multipass_data_ *
22247 const_ix86_first_cycle_multipass_data_t;
22249 /* A variable to store target state across calls to max_issue within
22251 static struct ix86_first_cycle_multipass_data_ _ix86_first_cycle_multipass_data,
22252 *ix86_first_cycle_multipass_data = &_ix86_first_cycle_multipass_data;
22254 /* Initialize DATA. */
22256 core2i7_first_cycle_multipass_init (void *_data)
22258 ix86_first_cycle_multipass_data_t data
22259 = (ix86_first_cycle_multipass_data_t) _data;
22261 data->ifetch_block_len = 0;
22262 data->ifetch_block_n_insns = 0;
22263 data->ready_try_change = NULL;
22264 data->ready_try_change_size = 0;
22267 /* Advancing the cycle; reset ifetch block counts. */
22269 core2i7_dfa_post_advance_cycle (void)
22271 ix86_first_cycle_multipass_data_t data = ix86_first_cycle_multipass_data;
22273 gcc_assert (data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
22275 data->ifetch_block_len = 0;
22276 data->ifetch_block_n_insns = 0;
22279 static int min_insn_size (rtx);
22281 /* Filter out insns from ready_try that the core will not be able to issue
22282 on current cycle due to decoder. */
22284 core2i7_first_cycle_multipass_filter_ready_try
22285 (const_ix86_first_cycle_multipass_data_t data,
22286 char *ready_try, int n_ready, bool first_cycle_insn_p)
22293 if (ready_try[n_ready])
22296 insn = get_ready_element (n_ready);
22297 insn_size = min_insn_size (insn);
22299 if (/* If this is a too long an insn for a secondary decoder ... */
22300 (!first_cycle_insn_p
22301 && insn_size > core2i7_secondary_decoder_max_insn_size)
22302 /* ... or it would not fit into the ifetch block ... */
22303 || data->ifetch_block_len + insn_size > core2i7_ifetch_block_size
22304 /* ... or the decoder is full already ... */
22305 || data->ifetch_block_n_insns + 1 > core2i7_ifetch_block_max_insns)
22306 /* ... mask the insn out. */
22308 ready_try[n_ready] = 1;
22310 if (data->ready_try_change)
22311 SET_BIT (data->ready_try_change, n_ready);
22316 /* Prepare for a new round of multipass lookahead scheduling. */
22318 core2i7_first_cycle_multipass_begin (void *_data, char *ready_try, int n_ready,
22319 bool first_cycle_insn_p)
22321 ix86_first_cycle_multipass_data_t data
22322 = (ix86_first_cycle_multipass_data_t) _data;
22323 const_ix86_first_cycle_multipass_data_t prev_data
22324 = ix86_first_cycle_multipass_data;
22326 /* Restore the state from the end of the previous round. */
22327 data->ifetch_block_len = prev_data->ifetch_block_len;
22328 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns;
22330 /* Filter instructions that cannot be issued on current cycle due to
22331 decoder restrictions. */
22332 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
22333 first_cycle_insn_p);
22336 /* INSN is being issued in current solution. Account for its impact on
22337 the decoder model. */
22339 core2i7_first_cycle_multipass_issue (void *_data, char *ready_try, int n_ready,
22340 rtx insn, const void *_prev_data)
22342 ix86_first_cycle_multipass_data_t data
22343 = (ix86_first_cycle_multipass_data_t) _data;
22344 const_ix86_first_cycle_multipass_data_t prev_data
22345 = (const_ix86_first_cycle_multipass_data_t) _prev_data;
22347 int insn_size = min_insn_size (insn);
22349 data->ifetch_block_len = prev_data->ifetch_block_len + insn_size;
22350 data->ifetch_block_n_insns = prev_data->ifetch_block_n_insns + 1;
22351 gcc_assert (data->ifetch_block_len <= core2i7_ifetch_block_size
22352 && data->ifetch_block_n_insns <= core2i7_ifetch_block_max_insns);
22354 /* Allocate or resize the bitmap for storing INSN's effect on ready_try. */
22355 if (!data->ready_try_change)
22357 data->ready_try_change = sbitmap_alloc (n_ready);
22358 data->ready_try_change_size = n_ready;
22360 else if (data->ready_try_change_size < n_ready)
22362 data->ready_try_change = sbitmap_resize (data->ready_try_change,
22364 data->ready_try_change_size = n_ready;
22366 sbitmap_zero (data->ready_try_change);
22368 /* Filter out insns from ready_try that the core will not be able to issue
22369 on current cycle due to decoder. */
22370 core2i7_first_cycle_multipass_filter_ready_try (data, ready_try, n_ready,
22374 /* Revert the effect on ready_try. */
22376 core2i7_first_cycle_multipass_backtrack (const void *_data,
22378 int n_ready ATTRIBUTE_UNUSED)
22380 const_ix86_first_cycle_multipass_data_t data
22381 = (const_ix86_first_cycle_multipass_data_t) _data;
22382 unsigned int i = 0;
22383 sbitmap_iterator sbi;
22385 gcc_assert (sbitmap_last_set_bit (data->ready_try_change) < n_ready);
22386 EXECUTE_IF_SET_IN_SBITMAP (data->ready_try_change, 0, i, sbi)
22392 /* Save the result of multipass lookahead scheduling for the next round. */
22394 core2i7_first_cycle_multipass_end (const void *_data)
22396 const_ix86_first_cycle_multipass_data_t data
22397 = (const_ix86_first_cycle_multipass_data_t) _data;
22398 ix86_first_cycle_multipass_data_t next_data
22399 = ix86_first_cycle_multipass_data;
22403 next_data->ifetch_block_len = data->ifetch_block_len;
22404 next_data->ifetch_block_n_insns = data->ifetch_block_n_insns;
22408 /* Deallocate target data. */
22410 core2i7_first_cycle_multipass_fini (void *_data)
22412 ix86_first_cycle_multipass_data_t data
22413 = (ix86_first_cycle_multipass_data_t) _data;
22415 if (data->ready_try_change)
22417 sbitmap_free (data->ready_try_change);
22418 data->ready_try_change = NULL;
22419 data->ready_try_change_size = 0;
22423 /* Prepare for scheduling pass. */
22425 ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED,
22426 int verbose ATTRIBUTE_UNUSED,
22427 int max_uid ATTRIBUTE_UNUSED)
22429 /* Install scheduling hooks for current CPU. Some of these hooks are used
22430 in time-critical parts of the scheduler, so we only set them up when
22431 they are actually used. */
22434 case PROCESSOR_CORE2_32:
22435 case PROCESSOR_CORE2_64:
22436 case PROCESSOR_COREI7_32:
22437 case PROCESSOR_COREI7_64:
22438 targetm.sched.dfa_post_advance_cycle
22439 = core2i7_dfa_post_advance_cycle;
22440 targetm.sched.first_cycle_multipass_init
22441 = core2i7_first_cycle_multipass_init;
22442 targetm.sched.first_cycle_multipass_begin
22443 = core2i7_first_cycle_multipass_begin;
22444 targetm.sched.first_cycle_multipass_issue
22445 = core2i7_first_cycle_multipass_issue;
22446 targetm.sched.first_cycle_multipass_backtrack
22447 = core2i7_first_cycle_multipass_backtrack;
22448 targetm.sched.first_cycle_multipass_end
22449 = core2i7_first_cycle_multipass_end;
22450 targetm.sched.first_cycle_multipass_fini
22451 = core2i7_first_cycle_multipass_fini;
22453 /* Set decoder parameters. */
22454 core2i7_secondary_decoder_max_insn_size = 8;
22455 core2i7_ifetch_block_size = 16;
22456 core2i7_ifetch_block_max_insns = 6;
22460 targetm.sched.dfa_post_advance_cycle = NULL;
22461 targetm.sched.first_cycle_multipass_init = NULL;
22462 targetm.sched.first_cycle_multipass_begin = NULL;
22463 targetm.sched.first_cycle_multipass_issue = NULL;
22464 targetm.sched.first_cycle_multipass_backtrack = NULL;
22465 targetm.sched.first_cycle_multipass_end = NULL;
22466 targetm.sched.first_cycle_multipass_fini = NULL;
22472 /* Compute the alignment given to a constant that is being placed in memory.
22473 EXP is the constant and ALIGN is the alignment that the object would
22475 The value of this function is used instead of that alignment to align
22479 ix86_constant_alignment (tree exp, int align)
22481 if (TREE_CODE (exp) == REAL_CST || TREE_CODE (exp) == VECTOR_CST
22482 || TREE_CODE (exp) == INTEGER_CST)
22484 if (TYPE_MODE (TREE_TYPE (exp)) == DFmode && align < 64)
22486 else if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (exp))) && align < 128)
22489 else if (!optimize_size && TREE_CODE (exp) == STRING_CST
22490 && TREE_STRING_LENGTH (exp) >= 31 && align < BITS_PER_WORD)
22491 return BITS_PER_WORD;
22496 /* Compute the alignment for a static variable.
22497 TYPE is the data type, and ALIGN is the alignment that
22498 the object would ordinarily have. The value of this function is used
22499 instead of that alignment to align the object. */
22502 ix86_data_alignment (tree type, int align)
22504 int max_align = optimize_size ? BITS_PER_WORD : MIN (256, MAX_OFILE_ALIGNMENT);
22506 if (AGGREGATE_TYPE_P (type)
22507 && TYPE_SIZE (type)
22508 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
22509 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= (unsigned) max_align
22510 || TREE_INT_CST_HIGH (TYPE_SIZE (type)))
22511 && align < max_align)
22514 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
22515 to 16byte boundary. */
22518 if (AGGREGATE_TYPE_P (type)
22519 && TYPE_SIZE (type)
22520 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
22521 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 128
22522 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
22526 if (TREE_CODE (type) == ARRAY_TYPE)
22528 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
22530 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
22533 else if (TREE_CODE (type) == COMPLEX_TYPE)
22536 if (TYPE_MODE (type) == DCmode && align < 64)
22538 if ((TYPE_MODE (type) == XCmode
22539 || TYPE_MODE (type) == TCmode) && align < 128)
22542 else if ((TREE_CODE (type) == RECORD_TYPE
22543 || TREE_CODE (type) == UNION_TYPE
22544 || TREE_CODE (type) == QUAL_UNION_TYPE)
22545 && TYPE_FIELDS (type))
22547 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
22549 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
22552 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
22553 || TREE_CODE (type) == INTEGER_TYPE)
22555 if (TYPE_MODE (type) == DFmode && align < 64)
22557 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
22564 /* Compute the alignment for a local variable or a stack slot. EXP is
22565 the data type or decl itself, MODE is the widest mode available and
22566 ALIGN is the alignment that the object would ordinarily have. The
22567 value of this macro is used instead of that alignment to align the
22571 ix86_local_alignment (tree exp, enum machine_mode mode,
22572 unsigned int align)
22576 if (exp && DECL_P (exp))
22578 type = TREE_TYPE (exp);
22587 /* Don't do dynamic stack realignment for long long objects with
22588 -mpreferred-stack-boundary=2. */
22591 && ix86_preferred_stack_boundary < 64
22592 && (mode == DImode || (type && TYPE_MODE (type) == DImode))
22593 && (!type || !TYPE_USER_ALIGN (type))
22594 && (!decl || !DECL_USER_ALIGN (decl)))
22597 /* If TYPE is NULL, we are allocating a stack slot for caller-save
22598 register in MODE. We will return the largest alignment of XF
22602 if (mode == XFmode && align < GET_MODE_ALIGNMENT (DFmode))
22603 align = GET_MODE_ALIGNMENT (DFmode);
22607 /* x86-64 ABI requires arrays greater than 16 bytes to be aligned
22608 to 16byte boundary. Exact wording is:
22610 An array uses the same alignment as its elements, except that a local or
22611 global array variable of length at least 16 bytes or
22612 a C99 variable-length array variable always has alignment of at least 16 bytes.
22614 This was added to allow use of aligned SSE instructions at arrays. This
22615 rule is meant for static storage (where compiler can not do the analysis
22616 by itself). We follow it for automatic variables only when convenient.
22617 We fully control everything in the function compiled and functions from
22618 other unit can not rely on the alignment.
22620 Exclude va_list type. It is the common case of local array where
22621 we can not benefit from the alignment. */
22622 if (TARGET_64BIT && optimize_function_for_speed_p (cfun)
22625 if (AGGREGATE_TYPE_P (type)
22626 && (va_list_type_node == NULL_TREE
22627 || (TYPE_MAIN_VARIANT (type)
22628 != TYPE_MAIN_VARIANT (va_list_type_node)))
22629 && TYPE_SIZE (type)
22630 && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
22631 && (TREE_INT_CST_LOW (TYPE_SIZE (type)) >= 16
22632 || TREE_INT_CST_HIGH (TYPE_SIZE (type))) && align < 128)
22635 if (TREE_CODE (type) == ARRAY_TYPE)
22637 if (TYPE_MODE (TREE_TYPE (type)) == DFmode && align < 64)
22639 if (ALIGN_MODE_128 (TYPE_MODE (TREE_TYPE (type))) && align < 128)
22642 else if (TREE_CODE (type) == COMPLEX_TYPE)
22644 if (TYPE_MODE (type) == DCmode && align < 64)
22646 if ((TYPE_MODE (type) == XCmode
22647 || TYPE_MODE (type) == TCmode) && align < 128)
22650 else if ((TREE_CODE (type) == RECORD_TYPE
22651 || TREE_CODE (type) == UNION_TYPE
22652 || TREE_CODE (type) == QUAL_UNION_TYPE)
22653 && TYPE_FIELDS (type))
22655 if (DECL_MODE (TYPE_FIELDS (type)) == DFmode && align < 64)
22657 if (ALIGN_MODE_128 (DECL_MODE (TYPE_FIELDS (type))) && align < 128)
22660 else if (TREE_CODE (type) == REAL_TYPE || TREE_CODE (type) == VECTOR_TYPE
22661 || TREE_CODE (type) == INTEGER_TYPE)
22664 if (TYPE_MODE (type) == DFmode && align < 64)
22666 if (ALIGN_MODE_128 (TYPE_MODE (type)) && align < 128)
22672 /* Compute the minimum required alignment for dynamic stack realignment
22673 purposes for a local variable, parameter or a stack slot. EXP is
22674 the data type or decl itself, MODE is its mode and ALIGN is the
22675 alignment that the object would ordinarily have. */
22678 ix86_minimum_alignment (tree exp, enum machine_mode mode,
22679 unsigned int align)
22683 if (exp && DECL_P (exp))
22685 type = TREE_TYPE (exp);
22694 if (TARGET_64BIT || align != 64 || ix86_preferred_stack_boundary >= 64)
22697 /* Don't do dynamic stack realignment for long long objects with
22698 -mpreferred-stack-boundary=2. */
22699 if ((mode == DImode || (type && TYPE_MODE (type) == DImode))
22700 && (!type || !TYPE_USER_ALIGN (type))
22701 && (!decl || !DECL_USER_ALIGN (decl)))
22707 /* Find a location for the static chain incoming to a nested function.
22708 This is a register, unless all free registers are used by arguments. */
22711 ix86_static_chain (const_tree fndecl, bool incoming_p)
22715 if (!DECL_STATIC_CHAIN (fndecl))
22720 /* We always use R10 in 64-bit mode. */
22728 /* By default in 32-bit mode we use ECX to pass the static chain. */
22731 fntype = TREE_TYPE (fndecl);
22732 ccvt = ix86_get_callcvt (fntype);
22733 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
22735 /* Fastcall functions use ecx/edx for arguments, which leaves
22736 us with EAX for the static chain.
22737 Thiscall functions use ecx for arguments, which also
22738 leaves us with EAX for the static chain. */
22741 else if (ix86_function_regparm (fntype, fndecl) == 3)
22743 /* For regparm 3, we have no free call-clobbered registers in
22744 which to store the static chain. In order to implement this,
22745 we have the trampoline push the static chain to the stack.
22746 However, we can't push a value below the return address when
22747 we call the nested function directly, so we have to use an
22748 alternate entry point. For this we use ESI, and have the
22749 alternate entry point push ESI, so that things appear the
22750 same once we're executing the nested function. */
22753 if (fndecl == current_function_decl)
22754 ix86_static_chain_on_stack = true;
22755 return gen_frame_mem (SImode,
22756 plus_constant (arg_pointer_rtx, -8));
22762 return gen_rtx_REG (Pmode, regno);
22765 /* Emit RTL insns to initialize the variable parts of a trampoline.
22766 FNDECL is the decl of the target address; M_TRAMP is a MEM for
22767 the trampoline, and CHAIN_VALUE is an RTX for the static chain
22768 to be passed to the target function. */
22771 ix86_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
22777 fnaddr = XEXP (DECL_RTL (fndecl), 0);
22783 /* Load the function address to r11. Try to load address using
22784 the shorter movl instead of movabs. We may want to support
22785 movq for kernel mode, but kernel does not use trampolines at
22787 if (x86_64_zext_immediate_operand (fnaddr, VOIDmode))
22789 fnaddr = copy_to_mode_reg (DImode, fnaddr);
22791 mem = adjust_address (m_tramp, HImode, offset);
22792 emit_move_insn (mem, gen_int_mode (0xbb41, HImode));
22794 mem = adjust_address (m_tramp, SImode, offset + 2);
22795 emit_move_insn (mem, gen_lowpart (SImode, fnaddr));
22800 mem = adjust_address (m_tramp, HImode, offset);
22801 emit_move_insn (mem, gen_int_mode (0xbb49, HImode));
22803 mem = adjust_address (m_tramp, DImode, offset + 2);
22804 emit_move_insn (mem, fnaddr);
22808 /* Load static chain using movabs to r10. Use the
22809 shorter movl instead of movabs for x32. */
22821 mem = adjust_address (m_tramp, HImode, offset);
22822 emit_move_insn (mem, gen_int_mode (opcode, HImode));
22824 mem = adjust_address (m_tramp, ptr_mode, offset + 2);
22825 emit_move_insn (mem, chain_value);
22828 /* Jump to r11; the last (unused) byte is a nop, only there to
22829 pad the write out to a single 32-bit store. */
22830 mem = adjust_address (m_tramp, SImode, offset);
22831 emit_move_insn (mem, gen_int_mode (0x90e3ff49, SImode));
22838 /* Depending on the static chain location, either load a register
22839 with a constant, or push the constant to the stack. All of the
22840 instructions are the same size. */
22841 chain = ix86_static_chain (fndecl, true);
22844 switch (REGNO (chain))
22847 opcode = 0xb8; break;
22849 opcode = 0xb9; break;
22851 gcc_unreachable ();
22857 mem = adjust_address (m_tramp, QImode, offset);
22858 emit_move_insn (mem, gen_int_mode (opcode, QImode));
22860 mem = adjust_address (m_tramp, SImode, offset + 1);
22861 emit_move_insn (mem, chain_value);
22864 mem = adjust_address (m_tramp, QImode, offset);
22865 emit_move_insn (mem, gen_int_mode (0xe9, QImode));
22867 mem = adjust_address (m_tramp, SImode, offset + 1);
22869 /* Compute offset from the end of the jmp to the target function.
22870 In the case in which the trampoline stores the static chain on
22871 the stack, we need to skip the first insn which pushes the
22872 (call-saved) register static chain; this push is 1 byte. */
22874 disp = expand_binop (SImode, sub_optab, fnaddr,
22875 plus_constant (XEXP (m_tramp, 0),
22876 offset - (MEM_P (chain) ? 1 : 0)),
22877 NULL_RTX, 1, OPTAB_DIRECT);
22878 emit_move_insn (mem, disp);
22881 gcc_assert (offset <= TRAMPOLINE_SIZE);
22883 #ifdef HAVE_ENABLE_EXECUTE_STACK
22884 #ifdef CHECK_EXECUTE_STACK_ENABLED
22885 if (CHECK_EXECUTE_STACK_ENABLED)
22887 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__enable_execute_stack"),
22888 LCT_NORMAL, VOIDmode, 1, XEXP (m_tramp, 0), Pmode);
22892 /* The following file contains several enumerations and data structures
22893 built from the definitions in i386-builtin-types.def. */
22895 #include "i386-builtin-types.inc"
22897 /* Table for the ix86 builtin non-function types. */
22898 static GTY(()) tree ix86_builtin_type_tab[(int) IX86_BT_LAST_CPTR + 1];
22900 /* Retrieve an element from the above table, building some of
22901 the types lazily. */
22904 ix86_get_builtin_type (enum ix86_builtin_type tcode)
22906 unsigned int index;
22909 gcc_assert ((unsigned)tcode < ARRAY_SIZE(ix86_builtin_type_tab));
22911 type = ix86_builtin_type_tab[(int) tcode];
22915 gcc_assert (tcode > IX86_BT_LAST_PRIM);
22916 if (tcode <= IX86_BT_LAST_VECT)
22918 enum machine_mode mode;
22920 index = tcode - IX86_BT_LAST_PRIM - 1;
22921 itype = ix86_get_builtin_type (ix86_builtin_type_vect_base[index]);
22922 mode = ix86_builtin_type_vect_mode[index];
22924 type = build_vector_type_for_mode (itype, mode);
22930 index = tcode - IX86_BT_LAST_VECT - 1;
22931 if (tcode <= IX86_BT_LAST_PTR)
22932 quals = TYPE_UNQUALIFIED;
22934 quals = TYPE_QUAL_CONST;
22936 itype = ix86_get_builtin_type (ix86_builtin_type_ptr_base[index]);
22937 if (quals != TYPE_UNQUALIFIED)
22938 itype = build_qualified_type (itype, quals);
22940 type = build_pointer_type (itype);
22943 ix86_builtin_type_tab[(int) tcode] = type;
22947 /* Table for the ix86 builtin function types. */
22948 static GTY(()) tree ix86_builtin_func_type_tab[(int) IX86_BT_LAST_ALIAS + 1];
22950 /* Retrieve an element from the above table, building some of
22951 the types lazily. */
22954 ix86_get_builtin_func_type (enum ix86_builtin_func_type tcode)
22958 gcc_assert ((unsigned)tcode < ARRAY_SIZE (ix86_builtin_func_type_tab));
22960 type = ix86_builtin_func_type_tab[(int) tcode];
22964 if (tcode <= IX86_BT_LAST_FUNC)
22966 unsigned start = ix86_builtin_func_start[(int) tcode];
22967 unsigned after = ix86_builtin_func_start[(int) tcode + 1];
22968 tree rtype, atype, args = void_list_node;
22971 rtype = ix86_get_builtin_type (ix86_builtin_func_args[start]);
22972 for (i = after - 1; i > start; --i)
22974 atype = ix86_get_builtin_type (ix86_builtin_func_args[i]);
22975 args = tree_cons (NULL, atype, args);
22978 type = build_function_type (rtype, args);
22982 unsigned index = tcode - IX86_BT_LAST_FUNC - 1;
22983 enum ix86_builtin_func_type icode;
22985 icode = ix86_builtin_func_alias_base[index];
22986 type = ix86_get_builtin_func_type (icode);
22989 ix86_builtin_func_type_tab[(int) tcode] = type;
22994 /* Codes for all the SSE/MMX builtins. */
22997 IX86_BUILTIN_ADDPS,
22998 IX86_BUILTIN_ADDSS,
22999 IX86_BUILTIN_DIVPS,
23000 IX86_BUILTIN_DIVSS,
23001 IX86_BUILTIN_MULPS,
23002 IX86_BUILTIN_MULSS,
23003 IX86_BUILTIN_SUBPS,
23004 IX86_BUILTIN_SUBSS,
23006 IX86_BUILTIN_CMPEQPS,
23007 IX86_BUILTIN_CMPLTPS,
23008 IX86_BUILTIN_CMPLEPS,
23009 IX86_BUILTIN_CMPGTPS,
23010 IX86_BUILTIN_CMPGEPS,
23011 IX86_BUILTIN_CMPNEQPS,
23012 IX86_BUILTIN_CMPNLTPS,
23013 IX86_BUILTIN_CMPNLEPS,
23014 IX86_BUILTIN_CMPNGTPS,
23015 IX86_BUILTIN_CMPNGEPS,
23016 IX86_BUILTIN_CMPORDPS,
23017 IX86_BUILTIN_CMPUNORDPS,
23018 IX86_BUILTIN_CMPEQSS,
23019 IX86_BUILTIN_CMPLTSS,
23020 IX86_BUILTIN_CMPLESS,
23021 IX86_BUILTIN_CMPNEQSS,
23022 IX86_BUILTIN_CMPNLTSS,
23023 IX86_BUILTIN_CMPNLESS,
23024 IX86_BUILTIN_CMPNGTSS,
23025 IX86_BUILTIN_CMPNGESS,
23026 IX86_BUILTIN_CMPORDSS,
23027 IX86_BUILTIN_CMPUNORDSS,
23029 IX86_BUILTIN_COMIEQSS,
23030 IX86_BUILTIN_COMILTSS,
23031 IX86_BUILTIN_COMILESS,
23032 IX86_BUILTIN_COMIGTSS,
23033 IX86_BUILTIN_COMIGESS,
23034 IX86_BUILTIN_COMINEQSS,
23035 IX86_BUILTIN_UCOMIEQSS,
23036 IX86_BUILTIN_UCOMILTSS,
23037 IX86_BUILTIN_UCOMILESS,
23038 IX86_BUILTIN_UCOMIGTSS,
23039 IX86_BUILTIN_UCOMIGESS,
23040 IX86_BUILTIN_UCOMINEQSS,
23042 IX86_BUILTIN_CVTPI2PS,
23043 IX86_BUILTIN_CVTPS2PI,
23044 IX86_BUILTIN_CVTSI2SS,
23045 IX86_BUILTIN_CVTSI642SS,
23046 IX86_BUILTIN_CVTSS2SI,
23047 IX86_BUILTIN_CVTSS2SI64,
23048 IX86_BUILTIN_CVTTPS2PI,
23049 IX86_BUILTIN_CVTTSS2SI,
23050 IX86_BUILTIN_CVTTSS2SI64,
23052 IX86_BUILTIN_MAXPS,
23053 IX86_BUILTIN_MAXSS,
23054 IX86_BUILTIN_MINPS,
23055 IX86_BUILTIN_MINSS,
23057 IX86_BUILTIN_LOADUPS,
23058 IX86_BUILTIN_STOREUPS,
23059 IX86_BUILTIN_MOVSS,
23061 IX86_BUILTIN_MOVHLPS,
23062 IX86_BUILTIN_MOVLHPS,
23063 IX86_BUILTIN_LOADHPS,
23064 IX86_BUILTIN_LOADLPS,
23065 IX86_BUILTIN_STOREHPS,
23066 IX86_BUILTIN_STORELPS,
23068 IX86_BUILTIN_MASKMOVQ,
23069 IX86_BUILTIN_MOVMSKPS,
23070 IX86_BUILTIN_PMOVMSKB,
23072 IX86_BUILTIN_MOVNTPS,
23073 IX86_BUILTIN_MOVNTQ,
23075 IX86_BUILTIN_LOADDQU,
23076 IX86_BUILTIN_STOREDQU,
23078 IX86_BUILTIN_PACKSSWB,
23079 IX86_BUILTIN_PACKSSDW,
23080 IX86_BUILTIN_PACKUSWB,
23082 IX86_BUILTIN_PADDB,
23083 IX86_BUILTIN_PADDW,
23084 IX86_BUILTIN_PADDD,
23085 IX86_BUILTIN_PADDQ,
23086 IX86_BUILTIN_PADDSB,
23087 IX86_BUILTIN_PADDSW,
23088 IX86_BUILTIN_PADDUSB,
23089 IX86_BUILTIN_PADDUSW,
23090 IX86_BUILTIN_PSUBB,
23091 IX86_BUILTIN_PSUBW,
23092 IX86_BUILTIN_PSUBD,
23093 IX86_BUILTIN_PSUBQ,
23094 IX86_BUILTIN_PSUBSB,
23095 IX86_BUILTIN_PSUBSW,
23096 IX86_BUILTIN_PSUBUSB,
23097 IX86_BUILTIN_PSUBUSW,
23100 IX86_BUILTIN_PANDN,
23104 IX86_BUILTIN_PAVGB,
23105 IX86_BUILTIN_PAVGW,
23107 IX86_BUILTIN_PCMPEQB,
23108 IX86_BUILTIN_PCMPEQW,
23109 IX86_BUILTIN_PCMPEQD,
23110 IX86_BUILTIN_PCMPGTB,
23111 IX86_BUILTIN_PCMPGTW,
23112 IX86_BUILTIN_PCMPGTD,
23114 IX86_BUILTIN_PMADDWD,
23116 IX86_BUILTIN_PMAXSW,
23117 IX86_BUILTIN_PMAXUB,
23118 IX86_BUILTIN_PMINSW,
23119 IX86_BUILTIN_PMINUB,
23121 IX86_BUILTIN_PMULHUW,
23122 IX86_BUILTIN_PMULHW,
23123 IX86_BUILTIN_PMULLW,
23125 IX86_BUILTIN_PSADBW,
23126 IX86_BUILTIN_PSHUFW,
23128 IX86_BUILTIN_PSLLW,
23129 IX86_BUILTIN_PSLLD,
23130 IX86_BUILTIN_PSLLQ,
23131 IX86_BUILTIN_PSRAW,
23132 IX86_BUILTIN_PSRAD,
23133 IX86_BUILTIN_PSRLW,
23134 IX86_BUILTIN_PSRLD,
23135 IX86_BUILTIN_PSRLQ,
23136 IX86_BUILTIN_PSLLWI,
23137 IX86_BUILTIN_PSLLDI,
23138 IX86_BUILTIN_PSLLQI,
23139 IX86_BUILTIN_PSRAWI,
23140 IX86_BUILTIN_PSRADI,
23141 IX86_BUILTIN_PSRLWI,
23142 IX86_BUILTIN_PSRLDI,
23143 IX86_BUILTIN_PSRLQI,
23145 IX86_BUILTIN_PUNPCKHBW,
23146 IX86_BUILTIN_PUNPCKHWD,
23147 IX86_BUILTIN_PUNPCKHDQ,
23148 IX86_BUILTIN_PUNPCKLBW,
23149 IX86_BUILTIN_PUNPCKLWD,
23150 IX86_BUILTIN_PUNPCKLDQ,
23152 IX86_BUILTIN_SHUFPS,
23154 IX86_BUILTIN_RCPPS,
23155 IX86_BUILTIN_RCPSS,
23156 IX86_BUILTIN_RSQRTPS,
23157 IX86_BUILTIN_RSQRTPS_NR,
23158 IX86_BUILTIN_RSQRTSS,
23159 IX86_BUILTIN_RSQRTF,
23160 IX86_BUILTIN_SQRTPS,
23161 IX86_BUILTIN_SQRTPS_NR,
23162 IX86_BUILTIN_SQRTSS,
23164 IX86_BUILTIN_UNPCKHPS,
23165 IX86_BUILTIN_UNPCKLPS,
23167 IX86_BUILTIN_ANDPS,
23168 IX86_BUILTIN_ANDNPS,
23170 IX86_BUILTIN_XORPS,
23173 IX86_BUILTIN_LDMXCSR,
23174 IX86_BUILTIN_STMXCSR,
23175 IX86_BUILTIN_SFENCE,
23177 /* 3DNow! Original */
23178 IX86_BUILTIN_FEMMS,
23179 IX86_BUILTIN_PAVGUSB,
23180 IX86_BUILTIN_PF2ID,
23181 IX86_BUILTIN_PFACC,
23182 IX86_BUILTIN_PFADD,
23183 IX86_BUILTIN_PFCMPEQ,
23184 IX86_BUILTIN_PFCMPGE,
23185 IX86_BUILTIN_PFCMPGT,
23186 IX86_BUILTIN_PFMAX,
23187 IX86_BUILTIN_PFMIN,
23188 IX86_BUILTIN_PFMUL,
23189 IX86_BUILTIN_PFRCP,
23190 IX86_BUILTIN_PFRCPIT1,
23191 IX86_BUILTIN_PFRCPIT2,
23192 IX86_BUILTIN_PFRSQIT1,
23193 IX86_BUILTIN_PFRSQRT,
23194 IX86_BUILTIN_PFSUB,
23195 IX86_BUILTIN_PFSUBR,
23196 IX86_BUILTIN_PI2FD,
23197 IX86_BUILTIN_PMULHRW,
23199 /* 3DNow! Athlon Extensions */
23200 IX86_BUILTIN_PF2IW,
23201 IX86_BUILTIN_PFNACC,
23202 IX86_BUILTIN_PFPNACC,
23203 IX86_BUILTIN_PI2FW,
23204 IX86_BUILTIN_PSWAPDSI,
23205 IX86_BUILTIN_PSWAPDSF,
23208 IX86_BUILTIN_ADDPD,
23209 IX86_BUILTIN_ADDSD,
23210 IX86_BUILTIN_DIVPD,
23211 IX86_BUILTIN_DIVSD,
23212 IX86_BUILTIN_MULPD,
23213 IX86_BUILTIN_MULSD,
23214 IX86_BUILTIN_SUBPD,
23215 IX86_BUILTIN_SUBSD,
23217 IX86_BUILTIN_CMPEQPD,
23218 IX86_BUILTIN_CMPLTPD,
23219 IX86_BUILTIN_CMPLEPD,
23220 IX86_BUILTIN_CMPGTPD,
23221 IX86_BUILTIN_CMPGEPD,
23222 IX86_BUILTIN_CMPNEQPD,
23223 IX86_BUILTIN_CMPNLTPD,
23224 IX86_BUILTIN_CMPNLEPD,
23225 IX86_BUILTIN_CMPNGTPD,
23226 IX86_BUILTIN_CMPNGEPD,
23227 IX86_BUILTIN_CMPORDPD,
23228 IX86_BUILTIN_CMPUNORDPD,
23229 IX86_BUILTIN_CMPEQSD,
23230 IX86_BUILTIN_CMPLTSD,
23231 IX86_BUILTIN_CMPLESD,
23232 IX86_BUILTIN_CMPNEQSD,
23233 IX86_BUILTIN_CMPNLTSD,
23234 IX86_BUILTIN_CMPNLESD,
23235 IX86_BUILTIN_CMPORDSD,
23236 IX86_BUILTIN_CMPUNORDSD,
23238 IX86_BUILTIN_COMIEQSD,
23239 IX86_BUILTIN_COMILTSD,
23240 IX86_BUILTIN_COMILESD,
23241 IX86_BUILTIN_COMIGTSD,
23242 IX86_BUILTIN_COMIGESD,
23243 IX86_BUILTIN_COMINEQSD,
23244 IX86_BUILTIN_UCOMIEQSD,
23245 IX86_BUILTIN_UCOMILTSD,
23246 IX86_BUILTIN_UCOMILESD,
23247 IX86_BUILTIN_UCOMIGTSD,
23248 IX86_BUILTIN_UCOMIGESD,
23249 IX86_BUILTIN_UCOMINEQSD,
23251 IX86_BUILTIN_MAXPD,
23252 IX86_BUILTIN_MAXSD,
23253 IX86_BUILTIN_MINPD,
23254 IX86_BUILTIN_MINSD,
23256 IX86_BUILTIN_ANDPD,
23257 IX86_BUILTIN_ANDNPD,
23259 IX86_BUILTIN_XORPD,
23261 IX86_BUILTIN_SQRTPD,
23262 IX86_BUILTIN_SQRTSD,
23264 IX86_BUILTIN_UNPCKHPD,
23265 IX86_BUILTIN_UNPCKLPD,
23267 IX86_BUILTIN_SHUFPD,
23269 IX86_BUILTIN_LOADUPD,
23270 IX86_BUILTIN_STOREUPD,
23271 IX86_BUILTIN_MOVSD,
23273 IX86_BUILTIN_LOADHPD,
23274 IX86_BUILTIN_LOADLPD,
23276 IX86_BUILTIN_CVTDQ2PD,
23277 IX86_BUILTIN_CVTDQ2PS,
23279 IX86_BUILTIN_CVTPD2DQ,
23280 IX86_BUILTIN_CVTPD2PI,
23281 IX86_BUILTIN_CVTPD2PS,
23282 IX86_BUILTIN_CVTTPD2DQ,
23283 IX86_BUILTIN_CVTTPD2PI,
23285 IX86_BUILTIN_CVTPI2PD,
23286 IX86_BUILTIN_CVTSI2SD,
23287 IX86_BUILTIN_CVTSI642SD,
23289 IX86_BUILTIN_CVTSD2SI,
23290 IX86_BUILTIN_CVTSD2SI64,
23291 IX86_BUILTIN_CVTSD2SS,
23292 IX86_BUILTIN_CVTSS2SD,
23293 IX86_BUILTIN_CVTTSD2SI,
23294 IX86_BUILTIN_CVTTSD2SI64,
23296 IX86_BUILTIN_CVTPS2DQ,
23297 IX86_BUILTIN_CVTPS2PD,
23298 IX86_BUILTIN_CVTTPS2DQ,
23300 IX86_BUILTIN_MOVNTI,
23301 IX86_BUILTIN_MOVNTPD,
23302 IX86_BUILTIN_MOVNTDQ,
23304 IX86_BUILTIN_MOVQ128,
23307 IX86_BUILTIN_MASKMOVDQU,
23308 IX86_BUILTIN_MOVMSKPD,
23309 IX86_BUILTIN_PMOVMSKB128,
23311 IX86_BUILTIN_PACKSSWB128,
23312 IX86_BUILTIN_PACKSSDW128,
23313 IX86_BUILTIN_PACKUSWB128,
23315 IX86_BUILTIN_PADDB128,
23316 IX86_BUILTIN_PADDW128,
23317 IX86_BUILTIN_PADDD128,
23318 IX86_BUILTIN_PADDQ128,
23319 IX86_BUILTIN_PADDSB128,
23320 IX86_BUILTIN_PADDSW128,
23321 IX86_BUILTIN_PADDUSB128,
23322 IX86_BUILTIN_PADDUSW128,
23323 IX86_BUILTIN_PSUBB128,
23324 IX86_BUILTIN_PSUBW128,
23325 IX86_BUILTIN_PSUBD128,
23326 IX86_BUILTIN_PSUBQ128,
23327 IX86_BUILTIN_PSUBSB128,
23328 IX86_BUILTIN_PSUBSW128,
23329 IX86_BUILTIN_PSUBUSB128,
23330 IX86_BUILTIN_PSUBUSW128,
23332 IX86_BUILTIN_PAND128,
23333 IX86_BUILTIN_PANDN128,
23334 IX86_BUILTIN_POR128,
23335 IX86_BUILTIN_PXOR128,
23337 IX86_BUILTIN_PAVGB128,
23338 IX86_BUILTIN_PAVGW128,
23340 IX86_BUILTIN_PCMPEQB128,
23341 IX86_BUILTIN_PCMPEQW128,
23342 IX86_BUILTIN_PCMPEQD128,
23343 IX86_BUILTIN_PCMPGTB128,
23344 IX86_BUILTIN_PCMPGTW128,
23345 IX86_BUILTIN_PCMPGTD128,
23347 IX86_BUILTIN_PMADDWD128,
23349 IX86_BUILTIN_PMAXSW128,
23350 IX86_BUILTIN_PMAXUB128,
23351 IX86_BUILTIN_PMINSW128,
23352 IX86_BUILTIN_PMINUB128,
23354 IX86_BUILTIN_PMULUDQ,
23355 IX86_BUILTIN_PMULUDQ128,
23356 IX86_BUILTIN_PMULHUW128,
23357 IX86_BUILTIN_PMULHW128,
23358 IX86_BUILTIN_PMULLW128,
23360 IX86_BUILTIN_PSADBW128,
23361 IX86_BUILTIN_PSHUFHW,
23362 IX86_BUILTIN_PSHUFLW,
23363 IX86_BUILTIN_PSHUFD,
23365 IX86_BUILTIN_PSLLDQI128,
23366 IX86_BUILTIN_PSLLWI128,
23367 IX86_BUILTIN_PSLLDI128,
23368 IX86_BUILTIN_PSLLQI128,
23369 IX86_BUILTIN_PSRAWI128,
23370 IX86_BUILTIN_PSRADI128,
23371 IX86_BUILTIN_PSRLDQI128,
23372 IX86_BUILTIN_PSRLWI128,
23373 IX86_BUILTIN_PSRLDI128,
23374 IX86_BUILTIN_PSRLQI128,
23376 IX86_BUILTIN_PSLLDQ128,
23377 IX86_BUILTIN_PSLLW128,
23378 IX86_BUILTIN_PSLLD128,
23379 IX86_BUILTIN_PSLLQ128,
23380 IX86_BUILTIN_PSRAW128,
23381 IX86_BUILTIN_PSRAD128,
23382 IX86_BUILTIN_PSRLW128,
23383 IX86_BUILTIN_PSRLD128,
23384 IX86_BUILTIN_PSRLQ128,
23386 IX86_BUILTIN_PUNPCKHBW128,
23387 IX86_BUILTIN_PUNPCKHWD128,
23388 IX86_BUILTIN_PUNPCKHDQ128,
23389 IX86_BUILTIN_PUNPCKHQDQ128,
23390 IX86_BUILTIN_PUNPCKLBW128,
23391 IX86_BUILTIN_PUNPCKLWD128,
23392 IX86_BUILTIN_PUNPCKLDQ128,
23393 IX86_BUILTIN_PUNPCKLQDQ128,
23395 IX86_BUILTIN_CLFLUSH,
23396 IX86_BUILTIN_MFENCE,
23397 IX86_BUILTIN_LFENCE,
23398 IX86_BUILTIN_PAUSE,
23400 IX86_BUILTIN_BSRSI,
23401 IX86_BUILTIN_BSRDI,
23402 IX86_BUILTIN_RDPMC,
23403 IX86_BUILTIN_RDTSC,
23404 IX86_BUILTIN_RDTSCP,
23405 IX86_BUILTIN_ROLQI,
23406 IX86_BUILTIN_ROLHI,
23407 IX86_BUILTIN_RORQI,
23408 IX86_BUILTIN_RORHI,
23411 IX86_BUILTIN_ADDSUBPS,
23412 IX86_BUILTIN_HADDPS,
23413 IX86_BUILTIN_HSUBPS,
23414 IX86_BUILTIN_MOVSHDUP,
23415 IX86_BUILTIN_MOVSLDUP,
23416 IX86_BUILTIN_ADDSUBPD,
23417 IX86_BUILTIN_HADDPD,
23418 IX86_BUILTIN_HSUBPD,
23419 IX86_BUILTIN_LDDQU,
23421 IX86_BUILTIN_MONITOR,
23422 IX86_BUILTIN_MWAIT,
23425 IX86_BUILTIN_PHADDW,
23426 IX86_BUILTIN_PHADDD,
23427 IX86_BUILTIN_PHADDSW,
23428 IX86_BUILTIN_PHSUBW,
23429 IX86_BUILTIN_PHSUBD,
23430 IX86_BUILTIN_PHSUBSW,
23431 IX86_BUILTIN_PMADDUBSW,
23432 IX86_BUILTIN_PMULHRSW,
23433 IX86_BUILTIN_PSHUFB,
23434 IX86_BUILTIN_PSIGNB,
23435 IX86_BUILTIN_PSIGNW,
23436 IX86_BUILTIN_PSIGND,
23437 IX86_BUILTIN_PALIGNR,
23438 IX86_BUILTIN_PABSB,
23439 IX86_BUILTIN_PABSW,
23440 IX86_BUILTIN_PABSD,
23442 IX86_BUILTIN_PHADDW128,
23443 IX86_BUILTIN_PHADDD128,
23444 IX86_BUILTIN_PHADDSW128,
23445 IX86_BUILTIN_PHSUBW128,
23446 IX86_BUILTIN_PHSUBD128,
23447 IX86_BUILTIN_PHSUBSW128,
23448 IX86_BUILTIN_PMADDUBSW128,
23449 IX86_BUILTIN_PMULHRSW128,
23450 IX86_BUILTIN_PSHUFB128,
23451 IX86_BUILTIN_PSIGNB128,
23452 IX86_BUILTIN_PSIGNW128,
23453 IX86_BUILTIN_PSIGND128,
23454 IX86_BUILTIN_PALIGNR128,
23455 IX86_BUILTIN_PABSB128,
23456 IX86_BUILTIN_PABSW128,
23457 IX86_BUILTIN_PABSD128,
23459 /* AMDFAM10 - SSE4A New Instructions. */
23460 IX86_BUILTIN_MOVNTSD,
23461 IX86_BUILTIN_MOVNTSS,
23462 IX86_BUILTIN_EXTRQI,
23463 IX86_BUILTIN_EXTRQ,
23464 IX86_BUILTIN_INSERTQI,
23465 IX86_BUILTIN_INSERTQ,
23468 IX86_BUILTIN_BLENDPD,
23469 IX86_BUILTIN_BLENDPS,
23470 IX86_BUILTIN_BLENDVPD,
23471 IX86_BUILTIN_BLENDVPS,
23472 IX86_BUILTIN_PBLENDVB128,
23473 IX86_BUILTIN_PBLENDW128,
23478 IX86_BUILTIN_INSERTPS128,
23480 IX86_BUILTIN_MOVNTDQA,
23481 IX86_BUILTIN_MPSADBW128,
23482 IX86_BUILTIN_PACKUSDW128,
23483 IX86_BUILTIN_PCMPEQQ,
23484 IX86_BUILTIN_PHMINPOSUW128,
23486 IX86_BUILTIN_PMAXSB128,
23487 IX86_BUILTIN_PMAXSD128,
23488 IX86_BUILTIN_PMAXUD128,
23489 IX86_BUILTIN_PMAXUW128,
23491 IX86_BUILTIN_PMINSB128,
23492 IX86_BUILTIN_PMINSD128,
23493 IX86_BUILTIN_PMINUD128,
23494 IX86_BUILTIN_PMINUW128,
23496 IX86_BUILTIN_PMOVSXBW128,
23497 IX86_BUILTIN_PMOVSXBD128,
23498 IX86_BUILTIN_PMOVSXBQ128,
23499 IX86_BUILTIN_PMOVSXWD128,
23500 IX86_BUILTIN_PMOVSXWQ128,
23501 IX86_BUILTIN_PMOVSXDQ128,
23503 IX86_BUILTIN_PMOVZXBW128,
23504 IX86_BUILTIN_PMOVZXBD128,
23505 IX86_BUILTIN_PMOVZXBQ128,
23506 IX86_BUILTIN_PMOVZXWD128,
23507 IX86_BUILTIN_PMOVZXWQ128,
23508 IX86_BUILTIN_PMOVZXDQ128,
23510 IX86_BUILTIN_PMULDQ128,
23511 IX86_BUILTIN_PMULLD128,
23513 IX86_BUILTIN_ROUNDPD,
23514 IX86_BUILTIN_ROUNDPS,
23515 IX86_BUILTIN_ROUNDSD,
23516 IX86_BUILTIN_ROUNDSS,
23518 IX86_BUILTIN_FLOORPD,
23519 IX86_BUILTIN_CEILPD,
23520 IX86_BUILTIN_TRUNCPD,
23521 IX86_BUILTIN_RINTPD,
23522 IX86_BUILTIN_FLOORPS,
23523 IX86_BUILTIN_CEILPS,
23524 IX86_BUILTIN_TRUNCPS,
23525 IX86_BUILTIN_RINTPS,
23527 IX86_BUILTIN_PTESTZ,
23528 IX86_BUILTIN_PTESTC,
23529 IX86_BUILTIN_PTESTNZC,
23531 IX86_BUILTIN_VEC_INIT_V2SI,
23532 IX86_BUILTIN_VEC_INIT_V4HI,
23533 IX86_BUILTIN_VEC_INIT_V8QI,
23534 IX86_BUILTIN_VEC_EXT_V2DF,
23535 IX86_BUILTIN_VEC_EXT_V2DI,
23536 IX86_BUILTIN_VEC_EXT_V4SF,
23537 IX86_BUILTIN_VEC_EXT_V4SI,
23538 IX86_BUILTIN_VEC_EXT_V8HI,
23539 IX86_BUILTIN_VEC_EXT_V2SI,
23540 IX86_BUILTIN_VEC_EXT_V4HI,
23541 IX86_BUILTIN_VEC_EXT_V16QI,
23542 IX86_BUILTIN_VEC_SET_V2DI,
23543 IX86_BUILTIN_VEC_SET_V4SF,
23544 IX86_BUILTIN_VEC_SET_V4SI,
23545 IX86_BUILTIN_VEC_SET_V8HI,
23546 IX86_BUILTIN_VEC_SET_V4HI,
23547 IX86_BUILTIN_VEC_SET_V16QI,
23549 IX86_BUILTIN_VEC_PACK_SFIX,
23552 IX86_BUILTIN_CRC32QI,
23553 IX86_BUILTIN_CRC32HI,
23554 IX86_BUILTIN_CRC32SI,
23555 IX86_BUILTIN_CRC32DI,
23557 IX86_BUILTIN_PCMPESTRI128,
23558 IX86_BUILTIN_PCMPESTRM128,
23559 IX86_BUILTIN_PCMPESTRA128,
23560 IX86_BUILTIN_PCMPESTRC128,
23561 IX86_BUILTIN_PCMPESTRO128,
23562 IX86_BUILTIN_PCMPESTRS128,
23563 IX86_BUILTIN_PCMPESTRZ128,
23564 IX86_BUILTIN_PCMPISTRI128,
23565 IX86_BUILTIN_PCMPISTRM128,
23566 IX86_BUILTIN_PCMPISTRA128,
23567 IX86_BUILTIN_PCMPISTRC128,
23568 IX86_BUILTIN_PCMPISTRO128,
23569 IX86_BUILTIN_PCMPISTRS128,
23570 IX86_BUILTIN_PCMPISTRZ128,
23572 IX86_BUILTIN_PCMPGTQ,
23574 /* AES instructions */
23575 IX86_BUILTIN_AESENC128,
23576 IX86_BUILTIN_AESENCLAST128,
23577 IX86_BUILTIN_AESDEC128,
23578 IX86_BUILTIN_AESDECLAST128,
23579 IX86_BUILTIN_AESIMC128,
23580 IX86_BUILTIN_AESKEYGENASSIST128,
23582 /* PCLMUL instruction */
23583 IX86_BUILTIN_PCLMULQDQ128,
23586 IX86_BUILTIN_ADDPD256,
23587 IX86_BUILTIN_ADDPS256,
23588 IX86_BUILTIN_ADDSUBPD256,
23589 IX86_BUILTIN_ADDSUBPS256,
23590 IX86_BUILTIN_ANDPD256,
23591 IX86_BUILTIN_ANDPS256,
23592 IX86_BUILTIN_ANDNPD256,
23593 IX86_BUILTIN_ANDNPS256,
23594 IX86_BUILTIN_BLENDPD256,
23595 IX86_BUILTIN_BLENDPS256,
23596 IX86_BUILTIN_BLENDVPD256,
23597 IX86_BUILTIN_BLENDVPS256,
23598 IX86_BUILTIN_DIVPD256,
23599 IX86_BUILTIN_DIVPS256,
23600 IX86_BUILTIN_DPPS256,
23601 IX86_BUILTIN_HADDPD256,
23602 IX86_BUILTIN_HADDPS256,
23603 IX86_BUILTIN_HSUBPD256,
23604 IX86_BUILTIN_HSUBPS256,
23605 IX86_BUILTIN_MAXPD256,
23606 IX86_BUILTIN_MAXPS256,
23607 IX86_BUILTIN_MINPD256,
23608 IX86_BUILTIN_MINPS256,
23609 IX86_BUILTIN_MULPD256,
23610 IX86_BUILTIN_MULPS256,
23611 IX86_BUILTIN_ORPD256,
23612 IX86_BUILTIN_ORPS256,
23613 IX86_BUILTIN_SHUFPD256,
23614 IX86_BUILTIN_SHUFPS256,
23615 IX86_BUILTIN_SUBPD256,
23616 IX86_BUILTIN_SUBPS256,
23617 IX86_BUILTIN_XORPD256,
23618 IX86_BUILTIN_XORPS256,
23619 IX86_BUILTIN_CMPSD,
23620 IX86_BUILTIN_CMPSS,
23621 IX86_BUILTIN_CMPPD,
23622 IX86_BUILTIN_CMPPS,
23623 IX86_BUILTIN_CMPPD256,
23624 IX86_BUILTIN_CMPPS256,
23625 IX86_BUILTIN_CVTDQ2PD256,
23626 IX86_BUILTIN_CVTDQ2PS256,
23627 IX86_BUILTIN_CVTPD2PS256,
23628 IX86_BUILTIN_CVTPS2DQ256,
23629 IX86_BUILTIN_CVTPS2PD256,
23630 IX86_BUILTIN_CVTTPD2DQ256,
23631 IX86_BUILTIN_CVTPD2DQ256,
23632 IX86_BUILTIN_CVTTPS2DQ256,
23633 IX86_BUILTIN_EXTRACTF128PD256,
23634 IX86_BUILTIN_EXTRACTF128PS256,
23635 IX86_BUILTIN_EXTRACTF128SI256,
23636 IX86_BUILTIN_VZEROALL,
23637 IX86_BUILTIN_VZEROUPPER,
23638 IX86_BUILTIN_VPERMILVARPD,
23639 IX86_BUILTIN_VPERMILVARPS,
23640 IX86_BUILTIN_VPERMILVARPD256,
23641 IX86_BUILTIN_VPERMILVARPS256,
23642 IX86_BUILTIN_VPERMILPD,
23643 IX86_BUILTIN_VPERMILPS,
23644 IX86_BUILTIN_VPERMILPD256,
23645 IX86_BUILTIN_VPERMILPS256,
23646 IX86_BUILTIN_VPERMIL2PD,
23647 IX86_BUILTIN_VPERMIL2PS,
23648 IX86_BUILTIN_VPERMIL2PD256,
23649 IX86_BUILTIN_VPERMIL2PS256,
23650 IX86_BUILTIN_VPERM2F128PD256,
23651 IX86_BUILTIN_VPERM2F128PS256,
23652 IX86_BUILTIN_VPERM2F128SI256,
23653 IX86_BUILTIN_VBROADCASTSS,
23654 IX86_BUILTIN_VBROADCASTSD256,
23655 IX86_BUILTIN_VBROADCASTSS256,
23656 IX86_BUILTIN_VBROADCASTPD256,
23657 IX86_BUILTIN_VBROADCASTPS256,
23658 IX86_BUILTIN_VINSERTF128PD256,
23659 IX86_BUILTIN_VINSERTF128PS256,
23660 IX86_BUILTIN_VINSERTF128SI256,
23661 IX86_BUILTIN_LOADUPD256,
23662 IX86_BUILTIN_LOADUPS256,
23663 IX86_BUILTIN_STOREUPD256,
23664 IX86_BUILTIN_STOREUPS256,
23665 IX86_BUILTIN_LDDQU256,
23666 IX86_BUILTIN_MOVNTDQ256,
23667 IX86_BUILTIN_MOVNTPD256,
23668 IX86_BUILTIN_MOVNTPS256,
23669 IX86_BUILTIN_LOADDQU256,
23670 IX86_BUILTIN_STOREDQU256,
23671 IX86_BUILTIN_MASKLOADPD,
23672 IX86_BUILTIN_MASKLOADPS,
23673 IX86_BUILTIN_MASKSTOREPD,
23674 IX86_BUILTIN_MASKSTOREPS,
23675 IX86_BUILTIN_MASKLOADPD256,
23676 IX86_BUILTIN_MASKLOADPS256,
23677 IX86_BUILTIN_MASKSTOREPD256,
23678 IX86_BUILTIN_MASKSTOREPS256,
23679 IX86_BUILTIN_MOVSHDUP256,
23680 IX86_BUILTIN_MOVSLDUP256,
23681 IX86_BUILTIN_MOVDDUP256,
23683 IX86_BUILTIN_SQRTPD256,
23684 IX86_BUILTIN_SQRTPS256,
23685 IX86_BUILTIN_SQRTPS_NR256,
23686 IX86_BUILTIN_RSQRTPS256,
23687 IX86_BUILTIN_RSQRTPS_NR256,
23689 IX86_BUILTIN_RCPPS256,
23691 IX86_BUILTIN_ROUNDPD256,
23692 IX86_BUILTIN_ROUNDPS256,
23694 IX86_BUILTIN_FLOORPD256,
23695 IX86_BUILTIN_CEILPD256,
23696 IX86_BUILTIN_TRUNCPD256,
23697 IX86_BUILTIN_RINTPD256,
23698 IX86_BUILTIN_FLOORPS256,
23699 IX86_BUILTIN_CEILPS256,
23700 IX86_BUILTIN_TRUNCPS256,
23701 IX86_BUILTIN_RINTPS256,
23703 IX86_BUILTIN_UNPCKHPD256,
23704 IX86_BUILTIN_UNPCKLPD256,
23705 IX86_BUILTIN_UNPCKHPS256,
23706 IX86_BUILTIN_UNPCKLPS256,
23708 IX86_BUILTIN_SI256_SI,
23709 IX86_BUILTIN_PS256_PS,
23710 IX86_BUILTIN_PD256_PD,
23711 IX86_BUILTIN_SI_SI256,
23712 IX86_BUILTIN_PS_PS256,
23713 IX86_BUILTIN_PD_PD256,
23715 IX86_BUILTIN_VTESTZPD,
23716 IX86_BUILTIN_VTESTCPD,
23717 IX86_BUILTIN_VTESTNZCPD,
23718 IX86_BUILTIN_VTESTZPS,
23719 IX86_BUILTIN_VTESTCPS,
23720 IX86_BUILTIN_VTESTNZCPS,
23721 IX86_BUILTIN_VTESTZPD256,
23722 IX86_BUILTIN_VTESTCPD256,
23723 IX86_BUILTIN_VTESTNZCPD256,
23724 IX86_BUILTIN_VTESTZPS256,
23725 IX86_BUILTIN_VTESTCPS256,
23726 IX86_BUILTIN_VTESTNZCPS256,
23727 IX86_BUILTIN_PTESTZ256,
23728 IX86_BUILTIN_PTESTC256,
23729 IX86_BUILTIN_PTESTNZC256,
23731 IX86_BUILTIN_MOVMSKPD256,
23732 IX86_BUILTIN_MOVMSKPS256,
23734 /* TFmode support builtins. */
23736 IX86_BUILTIN_HUGE_VALQ,
23737 IX86_BUILTIN_FABSQ,
23738 IX86_BUILTIN_COPYSIGNQ,
23740 /* Vectorizer support builtins. */
23741 IX86_BUILTIN_CPYSGNPS,
23742 IX86_BUILTIN_CPYSGNPD,
23743 IX86_BUILTIN_CPYSGNPS256,
23744 IX86_BUILTIN_CPYSGNPD256,
23746 IX86_BUILTIN_CVTUDQ2PS,
23748 IX86_BUILTIN_VEC_PERM_V2DF,
23749 IX86_BUILTIN_VEC_PERM_V4SF,
23750 IX86_BUILTIN_VEC_PERM_V2DI,
23751 IX86_BUILTIN_VEC_PERM_V4SI,
23752 IX86_BUILTIN_VEC_PERM_V8HI,
23753 IX86_BUILTIN_VEC_PERM_V16QI,
23754 IX86_BUILTIN_VEC_PERM_V2DI_U,
23755 IX86_BUILTIN_VEC_PERM_V4SI_U,
23756 IX86_BUILTIN_VEC_PERM_V8HI_U,
23757 IX86_BUILTIN_VEC_PERM_V16QI_U,
23758 IX86_BUILTIN_VEC_PERM_V4DF,
23759 IX86_BUILTIN_VEC_PERM_V8SF,
23761 /* FMA4 and XOP instructions. */
23762 IX86_BUILTIN_VFMADDSS,
23763 IX86_BUILTIN_VFMADDSD,
23764 IX86_BUILTIN_VFMADDPS,
23765 IX86_BUILTIN_VFMADDPD,
23766 IX86_BUILTIN_VFMADDPS256,
23767 IX86_BUILTIN_VFMADDPD256,
23768 IX86_BUILTIN_VFMADDSUBPS,
23769 IX86_BUILTIN_VFMADDSUBPD,
23770 IX86_BUILTIN_VFMADDSUBPS256,
23771 IX86_BUILTIN_VFMADDSUBPD256,
23773 IX86_BUILTIN_VPCMOV,
23774 IX86_BUILTIN_VPCMOV_V2DI,
23775 IX86_BUILTIN_VPCMOV_V4SI,
23776 IX86_BUILTIN_VPCMOV_V8HI,
23777 IX86_BUILTIN_VPCMOV_V16QI,
23778 IX86_BUILTIN_VPCMOV_V4SF,
23779 IX86_BUILTIN_VPCMOV_V2DF,
23780 IX86_BUILTIN_VPCMOV256,
23781 IX86_BUILTIN_VPCMOV_V4DI256,
23782 IX86_BUILTIN_VPCMOV_V8SI256,
23783 IX86_BUILTIN_VPCMOV_V16HI256,
23784 IX86_BUILTIN_VPCMOV_V32QI256,
23785 IX86_BUILTIN_VPCMOV_V8SF256,
23786 IX86_BUILTIN_VPCMOV_V4DF256,
23788 IX86_BUILTIN_VPPERM,
23790 IX86_BUILTIN_VPMACSSWW,
23791 IX86_BUILTIN_VPMACSWW,
23792 IX86_BUILTIN_VPMACSSWD,
23793 IX86_BUILTIN_VPMACSWD,
23794 IX86_BUILTIN_VPMACSSDD,
23795 IX86_BUILTIN_VPMACSDD,
23796 IX86_BUILTIN_VPMACSSDQL,
23797 IX86_BUILTIN_VPMACSSDQH,
23798 IX86_BUILTIN_VPMACSDQL,
23799 IX86_BUILTIN_VPMACSDQH,
23800 IX86_BUILTIN_VPMADCSSWD,
23801 IX86_BUILTIN_VPMADCSWD,
23803 IX86_BUILTIN_VPHADDBW,
23804 IX86_BUILTIN_VPHADDBD,
23805 IX86_BUILTIN_VPHADDBQ,
23806 IX86_BUILTIN_VPHADDWD,
23807 IX86_BUILTIN_VPHADDWQ,
23808 IX86_BUILTIN_VPHADDDQ,
23809 IX86_BUILTIN_VPHADDUBW,
23810 IX86_BUILTIN_VPHADDUBD,
23811 IX86_BUILTIN_VPHADDUBQ,
23812 IX86_BUILTIN_VPHADDUWD,
23813 IX86_BUILTIN_VPHADDUWQ,
23814 IX86_BUILTIN_VPHADDUDQ,
23815 IX86_BUILTIN_VPHSUBBW,
23816 IX86_BUILTIN_VPHSUBWD,
23817 IX86_BUILTIN_VPHSUBDQ,
23819 IX86_BUILTIN_VPROTB,
23820 IX86_BUILTIN_VPROTW,
23821 IX86_BUILTIN_VPROTD,
23822 IX86_BUILTIN_VPROTQ,
23823 IX86_BUILTIN_VPROTB_IMM,
23824 IX86_BUILTIN_VPROTW_IMM,
23825 IX86_BUILTIN_VPROTD_IMM,
23826 IX86_BUILTIN_VPROTQ_IMM,
23828 IX86_BUILTIN_VPSHLB,
23829 IX86_BUILTIN_VPSHLW,
23830 IX86_BUILTIN_VPSHLD,
23831 IX86_BUILTIN_VPSHLQ,
23832 IX86_BUILTIN_VPSHAB,
23833 IX86_BUILTIN_VPSHAW,
23834 IX86_BUILTIN_VPSHAD,
23835 IX86_BUILTIN_VPSHAQ,
23837 IX86_BUILTIN_VFRCZSS,
23838 IX86_BUILTIN_VFRCZSD,
23839 IX86_BUILTIN_VFRCZPS,
23840 IX86_BUILTIN_VFRCZPD,
23841 IX86_BUILTIN_VFRCZPS256,
23842 IX86_BUILTIN_VFRCZPD256,
23844 IX86_BUILTIN_VPCOMEQUB,
23845 IX86_BUILTIN_VPCOMNEUB,
23846 IX86_BUILTIN_VPCOMLTUB,
23847 IX86_BUILTIN_VPCOMLEUB,
23848 IX86_BUILTIN_VPCOMGTUB,
23849 IX86_BUILTIN_VPCOMGEUB,
23850 IX86_BUILTIN_VPCOMFALSEUB,
23851 IX86_BUILTIN_VPCOMTRUEUB,
23853 IX86_BUILTIN_VPCOMEQUW,
23854 IX86_BUILTIN_VPCOMNEUW,
23855 IX86_BUILTIN_VPCOMLTUW,
23856 IX86_BUILTIN_VPCOMLEUW,
23857 IX86_BUILTIN_VPCOMGTUW,
23858 IX86_BUILTIN_VPCOMGEUW,
23859 IX86_BUILTIN_VPCOMFALSEUW,
23860 IX86_BUILTIN_VPCOMTRUEUW,
23862 IX86_BUILTIN_VPCOMEQUD,
23863 IX86_BUILTIN_VPCOMNEUD,
23864 IX86_BUILTIN_VPCOMLTUD,
23865 IX86_BUILTIN_VPCOMLEUD,
23866 IX86_BUILTIN_VPCOMGTUD,
23867 IX86_BUILTIN_VPCOMGEUD,
23868 IX86_BUILTIN_VPCOMFALSEUD,
23869 IX86_BUILTIN_VPCOMTRUEUD,
23871 IX86_BUILTIN_VPCOMEQUQ,
23872 IX86_BUILTIN_VPCOMNEUQ,
23873 IX86_BUILTIN_VPCOMLTUQ,
23874 IX86_BUILTIN_VPCOMLEUQ,
23875 IX86_BUILTIN_VPCOMGTUQ,
23876 IX86_BUILTIN_VPCOMGEUQ,
23877 IX86_BUILTIN_VPCOMFALSEUQ,
23878 IX86_BUILTIN_VPCOMTRUEUQ,
23880 IX86_BUILTIN_VPCOMEQB,
23881 IX86_BUILTIN_VPCOMNEB,
23882 IX86_BUILTIN_VPCOMLTB,
23883 IX86_BUILTIN_VPCOMLEB,
23884 IX86_BUILTIN_VPCOMGTB,
23885 IX86_BUILTIN_VPCOMGEB,
23886 IX86_BUILTIN_VPCOMFALSEB,
23887 IX86_BUILTIN_VPCOMTRUEB,
23889 IX86_BUILTIN_VPCOMEQW,
23890 IX86_BUILTIN_VPCOMNEW,
23891 IX86_BUILTIN_VPCOMLTW,
23892 IX86_BUILTIN_VPCOMLEW,
23893 IX86_BUILTIN_VPCOMGTW,
23894 IX86_BUILTIN_VPCOMGEW,
23895 IX86_BUILTIN_VPCOMFALSEW,
23896 IX86_BUILTIN_VPCOMTRUEW,
23898 IX86_BUILTIN_VPCOMEQD,
23899 IX86_BUILTIN_VPCOMNED,
23900 IX86_BUILTIN_VPCOMLTD,
23901 IX86_BUILTIN_VPCOMLED,
23902 IX86_BUILTIN_VPCOMGTD,
23903 IX86_BUILTIN_VPCOMGED,
23904 IX86_BUILTIN_VPCOMFALSED,
23905 IX86_BUILTIN_VPCOMTRUED,
23907 IX86_BUILTIN_VPCOMEQQ,
23908 IX86_BUILTIN_VPCOMNEQ,
23909 IX86_BUILTIN_VPCOMLTQ,
23910 IX86_BUILTIN_VPCOMLEQ,
23911 IX86_BUILTIN_VPCOMGTQ,
23912 IX86_BUILTIN_VPCOMGEQ,
23913 IX86_BUILTIN_VPCOMFALSEQ,
23914 IX86_BUILTIN_VPCOMTRUEQ,
23916 /* LWP instructions. */
23917 IX86_BUILTIN_LLWPCB,
23918 IX86_BUILTIN_SLWPCB,
23919 IX86_BUILTIN_LWPVAL32,
23920 IX86_BUILTIN_LWPVAL64,
23921 IX86_BUILTIN_LWPINS32,
23922 IX86_BUILTIN_LWPINS64,
23926 /* BMI instructions. */
23927 IX86_BUILTIN_BEXTR32,
23928 IX86_BUILTIN_BEXTR64,
23931 /* TBM instructions. */
23932 IX86_BUILTIN_BEXTRI32,
23933 IX86_BUILTIN_BEXTRI64,
23936 /* FSGSBASE instructions. */
23937 IX86_BUILTIN_RDFSBASE32,
23938 IX86_BUILTIN_RDFSBASE64,
23939 IX86_BUILTIN_RDGSBASE32,
23940 IX86_BUILTIN_RDGSBASE64,
23941 IX86_BUILTIN_WRFSBASE32,
23942 IX86_BUILTIN_WRFSBASE64,
23943 IX86_BUILTIN_WRGSBASE32,
23944 IX86_BUILTIN_WRGSBASE64,
23946 /* RDRND instructions. */
23947 IX86_BUILTIN_RDRAND16_STEP,
23948 IX86_BUILTIN_RDRAND32_STEP,
23949 IX86_BUILTIN_RDRAND64_STEP,
23951 /* F16C instructions. */
23952 IX86_BUILTIN_CVTPH2PS,
23953 IX86_BUILTIN_CVTPH2PS256,
23954 IX86_BUILTIN_CVTPS2PH,
23955 IX86_BUILTIN_CVTPS2PH256,
23957 /* CFString built-in for darwin */
23958 IX86_BUILTIN_CFSTRING,
23963 /* Table for the ix86 builtin decls. */
23964 static GTY(()) tree ix86_builtins[(int) IX86_BUILTIN_MAX];
23966 /* Table of all of the builtin functions that are possible with different ISA's
23967 but are waiting to be built until a function is declared to use that
23969 struct builtin_isa {
23970 const char *name; /* function name */
23971 enum ix86_builtin_func_type tcode; /* type to use in the declaration */
23972 int isa; /* isa_flags this builtin is defined for */
23973 bool const_p; /* true if the declaration is constant */
23974 bool set_and_not_built_p;
23977 static struct builtin_isa ix86_builtins_isa[(int) IX86_BUILTIN_MAX];
23980 /* Add an ix86 target builtin function with CODE, NAME and TYPE. Save the MASK
23981 of which isa_flags to use in the ix86_builtins_isa array. Stores the
23982 function decl in the ix86_builtins array. Returns the function decl or
23983 NULL_TREE, if the builtin was not added.
23985 If the front end has a special hook for builtin functions, delay adding
23986 builtin functions that aren't in the current ISA until the ISA is changed
23987 with function specific optimization. Doing so, can save about 300K for the
23988 default compiler. When the builtin is expanded, check at that time whether
23991 If the front end doesn't have a special hook, record all builtins, even if
23992 it isn't an instruction set in the current ISA in case the user uses
23993 function specific options for a different ISA, so that we don't get scope
23994 errors if a builtin is added in the middle of a function scope. */
23997 def_builtin (int mask, const char *name, enum ix86_builtin_func_type tcode,
23998 enum ix86_builtins code)
24000 tree decl = NULL_TREE;
24002 if (!(mask & OPTION_MASK_ISA_64BIT) || TARGET_64BIT)
24004 ix86_builtins_isa[(int) code].isa = mask;
24006 mask &= ~OPTION_MASK_ISA_64BIT;
24008 || (mask & ix86_isa_flags) != 0
24009 || (lang_hooks.builtin_function
24010 == lang_hooks.builtin_function_ext_scope))
24013 tree type = ix86_get_builtin_func_type (tcode);
24014 decl = add_builtin_function (name, type, code, BUILT_IN_MD,
24016 ix86_builtins[(int) code] = decl;
24017 ix86_builtins_isa[(int) code].set_and_not_built_p = false;
24021 ix86_builtins[(int) code] = NULL_TREE;
24022 ix86_builtins_isa[(int) code].tcode = tcode;
24023 ix86_builtins_isa[(int) code].name = name;
24024 ix86_builtins_isa[(int) code].const_p = false;
24025 ix86_builtins_isa[(int) code].set_and_not_built_p = true;
24032 /* Like def_builtin, but also marks the function decl "const". */
24035 def_builtin_const (int mask, const char *name,
24036 enum ix86_builtin_func_type tcode, enum ix86_builtins code)
24038 tree decl = def_builtin (mask, name, tcode, code);
24040 TREE_READONLY (decl) = 1;
24042 ix86_builtins_isa[(int) code].const_p = true;
24047 /* Add any new builtin functions for a given ISA that may not have been
24048 declared. This saves a bit of space compared to adding all of the
24049 declarations to the tree, even if we didn't use them. */
24052 ix86_add_new_builtins (int isa)
24056 for (i = 0; i < (int)IX86_BUILTIN_MAX; i++)
24058 if ((ix86_builtins_isa[i].isa & isa) != 0
24059 && ix86_builtins_isa[i].set_and_not_built_p)
24063 /* Don't define the builtin again. */
24064 ix86_builtins_isa[i].set_and_not_built_p = false;
24066 type = ix86_get_builtin_func_type (ix86_builtins_isa[i].tcode);
24067 decl = add_builtin_function_ext_scope (ix86_builtins_isa[i].name,
24068 type, i, BUILT_IN_MD, NULL,
24071 ix86_builtins[i] = decl;
24072 if (ix86_builtins_isa[i].const_p)
24073 TREE_READONLY (decl) = 1;
24078 /* Bits for builtin_description.flag. */
24080 /* Set when we don't support the comparison natively, and should
24081 swap_comparison in order to support it. */
24082 #define BUILTIN_DESC_SWAP_OPERANDS 1
24084 struct builtin_description
24086 const unsigned int mask;
24087 const enum insn_code icode;
24088 const char *const name;
24089 const enum ix86_builtins code;
24090 const enum rtx_code comparison;
24094 static const struct builtin_description bdesc_comi[] =
24096 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 },
24097 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 },
24098 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 },
24099 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 },
24100 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 },
24101 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 },
24102 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 },
24103 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 },
24104 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 },
24105 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 },
24106 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 },
24107 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 },
24108 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 },
24109 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 },
24110 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 },
24111 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 },
24112 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 },
24113 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 },
24114 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 },
24115 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 },
24116 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 },
24117 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 },
24118 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 },
24119 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 },
24122 static const struct builtin_description bdesc_pcmpestr[] =
24125 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestri128", IX86_BUILTIN_PCMPESTRI128, UNKNOWN, 0 },
24126 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrm128", IX86_BUILTIN_PCMPESTRM128, UNKNOWN, 0 },
24127 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestria128", IX86_BUILTIN_PCMPESTRA128, UNKNOWN, (int) CCAmode },
24128 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestric128", IX86_BUILTIN_PCMPESTRC128, UNKNOWN, (int) CCCmode },
24129 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestrio128", IX86_BUILTIN_PCMPESTRO128, UNKNOWN, (int) CCOmode },
24130 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestris128", IX86_BUILTIN_PCMPESTRS128, UNKNOWN, (int) CCSmode },
24131 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpestr, "__builtin_ia32_pcmpestriz128", IX86_BUILTIN_PCMPESTRZ128, UNKNOWN, (int) CCZmode },
24134 static const struct builtin_description bdesc_pcmpistr[] =
24137 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistri128", IX86_BUILTIN_PCMPISTRI128, UNKNOWN, 0 },
24138 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrm128", IX86_BUILTIN_PCMPISTRM128, UNKNOWN, 0 },
24139 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistria128", IX86_BUILTIN_PCMPISTRA128, UNKNOWN, (int) CCAmode },
24140 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistric128", IX86_BUILTIN_PCMPISTRC128, UNKNOWN, (int) CCCmode },
24141 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistrio128", IX86_BUILTIN_PCMPISTRO128, UNKNOWN, (int) CCOmode },
24142 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistris128", IX86_BUILTIN_PCMPISTRS128, UNKNOWN, (int) CCSmode },
24143 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_pcmpistr, "__builtin_ia32_pcmpistriz128", IX86_BUILTIN_PCMPISTRZ128, UNKNOWN, (int) CCZmode },
24146 /* Special builtins with variable number of arguments. */
24147 static const struct builtin_description bdesc_special_args[] =
24149 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtsc, "__builtin_ia32_rdtsc", IX86_BUILTIN_RDTSC, UNKNOWN, (int) UINT64_FTYPE_VOID },
24150 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdtscp, "__builtin_ia32_rdtscp", IX86_BUILTIN_RDTSCP, UNKNOWN, (int) UINT64_FTYPE_PUNSIGNED },
24151 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_pause, "__builtin_ia32_pause", IX86_BUILTIN_PAUSE, UNKNOWN, (int) VOID_FTYPE_VOID },
24154 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_emms, "__builtin_ia32_emms", IX86_BUILTIN_EMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
24157 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_femms, "__builtin_ia32_femms", IX86_BUILTIN_FEMMS, UNKNOWN, (int) VOID_FTYPE_VOID },
24160 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
24161 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
24162 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movups, "__builtin_ia32_loadups", IX86_BUILTIN_LOADUPS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
24164 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadhps_exp, "__builtin_ia32_loadhps", IX86_BUILTIN_LOADHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
24165 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_loadlps_exp, "__builtin_ia32_loadlps", IX86_BUILTIN_LOADLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_PCV2SF },
24166 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storehps, "__builtin_ia32_storehps", IX86_BUILTIN_STOREHPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
24167 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_storelps, "__builtin_ia32_storelps", IX86_BUILTIN_STORELPS, UNKNOWN, (int) VOID_FTYPE_PV2SF_V4SF },
24169 /* SSE or 3DNow!A */
24170 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_sfence, "__builtin_ia32_sfence", IX86_BUILTIN_SFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
24171 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_sse_movntdi, "__builtin_ia32_movntq", IX86_BUILTIN_MOVNTQ, UNKNOWN, (int) VOID_FTYPE_PULONGLONG_ULONGLONG },
24174 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lfence, "__builtin_ia32_lfence", IX86_BUILTIN_LFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
24175 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_mfence, 0, IX86_BUILTIN_MFENCE, UNKNOWN, (int) VOID_FTYPE_VOID },
24176 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_storeupd", IX86_BUILTIN_STOREUPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
24177 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_storedqu", IX86_BUILTIN_STOREDQU, UNKNOWN, (int) VOID_FTYPE_PCHAR_V16QI },
24178 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2df, "__builtin_ia32_movntpd", IX86_BUILTIN_MOVNTPD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
24179 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntv2di, "__builtin_ia32_movntdq", IX86_BUILTIN_MOVNTDQ, UNKNOWN, (int) VOID_FTYPE_PV2DI_V2DI },
24180 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movntsi, "__builtin_ia32_movnti", IX86_BUILTIN_MOVNTI, UNKNOWN, (int) VOID_FTYPE_PINT_INT },
24181 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movupd, "__builtin_ia32_loadupd", IX86_BUILTIN_LOADUPD, UNKNOWN, (int) V2DF_FTYPE_PCDOUBLE },
24182 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movdqu, "__builtin_ia32_loaddqu", IX86_BUILTIN_LOADDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
24184 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadhpd_exp, "__builtin_ia32_loadhpd", IX86_BUILTIN_LOADHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
24185 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_loadlpd_exp, "__builtin_ia32_loadlpd", IX86_BUILTIN_LOADLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_PCDOUBLE },
24188 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_lddqu, "__builtin_ia32_lddqu", IX86_BUILTIN_LDDQU, UNKNOWN, (int) V16QI_FTYPE_PCCHAR },
24191 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_movntdqa, "__builtin_ia32_movntdqa", IX86_BUILTIN_MOVNTDQA, UNKNOWN, (int) V2DI_FTYPE_PV2DI },
24194 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
24195 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
24198 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
24199 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, "__builtin_ia32_vzeroupper", IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
24201 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4sf, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
24202 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv4df, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
24203 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_dupv8sf, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
24204 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v4df, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
24205 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_v8sf, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
24207 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
24208 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
24209 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
24210 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
24211 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
24212 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
24213 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
24215 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4di, "__builtin_ia32_movntdq256", IX86_BUILTIN_MOVNTDQ256, UNKNOWN, (int) VOID_FTYPE_PV4DI_V4DI },
24216 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv4df, "__builtin_ia32_movntpd256", IX86_BUILTIN_MOVNTPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
24217 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movntv8sf, "__builtin_ia32_movntps256", IX86_BUILTIN_MOVNTPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
24219 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DI },
24220 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SI },
24221 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DI },
24222 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SI },
24223 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DI_V2DF },
24224 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SI_V4SF },
24225 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DI_V4DF },
24226 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SI_V8SF },
24228 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_llwpcb, "__builtin_ia32_llwpcb", IX86_BUILTIN_LLWPCB, UNKNOWN, (int) VOID_FTYPE_PVOID },
24229 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_slwpcb, "__builtin_ia32_slwpcb", IX86_BUILTIN_SLWPCB, UNKNOWN, (int) PVOID_FTYPE_VOID },
24230 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvalsi3, "__builtin_ia32_lwpval32", IX86_BUILTIN_LWPVAL32, UNKNOWN, (int) VOID_FTYPE_UINT_UINT_UINT },
24231 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpvaldi3, "__builtin_ia32_lwpval64", IX86_BUILTIN_LWPVAL64, UNKNOWN, (int) VOID_FTYPE_UINT64_UINT_UINT },
24232 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinssi3, "__builtin_ia32_lwpins32", IX86_BUILTIN_LWPINS32, UNKNOWN, (int) UCHAR_FTYPE_UINT_UINT_UINT },
24233 { OPTION_MASK_ISA_LWP, CODE_FOR_lwp_lwpinsdi3, "__builtin_ia32_lwpins64", IX86_BUILTIN_LWPINS64, UNKNOWN, (int) UCHAR_FTYPE_UINT64_UINT_UINT },
24236 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasesi, "__builtin_ia32_rdfsbase32", IX86_BUILTIN_RDFSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
24237 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdfsbasedi, "__builtin_ia32_rdfsbase64", IX86_BUILTIN_RDFSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
24238 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasesi, "__builtin_ia32_rdgsbase32", IX86_BUILTIN_RDGSBASE32, UNKNOWN, (int) UNSIGNED_FTYPE_VOID },
24239 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_rdgsbasedi, "__builtin_ia32_rdgsbase64", IX86_BUILTIN_RDGSBASE64, UNKNOWN, (int) UINT64_FTYPE_VOID },
24240 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasesi, "__builtin_ia32_wrfsbase32", IX86_BUILTIN_WRFSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
24241 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrfsbasedi, "__builtin_ia32_wrfsbase64", IX86_BUILTIN_WRFSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
24242 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasesi, "__builtin_ia32_wrgsbase32", IX86_BUILTIN_WRGSBASE32, UNKNOWN, (int) VOID_FTYPE_UNSIGNED },
24243 { OPTION_MASK_ISA_FSGSBASE | OPTION_MASK_ISA_64BIT, CODE_FOR_wrgsbasedi, "__builtin_ia32_wrgsbase64", IX86_BUILTIN_WRGSBASE64, UNKNOWN, (int) VOID_FTYPE_UINT64 },
24246 /* Builtins with variable number of arguments. */
24247 static const struct builtin_description bdesc_args[] =
24249 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_bsr, "__builtin_ia32_bsrsi", IX86_BUILTIN_BSRSI, UNKNOWN, (int) INT_FTYPE_INT },
24250 { OPTION_MASK_ISA_64BIT, CODE_FOR_bsr_rex64, "__builtin_ia32_bsrdi", IX86_BUILTIN_BSRDI, UNKNOWN, (int) INT64_FTYPE_INT64 },
24251 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rdpmc, "__builtin_ia32_rdpmc", IX86_BUILTIN_RDPMC, UNKNOWN, (int) UINT64_FTYPE_INT },
24252 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlqi3, "__builtin_ia32_rolqi", IX86_BUILTIN_ROLQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
24253 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotlhi3, "__builtin_ia32_rolhi", IX86_BUILTIN_ROLHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
24254 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrqi3, "__builtin_ia32_rorqi", IX86_BUILTIN_RORQI, UNKNOWN, (int) UINT8_FTYPE_UINT8_INT },
24255 { ~OPTION_MASK_ISA_64BIT, CODE_FOR_rotrhi3, "__builtin_ia32_rorhi", IX86_BUILTIN_RORHI, UNKNOWN, (int) UINT16_FTYPE_UINT16_INT },
24258 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
24259 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24260 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
24261 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
24262 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24263 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
24265 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
24266 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24267 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv8qi3, "__builtin_ia32_psubsb", IX86_BUILTIN_PSUBSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
24268 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_sssubv4hi3, "__builtin_ia32_psubsw", IX86_BUILTIN_PSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24269 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv8qi3, "__builtin_ia32_paddusb", IX86_BUILTIN_PADDUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
24270 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_usaddv4hi3, "__builtin_ia32_paddusw", IX86_BUILTIN_PADDUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24271 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv8qi3, "__builtin_ia32_psubusb", IX86_BUILTIN_PSUBUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
24272 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ussubv4hi3, "__builtin_ia32_psubusw", IX86_BUILTIN_PSUBUSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24274 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_mulv4hi3, "__builtin_ia32_pmullw", IX86_BUILTIN_PMULLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24275 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_smulv4hi3_highpart, "__builtin_ia32_pmulhw", IX86_BUILTIN_PMULHW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24277 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andv2si3, "__builtin_ia32_pand", IX86_BUILTIN_PAND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
24278 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_andnotv2si3, "__builtin_ia32_pandn", IX86_BUILTIN_PANDN, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
24279 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_iorv2si3, "__builtin_ia32_por", IX86_BUILTIN_POR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
24280 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_xorv2si3, "__builtin_ia32_pxor", IX86_BUILTIN_PXOR, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
24282 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv8qi3, "__builtin_ia32_pcmpeqb", IX86_BUILTIN_PCMPEQB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
24283 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv4hi3, "__builtin_ia32_pcmpeqw", IX86_BUILTIN_PCMPEQW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24284 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_eqv2si3, "__builtin_ia32_pcmpeqd", IX86_BUILTIN_PCMPEQD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
24285 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv8qi3, "__builtin_ia32_pcmpgtb", IX86_BUILTIN_PCMPGTB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
24286 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv4hi3, "__builtin_ia32_pcmpgtw", IX86_BUILTIN_PCMPGTW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24287 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_gtv2si3, "__builtin_ia32_pcmpgtd", IX86_BUILTIN_PCMPGTD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
24289 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhbw, "__builtin_ia32_punpckhbw", IX86_BUILTIN_PUNPCKHBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
24290 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhwd, "__builtin_ia32_punpckhwd", IX86_BUILTIN_PUNPCKHWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24291 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckhdq, "__builtin_ia32_punpckhdq", IX86_BUILTIN_PUNPCKHDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
24292 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklbw, "__builtin_ia32_punpcklbw", IX86_BUILTIN_PUNPCKLBW, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
24293 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpcklwd, "__builtin_ia32_punpcklwd", IX86_BUILTIN_PUNPCKLWD, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI},
24294 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_punpckldq, "__builtin_ia32_punpckldq", IX86_BUILTIN_PUNPCKLDQ, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI},
24296 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packsswb, "__builtin_ia32_packsswb", IX86_BUILTIN_PACKSSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
24297 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packssdw, "__builtin_ia32_packssdw", IX86_BUILTIN_PACKSSDW, UNKNOWN, (int) V4HI_FTYPE_V2SI_V2SI },
24298 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_packuswb, "__builtin_ia32_packuswb", IX86_BUILTIN_PACKUSWB, UNKNOWN, (int) V8QI_FTYPE_V4HI_V4HI },
24300 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, "__builtin_ia32_pmaddwd", IX86_BUILTIN_PMADDWD, UNKNOWN, (int) V2SI_FTYPE_V4HI_V4HI },
24302 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllwi", IX86_BUILTIN_PSLLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
24303 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslldi", IX86_BUILTIN_PSLLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
24304 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllqi", IX86_BUILTIN_PSLLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
24305 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, "__builtin_ia32_psllw", IX86_BUILTIN_PSLLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
24306 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, "__builtin_ia32_pslld", IX86_BUILTIN_PSLLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
24307 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv1di3, "__builtin_ia32_psllq", IX86_BUILTIN_PSLLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
24309 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlwi", IX86_BUILTIN_PSRLWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
24310 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrldi", IX86_BUILTIN_PSRLDI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
24311 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlqi", IX86_BUILTIN_PSRLQI, UNKNOWN, (int) V1DI_FTYPE_V1DI_SI_COUNT },
24312 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, "__builtin_ia32_psrlw", IX86_BUILTIN_PSRLW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
24313 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, "__builtin_ia32_psrld", IX86_BUILTIN_PSRLD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
24314 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv1di3, "__builtin_ia32_psrlq", IX86_BUILTIN_PSRLQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_COUNT },
24316 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psrawi", IX86_BUILTIN_PSRAWI, UNKNOWN, (int) V4HI_FTYPE_V4HI_SI_COUNT },
24317 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psradi", IX86_BUILTIN_PSRADI, UNKNOWN, (int) V2SI_FTYPE_V2SI_SI_COUNT },
24318 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, "__builtin_ia32_psraw", IX86_BUILTIN_PSRAW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI_COUNT },
24319 { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, "__builtin_ia32_psrad", IX86_BUILTIN_PSRAD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI_COUNT },
24322 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pf2id, "__builtin_ia32_pf2id", IX86_BUILTIN_PF2ID, UNKNOWN, (int) V2SI_FTYPE_V2SF },
24323 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_floatv2si2, "__builtin_ia32_pi2fd", IX86_BUILTIN_PI2FD, UNKNOWN, (int) V2SF_FTYPE_V2SI },
24324 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpv2sf2, "__builtin_ia32_pfrcp", IX86_BUILTIN_PFRCP, UNKNOWN, (int) V2SF_FTYPE_V2SF },
24325 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqrtv2sf2, "__builtin_ia32_pfrsqrt", IX86_BUILTIN_PFRSQRT, UNKNOWN, (int) V2SF_FTYPE_V2SF },
24327 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgusb", IX86_BUILTIN_PAVGUSB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
24328 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_haddv2sf3, "__builtin_ia32_pfacc", IX86_BUILTIN_PFACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
24329 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_addv2sf3, "__builtin_ia32_pfadd", IX86_BUILTIN_PFADD, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
24330 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_eqv2sf3, "__builtin_ia32_pfcmpeq", IX86_BUILTIN_PFCMPEQ, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
24331 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gev2sf3, "__builtin_ia32_pfcmpge", IX86_BUILTIN_PFCMPGE, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
24332 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_gtv2sf3, "__builtin_ia32_pfcmpgt", IX86_BUILTIN_PFCMPGT, UNKNOWN, (int) V2SI_FTYPE_V2SF_V2SF },
24333 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_smaxv2sf3, "__builtin_ia32_pfmax", IX86_BUILTIN_PFMAX, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
24334 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_sminv2sf3, "__builtin_ia32_pfmin", IX86_BUILTIN_PFMIN, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
24335 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_mulv2sf3, "__builtin_ia32_pfmul", IX86_BUILTIN_PFMUL, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
24336 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit1v2sf3, "__builtin_ia32_pfrcpit1", IX86_BUILTIN_PFRCPIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
24337 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rcpit2v2sf3, "__builtin_ia32_pfrcpit2", IX86_BUILTIN_PFRCPIT2, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
24338 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_rsqit1v2sf3, "__builtin_ia32_pfrsqit1", IX86_BUILTIN_PFRSQIT1, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
24339 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subv2sf3, "__builtin_ia32_pfsub", IX86_BUILTIN_PFSUB, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
24340 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_subrv2sf3, "__builtin_ia32_pfsubr", IX86_BUILTIN_PFSUBR, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
24341 { OPTION_MASK_ISA_3DNOW, CODE_FOR_mmx_pmulhrwv4hi3, "__builtin_ia32_pmulhrw", IX86_BUILTIN_PMULHRW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24344 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pf2iw, "__builtin_ia32_pf2iw", IX86_BUILTIN_PF2IW, UNKNOWN, (int) V2SI_FTYPE_V2SF },
24345 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pi2fw, "__builtin_ia32_pi2fw", IX86_BUILTIN_PI2FW, UNKNOWN, (int) V2SF_FTYPE_V2SI },
24346 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2si2, "__builtin_ia32_pswapdsi", IX86_BUILTIN_PSWAPDSI, UNKNOWN, (int) V2SI_FTYPE_V2SI },
24347 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pswapdv2sf2, "__builtin_ia32_pswapdsf", IX86_BUILTIN_PSWAPDSF, UNKNOWN, (int) V2SF_FTYPE_V2SF },
24348 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_hsubv2sf3, "__builtin_ia32_pfnacc", IX86_BUILTIN_PFNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
24349 { OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_addsubv2sf3, "__builtin_ia32_pfpnacc", IX86_BUILTIN_PFPNACC, UNKNOWN, (int) V2SF_FTYPE_V2SF_V2SF },
24352 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, "__builtin_ia32_movmskps", IX86_BUILTIN_MOVMSKPS, UNKNOWN, (int) INT_FTYPE_V4SF },
24353 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_sqrtv4sf2, "__builtin_ia32_sqrtps", IX86_BUILTIN_SQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
24354 { OPTION_MASK_ISA_SSE, CODE_FOR_sqrtv4sf2, "__builtin_ia32_sqrtps_nr", IX86_BUILTIN_SQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
24355 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rsqrtv4sf2, "__builtin_ia32_rsqrtps", IX86_BUILTIN_RSQRTPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
24356 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtv4sf2, "__builtin_ia32_rsqrtps_nr", IX86_BUILTIN_RSQRTPS_NR, UNKNOWN, (int) V4SF_FTYPE_V4SF },
24357 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_rcpv4sf2, "__builtin_ia32_rcpps", IX86_BUILTIN_RCPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF },
24358 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtps2pi, "__builtin_ia32_cvtps2pi", IX86_BUILTIN_CVTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
24359 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtss2si, "__builtin_ia32_cvtss2si", IX86_BUILTIN_CVTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
24360 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtss2siq, "__builtin_ia32_cvtss2si64", IX86_BUILTIN_CVTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
24361 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttps2pi, "__builtin_ia32_cvttps2pi", IX86_BUILTIN_CVTTPS2PI, UNKNOWN, (int) V2SI_FTYPE_V4SF },
24362 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, "__builtin_ia32_cvttss2si", IX86_BUILTIN_CVTTSS2SI, UNKNOWN, (int) INT_FTYPE_V4SF },
24363 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, "__builtin_ia32_cvttss2si64", IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, (int) INT64_FTYPE_V4SF },
24365 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_shufps, "__builtin_ia32_shufps", IX86_BUILTIN_SHUFPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
24367 { OPTION_MASK_ISA_SSE, CODE_FOR_addv4sf3, "__builtin_ia32_addps", IX86_BUILTIN_ADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24368 { OPTION_MASK_ISA_SSE, CODE_FOR_subv4sf3, "__builtin_ia32_subps", IX86_BUILTIN_SUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24369 { OPTION_MASK_ISA_SSE, CODE_FOR_mulv4sf3, "__builtin_ia32_mulps", IX86_BUILTIN_MULPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24370 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_divv4sf3, "__builtin_ia32_divps", IX86_BUILTIN_DIVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24371 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmaddv4sf3, "__builtin_ia32_addss", IX86_BUILTIN_ADDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24372 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsubv4sf3, "__builtin_ia32_subss", IX86_BUILTIN_SUBSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24373 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmulv4sf3, "__builtin_ia32_mulss", IX86_BUILTIN_MULSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24374 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmdivv4sf3, "__builtin_ia32_divss", IX86_BUILTIN_DIVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24376 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpeqps", IX86_BUILTIN_CMPEQPS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
24377 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpltps", IX86_BUILTIN_CMPLTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
24378 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpleps", IX86_BUILTIN_CMPLEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
24379 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgtps", IX86_BUILTIN_CMPGTPS, LT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
24380 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpgeps", IX86_BUILTIN_CMPGEPS, LE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
24381 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpunordps", IX86_BUILTIN_CMPUNORDPS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
24382 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpneqps", IX86_BUILTIN_CMPNEQPS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
24383 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnltps", IX86_BUILTIN_CMPNLTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
24384 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpnleps", IX86_BUILTIN_CMPNLEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
24385 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngtps", IX86_BUILTIN_CMPNGTPS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
24386 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpngeps", IX86_BUILTIN_CMPNGEPS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP},
24387 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_maskcmpv4sf3, "__builtin_ia32_cmpordps", IX86_BUILTIN_CMPORDPS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
24388 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, (int) V4SF_FTYPE_V4SF_V4SF },
24389 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, (int) V4SF_FTYPE_V4SF_V4SF },
24390 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, (int) V4SF_FTYPE_V4SF_V4SF },
24391 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
24392 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, NE, (int) V4SF_FTYPE_V4SF_V4SF },
24393 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF },
24394 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF },
24395 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, UNGE, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
24396 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, UNGT, (int) V4SF_FTYPE_V4SF_V4SF_SWAP },
24397 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmmaskcmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, ORDERED, (int) V4SF_FTYPE_V4SF_V4SF },
24399 { OPTION_MASK_ISA_SSE, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24400 { OPTION_MASK_ISA_SSE, CODE_FOR_smaxv4sf3, "__builtin_ia32_maxps", IX86_BUILTIN_MAXPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24401 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24402 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24404 { OPTION_MASK_ISA_SSE, CODE_FOR_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24405 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_andnotv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24406 { OPTION_MASK_ISA_SSE, CODE_FOR_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24407 { OPTION_MASK_ISA_SSE, CODE_FOR_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24409 { OPTION_MASK_ISA_SSE, CODE_FOR_copysignv4sf3, "__builtin_ia32_copysignps", IX86_BUILTIN_CPYSGNPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24411 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24412 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movhlps_exp, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24413 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movlhps_exp, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24414 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_highv4sf, "__builtin_ia32_unpckhps", IX86_BUILTIN_UNPCKHPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24415 { OPTION_MASK_ISA_SSE, CODE_FOR_vec_interleave_lowv4sf, "__builtin_ia32_unpcklps", IX86_BUILTIN_UNPCKLPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24417 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtpi2ps, "__builtin_ia32_cvtpi2ps", IX86_BUILTIN_CVTPI2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2SI },
24418 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, "__builtin_ia32_cvtsi2ss", IX86_BUILTIN_CVTSI2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_SI },
24419 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, "__builtin_ia32_cvtsi642ss", IX86_BUILTIN_CVTSI642SS, UNKNOWN, V4SF_FTYPE_V4SF_DI },
24421 { OPTION_MASK_ISA_SSE, CODE_FOR_rsqrtsf2, "__builtin_ia32_rsqrtf", IX86_BUILTIN_RSQRTF, UNKNOWN, (int) FLOAT_FTYPE_FLOAT },
24423 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmsqrtv4sf2, "__builtin_ia32_sqrtss", IX86_BUILTIN_SQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
24424 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrsqrtv4sf2, "__builtin_ia32_rsqrtss", IX86_BUILTIN_RSQRTSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
24425 { OPTION_MASK_ISA_SSE, CODE_FOR_sse_vmrcpv4sf2, "__builtin_ia32_rcpss", IX86_BUILTIN_RCPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_VEC_MERGE },
24427 /* SSE MMX or 3Dnow!A */
24428 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv8qi3, "__builtin_ia32_pavgb", IX86_BUILTIN_PAVGB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
24429 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uavgv4hi3, "__builtin_ia32_pavgw", IX86_BUILTIN_PAVGW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24430 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umulv4hi3_highpart, "__builtin_ia32_pmulhuw", IX86_BUILTIN_PMULHUW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24432 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_umaxv8qi3, "__builtin_ia32_pmaxub", IX86_BUILTIN_PMAXUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
24433 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_smaxv4hi3, "__builtin_ia32_pmaxsw", IX86_BUILTIN_PMAXSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24434 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_uminv8qi3, "__builtin_ia32_pminub", IX86_BUILTIN_PMINUB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
24435 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_sminv4hi3, "__builtin_ia32_pminsw", IX86_BUILTIN_PMINSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24437 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, "__builtin_ia32_psadbw", IX86_BUILTIN_PSADBW, UNKNOWN, (int) V1DI_FTYPE_V8QI_V8QI },
24438 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, "__builtin_ia32_pmovmskb", IX86_BUILTIN_PMOVMSKB, UNKNOWN, (int) INT_FTYPE_V8QI },
24440 { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pshufw, "__builtin_ia32_pshufw", IX86_BUILTIN_PSHUFW, UNKNOWN, (int) V4HI_FTYPE_V4HI_INT },
24443 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_shufpd, "__builtin_ia32_shufpd", IX86_BUILTIN_SHUFPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
24445 { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2df", IX86_BUILTIN_VEC_PERM_V2DF, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DI },
24446 { OPTION_MASK_ISA_SSE, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4sf", IX86_BUILTIN_VEC_PERM_V4SF, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SI },
24447 { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2di", IX86_BUILTIN_VEC_PERM_V2DI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_V2DI },
24448 { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4si", IX86_BUILTIN_VEC_PERM_V4SI, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_V4SI },
24449 { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8hi", IX86_BUILTIN_VEC_PERM_V8HI, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_V8HI },
24450 { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v16qi", IX86_BUILTIN_VEC_PERM_V16QI, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
24451 { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v2di_u", IX86_BUILTIN_VEC_PERM_V2DI_U, UNKNOWN, (int) V2UDI_FTYPE_V2UDI_V2UDI_V2UDI },
24452 { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4si_u", IX86_BUILTIN_VEC_PERM_V4SI_U, UNKNOWN, (int) V4USI_FTYPE_V4USI_V4USI_V4USI },
24453 { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8hi_u", IX86_BUILTIN_VEC_PERM_V8HI_U, UNKNOWN, (int) V8UHI_FTYPE_V8UHI_V8UHI_V8UHI },
24454 { OPTION_MASK_ISA_SSE2, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v16qi_u", IX86_BUILTIN_VEC_PERM_V16QI_U, UNKNOWN, (int) V16UQI_FTYPE_V16UQI_V16UQI_V16UQI },
24455 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v4df", IX86_BUILTIN_VEC_PERM_V4DF, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DI },
24456 { OPTION_MASK_ISA_AVX, CODE_FOR_nothing, "__builtin_ia32_vec_perm_v8sf", IX86_BUILTIN_VEC_PERM_V8SF, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SI },
24458 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, "__builtin_ia32_movmskpd", IX86_BUILTIN_MOVMSKPD, UNKNOWN, (int) INT_FTYPE_V2DF },
24459 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, "__builtin_ia32_pmovmskb128", IX86_BUILTIN_PMOVMSKB128, UNKNOWN, (int) INT_FTYPE_V16QI },
24460 { OPTION_MASK_ISA_SSE2, CODE_FOR_sqrtv2df2, "__builtin_ia32_sqrtpd", IX86_BUILTIN_SQRTPD, UNKNOWN, (int) V2DF_FTYPE_V2DF },
24461 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2pd, "__builtin_ia32_cvtdq2pd", IX86_BUILTIN_CVTDQ2PD, UNKNOWN, (int) V2DF_FTYPE_V4SI },
24462 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtdq2ps, "__builtin_ia32_cvtdq2ps", IX86_BUILTIN_CVTDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
24463 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtudq2ps, "__builtin_ia32_cvtudq2ps", IX86_BUILTIN_CVTUDQ2PS, UNKNOWN, (int) V4SF_FTYPE_V4SI },
24465 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2dq, "__builtin_ia32_cvtpd2dq", IX86_BUILTIN_CVTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
24466 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2pi, "__builtin_ia32_cvtpd2pi", IX86_BUILTIN_CVTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
24467 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpd2ps, "__builtin_ia32_cvtpd2ps", IX86_BUILTIN_CVTPD2PS, UNKNOWN, (int) V4SF_FTYPE_V2DF },
24468 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2dq, "__builtin_ia32_cvttpd2dq", IX86_BUILTIN_CVTTPD2DQ, UNKNOWN, (int) V4SI_FTYPE_V2DF },
24469 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttpd2pi, "__builtin_ia32_cvttpd2pi", IX86_BUILTIN_CVTTPD2PI, UNKNOWN, (int) V2SI_FTYPE_V2DF },
24471 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtpi2pd, "__builtin_ia32_cvtpi2pd", IX86_BUILTIN_CVTPI2PD, UNKNOWN, (int) V2DF_FTYPE_V2SI },
24473 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2si, "__builtin_ia32_cvtsd2si", IX86_BUILTIN_CVTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
24474 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttsd2si, "__builtin_ia32_cvttsd2si", IX86_BUILTIN_CVTTSD2SI, UNKNOWN, (int) INT_FTYPE_V2DF },
24475 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsd2siq, "__builtin_ia32_cvtsd2si64", IX86_BUILTIN_CVTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
24476 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvttsd2siq, "__builtin_ia32_cvttsd2si64", IX86_BUILTIN_CVTTSD2SI64, UNKNOWN, (int) INT64_FTYPE_V2DF },
24478 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2dq, "__builtin_ia32_cvtps2dq", IX86_BUILTIN_CVTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
24479 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtps2pd, "__builtin_ia32_cvtps2pd", IX86_BUILTIN_CVTPS2PD, UNKNOWN, (int) V2DF_FTYPE_V4SF },
24480 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvttps2dq, "__builtin_ia32_cvttps2dq", IX86_BUILTIN_CVTTPS2DQ, UNKNOWN, (int) V4SI_FTYPE_V4SF },
24482 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2df3, "__builtin_ia32_addpd", IX86_BUILTIN_ADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24483 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2df3, "__builtin_ia32_subpd", IX86_BUILTIN_SUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24484 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv2df3, "__builtin_ia32_mulpd", IX86_BUILTIN_MULPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24485 { OPTION_MASK_ISA_SSE2, CODE_FOR_divv2df3, "__builtin_ia32_divpd", IX86_BUILTIN_DIVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24486 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmaddv2df3, "__builtin_ia32_addsd", IX86_BUILTIN_ADDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24487 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsubv2df3, "__builtin_ia32_subsd", IX86_BUILTIN_SUBSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24488 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmulv2df3, "__builtin_ia32_mulsd", IX86_BUILTIN_MULSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24489 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmdivv2df3, "__builtin_ia32_divsd", IX86_BUILTIN_DIVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24491 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpeqpd", IX86_BUILTIN_CMPEQPD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
24492 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpltpd", IX86_BUILTIN_CMPLTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
24493 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmplepd", IX86_BUILTIN_CMPLEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
24494 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgtpd", IX86_BUILTIN_CMPGTPD, LT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
24495 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpgepd", IX86_BUILTIN_CMPGEPD, LE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP},
24496 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpunordpd", IX86_BUILTIN_CMPUNORDPD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
24497 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpneqpd", IX86_BUILTIN_CMPNEQPD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
24498 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnltpd", IX86_BUILTIN_CMPNLTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
24499 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpnlepd", IX86_BUILTIN_CMPNLEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
24500 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngtpd", IX86_BUILTIN_CMPNGTPD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
24501 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpngepd", IX86_BUILTIN_CMPNGEPD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF_SWAP },
24502 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_maskcmpv2df3, "__builtin_ia32_cmpordpd", IX86_BUILTIN_CMPORDPD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
24503 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, (int) V2DF_FTYPE_V2DF_V2DF },
24504 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, (int) V2DF_FTYPE_V2DF_V2DF },
24505 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, (int) V2DF_FTYPE_V2DF_V2DF },
24506 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
24507 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, NE, (int) V2DF_FTYPE_V2DF_V2DF },
24508 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, UNGE, (int) V2DF_FTYPE_V2DF_V2DF },
24509 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, UNGT, (int) V2DF_FTYPE_V2DF_V2DF },
24510 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmmaskcmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, ORDERED, (int) V2DF_FTYPE_V2DF_V2DF },
24512 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24513 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv2df3, "__builtin_ia32_maxpd", IX86_BUILTIN_MAXPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24514 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24515 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24517 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24518 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24519 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24520 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24522 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysignv2df3, "__builtin_ia32_copysignpd", IX86_BUILTIN_CPYSGNPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24524 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24525 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2df, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24526 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2df, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24528 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, (int) V4SI_FTYPE_V2DF_V2DF },
24530 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24531 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24532 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv4si3, "__builtin_ia32_paddd128", IX86_BUILTIN_PADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
24533 { OPTION_MASK_ISA_SSE2, CODE_FOR_addv2di3, "__builtin_ia32_paddq128", IX86_BUILTIN_PADDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
24534 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv16qi3, "__builtin_ia32_psubb128", IX86_BUILTIN_PSUBB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24535 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv8hi3, "__builtin_ia32_psubw128", IX86_BUILTIN_PSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24536 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv4si3, "__builtin_ia32_psubd128", IX86_BUILTIN_PSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
24537 { OPTION_MASK_ISA_SSE2, CODE_FOR_subv2di3, "__builtin_ia32_psubq128", IX86_BUILTIN_PSUBQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
24539 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv16qi3, "__builtin_ia32_paddsb128", IX86_BUILTIN_PADDSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24540 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ssaddv8hi3, "__builtin_ia32_paddsw128", IX86_BUILTIN_PADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24541 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv16qi3, "__builtin_ia32_psubsb128", IX86_BUILTIN_PSUBSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24542 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_sssubv8hi3, "__builtin_ia32_psubsw128", IX86_BUILTIN_PSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24543 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv16qi3, "__builtin_ia32_paddusb128", IX86_BUILTIN_PADDUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24544 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_usaddv8hi3, "__builtin_ia32_paddusw128", IX86_BUILTIN_PADDUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24545 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv16qi3, "__builtin_ia32_psubusb128", IX86_BUILTIN_PSUBUSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24546 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ussubv8hi3, "__builtin_ia32_psubusw128", IX86_BUILTIN_PSUBUSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24548 { OPTION_MASK_ISA_SSE2, CODE_FOR_mulv8hi3, "__builtin_ia32_pmullw128", IX86_BUILTIN_PMULLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24549 { OPTION_MASK_ISA_SSE2, CODE_FOR_smulv8hi3_highpart, "__builtin_ia32_pmulhw128", IX86_BUILTIN_PMULHW128, UNKNOWN,(int) V8HI_FTYPE_V8HI_V8HI },
24551 { OPTION_MASK_ISA_SSE2, CODE_FOR_andv2di3, "__builtin_ia32_pand128", IX86_BUILTIN_PAND128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
24552 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_andnotv2di3, "__builtin_ia32_pandn128", IX86_BUILTIN_PANDN128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
24553 { OPTION_MASK_ISA_SSE2, CODE_FOR_iorv2di3, "__builtin_ia32_por128", IX86_BUILTIN_POR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
24554 { OPTION_MASK_ISA_SSE2, CODE_FOR_xorv2di3, "__builtin_ia32_pxor128", IX86_BUILTIN_PXOR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
24556 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv16qi3, "__builtin_ia32_pavgb128", IX86_BUILTIN_PAVGB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24557 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_uavgv8hi3, "__builtin_ia32_pavgw128", IX86_BUILTIN_PAVGW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24559 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv16qi3, "__builtin_ia32_pcmpeqb128", IX86_BUILTIN_PCMPEQB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24560 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv8hi3, "__builtin_ia32_pcmpeqw128", IX86_BUILTIN_PCMPEQW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24561 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_eqv4si3, "__builtin_ia32_pcmpeqd128", IX86_BUILTIN_PCMPEQD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
24562 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv16qi3, "__builtin_ia32_pcmpgtb128", IX86_BUILTIN_PCMPGTB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24563 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv8hi3, "__builtin_ia32_pcmpgtw128", IX86_BUILTIN_PCMPGTW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24564 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_gtv4si3, "__builtin_ia32_pcmpgtd128", IX86_BUILTIN_PCMPGTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
24566 { OPTION_MASK_ISA_SSE2, CODE_FOR_umaxv16qi3, "__builtin_ia32_pmaxub128", IX86_BUILTIN_PMAXUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24567 { OPTION_MASK_ISA_SSE2, CODE_FOR_smaxv8hi3, "__builtin_ia32_pmaxsw128", IX86_BUILTIN_PMAXSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24568 { OPTION_MASK_ISA_SSE2, CODE_FOR_uminv16qi3, "__builtin_ia32_pminub128", IX86_BUILTIN_PMINUB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24569 { OPTION_MASK_ISA_SSE2, CODE_FOR_sminv8hi3, "__builtin_ia32_pminsw128", IX86_BUILTIN_PMINSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24571 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv16qi, "__builtin_ia32_punpckhbw128", IX86_BUILTIN_PUNPCKHBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24572 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv8hi, "__builtin_ia32_punpckhwd128", IX86_BUILTIN_PUNPCKHWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24573 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv4si, "__builtin_ia32_punpckhdq128", IX86_BUILTIN_PUNPCKHDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
24574 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_highv2di, "__builtin_ia32_punpckhqdq128", IX86_BUILTIN_PUNPCKHQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
24575 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv16qi, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24576 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv8hi, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24577 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv4si, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
24578 { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_interleave_lowv2di, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
24580 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
24581 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
24582 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_packuswb, "__builtin_ia32_packuswb128", IX86_BUILTIN_PACKUSWB128, UNKNOWN, (int) V16QI_FTYPE_V8HI_V8HI },
24584 { OPTION_MASK_ISA_SSE2, CODE_FOR_umulv8hi3_highpart, "__builtin_ia32_pmulhuw128", IX86_BUILTIN_PMULHUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24585 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_psadbw, "__builtin_ia32_psadbw128", IX86_BUILTIN_PSADBW128, UNKNOWN, (int) V2DI_FTYPE_V16QI_V16QI },
24587 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
24588 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
24590 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
24592 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, "__builtin_ia32_cvtsi2sd", IX86_BUILTIN_CVTSI2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_SI },
24593 { OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse2_cvtsi2sdq, "__builtin_ia32_cvtsi642sd", IX86_BUILTIN_CVTSI642SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_DI },
24594 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsd2ss, "__builtin_ia32_cvtsd2ss", IX86_BUILTIN_CVTSD2SS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V2DF },
24595 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtss2sd, "__builtin_ia32_cvtss2sd", IX86_BUILTIN_CVTSS2SD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V4SF },
24597 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_ashlv1ti3, "__builtin_ia32_pslldqi128", IX86_BUILTIN_PSLLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
24598 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllwi128", IX86_BUILTIN_PSLLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
24599 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslldi128", IX86_BUILTIN_PSLLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
24600 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllqi128", IX86_BUILTIN_PSLLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
24601 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, "__builtin_ia32_psllw128", IX86_BUILTIN_PSLLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
24602 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, "__builtin_ia32_pslld128", IX86_BUILTIN_PSLLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
24603 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, "__builtin_ia32_psllq128", IX86_BUILTIN_PSLLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
24605 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_lshrv1ti3, "__builtin_ia32_psrldqi128", IX86_BUILTIN_PSRLDQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT_CONVERT },
24606 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlwi128", IX86_BUILTIN_PSRLWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
24607 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrldi128", IX86_BUILTIN_PSRLDI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
24608 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlqi128", IX86_BUILTIN_PSRLQI128, UNKNOWN, (int) V2DI_FTYPE_V2DI_SI_COUNT },
24609 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, "__builtin_ia32_psrlw128", IX86_BUILTIN_PSRLW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
24610 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, "__builtin_ia32_psrld128", IX86_BUILTIN_PSRLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
24611 { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, "__builtin_ia32_psrlq128", IX86_BUILTIN_PSRLQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_COUNT },
24613 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psrawi128", IX86_BUILTIN_PSRAWI128, UNKNOWN, (int) V8HI_FTYPE_V8HI_SI_COUNT },
24614 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psradi128", IX86_BUILTIN_PSRADI128, UNKNOWN, (int) V4SI_FTYPE_V4SI_SI_COUNT },
24615 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, "__builtin_ia32_psraw128", IX86_BUILTIN_PSRAW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_COUNT },
24616 { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, "__builtin_ia32_psrad128", IX86_BUILTIN_PSRAD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI_COUNT },
24618 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufd, "__builtin_ia32_pshufd", IX86_BUILTIN_PSHUFD, UNKNOWN, (int) V4SI_FTYPE_V4SI_INT },
24619 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshuflw, "__builtin_ia32_pshuflw", IX86_BUILTIN_PSHUFLW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
24620 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pshufhw, "__builtin_ia32_pshufhw", IX86_BUILTIN_PSHUFHW, UNKNOWN, (int) V8HI_FTYPE_V8HI_INT },
24622 { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_vmsqrtv2df2, "__builtin_ia32_sqrtsd", IX86_BUILTIN_SQRTSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_VEC_MERGE },
24624 { OPTION_MASK_ISA_SSE2, CODE_FOR_abstf2, 0, IX86_BUILTIN_FABSQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128 },
24625 { OPTION_MASK_ISA_SSE2, CODE_FOR_copysigntf3, 0, IX86_BUILTIN_COPYSIGNQ, UNKNOWN, (int) FLOAT128_FTYPE_FLOAT128_FLOAT128 },
24627 { OPTION_MASK_ISA_SSE, CODE_FOR_sse2_movq128, "__builtin_ia32_movq128", IX86_BUILTIN_MOVQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
24630 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
24631 { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI },
24634 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF},
24635 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF },
24637 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24638 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24639 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24640 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24641 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF },
24642 { OPTION_MASK_ISA_SSE3, CODE_FOR_sse3_hsubv2df3, "__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF },
24645 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv16qi2, "__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI },
24646 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI },
24647 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
24648 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI },
24649 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
24650 { OPTION_MASK_ISA_SSSE3, CODE_FOR_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI },
24652 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv8hi3, "__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24653 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24654 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv4si3, "__builtin_ia32_phaddd128", IX86_BUILTIN_PHADDD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
24655 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phadddv2si3, "__builtin_ia32_phaddd", IX86_BUILTIN_PHADDD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
24656 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv8hi3, "__builtin_ia32_phaddsw128", IX86_BUILTIN_PHADDSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24657 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phaddswv4hi3, "__builtin_ia32_phaddsw", IX86_BUILTIN_PHADDSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24658 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv8hi3, "__builtin_ia32_phsubw128", IX86_BUILTIN_PHSUBW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24659 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubwv4hi3, "__builtin_ia32_phsubw", IX86_BUILTIN_PHSUBW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24660 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv4si3, "__builtin_ia32_phsubd128", IX86_BUILTIN_PHSUBD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
24661 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubdv2si3, "__builtin_ia32_phsubd", IX86_BUILTIN_PHSUBD, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
24662 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv8hi3, "__builtin_ia32_phsubsw128", IX86_BUILTIN_PHSUBSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24663 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_phsubswv4hi3, "__builtin_ia32_phsubsw", IX86_BUILTIN_PHSUBSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24664 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw128, "__builtin_ia32_pmaddubsw128", IX86_BUILTIN_PMADDUBSW128, UNKNOWN, (int) V8HI_FTYPE_V16QI_V16QI },
24665 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmaddubsw, "__builtin_ia32_pmaddubsw", IX86_BUILTIN_PMADDUBSW, UNKNOWN, (int) V4HI_FTYPE_V8QI_V8QI },
24666 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv8hi3, "__builtin_ia32_pmulhrsw128", IX86_BUILTIN_PMULHRSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24667 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pmulhrswv4hi3, "__builtin_ia32_pmulhrsw", IX86_BUILTIN_PMULHRSW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24668 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv16qi3, "__builtin_ia32_pshufb128", IX86_BUILTIN_PSHUFB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24669 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_pshufbv8qi3, "__builtin_ia32_pshufb", IX86_BUILTIN_PSHUFB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
24670 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv16qi3, "__builtin_ia32_psignb128", IX86_BUILTIN_PSIGNB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24671 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8qi3, "__builtin_ia32_psignb", IX86_BUILTIN_PSIGNB, UNKNOWN, (int) V8QI_FTYPE_V8QI_V8QI },
24672 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv8hi3, "__builtin_ia32_psignw128", IX86_BUILTIN_PSIGNW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24673 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4hi3, "__builtin_ia32_psignw", IX86_BUILTIN_PSIGNW, UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI },
24674 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv4si3, "__builtin_ia32_psignd128", IX86_BUILTIN_PSIGND128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
24675 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_psignv2si3, "__builtin_ia32_psignd", IX86_BUILTIN_PSIGND, UNKNOWN, (int) V2SI_FTYPE_V2SI_V2SI },
24678 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrti, "__builtin_ia32_palignr128", IX86_BUILTIN_PALIGNR128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT_CONVERT },
24679 { OPTION_MASK_ISA_SSSE3, CODE_FOR_ssse3_palignrdi, "__builtin_ia32_palignr", IX86_BUILTIN_PALIGNR, UNKNOWN, (int) V1DI_FTYPE_V1DI_V1DI_INT_CONVERT },
24682 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
24683 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
24684 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF },
24685 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF },
24686 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
24687 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
24688 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
24689 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT },
24690 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI },
24691 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT },
24693 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
24694 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4qiv4si2, "__builtin_ia32_pmovsxbd128", IX86_BUILTIN_PMOVSXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
24695 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2qiv2di2, "__builtin_ia32_pmovsxbq128", IX86_BUILTIN_PMOVSXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
24696 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv4hiv4si2, "__builtin_ia32_pmovsxwd128", IX86_BUILTIN_PMOVSXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
24697 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2hiv2di2, "__builtin_ia32_pmovsxwq128", IX86_BUILTIN_PMOVSXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
24698 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_sign_extendv2siv2di2, "__builtin_ia32_pmovsxdq128", IX86_BUILTIN_PMOVSXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
24699 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv8qiv8hi2, "__builtin_ia32_pmovzxbw128", IX86_BUILTIN_PMOVZXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI },
24700 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4qiv4si2, "__builtin_ia32_pmovzxbd128", IX86_BUILTIN_PMOVZXBD128, UNKNOWN, (int) V4SI_FTYPE_V16QI },
24701 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2qiv2di2, "__builtin_ia32_pmovzxbq128", IX86_BUILTIN_PMOVZXBQ128, UNKNOWN, (int) V2DI_FTYPE_V16QI },
24702 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv4hiv4si2, "__builtin_ia32_pmovzxwd128", IX86_BUILTIN_PMOVZXWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI },
24703 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2hiv2di2, "__builtin_ia32_pmovzxwq128", IX86_BUILTIN_PMOVZXWQ128, UNKNOWN, (int) V2DI_FTYPE_V8HI },
24704 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_zero_extendv2siv2di2, "__builtin_ia32_pmovzxdq128", IX86_BUILTIN_PMOVZXDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI },
24705 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_phminposuw, "__builtin_ia32_phminposuw128", IX86_BUILTIN_PHMINPOSUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI },
24707 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_packusdw, "__builtin_ia32_packusdw128", IX86_BUILTIN_PACKUSDW128, UNKNOWN, (int) V8HI_FTYPE_V4SI_V4SI },
24708 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_eqv2di3, "__builtin_ia32_pcmpeqq", IX86_BUILTIN_PCMPEQQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
24709 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv16qi3, "__builtin_ia32_pmaxsb128", IX86_BUILTIN_PMAXSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24710 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_smaxv4si3, "__builtin_ia32_pmaxsd128", IX86_BUILTIN_PMAXSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
24711 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv4si3, "__builtin_ia32_pmaxud128", IX86_BUILTIN_PMAXUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
24712 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_umaxv8hi3, "__builtin_ia32_pmaxuw128", IX86_BUILTIN_PMAXUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24713 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv16qi3, "__builtin_ia32_pminsb128", IX86_BUILTIN_PMINSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI },
24714 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sminv4si3, "__builtin_ia32_pminsd128", IX86_BUILTIN_PMINSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
24715 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
24716 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
24717 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
24718 { OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
24721 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_roundpd", IX86_BUILTIN_ROUNDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
24722 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_roundps", IX86_BUILTIN_ROUNDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
24723 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundsd, "__builtin_ia32_roundsd", IX86_BUILTIN_ROUNDSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
24724 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundss, "__builtin_ia32_roundss", IX86_BUILTIN_ROUNDSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
24726 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_floorpd", IX86_BUILTIN_FLOORPD, (enum rtx_code) ROUND_FLOOR, (int) V2DF_FTYPE_V2DF_ROUND },
24727 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_ceilpd", IX86_BUILTIN_CEILPD, (enum rtx_code) ROUND_CEIL, (int) V2DF_FTYPE_V2DF_ROUND },
24728 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_truncpd", IX86_BUILTIN_TRUNCPD, (enum rtx_code) ROUND_TRUNC, (int) V2DF_FTYPE_V2DF_ROUND },
24729 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundpd, "__builtin_ia32_rintpd", IX86_BUILTIN_RINTPD, (enum rtx_code) ROUND_MXCSR, (int) V2DF_FTYPE_V2DF_ROUND },
24731 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_floorps", IX86_BUILTIN_FLOORPS, (enum rtx_code) ROUND_FLOOR, (int) V4SF_FTYPE_V4SF_ROUND },
24732 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_ceilps", IX86_BUILTIN_CEILPS, (enum rtx_code) ROUND_CEIL, (int) V4SF_FTYPE_V4SF_ROUND },
24733 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_truncps", IX86_BUILTIN_TRUNCPS, (enum rtx_code) ROUND_TRUNC, (int) V4SF_FTYPE_V4SF_ROUND },
24734 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_roundps, "__builtin_ia32_rintps", IX86_BUILTIN_RINTPS, (enum rtx_code) ROUND_MXCSR, (int) V4SF_FTYPE_V4SF_ROUND },
24736 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST },
24737 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
24738 { OPTION_MASK_ISA_ROUND, CODE_FOR_sse4_1_ptest, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST },
24741 { OPTION_MASK_ISA_SSE4_2, CODE_FOR_sse4_2_gtv2di3, "__builtin_ia32_pcmpgtq", IX86_BUILTIN_PCMPGTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
24742 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32qi, "__builtin_ia32_crc32qi", IX86_BUILTIN_CRC32QI, UNKNOWN, (int) UINT_FTYPE_UINT_UCHAR },
24743 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32hi, "__builtin_ia32_crc32hi", IX86_BUILTIN_CRC32HI, UNKNOWN, (int) UINT_FTYPE_UINT_USHORT },
24744 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32, CODE_FOR_sse4_2_crc32si, "__builtin_ia32_crc32si", IX86_BUILTIN_CRC32SI, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
24745 { OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32 | OPTION_MASK_ISA_64BIT, CODE_FOR_sse4_2_crc32di, "__builtin_ia32_crc32di", IX86_BUILTIN_CRC32DI, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
24748 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrqi, "__builtin_ia32_extrqi", IX86_BUILTIN_EXTRQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_UINT_UINT },
24749 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_extrq, "__builtin_ia32_extrq", IX86_BUILTIN_EXTRQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V16QI },
24750 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertqi, "__builtin_ia32_insertqi", IX86_BUILTIN_INSERTQI, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_UINT_UINT },
24751 { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_insertq, "__builtin_ia32_insertq", IX86_BUILTIN_INSERTQ, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
24754 { OPTION_MASK_ISA_SSE2, CODE_FOR_aeskeygenassist, 0, IX86_BUILTIN_AESKEYGENASSIST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_INT },
24755 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesimc, 0, IX86_BUILTIN_AESIMC128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
24757 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenc, 0, IX86_BUILTIN_AESENC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
24758 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesenclast, 0, IX86_BUILTIN_AESENCLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
24759 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdec, 0, IX86_BUILTIN_AESDEC128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
24760 { OPTION_MASK_ISA_SSE2, CODE_FOR_aesdeclast, 0, IX86_BUILTIN_AESDECLAST128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI },
24763 { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
24766 { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
24767 { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
24768 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
24769 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
24770 { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
24771 { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
24772 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
24773 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_andnotv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
24774 { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
24775 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
24776 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
24777 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
24778 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
24779 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
24780 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
24781 { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
24782 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
24783 { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
24784 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
24785 { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
24786 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
24787 { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
24788 { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
24789 { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
24790 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
24791 { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
24793 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
24794 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
24795 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
24796 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
24798 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
24799 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
24800 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
24801 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
24802 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
24803 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
24804 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
24805 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
24806 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vmcmpv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
24807 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
24808 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
24809 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
24810 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
24811 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
24812 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
24813 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
24814 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtdq2pd256, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
24815 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtdq2ps256, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
24816 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
24817 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
24818 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
24819 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvttpd2dq256, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
24820 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
24821 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvttps2dq256, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
24822 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
24823 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
24824 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
24825 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
24826 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
24827 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
24828 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
24829 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
24830 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
24831 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
24833 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
24834 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
24835 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
24837 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
24838 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
24839 { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
24840 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
24841 { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
24843 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
24845 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
24846 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
24848 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_floorpd256", IX86_BUILTIN_FLOORPD256, (enum rtx_code) ROUND_FLOOR, (int) V4DF_FTYPE_V4DF_ROUND },
24849 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_ceilpd256", IX86_BUILTIN_CEILPD256, (enum rtx_code) ROUND_CEIL, (int) V4DF_FTYPE_V4DF_ROUND },
24850 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_truncpd256", IX86_BUILTIN_TRUNCPD256, (enum rtx_code) ROUND_TRUNC, (int) V4DF_FTYPE_V4DF_ROUND },
24851 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_rintpd256", IX86_BUILTIN_RINTPD256, (enum rtx_code) ROUND_MXCSR, (int) V4DF_FTYPE_V4DF_ROUND },
24853 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_floorps256", IX86_BUILTIN_FLOORPS256, (enum rtx_code) ROUND_FLOOR, (int) V8SF_FTYPE_V8SF_ROUND },
24854 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_ceilps256", IX86_BUILTIN_CEILPS256, (enum rtx_code) ROUND_CEIL, (int) V8SF_FTYPE_V8SF_ROUND },
24855 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_truncps256", IX86_BUILTIN_TRUNCPS256, (enum rtx_code) ROUND_TRUNC, (int) V8SF_FTYPE_V8SF_ROUND },
24856 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_rintps256", IX86_BUILTIN_RINTPS256, (enum rtx_code) ROUND_MXCSR, (int) V8SF_FTYPE_V8SF_ROUND },
24858 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256, "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
24859 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256, "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
24860 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256, "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
24861 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256, "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
24863 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
24864 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
24865 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
24866 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8si, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
24867 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v8sf, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
24868 { OPTION_MASK_ISA_AVX, CODE_FOR_vec_extract_lo_v4df, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
24870 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
24871 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
24872 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
24873 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
24874 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
24875 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
24876 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
24877 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
24878 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
24879 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
24880 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
24881 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
24882 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
24883 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
24884 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
24886 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF },
24887 { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
24889 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv8sf3, "__builtin_ia32_copysignps256", IX86_BUILTIN_CPYSGNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
24890 { OPTION_MASK_ISA_AVX, CODE_FOR_copysignv4df3, "__builtin_ia32_copysignpd256", IX86_BUILTIN_CPYSGNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
24892 { OPTION_MASK_ISA_ABM, CODE_FOR_clzhi2_abm, "__builtin_clzs", IX86_BUILTIN_CLZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
24895 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_si, "__builtin_ia32_bextr_u32", IX86_BUILTIN_BEXTR32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
24896 { OPTION_MASK_ISA_BMI, CODE_FOR_bmi_bextr_di, "__builtin_ia32_bextr_u64", IX86_BUILTIN_BEXTR64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
24897 { OPTION_MASK_ISA_BMI, CODE_FOR_ctzhi2, "__builtin_ctzs", IX86_BUILTIN_CTZS, UNKNOWN, (int) UINT16_FTYPE_UINT16 },
24900 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_si, "__builtin_ia32_bextri_u32", IX86_BUILTIN_BEXTRI32, UNKNOWN, (int) UINT_FTYPE_UINT_UINT },
24901 { OPTION_MASK_ISA_TBM, CODE_FOR_tbm_bextri_di, "__builtin_ia32_bextri_u64", IX86_BUILTIN_BEXTRI64, UNKNOWN, (int) UINT64_FTYPE_UINT64_UINT64 },
24904 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps, "__builtin_ia32_vcvtph2ps", IX86_BUILTIN_CVTPH2PS, UNKNOWN, (int) V4SF_FTYPE_V8HI },
24905 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtph2ps256, "__builtin_ia32_vcvtph2ps256", IX86_BUILTIN_CVTPH2PS256, UNKNOWN, (int) V8SF_FTYPE_V8HI },
24906 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph, "__builtin_ia32_vcvtps2ph", IX86_BUILTIN_CVTPS2PH, UNKNOWN, (int) V8HI_FTYPE_V4SF_INT },
24907 { OPTION_MASK_ISA_F16C, CODE_FOR_vcvtps2ph256, "__builtin_ia32_vcvtps2ph256", IX86_BUILTIN_CVTPS2PH256, UNKNOWN, (int) V8HI_FTYPE_V8SF_INT },
24910 /* FMA4 and XOP. */
24911 #define MULTI_ARG_4_DF2_DI_I V2DF_FTYPE_V2DF_V2DF_V2DI_INT
24912 #define MULTI_ARG_4_DF2_DI_I1 V4DF_FTYPE_V4DF_V4DF_V4DI_INT
24913 #define MULTI_ARG_4_SF2_SI_I V4SF_FTYPE_V4SF_V4SF_V4SI_INT
24914 #define MULTI_ARG_4_SF2_SI_I1 V8SF_FTYPE_V8SF_V8SF_V8SI_INT
24915 #define MULTI_ARG_3_SF V4SF_FTYPE_V4SF_V4SF_V4SF
24916 #define MULTI_ARG_3_DF V2DF_FTYPE_V2DF_V2DF_V2DF
24917 #define MULTI_ARG_3_SF2 V8SF_FTYPE_V8SF_V8SF_V8SF
24918 #define MULTI_ARG_3_DF2 V4DF_FTYPE_V4DF_V4DF_V4DF
24919 #define MULTI_ARG_3_DI V2DI_FTYPE_V2DI_V2DI_V2DI
24920 #define MULTI_ARG_3_SI V4SI_FTYPE_V4SI_V4SI_V4SI
24921 #define MULTI_ARG_3_SI_DI V4SI_FTYPE_V4SI_V4SI_V2DI
24922 #define MULTI_ARG_3_HI V8HI_FTYPE_V8HI_V8HI_V8HI
24923 #define MULTI_ARG_3_HI_SI V8HI_FTYPE_V8HI_V8HI_V4SI
24924 #define MULTI_ARG_3_QI V16QI_FTYPE_V16QI_V16QI_V16QI
24925 #define MULTI_ARG_3_DI2 V4DI_FTYPE_V4DI_V4DI_V4DI
24926 #define MULTI_ARG_3_SI2 V8SI_FTYPE_V8SI_V8SI_V8SI
24927 #define MULTI_ARG_3_HI2 V16HI_FTYPE_V16HI_V16HI_V16HI
24928 #define MULTI_ARG_3_QI2 V32QI_FTYPE_V32QI_V32QI_V32QI
24929 #define MULTI_ARG_2_SF V4SF_FTYPE_V4SF_V4SF
24930 #define MULTI_ARG_2_DF V2DF_FTYPE_V2DF_V2DF
24931 #define MULTI_ARG_2_DI V2DI_FTYPE_V2DI_V2DI
24932 #define MULTI_ARG_2_SI V4SI_FTYPE_V4SI_V4SI
24933 #define MULTI_ARG_2_HI V8HI_FTYPE_V8HI_V8HI
24934 #define MULTI_ARG_2_QI V16QI_FTYPE_V16QI_V16QI
24935 #define MULTI_ARG_2_DI_IMM V2DI_FTYPE_V2DI_SI
24936 #define MULTI_ARG_2_SI_IMM V4SI_FTYPE_V4SI_SI
24937 #define MULTI_ARG_2_HI_IMM V8HI_FTYPE_V8HI_SI
24938 #define MULTI_ARG_2_QI_IMM V16QI_FTYPE_V16QI_SI
24939 #define MULTI_ARG_2_DI_CMP V2DI_FTYPE_V2DI_V2DI_CMP
24940 #define MULTI_ARG_2_SI_CMP V4SI_FTYPE_V4SI_V4SI_CMP
24941 #define MULTI_ARG_2_HI_CMP V8HI_FTYPE_V8HI_V8HI_CMP
24942 #define MULTI_ARG_2_QI_CMP V16QI_FTYPE_V16QI_V16QI_CMP
24943 #define MULTI_ARG_2_SF_TF V4SF_FTYPE_V4SF_V4SF_TF
24944 #define MULTI_ARG_2_DF_TF V2DF_FTYPE_V2DF_V2DF_TF
24945 #define MULTI_ARG_2_DI_TF V2DI_FTYPE_V2DI_V2DI_TF
24946 #define MULTI_ARG_2_SI_TF V4SI_FTYPE_V4SI_V4SI_TF
24947 #define MULTI_ARG_2_HI_TF V8HI_FTYPE_V8HI_V8HI_TF
24948 #define MULTI_ARG_2_QI_TF V16QI_FTYPE_V16QI_V16QI_TF
24949 #define MULTI_ARG_1_SF V4SF_FTYPE_V4SF
24950 #define MULTI_ARG_1_DF V2DF_FTYPE_V2DF
24951 #define MULTI_ARG_1_SF2 V8SF_FTYPE_V8SF
24952 #define MULTI_ARG_1_DF2 V4DF_FTYPE_V4DF
24953 #define MULTI_ARG_1_DI V2DI_FTYPE_V2DI
24954 #define MULTI_ARG_1_SI V4SI_FTYPE_V4SI
24955 #define MULTI_ARG_1_HI V8HI_FTYPE_V8HI
24956 #define MULTI_ARG_1_QI V16QI_FTYPE_V16QI
24957 #define MULTI_ARG_1_SI_DI V2DI_FTYPE_V4SI
24958 #define MULTI_ARG_1_HI_DI V2DI_FTYPE_V8HI
24959 #define MULTI_ARG_1_HI_SI V4SI_FTYPE_V8HI
24960 #define MULTI_ARG_1_QI_DI V2DI_FTYPE_V16QI
24961 #define MULTI_ARG_1_QI_SI V4SI_FTYPE_V16QI
24962 #define MULTI_ARG_1_QI_HI V8HI_FTYPE_V16QI
24964 static const struct builtin_description bdesc_multi_arg[] =
24966 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v4sf,
24967 "__builtin_ia32_vfmaddss", IX86_BUILTIN_VFMADDSS,
24968 UNKNOWN, (int)MULTI_ARG_3_SF },
24969 { OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_vmfmadd_v2df,
24970 "__builtin_ia32_vfmaddsd", IX86_BUILTIN_VFMADDSD,
24971 UNKNOWN, (int)MULTI_ARG_3_DF },
24973 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4sf,
24974 "__builtin_ia32_vfmaddps", IX86_BUILTIN_VFMADDPS,
24975 UNKNOWN, (int)MULTI_ARG_3_SF },
24976 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v2df,
24977 "__builtin_ia32_vfmaddpd", IX86_BUILTIN_VFMADDPD,
24978 UNKNOWN, (int)MULTI_ARG_3_DF },
24979 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v8sf,
24980 "__builtin_ia32_vfmaddps256", IX86_BUILTIN_VFMADDPS256,
24981 UNKNOWN, (int)MULTI_ARG_3_SF2 },
24982 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fma4i_fmadd_v4df,
24983 "__builtin_ia32_vfmaddpd256", IX86_BUILTIN_VFMADDPD256,
24984 UNKNOWN, (int)MULTI_ARG_3_DF2 },
24986 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4sf,
24987 "__builtin_ia32_vfmaddsubps", IX86_BUILTIN_VFMADDSUBPS,
24988 UNKNOWN, (int)MULTI_ARG_3_SF },
24989 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v2df,
24990 "__builtin_ia32_vfmaddsubpd", IX86_BUILTIN_VFMADDSUBPD,
24991 UNKNOWN, (int)MULTI_ARG_3_DF },
24992 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v8sf,
24993 "__builtin_ia32_vfmaddsubps256", IX86_BUILTIN_VFMADDSUBPS256,
24994 UNKNOWN, (int)MULTI_ARG_3_SF2 },
24995 { OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4, CODE_FOR_fmaddsub_v4df,
24996 "__builtin_ia32_vfmaddsubpd256", IX86_BUILTIN_VFMADDSUBPD256,
24997 UNKNOWN, (int)MULTI_ARG_3_DF2 },
24999 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov", IX86_BUILTIN_VPCMOV, UNKNOWN, (int)MULTI_ARG_3_DI },
25000 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2di, "__builtin_ia32_vpcmov_v2di", IX86_BUILTIN_VPCMOV_V2DI, UNKNOWN, (int)MULTI_ARG_3_DI },
25001 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4si, "__builtin_ia32_vpcmov_v4si", IX86_BUILTIN_VPCMOV_V4SI, UNKNOWN, (int)MULTI_ARG_3_SI },
25002 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8hi, "__builtin_ia32_vpcmov_v8hi", IX86_BUILTIN_VPCMOV_V8HI, UNKNOWN, (int)MULTI_ARG_3_HI },
25003 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16qi, "__builtin_ia32_vpcmov_v16qi",IX86_BUILTIN_VPCMOV_V16QI,UNKNOWN, (int)MULTI_ARG_3_QI },
25004 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v2df, "__builtin_ia32_vpcmov_v2df", IX86_BUILTIN_VPCMOV_V2DF, UNKNOWN, (int)MULTI_ARG_3_DF },
25005 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4sf, "__builtin_ia32_vpcmov_v4sf", IX86_BUILTIN_VPCMOV_V4SF, UNKNOWN, (int)MULTI_ARG_3_SF },
25007 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov256", IX86_BUILTIN_VPCMOV256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
25008 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4di256, "__builtin_ia32_vpcmov_v4di256", IX86_BUILTIN_VPCMOV_V4DI256, UNKNOWN, (int)MULTI_ARG_3_DI2 },
25009 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8si256, "__builtin_ia32_vpcmov_v8si256", IX86_BUILTIN_VPCMOV_V8SI256, UNKNOWN, (int)MULTI_ARG_3_SI2 },
25010 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v16hi256, "__builtin_ia32_vpcmov_v16hi256", IX86_BUILTIN_VPCMOV_V16HI256, UNKNOWN, (int)MULTI_ARG_3_HI2 },
25011 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v32qi256, "__builtin_ia32_vpcmov_v32qi256", IX86_BUILTIN_VPCMOV_V32QI256, UNKNOWN, (int)MULTI_ARG_3_QI2 },
25012 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v4df256, "__builtin_ia32_vpcmov_v4df256", IX86_BUILTIN_VPCMOV_V4DF256, UNKNOWN, (int)MULTI_ARG_3_DF2 },
25013 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcmov_v8sf256, "__builtin_ia32_vpcmov_v8sf256", IX86_BUILTIN_VPCMOV_V8SF256, UNKNOWN, (int)MULTI_ARG_3_SF2 },
25015 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pperm, "__builtin_ia32_vpperm", IX86_BUILTIN_VPPERM, UNKNOWN, (int)MULTI_ARG_3_QI },
25017 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssww, "__builtin_ia32_vpmacssww", IX86_BUILTIN_VPMACSSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
25018 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsww, "__builtin_ia32_vpmacsww", IX86_BUILTIN_VPMACSWW, UNKNOWN, (int)MULTI_ARG_3_HI },
25019 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsswd, "__builtin_ia32_vpmacsswd", IX86_BUILTIN_VPMACSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
25020 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacswd, "__builtin_ia32_vpmacswd", IX86_BUILTIN_VPMACSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
25021 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdd, "__builtin_ia32_vpmacssdd", IX86_BUILTIN_VPMACSSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
25022 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdd, "__builtin_ia32_vpmacsdd", IX86_BUILTIN_VPMACSDD, UNKNOWN, (int)MULTI_ARG_3_SI },
25023 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdql, "__builtin_ia32_vpmacssdql", IX86_BUILTIN_VPMACSSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
25024 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacssdqh, "__builtin_ia32_vpmacssdqh", IX86_BUILTIN_VPMACSSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
25025 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdql, "__builtin_ia32_vpmacsdql", IX86_BUILTIN_VPMACSDQL, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
25026 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmacsdqh, "__builtin_ia32_vpmacsdqh", IX86_BUILTIN_VPMACSDQH, UNKNOWN, (int)MULTI_ARG_3_SI_DI },
25027 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcsswd, "__builtin_ia32_vpmadcsswd", IX86_BUILTIN_VPMADCSSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
25028 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pmadcswd, "__builtin_ia32_vpmadcswd", IX86_BUILTIN_VPMADCSWD, UNKNOWN, (int)MULTI_ARG_3_HI_SI },
25030 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv2di3, "__builtin_ia32_vprotq", IX86_BUILTIN_VPROTQ, UNKNOWN, (int)MULTI_ARG_2_DI },
25031 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv4si3, "__builtin_ia32_vprotd", IX86_BUILTIN_VPROTD, UNKNOWN, (int)MULTI_ARG_2_SI },
25032 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv8hi3, "__builtin_ia32_vprotw", IX86_BUILTIN_VPROTW, UNKNOWN, (int)MULTI_ARG_2_HI },
25033 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vrotlv16qi3, "__builtin_ia32_vprotb", IX86_BUILTIN_VPROTB, UNKNOWN, (int)MULTI_ARG_2_QI },
25034 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv2di3, "__builtin_ia32_vprotqi", IX86_BUILTIN_VPROTQ_IMM, UNKNOWN, (int)MULTI_ARG_2_DI_IMM },
25035 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv4si3, "__builtin_ia32_vprotdi", IX86_BUILTIN_VPROTD_IMM, UNKNOWN, (int)MULTI_ARG_2_SI_IMM },
25036 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv8hi3, "__builtin_ia32_vprotwi", IX86_BUILTIN_VPROTW_IMM, UNKNOWN, (int)MULTI_ARG_2_HI_IMM },
25037 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_rotlv16qi3, "__builtin_ia32_vprotbi", IX86_BUILTIN_VPROTB_IMM, UNKNOWN, (int)MULTI_ARG_2_QI_IMM },
25038 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv2di3, "__builtin_ia32_vpshaq", IX86_BUILTIN_VPSHAQ, UNKNOWN, (int)MULTI_ARG_2_DI },
25039 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv4si3, "__builtin_ia32_vpshad", IX86_BUILTIN_VPSHAD, UNKNOWN, (int)MULTI_ARG_2_SI },
25040 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv8hi3, "__builtin_ia32_vpshaw", IX86_BUILTIN_VPSHAW, UNKNOWN, (int)MULTI_ARG_2_HI },
25041 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_ashlv16qi3, "__builtin_ia32_vpshab", IX86_BUILTIN_VPSHAB, UNKNOWN, (int)MULTI_ARG_2_QI },
25042 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv2di3, "__builtin_ia32_vpshlq", IX86_BUILTIN_VPSHLQ, UNKNOWN, (int)MULTI_ARG_2_DI },
25043 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv4si3, "__builtin_ia32_vpshld", IX86_BUILTIN_VPSHLD, UNKNOWN, (int)MULTI_ARG_2_SI },
25044 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv8hi3, "__builtin_ia32_vpshlw", IX86_BUILTIN_VPSHLW, UNKNOWN, (int)MULTI_ARG_2_HI },
25045 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_lshlv16qi3, "__builtin_ia32_vpshlb", IX86_BUILTIN_VPSHLB, UNKNOWN, (int)MULTI_ARG_2_QI },
25047 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv4sf2, "__builtin_ia32_vfrczss", IX86_BUILTIN_VFRCZSS, UNKNOWN, (int)MULTI_ARG_2_SF },
25048 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vmfrczv2df2, "__builtin_ia32_vfrczsd", IX86_BUILTIN_VFRCZSD, UNKNOWN, (int)MULTI_ARG_2_DF },
25049 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4sf2, "__builtin_ia32_vfrczps", IX86_BUILTIN_VFRCZPS, UNKNOWN, (int)MULTI_ARG_1_SF },
25050 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv2df2, "__builtin_ia32_vfrczpd", IX86_BUILTIN_VFRCZPD, UNKNOWN, (int)MULTI_ARG_1_DF },
25051 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv8sf2, "__builtin_ia32_vfrczps256", IX86_BUILTIN_VFRCZPS256, UNKNOWN, (int)MULTI_ARG_1_SF2 },
25052 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_frczv4df2, "__builtin_ia32_vfrczpd256", IX86_BUILTIN_VFRCZPD256, UNKNOWN, (int)MULTI_ARG_1_DF2 },
25054 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbw, "__builtin_ia32_vphaddbw", IX86_BUILTIN_VPHADDBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
25055 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbd, "__builtin_ia32_vphaddbd", IX86_BUILTIN_VPHADDBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
25056 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddbq, "__builtin_ia32_vphaddbq", IX86_BUILTIN_VPHADDBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
25057 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwd, "__builtin_ia32_vphaddwd", IX86_BUILTIN_VPHADDWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
25058 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddwq, "__builtin_ia32_vphaddwq", IX86_BUILTIN_VPHADDWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
25059 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadddq, "__builtin_ia32_vphadddq", IX86_BUILTIN_VPHADDDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
25060 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubw, "__builtin_ia32_vphaddubw", IX86_BUILTIN_VPHADDUBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
25061 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubd, "__builtin_ia32_vphaddubd", IX86_BUILTIN_VPHADDUBD, UNKNOWN, (int)MULTI_ARG_1_QI_SI },
25062 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddubq, "__builtin_ia32_vphaddubq", IX86_BUILTIN_VPHADDUBQ, UNKNOWN, (int)MULTI_ARG_1_QI_DI },
25063 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwd, "__builtin_ia32_vphadduwd", IX86_BUILTIN_VPHADDUWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
25064 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phadduwq, "__builtin_ia32_vphadduwq", IX86_BUILTIN_VPHADDUWQ, UNKNOWN, (int)MULTI_ARG_1_HI_DI },
25065 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phaddudq, "__builtin_ia32_vphaddudq", IX86_BUILTIN_VPHADDUDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
25066 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubbw, "__builtin_ia32_vphsubbw", IX86_BUILTIN_VPHSUBBW, UNKNOWN, (int)MULTI_ARG_1_QI_HI },
25067 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubwd, "__builtin_ia32_vphsubwd", IX86_BUILTIN_VPHSUBWD, UNKNOWN, (int)MULTI_ARG_1_HI_SI },
25068 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_phsubdq, "__builtin_ia32_vphsubdq", IX86_BUILTIN_VPHSUBDQ, UNKNOWN, (int)MULTI_ARG_1_SI_DI },
25070 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomeqb", IX86_BUILTIN_VPCOMEQB, EQ, (int)MULTI_ARG_2_QI_CMP },
25071 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
25072 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomneqb", IX86_BUILTIN_VPCOMNEB, NE, (int)MULTI_ARG_2_QI_CMP },
25073 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomltb", IX86_BUILTIN_VPCOMLTB, LT, (int)MULTI_ARG_2_QI_CMP },
25074 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomleb", IX86_BUILTIN_VPCOMLEB, LE, (int)MULTI_ARG_2_QI_CMP },
25075 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgtb", IX86_BUILTIN_VPCOMGTB, GT, (int)MULTI_ARG_2_QI_CMP },
25076 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv16qi3, "__builtin_ia32_vpcomgeb", IX86_BUILTIN_VPCOMGEB, GE, (int)MULTI_ARG_2_QI_CMP },
25078 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomeqw", IX86_BUILTIN_VPCOMEQW, EQ, (int)MULTI_ARG_2_HI_CMP },
25079 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomnew", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
25080 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomneqw", IX86_BUILTIN_VPCOMNEW, NE, (int)MULTI_ARG_2_HI_CMP },
25081 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomltw", IX86_BUILTIN_VPCOMLTW, LT, (int)MULTI_ARG_2_HI_CMP },
25082 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomlew", IX86_BUILTIN_VPCOMLEW, LE, (int)MULTI_ARG_2_HI_CMP },
25083 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgtw", IX86_BUILTIN_VPCOMGTW, GT, (int)MULTI_ARG_2_HI_CMP },
25084 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv8hi3, "__builtin_ia32_vpcomgew", IX86_BUILTIN_VPCOMGEW, GE, (int)MULTI_ARG_2_HI_CMP },
25086 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomeqd", IX86_BUILTIN_VPCOMEQD, EQ, (int)MULTI_ARG_2_SI_CMP },
25087 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomned", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
25088 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomneqd", IX86_BUILTIN_VPCOMNED, NE, (int)MULTI_ARG_2_SI_CMP },
25089 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomltd", IX86_BUILTIN_VPCOMLTD, LT, (int)MULTI_ARG_2_SI_CMP },
25090 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomled", IX86_BUILTIN_VPCOMLED, LE, (int)MULTI_ARG_2_SI_CMP },
25091 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomgtd", IX86_BUILTIN_VPCOMGTD, GT, (int)MULTI_ARG_2_SI_CMP },
25092 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv4si3, "__builtin_ia32_vpcomged", IX86_BUILTIN_VPCOMGED, GE, (int)MULTI_ARG_2_SI_CMP },
25094 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomeqq", IX86_BUILTIN_VPCOMEQQ, EQ, (int)MULTI_ARG_2_DI_CMP },
25095 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
25096 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomneqq", IX86_BUILTIN_VPCOMNEQ, NE, (int)MULTI_ARG_2_DI_CMP },
25097 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomltq", IX86_BUILTIN_VPCOMLTQ, LT, (int)MULTI_ARG_2_DI_CMP },
25098 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomleq", IX86_BUILTIN_VPCOMLEQ, LE, (int)MULTI_ARG_2_DI_CMP },
25099 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgtq", IX86_BUILTIN_VPCOMGTQ, GT, (int)MULTI_ARG_2_DI_CMP },
25100 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmpv2di3, "__builtin_ia32_vpcomgeq", IX86_BUILTIN_VPCOMGEQ, GE, (int)MULTI_ARG_2_DI_CMP },
25102 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomequb", IX86_BUILTIN_VPCOMEQUB, EQ, (int)MULTI_ARG_2_QI_CMP },
25103 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomneub", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
25104 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v16qi3,"__builtin_ia32_vpcomnequb", IX86_BUILTIN_VPCOMNEUB, NE, (int)MULTI_ARG_2_QI_CMP },
25105 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomltub", IX86_BUILTIN_VPCOMLTUB, LTU, (int)MULTI_ARG_2_QI_CMP },
25106 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomleub", IX86_BUILTIN_VPCOMLEUB, LEU, (int)MULTI_ARG_2_QI_CMP },
25107 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgtub", IX86_BUILTIN_VPCOMGTUB, GTU, (int)MULTI_ARG_2_QI_CMP },
25108 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv16qi3, "__builtin_ia32_vpcomgeub", IX86_BUILTIN_VPCOMGEUB, GEU, (int)MULTI_ARG_2_QI_CMP },
25110 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomequw", IX86_BUILTIN_VPCOMEQUW, EQ, (int)MULTI_ARG_2_HI_CMP },
25111 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomneuw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
25112 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v8hi3, "__builtin_ia32_vpcomnequw", IX86_BUILTIN_VPCOMNEUW, NE, (int)MULTI_ARG_2_HI_CMP },
25113 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomltuw", IX86_BUILTIN_VPCOMLTUW, LTU, (int)MULTI_ARG_2_HI_CMP },
25114 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomleuw", IX86_BUILTIN_VPCOMLEUW, LEU, (int)MULTI_ARG_2_HI_CMP },
25115 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgtuw", IX86_BUILTIN_VPCOMGTUW, GTU, (int)MULTI_ARG_2_HI_CMP },
25116 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv8hi3, "__builtin_ia32_vpcomgeuw", IX86_BUILTIN_VPCOMGEUW, GEU, (int)MULTI_ARG_2_HI_CMP },
25118 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomequd", IX86_BUILTIN_VPCOMEQUD, EQ, (int)MULTI_ARG_2_SI_CMP },
25119 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomneud", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
25120 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v4si3, "__builtin_ia32_vpcomnequd", IX86_BUILTIN_VPCOMNEUD, NE, (int)MULTI_ARG_2_SI_CMP },
25121 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomltud", IX86_BUILTIN_VPCOMLTUD, LTU, (int)MULTI_ARG_2_SI_CMP },
25122 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomleud", IX86_BUILTIN_VPCOMLEUD, LEU, (int)MULTI_ARG_2_SI_CMP },
25123 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgtud", IX86_BUILTIN_VPCOMGTUD, GTU, (int)MULTI_ARG_2_SI_CMP },
25124 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv4si3, "__builtin_ia32_vpcomgeud", IX86_BUILTIN_VPCOMGEUD, GEU, (int)MULTI_ARG_2_SI_CMP },
25126 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomequq", IX86_BUILTIN_VPCOMEQUQ, EQ, (int)MULTI_ARG_2_DI_CMP },
25127 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomneuq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
25128 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_uns2v2di3, "__builtin_ia32_vpcomnequq", IX86_BUILTIN_VPCOMNEUQ, NE, (int)MULTI_ARG_2_DI_CMP },
25129 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomltuq", IX86_BUILTIN_VPCOMLTUQ, LTU, (int)MULTI_ARG_2_DI_CMP },
25130 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomleuq", IX86_BUILTIN_VPCOMLEUQ, LEU, (int)MULTI_ARG_2_DI_CMP },
25131 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgtuq", IX86_BUILTIN_VPCOMGTUQ, GTU, (int)MULTI_ARG_2_DI_CMP },
25132 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_maskcmp_unsv2di3, "__builtin_ia32_vpcomgeuq", IX86_BUILTIN_VPCOMGEUQ, GEU, (int)MULTI_ARG_2_DI_CMP },
25134 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseb", IX86_BUILTIN_VPCOMFALSEB, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
25135 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalsew", IX86_BUILTIN_VPCOMFALSEW, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
25136 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalsed", IX86_BUILTIN_VPCOMFALSED, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
25137 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseq", IX86_BUILTIN_VPCOMFALSEQ, (enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
25138 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomfalseub",IX86_BUILTIN_VPCOMFALSEUB,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_QI_TF },
25139 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomfalseuw",IX86_BUILTIN_VPCOMFALSEUW,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_HI_TF },
25140 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomfalseud",IX86_BUILTIN_VPCOMFALSEUD,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_SI_TF },
25141 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomfalseuq",IX86_BUILTIN_VPCOMFALSEUQ,(enum rtx_code) PCOM_FALSE, (int)MULTI_ARG_2_DI_TF },
25143 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueb", IX86_BUILTIN_VPCOMTRUEB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
25144 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtruew", IX86_BUILTIN_VPCOMTRUEW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
25145 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrued", IX86_BUILTIN_VPCOMTRUED, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
25146 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueq", IX86_BUILTIN_VPCOMTRUEQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
25147 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv16qi3, "__builtin_ia32_vpcomtrueub", IX86_BUILTIN_VPCOMTRUEUB, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_QI_TF },
25148 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv8hi3, "__builtin_ia32_vpcomtrueuw", IX86_BUILTIN_VPCOMTRUEUW, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_HI_TF },
25149 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv4si3, "__builtin_ia32_vpcomtrueud", IX86_BUILTIN_VPCOMTRUEUD, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_SI_TF },
25150 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_pcom_tfv2di3, "__builtin_ia32_vpcomtrueuq", IX86_BUILTIN_VPCOMTRUEUQ, (enum rtx_code) PCOM_TRUE, (int)MULTI_ARG_2_DI_TF },
25152 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v2df3, "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I },
25153 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4sf3, "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I },
25154 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v4df3, "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int)MULTI_ARG_4_DF2_DI_I1 },
25155 { OPTION_MASK_ISA_XOP, CODE_FOR_xop_vpermil2v8sf3, "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int)MULTI_ARG_4_SF2_SI_I1 },
25159 /* Set up all the MMX/SSE builtins, even builtins for instructions that are not
25160 in the current target ISA to allow the user to compile particular modules
25161 with different target specific options that differ from the command line
25164 ix86_init_mmx_sse_builtins (void)
25166 const struct builtin_description * d;
25167 enum ix86_builtin_func_type ftype;
25170 /* Add all special builtins with variable number of operands. */
25171 for (i = 0, d = bdesc_special_args;
25172 i < ARRAY_SIZE (bdesc_special_args);
25178 ftype = (enum ix86_builtin_func_type) d->flag;
25179 def_builtin (d->mask, d->name, ftype, d->code);
25182 /* Add all builtins with variable number of operands. */
25183 for (i = 0, d = bdesc_args;
25184 i < ARRAY_SIZE (bdesc_args);
25190 ftype = (enum ix86_builtin_func_type) d->flag;
25191 def_builtin_const (d->mask, d->name, ftype, d->code);
25194 /* pcmpestr[im] insns. */
25195 for (i = 0, d = bdesc_pcmpestr;
25196 i < ARRAY_SIZE (bdesc_pcmpestr);
25199 if (d->code == IX86_BUILTIN_PCMPESTRM128)
25200 ftype = V16QI_FTYPE_V16QI_INT_V16QI_INT_INT;
25202 ftype = INT_FTYPE_V16QI_INT_V16QI_INT_INT;
25203 def_builtin_const (d->mask, d->name, ftype, d->code);
25206 /* pcmpistr[im] insns. */
25207 for (i = 0, d = bdesc_pcmpistr;
25208 i < ARRAY_SIZE (bdesc_pcmpistr);
25211 if (d->code == IX86_BUILTIN_PCMPISTRM128)
25212 ftype = V16QI_FTYPE_V16QI_V16QI_INT;
25214 ftype = INT_FTYPE_V16QI_V16QI_INT;
25215 def_builtin_const (d->mask, d->name, ftype, d->code);
25218 /* comi/ucomi insns. */
25219 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
25221 if (d->mask == OPTION_MASK_ISA_SSE2)
25222 ftype = INT_FTYPE_V2DF_V2DF;
25224 ftype = INT_FTYPE_V4SF_V4SF;
25225 def_builtin_const (d->mask, d->name, ftype, d->code);
25229 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_ldmxcsr",
25230 VOID_FTYPE_UNSIGNED, IX86_BUILTIN_LDMXCSR);
25231 def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_stmxcsr",
25232 UNSIGNED_FTYPE_VOID, IX86_BUILTIN_STMXCSR);
25234 /* SSE or 3DNow!A */
25235 def_builtin (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
25236 "__builtin_ia32_maskmovq", VOID_FTYPE_V8QI_V8QI_PCHAR,
25237 IX86_BUILTIN_MASKMOVQ);
25240 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_maskmovdqu",
25241 VOID_FTYPE_V16QI_V16QI_PCHAR, IX86_BUILTIN_MASKMOVDQU);
25243 def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_clflush",
25244 VOID_FTYPE_PCVOID, IX86_BUILTIN_CLFLUSH);
25245 x86_mfence = def_builtin (OPTION_MASK_ISA_SSE2, "__builtin_ia32_mfence",
25246 VOID_FTYPE_VOID, IX86_BUILTIN_MFENCE);
25249 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_monitor",
25250 VOID_FTYPE_PCVOID_UNSIGNED_UNSIGNED, IX86_BUILTIN_MONITOR);
25251 def_builtin (OPTION_MASK_ISA_SSE3, "__builtin_ia32_mwait",
25252 VOID_FTYPE_UNSIGNED_UNSIGNED, IX86_BUILTIN_MWAIT);
25255 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenc128",
25256 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENC128);
25257 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesenclast128",
25258 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESENCLAST128);
25259 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdec128",
25260 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDEC128);
25261 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesdeclast128",
25262 V2DI_FTYPE_V2DI_V2DI, IX86_BUILTIN_AESDECLAST128);
25263 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aesimc128",
25264 V2DI_FTYPE_V2DI, IX86_BUILTIN_AESIMC128);
25265 def_builtin_const (OPTION_MASK_ISA_AES, "__builtin_ia32_aeskeygenassist128",
25266 V2DI_FTYPE_V2DI_INT, IX86_BUILTIN_AESKEYGENASSIST128);
25269 def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128",
25270 V2DI_FTYPE_V2DI_V2DI_INT, IX86_BUILTIN_PCLMULQDQ128);
25273 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand16_step",
25274 INT_FTYPE_PUSHORT, IX86_BUILTIN_RDRAND16_STEP);
25275 def_builtin (OPTION_MASK_ISA_RDRND, "__builtin_ia32_rdrand32_step",
25276 INT_FTYPE_PUNSIGNED, IX86_BUILTIN_RDRAND32_STEP);
25277 def_builtin (OPTION_MASK_ISA_RDRND | OPTION_MASK_ISA_64BIT,
25278 "__builtin_ia32_rdrand64_step", INT_FTYPE_PULONGLONG,
25279 IX86_BUILTIN_RDRAND64_STEP);
25281 /* MMX access to the vec_init patterns. */
25282 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v2si",
25283 V2SI_FTYPE_INT_INT, IX86_BUILTIN_VEC_INIT_V2SI);
25285 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v4hi",
25286 V4HI_FTYPE_HI_HI_HI_HI,
25287 IX86_BUILTIN_VEC_INIT_V4HI);
25289 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_init_v8qi",
25290 V8QI_FTYPE_QI_QI_QI_QI_QI_QI_QI_QI,
25291 IX86_BUILTIN_VEC_INIT_V8QI);
25293 /* Access to the vec_extract patterns. */
25294 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2df",
25295 DOUBLE_FTYPE_V2DF_INT, IX86_BUILTIN_VEC_EXT_V2DF);
25296 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v2di",
25297 DI_FTYPE_V2DI_INT, IX86_BUILTIN_VEC_EXT_V2DI);
25298 def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_vec_ext_v4sf",
25299 FLOAT_FTYPE_V4SF_INT, IX86_BUILTIN_VEC_EXT_V4SF);
25300 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v4si",
25301 SI_FTYPE_V4SI_INT, IX86_BUILTIN_VEC_EXT_V4SI);
25302 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v8hi",
25303 HI_FTYPE_V8HI_INT, IX86_BUILTIN_VEC_EXT_V8HI);
25305 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
25306 "__builtin_ia32_vec_ext_v4hi",
25307 HI_FTYPE_V4HI_INT, IX86_BUILTIN_VEC_EXT_V4HI);
25309 def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_vec_ext_v2si",
25310 SI_FTYPE_V2SI_INT, IX86_BUILTIN_VEC_EXT_V2SI);
25312 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_ext_v16qi",
25313 QI_FTYPE_V16QI_INT, IX86_BUILTIN_VEC_EXT_V16QI);
25315 /* Access to the vec_set patterns. */
25316 def_builtin_const (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_64BIT,
25317 "__builtin_ia32_vec_set_v2di",
25318 V2DI_FTYPE_V2DI_DI_INT, IX86_BUILTIN_VEC_SET_V2DI);
25320 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4sf",
25321 V4SF_FTYPE_V4SF_FLOAT_INT, IX86_BUILTIN_VEC_SET_V4SF);
25323 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v4si",
25324 V4SI_FTYPE_V4SI_SI_INT, IX86_BUILTIN_VEC_SET_V4SI);
25326 def_builtin_const (OPTION_MASK_ISA_SSE2, "__builtin_ia32_vec_set_v8hi",
25327 V8HI_FTYPE_V8HI_HI_INT, IX86_BUILTIN_VEC_SET_V8HI);
25329 def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A,
25330 "__builtin_ia32_vec_set_v4hi",
25331 V4HI_FTYPE_V4HI_HI_INT, IX86_BUILTIN_VEC_SET_V4HI);
25333 def_builtin_const (OPTION_MASK_ISA_SSE4_1, "__builtin_ia32_vec_set_v16qi",
25334 V16QI_FTYPE_V16QI_QI_INT, IX86_BUILTIN_VEC_SET_V16QI);
25336 /* Add FMA4 multi-arg argument instructions */
25337 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
25342 ftype = (enum ix86_builtin_func_type) d->flag;
25343 def_builtin_const (d->mask, d->name, ftype, d->code);
25347 /* Internal method for ix86_init_builtins. */
25350 ix86_init_builtins_va_builtins_abi (void)
25352 tree ms_va_ref, sysv_va_ref;
25353 tree fnvoid_va_end_ms, fnvoid_va_end_sysv;
25354 tree fnvoid_va_start_ms, fnvoid_va_start_sysv;
25355 tree fnvoid_va_copy_ms, fnvoid_va_copy_sysv;
25356 tree fnattr_ms = NULL_TREE, fnattr_sysv = NULL_TREE;
25360 fnattr_ms = build_tree_list (get_identifier ("ms_abi"), NULL_TREE);
25361 fnattr_sysv = build_tree_list (get_identifier ("sysv_abi"), NULL_TREE);
25362 ms_va_ref = build_reference_type (ms_va_list_type_node);
25364 build_pointer_type (TREE_TYPE (sysv_va_list_type_node));
25367 build_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
25368 fnvoid_va_start_ms =
25369 build_varargs_function_type_list (void_type_node, ms_va_ref, NULL_TREE);
25370 fnvoid_va_end_sysv =
25371 build_function_type_list (void_type_node, sysv_va_ref, NULL_TREE);
25372 fnvoid_va_start_sysv =
25373 build_varargs_function_type_list (void_type_node, sysv_va_ref,
25375 fnvoid_va_copy_ms =
25376 build_function_type_list (void_type_node, ms_va_ref, ms_va_list_type_node,
25378 fnvoid_va_copy_sysv =
25379 build_function_type_list (void_type_node, sysv_va_ref,
25380 sysv_va_ref, NULL_TREE);
25382 add_builtin_function ("__builtin_ms_va_start", fnvoid_va_start_ms,
25383 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_ms);
25384 add_builtin_function ("__builtin_ms_va_end", fnvoid_va_end_ms,
25385 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_ms);
25386 add_builtin_function ("__builtin_ms_va_copy", fnvoid_va_copy_ms,
25387 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_ms);
25388 add_builtin_function ("__builtin_sysv_va_start", fnvoid_va_start_sysv,
25389 BUILT_IN_VA_START, BUILT_IN_NORMAL, NULL, fnattr_sysv);
25390 add_builtin_function ("__builtin_sysv_va_end", fnvoid_va_end_sysv,
25391 BUILT_IN_VA_END, BUILT_IN_NORMAL, NULL, fnattr_sysv);
25392 add_builtin_function ("__builtin_sysv_va_copy", fnvoid_va_copy_sysv,
25393 BUILT_IN_VA_COPY, BUILT_IN_NORMAL, NULL, fnattr_sysv);
25397 ix86_init_builtin_types (void)
25399 tree float128_type_node, float80_type_node;
25401 /* The __float80 type. */
25402 float80_type_node = long_double_type_node;
25403 if (TYPE_MODE (float80_type_node) != XFmode)
25405 /* The __float80 type. */
25406 float80_type_node = make_node (REAL_TYPE);
25408 TYPE_PRECISION (float80_type_node) = 80;
25409 layout_type (float80_type_node);
25411 lang_hooks.types.register_builtin_type (float80_type_node, "__float80");
25413 /* The __float128 type. */
25414 float128_type_node = make_node (REAL_TYPE);
25415 TYPE_PRECISION (float128_type_node) = 128;
25416 layout_type (float128_type_node);
25417 lang_hooks.types.register_builtin_type (float128_type_node, "__float128");
25419 /* This macro is built by i386-builtin-types.awk. */
25420 DEFINE_BUILTIN_PRIMITIVE_TYPES;
25424 ix86_init_builtins (void)
25428 ix86_init_builtin_types ();
25430 /* TFmode support builtins. */
25431 def_builtin_const (0, "__builtin_infq",
25432 FLOAT128_FTYPE_VOID, IX86_BUILTIN_INFQ);
25433 def_builtin_const (0, "__builtin_huge_valq",
25434 FLOAT128_FTYPE_VOID, IX86_BUILTIN_HUGE_VALQ);
25436 /* We will expand them to normal call if SSE2 isn't available since
25437 they are used by libgcc. */
25438 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128);
25439 t = add_builtin_function ("__builtin_fabsq", t, IX86_BUILTIN_FABSQ,
25440 BUILT_IN_MD, "__fabstf2", NULL_TREE);
25441 TREE_READONLY (t) = 1;
25442 ix86_builtins[(int) IX86_BUILTIN_FABSQ] = t;
25444 t = ix86_get_builtin_func_type (FLOAT128_FTYPE_FLOAT128_FLOAT128);
25445 t = add_builtin_function ("__builtin_copysignq", t, IX86_BUILTIN_COPYSIGNQ,
25446 BUILT_IN_MD, "__copysigntf3", NULL_TREE);
25447 TREE_READONLY (t) = 1;
25448 ix86_builtins[(int) IX86_BUILTIN_COPYSIGNQ] = t;
25450 ix86_init_mmx_sse_builtins ();
25453 ix86_init_builtins_va_builtins_abi ();
25455 #ifdef SUBTARGET_INIT_BUILTINS
25456 SUBTARGET_INIT_BUILTINS;
25460 /* Return the ix86 builtin for CODE. */
25463 ix86_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
25465 if (code >= IX86_BUILTIN_MAX)
25466 return error_mark_node;
25468 return ix86_builtins[code];
25471 /* Errors in the source file can cause expand_expr to return const0_rtx
25472 where we expect a vector. To avoid crashing, use one of the vector
25473 clear instructions. */
25475 safe_vector_operand (rtx x, enum machine_mode mode)
25477 if (x == const0_rtx)
25478 x = CONST0_RTX (mode);
25482 /* Subroutine of ix86_expand_builtin to take care of binop insns. */
25485 ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
25488 tree arg0 = CALL_EXPR_ARG (exp, 0);
25489 tree arg1 = CALL_EXPR_ARG (exp, 1);
25490 rtx op0 = expand_normal (arg0);
25491 rtx op1 = expand_normal (arg1);
25492 enum machine_mode tmode = insn_data[icode].operand[0].mode;
25493 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
25494 enum machine_mode mode1 = insn_data[icode].operand[2].mode;
25496 if (VECTOR_MODE_P (mode0))
25497 op0 = safe_vector_operand (op0, mode0);
25498 if (VECTOR_MODE_P (mode1))
25499 op1 = safe_vector_operand (op1, mode1);
25501 if (optimize || !target
25502 || GET_MODE (target) != tmode
25503 || !insn_data[icode].operand[0].predicate (target, tmode))
25504 target = gen_reg_rtx (tmode);
25506 if (GET_MODE (op1) == SImode && mode1 == TImode)
25508 rtx x = gen_reg_rtx (V4SImode);
25509 emit_insn (gen_sse2_loadd (x, op1));
25510 op1 = gen_lowpart (TImode, x);
25513 if (!insn_data[icode].operand[1].predicate (op0, mode0))
25514 op0 = copy_to_mode_reg (mode0, op0);
25515 if (!insn_data[icode].operand[2].predicate (op1, mode1))
25516 op1 = copy_to_mode_reg (mode1, op1);
25518 pat = GEN_FCN (icode) (target, op0, op1);
25527 /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
25530 ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
25531 enum ix86_builtin_func_type m_type,
25532 enum rtx_code sub_code)
25537 bool comparison_p = false;
25539 bool last_arg_constant = false;
25540 int num_memory = 0;
25543 enum machine_mode mode;
25546 enum machine_mode tmode = insn_data[icode].operand[0].mode;
25550 case MULTI_ARG_4_DF2_DI_I:
25551 case MULTI_ARG_4_DF2_DI_I1:
25552 case MULTI_ARG_4_SF2_SI_I:
25553 case MULTI_ARG_4_SF2_SI_I1:
25555 last_arg_constant = true;
25558 case MULTI_ARG_3_SF:
25559 case MULTI_ARG_3_DF:
25560 case MULTI_ARG_3_SF2:
25561 case MULTI_ARG_3_DF2:
25562 case MULTI_ARG_3_DI:
25563 case MULTI_ARG_3_SI:
25564 case MULTI_ARG_3_SI_DI:
25565 case MULTI_ARG_3_HI:
25566 case MULTI_ARG_3_HI_SI:
25567 case MULTI_ARG_3_QI:
25568 case MULTI_ARG_3_DI2:
25569 case MULTI_ARG_3_SI2:
25570 case MULTI_ARG_3_HI2:
25571 case MULTI_ARG_3_QI2:
25575 case MULTI_ARG_2_SF:
25576 case MULTI_ARG_2_DF:
25577 case MULTI_ARG_2_DI:
25578 case MULTI_ARG_2_SI:
25579 case MULTI_ARG_2_HI:
25580 case MULTI_ARG_2_QI:
25584 case MULTI_ARG_2_DI_IMM:
25585 case MULTI_ARG_2_SI_IMM:
25586 case MULTI_ARG_2_HI_IMM:
25587 case MULTI_ARG_2_QI_IMM:
25589 last_arg_constant = true;
25592 case MULTI_ARG_1_SF:
25593 case MULTI_ARG_1_DF:
25594 case MULTI_ARG_1_SF2:
25595 case MULTI_ARG_1_DF2:
25596 case MULTI_ARG_1_DI:
25597 case MULTI_ARG_1_SI:
25598 case MULTI_ARG_1_HI:
25599 case MULTI_ARG_1_QI:
25600 case MULTI_ARG_1_SI_DI:
25601 case MULTI_ARG_1_HI_DI:
25602 case MULTI_ARG_1_HI_SI:
25603 case MULTI_ARG_1_QI_DI:
25604 case MULTI_ARG_1_QI_SI:
25605 case MULTI_ARG_1_QI_HI:
25609 case MULTI_ARG_2_DI_CMP:
25610 case MULTI_ARG_2_SI_CMP:
25611 case MULTI_ARG_2_HI_CMP:
25612 case MULTI_ARG_2_QI_CMP:
25614 comparison_p = true;
25617 case MULTI_ARG_2_SF_TF:
25618 case MULTI_ARG_2_DF_TF:
25619 case MULTI_ARG_2_DI_TF:
25620 case MULTI_ARG_2_SI_TF:
25621 case MULTI_ARG_2_HI_TF:
25622 case MULTI_ARG_2_QI_TF:
25628 gcc_unreachable ();
25631 if (optimize || !target
25632 || GET_MODE (target) != tmode
25633 || !insn_data[icode].operand[0].predicate (target, tmode))
25634 target = gen_reg_rtx (tmode);
25636 gcc_assert (nargs <= 4);
25638 for (i = 0; i < nargs; i++)
25640 tree arg = CALL_EXPR_ARG (exp, i);
25641 rtx op = expand_normal (arg);
25642 int adjust = (comparison_p) ? 1 : 0;
25643 enum machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
25645 if (last_arg_constant && i == nargs - 1)
25647 if (!insn_data[icode].operand[i + 1].predicate (op, mode))
25649 enum insn_code new_icode = icode;
25652 case CODE_FOR_xop_vpermil2v2df3:
25653 case CODE_FOR_xop_vpermil2v4sf3:
25654 case CODE_FOR_xop_vpermil2v4df3:
25655 case CODE_FOR_xop_vpermil2v8sf3:
25656 error ("the last argument must be a 2-bit immediate");
25657 return gen_reg_rtx (tmode);
25658 case CODE_FOR_xop_rotlv2di3:
25659 new_icode = CODE_FOR_rotlv2di3;
25661 case CODE_FOR_xop_rotlv4si3:
25662 new_icode = CODE_FOR_rotlv4si3;
25664 case CODE_FOR_xop_rotlv8hi3:
25665 new_icode = CODE_FOR_rotlv8hi3;
25667 case CODE_FOR_xop_rotlv16qi3:
25668 new_icode = CODE_FOR_rotlv16qi3;
25670 if (CONST_INT_P (op))
25672 int mask = GET_MODE_BITSIZE (GET_MODE_INNER (tmode)) - 1;
25673 op = GEN_INT (INTVAL (op) & mask);
25674 gcc_checking_assert
25675 (insn_data[icode].operand[i + 1].predicate (op, mode));
25679 gcc_checking_assert
25681 && insn_data[new_icode].operand[0].mode == tmode
25682 && insn_data[new_icode].operand[1].mode == tmode
25683 && insn_data[new_icode].operand[2].mode == mode
25684 && insn_data[new_icode].operand[0].predicate
25685 == insn_data[icode].operand[0].predicate
25686 && insn_data[new_icode].operand[1].predicate
25687 == insn_data[icode].operand[1].predicate);
25693 gcc_unreachable ();
25700 if (VECTOR_MODE_P (mode))
25701 op = safe_vector_operand (op, mode);
25703 /* If we aren't optimizing, only allow one memory operand to be
25705 if (memory_operand (op, mode))
25708 gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
25711 || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
25713 op = force_reg (mode, op);
25717 args[i].mode = mode;
25723 pat = GEN_FCN (icode) (target, args[0].op);
25728 pat = GEN_FCN (icode) (target, args[0].op, args[1].op,
25729 GEN_INT ((int)sub_code));
25730 else if (! comparison_p)
25731 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
25734 rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
25738 pat = GEN_FCN (icode) (target, cmp_op, args[0].op, args[1].op);
25743 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
25747 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op, args[3].op);
25751 gcc_unreachable ();
25761 /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
25762 insns with vec_merge. */
25765 ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
25769 tree arg0 = CALL_EXPR_ARG (exp, 0);
25770 rtx op1, op0 = expand_normal (arg0);
25771 enum machine_mode tmode = insn_data[icode].operand[0].mode;
25772 enum machine_mode mode0 = insn_data[icode].operand[1].mode;
25774 if (optimize || !target
25775 || GET_MODE (target) != tmode
25776 || !insn_data[icode].operand[0].predicate (target, tmode))
25777 target = gen_reg_rtx (tmode);
25779 if (VECTOR_MODE_P (mode0))
25780 op0 = safe_vector_operand (op0, mode0);
25782 if ((optimize && !register_operand (op0, mode0))
25783 || !insn_data[icode].operand[1].predicate (op0, mode0))
25784 op0 = copy_to_mode_reg (mode0, op0);
25787 if (!insn_data[icode].operand[2].predicate (op1, mode0))
25788 op1 = copy_to_mode_reg (mode0, op1);
25790 pat = GEN_FCN (icode) (target, op0, op1);
25797 /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
25800 ix86_expand_sse_compare (const struct builtin_description *d,
25801 tree exp, rtx target, bool swap)
25804 tree arg0 = CALL_EXPR_ARG (exp, 0);
25805 tree arg1 = CALL_EXPR_ARG (exp, 1);
25806 rtx op0 = expand_normal (arg0);
25807 rtx op1 = expand_normal (arg1);
25809 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
25810 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
25811 enum machine_mode mode1 = insn_data[d->icode].operand[2].mode;
25812 enum rtx_code comparison = d->comparison;
25814 if (VECTOR_MODE_P (mode0))
25815 op0 = safe_vector_operand (op0, mode0);
25816 if (VECTOR_MODE_P (mode1))
25817 op1 = safe_vector_operand (op1, mode1);
25819 /* Swap operands if we have a comparison that isn't available in
25823 rtx tmp = gen_reg_rtx (mode1);
25824 emit_move_insn (tmp, op1);
25829 if (optimize || !target
25830 || GET_MODE (target) != tmode
25831 || !insn_data[d->icode].operand[0].predicate (target, tmode))
25832 target = gen_reg_rtx (tmode);
25834 if ((optimize && !register_operand (op0, mode0))
25835 || !insn_data[d->icode].operand[1].predicate (op0, mode0))
25836 op0 = copy_to_mode_reg (mode0, op0);
25837 if ((optimize && !register_operand (op1, mode1))
25838 || !insn_data[d->icode].operand[2].predicate (op1, mode1))
25839 op1 = copy_to_mode_reg (mode1, op1);
25841 op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
25842 pat = GEN_FCN (d->icode) (target, op0, op1, op2);
25849 /* Subroutine of ix86_expand_builtin to take care of comi insns. */
25852 ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
25856 tree arg0 = CALL_EXPR_ARG (exp, 0);
25857 tree arg1 = CALL_EXPR_ARG (exp, 1);
25858 rtx op0 = expand_normal (arg0);
25859 rtx op1 = expand_normal (arg1);
25860 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
25861 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
25862 enum rtx_code comparison = d->comparison;
25864 if (VECTOR_MODE_P (mode0))
25865 op0 = safe_vector_operand (op0, mode0);
25866 if (VECTOR_MODE_P (mode1))
25867 op1 = safe_vector_operand (op1, mode1);
25869 /* Swap operands if we have a comparison that isn't available in
25871 if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
25878 target = gen_reg_rtx (SImode);
25879 emit_move_insn (target, const0_rtx);
25880 target = gen_rtx_SUBREG (QImode, target, 0);
25882 if ((optimize && !register_operand (op0, mode0))
25883 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
25884 op0 = copy_to_mode_reg (mode0, op0);
25885 if ((optimize && !register_operand (op1, mode1))
25886 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
25887 op1 = copy_to_mode_reg (mode1, op1);
25889 pat = GEN_FCN (d->icode) (op0, op1);
25893 emit_insn (gen_rtx_SET (VOIDmode,
25894 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
25895 gen_rtx_fmt_ee (comparison, QImode,
25899 return SUBREG_REG (target);
25902 /* Subroutine of ix86_expand_args_builtin to take care of round insns. */
25905 ix86_expand_sse_round (const struct builtin_description *d, tree exp,
25909 tree arg0 = CALL_EXPR_ARG (exp, 0);
25910 rtx op1, op0 = expand_normal (arg0);
25911 enum machine_mode tmode = insn_data[d->icode].operand[0].mode;
25912 enum machine_mode mode0 = insn_data[d->icode].operand[1].mode;
25914 if (optimize || target == 0
25915 || GET_MODE (target) != tmode
25916 || !insn_data[d->icode].operand[0].predicate (target, tmode))
25917 target = gen_reg_rtx (tmode);
25919 if (VECTOR_MODE_P (mode0))
25920 op0 = safe_vector_operand (op0, mode0);
25922 if ((optimize && !register_operand (op0, mode0))
25923 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
25924 op0 = copy_to_mode_reg (mode0, op0);
25926 op1 = GEN_INT (d->comparison);
25928 pat = GEN_FCN (d->icode) (target, op0, op1);
25935 /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
25938 ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
25942 tree arg0 = CALL_EXPR_ARG (exp, 0);
25943 tree arg1 = CALL_EXPR_ARG (exp, 1);
25944 rtx op0 = expand_normal (arg0);
25945 rtx op1 = expand_normal (arg1);
25946 enum machine_mode mode0 = insn_data[d->icode].operand[0].mode;
25947 enum machine_mode mode1 = insn_data[d->icode].operand[1].mode;
25948 enum rtx_code comparison = d->comparison;
25950 if (VECTOR_MODE_P (mode0))
25951 op0 = safe_vector_operand (op0, mode0);
25952 if (VECTOR_MODE_P (mode1))
25953 op1 = safe_vector_operand (op1, mode1);
25955 target = gen_reg_rtx (SImode);
25956 emit_move_insn (target, const0_rtx);
25957 target = gen_rtx_SUBREG (QImode, target, 0);
25959 if ((optimize && !register_operand (op0, mode0))
25960 || !insn_data[d->icode].operand[0].predicate (op0, mode0))
25961 op0 = copy_to_mode_reg (mode0, op0);
25962 if ((optimize && !register_operand (op1, mode1))
25963 || !insn_data[d->icode].operand[1].predicate (op1, mode1))
25964 op1 = copy_to_mode_reg (mode1, op1);
25966 pat = GEN_FCN (d->icode) (op0, op1);
25970 emit_insn (gen_rtx_SET (VOIDmode,
25971 gen_rtx_STRICT_LOW_PART (VOIDmode, target),
25972 gen_rtx_fmt_ee (comparison, QImode,
25976 return SUBREG_REG (target);
25979 /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
25982 ix86_expand_sse_pcmpestr (const struct builtin_description *d,
25983 tree exp, rtx target)
25986 tree arg0 = CALL_EXPR_ARG (exp, 0);
25987 tree arg1 = CALL_EXPR_ARG (exp, 1);
25988 tree arg2 = CALL_EXPR_ARG (exp, 2);
25989 tree arg3 = CALL_EXPR_ARG (exp, 3);
25990 tree arg4 = CALL_EXPR_ARG (exp, 4);
25991 rtx scratch0, scratch1;
25992 rtx op0 = expand_normal (arg0);
25993 rtx op1 = expand_normal (arg1);
25994 rtx op2 = expand_normal (arg2);
25995 rtx op3 = expand_normal (arg3);
25996 rtx op4 = expand_normal (arg4);
25997 enum machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
25999 tmode0 = insn_data[d->icode].operand[0].mode;
26000 tmode1 = insn_data[d->icode].operand[1].mode;
26001 modev2 = insn_data[d->icode].operand[2].mode;
26002 modei3 = insn_data[d->icode].operand[3].mode;
26003 modev4 = insn_data[d->icode].operand[4].mode;
26004 modei5 = insn_data[d->icode].operand[5].mode;
26005 modeimm = insn_data[d->icode].operand[6].mode;
26007 if (VECTOR_MODE_P (modev2))
26008 op0 = safe_vector_operand (op0, modev2);
26009 if (VECTOR_MODE_P (modev4))
26010 op2 = safe_vector_operand (op2, modev4);
26012 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
26013 op0 = copy_to_mode_reg (modev2, op0);
26014 if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
26015 op1 = copy_to_mode_reg (modei3, op1);
26016 if ((optimize && !register_operand (op2, modev4))
26017 || !insn_data[d->icode].operand[4].predicate (op2, modev4))
26018 op2 = copy_to_mode_reg (modev4, op2);
26019 if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
26020 op3 = copy_to_mode_reg (modei5, op3);
26022 if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
26024 error ("the fifth argument must be an 8-bit immediate");
26028 if (d->code == IX86_BUILTIN_PCMPESTRI128)
26030 if (optimize || !target
26031 || GET_MODE (target) != tmode0
26032 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
26033 target = gen_reg_rtx (tmode0);
26035 scratch1 = gen_reg_rtx (tmode1);
26037 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
26039 else if (d->code == IX86_BUILTIN_PCMPESTRM128)
26041 if (optimize || !target
26042 || GET_MODE (target) != tmode1
26043 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
26044 target = gen_reg_rtx (tmode1);
26046 scratch0 = gen_reg_rtx (tmode0);
26048 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
26052 gcc_assert (d->flag);
26054 scratch0 = gen_reg_rtx (tmode0);
26055 scratch1 = gen_reg_rtx (tmode1);
26057 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
26067 target = gen_reg_rtx (SImode);
26068 emit_move_insn (target, const0_rtx);
26069 target = gen_rtx_SUBREG (QImode, target, 0);
26072 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
26073 gen_rtx_fmt_ee (EQ, QImode,
26074 gen_rtx_REG ((enum machine_mode) d->flag,
26077 return SUBREG_REG (target);
26084 /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
26087 ix86_expand_sse_pcmpistr (const struct builtin_description *d,
26088 tree exp, rtx target)
26091 tree arg0 = CALL_EXPR_ARG (exp, 0);
26092 tree arg1 = CALL_EXPR_ARG (exp, 1);
26093 tree arg2 = CALL_EXPR_ARG (exp, 2);
26094 rtx scratch0, scratch1;
26095 rtx op0 = expand_normal (arg0);
26096 rtx op1 = expand_normal (arg1);
26097 rtx op2 = expand_normal (arg2);
26098 enum machine_mode tmode0, tmode1, modev2, modev3, modeimm;
26100 tmode0 = insn_data[d->icode].operand[0].mode;
26101 tmode1 = insn_data[d->icode].operand[1].mode;
26102 modev2 = insn_data[d->icode].operand[2].mode;
26103 modev3 = insn_data[d->icode].operand[3].mode;
26104 modeimm = insn_data[d->icode].operand[4].mode;
26106 if (VECTOR_MODE_P (modev2))
26107 op0 = safe_vector_operand (op0, modev2);
26108 if (VECTOR_MODE_P (modev3))
26109 op1 = safe_vector_operand (op1, modev3);
26111 if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
26112 op0 = copy_to_mode_reg (modev2, op0);
26113 if ((optimize && !register_operand (op1, modev3))
26114 || !insn_data[d->icode].operand[3].predicate (op1, modev3))
26115 op1 = copy_to_mode_reg (modev3, op1);
26117 if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
26119 error ("the third argument must be an 8-bit immediate");
26123 if (d->code == IX86_BUILTIN_PCMPISTRI128)
26125 if (optimize || !target
26126 || GET_MODE (target) != tmode0
26127 || !insn_data[d->icode].operand[0].predicate (target, tmode0))
26128 target = gen_reg_rtx (tmode0);
26130 scratch1 = gen_reg_rtx (tmode1);
26132 pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
26134 else if (d->code == IX86_BUILTIN_PCMPISTRM128)
26136 if (optimize || !target
26137 || GET_MODE (target) != tmode1
26138 || !insn_data[d->icode].operand[1].predicate (target, tmode1))
26139 target = gen_reg_rtx (tmode1);
26141 scratch0 = gen_reg_rtx (tmode0);
26143 pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
26147 gcc_assert (d->flag);
26149 scratch0 = gen_reg_rtx (tmode0);
26150 scratch1 = gen_reg_rtx (tmode1);
26152 pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
26162 target = gen_reg_rtx (SImode);
26163 emit_move_insn (target, const0_rtx);
26164 target = gen_rtx_SUBREG (QImode, target, 0);
26167 (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target),
26168 gen_rtx_fmt_ee (EQ, QImode,
26169 gen_rtx_REG ((enum machine_mode) d->flag,
26172 return SUBREG_REG (target);
26178 /* Subroutine of ix86_expand_builtin to take care of insns with
26179 variable number of operands. */
26182 ix86_expand_args_builtin (const struct builtin_description *d,
26183 tree exp, rtx target)
26185 rtx pat, real_target;
26186 unsigned int i, nargs;
26187 unsigned int nargs_constant = 0;
26188 int num_memory = 0;
26192 enum machine_mode mode;
26194 bool last_arg_count = false;
26195 enum insn_code icode = d->icode;
26196 const struct insn_data_d *insn_p = &insn_data[icode];
26197 enum machine_mode tmode = insn_p->operand[0].mode;
26198 enum machine_mode rmode = VOIDmode;
26200 enum rtx_code comparison = d->comparison;
26202 switch ((enum ix86_builtin_func_type) d->flag)
26204 case V2DF_FTYPE_V2DF_ROUND:
26205 case V4DF_FTYPE_V4DF_ROUND:
26206 case V4SF_FTYPE_V4SF_ROUND:
26207 case V8SF_FTYPE_V8SF_ROUND:
26208 return ix86_expand_sse_round (d, exp, target);
26209 case INT_FTYPE_V8SF_V8SF_PTEST:
26210 case INT_FTYPE_V4DI_V4DI_PTEST:
26211 case INT_FTYPE_V4DF_V4DF_PTEST:
26212 case INT_FTYPE_V4SF_V4SF_PTEST:
26213 case INT_FTYPE_V2DI_V2DI_PTEST:
26214 case INT_FTYPE_V2DF_V2DF_PTEST:
26215 return ix86_expand_sse_ptest (d, exp, target);
26216 case FLOAT128_FTYPE_FLOAT128:
26217 case FLOAT_FTYPE_FLOAT:
26218 case INT_FTYPE_INT:
26219 case UINT64_FTYPE_INT:
26220 case UINT16_FTYPE_UINT16:
26221 case INT64_FTYPE_INT64:
26222 case INT64_FTYPE_V4SF:
26223 case INT64_FTYPE_V2DF:
26224 case INT_FTYPE_V16QI:
26225 case INT_FTYPE_V8QI:
26226 case INT_FTYPE_V8SF:
26227 case INT_FTYPE_V4DF:
26228 case INT_FTYPE_V4SF:
26229 case INT_FTYPE_V2DF:
26230 case V16QI_FTYPE_V16QI:
26231 case V8SI_FTYPE_V8SF:
26232 case V8SI_FTYPE_V4SI:
26233 case V8HI_FTYPE_V8HI:
26234 case V8HI_FTYPE_V16QI:
26235 case V8QI_FTYPE_V8QI:
26236 case V8SF_FTYPE_V8SF:
26237 case V8SF_FTYPE_V8SI:
26238 case V8SF_FTYPE_V4SF:
26239 case V8SF_FTYPE_V8HI:
26240 case V4SI_FTYPE_V4SI:
26241 case V4SI_FTYPE_V16QI:
26242 case V4SI_FTYPE_V4SF:
26243 case V4SI_FTYPE_V8SI:
26244 case V4SI_FTYPE_V8HI:
26245 case V4SI_FTYPE_V4DF:
26246 case V4SI_FTYPE_V2DF:
26247 case V4HI_FTYPE_V4HI:
26248 case V4DF_FTYPE_V4DF:
26249 case V4DF_FTYPE_V4SI:
26250 case V4DF_FTYPE_V4SF:
26251 case V4DF_FTYPE_V2DF:
26252 case V4SF_FTYPE_V4SF:
26253 case V4SF_FTYPE_V4SI:
26254 case V4SF_FTYPE_V8SF:
26255 case V4SF_FTYPE_V4DF:
26256 case V4SF_FTYPE_V8HI:
26257 case V4SF_FTYPE_V2DF:
26258 case V2DI_FTYPE_V2DI:
26259 case V2DI_FTYPE_V16QI:
26260 case V2DI_FTYPE_V8HI:
26261 case V2DI_FTYPE_V4SI:
26262 case V2DF_FTYPE_V2DF:
26263 case V2DF_FTYPE_V4SI:
26264 case V2DF_FTYPE_V4DF:
26265 case V2DF_FTYPE_V4SF:
26266 case V2DF_FTYPE_V2SI:
26267 case V2SI_FTYPE_V2SI:
26268 case V2SI_FTYPE_V4SF:
26269 case V2SI_FTYPE_V2SF:
26270 case V2SI_FTYPE_V2DF:
26271 case V2SF_FTYPE_V2SF:
26272 case V2SF_FTYPE_V2SI:
26275 case V4SF_FTYPE_V4SF_VEC_MERGE:
26276 case V2DF_FTYPE_V2DF_VEC_MERGE:
26277 return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
26278 case FLOAT128_FTYPE_FLOAT128_FLOAT128:
26279 case V16QI_FTYPE_V16QI_V16QI:
26280 case V16QI_FTYPE_V8HI_V8HI:
26281 case V8QI_FTYPE_V8QI_V8QI:
26282 case V8QI_FTYPE_V4HI_V4HI:
26283 case V8HI_FTYPE_V8HI_V8HI:
26284 case V8HI_FTYPE_V16QI_V16QI:
26285 case V8HI_FTYPE_V4SI_V4SI:
26286 case V8SF_FTYPE_V8SF_V8SF:
26287 case V8SF_FTYPE_V8SF_V8SI:
26288 case V4SI_FTYPE_V4SI_V4SI:
26289 case V4SI_FTYPE_V8HI_V8HI:
26290 case V4SI_FTYPE_V4SF_V4SF:
26291 case V4SI_FTYPE_V2DF_V2DF:
26292 case V4HI_FTYPE_V4HI_V4HI:
26293 case V4HI_FTYPE_V8QI_V8QI:
26294 case V4HI_FTYPE_V2SI_V2SI:
26295 case V4DF_FTYPE_V4DF_V4DF:
26296 case V4DF_FTYPE_V4DF_V4DI:
26297 case V4SF_FTYPE_V4SF_V4SF:
26298 case V4SF_FTYPE_V4SF_V4SI:
26299 case V4SF_FTYPE_V4SF_V2SI:
26300 case V4SF_FTYPE_V4SF_V2DF:
26301 case V4SF_FTYPE_V4SF_DI:
26302 case V4SF_FTYPE_V4SF_SI:
26303 case V2DI_FTYPE_V2DI_V2DI:
26304 case V2DI_FTYPE_V16QI_V16QI:
26305 case V2DI_FTYPE_V4SI_V4SI:
26306 case V2DI_FTYPE_V2DI_V16QI:
26307 case V2DI_FTYPE_V2DF_V2DF:
26308 case V2SI_FTYPE_V2SI_V2SI:
26309 case V2SI_FTYPE_V4HI_V4HI:
26310 case V2SI_FTYPE_V2SF_V2SF:
26311 case V2DF_FTYPE_V2DF_V2DF:
26312 case V2DF_FTYPE_V2DF_V4SF:
26313 case V2DF_FTYPE_V2DF_V2DI:
26314 case V2DF_FTYPE_V2DF_DI:
26315 case V2DF_FTYPE_V2DF_SI:
26316 case V2SF_FTYPE_V2SF_V2SF:
26317 case V1DI_FTYPE_V1DI_V1DI:
26318 case V1DI_FTYPE_V8QI_V8QI:
26319 case V1DI_FTYPE_V2SI_V2SI:
26320 if (comparison == UNKNOWN)
26321 return ix86_expand_binop_builtin (icode, exp, target);
26324 case V4SF_FTYPE_V4SF_V4SF_SWAP:
26325 case V2DF_FTYPE_V2DF_V2DF_SWAP:
26326 gcc_assert (comparison != UNKNOWN);
26330 case V8HI_FTYPE_V8HI_V8HI_COUNT:
26331 case V8HI_FTYPE_V8HI_SI_COUNT:
26332 case V4SI_FTYPE_V4SI_V4SI_COUNT:
26333 case V4SI_FTYPE_V4SI_SI_COUNT:
26334 case V4HI_FTYPE_V4HI_V4HI_COUNT:
26335 case V4HI_FTYPE_V4HI_SI_COUNT:
26336 case V2DI_FTYPE_V2DI_V2DI_COUNT:
26337 case V2DI_FTYPE_V2DI_SI_COUNT:
26338 case V2SI_FTYPE_V2SI_V2SI_COUNT:
26339 case V2SI_FTYPE_V2SI_SI_COUNT:
26340 case V1DI_FTYPE_V1DI_V1DI_COUNT:
26341 case V1DI_FTYPE_V1DI_SI_COUNT:
26343 last_arg_count = true;
26345 case UINT64_FTYPE_UINT64_UINT64:
26346 case UINT_FTYPE_UINT_UINT:
26347 case UINT_FTYPE_UINT_USHORT:
26348 case UINT_FTYPE_UINT_UCHAR:
26349 case UINT16_FTYPE_UINT16_INT:
26350 case UINT8_FTYPE_UINT8_INT:
26353 case V2DI_FTYPE_V2DI_INT_CONVERT:
26356 nargs_constant = 1;
26358 case V8HI_FTYPE_V8HI_INT:
26359 case V8HI_FTYPE_V8SF_INT:
26360 case V8HI_FTYPE_V4SF_INT:
26361 case V8SF_FTYPE_V8SF_INT:
26362 case V4SI_FTYPE_V4SI_INT:
26363 case V4SI_FTYPE_V8SI_INT:
26364 case V4HI_FTYPE_V4HI_INT:
26365 case V4DF_FTYPE_V4DF_INT:
26366 case V4SF_FTYPE_V4SF_INT:
26367 case V4SF_FTYPE_V8SF_INT:
26368 case V2DI_FTYPE_V2DI_INT:
26369 case V2DF_FTYPE_V2DF_INT:
26370 case V2DF_FTYPE_V4DF_INT:
26372 nargs_constant = 1;
26374 case V16QI_FTYPE_V16QI_V16QI_V16QI:
26375 case V8SF_FTYPE_V8SF_V8SF_V8SF:
26376 case V4DF_FTYPE_V4DF_V4DF_V4DF:
26377 case V4SF_FTYPE_V4SF_V4SF_V4SF:
26378 case V2DF_FTYPE_V2DF_V2DF_V2DF:
26381 case V16QI_FTYPE_V16QI_V16QI_INT:
26382 case V8HI_FTYPE_V8HI_V8HI_INT:
26383 case V8SI_FTYPE_V8SI_V8SI_INT:
26384 case V8SI_FTYPE_V8SI_V4SI_INT:
26385 case V8SF_FTYPE_V8SF_V8SF_INT:
26386 case V8SF_FTYPE_V8SF_V4SF_INT:
26387 case V4SI_FTYPE_V4SI_V4SI_INT:
26388 case V4DF_FTYPE_V4DF_V4DF_INT:
26389 case V4DF_FTYPE_V4DF_V2DF_INT:
26390 case V4SF_FTYPE_V4SF_V4SF_INT:
26391 case V2DI_FTYPE_V2DI_V2DI_INT:
26392 case V2DF_FTYPE_V2DF_V2DF_INT:
26394 nargs_constant = 1;
26396 case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
26399 nargs_constant = 1;
26401 case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
26404 nargs_constant = 1;
26406 case V2DI_FTYPE_V2DI_UINT_UINT:
26408 nargs_constant = 2;
26410 case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
26411 case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
26412 case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
26413 case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
26415 nargs_constant = 1;
26417 case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
26419 nargs_constant = 2;
26422 gcc_unreachable ();
26425 gcc_assert (nargs <= ARRAY_SIZE (args));
26427 if (comparison != UNKNOWN)
26429 gcc_assert (nargs == 2);
26430 return ix86_expand_sse_compare (d, exp, target, swap);
26433 if (rmode == VOIDmode || rmode == tmode)
26437 || GET_MODE (target) != tmode
26438 || !insn_p->operand[0].predicate (target, tmode))
26439 target = gen_reg_rtx (tmode);
26440 real_target = target;
26444 target = gen_reg_rtx (rmode);
26445 real_target = simplify_gen_subreg (tmode, target, rmode, 0);
26448 for (i = 0; i < nargs; i++)
26450 tree arg = CALL_EXPR_ARG (exp, i);
26451 rtx op = expand_normal (arg);
26452 enum machine_mode mode = insn_p->operand[i + 1].mode;
26453 bool match = insn_p->operand[i + 1].predicate (op, mode);
26455 if (last_arg_count && (i + 1) == nargs)
26457 /* SIMD shift insns take either an 8-bit immediate or
26458 register as count. But builtin functions take int as
26459 count. If count doesn't match, we put it in register. */
26462 op = simplify_gen_subreg (SImode, op, GET_MODE (op), 0);
26463 if (!insn_p->operand[i + 1].predicate (op, mode))
26464 op = copy_to_reg (op);
26467 else if ((nargs - i) <= nargs_constant)
26472 case CODE_FOR_sse4_1_roundpd:
26473 case CODE_FOR_sse4_1_roundps:
26474 case CODE_FOR_sse4_1_roundsd:
26475 case CODE_FOR_sse4_1_roundss:
26476 case CODE_FOR_sse4_1_blendps:
26477 case CODE_FOR_avx_blendpd256:
26478 case CODE_FOR_avx_vpermilv4df:
26479 case CODE_FOR_avx_roundpd256:
26480 case CODE_FOR_avx_roundps256:
26481 error ("the last argument must be a 4-bit immediate");
26484 case CODE_FOR_sse4_1_blendpd:
26485 case CODE_FOR_avx_vpermilv2df:
26486 case CODE_FOR_xop_vpermil2v2df3:
26487 case CODE_FOR_xop_vpermil2v4sf3:
26488 case CODE_FOR_xop_vpermil2v4df3:
26489 case CODE_FOR_xop_vpermil2v8sf3:
26490 error ("the last argument must be a 2-bit immediate");
26493 case CODE_FOR_avx_vextractf128v4df:
26494 case CODE_FOR_avx_vextractf128v8sf:
26495 case CODE_FOR_avx_vextractf128v8si:
26496 case CODE_FOR_avx_vinsertf128v4df:
26497 case CODE_FOR_avx_vinsertf128v8sf:
26498 case CODE_FOR_avx_vinsertf128v8si:
26499 error ("the last argument must be a 1-bit immediate");
26502 case CODE_FOR_avx_vmcmpv2df3:
26503 case CODE_FOR_avx_vmcmpv4sf3:
26504 case CODE_FOR_avx_cmpv2df3:
26505 case CODE_FOR_avx_cmpv4sf3:
26506 case CODE_FOR_avx_cmpv4df3:
26507 case CODE_FOR_avx_cmpv8sf3:
26508 error ("the last argument must be a 5-bit immediate");
26512 switch (nargs_constant)
26515 if ((nargs - i) == nargs_constant)
26517 error ("the next to last argument must be an 8-bit immediate");
26521 error ("the last argument must be an 8-bit immediate");
26524 gcc_unreachable ();
26531 if (VECTOR_MODE_P (mode))
26532 op = safe_vector_operand (op, mode);
26534 /* If we aren't optimizing, only allow one memory operand to
26536 if (memory_operand (op, mode))
26539 if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
26541 if (optimize || !match || num_memory > 1)
26542 op = copy_to_mode_reg (mode, op);
26546 op = copy_to_reg (op);
26547 op = simplify_gen_subreg (mode, op, GET_MODE (op), 0);
26552 args[i].mode = mode;
26558 pat = GEN_FCN (icode) (real_target, args[0].op);
26561 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op);
26564 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
26568 pat = GEN_FCN (icode) (real_target, args[0].op, args[1].op,
26569 args[2].op, args[3].op);
26572 gcc_unreachable ();
26582 /* Subroutine of ix86_expand_builtin to take care of special insns
26583 with variable number of operands. */
26586 ix86_expand_special_args_builtin (const struct builtin_description *d,
26587 tree exp, rtx target)
26591 unsigned int i, nargs, arg_adjust, memory;
26595 enum machine_mode mode;
26597 enum insn_code icode = d->icode;
26598 bool last_arg_constant = false;
26599 const struct insn_data_d *insn_p = &insn_data[icode];
26600 enum machine_mode tmode = insn_p->operand[0].mode;
26601 enum { load, store } klass;
26603 switch ((enum ix86_builtin_func_type) d->flag)
26605 case VOID_FTYPE_VOID:
26606 if (icode == CODE_FOR_avx_vzeroupper)
26607 target = GEN_INT (vzeroupper_intrinsic);
26608 emit_insn (GEN_FCN (icode) (target));
26610 case VOID_FTYPE_UINT64:
26611 case VOID_FTYPE_UNSIGNED:
26617 case UINT64_FTYPE_VOID:
26618 case UNSIGNED_FTYPE_VOID:
26623 case UINT64_FTYPE_PUNSIGNED:
26624 case V2DI_FTYPE_PV2DI:
26625 case V32QI_FTYPE_PCCHAR:
26626 case V16QI_FTYPE_PCCHAR:
26627 case V8SF_FTYPE_PCV4SF:
26628 case V8SF_FTYPE_PCFLOAT:
26629 case V4SF_FTYPE_PCFLOAT:
26630 case V4DF_FTYPE_PCV2DF:
26631 case V4DF_FTYPE_PCDOUBLE:
26632 case V2DF_FTYPE_PCDOUBLE:
26633 case VOID_FTYPE_PVOID:
26638 case VOID_FTYPE_PV2SF_V4SF:
26639 case VOID_FTYPE_PV4DI_V4DI:
26640 case VOID_FTYPE_PV2DI_V2DI:
26641 case VOID_FTYPE_PCHAR_V32QI:
26642 case VOID_FTYPE_PCHAR_V16QI:
26643 case VOID_FTYPE_PFLOAT_V8SF:
26644 case VOID_FTYPE_PFLOAT_V4SF:
26645 case VOID_FTYPE_PDOUBLE_V4DF:
26646 case VOID_FTYPE_PDOUBLE_V2DF:
26647 case VOID_FTYPE_PULONGLONG_ULONGLONG:
26648 case VOID_FTYPE_PINT_INT:
26651 /* Reserve memory operand for target. */
26652 memory = ARRAY_SIZE (args);
26654 case V4SF_FTYPE_V4SF_PCV2SF:
26655 case V2DF_FTYPE_V2DF_PCDOUBLE:
26660 case V8SF_FTYPE_PCV8SF_V8SI:
26661 case V4DF_FTYPE_PCV4DF_V4DI:
26662 case V4SF_FTYPE_PCV4SF_V4SI:
26663 case V2DF_FTYPE_PCV2DF_V2DI:
26668 case VOID_FTYPE_PV8SF_V8SI_V8SF:
26669 case VOID_FTYPE_PV4DF_V4DI_V4DF:
26670 case VOID_FTYPE_PV4SF_V4SI_V4SF:
26671 case VOID_FTYPE_PV2DF_V2DI_V2DF:
26674 /* Reserve memory operand for target. */
26675 memory = ARRAY_SIZE (args);
26677 case VOID_FTYPE_UINT_UINT_UINT:
26678 case VOID_FTYPE_UINT64_UINT_UINT:
26679 case UCHAR_FTYPE_UINT_UINT_UINT:
26680 case UCHAR_FTYPE_UINT64_UINT_UINT:
26683 memory = ARRAY_SIZE (args);
26684 last_arg_constant = true;
26687 gcc_unreachable ();
26690 gcc_assert (nargs <= ARRAY_SIZE (args));
26692 if (klass == store)
26694 arg = CALL_EXPR_ARG (exp, 0);
26695 op = expand_normal (arg);
26696 gcc_assert (target == 0);
26698 target = gen_rtx_MEM (tmode, copy_to_mode_reg (Pmode, op));
26700 target = force_reg (tmode, op);
26708 || GET_MODE (target) != tmode
26709 || !insn_p->operand[0].predicate (target, tmode))
26710 target = gen_reg_rtx (tmode);
26713 for (i = 0; i < nargs; i++)
26715 enum machine_mode mode = insn_p->operand[i + 1].mode;
26718 arg = CALL_EXPR_ARG (exp, i + arg_adjust);
26719 op = expand_normal (arg);
26720 match = insn_p->operand[i + 1].predicate (op, mode);
26722 if (last_arg_constant && (i + 1) == nargs)
26726 if (icode == CODE_FOR_lwp_lwpvalsi3
26727 || icode == CODE_FOR_lwp_lwpinssi3
26728 || icode == CODE_FOR_lwp_lwpvaldi3
26729 || icode == CODE_FOR_lwp_lwpinsdi3)
26730 error ("the last argument must be a 32-bit immediate");
26732 error ("the last argument must be an 8-bit immediate");
26740 /* This must be the memory operand. */
26741 op = gen_rtx_MEM (mode, copy_to_mode_reg (Pmode, op));
26742 gcc_assert (GET_MODE (op) == mode
26743 || GET_MODE (op) == VOIDmode);
26747 /* This must be register. */
26748 if (VECTOR_MODE_P (mode))
26749 op = safe_vector_operand (op, mode);
26751 gcc_assert (GET_MODE (op) == mode
26752 || GET_MODE (op) == VOIDmode);
26753 op = copy_to_mode_reg (mode, op);
26758 args[i].mode = mode;
26764 pat = GEN_FCN (icode) (target);
26767 pat = GEN_FCN (icode) (target, args[0].op);
26770 pat = GEN_FCN (icode) (target, args[0].op, args[1].op);
26773 pat = GEN_FCN (icode) (target, args[0].op, args[1].op, args[2].op);
26776 gcc_unreachable ();
26782 return klass == store ? 0 : target;
26785 /* Return the integer constant in ARG. Constrain it to be in the range
26786 of the subparts of VEC_TYPE; issue an error if not. */
26789 get_element_number (tree vec_type, tree arg)
26791 unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
26793 if (!host_integerp (arg, 1)
26794 || (elt = tree_low_cst (arg, 1), elt > max))
26796 error ("selector must be an integer constant in the range 0..%wi", max);
26803 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
26804 ix86_expand_vector_init. We DO have language-level syntax for this, in
26805 the form of (type){ init-list }. Except that since we can't place emms
26806 instructions from inside the compiler, we can't allow the use of MMX
26807 registers unless the user explicitly asks for it. So we do *not* define
26808 vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
26809 we have builtins invoked by mmintrin.h that gives us license to emit
26810 these sorts of instructions. */
26813 ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
26815 enum machine_mode tmode = TYPE_MODE (type);
26816 enum machine_mode inner_mode = GET_MODE_INNER (tmode);
26817 int i, n_elt = GET_MODE_NUNITS (tmode);
26818 rtvec v = rtvec_alloc (n_elt);
26820 gcc_assert (VECTOR_MODE_P (tmode));
26821 gcc_assert (call_expr_nargs (exp) == n_elt);
26823 for (i = 0; i < n_elt; ++i)
26825 rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
26826 RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
26829 if (!target || !register_operand (target, tmode))
26830 target = gen_reg_rtx (tmode);
26832 ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
26836 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
26837 ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
26838 had a language-level syntax for referencing vector elements. */
26841 ix86_expand_vec_ext_builtin (tree exp, rtx target)
26843 enum machine_mode tmode, mode0;
26848 arg0 = CALL_EXPR_ARG (exp, 0);
26849 arg1 = CALL_EXPR_ARG (exp, 1);
26851 op0 = expand_normal (arg0);
26852 elt = get_element_number (TREE_TYPE (arg0), arg1);
26854 tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
26855 mode0 = TYPE_MODE (TREE_TYPE (arg0));
26856 gcc_assert (VECTOR_MODE_P (mode0));
26858 op0 = force_reg (mode0, op0);
26860 if (optimize || !target || !register_operand (target, tmode))
26861 target = gen_reg_rtx (tmode);
26863 ix86_expand_vector_extract (true, target, op0, elt);
26868 /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
26869 ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
26870 a language-level syntax for referencing vector elements. */
26873 ix86_expand_vec_set_builtin (tree exp)
26875 enum machine_mode tmode, mode1;
26876 tree arg0, arg1, arg2;
26878 rtx op0, op1, target;
26880 arg0 = CALL_EXPR_ARG (exp, 0);
26881 arg1 = CALL_EXPR_ARG (exp, 1);
26882 arg2 = CALL_EXPR_ARG (exp, 2);
26884 tmode = TYPE_MODE (TREE_TYPE (arg0));
26885 mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
26886 gcc_assert (VECTOR_MODE_P (tmode));
26888 op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
26889 op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
26890 elt = get_element_number (TREE_TYPE (arg0), arg2);
26892 if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
26893 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
26895 op0 = force_reg (tmode, op0);
26896 op1 = force_reg (mode1, op1);
26898 /* OP0 is the source of these builtin functions and shouldn't be
26899 modified. Create a copy, use it and return it as target. */
26900 target = gen_reg_rtx (tmode);
26901 emit_move_insn (target, op0);
26902 ix86_expand_vector_set (true, target, op1, elt);
26907 /* Expand an expression EXP that calls a built-in function,
26908 with result going to TARGET if that's convenient
26909 (and in mode MODE if that's convenient).
26910 SUBTARGET may be used as the target for computing one of EXP's operands.
26911 IGNORE is nonzero if the value is to be ignored. */
26914 ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
26915 enum machine_mode mode ATTRIBUTE_UNUSED,
26916 int ignore ATTRIBUTE_UNUSED)
26918 const struct builtin_description *d;
26920 enum insn_code icode;
26921 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
26922 tree arg0, arg1, arg2;
26923 rtx op0, op1, op2, pat;
26924 enum machine_mode mode0, mode1, mode2;
26925 unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
26927 /* Determine whether the builtin function is available under the current ISA.
26928 Originally the builtin was not created if it wasn't applicable to the
26929 current ISA based on the command line switches. With function specific
26930 options, we need to check in the context of the function making the call
26931 whether it is supported. */
26932 if (ix86_builtins_isa[fcode].isa
26933 && !(ix86_builtins_isa[fcode].isa & ix86_isa_flags))
26935 char *opts = ix86_target_string (ix86_builtins_isa[fcode].isa, 0, NULL,
26936 NULL, (enum fpmath_unit) 0, false);
26939 error ("%qE needs unknown isa option", fndecl);
26942 gcc_assert (opts != NULL);
26943 error ("%qE needs isa option %s", fndecl, opts);
26951 case IX86_BUILTIN_MASKMOVQ:
26952 case IX86_BUILTIN_MASKMOVDQU:
26953 icode = (fcode == IX86_BUILTIN_MASKMOVQ
26954 ? CODE_FOR_mmx_maskmovq
26955 : CODE_FOR_sse2_maskmovdqu);
26956 /* Note the arg order is different from the operand order. */
26957 arg1 = CALL_EXPR_ARG (exp, 0);
26958 arg2 = CALL_EXPR_ARG (exp, 1);
26959 arg0 = CALL_EXPR_ARG (exp, 2);
26960 op0 = expand_normal (arg0);
26961 op1 = expand_normal (arg1);
26962 op2 = expand_normal (arg2);
26963 mode0 = insn_data[icode].operand[0].mode;
26964 mode1 = insn_data[icode].operand[1].mode;
26965 mode2 = insn_data[icode].operand[2].mode;
26967 op0 = force_reg (Pmode, op0);
26968 op0 = gen_rtx_MEM (mode1, op0);
26970 if (!insn_data[icode].operand[0].predicate (op0, mode0))
26971 op0 = copy_to_mode_reg (mode0, op0);
26972 if (!insn_data[icode].operand[1].predicate (op1, mode1))
26973 op1 = copy_to_mode_reg (mode1, op1);
26974 if (!insn_data[icode].operand[2].predicate (op2, mode2))
26975 op2 = copy_to_mode_reg (mode2, op2);
26976 pat = GEN_FCN (icode) (op0, op1, op2);
26982 case IX86_BUILTIN_LDMXCSR:
26983 op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
26984 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
26985 emit_move_insn (target, op0);
26986 emit_insn (gen_sse_ldmxcsr (target));
26989 case IX86_BUILTIN_STMXCSR:
26990 target = assign_386_stack_local (SImode, SLOT_VIRTUAL);
26991 emit_insn (gen_sse_stmxcsr (target));
26992 return copy_to_mode_reg (SImode, target);
26994 case IX86_BUILTIN_CLFLUSH:
26995 arg0 = CALL_EXPR_ARG (exp, 0);
26996 op0 = expand_normal (arg0);
26997 icode = CODE_FOR_sse2_clflush;
26998 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
26999 op0 = copy_to_mode_reg (Pmode, op0);
27001 emit_insn (gen_sse2_clflush (op0));
27004 case IX86_BUILTIN_MONITOR:
27005 arg0 = CALL_EXPR_ARG (exp, 0);
27006 arg1 = CALL_EXPR_ARG (exp, 1);
27007 arg2 = CALL_EXPR_ARG (exp, 2);
27008 op0 = expand_normal (arg0);
27009 op1 = expand_normal (arg1);
27010 op2 = expand_normal (arg2);
27012 op0 = copy_to_mode_reg (Pmode, op0);
27014 op1 = copy_to_mode_reg (SImode, op1);
27016 op2 = copy_to_mode_reg (SImode, op2);
27017 emit_insn (ix86_gen_monitor (op0, op1, op2));
27020 case IX86_BUILTIN_MWAIT:
27021 arg0 = CALL_EXPR_ARG (exp, 0);
27022 arg1 = CALL_EXPR_ARG (exp, 1);
27023 op0 = expand_normal (arg0);
27024 op1 = expand_normal (arg1);
27026 op0 = copy_to_mode_reg (SImode, op0);
27028 op1 = copy_to_mode_reg (SImode, op1);
27029 emit_insn (gen_sse3_mwait (op0, op1));
27032 case IX86_BUILTIN_VEC_INIT_V2SI:
27033 case IX86_BUILTIN_VEC_INIT_V4HI:
27034 case IX86_BUILTIN_VEC_INIT_V8QI:
27035 return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
27037 case IX86_BUILTIN_VEC_EXT_V2DF:
27038 case IX86_BUILTIN_VEC_EXT_V2DI:
27039 case IX86_BUILTIN_VEC_EXT_V4SF:
27040 case IX86_BUILTIN_VEC_EXT_V4SI:
27041 case IX86_BUILTIN_VEC_EXT_V8HI:
27042 case IX86_BUILTIN_VEC_EXT_V2SI:
27043 case IX86_BUILTIN_VEC_EXT_V4HI:
27044 case IX86_BUILTIN_VEC_EXT_V16QI:
27045 return ix86_expand_vec_ext_builtin (exp, target);
27047 case IX86_BUILTIN_VEC_SET_V2DI:
27048 case IX86_BUILTIN_VEC_SET_V4SF:
27049 case IX86_BUILTIN_VEC_SET_V4SI:
27050 case IX86_BUILTIN_VEC_SET_V8HI:
27051 case IX86_BUILTIN_VEC_SET_V4HI:
27052 case IX86_BUILTIN_VEC_SET_V16QI:
27053 return ix86_expand_vec_set_builtin (exp);
27055 case IX86_BUILTIN_VEC_PERM_V2DF:
27056 case IX86_BUILTIN_VEC_PERM_V4SF:
27057 case IX86_BUILTIN_VEC_PERM_V2DI:
27058 case IX86_BUILTIN_VEC_PERM_V4SI:
27059 case IX86_BUILTIN_VEC_PERM_V8HI:
27060 case IX86_BUILTIN_VEC_PERM_V16QI:
27061 case IX86_BUILTIN_VEC_PERM_V2DI_U:
27062 case IX86_BUILTIN_VEC_PERM_V4SI_U:
27063 case IX86_BUILTIN_VEC_PERM_V8HI_U:
27064 case IX86_BUILTIN_VEC_PERM_V16QI_U:
27065 case IX86_BUILTIN_VEC_PERM_V4DF:
27066 case IX86_BUILTIN_VEC_PERM_V8SF:
27067 return ix86_expand_vec_perm_builtin (exp);
27069 case IX86_BUILTIN_INFQ:
27070 case IX86_BUILTIN_HUGE_VALQ:
27072 REAL_VALUE_TYPE inf;
27076 tmp = CONST_DOUBLE_FROM_REAL_VALUE (inf, mode);
27078 tmp = validize_mem (force_const_mem (mode, tmp));
27081 target = gen_reg_rtx (mode);
27083 emit_move_insn (target, tmp);
27087 case IX86_BUILTIN_LLWPCB:
27088 arg0 = CALL_EXPR_ARG (exp, 0);
27089 op0 = expand_normal (arg0);
27090 icode = CODE_FOR_lwp_llwpcb;
27091 if (!insn_data[icode].operand[0].predicate (op0, Pmode))
27092 op0 = copy_to_mode_reg (Pmode, op0);
27093 emit_insn (gen_lwp_llwpcb (op0));
27096 case IX86_BUILTIN_SLWPCB:
27097 icode = CODE_FOR_lwp_slwpcb;
27099 || !insn_data[icode].operand[0].predicate (target, Pmode))
27100 target = gen_reg_rtx (Pmode);
27101 emit_insn (gen_lwp_slwpcb (target));
27104 case IX86_BUILTIN_BEXTRI32:
27105 case IX86_BUILTIN_BEXTRI64:
27106 arg0 = CALL_EXPR_ARG (exp, 0);
27107 arg1 = CALL_EXPR_ARG (exp, 1);
27108 op0 = expand_normal (arg0);
27109 op1 = expand_normal (arg1);
27110 icode = (fcode == IX86_BUILTIN_BEXTRI32
27111 ? CODE_FOR_tbm_bextri_si
27112 : CODE_FOR_tbm_bextri_di);
27113 if (!CONST_INT_P (op1))
27115 error ("last argument must be an immediate");
27120 unsigned char length = (INTVAL (op1) >> 8) & 0xFF;
27121 unsigned char lsb_index = INTVAL (op1) & 0xFF;
27122 op1 = GEN_INT (length);
27123 op2 = GEN_INT (lsb_index);
27124 pat = GEN_FCN (icode) (target, op0, op1, op2);
27130 case IX86_BUILTIN_RDRAND16_STEP:
27131 icode = CODE_FOR_rdrandhi_1;
27135 case IX86_BUILTIN_RDRAND32_STEP:
27136 icode = CODE_FOR_rdrandsi_1;
27140 case IX86_BUILTIN_RDRAND64_STEP:
27141 icode = CODE_FOR_rdranddi_1;
27145 op0 = gen_reg_rtx (mode0);
27146 emit_insn (GEN_FCN (icode) (op0));
27148 arg0 = CALL_EXPR_ARG (exp, 0);
27149 op1 = expand_normal (arg0);
27150 if (!address_operand (op1, VOIDmode))
27151 op1 = copy_addr_to_reg (op1);
27152 emit_move_insn (gen_rtx_MEM (mode0, op1), op0);
27154 op1 = gen_reg_rtx (SImode);
27155 emit_move_insn (op1, CONST1_RTX (SImode));
27157 /* Emit SImode conditional move. */
27158 if (mode0 == HImode)
27160 op2 = gen_reg_rtx (SImode);
27161 emit_insn (gen_zero_extendhisi2 (op2, op0));
27163 else if (mode0 == SImode)
27166 op2 = gen_rtx_SUBREG (SImode, op0, 0);
27169 target = gen_reg_rtx (SImode);
27171 pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
27173 emit_insn (gen_rtx_SET (VOIDmode, target,
27174 gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
27181 for (i = 0, d = bdesc_special_args;
27182 i < ARRAY_SIZE (bdesc_special_args);
27184 if (d->code == fcode)
27185 return ix86_expand_special_args_builtin (d, exp, target);
27187 for (i = 0, d = bdesc_args;
27188 i < ARRAY_SIZE (bdesc_args);
27190 if (d->code == fcode)
27193 case IX86_BUILTIN_FABSQ:
27194 case IX86_BUILTIN_COPYSIGNQ:
27196 /* Emit a normal call if SSE2 isn't available. */
27197 return expand_call (exp, target, ignore);
27199 return ix86_expand_args_builtin (d, exp, target);
27202 for (i = 0, d = bdesc_comi; i < ARRAY_SIZE (bdesc_comi); i++, d++)
27203 if (d->code == fcode)
27204 return ix86_expand_sse_comi (d, exp, target);
27206 for (i = 0, d = bdesc_pcmpestr;
27207 i < ARRAY_SIZE (bdesc_pcmpestr);
27209 if (d->code == fcode)
27210 return ix86_expand_sse_pcmpestr (d, exp, target);
27212 for (i = 0, d = bdesc_pcmpistr;
27213 i < ARRAY_SIZE (bdesc_pcmpistr);
27215 if (d->code == fcode)
27216 return ix86_expand_sse_pcmpistr (d, exp, target);
27218 for (i = 0, d = bdesc_multi_arg; i < ARRAY_SIZE (bdesc_multi_arg); i++, d++)
27219 if (d->code == fcode)
27220 return ix86_expand_multi_arg_builtin (d->icode, exp, target,
27221 (enum ix86_builtin_func_type)
27222 d->flag, d->comparison);
27224 gcc_unreachable ();
27227 /* Returns a function decl for a vectorized version of the builtin function
27228 with builtin function code FN and the result vector type TYPE, or NULL_TREE
27229 if it is not available. */
27232 ix86_builtin_vectorized_function (tree fndecl, tree type_out,
27235 enum machine_mode in_mode, out_mode;
27237 enum built_in_function fn = DECL_FUNCTION_CODE (fndecl);
27239 if (TREE_CODE (type_out) != VECTOR_TYPE
27240 || TREE_CODE (type_in) != VECTOR_TYPE
27241 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
27244 out_mode = TYPE_MODE (TREE_TYPE (type_out));
27245 out_n = TYPE_VECTOR_SUBPARTS (type_out);
27246 in_mode = TYPE_MODE (TREE_TYPE (type_in));
27247 in_n = TYPE_VECTOR_SUBPARTS (type_in);
27251 case BUILT_IN_SQRT:
27252 if (out_mode == DFmode && in_mode == DFmode)
27254 if (out_n == 2 && in_n == 2)
27255 return ix86_builtins[IX86_BUILTIN_SQRTPD];
27256 else if (out_n == 4 && in_n == 4)
27257 return ix86_builtins[IX86_BUILTIN_SQRTPD256];
27261 case BUILT_IN_SQRTF:
27262 if (out_mode == SFmode && in_mode == SFmode)
27264 if (out_n == 4 && in_n == 4)
27265 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR];
27266 else if (out_n == 8 && in_n == 8)
27267 return ix86_builtins[IX86_BUILTIN_SQRTPS_NR256];
27271 case BUILT_IN_LRINT:
27272 if (out_mode == SImode && out_n == 4
27273 && in_mode == DFmode && in_n == 2)
27274 return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
27277 case BUILT_IN_LRINTF:
27278 if (out_mode == SImode && in_mode == SFmode)
27280 if (out_n == 4 && in_n == 4)
27281 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ];
27282 else if (out_n == 8 && in_n == 8)
27283 return ix86_builtins[IX86_BUILTIN_CVTPS2DQ256];
27287 case BUILT_IN_COPYSIGN:
27288 if (out_mode == DFmode && in_mode == DFmode)
27290 if (out_n == 2 && in_n == 2)
27291 return ix86_builtins[IX86_BUILTIN_CPYSGNPD];
27292 else if (out_n == 4 && in_n == 4)
27293 return ix86_builtins[IX86_BUILTIN_CPYSGNPD256];
27297 case BUILT_IN_COPYSIGNF:
27298 if (out_mode == SFmode && in_mode == SFmode)
27300 if (out_n == 4 && in_n == 4)
27301 return ix86_builtins[IX86_BUILTIN_CPYSGNPS];
27302 else if (out_n == 8 && in_n == 8)
27303 return ix86_builtins[IX86_BUILTIN_CPYSGNPS256];
27307 case BUILT_IN_FLOOR:
27308 /* The round insn does not trap on denormals. */
27309 if (flag_trapping_math || !TARGET_ROUND)
27312 if (out_mode == DFmode && in_mode == DFmode)
27314 if (out_n == 2 && in_n == 2)
27315 return ix86_builtins[IX86_BUILTIN_FLOORPD];
27316 else if (out_n == 4 && in_n == 4)
27317 return ix86_builtins[IX86_BUILTIN_FLOORPD256];
27321 case BUILT_IN_FLOORF:
27322 /* The round insn does not trap on denormals. */
27323 if (flag_trapping_math || !TARGET_ROUND)
27326 if (out_mode == SFmode && in_mode == SFmode)
27328 if (out_n == 4 && in_n == 4)
27329 return ix86_builtins[IX86_BUILTIN_FLOORPS];
27330 else if (out_n == 8 && in_n == 8)
27331 return ix86_builtins[IX86_BUILTIN_FLOORPS256];
27335 case BUILT_IN_CEIL:
27336 /* The round insn does not trap on denormals. */
27337 if (flag_trapping_math || !TARGET_ROUND)
27340 if (out_mode == DFmode && in_mode == DFmode)
27342 if (out_n == 2 && in_n == 2)
27343 return ix86_builtins[IX86_BUILTIN_CEILPD];
27344 else if (out_n == 4 && in_n == 4)
27345 return ix86_builtins[IX86_BUILTIN_CEILPD256];
27349 case BUILT_IN_CEILF:
27350 /* The round insn does not trap on denormals. */
27351 if (flag_trapping_math || !TARGET_ROUND)
27354 if (out_mode == SFmode && in_mode == SFmode)
27356 if (out_n == 4 && in_n == 4)
27357 return ix86_builtins[IX86_BUILTIN_CEILPS];
27358 else if (out_n == 8 && in_n == 8)
27359 return ix86_builtins[IX86_BUILTIN_CEILPS256];
27363 case BUILT_IN_TRUNC:
27364 /* The round insn does not trap on denormals. */
27365 if (flag_trapping_math || !TARGET_ROUND)
27368 if (out_mode == DFmode && in_mode == DFmode)
27370 if (out_n == 2 && in_n == 2)
27371 return ix86_builtins[IX86_BUILTIN_TRUNCPD];
27372 else if (out_n == 4 && in_n == 4)
27373 return ix86_builtins[IX86_BUILTIN_TRUNCPD256];
27377 case BUILT_IN_TRUNCF:
27378 /* The round insn does not trap on denormals. */
27379 if (flag_trapping_math || !TARGET_ROUND)
27382 if (out_mode == SFmode && in_mode == SFmode)
27384 if (out_n == 4 && in_n == 4)
27385 return ix86_builtins[IX86_BUILTIN_TRUNCPS];
27386 else if (out_n == 8 && in_n == 8)
27387 return ix86_builtins[IX86_BUILTIN_TRUNCPS256];
27391 case BUILT_IN_RINT:
27392 /* The round insn does not trap on denormals. */
27393 if (flag_trapping_math || !TARGET_ROUND)
27396 if (out_mode == DFmode && in_mode == DFmode)
27398 if (out_n == 2 && in_n == 2)
27399 return ix86_builtins[IX86_BUILTIN_RINTPD];
27400 else if (out_n == 4 && in_n == 4)
27401 return ix86_builtins[IX86_BUILTIN_RINTPD256];
27405 case BUILT_IN_RINTF:
27406 /* The round insn does not trap on denormals. */
27407 if (flag_trapping_math || !TARGET_ROUND)
27410 if (out_mode == SFmode && in_mode == SFmode)
27412 if (out_n == 4 && in_n == 4)
27413 return ix86_builtins[IX86_BUILTIN_RINTPS];
27414 else if (out_n == 8 && in_n == 8)
27415 return ix86_builtins[IX86_BUILTIN_RINTPS256];
27420 if (out_mode == DFmode && in_mode == DFmode)
27422 if (out_n == 2 && in_n == 2)
27423 return ix86_builtins[IX86_BUILTIN_VFMADDPD];
27424 if (out_n == 4 && in_n == 4)
27425 return ix86_builtins[IX86_BUILTIN_VFMADDPD256];
27429 case BUILT_IN_FMAF:
27430 if (out_mode == SFmode && in_mode == SFmode)
27432 if (out_n == 4 && in_n == 4)
27433 return ix86_builtins[IX86_BUILTIN_VFMADDPS];
27434 if (out_n == 8 && in_n == 8)
27435 return ix86_builtins[IX86_BUILTIN_VFMADDPS256];
27443 /* Dispatch to a handler for a vectorization library. */
27444 if (ix86_veclib_handler)
27445 return ix86_veclib_handler ((enum built_in_function) fn, type_out,
27451 /* Handler for an SVML-style interface to
27452 a library with vectorized intrinsics. */
27455 ix86_veclibabi_svml (enum built_in_function fn, tree type_out, tree type_in)
27458 tree fntype, new_fndecl, args;
27461 enum machine_mode el_mode, in_mode;
27464 /* The SVML is suitable for unsafe math only. */
27465 if (!flag_unsafe_math_optimizations)
27468 el_mode = TYPE_MODE (TREE_TYPE (type_out));
27469 n = TYPE_VECTOR_SUBPARTS (type_out);
27470 in_mode = TYPE_MODE (TREE_TYPE (type_in));
27471 in_n = TYPE_VECTOR_SUBPARTS (type_in);
27472 if (el_mode != in_mode
27480 case BUILT_IN_LOG10:
27482 case BUILT_IN_TANH:
27484 case BUILT_IN_ATAN:
27485 case BUILT_IN_ATAN2:
27486 case BUILT_IN_ATANH:
27487 case BUILT_IN_CBRT:
27488 case BUILT_IN_SINH:
27490 case BUILT_IN_ASINH:
27491 case BUILT_IN_ASIN:
27492 case BUILT_IN_COSH:
27494 case BUILT_IN_ACOSH:
27495 case BUILT_IN_ACOS:
27496 if (el_mode != DFmode || n != 2)
27500 case BUILT_IN_EXPF:
27501 case BUILT_IN_LOGF:
27502 case BUILT_IN_LOG10F:
27503 case BUILT_IN_POWF:
27504 case BUILT_IN_TANHF:
27505 case BUILT_IN_TANF:
27506 case BUILT_IN_ATANF:
27507 case BUILT_IN_ATAN2F:
27508 case BUILT_IN_ATANHF:
27509 case BUILT_IN_CBRTF:
27510 case BUILT_IN_SINHF:
27511 case BUILT_IN_SINF:
27512 case BUILT_IN_ASINHF:
27513 case BUILT_IN_ASINF:
27514 case BUILT_IN_COSHF:
27515 case BUILT_IN_COSF:
27516 case BUILT_IN_ACOSHF:
27517 case BUILT_IN_ACOSF:
27518 if (el_mode != SFmode || n != 4)
27526 bname = IDENTIFIER_POINTER (DECL_NAME (implicit_built_in_decls[fn]));
27528 if (fn == BUILT_IN_LOGF)
27529 strcpy (name, "vmlsLn4");
27530 else if (fn == BUILT_IN_LOG)
27531 strcpy (name, "vmldLn2");
27534 sprintf (name, "vmls%s", bname+10);
27535 name[strlen (name)-1] = '4';
27538 sprintf (name, "vmld%s2", bname+10);
27540 /* Convert to uppercase. */
27544 for (args = DECL_ARGUMENTS (implicit_built_in_decls[fn]); args;
27545 args = TREE_CHAIN (args))
27549 fntype = build_function_type_list (type_out, type_in, NULL);
27551 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
27553 /* Build a function declaration for the vectorized function. */
27554 new_fndecl = build_decl (BUILTINS_LOCATION,
27555 FUNCTION_DECL, get_identifier (name), fntype);
27556 TREE_PUBLIC (new_fndecl) = 1;
27557 DECL_EXTERNAL (new_fndecl) = 1;
27558 DECL_IS_NOVOPS (new_fndecl) = 1;
27559 TREE_READONLY (new_fndecl) = 1;
27564 /* Handler for an ACML-style interface to
27565 a library with vectorized intrinsics. */
27568 ix86_veclibabi_acml (enum built_in_function fn, tree type_out, tree type_in)
27570 char name[20] = "__vr.._";
27571 tree fntype, new_fndecl, args;
27574 enum machine_mode el_mode, in_mode;
27577 /* The ACML is 64bits only and suitable for unsafe math only as
27578 it does not correctly support parts of IEEE with the required
27579 precision such as denormals. */
27581 || !flag_unsafe_math_optimizations)
27584 el_mode = TYPE_MODE (TREE_TYPE (type_out));
27585 n = TYPE_VECTOR_SUBPARTS (type_out);
27586 in_mode = TYPE_MODE (TREE_TYPE (type_in));
27587 in_n = TYPE_VECTOR_SUBPARTS (type_in);
27588 if (el_mode != in_mode
27598 case BUILT_IN_LOG2:
27599 case BUILT_IN_LOG10:
27602 if (el_mode != DFmode
27607 case BUILT_IN_SINF:
27608 case BUILT_IN_COSF:
27609 case BUILT_IN_EXPF:
27610 case BUILT_IN_POWF:
27611 case BUILT_IN_LOGF:
27612 case BUILT_IN_LOG2F:
27613 case BUILT_IN_LOG10F:
27616 if (el_mode != SFmode
27625 bname = IDENTIFIER_POINTER (DECL_NAME (implicit_built_in_decls[fn]));
27626 sprintf (name + 7, "%s", bname+10);
27629 for (args = DECL_ARGUMENTS (implicit_built_in_decls[fn]); args;
27630 args = TREE_CHAIN (args))
27634 fntype = build_function_type_list (type_out, type_in, NULL);
27636 fntype = build_function_type_list (type_out, type_in, type_in, NULL);
27638 /* Build a function declaration for the vectorized function. */
27639 new_fndecl = build_decl (BUILTINS_LOCATION,
27640 FUNCTION_DECL, get_identifier (name), fntype);
27641 TREE_PUBLIC (new_fndecl) = 1;
27642 DECL_EXTERNAL (new_fndecl) = 1;
27643 DECL_IS_NOVOPS (new_fndecl) = 1;
27644 TREE_READONLY (new_fndecl) = 1;
27650 /* Returns a decl of a function that implements conversion of an integer vector
27651 into a floating-point vector, or vice-versa. DEST_TYPE and SRC_TYPE
27652 are the types involved when converting according to CODE.
27653 Return NULL_TREE if it is not available. */
27656 ix86_vectorize_builtin_conversion (unsigned int code,
27657 tree dest_type, tree src_type)
27665 switch (TYPE_MODE (src_type))
27668 switch (TYPE_MODE (dest_type))
27671 return (TYPE_UNSIGNED (src_type)
27672 ? ix86_builtins[IX86_BUILTIN_CVTUDQ2PS]
27673 : ix86_builtins[IX86_BUILTIN_CVTDQ2PS]);
27675 return (TYPE_UNSIGNED (src_type)
27677 : ix86_builtins[IX86_BUILTIN_CVTDQ2PD256]);
27683 switch (TYPE_MODE (dest_type))
27686 return (TYPE_UNSIGNED (src_type)
27688 : ix86_builtins[IX86_BUILTIN_CVTDQ2PS256]);
27697 case FIX_TRUNC_EXPR:
27698 switch (TYPE_MODE (dest_type))
27701 switch (TYPE_MODE (src_type))
27704 return (TYPE_UNSIGNED (dest_type)
27706 : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ]);
27708 return (TYPE_UNSIGNED (dest_type)
27710 : ix86_builtins[IX86_BUILTIN_CVTTPD2DQ256]);
27717 switch (TYPE_MODE (src_type))
27720 return (TYPE_UNSIGNED (dest_type)
27722 : ix86_builtins[IX86_BUILTIN_CVTTPS2DQ256]);
27739 /* Returns a code for a target-specific builtin that implements
27740 reciprocal of the function, or NULL_TREE if not available. */
27743 ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
27744 bool sqrt ATTRIBUTE_UNUSED)
27746 if (! (TARGET_SSE_MATH && !optimize_insn_for_size_p ()
27747 && flag_finite_math_only && !flag_trapping_math
27748 && flag_unsafe_math_optimizations))
27752 /* Machine dependent builtins. */
27755 /* Vectorized version of sqrt to rsqrt conversion. */
27756 case IX86_BUILTIN_SQRTPS_NR:
27757 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR];
27759 case IX86_BUILTIN_SQRTPS_NR256:
27760 return ix86_builtins[IX86_BUILTIN_RSQRTPS_NR256];
27766 /* Normal builtins. */
27769 /* Sqrt to rsqrt conversion. */
27770 case BUILT_IN_SQRTF:
27771 return ix86_builtins[IX86_BUILTIN_RSQRTF];
27778 /* Helper for avx_vpermilps256_operand et al. This is also used by
27779 the expansion functions to turn the parallel back into a mask.
27780 The return value is 0 for no match and the imm8+1 for a match. */
27783 avx_vpermilp_parallel (rtx par, enum machine_mode mode)
27785 unsigned i, nelt = GET_MODE_NUNITS (mode);
27787 unsigned char ipar[8];
27789 if (XVECLEN (par, 0) != (int) nelt)
27792 /* Validate that all of the elements are constants, and not totally
27793 out of range. Copy the data into an integral array to make the
27794 subsequent checks easier. */
27795 for (i = 0; i < nelt; ++i)
27797 rtx er = XVECEXP (par, 0, i);
27798 unsigned HOST_WIDE_INT ei;
27800 if (!CONST_INT_P (er))
27811 /* In the 256-bit DFmode case, we can only move elements within
27813 for (i = 0; i < 2; ++i)
27817 mask |= ipar[i] << i;
27819 for (i = 2; i < 4; ++i)
27823 mask |= (ipar[i] - 2) << i;
27828 /* In the 256-bit SFmode case, we have full freedom of movement
27829 within the low 128-bit lane, but the high 128-bit lane must
27830 mirror the exact same pattern. */
27831 for (i = 0; i < 4; ++i)
27832 if (ipar[i] + 4 != ipar[i + 4])
27839 /* In the 128-bit case, we've full freedom in the placement of
27840 the elements from the source operand. */
27841 for (i = 0; i < nelt; ++i)
27842 mask |= ipar[i] << (i * (nelt / 2));
27846 gcc_unreachable ();
27849 /* Make sure success has a non-zero value by adding one. */
27853 /* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
27854 the expansion functions to turn the parallel back into a mask.
27855 The return value is 0 for no match and the imm8+1 for a match. */
27858 avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
27860 unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
27862 unsigned char ipar[8];
27864 if (XVECLEN (par, 0) != (int) nelt)
27867 /* Validate that all of the elements are constants, and not totally
27868 out of range. Copy the data into an integral array to make the
27869 subsequent checks easier. */
27870 for (i = 0; i < nelt; ++i)
27872 rtx er = XVECEXP (par, 0, i);
27873 unsigned HOST_WIDE_INT ei;
27875 if (!CONST_INT_P (er))
27878 if (ei >= 2 * nelt)
27883 /* Validate that the halves of the permute are halves. */
27884 for (i = 0; i < nelt2 - 1; ++i)
27885 if (ipar[i] + 1 != ipar[i + 1])
27887 for (i = nelt2; i < nelt - 1; ++i)
27888 if (ipar[i] + 1 != ipar[i + 1])
27891 /* Reconstruct the mask. */
27892 for (i = 0; i < 2; ++i)
27894 unsigned e = ipar[i * nelt2];
27898 mask |= e << (i * 4);
27901 /* Make sure success has a non-zero value by adding one. */
27906 /* Store OPERAND to the memory after reload is completed. This means
27907 that we can't easily use assign_stack_local. */
27909 ix86_force_to_memory (enum machine_mode mode, rtx operand)
27913 gcc_assert (reload_completed);
27914 if (ix86_using_red_zone ())
27916 result = gen_rtx_MEM (mode,
27917 gen_rtx_PLUS (Pmode,
27919 GEN_INT (-RED_ZONE_SIZE)));
27920 emit_move_insn (result, operand);
27922 else if (TARGET_64BIT)
27928 operand = gen_lowpart (DImode, operand);
27932 gen_rtx_SET (VOIDmode,
27933 gen_rtx_MEM (DImode,
27934 gen_rtx_PRE_DEC (DImode,
27935 stack_pointer_rtx)),
27939 gcc_unreachable ();
27941 result = gen_rtx_MEM (mode, stack_pointer_rtx);
27950 split_double_mode (mode, &operand, 1, operands, operands + 1);
27952 gen_rtx_SET (VOIDmode,
27953 gen_rtx_MEM (SImode,
27954 gen_rtx_PRE_DEC (Pmode,
27955 stack_pointer_rtx)),
27958 gen_rtx_SET (VOIDmode,
27959 gen_rtx_MEM (SImode,
27960 gen_rtx_PRE_DEC (Pmode,
27961 stack_pointer_rtx)),
27966 /* Store HImodes as SImodes. */
27967 operand = gen_lowpart (SImode, operand);
27971 gen_rtx_SET (VOIDmode,
27972 gen_rtx_MEM (GET_MODE (operand),
27973 gen_rtx_PRE_DEC (SImode,
27974 stack_pointer_rtx)),
27978 gcc_unreachable ();
27980 result = gen_rtx_MEM (mode, stack_pointer_rtx);
27985 /* Free operand from the memory. */
27987 ix86_free_from_memory (enum machine_mode mode)
27989 if (!ix86_using_red_zone ())
27993 if (mode == DImode || TARGET_64BIT)
27997 /* Use LEA to deallocate stack space. In peephole2 it will be converted
27998 to pop or add instruction if registers are available. */
27999 emit_insn (gen_rtx_SET (VOIDmode, stack_pointer_rtx,
28000 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
28005 /* Implement TARGET_PREFERRED_RELOAD_CLASS.
28007 Put float CONST_DOUBLE in the constant pool instead of fp regs.
28008 QImode must go into class Q_REGS.
28009 Narrow ALL_REGS to GENERAL_REGS. This supports allowing movsf and
28010 movdf to do mem-to-mem moves through integer regs. */
28013 ix86_preferred_reload_class (rtx x, reg_class_t regclass)
28015 enum machine_mode mode = GET_MODE (x);
28017 /* We're only allowed to return a subclass of CLASS. Many of the
28018 following checks fail for NO_REGS, so eliminate that early. */
28019 if (regclass == NO_REGS)
28022 /* All classes can load zeros. */
28023 if (x == CONST0_RTX (mode))
28026 /* Force constants into memory if we are loading a (nonzero) constant into
28027 an MMX or SSE register. This is because there are no MMX/SSE instructions
28028 to load from a constant. */
28030 && (MAYBE_MMX_CLASS_P (regclass) || MAYBE_SSE_CLASS_P (regclass)))
28033 /* Prefer SSE regs only, if we can use them for math. */
28034 if (TARGET_SSE_MATH && !TARGET_MIX_SSE_I387 && SSE_FLOAT_MODE_P (mode))
28035 return SSE_CLASS_P (regclass) ? regclass : NO_REGS;
28037 /* Floating-point constants need more complex checks. */
28038 if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode)
28040 /* General regs can load everything. */
28041 if (reg_class_subset_p (regclass, GENERAL_REGS))
28044 /* Floats can load 0 and 1 plus some others. Note that we eliminated
28045 zero above. We only want to wind up preferring 80387 registers if
28046 we plan on doing computation with them. */
28048 && standard_80387_constant_p (x) > 0)
28050 /* Limit class to non-sse. */
28051 if (regclass == FLOAT_SSE_REGS)
28053 if (regclass == FP_TOP_SSE_REGS)
28055 if (regclass == FP_SECOND_SSE_REGS)
28056 return FP_SECOND_REG;
28057 if (regclass == FLOAT_INT_REGS || regclass == FLOAT_REGS)
28064 /* Generally when we see PLUS here, it's the function invariant
28065 (plus soft-fp const_int). Which can only be computed into general
28067 if (GET_CODE (x) == PLUS)
28068 return reg_class_subset_p (regclass, GENERAL_REGS) ? regclass : NO_REGS;
28070 /* QImode constants are easy to load, but non-constant QImode data
28071 must go into Q_REGS. */
28072 if (GET_MODE (x) == QImode && !CONSTANT_P (x))
28074 if (reg_class_subset_p (regclass, Q_REGS))
28076 if (reg_class_subset_p (Q_REGS, regclass))
28084 /* Discourage putting floating-point values in SSE registers unless
28085 SSE math is being used, and likewise for the 387 registers. */
28087 ix86_preferred_output_reload_class (rtx x, reg_class_t regclass)
28089 enum machine_mode mode = GET_MODE (x);
28091 /* Restrict the output reload class to the register bank that we are doing
28092 math on. If we would like not to return a subset of CLASS, reject this
28093 alternative: if reload cannot do this, it will still use its choice. */
28094 mode = GET_MODE (x);
28095 if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
28096 return MAYBE_SSE_CLASS_P (regclass) ? SSE_REGS : NO_REGS;
28098 if (X87_FLOAT_MODE_P (mode))
28100 if (regclass == FP_TOP_SSE_REGS)
28102 else if (regclass == FP_SECOND_SSE_REGS)
28103 return FP_SECOND_REG;
28105 return FLOAT_CLASS_P (regclass) ? regclass : NO_REGS;
28112 ix86_secondary_reload (bool in_p, rtx x, reg_class_t rclass,
28113 enum machine_mode mode,
28114 secondary_reload_info *sri ATTRIBUTE_UNUSED)
28116 /* QImode spills from non-QI registers require
28117 intermediate register on 32bit targets. */
28119 && !in_p && mode == QImode
28120 && (rclass == GENERAL_REGS
28121 || rclass == LEGACY_REGS
28122 || rclass == INDEX_REGS))
28131 if (regno >= FIRST_PSEUDO_REGISTER || GET_CODE (x) == SUBREG)
28132 regno = true_regnum (x);
28134 /* Return Q_REGS if the operand is in memory. */
28139 /* This condition handles corner case where an expression involving
28140 pointers gets vectorized. We're trying to use the address of a
28141 stack slot as a vector initializer.
28143 (set (reg:V2DI 74 [ vect_cst_.2 ])
28144 (vec_duplicate:V2DI (reg/f:DI 20 frame)))
28146 Eventually frame gets turned into sp+offset like this:
28148 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
28149 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
28150 (const_int 392 [0x188]))))
28152 That later gets turned into:
28154 (set (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
28155 (vec_duplicate:V2DI (plus:DI (reg/f:DI 7 sp)
28156 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))))
28158 We'll have the following reload recorded:
28160 Reload 0: reload_in (DI) =
28161 (plus:DI (reg/f:DI 7 sp)
28162 (mem/u/c/i:DI (symbol_ref/u:DI ("*.LC0") [flags 0x2]) [0 S8 A64]))
28163 reload_out (V2DI) = (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
28164 SSE_REGS, RELOAD_OTHER (opnum = 0), can't combine
28165 reload_in_reg: (plus:DI (reg/f:DI 7 sp) (const_int 392 [0x188]))
28166 reload_out_reg: (reg:V2DI 21 xmm0 [orig:74 vect_cst_.2 ] [74])
28167 reload_reg_rtx: (reg:V2DI 22 xmm1)
28169 Which isn't going to work since SSE instructions can't handle scalar
28170 additions. Returning GENERAL_REGS forces the addition into integer
28171 register and reload can handle subsequent reloads without problems. */
28173 if (in_p && GET_CODE (x) == PLUS
28174 && SSE_CLASS_P (rclass)
28175 && SCALAR_INT_MODE_P (mode))
28176 return GENERAL_REGS;
28181 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. */
28184 ix86_class_likely_spilled_p (reg_class_t rclass)
28195 case SSE_FIRST_REG:
28197 case FP_SECOND_REG:
28207 /* If we are copying between general and FP registers, we need a memory
28208 location. The same is true for SSE and MMX registers.
28210 To optimize register_move_cost performance, allow inline variant.
28212 The macro can't work reliably when one of the CLASSES is class containing
28213 registers from multiple units (SSE, MMX, integer). We avoid this by never
28214 combining those units in single alternative in the machine description.
28215 Ensure that this constraint holds to avoid unexpected surprises.
28217 When STRICT is false, we are being called from REGISTER_MOVE_COST, so do not
28218 enforce these sanity checks. */
28221 inline_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
28222 enum machine_mode mode, int strict)
28224 if (MAYBE_FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class1)
28225 || MAYBE_FLOAT_CLASS_P (class2) != FLOAT_CLASS_P (class2)
28226 || MAYBE_SSE_CLASS_P (class1) != SSE_CLASS_P (class1)
28227 || MAYBE_SSE_CLASS_P (class2) != SSE_CLASS_P (class2)
28228 || MAYBE_MMX_CLASS_P (class1) != MMX_CLASS_P (class1)
28229 || MAYBE_MMX_CLASS_P (class2) != MMX_CLASS_P (class2))
28231 gcc_assert (!strict);
28235 if (FLOAT_CLASS_P (class1) != FLOAT_CLASS_P (class2))
28238 /* ??? This is a lie. We do have moves between mmx/general, and for
28239 mmx/sse2. But by saying we need secondary memory we discourage the
28240 register allocator from using the mmx registers unless needed. */
28241 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2))
28244 if (SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
28246 /* SSE1 doesn't have any direct moves from other classes. */
28250 /* If the target says that inter-unit moves are more expensive
28251 than moving through memory, then don't generate them. */
28252 if (!TARGET_INTER_UNIT_MOVES)
28255 /* Between SSE and general, we have moves no larger than word size. */
28256 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
28264 ix86_secondary_memory_needed (enum reg_class class1, enum reg_class class2,
28265 enum machine_mode mode, int strict)
28267 return inline_secondary_memory_needed (class1, class2, mode, strict);
28270 /* Return true if the registers in CLASS cannot represent the change from
28271 modes FROM to TO. */
28274 ix86_cannot_change_mode_class (enum machine_mode from, enum machine_mode to,
28275 enum reg_class regclass)
28280 /* x87 registers can't do subreg at all, as all values are reformatted
28281 to extended precision. */
28282 if (MAYBE_FLOAT_CLASS_P (regclass))
28285 if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
28287 /* Vector registers do not support QI or HImode loads. If we don't
28288 disallow a change to these modes, reload will assume it's ok to
28289 drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
28290 the vec_dupv4hi pattern. */
28291 if (GET_MODE_SIZE (from) < 4)
28294 /* Vector registers do not support subreg with nonzero offsets, which
28295 are otherwise valid for integer registers. Since we can't see
28296 whether we have a nonzero offset from here, prohibit all
28297 nonparadoxical subregs changing size. */
28298 if (GET_MODE_SIZE (to) < GET_MODE_SIZE (from))
28305 /* Return the cost of moving data of mode M between a
28306 register and memory. A value of 2 is the default; this cost is
28307 relative to those in `REGISTER_MOVE_COST'.
28309 This function is used extensively by register_move_cost that is used to
28310 build tables at startup. Make it inline in this case.
28311 When IN is 2, return maximum of in and out move cost.
28313 If moving between registers and memory is more expensive than
28314 between two registers, you should define this macro to express the
28317 Model also increased moving costs of QImode registers in non
28321 inline_memory_move_cost (enum machine_mode mode, enum reg_class regclass,
28325 if (FLOAT_CLASS_P (regclass))
28343 return MAX (ix86_cost->fp_load [index], ix86_cost->fp_store [index]);
28344 return in ? ix86_cost->fp_load [index] : ix86_cost->fp_store [index];
28346 if (SSE_CLASS_P (regclass))
28349 switch (GET_MODE_SIZE (mode))
28364 return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store [index]);
28365 return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store [index];
28367 if (MMX_CLASS_P (regclass))
28370 switch (GET_MODE_SIZE (mode))
28382 return MAX (ix86_cost->mmx_load [index], ix86_cost->mmx_store [index]);
28383 return in ? ix86_cost->mmx_load [index] : ix86_cost->mmx_store [index];
28385 switch (GET_MODE_SIZE (mode))
28388 if (Q_CLASS_P (regclass) || TARGET_64BIT)
28391 return ix86_cost->int_store[0];
28392 if (TARGET_PARTIAL_REG_DEPENDENCY
28393 && optimize_function_for_speed_p (cfun))
28394 cost = ix86_cost->movzbl_load;
28396 cost = ix86_cost->int_load[0];
28398 return MAX (cost, ix86_cost->int_store[0]);
28404 return MAX (ix86_cost->movzbl_load, ix86_cost->int_store[0] + 4);
28406 return ix86_cost->movzbl_load;
28408 return ix86_cost->int_store[0] + 4;
28413 return MAX (ix86_cost->int_load[1], ix86_cost->int_store[1]);
28414 return in ? ix86_cost->int_load[1] : ix86_cost->int_store[1];
28416 /* Compute number of 32bit moves needed. TFmode is moved as XFmode. */
28417 if (mode == TFmode)
28420 cost = MAX (ix86_cost->int_load[2] , ix86_cost->int_store[2]);
28422 cost = ix86_cost->int_load[2];
28424 cost = ix86_cost->int_store[2];
28425 return (cost * (((int) GET_MODE_SIZE (mode)
28426 + UNITS_PER_WORD - 1) / UNITS_PER_WORD));
28431 ix86_memory_move_cost (enum machine_mode mode, reg_class_t regclass,
28434 return inline_memory_move_cost (mode, (enum reg_class) regclass, in ? 1 : 0);
28438 /* Return the cost of moving data from a register in class CLASS1 to
28439 one in class CLASS2.
28441 It is not required that the cost always equal 2 when FROM is the same as TO;
28442 on some machines it is expensive to move between registers if they are not
28443 general registers. */
28446 ix86_register_move_cost (enum machine_mode mode, reg_class_t class1_i,
28447 reg_class_t class2_i)
28449 enum reg_class class1 = (enum reg_class) class1_i;
28450 enum reg_class class2 = (enum reg_class) class2_i;
28452 /* In case we require secondary memory, compute cost of the store followed
28453 by load. In order to avoid bad register allocation choices, we need
28454 for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */
28456 if (inline_secondary_memory_needed (class1, class2, mode, 0))
28460 cost += inline_memory_move_cost (mode, class1, 2);
28461 cost += inline_memory_move_cost (mode, class2, 2);
28463 /* In case of copying from general_purpose_register we may emit multiple
28464 stores followed by single load causing memory size mismatch stall.
28465 Count this as arbitrarily high cost of 20. */
28466 if (CLASS_MAX_NREGS (class1, mode) > CLASS_MAX_NREGS (class2, mode))
28469 /* In the case of FP/MMX moves, the registers actually overlap, and we
28470 have to switch modes in order to treat them differently. */
28471 if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2))
28472 || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1)))
28478 /* Moves between SSE/MMX and integer unit are expensive. */
28479 if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2)
28480 || SSE_CLASS_P (class1) != SSE_CLASS_P (class2))
28482 /* ??? By keeping returned value relatively high, we limit the number
28483 of moves between integer and MMX/SSE registers for all targets.
28484 Additionally, high value prevents problem with x86_modes_tieable_p(),
28485 where integer modes in MMX/SSE registers are not tieable
28486 because of missing QImode and HImode moves to, from or between
28487 MMX/SSE registers. */
28488 return MAX (8, ix86_cost->mmxsse_to_integer);
28490 if (MAYBE_FLOAT_CLASS_P (class1))
28491 return ix86_cost->fp_move;
28492 if (MAYBE_SSE_CLASS_P (class1))
28493 return ix86_cost->sse_move;
28494 if (MAYBE_MMX_CLASS_P (class1))
28495 return ix86_cost->mmx_move;
28499 /* Return TRUE if hard register REGNO can hold a value of machine-mode
28503 ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
28505 /* Flags and only flags can only hold CCmode values. */
28506 if (CC_REGNO_P (regno))
28507 return GET_MODE_CLASS (mode) == MODE_CC;
28508 if (GET_MODE_CLASS (mode) == MODE_CC
28509 || GET_MODE_CLASS (mode) == MODE_RANDOM
28510 || GET_MODE_CLASS (mode) == MODE_PARTIAL_INT)
28512 if (FP_REGNO_P (regno))
28513 return VALID_FP_MODE_P (mode);
28514 if (SSE_REGNO_P (regno))
28516 /* We implement the move patterns for all vector modes into and
28517 out of SSE registers, even when no operation instructions
28518 are available. OImode move is available only when AVX is
28520 return ((TARGET_AVX && mode == OImode)
28521 || VALID_AVX256_REG_MODE (mode)
28522 || VALID_SSE_REG_MODE (mode)
28523 || VALID_SSE2_REG_MODE (mode)
28524 || VALID_MMX_REG_MODE (mode)
28525 || VALID_MMX_REG_MODE_3DNOW (mode));
28527 if (MMX_REGNO_P (regno))
28529 /* We implement the move patterns for 3DNOW modes even in MMX mode,
28530 so if the register is available at all, then we can move data of
28531 the given mode into or out of it. */
28532 return (VALID_MMX_REG_MODE (mode)
28533 || VALID_MMX_REG_MODE_3DNOW (mode));
28536 if (mode == QImode)
28538 /* Take care for QImode values - they can be in non-QI regs,
28539 but then they do cause partial register stalls. */
28540 if (regno <= BX_REG || TARGET_64BIT)
28542 if (!TARGET_PARTIAL_REG_STALL)
28544 return !can_create_pseudo_p ();
28546 /* We handle both integer and floats in the general purpose registers. */
28547 else if (VALID_INT_MODE_P (mode))
28549 else if (VALID_FP_MODE_P (mode))
28551 else if (VALID_DFP_MODE_P (mode))
28553 /* Lots of MMX code casts 8 byte vector modes to DImode. If we then go
28554 on to use that value in smaller contexts, this can easily force a
28555 pseudo to be allocated to GENERAL_REGS. Since this is no worse than
28556 supporting DImode, allow it. */
28557 else if (VALID_MMX_REG_MODE_3DNOW (mode) || VALID_MMX_REG_MODE (mode))
28563 /* A subroutine of ix86_modes_tieable_p. Return true if MODE is a
28564 tieable integer mode. */
28567 ix86_tieable_integer_mode_p (enum machine_mode mode)
28576 return TARGET_64BIT || !TARGET_PARTIAL_REG_STALL;
28579 return TARGET_64BIT;
28586 /* Return true if MODE1 is accessible in a register that can hold MODE2
28587 without copying. That is, all register classes that can hold MODE2
28588 can also hold MODE1. */
28591 ix86_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
28593 if (mode1 == mode2)
28596 if (ix86_tieable_integer_mode_p (mode1)
28597 && ix86_tieable_integer_mode_p (mode2))
28600 /* MODE2 being XFmode implies fp stack or general regs, which means we
28601 can tie any smaller floating point modes to it. Note that we do not
28602 tie this with TFmode. */
28603 if (mode2 == XFmode)
28604 return mode1 == SFmode || mode1 == DFmode;
28606 /* MODE2 being DFmode implies fp stack, general or sse regs, which means
28607 that we can tie it with SFmode. */
28608 if (mode2 == DFmode)
28609 return mode1 == SFmode;
28611 /* If MODE2 is only appropriate for an SSE register, then tie with
28612 any other mode acceptable to SSE registers. */
28613 if (GET_MODE_SIZE (mode2) == 16
28614 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
28615 return (GET_MODE_SIZE (mode1) == 16
28616 && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
28618 /* If MODE2 is appropriate for an MMX register, then tie
28619 with any other mode acceptable to MMX registers. */
28620 if (GET_MODE_SIZE (mode2) == 8
28621 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode2))
28622 return (GET_MODE_SIZE (mode1) == 8
28623 && ix86_hard_regno_mode_ok (FIRST_MMX_REG, mode1));
28628 /* Compute a (partial) cost for rtx X. Return true if the complete
28629 cost has been computed, and false if subexpressions should be
28630 scanned. In either case, *TOTAL contains the cost result. */
28633 ix86_rtx_costs (rtx x, int code, int outer_code_i, int *total, bool speed)
28635 enum rtx_code outer_code = (enum rtx_code) outer_code_i;
28636 enum machine_mode mode = GET_MODE (x);
28637 const struct processor_costs *cost = speed ? ix86_cost : &ix86_size_cost;
28645 if (TARGET_64BIT && !x86_64_immediate_operand (x, VOIDmode))
28647 else if (TARGET_64BIT && !x86_64_zext_immediate_operand (x, VOIDmode))
28649 else if (flag_pic && SYMBOLIC_CONST (x)
28651 || (!GET_CODE (x) != LABEL_REF
28652 && (GET_CODE (x) != SYMBOL_REF
28653 || !SYMBOL_REF_LOCAL_P (x)))))
28660 if (mode == VOIDmode)
28663 switch (standard_80387_constant_p (x))
28668 default: /* Other constants */
28673 /* Start with (MEM (SYMBOL_REF)), since that's where
28674 it'll probably end up. Add a penalty for size. */
28675 *total = (COSTS_N_INSNS (1)
28676 + (flag_pic != 0 && !TARGET_64BIT)
28677 + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
28683 /* The zero extensions is often completely free on x86_64, so make
28684 it as cheap as possible. */
28685 if (TARGET_64BIT && mode == DImode
28686 && GET_MODE (XEXP (x, 0)) == SImode)
28688 else if (TARGET_ZERO_EXTEND_WITH_AND)
28689 *total = cost->add;
28691 *total = cost->movzx;
28695 *total = cost->movsx;
28699 if (CONST_INT_P (XEXP (x, 1))
28700 && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
28702 HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
28705 *total = cost->add;
28708 if ((value == 2 || value == 3)
28709 && cost->lea <= cost->shift_const)
28711 *total = cost->lea;
28721 if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
28723 if (CONST_INT_P (XEXP (x, 1)))
28725 if (INTVAL (XEXP (x, 1)) > 32)
28726 *total = cost->shift_const + COSTS_N_INSNS (2);
28728 *total = cost->shift_const * 2;
28732 if (GET_CODE (XEXP (x, 1)) == AND)
28733 *total = cost->shift_var * 2;
28735 *total = cost->shift_var * 6 + COSTS_N_INSNS (2);
28740 if (CONST_INT_P (XEXP (x, 1)))
28741 *total = cost->shift_const;
28743 *total = cost->shift_var;
28751 gcc_assert (FLOAT_MODE_P (mode));
28752 gcc_assert (TARGET_FMA || TARGET_FMA4);
28754 /* ??? SSE scalar/vector cost should be used here. */
28755 /* ??? Bald assumption that fma has the same cost as fmul. */
28756 *total = cost->fmul;
28757 *total += rtx_cost (XEXP (x, 1), FMA, speed);
28759 /* Negate in op0 or op2 is free: FMS, FNMA, FNMS. */
28761 if (GET_CODE (sub) == NEG)
28762 sub = XEXP (sub, 0);
28763 *total += rtx_cost (sub, FMA, speed);
28766 if (GET_CODE (sub) == NEG)
28767 sub = XEXP (sub, 0);
28768 *total += rtx_cost (sub, FMA, speed);
28773 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
28775 /* ??? SSE scalar cost should be used here. */
28776 *total = cost->fmul;
28779 else if (X87_FLOAT_MODE_P (mode))
28781 *total = cost->fmul;
28784 else if (FLOAT_MODE_P (mode))
28786 /* ??? SSE vector cost should be used here. */
28787 *total = cost->fmul;
28792 rtx op0 = XEXP (x, 0);
28793 rtx op1 = XEXP (x, 1);
28795 if (CONST_INT_P (XEXP (x, 1)))
28797 unsigned HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
28798 for (nbits = 0; value != 0; value &= value - 1)
28802 /* This is arbitrary. */
28805 /* Compute costs correctly for widening multiplication. */
28806 if ((GET_CODE (op0) == SIGN_EXTEND || GET_CODE (op0) == ZERO_EXTEND)
28807 && GET_MODE_SIZE (GET_MODE (XEXP (op0, 0))) * 2
28808 == GET_MODE_SIZE (mode))
28810 int is_mulwiden = 0;
28811 enum machine_mode inner_mode = GET_MODE (op0);
28813 if (GET_CODE (op0) == GET_CODE (op1))
28814 is_mulwiden = 1, op1 = XEXP (op1, 0);
28815 else if (CONST_INT_P (op1))
28817 if (GET_CODE (op0) == SIGN_EXTEND)
28818 is_mulwiden = trunc_int_for_mode (INTVAL (op1), inner_mode)
28821 is_mulwiden = !(INTVAL (op1) & ~GET_MODE_MASK (inner_mode));
28825 op0 = XEXP (op0, 0), mode = GET_MODE (op0);
28828 *total = (cost->mult_init[MODE_INDEX (mode)]
28829 + nbits * cost->mult_bit
28830 + rtx_cost (op0, outer_code, speed) + rtx_cost (op1, outer_code, speed));
28839 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
28840 /* ??? SSE cost should be used here. */
28841 *total = cost->fdiv;
28842 else if (X87_FLOAT_MODE_P (mode))
28843 *total = cost->fdiv;
28844 else if (FLOAT_MODE_P (mode))
28845 /* ??? SSE vector cost should be used here. */
28846 *total = cost->fdiv;
28848 *total = cost->divide[MODE_INDEX (mode)];
28852 if (GET_MODE_CLASS (mode) == MODE_INT
28853 && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
28855 if (GET_CODE (XEXP (x, 0)) == PLUS
28856 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
28857 && CONST_INT_P (XEXP (XEXP (XEXP (x, 0), 0), 1))
28858 && CONSTANT_P (XEXP (x, 1)))
28860 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (XEXP (x, 0), 0), 1));
28861 if (val == 2 || val == 4 || val == 8)
28863 *total = cost->lea;
28864 *total += rtx_cost (XEXP (XEXP (x, 0), 1), outer_code, speed);
28865 *total += rtx_cost (XEXP (XEXP (XEXP (x, 0), 0), 0),
28866 outer_code, speed);
28867 *total += rtx_cost (XEXP (x, 1), outer_code, speed);
28871 else if (GET_CODE (XEXP (x, 0)) == MULT
28872 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
28874 HOST_WIDE_INT val = INTVAL (XEXP (XEXP (x, 0), 1));
28875 if (val == 2 || val == 4 || val == 8)
28877 *total = cost->lea;
28878 *total += rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, speed);
28879 *total += rtx_cost (XEXP (x, 1), outer_code, speed);
28883 else if (GET_CODE (XEXP (x, 0)) == PLUS)
28885 *total = cost->lea;
28886 *total += rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, speed);
28887 *total += rtx_cost (XEXP (XEXP (x, 0), 1), outer_code, speed);
28888 *total += rtx_cost (XEXP (x, 1), outer_code, speed);
28895 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
28897 /* ??? SSE cost should be used here. */
28898 *total = cost->fadd;
28901 else if (X87_FLOAT_MODE_P (mode))
28903 *total = cost->fadd;
28906 else if (FLOAT_MODE_P (mode))
28908 /* ??? SSE vector cost should be used here. */
28909 *total = cost->fadd;
28917 if (!TARGET_64BIT && mode == DImode)
28919 *total = (cost->add * 2
28920 + (rtx_cost (XEXP (x, 0), outer_code, speed)
28921 << (GET_MODE (XEXP (x, 0)) != DImode))
28922 + (rtx_cost (XEXP (x, 1), outer_code, speed)
28923 << (GET_MODE (XEXP (x, 1)) != DImode)));
28929 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
28931 /* ??? SSE cost should be used here. */
28932 *total = cost->fchs;
28935 else if (X87_FLOAT_MODE_P (mode))
28937 *total = cost->fchs;
28940 else if (FLOAT_MODE_P (mode))
28942 /* ??? SSE vector cost should be used here. */
28943 *total = cost->fchs;
28949 if (!TARGET_64BIT && mode == DImode)
28950 *total = cost->add * 2;
28952 *total = cost->add;
28956 if (GET_CODE (XEXP (x, 0)) == ZERO_EXTRACT
28957 && XEXP (XEXP (x, 0), 1) == const1_rtx
28958 && CONST_INT_P (XEXP (XEXP (x, 0), 2))
28959 && XEXP (x, 1) == const0_rtx)
28961 /* This kind of construct is implemented using test[bwl].
28962 Treat it as if we had an AND. */
28963 *total = (cost->add
28964 + rtx_cost (XEXP (XEXP (x, 0), 0), outer_code, speed)
28965 + rtx_cost (const1_rtx, outer_code, speed));
28971 if (!(SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH))
28976 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
28977 /* ??? SSE cost should be used here. */
28978 *total = cost->fabs;
28979 else if (X87_FLOAT_MODE_P (mode))
28980 *total = cost->fabs;
28981 else if (FLOAT_MODE_P (mode))
28982 /* ??? SSE vector cost should be used here. */
28983 *total = cost->fabs;
28987 if (SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH)
28988 /* ??? SSE cost should be used here. */
28989 *total = cost->fsqrt;
28990 else if (X87_FLOAT_MODE_P (mode))
28991 *total = cost->fsqrt;
28992 else if (FLOAT_MODE_P (mode))
28993 /* ??? SSE vector cost should be used here. */
28994 *total = cost->fsqrt;
28998 if (XINT (x, 1) == UNSPEC_TP)
29005 case VEC_DUPLICATE:
29006 /* ??? Assume all of these vector manipulation patterns are
29007 recognizable. In which case they all pretty much have the
29009 *total = COSTS_N_INSNS (1);
29019 static int current_machopic_label_num;
29021 /* Given a symbol name and its associated stub, write out the
29022 definition of the stub. */
29025 machopic_output_stub (FILE *file, const char *symb, const char *stub)
29027 unsigned int length;
29028 char *binder_name, *symbol_name, lazy_ptr_name[32];
29029 int label = ++current_machopic_label_num;
29031 /* For 64-bit we shouldn't get here. */
29032 gcc_assert (!TARGET_64BIT);
29034 /* Lose our funky encoding stuff so it doesn't contaminate the stub. */
29035 symb = targetm.strip_name_encoding (symb);
29037 length = strlen (stub);
29038 binder_name = XALLOCAVEC (char, length + 32);
29039 GEN_BINDER_NAME_FOR_STUB (binder_name, stub, length);
29041 length = strlen (symb);
29042 symbol_name = XALLOCAVEC (char, length + 32);
29043 GEN_SYMBOL_NAME_FOR_SYMBOL (symbol_name, symb, length);
29045 sprintf (lazy_ptr_name, "L%d$lz", label);
29047 if (MACHOPIC_ATT_STUB)
29048 switch_to_section (darwin_sections[machopic_picsymbol_stub3_section]);
29049 else if (MACHOPIC_PURE)
29050 switch_to_section (darwin_sections[machopic_picsymbol_stub2_section]);
29052 switch_to_section (darwin_sections[machopic_symbol_stub_section]);
29054 fprintf (file, "%s:\n", stub);
29055 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
29057 if (MACHOPIC_ATT_STUB)
29059 fprintf (file, "\thlt ; hlt ; hlt ; hlt ; hlt\n");
29061 else if (MACHOPIC_PURE)
29064 /* 25-byte PIC stub using "CALL get_pc_thunk". */
29065 rtx tmp = gen_rtx_REG (SImode, 2 /* ECX */);
29066 output_set_got (tmp, NULL_RTX); /* "CALL ___<cpu>.get_pc_thunk.cx". */
29067 fprintf (file, "LPC$%d:\tmovl\t%s-LPC$%d(%%ecx),%%ecx\n",
29068 label, lazy_ptr_name, label);
29069 fprintf (file, "\tjmp\t*%%ecx\n");
29072 fprintf (file, "\tjmp\t*%s\n", lazy_ptr_name);
29074 /* The AT&T-style ("self-modifying") stub is not lazily bound, thus
29075 it needs no stub-binding-helper. */
29076 if (MACHOPIC_ATT_STUB)
29079 fprintf (file, "%s:\n", binder_name);
29083 fprintf (file, "\tlea\t%s-%s(%%ecx),%%ecx\n", lazy_ptr_name, binder_name);
29084 fprintf (file, "\tpushl\t%%ecx\n");
29087 fprintf (file, "\tpushl\t$%s\n", lazy_ptr_name);
29089 fputs ("\tjmp\tdyld_stub_binding_helper\n", file);
29091 /* N.B. Keep the correspondence of these
29092 'symbol_ptr/symbol_ptr2/symbol_ptr3' sections consistent with the
29093 old-pic/new-pic/non-pic stubs; altering this will break
29094 compatibility with existing dylibs. */
29097 /* 25-byte PIC stub using "CALL get_pc_thunk". */
29098 switch_to_section (darwin_sections[machopic_lazy_symbol_ptr2_section]);
29101 /* 16-byte -mdynamic-no-pic stub. */
29102 switch_to_section(darwin_sections[machopic_lazy_symbol_ptr3_section]);
29104 fprintf (file, "%s:\n", lazy_ptr_name);
29105 fprintf (file, "\t.indirect_symbol %s\n", symbol_name);
29106 fprintf (file, ASM_LONG "%s\n", binder_name);
29108 #endif /* TARGET_MACHO */
29110 /* Order the registers for register allocator. */
29113 x86_order_regs_for_local_alloc (void)
29118 /* First allocate the local general purpose registers. */
29119 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
29120 if (GENERAL_REGNO_P (i) && call_used_regs[i])
29121 reg_alloc_order [pos++] = i;
29123 /* Global general purpose registers. */
29124 for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
29125 if (GENERAL_REGNO_P (i) && !call_used_regs[i])
29126 reg_alloc_order [pos++] = i;
29128 /* x87 registers come first in case we are doing FP math
29130 if (!TARGET_SSE_MATH)
29131 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
29132 reg_alloc_order [pos++] = i;
29134 /* SSE registers. */
29135 for (i = FIRST_SSE_REG; i <= LAST_SSE_REG; i++)
29136 reg_alloc_order [pos++] = i;
29137 for (i = FIRST_REX_SSE_REG; i <= LAST_REX_SSE_REG; i++)
29138 reg_alloc_order [pos++] = i;
29140 /* x87 registers. */
29141 if (TARGET_SSE_MATH)
29142 for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
29143 reg_alloc_order [pos++] = i;
29145 for (i = FIRST_MMX_REG; i <= LAST_MMX_REG; i++)
29146 reg_alloc_order [pos++] = i;
29148 /* Initialize the rest of array as we do not allocate some registers
29150 while (pos < FIRST_PSEUDO_REGISTER)
29151 reg_alloc_order [pos++] = 0;
29154 /* Handle a "callee_pop_aggregate_return" attribute; arguments as
29155 in struct attribute_spec handler. */
29157 ix86_handle_callee_pop_aggregate_return (tree *node, tree name,
29159 int flags ATTRIBUTE_UNUSED,
29160 bool *no_add_attrs)
29162 if (TREE_CODE (*node) != FUNCTION_TYPE
29163 && TREE_CODE (*node) != METHOD_TYPE
29164 && TREE_CODE (*node) != FIELD_DECL
29165 && TREE_CODE (*node) != TYPE_DECL)
29167 warning (OPT_Wattributes, "%qE attribute only applies to functions",
29169 *no_add_attrs = true;
29174 warning (OPT_Wattributes, "%qE attribute only available for 32-bit",
29176 *no_add_attrs = true;
29179 if (is_attribute_p ("callee_pop_aggregate_return", name))
29183 cst = TREE_VALUE (args);
29184 if (TREE_CODE (cst) != INTEGER_CST)
29186 warning (OPT_Wattributes,
29187 "%qE attribute requires an integer constant argument",
29189 *no_add_attrs = true;
29191 else if (compare_tree_int (cst, 0) != 0
29192 && compare_tree_int (cst, 1) != 0)
29194 warning (OPT_Wattributes,
29195 "argument to %qE attribute is neither zero, nor one",
29197 *no_add_attrs = true;
29206 /* Handle a "ms_abi" or "sysv" attribute; arguments as in
29207 struct attribute_spec.handler. */
29209 ix86_handle_abi_attribute (tree *node, tree name,
29210 tree args ATTRIBUTE_UNUSED,
29211 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
29213 if (TREE_CODE (*node) != FUNCTION_TYPE
29214 && TREE_CODE (*node) != METHOD_TYPE
29215 && TREE_CODE (*node) != FIELD_DECL
29216 && TREE_CODE (*node) != TYPE_DECL)
29218 warning (OPT_Wattributes, "%qE attribute only applies to functions",
29220 *no_add_attrs = true;
29225 warning (OPT_Wattributes, "%qE attribute only available for 64-bit",
29227 *no_add_attrs = true;
29231 /* Can combine regparm with all attributes but fastcall. */
29232 if (is_attribute_p ("ms_abi", name))
29234 if (lookup_attribute ("sysv_abi", TYPE_ATTRIBUTES (*node)))
29236 error ("ms_abi and sysv_abi attributes are not compatible");
29241 else if (is_attribute_p ("sysv_abi", name))
29243 if (lookup_attribute ("ms_abi", TYPE_ATTRIBUTES (*node)))
29245 error ("ms_abi and sysv_abi attributes are not compatible");
29254 /* Handle a "ms_struct" or "gcc_struct" attribute; arguments as in
29255 struct attribute_spec.handler. */
29257 ix86_handle_struct_attribute (tree *node, tree name,
29258 tree args ATTRIBUTE_UNUSED,
29259 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
29262 if (DECL_P (*node))
29264 if (TREE_CODE (*node) == TYPE_DECL)
29265 type = &TREE_TYPE (*node);
29270 if (!(type && (TREE_CODE (*type) == RECORD_TYPE
29271 || TREE_CODE (*type) == UNION_TYPE)))
29273 warning (OPT_Wattributes, "%qE attribute ignored",
29275 *no_add_attrs = true;
29278 else if ((is_attribute_p ("ms_struct", name)
29279 && lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (*type)))
29280 || ((is_attribute_p ("gcc_struct", name)
29281 && lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (*type)))))
29283 warning (OPT_Wattributes, "%qE incompatible attribute ignored",
29285 *no_add_attrs = true;
29292 ix86_handle_fndecl_attribute (tree *node, tree name,
29293 tree args ATTRIBUTE_UNUSED,
29294 int flags ATTRIBUTE_UNUSED, bool *no_add_attrs)
29296 if (TREE_CODE (*node) != FUNCTION_DECL)
29298 warning (OPT_Wattributes, "%qE attribute only applies to functions",
29300 *no_add_attrs = true;
29306 ix86_ms_bitfield_layout_p (const_tree record_type)
29308 return ((TARGET_MS_BITFIELD_LAYOUT
29309 && !lookup_attribute ("gcc_struct", TYPE_ATTRIBUTES (record_type)))
29310 || lookup_attribute ("ms_struct", TYPE_ATTRIBUTES (record_type)));
29313 /* Returns an expression indicating where the this parameter is
29314 located on entry to the FUNCTION. */
29317 x86_this_parameter (tree function)
29319 tree type = TREE_TYPE (function);
29320 bool aggr = aggregate_value_p (TREE_TYPE (type), type) != 0;
29325 const int *parm_regs;
29327 if (ix86_function_type_abi (type) == MS_ABI)
29328 parm_regs = x86_64_ms_abi_int_parameter_registers;
29330 parm_regs = x86_64_int_parameter_registers;
29331 return gen_rtx_REG (DImode, parm_regs[aggr]);
29334 nregs = ix86_function_regparm (type, function);
29336 if (nregs > 0 && !stdarg_p (type))
29339 unsigned int ccvt = ix86_get_callcvt (type);
29341 if ((ccvt & IX86_CALLCVT_FASTCALL) != 0)
29342 regno = aggr ? DX_REG : CX_REG;
29343 else if ((ccvt & IX86_CALLCVT_THISCALL) != 0)
29347 return gen_rtx_MEM (SImode,
29348 plus_constant (stack_pointer_rtx, 4));
29357 return gen_rtx_MEM (SImode,
29358 plus_constant (stack_pointer_rtx, 4));
29361 return gen_rtx_REG (SImode, regno);
29364 return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, aggr ? 8 : 4));
29367 /* Determine whether x86_output_mi_thunk can succeed. */
29370 x86_can_output_mi_thunk (const_tree thunk ATTRIBUTE_UNUSED,
29371 HOST_WIDE_INT delta ATTRIBUTE_UNUSED,
29372 HOST_WIDE_INT vcall_offset, const_tree function)
29374 /* 64-bit can handle anything. */
29378 /* For 32-bit, everything's fine if we have one free register. */
29379 if (ix86_function_regparm (TREE_TYPE (function), function) < 3)
29382 /* Need a free register for vcall_offset. */
29386 /* Need a free register for GOT references. */
29387 if (flag_pic && !targetm.binds_local_p (function))
29390 /* Otherwise ok. */
29394 /* Output the assembler code for a thunk function. THUNK_DECL is the
29395 declaration for the thunk function itself, FUNCTION is the decl for
29396 the target function. DELTA is an immediate constant offset to be
29397 added to THIS. If VCALL_OFFSET is nonzero, the word at
29398 *(*this + vcall_offset) should be added to THIS. */
29401 x86_output_mi_thunk (FILE *file,
29402 tree thunk ATTRIBUTE_UNUSED, HOST_WIDE_INT delta,
29403 HOST_WIDE_INT vcall_offset, tree function)
29405 rtx this_param = x86_this_parameter (function);
29406 rtx this_reg, tmp, fnaddr;
29408 emit_note (NOTE_INSN_PROLOGUE_END);
29410 /* If VCALL_OFFSET, we'll need THIS in a register. Might as well
29411 pull it in now and let DELTA benefit. */
29412 if (REG_P (this_param))
29413 this_reg = this_param;
29414 else if (vcall_offset)
29416 /* Put the this parameter into %eax. */
29417 this_reg = gen_rtx_REG (Pmode, AX_REG);
29418 emit_move_insn (this_reg, this_param);
29421 this_reg = NULL_RTX;
29423 /* Adjust the this parameter by a fixed constant. */
29426 rtx delta_rtx = GEN_INT (delta);
29427 rtx delta_dst = this_reg ? this_reg : this_param;
29431 if (!x86_64_general_operand (delta_rtx, Pmode))
29433 tmp = gen_rtx_REG (Pmode, R10_REG);
29434 emit_move_insn (tmp, delta_rtx);
29439 emit_insn (ix86_gen_add3 (delta_dst, delta_dst, delta_rtx));
29442 /* Adjust the this parameter by a value stored in the vtable. */
29445 rtx vcall_addr, vcall_mem;
29446 unsigned int tmp_regno;
29449 tmp_regno = R10_REG;
29452 unsigned int ccvt = ix86_get_callcvt (TREE_TYPE (function));
29453 if ((ccvt & (IX86_CALLCVT_FASTCALL | IX86_CALLCVT_THISCALL)) != 0)
29454 tmp_regno = AX_REG;
29456 tmp_regno = CX_REG;
29458 tmp = gen_rtx_REG (Pmode, tmp_regno);
29460 emit_move_insn (tmp, gen_rtx_MEM (ptr_mode, this_reg));
29462 /* Adjust the this parameter. */
29463 vcall_addr = plus_constant (tmp, vcall_offset);
29465 && !ix86_legitimate_address_p (ptr_mode, vcall_addr, true))
29467 rtx tmp2 = gen_rtx_REG (Pmode, R11_REG);
29468 emit_move_insn (tmp2, GEN_INT (vcall_offset));
29469 vcall_addr = gen_rtx_PLUS (Pmode, tmp, tmp2);
29472 vcall_mem = gen_rtx_MEM (Pmode, vcall_addr);
29473 emit_insn (ix86_gen_add3 (this_reg, this_reg, vcall_mem));
29476 /* If necessary, drop THIS back to its stack slot. */
29477 if (this_reg && this_reg != this_param)
29478 emit_move_insn (this_param, this_reg);
29480 fnaddr = XEXP (DECL_RTL (function), 0);
29483 if (!flag_pic || targetm.binds_local_p (function)
29484 || cfun->machine->call_abi == MS_ABI)
29488 tmp = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOTPCREL);
29489 tmp = gen_rtx_CONST (Pmode, tmp);
29490 fnaddr = gen_rtx_MEM (Pmode, tmp);
29495 if (!flag_pic || targetm.binds_local_p (function))
29498 else if (TARGET_MACHO)
29500 fnaddr = machopic_indirect_call_target (DECL_RTL (function));
29501 fnaddr = XEXP (fnaddr, 0);
29503 #endif /* TARGET_MACHO */
29506 tmp = gen_rtx_REG (Pmode, CX_REG);
29507 output_set_got (tmp, NULL_RTX);
29509 fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, fnaddr), UNSPEC_GOT);
29510 fnaddr = gen_rtx_PLUS (Pmode, fnaddr, tmp);
29511 fnaddr = gen_rtx_MEM (Pmode, fnaddr);
29515 /* Our sibling call patterns do not allow memories, because we have no
29516 predicate that can distinguish between frame and non-frame memory.
29517 For our purposes here, we can get away with (ab)using a jump pattern,
29518 because we're going to do no optimization. */
29519 if (MEM_P (fnaddr))
29520 emit_jump_insn (gen_indirect_jump (fnaddr));
29523 tmp = gen_rtx_MEM (QImode, fnaddr);
29524 tmp = gen_rtx_CALL (VOIDmode, tmp, const0_rtx);
29525 tmp = emit_call_insn (tmp);
29526 SIBLING_CALL_P (tmp) = 1;
29530 /* Emit just enough of rest_of_compilation to get the insns emitted.
29531 Note that use_thunk calls assemble_start_function et al. */
29532 tmp = get_insns ();
29533 insn_locators_alloc ();
29534 shorten_branches (tmp);
29535 final_start_function (tmp, file, 1);
29536 final (tmp, file, 1);
29537 final_end_function ();
29541 x86_file_start (void)
29543 default_file_start ();
29545 darwin_file_start ();
29547 if (X86_FILE_START_VERSION_DIRECTIVE)
29548 fputs ("\t.version\t\"01.01\"\n", asm_out_file);
29549 if (X86_FILE_START_FLTUSED)
29550 fputs ("\t.global\t__fltused\n", asm_out_file);
29551 if (ix86_asm_dialect == ASM_INTEL)
29552 fputs ("\t.intel_syntax noprefix\n", asm_out_file);
29556 x86_field_alignment (tree field, int computed)
29558 enum machine_mode mode;
29559 tree type = TREE_TYPE (field);
29561 if (TARGET_64BIT || TARGET_ALIGN_DOUBLE)
29563 mode = TYPE_MODE (strip_array_types (type));
29564 if (mode == DFmode || mode == DCmode
29565 || GET_MODE_CLASS (mode) == MODE_INT
29566 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
29567 return MIN (32, computed);
29571 /* Output assembler code to FILE to increment profiler label # LABELNO
29572 for profiling a function entry. */
29574 x86_function_profiler (FILE *file, int labelno ATTRIBUTE_UNUSED)
29576 const char *mcount_name = (flag_fentry ? MCOUNT_NAME_BEFORE_PROLOGUE
29581 #ifndef NO_PROFILE_COUNTERS
29582 fprintf (file, "\tleaq\t%sP%d(%%rip),%%r11\n", LPREFIX, labelno);
29585 if (DEFAULT_ABI == SYSV_ABI && flag_pic)
29586 fprintf (file, "\tcall\t*%s@GOTPCREL(%%rip)\n", mcount_name);
29588 fprintf (file, "\tcall\t%s\n", mcount_name);
29592 #ifndef NO_PROFILE_COUNTERS
29593 fprintf (file, "\tleal\t%sP%d@GOTOFF(%%ebx),%%" PROFILE_COUNT_REGISTER "\n",
29596 fprintf (file, "\tcall\t*%s@GOT(%%ebx)\n", mcount_name);
29600 #ifndef NO_PROFILE_COUNTERS
29601 fprintf (file, "\tmovl\t$%sP%d,%%" PROFILE_COUNT_REGISTER "\n",
29604 fprintf (file, "\tcall\t%s\n", mcount_name);
29608 /* We don't have exact information about the insn sizes, but we may assume
29609 quite safely that we are informed about all 1 byte insns and memory
29610 address sizes. This is enough to eliminate unnecessary padding in
29614 min_insn_size (rtx insn)
29618 if (!INSN_P (insn) || !active_insn_p (insn))
29621 /* Discard alignments we've emit and jump instructions. */
29622 if (GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
29623 && XINT (PATTERN (insn), 1) == UNSPECV_ALIGN)
29625 if (JUMP_TABLE_DATA_P (insn))
29628 /* Important case - calls are always 5 bytes.
29629 It is common to have many calls in the row. */
29631 && symbolic_reference_mentioned_p (PATTERN (insn))
29632 && !SIBLING_CALL_P (insn))
29634 len = get_attr_length (insn);
29638 /* For normal instructions we rely on get_attr_length being exact,
29639 with a few exceptions. */
29640 if (!JUMP_P (insn))
29642 enum attr_type type = get_attr_type (insn);
29647 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
29648 || asm_noperands (PATTERN (insn)) >= 0)
29655 /* Otherwise trust get_attr_length. */
29659 l = get_attr_length_address (insn);
29660 if (l < 4 && symbolic_reference_mentioned_p (PATTERN (insn)))
29669 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
29671 /* AMD K8 core mispredicts jumps when there are more than 3 jumps in 16 byte
29675 ix86_avoid_jump_mispredicts (void)
29677 rtx insn, start = get_insns ();
29678 int nbytes = 0, njumps = 0;
29681 /* Look for all minimal intervals of instructions containing 4 jumps.
29682 The intervals are bounded by START and INSN. NBYTES is the total
29683 size of instructions in the interval including INSN and not including
29684 START. When the NBYTES is smaller than 16 bytes, it is possible
29685 that the end of START and INSN ends up in the same 16byte page.
29687 The smallest offset in the page INSN can start is the case where START
29688 ends on the offset 0. Offset of INSN is then NBYTES - sizeof (INSN).
29689 We add p2align to 16byte window with maxskip 15 - NBYTES + sizeof (INSN).
29691 for (insn = start; insn; insn = NEXT_INSN (insn))
29695 if (LABEL_P (insn))
29697 int align = label_to_alignment (insn);
29698 int max_skip = label_to_max_skip (insn);
29702 /* If align > 3, only up to 16 - max_skip - 1 bytes can be
29703 already in the current 16 byte page, because otherwise
29704 ASM_OUTPUT_MAX_SKIP_ALIGN could skip max_skip or fewer
29705 bytes to reach 16 byte boundary. */
29707 || (align <= 3 && max_skip != (1 << align) - 1))
29710 fprintf (dump_file, "Label %i with max_skip %i\n",
29711 INSN_UID (insn), max_skip);
29714 while (nbytes + max_skip >= 16)
29716 start = NEXT_INSN (start);
29717 if ((JUMP_P (start)
29718 && GET_CODE (PATTERN (start)) != ADDR_VEC
29719 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
29721 njumps--, isjump = 1;
29724 nbytes -= min_insn_size (start);
29730 min_size = min_insn_size (insn);
29731 nbytes += min_size;
29733 fprintf (dump_file, "Insn %i estimated to %i bytes\n",
29734 INSN_UID (insn), min_size);
29736 && GET_CODE (PATTERN (insn)) != ADDR_VEC
29737 && GET_CODE (PATTERN (insn)) != ADDR_DIFF_VEC)
29745 start = NEXT_INSN (start);
29746 if ((JUMP_P (start)
29747 && GET_CODE (PATTERN (start)) != ADDR_VEC
29748 && GET_CODE (PATTERN (start)) != ADDR_DIFF_VEC)
29750 njumps--, isjump = 1;
29753 nbytes -= min_insn_size (start);
29755 gcc_assert (njumps >= 0);
29757 fprintf (dump_file, "Interval %i to %i has %i bytes\n",
29758 INSN_UID (start), INSN_UID (insn), nbytes);
29760 if (njumps == 3 && isjump && nbytes < 16)
29762 int padsize = 15 - nbytes + min_insn_size (insn);
29765 fprintf (dump_file, "Padding insn %i by %i bytes!\n",
29766 INSN_UID (insn), padsize);
29767 emit_insn_before (gen_pad (GEN_INT (padsize)), insn);
29773 /* AMD Athlon works faster
29774 when RET is not destination of conditional jump or directly preceded
29775 by other jump instruction. We avoid the penalty by inserting NOP just
29776 before the RET instructions in such cases. */
29778 ix86_pad_returns (void)
29783 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
29785 basic_block bb = e->src;
29786 rtx ret = BB_END (bb);
29788 bool replace = false;
29790 if (!JUMP_P (ret) || GET_CODE (PATTERN (ret)) != RETURN
29791 || optimize_bb_for_size_p (bb))
29793 for (prev = PREV_INSN (ret); prev; prev = PREV_INSN (prev))
29794 if (active_insn_p (prev) || LABEL_P (prev))
29796 if (prev && LABEL_P (prev))
29801 FOR_EACH_EDGE (e, ei, bb->preds)
29802 if (EDGE_FREQUENCY (e) && e->src->index >= 0
29803 && !(e->flags & EDGE_FALLTHRU))
29808 prev = prev_active_insn (ret);
29810 && ((JUMP_P (prev) && any_condjump_p (prev))
29813 /* Empty functions get branch mispredict even when
29814 the jump destination is not visible to us. */
29815 if (!prev && !optimize_function_for_size_p (cfun))
29820 emit_jump_insn_before (gen_return_internal_long (), ret);
29826 /* Count the minimum number of instructions in BB. Return 4 if the
29827 number of instructions >= 4. */
29830 ix86_count_insn_bb (basic_block bb)
29833 int insn_count = 0;
29835 /* Count number of instructions in this block. Return 4 if the number
29836 of instructions >= 4. */
29837 FOR_BB_INSNS (bb, insn)
29839 /* Only happen in exit blocks. */
29841 && GET_CODE (PATTERN (insn)) == RETURN)
29844 if (NONDEBUG_INSN_P (insn)
29845 && GET_CODE (PATTERN (insn)) != USE
29846 && GET_CODE (PATTERN (insn)) != CLOBBER)
29849 if (insn_count >= 4)
29858 /* Count the minimum number of instructions in code path in BB.
29859 Return 4 if the number of instructions >= 4. */
29862 ix86_count_insn (basic_block bb)
29866 int min_prev_count;
29868 /* Only bother counting instructions along paths with no
29869 more than 2 basic blocks between entry and exit. Given
29870 that BB has an edge to exit, determine if a predecessor
29871 of BB has an edge from entry. If so, compute the number
29872 of instructions in the predecessor block. If there
29873 happen to be multiple such blocks, compute the minimum. */
29874 min_prev_count = 4;
29875 FOR_EACH_EDGE (e, ei, bb->preds)
29878 edge_iterator prev_ei;
29880 if (e->src == ENTRY_BLOCK_PTR)
29882 min_prev_count = 0;
29885 FOR_EACH_EDGE (prev_e, prev_ei, e->src->preds)
29887 if (prev_e->src == ENTRY_BLOCK_PTR)
29889 int count = ix86_count_insn_bb (e->src);
29890 if (count < min_prev_count)
29891 min_prev_count = count;
29897 if (min_prev_count < 4)
29898 min_prev_count += ix86_count_insn_bb (bb);
29900 return min_prev_count;
29903 /* Pad short funtion to 4 instructions. */
29906 ix86_pad_short_function (void)
29911 FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR->preds)
29913 rtx ret = BB_END (e->src);
29914 if (JUMP_P (ret) && GET_CODE (PATTERN (ret)) == RETURN)
29916 int insn_count = ix86_count_insn (e->src);
29918 /* Pad short function. */
29919 if (insn_count < 4)
29923 /* Find epilogue. */
29926 || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG))
29927 insn = PREV_INSN (insn);
29932 /* Two NOPs count as one instruction. */
29933 insn_count = 2 * (4 - insn_count);
29934 emit_insn_before (gen_nops (GEN_INT (insn_count)), insn);
29940 /* Implement machine specific optimizations. We implement padding of returns
29941 for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */
29945 /* We are freeing block_for_insn in the toplev to keep compatibility
29946 with old MDEP_REORGS that are not CFG based. Recompute it now. */
29947 compute_bb_for_insn ();
29949 /* Run the vzeroupper optimization if needed. */
29950 if (TARGET_VZEROUPPER)
29951 move_or_delete_vzeroupper ();
29953 if (optimize && optimize_function_for_speed_p (cfun))
29955 if (TARGET_PAD_SHORT_FUNCTION)
29956 ix86_pad_short_function ();
29957 else if (TARGET_PAD_RETURNS)
29958 ix86_pad_returns ();
29959 #ifdef ASM_OUTPUT_MAX_SKIP_PAD
29960 if (TARGET_FOUR_JUMP_LIMIT)
29961 ix86_avoid_jump_mispredicts ();
29966 /* Return nonzero when QImode register that must be represented via REX prefix
29969 x86_extended_QIreg_mentioned_p (rtx insn)
29972 extract_insn_cached (insn);
29973 for (i = 0; i < recog_data.n_operands; i++)
29974 if (REG_P (recog_data.operand[i])
29975 && REGNO (recog_data.operand[i]) > BX_REG)
29980 /* Return nonzero when P points to register encoded via REX prefix.
29981 Called via for_each_rtx. */
29983 extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
29985 unsigned int regno;
29988 regno = REGNO (*p);
29989 return REX_INT_REGNO_P (regno) || REX_SSE_REGNO_P (regno);
29992 /* Return true when INSN mentions register that must be encoded using REX
29995 x86_extended_reg_mentioned_p (rtx insn)
29997 return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
29998 extended_reg_mentioned_1, NULL);
30001 /* If profitable, negate (without causing overflow) integer constant
30002 of mode MODE at location LOC. Return true in this case. */
30004 x86_maybe_negate_const_int (rtx *loc, enum machine_mode mode)
30008 if (!CONST_INT_P (*loc))
30014 /* DImode x86_64 constants must fit in 32 bits. */
30015 gcc_assert (x86_64_immediate_operand (*loc, mode));
30026 gcc_unreachable ();
30029 /* Avoid overflows. */
30030 if (mode_signbit_p (mode, *loc))
30033 val = INTVAL (*loc);
30035 /* Make things pretty and `subl $4,%eax' rather than `addl $-4,%eax'.
30036 Exceptions: -128 encodes smaller than 128, so swap sign and op. */
30037 if ((val < 0 && val != -128)
30040 *loc = GEN_INT (-val);
30047 /* Generate an unsigned DImode/SImode to FP conversion. This is the same code
30048 optabs would emit if we didn't have TFmode patterns. */
30051 x86_emit_floatuns (rtx operands[2])
30053 rtx neglab, donelab, i0, i1, f0, in, out;
30054 enum machine_mode mode, inmode;
30056 inmode = GET_MODE (operands[1]);
30057 gcc_assert (inmode == SImode || inmode == DImode);
30060 in = force_reg (inmode, operands[1]);
30061 mode = GET_MODE (out);
30062 neglab = gen_label_rtx ();
30063 donelab = gen_label_rtx ();
30064 f0 = gen_reg_rtx (mode);
30066 emit_cmp_and_jump_insns (in, const0_rtx, LT, const0_rtx, inmode, 0, neglab);
30068 expand_float (out, in, 0);
30070 emit_jump_insn (gen_jump (donelab));
30073 emit_label (neglab);
30075 i0 = expand_simple_binop (inmode, LSHIFTRT, in, const1_rtx, NULL,
30077 i1 = expand_simple_binop (inmode, AND, in, const1_rtx, NULL,
30079 i0 = expand_simple_binop (inmode, IOR, i0, i1, i0, 1, OPTAB_DIRECT);
30081 expand_float (f0, i0, 0);
30083 emit_insn (gen_rtx_SET (VOIDmode, out, gen_rtx_PLUS (mode, f0, f0)));
30085 emit_label (donelab);
30088 /* AVX does not support 32-byte integer vector operations,
30089 thus the longest vector we are faced with is V16QImode. */
30090 #define MAX_VECT_LEN 16
30092 struct expand_vec_perm_d
30094 rtx target, op0, op1;
30095 unsigned char perm[MAX_VECT_LEN];
30096 enum machine_mode vmode;
30097 unsigned char nelt;
30101 static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
30102 static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
30104 /* Get a vector mode of the same size as the original but with elements
30105 twice as wide. This is only guaranteed to apply to integral vectors. */
30107 static inline enum machine_mode
30108 get_mode_wider_vector (enum machine_mode o)
30110 /* ??? Rely on the ordering that genmodes.c gives to vectors. */
30111 enum machine_mode n = GET_MODE_WIDER_MODE (o);
30112 gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
30113 gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
30117 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
30118 with all elements equal to VAR. Return true if successful. */
30121 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
30122 rtx target, rtx val)
30145 /* First attempt to recognize VAL as-is. */
30146 dup = gen_rtx_VEC_DUPLICATE (mode, val);
30147 insn = emit_insn (gen_rtx_SET (VOIDmode, target, dup));
30148 if (recog_memoized (insn) < 0)
30151 /* If that fails, force VAL into a register. */
30154 XEXP (dup, 0) = force_reg (GET_MODE_INNER (mode), val);
30155 seq = get_insns ();
30158 emit_insn_before (seq, insn);
30160 ok = recog_memoized (insn) >= 0;
30169 if (TARGET_SSE || TARGET_3DNOW_A)
30173 val = gen_lowpart (SImode, val);
30174 x = gen_rtx_TRUNCATE (HImode, val);
30175 x = gen_rtx_VEC_DUPLICATE (mode, x);
30176 emit_insn (gen_rtx_SET (VOIDmode, target, x));
30189 struct expand_vec_perm_d dperm;
30193 memset (&dperm, 0, sizeof (dperm));
30194 dperm.target = target;
30195 dperm.vmode = mode;
30196 dperm.nelt = GET_MODE_NUNITS (mode);
30197 dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
30199 /* Extend to SImode using a paradoxical SUBREG. */
30200 tmp1 = gen_reg_rtx (SImode);
30201 emit_move_insn (tmp1, gen_lowpart (SImode, val));
30203 /* Insert the SImode value as low element of a V4SImode vector. */
30204 tmp2 = gen_lowpart (V4SImode, dperm.op0);
30205 emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
30207 ok = (expand_vec_perm_1 (&dperm)
30208 || expand_vec_perm_broadcast_1 (&dperm));
30220 /* Replicate the value once into the next wider mode and recurse. */
30222 enum machine_mode smode, wsmode, wvmode;
30225 smode = GET_MODE_INNER (mode);
30226 wvmode = get_mode_wider_vector (mode);
30227 wsmode = GET_MODE_INNER (wvmode);
30229 val = convert_modes (wsmode, smode, val, true);
30230 x = expand_simple_binop (wsmode, ASHIFT, val,
30231 GEN_INT (GET_MODE_BITSIZE (smode)),
30232 NULL_RTX, 1, OPTAB_LIB_WIDEN);
30233 val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
30235 x = gen_lowpart (wvmode, target);
30236 ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
30244 enum machine_mode hvmode = (mode == V16HImode ? V8HImode : V16QImode);
30245 rtx x = gen_reg_rtx (hvmode);
30247 ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
30250 x = gen_rtx_VEC_CONCAT (mode, x, x);
30251 emit_insn (gen_rtx_SET (VOIDmode, target, x));
30260 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
30261 whose ONE_VAR element is VAR, and other elements are zero. Return true
30265 ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
30266 rtx target, rtx var, int one_var)
30268 enum machine_mode vsimode;
30271 bool use_vector_set = false;
30276 /* For SSE4.1, we normally use vector set. But if the second
30277 element is zero and inter-unit moves are OK, we use movq
30279 use_vector_set = (TARGET_64BIT
30281 && !(TARGET_INTER_UNIT_MOVES
30287 use_vector_set = TARGET_SSE4_1;
30290 use_vector_set = TARGET_SSE2;
30293 use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
30300 use_vector_set = TARGET_AVX;
30303 /* Use ix86_expand_vector_set in 64bit mode only. */
30304 use_vector_set = TARGET_AVX && TARGET_64BIT;
30310 if (use_vector_set)
30312 emit_insn (gen_rtx_SET (VOIDmode, target, CONST0_RTX (mode)));
30313 var = force_reg (GET_MODE_INNER (mode), var);
30314 ix86_expand_vector_set (mmx_ok, target, var, one_var);
30330 var = force_reg (GET_MODE_INNER (mode), var);
30331 x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
30332 emit_insn (gen_rtx_SET (VOIDmode, target, x));
30337 if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
30338 new_target = gen_reg_rtx (mode);
30340 new_target = target;
30341 var = force_reg (GET_MODE_INNER (mode), var);
30342 x = gen_rtx_VEC_DUPLICATE (mode, var);
30343 x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
30344 emit_insn (gen_rtx_SET (VOIDmode, new_target, x));
30347 /* We need to shuffle the value to the correct position, so
30348 create a new pseudo to store the intermediate result. */
30350 /* With SSE2, we can use the integer shuffle insns. */
30351 if (mode != V4SFmode && TARGET_SSE2)
30353 emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
30355 GEN_INT (one_var == 1 ? 0 : 1),
30356 GEN_INT (one_var == 2 ? 0 : 1),
30357 GEN_INT (one_var == 3 ? 0 : 1)));
30358 if (target != new_target)
30359 emit_move_insn (target, new_target);
30363 /* Otherwise convert the intermediate result to V4SFmode and
30364 use the SSE1 shuffle instructions. */
30365 if (mode != V4SFmode)
30367 tmp = gen_reg_rtx (V4SFmode);
30368 emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
30373 emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
30375 GEN_INT (one_var == 1 ? 0 : 1),
30376 GEN_INT (one_var == 2 ? 0+4 : 1+4),
30377 GEN_INT (one_var == 3 ? 0+4 : 1+4)));
30379 if (mode != V4SFmode)
30380 emit_move_insn (target, gen_lowpart (V4SImode, tmp));
30381 else if (tmp != target)
30382 emit_move_insn (target, tmp);
30384 else if (target != new_target)
30385 emit_move_insn (target, new_target);
30390 vsimode = V4SImode;
30396 vsimode = V2SImode;
30402 /* Zero extend the variable element to SImode and recurse. */
30403 var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
30405 x = gen_reg_rtx (vsimode);
30406 if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
30408 gcc_unreachable ();
30410 emit_move_insn (target, gen_lowpart (mode, x));
30418 /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
30419 consisting of the values in VALS. It is known that all elements
30420 except ONE_VAR are constants. Return true if successful. */
30423 ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
30424 rtx target, rtx vals, int one_var)
30426 rtx var = XVECEXP (vals, 0, one_var);
30427 enum machine_mode wmode;
30430 const_vec = copy_rtx (vals);
30431 XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
30432 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
30440 /* For the two element vectors, it's just as easy to use
30441 the general case. */
30445 /* Use ix86_expand_vector_set in 64bit mode only. */
30468 /* There's no way to set one QImode entry easily. Combine
30469 the variable value with its adjacent constant value, and
30470 promote to an HImode set. */
30471 x = XVECEXP (vals, 0, one_var ^ 1);
30474 var = convert_modes (HImode, QImode, var, true);
30475 var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
30476 NULL_RTX, 1, OPTAB_LIB_WIDEN);
30477 x = GEN_INT (INTVAL (x) & 0xff);
30481 var = convert_modes (HImode, QImode, var, true);
30482 x = gen_int_mode (INTVAL (x) << 8, HImode);
30484 if (x != const0_rtx)
30485 var = expand_simple_binop (HImode, IOR, var, x, var,
30486 1, OPTAB_LIB_WIDEN);
30488 x = gen_reg_rtx (wmode);
30489 emit_move_insn (x, gen_lowpart (wmode, const_vec));
30490 ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
30492 emit_move_insn (target, gen_lowpart (mode, x));
30499 emit_move_insn (target, const_vec);
30500 ix86_expand_vector_set (mmx_ok, target, var, one_var);
30504 /* A subroutine of ix86_expand_vector_init_general. Use vector
30505 concatenate to handle the most general case: all values variable,
30506 and none identical. */
30509 ix86_expand_vector_init_concat (enum machine_mode mode,
30510 rtx target, rtx *ops, int n)
30512 enum machine_mode cmode, hmode = VOIDmode;
30513 rtx first[8], second[4];
30553 gcc_unreachable ();
30556 if (!register_operand (ops[1], cmode))
30557 ops[1] = force_reg (cmode, ops[1]);
30558 if (!register_operand (ops[0], cmode))
30559 ops[0] = force_reg (cmode, ops[0]);
30560 emit_insn (gen_rtx_SET (VOIDmode, target,
30561 gen_rtx_VEC_CONCAT (mode, ops[0],
30581 gcc_unreachable ();
30597 gcc_unreachable ();
30602 /* FIXME: We process inputs backward to help RA. PR 36222. */
30605 for (; i > 0; i -= 2, j--)
30607 first[j] = gen_reg_rtx (cmode);
30608 v = gen_rtvec (2, ops[i - 1], ops[i]);
30609 ix86_expand_vector_init (false, first[j],
30610 gen_rtx_PARALLEL (cmode, v));
30616 gcc_assert (hmode != VOIDmode);
30617 for (i = j = 0; i < n; i += 2, j++)
30619 second[j] = gen_reg_rtx (hmode);
30620 ix86_expand_vector_init_concat (hmode, second [j],
30624 ix86_expand_vector_init_concat (mode, target, second, n);
30627 ix86_expand_vector_init_concat (mode, target, first, n);
30631 gcc_unreachable ();
30635 /* A subroutine of ix86_expand_vector_init_general. Use vector
30636 interleave to handle the most general case: all values variable,
30637 and none identical. */
30640 ix86_expand_vector_init_interleave (enum machine_mode mode,
30641 rtx target, rtx *ops, int n)
30643 enum machine_mode first_imode, second_imode, third_imode, inner_mode;
30646 rtx (*gen_load_even) (rtx, rtx, rtx);
30647 rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
30648 rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
30653 gen_load_even = gen_vec_setv8hi;
30654 gen_interleave_first_low = gen_vec_interleave_lowv4si;
30655 gen_interleave_second_low = gen_vec_interleave_lowv2di;
30656 inner_mode = HImode;
30657 first_imode = V4SImode;
30658 second_imode = V2DImode;
30659 third_imode = VOIDmode;
30662 gen_load_even = gen_vec_setv16qi;
30663 gen_interleave_first_low = gen_vec_interleave_lowv8hi;
30664 gen_interleave_second_low = gen_vec_interleave_lowv4si;
30665 inner_mode = QImode;
30666 first_imode = V8HImode;
30667 second_imode = V4SImode;
30668 third_imode = V2DImode;
30671 gcc_unreachable ();
30674 for (i = 0; i < n; i++)
30676 /* Extend the odd elment to SImode using a paradoxical SUBREG. */
30677 op0 = gen_reg_rtx (SImode);
30678 emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
30680 /* Insert the SImode value as low element of V4SImode vector. */
30681 op1 = gen_reg_rtx (V4SImode);
30682 op0 = gen_rtx_VEC_MERGE (V4SImode,
30683 gen_rtx_VEC_DUPLICATE (V4SImode,
30685 CONST0_RTX (V4SImode),
30687 emit_insn (gen_rtx_SET (VOIDmode, op1, op0));
30689 /* Cast the V4SImode vector back to a vector in orignal mode. */
30690 op0 = gen_reg_rtx (mode);
30691 emit_move_insn (op0, gen_lowpart (mode, op1));
30693 /* Load even elements into the second positon. */
30694 emit_insn (gen_load_even (op0,
30695 force_reg (inner_mode,
30699 /* Cast vector to FIRST_IMODE vector. */
30700 ops[i] = gen_reg_rtx (first_imode);
30701 emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
30704 /* Interleave low FIRST_IMODE vectors. */
30705 for (i = j = 0; i < n; i += 2, j++)
30707 op0 = gen_reg_rtx (first_imode);
30708 emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
30710 /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
30711 ops[j] = gen_reg_rtx (second_imode);
30712 emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
30715 /* Interleave low SECOND_IMODE vectors. */
30716 switch (second_imode)
30719 for (i = j = 0; i < n / 2; i += 2, j++)
30721 op0 = gen_reg_rtx (second_imode);
30722 emit_insn (gen_interleave_second_low (op0, ops[i],
30725 /* Cast the SECOND_IMODE vector to the THIRD_IMODE
30727 ops[j] = gen_reg_rtx (third_imode);
30728 emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
30730 second_imode = V2DImode;
30731 gen_interleave_second_low = gen_vec_interleave_lowv2di;
30735 op0 = gen_reg_rtx (second_imode);
30736 emit_insn (gen_interleave_second_low (op0, ops[0],
30739 /* Cast the SECOND_IMODE vector back to a vector on original
30741 emit_insn (gen_rtx_SET (VOIDmode, target,
30742 gen_lowpart (mode, op0)));
30746 gcc_unreachable ();
30750 /* A subroutine of ix86_expand_vector_init. Handle the most general case:
30751 all values variable, and none identical. */
30754 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
30755 rtx target, rtx vals)
30757 rtx ops[32], op0, op1;
30758 enum machine_mode half_mode = VOIDmode;
30765 if (!mmx_ok && !TARGET_SSE)
30777 n = GET_MODE_NUNITS (mode);
30778 for (i = 0; i < n; i++)
30779 ops[i] = XVECEXP (vals, 0, i);
30780 ix86_expand_vector_init_concat (mode, target, ops, n);
30784 half_mode = V16QImode;
30788 half_mode = V8HImode;
30792 n = GET_MODE_NUNITS (mode);
30793 for (i = 0; i < n; i++)
30794 ops[i] = XVECEXP (vals, 0, i);
30795 op0 = gen_reg_rtx (half_mode);
30796 op1 = gen_reg_rtx (half_mode);
30797 ix86_expand_vector_init_interleave (half_mode, op0, ops,
30799 ix86_expand_vector_init_interleave (half_mode, op1,
30800 &ops [n >> 1], n >> 2);
30801 emit_insn (gen_rtx_SET (VOIDmode, target,
30802 gen_rtx_VEC_CONCAT (mode, op0, op1)));
30806 if (!TARGET_SSE4_1)
30814 /* Don't use ix86_expand_vector_init_interleave if we can't
30815 move from GPR to SSE register directly. */
30816 if (!TARGET_INTER_UNIT_MOVES)
30819 n = GET_MODE_NUNITS (mode);
30820 for (i = 0; i < n; i++)
30821 ops[i] = XVECEXP (vals, 0, i);
30822 ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
30830 gcc_unreachable ();
30834 int i, j, n_elts, n_words, n_elt_per_word;
30835 enum machine_mode inner_mode;
30836 rtx words[4], shift;
30838 inner_mode = GET_MODE_INNER (mode);
30839 n_elts = GET_MODE_NUNITS (mode);
30840 n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
30841 n_elt_per_word = n_elts / n_words;
30842 shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
30844 for (i = 0; i < n_words; ++i)
30846 rtx word = NULL_RTX;
30848 for (j = 0; j < n_elt_per_word; ++j)
30850 rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
30851 elt = convert_modes (word_mode, inner_mode, elt, true);
30857 word = expand_simple_binop (word_mode, ASHIFT, word, shift,
30858 word, 1, OPTAB_LIB_WIDEN);
30859 word = expand_simple_binop (word_mode, IOR, word, elt,
30860 word, 1, OPTAB_LIB_WIDEN);
30868 emit_move_insn (target, gen_lowpart (mode, words[0]));
30869 else if (n_words == 2)
30871 rtx tmp = gen_reg_rtx (mode);
30872 emit_clobber (tmp);
30873 emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
30874 emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
30875 emit_move_insn (target, tmp);
30877 else if (n_words == 4)
30879 rtx tmp = gen_reg_rtx (V4SImode);
30880 gcc_assert (word_mode == SImode);
30881 vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
30882 ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
30883 emit_move_insn (target, gen_lowpart (mode, tmp));
30886 gcc_unreachable ();
30890 /* Initialize vector TARGET via VALS. Suppress the use of MMX
30891 instructions unless MMX_OK is true. */
30894 ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
30896 enum machine_mode mode = GET_MODE (target);
30897 enum machine_mode inner_mode = GET_MODE_INNER (mode);
30898 int n_elts = GET_MODE_NUNITS (mode);
30899 int n_var = 0, one_var = -1;
30900 bool all_same = true, all_const_zero = true;
30904 for (i = 0; i < n_elts; ++i)
30906 x = XVECEXP (vals, 0, i);
30907 if (!(CONST_INT_P (x)
30908 || GET_CODE (x) == CONST_DOUBLE
30909 || GET_CODE (x) == CONST_FIXED))
30910 n_var++, one_var = i;
30911 else if (x != CONST0_RTX (inner_mode))
30912 all_const_zero = false;
30913 if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
30917 /* Constants are best loaded from the constant pool. */
30920 emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
30924 /* If all values are identical, broadcast the value. */
30926 && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
30927 XVECEXP (vals, 0, 0)))
30930 /* Values where only one field is non-constant are best loaded from
30931 the pool and overwritten via move later. */
30935 && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
30936 XVECEXP (vals, 0, one_var),
30940 if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
30944 ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
30948 ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
30950 enum machine_mode mode = GET_MODE (target);
30951 enum machine_mode inner_mode = GET_MODE_INNER (mode);
30952 enum machine_mode half_mode;
30953 bool use_vec_merge = false;
30955 static rtx (*gen_extract[6][2]) (rtx, rtx)
30957 { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
30958 { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
30959 { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
30960 { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
30961 { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
30962 { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
30964 static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
30966 { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
30967 { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
30968 { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
30969 { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
30970 { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
30971 { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
30981 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
30982 ix86_expand_vector_extract (true, tmp, target, 1 - elt);
30984 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
30986 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
30987 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
30993 use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
30997 tmp = gen_reg_rtx (GET_MODE_INNER (mode));
30998 ix86_expand_vector_extract (false, tmp, target, 1 - elt);
31000 tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
31002 tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
31003 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
31010 /* For the two element vectors, we implement a VEC_CONCAT with
31011 the extraction of the other element. */
31013 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
31014 tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
31017 op0 = val, op1 = tmp;
31019 op0 = tmp, op1 = val;
31021 tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
31022 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
31027 use_vec_merge = TARGET_SSE4_1;
31034 use_vec_merge = true;
31038 /* tmp = target = A B C D */
31039 tmp = copy_to_reg (target);
31040 /* target = A A B B */
31041 emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
31042 /* target = X A B B */
31043 ix86_expand_vector_set (false, target, val, 0);
31044 /* target = A X C D */
31045 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
31046 const1_rtx, const0_rtx,
31047 GEN_INT (2+4), GEN_INT (3+4)));
31051 /* tmp = target = A B C D */
31052 tmp = copy_to_reg (target);
31053 /* tmp = X B C D */
31054 ix86_expand_vector_set (false, tmp, val, 0);
31055 /* target = A B X D */
31056 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
31057 const0_rtx, const1_rtx,
31058 GEN_INT (0+4), GEN_INT (3+4)));
31062 /* tmp = target = A B C D */
31063 tmp = copy_to_reg (target);
31064 /* tmp = X B C D */
31065 ix86_expand_vector_set (false, tmp, val, 0);
31066 /* target = A B X D */
31067 emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
31068 const0_rtx, const1_rtx,
31069 GEN_INT (2+4), GEN_INT (0+4)));
31073 gcc_unreachable ();
31078 use_vec_merge = TARGET_SSE4_1;
31082 /* Element 0 handled by vec_merge below. */
31085 use_vec_merge = true;
31091 /* With SSE2, use integer shuffles to swap element 0 and ELT,
31092 store into element 0, then shuffle them back. */
31096 order[0] = GEN_INT (elt);
31097 order[1] = const1_rtx;
31098 order[2] = const2_rtx;
31099 order[3] = GEN_INT (3);
31100 order[elt] = const0_rtx;
31102 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
31103 order[1], order[2], order[3]));
31105 ix86_expand_vector_set (false, target, val, 0);
31107 emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
31108 order[1], order[2], order[3]));
31112 /* For SSE1, we have to reuse the V4SF code. */
31113 ix86_expand_vector_set (false, gen_lowpart (V4SFmode, target),
31114 gen_lowpart (SFmode, val), elt);
31119 use_vec_merge = TARGET_SSE2;
31122 use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
31126 use_vec_merge = TARGET_SSE4_1;
31133 half_mode = V16QImode;
31139 half_mode = V8HImode;
31145 half_mode = V4SImode;
31151 half_mode = V2DImode;
31157 half_mode = V4SFmode;
31163 half_mode = V2DFmode;
31169 /* Compute offset. */
31173 gcc_assert (i <= 1);
31175 /* Extract the half. */
31176 tmp = gen_reg_rtx (half_mode);
31177 emit_insn (gen_extract[j][i] (tmp, target));
31179 /* Put val in tmp at elt. */
31180 ix86_expand_vector_set (false, tmp, val, elt);
31183 emit_insn (gen_insert[j][i] (target, target, tmp));
31192 tmp = gen_rtx_VEC_DUPLICATE (mode, val);
31193 tmp = gen_rtx_VEC_MERGE (mode, tmp, target, GEN_INT (1 << elt));
31194 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
31198 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
31200 emit_move_insn (mem, target);
31202 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
31203 emit_move_insn (tmp, val);
31205 emit_move_insn (target, mem);
31210 ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
31212 enum machine_mode mode = GET_MODE (vec);
31213 enum machine_mode inner_mode = GET_MODE_INNER (mode);
31214 bool use_vec_extr = false;
31227 use_vec_extr = true;
31231 use_vec_extr = TARGET_SSE4_1;
31243 tmp = gen_reg_rtx (mode);
31244 emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
31245 GEN_INT (elt), GEN_INT (elt),
31246 GEN_INT (elt+4), GEN_INT (elt+4)));
31250 tmp = gen_reg_rtx (mode);
31251 emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
31255 gcc_unreachable ();
31258 use_vec_extr = true;
31263 use_vec_extr = TARGET_SSE4_1;
31277 tmp = gen_reg_rtx (mode);
31278 emit_insn (gen_sse2_pshufd_1 (tmp, vec,
31279 GEN_INT (elt), GEN_INT (elt),
31280 GEN_INT (elt), GEN_INT (elt)));
31284 tmp = gen_reg_rtx (mode);
31285 emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
31289 gcc_unreachable ();
31292 use_vec_extr = true;
31297 /* For SSE1, we have to reuse the V4SF code. */
31298 ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
31299 gen_lowpart (V4SFmode, vec), elt);
31305 use_vec_extr = TARGET_SSE2;
31308 use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
31312 use_vec_extr = TARGET_SSE4_1;
31316 /* ??? Could extract the appropriate HImode element and shift. */
31323 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
31324 tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
31326 /* Let the rtl optimizers know about the zero extension performed. */
31327 if (inner_mode == QImode || inner_mode == HImode)
31329 tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
31330 target = gen_lowpart (SImode, target);
31333 emit_insn (gen_rtx_SET (VOIDmode, target, tmp));
31337 rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode), false);
31339 emit_move_insn (mem, vec);
31341 tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
31342 emit_move_insn (target, tmp);
31346 /* Expand a vector reduction on V4SFmode for SSE1. FN is the binary
31347 pattern to reduce; DEST is the destination; IN is the input vector. */
31350 ix86_expand_reduc_v4sf (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
31352 rtx tmp1, tmp2, tmp3;
31354 tmp1 = gen_reg_rtx (V4SFmode);
31355 tmp2 = gen_reg_rtx (V4SFmode);
31356 tmp3 = gen_reg_rtx (V4SFmode);
31358 emit_insn (gen_sse_movhlps (tmp1, in, in));
31359 emit_insn (fn (tmp2, tmp1, in));
31361 emit_insn (gen_sse_shufps_v4sf (tmp3, tmp2, tmp2,
31362 const1_rtx, const1_rtx,
31363 GEN_INT (1+4), GEN_INT (1+4)));
31364 emit_insn (fn (dest, tmp2, tmp3));
31367 /* Target hook for scalar_mode_supported_p. */
31369 ix86_scalar_mode_supported_p (enum machine_mode mode)
31371 if (DECIMAL_FLOAT_MODE_P (mode))
31372 return default_decimal_float_supported_p ();
31373 else if (mode == TFmode)
31376 return default_scalar_mode_supported_p (mode);
31379 /* Implements target hook vector_mode_supported_p. */
31381 ix86_vector_mode_supported_p (enum machine_mode mode)
31383 if (TARGET_SSE && VALID_SSE_REG_MODE (mode))
31385 if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
31387 if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
31389 if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
31391 if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
31396 /* Target hook for c_mode_for_suffix. */
31397 static enum machine_mode
31398 ix86_c_mode_for_suffix (char suffix)
31408 /* Worker function for TARGET_MD_ASM_CLOBBERS.
31410 We do this in the new i386 backend to maintain source compatibility
31411 with the old cc0-based compiler. */
31414 ix86_md_asm_clobbers (tree outputs ATTRIBUTE_UNUSED,
31415 tree inputs ATTRIBUTE_UNUSED,
31418 clobbers = tree_cons (NULL_TREE, build_string (5, "flags"),
31420 clobbers = tree_cons (NULL_TREE, build_string (4, "fpsr"),
31425 /* Implements target vector targetm.asm.encode_section_info. This
31426 is not used by netware. */
31428 static void ATTRIBUTE_UNUSED
31429 ix86_encode_section_info (tree decl, rtx rtl, int first)
31431 default_encode_section_info (decl, rtl, first);
31433 if (TREE_CODE (decl) == VAR_DECL
31434 && (TREE_STATIC (decl) || DECL_EXTERNAL (decl))
31435 && ix86_in_large_data_p (decl))
31436 SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_FAR_ADDR;
31439 /* Worker function for REVERSE_CONDITION. */
31442 ix86_reverse_condition (enum rtx_code code, enum machine_mode mode)
31444 return (mode != CCFPmode && mode != CCFPUmode
31445 ? reverse_condition (code)
31446 : reverse_condition_maybe_unordered (code));
31449 /* Output code to perform an x87 FP register move, from OPERANDS[1]
31453 output_387_reg_move (rtx insn, rtx *operands)
31455 if (REG_P (operands[0]))
31457 if (REG_P (operands[1])
31458 && find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
31460 if (REGNO (operands[0]) == FIRST_STACK_REG)
31461 return output_387_ffreep (operands, 0);
31462 return "fstp\t%y0";
31464 if (STACK_TOP_P (operands[0]))
31465 return "fld%Z1\t%y1";
31468 else if (MEM_P (operands[0]))
31470 gcc_assert (REG_P (operands[1]));
31471 if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
31472 return "fstp%Z0\t%y0";
31475 /* There is no non-popping store to memory for XFmode.
31476 So if we need one, follow the store with a load. */
31477 if (GET_MODE (operands[0]) == XFmode)
31478 return "fstp%Z0\t%y0\n\tfld%Z0\t%y0";
31480 return "fst%Z0\t%y0";
31487 /* Output code to perform a conditional jump to LABEL, if C2 flag in
31488 FP status register is set. */
31491 ix86_emit_fp_unordered_jump (rtx label)
31493 rtx reg = gen_reg_rtx (HImode);
31496 emit_insn (gen_x86_fnstsw_1 (reg));
31498 if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
31500 emit_insn (gen_x86_sahf_1 (reg));
31502 temp = gen_rtx_REG (CCmode, FLAGS_REG);
31503 temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
31507 emit_insn (gen_testqi_ext_ccno_0 (reg, GEN_INT (0x04)));
31509 temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
31510 temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
31513 temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
31514 gen_rtx_LABEL_REF (VOIDmode, label),
31516 temp = gen_rtx_SET (VOIDmode, pc_rtx, temp);
31518 emit_jump_insn (temp);
31519 predict_jump (REG_BR_PROB_BASE * 10 / 100);
31522 /* Output code to perform a log1p XFmode calculation. */
31524 void ix86_emit_i387_log1p (rtx op0, rtx op1)
31526 rtx label1 = gen_label_rtx ();
31527 rtx label2 = gen_label_rtx ();
31529 rtx tmp = gen_reg_rtx (XFmode);
31530 rtx tmp2 = gen_reg_rtx (XFmode);
31533 emit_insn (gen_absxf2 (tmp, op1));
31534 test = gen_rtx_GE (VOIDmode, tmp,
31535 CONST_DOUBLE_FROM_REAL_VALUE (
31536 REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode),
31538 emit_jump_insn (gen_cbranchxf4 (test, XEXP (test, 0), XEXP (test, 1), label1));
31540 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
31541 emit_insn (gen_fyl2xp1xf3_i387 (op0, op1, tmp2));
31542 emit_jump (label2);
31544 emit_label (label1);
31545 emit_move_insn (tmp, CONST1_RTX (XFmode));
31546 emit_insn (gen_addxf3 (tmp, op1, tmp));
31547 emit_move_insn (tmp2, standard_80387_constant_rtx (4)); /* fldln2 */
31548 emit_insn (gen_fyl2xxf3_i387 (op0, tmp, tmp2));
31550 emit_label (label2);
31553 /* Output code to perform a Newton-Rhapson approximation of a single precision
31554 floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
31556 void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
31558 rtx x0, x1, e0, e1;
31560 x0 = gen_reg_rtx (mode);
31561 e0 = gen_reg_rtx (mode);
31562 e1 = gen_reg_rtx (mode);
31563 x1 = gen_reg_rtx (mode);
31565 /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
31567 /* x0 = rcp(b) estimate */
31568 emit_insn (gen_rtx_SET (VOIDmode, x0,
31569 gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
31572 emit_insn (gen_rtx_SET (VOIDmode, e0,
31573 gen_rtx_MULT (mode, x0, b)));
31576 emit_insn (gen_rtx_SET (VOIDmode, e0,
31577 gen_rtx_MULT (mode, x0, e0)));
31580 emit_insn (gen_rtx_SET (VOIDmode, e1,
31581 gen_rtx_PLUS (mode, x0, x0)));
31584 emit_insn (gen_rtx_SET (VOIDmode, x1,
31585 gen_rtx_MINUS (mode, e1, e0)));
31588 emit_insn (gen_rtx_SET (VOIDmode, res,
31589 gen_rtx_MULT (mode, a, x1)));
31592 /* Output code to perform a Newton-Rhapson approximation of a
31593 single precision floating point [reciprocal] square root. */
31595 void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
31598 rtx x0, e0, e1, e2, e3, mthree, mhalf;
31601 x0 = gen_reg_rtx (mode);
31602 e0 = gen_reg_rtx (mode);
31603 e1 = gen_reg_rtx (mode);
31604 e2 = gen_reg_rtx (mode);
31605 e3 = gen_reg_rtx (mode);
31607 real_from_integer (&r, VOIDmode, -3, -1, 0);
31608 mthree = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
31610 real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
31611 mhalf = CONST_DOUBLE_FROM_REAL_VALUE (r, SFmode);
31613 if (VECTOR_MODE_P (mode))
31615 mthree = ix86_build_const_vector (mode, true, mthree);
31616 mhalf = ix86_build_const_vector (mode, true, mhalf);
31619 /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
31620 rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
31622 /* x0 = rsqrt(a) estimate */
31623 emit_insn (gen_rtx_SET (VOIDmode, x0,
31624 gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
31627 /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
31632 zero = gen_reg_rtx (mode);
31633 mask = gen_reg_rtx (mode);
31635 zero = force_reg (mode, CONST0_RTX(mode));
31636 emit_insn (gen_rtx_SET (VOIDmode, mask,
31637 gen_rtx_NE (mode, zero, a)));
31639 emit_insn (gen_rtx_SET (VOIDmode, x0,
31640 gen_rtx_AND (mode, x0, mask)));
31644 emit_insn (gen_rtx_SET (VOIDmode, e0,
31645 gen_rtx_MULT (mode, x0, a)));
31647 emit_insn (gen_rtx_SET (VOIDmode, e1,
31648 gen_rtx_MULT (mode, e0, x0)));
31651 mthree = force_reg (mode, mthree);
31652 emit_insn (gen_rtx_SET (VOIDmode, e2,
31653 gen_rtx_PLUS (mode, e1, mthree)));
31655 mhalf = force_reg (mode, mhalf);
31657 /* e3 = -.5 * x0 */
31658 emit_insn (gen_rtx_SET (VOIDmode, e3,
31659 gen_rtx_MULT (mode, x0, mhalf)));
31661 /* e3 = -.5 * e0 */
31662 emit_insn (gen_rtx_SET (VOIDmode, e3,
31663 gen_rtx_MULT (mode, e0, mhalf)));
31664 /* ret = e2 * e3 */
31665 emit_insn (gen_rtx_SET (VOIDmode, res,
31666 gen_rtx_MULT (mode, e2, e3)));
31669 #ifdef TARGET_SOLARIS
31670 /* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
31673 i386_solaris_elf_named_section (const char *name, unsigned int flags,
31676 /* With Binutils 2.15, the "@unwind" marker must be specified on
31677 every occurrence of the ".eh_frame" section, not just the first
31680 && strcmp (name, ".eh_frame") == 0)
31682 fprintf (asm_out_file, "\t.section\t%s,\"%s\",@unwind\n", name,
31683 flags & SECTION_WRITE ? "aw" : "a");
31688 if (HAVE_COMDAT_GROUP && flags & SECTION_LINKONCE)
31690 solaris_elf_asm_comdat_section (name, flags, decl);
31695 default_elf_asm_named_section (name, flags, decl);
31697 #endif /* TARGET_SOLARIS */
31699 /* Return the mangling of TYPE if it is an extended fundamental type. */
31701 static const char *
31702 ix86_mangle_type (const_tree type)
31704 type = TYPE_MAIN_VARIANT (type);
31706 if (TREE_CODE (type) != VOID_TYPE && TREE_CODE (type) != BOOLEAN_TYPE
31707 && TREE_CODE (type) != INTEGER_TYPE && TREE_CODE (type) != REAL_TYPE)
31710 switch (TYPE_MODE (type))
31713 /* __float128 is "g". */
31716 /* "long double" or __float80 is "e". */
31723 /* For 32-bit code we can save PIC register setup by using
31724 __stack_chk_fail_local hidden function instead of calling
31725 __stack_chk_fail directly. 64-bit code doesn't need to setup any PIC
31726 register, so it is better to call __stack_chk_fail directly. */
31728 static tree ATTRIBUTE_UNUSED
31729 ix86_stack_protect_fail (void)
31731 return TARGET_64BIT
31732 ? default_external_stack_protect_fail ()
31733 : default_hidden_stack_protect_fail ();
31736 /* Select a format to encode pointers in exception handling data. CODE
31737 is 0 for data, 1 for code labels, 2 for function pointers. GLOBAL is
31738 true if the symbol may be affected by dynamic relocations.
31740 ??? All x86 object file formats are capable of representing this.
31741 After all, the relocation needed is the same as for the call insn.
31742 Whether or not a particular assembler allows us to enter such, I
31743 guess we'll have to see. */
31745 asm_preferred_eh_data_format (int code, int global)
31749 int type = DW_EH_PE_sdata8;
31751 || ix86_cmodel == CM_SMALL_PIC
31752 || (ix86_cmodel == CM_MEDIUM_PIC && (global || code)))
31753 type = DW_EH_PE_sdata4;
31754 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
31756 if (ix86_cmodel == CM_SMALL
31757 || (ix86_cmodel == CM_MEDIUM && code))
31758 return DW_EH_PE_udata4;
31759 return DW_EH_PE_absptr;
31762 /* Expand copysign from SIGN to the positive value ABS_VALUE
31763 storing in RESULT. If MASK is non-null, it shall be a mask to mask out
31766 ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
31768 enum machine_mode mode = GET_MODE (sign);
31769 rtx sgn = gen_reg_rtx (mode);
31770 if (mask == NULL_RTX)
31772 enum machine_mode vmode;
31774 if (mode == SFmode)
31776 else if (mode == DFmode)
31781 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
31782 if (!VECTOR_MODE_P (mode))
31784 /* We need to generate a scalar mode mask in this case. */
31785 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
31786 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
31787 mask = gen_reg_rtx (mode);
31788 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
31792 mask = gen_rtx_NOT (mode, mask);
31793 emit_insn (gen_rtx_SET (VOIDmode, sgn,
31794 gen_rtx_AND (mode, mask, sign)));
31795 emit_insn (gen_rtx_SET (VOIDmode, result,
31796 gen_rtx_IOR (mode, abs_value, sgn)));
31799 /* Expand fabs (OP0) and return a new rtx that holds the result. The
31800 mask for masking out the sign-bit is stored in *SMASK, if that is
31803 ix86_expand_sse_fabs (rtx op0, rtx *smask)
31805 enum machine_mode vmode, mode = GET_MODE (op0);
31808 xa = gen_reg_rtx (mode);
31809 if (mode == SFmode)
31811 else if (mode == DFmode)
31815 mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
31816 if (!VECTOR_MODE_P (mode))
31818 /* We need to generate a scalar mode mask in this case. */
31819 rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
31820 tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
31821 mask = gen_reg_rtx (mode);
31822 emit_insn (gen_rtx_SET (VOIDmode, mask, tmp));
31824 emit_insn (gen_rtx_SET (VOIDmode, xa,
31825 gen_rtx_AND (mode, op0, mask)));
31833 /* Expands a comparison of OP0 with OP1 using comparison code CODE,
31834 swapping the operands if SWAP_OPERANDS is true. The expanded
31835 code is a forward jump to a newly created label in case the
31836 comparison is true. The generated label rtx is returned. */
31838 ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
31839 bool swap_operands)
31850 label = gen_label_rtx ();
31851 tmp = gen_rtx_REG (CCFPUmode, FLAGS_REG);
31852 emit_insn (gen_rtx_SET (VOIDmode, tmp,
31853 gen_rtx_COMPARE (CCFPUmode, op0, op1)));
31854 tmp = gen_rtx_fmt_ee (code, VOIDmode, tmp, const0_rtx);
31855 tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
31856 gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
31857 tmp = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, tmp));
31858 JUMP_LABEL (tmp) = label;
31863 /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
31864 using comparison code CODE. Operands are swapped for the comparison if
31865 SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
31867 ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
31868 bool swap_operands)
31870 rtx (*insn)(rtx, rtx, rtx, rtx);
31871 enum machine_mode mode = GET_MODE (op0);
31872 rtx mask = gen_reg_rtx (mode);
31881 insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
31883 emit_insn (insn (mask, op0, op1,
31884 gen_rtx_fmt_ee (code, mode, op0, op1)));
31888 /* Generate and return a rtx of mode MODE for 2**n where n is the number
31889 of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
31891 ix86_gen_TWO52 (enum machine_mode mode)
31893 REAL_VALUE_TYPE TWO52r;
31896 real_ldexp (&TWO52r, &dconst1, mode == DFmode ? 52 : 23);
31897 TWO52 = const_double_from_real_value (TWO52r, mode);
31898 TWO52 = force_reg (mode, TWO52);
31903 /* Expand SSE sequence for computing lround from OP1 storing
31906 ix86_expand_lround (rtx op0, rtx op1)
31908 /* C code for the stuff we're doing below:
31909 tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
31912 enum machine_mode mode = GET_MODE (op1);
31913 const struct real_format *fmt;
31914 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
31917 /* load nextafter (0.5, 0.0) */
31918 fmt = REAL_MODE_FORMAT (mode);
31919 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
31920 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
31922 /* adj = copysign (0.5, op1) */
31923 adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
31924 ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
31926 /* adj = op1 + adj */
31927 adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
31929 /* op0 = (imode)adj */
31930 expand_fix (op0, adj, 0);
31933 /* Expand SSE2 sequence for computing lround from OPERAND1 storing
31936 ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
31938 /* C code for the stuff we're doing below (for do_floor):
31940 xi -= (double)xi > op1 ? 1 : 0;
31943 enum machine_mode fmode = GET_MODE (op1);
31944 enum machine_mode imode = GET_MODE (op0);
31945 rtx ireg, freg, label, tmp;
31947 /* reg = (long)op1 */
31948 ireg = gen_reg_rtx (imode);
31949 expand_fix (ireg, op1, 0);
31951 /* freg = (double)reg */
31952 freg = gen_reg_rtx (fmode);
31953 expand_float (freg, ireg, 0);
31955 /* ireg = (freg > op1) ? ireg - 1 : ireg */
31956 label = ix86_expand_sse_compare_and_jump (UNLE,
31957 freg, op1, !do_floor);
31958 tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
31959 ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
31960 emit_move_insn (ireg, tmp);
31962 emit_label (label);
31963 LABEL_NUSES (label) = 1;
31965 emit_move_insn (op0, ireg);
31968 /* Expand rint (IEEE round to nearest) rounding OPERAND1 and storing the
31969 result in OPERAND0. */
31971 ix86_expand_rint (rtx operand0, rtx operand1)
31973 /* C code for the stuff we're doing below:
31974 xa = fabs (operand1);
31975 if (!isless (xa, 2**52))
31977 xa = xa + 2**52 - 2**52;
31978 return copysign (xa, operand1);
31980 enum machine_mode mode = GET_MODE (operand0);
31981 rtx res, xa, label, TWO52, mask;
31983 res = gen_reg_rtx (mode);
31984 emit_move_insn (res, operand1);
31986 /* xa = abs (operand1) */
31987 xa = ix86_expand_sse_fabs (res, &mask);
31989 /* if (!isless (xa, TWO52)) goto label; */
31990 TWO52 = ix86_gen_TWO52 (mode);
31991 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
31993 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
31994 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
31996 ix86_sse_copysign_to_positive (res, xa, res, mask);
31998 emit_label (label);
31999 LABEL_NUSES (label) = 1;
32001 emit_move_insn (operand0, res);
32004 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
32007 ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
32009 /* C code for the stuff we expand below.
32010 double xa = fabs (x), x2;
32011 if (!isless (xa, TWO52))
32013 xa = xa + TWO52 - TWO52;
32014 x2 = copysign (xa, x);
32023 enum machine_mode mode = GET_MODE (operand0);
32024 rtx xa, TWO52, tmp, label, one, res, mask;
32026 TWO52 = ix86_gen_TWO52 (mode);
32028 /* Temporary for holding the result, initialized to the input
32029 operand to ease control flow. */
32030 res = gen_reg_rtx (mode);
32031 emit_move_insn (res, operand1);
32033 /* xa = abs (operand1) */
32034 xa = ix86_expand_sse_fabs (res, &mask);
32036 /* if (!isless (xa, TWO52)) goto label; */
32037 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
32039 /* xa = xa + TWO52 - TWO52; */
32040 xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
32041 xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
32043 /* xa = copysign (xa, operand1) */
32044 ix86_sse_copysign_to_positive (xa, xa, res, mask);
32046 /* generate 1.0 or -1.0 */
32047 one = force_reg (mode,
32048 const_double_from_real_value (do_floor
32049 ? dconst1 : dconstm1, mode));
32051 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
32052 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
32053 emit_insn (gen_rtx_SET (VOIDmode, tmp,
32054 gen_rtx_AND (mode, one, tmp)));
32055 /* We always need to subtract here to preserve signed zero. */
32056 tmp = expand_simple_binop (mode, MINUS,
32057 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
32058 emit_move_insn (res, tmp);
32060 emit_label (label);
32061 LABEL_NUSES (label) = 1;
32063 emit_move_insn (operand0, res);
32066 /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
32069 ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
32071 /* C code for the stuff we expand below.
32072 double xa = fabs (x), x2;
32073 if (!isless (xa, TWO52))
32075 x2 = (double)(long)x;
32082 if (HONOR_SIGNED_ZEROS (mode))
32083 return copysign (x2, x);
32086 enum machine_mode mode = GET_MODE (operand0);
32087 rtx xa, xi, TWO52, tmp, label, one, res, mask;
32089 TWO52 = ix86_gen_TWO52 (mode);
32091 /* Temporary for holding the result, initialized to the input
32092 operand to ease control flow. */
32093 res = gen_reg_rtx (mode);
32094 emit_move_insn (res, operand1);
32096 /* xa = abs (operand1) */
32097 xa = ix86_expand_sse_fabs (res, &mask);
32099 /* if (!isless (xa, TWO52)) goto label; */
32100 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
32102 /* xa = (double)(long)x */
32103 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
32104 expand_fix (xi, res, 0);
32105 expand_float (xa, xi, 0);
32108 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
32110 /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
32111 tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
32112 emit_insn (gen_rtx_SET (VOIDmode, tmp,
32113 gen_rtx_AND (mode, one, tmp)));
32114 tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
32115 xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
32116 emit_move_insn (res, tmp);
32118 if (HONOR_SIGNED_ZEROS (mode))
32119 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
32121 emit_label (label);
32122 LABEL_NUSES (label) = 1;
32124 emit_move_insn (operand0, res);
32127 /* Expand SSE sequence for computing round from OPERAND1 storing
32128 into OPERAND0. Sequence that works without relying on DImode truncation
32129 via cvttsd2siq that is only available on 64bit targets. */
32131 ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
32133 /* C code for the stuff we expand below.
32134 double xa = fabs (x), xa2, x2;
32135 if (!isless (xa, TWO52))
32137 Using the absolute value and copying back sign makes
32138 -0.0 -> -0.0 correct.
32139 xa2 = xa + TWO52 - TWO52;
32144 else if (dxa > 0.5)
32146 x2 = copysign (xa2, x);
32149 enum machine_mode mode = GET_MODE (operand0);
32150 rtx xa, xa2, dxa, TWO52, tmp, label, half, mhalf, one, res, mask;
32152 TWO52 = ix86_gen_TWO52 (mode);
32154 /* Temporary for holding the result, initialized to the input
32155 operand to ease control flow. */
32156 res = gen_reg_rtx (mode);
32157 emit_move_insn (res, operand1);
32159 /* xa = abs (operand1) */
32160 xa = ix86_expand_sse_fabs (res, &mask);
32162 /* if (!isless (xa, TWO52)) goto label; */
32163 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
32165 /* xa2 = xa + TWO52 - TWO52; */
32166 xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
32167 xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
32169 /* dxa = xa2 - xa; */
32170 dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
32172 /* generate 0.5, 1.0 and -0.5 */
32173 half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
32174 one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
32175 mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
32179 tmp = gen_reg_rtx (mode);
32180 /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
32181 tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
32182 emit_insn (gen_rtx_SET (VOIDmode, tmp,
32183 gen_rtx_AND (mode, one, tmp)));
32184 xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
32185 /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
32186 tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
32187 emit_insn (gen_rtx_SET (VOIDmode, tmp,
32188 gen_rtx_AND (mode, one, tmp)));
32189 xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
32191 /* res = copysign (xa2, operand1) */
32192 ix86_sse_copysign_to_positive (res, xa2, force_reg (mode, operand1), mask);
32194 emit_label (label);
32195 LABEL_NUSES (label) = 1;
32197 emit_move_insn (operand0, res);
32200 /* Expand SSE sequence for computing trunc from OPERAND1 storing
32203 ix86_expand_trunc (rtx operand0, rtx operand1)
32205 /* C code for SSE variant we expand below.
32206 double xa = fabs (x), x2;
32207 if (!isless (xa, TWO52))
32209 x2 = (double)(long)x;
32210 if (HONOR_SIGNED_ZEROS (mode))
32211 return copysign (x2, x);
32214 enum machine_mode mode = GET_MODE (operand0);
32215 rtx xa, xi, TWO52, label, res, mask;
32217 TWO52 = ix86_gen_TWO52 (mode);
32219 /* Temporary for holding the result, initialized to the input
32220 operand to ease control flow. */
32221 res = gen_reg_rtx (mode);
32222 emit_move_insn (res, operand1);
32224 /* xa = abs (operand1) */
32225 xa = ix86_expand_sse_fabs (res, &mask);
32227 /* if (!isless (xa, TWO52)) goto label; */
32228 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
32230 /* x = (double)(long)x */
32231 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
32232 expand_fix (xi, res, 0);
32233 expand_float (res, xi, 0);
32235 if (HONOR_SIGNED_ZEROS (mode))
32236 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), mask);
32238 emit_label (label);
32239 LABEL_NUSES (label) = 1;
32241 emit_move_insn (operand0, res);
32244 /* Expand SSE sequence for computing trunc from OPERAND1 storing
32247 ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
32249 enum machine_mode mode = GET_MODE (operand0);
32250 rtx xa, mask, TWO52, label, one, res, smask, tmp;
32252 /* C code for SSE variant we expand below.
32253 double xa = fabs (x), x2;
32254 if (!isless (xa, TWO52))
32256 xa2 = xa + TWO52 - TWO52;
32260 x2 = copysign (xa2, x);
32264 TWO52 = ix86_gen_TWO52 (mode);
32266 /* Temporary for holding the result, initialized to the input
32267 operand to ease control flow. */
32268 res = gen_reg_rtx (mode);
32269 emit_move_insn (res, operand1);
32271 /* xa = abs (operand1) */
32272 xa = ix86_expand_sse_fabs (res, &smask);
32274 /* if (!isless (xa, TWO52)) goto label; */
32275 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
32277 /* res = xa + TWO52 - TWO52; */
32278 tmp = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
32279 tmp = expand_simple_binop (mode, MINUS, tmp, TWO52, tmp, 0, OPTAB_DIRECT);
32280 emit_move_insn (res, tmp);
32283 one = force_reg (mode, const_double_from_real_value (dconst1, mode));
32285 /* Compensate: res = xa2 - (res > xa ? 1 : 0) */
32286 mask = ix86_expand_sse_compare_mask (UNGT, res, xa, false);
32287 emit_insn (gen_rtx_SET (VOIDmode, mask,
32288 gen_rtx_AND (mode, mask, one)));
32289 tmp = expand_simple_binop (mode, MINUS,
32290 res, mask, NULL_RTX, 0, OPTAB_DIRECT);
32291 emit_move_insn (res, tmp);
32293 /* res = copysign (res, operand1) */
32294 ix86_sse_copysign_to_positive (res, res, force_reg (mode, operand1), smask);
32296 emit_label (label);
32297 LABEL_NUSES (label) = 1;
32299 emit_move_insn (operand0, res);
32302 /* Expand SSE sequence for computing round from OPERAND1 storing
32305 ix86_expand_round (rtx operand0, rtx operand1)
32307 /* C code for the stuff we're doing below:
32308 double xa = fabs (x);
32309 if (!isless (xa, TWO52))
32311 xa = (double)(long)(xa + nextafter (0.5, 0.0));
32312 return copysign (xa, x);
32314 enum machine_mode mode = GET_MODE (operand0);
32315 rtx res, TWO52, xa, label, xi, half, mask;
32316 const struct real_format *fmt;
32317 REAL_VALUE_TYPE pred_half, half_minus_pred_half;
32319 /* Temporary for holding the result, initialized to the input
32320 operand to ease control flow. */
32321 res = gen_reg_rtx (mode);
32322 emit_move_insn (res, operand1);
32324 TWO52 = ix86_gen_TWO52 (mode);
32325 xa = ix86_expand_sse_fabs (res, &mask);
32326 label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
32328 /* load nextafter (0.5, 0.0) */
32329 fmt = REAL_MODE_FORMAT (mode);
32330 real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
32331 REAL_ARITHMETIC (pred_half, MINUS_EXPR, dconsthalf, half_minus_pred_half);
32333 /* xa = xa + 0.5 */
32334 half = force_reg (mode, const_double_from_real_value (pred_half, mode));
32335 xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
32337 /* xa = (double)(int64_t)xa */
32338 xi = gen_reg_rtx (mode == DFmode ? DImode : SImode);
32339 expand_fix (xi, xa, 0);
32340 expand_float (xa, xi, 0);
32342 /* res = copysign (xa, operand1) */
32343 ix86_sse_copysign_to_positive (res, xa, force_reg (mode, operand1), mask);
32345 emit_label (label);
32346 LABEL_NUSES (label) = 1;
32348 emit_move_insn (operand0, res);
32352 /* Table of valid machine attributes. */
32353 static const struct attribute_spec ix86_attribute_table[] =
32355 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
32356 affects_type_identity } */
32357 /* Stdcall attribute says callee is responsible for popping arguments
32358 if they are not variable. */
32359 { "stdcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
32361 /* Fastcall attribute says callee is responsible for popping arguments
32362 if they are not variable. */
32363 { "fastcall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
32365 /* Thiscall attribute says callee is responsible for popping arguments
32366 if they are not variable. */
32367 { "thiscall", 0, 0, false, true, true, ix86_handle_cconv_attribute,
32369 /* Cdecl attribute says the callee is a normal C declaration */
32370 { "cdecl", 0, 0, false, true, true, ix86_handle_cconv_attribute,
32372 /* Regparm attribute specifies how many integer arguments are to be
32373 passed in registers. */
32374 { "regparm", 1, 1, false, true, true, ix86_handle_cconv_attribute,
32376 /* Sseregparm attribute says we are using x86_64 calling conventions
32377 for FP arguments. */
32378 { "sseregparm", 0, 0, false, true, true, ix86_handle_cconv_attribute,
32380 /* force_align_arg_pointer says this function realigns the stack at entry. */
32381 { (const char *)&ix86_force_align_arg_pointer_string, 0, 0,
32382 false, true, true, ix86_handle_cconv_attribute, false },
32383 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
32384 { "dllimport", 0, 0, false, false, false, handle_dll_attribute, false },
32385 { "dllexport", 0, 0, false, false, false, handle_dll_attribute, false },
32386 { "shared", 0, 0, true, false, false, ix86_handle_shared_attribute,
32389 { "ms_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
32391 { "gcc_struct", 0, 0, false, false, false, ix86_handle_struct_attribute,
32393 #ifdef SUBTARGET_ATTRIBUTE_TABLE
32394 SUBTARGET_ATTRIBUTE_TABLE,
32396 /* ms_abi and sysv_abi calling convention function attributes. */
32397 { "ms_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
32398 { "sysv_abi", 0, 0, false, true, true, ix86_handle_abi_attribute, true },
32399 { "ms_hook_prologue", 0, 0, true, false, false, ix86_handle_fndecl_attribute,
32401 { "callee_pop_aggregate_return", 1, 1, false, true, true,
32402 ix86_handle_callee_pop_aggregate_return, true },
32404 { NULL, 0, 0, false, false, false, NULL, false }
32407 /* Implement targetm.vectorize.builtin_vectorization_cost. */
32409 ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
32410 tree vectype ATTRIBUTE_UNUSED,
32411 int misalign ATTRIBUTE_UNUSED)
32413 switch (type_of_cost)
32416 return ix86_cost->scalar_stmt_cost;
32419 return ix86_cost->scalar_load_cost;
32422 return ix86_cost->scalar_store_cost;
32425 return ix86_cost->vec_stmt_cost;
32428 return ix86_cost->vec_align_load_cost;
32431 return ix86_cost->vec_store_cost;
32433 case vec_to_scalar:
32434 return ix86_cost->vec_to_scalar_cost;
32436 case scalar_to_vec:
32437 return ix86_cost->scalar_to_vec_cost;
32439 case unaligned_load:
32440 case unaligned_store:
32441 return ix86_cost->vec_unalign_load_cost;
32443 case cond_branch_taken:
32444 return ix86_cost->cond_taken_branch_cost;
32446 case cond_branch_not_taken:
32447 return ix86_cost->cond_not_taken_branch_cost;
32453 gcc_unreachable ();
32458 /* Implement targetm.vectorize.builtin_vec_perm. */
32461 ix86_vectorize_builtin_vec_perm (tree vec_type, tree *mask_type)
32463 tree itype = TREE_TYPE (vec_type);
32464 bool u = TYPE_UNSIGNED (itype);
32465 enum machine_mode vmode = TYPE_MODE (vec_type);
32466 enum ix86_builtins fcode;
32467 bool ok = TARGET_SSE2;
32473 fcode = IX86_BUILTIN_VEC_PERM_V4DF;
32476 fcode = IX86_BUILTIN_VEC_PERM_V2DF;
32478 itype = ix86_get_builtin_type (IX86_BT_DI);
32483 fcode = IX86_BUILTIN_VEC_PERM_V8SF;
32487 fcode = IX86_BUILTIN_VEC_PERM_V4SF;
32489 itype = ix86_get_builtin_type (IX86_BT_SI);
32493 fcode = u ? IX86_BUILTIN_VEC_PERM_V2DI_U : IX86_BUILTIN_VEC_PERM_V2DI;
32496 fcode = u ? IX86_BUILTIN_VEC_PERM_V4SI_U : IX86_BUILTIN_VEC_PERM_V4SI;
32499 fcode = u ? IX86_BUILTIN_VEC_PERM_V8HI_U : IX86_BUILTIN_VEC_PERM_V8HI;
32502 fcode = u ? IX86_BUILTIN_VEC_PERM_V16QI_U : IX86_BUILTIN_VEC_PERM_V16QI;
32512 *mask_type = itype;
32513 return ix86_builtins[(int) fcode];
32516 /* Return a vector mode with twice as many elements as VMODE. */
32517 /* ??? Consider moving this to a table generated by genmodes.c. */
32519 static enum machine_mode
32520 doublesize_vector_mode (enum machine_mode vmode)
32524 case V2SFmode: return V4SFmode;
32525 case V1DImode: return V2DImode;
32526 case V2SImode: return V4SImode;
32527 case V4HImode: return V8HImode;
32528 case V8QImode: return V16QImode;
32530 case V2DFmode: return V4DFmode;
32531 case V4SFmode: return V8SFmode;
32532 case V2DImode: return V4DImode;
32533 case V4SImode: return V8SImode;
32534 case V8HImode: return V16HImode;
32535 case V16QImode: return V32QImode;
32537 case V4DFmode: return V8DFmode;
32538 case V8SFmode: return V16SFmode;
32539 case V4DImode: return V8DImode;
32540 case V8SImode: return V16SImode;
32541 case V16HImode: return V32HImode;
32542 case V32QImode: return V64QImode;
32545 gcc_unreachable ();
32549 /* Construct (set target (vec_select op0 (parallel perm))) and
32550 return true if that's a valid instruction in the active ISA. */
32553 expand_vselect (rtx target, rtx op0, const unsigned char *perm, unsigned nelt)
32555 rtx rperm[MAX_VECT_LEN], x;
32558 for (i = 0; i < nelt; ++i)
32559 rperm[i] = GEN_INT (perm[i]);
32561 x = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, rperm));
32562 x = gen_rtx_VEC_SELECT (GET_MODE (target), op0, x);
32563 x = gen_rtx_SET (VOIDmode, target, x);
32566 if (recog_memoized (x) < 0)
32574 /* Similar, but generate a vec_concat from op0 and op1 as well. */
32577 expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
32578 const unsigned char *perm, unsigned nelt)
32580 enum machine_mode v2mode;
32583 v2mode = doublesize_vector_mode (GET_MODE (op0));
32584 x = gen_rtx_VEC_CONCAT (v2mode, op0, op1);
32585 return expand_vselect (target, x, perm, nelt);
32588 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
32589 in terms of blendp[sd] / pblendw / pblendvb. */
32592 expand_vec_perm_blend (struct expand_vec_perm_d *d)
32594 enum machine_mode vmode = d->vmode;
32595 unsigned i, mask, nelt = d->nelt;
32596 rtx target, op0, op1, x;
32598 if (!TARGET_SSE4_1 || d->op0 == d->op1)
32600 if (!(GET_MODE_SIZE (vmode) == 16 || vmode == V4DFmode || vmode == V8SFmode))
32603 /* This is a blend, not a permute. Elements must stay in their
32604 respective lanes. */
32605 for (i = 0; i < nelt; ++i)
32607 unsigned e = d->perm[i];
32608 if (!(e == i || e == i + nelt))
32615 /* ??? Without SSE4.1, we could implement this with and/andn/or. This
32616 decision should be extracted elsewhere, so that we only try that
32617 sequence once all budget==3 options have been tried. */
32619 /* For bytes, see if bytes move in pairs so we can use pblendw with
32620 an immediate argument, rather than pblendvb with a vector argument. */
32621 if (vmode == V16QImode)
32623 bool pblendw_ok = true;
32624 for (i = 0; i < 16 && pblendw_ok; i += 2)
32625 pblendw_ok = (d->perm[i] + 1 == d->perm[i + 1]);
32629 rtx rperm[16], vperm;
32631 for (i = 0; i < nelt; ++i)
32632 rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
32634 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm));
32635 vperm = force_reg (V16QImode, vperm);
32637 emit_insn (gen_sse4_1_pblendvb (d->target, d->op0, d->op1, vperm));
32642 target = d->target;
32654 for (i = 0; i < nelt; ++i)
32655 mask |= (d->perm[i] >= nelt) << i;
32659 for (i = 0; i < 2; ++i)
32660 mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
32664 for (i = 0; i < 4; ++i)
32665 mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
32669 for (i = 0; i < 8; ++i)
32670 mask |= (d->perm[i * 2] >= 16) << i;
32674 target = gen_lowpart (vmode, target);
32675 op0 = gen_lowpart (vmode, op0);
32676 op1 = gen_lowpart (vmode, op1);
32680 gcc_unreachable ();
32683 /* This matches five different patterns with the different modes. */
32684 x = gen_rtx_VEC_MERGE (vmode, op1, op0, GEN_INT (mask));
32685 x = gen_rtx_SET (VOIDmode, target, x);
32691 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
32692 in terms of the variable form of vpermilps.
32694 Note that we will have already failed the immediate input vpermilps,
32695 which requires that the high and low part shuffle be identical; the
32696 variable form doesn't require that. */
32699 expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
32701 rtx rperm[8], vperm;
32704 if (!TARGET_AVX || d->vmode != V8SFmode || d->op0 != d->op1)
32707 /* We can only permute within the 128-bit lane. */
32708 for (i = 0; i < 8; ++i)
32710 unsigned e = d->perm[i];
32711 if (i < 4 ? e >= 4 : e < 4)
32718 for (i = 0; i < 8; ++i)
32720 unsigned e = d->perm[i];
32722 /* Within each 128-bit lane, the elements of op0 are numbered
32723 from 0 and the elements of op1 are numbered from 4. */
32729 rperm[i] = GEN_INT (e);
32732 vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
32733 vperm = force_reg (V8SImode, vperm);
32734 emit_insn (gen_avx_vpermilvarv8sf3 (d->target, d->op0, vperm));
32739 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
32740 in terms of pshufb or vpperm. */
32743 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
32745 unsigned i, nelt, eltsz;
32746 rtx rperm[16], vperm, target, op0, op1;
32748 if (!(d->op0 == d->op1 ? TARGET_SSSE3 : TARGET_XOP))
32750 if (GET_MODE_SIZE (d->vmode) != 16)
32757 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
32759 for (i = 0; i < nelt; ++i)
32761 unsigned j, e = d->perm[i];
32762 for (j = 0; j < eltsz; ++j)
32763 rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
32766 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm));
32767 vperm = force_reg (V16QImode, vperm);
32769 target = gen_lowpart (V16QImode, d->target);
32770 op0 = gen_lowpart (V16QImode, d->op0);
32771 if (d->op0 == d->op1)
32772 emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
32775 op1 = gen_lowpart (V16QImode, d->op1);
32776 emit_insn (gen_xop_pperm (target, op0, op1, vperm));
32782 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
32783 in a single instruction. */
32786 expand_vec_perm_1 (struct expand_vec_perm_d *d)
32788 unsigned i, nelt = d->nelt;
32789 unsigned char perm2[MAX_VECT_LEN];
32791 /* Check plain VEC_SELECT first, because AVX has instructions that could
32792 match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
32793 input where SEL+CONCAT may not. */
32794 if (d->op0 == d->op1)
32796 int mask = nelt - 1;
32798 for (i = 0; i < nelt; i++)
32799 perm2[i] = d->perm[i] & mask;
32801 if (expand_vselect (d->target, d->op0, perm2, nelt))
32804 /* There are plenty of patterns in sse.md that are written for
32805 SEL+CONCAT and are not replicated for a single op. Perhaps
32806 that should be changed, to avoid the nastiness here. */
32808 /* Recognize interleave style patterns, which means incrementing
32809 every other permutation operand. */
32810 for (i = 0; i < nelt; i += 2)
32812 perm2[i] = d->perm[i] & mask;
32813 perm2[i + 1] = (d->perm[i + 1] & mask) + nelt;
32815 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
32818 /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
32821 for (i = 0; i < nelt; i += 4)
32823 perm2[i + 0] = d->perm[i + 0] & mask;
32824 perm2[i + 1] = d->perm[i + 1] & mask;
32825 perm2[i + 2] = (d->perm[i + 2] & mask) + nelt;
32826 perm2[i + 3] = (d->perm[i + 3] & mask) + nelt;
32829 if (expand_vselect_vconcat (d->target, d->op0, d->op0, perm2, nelt))
32834 /* Finally, try the fully general two operand permute. */
32835 if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt))
32838 /* Recognize interleave style patterns with reversed operands. */
32839 if (d->op0 != d->op1)
32841 for (i = 0; i < nelt; ++i)
32843 unsigned e = d->perm[i];
32851 if (expand_vselect_vconcat (d->target, d->op1, d->op0, perm2, nelt))
32855 /* Try the SSE4.1 blend variable merge instructions. */
32856 if (expand_vec_perm_blend (d))
32859 /* Try one of the AVX vpermil variable permutations. */
32860 if (expand_vec_perm_vpermil (d))
32863 /* Try the SSSE3 pshufb or XOP vpperm variable permutation. */
32864 if (expand_vec_perm_pshufb (d))
32870 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
32871 in terms of a pair of pshuflw + pshufhw instructions. */
32874 expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
32876 unsigned char perm2[MAX_VECT_LEN];
32880 if (d->vmode != V8HImode || d->op0 != d->op1)
32883 /* The two permutations only operate in 64-bit lanes. */
32884 for (i = 0; i < 4; ++i)
32885 if (d->perm[i] >= 4)
32887 for (i = 4; i < 8; ++i)
32888 if (d->perm[i] < 4)
32894 /* Emit the pshuflw. */
32895 memcpy (perm2, d->perm, 4);
32896 for (i = 4; i < 8; ++i)
32898 ok = expand_vselect (d->target, d->op0, perm2, 8);
32901 /* Emit the pshufhw. */
32902 memcpy (perm2 + 4, d->perm + 4, 4);
32903 for (i = 0; i < 4; ++i)
32905 ok = expand_vselect (d->target, d->target, perm2, 8);
32911 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
32912 the permutation using the SSSE3 palignr instruction. This succeeds
32913 when all of the elements in PERM fit within one vector and we merely
32914 need to shift them down so that a single vector permutation has a
32915 chance to succeed. */
32918 expand_vec_perm_palignr (struct expand_vec_perm_d *d)
32920 unsigned i, nelt = d->nelt;
32925 /* Even with AVX, palignr only operates on 128-bit vectors. */
32926 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
32929 min = nelt, max = 0;
32930 for (i = 0; i < nelt; ++i)
32932 unsigned e = d->perm[i];
32938 if (min == 0 || max - min >= nelt)
32941 /* Given that we have SSSE3, we know we'll be able to implement the
32942 single operand permutation after the palignr with pshufb. */
32946 shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
32947 emit_insn (gen_ssse3_palignrti (gen_lowpart (TImode, d->target),
32948 gen_lowpart (TImode, d->op1),
32949 gen_lowpart (TImode, d->op0), shift));
32951 d->op0 = d->op1 = d->target;
32954 for (i = 0; i < nelt; ++i)
32956 unsigned e = d->perm[i] - min;
32962 /* Test for the degenerate case where the alignment by itself
32963 produces the desired permutation. */
32967 ok = expand_vec_perm_1 (d);
32973 /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
32974 a two vector permutation into a single vector permutation by using
32975 an interleave operation to merge the vectors. */
32978 expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
32980 struct expand_vec_perm_d dremap, dfinal;
32981 unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
32982 unsigned contents, h1, h2, h3, h4;
32983 unsigned char remap[2 * MAX_VECT_LEN];
32987 if (d->op0 == d->op1)
32990 /* The 256-bit unpck[lh]p[sd] instructions only operate within the 128-bit
32991 lanes. We can use similar techniques with the vperm2f128 instruction,
32992 but it requires slightly different logic. */
32993 if (GET_MODE_SIZE (d->vmode) != 16)
32996 /* Examine from whence the elements come. */
32998 for (i = 0; i < nelt; ++i)
32999 contents |= 1u << d->perm[i];
33001 /* Split the two input vectors into 4 halves. */
33002 h1 = (1u << nelt2) - 1;
33007 memset (remap, 0xff, sizeof (remap));
33010 /* If the elements from the low halves use interleave low, and similarly
33011 for interleave high. If the elements are from mis-matched halves, we
33012 can use shufps for V4SF/V4SI or do a DImode shuffle. */
33013 if ((contents & (h1 | h3)) == contents)
33015 for (i = 0; i < nelt2; ++i)
33018 remap[i + nelt] = i * 2 + 1;
33019 dremap.perm[i * 2] = i;
33020 dremap.perm[i * 2 + 1] = i + nelt;
33023 else if ((contents & (h2 | h4)) == contents)
33025 for (i = 0; i < nelt2; ++i)
33027 remap[i + nelt2] = i * 2;
33028 remap[i + nelt + nelt2] = i * 2 + 1;
33029 dremap.perm[i * 2] = i + nelt2;
33030 dremap.perm[i * 2 + 1] = i + nelt + nelt2;
33033 else if ((contents & (h1 | h4)) == contents)
33035 for (i = 0; i < nelt2; ++i)
33038 remap[i + nelt + nelt2] = i + nelt2;
33039 dremap.perm[i] = i;
33040 dremap.perm[i + nelt2] = i + nelt + nelt2;
33044 dremap.vmode = V2DImode;
33046 dremap.perm[0] = 0;
33047 dremap.perm[1] = 3;
33050 else if ((contents & (h2 | h3)) == contents)
33052 for (i = 0; i < nelt2; ++i)
33054 remap[i + nelt2] = i;
33055 remap[i + nelt] = i + nelt2;
33056 dremap.perm[i] = i + nelt2;
33057 dremap.perm[i + nelt2] = i + nelt;
33061 dremap.vmode = V2DImode;
33063 dremap.perm[0] = 1;
33064 dremap.perm[1] = 2;
33070 /* Use the remapping array set up above to move the elements from their
33071 swizzled locations into their final destinations. */
33073 for (i = 0; i < nelt; ++i)
33075 unsigned e = remap[d->perm[i]];
33076 gcc_assert (e < nelt);
33077 dfinal.perm[i] = e;
33079 dfinal.op0 = gen_reg_rtx (dfinal.vmode);
33080 dfinal.op1 = dfinal.op0;
33081 dremap.target = dfinal.op0;
33083 /* Test if the final remap can be done with a single insn. For V4SFmode or
33084 V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
33086 ok = expand_vec_perm_1 (&dfinal);
33087 seq = get_insns ();
33093 if (dremap.vmode != dfinal.vmode)
33095 dremap.target = gen_lowpart (dremap.vmode, dremap.target);
33096 dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
33097 dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
33100 ok = expand_vec_perm_1 (&dremap);
33107 /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
33108 permutation with two pshufb insns and an ior. We should have already
33109 failed all two instruction sequences. */
33112 expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
33114 rtx rperm[2][16], vperm, l, h, op, m128;
33115 unsigned int i, nelt, eltsz;
33117 if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
33119 gcc_assert (d->op0 != d->op1);
33122 eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode));
33124 /* Generate two permutation masks. If the required element is within
33125 the given vector it is shuffled into the proper lane. If the required
33126 element is in the other vector, force a zero into the lane by setting
33127 bit 7 in the permutation mask. */
33128 m128 = GEN_INT (-128);
33129 for (i = 0; i < nelt; ++i)
33131 unsigned j, e = d->perm[i];
33132 unsigned which = (e >= nelt);
33136 for (j = 0; j < eltsz; ++j)
33138 rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
33139 rperm[1-which][i*eltsz + j] = m128;
33143 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
33144 vperm = force_reg (V16QImode, vperm);
33146 l = gen_reg_rtx (V16QImode);
33147 op = gen_lowpart (V16QImode, d->op0);
33148 emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
33150 vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
33151 vperm = force_reg (V16QImode, vperm);
33153 h = gen_reg_rtx (V16QImode);
33154 op = gen_lowpart (V16QImode, d->op1);
33155 emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
33157 op = gen_lowpart (V16QImode, d->target);
33158 emit_insn (gen_iorv16qi3 (op, l, h));
33163 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
33164 and extract-odd permutations. */
33167 expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
33174 t1 = gen_reg_rtx (V4DFmode);
33175 t2 = gen_reg_rtx (V4DFmode);
33177 /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
33178 emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
33179 emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
33181 /* Now an unpck[lh]pd will produce the result required. */
33183 t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
33185 t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
33191 int mask = odd ? 0xdd : 0x88;
33193 t1 = gen_reg_rtx (V8SFmode);
33194 t2 = gen_reg_rtx (V8SFmode);
33195 t3 = gen_reg_rtx (V8SFmode);
33197 /* Shuffle within the 128-bit lanes to produce:
33198 { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
33199 emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
33202 /* Shuffle the lanes around to produce:
33203 { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
33204 emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
33207 /* Shuffle within the 128-bit lanes to produce:
33208 { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
33209 emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
33211 /* Shuffle within the 128-bit lanes to produce:
33212 { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
33213 emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
33215 /* Shuffle the lanes around to produce:
33216 { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
33217 emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
33226 /* These are always directly implementable by expand_vec_perm_1. */
33227 gcc_unreachable ();
33231 return expand_vec_perm_pshufb2 (d);
33234 /* We need 2*log2(N)-1 operations to achieve odd/even
33235 with interleave. */
33236 t1 = gen_reg_rtx (V8HImode);
33237 t2 = gen_reg_rtx (V8HImode);
33238 emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
33239 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
33240 emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
33241 emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
33243 t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
33245 t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
33252 return expand_vec_perm_pshufb2 (d);
33255 t1 = gen_reg_rtx (V16QImode);
33256 t2 = gen_reg_rtx (V16QImode);
33257 t3 = gen_reg_rtx (V16QImode);
33258 emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
33259 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
33260 emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
33261 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
33262 emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
33263 emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
33265 t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
33267 t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
33273 gcc_unreachable ();
33279 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
33280 extract-even and extract-odd permutations. */
33283 expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
33285 unsigned i, odd, nelt = d->nelt;
33288 if (odd != 0 && odd != 1)
33291 for (i = 1; i < nelt; ++i)
33292 if (d->perm[i] != 2 * i + odd)
33295 return expand_vec_perm_even_odd_1 (d, odd);
33298 /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
33299 permutations. We assume that expand_vec_perm_1 has already failed. */
33302 expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
33304 unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
33305 enum machine_mode vmode = d->vmode;
33306 unsigned char perm2[4];
33314 /* These are special-cased in sse.md so that we can optionally
33315 use the vbroadcast instruction. They expand to two insns
33316 if the input happens to be in a register. */
33317 gcc_unreachable ();
33323 /* These are always implementable using standard shuffle patterns. */
33324 gcc_unreachable ();
33328 /* These can be implemented via interleave. We save one insn by
33329 stopping once we have promoted to V4SImode and then use pshufd. */
33332 optab otab = vec_interleave_low_optab;
33336 otab = vec_interleave_high_optab;
33341 op0 = expand_binop (vmode, otab, op0, op0, NULL, 0, OPTAB_DIRECT);
33342 vmode = get_mode_wider_vector (vmode);
33343 op0 = gen_lowpart (vmode, op0);
33345 while (vmode != V4SImode);
33347 memset (perm2, elt, 4);
33348 ok = expand_vselect (gen_lowpart (V4SImode, d->target), op0, perm2, 4);
33353 gcc_unreachable ();
33357 /* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
33358 broadcast permutations. */
33361 expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
33363 unsigned i, elt, nelt = d->nelt;
33365 if (d->op0 != d->op1)
33369 for (i = 1; i < nelt; ++i)
33370 if (d->perm[i] != elt)
33373 return expand_vec_perm_broadcast_1 (d);
33376 /* The guts of ix86_expand_vec_perm_builtin, also used by the ok hook.
33377 With all of the interface bits taken care of, perform the expansion
33378 in D and return true on success. */
33381 ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d)
33383 /* Try a single instruction expansion. */
33384 if (expand_vec_perm_1 (d))
33387 /* Try sequences of two instructions. */
33389 if (expand_vec_perm_pshuflw_pshufhw (d))
33392 if (expand_vec_perm_palignr (d))
33395 if (expand_vec_perm_interleave2 (d))
33398 if (expand_vec_perm_broadcast (d))
33401 /* Try sequences of three instructions. */
33403 if (expand_vec_perm_pshufb2 (d))
33406 /* ??? Look for narrow permutations whose element orderings would
33407 allow the promotion to a wider mode. */
33409 /* ??? Look for sequences of interleave or a wider permute that place
33410 the data into the correct lanes for a half-vector shuffle like
33411 pshuf[lh]w or vpermilps. */
33413 /* ??? Look for sequences of interleave that produce the desired results.
33414 The combinatorics of punpck[lh] get pretty ugly... */
33416 if (expand_vec_perm_even_odd (d))
33422 /* Extract the values from the vector CST into the permutation array in D.
33423 Return 0 on error, 1 if all values from the permutation come from the
33424 first vector, 2 if all values from the second vector, and 3 otherwise. */
33427 extract_vec_perm_cst (struct expand_vec_perm_d *d, tree cst)
33429 tree list = TREE_VECTOR_CST_ELTS (cst);
33430 unsigned i, nelt = d->nelt;
33433 for (i = 0; i < nelt; ++i, list = TREE_CHAIN (list))
33435 unsigned HOST_WIDE_INT e;
33437 if (!host_integerp (TREE_VALUE (list), 1))
33439 e = tree_low_cst (TREE_VALUE (list), 1);
33443 ret |= (e < nelt ? 1 : 2);
33446 gcc_assert (list == NULL);
33448 /* For all elements from second vector, fold the elements to first. */
33450 for (i = 0; i < nelt; ++i)
33451 d->perm[i] -= nelt;
33457 ix86_expand_vec_perm_builtin (tree exp)
33459 struct expand_vec_perm_d d;
33460 tree arg0, arg1, arg2;
33462 arg0 = CALL_EXPR_ARG (exp, 0);
33463 arg1 = CALL_EXPR_ARG (exp, 1);
33464 arg2 = CALL_EXPR_ARG (exp, 2);
33466 d.vmode = TYPE_MODE (TREE_TYPE (arg0));
33467 d.nelt = GET_MODE_NUNITS (d.vmode);
33468 d.testing_p = false;
33469 gcc_assert (VECTOR_MODE_P (d.vmode));
33471 if (TREE_CODE (arg2) != VECTOR_CST)
33473 error_at (EXPR_LOCATION (exp),
33474 "vector permutation requires vector constant");
33478 switch (extract_vec_perm_cst (&d, arg2))
33484 error_at (EXPR_LOCATION (exp), "invalid vector permutation constant");
33488 if (!operand_equal_p (arg0, arg1, 0))
33490 d.op0 = expand_expr (arg0, NULL_RTX, d.vmode, EXPAND_NORMAL);
33491 d.op0 = force_reg (d.vmode, d.op0);
33492 d.op1 = expand_expr (arg1, NULL_RTX, d.vmode, EXPAND_NORMAL);
33493 d.op1 = force_reg (d.vmode, d.op1);
33497 /* The elements of PERM do not suggest that only the first operand
33498 is used, but both operands are identical. Allow easier matching
33499 of the permutation by folding the permutation into the single
33502 unsigned i, nelt = d.nelt;
33503 for (i = 0; i < nelt; ++i)
33504 if (d.perm[i] >= nelt)
33510 d.op0 = expand_expr (arg0, NULL_RTX, d.vmode, EXPAND_NORMAL);
33511 d.op0 = force_reg (d.vmode, d.op0);
33516 d.op0 = expand_expr (arg1, NULL_RTX, d.vmode, EXPAND_NORMAL);
33517 d.op0 = force_reg (d.vmode, d.op0);
33522 d.target = gen_reg_rtx (d.vmode);
33523 if (ix86_expand_vec_perm_builtin_1 (&d))
33526 /* For compiler generated permutations, we should never got here, because
33527 the compiler should also be checking the ok hook. But since this is a
33528 builtin the user has access too, so don't abort. */
33532 sorry ("vector permutation (%d %d)", d.perm[0], d.perm[1]);
33535 sorry ("vector permutation (%d %d %d %d)",
33536 d.perm[0], d.perm[1], d.perm[2], d.perm[3]);
33539 sorry ("vector permutation (%d %d %d %d %d %d %d %d)",
33540 d.perm[0], d.perm[1], d.perm[2], d.perm[3],
33541 d.perm[4], d.perm[5], d.perm[6], d.perm[7]);
33544 sorry ("vector permutation "
33545 "(%d %d %d %d %d %d %d %d %d %d %d %d %d %d %d %d)",
33546 d.perm[0], d.perm[1], d.perm[2], d.perm[3],
33547 d.perm[4], d.perm[5], d.perm[6], d.perm[7],
33548 d.perm[8], d.perm[9], d.perm[10], d.perm[11],
33549 d.perm[12], d.perm[13], d.perm[14], d.perm[15]);
33552 gcc_unreachable ();
33555 return CONST0_RTX (d.vmode);
33558 /* Implement targetm.vectorize.builtin_vec_perm_ok. */
33561 ix86_vectorize_builtin_vec_perm_ok (tree vec_type, tree mask)
33563 struct expand_vec_perm_d d;
33567 d.vmode = TYPE_MODE (vec_type);
33568 d.nelt = GET_MODE_NUNITS (d.vmode);
33569 d.testing_p = true;
33571 /* Given sufficient ISA support we can just return true here
33572 for selected vector modes. */
33573 if (GET_MODE_SIZE (d.vmode) == 16)
33575 /* All implementable with a single vpperm insn. */
33578 /* All implementable with 2 pshufb + 1 ior. */
33581 /* All implementable with shufpd or unpck[lh]pd. */
33586 vec_mask = extract_vec_perm_cst (&d, mask);
33588 /* This hook is cannot be called in response to something that the
33589 user does (unlike the builtin expander) so we shouldn't ever see
33590 an error generated from the extract. */
33591 gcc_assert (vec_mask > 0 && vec_mask <= 3);
33592 one_vec = (vec_mask != 3);
33594 /* Implementable with shufps or pshufd. */
33595 if (one_vec && (d.vmode == V4SFmode || d.vmode == V4SImode))
33598 /* Otherwise we have to go through the motions and see if we can
33599 figure out how to generate the requested permutation. */
33600 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
33601 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
33603 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
33606 ret = ix86_expand_vec_perm_builtin_1 (&d);
33613 ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
33615 struct expand_vec_perm_d d;
33621 d.vmode = GET_MODE (targ);
33622 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
33623 d.testing_p = false;
33625 for (i = 0; i < nelt; ++i)
33626 d.perm[i] = i * 2 + odd;
33628 /* We'll either be able to implement the permutation directly... */
33629 if (expand_vec_perm_1 (&d))
33632 /* ... or we use the special-case patterns. */
33633 expand_vec_perm_even_odd_1 (&d, odd);
33636 /* Expand an insert into a vector register through pinsr insn.
33637 Return true if successful. */
33640 ix86_expand_pinsr (rtx *operands)
33642 rtx dst = operands[0];
33643 rtx src = operands[3];
33645 unsigned int size = INTVAL (operands[1]);
33646 unsigned int pos = INTVAL (operands[2]);
33648 if (GET_CODE (dst) == SUBREG)
33650 pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
33651 dst = SUBREG_REG (dst);
33654 if (GET_CODE (src) == SUBREG)
33655 src = SUBREG_REG (src);
33657 switch (GET_MODE (dst))
33664 enum machine_mode srcmode, dstmode;
33665 rtx (*pinsr)(rtx, rtx, rtx, rtx);
33667 srcmode = mode_for_size (size, MODE_INT, 0);
33672 if (!TARGET_SSE4_1)
33674 dstmode = V16QImode;
33675 pinsr = gen_sse4_1_pinsrb;
33681 dstmode = V8HImode;
33682 pinsr = gen_sse2_pinsrw;
33686 if (!TARGET_SSE4_1)
33688 dstmode = V4SImode;
33689 pinsr = gen_sse4_1_pinsrd;
33693 gcc_assert (TARGET_64BIT);
33694 if (!TARGET_SSE4_1)
33696 dstmode = V2DImode;
33697 pinsr = gen_sse4_1_pinsrq;
33704 dst = gen_lowpart (dstmode, dst);
33705 src = gen_lowpart (srcmode, src);
33709 emit_insn (pinsr (dst, dst, src, GEN_INT (1 << pos)));
33718 /* This function returns the calling abi specific va_list type node.
33719 It returns the FNDECL specific va_list type. */
33722 ix86_fn_abi_va_list (tree fndecl)
33725 return va_list_type_node;
33726 gcc_assert (fndecl != NULL_TREE);
33728 if (ix86_function_abi ((const_tree) fndecl) == MS_ABI)
33729 return ms_va_list_type_node;
33731 return sysv_va_list_type_node;
33734 /* Returns the canonical va_list type specified by TYPE. If there
33735 is no valid TYPE provided, it return NULL_TREE. */
33738 ix86_canonical_va_list_type (tree type)
33742 /* Resolve references and pointers to va_list type. */
33743 if (TREE_CODE (type) == MEM_REF)
33744 type = TREE_TYPE (type);
33745 else if (POINTER_TYPE_P (type) && POINTER_TYPE_P (TREE_TYPE(type)))
33746 type = TREE_TYPE (type);
33747 else if (POINTER_TYPE_P (type) && TREE_CODE (TREE_TYPE (type)) == ARRAY_TYPE)
33748 type = TREE_TYPE (type);
33750 if (TARGET_64BIT && va_list_type_node != NULL_TREE)
33752 wtype = va_list_type_node;
33753 gcc_assert (wtype != NULL_TREE);
33755 if (TREE_CODE (wtype) == ARRAY_TYPE)
33757 /* If va_list is an array type, the argument may have decayed
33758 to a pointer type, e.g. by being passed to another function.
33759 In that case, unwrap both types so that we can compare the
33760 underlying records. */
33761 if (TREE_CODE (htype) == ARRAY_TYPE
33762 || POINTER_TYPE_P (htype))
33764 wtype = TREE_TYPE (wtype);
33765 htype = TREE_TYPE (htype);
33768 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
33769 return va_list_type_node;
33770 wtype = sysv_va_list_type_node;
33771 gcc_assert (wtype != NULL_TREE);
33773 if (TREE_CODE (wtype) == ARRAY_TYPE)
33775 /* If va_list is an array type, the argument may have decayed
33776 to a pointer type, e.g. by being passed to another function.
33777 In that case, unwrap both types so that we can compare the
33778 underlying records. */
33779 if (TREE_CODE (htype) == ARRAY_TYPE
33780 || POINTER_TYPE_P (htype))
33782 wtype = TREE_TYPE (wtype);
33783 htype = TREE_TYPE (htype);
33786 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
33787 return sysv_va_list_type_node;
33788 wtype = ms_va_list_type_node;
33789 gcc_assert (wtype != NULL_TREE);
33791 if (TREE_CODE (wtype) == ARRAY_TYPE)
33793 /* If va_list is an array type, the argument may have decayed
33794 to a pointer type, e.g. by being passed to another function.
33795 In that case, unwrap both types so that we can compare the
33796 underlying records. */
33797 if (TREE_CODE (htype) == ARRAY_TYPE
33798 || POINTER_TYPE_P (htype))
33800 wtype = TREE_TYPE (wtype);
33801 htype = TREE_TYPE (htype);
33804 if (TYPE_MAIN_VARIANT (wtype) == TYPE_MAIN_VARIANT (htype))
33805 return ms_va_list_type_node;
33808 return std_canonical_va_list_type (type);
33811 /* Iterate through the target-specific builtin types for va_list.
33812 IDX denotes the iterator, *PTREE is set to the result type of
33813 the va_list builtin, and *PNAME to its internal type.
33814 Returns zero if there is no element for this index, otherwise
33815 IDX should be increased upon the next call.
33816 Note, do not iterate a base builtin's name like __builtin_va_list.
33817 Used from c_common_nodes_and_builtins. */
33820 ix86_enum_va_list (int idx, const char **pname, tree *ptree)
33830 *ptree = ms_va_list_type_node;
33831 *pname = "__builtin_ms_va_list";
33835 *ptree = sysv_va_list_type_node;
33836 *pname = "__builtin_sysv_va_list";
33844 #undef TARGET_SCHED_DISPATCH
33845 #define TARGET_SCHED_DISPATCH has_dispatch
33846 #undef TARGET_SCHED_DISPATCH_DO
33847 #define TARGET_SCHED_DISPATCH_DO do_dispatch
33849 /* The size of the dispatch window is the total number of bytes of
33850 object code allowed in a window. */
33851 #define DISPATCH_WINDOW_SIZE 16
33853 /* Number of dispatch windows considered for scheduling. */
33854 #define MAX_DISPATCH_WINDOWS 3
33856 /* Maximum number of instructions in a window. */
33859 /* Maximum number of immediate operands in a window. */
33862 /* Maximum number of immediate bits allowed in a window. */
33863 #define MAX_IMM_SIZE 128
33865 /* Maximum number of 32 bit immediates allowed in a window. */
33866 #define MAX_IMM_32 4
33868 /* Maximum number of 64 bit immediates allowed in a window. */
33869 #define MAX_IMM_64 2
33871 /* Maximum total of loads or prefetches allowed in a window. */
33874 /* Maximum total of stores allowed in a window. */
33875 #define MAX_STORE 1
33881 /* Dispatch groups. Istructions that affect the mix in a dispatch window. */
33882 enum dispatch_group {
33897 /* Number of allowable groups in a dispatch window. It is an array
33898 indexed by dispatch_group enum. 100 is used as a big number,
33899 because the number of these kind of operations does not have any
33900 effect in dispatch window, but we need them for other reasons in
33902 static unsigned int num_allowable_groups[disp_last] = {
33903 0, 2, 1, 1, 2, 4, 4, 2, 1, BIG, BIG
33906 char group_name[disp_last + 1][16] = {
33907 "disp_no_group", "disp_load", "disp_store", "disp_load_store",
33908 "disp_prefetch", "disp_imm", "disp_imm_32", "disp_imm_64",
33909 "disp_branch", "disp_cmp", "disp_jcc", "disp_last"
33912 /* Instruction path. */
33915 path_single, /* Single micro op. */
33916 path_double, /* Double micro op. */
33917 path_multi, /* Instructions with more than 2 micro op.. */
33921 /* sched_insn_info defines a window to the instructions scheduled in
33922 the basic block. It contains a pointer to the insn_info table and
33923 the instruction scheduled.
33925 Windows are allocated for each basic block and are linked
33927 typedef struct sched_insn_info_s {
33929 enum dispatch_group group;
33930 enum insn_path path;
33935 /* Linked list of dispatch windows. This is a two way list of
33936 dispatch windows of a basic block. It contains information about
33937 the number of uops in the window and the total number of
33938 instructions and of bytes in the object code for this dispatch
33940 typedef struct dispatch_windows_s {
33941 int num_insn; /* Number of insn in the window. */
33942 int num_uops; /* Number of uops in the window. */
33943 int window_size; /* Number of bytes in the window. */
33944 int window_num; /* Window number between 0 or 1. */
33945 int num_imm; /* Number of immediates in an insn. */
33946 int num_imm_32; /* Number of 32 bit immediates in an insn. */
33947 int num_imm_64; /* Number of 64 bit immediates in an insn. */
33948 int imm_size; /* Total immediates in the window. */
33949 int num_loads; /* Total memory loads in the window. */
33950 int num_stores; /* Total memory stores in the window. */
33951 int violation; /* Violation exists in window. */
33952 sched_insn_info *window; /* Pointer to the window. */
33953 struct dispatch_windows_s *next;
33954 struct dispatch_windows_s *prev;
33955 } dispatch_windows;
33957 /* Immediate valuse used in an insn. */
33958 typedef struct imm_info_s
33965 static dispatch_windows *dispatch_window_list;
33966 static dispatch_windows *dispatch_window_list1;
33968 /* Get dispatch group of insn. */
33970 static enum dispatch_group
33971 get_mem_group (rtx insn)
33973 enum attr_memory memory;
33975 if (INSN_CODE (insn) < 0)
33976 return disp_no_group;
33977 memory = get_attr_memory (insn);
33978 if (memory == MEMORY_STORE)
33981 if (memory == MEMORY_LOAD)
33984 if (memory == MEMORY_BOTH)
33985 return disp_load_store;
33987 return disp_no_group;
33990 /* Return true if insn is a compare instruction. */
33995 enum attr_type type;
33997 type = get_attr_type (insn);
33998 return (type == TYPE_TEST
33999 || type == TYPE_ICMP
34000 || type == TYPE_FCMP
34001 || GET_CODE (PATTERN (insn)) == COMPARE);
34004 /* Return true if a dispatch violation encountered. */
34007 dispatch_violation (void)
34009 if (dispatch_window_list->next)
34010 return dispatch_window_list->next->violation;
34011 return dispatch_window_list->violation;
34014 /* Return true if insn is a branch instruction. */
34017 is_branch (rtx insn)
34019 return (CALL_P (insn) || JUMP_P (insn));
34022 /* Return true if insn is a prefetch instruction. */
34025 is_prefetch (rtx insn)
34027 return NONJUMP_INSN_P (insn) && GET_CODE (PATTERN (insn)) == PREFETCH;
34030 /* This function initializes a dispatch window and the list container holding a
34031 pointer to the window. */
34034 init_window (int window_num)
34037 dispatch_windows *new_list;
34039 if (window_num == 0)
34040 new_list = dispatch_window_list;
34042 new_list = dispatch_window_list1;
34044 new_list->num_insn = 0;
34045 new_list->num_uops = 0;
34046 new_list->window_size = 0;
34047 new_list->next = NULL;
34048 new_list->prev = NULL;
34049 new_list->window_num = window_num;
34050 new_list->num_imm = 0;
34051 new_list->num_imm_32 = 0;
34052 new_list->num_imm_64 = 0;
34053 new_list->imm_size = 0;
34054 new_list->num_loads = 0;
34055 new_list->num_stores = 0;
34056 new_list->violation = false;
34058 for (i = 0; i < MAX_INSN; i++)
34060 new_list->window[i].insn = NULL;
34061 new_list->window[i].group = disp_no_group;
34062 new_list->window[i].path = no_path;
34063 new_list->window[i].byte_len = 0;
34064 new_list->window[i].imm_bytes = 0;
34069 /* This function allocates and initializes a dispatch window and the
34070 list container holding a pointer to the window. */
34072 static dispatch_windows *
34073 allocate_window (void)
34075 dispatch_windows *new_list = XNEW (struct dispatch_windows_s);
34076 new_list->window = XNEWVEC (struct sched_insn_info_s, MAX_INSN + 1);
34081 /* This routine initializes the dispatch scheduling information. It
34082 initiates building dispatch scheduler tables and constructs the
34083 first dispatch window. */
34086 init_dispatch_sched (void)
34088 /* Allocate a dispatch list and a window. */
34089 dispatch_window_list = allocate_window ();
34090 dispatch_window_list1 = allocate_window ();
34095 /* This function returns true if a branch is detected. End of a basic block
34096 does not have to be a branch, but here we assume only branches end a
34100 is_end_basic_block (enum dispatch_group group)
34102 return group == disp_branch;
34105 /* This function is called when the end of a window processing is reached. */
34108 process_end_window (void)
34110 gcc_assert (dispatch_window_list->num_insn <= MAX_INSN);
34111 if (dispatch_window_list->next)
34113 gcc_assert (dispatch_window_list1->num_insn <= MAX_INSN);
34114 gcc_assert (dispatch_window_list->window_size
34115 + dispatch_window_list1->window_size <= 48);
34121 /* Allocates a new dispatch window and adds it to WINDOW_LIST.
34122 WINDOW_NUM is either 0 or 1. A maximum of two windows are generated
34123 for 48 bytes of instructions. Note that these windows are not dispatch
34124 windows that their sizes are DISPATCH_WINDOW_SIZE. */
34126 static dispatch_windows *
34127 allocate_next_window (int window_num)
34129 if (window_num == 0)
34131 if (dispatch_window_list->next)
34134 return dispatch_window_list;
34137 dispatch_window_list->next = dispatch_window_list1;
34138 dispatch_window_list1->prev = dispatch_window_list;
34140 return dispatch_window_list1;
34143 /* Increment the number of immediate operands of an instruction. */
34146 find_constant_1 (rtx *in_rtx, imm_info *imm_values)
34151 switch ( GET_CODE (*in_rtx))
34156 (imm_values->imm)++;
34157 if (x86_64_immediate_operand (*in_rtx, SImode))
34158 (imm_values->imm32)++;
34160 (imm_values->imm64)++;
34164 (imm_values->imm)++;
34165 (imm_values->imm64)++;
34169 if (LABEL_KIND (*in_rtx) == LABEL_NORMAL)
34171 (imm_values->imm)++;
34172 (imm_values->imm32)++;
34183 /* Compute number of immediate operands of an instruction. */
34186 find_constant (rtx in_rtx, imm_info *imm_values)
34188 for_each_rtx (INSN_P (in_rtx) ? &PATTERN (in_rtx) : &in_rtx,
34189 (rtx_function) find_constant_1, (void *) imm_values);
34192 /* Return total size of immediate operands of an instruction along with number
34193 of corresponding immediate-operands. It initializes its parameters to zero
34194 befor calling FIND_CONSTANT.
34195 INSN is the input instruction. IMM is the total of immediates.
34196 IMM32 is the number of 32 bit immediates. IMM64 is the number of 64
34200 get_num_immediates (rtx insn, int *imm, int *imm32, int *imm64)
34202 imm_info imm_values = {0, 0, 0};
34204 find_constant (insn, &imm_values);
34205 *imm = imm_values.imm;
34206 *imm32 = imm_values.imm32;
34207 *imm64 = imm_values.imm64;
34208 return imm_values.imm32 * 4 + imm_values.imm64 * 8;
34211 /* This function indicates if an operand of an instruction is an
34215 has_immediate (rtx insn)
34217 int num_imm_operand;
34218 int num_imm32_operand;
34219 int num_imm64_operand;
34222 return get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
34223 &num_imm64_operand);
34227 /* Return single or double path for instructions. */
34229 static enum insn_path
34230 get_insn_path (rtx insn)
34232 enum attr_amdfam10_decode path = get_attr_amdfam10_decode (insn);
34234 if ((int)path == 0)
34235 return path_single;
34237 if ((int)path == 1)
34238 return path_double;
34243 /* Return insn dispatch group. */
34245 static enum dispatch_group
34246 get_insn_group (rtx insn)
34248 enum dispatch_group group = get_mem_group (insn);
34252 if (is_branch (insn))
34253 return disp_branch;
34258 if (has_immediate (insn))
34261 if (is_prefetch (insn))
34262 return disp_prefetch;
34264 return disp_no_group;
34267 /* Count number of GROUP restricted instructions in a dispatch
34268 window WINDOW_LIST. */
34271 count_num_restricted (rtx insn, dispatch_windows *window_list)
34273 enum dispatch_group group = get_insn_group (insn);
34275 int num_imm_operand;
34276 int num_imm32_operand;
34277 int num_imm64_operand;
34279 if (group == disp_no_group)
34282 if (group == disp_imm)
34284 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
34285 &num_imm64_operand);
34286 if (window_list->imm_size + imm_size > MAX_IMM_SIZE
34287 || num_imm_operand + window_list->num_imm > MAX_IMM
34288 || (num_imm32_operand > 0
34289 && (window_list->num_imm_32 + num_imm32_operand > MAX_IMM_32
34290 || window_list->num_imm_64 * 2 + num_imm32_operand > MAX_IMM_32))
34291 || (num_imm64_operand > 0
34292 && (window_list->num_imm_64 + num_imm64_operand > MAX_IMM_64
34293 || window_list->num_imm_32 + num_imm64_operand * 2 > MAX_IMM_32))
34294 || (window_list->imm_size + imm_size == MAX_IMM_SIZE
34295 && num_imm64_operand > 0
34296 && ((window_list->num_imm_64 > 0
34297 && window_list->num_insn >= 2)
34298 || window_list->num_insn >= 3)))
34304 if ((group == disp_load_store
34305 && (window_list->num_loads >= MAX_LOAD
34306 || window_list->num_stores >= MAX_STORE))
34307 || ((group == disp_load
34308 || group == disp_prefetch)
34309 && window_list->num_loads >= MAX_LOAD)
34310 || (group == disp_store
34311 && window_list->num_stores >= MAX_STORE))
34317 /* This function returns true if insn satisfies dispatch rules on the
34318 last window scheduled. */
34321 fits_dispatch_window (rtx insn)
34323 dispatch_windows *window_list = dispatch_window_list;
34324 dispatch_windows *window_list_next = dispatch_window_list->next;
34325 unsigned int num_restrict;
34326 enum dispatch_group group = get_insn_group (insn);
34327 enum insn_path path = get_insn_path (insn);
34330 /* Make disp_cmp and disp_jcc get scheduled at the latest. These
34331 instructions should be given the lowest priority in the
34332 scheduling process in Haifa scheduler to make sure they will be
34333 scheduled in the same dispatch window as the refrence to them. */
34334 if (group == disp_jcc || group == disp_cmp)
34337 /* Check nonrestricted. */
34338 if (group == disp_no_group || group == disp_branch)
34341 /* Get last dispatch window. */
34342 if (window_list_next)
34343 window_list = window_list_next;
34345 if (window_list->window_num == 1)
34347 sum = window_list->prev->window_size + window_list->window_size;
34350 || (min_insn_size (insn) + sum) >= 48)
34351 /* Window 1 is full. Go for next window. */
34355 num_restrict = count_num_restricted (insn, window_list);
34357 if (num_restrict > num_allowable_groups[group])
34360 /* See if it fits in the first window. */
34361 if (window_list->window_num == 0)
34363 /* The first widow should have only single and double path
34365 if (path == path_double
34366 && (window_list->num_uops + 2) > MAX_INSN)
34368 else if (path != path_single)
34374 /* Add an instruction INSN with NUM_UOPS micro-operations to the
34375 dispatch window WINDOW_LIST. */
34378 add_insn_window (rtx insn, dispatch_windows *window_list, int num_uops)
34380 int byte_len = min_insn_size (insn);
34381 int num_insn = window_list->num_insn;
34383 sched_insn_info *window = window_list->window;
34384 enum dispatch_group group = get_insn_group (insn);
34385 enum insn_path path = get_insn_path (insn);
34386 int num_imm_operand;
34387 int num_imm32_operand;
34388 int num_imm64_operand;
34390 if (!window_list->violation && group != disp_cmp
34391 && !fits_dispatch_window (insn))
34392 window_list->violation = true;
34394 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
34395 &num_imm64_operand);
34397 /* Initialize window with new instruction. */
34398 window[num_insn].insn = insn;
34399 window[num_insn].byte_len = byte_len;
34400 window[num_insn].group = group;
34401 window[num_insn].path = path;
34402 window[num_insn].imm_bytes = imm_size;
34404 window_list->window_size += byte_len;
34405 window_list->num_insn = num_insn + 1;
34406 window_list->num_uops = window_list->num_uops + num_uops;
34407 window_list->imm_size += imm_size;
34408 window_list->num_imm += num_imm_operand;
34409 window_list->num_imm_32 += num_imm32_operand;
34410 window_list->num_imm_64 += num_imm64_operand;
34412 if (group == disp_store)
34413 window_list->num_stores += 1;
34414 else if (group == disp_load
34415 || group == disp_prefetch)
34416 window_list->num_loads += 1;
34417 else if (group == disp_load_store)
34419 window_list->num_stores += 1;
34420 window_list->num_loads += 1;
34424 /* Adds a scheduled instruction, INSN, to the current dispatch window.
34425 If the total bytes of instructions or the number of instructions in
34426 the window exceed allowable, it allocates a new window. */
34429 add_to_dispatch_window (rtx insn)
34432 dispatch_windows *window_list;
34433 dispatch_windows *next_list;
34434 dispatch_windows *window0_list;
34435 enum insn_path path;
34436 enum dispatch_group insn_group;
34444 if (INSN_CODE (insn) < 0)
34447 byte_len = min_insn_size (insn);
34448 window_list = dispatch_window_list;
34449 next_list = window_list->next;
34450 path = get_insn_path (insn);
34451 insn_group = get_insn_group (insn);
34453 /* Get the last dispatch window. */
34455 window_list = dispatch_window_list->next;
34457 if (path == path_single)
34459 else if (path == path_double)
34462 insn_num_uops = (int) path;
34464 /* If current window is full, get a new window.
34465 Window number zero is full, if MAX_INSN uops are scheduled in it.
34466 Window number one is full, if window zero's bytes plus window
34467 one's bytes is 32, or if the bytes of the new instruction added
34468 to the total makes it greater than 48, or it has already MAX_INSN
34469 instructions in it. */
34470 num_insn = window_list->num_insn;
34471 num_uops = window_list->num_uops;
34472 window_num = window_list->window_num;
34473 insn_fits = fits_dispatch_window (insn);
34475 if (num_insn >= MAX_INSN
34476 || num_uops + insn_num_uops > MAX_INSN
34479 window_num = ~window_num & 1;
34480 window_list = allocate_next_window (window_num);
34483 if (window_num == 0)
34485 add_insn_window (insn, window_list, insn_num_uops);
34486 if (window_list->num_insn >= MAX_INSN
34487 && insn_group == disp_branch)
34489 process_end_window ();
34493 else if (window_num == 1)
34495 window0_list = window_list->prev;
34496 sum = window0_list->window_size + window_list->window_size;
34498 || (byte_len + sum) >= 48)
34500 process_end_window ();
34501 window_list = dispatch_window_list;
34504 add_insn_window (insn, window_list, insn_num_uops);
34507 gcc_unreachable ();
34509 if (is_end_basic_block (insn_group))
34511 /* End of basic block is reached do end-basic-block process. */
34512 process_end_window ();
34517 /* Print the dispatch window, WINDOW_NUM, to FILE. */
34519 DEBUG_FUNCTION static void
34520 debug_dispatch_window_file (FILE *file, int window_num)
34522 dispatch_windows *list;
34525 if (window_num == 0)
34526 list = dispatch_window_list;
34528 list = dispatch_window_list1;
34530 fprintf (file, "Window #%d:\n", list->window_num);
34531 fprintf (file, " num_insn = %d, num_uops = %d, window_size = %d\n",
34532 list->num_insn, list->num_uops, list->window_size);
34533 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
34534 list->num_imm, list->num_imm_32, list->num_imm_64, list->imm_size);
34536 fprintf (file, " num_loads = %d, num_stores = %d\n", list->num_loads,
34538 fprintf (file, " insn info:\n");
34540 for (i = 0; i < MAX_INSN; i++)
34542 if (!list->window[i].insn)
34544 fprintf (file, " group[%d] = %s, insn[%d] = %p, path[%d] = %d byte_len[%d] = %d, imm_bytes[%d] = %d\n",
34545 i, group_name[list->window[i].group],
34546 i, (void *)list->window[i].insn,
34547 i, list->window[i].path,
34548 i, list->window[i].byte_len,
34549 i, list->window[i].imm_bytes);
34553 /* Print to stdout a dispatch window. */
34555 DEBUG_FUNCTION void
34556 debug_dispatch_window (int window_num)
34558 debug_dispatch_window_file (stdout, window_num);
34561 /* Print INSN dispatch information to FILE. */
34563 DEBUG_FUNCTION static void
34564 debug_insn_dispatch_info_file (FILE *file, rtx insn)
34567 enum insn_path path;
34568 enum dispatch_group group;
34570 int num_imm_operand;
34571 int num_imm32_operand;
34572 int num_imm64_operand;
34574 if (INSN_CODE (insn) < 0)
34577 byte_len = min_insn_size (insn);
34578 path = get_insn_path (insn);
34579 group = get_insn_group (insn);
34580 imm_size = get_num_immediates (insn, &num_imm_operand, &num_imm32_operand,
34581 &num_imm64_operand);
34583 fprintf (file, " insn info:\n");
34584 fprintf (file, " group = %s, path = %d, byte_len = %d\n",
34585 group_name[group], path, byte_len);
34586 fprintf (file, " num_imm = %d, num_imm_32 = %d, num_imm_64 = %d, imm_size = %d\n",
34587 num_imm_operand, num_imm32_operand, num_imm64_operand, imm_size);
34590 /* Print to STDERR the status of the ready list with respect to
34591 dispatch windows. */
34593 DEBUG_FUNCTION void
34594 debug_ready_dispatch (void)
34597 int no_ready = number_in_ready ();
34599 fprintf (stdout, "Number of ready: %d\n", no_ready);
34601 for (i = 0; i < no_ready; i++)
34602 debug_insn_dispatch_info_file (stdout, get_ready_element (i));
34605 /* This routine is the driver of the dispatch scheduler. */
34608 do_dispatch (rtx insn, int mode)
34610 if (mode == DISPATCH_INIT)
34611 init_dispatch_sched ();
34612 else if (mode == ADD_TO_DISPATCH_WINDOW)
34613 add_to_dispatch_window (insn);
34616 /* Return TRUE if Dispatch Scheduling is supported. */
34619 has_dispatch (rtx insn, int action)
34621 if ((ix86_tune == PROCESSOR_BDVER1 || ix86_tune == PROCESSOR_BDVER2)
34622 && flag_dispatch_scheduler)
34628 case IS_DISPATCH_ON:
34633 return is_cmp (insn);
34635 case DISPATCH_VIOLATION:
34636 return dispatch_violation ();
34638 case FITS_DISPATCH_WINDOW:
34639 return fits_dispatch_window (insn);
34645 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
34646 place emms and femms instructions. */
34648 static enum machine_mode
34649 ix86_preferred_simd_mode (enum machine_mode mode)
34666 if (TARGET_AVX && !TARGET_PREFER_AVX128)
34672 if (!TARGET_VECTORIZE_DOUBLE)
34674 else if (TARGET_AVX && !TARGET_PREFER_AVX128)
34676 else if (TARGET_SSE2)
34685 /* If AVX is enabled then try vectorizing with both 256bit and 128bit
34688 static unsigned int
34689 ix86_autovectorize_vector_sizes (void)
34691 return (TARGET_AVX && !TARGET_PREFER_AVX128) ? 32 | 16 : 0;
34694 /* Initialize the GCC target structure. */
34695 #undef TARGET_RETURN_IN_MEMORY
34696 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
34698 #undef TARGET_LEGITIMIZE_ADDRESS
34699 #define TARGET_LEGITIMIZE_ADDRESS ix86_legitimize_address
34701 #undef TARGET_ATTRIBUTE_TABLE
34702 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
34703 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
34704 # undef TARGET_MERGE_DECL_ATTRIBUTES
34705 # define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
34708 #undef TARGET_COMP_TYPE_ATTRIBUTES
34709 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
34711 #undef TARGET_INIT_BUILTINS
34712 #define TARGET_INIT_BUILTINS ix86_init_builtins
34713 #undef TARGET_BUILTIN_DECL
34714 #define TARGET_BUILTIN_DECL ix86_builtin_decl
34715 #undef TARGET_EXPAND_BUILTIN
34716 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
34718 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
34719 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
34720 ix86_builtin_vectorized_function
34722 #undef TARGET_VECTORIZE_BUILTIN_CONVERSION
34723 #define TARGET_VECTORIZE_BUILTIN_CONVERSION ix86_vectorize_builtin_conversion
34725 #undef TARGET_BUILTIN_RECIPROCAL
34726 #define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
34728 #undef TARGET_ASM_FUNCTION_EPILOGUE
34729 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
34731 #undef TARGET_ENCODE_SECTION_INFO
34732 #ifndef SUBTARGET_ENCODE_SECTION_INFO
34733 #define TARGET_ENCODE_SECTION_INFO ix86_encode_section_info
34735 #define TARGET_ENCODE_SECTION_INFO SUBTARGET_ENCODE_SECTION_INFO
34738 #undef TARGET_ASM_OPEN_PAREN
34739 #define TARGET_ASM_OPEN_PAREN ""
34740 #undef TARGET_ASM_CLOSE_PAREN
34741 #define TARGET_ASM_CLOSE_PAREN ""
34743 #undef TARGET_ASM_BYTE_OP
34744 #define TARGET_ASM_BYTE_OP ASM_BYTE
34746 #undef TARGET_ASM_ALIGNED_HI_OP
34747 #define TARGET_ASM_ALIGNED_HI_OP ASM_SHORT
34748 #undef TARGET_ASM_ALIGNED_SI_OP
34749 #define TARGET_ASM_ALIGNED_SI_OP ASM_LONG
34751 #undef TARGET_ASM_ALIGNED_DI_OP
34752 #define TARGET_ASM_ALIGNED_DI_OP ASM_QUAD
34755 #undef TARGET_PROFILE_BEFORE_PROLOGUE
34756 #define TARGET_PROFILE_BEFORE_PROLOGUE ix86_profile_before_prologue
34758 #undef TARGET_ASM_UNALIGNED_HI_OP
34759 #define TARGET_ASM_UNALIGNED_HI_OP TARGET_ASM_ALIGNED_HI_OP
34760 #undef TARGET_ASM_UNALIGNED_SI_OP
34761 #define TARGET_ASM_UNALIGNED_SI_OP TARGET_ASM_ALIGNED_SI_OP
34762 #undef TARGET_ASM_UNALIGNED_DI_OP
34763 #define TARGET_ASM_UNALIGNED_DI_OP TARGET_ASM_ALIGNED_DI_OP
34765 #undef TARGET_PRINT_OPERAND
34766 #define TARGET_PRINT_OPERAND ix86_print_operand
34767 #undef TARGET_PRINT_OPERAND_ADDRESS
34768 #define TARGET_PRINT_OPERAND_ADDRESS ix86_print_operand_address
34769 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
34770 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P ix86_print_operand_punct_valid_p
34771 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
34772 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA i386_asm_output_addr_const_extra
34774 #undef TARGET_SCHED_INIT_GLOBAL
34775 #define TARGET_SCHED_INIT_GLOBAL ix86_sched_init_global
34776 #undef TARGET_SCHED_ADJUST_COST
34777 #define TARGET_SCHED_ADJUST_COST ix86_adjust_cost
34778 #undef TARGET_SCHED_ISSUE_RATE
34779 #define TARGET_SCHED_ISSUE_RATE ix86_issue_rate
34780 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
34781 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
34782 ia32_multipass_dfa_lookahead
34784 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
34785 #define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall
34788 #undef TARGET_HAVE_TLS
34789 #define TARGET_HAVE_TLS true
34791 #undef TARGET_CANNOT_FORCE_CONST_MEM
34792 #define TARGET_CANNOT_FORCE_CONST_MEM ix86_cannot_force_const_mem
34793 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
34794 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
34796 #undef TARGET_DELEGITIMIZE_ADDRESS
34797 #define TARGET_DELEGITIMIZE_ADDRESS ix86_delegitimize_address
34799 #undef TARGET_MS_BITFIELD_LAYOUT_P
34800 #define TARGET_MS_BITFIELD_LAYOUT_P ix86_ms_bitfield_layout_p
34803 #undef TARGET_BINDS_LOCAL_P
34804 #define TARGET_BINDS_LOCAL_P darwin_binds_local_p
34806 #if TARGET_DLLIMPORT_DECL_ATTRIBUTES
34807 #undef TARGET_BINDS_LOCAL_P
34808 #define TARGET_BINDS_LOCAL_P i386_pe_binds_local_p
34811 #undef TARGET_ASM_OUTPUT_MI_THUNK
34812 #define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk
34813 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
34814 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK x86_can_output_mi_thunk
34816 #undef TARGET_ASM_FILE_START
34817 #define TARGET_ASM_FILE_START x86_file_start
34819 #undef TARGET_OPTION_OVERRIDE
34820 #define TARGET_OPTION_OVERRIDE ix86_option_override
34822 #undef TARGET_REGISTER_MOVE_COST
34823 #define TARGET_REGISTER_MOVE_COST ix86_register_move_cost
34824 #undef TARGET_MEMORY_MOVE_COST
34825 #define TARGET_MEMORY_MOVE_COST ix86_memory_move_cost
34826 #undef TARGET_RTX_COSTS
34827 #define TARGET_RTX_COSTS ix86_rtx_costs
34828 #undef TARGET_ADDRESS_COST
34829 #define TARGET_ADDRESS_COST ix86_address_cost
34831 #undef TARGET_FIXED_CONDITION_CODE_REGS
34832 #define TARGET_FIXED_CONDITION_CODE_REGS ix86_fixed_condition_code_regs
34833 #undef TARGET_CC_MODES_COMPATIBLE
34834 #define TARGET_CC_MODES_COMPATIBLE ix86_cc_modes_compatible
34836 #undef TARGET_MACHINE_DEPENDENT_REORG
34837 #define TARGET_MACHINE_DEPENDENT_REORG ix86_reorg
34839 #undef TARGET_BUILTIN_SETJMP_FRAME_VALUE
34840 #define TARGET_BUILTIN_SETJMP_FRAME_VALUE ix86_builtin_setjmp_frame_value
34842 #undef TARGET_BUILD_BUILTIN_VA_LIST
34843 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
34845 #undef TARGET_ENUM_VA_LIST_P
34846 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
34848 #undef TARGET_FN_ABI_VA_LIST
34849 #define TARGET_FN_ABI_VA_LIST ix86_fn_abi_va_list
34851 #undef TARGET_CANONICAL_VA_LIST_TYPE
34852 #define TARGET_CANONICAL_VA_LIST_TYPE ix86_canonical_va_list_type
34854 #undef TARGET_EXPAND_BUILTIN_VA_START
34855 #define TARGET_EXPAND_BUILTIN_VA_START ix86_va_start
34857 #undef TARGET_MD_ASM_CLOBBERS
34858 #define TARGET_MD_ASM_CLOBBERS ix86_md_asm_clobbers
34860 #undef TARGET_PROMOTE_PROTOTYPES
34861 #define TARGET_PROMOTE_PROTOTYPES hook_bool_const_tree_true
34862 #undef TARGET_STRUCT_VALUE_RTX
34863 #define TARGET_STRUCT_VALUE_RTX ix86_struct_value_rtx
34864 #undef TARGET_SETUP_INCOMING_VARARGS
34865 #define TARGET_SETUP_INCOMING_VARARGS ix86_setup_incoming_varargs
34866 #undef TARGET_MUST_PASS_IN_STACK
34867 #define TARGET_MUST_PASS_IN_STACK ix86_must_pass_in_stack
34868 #undef TARGET_FUNCTION_ARG_ADVANCE
34869 #define TARGET_FUNCTION_ARG_ADVANCE ix86_function_arg_advance
34870 #undef TARGET_FUNCTION_ARG
34871 #define TARGET_FUNCTION_ARG ix86_function_arg
34872 #undef TARGET_FUNCTION_ARG_BOUNDARY
34873 #define TARGET_FUNCTION_ARG_BOUNDARY ix86_function_arg_boundary
34874 #undef TARGET_PASS_BY_REFERENCE
34875 #define TARGET_PASS_BY_REFERENCE ix86_pass_by_reference
34876 #undef TARGET_INTERNAL_ARG_POINTER
34877 #define TARGET_INTERNAL_ARG_POINTER ix86_internal_arg_pointer
34878 #undef TARGET_UPDATE_STACK_BOUNDARY
34879 #define TARGET_UPDATE_STACK_BOUNDARY ix86_update_stack_boundary
34880 #undef TARGET_GET_DRAP_RTX
34881 #define TARGET_GET_DRAP_RTX ix86_get_drap_rtx
34882 #undef TARGET_STRICT_ARGUMENT_NAMING
34883 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
34884 #undef TARGET_STATIC_CHAIN
34885 #define TARGET_STATIC_CHAIN ix86_static_chain
34886 #undef TARGET_TRAMPOLINE_INIT
34887 #define TARGET_TRAMPOLINE_INIT ix86_trampoline_init
34888 #undef TARGET_RETURN_POPS_ARGS
34889 #define TARGET_RETURN_POPS_ARGS ix86_return_pops_args
34891 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
34892 #define TARGET_GIMPLIFY_VA_ARG_EXPR ix86_gimplify_va_arg
34894 #undef TARGET_SCALAR_MODE_SUPPORTED_P
34895 #define TARGET_SCALAR_MODE_SUPPORTED_P ix86_scalar_mode_supported_p
34897 #undef TARGET_VECTOR_MODE_SUPPORTED_P
34898 #define TARGET_VECTOR_MODE_SUPPORTED_P ix86_vector_mode_supported_p
34900 #undef TARGET_C_MODE_FOR_SUFFIX
34901 #define TARGET_C_MODE_FOR_SUFFIX ix86_c_mode_for_suffix
34904 #undef TARGET_ASM_OUTPUT_DWARF_DTPREL
34905 #define TARGET_ASM_OUTPUT_DWARF_DTPREL i386_output_dwarf_dtprel
34908 #ifdef SUBTARGET_INSERT_ATTRIBUTES
34909 #undef TARGET_INSERT_ATTRIBUTES
34910 #define TARGET_INSERT_ATTRIBUTES SUBTARGET_INSERT_ATTRIBUTES
34913 #undef TARGET_MANGLE_TYPE
34914 #define TARGET_MANGLE_TYPE ix86_mangle_type
34916 #ifndef TARGET_MACHO
34917 #undef TARGET_STACK_PROTECT_FAIL
34918 #define TARGET_STACK_PROTECT_FAIL ix86_stack_protect_fail
34921 #undef TARGET_FUNCTION_VALUE
34922 #define TARGET_FUNCTION_VALUE ix86_function_value
34924 #undef TARGET_FUNCTION_VALUE_REGNO_P
34925 #define TARGET_FUNCTION_VALUE_REGNO_P ix86_function_value_regno_p
34927 #undef TARGET_SECONDARY_RELOAD
34928 #define TARGET_SECONDARY_RELOAD ix86_secondary_reload
34930 #undef TARGET_PREFERRED_RELOAD_CLASS
34931 #define TARGET_PREFERRED_RELOAD_CLASS ix86_preferred_reload_class
34932 #undef TARGET_PREFERRED_OUTPUT_RELOAD_CLASS
34933 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
34934 #undef TARGET_CLASS_LIKELY_SPILLED_P
34935 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
34937 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
34938 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
34939 ix86_builtin_vectorization_cost
34940 #undef TARGET_VECTORIZE_BUILTIN_VEC_PERM
34941 #define TARGET_VECTORIZE_BUILTIN_VEC_PERM \
34942 ix86_vectorize_builtin_vec_perm
34943 #undef TARGET_VECTORIZE_BUILTIN_VEC_PERM_OK
34944 #define TARGET_VECTORIZE_BUILTIN_VEC_PERM_OK \
34945 ix86_vectorize_builtin_vec_perm_ok
34946 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
34947 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
34948 ix86_preferred_simd_mode
34949 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
34950 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
34951 ix86_autovectorize_vector_sizes
34953 #undef TARGET_SET_CURRENT_FUNCTION
34954 #define TARGET_SET_CURRENT_FUNCTION ix86_set_current_function
34956 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
34957 #define TARGET_OPTION_VALID_ATTRIBUTE_P ix86_valid_target_attribute_p
34959 #undef TARGET_OPTION_SAVE
34960 #define TARGET_OPTION_SAVE ix86_function_specific_save
34962 #undef TARGET_OPTION_RESTORE
34963 #define TARGET_OPTION_RESTORE ix86_function_specific_restore
34965 #undef TARGET_OPTION_PRINT
34966 #define TARGET_OPTION_PRINT ix86_function_specific_print
34968 #undef TARGET_CAN_INLINE_P
34969 #define TARGET_CAN_INLINE_P ix86_can_inline_p
34971 #undef TARGET_EXPAND_TO_RTL_HOOK
34972 #define TARGET_EXPAND_TO_RTL_HOOK ix86_maybe_switch_abi
34974 #undef TARGET_LEGITIMATE_ADDRESS_P
34975 #define TARGET_LEGITIMATE_ADDRESS_P ix86_legitimate_address_p
34977 #undef TARGET_LEGITIMATE_CONSTANT_P
34978 #define TARGET_LEGITIMATE_CONSTANT_P ix86_legitimate_constant_p
34980 #undef TARGET_FRAME_POINTER_REQUIRED
34981 #define TARGET_FRAME_POINTER_REQUIRED ix86_frame_pointer_required
34983 #undef TARGET_CAN_ELIMINATE
34984 #define TARGET_CAN_ELIMINATE ix86_can_eliminate
34986 #undef TARGET_EXTRA_LIVE_ON_ENTRY
34987 #define TARGET_EXTRA_LIVE_ON_ENTRY ix86_live_on_entry
34989 #undef TARGET_ASM_CODE_END
34990 #define TARGET_ASM_CODE_END ix86_code_end
34992 #undef TARGET_CONDITIONAL_REGISTER_USAGE
34993 #define TARGET_CONDITIONAL_REGISTER_USAGE ix86_conditional_register_usage
34996 #undef TARGET_INIT_LIBFUNCS
34997 #define TARGET_INIT_LIBFUNCS darwin_rename_builtins
35000 struct gcc_target targetm = TARGET_INITIALIZER;
35002 #include "gt-i386.h"