/* Copyright (C) 2000, 2001, 2003, 2005 Free Software Foundation, Inc. Contributed by James E. Wilson . This file is part of GCC. GCC is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GCC; see the file COPYING. If not, write to the Free Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ /* As a special exception, if you link this library with other files, some of which are compiled with GCC, to produce an executable, this library does not by itself cause the resulting executable to be covered by the GNU General Public License. This exception does not however invalidate any other reasons why the executable file might be covered by the GNU General Public License. */ #ifdef L__divxf3 // Compute a 80-bit IEEE double-extended quotient. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // farg0 holds the dividend. farg1 holds the divisor. // // __divtf3 is an alternate symbol name for backward compatibility. .text .align 16 .global __divxf3 .global __divtf3 .proc __divxf3 __divxf3: __divtf3: cmp.eq p7, p0 = r0, r0 frcpa.s0 f10, p6 = farg0, farg1 ;; (p6) cmp.ne p7, p0 = r0, r0 .pred.rel.mutex p6, p7 (p6) fnma.s1 f11 = farg1, f10, f1 (p6) fma.s1 f12 = farg0, f10, f0 ;; (p6) fma.s1 f13 = f11, f11, f0 (p6) fma.s1 f14 = f11, f11, f11 ;; (p6) fma.s1 f11 = f13, f13, f11 (p6) fma.s1 f13 = f14, f10, f10 ;; (p6) fma.s1 f10 = f13, f11, f10 (p6) fnma.s1 f11 = farg1, f12, farg0 ;; (p6) fma.s1 f11 = f11, f10, f12 (p6) fnma.s1 f12 = farg1, f10, f1 ;; (p6) fma.s1 f10 = f12, f10, f10 (p6) fnma.s1 f12 = farg1, f11, farg0 ;; (p6) fma.s0 fret0 = f12, f10, f11 (p7) mov fret0 = f10 br.ret.sptk rp .endp __divxf3 #endif #ifdef L__divdf3 // Compute a 64-bit IEEE double quotient. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // farg0 holds the dividend. farg1 holds the divisor. .text .align 16 .global __divdf3 .proc __divdf3 __divdf3: cmp.eq p7, p0 = r0, r0 frcpa.s0 f10, p6 = farg0, farg1 ;; (p6) cmp.ne p7, p0 = r0, r0 .pred.rel.mutex p6, p7 (p6) fmpy.s1 f11 = farg0, f10 (p6) fnma.s1 f12 = farg1, f10, f1 ;; (p6) fma.s1 f11 = f12, f11, f11 (p6) fmpy.s1 f13 = f12, f12 ;; (p6) fma.s1 f10 = f12, f10, f10 (p6) fma.s1 f11 = f13, f11, f11 ;; (p6) fmpy.s1 f12 = f13, f13 (p6) fma.s1 f10 = f13, f10, f10 ;; (p6) fma.d.s1 f11 = f12, f11, f11 (p6) fma.s1 f10 = f12, f10, f10 ;; (p6) fnma.d.s1 f8 = farg1, f11, farg0 ;; (p6) fma.d fret0 = f8, f10, f11 (p7) mov fret0 = f10 br.ret.sptk rp ;; .endp __divdf3 #endif #ifdef L__divsf3 // Compute a 32-bit IEEE float quotient. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // farg0 holds the dividend. farg1 holds the divisor. .text .align 16 .global __divsf3 .proc __divsf3 __divsf3: cmp.eq p7, p0 = r0, r0 frcpa.s0 f10, p6 = farg0, farg1 ;; (p6) cmp.ne p7, p0 = r0, r0 .pred.rel.mutex p6, p7 (p6) fmpy.s1 f8 = farg0, f10 (p6) fnma.s1 f9 = farg1, f10, f1 ;; (p6) fma.s1 f8 = f9, f8, f8 (p6) fmpy.s1 f9 = f9, f9 ;; (p6) fma.s1 f8 = f9, f8, f8 (p6) fmpy.s1 f9 = f9, f9 ;; (p6) fma.d.s1 f10 = f9, f8, f8 ;; (p6) fnorm.s.s0 fret0 = f10 (p7) mov fret0 = f10 br.ret.sptk rp ;; .endp __divsf3 #endif #ifdef L__divdi3 // Compute a 64-bit integer quotient. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // in0 holds the dividend. in1 holds the divisor. .text .align 16 .global __divdi3 .proc __divdi3 __divdi3: .regstk 2,0,0,0 // Transfer inputs to FP registers. setf.sig f8 = in0 setf.sig f9 = in1 // Check divide by zero. cmp.ne.unc p0,p7=0,in1 ;; // Convert the inputs to FP, so that they won't be treated as unsigned. fcvt.xf f8 = f8 fcvt.xf f9 = f9 (p7) break 1 ;; // Compute the reciprocal approximation. frcpa.s1 f10, p6 = f8, f9 ;; // 3 Newton-Raphson iterations. (p6) fnma.s1 f11 = f9, f10, f1 (p6) fmpy.s1 f12 = f8, f10 ;; (p6) fmpy.s1 f13 = f11, f11 (p6) fma.s1 f12 = f11, f12, f12 ;; (p6) fma.s1 f10 = f11, f10, f10 (p6) fma.s1 f11 = f13, f12, f12 ;; (p6) fma.s1 f10 = f13, f10, f10 (p6) fnma.s1 f12 = f9, f11, f8 ;; (p6) fma.s1 f10 = f12, f10, f11 ;; // Round quotient to an integer. fcvt.fx.trunc.s1 f10 = f10 ;; // Transfer result to GP registers. getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __divdi3 #endif #ifdef L__moddi3 // Compute a 64-bit integer modulus. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // in0 holds the dividend (a). in1 holds the divisor (b). .text .align 16 .global __moddi3 .proc __moddi3 __moddi3: .regstk 2,0,0,0 // Transfer inputs to FP registers. setf.sig f14 = in0 setf.sig f9 = in1 // Check divide by zero. cmp.ne.unc p0,p7=0,in1 ;; // Convert the inputs to FP, so that they won't be treated as unsigned. fcvt.xf f8 = f14 fcvt.xf f9 = f9 (p7) break 1 ;; // Compute the reciprocal approximation. frcpa.s1 f10, p6 = f8, f9 ;; // 3 Newton-Raphson iterations. (p6) fmpy.s1 f12 = f8, f10 (p6) fnma.s1 f11 = f9, f10, f1 ;; (p6) fma.s1 f12 = f11, f12, f12 (p6) fmpy.s1 f13 = f11, f11 ;; (p6) fma.s1 f10 = f11, f10, f10 (p6) fma.s1 f11 = f13, f12, f12 ;; sub in1 = r0, in1 (p6) fma.s1 f10 = f13, f10, f10 (p6) fnma.s1 f12 = f9, f11, f8 ;; setf.sig f9 = in1 (p6) fma.s1 f10 = f12, f10, f11 ;; fcvt.fx.trunc.s1 f10 = f10 ;; // r = q * (-b) + a xma.l f10 = f10, f9, f14 ;; // Transfer result to GP registers. getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __moddi3 #endif #ifdef L__udivdi3 // Compute a 64-bit unsigned integer quotient. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // in0 holds the dividend. in1 holds the divisor. .text .align 16 .global __udivdi3 .proc __udivdi3 __udivdi3: .regstk 2,0,0,0 // Transfer inputs to FP registers. setf.sig f8 = in0 setf.sig f9 = in1 // Check divide by zero. cmp.ne.unc p0,p7=0,in1 ;; // Convert the inputs to FP, to avoid FP software-assist faults. fcvt.xuf.s1 f8 = f8 fcvt.xuf.s1 f9 = f9 (p7) break 1 ;; // Compute the reciprocal approximation. frcpa.s1 f10, p6 = f8, f9 ;; // 3 Newton-Raphson iterations. (p6) fnma.s1 f11 = f9, f10, f1 (p6) fmpy.s1 f12 = f8, f10 ;; (p6) fmpy.s1 f13 = f11, f11 (p6) fma.s1 f12 = f11, f12, f12 ;; (p6) fma.s1 f10 = f11, f10, f10 (p6) fma.s1 f11 = f13, f12, f12 ;; (p6) fma.s1 f10 = f13, f10, f10 (p6) fnma.s1 f12 = f9, f11, f8 ;; (p6) fma.s1 f10 = f12, f10, f11 ;; // Round quotient to an unsigned integer. fcvt.fxu.trunc.s1 f10 = f10 ;; // Transfer result to GP registers. getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __udivdi3 #endif #ifdef L__umoddi3 // Compute a 64-bit unsigned integer modulus. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // in0 holds the dividend (a). in1 holds the divisor (b). .text .align 16 .global __umoddi3 .proc __umoddi3 __umoddi3: .regstk 2,0,0,0 // Transfer inputs to FP registers. setf.sig f14 = in0 setf.sig f9 = in1 // Check divide by zero. cmp.ne.unc p0,p7=0,in1 ;; // Convert the inputs to FP, to avoid FP software assist faults. fcvt.xuf.s1 f8 = f14 fcvt.xuf.s1 f9 = f9 (p7) break 1; ;; // Compute the reciprocal approximation. frcpa.s1 f10, p6 = f8, f9 ;; // 3 Newton-Raphson iterations. (p6) fmpy.s1 f12 = f8, f10 (p6) fnma.s1 f11 = f9, f10, f1 ;; (p6) fma.s1 f12 = f11, f12, f12 (p6) fmpy.s1 f13 = f11, f11 ;; (p6) fma.s1 f10 = f11, f10, f10 (p6) fma.s1 f11 = f13, f12, f12 ;; sub in1 = r0, in1 (p6) fma.s1 f10 = f13, f10, f10 (p6) fnma.s1 f12 = f9, f11, f8 ;; setf.sig f9 = in1 (p6) fma.s1 f10 = f12, f10, f11 ;; // Round quotient to an unsigned integer. fcvt.fxu.trunc.s1 f10 = f10 ;; // r = q * (-b) + a xma.l f10 = f10, f9, f14 ;; // Transfer result to GP registers. getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __umoddi3 #endif #ifdef L__divsi3 // Compute a 32-bit integer quotient. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // in0 holds the dividend. in1 holds the divisor. .text .align 16 .global __divsi3 .proc __divsi3 __divsi3: .regstk 2,0,0,0 // Check divide by zero. cmp.ne.unc p0,p7=0,in1 sxt4 in0 = in0 sxt4 in1 = in1 ;; setf.sig f8 = in0 setf.sig f9 = in1 (p7) break 1 ;; mov r2 = 0x0ffdd fcvt.xf f8 = f8 fcvt.xf f9 = f9 ;; setf.exp f11 = r2 frcpa.s1 f10, p6 = f8, f9 ;; (p6) fmpy.s1 f8 = f8, f10 (p6) fnma.s1 f9 = f9, f10, f1 ;; (p6) fma.s1 f8 = f9, f8, f8 (p6) fma.s1 f9 = f9, f9, f11 ;; (p6) fma.s1 f10 = f9, f8, f8 ;; fcvt.fx.trunc.s1 f10 = f10 ;; getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __divsi3 #endif #ifdef L__modsi3 // Compute a 32-bit integer modulus. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // in0 holds the dividend. in1 holds the divisor. .text .align 16 .global __modsi3 .proc __modsi3 __modsi3: .regstk 2,0,0,0 mov r2 = 0x0ffdd sxt4 in0 = in0 sxt4 in1 = in1 ;; setf.sig f13 = r32 setf.sig f9 = r33 // Check divide by zero. cmp.ne.unc p0,p7=0,in1 ;; sub in1 = r0, in1 fcvt.xf f8 = f13 fcvt.xf f9 = f9 ;; setf.exp f11 = r2 frcpa.s1 f10, p6 = f8, f9 (p7) break 1 ;; (p6) fmpy.s1 f12 = f8, f10 (p6) fnma.s1 f10 = f9, f10, f1 ;; setf.sig f9 = in1 (p6) fma.s1 f12 = f10, f12, f12 (p6) fma.s1 f10 = f10, f10, f11 ;; (p6) fma.s1 f10 = f10, f12, f12 ;; fcvt.fx.trunc.s1 f10 = f10 ;; xma.l f10 = f10, f9, f13 ;; getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __modsi3 #endif #ifdef L__udivsi3 // Compute a 32-bit unsigned integer quotient. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // in0 holds the dividend. in1 holds the divisor. .text .align 16 .global __udivsi3 .proc __udivsi3 __udivsi3: .regstk 2,0,0,0 mov r2 = 0x0ffdd zxt4 in0 = in0 zxt4 in1 = in1 ;; setf.sig f8 = in0 setf.sig f9 = in1 // Check divide by zero. cmp.ne.unc p0,p7=0,in1 ;; fcvt.xf f8 = f8 fcvt.xf f9 = f9 (p7) break 1 ;; setf.exp f11 = r2 frcpa.s1 f10, p6 = f8, f9 ;; (p6) fmpy.s1 f8 = f8, f10 (p6) fnma.s1 f9 = f9, f10, f1 ;; (p6) fma.s1 f8 = f9, f8, f8 (p6) fma.s1 f9 = f9, f9, f11 ;; (p6) fma.s1 f10 = f9, f8, f8 ;; fcvt.fxu.trunc.s1 f10 = f10 ;; getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __udivsi3 #endif #ifdef L__umodsi3 // Compute a 32-bit unsigned integer modulus. // // From the Intel IA-64 Optimization Guide, choose the minimum latency // alternative. // // in0 holds the dividend. in1 holds the divisor. .text .align 16 .global __umodsi3 .proc __umodsi3 __umodsi3: .regstk 2,0,0,0 mov r2 = 0x0ffdd zxt4 in0 = in0 zxt4 in1 = in1 ;; setf.sig f13 = in0 setf.sig f9 = in1 // Check divide by zero. cmp.ne.unc p0,p7=0,in1 ;; sub in1 = r0, in1 fcvt.xf f8 = f13 fcvt.xf f9 = f9 ;; setf.exp f11 = r2 frcpa.s1 f10, p6 = f8, f9 (p7) break 1; ;; (p6) fmpy.s1 f12 = f8, f10 (p6) fnma.s1 f10 = f9, f10, f1 ;; setf.sig f9 = in1 (p6) fma.s1 f12 = f10, f12, f12 (p6) fma.s1 f10 = f10, f10, f11 ;; (p6) fma.s1 f10 = f10, f12, f12 ;; fcvt.fxu.trunc.s1 f10 = f10 ;; xma.l f10 = f10, f9, f13 ;; getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __umodsi3 #endif #ifdef L__save_stack_nonlocal // Notes on save/restore stack nonlocal: We read ar.bsp but write // ar.bspstore. This is because ar.bsp can be read at all times // (independent of the RSE mode) but since it's read-only we need to // restore the value via ar.bspstore. This is OK because // ar.bsp==ar.bspstore after executing "flushrs". // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer) .text .align 16 .global __ia64_save_stack_nonlocal .proc __ia64_save_stack_nonlocal __ia64_save_stack_nonlocal: { .mmf alloc r18 = ar.pfs, 2, 0, 0, 0 mov r19 = ar.rsc ;; } { .mmi flushrs st8 [in0] = in1, 24 and r19 = 0x1c, r19 ;; } { .mmi st8 [in0] = r18, -16 mov ar.rsc = r19 or r19 = 0x3, r19 ;; } { .mmi mov r16 = ar.bsp mov r17 = ar.rnat adds r2 = 8, in0 ;; } { .mmi st8 [in0] = r16 st8 [r2] = r17 } { .mib mov ar.rsc = r19 br.ret.sptk.few rp ;; } .endp __ia64_save_stack_nonlocal #endif #ifdef L__nonlocal_goto // void __ia64_nonlocal_goto(void *target_label, void *save_area, // void *static_chain); .text .align 16 .global __ia64_nonlocal_goto .proc __ia64_nonlocal_goto __ia64_nonlocal_goto: { .mmi alloc r20 = ar.pfs, 3, 0, 0, 0 ld8 r12 = [in1], 8 mov.ret.sptk rp = in0, .L0 ;; } { .mmf ld8 r16 = [in1], 8 mov r19 = ar.rsc ;; } { .mmi flushrs ld8 r17 = [in1], 8 and r19 = 0x1c, r19 ;; } { .mmi ld8 r18 = [in1] mov ar.rsc = r19 or r19 = 0x3, r19 ;; } { .mmi mov ar.bspstore = r16 ;; mov ar.rnat = r17 ;; } { .mmi loadrs invala mov r15 = in2 ;; } .L0: { .mib mov ar.rsc = r19 mov ar.pfs = r18 br.ret.sptk.few rp ;; } .endp __ia64_nonlocal_goto #endif #ifdef L__restore_stack_nonlocal // This is mostly the same as nonlocal_goto above. // ??? This has not been tested yet. // void __ia64_restore_stack_nonlocal(void *save_area) .text .align 16 .global __ia64_restore_stack_nonlocal .proc __ia64_restore_stack_nonlocal __ia64_restore_stack_nonlocal: { .mmf alloc r20 = ar.pfs, 4, 0, 0, 0 ld8 r12 = [in0], 8 ;; } { .mmb ld8 r16=[in0], 8 mov r19 = ar.rsc ;; } { .mmi flushrs ld8 r17 = [in0], 8 and r19 = 0x1c, r19 ;; } { .mmf ld8 r18 = [in0] mov ar.rsc = r19 ;; } { .mmi mov ar.bspstore = r16 ;; mov ar.rnat = r17 or r19 = 0x3, r19 ;; } { .mmf loadrs invala ;; } .L0: { .mib mov ar.rsc = r19 mov ar.pfs = r18 br.ret.sptk.few rp ;; } .endp __ia64_restore_stack_nonlocal #endif #ifdef L__trampoline // Implement the nested function trampoline. This is out of line // so that we don't have to bother with flushing the icache, as // well as making the on-stack trampoline smaller. // // The trampoline has the following form: // // +-------------------+ > // TRAMP: | __ia64_trampoline | | // +-------------------+ > fake function descriptor // | TRAMP+16 | | // +-------------------+ > // | target descriptor | // +-------------------+ // | static link | // +-------------------+ .text .align 16 .global __ia64_trampoline .proc __ia64_trampoline __ia64_trampoline: { .mmi ld8 r2 = [r1], 8 ;; ld8 r15 = [r1] } { .mmi ld8 r3 = [r2], 8 ;; ld8 r1 = [r2] mov b6 = r3 } { .bbb br.sptk.many b6 ;; } .endp __ia64_trampoline #endif // Thunks for backward compatibility. #ifdef L_fixtfdi .text .align 16 .global __fixtfti .proc __fixtfti __fixtfti: { .bbb br.sptk.many __fixxfti ;; } .endp __fixtfti #endif #ifdef L_fixunstfdi .align 16 .global __fixunstfti .proc __fixunstfti __fixunstfti: { .bbb br.sptk.many __fixunsxfti ;; } .endp __fixunstfti #endif #if L_floatditf .align 16 .global __floattitf .proc __floattitf __floattitf: { .bbb br.sptk.many __floattixf ;; } .endp __floattitf #endif