/* Copyright (C) 2000, 2001, 2003, 2005 Free Software Foundation, Inc.
   Contributed by James E. Wilson <wilson@cygnus.com>.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   GCC is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with GCC; see the file COPYING.  If not, write to
   the Free Software Foundation, 51 Franklin Street, Fifth Floor,
   Boston, MA 02110-1301, USA.  */

/* As a special exception, if you link this library with other files,
   some of which are compiled with GCC, to produce an executable,
   this library does not by itself cause the resulting executable
   to be covered by the GNU General Public License.
   This exception does not however invalidate any other reasons why
   the executable file might be covered by the GNU General Public License.  */

#ifdef L__divxf3
// Compute a 80-bit IEEE double-extended quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.
//
// __divtf3 is an alternate symbol name for backward compatibility.

	.text
	.align 16
	.global __divxf3
	.global __divtf3
	.proc __divxf3
__divxf3:
__divtf3:
	cmp.eq p7, p0 = r0, r0
	frcpa.s0 f10, p6 = farg0, farg1
	;;
(p6)	cmp.ne p7, p0 = r0, r0
	.pred.rel.mutex p6, p7
(p6)	fnma.s1 f11 = farg1, f10, f1
(p6)	fma.s1 f12 = farg0, f10, f0
	;;
(p6)	fma.s1 f13 = f11, f11, f0
(p6)	fma.s1 f14 = f11, f11, f11
	;;
(p6)	fma.s1 f11 = f13, f13, f11
(p6)	fma.s1 f13 = f14, f10, f10
	;;
(p6)	fma.s1 f10 = f13, f11, f10
(p6)	fnma.s1 f11 = farg1, f12, farg0
	;;
(p6)	fma.s1 f11 = f11, f10, f12
(p6)	fnma.s1 f12 = farg1, f10, f1
	;;
(p6)	fma.s1 f10 = f12, f10, f10
(p6)	fnma.s1 f12 = farg1, f11, farg0
	;;
(p6)	fma.s0 fret0 = f12, f10, f11
(p7)	mov fret0 = f10
	br.ret.sptk rp
	.endp __divxf3
#endif

#ifdef L__divdf3
// Compute a 64-bit IEEE double quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.

	.text
	.align 16
	.global __divdf3
	.proc __divdf3
__divdf3:
	cmp.eq p7, p0 = r0, r0
	frcpa.s0 f10, p6 = farg0, farg1
	;;
(p6)	cmp.ne p7, p0 = r0, r0
	.pred.rel.mutex p6, p7
(p6)	fmpy.s1 f11 = farg0, f10
(p6)	fnma.s1 f12 = farg1, f10, f1
	;;
(p6)	fma.s1 f11 = f12, f11, f11
(p6)	fmpy.s1 f13 = f12, f12
	;;
(p6)	fma.s1 f10 = f12, f10, f10
(p6)	fma.s1 f11 = f13, f11, f11
	;;
(p6)	fmpy.s1 f12 = f13, f13
(p6)	fma.s1 f10 = f13, f10, f10
	;;
(p6)	fma.d.s1 f11 = f12, f11, f11
(p6)	fma.s1 f10 = f12, f10, f10
	;;
(p6)	fnma.d.s1 f8 = farg1, f11, farg0
	;;
(p6)	fma.d fret0 = f8, f10, f11
(p7)	mov fret0 = f10
	br.ret.sptk rp
	;;
	.endp __divdf3
#endif

#ifdef L__divsf3
// Compute a 32-bit IEEE float quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// farg0 holds the dividend.  farg1 holds the divisor.

	.text
	.align 16
	.global __divsf3
	.proc __divsf3
__divsf3:
	cmp.eq p7, p0 = r0, r0
	frcpa.s0 f10, p6 = farg0, farg1
	;;
(p6)	cmp.ne p7, p0 = r0, r0
	.pred.rel.mutex p6, p7
(p6)	fmpy.s1 f8 = farg0, f10
(p6)	fnma.s1 f9 = farg1, f10, f1
	;;
(p6)	fma.s1 f8 = f9, f8, f8
(p6)	fmpy.s1 f9 = f9, f9
	;;
(p6)	fma.s1 f8 = f9, f8, f8
(p6)	fmpy.s1 f9 = f9, f9
	;;
(p6)	fma.d.s1 f10 = f9, f8, f8
	;;
(p6)	fnorm.s.s0 fret0 = f10
(p7)	mov fret0 = f10
	br.ret.sptk rp
	;;
	.endp __divsf3
#endif

#ifdef L__divdi3
// Compute a 64-bit integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __divdi3
	.proc __divdi3
__divdi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	// Convert the inputs to FP, so that they won't be treated as unsigned.
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
(p7)	break 1
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fnma.s1 f11 = f9, f10, f1
(p6)	fmpy.s1 f12 = f8, f10
	;;
(p6)	fmpy.s1 f13 = f11, f11
(p6)	fma.s1 f12 = f11, f12, f12
	;;
(p6)	fma.s1 f10 = f11, f10, f10
(p6)	fma.s1 f11 = f13, f12, f12
	;;
(p6)	fma.s1 f10 = f13, f10, f10
(p6)	fnma.s1 f12 = f9, f11, f8
	;;
(p6)	fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an integer.
	fcvt.fx.trunc.s1 f10 = f10
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __divdi3
#endif

#ifdef L__moddi3
// Compute a 64-bit integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend (a).  in1 holds the divisor (b).

	.text
	.align 16
	.global __moddi3
	.proc __moddi3
__moddi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f14 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	// Convert the inputs to FP, so that they won't be treated as unsigned.
	fcvt.xf f8 = f14
	fcvt.xf f9 = f9
(p7)	break 1
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fmpy.s1 f12 = f8, f10
(p6)	fnma.s1 f11 = f9, f10, f1
	;;
(p6)	fma.s1 f12 = f11, f12, f12
(p6)	fmpy.s1 f13 = f11, f11
	;;
(p6)	fma.s1 f10 = f11, f10, f10
(p6)	fma.s1 f11 = f13, f12, f12
	;;
	sub in1 = r0, in1
(p6)	fma.s1 f10 = f13, f10, f10
(p6)	fnma.s1 f12 = f9, f11, f8
	;;
	setf.sig f9 = in1
(p6)	fma.s1 f10 = f12, f10, f11
	;;
	fcvt.fx.trunc.s1 f10 = f10
	;;
	// r = q * (-b) + a
	xma.l f10 = f10, f9, f14
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __moddi3
#endif

#ifdef L__udivdi3
// Compute a 64-bit unsigned integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __udivdi3
	.proc __udivdi3
__udivdi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f8 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	// Convert the inputs to FP, to avoid FP software-assist faults.
	fcvt.xuf.s1 f8 = f8
	fcvt.xuf.s1 f9 = f9
(p7)	break 1
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fnma.s1 f11 = f9, f10, f1
(p6)	fmpy.s1 f12 = f8, f10
	;;
(p6)	fmpy.s1 f13 = f11, f11
(p6)	fma.s1 f12 = f11, f12, f12
	;;
(p6)	fma.s1 f10 = f11, f10, f10
(p6)	fma.s1 f11 = f13, f12, f12
	;;
(p6)	fma.s1 f10 = f13, f10, f10
(p6)	fnma.s1 f12 = f9, f11, f8
	;;
(p6)	fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an unsigned integer.
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __udivdi3
#endif

#ifdef L__umoddi3
// Compute a 64-bit unsigned integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend (a).  in1 holds the divisor (b).

	.text
	.align 16
	.global __umoddi3
	.proc __umoddi3
__umoddi3:
	.regstk 2,0,0,0
	// Transfer inputs to FP registers.
	setf.sig f14 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	// Convert the inputs to FP, to avoid FP software assist faults.
	fcvt.xuf.s1 f8 = f14
	fcvt.xuf.s1 f9 = f9
(p7)	break 1;
	;;
	// Compute the reciprocal approximation.
	frcpa.s1 f10, p6 = f8, f9
	;;
	// 3 Newton-Raphson iterations.
(p6)	fmpy.s1 f12 = f8, f10
(p6)	fnma.s1 f11 = f9, f10, f1
	;;
(p6)	fma.s1 f12 = f11, f12, f12
(p6)	fmpy.s1 f13 = f11, f11
	;;
(p6)	fma.s1 f10 = f11, f10, f10
(p6)	fma.s1 f11 = f13, f12, f12
	;;
	sub in1 = r0, in1
(p6)	fma.s1 f10 = f13, f10, f10
(p6)	fnma.s1 f12 = f9, f11, f8
	;;
	setf.sig f9 = in1
(p6)	fma.s1 f10 = f12, f10, f11
	;;
	// Round quotient to an unsigned integer.
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	// r = q * (-b) + a
	xma.l f10 = f10, f9, f14
	;;
	// Transfer result to GP registers.
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __umoddi3
#endif

#ifdef L__divsi3
// Compute a 32-bit integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __divsi3
	.proc __divsi3
__divsi3:
	.regstk 2,0,0,0
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	sxt4 in0 = in0
	sxt4 in1 = in1
	;;
	setf.sig f8 = in0
	setf.sig f9 = in1
(p7)	break 1
	;;
	mov r2 = 0x0ffdd
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	;;
(p6)	fmpy.s1 f8 = f8, f10
(p6)	fnma.s1 f9 = f9, f10, f1
	;;
(p6)	fma.s1 f8 = f9, f8, f8
(p6)	fma.s1 f9 = f9, f9, f11
	;;
(p6)	fma.s1 f10 = f9, f8, f8
	;;
	fcvt.fx.trunc.s1 f10 = f10
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __divsi3
#endif

#ifdef L__modsi3
// Compute a 32-bit integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __modsi3
	.proc __modsi3
__modsi3:
	.regstk 2,0,0,0
	mov r2 = 0x0ffdd
	sxt4 in0 = in0
	sxt4 in1 = in1
	;;
	setf.sig f13 = r32
	setf.sig f9 = r33
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	sub in1 = r0, in1
	fcvt.xf f8 = f13
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
(p7)	break 1
	;;
(p6)	fmpy.s1 f12 = f8, f10
(p6)	fnma.s1 f10 = f9, f10, f1
	;;
	setf.sig f9 = in1
(p6)	fma.s1 f12 = f10, f12, f12
(p6)	fma.s1 f10 = f10, f10, f11	
	;;
(p6)	fma.s1 f10 = f10, f12, f12
	;;
	fcvt.fx.trunc.s1 f10 = f10
	;;
	xma.l f10 = f10, f9, f13
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __modsi3
#endif

#ifdef L__udivsi3
// Compute a 32-bit unsigned integer quotient.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __udivsi3
	.proc __udivsi3
__udivsi3:
	.regstk 2,0,0,0
	mov r2 = 0x0ffdd
	zxt4 in0 = in0
	zxt4 in1 = in1
	;;
	setf.sig f8 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	fcvt.xf f8 = f8
	fcvt.xf f9 = f9
(p7)	break 1
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
	;;
(p6)	fmpy.s1 f8 = f8, f10
(p6)	fnma.s1 f9 = f9, f10, f1
	;;
(p6)	fma.s1 f8 = f9, f8, f8
(p6)	fma.s1 f9 = f9, f9, f11
	;;
(p6)	fma.s1 f10 = f9, f8, f8
	;;
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __udivsi3
#endif

#ifdef L__umodsi3
// Compute a 32-bit unsigned integer modulus.
//
// From the Intel IA-64 Optimization Guide, choose the minimum latency
// alternative.
//
// in0 holds the dividend.  in1 holds the divisor.

	.text
	.align 16
	.global __umodsi3
	.proc __umodsi3
__umodsi3:
	.regstk 2,0,0,0
	mov r2 = 0x0ffdd
	zxt4 in0 = in0
	zxt4 in1 = in1
	;;
	setf.sig f13 = in0
	setf.sig f9 = in1
	// Check divide by zero.
	cmp.ne.unc p0,p7=0,in1
	;;
	sub in1 = r0, in1
	fcvt.xf f8 = f13
	fcvt.xf f9 = f9
	;;
	setf.exp f11 = r2
	frcpa.s1 f10, p6 = f8, f9
(p7)	break 1;
	;;
(p6)	fmpy.s1 f12 = f8, f10
(p6)	fnma.s1 f10 = f9, f10, f1
	;;
	setf.sig f9 = in1
(p6)	fma.s1 f12 = f10, f12, f12
(p6)	fma.s1 f10 = f10, f10, f11
	;;
(p6)	fma.s1 f10 = f10, f12, f12
	;;
	fcvt.fxu.trunc.s1 f10 = f10
	;;
	xma.l f10 = f10, f9, f13
	;;
	getf.sig ret0 = f10
	br.ret.sptk rp
	;;
	.endp __umodsi3
#endif

#ifdef L__save_stack_nonlocal
// Notes on save/restore stack nonlocal: We read ar.bsp but write
// ar.bspstore.  This is because ar.bsp can be read at all times
// (independent of the RSE mode) but since it's read-only we need to
// restore the value via ar.bspstore.  This is OK because
// ar.bsp==ar.bspstore after executing "flushrs".

// void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)

	.text
	.align 16
	.global __ia64_save_stack_nonlocal
	.proc __ia64_save_stack_nonlocal
__ia64_save_stack_nonlocal:
	{ .mmf
	  alloc r18 = ar.pfs, 2, 0, 0, 0
	  mov r19 = ar.rsc
	  ;;
	}
	{ .mmi
	  flushrs
	  st8 [in0] = in1, 24
	  and r19 = 0x1c, r19
	  ;;
	}
	{ .mmi
	  st8 [in0] = r18, -16
	  mov ar.rsc = r19
	  or r19 = 0x3, r19
	  ;;
	}
	{ .mmi
	  mov r16 = ar.bsp
	  mov r17 = ar.rnat
	  adds r2 = 8, in0
	  ;;
	}
	{ .mmi
	  st8 [in0] = r16
	  st8 [r2] = r17
	}
	{ .mib
	  mov ar.rsc = r19
	  br.ret.sptk.few rp
	  ;;
	}
	.endp __ia64_save_stack_nonlocal
#endif

#ifdef L__nonlocal_goto
// void __ia64_nonlocal_goto(void *target_label, void *save_area,
//			     void *static_chain);

	.text
	.align 16
	.global __ia64_nonlocal_goto
	.proc __ia64_nonlocal_goto
__ia64_nonlocal_goto:
	{ .mmi
	  alloc r20 = ar.pfs, 3, 0, 0, 0
	  ld8 r12 = [in1], 8
	  mov.ret.sptk rp = in0, .L0
	  ;;
	}
	{ .mmf
	  ld8 r16 = [in1], 8
	  mov r19 = ar.rsc
	  ;;
	}
	{ .mmi
	  flushrs
	  ld8 r17 = [in1], 8
	  and r19 = 0x1c, r19
	  ;;
	}
	{ .mmi
	  ld8 r18 = [in1]
	  mov ar.rsc = r19
	  or r19 = 0x3, r19
	  ;;
	}
	{ .mmi
	  mov ar.bspstore = r16
	  ;;
	  mov ar.rnat = r17
	  ;;
	}
	{ .mmi
	  loadrs
	  invala
	  mov r15 = in2
	  ;;
	}
.L0:	{ .mib
	  mov ar.rsc = r19
	  mov ar.pfs = r18
	  br.ret.sptk.few rp
	  ;;
	}
	.endp __ia64_nonlocal_goto
#endif

#ifdef L__restore_stack_nonlocal
// This is mostly the same as nonlocal_goto above.
// ??? This has not been tested yet.

// void __ia64_restore_stack_nonlocal(void *save_area)

	.text
	.align 16
	.global __ia64_restore_stack_nonlocal
	.proc __ia64_restore_stack_nonlocal
__ia64_restore_stack_nonlocal:
	{ .mmf
	  alloc r20 = ar.pfs, 4, 0, 0, 0
	  ld8 r12 = [in0], 8
	  ;;
	}
	{ .mmb
	  ld8 r16=[in0], 8
	  mov r19 = ar.rsc
	  ;;
	}
	{ .mmi
	  flushrs
	  ld8 r17 = [in0], 8
	  and r19 = 0x1c, r19
	  ;;
	}
	{ .mmf
	  ld8 r18 = [in0]
	  mov ar.rsc = r19
	  ;;
	}
	{ .mmi
	  mov ar.bspstore = r16
	  ;;
	  mov ar.rnat = r17
	  or r19 = 0x3, r19
	  ;;
	}
	{ .mmf
	  loadrs
	  invala
	  ;;
	}
.L0:	{ .mib
	  mov ar.rsc = r19
	  mov ar.pfs = r18
	  br.ret.sptk.few rp
	  ;;
	}
	.endp __ia64_restore_stack_nonlocal
#endif

#ifdef L__trampoline
// Implement the nested function trampoline.  This is out of line
// so that we don't have to bother with flushing the icache, as
// well as making the on-stack trampoline smaller.
//
// The trampoline has the following form:
//
//		+-------------------+ >
//	TRAMP:	| __ia64_trampoline | |
//		+-------------------+  > fake function descriptor
//		| TRAMP+16          | |
//		+-------------------+ >
//		| target descriptor |
//		+-------------------+
//		| static link	    |
//		+-------------------+

	.text
	.align 16
	.global __ia64_trampoline
	.proc __ia64_trampoline
__ia64_trampoline:
	{ .mmi
	  ld8 r2 = [r1], 8
	  ;;
	  ld8 r15 = [r1]
	}
	{ .mmi
	  ld8 r3 = [r2], 8
	  ;;
	  ld8 r1 = [r2]
	  mov b6 = r3
	}
	{ .bbb
	  br.sptk.many b6
	  ;;
	}
	.endp __ia64_trampoline
#endif

// Thunks for backward compatibility.
#ifdef L_fixtfdi
	.text
	.align 16
	.global __fixtfti
	.proc __fixtfti
__fixtfti:
	{ .bbb
	  br.sptk.many __fixxfti
	  ;;
	}
	.endp __fixtfti
#endif
#ifdef L_fixunstfdi
	.align 16
	.global __fixunstfti
	.proc __fixunstfti
__fixunstfti:
	{ .bbb
	  br.sptk.many __fixunsxfti
	  ;;
	}
	.endp __fixunstfti
#endif
#if L_floatditf
	.align 16
	.global __floattitf
	.proc __floattitf
__floattitf:
	{ .bbb
	  br.sptk.many __floattixf
	  ;;
	}
	.endp __floattitf
#endif