2 // Compute a 80-bit IEEE double-extended quotient.
4 // From the Intel IA-64 Optimization Guide, choose the minimum latency
7 // farg0 holds the dividend. farg1 holds the divisor.
14 cmp.eq p7, p0 = r0, r0
15 frcpa.s0 f10, p6 = farg0, farg1
17 (p6) cmp.ne p7, p0 = r0, r0
18 .pred.rel.mutex p6, p7
19 (p6) fnma.s1 f11 = farg1, f10, f1
20 (p6) fma.s1 f12 = farg0, f10, f0
22 (p6) fma.s1 f13 = f11, f11, f0
23 (p6) fma.s1 f14 = f11, f11, f11
25 (p6) fma.s1 f11 = f13, f13, f11
26 (p6) fma.s1 f13 = f14, f10, f10
28 (p6) fma.s1 f10 = f13, f11, f10
29 (p6) fnma.s1 f11 = farg1, f12, farg0
31 (p6) fma.s1 f11 = f11, f10, f12
32 (p6) fnma.s1 f12 = farg1, f10, f1
34 (p6) fma.s1 f10 = f12, f10, f10
35 (p6) fnma.s1 f12 = farg1, f11, farg0
37 (p6) fma.s0 fret0 = f12, f10, f11
44 // Compute a 64-bit IEEE double quotient.
46 // From the Intel IA-64 Optimization Guide, choose the minimum latency
49 // farg0 holds the dividend. farg1 holds the divisor.
56 cmp.eq p7, p0 = r0, r0
57 frcpa.s0 f10, p6 = farg0, farg1
59 (p6) cmp.ne p7, p0 = r0, r0
60 .pred.rel.mutex p6, p7
61 (p6) fmpy.s1 f11 = farg0, f10
62 (p6) fnma.s1 f12 = farg1, f10, f1
64 (p6) fma.s1 f11 = f12, f11, f11
65 (p6) fmpy.s1 f13 = f12, f12
67 (p6) fma.s1 f10 = f12, f10, f10
68 (p6) fma.s1 f11 = f13, f11, f11
70 (p6) fmpy.s1 f12 = f13, f13
71 (p6) fma.s1 f10 = f13, f10, f10
73 (p6) fma.d.s1 f11 = f12, f11, f11
74 (p6) fma.s1 f10 = f12, f10, f10
76 (p6) fnma.d.s1 f8 = farg1, f11, farg0
78 (p6) fma.d fret0 = f8, f10, f11
86 // Compute a 32-bit IEEE float quotient.
88 // From the Intel IA-64 Optimization Guide, choose the minimum latency
91 // farg0 holds the dividend. farg1 holds the divisor.
98 cmp.eq p7, p0 = r0, r0
99 frcpa.s0 f10, p6 = farg0, farg1
101 (p6) cmp.ne p7, p0 = r0, r0
102 .pred.rel.mutex p6, p7
103 (p6) fmpy.s1 f8 = farg0, f10
104 (p6) fnma.s1 f9 = farg1, f10, f1
106 (p6) fma.s1 f8 = f9, f8, f8
107 (p6) fmpy.s1 f9 = f9, f9
109 (p6) fma.s1 f8 = f9, f8, f8
110 (p6) fmpy.s1 f9 = f9, f9
112 (p6) fma.d.s1 f10 = f9, f8, f8
114 (p6) fnorm.s.s0 fret0 = f10
122 // Compute a 64-bit integer quotient.
124 // From the Intel IA-64 Optimization Guide, choose the minimum latency
127 // in0 holds the dividend. in1 holds the divisor.
135 // Transfer inputs to FP registers.
139 // Convert the inputs to FP, so that they won't be treated as unsigned.
143 // Compute the reciprocal approximation.
144 frcpa.s1 f10, p6 = f8, f9
146 // 3 Newton-Raphson iterations.
147 (p6) fnma.s1 f11 = f9, f10, f1
148 (p6) fmpy.s1 f12 = f8, f10
150 (p6) fmpy.s1 f13 = f11, f11
151 (p6) fma.s1 f12 = f11, f12, f12
153 (p6) fma.s1 f10 = f11, f10, f10
154 (p6) fma.s1 f11 = f13, f12, f12
156 (p6) fma.s1 f10 = f13, f10, f10
157 (p6) fnma.s1 f12 = f9, f11, f8
159 (p6) fma.s1 f10 = f12, f10, f11
161 // Round quotient to an integer.
162 fcvt.fx.trunc.s1 f10 = f10
164 // Transfer result to GP registers.
172 // Compute a 64-bit integer modulus.
174 // From the Intel IA-64 Optimization Guide, choose the minimum latency
177 // in0 holds the dividend (a). in1 holds the divisor (b).
185 // Transfer inputs to FP registers.
189 // Convert the inputs to FP, so that they won't be treated as unsigned.
193 // Compute the reciprocal approximation.
194 frcpa.s1 f10, p6 = f8, f9
196 // 3 Newton-Raphson iterations.
197 (p6) fmpy.s1 f12 = f8, f10
198 (p6) fnma.s1 f11 = f9, f10, f1
200 (p6) fma.s1 f12 = f11, f12, f12
201 (p6) fmpy.s1 f13 = f11, f11
203 (p6) fma.s1 f10 = f11, f10, f10
204 (p6) fma.s1 f11 = f13, f12, f12
207 (p6) fma.s1 f10 = f13, f10, f10
208 (p6) fnma.s1 f12 = f9, f11, f8
211 (p6) fma.s1 f10 = f12, f10, f11
213 fcvt.fx.trunc.s1 f10 = f10
216 xma.l f10 = f10, f9, f14
218 // Transfer result to GP registers.
226 // Compute a 64-bit unsigned integer quotient.
228 // From the Intel IA-64 Optimization Guide, choose the minimum latency
231 // in0 holds the dividend. in1 holds the divisor.
239 // Transfer inputs to FP registers.
243 // Convert the inputs to FP, to avoid FP software-assist faults.
247 // Compute the reciprocal approximation.
248 frcpa.s1 f10, p6 = f8, f9
250 // 3 Newton-Raphson iterations.
251 (p6) fnma.s1 f11 = f9, f10, f1
252 (p6) fmpy.s1 f12 = f8, f10
254 (p6) fmpy.s1 f13 = f11, f11
255 (p6) fma.s1 f12 = f11, f12, f12
257 (p6) fma.s1 f10 = f11, f10, f10
258 (p6) fma.s1 f11 = f13, f12, f12
260 (p6) fma.s1 f10 = f13, f10, f10
261 (p6) fnma.s1 f12 = f9, f11, f8
263 (p6) fma.s1 f10 = f12, f10, f11
265 // Round quotient to an unsigned integer.
266 fcvt.fxu.trunc.s1 f10 = f10
268 // Transfer result to GP registers.
276 // Compute a 64-bit unsigned integer modulus.
278 // From the Intel IA-64 Optimization Guide, choose the minimum latency
281 // in0 holds the dividend (a). in1 holds the divisor (b).
289 // Transfer inputs to FP registers.
293 // Convert the inputs to FP, to avoid FP software assist faults.
297 // Compute the reciprocal approximation.
298 frcpa.s1 f10, p6 = f8, f9
300 // 3 Newton-Raphson iterations.
301 (p6) fmpy.s1 f12 = f8, f10
302 (p6) fnma.s1 f11 = f9, f10, f1
304 (p6) fma.s1 f12 = f11, f12, f12
305 (p6) fmpy.s1 f13 = f11, f11
307 (p6) fma.s1 f10 = f11, f10, f10
308 (p6) fma.s1 f11 = f13, f12, f12
311 (p6) fma.s1 f10 = f13, f10, f10
312 (p6) fnma.s1 f12 = f9, f11, f8
315 (p6) fma.s1 f10 = f12, f10, f11
317 // Round quotient to an unsigned integer.
318 fcvt.fxu.trunc.s1 f10 = f10
321 xma.l f10 = f10, f9, f14
323 // Transfer result to GP registers.
331 // Compute a 32-bit integer quotient.
333 // From the Intel IA-64 Optimization Guide, choose the minimum latency
336 // in0 holds the dividend. in1 holds the divisor.
355 frcpa.s1 f10, p6 = f8, f9
357 (p6) fmpy.s1 f8 = f8, f10
358 (p6) fnma.s1 f9 = f9, f10, f1
360 (p6) fma.s1 f8 = f9, f8, f8
361 (p6) fma.s1 f9 = f9, f9, f11
363 (p6) fma.s1 f10 = f9, f8, f8
365 fcvt.fx.trunc.s1 f10 = f10
374 // Compute a 32-bit integer modulus.
376 // From the Intel IA-64 Optimization Guide, choose the minimum latency
379 // in0 holds the dividend. in1 holds the divisor.
399 frcpa.s1 f10, p6 = f8, f9
401 (p6) fmpy.s1 f12 = f8, f10
402 (p6) fnma.s1 f10 = f9, f10, f1
405 (p6) fma.s1 f12 = f10, f12, f12
406 (p6) fma.s1 f10 = f10, f10, f11
408 (p6) fma.s1 f10 = f10, f12, f12
410 fcvt.fx.trunc.s1 f10 = f10
412 xma.l f10 = f10, f9, f13
421 // Compute a 32-bit unsigned integer quotient.
423 // From the Intel IA-64 Optimization Guide, choose the minimum latency
426 // in0 holds the dividend. in1 holds the divisor.
445 frcpa.s1 f10, p6 = f8, f9
447 (p6) fmpy.s1 f8 = f8, f10
448 (p6) fnma.s1 f9 = f9, f10, f1
450 (p6) fma.s1 f8 = f9, f8, f8
451 (p6) fma.s1 f9 = f9, f9, f11
453 (p6) fma.s1 f10 = f9, f8, f8
455 fcvt.fxu.trunc.s1 f10 = f10
464 // Compute a 32-bit unsigned integer modulus.
466 // From the Intel IA-64 Optimization Guide, choose the minimum latency
469 // in0 holds the dividend. in1 holds the divisor.
489 frcpa.s1 f10, p6 = f8, f9
491 (p6) fmpy.s1 f12 = f8, f10
492 (p6) fnma.s1 f10 = f9, f10, f1
495 (p6) fma.s1 f12 = f10, f12, f12
496 (p6) fma.s1 f10 = f10, f10, f11
498 (p6) fma.s1 f10 = f10, f12, f12
500 fcvt.fxu.trunc.s1 f10 = f10
502 xma.l f10 = f10, f9, f13
510 #ifdef L__save_stack_nonlocal
511 // Notes on save/restore stack nonlocal: We read ar.bsp but write
512 // ar.bspstore. This is because ar.bsp can be read at all times
513 // (independent of the RSE mode) but since it's read-only we need to
514 // restore the value via ar.bspstore. This is OK because
515 // ar.bsp==ar.bspstore after executing "flushrs".
517 // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
521 .global __ia64_save_stack_nonlocal
522 .proc __ia64_save_stack_nonlocal
523 __ia64_save_stack_nonlocal:
525 alloc r18 = ar.pfs, 2, 0, 0, 0
556 .endp __ia64_save_stack_nonlocal
559 #ifdef L__nonlocal_goto
560 // void __ia64_nonlocal_goto(void *target_label, void *save_area,
561 // void *static_chain);
565 .global __ia64_nonlocal_goto
566 .proc __ia64_nonlocal_goto
567 __ia64_nonlocal_goto:
569 alloc r20 = ar.pfs, 3, 0, 0, 0
571 mov.ret.sptk rp = in0, .L0
592 mov ar.bspstore = r16
609 .endp __ia64_nonlocal_goto
612 #ifdef L__restore_stack_nonlocal
613 // This is mostly the same as nonlocal_goto above.
614 // ??? This has not been tested yet.
616 // void __ia64_restore_stack_nonlocal(void *save_area)
620 .global __ia64_restore_stack_nonlocal
621 .proc __ia64_restore_stack_nonlocal
622 __ia64_restore_stack_nonlocal:
624 alloc r20 = ar.pfs, 4, 0, 0, 0
645 mov ar.bspstore = r16
662 .endp __ia64_restore_stack_nonlocal
666 // Implement the nested function trampoline. This is out of line
667 // so that we don't have to bother with flushing the icache, as
668 // well as making the on-stack trampoline smaller.
670 // The trampoline has the following form:
672 // +-------------------+ >
673 // TRAMP: | __ia64_trampoline | |
674 // +-------------------+ > fake function descriptor
676 // +-------------------+ >
677 // | target descriptor |
678 // +-------------------+
680 // +-------------------+
684 .global __ia64_trampoline
685 .proc __ia64_trampoline
702 .endp __ia64_trampoline