1 /* Assembly functions for the Xtensa version of libgcc1.
2 Copyright (C) 2001, 2002, 2003, 2005, 2006, 2007
3 Free Software Foundation, Inc.
4 Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
13 In addition to the permissions in the GNU General Public License, the
14 Free Software Foundation gives you unlimited permission to link the
15 compiled version of this file into combinations with other programs,
16 and to distribute those combinations without any restriction coming
17 from the use of this file. (The General Public License restrictions
18 do apply in other respects; for example, they cover modification of
19 the file, and distribution when not linked into a combine
22 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
23 WARRANTY; without even the implied warranty of MERCHANTABILITY or
24 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 You should have received a copy of the GNU General Public License
28 along with GCC; see the file COPYING. If not, write to the Free
29 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
32 #include "xtensa-config.h"
34 /* Define macros for the ABS and ADDX* instructions to handle cases
35 where they are not included in the Xtensa processor configuration. */
37 .macro do_abs dst, src, tmp
42 movgez \tmp, \src, \src
47 .macro do_addx2 dst, as, at, tmp
56 .macro do_addx4 dst, as, at, tmp
65 .macro do_addx8 dst, as, at, tmp
74 /* Define macros for leaf function entry and return, supporting either the
75 standard register windowed ABI or the non-windowed call0 ABI. These
76 macros do not allocate any extra stack space, so they only work for
77 leaf functions that do not need to spill anything to the stack. */
79 .macro leaf_entry reg, size
80 #if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__
88 #if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__
99 .type __mulsi3, @function
106 #elif XCHAL_HAVE_MUL16
122 #elif XCHAL_HAVE_MAC16
131 #else /* !MUL32 && !MUL16 && !MAC16 */
133 /* Multiply one bit at a time, but unroll the loop 4x to better
134 exploit the addx instructions and avoid overhead.
135 Peel the first iteration to save a cycle on init. */
137 /* Avoid negative numbers. */
138 xor a5, a2, a3 /* Top bit is 1 if one input is negative. */
142 /* Swap so the second argument is smaller. */
145 movgez a4, a2, a7 /* a4 = max (a2, a3) */
146 movltz a3, a2, a7 /* a3 = min (a2, a3) */
152 do_addx2 a7, a4, a2, a7
156 do_addx4 a7, a4, a2, a7
160 do_addx8 a7, a4, a2, a7
164 bgeui a3, 16, .Lmult_main_loop
178 do_addx2 a7, a4, a2, a7
182 do_addx4 a7, a4, a2, a7
186 do_addx8 a7, a4, a2, a7
190 bgeui a3, 16, .Lmult_main_loop
195 #endif /* !MUL32 && !MUL16 && !MAC16 */
198 .size __mulsi3, . - __mulsi3
200 #endif /* L_mulsi3 */
205 #if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
206 #define XCHAL_NO_MUL 1
211 .type __umulsidi3, @function
213 #if __XTENSA_CALL0_ABI__
221 /* This is not really a leaf function; allocate enough stack space
222 to allow CALL12s to a helper function. */
234 #endif /* __XTENSA_EB__ */
236 /* This code is taken from the mulsf3 routine in ieee754-sf.S.
237 See more comments there. */
239 #if XCHAL_HAVE_MUL32_HIGH
244 #else /* ! MUL32_HIGH */
246 #if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
247 /* a0 and a8 will be clobbered by calling the multiply function
248 but a8 is not used here and need not be saved. */
252 #if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32
257 /* Get the high halves of the inputs into registers. */
264 #if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16
265 /* Clear the high halves of the inputs. This does not matter
266 for MUL16 because the high bits are ignored. */
270 #endif /* MUL16 || MUL32 */
275 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
276 mul16u dst, xreg ## xhalf, yreg ## yhalf
278 #elif XCHAL_HAVE_MUL32
280 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
281 mull dst, xreg ## xhalf, yreg ## yhalf
283 #elif XCHAL_HAVE_MAC16
285 /* The preprocessor insists on inserting a space when concatenating after
286 a period in the definition of do_mul below. These macros are a workaround
287 using underscores instead of periods when doing the concatenation. */
288 #define umul_aa_ll umul.aa.ll
289 #define umul_aa_lh umul.aa.lh
290 #define umul_aa_hl umul.aa.hl
291 #define umul_aa_hh umul.aa.hh
293 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
294 umul_aa_ ## xhalf ## yhalf xreg, yreg; \
297 #else /* no multiply hardware */
299 #define set_arg_l(dst, src) \
300 extui dst, src, 0, 16
301 #define set_arg_h(dst, src) \
304 #if __XTENSA_CALL0_ABI__
305 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
306 set_arg_ ## xhalf (a13, xreg); \
307 set_arg_ ## yhalf (a14, yreg); \
308 call0 .Lmul_mulsi3; \
311 #define do_mul(dst, xreg, xhalf, yreg, yhalf) \
312 set_arg_ ## xhalf (a14, xreg); \
313 set_arg_ ## yhalf (a15, yreg); \
314 call12 .Lmul_mulsi3; \
316 #endif /* __XTENSA_CALL0_ABI__ */
318 #endif /* no multiply hardware */
320 /* Add pp1 and pp2 into a6 with carry-out in a9. */
321 do_mul(a6, a2, l, a3, h) /* pp 1 */
322 do_mul(a11, a2, h, a3, l) /* pp 2 */
328 /* Shift the high half of a9/a6 into position in a9. Note that
329 this value can be safely incremented without any carry-outs. */
333 /* Compute the low word into a6. */
334 do_mul(a11, a2, l, a3, l) /* pp 0 */
340 /* Compute the high word into wh. */
341 do_mul(wh, a2, h, a3, h) /* pp 3 */
345 #endif /* !MUL32_HIGH */
347 #if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
348 /* Restore the original return address. */
351 #if __XTENSA_CALL0_ABI__
362 /* For Xtensa processors with no multiply hardware, this simplified
363 version of _mulsi3 is used for multiplying 16-bit chunks of
364 the floating-point mantissas. When using CALL0, this function
365 uses a custom ABI: the inputs are passed in a13 and a14, the
366 result is returned in a12, and a8 and a15 are clobbered. */
370 .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
372 1: add \tmp1, \src2, \dst
373 extui \tmp2, \src1, 0, 1
374 movnez \dst, \tmp1, \tmp2
376 do_addx2 \tmp1, \src2, \dst, \tmp1
377 extui \tmp2, \src1, 1, 1
378 movnez \dst, \tmp1, \tmp2
380 do_addx4 \tmp1, \src2, \dst, \tmp1
381 extui \tmp2, \src1, 2, 1
382 movnez \dst, \tmp1, \tmp2
384 do_addx8 \tmp1, \src2, \dst, \tmp1
385 extui \tmp2, \src1, 3, 1
386 movnez \dst, \tmp1, \tmp2
392 #if __XTENSA_CALL0_ABI__
393 mul_mulsi3_body a12, a13, a14, a15, a8
395 /* The result will be written into a2, so save that argument in a4. */
397 mul_mulsi3_body a2, a4, a3, a5, a6
400 #endif /* XCHAL_NO_MUL */
402 .size __umulsidi3, . - __umulsidi3
404 #endif /* L_umulsidi3 */
407 /* Define a macro for the NSAU (unsigned normalize shift amount)
408 instruction, which computes the number of leading zero bits,
409 to handle cases where it is not included in the Xtensa processor
412 .macro do_nsau cnt, val, tmp, a
418 extui \tmp, \a, 16, 16
423 extui \tmp, \a, 24, 8
428 movi \tmp, __nsau_data
433 #endif /* !XCHAL_HAVE_NSA */
440 .type __nsau_data, @object
443 .byte 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
444 .byte 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
445 .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
446 .byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
447 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
448 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
449 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
450 .byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
451 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
452 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
453 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
454 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
455 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
456 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
457 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
458 .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
459 #endif /* !XCHAL_HAVE_NSA */
460 .size __nsau_data, . - __nsau_data
468 .type __clzsi2, @function
471 do_nsau a2, a2, a3, a4
473 .size __clzsi2, . - __clzsi2
475 #endif /* L_clzsi2 */
481 .type __ctzsi2, @function
486 do_nsau a2, a3, a4, a5
490 .size __ctzsi2, . - __ctzsi2
492 #endif /* L_ctzsi2 */
498 .type __ffssi2, @function
503 do_nsau a2, a3, a4, a5
507 .size __ffssi2, . - __ffssi2
509 #endif /* L_ffssi2 */
515 .type __udivsi3, @function
521 bltui a3, 2, .Lle_one /* check if the divisor <= 1 */
523 mov a6, a2 /* keep dividend in a6 */
524 do_nsau a5, a6, a2, a7 /* dividend_shift = nsau (dividend) */
525 do_nsau a4, a3, a2, a7 /* divisor_shift = nsau (divisor) */
526 bgeu a5, a4, .Lspecial
528 sub a4, a4, a5 /* count = divisor_shift - dividend_shift */
530 sll a3, a3 /* divisor <<= count */
531 movi a2, 0 /* quotient = 0 */
533 /* test-subtract-and-shift loop; one quotient bit on each iteration */
535 loopnez a4, .Lloopend
536 #endif /* XCHAL_HAVE_LOOPS */
538 bltu a6, a3, .Lzerobit
544 #if !XCHAL_HAVE_LOOPS
547 #endif /* !XCHAL_HAVE_LOOPS */
550 bltu a6, a3, .Lreturn
551 addi a2, a2, 1 /* increment quotient if dividend >= divisor */
556 beqz a3, .Lerror /* if divisor == 1, return the dividend */
560 /* return dividend >= divisor */
561 bltu a6, a3, .Lreturn0
566 /* Divide by zero: Use an illegal instruction to force an exception.
567 The subsequent "DIV0" string can be recognized by the exception
568 handler to identify the real cause of the exception. */
574 #endif /* XCHAL_HAVE_DIV32 */
576 .size __udivsi3, . - __udivsi3
578 #endif /* L_udivsi3 */
584 .type __divsi3, @function
590 xor a7, a2, a3 /* sign = dividend ^ divisor */
591 do_abs a6, a2, a4 /* udividend = abs (dividend) */
592 do_abs a3, a3, a4 /* udivisor = abs (divisor) */
593 bltui a3, 2, .Lle_one /* check if udivisor <= 1 */
594 do_nsau a5, a6, a2, a8 /* udividend_shift = nsau (udividend) */
595 do_nsau a4, a3, a2, a8 /* udivisor_shift = nsau (udivisor) */
596 bgeu a5, a4, .Lspecial
598 sub a4, a4, a5 /* count = udivisor_shift - udividend_shift */
600 sll a3, a3 /* udivisor <<= count */
601 movi a2, 0 /* quotient = 0 */
603 /* test-subtract-and-shift loop; one quotient bit on each iteration */
605 loopnez a4, .Lloopend
606 #endif /* XCHAL_HAVE_LOOPS */
608 bltu a6, a3, .Lzerobit
614 #if !XCHAL_HAVE_LOOPS
617 #endif /* !XCHAL_HAVE_LOOPS */
620 bltu a6, a3, .Lreturn
621 addi a2, a2, 1 /* increment if udividend >= udivisor */
624 movltz a2, a5, a7 /* return (sign < 0) ? -quotient : quotient */
629 neg a2, a6 /* if udivisor == 1, then return... */
630 movgez a2, a6, a7 /* (sign < 0) ? -udividend : udividend */
634 bltu a6, a3, .Lreturn0 /* if dividend < divisor, return 0 */
637 movltz a2, a4, a7 /* else return (sign < 0) ? -1 : 1 */
641 /* Divide by zero: Use an illegal instruction to force an exception.
642 The subsequent "DIV0" string can be recognized by the exception
643 handler to identify the real cause of the exception. */
649 #endif /* XCHAL_HAVE_DIV32 */
651 .size __divsi3, . - __divsi3
653 #endif /* L_divsi3 */
659 .type __umodsi3, @function
665 bltui a3, 2, .Lle_one /* check if the divisor is <= 1 */
667 do_nsau a5, a2, a6, a7 /* dividend_shift = nsau (dividend) */
668 do_nsau a4, a3, a6, a7 /* divisor_shift = nsau (divisor) */
669 bgeu a5, a4, .Lspecial
671 sub a4, a4, a5 /* count = divisor_shift - dividend_shift */
673 sll a3, a3 /* divisor <<= count */
675 /* test-subtract-and-shift loop */
677 loopnez a4, .Lloopend
678 #endif /* XCHAL_HAVE_LOOPS */
680 bltu a2, a3, .Lzerobit
684 #if !XCHAL_HAVE_LOOPS
687 #endif /* !XCHAL_HAVE_LOOPS */
691 bltu a2, a3, .Lreturn
692 sub a2, a2, a3 /* subtract once more if dividend >= divisor */
699 /* Divide by zero: Use an illegal instruction to force an exception.
700 The subsequent "DIV0" string can be recognized by the exception
701 handler to identify the real cause of the exception. */
707 #endif /* XCHAL_HAVE_DIV32 */
709 .size __umodsi3, . - __umodsi3
711 #endif /* L_umodsi3 */
717 .type __modsi3, @function
723 mov a7, a2 /* save original (signed) dividend */
724 do_abs a2, a2, a4 /* udividend = abs (dividend) */
725 do_abs a3, a3, a4 /* udivisor = abs (divisor) */
726 bltui a3, 2, .Lle_one /* check if udivisor <= 1 */
727 do_nsau a5, a2, a6, a8 /* udividend_shift = nsau (udividend) */
728 do_nsau a4, a3, a6, a8 /* udivisor_shift = nsau (udivisor) */
729 bgeu a5, a4, .Lspecial
731 sub a4, a4, a5 /* count = udivisor_shift - udividend_shift */
733 sll a3, a3 /* udivisor <<= count */
735 /* test-subtract-and-shift loop */
737 loopnez a4, .Lloopend
738 #endif /* XCHAL_HAVE_LOOPS */
740 bltu a2, a3, .Lzerobit
744 #if !XCHAL_HAVE_LOOPS
747 #endif /* !XCHAL_HAVE_LOOPS */
751 bltu a2, a3, .Lreturn
752 sub a2, a2, a3 /* subtract again if udividend >= udivisor */
755 neg a2, a2 /* if (dividend < 0), return -udividend */
762 /* Divide by zero: Use an illegal instruction to force an exception.
763 The subsequent "DIV0" string can be recognized by the exception
764 handler to identify the real cause of the exception. */
770 #endif /* XCHAL_HAVE_DIV32 */
772 .size __modsi3, . - __modsi3
774 #endif /* L_modsi3 */
783 #endif /* __XTENSA_EB__ */
789 .type __ashldi3, @function
793 bgei a4, 32, .Llow_only
802 .size __ashldi3, . - __ashldi3
804 #endif /* L_ashldi3 */
810 .type __ashrdi3, @function
814 bgei a4, 32, .Lhigh_only
823 .size __ashrdi3, . - __ashrdi3
825 #endif /* L_ashrdi3 */
831 .type __lshrdi3, @function
835 bgei a4, 32, .Lhigh_only1
844 .size __lshrdi3, . - __lshrdi3
846 #endif /* L_lshrdi3 */
849 #include "ieee754-df.S"
850 #include "ieee754-sf.S"