1 /* -*- Mode: Asm -*- */
2 ;; Copyright (C) 2012-2016 Free Software Foundation, Inc.
3 ;; Contributed by Sean D'Epagnier (sean@depagnier.com)
4 ;; Georg-Johann Lay (avr@gjlay.de)
6 ;; This file is free software; you can redistribute it and/or modify it
7 ;; under the terms of the GNU General Public License as published by the
8 ;; Free Software Foundation; either version 3, or (at your option) any
11 ;; In addition to the permissions in the GNU General Public License, the
12 ;; Free Software Foundation gives you unlimited permission to link the
13 ;; compiled version of this file into combinations with other programs,
14 ;; and to distribute those combinations without any restriction coming
15 ;; from the use of this file. (The General Public License restrictions
16 ;; do apply in other respects; for example, they cover modification of
17 ;; the file, and distribution when not linked into a combine
20 ;; This file is distributed in the hope that it will be useful, but
21 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
22 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 ;; General Public License for more details.
25 ;; You should have received a copy of the GNU General Public License
26 ;; along with this program; see the file COPYING. If not, write to
27 ;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
28 ;; Boston, MA 02110-1301, USA.
30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31 ;; Fixed point library routines for AVR
32 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
34 #if defined __AVR_TINY__
35 #define __zero_reg__ r17
36 #define __tmp_reg__ r16
38 #define __zero_reg__ r1
39 #define __tmp_reg__ r0
42 .section .text.libgcc.fixed, "ax", @progbits
46 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
47 ;; Conversions to float
48 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
50 #if defined (L_fractqqsf)
52 ;; Move in place for SA -> SF conversion
61 #endif /* L_fractqqsf */
63 #if defined (L_fractuqqsf)
65 ;; Move in place for USA -> SF conversion
73 #endif /* L_fractuqqsf */
75 #if defined (L_fracthqsf)
77 ;; Move in place for SA -> SF conversion
85 #endif /* L_fracthqsf */
87 #if defined (L_fractuhqsf)
89 ;; Move in place for USA -> SF conversion
96 #endif /* L_fractuhqsf */
98 #if defined (L_fracthasf)
100 ;; Move in place for SA -> SF conversion
109 #endif /* L_fracthasf */
111 #if defined (L_fractuhasf)
113 ;; Move in place for USA -> SF conversion
121 #endif /* L_fractuhasf */
124 #if defined (L_fractsqsf)
127 ;; Divide non-zero results by 2^31 to move the
128 ;; decimal point into place
131 subi r24, exp_lo (31)
132 sbci r25, exp_hi (31)
135 #endif /* L_fractsqsf */
137 #if defined (L_fractusqsf)
140 ;; Divide non-zero results by 2^32 to move the
141 ;; decimal point into place
142 cpse r25, __zero_reg__
143 subi r25, exp_hi (32)
146 #endif /* L_fractusqsf */
148 #if defined (L_fractsasf)
151 ;; Divide non-zero results by 2^15 to move the
152 ;; decimal point into place
155 subi r24, exp_lo (15)
156 sbci r25, exp_hi (15)
159 #endif /* L_fractsasf */
161 #if defined (L_fractusasf)
164 ;; Divide non-zero results by 2^16 to move the
165 ;; decimal point into place
166 cpse r25, __zero_reg__
167 subi r25, exp_hi (16)
170 #endif /* L_fractusasf */
172 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
173 ;; Conversions from float
174 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
176 #if defined (L_fractsfqq)
178 ;; Multiply with 2^{24+7} to get a QQ result in r25
179 subi r24, exp_lo (-31)
180 sbci r25, exp_hi (-31)
185 #endif /* L_fractsfqq */
187 #if defined (L_fractsfuqq)
189 ;; Multiply with 2^{24+8} to get a UQQ result in r25
190 subi r25, exp_hi (-32)
195 #endif /* L_fractsfuqq */
197 #if defined (L_fractsfha)
199 ;; Multiply with 2^{16+7} to get a HA result in r25:r24
200 subi r24, exp_lo (-23)
201 sbci r25, exp_hi (-23)
204 #endif /* L_fractsfha */
206 #if defined (L_fractsfuha)
208 ;; Multiply with 2^24 to get a UHA result in r25:r24
209 subi r25, exp_hi (-24)
212 #endif /* L_fractsfuha */
214 #if defined (L_fractsfhq)
218 ;; Multiply with 2^{16+15} to get a HQ result in r25:r24
219 ;; resp. with 2^31 to get a SQ result in r25:r22
220 subi r24, exp_lo (-31)
221 sbci r25, exp_hi (-31)
224 #endif /* L_fractsfhq */
226 #if defined (L_fractsfuhq)
230 ;; Multiply with 2^{16+16} to get a UHQ result in r25:r24
231 ;; resp. with 2^32 to get a USQ result in r25:r22
232 subi r25, exp_hi (-32)
235 #endif /* L_fractsfuhq */
237 #if defined (L_fractsfsa)
239 ;; Multiply with 2^15 to get a SA result in r25:r22
240 subi r24, exp_lo (-15)
241 sbci r25, exp_hi (-15)
244 #endif /* L_fractsfsa */
246 #if defined (L_fractsfusa)
248 ;; Multiply with 2^16 to get a USA result in r25:r22
249 subi r25, exp_hi (-16)
252 #endif /* L_fractsfusa */
255 ;; For multiplication the functions here are called directly from
256 ;; avr-fixed.md instead of using the standard libcall mechanisms.
257 ;; This can make better code because GCC knows exactly which
258 ;; of the call-used registers (not all of them) are clobbered. */
260 /*******************************************************
261 Fractional Multiplication 8 x 8 without MUL
262 *******************************************************/
264 #if defined (L_mulqq3) && !defined (__AVR_HAVE_MUL__)
266 ;;; Clobbers: __tmp_reg__, R22, R24, R25
270 ;; TR 18037 requires that (-1) * (-1) does not overflow
271 ;; The only input that can produce -1 is (-1)^2.
277 #endif /* L_mulqq3 && ! HAVE_MUL */
279 /*******************************************************
280 Fractional Multiply .16 x .16 with and without MUL
281 *******************************************************/
283 #if defined (L_mulhq3)
284 ;;; Same code with and without MUL, but the interfaces differ:
285 ;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25)
286 ;;; Clobbers: ABI, called by optabs
287 ;;; MUL: (R25:R24) = (R19:R18) * (R27:R26)
288 ;;; Clobbers: __tmp_reg__, R22, R23
289 ;;; Rounding: -0.5 LSB <= error <= 0.5 LSB
292 ;; Shift result into place
301 1: ;; Overflow. TR 18037 requires (-1)^2 not to overflow
302 ldi r24, lo8 (0x7fff)
303 ldi r25, hi8 (0x7fff)
306 #endif /* defined (L_mulhq3) */
308 #if defined (L_muluhq3)
309 ;;; Same code with and without MUL, but the interfaces differ:
310 ;;; no MUL: (R25:R24) *= (R23:R22)
311 ;;; Clobbers: ABI, called by optabs
312 ;;; MUL: (R25:R24) = (R19:R18) * (R27:R26)
313 ;;; Clobbers: __tmp_reg__, R22, R23
314 ;;; Rounding: -0.5 LSB < error <= 0.5 LSB
322 #endif /* L_muluhq3 */
325 /*******************************************************
326 Fixed Multiply 8.8 x 8.8 with and without MUL
327 *******************************************************/
329 #if defined (L_mulha3)
330 ;;; Same code with and without MUL, but the interfaces differ:
331 ;;; no MUL: (R25:R24) = (R22:R23) * (R24:R25)
332 ;;; Clobbers: ABI, called by optabs
333 ;;; MUL: (R25:R24) = (R19:R18) * (R27:R26)
334 ;;; Clobbers: __tmp_reg__, R22, R23
335 ;;; Rounding: -0.5 LSB <= error <= 0.5 LSB
343 #endif /* L_mulha3 */
345 #if defined (L_muluha3)
346 ;;; Same code with and without MUL, but the interfaces differ:
347 ;;; no MUL: (R25:R24) *= (R23:R22)
348 ;;; Clobbers: ABI, called by optabs
349 ;;; MUL: (R25:R24) = (R19:R18) * (R27:R26)
350 ;;; Clobbers: __tmp_reg__, R22, R23
351 ;;; Rounding: -0.5 LSB < error <= 0.5 LSB
356 #endif /* L_muluha3 */
358 #if defined (L_muluha3_round)
359 DEFUN __muluha3_round
360 ;; Shift result into place
368 #endif /* L_muluha3_round */
371 /*******************************************************
372 Fixed Multiplication 16.16 x 16.16
373 *******************************************************/
375 ;; Bits outside the result (below LSB), used in the signed version
376 #define GUARD __tmp_reg__
378 #if defined (__AVR_HAVE_MUL__)
398 #if defined (L_mulusa3)
399 ;;; (C3:C0) = (A3:A0) * (B3:B0)
405 ;;; Round for last digit iff T = 1
406 ;;; Return guard bits in GUARD (__tmp_reg__).
407 ;;; Rounding, T = 0: -1.0 LSB < error <= 0 LSB
408 ;;; Rounding, T = 1: -0.5 LSB < error <= 0.5 LSB
409 DEFUN __mulusa3_round
410 ;; Some of the MUL instructions have LSBs outside the result.
411 ;; Don't ignore these LSBs in order to tame rounding error.
412 ;; Use C2/C3 for these LSBs.
416 mul A0, B0 $ movw C2, r0
418 mul A1, B0 $ add C3, r0 $ adc C0, r1
419 mul A0, B1 $ add C3, r0 $ adc C0, r1 $ rol C1
421 ;; Round if T = 1. Store guarding bits outside the result for rounding
422 ;; and left-shift by the signed version (function below).
428 ;; The following MULs don't have LSBs outside the result.
429 ;; C2/C3 is the high part.
431 mul A0, B2 $ add C0, r0 $ adc C1, r1 $ sbc C2, C2
432 mul A1, B1 $ add C0, r0 $ adc C1, r1 $ sbci C2, 0
433 mul A2, B0 $ add C0, r0 $ adc C1, r1 $ sbci C2, 0
436 mul A0, B3 $ add C1, r0 $ adc C2, r1 $ sbc C3, C3
437 mul A1, B2 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0
438 mul A2, B1 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0
439 mul A3, B0 $ add C1, r0 $ adc C2, r1 $ sbci C3, 0
442 mul A1, B3 $ add C2, r0 $ adc C3, r1
443 mul A2, B2 $ add C2, r0 $ adc C3, r1
444 mul A3, B1 $ add C2, r0 $ adc C3, r1
446 mul A2, B3 $ add C3, r0
447 mul A3, B2 $ add C3, r0
449 ;; Guard bits used in the signed version below.
454 #endif /* L_mulusa3 */
456 #if defined (L_mulsa3)
457 ;;; (C3:C0) = (A3:A0) * (B3:B0)
458 ;;; Clobbers: __tmp_reg__, T
459 ;;; Rounding: -0.5 LSB <= error <= 0.5 LSB
462 XCALL __mulusa3_round
463 ;; A posteriori sign extension of the operands
473 ;; Shift 1 bit left to adjust for 15 fractional bits
487 #endif /* L_mulsa3 */
502 #else /* __AVR_HAVE_MUL__ */
531 #if defined (L_mulsa3)
532 ;;; (R25:R22) *= (R21:R18)
533 ;;; Clobbers: ABI, called by optabs
534 ;;; Rounding: -1 LSB <= error <= 1 LSB
540 XCALL __mulusa3_round
545 ;; A1, A0 survived in R27:R26
552 ;; sign-extend A. A3 survived in R31
558 ;; Shift 1 bit left to adjust for 15 fractional bits
572 #endif /* L_mulsa3 */
574 #if defined (L_mulusa3)
575 ;;; (R25:R22) *= (R21:R18)
576 ;;; Clobbers: ABI, called by optabs
577 ;;; Rounding: -1 LSB <= error <= 1 LSB
583 ;;; A[] survives in 26, 27, 30, 31
584 ;;; Also used by __mulsa3 with T = 0
586 ;;; Return Guard bits in GUARD (__tmp_reg__), used by signed version.
587 DEFUN __mulusa3_round
598 ;; Loop the integral part
600 1: ;; CC += A * 2^n; n >= 0
601 add CC0,A0 $ adc CC1,A1 $ adc CC2,A2 $ adc CC3,A3
604 lsl A0 $ rol A1 $ rol A2 $ rol A3
607 ;; Carry = n-th bit of B; n >= 0
614 ;; Loop the fractional part
615 ;; B2/B3 is 0 now, use as guard bits for rounding
616 ;; Restore multiplicand
621 4: ;; CC += A:Guard * 2^n; n < 0
622 add B3,B2 $ adc CC0,A0 $ adc CC1,A1 $ adc CC2,A2 $ adc CC3,A3
625 lsr A3 $ ror A2 $ ror A1 $ ror A0 $ ror B2
628 ;; Carry = n-th bit of B; n < 0
635 ;; Save guard bits and set carry for rounding
638 ;; Move result into place
655 #endif /* L_mulusa3 */
678 #endif /* __AVR_HAVE_MUL__ */
682 /***********************************************************
683 Fixed unsigned saturated Multiplication 8.8 x 8.8
684 ***********************************************************/
690 #define SS __tmp_reg__
692 #if defined (L_usmuluha3)
695 #ifdef __AVR_HAVE_MUL__
699 #endif /* HAVE MUL */
703 ;; Round, target is in C1..C2
708 ;; Move result into place
718 #endif /* L_usmuluha3 */
720 /***********************************************************
721 Fixed signed saturated Multiplication s8.7 x s8.7
722 ***********************************************************/
724 #if defined (L_ssmulha3)
727 #ifdef __AVR_HAVE_MUL__
731 #endif /* HAVE MUL */
733 ;; Adjust decimal point
738 ;; The 9 MSBs must be the same
748 ;; Move result into place
757 ;; C3 >= 0 --> 0x7fff
760 ;; Load min / max value:
761 ;; SS = -1 --> 0x8000
769 #endif /* L_ssmulha3 */
777 /***********************************************************
778 Fixed unsigned saturated Multiplication 16.16 x 16.16
779 ***********************************************************/
789 #define SS __tmp_reg__
791 #if defined (L_usmulusa3)
792 ;; R22[4] = R22[4] *{ssat} R18[4]
793 ;; Ordinary ABI function
799 ;; Round, target is in C2..C5
806 ;; Move result into place
817 #endif /* L_usmulusa3 */
819 /***********************************************************
820 Fixed signed saturated Multiplication s16.15 x s16.15
821 ***********************************************************/
823 #if defined (L_ssmulsa3)
824 ;; R22[4] = R22[4] *{ssat} R18[4]
825 ;; Ordinary ABI function
829 ;; Adjust decimal point
836 ;; The 17 MSBs must be the same
850 ;; Move result into place
859 ;; C7 < 0 --> 0x80000000
860 ;; C7 >= 0 --> 0x7fffffff
864 ;; Load min / max value:
865 ;; SS = -1 --> 0x80000000
866 ;; SS = 0 --> 0x7fffffff
874 #endif /* L_ssmulsa3 */
886 /*******************************************************
887 Fractional Division 8 / 8
888 *******************************************************/
890 #define r_divd r25 /* dividend */
891 #define r_quo r24 /* quotient */
892 #define r_div r22 /* divisor */
893 #define r_sign __tmp_reg__
895 #if defined (L_divqq3)
905 sbrc r_sign, 7 ; negate result if needed
909 #endif /* L_divqq3 */
911 #if defined (L_udivuqq3)
916 ;; Result is out of [0, 1) ==> Return 1 - eps.
920 #endif /* L_udivuqq3 */
923 #if defined (L_divqq_helper)
925 clr r_quo ; clear quotient
926 inc __zero_reg__ ; init loop counter, used per shift
928 lsl r_divd ; shift dividend
929 brcs 0f ; dividend overflow
930 cp r_divd,r_div ; compare dividend & divisor
931 brcc 0f ; dividend >= divisor
932 rol r_quo ; shift quotient (with CARRY)
935 sub r_divd,r_div ; restore dividend
936 lsl r_quo ; shift quotient (without CARRY)
938 lsl __zero_reg__ ; shift loop-counter bit
940 com r_quo ; complement result
941 ; because C flag was complemented in loop
944 #endif /* L_divqq_helper */
952 /*******************************************************
953 Fractional Division 16 / 16
954 *******************************************************/
955 #define r_divdL 26 /* dividend Low */
956 #define r_divdH 27 /* dividend Hig */
957 #define r_quoL 24 /* quotient Low */
958 #define r_quoH 25 /* quotient High */
959 #define r_divL 22 /* divisor */
960 #define r_divH 23 /* divisor */
963 #if defined (L_divhq3)
977 breq __divhq3_minus1 ; if equal return -1
982 ;; negate result if needed
991 #endif /* defined (L_divhq3) */
993 #if defined (L_udivuhq3)
995 sub r_quoH,r_quoH ; clear quotient and carry
999 DEFUN __udivuha3_common
1000 clr r_quoL ; clear quotient
1001 ldi r_cnt,16 ; init loop counter
1003 rol r_divdL ; shift dividend (with CARRY)
1005 brcs __udivuhq3_ep ; dividend overflow
1006 cp r_divdL,r_divL ; compare dividend & divisor
1008 brcc __udivuhq3_ep ; dividend >= divisor
1009 rol r_quoL ; shift quotient (with CARRY)
1010 rjmp __udivuhq3_cont
1012 sub r_divdL,r_divL ; restore dividend
1014 lsl r_quoL ; shift quotient (without CARRY)
1016 rol r_quoH ; shift quotient
1017 dec r_cnt ; decrement loop counter
1018 brne __udivuhq3_loop
1019 com r_quoL ; complement result
1020 com r_quoH ; because C flag was complemented in loop
1022 ENDF __udivuha3_common
1023 #endif /* defined (L_udivuhq3) */
1025 /*******************************************************
1026 Fixed Division 8.8 / 8.8
1027 *******************************************************/
1028 #if defined (L_divha3)
1041 lsr r_quoH ; adjust to 7 fractional bits
1043 sbrs r0, 7 ; negate result if needed
1048 #endif /* defined (L_divha3) */
1050 #if defined (L_udivuha3)
1053 mov r_divdL, r_divdH
1055 lsl r_quoH ; shift quotient into carry
1056 XJMP __udivuha3_common ; same as fractional after rearrange
1058 #endif /* defined (L_udivuha3) */
1068 /*******************************************************
1069 Fixed Division 16.16 / 16.16
1070 *******************************************************/
1072 #define r_arg1L 24 /* arg1 gets passed already in place */
1076 #define r_divdL 26 /* dividend Low */
1079 #define r_divdHH 31 /* dividend High */
1080 #define r_quoL 22 /* quotient Low */
1083 #define r_quoHH 25 /* quotient High */
1084 #define r_divL 18 /* divisor Low */
1087 #define r_divHH 21 /* divisor High */
1088 #define r_cnt __zero_reg__ /* loop count (0 after the loop!) */
1090 #if defined (L_divsa3)
1103 lsr r_quoHH ; adjust to 15 fractional bits
1107 sbrs r0, 7 ; negate result if needed
1112 #endif /* defined (L_divsa3) */
1114 #if defined (L_udivusa3)
1116 ldi r_divdHL, 32 ; init loop counter
1120 wmov r_quoL, r_divdHL
1121 lsl r_quoHL ; shift quotient into carry
1124 rol r_divdL ; shift dividend (with CARRY)
1128 brcs __udivusa3_ep ; dividend overflow
1129 cp r_divdL,r_divL ; compare dividend & divisor
1131 cpc r_divdHL,r_divHL
1132 cpc r_divdHH,r_divHH
1133 brcc __udivusa3_ep ; dividend >= divisor
1134 rol r_quoL ; shift quotient (with CARRY)
1135 rjmp __udivusa3_cont
1137 sub r_divdL,r_divL ; restore dividend
1139 sbc r_divdHL,r_divHL
1140 sbc r_divdHH,r_divHH
1141 lsl r_quoL ; shift quotient (without CARRY)
1143 rol r_quoH ; shift quotient
1146 dec r_cnt ; decrement loop counter
1147 brne __udivusa3_loop
1148 com r_quoL ; complement result
1149 com r_quoH ; because C flag was complemented in loop
1154 #endif /* defined (L_udivusa3) */
1175 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1176 ;; Saturation, 1 Byte
1177 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1179 ;; First Argument and Return Register
1182 #if defined (L_ssabs_1)
1191 #endif /* L_ssabs_1 */
1197 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1198 ;; Saturation, 2 Bytes
1199 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1201 ;; First Argument and Return Register
1205 #if defined (L_ssneg_2)
1212 #endif /* L_ssneg_2 */
1214 #if defined (L_ssabs_2)
1220 #endif /* L_ssabs_2 */
1227 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1228 ;; Saturation, 4 Bytes
1229 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1231 ;; First Argument and Return Register
1237 #if defined (L_ssneg_4)
1247 #endif /* L_ssneg_4 */
1249 #if defined (L_ssabs_4)
1255 #endif /* L_ssabs_4 */
1264 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1265 ;; Saturation, 8 Bytes
1266 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1268 ;; First Argument and Return Register
1278 #if defined (L_clr_8)
1283 ;; Clear Carry and all Bytes
1285 ;; Clear Carry and set Z
1289 ;; Propagate Carry to all Bytes, Carry unaltered
1298 #endif /* L_clr_8 */
1300 #if defined (L_ssneg_8)
1314 #endif /* L_ssneg_8 */
1316 #if defined (L_ssabs_8)
1326 #endif /* L_ssabs_8 */
1338 #if defined (L_usadd_8)
1347 0: ;; A[] = 0xffffffff
1350 #endif /* L_usadd_8 */
1352 #if defined (L_ussub_8)
1364 #endif /* L_ussub_8 */
1366 #if defined (L_ssadd_8)
1374 ;; A = (B >= 0) ? INT64_MAX : INT64_MIN
1380 #endif /* L_ssadd_8 */
1382 #if defined (L_sssub_8)
1390 ;; A = (B < 0) ? INT64_MAX : INT64_MIN
1397 #endif /* L_sssub_8 */
1417 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1419 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1426 ;; R25 = 1 << (R24 & 7)
1427 ;; CC = 1 << (AA & 7)
1445 #endif /* L_mask1 */
1447 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1449 ;; The rounding point. Any bits smaller than
1450 ;; 2^{-RP} will be cleared.
1459 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1461 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1465 ;; R24 = round (R22, R24)
1466 ;; Clobbers: R22, __tmp_reg__
1469 subi RP, __QQ_FBIT__ - 1
1471 ;; R25 = 1 << RP (Total offset is FBIT-1 - RP)
1474 ;; Add-Saturate 2^{-RP-1}
1479 0: ;; Mask out bits beyond RP
1483 9: mov C1, __tmp_reg__
1486 #endif /* L_roundqq3 */
1490 ;; R24 = round (R22, R24)
1491 ;; Clobbers: R22, __tmp_reg__
1494 subi RP, __UQQ_FBIT__ - 1
1496 ;; R25 = 1 << RP (Total offset is FBIT-1 - RP)
1499 ;; Add-Saturate 2^{-RP-1}
1504 0: ;; Mask out bits beyond RP
1508 9: mov C1, __tmp_reg__
1511 #endif /* L_rounduqq3 */
1513 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1514 ;; Rounding, 2 Bytes
1515 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1519 ;; [ R25:R24 = 1 << (R24 & 15)
1520 ;; R23:R22 += 1 << (R24 & 15) ]
1521 ;; SREG is set according to the addition
1523 ;; R25 = 1 << (R24 & 7)
1527 ;; Swap C0 and C1 if RP.3 was set
1530 ;; Finally, add the power-of-two: A[] += C[]
1535 #endif /* L_addmask_2 */
1539 ;; R25:R24 = round (R23:R22, R24)
1540 ;; Clobbers: R23, R22
1542 subi RP, __HQ_FBIT__ - __HA_FBIT__
1545 subi RP, __HA_FBIT__ - 1
1547 ;; [ R25:R24 = 1 << (FBIT-1 - RP)
1548 ;; R23:R22 += 1 << (FBIT-1 - RP) ]
1550 XJMP __round_s2_const
1553 #endif /* L_round_s2 */
1557 ;; R25:R24 = round (R23:R22, R24)
1558 ;; Clobbers: R23, R22
1560 subi RP, __UHQ_FBIT__ - __UHA_FBIT__
1563 subi RP, __UHA_FBIT__ - 1
1565 ;; [ R25:R24 = 1 << (FBIT-1 - RP)
1566 ;; R23:R22 += 1 << (FBIT-1 - RP) ]
1568 XJMP __round_u2_const
1571 #endif /* L_round_u2 */
1574 #ifdef L_round_2_const
1576 ;; Helpers for 2 byte wide rounding
1578 DEFUN __round_s2_const
1582 ;; FALLTHRU (Barrier)
1583 ENDF __round_s2_const
1585 DEFUN __round_u2_const
1592 ;; Saturation is performed now.
1593 ;; Currently, we have C[] = 2^{-RP-1}
1599 ;; Clear the bits beyond the rounding point.
1603 ENDF __round_u2_const
1605 #endif /* L_round_2_const */
1612 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1613 ;; Rounding, 4 Bytes
1614 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1628 ;; [ R25:R22 = 1 << (R24 & 31)
1629 ;; R21:R18 += 1 << (R24 & 31) ]
1630 ;; SREG is set according to the addition
1632 ;; R25 = 1 << (R24 & 7)
1637 ;; Swap C2 with C3 if RP.3 is not set
1642 ;; Swap C3:C2 with C1:C0 if RP.4 is not set
1643 and C0, C2 $ eor C2, C0
1644 and C1, C3 $ eor C3, C1
1645 ;; Finally, add the power-of-two: A[] += C[]
1652 #endif /* L_addmask_4 */
1656 ;; R25:R22 = round (R21:R18, R24)
1657 ;; Clobbers: R18...R21
1659 subi RP, __SQ_FBIT__ - __SA_FBIT__
1662 subi RP, __SA_FBIT__ - 1
1664 ;; [ R25:R22 = 1 << (FBIT-1 - RP)
1665 ;; R21:R18 += 1 << (FBIT-1 - RP) ]
1667 XJMP __round_s4_const
1670 #endif /* L_round_s4 */
1674 ;; R25:R22 = round (R21:R18, R24)
1675 ;; Clobbers: R18...R21
1677 subi RP, __USQ_FBIT__ - __USA_FBIT__
1680 subi RP, __USA_FBIT__ - 1
1682 ;; [ R25:R22 = 1 << (FBIT-1 - RP)
1683 ;; R21:R18 += 1 << (FBIT-1 - RP) ]
1685 XJMP __round_u4_const
1688 #endif /* L_round_u4 */
1691 #ifdef L_round_4_const
1693 ;; Helpers for 4 byte wide rounding
1695 DEFUN __round_s4_const
1699 ;; FALLTHRU (Barrier)
1700 ENDF __round_s4_const
1702 DEFUN __round_u4_const
1711 ;; Saturation is performed now.
1712 ;; Currently, we have C[] = 2^{-RP-1}
1719 ;; Clear the bits beyond the rounding point.
1725 ENDF __round_u4_const
1727 #endif /* L_round_4_const */
1740 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1741 ;; Rounding, 8 Bytes
1742 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1767 ;; R25:R18 = round (R25:R18, R16)
1770 ldi FBITm1, __DQ_FBIT__ - 1
1774 #endif /* L_rounddq3 */
1777 ;; R25:R18 = round (R25:R18, R16)
1780 ldi FBITm1, __UDQ_FBIT__ - 1
1784 #endif /* L_roundudq3 */
1787 ;; R25:R18 = round (R25:R18, R16)
1790 ldi FBITm1, __DA_FBIT__ - 1
1794 #endif /* L_roundda3 */
1797 ;; R25:R18 = round (R25:R18, R16)
1800 ldi FBITm1, __UDA_FBIT__ - 1
1804 #endif /* L_rounduda3 */
1807 ;; R25:R18 = round (R25:R18, R16)
1810 ldi FBITm1, __TA_FBIT__ - 1
1814 #endif /* L_roundta3 */
1817 ;; R25:R18 = round (R25:R18, R16)
1820 ldi FBITm1, __UTA_FBIT__ - 1
1824 #endif /* L_rounduta3 */
1833 ;; Compute log2 of addend from rounding point
1836 ;; Move input to work register A[]
1842 ;; C[] = 1 << (FBIT-1 - RP)
1859 ;; Signed overflow: A[] = 0x7f...
1863 ;; Unsigned overflow: A[] = 0xff...
1878 ;; Clear the bits beyond the rounding point.
1895 #endif /* L_round_x8 */
1919 ;; Supply implementations / symbols for the bit-banging functions
1920 ;; __builtin_avr_bitsfx and __builtin_avr_fxbits
1927 #endif /* if not __AVR_TINY__ */