1 /***************************************************************************
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
10 * Copyright (C) 2008 by Jens Arnold
11 * Copyright (C) 2009 by Andrew Mahone
13 * Optimised unsigned integer division for ARMv4
15 * Based on: libgcc routines for ARM cpu, additional algorithms from ARM System
17 * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
18 * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
19 * Free Software Foundation, Inc.
21 * This program is free software; you can redistribute it and/or
22 * modify it under the terms of the GNU General Public License
23 * as published by the Free Software Foundation; either version 2
24 * of the License, or (at your option) any later version.
26 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
27 * KIND, either express or implied.
29 ****************************************************************************/
32 /* On targets with codec iram, a header file will be generated after an initial
33 link of the APE codec, stating the amount of IRAM remaining for use by the
34 reciprocal lookup table. */
35 #if !defined(APE_PRE) && defined(USE_IRAM) && ARM_ARCH < 5
36 #include "apps/codecs/ape_free_iram.h"
39 /* Codecs should not normally do this, but we need to check a macro, and
40 * codecs.h would confuse the assembler. */
44 .section .icode,"ax",%progbits
50 .type udiv32_arm,%function
53 /* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
54 for dividing a 30-bit value by a 15-bit value, with two operations per
55 iteration by storing quotient and remainder together and adding the previous
56 quotient bit during trial subtraction. Modified to work with any dividend
57 and divisor both less than 1 << 30, and skipping trials by calculating bits
59 .macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
62 /* Shift the divisor left until it aligns with the numerator. If it already
63 has the high bit set, this is fine, everything inside .rept will be
64 skipped, and the add before and adcs after will set the one-bit result
66 cmn \divisor, \dividend, lsr #16
67 movcs \divisor, \divisor, lsl #16
68 addcs \bits, \bits, #16
69 cmn \divisor, \dividend, lsr #8
70 movcs \divisor, \divisor, lsl #8
71 addcs \bits, \bits, #8
72 cmn \divisor, \dividend, lsr #4
73 movcs \divisor, \divisor, lsl #4
74 addcs \bits, \bits, #4
75 cmn \divisor, \dividend, lsr #2
76 movcs \divisor, \divisor, lsl #2
77 addcs \bits, \bits, #2
78 cmn \divisor, \dividend, lsr #1
79 movcs \divisor, \divisor, lsl #1
80 addcs \bits, \bits, #1
81 adds \result, \dividend, \divisor
82 subcc \result, \result, \divisor
83 rsb \curbit, \bits, #31
84 add pc, pc, \curbit, lsl #3
87 adcs \result, \divisor, \result, lsl #1
88 /* Fix the remainder portion of the result. This must be done because the
89 handler for 32-bit numerators needs the remainder. */
90 subcc \result, \result, \divisor
92 /* Shift remainder/quotient left one, add final quotient bit */
93 adc \result, \result, \result
94 mov \remainder, \result, lsr \bits
95 eor \quotient, \result, \remainder, lsl \bits
101 /* Each table entry is one word. Since a compare is done against the maximum
102 entry as an immediate, the maximum entry must be a valid ARM immediate,
103 which means a byte shifted by an even number of places. */
104 .set recip_max, 2 + FREE_IRAM / 4
105 .set recip_max_tmp, recip_max >> 8
106 .set recip_mask_shift, 0
109 .if recip_max_tmp >> tmp_shift
110 .set recip_max_tmp, recip_max_tmp >> tmp_shift
111 .set recip_mask_shift, recip_mask_shift + tmp_shift
113 .set tmp_shift, tmp_shift >> 1
116 .set recip_mask_shift, recip_mask_shift + 1
118 .set recip_mask_shift, (recip_mask_shift + 1) & 62
119 .set recip_max, recip_max & (255 << recip_mask_shift)
129 adr r3, .L_udiv_recip_table-12
130 ldr r2, [r3, r1, lsl #2]
145 /* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
146 and add the next bit of the result. The correction code at .L_udiv32
147 does not need the divisor inverted, but can be modified to work with it,
148 and this allows the zero divisor test to be done early and without an
149 explicit comparison. */
155 /* High bit must be unset, otherwise shift numerator right, calculate,
156 and correct results. As this case is very uncommon we want to avoid
157 any other delays on the main path in handling it, so the long divide
158 calls the short divide as a function. */
161 ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
164 /* store original numerator and divisor, we'll need them to correct the
166 stmdb sp, { r0, r1, lr }
167 /* Call __div0 here if divisor is zero, otherwise it would report the wrong
171 ldmdb sp, { r2, r3, lr }
172 /* Move the low bit of the original numerator to the carry bit */
174 /* Shift the remainder left one and add in the carry bit */
176 /* Subtract the original divisor from the remainder, setting carry if the
177 result is non-negative */
179 /* Shift quotient left one and add carry bit */
183 /* __div0 expects the calling address on the top of the stack */
186 #if defined(__ARM_EABI__) || !defined(USE_IRAM)
197 .set q, 0x40000000 / div
198 .set r, (0x40000000 - (q * div))<<1
212 .set q, 0x40000000 / div * 4
218 .size udiv32_arm, . - udiv32_arm
221 .macro ARMV5_UDIV32_BODY numerator, divisor, quotient, bits, inv, neg, div0label
222 cmp \numerator, \divisor
225 mov \inv, \divisor, lsl \bits
226 add \neg, pc, \inv, lsr #25
228 ldrhib \inv, [\neg, #.L_udiv_est_table-.-64]
230 subs \bits, \bits, #7
231 rsb \neg, \divisor, #0
232 movpl \divisor, \inv, lsl \bits
234 mul \inv, \divisor, \neg
235 smlawt \divisor, \divisor, \inv, \divisor
236 mul \inv, \divisor, \neg
237 /* This will save a cycle on ARMv6, but requires that the numerator sign
238 bit is not set (that of inv is guaranteed unset). The branch should
239 predict very well, making it typically 1 cycle, and thus both the branch
240 and test fill delay cycles for the multiplies. Based on logging of
241 numerator sizes in the APE codec, the branch is taken about 1/10^7 of
244 tst \numerator, \numerator
245 smmla \divisor, \divisor, \inv, \divisor
247 smmul \inv, \numerator, \divisor
250 smlal \bits, \divisor, \inv, \divisor
251 umull \bits, \inv, \numerator, \divisor
253 add \numerator, \numerator, \neg
254 mla \divisor, \inv, \neg, \numerator
257 addcc \quotient, \quotient, #1
258 addpl \quotient, \quotient, #2
263 mov \divisor, \inv, lsr \bits
264 umull \bits, \inv, \numerator, \divisor
265 mla \divisor, \inv, \neg, \numerator
267 cmn \neg, \divisor, lsr #1
268 addcs \divisor, \divisor, \neg, lsl #1
269 addcs \quotient, \quotient, #2
271 addcs \quotient, \quotient, #1
274 .ifnc "", "\div0label"
275 rsb \bits, \bits, #31
278 mov \quotient, \numerator, lsr \bits
285 umull \bits, \inv, \numerator, \divisor
286 add \numerator, \numerator, \neg
287 mla \divisor, \inv, \neg, \numerator
290 addcc \quotient, \quotient, #1
291 addpl \quotient, \quotient, #2
297 ARMV5_UDIV32_BODY r0, r1, r0, r2, r3, ip, .L_div0
299 /* __div0 expects the calling address on the top of the stack */
302 #if defined(__ARM_EABI__) || !defined(USE_IRAM)
309 .byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6
310 .byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf
311 .byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc
312 .byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac
313 .byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f
314 .byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93
315 .byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
316 .byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
318 .size udiv32_arm, . - udiv32_arm