apps/codecs/demac/libdemac/udiv32_arm.S

   1 /***************************************************************************
   2  *             __________               __   ___.
   3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7  *                     \/            \/     \/    \/            \/
   8  * $Id$
   9  *
  10  * Copyright (C) 2008 by Jens Arnold
  11  * Copyright (C) 2009 by Andrew Mahone
  12  *
  13  * Optimised unsigned integer division for ARMv4
  14  *
  15  * Based on: libgcc routines for ARM cpu, additional algorithms from ARM System
  16  *           Developer's Guide
  17  * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
  18  * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
  19  * Free Software Foundation, Inc.
  20  *
  21  * This program is free software; you can redistribute it and/or
  22  * modify it under the terms of the GNU General Public License
  23  * as published by the Free Software Foundation; either version 2
  24  * of the License, or (at your option) any later version.
  25  *
  26  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  27  * KIND, either express or implied.
  28  *
  29  ****************************************************************************/
  30
  31 #include "config.h"
  32 /* On targets with codec iram, a header file will be generated after an initial
  33    link of the APE codec, stating the amount of IRAM remaining for use by the
  34    reciprocal lookup table. */
  35 #if !defined(APE_PRE) && defined(USE_IRAM) && ARM_ARCH < 5
  36 #include "apps/codecs/ape_free_iram.h"
  37 #endif
  38
  39 /* Codecs should not normally do this, but we need to check a macro, and
  40  * codecs.h would confuse the assembler. */
  41
  42 #ifdef USE_IRAM
  43 #define DIV_RECIP
  44     .section    .icode,"ax",%progbits
  45 #else
  46     .text
  47 #endif
  48     .align
  49     .global udiv32_arm
  50     .type   udiv32_arm,%function
  51
  52 #if ARM_ARCH < 5
  53 /* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
  54    for dividing a 30-bit value by a 15-bit value, with two operations per
  55    iteration by storing quotient and remainder together and adding the previous
  56    quotient bit during trial subtraction. Modified to work with any dividend
  57    and divisor both less than 1 << 30, and skipping trials by calculating bits
  58    in output. */
  59 .macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
  60
  61     mov     \bits, #1
  62     /* Shift the divisor left until it aligns with the numerator. If it already
  63        has the high bit set, this is fine, everything inside .rept will be
  64        skipped, and the add before and adcs after will set the one-bit result
  65        to zero. */
  66     cmn     \divisor, \dividend, lsr #16
  67     movcs   \divisor, \divisor, lsl #16
  68     addcs   \bits, \bits, #16
  69     cmn     \divisor, \dividend, lsr #8
  70     movcs   \divisor, \divisor, lsl #8
  71     addcs   \bits, \bits, #8
  72     cmn     \divisor, \dividend, lsr #4
  73     movcs   \divisor, \divisor, lsl #4
  74     addcs   \bits, \bits, #4
  75     cmn     \divisor, \dividend, lsr #2
  76     movcs   \divisor, \divisor, lsl #2
  77     addcs   \bits, \bits, #2
  78     cmn     \divisor, \dividend, lsr #1
  79     movcs   \divisor, \divisor, lsl #1
  80     addcs   \bits, \bits, #1
  81     adds    \result, \dividend, \divisor
  82     subcc   \result, \result, \divisor
  83     rsb     \curbit, \bits, #31
  84     add     pc, pc, \curbit, lsl #3
  85     nop
  86     .rept   30
  87     adcs    \result, \divisor, \result, lsl #1
  88     /* Fix the remainder portion of the result. This must be done because the
  89        handler for 32-bit numerators needs the remainder. */
  90     subcc   \result, \result, \divisor
  91     .endr
  92     /* Shift remainder/quotient left one, add final quotient bit */
  93     adc     \result, \result, \result
  94     mov     \remainder, \result, lsr \bits
  95     eor     \quotient, \result, \remainder, lsl \bits
  96 .endm
  97
  98 #ifndef FREE_IRAM
  99 .set recip_max, 2
 100 #else
 101 /* Each table entry is one word. Since a compare is done against the maximum
 102    entry as an immediate, the maximum entry must be a valid ARM immediate,
 103    which means a byte shifted by an even number of places. */
 104 .set recip_max, 2 + FREE_IRAM / 4
 105 .set recip_max_tmp, recip_max >> 8
 106 .set recip_mask_shift, 0
 107 .set tmp_shift, 16
 108 .rept 5
 109     .if recip_max_tmp >> tmp_shift
 110         .set recip_max_tmp, recip_max_tmp >> tmp_shift
 111         .set recip_mask_shift, recip_mask_shift + tmp_shift
 112     .endif
 113     .set tmp_shift, tmp_shift >> 1
 114 .endr
 115 .if recip_max_tmp
 116     .set recip_mask_shift, recip_mask_shift + 1
 117 .endif
 118 .set recip_mask_shift, (recip_mask_shift + 1) & 62
 119 .set recip_max, recip_max & (255 << recip_mask_shift)
 120 //.set recip_max, 2
 121 #endif
 122
 123 udiv32_arm:
 124 #ifdef DIV_RECIP
 125     cmp     r1, #3
 126     bcc     .L_udiv_tiny
 127     cmp     r1, #recip_max
 128     bhi     .L_udiv
 129     adr     r3, .L_udiv_recip_table-12
 130     ldr     r2, [r3, r1, lsl #2]
 131     mov     r3, r0
 132     umull   ip, r0, r2, r0
 133     mul     r2, r0, r1
 134     cmp     r3, r2
 135     bxcs    lr
 136     sub     r0, r0, #1
 137     bx      lr
 138 .L_udiv_tiny:
 139     cmp     r1, #1
 140     movhi   r0, r0, lsr #1
 141     bxcs    lr
 142     b       .L_div0
 143 #endif
 144 .L_udiv:
 145     /* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
 146        and add the next bit of the result. The correction code at .L_udiv32
 147        does not need the divisor inverted, but can be modified to work with it,
 148        and this allows the zero divisor test to be done early and without an
 149        explicit comparison. */
 150     rsbs    r1, r1, #0
 151 #ifndef DIV_RECIP
 152     beq .L_div0
 153 #endif
 154     tst     r0, r0
 155     /* High bit must be unset, otherwise shift numerator right, calculate,
 156        and correct results. As this case is very uncommon we want to avoid
 157        any other delays on the main path in handling it, so the long divide
 158        calls the short divide as a function. */
 159     bmi     .L_udiv32
 160 .L_udiv31:
 161     ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
 162     bx      lr
 163 .L_udiv32:
 164     /* store original numerator and divisor, we'll need them to correct the
 165        result, */
 166     stmdb   sp, { r0, r1, lr }
 167     /* Call __div0 here if divisor is zero, otherwise it would report the wrong
 168        address. */
 169     mov     r0, r0, lsr #1
 170     bl      .L_udiv31
 171     ldmdb   sp, { r2, r3, lr }
 172     /* Move the low bit of the original numerator to the carry bit */
 173     movs    r2, r2, lsr #1
 174     /* Shift the remainder left one and add in the carry bit */
 175     adc     r1, r1, r1
 176     /* Subtract the original divisor from the remainder, setting carry if the
 177        result is non-negative */
 178     adds    r1, r1, r3
 179     /* Shift quotient left one and add carry bit */
 180     adc     r0, r0, r0
 181     bx      lr
 182 .L_div0:
 183     /* __div0 expects the calling address on the top of the stack */
 184     stmdb sp!, { lr }
 185     mov     r0, #0
 186 #if defined(__ARM_EABI__) || !defined(USE_IRAM)
 187     bl      __div0
 188 #else
 189     ldr     pc, [pc, #-4]
 190     .word   __div0
 191 #endif
 192 #ifdef DIV_RECIP
 193 .L_udiv_recip_table:
 194     .set div, 3
 195     .rept recip_max - 2
 196         .if (div - 1) & div
 197             .set q, 0x40000000 / div
 198             .set r, (0x40000000 - (q * div))<<1
 199             .set q, q << 1
 200             .if r >= div
 201                 .set q, q + 1
 202                 .set r, r - div
 203             .endif
 204             .set r, r << 1
 205             .set q, q << 1
 206             .if r >= div
 207                 .set q, q + 1
 208                 .set r, r - div
 209             .endif
 210             .set q, q + 1
 211         .else
 212             .set q, 0x40000000 / div * 4
 213         .endif
 214         .word q
 215         .set div, div+1
 216     .endr
 217 #endif
 218     .size udiv32_arm, . - udiv32_arm
 219
 220 #else
 221 .macro ARMV5_UDIV32_BODY numerator, divisor, quotient, bits, inv, neg, div0label
 222     cmp     \numerator, \divisor
 223     clz     \bits, \divisor
 224     bcc     30f
 225     mov     \inv, \divisor, lsl \bits
 226     add     \neg, pc, \inv, lsr #25
 227     cmp     \inv, #1<<31
 228     ldrhib  \inv, [\neg, #.L_udiv_est_table-.-64]
 229     bls     20f
 230     subs    \bits, \bits, #7
 231     rsb     \neg, \divisor, #0
 232     movpl   \divisor, \inv, lsl \bits
 233     bmi     10f
 234     mul     \inv, \divisor, \neg
 235     smlawt  \divisor, \divisor, \inv, \divisor
 236     mul     \inv, \divisor, \neg
 237     /* This will save a cycle on ARMv6, but requires that the numerator sign
 238        bit is not set (that of inv is guaranteed unset). The branch should
 239        predict very well, making it typically 1 cycle, and thus both the branch
 240        and test fill delay cycles for the multiplies. Based on logging of
 241        numerator sizes in the APE codec, the branch is taken about 1/10^7 of
 242        the time. */
 243 #if ARM_ARCH >= 6
 244     tst     \numerator, \numerator
 245     smmla   \divisor, \divisor, \inv, \divisor
 246     bmi     40f
 247     smmul   \inv, \numerator, \divisor
 248 #else
 249     mov     \bits, #0
 250     smlal   \bits, \divisor, \inv, \divisor
 251     umull   \bits, \inv, \numerator, \divisor
 252 #endif
 253     add     \numerator, \numerator, \neg
 254     mla     \divisor, \inv, \neg, \numerator
 255     mov     \quotient, \inv
 256     cmn     \divisor, \neg
 257     addcc   \quotient, \quotient, #1
 258     addpl   \quotient, \quotient, #2
 259     bx      lr
 260 10:
 261     rsb     \bits, \bits, #0
 262     sub     \inv, \inv, #4
 263     mov     \divisor, \inv, lsr \bits
 264     umull   \bits, \inv, \numerator, \divisor
 265     mla     \divisor, \inv, \neg, \numerator
 266     mov     \quotient, \inv
 267     cmn     \neg, \divisor, lsr #1
 268     addcs   \divisor, \divisor, \neg, lsl #1
 269     addcs   \quotient, \quotient, #2
 270     cmn     \neg, \divisor
 271     addcs   \quotient, \quotient, #1
 272     bx      lr
 273 20:
 274 .ifnc "", "\div0label"
 275     rsb     \bits, \bits, #31
 276     bne     \div0label
 277 .endif
 278     mov     \quotient, \numerator, lsr \bits
 279     bx      lr
 280 30:
 281     mov     \quotient, #0
 282     bx      lr
 283 #if ARM_ARCH >= 6
 284 40:
 285     umull   \bits, \inv, \numerator, \divisor
 286     add     \numerator, \numerator, \neg
 287     mla     \divisor, \inv, \neg, \numerator
 288     mov     \quotient, \inv
 289     cmn     \divisor, \neg
 290     addcc   \quotient, \quotient, #1
 291     addpl   \quotient, \quotient, #2
 292     bx      lr
 293 #endif
 294 .endm
 295
 296 udiv32_arm:
 297     ARMV5_UDIV32_BODY r0, r1, r0, r2, r3, ip, .L_div0
 298 .L_div0:
 299     /* __div0 expects the calling address on the top of the stack */
 300     stmdb sp!, { lr }
 301     mov     r0, #0
 302 #if defined(__ARM_EABI__) || !defined(USE_IRAM)
 303     bl      __div0
 304 #else
 305     ldr     pc, [pc, #-4]
 306     .word   __div0
 307 #endif
 308 .L_udiv_est_table:
 309     .byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6
 310     .byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf
 311     .byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc
 312     .byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac
 313     .byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f
 314     .byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93
 315     .byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
 316     .byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
 317 #endif
 318     .size udiv32_arm, . - udiv32_arm