apps/codecs/lib/udiv32_arm.S

   1 /***************************************************************************
   2  *             __________               __   ___.
   3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7  *                     \/            \/     \/    \/            \/
   8  * $Id$
   9  *
  10  * Copyright (C) 2008 by Jens Arnold
  11  * Copyright (C) 2009 by Andrew Mahone
  12  *
  13  * Optimised unsigned integer division for ARMv4
  14  *
  15  * Based on: libgcc routines for ARM cpu, additional algorithms from ARM System
  16  *           Developer's Guide
  17  * Division routines, written by Richard Earnshaw, (rearnsha@armltd.co.uk)
  18  * Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
  19  * Free Software Foundation, Inc.
  20  *
  21  * This program is free software; you can redistribute it and/or
  22  * modify it under the terms of the GNU General Public License
  23  * as published by the Free Software Foundation; either version 2
  24  * of the License, or (at your option) any later version.
  25  *
  26  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  27  * KIND, either express or implied.
  28  *
  29  ****************************************************************************/
  30
  31 #include "config.h"
  32 /* Codecs should not normally do this, but we need to check a macro, and
  33  * codecs.h would confuse the assembler. */
  34
  35 #ifdef USE_IRAM
  36 #define DIV_RECIP
  37     .section    .icode,"ax",%progbits
  38 #else
  39     .text
  40 #endif
  41     .align
  42     .global udiv32_arm
  43     .type   udiv32_arm,%function
  44
  45 #if ARM_ARCH < 5
  46 /* Adapted from an algorithm given in ARM System Developer's Guide (7.3.1.2)
  47    for dividing a 30-bit value by a 15-bit value, with two operations per
  48    iteration by storing quotient and remainder together and adding the previous
  49    quotient bit during trial subtraction. Modified to work with any dividend
  50    and divisor both less than 1 << 30, and skipping trials by calculating bits
  51    in output. */
  52 .macro ARM_DIV_31_BODY dividend, divisor, result, bits, curbit, quotient, remainder
  53
  54     mov     \bits, #1
  55     /* Shift the divisor left until it aligns with the numerator. If it already
  56        has the high bit set, this is fine, everything inside .rept will be
  57        skipped, and the add before and adcs after will set the one-bit result
  58        to zero. */
  59     cmn     \divisor, \dividend, lsr #16
  60     movcs   \divisor, \divisor, lsl #16
  61     addcs   \bits, \bits, #16
  62     cmn     \divisor, \dividend, lsr #8
  63     movcs   \divisor, \divisor, lsl #8
  64     addcs   \bits, \bits, #8
  65     cmn     \divisor, \dividend, lsr #4
  66     movcs   \divisor, \divisor, lsl #4
  67     addcs   \bits, \bits, #4
  68     cmn     \divisor, \dividend, lsr #2
  69     movcs   \divisor, \divisor, lsl #2
  70     addcs   \bits, \bits, #2
  71     cmn     \divisor, \dividend, lsr #1
  72     movcs   \divisor, \divisor, lsl #1
  73     addcs   \bits, \bits, #1
  74     adds    \result, \dividend, \divisor
  75     subcc   \result, \result, \divisor
  76     rsb     \curbit, \bits, #31
  77     add     pc, pc, \curbit, lsl #3
  78     nop
  79     .rept   30
  80     adcs    \result, \divisor, \result, lsl #1
  81     /* Fix the remainder portion of the result. This must be done because the
  82        handler for 32-bit numerators needs the remainder. */
  83     subcc   \result, \result, \divisor
  84     .endr
  85     /* Shift remainder/quotient left one, add final quotient bit */
  86     adc     \result, \result, \result
  87     mov     \remainder, \result, lsr \bits
  88     eor     \quotient, \result, \remainder, lsl \bits
  89 .endm
  90
  91 #ifdef CPU_PP
  92 #if CONFIG_CPU == PP5020
  93 .set recip_max, 8384
  94 #elif CONFIG_CPU == PP5002
  95 .set recip_max, 4608
  96 #else
  97 .set recip_max, 16384
  98 #endif
  99 #elif CONFIG_CPU == AS3525
 100 .set recip_max, 42752
 101 #elif CONFIG_CPU == S5L8701
 102 .set recip_max, 13184
 103 #elif CONFIG_CPU == S5L8700
 104 .set recip_max, 9088
 105 #endif
 106
 107 udiv32_arm:
 108 #ifdef DIV_RECIP
 109     cmp     r1, #3
 110     bcc     .L_udiv_tiny
 111     cmp     r1, #recip_max
 112     bhi     .L_udiv
 113     adr     r3, .L_udiv_recip_table-12
 114     ldr     r2, [r3, r1, lsl #2]
 115     mov     r3, r0
 116     umull   ip, r0, r2, r0
 117     mul     r2, r0, r1
 118     cmp     r3, r2
 119     bxcs    lr
 120     sub     r0, r0, #1
 121     bx      lr
 122 .L_udiv_tiny:
 123     cmp     r1, #1
 124     movhi   r0, r0, lsr #1
 125     bxcs    lr
 126     b       .L_div0
 127 #endif
 128 .L_udiv:
 129     /* Invert divisor. ARM_DIV_31_BODY uses adc to both subtract the divisor
 130        and add the next bit of the result. The correction code at .L_udiv32
 131        does not need the divisor inverted, but can be modified to work with it,
 132        and this allows the zero divisor test to be done early and without an
 133        explicit comparison. */
 134     rsbs    r1, r1, #0
 135 #ifndef DIV_RECIP
 136     beq .L_div0
 137 #endif
 138     tst     r0, r0
 139     /* High bit must be unset, otherwise shift numerator right, calculate,
 140        and correct results. As this case is very uncommon we want to avoid
 141        any other delays on the main path in handling it, so the long divide
 142        calls the short divide as a function. */
 143     bmi     .L_udiv32
 144 .L_udiv31:
 145     ARM_DIV_31_BODY r0, r1, r2, r3, ip, r0, r1
 146     bx      lr
 147 .L_udiv32:
 148     /* store original numerator and divisor, we'll need them to correct the
 149        result, */
 150     stmdb   sp, { r0, r1, lr }
 151     /* Call __div0 here if divisor is zero, otherwise it would report the wrong
 152        address. */
 153     mov     r0, r0, lsr #1
 154     bl      .L_udiv31
 155     ldmdb   sp, { r2, r3, lr }
 156     /* Move the low bit of the original numerator to the carry bit */
 157     movs    r2, r2, lsr #1
 158     /* Shift the remainder left one and add in the carry bit */
 159     adc     r1, r1, r1
 160     /* Subtract the original divisor from the remainder, setting carry if the
 161        result is non-negative */
 162     adds    r1, r1, r3
 163     /* Shift quotient left one and add carry bit */
 164     adc     r0, r0, r0
 165     bx      lr
 166 .L_div0:
 167     /* __div0 expects the calling address on the top of the stack */
 168     stmdb sp!, { lr }
 169     mov     r0, #0
 170 #if defined(__ARM_EABI__) || !defined(USE_IRAM)
 171     bl      __div0
 172 #else
 173     ldr     pc, [pc, #-4]
 174     .word   __div0
 175 #endif
 176 #ifdef DIV_RECIP
 177 .L_udiv_recip_table:
 178     .set div, 3
 179     .rept recip_max - 2
 180         .if (div - 1) & div
 181             .set q, 0x40000000 / div
 182             .set r, (0x40000000 - (q * div))<<1
 183             .set q, q << 1
 184             .if r >= div
 185                 .set q, q + 1
 186                 .set r, r - div
 187             .endif
 188             .set r, r << 1
 189             .set q, q << 1
 190             .if r >= div
 191                 .set q, q + 1
 192                 .set r, r - div
 193             .endif
 194             .set q, q + 1
 195         .else
 196             .set q, 0x40000000 / div * 4
 197         .endif
 198         .word q
 199         .set div, div+1
 200     .endr
 201 #endif
 202     .size udiv32_arm, . - udiv32_arm
 203
 204 #else
 205 .macro ARMV5_UDIV32_BODY numerator, divisor, quotient, bits, inv, neg, div0label
 206     cmp     \numerator, \divisor
 207     clz     \bits, \divisor
 208     bcc     30f
 209     mov     \inv, \divisor, lsl \bits
 210     add     \neg, pc, \inv, lsr #25
 211     cmp     \inv, #1<<31
 212     ldrhib  \inv, [\neg, #.L_udiv_est_table-.-64]
 213     bls     20f
 214     subs    \bits, \bits, #7
 215     rsb     \neg, \divisor, #0
 216     movpl   \divisor, \inv, lsl \bits
 217     bmi     10f
 218     mul     \inv, \divisor, \neg
 219     smlawt  \divisor, \divisor, \inv, \divisor
 220     mul     \inv, \divisor, \neg
 221     /* This will save a cycle on ARMv6, but does not produce a correct result
 222        if numerator sign bit is set. This case accounts for about 1 in 10^7 of
 223        divisions, done by the APE decoder, so we specialize for the more common
 224        case and handle the uncommon large-numerator separately */
 225 #if ARM_ARCH >= 6
 226     tst     \numerator, \numerator
 227     smmla   \divisor, \divisor, \inv, \divisor
 228     bmi     40f
 229     smmul   \inv, \numerator, \divisor
 230 #else
 231     mov     \bits, #0
 232     smlal   \bits, \divisor, \inv, \divisor
 233     umull   \bits, \inv, \numerator, \divisor
 234 #endif
 235     add     \numerator, \numerator, \neg
 236     mla     \divisor, \inv, \neg, \numerator
 237     mov     \quotient, \inv
 238     cmn     \divisor, \neg
 239     addcc   \quotient, \quotient, #1
 240     addpl   \quotient, \quotient, #2
 241     bx      lr
 242 10:
 243     rsb     \bits, \bits, #0
 244     sub     \inv, \inv, #4
 245     mov     \divisor, \inv, lsr \bits
 246     umull   \bits, \inv, \numerator, \divisor
 247     mla     \divisor, \inv, \neg, \numerator
 248     mov     \quotient, \inv
 249     cmn     \neg, \divisor, lsr #1
 250     addcs   \divisor, \divisor, \neg, lsl #1
 251     addcs   \quotient, \quotient, #2
 252     cmn     \neg, \divisor
 253     addcs   \quotient, \quotient, #1
 254     bx      lr
 255 20:
 256 .ifnc "", "\div0label"
 257     rsb     \bits, \bits, #31
 258     bne     \div0label
 259 .endif
 260     mov     \quotient, \numerator, lsr \bits
 261     bx      lr
 262 30:
 263     mov     \quotient, #0
 264     bx      lr
 265 #if ARM_ARCH >= 6
 266 40:
 267     umull   \bits, \inv, \numerator, \divisor
 268     add     \numerator, \numerator, \neg
 269     mla     \divisor, \inv, \neg, \numerator
 270     mov     \quotient, \inv
 271     cmn     \divisor, \neg
 272     addcc   \quotient, \quotient, #1
 273     addpl   \quotient, \quotient, #2
 274     bx      lr
 275 #endif
 276 .endm
 277
 278 udiv32_arm:
 279     ARMV5_UDIV32_BODY r0, r1, r0, r2, r3, ip, .L_div0
 280 .L_div0:
 281     /* __div0 expects the calling address on the top of the stack */
 282     stmdb sp!, { lr }
 283     mov     r0, #0
 284 #if defined(__ARM_EABI__) || !defined(USE_IRAM)
 285     bl      __div0
 286 #else
 287     ldr     pc, [pc, #-4]
 288     .word   __div0
 289 #endif
 290 .L_udiv_est_table:
 291     .byte 0xff, 0xfc, 0xf8, 0xf4, 0xf0, 0xed, 0xea, 0xe6
 292     .byte 0xe3, 0xe0, 0xdd, 0xda, 0xd7, 0xd4, 0xd2, 0xcf
 293     .byte 0xcc, 0xca, 0xc7, 0xc5, 0xc3, 0xc0, 0xbe, 0xbc
 294     .byte 0xba, 0xb8, 0xb6, 0xb4, 0xb2, 0xb0, 0xae, 0xac
 295     .byte 0xaa, 0xa8, 0xa7, 0xa5, 0xa3, 0xa2, 0xa0, 0x9f
 296     .byte 0x9d, 0x9c, 0x9a, 0x99, 0x97, 0x96, 0x94, 0x93
 297     .byte 0x92, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8a, 0x89
 298     .byte 0x88, 0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81
 299 #endif
 300     .size udiv32_arm, . - udiv32_arm