apps/codecs/libwavpack/arm.S

   1 /***************************************************************************
   2  *             __________               __   ___.
   3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7  *                     \/            \/     \/    \/            \/
   8  * $Id$
   9  *
  10  * Copyright (C) 2006 by David Bryant
  11  *
  12  * This program is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU General Public License
  14  * as published by the Free Software Foundation; either version 2
  15  * of the License, or (at your option) any later version.
  16  *
  17  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  18  * KIND, either express or implied.
  19  *
  20  ****************************************************************************/
  21
  22 /* This is an assembly optimized version of the following WavPack function:
  23  *
  24  * void decorr_stereo_pass_cont_arm (struct decorr_pass *dpp,
  25  *                                   long *buffer, long sample_count);
  26  *
  27  * It performs a single pass of stereo decorrelation on the provided buffer.
  28  * Note that this version of the function requires that the 8 previous stereo
  29  * samples are visible and correct. In other words, it ignores the "samples_*"
  30  * fields in the decorr_pass structure and gets the history data directly
  31  * from the buffer. It does, however, return the appropriate history samples
  32  * to the decorr_pass structure before returning.
  33  *
  34  * This is written to work on a ARM7TDMI processor. This version only uses the
  35  * 32-bit multiply-accumulate instruction and so will overflow with 24-bit
  36  * WavPack files.
  37  */
  38
  39 #include "config.h"
  40
  41         .text
  42         .align
  43         .global         decorr_stereo_pass_cont_arm
  44
  45 /*
  46  * on entry:
  47  *
  48  * r0 = struct decorr_pass *dpp
  49  * r1 = long *buffer
  50  * r2 = long sample_count
  51  */
  52
  53 decorr_stereo_pass_cont_arm:
  54
  55         stmfd   sp!, {r4 - r8, r10, r11, lr}
  56         mov     r5, r0                  @ r5 = dpp
  57         mov     r11, #512               @ r11 = 512 for rounding
  58         ldrsh   r6, [r0, #2]            @ r6 = dpp->delta
  59         ldrsh   r4, [r0, #4]            @ r4 = dpp->weight_A
  60         ldrsh   r0, [r0, #6]            @ r0 = dpp->weight_B
  61         cmp     r2, #0                  @ exit if no samples to process
  62         beq     common_exit
  63
  64         add     r7, r1, r2, asl #3      @ r7 = buffer ending position
  65         ldrsh   r2, [r5, #0]            @ r2 = dpp->term
  66         cmp     r2, #0
  67         bmi     minus_term
  68
  69         ldr     lr, [r1, #-16]          @ load 2 sample history from buffer
  70         ldr     r10, [r1, #-12]         @  for terms 2, 17, and 18
  71         ldr     r8, [r1, #-8]
  72         ldr     r3, [r1, #-4]
  73         cmp     r2, #17
  74         beq     term_17_loop
  75         cmp     r2, #18
  76         beq     term_18_loop
  77         cmp     r2, #2
  78         beq     term_2_loop
  79         b       term_default_loop       @ else handle default (1-8, except 2)
  80
  81 minus_term:
  82         mov     r10, #1024              @ r10 = -1024 for weight clipping
  83         rsb     r10, r10, #0            @  (only used for negative terms)
  84         cmn     r2, #1
  85         beq     term_minus_1
  86         cmn     r2, #2
  87         beq     term_minus_2
  88         cmn     r2, #3
  89         beq     term_minus_3
  90         b       common_exit
  91
  92 /*
  93  ******************************************************************************
  94  * Loop to handle term = 17 condition
  95  *
  96  * r0 = dpp->weight_B           r8 = previous left sample
  97  * r1 = bptr                    r9 =
  98  * r2 = current sample          r10 = second previous left sample
  99  * r3 = previous right sample   r11 = 512 (for rounding)
 100  * r4 = dpp->weight_A           ip = current decorrelation value
 101  * r5 = dpp                     sp =
 102  * r6 = dpp->delta              lr = second previous right sample
 103  * r7 = eptr                    pc =
 104  *******************************************************************************
 105  */
 106
 107 term_17_loop:
 108         rsbs    ip, lr, r8, asl #1      @ decorr value = (2 * prev) - 2nd prev
 109         mov     lr, r8                  @ previous becomes 2nd previous
 110         ldr     r2, [r1], #4            @ get sample & update pointer
 111         mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
 112         add     r8, r2, r8, asr #10     @  shift, and add to new sample
 113         strne   r8, [r1, #-4]           @ if change possible, store sample back
 114         cmpne   r2, #0
 115         beq     .L325
 116         teq     ip, r2                  @ update weight based on signs
 117         submi   r4, r4, r6
 118         addpl   r4, r4, r6
 119
 120 .L325:  rsbs    ip, r10, r3, asl #1     @ do same thing for right channel
 121         mov     r10, r3
 122         ldr     r2, [r1], #4
 123         mla     r3, ip, r0, r11
 124         add     r3, r2, r3, asr #10
 125         strne   r3, [r1, #-4]
 126         cmpne   r2, #0
 127         beq     .L329
 128         teq     ip, r2
 129         submi   r0, r0, r6
 130         addpl   r0, r0, r6
 131
 132 .L329:  cmp     r7, r1                  @ loop back if more samples to do
 133         bhi     term_17_loop
 134         b       store_1718              @ common exit for terms 17 & 18
 135
 136 /*
 137  ******************************************************************************
 138  * Loop to handle term = 18 condition
 139  *
 140  * r0 = dpp->weight_B           r8 = previous left sample
 141  * r1 = bptr                    r9 =
 142  * r2 = current sample          r10 = second previous left sample
 143  * r3 = previous right sample   r11 = 512 (for rounding)
 144  * r4 = dpp->weight_A           ip = decorrelation value
 145  * r5 = dpp                     sp =
 146  * r6 = dpp->delta              lr = second previous right sample
 147  * r7 = eptr                    pc =
 148  *******************************************************************************
 149  */
 150
 151 term_18_loop:
 152         sub     ip, r8, lr              @ decorr value =
 153         mov     lr, r8                  @  ((3 * prev) - 2nd prev) >> 1
 154         adds    ip, r8, ip, asr #1
 155         ldr     r2, [r1], #4            @ get sample & update pointer
 156         mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
 157         add     r8, r2, r8, asr #10     @  shift, and add to new sample
 158         strne   r8, [r1, #-4]           @ if change possible, store sample back
 159         cmpne   r2, #0
 160         beq     .L337
 161         teq     ip, r2                  @ update weight based on signs
 162         submi   r4, r4, r6
 163         addpl   r4, r4, r6
 164
 165 .L337:  sub     ip, r3, r10             @ do same thing for right channel
 166         mov     r10, r3
 167         adds    ip, r3, ip, asr #1
 168         ldr     r2, [r1], #4
 169         mla     r3, ip, r0, r11
 170         add     r3, r2, r3, asr #10
 171         strne   r3, [r1, #-4]
 172         cmpne   r2, #0
 173         beq     .L341
 174         teq     ip, r2
 175         submi   r0, r0, r6
 176         addpl   r0, r0, r6
 177
 178 .L341:  cmp     r7, r1                  @ loop back if more samples to do
 179         bhi     term_18_loop
 180
 181 /* common exit for terms 17 & 18 */
 182
 183 store_1718:
 184         str     r3, [r5, #40]           @ store sample history into struct
 185         str     r8, [r5, #8]
 186         str     r10, [r5, #44]
 187         str     lr, [r5, #12]
 188         b       common_exit             @ and return
 189
 190 /*
 191  ******************************************************************************
 192  * Loop to handle term = 2 condition
 193  * (note that this case can be handled by the default term handler (1-8), but
 194  * this special case is faster because it doesn't have to read memory twice)
 195  *
 196  * r0 = dpp->weight_B           r8 = previous left sample
 197  * r1 = bptr                    r9 =
 198  * r2 = current sample          r10 = second previous left sample
 199  * r3 = previous right sample   r11 = 512 (for rounding)
 200  * r4 = dpp->weight_A           ip = decorrelation value
 201  * r5 = dpp                     sp =
 202  * r6 = dpp->delta              lr = second previous right sample
 203  * r7 = eptr                    pc =
 204  *******************************************************************************
 205  */
 206
 207 term_2_loop:
 208         movs    ip, lr                  @ get decorrelation value & test
 209         mov     lr, r8                  @ previous becomes 2nd previous
 210         ldr     r2, [r1], #4            @ get sample & update pointer
 211         mla     r8, ip, r4, r11         @ mult decorr value by weight, round,
 212         add     r8, r2, r8, asr #10     @  shift, and add to new sample
 213         strne   r8, [r1, #-4]           @ if change possible, store sample back
 214         cmpne   r2, #0
 215         beq     .L225
 216         teq     ip, r2                  @ update weight based on signs
 217         submi   r4, r4, r6
 218         addpl   r4, r4, r6
 219
 220 .L225:  movs    ip, r10                 @ do same thing for right channel
 221         mov     r10, r3
 222         ldr     r2, [r1], #4
 223         mla     r3, ip, r0, r11
 224         add     r3, r2, r3, asr #10
 225         strne   r3, [r1, #-4]
 226         cmpne   r2, #0
 227         beq     .L229
 228         teq     ip, r2
 229         submi   r0, r0, r6
 230         addpl   r0, r0, r6
 231
 232 .L229:  cmp     r7, r1                  @ loop back if more samples to do
 233         bhi     term_2_loop
 234         b       default_term_exit       @ this exit updates all dpp->samples
 235
 236 /*
 237  ******************************************************************************
 238  * Loop to handle default term condition
 239  *
 240  * r0 = dpp->weight_B           r8 = result accumulator
 241  * r1 = bptr                    r9 =
 242  * r2 = dpp->term               r10 =
 243  * r3 = decorrelation value     r11 = 512 (for rounding)
 244  * r4 = dpp->weight_A           ip = current sample
 245  * r5 = dpp                     sp =
 246  * r6 = dpp->delta              lr =
 247  * r7 = eptr                    pc =
 248  *******************************************************************************
 249  */
 250
 251 term_default_loop:
 252         ldr     ip, [r1]                @ get original sample
 253         ldr     r3, [r1, -r2, asl #3]   @ get decorrelation value based on term
 254         mla     r8, r3, r4, r11         @ mult decorr value by weight, round,
 255         add     r8, ip, r8, asr #10     @  shift and add to new sample
 256         str     r8, [r1], #4            @ store update sample
 257         cmp     r3, #0
 258         cmpne   ip, #0
 259         beq     .L350
 260         teq     ip, r3                  @ update weight based on signs
 261         submi   r4, r4, r6
 262         addpl   r4, r4, r6
 263
 264 .L350:  ldr     ip, [r1]                @ do the same thing for right channel
 265         ldr     r3, [r1, -r2, asl #3]
 266         mla     r8, r3, r0, r11
 267         add     r8, ip, r8, asr #10
 268         str     r8, [r1], #4
 269         cmp     r3, #0
 270         cmpne   ip, #0
 271         beq     .L354
 272         teq     ip, r3
 273         submi   r0, r0, r6
 274         addpl   r0, r0, r6
 275
 276 .L354:  cmp     r7, r1                  @ loop back if more samples to do
 277         bhi     term_default_loop
 278
 279 /*
 280  * This exit is used by terms 1-8 to store the previous 8 samples into the decorr
 281  * structure (even if they are not all used for the given term)
 282  */
 283
 284 default_term_exit:
 285         ldrsh   r3, [r5, #0]
 286         sub     ip, r3, #1
 287         mov     lr, #7
 288
 289 .L358:  and     r3, ip, #7
 290         add     r3, r5, r3, asl #2
 291         ldr     r2, [r1, #-4]
 292         str     r2, [r3, #40]
 293         ldr     r2, [r1, #-8]!
 294         str     r2, [r3, #8]
 295         sub     ip, ip, #1
 296         sub     lr, lr, #1
 297         cmn     lr, #1
 298         bne     .L358
 299         b       common_exit
 300
 301 /*
 302  ******************************************************************************
 303  * Loop to handle term = -1 condition
 304  *
 305  * r0 = dpp->weight_B           r8 =
 306  * r1 = bptr                    r9 =
 307  * r2 = intermediate result     r10 = -1024 (for clipping)
 308  * r3 = previous right sample   r11 = 512 (for rounding)
 309  * r4 = dpp->weight_A           ip = current sample
 310  * r5 = dpp                     sp =
 311  * r6 = dpp->delta              lr = updated left sample
 312  * r7 = eptr                    pc =
 313  *******************************************************************************
 314  */
 315
 316 term_minus_1:
 317         ldr     r3, [r1, #-4]
 318
 319 term_minus_1_loop:
 320         ldr     ip, [r1]                @ for left channel the decorrelation value
 321         mla     r2, r3, r4, r11         @  is the previous right sample (in r3)
 322         add     lr, ip, r2, asr #10
 323         str     lr, [r1], #8
 324         cmp     r3, #0
 325         cmpne   ip, #0
 326         beq     .L361
 327         teq     ip, r3                  @ update weight based on signs
 328         submi   r4, r4, r6
 329         addpl   r4, r4, r6
 330         cmp     r4, #1024
 331         movgt   r4, #1024
 332         cmp     r4, r10
 333         movlt   r4, r10
 334
 335 .L361:  ldr     r2, [r1, #-4]           @ for right channel the decorrelation value
 336         mla     r3, lr, r0, r11         @  is the just updated right sample (in lr)
 337         add     r3, r2, r3, asr #10
 338         str     r3, [r1, #-4]
 339         cmp     lr, #0
 340         cmpne   r2, #0
 341         beq     .L369
 342         teq     r2, lr
 343         submi   r0, r0, r6
 344         addpl   r0, r0, r6
 345         cmp     r0, #1024               @ then clip weight to +/-1024
 346         movgt   r0, #1024
 347         cmp     r0, r10
 348         movlt   r0, r10
 349
 350 .L369:  cmp     r7, r1                  @ loop back if more samples to do
 351         bhi     term_minus_1_loop
 352
 353         str     r3, [r5, #8]            @ else store right sample and exit
 354         b       common_exit
 355
 356 /*
 357  ******************************************************************************
 358  * Loop to handle term = -2 condition
 359  * (note that the channels are processed in the reverse order here)
 360  *
 361  * r0 = dpp->weight_B           r8 =
 362  * r1 = bptr                    r9 =
 363  * r2 = intermediate result     r10 = -1024 (for clipping)
 364  * r3 = previous left sample    r11 = 512 (for rounding)
 365  * r4 = dpp->weight_A           ip = current sample
 366  * r5 = dpp                     sp =
 367  * r6 = dpp->delta              lr = updated right sample
 368  * r7 = eptr                    pc =
 369  *******************************************************************************
 370  */
 371
 372 term_minus_2:
 373         ldr     r3, [r1, #-8]
 374
 375 term_minus_2_loop:
 376         ldr     ip, [r1, #4]            @ for right channel the decorrelation value
 377         mla     r2, r3, r0, r11         @  is the previous left sample (in r3)
 378         add     lr, ip, r2, asr #10
 379         str     lr, [r1, #4]
 380         cmp     r3, #0
 381         cmpne   ip, #0
 382         beq     .L380
 383         teq     ip, r3                  @ update weight based on signs
 384         submi   r0, r0, r6
 385         addpl   r0, r0, r6
 386         cmp     r0, #1024               @ then clip weight to +/-1024
 387         movgt   r0, #1024
 388         cmp     r0, r10
 389         movlt   r0, r10
 390
 391 .L380:  ldr     r2, [r1, #0]            @ for left channel the decorrelation value
 392         mla     r3, lr, r4, r11         @  is the just updated left sample (in lr)
 393         add     r3, r2, r3, asr #10
 394         str     r3, [r1], #8
 395         cmp     lr, #0
 396         cmpne   r2, #0
 397         beq     .L388
 398         teq     r2, lr
 399         submi   r4, r4, r6
 400         addpl   r4, r4, r6
 401         cmp     r4, #1024
 402         movgt   r4, #1024
 403         cmp     r4, r10
 404         movlt   r4, r10
 405
 406 .L388:  cmp     r7, r1                  @ loop back if more samples to do
 407         bhi     term_minus_2_loop
 408
 409         str     r3, [r5, #40]           @ else store left channel and exit
 410         b       common_exit
 411
 412 /*
 413  ******************************************************************************
 414  * Loop to handle term = -3 condition
 415  *
 416  * r0 = dpp->weight_B           r8 = previous left sample
 417  * r1 = bptr                    r9 =
 418  * r2 = current left sample     r10 = -1024 (for clipping)
 419  * r3 = previous right sample   r11 = 512 (for rounding)
 420  * r4 = dpp->weight_A           ip = intermediate result
 421  * r5 = dpp                     sp =
 422  * r6 = dpp->delta              lr =
 423  * r7 = eptr                    pc =
 424  *******************************************************************************
 425  */
 426
 427 term_minus_3:
 428         ldr     r3, [r1, #-4]           @ load previous samples
 429         ldr     r8, [r1, #-8]
 430
 431 term_minus_3_loop:
 432         ldr     ip, [r1]
 433         mla     r2, r3, r4, r11
 434         add     r2, ip, r2, asr #10
 435         str     r2, [r1], #4
 436         cmp     r3, #0
 437         cmpne   ip, #0
 438         beq     .L399
 439         teq     ip, r3                  @ update weight based on signs
 440         submi   r4, r4, r6
 441         addpl   r4, r4, r6
 442         cmp     r4, #1024               @ then clip weight to +/-1024
 443         movgt   r4, #1024
 444         cmp     r4, r10
 445         movlt   r4, r10
 446
 447 .L399:  movs    ip, r8                  @ ip = previous left we use now
 448         mov     r8, r2                  @ r8 = current left we use next time
 449         ldr     r2, [r1], #4
 450         mla     r3, ip, r0, r11
 451         add     r3, r2, r3, asr #10
 452         strne   r3, [r1, #-4]
 453         cmpne   r2, #0
 454         beq     .L407
 455         teq     ip, r2
 456         submi   r0, r0, r6
 457         addpl   r0, r0, r6
 458         cmp     r0, #1024
 459         movgt   r0, #1024
 460         cmp     r0, r10
 461         movlt   r0, r10
 462
 463 .L407:  cmp     r7, r1                  @ loop back if more samples to do
 464         bhi     term_minus_3_loop
 465
 466         str     r3, [r5, #8]            @ else store previous samples & exit
 467         str     r8, [r5, #40]
 468
 469 /*
 470  * Before finally exiting we must store weights back for next time
 471  */
 472
 473 common_exit:
 474         strh    r4, [r5, #4]
 475         strh    r0, [r5, #6]
 476         ldmpc   regs="r4-r8, r10-r11"
 477