Submit interim version of FS#10565. Performance optimization of atrac3 decoder for...
[kugel-rb.git] / apps / codecs / libatrac / atrac3_arm.S
blobbe8b2a0e0e4aeecc04fc4bf06fe2f15b4f675ea7
1 /***************************************************************************\r
2  *             __________               __   ___.\r
3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___\r
4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /\r
5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <\r
6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \\r
7  *                     \/            \/     \/    \/            \/\r
8  * $Id: \r
9  *\r
10  * Copyright (C) 2009 by Andree Buschmann\r
11  *\r
12  * This program is free software; you can redistribute it and/or\r
13  * modify it under the terms of the GNU General Public License\r
14  * as published by the Free Software Foundation; either version 2\r
15  * of the License, or (at your option) any later version.\r
16  *\r
17  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY\r
18  * KIND, either express or implied.\r
19  *\r
20  ****************************************************************************/\r
22     .section .text, "ax", %progbits\r
24 /****************************************************************************\r
25  * void atrac3_iqmf_matrixing(int32_t *dest, \r
26  *                            int32_t *inlo, \r
27  *                            int32_t *inhi, \r
28  *                            unsigned int count);\r
29  * \r
30  * Matrixing step within iqmf of atrac3 synthesis. Reference implementation:\r
31  *\r
32  * for(i=0; i<counter; i+=2){\r
33  *      dest[2*i+0] = inlo[i  ] + inhi[i  ];\r
34  *      dest[2*i+1] = inlo[i  ] - inhi[i  ];\r
35  *      dest[2*i+2] = inlo[i+1] + inhi[i+1];\r
36  *      dest[2*i+3] = inlo[i+1] - inhi[i+1];\r
37  * }\r
38  * Note: r12 is a scratch register and can be used without restorage.\r
39  ****************************************************************************/\r
40     .align  2\r
41     .global atrac3_iqmf_matrixing\r
42     .type   atrac3_iqmf_matrixing, %function\r
44 atrac3_iqmf_matrixing:\r
45     /* r0 = dest */\r
46     /* r1 = inlo */\r
47     /* r2 = inhi */\r
48     /* r3 = counter */\r
49     stmfd   sp!, {r4-r9, lr}       /* save non-scratch registers */\r
50     \r
51 .iqmf_matrixing_loop:\r
52     ldmia r1!, { r4, r6, r8, r12}   /* load inlo[0...3] */\r
53     ldmia r2!, { r5, r7, r9, lr }   /* load inhi[0...3] */\r
54     add   r4, r4, r5                /* r4  = inlo[0] + inhi[0] */\r
55     sub   r5, r4, r5, asl #1        /* r5  = inlo[0] - inhi[0] */\r
56     add   r6, r6, r7                /* r6  = inlo[1] + inhi[1] */\r
57     sub   r7, r6, r7, asl #1        /* r7  = inlo[1] - inhi[1] */\r
58     add   r8, r8, r9                /* r8  = inlo[2] + inhi[2] */\r
59     sub   r9, r8, r9, asl #1        /* r9  = inlo[2] - inhi[2] */\r
60     add   r12, r12, lr              /* r12 = inlo[3] + inhi[3] */\r
61     sub   lr , r12, lr, asl #1      /* lr  = inlo[3] - inhi[3] */\r
62     stmia r0!, {r4-r9, r12, lr}     /* store results to dest */\r
63     subs r3, r3, #4                 /* counter -= 4 */\r
64     bgt .iqmf_matrixing_loop\r
65     \r
66     ldmfd   sp!, {r4-r9, pc}       /* restore registers */\r
68 .atrac3_iqmf_matrixing_end:\r
69     .size   atrac3_iqmf_matrixing,.atrac3_iqmf_matrixing_end-atrac3_iqmf_matrixing\r
71     \r
72 /****************************************************************************\r
73  * atrac3_iqmf_dewindowing(int32_t *out,\r
74  *                         int32_t *in,\r
75  *                         int32_t *win,\r
76  *                         unsigned int nIn);\r
77  * \r
78  * Dewindowing step within iqmf of atrac3 synthesis. Reference implementation:\r
79  *\r
80  * for (j = nIn; j != 0; j--) {\r
81  *          s1 = fixmul32(in[0], win[0]);\r
82  *          s2 = fixmul32(in[1], win[1]);\r
83  *          for (i = 2; i < 48; i += 2) {\r
84  *              s1 += fixmul32(in[i  ], win[i  ]);\r
85  *              s2 += fixmul32(in[i+1], win[i+1]);\r
86  *          }\r
87  *          out[0] = s2 << 1;\r
88  *          out[1] = s1 << 1;\r
89  *          in += 2;\r
90  *          out += 2;\r
91  *      }\r
92  * Note: r12 is a scratch register and can be used without restorage.\r
93  ****************************************************************************/\r
94     .align  2\r
95     .global atrac3_iqmf_dewindowing\r
96     .type   atrac3_iqmf_dewindowing, %function\r
97     \r
98 atrac3_iqmf_dewindowing:\r
99     /* r0 = dest */\r
100     /* r1 = input samples */\r
101     /* r2 = window coefficients */\r
102     /* r3 = counter */\r
103     stmfd   sp!, {r4-r10, lr}       /* save non-scratch registers */\r
104     \r
105 .iqmf_dewindow_outer_loop:          /* outer loop 0...counter-1 */\r
107     ldmia r2!, {r5, r6}             /* load win[0..1] */\r
108     ldmia r1!, {r7, r8}             /* load in[0..1] */\r
109     smull lr , r10, r5, r7          /* s1 = win[0] * in[0] */\r
110     smull r12, r9 , r6, r8          /* s2 = win[1] * in[1] */\r
112     mov r4, #46                     /* r4 = 46 */\r
113 .iqmf_dewindow_inner_loop:          /* inner loop i=2...48 */\r
114     ldmia r2!, {r5, r6}             /* load win[i...i+1] */\r
115     ldmia r1!, {r7, r8}             /* load in[i...i+1] */\r
116     smlal lr , r10, r5, r7          /* s1 = win[i  ] * in[i  ] */\r
117     smlal r12, r9 , r6, r8          /* s2 = win[i+1] * in[i+1] */\r
119     subs r4, r4, #2                 /* inner loop -= 2*/\r
120     bgt .iqmf_dewindow_inner_loop\r
122     mov   lr , lr , lsr #31\r
123     orr   r10, lr , r10, lsl #1     /* s1 = low>>31 || hi<<1 */\r
124     mov   r12, r12, lsr #31\r
125     orr   r9 , r12, r9 , lsl #1     /* s2 = low>>31 || hi<<1 */\r
127     stmia r0!, {r9, r10}            /* store result out[0]=s2, out[1]=s1 */\r
128     sub   r1, r1, #184              /* roll back 64 entries = 184 bytes */\r
129     sub   r2, r2, #192              /* roll back 48 entries = 192 bytes = win[0] */\r
131     subs r3, r3, #1                 /* outer loop -= 1 */\r
132     bgt .iqmf_dewindow_outer_loop\r
133     \r
134     ldmfd   sp!, {r4-r10, pc}       /* restore registers */\r
135     \r
136 .atrac3_iqmf_dewindowing_end:\r
137     .size   atrac3_iqmf_dewindowing,.atrac3_iqmf_dewindowing_end-atrac3_iqmf_dewindowing\r