FS#11335 by me: make ARM assembly functions thumb-friendly
[kugel-rb.git] / apps / codecs / libmad / imdct_l_arm.S
blobb511ff169d54758f355d76cd4353ff785c076e40
1 /*****************************************************************************
2 * Copyright (C) 2000-2001 Andre McCurdy  <armccurdy@yahoo.co.uk>
4 * This program is free software. you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation@ either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY, without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program@ if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18 *****************************************************************************
20 * Notes:
23 *****************************************************************************
25 * $Id$
27 * 2001/03/24:  Andre McCurdy <armccurdy@yahoo.co.uk>
28 *   - Corrected PIC unsafe loading of address of 'imdct36_long_karray'
30 * 2000/09/20:  Robert Leslie <rob@mars.org>
31 *   - Added a global symbol with leading underscore per suggestion of
32 *     Simon Burge to support linking with the a.out format.
34 * 2000/09/15:  Robert Leslie <rob@mars.org>
35 *   - Fixed a small bug where flags were changed before a conditional branch.
37 * 2000/09/15:  Andre McCurdy <armccurdy@yahoo.co.uk>
38 *   - Applied Nicolas Pitre's rounding optimisation in all remaining places.
40 * 2000/09/09:  Nicolas Pitre <nico@cam.org>
41 *   - Optimized rounding + scaling operations.
43 * 2000/08/09:  Andre McCurdy <armccurdy@yahoo.co.uk>
44 *   - Original created.
46 ****************************************************************************/
48 #include "config.h"
51    On entry:
53       r0 = pointer to 18 element input  array
54       r1 = pointer to 36 element output array
55       r2 = windowing block type
58    Stack frame created during execution of the function:
60    Initial   Holds:
61    Stack
62    pointer
63    minus:
65        0
66        4     lr
67        8     r11
68       12     r10
69       16     r9
70       20     r8
71       24     r7
72       28     r6
73       32     r5
74       36     r4
76       40     r2 : windowing block type
78       44     ct00 high
79       48     ct00 low
80       52     ct01 high
81       56     ct01 low
82       60     ct04 high
83       64     ct04 low
84       68     ct06 high
85       72     ct06 low
86       76     ct05 high
87       80     ct05 low
88       84     ct03 high
89       88     ct03 low
90       92    -ct05 high
91       96    -ct05 low
92      100    -ct07 high
93      104    -ct07 low
94      108     ct07 high
95      112     ct07 low
96      116     ct02 high
97      120     ct02 low
100 #define BLOCK_MODE_NORMAL   0
101 #define BLOCK_MODE_START    1
102 #define BLOCK_MODE_STOP     3
105 #define X0   0x00
106 #define X1   0x04
107 #define X2   0x08
108 #define X3   0x0C
109 #define X4   0x10
110 #define X5   0x14
111 #define X6   0x18
112 #define X7   0x1c
113 #define X8   0x20
114 #define X9   0x24
115 #define X10  0x28
116 #define X11  0x2c
117 #define X12  0x30
118 #define X13  0x34
119 #define X14  0x38
120 #define X15  0x3c
121 #define X16  0x40
122 #define X17  0x44
124 #define x0   0x00
125 #define x1   0x04
126 #define x2   0x08
127 #define x3   0x0C
128 #define x4   0x10
129 #define x5   0x14
130 #define x6   0x18
131 #define x7   0x1c
132 #define x8   0x20
133 #define x9   0x24
134 #define x10  0x28
135 #define x11  0x2c
136 #define x12  0x30
137 #define x13  0x34
138 #define x14  0x38
139 #define x15  0x3c
140 #define x16  0x40
141 #define x17  0x44
142 #define x18  0x48
143 #define x19  0x4c
144 #define x20  0x50
145 #define x21  0x54
146 #define x22  0x58
147 #define x23  0x5c
148 #define x24  0x60
149 #define x25  0x64
150 #define x26  0x68
151 #define x27  0x6c
152 #define x28  0x70
153 #define x29  0x74
154 #define x30  0x78
155 #define x31  0x7c
156 #define x32  0x80
157 #define x33  0x84
158 #define x34  0x88
159 #define x35  0x8c
161 #define K00  0x0ffc19fd
162 #define K01  0x00b2aa3e
163 #define K02  0x0fdcf549
164 #define K03  0x0216a2a2
165 #define K04  0x0f9ee890
166 #define K05  0x03768962
167 #define K06  0x0f426cb5
168 #define K07  0x04cfb0e2
169 #define K08  0x0ec835e8
170 #define K09  0x061f78aa
171 #define K10  0x0e313245
172 #define K11  0x07635284
173 #define K12  0x0d7e8807
174 #define K13  0x0898c779
175 #define K14  0x0cb19346
176 #define K15  0x09bd7ca0
177 #define K16  0x0bcbe352
178 #define K17  0x0acf37ad
180 #define minus_K02 0xf0230ab7
182 #define WL0  0x00b2aa3e
183 #define WL1  0x0216a2a2
184 #define WL2  0x03768962
185 #define WL3  0x04cfb0e2
186 #define WL4  0x061f78aa
187 #define WL5  0x07635284
188 #define WL6  0x0898c779
189 #define WL7  0x09bd7ca0
190 #define WL8  0x0acf37ad
191 #define WL9  0x0bcbe352
192 #define WL10 0x0cb19346
193 #define WL11 0x0d7e8807
194 #define WL12 0x0e313245
195 #define WL13 0x0ec835e8
196 #define WL14 0x0f426cb5
197 #define WL15 0x0f9ee890
198 #define WL16 0x0fdcf549
199 #define WL17 0x0ffc19fd
202 @*****************************************************************************
205     .text
206     .align
208     .global III_imdct_l
209     .global _III_imdct_l
211 III_imdct_l:
212 _III_imdct_l:
214     stmdb   sp!, { r2, r4 - r11, lr }   @ all callee saved regs, plus arg3
216     ldr     r4, =K08                    @ r4 =  K08
217     ldr     r5, =K09                    @ r5 =  K09
218     ldr     r8, [r0, #X4]               @ r8 =  X4
219     ldr     r9, [r0, #X13]              @ r9 =  X13
220     rsb     r6, r4, #0                  @ r6 = -K08
221     rsb     r7, r5, #0                  @ r7 = -K09
223     smull   r2, r3, r4, r8              @ r2..r3  = (X4 * K08)
224     smlal   r2, r3, r5, r9              @ r2..r3  = (X4 * K08) + (X13 *  K09) = ct01
226     smull   r10, lr, r8, r5             @ r10..lr = (X4 * K09)
227     smlal   r10, lr, r9, r6             @ r10..lr = (X4 * K09) + (X13 * -K08) = ct00
229     ldr     r8, [r0, #X7]               @ r8 = X7
230     ldr     r9, [r0, #X16]              @ r9 = X16
232     stmdb   sp!, { r2, r3, r10, lr }    @ stack ct00_h, ct00_l, ct01_h, ct01_l
234     add     r8, r8, r9                  @ r8 = (X7 + X16)
235     ldr     r9, [r0, #X1]               @ r9 = X1
237     smlal   r2, r3, r6, r8              @ r2..r3  = ct01 + ((X7 + X16) * -K08)
238     smlal   r2, r3, r7, r9              @ r2..r3 += (X1  * -K09)
240     ldr     r7, [r0, #X10]              @ r7 = X10
242     rsbs    r10, r10, #0
243     rsc     lr, lr, #0                  @ r10..lr  = -ct00
245     smlal   r2, r3, r5, r7              @ r2..r3  += (X10 *  K09) = ct06
247     smlal   r10, lr, r9, r6             @ r10..lr  = -ct00 + ( X1        * -K08)
248     smlal   r10, lr, r8, r5             @ r10..lr +=         ((X7 + X16) *  K09)
249     smlal   r10, lr, r7, r4             @ r10..lr +=         ( X10       *  K08) = ct04
251     stmdb   sp!, { r2, r3, r10, lr }    @ stack ct04_h, ct04_l, ct06_h, ct06_l
253     @----
255     ldr     r7, [r0, #X0]
256     ldr     r8, [r0, #X11]
257     ldr     r9, [r0, #X12]
258     sub     r7, r7, r8
259     sub     r7, r7, r9                  @ r7 = (X0 - X11 -X12) = ct14
261     ldr     r9,  [r0, #X3]
262     ldr     r8,  [r0, #X8]
263     ldr     r11, [r0, #X15]
264     sub     r8, r8, r9
265     add     r8, r8, r11                 @ r8 = (X8 - X3 + X15) = ct16
267     add     r11, r7, r8                 @ r11 = ct14 + ct16 = ct18
269     smlal   r2, r3, r6, r11             @ r2..r3 = ct06 + ((X0 - X11 - X3 + X15 + X8 - X12) * -K08)
271     ldr     r6,  [r0, #X2]
272     ldr     r9,  [r0, #X9]
273     ldr     r12, [r0, #X14]
274     sub     r6, r6, r9
275     sub     r6, r6, r12                 @ r6 = (X2 - X9 - X14) = ct15
277     ldr     r9,  [r0, #X5]
278     ldr     r12, [r0, #X6]
279     sub     r9, r9, r12
280     ldr     r12, [r0, #X17]
281     sub     r9, r9, r12                 @ r9 = (X5 - X6 - X17) = ct17
283     add     r12, r9, r6                 @ r12 = ct15 + ct17 = ct19
285     smlal   r2, r3, r5, r12             @ r2..r3 += ((X2 - X9 + X5 - X6 - X17 - X14) * K09)
287     smlal   r10, lr, r11, r5            @ r10..lr = ct04 + (ct18 * K09)
288     smlal   r10, lr, r12, r4            @ r10..lr = ct04 + (ct18 * K09) + (ct19 * K08)
290     movs    r2, r2, lsr #28
291     adc     r2, r2, r3, lsl #4          @ r2 = bits[59..28] of r2..r3
292     str     r2, [r1, #x22]              @ store result x22
294     movs    r10, r10, lsr #28
295     adc     r10, r10, lr, lsl #4        @ r10 = bits[59..28] of r10..lr
296     str     r10, [r1, #x4]              @ store result x4
298     @----
300     ldmia   sp, { r2, r3, r4, r5 }      @ r2..r3 = ct06, r4..r5 = ct04 (dont update sp)
302     @ r2..r3 = ct06
303     @ r4..r5 = ct04
304     @ r6     = ct15
305     @ r7     = ct14
306     @ r8     = ct16
307     @ r9     = ct17
308     @ r10    = .
309     @ r11    = .
310     @ r12    = .
311     @ lr     = .
313     ldr     r10, =K03                   @ r10 = K03
314     ldr     lr,  =K15                   @ lr  = K15
316     smlal   r2, r3, r10, r7             @ r2..r3 = ct06 + (ct14 * K03)
317     smlal   r4, r5,  lr, r7             @ r4..r5 = ct04 + (ct14 * K15)
319     ldr     r12, =K14                   @ r12 =  K14
320     rsb     r10, r10, #0                @ r10 = -K03
322     smlal   r2, r3,  lr, r6             @ r2..r3 += (ct15 *  K15)
323     smlal   r4, r5, r10, r6             @ r4..r5 += (ct15 * -K03)
324     smlal   r2, r3, r12, r8             @ r2..r3 += (ct16 *  K14)
326     ldr     r11, =minus_K02             @ r11 = -K02
327     rsb     r12, r12, #0                @ r12 = -K14
329     smlal   r4, r5, r12, r9             @ r4..r5 += (ct17 * -K14)
330     smlal   r2, r3, r11, r9             @ r2..r3 += (ct17 * -K02)
331     smlal   r4, r5, r11, r8             @ r4..r5 += (ct16 * -K02)
333     movs    r2, r2, lsr #28
334     adc     r2, r2, r3, lsl #4          @ r2 = bits[59..28] of r2..r3
335     str     r2, [r1, #x7]               @ store result x7
337     movs    r4, r4, lsr #28
338     adc     r4, r4, r5, lsl #4          @ r4 = bits[59..28] of r4..r5
339     str     r4, [r1, #x1]               @ store result x1
341     @----
343     ldmia   sp, { r2, r3, r4, r5 }      @ r2..r3 = ct06, r4..r5 = ct04 (dont update sp)
345     @ r2..r3 = ct06
346     @ r4..r5 = ct04
347     @ r6     = ct15
348     @ r7     = ct14
349     @ r8     = ct16
350     @ r9     = ct17
351     @ r10    = -K03
352     @ r11    = -K02
353     @ r12    = -K14
354     @ lr     =  K15
356     rsbs    r2, r2, #0
357     rsc     r3, r3, #0                  @ r2..r3 = -ct06
359     smlal   r2, r3, r12, r7             @ r2..r3  = -ct06 + (ct14 * -K14)
360     smlal   r2, r3, r10, r8             @ r2..r3 += (ct16 * -K03)
362     smlal   r4, r5, r12, r6             @ r4..r5  =  ct04 + (ct15 * -K14)
363     smlal   r4, r5, r10, r9             @ r4..r5 += (ct17 * -K03)
364     smlal   r4, r5,  lr, r8             @ r4..r5 += (ct16 *  K15)
365     smlal   r4, r5, r11, r7             @ r4..r5 += (ct14 * -K02)
367     rsb     lr, lr, #0                  @ lr  = -K15
368     rsb     r11, r11, #0                @ r11 =  K02
370     smlal   r2, r3,  lr, r9             @ r2..r3 += (ct17 * -K15)
371     smlal   r2, r3, r11, r6             @ r2..r3 += (ct15 *  K02)
373     movs    r4, r4, lsr #28
374     adc     r4, r4, r5, lsl #4          @ r4 = bits[59..28] of r4..r5
375     str     r4, [r1, #x25]              @ store result x25
377     movs    r2, r2, lsr #28
378     adc     r2, r2, r3, lsl #4          @ r2 = bits[59..28] of r2..r3
379     str     r2, [r1, #x19]              @ store result x19
381     @----
383     ldr     r2, [sp, #16]               @ r2 = ct01_l
384     ldr     r3, [sp, #20]               @ r3 = ct01_h
386     ldr     r6, [r0, #X1]
387     ldr     r8, [r0, #X7]
388     ldr     r9, [r0, #X10]
389     ldr     r7, [r0, #X16]
391     rsbs    r2, r2, #0
392     rsc     r3, r3, #0                  @ r2..r3 = -ct01
394     mov     r4, r2
395     mov     r5, r3                      @ r4..r5 = -ct01
397     @ r2..r3 = -ct01
398     @ r4..r5 = -ct01
399     @ r6     =  X1
400     @ r7     =  X16
401     @ r8     =  X7
402     @ r9     =  X10
403     @ r10    = -K03
404     @ r11    =  K02
405     @ r12    = -K14
406     @ lr     = -K15
408     smlal   r4, r5, r12, r7             @ r4..r5 = -ct01 + (X16 * -K14)
409     smlal   r2, r3,  lr, r9             @ r2..r3 = -ct01 + (X10 * -K15)
411     smlal   r4, r5, r10, r8             @ r4..r5 += (X7  * -K03)
412     smlal   r2, r3, r10, r7             @ r2..r3 += (X16 * -K03)
414     smlal   r4, r5, r11, r9             @ r4..r5 += (X10 *  K02)
415     smlal   r2, r3, r12, r8             @ r2..r3 += (X7  * -K14)
417     rsb     lr, lr, #0                  @ lr  =  K15
418     rsb     r11, r11, #0                @ r11 = -K02
420     smlal   r4, r5,  lr, r6             @ r4..r5 += (X1  *  K15) = ct05
421     smlal   r2, r3, r11, r6             @ r2..r3 += (X1  * -K02) = ct03
423     stmdb   sp!, { r2, r3, r4, r5 }     @ stack ct05_h, ct05_l, ct03_h, ct03_l
425     rsbs    r4, r4, #0
426     rsc     r5, r5, #0                  @ r4..r5 = -ct05
428     stmdb   sp!, { r4, r5 }             @ stack -ct05_h, -ct05_l
430     ldr     r2, [sp, #48]               @ r2 = ct00_l
431     ldr     r3, [sp, #52]               @ r3 = ct00_h
433     rsb     r10, r10, #0                @ r10 = K03
435     rsbs    r4, r2, #0
436     rsc     r5, r3, #0                  @ r4..r5 = -ct00
438     @ r2..r3 =  ct00
439     @ r4..r5 = -ct00
440     @ r6     =  X1
441     @ r7     =  X16
442     @ r8     =  X7
443     @ r9     =  X10
444     @ r10    =  K03
445     @ r11    = -K02
446     @ r12    = -K14
447     @ lr     =  K15
449     smlal   r4, r5, r10, r6             @ r4..r5 = -ct00 + (X1  * K03)
450     smlal   r2, r3, r10, r9             @ r2..r3 =  ct00 + (X10 * K03)
452     smlal   r4, r5, r12, r9             @ r4..r5 += (X10 * -K14)
453     smlal   r2, r3, r12, r6             @ r2..r3 += (X1  * -K14)
455     smlal   r4, r5, r11, r7             @ r4..r5 += (X16 * -K02)
456     smlal   r4, r5,  lr, r8             @ r4..r5 += (X7  *  K15) = ct07
458     rsb     lr, lr, #0                  @ lr  = -K15
459     rsb     r11, r11, #0                @ r11 =  K02
461     smlal   r2, r3, r11, r8             @ r2..r3 += (X7  *  K02)
462     smlal   r2, r3,  lr, r7             @ r2..r3 += (X16 * -K15) = ct02
464     rsbs    r6, r4, #0
465     rsc     r7, r5, #0                  @ r6..r7 = -ct07
467     stmdb   sp!, { r2 - r7 }            @ stack -ct07_h, -ct07_l, ct07_h, ct07_l, ct02_h, ct02_l
470     @----
472     add     r2, pc, #(imdct36_long_karray-.-8)  @ r2 = base address of Knn array (PIC safe ?)
475 loop:
476     ldr     r12, [r0, #X0]
478     ldmia   r2!, { r5 - r11 }           @ first 7 words from Karray element
480     smull   r3, r4, r5, r12             @ sum =  (Kxx * X0)
481     ldr     r12, [r0, #X2]
482     ldr     r5,  [r0, #X3]
483     smlal   r3, r4, r6, r12             @ sum += (Kxx * X2)
484     ldr     r12, [r0, #X5]
485     ldr     r6,  [r0, #X6]
486     smlal   r3, r4, r7, r5              @ sum += (Kxx * X3)
487     smlal   r3, r4, r8, r12             @ sum += (Kxx * X5)
488     ldr     r12, [r0, #X8]
489     ldr     r5,  [r0, #X9]
490     smlal   r3, r4, r9, r6              @ sum += (Kxx * X6)
491     smlal   r3, r4, r10, r12            @ sum += (Kxx * X8)
492     smlal   r3, r4, r11, r5             @ sum += (Kxx * X9)
494     ldmia   r2!, { r5 - r10 }           @ final 6 words from Karray element
496     ldr     r11, [r0, #X11]
497     ldr     r12, [r0, #X12]
498     smlal   r3, r4, r5, r11             @ sum += (Kxx * X11)
499     ldr     r11, [r0, #X14]
500     ldr     r5,  [r0, #X15]
501     smlal   r3, r4, r6, r12             @ sum += (Kxx * X12)
502     smlal   r3, r4, r7, r11             @ sum += (Kxx * X14)
503     ldr     r11, [r0, #X17]
504     smlal   r3, r4, r8, r5              @ sum += (Kxx * X15)
505     smlal   r3, r4, r9, r11             @ sum += (Kxx * X17)
507     add     r5, sp, r10, lsr #16        @ create index back into stack for required ctxx
509     ldmia   r5, { r6, r7 }              @ r6..r7 = ctxx
511     mov     r8, r10, lsl #16            @ push ctxx index off the top end
513     adds    r3, r3, r6                  @ add low words
514     adc     r4, r4, r7                  @ add high words, with carry
515     movs    r3, r3, lsr #28
516     adc     r3, r3, r4, lsl #4          @ r3 = bits[59..28] of r3..r4
518     str     r3, [r1, r8, lsr #24]       @ push completion flag off the bottom end
520     movs    r8, r8, lsl #8              @ push result location index off the top end
521     beq     loop                        @ loop back if completion flag not set
522     b       imdct_l_windowing           @ branch to windowing stage if looping finished
524 imdct36_long_karray:
526     .word   K17, -K13,  K10, -K06, -K05,  K01, -K00,  K04, -K07,  K11,  K12, -K16, 0x00000000
527     .word   K13,  K07,  K16,  K01,  K10, -K05,  K04, -K11,  K00, -K17,  K06, -K12, 0x00200800
528     .word   K11,  K17,  K05,  K12, -K01,  K06, -K07,  K00, -K13,  K04, -K16,  K10, 0x00200c00
529     .word   K07,  K00, -K12,  K05, -K16, -K10,  K11, -K17,  K04,  K13,  K01,  K06, 0x00001400
530     .word   K05,  K10, -K00, -K17,  K07, -K13,  K12,  K06, -K16,  K01, -K11, -K04, 0x00181800
531     .word   K01,  K05, -K07, -K11,  K13,  K17, -K16, -K12,  K10,  K06, -K04, -K00, 0x00102000
532     .word  -K16,  K12, -K11,  K07,  K04, -K00, -K01,  K05, -K06,  K10,  K13, -K17, 0x00284800
533     .word  -K12,  K06,  K17, -K00, -K11,  K04,  K05, -K10,  K01,  K16, -K07, -K13, 0x00085000
534     .word  -K10,  K16,  K04, -K13, -K00,  K07,  K06, -K01, -K12, -K05,  K17,  K11, 0x00105400
535     .word  -K06, -K01,  K13,  K04,  K17, -K11, -K10, -K16, -K05,  K12,  K00,  K07, 0x00185c00
536     .word  -K04, -K11, -K01,  K16,  K06,  K12,  K13, -K07, -K17, -K00, -K10, -K05, 0x00006000
537     .word  -K00, -K04, -K06, -K10, -K12, -K16, -K17, -K13, -K11, -K07, -K05, -K01, 0x00206801
540     @----
541     @-------------------------------------------------------------------------
542     @----
544 imdct_l_windowing:
546     ldr     r11, [sp, #80]              @ fetch function parameter 3 from out of the stack
547     ldmia   r1!, { r0, r2 - r9 }        @ load 9 words from x0, update pointer
549     @ r0     = x0
550     @ r1     = &x[9]
551     @ r2     = x1
552     @ r3     = x2
553     @ r4     = x3
554     @ r5     = x4
555     @ r6     = x5
556     @ r7     = x6
557     @ r8     = x7
558     @ r9     = x8
559     @ r10    = .
560     @ r11    = window mode: (0 == normal), (1 == start block), (3 == stop block)
561     @ r12    = .
562     @ lr     = .
564     cmp     r11, #BLOCK_MODE_STOP       @ setup flags
565     rsb     r10, r0, #0                 @ r10 = -x0 (DONT change flags !!)
566     beq     stop_block_x0_to_x17
569     @ start and normal blocks are treated the same for x[0]..x[17]
571 normal_block_x0_to_x17:
573     ldr     r12, =WL9                   @ r12 = window_l[9]
575     rsb     r0,  r9, #0                 @ r0  = -x8
576     rsb     r9,  r2, #0                 @ r9  = -x1
577     rsb     r2,  r8, #0                 @ r2  = -x7
578     rsb     r8,  r3, #0                 @ r8  = -x2
579     rsb     r3,  r7, #0                 @ r3  = -x6
580     rsb     r7,  r4, #0                 @ r7  = -x3
581     rsb     r4,  r6, #0                 @ r4  = -x5
582     rsb     r6,  r5, #0                 @ r6  = -x4
584     @ r0     = -x8
585     @ r1     = &x[9]
586     @ r2     = -x7
587     @ r3     = -x6
588     @ r4     = -x5
589     @ r5     = .
590     @ r6     = -x4
591     @ r7     = -x3
592     @ r8     = -x2
593     @ r9     = -x1
594     @ r10    = -x0
595     @ r11    = window mode: (0 == normal), (1 == start block), (3 == stop block)
596     @ r12    = window_l[9]
597     @ lr     = .
599     smull   r5, lr, r12, r0             @ r5..lr = (window_l[9]  * (x[9]  == -x[8]))
600     ldr     r12, =WL10                  @ r12 = window_l[10]
601     movs    r5, r5, lsr #28
602     adc     r0, r5, lr, lsl #4          @ r0 = bits[59..28] of windowed x9
604     smull   r5, lr, r12, r2             @ r5..lr = (window_l[10] * (x[10] == -x[7]))
605     ldr     r12, =WL11                  @ r12 = window_l[11]
606     movs    r5, r5, lsr #28
607     adc     r2, r5, lr, lsl #4          @ r2 = bits[59..28] of windowed x10
609     smull   r5, lr, r12, r3             @ r5..lr = (window_l[11] * (x[11] == -x[6]))
610     ldr     r12, =WL12                  @ r12 = window_l[12]
611     movs    r5, r5, lsr #28
612     adc     r3, r5, lr, lsl #4          @ r3 = bits[59..28] of windowed x11
614     smull   r5, lr, r12, r4             @ r5..lr = (window_l[12] * (x[12] == -x[5]))
615     ldr     r12, =WL13                  @ r12 = window_l[13]
616     movs    r5, r5, lsr #28
617     adc     r4, r5, lr, lsl #4          @ r4 = bits[59..28] of windowed x12
619     smull   r5, lr, r12, r6             @ r5..lr = (window_l[13] * (x[13] == -x[4]))
620     ldr     r12, =WL14                  @ r12 = window_l[14]
621     movs    r5, r5, lsr #28
622     adc     r6, r5, lr, lsl #4          @ r6 = bits[59..28] of windowed x13
624     smull   r5, lr, r12, r7             @ r5..lr = (window_l[14] * (x[14] == -x[3]))
625     ldr     r12, =WL15                  @ r12 = window_l[15]
626     movs    r5, r5, lsr #28
627     adc     r7, r5, lr, lsl #4          @ r7 = bits[59..28] of windowed x14
629     smull   r5, lr, r12, r8             @ r5..lr = (window_l[15] * (x[15] == -x[2]))
630     ldr     r12, =WL16                  @ r12 = window_l[16]
631     movs    r5, r5, lsr #28
632     adc     r8, r5, lr, lsl #4          @ r8 = bits[59..28] of windowed x15
634     smull   r5, lr, r12, r9             @ r5..lr = (window_l[16] * (x[16] == -x[1]))
635     ldr     r12, =WL17                  @ r12 = window_l[17]
636     movs    r5, r5, lsr #28
637     adc     r9, r5, lr, lsl #4          @ r9 = bits[59..28] of windowed x16
639     smull   r5, lr, r12, r10            @ r5..lr = (window_l[17] * (x[17] == -x[0]))
640     ldr     r12, =WL0                   @ r12 = window_l[0]
641     movs    r5, r5, lsr #28
642     adc     r10, r5, lr, lsl #4         @ r10 = bits[59..28] of windowed x17
645     stmia   r1,  { r0, r2 - r4, r6 - r10 } @ store windowed x[9] .. x[17]
646     ldmdb   r1!, { r0, r2 - r9 }           @ load 9 words downto (and including) x0
649     smull   r10, lr, r12, r0            @ r10..lr = (window_l[0] * x[0])
650     ldr     r12, =WL1                   @ r12 = window_l[1]
651     movs    r10, r10, lsr #28
652     adc     r0, r10, lr, lsl #4         @ r0 = bits[59..28] of windowed x0
654     smull   r10, lr, r12, r2            @ r10..lr = (window_l[1] * x[1])
655     ldr     r12, =WL2                   @ r12 = window_l[2]
656     movs    r10, r10, lsr #28
657     adc     r2, r10, lr, lsl #4         @ r2 = bits[59..28] of windowed x1
659     smull   r10, lr, r12, r3            @ r10..lr = (window_l[2] * x[2])
660     ldr     r12, =WL3                   @ r12 = window_l[3]
661     movs    r10, r10, lsr #28
662     adc     r3, r10, lr, lsl #4         @ r3 = bits[59..28] of windowed x2
664     smull   r10, lr, r12, r4            @ r10..lr = (window_l[3] * x[3])
665     ldr     r12, =WL4                   @ r12 = window_l[4]
666     movs    r10, r10, lsr #28
667     adc     r4, r10, lr, lsl #4         @ r4 = bits[59..28] of windowed x3
669     smull   r10, lr, r12, r5            @ r10..lr = (window_l[4] * x[4])
670     ldr     r12, =WL5                   @ r12 = window_l[5]
671     movs    r10, r10, lsr #28
672     adc     r5, r10, lr, lsl #4         @ r5 = bits[59..28] of windowed x4
674     smull   r10, lr, r12, r6            @ r10..lr = (window_l[5] * x[5])
675     ldr     r12, =WL6                   @ r12 = window_l[6]
676     movs    r10, r10, lsr #28
677     adc     r6, r10, lr, lsl #4         @ r6 = bits[59..28] of windowed x5
679     smull   r10, lr, r12, r7            @ r10..lr = (window_l[6] * x[6])
680     ldr     r12, =WL7                   @ r12 = window_l[7]
681     movs    r10, r10, lsr #28
682     adc     r7, r10, lr, lsl #4         @ r7 = bits[59..28] of windowed x6
684     smull   r10, lr, r12, r8            @ r10..lr = (window_l[7] * x[7])
685     ldr     r12, =WL8                   @ r12 = window_l[8]
686     movs    r10, r10, lsr #28
687     adc     r8, r10, lr, lsl #4         @ r8 = bits[59..28] of windowed x7
689     smull   r10, lr, r12, r9            @ r10..lr = (window_l[8] * x[8])
690     movs    r10, r10, lsr #28
691     adc     r9, r10, lr, lsl #4         @ r9 = bits[59..28] of windowed x8
693     stmia   r1, { r0, r2 - r9 }         @ store windowed x[0] .. x[8]
695     cmp     r11, #BLOCK_MODE_START
696     beq     start_block_x18_to_x35
699     @----
702 normal_block_x18_to_x35:
704     ldr     r11, =WL3                   @ r11 = window_l[3]
705     ldr     r12, =WL4                   @ r12 = window_l[4]
707     add     r1, r1, #(18*4)             @ r1 = &x[18]
709     ldmia   r1!, { r0, r2 - r4, r6 - r10 }  @ load 9 words from x18, update pointer
711     @ r0     = x18
712     @ r1     = &x[27]
713     @ r2     = x19
714     @ r3     = x20
715     @ r4     = x21
716     @ r5     = .
717     @ r6     = x22
718     @ r7     = x23
719     @ r8     = x24
720     @ r9     = x25
721     @ r10    = x26
722     @ r11    = window_l[3]
723     @ r12    = window_l[4]
724     @ lr     = .
726     smull   r5, lr, r12, r6             @ r5..lr = (window_l[4] * (x[22] == x[31]))
727     movs    r5, r5, lsr #28
728     adc     r5, r5, lr, lsl #4          @ r5 = bits[59..28] of windowed x31
730     smull   r6, lr, r11, r4             @ r5..lr = (window_l[3] * (x[21] == x[32]))
731     ldr     r12, =WL5                   @ r12    =  window_l[5]
732     movs    r6, r6, lsr #28
733     adc     r6, r6, lr, lsl #4          @ r6 = bits[59..28] of windowed x32
735     smull   r4, lr, r12, r7             @ r4..lr = (window_l[5] * (x[23] == x[30]))
736     ldr     r11, =WL1                   @ r11    =  window_l[1]
737     ldr     r12, =WL2                   @ r12    =  window_l[2]
738     movs    r4, r4, lsr #28
739     adc     r4, r4, lr, lsl #4          @ r4 = bits[59..28] of windowed x30
741     smull   r7, lr, r12, r3             @ r7..lr = (window_l[2] * (x[20] == x[33]))
742     ldr     r12, =WL6                   @ r12 = window_l[6]
743     movs    r7, r7, lsr #28
744     adc     r7, r7, lr, lsl #4          @ r7 = bits[59..28] of windowed x33
746     smull   r3, lr, r12, r8             @ r3..lr = (window_l[6] * (x[24] == x[29]))
747     movs    r3, r3, lsr #28
748     adc     r3, r3, lr, lsl #4          @ r3 = bits[59..28] of windowed x29
750     smull   r8, lr, r11, r2             @ r7..lr = (window_l[1] * (x[19] == x[34]))
751     ldr     r12, =WL7                   @ r12    =  window_l[7]
752     ldr     r11, =WL8                   @ r11    =  window_l[8]
753     movs    r8, r8, lsr #28
754     adc     r8, r8, lr, lsl #4          @ r8 = bits[59..28] of windowed x34
756     smull   r2, lr, r12, r9             @ r7..lr = (window_l[7] * (x[25] == x[28]))
757     ldr     r12, =WL0                   @ r12 = window_l[0]
758     movs    r2, r2, lsr #28
759     adc     r2, r2, lr, lsl #4          @ r2 = bits[59..28] of windowed x28
761     smull   r9, lr, r12, r0             @ r3..lr = (window_l[0] * (x[18] == x[35]))
762     movs    r9, r9, lsr #28
763     adc     r9, r9, lr, lsl #4          @ r9 = bits[59..28] of windowed x35
765     smull   r0, lr, r11, r10            @ r7..lr = (window_l[8] * (x[26] == x[27]))
766     ldr     r11, =WL16                  @ r11    =  window_l[16]
767     ldr     r12, =WL17                  @ r12    =  window_l[17]
768     movs    r0, r0, lsr #28
769     adc     r0, r0, lr, lsl #4          @ r0 = bits[59..28] of windowed x27
772     stmia   r1,  { r0, r2 - r9 }        @ store windowed x[27] .. x[35]
773     ldmdb   r1!, { r0, r2 - r9 }        @ load 9 words downto (and including) x18
776     smull   r10, lr, r12, r0            @ r10..lr = (window_l[17] * x[18])
777     movs    r10, r10, lsr #28
778     adc     r0,  r10, lr, lsl #4        @ r0 = bits[59..28] of windowed x0
780     smull   r10, lr, r11, r2            @ r10..lr = (window_l[16] * x[19])
781     ldr     r11, =WL14                  @ r11     =  window_l[14]
782     ldr     r12, =WL15                  @ r12     =  window_l[15]
783     movs    r10, r10, lsr #28
784     adc     r2,  r10, lr, lsl #4        @ r2 = bits[59..28] of windowed x1
786     smull   r10, lr, r12, r3            @ r10..lr = (window_l[15] * x[20])
787     movs    r10, r10, lsr #28
788     adc     r3,  r10, lr, lsl #4        @ r3 = bits[59..28] of windowed x2
790     smull   r10, lr, r11, r4            @ r10..lr = (window_l[14] * x[21])
791     ldr     r11, =WL12                  @ r11     =  window_l[12]
792     ldr     r12, =WL13                  @ r12     =  window_l[13]
793     movs    r10, r10, lsr #28
794     adc     r4,  r10, lr, lsl #4        @ r4 = bits[59..28] of windowed x3
796     smull   r10, lr, r12, r5            @ r10..lr = (window_l[13] * x[22])
797     movs    r10, r10, lsr #28
798     adc     r5,  r10, lr, lsl #4        @ r5 = bits[59..28] of windowed x4
800     smull   r10, lr, r11, r6            @ r10..lr = (window_l[12] * x[23])
801     ldr     r11, =WL10                  @ r12 = window_l[10]
802     ldr     r12, =WL11                  @ r12 = window_l[11]
803     movs    r10, r10, lsr #28
804     adc     r6,  r10, lr, lsl #4        @ r6 = bits[59..28] of windowed x5
806     smull   r10, lr, r12, r7            @ r10..lr = (window_l[11] * x[24])
807     movs    r10, r10, lsr #28
808     adc     r7,  r10, lr, lsl #4        @ r7 = bits[59..28] of windowed x6
810     smull   r10, lr, r11, r8            @ r10..lr = (window_l[10] * x[25])
811     ldr     r12, =WL9                   @ r12 = window_l[9]
812     movs    r10, r10, lsr #28
813     adc     r8,  r10, lr, lsl #4        @ r8 = bits[59..28] of windowed x7
815     smull   r10, lr, r12, r9            @ r10..lr = (window_l[9] * x[26])
817     movs    r10, r10, lsr #28
818     adc     r9,  r10, lr, lsl #4        @ r9 = bits[59..28] of windowed x8
820     stmia   r1, { r0, r2 - r9 }         @ store windowed x[18] .. x[26]
822     @----
823     @ NB there are 2 possible exits from this function - this is only one of them
824     @----
826     add     sp, sp, #(21*4)             @ return stack frame
827     ldmpc   regs=r4-r11                 @ restore callee saved regs, and return
829     @----
832 stop_block_x0_to_x17:
834     @ r0     =  x0
835     @ r1     = &x[9]
836     @ r2     =  x1
837     @ r3     =  x2
838     @ r4     =  x3
839     @ r5     =  x4
840     @ r6     =  x5
841     @ r7     =  x6
842     @ r8     =  x7
843     @ r9     =  x8
844     @ r10    = -x0
845     @ r11    =  window mode: (0 == normal), (1 == start block), (3 == stop block)
846     @ r12    =  .
847     @ lr     =  .
849     rsb     r0, r6, #0                  @ r0 = -x5
850     rsb     r6, r2, #0                  @ r6 = -x1
851     rsb     r2, r5, #0                  @ r2 = -x4
852     rsb     r5, r3, #0                  @ r5 = -x2
853     rsb     r3, r4, #0                  @ r3 = -x3
855     add     r1, r1, #(3*4)                      @ r1 = &x[12]
856     stmia   r1, { r0, r2, r3, r5, r6, r10 }     @ store unchanged x[12] .. x[17]
858     ldr     r0, =WL1                    @ r0 = window_l[1]  == window_s[0]
860     rsb     r10, r9, #0                 @ r10 = -x8
861     rsb     r12, r8, #0                 @ r12 = -x7
862     rsb     lr,  r7, #0                 @ lr  = -x6
864     @ r0     =  WL1
865     @ r1     = &x[12]
866     @ r2     =  .
867     @ r3     =  .
868     @ r4     =  .
869     @ r5     =  .
870     @ r6     =  .
871     @ r7     =  x6
872     @ r8     =  x7
873     @ r9     =  x8
874     @ r10    = -x8
875     @ r11    =  window mode: (0 == normal), (1 == start block), (3 == stop block)
876     @ r12    = -x7
877     @ lr     = -x6
879     smull   r5, r6, r0, r7              @ r5..r6 = (window_l[1] * x[6])
880     ldr     r2, =WL4                    @ r2     =  window_l[4] == window_s[1]
881     movs    r5, r5, lsr #28
882     adc     r7, r5, r6, lsl #4          @ r7 = bits[59..28] of windowed x6
884     smull   r5, r6, r2, r8              @ r5..r6 = (window_l[4] * x[7])
885     ldr     r3, =WL7                    @ r3     =  window_l[7] == window_s[2]
886     movs    r5, r5, lsr #28
887     adc     r8, r5, r6, lsl #4          @ r8 = bits[59..28] of windowed x7
889     smull   r5, r6, r3, r9              @ r5..r6 = (window_l[7] * x[8])
890     ldr     r4, =WL10                   @ r4     =  window_l[10] == window_s[3]
891     movs    r5, r5, lsr #28
892     adc     r9, r5, r6, lsl #4          @ r9 = bits[59..28] of windowed x8
894     smull   r5, r6, r4, r10             @ r5..r6 = (window_l[10] * (x[9] == -x[8]))
895     ldr     r0, =WL13                   @ r0     =  window_l[13] == window_s[4]
896     movs    r5, r5, lsr #28
897     adc     r10, r5, r6, lsl #4         @ r10 = bits[59..28] of windowed x9
899     smull   r5, r6, r0, r12             @ r5..r6 = (window_l[13] * (x[10] == -x[7]))
900     ldr     r2, =WL16                   @ r2     =  window_l[16] == window_s[5]
901     movs    r5, r5, lsr #28
902     adc     r12, r5, r6, lsl #4         @ r10 = bits[59..28] of windowed x9
904     smull   r5, r6, r2, lr              @ r5..r6 = (window_l[16] * (x[11] == -x[6]))
906     ldr     r0, =0x00
908     movs    r5, r5, lsr #28
909     adc     lr, r5, r6, lsl #4          @ r10 = bits[59..28] of windowed x9
911     stmdb   r1!, { r7 - r10, r12, lr }  @ store windowed x[6] .. x[11]
913     ldr     r5, =0x00
914     ldr     r6, =0x00
915     ldr     r2, =0x00
916     ldr     r3, =0x00
917     ldr     r4, =0x00
919     stmdb   r1!, { r0, r2 - r6 }        @ store windowed x[0] .. x[5]
921     b       normal_block_x18_to_x35
924     @----
927 start_block_x18_to_x35:
929     ldr     r4, =WL1                    @ r0 = window_l[1]  == window_s[0]
931     add     r1, r1, #(24*4)             @ r1 = &x[24]
933     ldmia   r1, { r0, r2, r3 }          @ load 3 words from x24, dont update pointer
935     @ r0     = x24
936     @ r1     = &x[24]
937     @ r2     = x25
938     @ r3     = x26
939     @ r4     = WL1
940     @ r5     = WL4
941     @ r6     = WL7
942     @ r7     = WL10
943     @ r8     = WL13
944     @ r9     = WL16
945     @ r10    = .
946     @ r11    = .
947     @ r12    = .
948     @ lr     = .
950     ldr     r5, =WL4                    @ r5 = window_l[4] == window_s[1]
952     smull   r10, r11, r4, r0            @ r10..r11 = (window_l[1] * (x[24] == x[29]))
953     ldr     r6, =WL7                    @ r6       =  window_l[7]  == window_s[2]
954     movs    r10, r10, lsr #28
955     adc     lr, r10, r11, lsl #4        @ lr = bits[59..28] of windowed x29
957     smull   r10, r11, r5, r2            @ r10..r11 = (window_l[4] * (x[25] == x[28]))
958     ldr     r7, =WL10                   @ r7       =  window_l[10] == window_s[3]
959     movs    r10, r10, lsr #28
960     adc     r12, r10, r11, lsl #4       @ r12 = bits[59..28] of windowed x28
962     smull   r10, r11, r6, r3            @ r10..r11 = (window_l[7] * (x[26] == x[27]))
963     ldr     r8, =WL13                   @ r8       =  window_l[13] == window_s[4]
964     movs    r10, r10, lsr #28
965     adc     r4, r10, r11, lsl #4        @ r4 = bits[59..28] of windowed x27
967     smull   r10, r11, r7, r3            @ r10..r11 = (window_l[10] * x[26])
968     ldr     r9, =WL16                   @ r9       =  window_l[16] == window_s[5]
969     movs    r10, r10, lsr #28
970     adc     r3, r10, r11, lsl #4        @ r3 = bits[59..28] of windowed x26
972     smull   r10, r11, r8, r2            @ r10..r11 = (window_l[13] * x[25])
973     ldr     r5, =0x00
974     movs    r10, r10, lsr #28
975     adc     r2, r10, r11, lsl #4        @ r2 = bits[59..28] of windowed x25
977     smull   r10, r11, r9, r0            @ r10..r11 = (window_l[16] * x[24])
978     ldr     r6, =0x00
979     movs    r10, r10, lsr #28
980     adc     r0, r10, r11, lsl #4        @ r0 = bits[59..28] of windowed x24
982     stmia   r1!, { r0, r2, r3, r4, r12, lr }    @ store windowed x[24] .. x[29]
984     ldr     r7, =0x00
985     ldr     r8, =0x00
986     ldr     r9, =0x00
987     ldr     r10, =0x00
989     stmia   r1!, { r5 - r10 }           @ store windowed x[30] .. x[35]
991     @----
992     @ NB there are 2 possible exits from this function - this is only one of them
993     @----
995     add     sp, sp, #(21*4)             @ return stack frame
996     ldmpc   regs=r4-r11                 @ restore callee saved regs, and return
998     @----
999     @END
1000     @----