Improved bitrev with approach suggested by Jens Arnold, gives 0.5%-1% speedup for...
[kugel-rb.git] / apps / codecs / libmad / imdct_l_arm.S
blobb86ba1120da3b77ee32d3e558ca875c6101b2784
1 /*****************************************************************************
2 * Copyright (C) 2000-2001 Andre McCurdy  <armccurdy@yahoo.co.uk>
4 * This program is free software. you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation@ either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY, without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program@ if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18 *****************************************************************************
20 * Notes:
23 *****************************************************************************
25 * $Id$
27 * 2001/03/24:  Andre McCurdy <armccurdy@yahoo.co.uk>
28 *   - Corrected PIC unsafe loading of address of 'imdct36_long_karray'
30 * 2000/09/20:  Robert Leslie <rob@mars.org>
31 *   - Added a global symbol with leading underscore per suggestion of
32 *     Simon Burge to support linking with the a.out format.
34 * 2000/09/15:  Robert Leslie <rob@mars.org>
35 *   - Fixed a small bug where flags were changed before a conditional branch.
37 * 2000/09/15:  Andre McCurdy <armccurdy@yahoo.co.uk>
38 *   - Applied Nicolas Pitre's rounding optimisation in all remaining places.
40 * 2000/09/09:  Nicolas Pitre <nico@cam.org>
41 *   - Optimized rounding + scaling operations.
43 * 2000/08/09:  Andre McCurdy <armccurdy@yahoo.co.uk>
44 *   - Original created.
46 ****************************************************************************/
50    On entry:
52       r0 = pointer to 18 element input  array
53       r1 = pointer to 36 element output array
54       r2 = windowing block type
57    Stack frame created during execution of the function:
59    Initial   Holds:
60    Stack
61    pointer
62    minus:
64        0
65        4     lr
66        8     r11
67       12     r10
68       16     r9
69       20     r8
70       24     r7
71       28     r6
72       32     r5
73       36     r4
75       40     r2 : windowing block type
77       44     ct00 high
78       48     ct00 low
79       52     ct01 high
80       56     ct01 low
81       60     ct04 high
82       64     ct04 low
83       68     ct06 high
84       72     ct06 low
85       76     ct05 high
86       80     ct05 low
87       84     ct03 high
88       88     ct03 low
89       92    -ct05 high
90       96    -ct05 low
91      100    -ct07 high
92      104    -ct07 low
93      108     ct07 high
94      112     ct07 low
95      116     ct02 high
96      120     ct02 low
99 #define BLOCK_MODE_NORMAL   0
100 #define BLOCK_MODE_START    1
101 #define BLOCK_MODE_STOP     3
104 #define X0   0x00
105 #define X1   0x04
106 #define X2   0x08
107 #define X3   0x0C
108 #define X4   0x10
109 #define X5   0x14
110 #define X6   0x18
111 #define X7   0x1c
112 #define X8   0x20
113 #define X9   0x24
114 #define X10  0x28
115 #define X11  0x2c
116 #define X12  0x30
117 #define X13  0x34
118 #define X14  0x38
119 #define X15  0x3c
120 #define X16  0x40
121 #define X17  0x44
123 #define x0   0x00
124 #define x1   0x04
125 #define x2   0x08
126 #define x3   0x0C
127 #define x4   0x10
128 #define x5   0x14
129 #define x6   0x18
130 #define x7   0x1c
131 #define x8   0x20
132 #define x9   0x24
133 #define x10  0x28
134 #define x11  0x2c
135 #define x12  0x30
136 #define x13  0x34
137 #define x14  0x38
138 #define x15  0x3c
139 #define x16  0x40
140 #define x17  0x44
141 #define x18  0x48
142 #define x19  0x4c
143 #define x20  0x50
144 #define x21  0x54
145 #define x22  0x58
146 #define x23  0x5c
147 #define x24  0x60
148 #define x25  0x64
149 #define x26  0x68
150 #define x27  0x6c
151 #define x28  0x70
152 #define x29  0x74
153 #define x30  0x78
154 #define x31  0x7c
155 #define x32  0x80
156 #define x33  0x84
157 #define x34  0x88
158 #define x35  0x8c
160 #define K00  0x0ffc19fd
161 #define K01  0x00b2aa3e
162 #define K02  0x0fdcf549
163 #define K03  0x0216a2a2
164 #define K04  0x0f9ee890
165 #define K05  0x03768962
166 #define K06  0x0f426cb5
167 #define K07  0x04cfb0e2
168 #define K08  0x0ec835e8
169 #define K09  0x061f78aa
170 #define K10  0x0e313245
171 #define K11  0x07635284
172 #define K12  0x0d7e8807
173 #define K13  0x0898c779
174 #define K14  0x0cb19346
175 #define K15  0x09bd7ca0
176 #define K16  0x0bcbe352
177 #define K17  0x0acf37ad
179 #define minus_K02 0xf0230ab7
181 #define WL0  0x00b2aa3e
182 #define WL1  0x0216a2a2
183 #define WL2  0x03768962
184 #define WL3  0x04cfb0e2
185 #define WL4  0x061f78aa
186 #define WL5  0x07635284
187 #define WL6  0x0898c779
188 #define WL7  0x09bd7ca0
189 #define WL8  0x0acf37ad
190 #define WL9  0x0bcbe352
191 #define WL10 0x0cb19346
192 #define WL11 0x0d7e8807
193 #define WL12 0x0e313245
194 #define WL13 0x0ec835e8
195 #define WL14 0x0f426cb5
196 #define WL15 0x0f9ee890
197 #define WL16 0x0fdcf549
198 #define WL17 0x0ffc19fd
201 @*****************************************************************************
204     .text
205     .align
207     .global III_imdct_l
208     .global _III_imdct_l
210 III_imdct_l:
211 _III_imdct_l:
213     stmdb   sp!, { r2, r4 - r11, lr }   @ all callee saved regs, plus arg3
215     ldr     r4, =K08                    @ r4 =  K08
216     ldr     r5, =K09                    @ r5 =  K09
217     ldr     r8, [r0, #X4]               @ r8 =  X4
218     ldr     r9, [r0, #X13]              @ r9 =  X13
219     rsb     r6, r4, #0                  @ r6 = -K08
220     rsb     r7, r5, #0                  @ r7 = -K09
222     smull   r2, r3, r4, r8              @ r2..r3  = (X4 * K08)
223     smlal   r2, r3, r5, r9              @ r2..r3  = (X4 * K08) + (X13 *  K09) = ct01
225     smull   r10, lr, r8, r5             @ r10..lr = (X4 * K09)
226     smlal   r10, lr, r9, r6             @ r10..lr = (X4 * K09) + (X13 * -K08) = ct00
228     ldr     r8, [r0, #X7]               @ r8 = X7
229     ldr     r9, [r0, #X16]              @ r9 = X16
231     stmdb   sp!, { r2, r3, r10, lr }    @ stack ct00_h, ct00_l, ct01_h, ct01_l
233     add     r8, r8, r9                  @ r8 = (X7 + X16)
234     ldr     r9, [r0, #X1]               @ r9 = X1
236     smlal   r2, r3, r6, r8              @ r2..r3  = ct01 + ((X7 + X16) * -K08)
237     smlal   r2, r3, r7, r9              @ r2..r3 += (X1  * -K09)
239     ldr     r7, [r0, #X10]              @ r7 = X10
241     rsbs    r10, r10, #0
242     rsc     lr, lr, #0                  @ r10..lr  = -ct00
244     smlal   r2, r3, r5, r7              @ r2..r3  += (X10 *  K09) = ct06
246     smlal   r10, lr, r9, r6             @ r10..lr  = -ct00 + ( X1        * -K08)
247     smlal   r10, lr, r8, r5             @ r10..lr +=         ((X7 + X16) *  K09)
248     smlal   r10, lr, r7, r4             @ r10..lr +=         ( X10       *  K08) = ct04
250     stmdb   sp!, { r2, r3, r10, lr }    @ stack ct04_h, ct04_l, ct06_h, ct06_l
252     @----
254     ldr     r7, [r0, #X0]
255     ldr     r8, [r0, #X11]
256     ldr     r9, [r0, #X12]
257     sub     r7, r7, r8
258     sub     r7, r7, r9                  @ r7 = (X0 - X11 -X12) = ct14
260     ldr     r9,  [r0, #X3]
261     ldr     r8,  [r0, #X8]
262     ldr     r11, [r0, #X15]
263     sub     r8, r8, r9
264     add     r8, r8, r11                 @ r8 = (X8 - X3 + X15) = ct16
266     add     r11, r7, r8                 @ r11 = ct14 + ct16 = ct18
268     smlal   r2, r3, r6, r11             @ r2..r3 = ct06 + ((X0 - X11 - X3 + X15 + X8 - X12) * -K08)
270     ldr     r6,  [r0, #X2]
271     ldr     r9,  [r0, #X9]
272     ldr     r12, [r0, #X14]
273     sub     r6, r6, r9
274     sub     r6, r6, r12                 @ r6 = (X2 - X9 - X14) = ct15
276     ldr     r9,  [r0, #X5]
277     ldr     r12, [r0, #X6]
278     sub     r9, r9, r12
279     ldr     r12, [r0, #X17]
280     sub     r9, r9, r12                 @ r9 = (X5 - X6 - X17) = ct17
282     add     r12, r9, r6                 @ r12 = ct15 + ct17 = ct19
284     smlal   r2, r3, r5, r12             @ r2..r3 += ((X2 - X9 + X5 - X6 - X17 - X14) * K09)
286     smlal   r10, lr, r11, r5            @ r10..lr = ct04 + (ct18 * K09)
287     smlal   r10, lr, r12, r4            @ r10..lr = ct04 + (ct18 * K09) + (ct19 * K08)
289     movs    r2, r2, lsr #28
290     adc     r2, r2, r3, lsl #4          @ r2 = bits[59..28] of r2..r3
291     str     r2, [r1, #x22]              @ store result x22
293     movs    r10, r10, lsr #28
294     adc     r10, r10, lr, lsl #4        @ r10 = bits[59..28] of r10..lr
295     str     r10, [r1, #x4]              @ store result x4
297     @----
299     ldmia   sp, { r2, r3, r4, r5 }      @ r2..r3 = ct06, r4..r5 = ct04 (dont update sp)
301     @ r2..r3 = ct06
302     @ r4..r5 = ct04
303     @ r6     = ct15
304     @ r7     = ct14
305     @ r8     = ct16
306     @ r9     = ct17
307     @ r10    = .
308     @ r11    = .
309     @ r12    = .
310     @ lr     = .
312     ldr     r10, =K03                   @ r10 = K03
313     ldr     lr,  =K15                   @ lr  = K15
315     smlal   r2, r3, r10, r7             @ r2..r3 = ct06 + (ct14 * K03)
316     smlal   r4, r5,  lr, r7             @ r4..r5 = ct04 + (ct14 * K15)
318     ldr     r12, =K14                   @ r12 =  K14
319     rsb     r10, r10, #0                @ r10 = -K03
321     smlal   r2, r3,  lr, r6             @ r2..r3 += (ct15 *  K15)
322     smlal   r4, r5, r10, r6             @ r4..r5 += (ct15 * -K03)
323     smlal   r2, r3, r12, r8             @ r2..r3 += (ct16 *  K14)
325     ldr     r11, =minus_K02             @ r11 = -K02
326     rsb     r12, r12, #0                @ r12 = -K14
328     smlal   r4, r5, r12, r9             @ r4..r5 += (ct17 * -K14)
329     smlal   r2, r3, r11, r9             @ r2..r3 += (ct17 * -K02)
330     smlal   r4, r5, r11, r8             @ r4..r5 += (ct16 * -K02)
332     movs    r2, r2, lsr #28
333     adc     r2, r2, r3, lsl #4          @ r2 = bits[59..28] of r2..r3
334     str     r2, [r1, #x7]               @ store result x7
336     movs    r4, r4, lsr #28
337     adc     r4, r4, r5, lsl #4          @ r4 = bits[59..28] of r4..r5
338     str     r4, [r1, #x1]               @ store result x1
340     @----
342     ldmia   sp, { r2, r3, r4, r5 }      @ r2..r3 = ct06, r4..r5 = ct04 (dont update sp)
344     @ r2..r3 = ct06
345     @ r4..r5 = ct04
346     @ r6     = ct15
347     @ r7     = ct14
348     @ r8     = ct16
349     @ r9     = ct17
350     @ r10    = -K03
351     @ r11    = -K02
352     @ r12    = -K14
353     @ lr     =  K15
355     rsbs    r2, r2, #0
356     rsc     r3, r3, #0                  @ r2..r3 = -ct06
358     smlal   r2, r3, r12, r7             @ r2..r3  = -ct06 + (ct14 * -K14)
359     smlal   r2, r3, r10, r8             @ r2..r3 += (ct16 * -K03)
361     smlal   r4, r5, r12, r6             @ r4..r5  =  ct04 + (ct15 * -K14)
362     smlal   r4, r5, r10, r9             @ r4..r5 += (ct17 * -K03)
363     smlal   r4, r5,  lr, r8             @ r4..r5 += (ct16 *  K15)
364     smlal   r4, r5, r11, r7             @ r4..r5 += (ct14 * -K02)
366     rsb     lr, lr, #0                  @ lr  = -K15
367     rsb     r11, r11, #0                @ r11 =  K02
369     smlal   r2, r3,  lr, r9             @ r2..r3 += (ct17 * -K15)
370     smlal   r2, r3, r11, r6             @ r2..r3 += (ct15 *  K02)
372     movs    r4, r4, lsr #28
373     adc     r4, r4, r5, lsl #4          @ r4 = bits[59..28] of r4..r5
374     str     r4, [r1, #x25]              @ store result x25
376     movs    r2, r2, lsr #28
377     adc     r2, r2, r3, lsl #4          @ r2 = bits[59..28] of r2..r3
378     str     r2, [r1, #x19]              @ store result x19
380     @----
382     ldr     r2, [sp, #16]               @ r2 = ct01_l
383     ldr     r3, [sp, #20]               @ r3 = ct01_h
385     ldr     r6, [r0, #X1]
386     ldr     r8, [r0, #X7]
387     ldr     r9, [r0, #X10]
388     ldr     r7, [r0, #X16]
390     rsbs    r2, r2, #0
391     rsc     r3, r3, #0                  @ r2..r3 = -ct01
393     mov     r4, r2
394     mov     r5, r3                      @ r4..r5 = -ct01
396     @ r2..r3 = -ct01
397     @ r4..r5 = -ct01
398     @ r6     =  X1
399     @ r7     =  X16
400     @ r8     =  X7
401     @ r9     =  X10
402     @ r10    = -K03
403     @ r11    =  K02
404     @ r12    = -K14
405     @ lr     = -K15
407     smlal   r4, r5, r12, r7             @ r4..r5 = -ct01 + (X16 * -K14)
408     smlal   r2, r3,  lr, r9             @ r2..r3 = -ct01 + (X10 * -K15)
410     smlal   r4, r5, r10, r8             @ r4..r5 += (X7  * -K03)
411     smlal   r2, r3, r10, r7             @ r2..r3 += (X16 * -K03)
413     smlal   r4, r5, r11, r9             @ r4..r5 += (X10 *  K02)
414     smlal   r2, r3, r12, r8             @ r2..r3 += (X7  * -K14)
416     rsb     lr, lr, #0                  @ lr  =  K15
417     rsb     r11, r11, #0                @ r11 = -K02
419     smlal   r4, r5,  lr, r6             @ r4..r5 += (X1  *  K15) = ct05
420     smlal   r2, r3, r11, r6             @ r2..r3 += (X1  * -K02) = ct03
422     stmdb   sp!, { r2, r3, r4, r5 }     @ stack ct05_h, ct05_l, ct03_h, ct03_l
424     rsbs    r4, r4, #0
425     rsc     r5, r5, #0                  @ r4..r5 = -ct05
427     stmdb   sp!, { r4, r5 }             @ stack -ct05_h, -ct05_l
429     ldr     r2, [sp, #48]               @ r2 = ct00_l
430     ldr     r3, [sp, #52]               @ r3 = ct00_h
432     rsb     r10, r10, #0                @ r10 = K03
434     rsbs    r4, r2, #0
435     rsc     r5, r3, #0                  @ r4..r5 = -ct00
437     @ r2..r3 =  ct00
438     @ r4..r5 = -ct00
439     @ r6     =  X1
440     @ r7     =  X16
441     @ r8     =  X7
442     @ r9     =  X10
443     @ r10    =  K03
444     @ r11    = -K02
445     @ r12    = -K14
446     @ lr     =  K15
448     smlal   r4, r5, r10, r6             @ r4..r5 = -ct00 + (X1  * K03)
449     smlal   r2, r3, r10, r9             @ r2..r3 =  ct00 + (X10 * K03)
451     smlal   r4, r5, r12, r9             @ r4..r5 += (X10 * -K14)
452     smlal   r2, r3, r12, r6             @ r2..r3 += (X1  * -K14)
454     smlal   r4, r5, r11, r7             @ r4..r5 += (X16 * -K02)
455     smlal   r4, r5,  lr, r8             @ r4..r5 += (X7  *  K15) = ct07
457     rsb     lr, lr, #0                  @ lr  = -K15
458     rsb     r11, r11, #0                @ r11 =  K02
460     smlal   r2, r3, r11, r8             @ r2..r3 += (X7  *  K02)
461     smlal   r2, r3,  lr, r7             @ r2..r3 += (X16 * -K15) = ct02
463     rsbs    r6, r4, #0
464     rsc     r7, r5, #0                  @ r6..r7 = -ct07
466     stmdb   sp!, { r2 - r7 }            @ stack -ct07_h, -ct07_l, ct07_h, ct07_l, ct02_h, ct02_l
469     @----
471     add     r2, pc, #(imdct36_long_karray-.-8)  @ r2 = base address of Knn array (PIC safe ?)
474 loop:
475     ldr     r12, [r0, #X0]
477     ldmia   r2!, { r5 - r11 }           @ first 7 words from Karray element
479     smull   r3, r4, r5, r12             @ sum =  (Kxx * X0)
480     ldr     r12, [r0, #X2]
481     ldr     r5,  [r0, #X3]
482     smlal   r3, r4, r6, r12             @ sum += (Kxx * X2)
483     ldr     r12, [r0, #X5]
484     ldr     r6,  [r0, #X6]
485     smlal   r3, r4, r7, r5              @ sum += (Kxx * X3)
486     smlal   r3, r4, r8, r12             @ sum += (Kxx * X5)
487     ldr     r12, [r0, #X8]
488     ldr     r5,  [r0, #X9]
489     smlal   r3, r4, r9, r6              @ sum += (Kxx * X6)
490     smlal   r3, r4, r10, r12            @ sum += (Kxx * X8)
491     smlal   r3, r4, r11, r5             @ sum += (Kxx * X9)
493     ldmia   r2!, { r5 - r10 }           @ final 6 words from Karray element
495     ldr     r11, [r0, #X11]
496     ldr     r12, [r0, #X12]
497     smlal   r3, r4, r5, r11             @ sum += (Kxx * X11)
498     ldr     r11, [r0, #X14]
499     ldr     r5,  [r0, #X15]
500     smlal   r3, r4, r6, r12             @ sum += (Kxx * X12)
501     smlal   r3, r4, r7, r11             @ sum += (Kxx * X14)
502     ldr     r11, [r0, #X17]
503     smlal   r3, r4, r8, r5              @ sum += (Kxx * X15)
504     smlal   r3, r4, r9, r11             @ sum += (Kxx * X17)
506     add     r5, sp, r10, lsr #16        @ create index back into stack for required ctxx
508     ldmia   r5, { r6, r7 }              @ r6..r7 = ctxx
510     mov     r8, r10, lsl #16            @ push ctxx index off the top end
512     adds    r3, r3, r6                  @ add low words
513     adc     r4, r4, r7                  @ add high words, with carry
514     movs    r3, r3, lsr #28
515     adc     r3, r3, r4, lsl #4          @ r3 = bits[59..28] of r3..r4
517     str     r3, [r1, r8, lsr #24]       @ push completion flag off the bottom end
519     movs    r8, r8, lsl #8              @ push result location index off the top end
520     beq     loop                        @ loop back if completion flag not set
521     b       imdct_l_windowing           @ branch to windowing stage if looping finished
523 imdct36_long_karray:
525     .word   K17, -K13,  K10, -K06, -K05,  K01, -K00,  K04, -K07,  K11,  K12, -K16, 0x00000000
526     .word   K13,  K07,  K16,  K01,  K10, -K05,  K04, -K11,  K00, -K17,  K06, -K12, 0x00200800
527     .word   K11,  K17,  K05,  K12, -K01,  K06, -K07,  K00, -K13,  K04, -K16,  K10, 0x00200c00
528     .word   K07,  K00, -K12,  K05, -K16, -K10,  K11, -K17,  K04,  K13,  K01,  K06, 0x00001400
529     .word   K05,  K10, -K00, -K17,  K07, -K13,  K12,  K06, -K16,  K01, -K11, -K04, 0x00181800
530     .word   K01,  K05, -K07, -K11,  K13,  K17, -K16, -K12,  K10,  K06, -K04, -K00, 0x00102000
531     .word  -K16,  K12, -K11,  K07,  K04, -K00, -K01,  K05, -K06,  K10,  K13, -K17, 0x00284800
532     .word  -K12,  K06,  K17, -K00, -K11,  K04,  K05, -K10,  K01,  K16, -K07, -K13, 0x00085000
533     .word  -K10,  K16,  K04, -K13, -K00,  K07,  K06, -K01, -K12, -K05,  K17,  K11, 0x00105400
534     .word  -K06, -K01,  K13,  K04,  K17, -K11, -K10, -K16, -K05,  K12,  K00,  K07, 0x00185c00
535     .word  -K04, -K11, -K01,  K16,  K06,  K12,  K13, -K07, -K17, -K00, -K10, -K05, 0x00006000
536     .word  -K00, -K04, -K06, -K10, -K12, -K16, -K17, -K13, -K11, -K07, -K05, -K01, 0x00206801
539     @----
540     @-------------------------------------------------------------------------
541     @----
543 imdct_l_windowing:
545     ldr     r11, [sp, #80]              @ fetch function parameter 3 from out of the stack
546     ldmia   r1!, { r0, r2 - r9 }        @ load 9 words from x0, update pointer
548     @ r0     = x0
549     @ r1     = &x[9]
550     @ r2     = x1
551     @ r3     = x2
552     @ r4     = x3
553     @ r5     = x4
554     @ r6     = x5
555     @ r7     = x6
556     @ r8     = x7
557     @ r9     = x8
558     @ r10    = .
559     @ r11    = window mode: (0 == normal), (1 == start block), (3 == stop block)
560     @ r12    = .
561     @ lr     = .
563     cmp     r11, #BLOCK_MODE_STOP       @ setup flags
564     rsb     r10, r0, #0                 @ r10 = -x0 (DONT change flags !!)
565     beq     stop_block_x0_to_x17
568     @ start and normal blocks are treated the same for x[0]..x[17]
570 normal_block_x0_to_x17:
572     ldr     r12, =WL9                   @ r12 = window_l[9]
574     rsb     r0,  r9, #0                 @ r0  = -x8
575     rsb     r9,  r2, #0                 @ r9  = -x1
576     rsb     r2,  r8, #0                 @ r2  = -x7
577     rsb     r8,  r3, #0                 @ r8  = -x2
578     rsb     r3,  r7, #0                 @ r3  = -x6
579     rsb     r7,  r4, #0                 @ r7  = -x3
580     rsb     r4,  r6, #0                 @ r4  = -x5
581     rsb     r6,  r5, #0                 @ r6  = -x4
583     @ r0     = -x8
584     @ r1     = &x[9]
585     @ r2     = -x7
586     @ r3     = -x6
587     @ r4     = -x5
588     @ r5     = .
589     @ r6     = -x4
590     @ r7     = -x3
591     @ r8     = -x2
592     @ r9     = -x1
593     @ r10    = -x0
594     @ r11    = window mode: (0 == normal), (1 == start block), (3 == stop block)
595     @ r12    = window_l[9]
596     @ lr     = .
598     smull   r5, lr, r12, r0             @ r5..lr = (window_l[9]  * (x[9]  == -x[8]))
599     ldr     r12, =WL10                  @ r12 = window_l[10]
600     movs    r5, r5, lsr #28
601     adc     r0, r5, lr, lsl #4          @ r0 = bits[59..28] of windowed x9
603     smull   r5, lr, r12, r2             @ r5..lr = (window_l[10] * (x[10] == -x[7]))
604     ldr     r12, =WL11                  @ r12 = window_l[11]
605     movs    r5, r5, lsr #28
606     adc     r2, r5, lr, lsl #4          @ r2 = bits[59..28] of windowed x10
608     smull   r5, lr, r12, r3             @ r5..lr = (window_l[11] * (x[11] == -x[6]))
609     ldr     r12, =WL12                  @ r12 = window_l[12]
610     movs    r5, r5, lsr #28
611     adc     r3, r5, lr, lsl #4          @ r3 = bits[59..28] of windowed x11
613     smull   r5, lr, r12, r4             @ r5..lr = (window_l[12] * (x[12] == -x[5]))
614     ldr     r12, =WL13                  @ r12 = window_l[13]
615     movs    r5, r5, lsr #28
616     adc     r4, r5, lr, lsl #4          @ r4 = bits[59..28] of windowed x12
618     smull   r5, lr, r12, r6             @ r5..lr = (window_l[13] * (x[13] == -x[4]))
619     ldr     r12, =WL14                  @ r12 = window_l[14]
620     movs    r5, r5, lsr #28
621     adc     r6, r5, lr, lsl #4          @ r6 = bits[59..28] of windowed x13
623     smull   r5, lr, r12, r7             @ r5..lr = (window_l[14] * (x[14] == -x[3]))
624     ldr     r12, =WL15                  @ r12 = window_l[15]
625     movs    r5, r5, lsr #28
626     adc     r7, r5, lr, lsl #4          @ r7 = bits[59..28] of windowed x14
628     smull   r5, lr, r12, r8             @ r5..lr = (window_l[15] * (x[15] == -x[2]))
629     ldr     r12, =WL16                  @ r12 = window_l[16]
630     movs    r5, r5, lsr #28
631     adc     r8, r5, lr, lsl #4          @ r8 = bits[59..28] of windowed x15
633     smull   r5, lr, r12, r9             @ r5..lr = (window_l[16] * (x[16] == -x[1]))
634     ldr     r12, =WL17                  @ r12 = window_l[17]
635     movs    r5, r5, lsr #28
636     adc     r9, r5, lr, lsl #4          @ r9 = bits[59..28] of windowed x16
638     smull   r5, lr, r12, r10            @ r5..lr = (window_l[17] * (x[17] == -x[0]))
639     ldr     r12, =WL0                   @ r12 = window_l[0]
640     movs    r5, r5, lsr #28
641     adc     r10, r5, lr, lsl #4         @ r10 = bits[59..28] of windowed x17
644     stmia   r1,  { r0, r2 - r4, r6 - r10 } @ store windowed x[9] .. x[17]
645     ldmdb   r1!, { r0, r2 - r9 }           @ load 9 words downto (and including) x0
648     smull   r10, lr, r12, r0            @ r10..lr = (window_l[0] * x[0])
649     ldr     r12, =WL1                   @ r12 = window_l[1]
650     movs    r10, r10, lsr #28
651     adc     r0, r10, lr, lsl #4         @ r0 = bits[59..28] of windowed x0
653     smull   r10, lr, r12, r2            @ r10..lr = (window_l[1] * x[1])
654     ldr     r12, =WL2                   @ r12 = window_l[2]
655     movs    r10, r10, lsr #28
656     adc     r2, r10, lr, lsl #4         @ r2 = bits[59..28] of windowed x1
658     smull   r10, lr, r12, r3            @ r10..lr = (window_l[2] * x[2])
659     ldr     r12, =WL3                   @ r12 = window_l[3]
660     movs    r10, r10, lsr #28
661     adc     r3, r10, lr, lsl #4         @ r3 = bits[59..28] of windowed x2
663     smull   r10, lr, r12, r4            @ r10..lr = (window_l[3] * x[3])
664     ldr     r12, =WL4                   @ r12 = window_l[4]
665     movs    r10, r10, lsr #28
666     adc     r4, r10, lr, lsl #4         @ r4 = bits[59..28] of windowed x3
668     smull   r10, lr, r12, r5            @ r10..lr = (window_l[4] * x[4])
669     ldr     r12, =WL5                   @ r12 = window_l[5]
670     movs    r10, r10, lsr #28
671     adc     r5, r10, lr, lsl #4         @ r5 = bits[59..28] of windowed x4
673     smull   r10, lr, r12, r6            @ r10..lr = (window_l[5] * x[5])
674     ldr     r12, =WL6                   @ r12 = window_l[6]
675     movs    r10, r10, lsr #28
676     adc     r6, r10, lr, lsl #4         @ r6 = bits[59..28] of windowed x5
678     smull   r10, lr, r12, r7            @ r10..lr = (window_l[6] * x[6])
679     ldr     r12, =WL7                   @ r12 = window_l[7]
680     movs    r10, r10, lsr #28
681     adc     r7, r10, lr, lsl #4         @ r7 = bits[59..28] of windowed x6
683     smull   r10, lr, r12, r8            @ r10..lr = (window_l[7] * x[7])
684     ldr     r12, =WL8                   @ r12 = window_l[8]
685     movs    r10, r10, lsr #28
686     adc     r8, r10, lr, lsl #4         @ r8 = bits[59..28] of windowed x7
688     smull   r10, lr, r12, r9            @ r10..lr = (window_l[8] * x[8])
689     movs    r10, r10, lsr #28
690     adc     r9, r10, lr, lsl #4         @ r9 = bits[59..28] of windowed x8
692     stmia   r1, { r0, r2 - r9 }         @ store windowed x[0] .. x[8]
694     cmp     r11, #BLOCK_MODE_START
695     beq     start_block_x18_to_x35
698     @----
701 normal_block_x18_to_x35:
703     ldr     r11, =WL3                   @ r11 = window_l[3]
704     ldr     r12, =WL4                   @ r12 = window_l[4]
706     add     r1, r1, #(18*4)             @ r1 = &x[18]
708     ldmia   r1!, { r0, r2 - r4, r6 - r10 }  @ load 9 words from x18, update pointer
710     @ r0     = x18
711     @ r1     = &x[27]
712     @ r2     = x19
713     @ r3     = x20
714     @ r4     = x21
715     @ r5     = .
716     @ r6     = x22
717     @ r7     = x23
718     @ r8     = x24
719     @ r9     = x25
720     @ r10    = x26
721     @ r11    = window_l[3]
722     @ r12    = window_l[4]
723     @ lr     = .
725     smull   r5, lr, r12, r6             @ r5..lr = (window_l[4] * (x[22] == x[31]))
726     movs    r5, r5, lsr #28
727     adc     r5, r5, lr, lsl #4          @ r5 = bits[59..28] of windowed x31
729     smull   r6, lr, r11, r4             @ r5..lr = (window_l[3] * (x[21] == x[32]))
730     ldr     r12, =WL5                   @ r12    =  window_l[5]
731     movs    r6, r6, lsr #28
732     adc     r6, r6, lr, lsl #4          @ r6 = bits[59..28] of windowed x32
734     smull   r4, lr, r12, r7             @ r4..lr = (window_l[5] * (x[23] == x[30]))
735     ldr     r11, =WL1                   @ r11    =  window_l[1]
736     ldr     r12, =WL2                   @ r12    =  window_l[2]
737     movs    r4, r4, lsr #28
738     adc     r4, r4, lr, lsl #4          @ r4 = bits[59..28] of windowed x30
740     smull   r7, lr, r12, r3             @ r7..lr = (window_l[2] * (x[20] == x[33]))
741     ldr     r12, =WL6                   @ r12 = window_l[6]
742     movs    r7, r7, lsr #28
743     adc     r7, r7, lr, lsl #4          @ r7 = bits[59..28] of windowed x33
745     smull   r3, lr, r12, r8             @ r3..lr = (window_l[6] * (x[24] == x[29]))
746     movs    r3, r3, lsr #28
747     adc     r3, r3, lr, lsl #4          @ r3 = bits[59..28] of windowed x29
749     smull   r8, lr, r11, r2             @ r7..lr = (window_l[1] * (x[19] == x[34]))
750     ldr     r12, =WL7                   @ r12    =  window_l[7]
751     ldr     r11, =WL8                   @ r11    =  window_l[8]
752     movs    r8, r8, lsr #28
753     adc     r8, r8, lr, lsl #4          @ r8 = bits[59..28] of windowed x34
755     smull   r2, lr, r12, r9             @ r7..lr = (window_l[7] * (x[25] == x[28]))
756     ldr     r12, =WL0                   @ r12 = window_l[0]
757     movs    r2, r2, lsr #28
758     adc     r2, r2, lr, lsl #4          @ r2 = bits[59..28] of windowed x28
760     smull   r9, lr, r12, r0             @ r3..lr = (window_l[0] * (x[18] == x[35]))
761     movs    r9, r9, lsr #28
762     adc     r9, r9, lr, lsl #4          @ r9 = bits[59..28] of windowed x35
764     smull   r0, lr, r11, r10            @ r7..lr = (window_l[8] * (x[26] == x[27]))
765     ldr     r11, =WL16                  @ r11    =  window_l[16]
766     ldr     r12, =WL17                  @ r12    =  window_l[17]
767     movs    r0, r0, lsr #28
768     adc     r0, r0, lr, lsl #4          @ r0 = bits[59..28] of windowed x27
771     stmia   r1,  { r0, r2 - r9 }        @ store windowed x[27] .. x[35]
772     ldmdb   r1!, { r0, r2 - r9 }        @ load 9 words downto (and including) x18
775     smull   r10, lr, r12, r0            @ r10..lr = (window_l[17] * x[18])
776     movs    r10, r10, lsr #28
777     adc     r0,  r10, lr, lsl #4        @ r0 = bits[59..28] of windowed x0
779     smull   r10, lr, r11, r2            @ r10..lr = (window_l[16] * x[19])
780     ldr     r11, =WL14                  @ r11     =  window_l[14]
781     ldr     r12, =WL15                  @ r12     =  window_l[15]
782     movs    r10, r10, lsr #28
783     adc     r2,  r10, lr, lsl #4        @ r2 = bits[59..28] of windowed x1
785     smull   r10, lr, r12, r3            @ r10..lr = (window_l[15] * x[20])
786     movs    r10, r10, lsr #28
787     adc     r3,  r10, lr, lsl #4        @ r3 = bits[59..28] of windowed x2
789     smull   r10, lr, r11, r4            @ r10..lr = (window_l[14] * x[21])
790     ldr     r11, =WL12                  @ r11     =  window_l[12]
791     ldr     r12, =WL13                  @ r12     =  window_l[13]
792     movs    r10, r10, lsr #28
793     adc     r4,  r10, lr, lsl #4        @ r4 = bits[59..28] of windowed x3
795     smull   r10, lr, r12, r5            @ r10..lr = (window_l[13] * x[22])
796     movs    r10, r10, lsr #28
797     adc     r5,  r10, lr, lsl #4        @ r5 = bits[59..28] of windowed x4
799     smull   r10, lr, r11, r6            @ r10..lr = (window_l[12] * x[23])
800     ldr     r11, =WL10                  @ r12 = window_l[10]
801     ldr     r12, =WL11                  @ r12 = window_l[11]
802     movs    r10, r10, lsr #28
803     adc     r6,  r10, lr, lsl #4        @ r6 = bits[59..28] of windowed x5
805     smull   r10, lr, r12, r7            @ r10..lr = (window_l[11] * x[24])
806     movs    r10, r10, lsr #28
807     adc     r7,  r10, lr, lsl #4        @ r7 = bits[59..28] of windowed x6
809     smull   r10, lr, r11, r8            @ r10..lr = (window_l[10] * x[25])
810     ldr     r12, =WL9                   @ r12 = window_l[9]
811     movs    r10, r10, lsr #28
812     adc     r8,  r10, lr, lsl #4        @ r8 = bits[59..28] of windowed x7
814     smull   r10, lr, r12, r9            @ r10..lr = (window_l[9] * x[26])
816     movs    r10, r10, lsr #28
817     adc     r9,  r10, lr, lsl #4        @ r9 = bits[59..28] of windowed x8
819     stmia   r1, { r0, r2 - r9 }         @ store windowed x[18] .. x[26]
821     @----
822     @ NB there are 2 possible exits from this function - this is only one of them
823     @----
825     add     sp, sp, #(21*4)             @ return stack frame
826     ldmia   sp!, { r4 - r11, pc }       @ restore callee saved regs, and return
828     @----
831 stop_block_x0_to_x17:
833     @ r0     =  x0
834     @ r1     = &x[9]
835     @ r2     =  x1
836     @ r3     =  x2
837     @ r4     =  x3
838     @ r5     =  x4
839     @ r6     =  x5
840     @ r7     =  x6
841     @ r8     =  x7
842     @ r9     =  x8
843     @ r10    = -x0
844     @ r11    =  window mode: (0 == normal), (1 == start block), (3 == stop block)
845     @ r12    =  .
846     @ lr     =  .
848     rsb     r0, r6, #0                  @ r0 = -x5
849     rsb     r6, r2, #0                  @ r6 = -x1
850     rsb     r2, r5, #0                  @ r2 = -x4
851     rsb     r5, r3, #0                  @ r5 = -x2
852     rsb     r3, r4, #0                  @ r3 = -x3
854     add     r1, r1, #(3*4)                      @ r1 = &x[12]
855     stmia   r1, { r0, r2, r3, r5, r6, r10 }     @ store unchanged x[12] .. x[17]
857     ldr     r0, =WL1                    @ r0 = window_l[1]  == window_s[0]
859     rsb     r10, r9, #0                 @ r10 = -x8
860     rsb     r12, r8, #0                 @ r12 = -x7
861     rsb     lr,  r7, #0                 @ lr  = -x6
863     @ r0     =  WL1
864     @ r1     = &x[12]
865     @ r2     =  .
866     @ r3     =  .
867     @ r4     =  .
868     @ r5     =  .
869     @ r6     =  .
870     @ r7     =  x6
871     @ r8     =  x7
872     @ r9     =  x8
873     @ r10    = -x8
874     @ r11    =  window mode: (0 == normal), (1 == start block), (3 == stop block)
875     @ r12    = -x7
876     @ lr     = -x6
878     smull   r5, r6, r0, r7              @ r5..r6 = (window_l[1] * x[6])
879     ldr     r2, =WL4                    @ r2     =  window_l[4] == window_s[1]
880     movs    r5, r5, lsr #28
881     adc     r7, r5, r6, lsl #4          @ r7 = bits[59..28] of windowed x6
883     smull   r5, r6, r2, r8              @ r5..r6 = (window_l[4] * x[7])
884     ldr     r3, =WL7                    @ r3     =  window_l[7] == window_s[2]
885     movs    r5, r5, lsr #28
886     adc     r8, r5, r6, lsl #4          @ r8 = bits[59..28] of windowed x7
888     smull   r5, r6, r3, r9              @ r5..r6 = (window_l[7] * x[8])
889     ldr     r4, =WL10                   @ r4     =  window_l[10] == window_s[3]
890     movs    r5, r5, lsr #28
891     adc     r9, r5, r6, lsl #4          @ r9 = bits[59..28] of windowed x8
893     smull   r5, r6, r4, r10             @ r5..r6 = (window_l[10] * (x[9] == -x[8]))
894     ldr     r0, =WL13                   @ r0     =  window_l[13] == window_s[4]
895     movs    r5, r5, lsr #28
896     adc     r10, r5, r6, lsl #4         @ r10 = bits[59..28] of windowed x9
898     smull   r5, r6, r0, r12             @ r5..r6 = (window_l[13] * (x[10] == -x[7]))
899     ldr     r2, =WL16                   @ r2     =  window_l[16] == window_s[5]
900     movs    r5, r5, lsr #28
901     adc     r12, r5, r6, lsl #4         @ r10 = bits[59..28] of windowed x9
903     smull   r5, r6, r2, lr              @ r5..r6 = (window_l[16] * (x[11] == -x[6]))
905     ldr     r0, =0x00
907     movs    r5, r5, lsr #28
908     adc     lr, r5, r6, lsl #4          @ r10 = bits[59..28] of windowed x9
910     stmdb   r1!, { r7 - r10, r12, lr }  @ store windowed x[6] .. x[11]
912     ldr     r5, =0x00
913     ldr     r6, =0x00
914     ldr     r2, =0x00
915     ldr     r3, =0x00
916     ldr     r4, =0x00
918     stmdb   r1!, { r0, r2 - r6 }        @ store windowed x[0] .. x[5]
920     b       normal_block_x18_to_x35
923     @----
926 start_block_x18_to_x35:
928     ldr     r4, =WL1                    @ r0 = window_l[1]  == window_s[0]
930     add     r1, r1, #(24*4)             @ r1 = &x[24]
932     ldmia   r1, { r0, r2, r3 }          @ load 3 words from x24, dont update pointer
934     @ r0     = x24
935     @ r1     = &x[24]
936     @ r2     = x25
937     @ r3     = x26
938     @ r4     = WL1
939     @ r5     = WL4
940     @ r6     = WL7
941     @ r7     = WL10
942     @ r8     = WL13
943     @ r9     = WL16
944     @ r10    = .
945     @ r11    = .
946     @ r12    = .
947     @ lr     = .
949     ldr     r5, =WL4                    @ r5 = window_l[4] == window_s[1]
951     smull   r10, r11, r4, r0            @ r10..r11 = (window_l[1] * (x[24] == x[29]))
952     ldr     r6, =WL7                    @ r6       =  window_l[7]  == window_s[2]
953     movs    r10, r10, lsr #28
954     adc     lr, r10, r11, lsl #4        @ lr = bits[59..28] of windowed x29
956     smull   r10, r11, r5, r2            @ r10..r11 = (window_l[4] * (x[25] == x[28]))
957     ldr     r7, =WL10                   @ r7       =  window_l[10] == window_s[3]
958     movs    r10, r10, lsr #28
959     adc     r12, r10, r11, lsl #4       @ r12 = bits[59..28] of windowed x28
961     smull   r10, r11, r6, r3            @ r10..r11 = (window_l[7] * (x[26] == x[27]))
962     ldr     r8, =WL13                   @ r8       =  window_l[13] == window_s[4]
963     movs    r10, r10, lsr #28
964     adc     r4, r10, r11, lsl #4        @ r4 = bits[59..28] of windowed x27
966     smull   r10, r11, r7, r3            @ r10..r11 = (window_l[10] * x[26])
967     ldr     r9, =WL16                   @ r9       =  window_l[16] == window_s[5]
968     movs    r10, r10, lsr #28
969     adc     r3, r10, r11, lsl #4        @ r3 = bits[59..28] of windowed x26
971     smull   r10, r11, r8, r2            @ r10..r11 = (window_l[13] * x[25])
972     ldr     r5, =0x00
973     movs    r10, r10, lsr #28
974     adc     r2, r10, r11, lsl #4        @ r2 = bits[59..28] of windowed x25
976     smull   r10, r11, r9, r0            @ r10..r11 = (window_l[16] * x[24])
977     ldr     r6, =0x00
978     movs    r10, r10, lsr #28
979     adc     r0, r10, r11, lsl #4        @ r0 = bits[59..28] of windowed x24
981     stmia   r1!, { r0, r2, r3, r4, r12, lr }    @ store windowed x[24] .. x[29]
983     ldr     r7, =0x00
984     ldr     r8, =0x00
985     ldr     r9, =0x00
986     ldr     r10, =0x00
988     stmia   r1!, { r5 - r10 }           @ store windowed x[30] .. x[35]
990     @----
991     @ NB there are 2 possible exits from this function - this is only one of them
992     @----
994     add     sp, sp, #(21*4)             @ return stack frame
995     ldmia   sp!, { r4 - r11, pc }       @ restore callee saved regs, and return
997     @----
998     @END
999     @----