fix YUV Dither for the other targets too, like in r26064.
[kugel-rb.git] / firmware / target / arm / as3525 / lcd-as-e200v2-fuze-fuzev2.S
blobc5f85baa44956f45ab9c04df8ca2015fcb39f190
1 /***************************************************************************
2  *             __________               __   ___.
3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
7  *                     \/            \/     \/    \/            \/
8  * $Id$
9  *
10  * Copyright (C) 2007 by Jens Arnold
11  * Heavily based on lcd-as-memframe.c by Michael Sevakis
12  * Adapted for Sansa Fuze/e200v2 by Rafaël Carré
13  *
14  * This program is free software; you can redistribute it and/or
15  * modify it under the terms of the GNU General Public License
16  * as published by the Free Software Foundation; either version 2
17  * of the License, or (at your option) any later version.
18  *
19  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
20  * KIND, either express or implied.
21  *
22  ****************************************************************************/
24 #include "config.h"
25 #include "cpu.h"
27 #define DBOP_BUSY (1<<10)
29 /****************************************************************************
30  * void lcd_write_yuv_420_lines(unsigned char const * const src[3],
31  *                              int width,
32  *                              int stride);
33  *
34  *   |R|   |1.000000 -0.000001  1.402000| |Y'|
35  *   |G| = |1.000000 -0.334136 -0.714136| |Pb|
36  *   |B|   |1.000000  1.772000  0.000000| |Pr|
37  *   Scaled, normalized, rounded and tweaked to yield RGB 565:
38  *   |R|   |74   0 101| |Y' -  16| >> 9
39  *   |G| = |74 -24 -51| |Cb - 128| >> 8
40  *   |B|   |74 128   0| |Cr - 128| >> 9
41  *
42  * Write four RGB565 pixels in the following order on each loop:
43  * 1 3 + > down
44  * 2 4 \/ left
45  */
46     .section    .icode, "ax", %progbits
47     .align      2
48     .global     lcd_write_yuv420_lines
49     .type       lcd_write_yuv420_lines, %function
50 lcd_write_yuv420_lines:
51                                         @ r0 = yuv_src
52                                         @ r1 = width
53                                         @ r2 = stride
54     stmfd       sp!, { r4-r11, lr }     @ save non-scratch
56     mov         r3, #0xC8000000         @
57     orr         r3, r3, #0x120000       @ r3 = DBOP_BASE
59     ldmia       r0, { r4, r5, r6 }      @ r4 = yuv_src[0] = Y'_p
60                                         @ r5 = yuv_src[1] = Cb_p
61                                         @ r6 = yuv_src[2] = Cr_p
62                                         @ r0 = scratch
63     ldr         r12, [r3, #8]           @
64     sub         r2, r2, #1              @ stride -= 1
65     orr         r12, r12, #3<<13        @ DBOP_CTRL |= (1<<13|1<<14) (32bit mode)
66 #ifdef SANSA_FUZEV2
67     bic         r12, r12, #1<<13        @ DBOP_CTRL &= ~(1<<13),still 32bit mode
68 #endif
69     str         r12, [r3, #8]           @ 
70 10: @ loop line                         @
71     ldrb        r7, [r4], #1            @ r7 = *Y'_p++;
72     ldrb        r8, [r5], #1            @ r8 = *Cb_p++;
73     ldrb        r9, [r6], #1            @ r9 = *Cr_p++;
74                                         @
75     sub         r7, r7, #16             @ r7 = Y = (Y' - 16)*74
76     add         r12, r7, r7, asl #2     @ actually (Y' - 16)*37 and shift right
77     add         r7, r12, r7, asl #5     @ by one less when adding - same for all
78                                         @
79     sub         r8, r8, #128            @ Cb -= 128
80     sub         r9, r9, #128            @ Cr -= 128
81                                         @
82     add         r10, r9, r9, asl #1     @ r10 = Cr*51 + Cb*24
83     add         r10, r10, r10, asl #4   @
84     add         r10, r10, r8, asl #3    @
85     add         r10, r10, r8, asl #4    @
86                                         @
87     add         lr, r9, r9, asl #2      @ r9 = Cr*101
88     add         lr, lr, r9, asl #5      @
89     add         r9, lr, r9, asl #6      @
90                                         @
91     add         r8, r8, #2              @ r8 = bu = (Cb*128 + 128) >> 8
92     mov         r8, r8, asr #2          @
93     add         r9, r9, #256            @ r9 = rv = (r9 + 256) >> 9
94     mov         r9, r9, asr #9          @
95     rsb         r10, r10, #128          @ r10 = guv = (-r10 + 128) >> 8
96     mov         r10, r10, asr #8        @
97                                         @ compute R, G, and B
98     add         r0, r8, r7, asr #8      @ r0  = b = (Y >> 9) + bu
99     add         lr, r9, r7, asr #8      @ lr = r = (Y >> 9) + rv
100     add         r7, r10, r7, asr #7     @ r7  = g = (Y >> 8) + guv
101                                         @
102     orr         r12, r0, lr             @ check if clamping is needed...
103     orr         r12, r12, r7, asr #1    @ ...at all
104     cmp         r12, #31                @
105     bls         15f @ no clamp          @
106     cmp         r0, #31                 @ clamp b
107     mvnhi       r0, r0, asr #31         @
108     andhi       r0, r0, #31             @
109     cmp         lr, #31                 @ clamp r
110     mvnhi       lr, lr, asr #31         @
111     andhi       lr, lr, #31             @
112     cmp         r7, #63                 @ clamp g
113     mvnhi       r7, r7, asr #31         @
114     andhi       r7, r7, #63             @
115 15: @ no clamp                          @
116                                         @
117     ldrb        r12, [r4, r2]           @ r12 = Y' = *(Y'_p + stride)
118                                         @
119     orr         r0, r0, lr, lsl #11     @ r0 = (r << 11) | b
120     orr         r11, r0, r7, lsl #5     @ r11 = (r << 11) | (g << 5) | b
121     orr         r11, r0, r7, lsl #5     @ r11 = (r << 11) | (g << 5) | b
122 #ifdef SANSA_FUZEV2
123     mov         r0, r11, lsr #8         @
124     bic         r11, r11, #0xff00       @
125     orr         r11, r0, r11, lsl #8    @ swap bytes
126 #endif
127     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*74
128     add         r12, r7, r7, asl #2     @
129     add         r7, r12, r7, asl #5     @
130                                         @ compute R, G, and B
131     add         r0, r8, r7, asr #8      @ r0  = b = (Y >> 9) + bu
132     add         lr, r9, r7, asr #8      @ lr = r = (Y >> 9) + rv
133     add         r7, r10, r7, asr #7     @ r7  = g = (Y >> 8) + guv
134                                         @
135     orr         r12, r0, lr             @ check if clamping is needed...
136     orr         r12, r12, r7, asr #1    @ ...at all
137     cmp         r12, #31                @
138     bls         15f @ no clamp          @
139     cmp         r0, #31                 @ clamp b
140     mvnhi       r0, r0, asr #31         @
141     andhi       r0, r0, #31             @
142     cmp         lr, #31                 @ clamp r
143     mvnhi       lr, lr, asr #31         @
144     andhi       lr, lr, #31             @
145     cmp         r7, #63                 @ clamp g
146     mvnhi       r7, r7, asr #31         @
147     andhi       r7, r7, #63             @
148 15: @ no clamp                          @
149                                         @
150     ldrb        r12, [r4], #1           @ r12 = Y' = *(Y'_p++)
151                                         @
152     orr         r0, r0, lr, lsl #11     @ r0 = (r << 11) | b
153     orr         r0, r0, r7, lsl #5      @ r0 = (r << 11) | (g << 5) | b
155 #ifdef SANSA_FUZEV2
156     mov         r7, r0, lsr #8          @
157     bic         r7, r7, #0xff00         @
158     orr         r0, r7, r0, lsl #8      @ swap bytes
159 #endif
161     orr         r0, r11, r0, lsl#16     @ pack with 2nd pixel
162     str         r0, [r3, #0x10]         @ write pixel
163                                         @
164     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*74
165     add         r12, r7, r7, asl #2     @
166     add         r7, r12, r7, asl #5     @
167                                         @ compute R, G, and B
168     add         r0, r8, r7, asr #8      @ r0  = b = (Y >> 9) + bu
169     add         lr, r9, r7, asr #8      @ lr = r = (Y >> 9) + rv
170     add         r7, r10, r7, asr #7     @ r7  = g = (Y >> 8) + guv
171                                         @
172     orr         r12, r0, lr             @ check if clamping is needed...
173     orr         r12, r12, r7, asr #1    @ ...at all
174     cmp         r12, #31                @
175     bls         15f @ no clamp          @
176     cmp         r0, #31                 @ clamp b
177     mvnhi       r0, r0, asr #31         @
178     andhi       r0, r0, #31             @
179     cmp         lr, #31                 @ clamp r
180     mvnhi       lr, lr, asr #31         @
181     andhi       lr, lr, #31             @
182     cmp         r7, #63                 @ clamp g
183     mvnhi       r7, r7, asr #31         @
184     andhi       r7, r7, #63             @
185 15: @ no clamp                          @
186                                         @
187     ldrb        r12, [r4, r2]           @ r12 = Y' = *(Y'_p + stride)
188                                         @
189                                         @
190     orr         r0, r0, lr, lsl #11     @ r0 = (r << 11) | b
191     orr         r11, r0, r7, lsl #5     @ r0 = (r << 11) | (g << 5) | b
193 #ifdef SANSA_FUZEV2
194     mov         r0, r11, lsr #8         @
195     bic         r11, r11, #0xff00       @
196     orr         r11, r0, r11, lsl #8    @ swap byte
197 #endif
199     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*74
200     add         r12, r7, r7, asl #2     @
201     add         r7, r12, r7, asl #5     @
202                                         @ compute R, G, and B
203     add         r0, r8, r7, asr #8      @ r0  = b = (Y >> 9) + bu
204     add         lr, r9, r7, asr #8      @ lr = r = (Y >> 9) + rv
205     add         r7, r10, r7, asr #7     @ r7  = g = (Y >> 8) + guv
206                                         @
207     orr         r12, r0, lr             @ check if clamping is needed...
208     orr         r12, r12, r7, asr #1    @ ...at all
209     cmp         r12, #31                @
210     bls         15f @ no clamp          @
211     cmp         r0, #31                 @ clamp b
212     mvnhi       r0, r0, asr #31         @
213     andhi       r0, r0, #31             @
214     cmp         lr, #31                 @ clamp r
215     mvnhi       lr, lr, asr #31         @
216     andhi       lr, lr, #31             @
217     cmp         r7, #63                 @ clamp g
218     mvnhi       r7, r7, asr #31         @
219     andhi       r7, r7, #63             @
220 15: @ no clamp                          @
221                                         @
222     orr         r0, r0, lr, lsl #11     @ r0 = (r << 11) | b
223     orr         r0, r0, r7, lsl #5      @ r0 = (r << 11) | (g << 5) | b
224     
225 #ifdef SANSA_FUZEV2
226     mov         r7, r0, lsr #8          @
227     bic         r7, r7, #0xff00         @
228     orr         r0, r7, r0, lsl #8      @ swap bytes
229 #endif
231     orr         r0, r11, r0, lsl#16     @ pack with 2nd pixel
232     str         r0, [r3, #0x10]         @ write pixel
233                                         @
234     subs        r1, r1, #2              @ subtract block from width
235     bgt         10b @ loop line         @
236                                         @
237 1: @ busy
238     @ writing at max 110*32 (LCD_WIDTH/2), the fifo is bigger
239     @ so polling fifo empty only after each line is save
240     ldr         r7, [r3,#0xc]           @ r7 = DBOP_STATUS
241     tst         r7, #DBOP_BUSY          @ fifo not empty?
242     beq         1b                      @
244     ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
245     bx          lr                      @
246     .ltorg                              @ dump constant pool
247     .size   lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
249 /****************************************************************************
250  * void lcd_write_yuv_420_lines_odither(unsigned char const * const src[3],
251  *                                      int width,
252  *                                      int stride,
253  *                                      int x_screen,
254  *                                      int y_screen);
256  *   |R|   |1.000000 -0.000001  1.402000| |Y'|
257  *   |G| = |1.000000 -0.334136 -0.714136| |Pb|
258  *   |B|   |1.000000  1.772000  0.000000| |Pr|
259  *   Red scaled at twice g & b but at same precision to place it in correct
260  *   bit position after multiply and leave instruction count lower.
261  *   |R|   |258   0  408| |Y' -  16|
262  *   |G| = |149 -49 -104| |Cb - 128|
263  *   |B|   |149 258    0| |Cr - 128|
265  * Write four RGB565 pixels in the following order on each loop:
266  * 1 3 + > down
267  * 2 4 \/ left
269  * Kernel pattern (raw|rotated|use order):
270  * 5 3 4 2   2 6 3 7     row0   row2          > down
271  * 1 7 0 6 | 4 0 5 1 | 2 4 6 0 3 5 7 1 col0     left
272  * 4 2 5 3 | 3 7 2 6 | 3 5 7 1 2 4 6 0 col2  \/
273  * 0 6 1 7   5 1 4 0
274  */
275     .section    .icode, "ax", %progbits
276     .align      2
277     .global     lcd_write_yuv420_lines_odither
278     .type       lcd_write_yuv420_lines_odither, %function
279 lcd_write_yuv420_lines_odither:
280                                         @ r0 = yuv_src
281                                         @ r1 = width
282                                         @ r2 = stride
283                                         @ r3 = x_screen
284                                         @ [sp] = y_screen
285     stmfd       sp!, { r4-r11, lr }     @ save non-scratch
286     ldmia       r0, { r4, r5, r6 }      @ r4 = yuv_src[0] = Y'_p
287                                         @ r5 = yuv_src[1] = Cb_p
288                                         @ r6 = yuv_src[2] = Cr_p
289                                         @
290     ldr         r14, [sp, #36]          @ Line up pattern and kernel quadrant
291     sub         r2, r2, #1              @ stride =- 1
292     eor         r14, r14, r3            @
293     and         r14, r14, #0x2          @
294     mov         r14, r14, lsl #6        @ 0x00 or 0x80
296     mov         r3, #0xC8000000         @
297     orr         r3, r3, #0x120000       @ r3 = DBOP_BASE, need to be redone
298                                         @ due to lack of registers
299     ldr         r12, [r3, #8]           @
300     orr         r12, r12, #3<<13        @ DBOP_CTRL |= (1<<13|1<<14)
301 #ifdef SANSA_FUZEV2
302     bic         r12, r12, #1<<13        @ DBOP_CTRL &= ~(1<<13), still 32bit mode
303 #endif
304     str         r12, [r3, #8]           @ (32bit mode)
305 10: @ loop line                         @
306                                         @
307     ldrb        r7, [r4], #1            @ r7 = *Y'_p++;
308     ldrb        r8, [r5], #1            @ r8 = *Cb_p++;
309     ldrb        r9, [r6], #1            @ r9 = *Cr_p++;
310                                         @
311     eor         r14, r14, #0x80         @ flip pattern quadrant
312                                         @
313     sub         r7, r7, #16             @ r7 = Y = (Y' - 16)*149
314     add         r12, r7, r7, asl #2     @
315     add         r12, r12, r12, asl #4   @
316     add         r7, r12, r7, asl #6     @
317                                         @
318     sub         r8, r8, #128            @ Cb -= 128
319     sub         r9, r9, #128            @ Cr -= 128
320                                         @
321     add         r10, r8, r8, asl #4     @ r10 = guv = Cr*104 + Cb*49
322     add         r10, r10, r8, asl #5    @
323     add         r10, r10, r9, asl #3    @
324     add         r10, r10, r9, asl #5    @
325     add         r10, r10, r9, asl #6    @
326                                         @
327     mov         r8, r8, asl #1          @ r8 = bu = Cb*258
328     add         r8, r8, r8, asl #7      @
329                                         @
330     add         r9, r9, r9, asl #1      @ r9 = rv = Cr*408
331     add         r9, r9, r9, asl #4      @
332     mov         r9, r9, asl #3          @
333                                         @
334                                         @ compute R, G, and B
335     add         r0, r8, r7              @ r0  = b' = Y + bu
336     add         r11, r9, r7, asl #1     @ r11 = r' = Y*2 + rv
337     rsb         r7, r10, r7             @ r7  = g' = Y + guv
338                                         @
339                                         @ r8 = bu, r9 = rv, r10 = guv
340                                         @
341     sub         r12, r0, r0, lsr #5     @ r0 = 31/32*b + b/256
342     add         r0, r12, r0, lsr #8     @
343                                         @
344     sub         r12, r11, r11, lsr #5   @ r11 = 31/32*r + r/256
345     add         r11, r12, r11, lsr #8   @
346                                         @
347     sub         r12, r7, r7, lsr #6     @ r7 = 63/64*g + g/256
348     add         r7, r12, r7, lsr #8     @
349                                         @
350     add         r12, r14, #0x100        @
351                                         @
352     add         r0, r0, r12             @ b = r0 + delta
353     add         r11, r11, r12, lsl #1   @ r = r11 + delta*2
354     add         r7, r7, r12, lsr #1     @ g = r7 + delta/2
355                                         @
356     orr         r12, r0, r11, asr #1    @ check if clamping is needed...
357     orr         r12, r12, r7            @ ...at all
358     movs        r12, r12, asr #15       @
359     beq         15f @ no clamp          @
360     movs        r12, r0, asr #15        @ clamp b
361     mvnne       r0, r12, lsr #15        @
362     andne       r0, r0, #0x7c00         @ mask b only if clamped
363     movs        r12, r11, asr #16       @ clamp r
364     mvnne       r11, r12, lsr #16       @
365     movs        r12, r7, asr #15        @ clamp g
366     mvnne       r7, r12, lsr #15        @
367 15: @ no clamp                          @
368                                         @
369     ldrb        r12, [r4, r2]           @ r12 = Y' = *(Y'_p + stride)
370                                         @
371     and         r11, r11, #0xf800       @ pack pixel
372     and         r7, r7, #0x7e00         @ r0 = pixel = (r & 0xf800) |
373     orr         r11, r11, r7, lsr #4    @              ((g & 0x7e00) >> 4) |
374     orr         r3, r11, r0, lsr #10    @              (b >> 10)
375 #ifdef SANSA_FUZEV2
376     mov         r7, r3, lsr #8          @
377     bic         r3, r3, #0xff00         @
378     orr         r3, r7, r3, lsl #8      @ swap pixel
379 #endif
380                                         @ save pixel
381     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*149
382     add         r12, r7, r7, asl #2     @
383     add         r12, r12, r12, asl #4   @
384     add         r7, r12, r7, asl #6     @
385                                         @ compute R, G, and B
386     add         r0, r8, r7              @ r0  = b' = Y + bu
387     add         r11, r9, r7, asl #1     @ r11 = r' = Y*2 + rv
388     rsb         r7, r10, r7             @ r7  = g' = Y + guv
389                                         @
390     sub         r12, r0, r0, lsr #5     @ r0  = 31/32*b' + b'/256
391     add         r0, r12, r0, lsr #8     @
392                                         @
393     sub         r12, r11, r11, lsr #5   @ r11 = 31/32*r' + r'/256
394     add         r11, r12, r11, lsr #8   @
395                                         @
396     sub         r12, r7, r7, lsr #6     @ r7  = 63/64*g' + g'/256
397     add         r7, r12, r7, lsr #8     @
398                                         @
399     add         r12, r14, #0x200        @
400                                         @
401     add         r0, r0, r12             @ b = r0 + delta
402     add         r11, r11, r12, lsl #1   @ r = r11 + delta*2
403     add         r7, r7, r12, lsr #1     @ g = r7 + delta/2
404                                         @
405     orr         r12, r0, r11, asr #1    @ check if clamping is needed...
406     orr         r12, r12, r7            @ ...at all
407     movs        r12, r12, asr #15       @
408     beq         15f @ no clamp          @
409     movs        r12, r0, asr #15        @ clamp b
410     mvnne       r0, r12, lsr #15        @
411     andne       r0, r0, #0x7c00         @ mask b only if clamped
412     movs        r12, r11, asr #16       @ clamp r
413     mvnne       r11, r12, lsr #16       @
414     movs        r12, r7, asr #15        @ clamp g
415     mvnne       r7, r12, lsr #15        @
416 15: @ no clamp                          @
417                                         @
418     ldrb        r12, [r4], #1           @ r12 = Y' = *(Y'_p++)
420     and         r11, r11, #0xf800       @ pack pixel
421     and         r7, r7, #0x7e00         @ r0 = pixel = (r & 0xf800) |
422     orr         r11, r11, r7, lsr #4    @              ((g & 0x7e00) >> 4) |
423     orr         r0, r11, r0, lsr #10    @              (b >> 10)
424 #ifdef SANSA_FUZEV2
425     mov         r7, r0, lsr #8          @
426     bic         r0, r0, #0xff00         @
427     orr         r0, r7, r0, lsl #8      @ swap pixel
428 #endif
429     orr         r3, r3, r0, lsl#16      @ pack with 2nd pixel
430     mov         r0, #0xC8000000         @
431     orr         r0, r0, #0x120000       @ r3 = DBOP_BASE
433     str         r3, [r0, #0x10]         @ write pixel
434                                         @
435     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*149
436     add         r12, r7, r7, asl #2     @
437     add         r12, r12, r12, asl #4   @
438     add         r7, r12, r7, asl #6     @
439                                         @ compute R, G, and B
440     add         r0, r8, r7              @ r0  = b' = Y + bu
441     add         r11, r9, r7, asl #1     @ r11 = r' = Y*2 + rv
442     rsb         r7, r10, r7             @ r7  = g' = Y + guv
443                                         @
444                                         @ r8 = bu, r9 = rv, r10 = guv
445                                         @
446     sub         r12, r0, r0, lsr #5     @ r0  = 31/32*b' + b'/256
447     add         r0, r12, r0, lsr #8     @
448                                         @
449     sub         r12, r11, r11, lsr #5   @ r11 = 31/32*r' + r'/256
450     add         r11, r12, r11, lsr #8   @
451                                         @
452     sub         r12, r7, r7, lsr #6     @ r7  = 63/64*g' + g'/256
453     add         r7, r12, r7, lsr #8     @
454                                         @
455     add         r12, r14, #0x300        @
456                                         @
457     add         r0, r0, r12             @ b = r0 + delta
458     add         r11, r11, r12, lsl #1   @ r = r11 + delta*2
459     add         r7, r7, r12, lsr #1     @ g = r7 + delta/2
460                                         @
461     orr         r12, r0, r11, asr #1    @ check if clamping is needed...
462     orr         r12, r12, r7            @ ...at all
463     movs        r12, r12, asr #15       @
464     beq         15f @ no clamp          @
465     movs        r12, r0, asr #15        @ clamp b
466     mvnne       r0, r12, lsr #15        @
467     andne       r0, r0, #0x7c00         @ mask b only if clamped
468     movs        r12, r11, asr #16       @ clamp r
469     mvnne       r11, r12, lsr #16       @
470     movs        r12, r7, asr #15        @ clamp g
471     mvnne       r7, r12, lsr #15        @
472 15: @ no clamp                          @
473                                         @
474     ldrb        r12, [r4, r2]           @ r12 = Y' = *(Y'_p + stride)
475                                         @
476     and         r11, r11, #0xf800       @ pack pixel
477     and         r7, r7, #0x7e00         @ r0 = pixel = (r & 0xf800) |
478     orr         r11, r11, r7, lsr #4    @              ((g & 0x7e00) >> 4) |
479     orr         r3, r11, r0, lsr #10    @              (b >> 10)
480 #ifdef SANSA_FUZEV2
481     mov         r7, r3, lsr #8          @
482     bic         r3, r3, #0xff00         @
483     orr         r3, r7, r3, lsl #8      @ swap pixel
484 #endif
485                                         @ save pixel
486                                         @
487     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*149
488     add         r12, r7, r7, asl #2     @
489     add         r12, r12, r12, asl #4   @
490     add         r7, r12, r7, asl #6     @
491                                         @ compute R, G, and B
492     add         r0, r8, r7              @ r0  = b' = Y + bu
493     add         r11, r9, r7, asl #1     @ r11 = r' = Y*2 + rv
494     rsb         r7, r10, r7             @ r7  = g' = Y + guv
495                                         @
496     sub         r12, r0, r0, lsr #5     @ r0 = 31/32*b + b/256
497     add         r0, r12, r0, lsr #8     @
498                                         @
499     sub         r12, r11, r11, lsr #5   @ r11 = 31/32*r + r/256
500     add         r11, r12, r11, lsr #8   @
501                                         @
502     sub         r12, r7, r7, lsr #6     @ r7 = 63/64*g + g/256
503     add         r7, r12, r7, lsr #8     @
504                                         @
505     @ This element is zero - use r14    @
506                                         @
507     add         r0, r0, r14             @ b = r0 + delta
508     add         r11, r11, r14, lsl #1   @ r = r11 + delta*2
509     add         r7, r7, r14, lsr #1     @ g = r7 + delta/2
510                                         @
511     orr         r12, r0, r11, asr #1    @ check if clamping is needed...
512     orr         r12, r12, r7            @ ...at all
513     movs        r12, r12, asr #15       @
514     beq         15f @ no clamp          @
515     movs        r12, r0, asr #15        @ clamp b
516     mvnne       r0, r12, lsr #15        @
517     andne       r0, r0, #0x7c00         @ mask b only if clamped
518     movs        r12, r11, asr #16       @ clamp r
519     mvnne       r11, r12, lsr #16       @
520     movs        r12, r7, asr #15        @ clamp g
521     mvnne       r7, r12, lsr #15        @
522 15: @ no clamp                          @
523                                         @
524     and         r11, r11, #0xf800       @ pack pixel
525     and         r7, r7, #0x7e00         @ r0 = pixel = (r & 0xf800) |
526     orr         r11, r11, r7, lsr #4    @              ((g & 0x7e00) >> 4) |
527     orr         r0, r11, r0, lsr #10    @              (b >> 10)
528 #ifdef SANSA_FUZEV2
529     mov         r7, r0, lsr #8          @
530     bic         r0, r0, #0xff00         @
531     orr         r0, r7, r0, lsl #8      @ swap pixel
532 #endif
533     orr         r3, r3, r0, lsl#16      @ pack with 2nd pixel
534     mov         r0, #0xC8000000         @
535     orr         r0, r0, #0x120000       @ r3 = DBOP_BASE
537     str         r3, [r0, #0x10]         @ write pixel
538                                         @
539     subs        r1, r1, #2              @ subtract block from width
540     bgt         10b @ loop line         @
541                                         @
542 1: @ busy                               @
543     @ writing at max 110*32 (LCD_WIDTH/2), the fifo is bigger (128*32)
544     @ so polling fifo empty only after each line is save
545     ldr         r7, [r0,#0xc]           @ r7 = DBOP_STATUS
546     tst         r7, #DBOP_BUSY          @ fifo not empty?
547     beq         1b                      @
549     ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
550     .ltorg                              @ dump constant pool
551     .size   lcd_write_yuv420_lines_odither, .-lcd_write_yuv420_lines_odither