fix YUV Dither for the other targets too, like in r26064.
[kugel-rb.git] / firmware / target / arm / pbell / vibe500 / lcd-as-vibe500.S
blobd5d51575a7ec3042d4ed3c4995f13e500fd70794
1 /***************************************************************************
2  *             __________               __   ___.
3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
7  *                     \/            \/     \/    \/            \/
8  * $Id:$
9  *
10  * Copyright (C) 2007-2008 by Michael Sevakis
11  * Adapted for the Packard Bell Vibe 500 by Szymon Dziok
12  * 
13  * Packard Bell Vibe 500 LCD assembly routines
14  *
15  * This program is free software; you can redistribute it and/or
16  * modify it under the terms of the GNU General Public License
17  * as published by the Free Software Foundation; either version 2
18  * of the License, or (at your option) any later version.
19  *
20  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
21  * KIND, either express or implied.
22  *
23  ****************************************************************************/
25 #include "config.h"
26 #include "cpu.h"
28 /****************************************************************************
29  * void lcd_write_yuv_420_lines(unsigned char const * const src[3],
30  *                              int width,
31  *                              int stride);
32  *
33  *   |R|   |1.000000 -0.000001  1.402000| |Y'|
34  *   |G| = |1.000000 -0.334136 -0.714136| |Pb|
35  *   |B|   |1.000000  1.772000  0.000000| |Pr|
36  *   Scaled, normalized, rounded and tweaked to yield RGB 565:
37  *   |R|   |74   0 101| |Y' -  16| >> 9
38  *   |G| = |74 -24 -51| |Cb - 128| >> 8
39  *   |B|   |74 128   0| |Cr - 128| >> 9
40  *
41  * Write four RGB565 pixels in the following order on each loop:
42  * 1 3 + > down
43  * 2 4 \/ left
44  */
45     .section    .icode, "ax", %progbits
46     .align      2
47     .global     lcd_write_yuv420_lines
48     .type       lcd_write_yuv420_lines, %function
49 lcd_write_yuv420_lines:
50                                         @ r0 = yuv_src
51                                         @ r1 = width
52                                         @ r2 = stride
53     stmfd       sp!, { r4-r11, lr }     @ save non-scratch
54     ldmia       r0, { r4, r5, r6 }      @ r4 = yuv_src[0] = Y'_p
55                                         @ r5 = yuv_src[1] = Cb_p
56                                         @ r6 = yuv_src[2] = Cr_p
57                                         @
58     ldr         r0, =LCD1_BASE          @
59                                         @
60     sub         r2, r2, #1              @ Adjust stride because of increment
61 10: @ loop line                         @
62     ldrb        r7, [r4], #1            @ r7 = *Y'_p++;
63     ldrb        r8, [r5], #1            @ r8 = *Cb_p++;
64     ldrb        r9, [r6], #1            @ r9 = *Cr_p++;
65                                         @
66     sub         r7, r7, #16             @ r7 = Y = (Y' - 16)*74
67     add         r12, r7, r7, asl #2     @ actually (Y' - 16)*37 and shift right
68     add         r7, r12, r7, asl #5     @ by one less when adding - same for all
69                                         @
70     sub         r8, r8, #128            @ Cb -= 128
71     sub         r9, r9, #128            @ Cr -= 128
72                                         @
73     add         r10, r9, r9, asl #1     @ r10 = Cr*51 + Cb*24
74     add         r10, r10, r10, asl #4   @
75     add         r10, r10, r8, asl #3    @
76     add         r10, r10, r8, asl #4    @
77                                         @
78     add         r11, r9, r9, asl #2     @ r9 = Cr*101
79     add         r11, r11, r9, asl #5    @
80     add         r9, r11, r9, asl #6     @
81                                         @
82     add         r8, r8, #2              @ r8 = bu = (Cb*128 + 128) >> 8
83     mov         r8, r8, asr #2          @
84     add         r9, r9, #256            @ r9 = rv = (r8 + 256) >> 9
85     mov         r9, r9, asr #9          @
86     rsb         r10, r10, #128          @ r10 = guv = (-r9 + 128) >> 8
87     mov         r10, r10, asr #8        @
88                                         @ compute R, G, and B
89     add         r3, r8, r7, asr #8      @ r3 = b = (Y >> 9) + bu
90     add         r11, r9, r7, asr #8     @ r11 = r = (Y >> 9) + rv
91     add         r7, r10, r7, asr #7     @ r7  = g = (Y >> 8) + guv
92                                         @
93     orr         r12, r3, r11            @ check if clamping is needed...
94     orr         r12, r12, r7, asr #1    @ ...at all
95     cmp         r12, #31                @
96     bls         15f @ no clamp          @
97     cmp         r3, #31                 @ clamp b
98     mvnhi       r3, r3, asr #31         @
99     andhi       r3, r3, #31             @
100     cmp         r11, #31                @ clamp r
101     mvnhi       r11, r11, asr #31       @
102     andhi       r11, r11, #31           @
103     cmp         r7, #63                 @ clamp g
104     mvnhi       r7, r7, asr #31         @
105     andhi       r7, r7, #63             @
106 15: @ no clamp                          @
107                                         @
108     ldrb        r12, [r4, r2]           @ r12 = Y' = *(Y'_p + stride)
109                                         @
110     orr         r3, r3, r11, lsl #11    @ r3 = b | (r << 11)
111     orr         r3, r3, r7, lsl #5      @ r3 |= (g << 5)
112                                         @
113     movs         r7, r3, lsr #8         @ store pixel
114 20:                                     @
115     ldr         r11, [r0]               @
116     tst         r11, #LCD1_BUSY_MASK    @
117     bne         20b                     @
118     str         r7, [r0, #0x10]         @
119 25:                                     @
120     ldr         r11, [r0]               @
121     tst         r11, #LCD1_BUSY_MASK    @
122     bne         25b                     @
123     str         r3, [r0, #0x10]         @
124                                         @
125     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*74
126     add         r12, r7, r7, asl #2     @
127     add         r7, r12, r7, asl #5     @
128                                         @ compute R, G, and B
129     add         r3, r8, r7, asr #8      @ r3  = b = (Y >> 9) + bu
130     add         r11, r9, r7, asr #8     @ r11 = r = (Y >> 9) + rv
131     add         r7, r10, r7, asr #7     @ r7  = g = (Y >> 8) + guv
132                                         @
133     orr         r12, r3, r11            @ check if clamping is needed...
134     orr         r12, r12, r7, asr #1    @ ...at all
135     cmp         r12, #31                @
136     bls         15f @ no clamp          @
137     cmp         r3, #31                 @ clamp b
138     mvnhi       r3, r3, asr #31         @
139     andhi       r3, r3, #31             @
140     cmp         r11, #31                @ clamp r
141     mvnhi       r11, r11, asr #31       @
142     andhi       r11, r11, #31           @
143     cmp         r7, #63                 @ clamp g
144     mvnhi       r7, r7, asr #31         @
145     andhi       r7, r7, #63             @
146 15: @ no clamp                          @
147                                         @
148     ldrb        r12, [r4], #1           @ r12 = Y' = *(Y'_p++)
149                                         @
150     orr         r3, r3, r11, lsl #11    @ r3 = b | (r << 11)
151     orr         r3, r3, r7, lsl #5      @ r3 |= (g << 5)
152                                         @
153     movs         r7, r3, lsr #8         @ store pixel
154 20:                                     @
155     ldr         r11, [r0]               @
156     tst         r11, #LCD1_BUSY_MASK    @
157     bne         20b                     @
158     str         r7, [r0, #0x10]         @
159 25:                                     @
160     ldr         r11, [r0]               @
161     tst         r11, #LCD1_BUSY_MASK    @
162     bne         25b                     @
163     str         r3, [r0, #0x10]         @
164                                         @
165     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*74
166     add         r12, r7, r7, asl #2     @
167     add         r7, r12, r7, asl #5     @
168                                         @ compute R, G, and B
169     add         r3, r8, r7, asr #8      @ r3  = b = (Y >> 9) + bu
170     add         r11, r9, r7, asr #8     @ r11 = r = (Y >> 9) + rv
171     add         r7, r10, r7, asr #7     @ r7  = g = (Y >> 8) + guv
172                                         @
173     orr         r12, r3, r11            @ check if clamping is needed...
174     orr         r12, r12, r7, asr #1    @ ...at all
175     cmp         r12, #31                @
176     bls         15f @ no clamp          @
177     cmp         r3, #31                 @ clamp b
178     mvnhi       r3, r3, asr #31         @
179     andhi       r3, r3, #31             @
180     cmp         r11, #31                @ clamp r
181     mvnhi       r11, r11, asr #31       @
182     andhi       r11, r11, #31           @
183     cmp         r7, #63                 @ clamp g
184     mvnhi       r7, r7, asr #31         @
185     andhi       r7, r7, #63             @
186 15: @ no clamp                          @
187                                         @
188     ldrb        r12, [r4, r2]           @ r12 = Y' = *(Y'_p + stride)
189                                         @
190     orr         r3, r3, r7, lsl #5      @ r3 = b | (g << 5)
191     orr         r3, r3, r11, lsl #11    @ r3 |= (r << 11)
192                                         @
193     movs         r7, r3, lsr #8         @ store pixel
194 20:                                     @
195     ldr         r11, [r0]               @
196     tst         r11, #LCD1_BUSY_MASK    @
197     bne         20b                     @
198     str         r7, [r0, #0x10]         @
199 25:                                     @
200     ldr         r11, [r0]               @
201     tst         r11, #LCD1_BUSY_MASK    @
202     bne         25b                     @
203     str         r3, [r0, #0x10]         @
204                                         @
205     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*74
206     add         r12, r7, r7, asl #2     @
207     add         r7, r12, r7, asl #5     @
208                                         @ compute R, G, and B
209     add         r3, r8, r7, asr #8      @ r3  = b = (Y >> 9) + bu
210     add         r11, r9, r7, asr #8     @ r11 = r = (Y >> 9) + rv
211     add         r7, r10, r7, asr #7     @ r7  = g = (Y >> 8) + guv
212                                         @
213     orr         r12, r3, r11            @ check if clamping is needed...
214     orr         r12, r12, r7, asr #1    @ ...at all
215     cmp         r12, #31                @
216     bls         15f @ no clamp          @
217     cmp         r3, #31                 @ clamp b
218     mvnhi       r3, r3, asr #31         @
219     andhi       r3, r3, #31             @
220     cmp         r11, #31                @ clamp r
221     mvnhi       r11, r11, asr #31       @
222     andhi       r11, r11, #31           @
223     cmp         r7, #63                 @ clamp g
224     mvnhi       r7, r7, asr #31         @
225     andhi       r7, r7, #63             @
226 15: @ no clamp                          @
227                                         @
228     orr         r3, r3, r11, lsl #11    @ r3 = b | (r << 11)
229     orr         r3, r3, r7, lsl #5      @ r3 |= (g << 5)
230                                         @
231     movs         r7, r3, lsr #8         @ store pixel
232 20:                                     @
233     ldr         r11, [r0]               @
234     tst         r11, #LCD1_BUSY_MASK    @
235     bne         20b                     @
236     str         r7, [r0, #0x10]         @
237 25:                                     @
238     ldr         r11, [r0]               @
239     tst         r11, #LCD1_BUSY_MASK    @
240     bne         25b                     @
241     str         r3, [r0, #0x10]         @
242                                         @
243     subs        r1, r1, #2              @ subtract block from width
244     bgt         10b @ loop line         @
245                                         @
246     ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
247     .ltorg                              @ dump constant pool
248     .size   lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
251 /****************************************************************************
252  * void lcd_write_yuv_420_lines_odither(unsigned char const * const src[3],
253  *                                      int width,
254  *                                      int stride,
255  *                                      int x_screen,
256  *                                      int y_screen);
258  *   |R|   |1.000000 -0.000001  1.402000| |Y'|
259  *   |G| = |1.000000 -0.334136 -0.714136| |Pb|
260  *   |B|   |1.000000  1.772000  0.000000| |Pr|
261  *   Red scaled at twice g & b but at same precision to place it in correct
262  *   bit position after multiply and leave instruction count lower.
263  *   |R|   |258   0  408| |Y' -  16|
264  *   |G| = |149 -49 -104| |Cb - 128|
265  *   |B|   |149 258    0| |Cr - 128|
267  * Write four RGB565 pixels in the following order on each loop:
268  * 1 3 + > down
269  * 2 4 \/ left
271  * Kernel pattern (raw|use order):
272  * 5 3 4 2     row0    row2         > down
273  * 1 7 0 6 | 5 1 3 7 4 0 2 6 col0     left
274  * 4 2 5 3 | 4 0 2 6 5 1 3 7 col2  \/
275  * 0 6 1 7
276  */
277     .section    .icode, "ax", %progbits
278     .align      2
279     .global     lcd_write_yuv420_lines_odither
280     .type       lcd_write_yuv420_lines_odither, %function
281 lcd_write_yuv420_lines_odither:
282                                         @ r0   = yuv_src
283                                         @ r1   = width
284                                         @ r2   = stride
285                                         @ r3   = x_screen
286                                         @ [sp] = y_screen
287     stmfd       sp!, { r4-r11, lr }     @ save non-scratch
288     ldmia       r0, { r4, r5, r6 }      @ r4 = yuv_src[0] = Y'_p
289                                         @ r5 = yuv_src[1] = Cb_p
290                                         @ r6 = yuv_src[2] = Cr_p
291                                         @
292     ldr         r0, [sp, #36]           @ Line up pattern and kernel quadrant
293     eor         r14, r3, r0             @
294     and         r14, r14, #0x2          @
295     mov         r14, r14, lsl #6        @ 0x00 or 0x80
296                                         @
297     ldr         r0, =LCD1_BASE          @
298                                         @
299     sub         r2, r2, #1              @ Adjust stride because of increment
300 10: @ loop line                         @
301                                         @
302     ldrb        r7, [r4], #1            @ r7 = *Y'_p++;
303     ldrb        r8, [r5], #1            @ r8 = *Cb_p++;
304     ldrb        r9, [r6], #1            @ r9 = *Cr_p++;
305                                         @
306     eor         r14, r14, #0x80         @ flip pattern quadrant
307                                         @
308     sub         r7, r7, #16             @ r7 = Y = (Y' - 16)*149
309     add         r12, r7, r7, asl #2     @
310     add         r12, r12, r12, asl #4   @
311     add         r7, r12, r7, asl #6     @
312                                         @    
313     sub         r8, r8, #128            @ Cb -= 128
314     sub         r9, r9, #128            @ Cr -= 128
315                                         @
316     add         r10, r8, r8, asl #4     @ r10 = guv = Cr*104 + Cb*49
317     add         r10, r10, r8, asl #5    @
318     add         r10, r10, r9, asl #3    @
319     add         r10, r10, r9, asl #5    @
320     add         r10, r10, r9, asl #6    @
321                                         @
322     mov         r8, r8, asl #1          @ r8 = bu = Cb*258
323     add         r8, r8, r8, asl #7      @
324                                         @
325     add         r9, r9, r9, asl #1      @ r9 = rv = Cr*408
326     add         r9, r9, r9, asl #4      @
327     mov         r9, r9, asl #3          @
328                                         @
329                                         @ compute R, G, and B
330     add         r3, r8, r7              @ r3  = b' = Y + bu
331     add         r11, r9, r7, asl #1     @ r11 = r' = Y*2 + rv
332     rsb         r7, r10, r7             @ r7  = g' = Y + guv
333                                         @
334                                         @ r8 = bu, r9 = rv, r10 = guv
335                                         @
336     sub         r12, r3, r3, lsr #5     @ r3 = 31/32*b + b/256
337     add         r3, r12, r3, lsr #8     @
338                                         @
339     sub         r12, r11, r11, lsr #5   @ r11 = 31/32*r + r/256
340     add         r11, r12, r11, lsr #8   @
341                                         @
342     sub         r12, r7, r7, lsr #6     @ r7 = 63/64*g + g/256
343     add         r7, r12, r7, lsr #8     @
344                                         @
345     add         r12, r14, #0x200        @
346                                         @
347     add         r3, r3, r12             @ b = r3 + delta
348     add         r11, r11, r12, lsl #1   @ r = r11 + delta*2
349     add         r7, r7, r12, lsr #1     @ g = r7 + delta/2
350                                         @
351     orr         r12, r3, r11, asr #1    @ check if clamping is needed...
352     orr         r12, r12, r7            @ ...at all
353     movs        r12, r12, asr #15       @
354     beq         15f @ no clamp          @
355     movs        r12, r3, asr #15        @ clamp b
356     mvnne       r3, r12, lsr #15        @
357     andne       r3, r3, #0x7c00         @ mask b only if clamped
358     movs        r12, r11, asr #16       @ clamp r
359     mvnne       r11, r12, lsr #16       @
360     movs        r12, r7, asr #15        @ clamp g
361     mvnne       r7, r12, lsr #15        @
362 15: @ no clamp                          @
363                                         @
364     ldrb        r12, [r4, r2]           @ r12 = Y' = *(Y'_p + stride)
365                                         @
366     and         r11, r11, #0xf800       @ pack pixel
367     and         r7, r7, #0x7e00         @ r3 = pixel = (r & 0xf800) |
368     orr         r11, r11, r7, lsr #4    @              ((g & 0x7e00) >> 4) |
369     orr         r3, r11, r3, lsr #10    @              (b >> 10)
370                                         @
371     movs         r7, r3, lsr #8         @ store pixel
372 20:                                     @
373     ldr         r11, [r0]               @
374     tst         r11, #LCD1_BUSY_MASK    @
375     bne         20b                     @
376     str         r7, [r0, #0x10]         @
377 25:                                     @
378     ldr         r11, [r0]               @
379     tst         r11, #LCD1_BUSY_MASK    @
380     bne         25b                     @
381     str         r3, [r0, #0x10]         @
382                                         @
383     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*149
384     add         r12, r7, r7, asl #2     @
385     add         r12, r12, r12, asl #4   @
386     add         r7, r12, r7, asl #6     @
387                                         @ compute R, G, and B
388     add         r3, r8, r7              @ r3  = b' = Y + bu
389     add         r11, r9, r7, asl #1     @ r11 = r' = Y*2 + rv
390     rsb         r7, r10, r7             @ r7  = g' = Y + guv
391                                         @
392     sub         r12, r3, r3, lsr #5     @ r3  = 31/32*b' + b'/256
393     add         r3, r12, r3, lsr #8     @
394                                         @
395     sub         r12, r11, r11, lsr #5   @ r11 = 31/32*r' + r'/256
396     add         r11, r12, r11, lsr #8   @
397                                         @
398     sub         r12, r7, r7, lsr #6     @ r7  = 63/64*g' + g'/256
399     add         r7, r12, r7, lsr #8     @
400                                         @
401     @ This element is zero - use r14    @
402                                         @
403     add         r3, r3, r14             @ b = r3 + delta
404     add         r11, r11, r14, lsl #1   @ r = r11 + delta*2
405     add         r7, r7, r14, lsr #1     @ g = r7 + delta/2
406                                         @
407     orr         r12, r3, r11, asr #1    @ check if clamping is needed...
408     orr         r12, r12, r7            @ ...at all
409     movs        r12, r12, asr #15       @
410     beq         15f @ no clamp          @
411     movs        r12, r3, asr #15        @ clamp b
412     mvnne       r3, r12, lsr #15        @
413     andne       r3, r3, #0x7c00         @ mask b only if clamped
414     movs        r12, r11, asr #16       @ clamp r
415     mvnne       r11, r12, lsr #16       @
416     movs        r12, r7, asr #15        @ clamp g
417     mvnne       r7, r12, lsr #15        @
418 15: @ no clamp                          @
419                                         @
420     ldrb        r12, [r4], #1           @ r12 = Y' = *(Y'_p++)
421                                         @
422     and         r11, r11, #0xf800       @ pack pixel
423     and         r7, r7, #0x7e00         @ r3 = pixel = (r & 0xf800) |
424     orr         r11, r11, r7, lsr #4    @              ((g & 0x7e00) >> 4) |
425     orr         r3, r11, r3, lsr #10    @              (b >> 10)
426                                         @
427     movs         r7, r3, lsr #8         @ store pixel
428 20:                                     @
429     ldr         r11, [r0]               @
430     tst         r11, #LCD1_BUSY_MASK    @
431     bne         20b                     @
432     str         r7, [r0, #0x10]         @
433 25:                                     @
434     ldr         r11, [r0]               @
435     tst         r11, #LCD1_BUSY_MASK    @
436     bne         25b                     @
437     str         r3, [r0, #0x10]         @
438                                         @
439     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*149
440     add         r12, r7, r7, asl #2     @
441     add         r12, r12, r12, asl #4   @
442     add         r7, r12, r7, asl #6     @
443                                         @ compute R, G, and B
444     add         r3, r8, r7              @ r3  = b' = Y + bu
445     add         r11, r9, r7, asl #1     @ r11 = r' = Y*2 + rv
446     rsb         r7, r10, r7             @ r7  = g' = Y + guv
447                                         @
448                                         @ r8 = bu, r9 = rv, r10 = guv
449                                         @
450     sub         r12, r3, r3, lsr #5     @ r3  = 31/32*b' + b'/256
451     add         r3, r12, r3, lsr #8     @
452                                         @
453     sub         r12, r11, r11, lsr #5   @ r11 = 31/32*r' + r'/256
454     add         r11, r12, r11, lsr #8   @
455                                         @
456     sub         r12, r7, r7, lsr #6     @ r7  = 63/64*g' + g'/256
457     add         r7, r12, r7, lsr #8     @
458                                         @
459     add         r12, r14, #0x100        @
460                                         @
461     add         r3, r3, r12             @ b = r3 + delta
462     add         r11, r11, r12, lsl #1   @ r = r11 + delta*2
463     add         r7, r7, r12, lsr #1     @ g = r7 + delta/2
464                                         @
465     orr         r12, r3, r11, asr #1    @ check if clamping is needed...
466     orr         r12, r12, r7            @ ...at all
467     movs        r12, r12, asr #15       @
468     beq         15f @ no clamp          @
469     movs        r12, r3, asr #15        @ clamp b
470     mvnne       r3, r12, lsr #15        @
471     andne       r3, r3, #0x7c00         @ mask b only if clamped
472     movs        r12, r11, asr #16       @ clamp r
473     mvnne       r11, r12, lsr #16       @
474     movs        r12, r7, asr #15        @ clamp g
475     mvnne       r7, r12, lsr #15        @
476 15: @ no clamp                          @
477                                         @
478     ldrb        r12, [r4, r2]           @ r12 = Y' = *(Y'_p + stride)    
479                                         @
480     and         r11, r11, #0xf800       @ pack pixel
481     and         r7, r7, #0x7e00         @ r3 = pixel = (r & 0xf800) |
482     orr         r11, r11, r7, lsr #4    @              ((g & 0x7e00) >> 4) |
483     orr         r3, r11, r3, lsr #10    @              (b >> 10)
484                                         @
485     movs         r7, r3, lsr #8         @ store pixel
486 20:                                     @
487     ldr         r11, [r0]               @
488     tst         r11, #LCD1_BUSY_MASK    @
489     bne         20b                     @
490     str         r7, [r0, #0x10]         @
491 25:                                     @
492     ldr         r11, [r0]               @
493     tst         r11, #LCD1_BUSY_MASK    @
494     bne         25b                     @
495     str         r3, [r0, #0x10]         @
496                                         @
497     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*149
498     add         r12, r7, r7, asl #2     @
499     add         r12, r12, r12, asl #4   @
500     add         r7, r12, r7, asl #6     @
501                                         @ compute R, G, and B
502     add         r3, r8, r7              @ r3  = b' = Y + bu
503     add         r11, r9, r7, asl #1     @ r11 = r' = Y*2 + rv
504     rsb         r7, r10, r7             @ r7  = g' = Y + guv
505                                         @
506     sub         r12, r3, r3, lsr #5     @ r3 = 31/32*b + b/256
507     add         r3, r12, r3, lsr #8     @
508                                         @
509     sub         r12, r11, r11, lsr #5   @ r11 = 31/32*r + r/256
510     add         r11, r12, r11, lsr #8   @
511                                         @
512     sub         r12, r7, r7, lsr #6     @ r7 = 63/64*g + g/256
513     add         r7, r12, r7, lsr #8     @
514                                         @
515     add         r12, r14, #0x300        @
516                                         @
517     add         r3, r3, r12             @ b = r3 + delta
518     add         r11, r11, r12, lsl #1   @ r = r11 + delta*2
519     add         r7, r7, r12, lsr #1     @ g = r7 + delta/2
520                                         @
521     orr         r12, r3, r11, asr #1    @ check if clamping is needed...
522     orr         r12, r12, r7            @ ...at all
523     movs        r12, r12, asr #15       @
524     beq         15f @ no clamp          @
525     movs        r12, r3, asr #15        @ clamp b
526     mvnne       r3, r12, lsr #15        @
527     andne       r3, r3, #0x7c00         @ mask b only if clamped
528     movs        r12, r11, asr #16       @ clamp r
529     mvnne       r11, r12, lsr #16       @
530     movs        r12, r7, asr #15        @ clamp g
531     mvnne       r7, r12, lsr #15        @
532 15: @ no clamp                          @
533                                         @
534     and         r11, r11, #0xf800       @ pack pixel
535     and         r7, r7, #0x7e00         @ r3 = pixel = (r & 0xf800) |
536     orr         r11, r11, r7, lsr #4    @              ((g & 0x7e00) >> 4) |
537     orr         r3, r11, r3, lsr #10    @              (b >> 10)
538                                         @
539     movs         r7, r3, lsr #8         @ store pixel
540 20:                                     @
541     ldr         r11, [r0]               @
542     tst         r11, #LCD1_BUSY_MASK    @
543     bne         20b                     @
544     str         r7, [r0, #0x10]         @
545 25:                                     @
546     ldr         r11, [r0]               @
547     tst         r11, #LCD1_BUSY_MASK    @
548     bne         25b                     @
549     str         r3, [r0, #0x10]         @
550                                         @
551     subs        r1, r1, #2              @ subtract block from width
552     bgt         10b @ loop line         @
553                                         @
554     ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
555     .ltorg                              @ dump constant pool
556     .size   lcd_write_yuv420_lines_odither, .-lcd_write_yuv420_lines_odither