Hopefully fix FS#8506 (OF cant be loaded on some PP targets). also hopefully fixes...
[Rockbox.git] / firmware / target / arm / lcd-as-memframe.S
blob20950c8a4b3ee09bbde3ab5dc0a29da10775e59b
1 /***************************************************************************
2  *             __________               __   ___.
3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
7  *                     \/            \/     \/    \/            \/
8  * $Id$
9  *
10  * Copyright (C) 2007 by Michael Sevakis
11  *
12  * ARM code for memory framebuffer LCDs
13  *
14  * All files in this archive are subject to the GNU General Public License.
15  * See the file COPYING in the source tree root for full license agreement.
16  *
17  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18  * KIND, either express or implied.
19  *
20  ****************************************************************************/
22 #include "config.h"
23 #include "cpu.h"
25 /****************************************************************************
26  * void lcd_copy_buffer_rect(fb_data *dst, fb_data *src, int width,
27  *                           int height);
28  */
29      .section    .icode, "ax", %progbits
30      .align      2
31      .global     lcd_copy_buffer_rect
32      .type       lcd_copy_buffer_rect, %function
33                                         @ r0 = dst
34                                         @ r1 = src
35                                         @ r2 = width
36                                         @ r3 = height
37 lcd_copy_buffer_rect:                   @
38     stmfd   sp!, { r4-r12, lr }         @ save non-scratch regs
39     mov     r5, r2                      @ r5 = cached width
40     rsb     r4, r2, #LCD_WIDTH          @ r4 = LCD_WIDTH - width
41 10: @ copy line                         @
42     subs    r2, r5, #1                  @ r2 = width - 1
43     beq     40f @ finish line           @ one halfword? skip to trailing copy
44     tst     r0, #2                      @ word aligned?
45     beq     20f @ rem copy              @ yes? skip to word copy
46     ldrh    r6, [r1], #2                @ copy leading halfword
47     subs    r2, r2, #1                  @
48     strh    r6, [r0], #2                @
49     ble     40f @ finish line           @ next line if lt or finish
50                                         @ trailing halfword if eq
51 20: @ rem copy                          @
52     add     r14, r2, #1                 @ get remaining width mod 16 after word
53                                         @ align (rw)
54     and     r14, r14, #0xe              @ r14 = 0 (16), 2, 4, 6, 8, 10, 12, 14
55     add     pc, pc, r14, lsl #3         @ branch to 32-byte align
56     nop                                 @ 
57     b       30f                         @ rw % 16 = 0 or 1? use octword loop
58     nop                                 @
59     nop                                 @
60     nop                                 @
61     ldr     r6, [r1], #4                @ rw % 16 = 2 or 3
62     subs    r2, r2, #2                  @
63     str     r6, [r0], #4                @
64     b       25f @ copy up done          @
65     ldmia   r1!, { r6-r7 }              @ rw % 16 = 4 or 5
66     subs    r2, r2, #4                  @
67     stmia   r0!, { r6-r7 }              @
68     b       25f @ copy up done          @
69     ldmia   r1!, { r6-r8 }              @ rw % 16 = 6 or 7
70     subs    r2, r2, #6                  @
71     stmia   r0!, { r6-r8 }              @
72     b       25f @ copy up done          @
73     ldmia   r1!, { r6-r9 }              @ rw % 16 = 8 or 9
74     subs    r2, r2, #8                  @
75     stmia   r0!, { r6-r9 }              @
76     b       25f @ copy up done          @
77     ldmia   r1!, { r6-r10 }             @ rw % 16 = 10 or 11
78     subs    r2, r2, #10                 @
79     stmia   r0!, { r6-r10 }             @
80     b       25f @ copy up done          @
81     ldmia   r1!, { r6-r11 }             @ rw % 16 = 12 or 13
82     subs    r2, r2, #12                 @
83     stmia   r0!, { r6-r11 }             @
84     b       25f @ copy up done          @
85     ldmia   r1!, { r6-r12 }             @ rw % 16 = 14 or 15
86     subs    r2, r2, #14                 @
87     stmia   r0!, { r6-r12 }             @
88 25: @ copy up done                      @
89     ble     40f @ finish line           @ no 32-byte segments remaining?
90 30: @ octword loop                      @ copy 16 pixels per loop
91     ldmia   r1!, { r6-r12, r14 }        @
92     subs    r2, r2, #16                 @
93     stmia   r0!, { r6-r12, r14 }        @
94     bgt     30b @ octword loop          @ 
95 40: @ finish line                       @
96     ldreqh  r6, [r1], #2                @ finish last halfword if eq ...
97     add     r1, r1, r4, lsl #1          @
98     streqh  r6, [r0], #2                @ ...
99     add     r0, r0, r4, lsl #1          @
100     subs    r3, r3, #1                  @ next line
101     bgt     10b @ copy line             @
102     ldmfd   sp!, { r4-r12, pc }         @ restore regs and return
103     .ltorg                              @ dump constant pool
104     .size   lcd_copy_buffer_rect, .-lcd_copy_buffer_rect
106 /****************************************************************************
107  * void lcd_write_yuv_420_lines(fb_data *dst,
108  *                              unsigned char const * const src[3],
109  *                              int width,
110  *                              int stride);
112  *   |R|   |1.000000 -0.000001  1.402000| |Y'|
113  *   |G| = |1.000000 -0.334136 -0.714136| |Pb|
114  *   |B|   |1.000000  1.772000  0.000000| |Pr|
115  *   Scaled, normalized, rounded and tweaked to yield RGB 565:
116  *   |R|   |74   0 101| |Y' -  16| >> 9
117  *   |G| = |74 -24 -51| |Cb - 128| >> 8
118  *   |B|   |74 128   0| |Cr - 128| >> 9
120  * Write four RGB565 pixels in the following order on each loop:
121  * 1 3 + > down
122  * 2 4 \/ left
123  */
124     .section    .icode, "ax", %progbits
125     .align      2
126     .global     lcd_write_yuv420_lines
127     .type       lcd_write_yuv420_lines, %function
128 lcd_write_yuv420_lines:
129                                         @ r0 = dst
130                                         @ r1 = yuv_src
131                                         @ r2 = width
132                                         @ r3 = stride
133     stmfd       sp!, { r4-r12 }         @ save non-scratch
134     ldmia       r1, { r4, r5, r6 }      @ r4 = yuv_src[0] = Y'_p
135                                         @ r5 = yuv_src[1] = Cb_p
136                                         @ r6 = yuv_src[2] = Cr_p
137                                         @ r1 = scratch
138     sub         r3, r3, #1              @
139 10: @ loop line                         @
140     ldrb        r7, [r4], #1            @ r7 = *Y'_p++;
141     ldrb        r8, [r5], #1            @ r8 = *Cb_p++;
142     ldrb        r9, [r6], #1            @ r9 = *Cr_p++;
143                                         @
144     sub         r7, r7, #16             @ r7 = Y = (Y' - 16)*74
145     add         r12, r7, r7, asl #2     @ actually (Y' - 16)*37 and shift right
146     add         r7, r12, r7, asl #5     @ by one less when adding - same for all
147                                         @
148     sub         r8, r8, #128            @ Cb -= 128
149     sub         r9, r9, #128            @ Cr -= 128
150                                         @
151     add         r10, r9, r9, asl #1     @ r10 = Cr*51 + Cb*24
152     add         r10, r10, r10, asl #4   @
153     add         r10, r10, r8, asl #3    @
154     add         r10, r10, r8, asl #4    @
155                                         @
156     add         r11, r9, r9, asl #2     @ r9 = Cr*101
157     add         r11, r11, r9, asl #5    @
158     add         r9, r11, r9, asl #6     @
159                                         @
160     add         r8, r8, #2              @ r8 = bu = (Cb*128 + 128) >> 8
161     mov         r8, r8, asr #2          @
162     add         r9, r9, #256            @ r9 = rv = (r9 + 256) >> 9
163     mov         r9, r9, asr #9          @
164     rsb         r10, r10, #128          @ r10 = guv = (-r10 + 128) >> 8
165     mov         r10, r10, asr #8        @
166                                         @ compute R, G, and B
167     add         r1, r8, r7, asr #8      @ r1  = b = (Y >> 9) + bu
168     add         r11, r9, r7, asr #8     @ r11 = r = (Y >> 9) + rv
169     add         r7, r10, r7, asr #7     @ r7  = g = (Y >> 8) + guv
170                                         @
171     orr         r12, r1, r11            @ check if clamping is needed...
172     orr         r12, r12, r7, asr #1    @ ...at all
173     cmp         r12, #31                @
174     bls         15f @ no clamp          @
175     cmp         r1, #31                 @ clamp b
176     mvnhi       r1, r1, asr #31         @
177     andhi       r1, r1, #31             @
178     cmp         r11, #31                @ clamp r
179     mvnhi       r11, r11, asr #31       @
180     andhi       r11, r11, #31           @
181     cmp         r7, #63                 @ clamp g
182     mvnhi       r7, r7, asr #31         @
183     andhi       r7, r7, #63             @
184 15: @ no clamp                          @
185                                         @
186     ldrb        r12, [r4, r3]           @ r12 = Y' = *(Y'_p + stride)
187                                         @
188     orr         r1, r1, r7, lsl #5      @ r4 |= (g << 5)
189     orr         r1, r1, r11, lsl #11    @ r4 = b | (r << 11)
190 #if LCD_WIDTH < 256
191     strh        r1, [r0], #LCD_WIDTH    @ store pixel
192 #else
193     strh        r1, [r0]                @
194 #endif
195                                         @
196     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*74
197     add         r12, r7, r7, asl #2     @
198     add         r7, r12, r7, asl #5     @
199                                         @ compute R, G, and B
200     add         r1, r8, r7, asr #8      @ r1  = b = (Y >> 9) + bu
201     add         r11, r9, r7, asr #8     @ r11 = r = (Y >> 9) + rv
202     add         r7, r10, r7, asr #7     @ r7  = g = (Y >> 8) + guv
203                                         @
204     orr         r12, r1, r11            @ check if clamping is needed...
205     orr         r12, r12, r7, asr #1    @ ...at all
206     cmp         r12, #31                @
207     bls         15f @ no clamp          @
208     cmp         r1, #31                 @ clamp b
209     mvnhi       r1, r1, asr #31         @
210     andhi       r1, r1, #31             @
211     cmp         r11, #31                @ clamp r
212     mvnhi       r11, r11, asr #31       @
213     andhi       r11, r11, #31           @
214     cmp         r7, #63                 @ clamp g
215     mvnhi       r7, r7, asr #31         @
216     andhi       r7, r7, #63             @
217 15: @ no clamp                          @
218                                         @
219     ldrb        r12, [r4], #1           @ r12 = Y' = *(Y'_p++)
220                                         @
221     orr         r1, r1, r11, lsl #11    @ r1 = b | (r << 11)
222     orr         r1, r1, r7, lsl #5      @ r1 |= (g << 5)
223 #if LCD_WIDTH < 256
224     strh        r1, [r0, #-LCD_WIDTH-2] @ store pixel
225 #else
226     strh        r1, [r0, #-2]           @
227     add         r0, r0, #LCD_WIDTH      @
228 #endif
229                                         @
230     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*74
231     add         r12, r7, r7, asl #2     @
232     add         r7, r12, r7, asl #5     @
233                                         @ compute R, G, and B
234     add         r1, r8, r7, asr #8      @ r1  = b = (Y >> 9) + bu
235     add         r11, r9, r7, asr #8     @ r11 = r = (Y >> 9) + rv
236     add         r7, r10, r7, asr #7     @ r7  = g = (Y >> 8) + guv
237                                         @
238     orr         r12, r1, r11            @ check if clamping is needed...
239     orr         r12, r12, r7, asr #1    @ ...at all
240     cmp         r12, #31                @
241     bls         15f @ no clamp          @
242     cmp         r1, #31                 @ clamp b
243     mvnhi       r1, r1, asr #31         @
244     andhi       r1, r1, #31             @
245     cmp         r11, #31                @ clamp r
246     mvnhi       r11, r11, asr #31       @
247     andhi       r11, r11, #31           @
248     cmp         r7, #63                 @ clamp g
249     mvnhi       r7, r7, asr #31         @
250     andhi       r7, r7, #63             @
251 15: @ no clamp                          @
252                                         @
253     ldrb        r12, [r4, r3]           @ r12 = Y' = *(Y'_p + stride)
254                                         @
255     orr         r1, r1, r7, lsl #5      @ r1 = b | (g << 5)
256     orr         r1, r1, r11, lsl #11    @ r1 |= (r << 11)
257 #if LCD_WIDTH <  256
258     strh        r1, [r0, #LCD_WIDTH]!   @ store pixel
259 #else
260     strh        r1, [r0]                @
261 #endif
262                                         @
263     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*74
264     add         r12, r7, r7, asl #2     @
265     add         r7, r12, r7, asl #5     @
266                                         @ compute R, G, and B
267     add         r1, r8, r7, asr #8      @ r1  = b = (Y >> 9) + bu
268     add         r11, r9, r7, asr #8     @ r11 = r = (Y >> 9) + rv
269     add         r7, r10, r7, asr #7     @ r7  = g = (Y >> 8) + guv
270                                         @
271     orr         r12, r1, r11            @ check if clamping is needed...
272     orr         r12, r12, r7, asr #1    @ ...at all
273     cmp         r12, #31                @
274     bls         15f @ no clamp          @
275     cmp         r1, #31                 @ clamp b
276     mvnhi       r1, r1, asr #31         @
277     andhi       r1, r1, #31             @
278     cmp         r11, #31                @ clamp r
279     mvnhi       r11, r11, asr #31       @
280     andhi       r11, r11, #31           @
281     cmp         r7, #63                 @ clamp g
282     mvnhi       r7, r7, asr #31         @
283     andhi       r7, r7, #63             @
284 15: @ no clamp                          @
285                                         @
286     orr         r12, r1, r11, lsl #11   @ r12 = b | (r << 11)
287     orr         r12, r12, r7, lsl #5    @ r12 |= (g << 5)
288     strh        r12, [r0, #-2]          @ store pixel
289 #if LCD_WIDTH < 256
290     add         r0, r0, #2*LCD_WIDTH    @
291 #else
292     add         r0, r0, #LCD_WIDTH      @
293 #endif
294                                         @
295     subs        r2, r2, #2              @ subtract block from width
296     bgt         10b @ loop line         @
297                                         @
298     ldmfd       sp!, { r4-r12 }         @ restore registers and return
299     bx          lr                      @
300     .ltorg                              @ dump constant pool
301     .size   lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
304 /****************************************************************************
305  * void lcd_write_yuv_420_lines_odither(fb_data *dst,
306  *                                      unsigned char const * const src[3],
307  *                                      int width,
308  *                                      int stride,
309  *                                      int x_screen,
310  *                                      int y_screen);
312  *   |R|   |1.000000 -0.000001  1.402000| |Y'|
313  *   |G| = |1.000000 -0.334136 -0.714136| |Pb|
314  *   |B|   |1.000000  1.772000  0.000000| |Pr|
315  *   Red scaled at twice g & b but at same precision to place it in correct
316  *   bit position after multiply and leave instruction count lower.
317  *   |R|   |258   0  408| |Y' -  16|
318  *   |G| = |149 -49 -104| |Cb - 128|
319  *   |B|   |149 258    0| |Cr - 128|
321  * Write four RGB565 pixels in the following order on each loop:
322  * 1 3 + > down
323  * 2 4 \/ left
325  * Kernel pattern (raw|rotated|use order):
326  * 5 3 4 2   2 6 3 7     row0   row2          > down
327  * 1 7 0 6 | 4 0 5 1 | 2 4 6 0 3 5 7 1 col0     left
328  * 4 2 5 3 | 3 7 2 6 | 3 5 7 1 2 4 6 0 col2  \/
329  * 0 6 1 7   5 1 4 0
330  */
331     .section    .icode, "ax", %progbits
332     .align      2
333     .global     lcd_write_yuv420_lines_odither
334     .type       lcd_write_yuv420_lines_odither, %function
335 lcd_write_yuv420_lines_odither:
336                                         @ r0 = dst
337                                         @ r1 = yuv_src
338                                         @ r2 = width
339                                         @ r3 = stride
340                                         @ [sp]   = x_screen
341                                         @ [sp+4] = y_screen
342     stmfd       sp!, { r4-r12, lr }     @ save non-scratch
343     ldmia       r1, { r4, r5, r6 }      @ r4 = yuv_src[0] = Y'_p
344                                         @ r5 = yuv_src[1] = Cb_p
345                                         @ r6 = yuv_src[2] = Cr_p
346                                         @
347     sub         r3, r3, #1              @
348     add         r1, sp, #40             @ Line up pattern and kernel quadrant
349     ldmia       r1, { r12, r14 }        @
350     eor         r14, r14, r12           @
351     and         r14, r14, #0x2          @
352     mov         r14, r14, lsl #6        @ 0x00 or 0x80
353 10: @ loop line                         @
354                                         @
355     ldrb        r7, [r4], #1            @ r7 = *Y'_p++;
356     ldrb        r8, [r5], #1            @ r8 = *Cb_p++;
357     ldrb        r9, [r6], #1            @ r9 = *Cr_p++;
358                                         @
359     eor         r14, r14, #0x80         @ flip pattern quadrant
360                                         @
361     sub         r7, r7, #16             @ r7 = Y = (Y' - 16)*149
362     add         r12, r7, r7, asl #2     @
363     add         r12, r12, r12, asl #4   @
364     add         r7, r12, r7, asl #6     @
365                                         @    
366     sub         r8, r8, #128            @ Cb -= 128
367     sub         r9, r9, #128            @ Cr -= 128
368                                         @
369     add         r10, r8, r8, asl #4     @ r10 = guv = Cr*104 + Cb*49
370     add         r10, r10, r8, asl #5    @
371     add         r10, r10, r9, asl #3    @
372     add         r10, r10, r9, asl #5    @
373     add         r10, r10, r9, asl #6    @
374                                         @
375     mov         r8, r8, asl #1          @ r8 = bu = Cb*258
376     add         r8, r8, r8, asl #7      @
377                                         @
378     add         r9, r9, r9, asl #1      @ r9 = rv = Cr*408
379     add         r9, r9, r9, asl #4      @
380     mov         r9, r9, asl #3          @
381                                         @
382                                         @ compute R, G, and B
383     add         r1, r8, r7              @ r1  = b' = Y + bu
384     add         r11, r9, r7, asl #1     @ r11 = r' = Y*2 + rv
385     rsb         r7, r10, r7             @ r7  = g' = Y + guv
386                                         @
387                                         @ r8 = bu, r9 = rv, r10 = guv
388                                         @
389     sub         r12, r1, r1, lsr #5     @ r1 = 31/32*b + b/256
390     add         r1, r12, r1, lsr #8     @
391                                         @
392     sub         r12, r11, r11, lsr #5   @ r11 = 31/32*r + r/256
393     add         r11, r12, r11, lsr #8   @
394                                         @
395     sub         r12, r7, r7, lsr #6     @ r7 = 63/64*g + g/256
396     add         r7, r12, r7, lsr #8     @
397                                         @
398     add         r12, r14, #0x100        @
399                                         @
400     add         r1, r1, r12             @ b = r1 + delta
401     add         r11, r11, r12, lsl #1   @ r = r11 + delta*2
402     add         r7, r7, r12, lsr #1     @ g = r7 + delta/2
403                                         @
404     orr         r12, r1, r11, asr #1    @ check if clamping is needed...
405     orr         r12, r12, r7            @ ...at all
406     movs        r12, r12, asr #15       @
407     beq         15f @ no clamp          @
408     movs        r12, r1, asr #15        @ clamp b
409     mvnne       r1, r12, lsr #15        @
410     andne       r1, r1, #0x7c00         @ mask b only if clamped
411     movs        r12, r11, asr #16       @ clamp r
412     mvnne       r11, r12, lsr #16       @
413     movs        r12, r7, asr #15        @ clamp g
414     mvnne       r7, r12, lsr #15        @
415 15: @ no clamp                          @
416                                         @
417     ldrb        r12, [r4, r3]           @ r12 = Y' = *(Y'_p + stride)
418                                         @
419     and         r11, r11, #0xf800       @ pack pixel
420     and         r7, r7, #0x7e00         @ r1 = pixel = (r & 0xf800) |
421     orr         r11, r11, r7, lsr #4    @              ((g & 0x7e00) >> 4) |
422     orr         r1, r11, r1, lsr #10    @              (b >> 10)
423                                         @
424 #if LCD_WIDTH < 256
425     strh        r1, [r0], #LCD_WIDTH    @ store pixel
426 #else
427     strh        r1, [r0]                @
428 #endif
429                                         @
430     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*149
431     add         r12, r7, r7, asl #2     @
432     add         r12, r12, r12, asl #4   @
433     add         r7, r12, r7, asl #6     @
434                                         @ compute R, G, and B
435     add         r1, r8, r7              @ r1  = b' = Y + bu
436     add         r11, r9, r7, asl #1     @ r11 = r' = Y*2 + rv
437     rsb         r7, r10, r7             @ r7  = g' = Y + guv
438                                         @
439     sub         r12, r1, r1, lsr #5     @ r1  = 31/32*b' + b'/256
440     add         r1, r12, r1, lsr #8     @
441                                         @
442     sub         r12, r11, r11, lsr #5   @ r11 = 31/32*r' + r'/256
443     add         r11, r12, r11, lsr #8   @
444                                         @
445     sub         r12, r7, r7, lsr #6     @ r7  = 63/64*g' + g'/256
446     add         r7, r12, r7, lsr #8     @
447                                         @
448     add         r12, r14, #0x200        @
449                                         @
450     add         r1, r1, r12             @ b = r1 + delta
451     add         r11, r11, r12, lsl #1   @ r = r11 + delta*2
452     add         r7, r7, r12, lsr #1     @ g = r7 + delta/2
453                                         @
454     orr         r12, r1, r11, asr #1    @ check if clamping is needed...
455     orr         r12, r12, r7            @ ...at all
456     movs        r12, r12, asr #15       @
457     beq         15f @ no clamp          @
458     movs        r12, r1, asr #15        @ clamp b
459     mvnne       r1, r12, lsr #15        @
460     andne       r1, r1, #0x7c00         @ mask b only if clamped
461     movs        r12, r11, asr #16       @ clamp r
462     mvnne       r11, r12, lsr #16       @
463     movs        r12, r7, asr #15        @ clamp g
464     mvnne       r7, r12, lsr #15        @
465 15: @ no clamp                          @
466                                         @
467     ldrb        r12, [r4], #1           @ r12 = Y' = *(Y'_p++)
468                                         @
469     and         r11, r11, #0xf800       @ pack pixel
470     and         r7, r7, #0x7e00         @ r1 = pixel = (r & 0xf800) |
471     orr         r11, r11, r7, lsr #4    @              ((g & 0x7e00) >> 4) |
472     orr         r1, r11, r1, lsr #10    @              (b >> 10)
473                                         @
474 #if LCD_WIDTH < 256
475     strh        r1, [r0, #-LCD_WIDTH-2] @ store pixel
476 #else
477     strh        r1, [r0, #-2]           @ store pixel
478     add         r0, r0, #LCD_WIDTH      @
479 #endif
480                                         @
481     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*149
482     add         r12, r7, r7, asl #2     @
483     add         r12, r12, r12, asl #4   @
484     add         r7, r12, r7, asl #6     @
485                                         @ compute R, G, and B
486     add         r1, r8, r7              @ r1  = b' = Y + bu
487     add         r11, r9, r7, asl #1     @ r11 = r' = Y*2 + rv
488     rsb         r7, r10, r7             @ r7  = g' = Y + guv
489                                         @
490                                         @ r8 = bu, r9 = rv, r10 = guv
491                                         @
492     sub         r12, r1, r1, lsr #5     @ r1  = 31/32*b' + b'/256
493     add         r1, r12, r1, lsr #8     @
494                                         @
495     sub         r12, r11, r11, lsr #5   @ r11 = 31/32*r' + r'/256
496     add         r11, r12, r11, lsr #8   @
497                                         @
498     sub         r12, r7, r7, lsr #6     @ r7  = 63/64*g' + g'/256
499     add         r7, r12, r7, lsr #8     @
500                                         @
501     add         r12, r14, #0x300        @
502                                         @
503     add         r1, r1, r12             @ b = r1 + delta
504     add         r11, r11, r12, lsl #1   @ r = r11 + delta*2
505     add         r7, r7, r12, lsr #1     @ g = r7 + delta/2
506                                         @
507     orr         r12, r1, r11, asr #1    @ check if clamping is needed...
508     orr         r12, r12, r7            @ ...at all
509     movs        r12, r12, asr #15       @
510     beq         15f @ no clamp          @
511     movs        r12, r1, asr #15        @ clamp b
512     mvnne       r1, r12, lsr #15        @
513     andne       r1, r1, #0x7c00         @ mask b only if clamped
514     movs        r12, r11, asr #16       @ clamp r
515     mvnne       r11, r12, lsr #16       @
516     movs        r12, r7, asr #15        @ clamp g
517     mvnne       r7, r12, lsr #15        @
518 15: @ no clamp                          @
519                                         @
520     ldrb        r12, [r4, r3]           @ r12 = Y' = *(Y'_p + stride)    
521                                         @
522     and         r11, r11, #0xf800       @ pack pixel
523     and         r7, r7, #0x7e00         @ r1 = pixel = (r & 0xf800) |
524     orr         r11, r11, r7, lsr #4    @              ((g & 0x7e00) >> 4) |
525     orr         r1, r11, r1, lsr #10    @              (b >> 10)
526                                         @
527 #if LCD_WIDTH < 256
528     strh        r1, [r0, #LCD_WIDTH]!   @ store pixel
529 #else
530     strh        r1, [r0]                @
531 #endif
532                                         @
533     sub         r7, r12, #16            @ r7 = Y = (Y' - 16)*149
534     add         r12, r7, r7, asl #2     @
535     add         r12, r12, r12, asl #4   @
536     add         r7, r12, r7, asl #6     @
537                                         @ compute R, G, and B
538     add         r1, r8, r7              @ r1  = b' = Y + bu
539     add         r11, r9, r7, asl #1     @ r11 = r' = Y*2 + rv
540     rsb         r7, r10, r7             @ r7  = g' = Y + guv
541                                         @
542     sub         r12, r1, r1, lsr #5     @ r1 = 31/32*b + b/256
543     add         r1, r12, r1, lsr #8     @
544                                         @
545     sub         r12, r11, r11, lsr #5   @ r11 = 31/32*r + r/256
546     add         r11, r12, r11, lsr #8   @
547                                         @
548     sub         r12, r7, r7, lsr #6     @ r7 = 63/64*g + g/256
549     add         r7, r12, r7, lsr #8     @
550                                         @
551     @ This element is zero - use r14    @
552                                         @
553     add         r1, r1, r14             @ b = r1 + delta
554     add         r11, r11, r14, lsl #1   @ r = r11 + delta*2
555     add         r7, r7, r14, lsr #1     @ g = r7 + delta/2
556                                         @
557     orr         r12, r1, r11, asr #1    @ check if clamping is needed...
558     orr         r12, r12, r7            @ ...at all
559     movs        r12, r12, asr #15       @
560     beq         15f @ no clamp          @
561     movs        r12, r1, asr #15        @ clamp b
562     mvnne       r1, r12, lsr #15        @
563     andne       r1, r1, #0x7c00         @ mask b only if clamped
564     movs        r12, r11, asr #16       @ clamp r
565     mvnne       r11, r12, lsr #16       @
566     movs        r12, r7, asr #15        @ clamp g
567     mvnne       r7, r12, lsr #15        @
568 15: @ no clamp                          @
569                                         @
570     and         r11, r11, #0xf800       @ pack pixel
571     and         r7, r7, #0x7e00         @ r1 = pixel = (r & 0xf800) |
572     orr         r11, r11, r7, lsr #4    @              ((g & 0x7e00) >> 4) |
573     orr         r1, r11, r1, lsr #10    @              (b >> 10)
574                                         @
575     strh        r1, [r0, #-2]           @ store pixel
576 #if LCD_WIDTH < 256
577     add         r0, r0, #2*LCD_WIDTH    @
578 #else
579     add         r0, r0, #LCD_WIDTH      @
580 #endif
581                                         @
582     subs        r2, r2, #2              @ subtract block from width
583     bgt         10b @ loop line         @
584                                         @
585     ldmfd       sp!, { r4-r12, pc }     @ restore registers and return
586     .ltorg                              @ dump constant pool
587     .size   lcd_write_yuv420_lines_odither, .-lcd_write_yuv420_lines_odither