2 * Copyright © 2009 Nokia Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
27 * This file contains implementations of NEON optimized pixel processing
28 * functions. There is no full and detailed tutorial, but some functions
29 * (those which are exposing some new or interesting features) are
30 * extensively commented and can be used as examples.
32 * You may want to have a look at the comments for following functions:
33 * - pixman_composite_over_8888_0565_asm_neon
34 * - pixman_composite_over_n_8_0565_asm_neon
37 /* Prevent the stack from becoming executable for no reason... */
38 #if defined(__linux__) && defined(__ELF__)
39 .section .note.GNU-stack,"",%progbits
46 .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
47 .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
52 #include "pixman-private.h"
53 #include "pixman-arm-asm.h"
54 #include "pixman-arm-neon-asm.h"
56 /* Global configuration options and preferences */
59 * The code can optionally make use of unaligned memory accesses to improve
60 * performance of handling leading/trailing pixels for each scanline.
61 * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
62 * example in linux if unaligned memory accesses are not configured to
63 * generate.exceptions.
65 .set RESPECT_STRICT_ALIGNMENT, 1
68 * Set default prefetch type. There is a choice between the following options:
70 * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
71 * as NOP to workaround some HW bugs or for whatever other reason)
73 * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
74 * advanced prefetch intruduces heavy overhead)
76 * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
77 * which can run ARM and NEON instructions simultaneously so that extra ARM
78 * instructions do not add (many) extra cycles, but improve prefetch efficiency)
80 * Note: some types of function can't support advanced prefetch and fallback
81 * to simple one (those which handle 24bpp pixels)
83 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
85 /* Prefetch distance in pixels for simple prefetch */
86 .set PREFETCH_DISTANCE_SIMPLE, 64
89 * Implementation of pixman_composite_over_8888_0565_asm_neon
91 * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
92 * performs OVER compositing operation. Function fast_composite_over_8888_0565
93 * from pixman-fast-path.c does the same in C and can be used as a reference.
95 * First we need to have some NEON assembly code which can do the actual
96 * operation on the pixels and provide it to the template macro.
98 * Template macro quite conveniently takes care of emitting all the necessary
99 * code for memory reading and writing (including quite tricky cases of
100 * handling unaligned leading/trailing pixels), so we only need to deal with
101 * the data in NEON registers.
103 * NEON registers allocation in general is recommented to be the following:
104 * d0, d1, d2, d3 - contain loaded source pixel data
105 * d4, d5, d6, d7 - contain loaded destination pixels (if they are needed)
106 * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
107 * d28, d29, d30, d31 - place for storing the result (destination pixels)
109 * As can be seen above, four 64-bit NEON registers are used for keeping
110 * intermediate pixel data and up to 8 pixels can be processed in one step
111 * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
113 * This particular function uses the following registers allocation:
114 * d0, d1, d2, d3 - contain loaded source pixel data
115 * d4, d5 - contain loaded destination pixels (they are needed)
116 * d28, d29 - place for storing the result (destination pixels)
120 * Step one. We need to have some code to do some arithmetics on pixel data.
121 * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
122 * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
123 * perform all the needed calculations and write the result to {d28, d29}.
124 * The rationale for having two macros and not just one will be explained
125 * later. In practice, any single monolitic function which does the work can
126 * be split into two parts in any arbitrary way without affecting correctness.
128 * There is one special trick here too. Common template macro can optionally
129 * make our life a bit easier by doing R, G, B, A color components
130 * deinterleaving for 32bpp pixel formats (and this feature is used in
131 * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
132 * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
133 * actually use d0 register for blue channel (a vector of eight 8-bit
134 * values), d1 register for green, d2 for red and d3 for alpha. This
135 * simple conversion can be also done with a few NEON instructions:
137 * Packed to planar conversion:
143 * Planar to packed conversion:
149 * But pixel can be loaded directly in planar format using VLD4.8 NEON
150 * instruction. It is 1 cycle slower than VLD1.32, so this is not always
151 * desirable, that's why deinterleaving is optional.
153 * But anyway, here is the code:
155 .macro pixman_composite_over_8888_0565_process_pixblock_head
156 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
157 and put data into d6 - red, d7 - green, d30 - blue */
162 vmvn.8 d3, d3 /* invert source alpha */
164 vshrn.u16 d30, q2, #2
165 /* now do alpha blending, storing results in 8-bit planar format
166 into d16 - red, d19 - green, d18 - blue */
169 vmull.u8 q12, d3, d30
170 vrshr.u16 q13, q10, #8
171 vrshr.u16 q3, q11, #8
172 vrshr.u16 q15, q12, #8
173 vraddhn.u16 d20, q10, q13
174 vraddhn.u16 d23, q11, q3
175 vraddhn.u16 d22, q12, q15
178 .macro pixman_composite_over_8888_0565_process_pixblock_tail
179 /* ... continue alpha blending */
180 vqadd.u8 d16, d2, d20
182 /* convert the result to r5g6b5 and store it into {d28, d29} */
183 vshll.u8 q14, d16, #8
187 vsri.u16 q14, q9, #11
191 * OK, now we got almost everything that we need. Using the above two
192 * macros, the work can be done right. But now we want to optimize
193 * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
194 * a lot from good code scheduling and software pipelining.
196 * Let's construct some code, which will run in the core main loop.
197 * Some pseudo-code of the main loop will look like this:
205 * It may look a bit weird, but this setup allows to hide instruction
206 * latencies better and also utilize dual-issue capability more
207 * efficiently (make pairs of load-store and ALU instructions).
209 * So what we need now is a '*_tail_head' macro, which will be used
210 * in the core main loop. A trivial straightforward implementation
211 * of this macro would look like this:
213 * pixman_composite_over_8888_0565_process_pixblock_tail
214 * vst1.16 {d28, d29}, [DST_W, :128]!
215 * vld1.16 {d4, d5}, [DST_R, :128]!
216 * vld4.32 {d0, d1, d2, d3}, [SRC]!
217 * pixman_composite_over_8888_0565_process_pixblock_head
220 * Now it also got some VLD/VST instructions. We simply can't move from
221 * processing one block of pixels to the other one with just arithmetics.
222 * The previously processed data needs to be written to memory and new
223 * data needs to be fetched. Fortunately, this main loop does not deal
224 * with partial leading/trailing pixels and can load/store a full block
225 * of pixels in a bulk. Additionally, destination buffer is already
226 * 16 bytes aligned here (which is good for performance).
228 * New things here are DST_R, DST_W, SRC and MASK identifiers. These
229 * are the aliases for ARM registers which are used as pointers for
230 * accessing data. We maintain separate pointers for reading and writing
231 * destination buffer (DST_R and DST_W).
233 * Another new thing is 'cache_preload' macro. It is used for prefetching
234 * data into CPU L2 cache and improve performance when dealing with large
235 * images which are far larger than cache size. It uses one argument
236 * (actually two, but they need to be the same here) - number of pixels
237 * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
238 * details about this macro. Moreover, if good performance is needed
239 * the code from this macro needs to be copied into '*_tail_head' macro
240 * and mixed with the rest of code for optimal instructions scheduling.
241 * We are actually doing it below.
243 * Now after all the explanations, here is the optimized code.
244 * Different instruction streams (originaling from '*_head', '*_tail'
245 * and 'cache_preload' macro) use different indentation levels for
246 * better readability. Actually taking the code from one of these
247 * indentation levels and ignoring a few VLD/VST instructions would
248 * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
254 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
255 vqadd.u8 d16, d2, d20
256 vld1.16 {d4, d5}, [DST_R, :128]!
262 vshll.u8 q14, d16, #8
263 PF add PF_X, PF_X, #8
267 PF addne PF_X, PF_X, #8
269 PF subne PF_CTL, PF_CTL, #1
271 vshrn.u16 d30, q2, #2
273 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
275 vmull.u8 q12, d3, d30
276 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
280 vrshr.u16 q13, q10, #8
281 PF subge PF_X, PF_X, ORIG_W
282 vrshr.u16 q3, q11, #8
283 vrshr.u16 q15, q12, #8
284 PF subges PF_CTL, PF_CTL, #0x10
285 vsri.u16 q14, q9, #11
286 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
287 vraddhn.u16 d20, q10, q13
288 vraddhn.u16 d23, q11, q3
289 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
290 vraddhn.u16 d22, q12, q15
291 vst1.16 {d28, d29}, [DST_W, :128]!
296 /* If we did not care much about the performance, we would just use this... */
297 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
298 pixman_composite_over_8888_0565_process_pixblock_tail
299 vst1.16 {d28, d29}, [DST_W, :128]!
300 vld1.16 {d4, d5}, [DST_R, :128]!
302 pixman_composite_over_8888_0565_process_pixblock_head
309 * And now the final part. We are using 'generate_composite_function' macro
310 * to put all the stuff together. We are specifying the name of the function
311 * which we want to get, number of bits per pixel for the source, mask and
312 * destination (0 if unused, like mask in this case). Next come some bit
314 * FLAG_DST_READWRITE - tells that the destination buffer is both read
315 * and written, for write-only buffer we would use
316 * FLAG_DST_WRITEONLY flag instead
317 * FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
318 * and separate color channels for 32bpp format.
319 * The next things are:
320 * - the number of pixels processed per iteration (8 in this case, because
321 * that's the maximum what can fit into four 64-bit NEON registers).
322 * - prefetch distance, measured in pixel blocks. In this case it is 5 times
323 * by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
324 * prefetch distance can be selected by running some benchmarks.
326 * After that we specify some macros, these are 'default_init',
327 * 'default_cleanup' here which are empty (but it is possible to have custom
328 * init/cleanup macros to be able to save/restore some extra NEON registers
329 * like d8-d15 or do anything else) followed by
330 * 'pixman_composite_over_8888_0565_process_pixblock_head',
331 * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
332 * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
333 * which we got implemented above.
335 * The last part is the NEON registers allocation scheme.
337 generate_composite_function \
338 pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
339 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
340 8, /* number of pixels, processed in a single block */ \
341 5, /* prefetch distance */ \
344 pixman_composite_over_8888_0565_process_pixblock_head, \
345 pixman_composite_over_8888_0565_process_pixblock_tail, \
346 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
347 28, /* dst_w_basereg */ \
348 4, /* dst_r_basereg */ \
349 0, /* src_basereg */ \
350 24 /* mask_basereg */
352 /******************************************************************************/
354 .macro pixman_composite_over_n_0565_process_pixblock_head
355 /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
356 and put data into d6 - red, d7 - green, d30 - blue */
362 vshrn.u16 d30, q2, #2
363 /* now do alpha blending, storing results in 8-bit planar format
364 into d16 - red, d19 - green, d18 - blue */
367 vmull.u8 q12, d3, d30
368 vrshr.u16 q13, q10, #8
369 vrshr.u16 q3, q11, #8
370 vrshr.u16 q15, q12, #8
371 vraddhn.u16 d20, q10, q13
372 vraddhn.u16 d23, q11, q3
373 vraddhn.u16 d22, q12, q15
376 .macro pixman_composite_over_n_0565_process_pixblock_tail
377 /* ... continue alpha blending */
378 vqadd.u8 d16, d2, d20
380 /* convert the result to r5g6b5 and store it into {d28, d29} */
381 vshll.u8 q14, d16, #8
385 vsri.u16 q14, q9, #11
388 /* TODO: expand macros and do better instructions scheduling */
389 .macro pixman_composite_over_n_0565_process_pixblock_tail_head
390 pixman_composite_over_n_0565_process_pixblock_tail
391 vld1.16 {d4, d5}, [DST_R, :128]!
392 vst1.16 {d28, d29}, [DST_W, :128]!
393 pixman_composite_over_n_0565_process_pixblock_head
397 .macro pixman_composite_over_n_0565_init
398 add DUMMY, sp, #ARGS_STACK_OFFSET
399 vld1.32 {d3[0]}, [DUMMY]
404 vmvn.8 d3, d3 /* invert source alpha */
407 generate_composite_function \
408 pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
409 FLAG_DST_READWRITE, \
410 8, /* number of pixels, processed in a single block */ \
411 5, /* prefetch distance */ \
412 pixman_composite_over_n_0565_init, \
414 pixman_composite_over_n_0565_process_pixblock_head, \
415 pixman_composite_over_n_0565_process_pixblock_tail, \
416 pixman_composite_over_n_0565_process_pixblock_tail_head, \
417 28, /* dst_w_basereg */ \
418 4, /* dst_r_basereg */ \
419 0, /* src_basereg */ \
420 24 /* mask_basereg */
422 /******************************************************************************/
424 .macro pixman_composite_src_8888_0565_process_pixblock_head
430 .macro pixman_composite_src_8888_0565_process_pixblock_tail
432 vsri.u16 q14, q9, #11
435 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
437 PF add PF_X, PF_X, #8
440 PF addne PF_X, PF_X, #8
441 PF subne PF_CTL, PF_CTL, #1
442 vsri.u16 q14, q9, #11
444 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
446 vst1.16 {d28, d29}, [DST_W, :128]!
447 PF subge PF_X, PF_X, ORIG_W
448 PF subges PF_CTL, PF_CTL, #0x10
450 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
454 generate_composite_function \
455 pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
456 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
457 8, /* number of pixels, processed in a single block */ \
458 10, /* prefetch distance */ \
461 pixman_composite_src_8888_0565_process_pixblock_head, \
462 pixman_composite_src_8888_0565_process_pixblock_tail, \
463 pixman_composite_src_8888_0565_process_pixblock_tail_head
465 /******************************************************************************/
467 .macro pixman_composite_src_0565_8888_process_pixblock_head
468 vshrn.u16 d30, q0, #8
469 vshrn.u16 d29, q0, #3
474 vshrn.u16 d28, q0, #2
477 .macro pixman_composite_src_0565_8888_process_pixblock_tail
480 /* TODO: expand macros and do better instructions scheduling */
481 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
482 pixman_composite_src_0565_8888_process_pixblock_tail
483 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
485 pixman_composite_src_0565_8888_process_pixblock_head
489 generate_composite_function \
490 pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
491 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
492 8, /* number of pixels, processed in a single block */ \
493 10, /* prefetch distance */ \
496 pixman_composite_src_0565_8888_process_pixblock_head, \
497 pixman_composite_src_0565_8888_process_pixblock_tail, \
498 pixman_composite_src_0565_8888_process_pixblock_tail_head
500 /******************************************************************************/
502 .macro pixman_composite_add_8_8_process_pixblock_head
507 .macro pixman_composite_add_8_8_process_pixblock_tail
510 .macro pixman_composite_add_8_8_process_pixblock_tail_head
512 PF add PF_X, PF_X, #32
514 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
515 PF addne PF_X, PF_X, #32
516 PF subne PF_CTL, PF_CTL, #1
517 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
519 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
520 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
521 PF subge PF_X, PF_X, ORIG_W
522 PF subges PF_CTL, PF_CTL, #0x10
524 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
525 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
529 generate_composite_function \
530 pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
531 FLAG_DST_READWRITE, \
532 32, /* number of pixels, processed in a single block */ \
533 10, /* prefetch distance */ \
536 pixman_composite_add_8_8_process_pixblock_head, \
537 pixman_composite_add_8_8_process_pixblock_tail, \
538 pixman_composite_add_8_8_process_pixblock_tail_head
540 /******************************************************************************/
542 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
544 PF add PF_X, PF_X, #8
546 vld1.32 {d4, d5, d6, d7}, [DST_R, :128]!
547 PF addne PF_X, PF_X, #8
548 PF subne PF_CTL, PF_CTL, #1
549 vst1.32 {d28, d29, d30, d31}, [DST_W, :128]!
551 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
552 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
553 PF subge PF_X, PF_X, ORIG_W
554 PF subges PF_CTL, PF_CTL, #0x10
556 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
557 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
561 generate_composite_function \
562 pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
563 FLAG_DST_READWRITE, \
564 8, /* number of pixels, processed in a single block */ \
565 10, /* prefetch distance */ \
568 pixman_composite_add_8_8_process_pixblock_head, \
569 pixman_composite_add_8_8_process_pixblock_tail, \
570 pixman_composite_add_8888_8888_process_pixblock_tail_head
572 generate_composite_function_single_scanline \
573 pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
574 FLAG_DST_READWRITE, \
575 8, /* number of pixels, processed in a single block */ \
578 pixman_composite_add_8_8_process_pixblock_head, \
579 pixman_composite_add_8_8_process_pixblock_tail, \
580 pixman_composite_add_8888_8888_process_pixblock_tail_head
582 /******************************************************************************/
584 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
585 vmvn.8 d24, d3 /* get inverted alpha */
586 /* do alpha blending */
589 vmull.u8 q10, d24, d6
590 vmull.u8 q11, d24, d7
593 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
594 vrshr.u16 q14, q8, #8
595 vrshr.u16 q15, q9, #8
596 vrshr.u16 q12, q10, #8
597 vrshr.u16 q13, q11, #8
598 vraddhn.u16 d28, q14, q8
599 vraddhn.u16 d29, q15, q9
600 vraddhn.u16 d30, q12, q10
601 vraddhn.u16 d31, q13, q11
604 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
605 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
606 vrshr.u16 q14, q8, #8
607 PF add PF_X, PF_X, #8
609 vrshr.u16 q15, q9, #8
610 vrshr.u16 q12, q10, #8
611 vrshr.u16 q13, q11, #8
612 PF addne PF_X, PF_X, #8
613 PF subne PF_CTL, PF_CTL, #1
614 vraddhn.u16 d28, q14, q8
615 vraddhn.u16 d29, q15, q9
617 vraddhn.u16 d30, q12, q10
618 vraddhn.u16 d31, q13, q11
620 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
622 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
623 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
624 PF subge PF_X, PF_X, ORIG_W
626 PF subges PF_CTL, PF_CTL, #0x10
628 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
629 vmull.u8 q10, d22, d6
630 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
631 vmull.u8 q11, d22, d7
634 generate_composite_function_single_scanline \
635 pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
636 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
637 8, /* number of pixels, processed in a single block */ \
640 pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
641 pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
642 pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
644 /******************************************************************************/
646 .macro pixman_composite_over_8888_8888_process_pixblock_head
647 pixman_composite_out_reverse_8888_8888_process_pixblock_head
650 .macro pixman_composite_over_8888_8888_process_pixblock_tail
651 pixman_composite_out_reverse_8888_8888_process_pixblock_tail
652 vqadd.u8 q14, q0, q14
653 vqadd.u8 q15, q1, q15
656 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
657 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
658 vrshr.u16 q14, q8, #8
659 PF add PF_X, PF_X, #8
661 vrshr.u16 q15, q9, #8
662 vrshr.u16 q12, q10, #8
663 vrshr.u16 q13, q11, #8
664 PF addne PF_X, PF_X, #8
665 PF subne PF_CTL, PF_CTL, #1
666 vraddhn.u16 d28, q14, q8
667 vraddhn.u16 d29, q15, q9
669 vraddhn.u16 d30, q12, q10
670 vraddhn.u16 d31, q13, q11
671 vqadd.u8 q14, q0, q14
672 vqadd.u8 q15, q1, q15
674 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
676 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
677 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
678 PF subge PF_X, PF_X, ORIG_W
680 PF subges PF_CTL, PF_CTL, #0x10
682 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
683 vmull.u8 q10, d22, d6
684 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
685 vmull.u8 q11, d22, d7
688 generate_composite_function \
689 pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
690 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
691 8, /* number of pixels, processed in a single block */ \
692 5, /* prefetch distance */ \
695 pixman_composite_over_8888_8888_process_pixblock_head, \
696 pixman_composite_over_8888_8888_process_pixblock_tail, \
697 pixman_composite_over_8888_8888_process_pixblock_tail_head
699 generate_composite_function_single_scanline \
700 pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
701 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
702 8, /* number of pixels, processed in a single block */ \
705 pixman_composite_over_8888_8888_process_pixblock_head, \
706 pixman_composite_over_8888_8888_process_pixblock_tail, \
707 pixman_composite_over_8888_8888_process_pixblock_tail_head
709 /******************************************************************************/
711 .macro pixman_composite_over_n_8888_process_pixblock_head
712 /* deinterleaved source pixels in {d0, d1, d2, d3} */
713 /* inverted alpha in {d24} */
714 /* destination pixels in {d4, d5, d6, d7} */
717 vmull.u8 q10, d24, d6
718 vmull.u8 q11, d24, d7
721 .macro pixman_composite_over_n_8888_process_pixblock_tail
722 vrshr.u16 q14, q8, #8
723 vrshr.u16 q15, q9, #8
724 vrshr.u16 q2, q10, #8
725 vrshr.u16 q3, q11, #8
726 vraddhn.u16 d28, q14, q8
727 vraddhn.u16 d29, q15, q9
728 vraddhn.u16 d30, q2, q10
729 vraddhn.u16 d31, q3, q11
730 vqadd.u8 q14, q0, q14
731 vqadd.u8 q15, q1, q15
734 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
735 vrshr.u16 q14, q8, #8
736 vrshr.u16 q15, q9, #8
737 vrshr.u16 q2, q10, #8
738 vrshr.u16 q3, q11, #8
739 vraddhn.u16 d28, q14, q8
740 vraddhn.u16 d29, q15, q9
741 vraddhn.u16 d30, q2, q10
742 vraddhn.u16 d31, q3, q11
743 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
744 vqadd.u8 q14, q0, q14
745 PF add PF_X, PF_X, #8
747 PF addne PF_X, PF_X, #8
748 PF subne PF_CTL, PF_CTL, #1
749 vqadd.u8 q15, q1, q15
752 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
754 PF subge PF_X, PF_X, ORIG_W
755 vmull.u8 q10, d24, d6
756 PF subges PF_CTL, PF_CTL, #0x10
757 vmull.u8 q11, d24, d7
758 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
759 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
762 .macro pixman_composite_over_n_8888_init
763 add DUMMY, sp, #ARGS_STACK_OFFSET
764 vld1.32 {d3[0]}, [DUMMY]
769 vmvn.8 d24, d3 /* get inverted alpha */
772 generate_composite_function \
773 pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
774 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
775 8, /* number of pixels, processed in a single block */ \
776 5, /* prefetch distance */ \
777 pixman_composite_over_n_8888_init, \
779 pixman_composite_over_8888_8888_process_pixblock_head, \
780 pixman_composite_over_8888_8888_process_pixblock_tail, \
781 pixman_composite_over_n_8888_process_pixblock_tail_head
783 /******************************************************************************/
785 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
786 vrshr.u16 q14, q8, #8
787 PF add PF_X, PF_X, #8
789 vrshr.u16 q15, q9, #8
790 vrshr.u16 q12, q10, #8
791 vrshr.u16 q13, q11, #8
792 PF addne PF_X, PF_X, #8
793 PF subne PF_CTL, PF_CTL, #1
794 vraddhn.u16 d28, q14, q8
795 vraddhn.u16 d29, q15, q9
797 vraddhn.u16 d30, q12, q10
798 vraddhn.u16 d31, q13, q11
799 vqadd.u8 q14, q0, q14
800 vqadd.u8 q15, q1, q15
801 vld4.8 {d0, d1, d2, d3}, [DST_R, :128]!
803 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
804 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
805 PF subge PF_X, PF_X, ORIG_W
807 PF subges PF_CTL, PF_CTL, #0x10
809 vmull.u8 q10, d22, d6
810 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
811 vmull.u8 q11, d22, d7
814 .macro pixman_composite_over_reverse_n_8888_init
815 add DUMMY, sp, #ARGS_STACK_OFFSET
816 vld1.32 {d7[0]}, [DUMMY]
823 generate_composite_function \
824 pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
825 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
826 8, /* number of pixels, processed in a single block */ \
827 5, /* prefetch distance */ \
828 pixman_composite_over_reverse_n_8888_init, \
830 pixman_composite_over_8888_8888_process_pixblock_head, \
831 pixman_composite_over_8888_8888_process_pixblock_tail, \
832 pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
833 28, /* dst_w_basereg */ \
834 0, /* dst_r_basereg */ \
835 4, /* src_basereg */ \
836 24 /* mask_basereg */
838 /******************************************************************************/
840 .macro pixman_composite_over_8888_8_0565_process_pixblock_head
841 vmull.u8 q0, d24, d8 /* IN for SRC pixels (part1) */
843 vmull.u8 q6, d24, d10
844 vmull.u8 q7, d24, d11
845 vshrn.u16 d6, q2, #8 /* convert DST_R data to 32-bpp (part1) */
848 vrshr.u16 q8, q0, #8 /* IN for SRC pixels (part2) */
850 vrshr.u16 q10, q6, #8
851 vrshr.u16 q11, q7, #8
852 vraddhn.u16 d0, q0, q8
853 vraddhn.u16 d1, q1, q9
854 vraddhn.u16 d2, q6, q10
855 vraddhn.u16 d3, q7, q11
856 vsri.u8 d6, d6, #5 /* convert DST_R data to 32-bpp (part2) */
859 vshrn.u16 d30, q2, #2
860 vmull.u8 q8, d3, d6 /* now do alpha blending */
862 vmull.u8 q10, d3, d30
865 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail
866 /* 3 cycle bubble (after vmull.u8) */
867 vrshr.u16 q13, q8, #8
868 vrshr.u16 q11, q9, #8
869 vrshr.u16 q15, q10, #8
870 vraddhn.u16 d16, q8, q13
871 vraddhn.u16 d27, q9, q11
872 vraddhn.u16 d26, q10, q15
873 vqadd.u8 d16, d2, d16
876 vshll.u8 q14, d16, #8 /* convert to 16bpp */
881 vsri.u16 q14, q9, #11
884 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
885 vld1.16 {d4, d5}, [DST_R, :128]!
890 vmull.u8 q6, d24, d10
891 vrshr.u16 q13, q8, #8
892 vrshr.u16 q11, q9, #8
893 vrshr.u16 q15, q10, #8
894 vraddhn.u16 d16, q8, q13
895 vraddhn.u16 d27, q9, q11
896 vraddhn.u16 d26, q10, q15
897 vqadd.u8 d16, d2, d16
900 vshll.u8 q14, d16, #8
905 vmull.u8 q7, d24, d11
906 vsri.u16 q14, q9, #11
913 vrshr.u16 q10, q6, #8
914 vrshr.u16 q11, q7, #8
915 vraddhn.u16 d0, q0, q8
916 vraddhn.u16 d1, q1, q9
917 vraddhn.u16 d2, q6, q10
918 vraddhn.u16 d3, q7, q11
922 vshrn.u16 d30, q2, #2
923 vst1.16 {d28, d29}, [DST_W, :128]!
926 vmull.u8 q10, d3, d30
929 generate_composite_function \
930 pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
931 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
932 8, /* number of pixels, processed in a single block */ \
933 5, /* prefetch distance */ \
934 default_init_need_all_regs, \
935 default_cleanup_need_all_regs, \
936 pixman_composite_over_8888_8_0565_process_pixblock_head, \
937 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
938 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
939 28, /* dst_w_basereg */ \
940 4, /* dst_r_basereg */ \
941 8, /* src_basereg */ \
942 24 /* mask_basereg */
944 /******************************************************************************/
947 * This function needs a special initialization of solid mask.
948 * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
949 * offset, split into color components and replicated in d8-d11
950 * registers. Additionally, this function needs all the NEON registers,
951 * so it has to save d8-d15 registers which are callee saved according
952 * to ABI. These registers are restored from 'cleanup' macro. All the
953 * other NEON registers are caller saved, so can be clobbered freely
954 * without introducing any problems.
956 .macro pixman_composite_over_n_8_0565_init
957 add DUMMY, sp, #ARGS_STACK_OFFSET
959 vld1.32 {d11[0]}, [DUMMY]
966 .macro pixman_composite_over_n_8_0565_cleanup
970 generate_composite_function \
971 pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
972 FLAG_DST_READWRITE, \
973 8, /* number of pixels, processed in a single block */ \
974 5, /* prefetch distance */ \
975 pixman_composite_over_n_8_0565_init, \
976 pixman_composite_over_n_8_0565_cleanup, \
977 pixman_composite_over_8888_8_0565_process_pixblock_head, \
978 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
979 pixman_composite_over_8888_8_0565_process_pixblock_tail_head
981 /******************************************************************************/
983 .macro pixman_composite_over_8888_n_0565_init
984 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
986 vld1.32 {d24[0]}, [DUMMY]
990 .macro pixman_composite_over_8888_n_0565_cleanup
994 generate_composite_function \
995 pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
996 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
997 8, /* number of pixels, processed in a single block */ \
998 5, /* prefetch distance */ \
999 pixman_composite_over_8888_n_0565_init, \
1000 pixman_composite_over_8888_n_0565_cleanup, \
1001 pixman_composite_over_8888_8_0565_process_pixblock_head, \
1002 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
1003 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
1004 28, /* dst_w_basereg */ \
1005 4, /* dst_r_basereg */ \
1006 8, /* src_basereg */ \
1007 24 /* mask_basereg */
1009 /******************************************************************************/
1011 .macro pixman_composite_src_0565_0565_process_pixblock_head
1014 .macro pixman_composite_src_0565_0565_process_pixblock_tail
1017 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
1018 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1020 cache_preload 16, 16
1023 generate_composite_function \
1024 pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
1025 FLAG_DST_WRITEONLY, \
1026 16, /* number of pixels, processed in a single block */ \
1027 10, /* prefetch distance */ \
1030 pixman_composite_src_0565_0565_process_pixblock_head, \
1031 pixman_composite_src_0565_0565_process_pixblock_tail, \
1032 pixman_composite_src_0565_0565_process_pixblock_tail_head, \
1033 0, /* dst_w_basereg */ \
1034 0, /* dst_r_basereg */ \
1035 0, /* src_basereg */ \
1036 0 /* mask_basereg */
1038 /******************************************************************************/
1040 .macro pixman_composite_src_n_8_process_pixblock_head
1043 .macro pixman_composite_src_n_8_process_pixblock_tail
1046 .macro pixman_composite_src_n_8_process_pixblock_tail_head
1047 vst1.8 {d0, d1, d2, d3}, [DST_W, :128]!
1050 .macro pixman_composite_src_n_8_init
1051 add DUMMY, sp, #ARGS_STACK_OFFSET
1052 vld1.32 {d0[0]}, [DUMMY]
1054 vsli.u64 d0, d0, #16
1055 vsli.u64 d0, d0, #32
1060 .macro pixman_composite_src_n_8_cleanup
1063 generate_composite_function \
1064 pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
1065 FLAG_DST_WRITEONLY, \
1066 32, /* number of pixels, processed in a single block */ \
1067 0, /* prefetch distance */ \
1068 pixman_composite_src_n_8_init, \
1069 pixman_composite_src_n_8_cleanup, \
1070 pixman_composite_src_n_8_process_pixblock_head, \
1071 pixman_composite_src_n_8_process_pixblock_tail, \
1072 pixman_composite_src_n_8_process_pixblock_tail_head, \
1073 0, /* dst_w_basereg */ \
1074 0, /* dst_r_basereg */ \
1075 0, /* src_basereg */ \
1076 0 /* mask_basereg */
1078 /******************************************************************************/
1080 .macro pixman_composite_src_n_0565_process_pixblock_head
1083 .macro pixman_composite_src_n_0565_process_pixblock_tail
1086 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
1087 vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
1090 .macro pixman_composite_src_n_0565_init
1091 add DUMMY, sp, #ARGS_STACK_OFFSET
1092 vld1.32 {d0[0]}, [DUMMY]
1093 vsli.u64 d0, d0, #16
1094 vsli.u64 d0, d0, #32
1099 .macro pixman_composite_src_n_0565_cleanup
1102 generate_composite_function \
1103 pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
1104 FLAG_DST_WRITEONLY, \
1105 16, /* number of pixels, processed in a single block */ \
1106 0, /* prefetch distance */ \
1107 pixman_composite_src_n_0565_init, \
1108 pixman_composite_src_n_0565_cleanup, \
1109 pixman_composite_src_n_0565_process_pixblock_head, \
1110 pixman_composite_src_n_0565_process_pixblock_tail, \
1111 pixman_composite_src_n_0565_process_pixblock_tail_head, \
1112 0, /* dst_w_basereg */ \
1113 0, /* dst_r_basereg */ \
1114 0, /* src_basereg */ \
1115 0 /* mask_basereg */
1117 /******************************************************************************/
1119 .macro pixman_composite_src_n_8888_process_pixblock_head
1122 .macro pixman_composite_src_n_8888_process_pixblock_tail
1125 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
1126 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1129 .macro pixman_composite_src_n_8888_init
1130 add DUMMY, sp, #ARGS_STACK_OFFSET
1131 vld1.32 {d0[0]}, [DUMMY]
1132 vsli.u64 d0, d0, #32
1137 .macro pixman_composite_src_n_8888_cleanup
1140 generate_composite_function \
1141 pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
1142 FLAG_DST_WRITEONLY, \
1143 8, /* number of pixels, processed in a single block */ \
1144 0, /* prefetch distance */ \
1145 pixman_composite_src_n_8888_init, \
1146 pixman_composite_src_n_8888_cleanup, \
1147 pixman_composite_src_n_8888_process_pixblock_head, \
1148 pixman_composite_src_n_8888_process_pixblock_tail, \
1149 pixman_composite_src_n_8888_process_pixblock_tail_head, \
1150 0, /* dst_w_basereg */ \
1151 0, /* dst_r_basereg */ \
1152 0, /* src_basereg */ \
1153 0 /* mask_basereg */
1155 /******************************************************************************/
1157 .macro pixman_composite_src_8888_8888_process_pixblock_head
1160 .macro pixman_composite_src_8888_8888_process_pixblock_tail
1163 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
1164 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1169 generate_composite_function \
1170 pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
1171 FLAG_DST_WRITEONLY, \
1172 8, /* number of pixels, processed in a single block */ \
1173 10, /* prefetch distance */ \
1176 pixman_composite_src_8888_8888_process_pixblock_head, \
1177 pixman_composite_src_8888_8888_process_pixblock_tail, \
1178 pixman_composite_src_8888_8888_process_pixblock_tail_head, \
1179 0, /* dst_w_basereg */ \
1180 0, /* dst_r_basereg */ \
1181 0, /* src_basereg */ \
1182 0 /* mask_basereg */
1184 /******************************************************************************/
1186 .macro pixman_composite_src_x888_8888_process_pixblock_head
1191 .macro pixman_composite_src_x888_8888_process_pixblock_tail
1194 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
1195 vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
1202 .macro pixman_composite_src_x888_8888_init
1204 vshl.u32 q2, q2, #24
1207 generate_composite_function \
1208 pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
1209 FLAG_DST_WRITEONLY, \
1210 8, /* number of pixels, processed in a single block */ \
1211 10, /* prefetch distance */ \
1212 pixman_composite_src_x888_8888_init, \
1214 pixman_composite_src_x888_8888_process_pixblock_head, \
1215 pixman_composite_src_x888_8888_process_pixblock_tail, \
1216 pixman_composite_src_x888_8888_process_pixblock_tail_head, \
1217 0, /* dst_w_basereg */ \
1218 0, /* dst_r_basereg */ \
1219 0, /* src_basereg */ \
1220 0 /* mask_basereg */
1222 /******************************************************************************/
1224 .macro pixman_composite_src_n_8_8888_process_pixblock_head
1225 /* expecting solid source in {d0, d1, d2, d3} */
1226 /* mask is in d24 (d25, d26, d27 are unused) */
1229 vmull.u8 q8, d24, d0
1230 vmull.u8 q9, d24, d1
1231 vmull.u8 q10, d24, d2
1232 vmull.u8 q11, d24, d3
1233 vrsra.u16 q8, q8, #8
1234 vrsra.u16 q9, q9, #8
1235 vrsra.u16 q10, q10, #8
1236 vrsra.u16 q11, q11, #8
1239 .macro pixman_composite_src_n_8_8888_process_pixblock_tail
1240 vrshrn.u16 d28, q8, #8
1241 vrshrn.u16 d29, q9, #8
1242 vrshrn.u16 d30, q10, #8
1243 vrshrn.u16 d31, q11, #8
1246 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
1248 PF add PF_X, PF_X, #8
1249 vrshrn.u16 d28, q8, #8
1250 PF tst PF_CTL, #0x0F
1251 vrshrn.u16 d29, q9, #8
1252 PF addne PF_X, PF_X, #8
1253 vrshrn.u16 d30, q10, #8
1254 PF subne PF_CTL, PF_CTL, #1
1255 vrshrn.u16 d31, q11, #8
1257 vmull.u8 q8, d24, d0
1258 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1259 vmull.u8 q9, d24, d1
1260 PF subge PF_X, PF_X, ORIG_W
1261 vmull.u8 q10, d24, d2
1262 PF subges PF_CTL, PF_CTL, #0x10
1263 vmull.u8 q11, d24, d3
1264 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1265 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1266 vrsra.u16 q8, q8, #8
1267 vrsra.u16 q9, q9, #8
1268 vrsra.u16 q10, q10, #8
1269 vrsra.u16 q11, q11, #8
1272 .macro pixman_composite_src_n_8_8888_init
1273 add DUMMY, sp, #ARGS_STACK_OFFSET
1274 vld1.32 {d3[0]}, [DUMMY]
1281 .macro pixman_composite_src_n_8_8888_cleanup
1284 generate_composite_function \
1285 pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
1286 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
1287 8, /* number of pixels, processed in a single block */ \
1288 5, /* prefetch distance */ \
1289 pixman_composite_src_n_8_8888_init, \
1290 pixman_composite_src_n_8_8888_cleanup, \
1291 pixman_composite_src_n_8_8888_process_pixblock_head, \
1292 pixman_composite_src_n_8_8888_process_pixblock_tail, \
1293 pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
1295 /******************************************************************************/
1297 .macro pixman_composite_src_n_8_8_process_pixblock_head
1298 vmull.u8 q0, d24, d16
1299 vmull.u8 q1, d25, d16
1300 vmull.u8 q2, d26, d16
1301 vmull.u8 q3, d27, d16
1302 vrsra.u16 q0, q0, #8
1303 vrsra.u16 q1, q1, #8
1304 vrsra.u16 q2, q2, #8
1305 vrsra.u16 q3, q3, #8
1308 .macro pixman_composite_src_n_8_8_process_pixblock_tail
1309 vrshrn.u16 d28, q0, #8
1310 vrshrn.u16 d29, q1, #8
1311 vrshrn.u16 d30, q2, #8
1312 vrshrn.u16 d31, q3, #8
1315 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head
1317 PF add PF_X, PF_X, #8
1318 vrshrn.u16 d28, q0, #8
1319 PF tst PF_CTL, #0x0F
1320 vrshrn.u16 d29, q1, #8
1321 PF addne PF_X, PF_X, #8
1322 vrshrn.u16 d30, q2, #8
1323 PF subne PF_CTL, PF_CTL, #1
1324 vrshrn.u16 d31, q3, #8
1326 vmull.u8 q0, d24, d16
1327 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1328 vmull.u8 q1, d25, d16
1329 PF subge PF_X, PF_X, ORIG_W
1330 vmull.u8 q2, d26, d16
1331 PF subges PF_CTL, PF_CTL, #0x10
1332 vmull.u8 q3, d27, d16
1333 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1334 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1335 vrsra.u16 q0, q0, #8
1336 vrsra.u16 q1, q1, #8
1337 vrsra.u16 q2, q2, #8
1338 vrsra.u16 q3, q3, #8
1341 .macro pixman_composite_src_n_8_8_init
1342 add DUMMY, sp, #ARGS_STACK_OFFSET
1343 vld1.32 {d16[0]}, [DUMMY]
1347 .macro pixman_composite_src_n_8_8_cleanup
1350 generate_composite_function \
1351 pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
1352 FLAG_DST_WRITEONLY, \
1353 32, /* number of pixels, processed in a single block */ \
1354 5, /* prefetch distance */ \
1355 pixman_composite_src_n_8_8_init, \
1356 pixman_composite_src_n_8_8_cleanup, \
1357 pixman_composite_src_n_8_8_process_pixblock_head, \
1358 pixman_composite_src_n_8_8_process_pixblock_tail, \
1359 pixman_composite_src_n_8_8_process_pixblock_tail_head
1361 /******************************************************************************/
1363 .macro pixman_composite_over_n_8_8888_process_pixblock_head
1364 /* expecting deinterleaved source data in {d8, d9, d10, d11} */
1365 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1366 /* and destination data in {d4, d5, d6, d7} */
1367 /* mask is in d24 (d25, d26, d27 are unused) */
1370 vmull.u8 q6, d24, d8
1371 vmull.u8 q7, d24, d9
1372 vmull.u8 q8, d24, d10
1373 vmull.u8 q9, d24, d11
1374 vrshr.u16 q10, q6, #8
1375 vrshr.u16 q11, q7, #8
1376 vrshr.u16 q12, q8, #8
1377 vrshr.u16 q13, q9, #8
1378 vraddhn.u16 d0, q6, q10
1379 vraddhn.u16 d1, q7, q11
1380 vraddhn.u16 d2, q8, q12
1381 vraddhn.u16 d3, q9, q13
1382 vmvn.8 d25, d3 /* get inverted alpha */
1383 /* source: d0 - blue, d1 - green, d2 - red, d3 - alpha */
1384 /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
1385 /* now do alpha blending */
1386 vmull.u8 q8, d25, d4
1387 vmull.u8 q9, d25, d5
1388 vmull.u8 q10, d25, d6
1389 vmull.u8 q11, d25, d7
1392 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
1393 vrshr.u16 q14, q8, #8
1394 vrshr.u16 q15, q9, #8
1395 vrshr.u16 q6, q10, #8
1396 vrshr.u16 q7, q11, #8
1397 vraddhn.u16 d28, q14, q8
1398 vraddhn.u16 d29, q15, q9
1399 vraddhn.u16 d30, q6, q10
1400 vraddhn.u16 d31, q7, q11
1401 vqadd.u8 q14, q0, q14
1402 vqadd.u8 q15, q1, q15
1405 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
1406 vrshr.u16 q14, q8, #8
1407 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1408 vrshr.u16 q15, q9, #8
1410 vrshr.u16 q6, q10, #8
1411 PF add PF_X, PF_X, #8
1412 vrshr.u16 q7, q11, #8
1413 PF tst PF_CTL, #0x0F
1414 vraddhn.u16 d28, q14, q8
1415 PF addne PF_X, PF_X, #8
1416 vraddhn.u16 d29, q15, q9
1417 PF subne PF_CTL, PF_CTL, #1
1418 vraddhn.u16 d30, q6, q10
1420 vraddhn.u16 d31, q7, q11
1421 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
1422 vmull.u8 q6, d24, d8
1423 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
1424 vmull.u8 q7, d24, d9
1425 PF subge PF_X, PF_X, ORIG_W
1426 vmull.u8 q8, d24, d10
1427 PF subges PF_CTL, PF_CTL, #0x10
1428 vmull.u8 q9, d24, d11
1429 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
1430 vqadd.u8 q14, q0, q14
1431 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
1432 vqadd.u8 q15, q1, q15
1433 vrshr.u16 q10, q6, #8
1434 vrshr.u16 q11, q7, #8
1435 vrshr.u16 q12, q8, #8
1436 vrshr.u16 q13, q9, #8
1437 vraddhn.u16 d0, q6, q10
1438 vraddhn.u16 d1, q7, q11
1439 vraddhn.u16 d2, q8, q12
1440 vraddhn.u16 d3, q9, q13
1441 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1443 vmull.u8 q8, d25, d4
1444 vmull.u8 q9, d25, d5
1445 vmull.u8 q10, d25, d6
1446 vmull.u8 q11, d25, d7
1449 .macro pixman_composite_over_n_8_8888_init
1450 add DUMMY, sp, #ARGS_STACK_OFFSET
1452 vld1.32 {d11[0]}, [DUMMY]
1459 .macro pixman_composite_over_n_8_8888_cleanup
1463 generate_composite_function \
1464 pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
1465 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1466 8, /* number of pixels, processed in a single block */ \
1467 5, /* prefetch distance */ \
1468 pixman_composite_over_n_8_8888_init, \
1469 pixman_composite_over_n_8_8888_cleanup, \
1470 pixman_composite_over_n_8_8888_process_pixblock_head, \
1471 pixman_composite_over_n_8_8888_process_pixblock_tail, \
1472 pixman_composite_over_n_8_8888_process_pixblock_tail_head
1474 /******************************************************************************/
1476 .macro pixman_composite_over_n_8_8_process_pixblock_head
1477 vmull.u8 q0, d24, d8
1478 vmull.u8 q1, d25, d8
1479 vmull.u8 q6, d26, d8
1480 vmull.u8 q7, d27, d8
1481 vrshr.u16 q10, q0, #8
1482 vrshr.u16 q11, q1, #8
1483 vrshr.u16 q12, q6, #8
1484 vrshr.u16 q13, q7, #8
1485 vraddhn.u16 d0, q0, q10
1486 vraddhn.u16 d1, q1, q11
1487 vraddhn.u16 d2, q6, q12
1488 vraddhn.u16 d3, q7, q13
1491 vmull.u8 q8, d24, d4
1492 vmull.u8 q9, d25, d5
1493 vmull.u8 q10, d26, d6
1494 vmull.u8 q11, d27, d7
1497 .macro pixman_composite_over_n_8_8_process_pixblock_tail
1498 vrshr.u16 q14, q8, #8
1499 vrshr.u16 q15, q9, #8
1500 vrshr.u16 q12, q10, #8
1501 vrshr.u16 q13, q11, #8
1502 vraddhn.u16 d28, q14, q8
1503 vraddhn.u16 d29, q15, q9
1504 vraddhn.u16 d30, q12, q10
1505 vraddhn.u16 d31, q13, q11
1506 vqadd.u8 q14, q0, q14
1507 vqadd.u8 q15, q1, q15
1510 /* TODO: expand macros and do better instructions scheduling */
1511 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head
1512 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1513 pixman_composite_over_n_8_8_process_pixblock_tail
1515 cache_preload 32, 32
1516 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1517 pixman_composite_over_n_8_8_process_pixblock_head
1520 .macro pixman_composite_over_n_8_8_init
1521 add DUMMY, sp, #ARGS_STACK_OFFSET
1523 vld1.32 {d8[0]}, [DUMMY]
1527 .macro pixman_composite_over_n_8_8_cleanup
1531 generate_composite_function \
1532 pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
1533 FLAG_DST_READWRITE, \
1534 32, /* number of pixels, processed in a single block */ \
1535 5, /* prefetch distance */ \
1536 pixman_composite_over_n_8_8_init, \
1537 pixman_composite_over_n_8_8_cleanup, \
1538 pixman_composite_over_n_8_8_process_pixblock_head, \
1539 pixman_composite_over_n_8_8_process_pixblock_tail, \
1540 pixman_composite_over_n_8_8_process_pixblock_tail_head
1542 /******************************************************************************/
1544 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1546 * 'combine_mask_ca' replacement
1548 * input: solid src (n) in {d8, d9, d10, d11}
1549 * dest in {d4, d5, d6, d7 }
1550 * mask in {d24, d25, d26, d27}
1551 * output: updated src in {d0, d1, d2, d3 }
1552 * updated mask in {d24, d25, d26, d3 }
1554 vmull.u8 q0, d24, d8
1555 vmull.u8 q1, d25, d9
1556 vmull.u8 q6, d26, d10
1557 vmull.u8 q7, d27, d11
1558 vmull.u8 q9, d11, d25
1559 vmull.u8 q12, d11, d24
1560 vmull.u8 q13, d11, d26
1561 vrshr.u16 q8, q0, #8
1562 vrshr.u16 q10, q1, #8
1563 vrshr.u16 q11, q6, #8
1564 vraddhn.u16 d0, q0, q8
1565 vraddhn.u16 d1, q1, q10
1566 vraddhn.u16 d2, q6, q11
1567 vrshr.u16 q11, q12, #8
1568 vrshr.u16 q8, q9, #8
1569 vrshr.u16 q6, q13, #8
1570 vrshr.u16 q10, q7, #8
1571 vraddhn.u16 d24, q12, q11
1572 vraddhn.u16 d25, q9, q8
1573 vraddhn.u16 d26, q13, q6
1574 vraddhn.u16 d3, q7, q10
1576 * 'combine_over_ca' replacement
1578 * output: updated dest in {d28, d29, d30, d31}
1582 vmull.u8 q8, d24, d4
1583 vmull.u8 q9, d25, d5
1585 vmull.u8 q10, d26, d6
1586 vmull.u8 q11, d27, d7
1589 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
1590 /* ... continue 'combine_over_ca' replacement */
1591 vrshr.u16 q14, q8, #8
1592 vrshr.u16 q15, q9, #8
1593 vrshr.u16 q6, q10, #8
1594 vrshr.u16 q7, q11, #8
1595 vraddhn.u16 d28, q14, q8
1596 vraddhn.u16 d29, q15, q9
1597 vraddhn.u16 d30, q6, q10
1598 vraddhn.u16 d31, q7, q11
1599 vqadd.u8 q14, q0, q14
1600 vqadd.u8 q15, q1, q15
1603 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1604 vrshr.u16 q14, q8, #8
1605 vrshr.u16 q15, q9, #8
1606 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
1607 vrshr.u16 q6, q10, #8
1608 vrshr.u16 q7, q11, #8
1609 vraddhn.u16 d28, q14, q8
1610 vraddhn.u16 d29, q15, q9
1611 vraddhn.u16 d30, q6, q10
1612 vraddhn.u16 d31, q7, q11
1614 vqadd.u8 q14, q0, q14
1615 vqadd.u8 q15, q1, q15
1617 pixman_composite_over_n_8888_8888_ca_process_pixblock_head
1618 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
1621 .macro pixman_composite_over_n_8888_8888_ca_init
1622 add DUMMY, sp, #ARGS_STACK_OFFSET
1624 vld1.32 {d11[0]}, [DUMMY]
1631 .macro pixman_composite_over_n_8888_8888_ca_cleanup
1635 generate_composite_function \
1636 pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
1637 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1638 8, /* number of pixels, processed in a single block */ \
1639 5, /* prefetch distance */ \
1640 pixman_composite_over_n_8888_8888_ca_init, \
1641 pixman_composite_over_n_8888_8888_ca_cleanup, \
1642 pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
1643 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
1644 pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
1646 /******************************************************************************/
1648 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
1650 * 'combine_mask_ca' replacement
1652 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
1653 * mask in {d24, d25, d26} [B, G, R]
1654 * output: updated src in {d0, d1, d2 } [B, G, R]
1655 * updated mask in {d24, d25, d26} [B, G, R]
1657 vmull.u8 q0, d24, d8
1658 vmull.u8 q1, d25, d9
1659 vmull.u8 q6, d26, d10
1660 vmull.u8 q9, d11, d25
1661 vmull.u8 q12, d11, d24
1662 vmull.u8 q13, d11, d26
1663 vrshr.u16 q8, q0, #8
1664 vrshr.u16 q10, q1, #8
1665 vrshr.u16 q11, q6, #8
1666 vraddhn.u16 d0, q0, q8
1667 vraddhn.u16 d1, q1, q10
1668 vraddhn.u16 d2, q6, q11
1669 vrshr.u16 q11, q12, #8
1670 vrshr.u16 q8, q9, #8
1671 vrshr.u16 q6, q13, #8
1672 vraddhn.u16 d24, q12, q11
1673 vraddhn.u16 d25, q9, q8
1675 * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
1676 * and put data into d16 - blue, d17 - green, d18 - red
1678 vshrn.u16 d17, q2, #3
1679 vshrn.u16 d18, q2, #8
1680 vraddhn.u16 d26, q13, q6
1682 vsri.u8 d18, d18, #5
1683 vsri.u8 d17, d17, #6
1685 * 'combine_over_ca' replacement
1687 * output: updated dest in d16 - blue, d17 - green, d18 - red
1690 vshrn.u16 d16, q2, #2
1692 vmull.u8 q6, d16, d24
1693 vmull.u8 q7, d17, d25
1694 vmull.u8 q11, d18, d26
1697 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
1698 /* ... continue 'combine_over_ca' replacement */
1699 vrshr.u16 q10, q6, #8
1700 vrshr.u16 q14, q7, #8
1701 vrshr.u16 q15, q11, #8
1702 vraddhn.u16 d16, q10, q6
1703 vraddhn.u16 d17, q14, q7
1704 vraddhn.u16 d18, q15, q11
1706 vqadd.u8 d18, d2, d18
1708 * convert the results in d16, d17, d18 to r5g6b5 and store
1709 * them into {d28, d29}
1711 vshll.u8 q14, d18, #8
1712 vshll.u8 q10, d17, #8
1713 vshll.u8 q15, d16, #8
1714 vsri.u16 q14, q10, #5
1715 vsri.u16 q14, q15, #11
1718 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1720 vrshr.u16 q10, q6, #8
1721 vrshr.u16 q14, q7, #8
1722 vld1.16 {d4, d5}, [DST_R, :128]!
1723 vrshr.u16 q15, q11, #8
1724 vraddhn.u16 d16, q10, q6
1725 vraddhn.u16 d17, q14, q7
1726 vraddhn.u16 d22, q15, q11
1727 /* process_pixblock_head */
1729 * 'combine_mask_ca' replacement
1731 * input: solid src (n) in {d8, d9, d10, d11} [B, G, R, A]
1732 * mask in {d24, d25, d26} [B, G, R]
1733 * output: updated src in {d0, d1, d2 } [B, G, R]
1734 * updated mask in {d24, d25, d26} [B, G, R]
1736 vmull.u8 q6, d26, d10
1738 vmull.u8 q0, d24, d8
1739 vqadd.u8 d22, d2, d22
1740 vmull.u8 q1, d25, d9
1742 * convert the result in d16, d17, d22 to r5g6b5 and store
1743 * it into {d28, d29}
1745 vshll.u8 q14, d22, #8
1746 vshll.u8 q10, d17, #8
1747 vshll.u8 q15, d16, #8
1748 vmull.u8 q9, d11, d25
1749 vsri.u16 q14, q10, #5
1750 vmull.u8 q12, d11, d24
1751 vmull.u8 q13, d11, d26
1752 vsri.u16 q14, q15, #11
1754 vrshr.u16 q8, q0, #8
1755 vrshr.u16 q10, q1, #8
1756 vrshr.u16 q11, q6, #8
1757 vraddhn.u16 d0, q0, q8
1758 vraddhn.u16 d1, q1, q10
1759 vraddhn.u16 d2, q6, q11
1760 vrshr.u16 q11, q12, #8
1761 vrshr.u16 q8, q9, #8
1762 vrshr.u16 q6, q13, #8
1763 vraddhn.u16 d24, q12, q11
1764 vraddhn.u16 d25, q9, q8
1766 * convert 8 r5g6b5 pixel data from {d4, d5} to planar
1767 * 8-bit format and put data into d16 - blue, d17 - green,
1770 vshrn.u16 d17, q2, #3
1771 vshrn.u16 d18, q2, #8
1772 vraddhn.u16 d26, q13, q6
1774 vsri.u8 d17, d17, #6
1775 vsri.u8 d18, d18, #5
1777 * 'combine_over_ca' replacement
1779 * output: updated dest in d16 - blue, d17 - green, d18 - red
1782 vshrn.u16 d16, q2, #2
1784 vmull.u8 q7, d17, d25
1785 vmull.u8 q6, d16, d24
1786 vmull.u8 q11, d18, d26
1787 vst1.16 {d28, d29}, [DST_W, :128]!
1790 .macro pixman_composite_over_n_8888_0565_ca_init
1791 add DUMMY, sp, #ARGS_STACK_OFFSET
1793 vld1.32 {d11[0]}, [DUMMY]
1800 .macro pixman_composite_over_n_8888_0565_ca_cleanup
1804 generate_composite_function \
1805 pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
1806 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
1807 8, /* number of pixels, processed in a single block */ \
1808 5, /* prefetch distance */ \
1809 pixman_composite_over_n_8888_0565_ca_init, \
1810 pixman_composite_over_n_8888_0565_ca_cleanup, \
1811 pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
1812 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
1813 pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
1815 /******************************************************************************/
1817 .macro pixman_composite_in_n_8_process_pixblock_head
1818 /* expecting source data in {d0, d1, d2, d3} */
1819 /* and destination data in {d4, d5, d6, d7} */
1822 vmull.u8 q10, d6, d3
1823 vmull.u8 q11, d7, d3
1826 .macro pixman_composite_in_n_8_process_pixblock_tail
1827 vrshr.u16 q14, q8, #8
1828 vrshr.u16 q15, q9, #8
1829 vrshr.u16 q12, q10, #8
1830 vrshr.u16 q13, q11, #8
1831 vraddhn.u16 d28, q8, q14
1832 vraddhn.u16 d29, q9, q15
1833 vraddhn.u16 d30, q10, q12
1834 vraddhn.u16 d31, q11, q13
1837 .macro pixman_composite_in_n_8_process_pixblock_tail_head
1838 pixman_composite_in_n_8_process_pixblock_tail
1839 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1840 cache_preload 32, 32
1841 pixman_composite_in_n_8_process_pixblock_head
1842 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1845 .macro pixman_composite_in_n_8_init
1846 add DUMMY, sp, #ARGS_STACK_OFFSET
1847 vld1.32 {d3[0]}, [DUMMY]
1851 .macro pixman_composite_in_n_8_cleanup
1854 generate_composite_function \
1855 pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
1856 FLAG_DST_READWRITE, \
1857 32, /* number of pixels, processed in a single block */ \
1858 5, /* prefetch distance */ \
1859 pixman_composite_in_n_8_init, \
1860 pixman_composite_in_n_8_cleanup, \
1861 pixman_composite_in_n_8_process_pixblock_head, \
1862 pixman_composite_in_n_8_process_pixblock_tail, \
1863 pixman_composite_in_n_8_process_pixblock_tail_head, \
1864 28, /* dst_w_basereg */ \
1865 4, /* dst_r_basereg */ \
1866 0, /* src_basereg */ \
1867 24 /* mask_basereg */
1869 .macro pixman_composite_add_n_8_8_process_pixblock_head
1870 /* expecting source data in {d8, d9, d10, d11} */
1871 /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
1872 /* and destination data in {d4, d5, d6, d7} */
1873 /* mask is in d24, d25, d26, d27 */
1874 vmull.u8 q0, d24, d11
1875 vmull.u8 q1, d25, d11
1876 vmull.u8 q6, d26, d11
1877 vmull.u8 q7, d27, d11
1878 vrshr.u16 q10, q0, #8
1879 vrshr.u16 q11, q1, #8
1880 vrshr.u16 q12, q6, #8
1881 vrshr.u16 q13, q7, #8
1882 vraddhn.u16 d0, q0, q10
1883 vraddhn.u16 d1, q1, q11
1884 vraddhn.u16 d2, q6, q12
1885 vraddhn.u16 d3, q7, q13
1886 vqadd.u8 q14, q0, q2
1887 vqadd.u8 q15, q1, q3
1890 .macro pixman_composite_add_n_8_8_process_pixblock_tail
1893 /* TODO: expand macros and do better instructions scheduling */
1894 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
1895 pixman_composite_add_n_8_8_process_pixblock_tail
1896 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1897 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1899 cache_preload 32, 32
1900 pixman_composite_add_n_8_8_process_pixblock_head
1903 .macro pixman_composite_add_n_8_8_init
1904 add DUMMY, sp, #ARGS_STACK_OFFSET
1906 vld1.32 {d11[0]}, [DUMMY]
1910 .macro pixman_composite_add_n_8_8_cleanup
1914 generate_composite_function \
1915 pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
1916 FLAG_DST_READWRITE, \
1917 32, /* number of pixels, processed in a single block */ \
1918 5, /* prefetch distance */ \
1919 pixman_composite_add_n_8_8_init, \
1920 pixman_composite_add_n_8_8_cleanup, \
1921 pixman_composite_add_n_8_8_process_pixblock_head, \
1922 pixman_composite_add_n_8_8_process_pixblock_tail, \
1923 pixman_composite_add_n_8_8_process_pixblock_tail_head
1925 /******************************************************************************/
1927 .macro pixman_composite_add_8_8_8_process_pixblock_head
1928 /* expecting source data in {d0, d1, d2, d3} */
1929 /* destination data in {d4, d5, d6, d7} */
1930 /* mask in {d24, d25, d26, d27} */
1931 vmull.u8 q8, d24, d0
1932 vmull.u8 q9, d25, d1
1933 vmull.u8 q10, d26, d2
1934 vmull.u8 q11, d27, d3
1935 vrshr.u16 q0, q8, #8
1936 vrshr.u16 q1, q9, #8
1937 vrshr.u16 q12, q10, #8
1938 vrshr.u16 q13, q11, #8
1939 vraddhn.u16 d0, q0, q8
1940 vraddhn.u16 d1, q1, q9
1941 vraddhn.u16 d2, q12, q10
1942 vraddhn.u16 d3, q13, q11
1943 vqadd.u8 q14, q0, q2
1944 vqadd.u8 q15, q1, q3
1947 .macro pixman_composite_add_8_8_8_process_pixblock_tail
1950 /* TODO: expand macros and do better instructions scheduling */
1951 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
1952 pixman_composite_add_8_8_8_process_pixblock_tail
1953 vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
1954 vld1.8 {d4, d5, d6, d7}, [DST_R, :128]!
1957 cache_preload 32, 32
1958 pixman_composite_add_8_8_8_process_pixblock_head
1961 .macro pixman_composite_add_8_8_8_init
1964 .macro pixman_composite_add_8_8_8_cleanup
1967 generate_composite_function \
1968 pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
1969 FLAG_DST_READWRITE, \
1970 32, /* number of pixels, processed in a single block */ \
1971 5, /* prefetch distance */ \
1972 pixman_composite_add_8_8_8_init, \
1973 pixman_composite_add_8_8_8_cleanup, \
1974 pixman_composite_add_8_8_8_process_pixblock_head, \
1975 pixman_composite_add_8_8_8_process_pixblock_tail, \
1976 pixman_composite_add_8_8_8_process_pixblock_tail_head
1978 /******************************************************************************/
1980 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
1981 /* expecting source data in {d0, d1, d2, d3} */
1982 /* destination data in {d4, d5, d6, d7} */
1983 /* mask in {d24, d25, d26, d27} */
1984 vmull.u8 q8, d27, d0
1985 vmull.u8 q9, d27, d1
1986 vmull.u8 q10, d27, d2
1987 vmull.u8 q11, d27, d3
1988 /* 1 cycle bubble */
1989 vrsra.u16 q8, q8, #8
1990 vrsra.u16 q9, q9, #8
1991 vrsra.u16 q10, q10, #8
1992 vrsra.u16 q11, q11, #8
1995 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
1996 /* 2 cycle bubble */
1997 vrshrn.u16 d28, q8, #8
1998 vrshrn.u16 d29, q9, #8
1999 vrshrn.u16 d30, q10, #8
2000 vrshrn.u16 d31, q11, #8
2001 vqadd.u8 q14, q2, q14
2002 /* 1 cycle bubble */
2003 vqadd.u8 q15, q3, q15
2006 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2008 vrshrn.u16 d28, q8, #8
2010 vrshrn.u16 d29, q9, #8
2011 vmull.u8 q8, d27, d0
2012 vrshrn.u16 d30, q10, #8
2013 vmull.u8 q9, d27, d1
2014 vrshrn.u16 d31, q11, #8
2015 vmull.u8 q10, d27, d2
2016 vqadd.u8 q14, q2, q14
2017 vmull.u8 q11, d27, d3
2018 vqadd.u8 q15, q3, q15
2019 vrsra.u16 q8, q8, #8
2020 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2021 vrsra.u16 q9, q9, #8
2022 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2023 vrsra.u16 q10, q10, #8
2027 vrsra.u16 q11, q11, #8
2030 generate_composite_function \
2031 pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
2032 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2033 8, /* number of pixels, processed in a single block */ \
2034 10, /* prefetch distance */ \
2037 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2038 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2039 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2041 generate_composite_function_single_scanline \
2042 pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
2043 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2044 8, /* number of pixels, processed in a single block */ \
2047 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2048 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2049 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
2051 /******************************************************************************/
2053 generate_composite_function \
2054 pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
2055 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2056 8, /* number of pixels, processed in a single block */ \
2057 5, /* prefetch distance */ \
2060 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2061 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2062 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2063 28, /* dst_w_basereg */ \
2064 4, /* dst_r_basereg */ \
2065 0, /* src_basereg */ \
2066 27 /* mask_basereg */
2068 /******************************************************************************/
2070 .macro pixman_composite_add_n_8_8888_init
2071 add DUMMY, sp, #ARGS_STACK_OFFSET
2072 vld1.32 {d3[0]}, [DUMMY]
2079 .macro pixman_composite_add_n_8_8888_cleanup
2082 generate_composite_function \
2083 pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
2084 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2085 8, /* number of pixels, processed in a single block */ \
2086 5, /* prefetch distance */ \
2087 pixman_composite_add_n_8_8888_init, \
2088 pixman_composite_add_n_8_8888_cleanup, \
2089 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2090 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2091 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2092 28, /* dst_w_basereg */ \
2093 4, /* dst_r_basereg */ \
2094 0, /* src_basereg */ \
2095 27 /* mask_basereg */
2097 /******************************************************************************/
2099 .macro pixman_composite_add_8888_n_8888_init
2100 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2101 vld1.32 {d27[0]}, [DUMMY]
2105 .macro pixman_composite_add_8888_n_8888_cleanup
2108 generate_composite_function \
2109 pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
2110 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2111 8, /* number of pixels, processed in a single block */ \
2112 5, /* prefetch distance */ \
2113 pixman_composite_add_8888_n_8888_init, \
2114 pixman_composite_add_8888_n_8888_cleanup, \
2115 pixman_composite_add_8888_8888_8888_process_pixblock_head, \
2116 pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
2117 pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
2118 28, /* dst_w_basereg */ \
2119 4, /* dst_r_basereg */ \
2120 0, /* src_basereg */ \
2121 27 /* mask_basereg */
2123 /******************************************************************************/
2125 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2126 /* expecting source data in {d0, d1, d2, d3} */
2127 /* destination data in {d4, d5, d6, d7} */
2128 /* solid mask is in d15 */
2131 vmull.u8 q8, d15, d3
2132 vmull.u8 q6, d15, d2
2133 vmull.u8 q5, d15, d1
2134 vmull.u8 q4, d15, d0
2135 vrshr.u16 q13, q8, #8
2136 vrshr.u16 q12, q6, #8
2137 vrshr.u16 q11, q5, #8
2138 vrshr.u16 q10, q4, #8
2139 vraddhn.u16 d3, q8, q13
2140 vraddhn.u16 d2, q6, q12
2141 vraddhn.u16 d1, q5, q11
2142 vraddhn.u16 d0, q4, q10
2143 vmvn.8 d24, d3 /* get inverted alpha */
2144 /* now do alpha blending */
2145 vmull.u8 q8, d24, d4
2146 vmull.u8 q9, d24, d5
2147 vmull.u8 q10, d24, d6
2148 vmull.u8 q11, d24, d7
2151 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2152 vrshr.u16 q14, q8, #8
2153 vrshr.u16 q15, q9, #8
2154 vrshr.u16 q12, q10, #8
2155 vrshr.u16 q13, q11, #8
2156 vraddhn.u16 d28, q14, q8
2157 vraddhn.u16 d29, q15, q9
2158 vraddhn.u16 d30, q12, q10
2159 vraddhn.u16 d31, q13, q11
2162 /* TODO: expand macros and do better instructions scheduling */
2163 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
2164 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2165 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2169 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2170 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2173 generate_composite_function_single_scanline \
2174 pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
2175 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2176 8, /* number of pixels, processed in a single block */ \
2177 default_init_need_all_regs, \
2178 default_cleanup_need_all_regs, \
2179 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
2180 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
2181 pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
2182 28, /* dst_w_basereg */ \
2183 4, /* dst_r_basereg */ \
2184 0, /* src_basereg */ \
2185 12 /* mask_basereg */
2187 /******************************************************************************/
2189 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
2190 pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
2193 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
2194 pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
2195 vqadd.u8 q14, q0, q14
2196 vqadd.u8 q15, q1, q15
2199 /* TODO: expand macros and do better instructions scheduling */
2200 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2201 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2202 pixman_composite_over_8888_n_8888_process_pixblock_tail
2205 pixman_composite_over_8888_n_8888_process_pixblock_head
2206 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2209 .macro pixman_composite_over_8888_n_8888_init
2212 vld1.32 {d15[0]}, [DUMMY]
2216 .macro pixman_composite_over_8888_n_8888_cleanup
2220 generate_composite_function \
2221 pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
2222 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2223 8, /* number of pixels, processed in a single block */ \
2224 5, /* prefetch distance */ \
2225 pixman_composite_over_8888_n_8888_init, \
2226 pixman_composite_over_8888_n_8888_cleanup, \
2227 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2228 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2229 pixman_composite_over_8888_n_8888_process_pixblock_tail_head
2231 /******************************************************************************/
2233 /* TODO: expand macros and do better instructions scheduling */
2234 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
2235 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2236 pixman_composite_over_8888_n_8888_process_pixblock_tail
2240 pixman_composite_over_8888_n_8888_process_pixblock_head
2241 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2244 generate_composite_function \
2245 pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
2246 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2247 8, /* number of pixels, processed in a single block */ \
2248 5, /* prefetch distance */ \
2249 default_init_need_all_regs, \
2250 default_cleanup_need_all_regs, \
2251 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2252 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2253 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2254 28, /* dst_w_basereg */ \
2255 4, /* dst_r_basereg */ \
2256 0, /* src_basereg */ \
2257 12 /* mask_basereg */
2259 generate_composite_function_single_scanline \
2260 pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
2261 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2262 8, /* number of pixels, processed in a single block */ \
2263 default_init_need_all_regs, \
2264 default_cleanup_need_all_regs, \
2265 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2266 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2267 pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
2268 28, /* dst_w_basereg */ \
2269 4, /* dst_r_basereg */ \
2270 0, /* src_basereg */ \
2271 12 /* mask_basereg */
2273 /******************************************************************************/
2275 /* TODO: expand macros and do better instructions scheduling */
2276 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
2277 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2278 pixman_composite_over_8888_n_8888_process_pixblock_tail
2282 pixman_composite_over_8888_n_8888_process_pixblock_head
2283 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2286 generate_composite_function \
2287 pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
2288 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2289 8, /* number of pixels, processed in a single block */ \
2290 5, /* prefetch distance */ \
2291 default_init_need_all_regs, \
2292 default_cleanup_need_all_regs, \
2293 pixman_composite_over_8888_n_8888_process_pixblock_head, \
2294 pixman_composite_over_8888_n_8888_process_pixblock_tail, \
2295 pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
2296 28, /* dst_w_basereg */ \
2297 4, /* dst_r_basereg */ \
2298 0, /* src_basereg */ \
2299 15 /* mask_basereg */
2301 /******************************************************************************/
2303 .macro pixman_composite_src_0888_0888_process_pixblock_head
2306 .macro pixman_composite_src_0888_0888_process_pixblock_tail
2309 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
2310 vst3.8 {d0, d1, d2}, [DST_W]!
2315 generate_composite_function \
2316 pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
2317 FLAG_DST_WRITEONLY, \
2318 8, /* number of pixels, processed in a single block */ \
2319 10, /* prefetch distance */ \
2322 pixman_composite_src_0888_0888_process_pixblock_head, \
2323 pixman_composite_src_0888_0888_process_pixblock_tail, \
2324 pixman_composite_src_0888_0888_process_pixblock_tail_head, \
2325 0, /* dst_w_basereg */ \
2326 0, /* dst_r_basereg */ \
2327 0, /* src_basereg */ \
2328 0 /* mask_basereg */
2330 /******************************************************************************/
2332 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head
2336 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
2339 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
2340 vst4.8 {d0, d1, d2, d3}, [DST_W]!
2346 .macro pixman_composite_src_0888_8888_rev_init
2350 generate_composite_function \
2351 pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
2352 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2353 8, /* number of pixels, processed in a single block */ \
2354 10, /* prefetch distance */ \
2355 pixman_composite_src_0888_8888_rev_init, \
2357 pixman_composite_src_0888_8888_rev_process_pixblock_head, \
2358 pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
2359 pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
2360 0, /* dst_w_basereg */ \
2361 0, /* dst_r_basereg */ \
2362 0, /* src_basereg */ \
2363 0 /* mask_basereg */
2365 /******************************************************************************/
2367 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head
2372 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
2373 vshll.u8 q14, d0, #8
2374 vsri.u16 q14, q8, #5
2375 vsri.u16 q14, q9, #11
2378 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
2379 vshll.u8 q14, d0, #8
2381 vsri.u16 q14, q8, #5
2382 vsri.u16 q14, q9, #11
2384 vst1.16 {d28, d29}, [DST_W, :128]!
2388 generate_composite_function \
2389 pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
2390 FLAG_DST_WRITEONLY, \
2391 8, /* number of pixels, processed in a single block */ \
2392 10, /* prefetch distance */ \
2395 pixman_composite_src_0888_0565_rev_process_pixblock_head, \
2396 pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
2397 pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
2398 28, /* dst_w_basereg */ \
2399 0, /* dst_r_basereg */ \
2400 0, /* src_basereg */ \
2401 0 /* mask_basereg */
2403 /******************************************************************************/
2405 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head
2408 vmull.u8 q10, d3, d2
2411 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
2412 vrshr.u16 q11, q8, #8
2414 vrshr.u16 q12, q9, #8
2415 vrshr.u16 q13, q10, #8
2416 vraddhn.u16 d30, q11, q8
2417 vraddhn.u16 d29, q12, q9
2418 vraddhn.u16 d28, q13, q10
2421 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
2422 vrshr.u16 q11, q8, #8
2424 vrshr.u16 q12, q9, #8
2425 vrshr.u16 q13, q10, #8
2427 vraddhn.u16 d30, q11, q8
2428 PF add PF_X, PF_X, #8
2430 PF addne PF_X, PF_X, #8
2431 PF subne PF_CTL, PF_CTL, #1
2432 vraddhn.u16 d29, q12, q9
2433 vraddhn.u16 d28, q13, q10
2436 vmull.u8 q10, d3, d2
2437 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2439 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2440 PF subge PF_X, PF_X, ORIG_W
2441 PF subges PF_CTL, PF_CTL, #0x10
2442 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2445 generate_composite_function \
2446 pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
2447 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2448 8, /* number of pixels, processed in a single block */ \
2449 10, /* prefetch distance */ \
2452 pixman_composite_src_pixbuf_8888_process_pixblock_head, \
2453 pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
2454 pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
2455 28, /* dst_w_basereg */ \
2456 0, /* dst_r_basereg */ \
2457 0, /* src_basereg */ \
2458 0 /* mask_basereg */
2460 /******************************************************************************/
2462 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
2465 vmull.u8 q10, d3, d2
2468 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
2469 vrshr.u16 q11, q8, #8
2471 vrshr.u16 q12, q9, #8
2472 vrshr.u16 q13, q10, #8
2473 vraddhn.u16 d28, q11, q8
2474 vraddhn.u16 d29, q12, q9
2475 vraddhn.u16 d30, q13, q10
2478 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
2479 vrshr.u16 q11, q8, #8
2481 vrshr.u16 q12, q9, #8
2482 vrshr.u16 q13, q10, #8
2484 vraddhn.u16 d28, q11, q8
2485 PF add PF_X, PF_X, #8
2487 PF addne PF_X, PF_X, #8
2488 PF subne PF_CTL, PF_CTL, #1
2489 vraddhn.u16 d29, q12, q9
2490 vraddhn.u16 d30, q13, q10
2493 vmull.u8 q10, d3, d2
2494 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2496 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
2497 PF subge PF_X, PF_X, ORIG_W
2498 PF subges PF_CTL, PF_CTL, #0x10
2499 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
2502 generate_composite_function \
2503 pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
2504 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2505 8, /* number of pixels, processed in a single block */ \
2506 10, /* prefetch distance */ \
2509 pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
2510 pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
2511 pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
2512 28, /* dst_w_basereg */ \
2513 0, /* dst_r_basereg */ \
2514 0, /* src_basereg */ \
2515 0 /* mask_basereg */
2517 /******************************************************************************/
2519 .macro pixman_composite_over_0565_8_0565_process_pixblock_head
2520 /* mask is in d15 */
2521 convert_0565_to_x888 q4, d2, d1, d0
2522 convert_0565_to_x888 q5, d6, d5, d4
2523 /* source pixel data is in {d0, d1, d2, XX} */
2524 /* destination pixel data is in {d4, d5, d6, XX} */
2526 vmull.u8 q6, d15, d2
2527 vmull.u8 q5, d15, d1
2528 vmull.u8 q4, d15, d0
2531 vmull.u8 q13, d7, d6
2532 vrshr.u16 q12, q6, #8
2533 vrshr.u16 q11, q5, #8
2534 vrshr.u16 q10, q4, #8
2535 vraddhn.u16 d2, q6, q12
2536 vraddhn.u16 d1, q5, q11
2537 vraddhn.u16 d0, q4, q10
2540 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail
2541 vrshr.u16 q14, q8, #8
2542 vrshr.u16 q15, q9, #8
2543 vrshr.u16 q12, q13, #8
2544 vraddhn.u16 d28, q14, q8
2545 vraddhn.u16 d29, q15, q9
2546 vraddhn.u16 d30, q12, q13
2547 vqadd.u8 q0, q0, q14
2548 vqadd.u8 q1, q1, q15
2549 /* 32bpp result is in {d0, d1, d2, XX} */
2550 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2553 /* TODO: expand macros and do better instructions scheduling */
2554 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
2556 pixman_composite_over_0565_8_0565_process_pixblock_tail
2558 vld1.16 {d10, d11}, [DST_R, :128]!
2560 pixman_composite_over_0565_8_0565_process_pixblock_head
2561 vst1.16 {d28, d29}, [DST_W, :128]!
2564 generate_composite_function \
2565 pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
2566 FLAG_DST_READWRITE, \
2567 8, /* number of pixels, processed in a single block */ \
2568 5, /* prefetch distance */ \
2569 default_init_need_all_regs, \
2570 default_cleanup_need_all_regs, \
2571 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2572 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2573 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2574 28, /* dst_w_basereg */ \
2575 10, /* dst_r_basereg */ \
2576 8, /* src_basereg */ \
2577 15 /* mask_basereg */
2579 /******************************************************************************/
2581 .macro pixman_composite_over_0565_n_0565_init
2582 add DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
2584 vld1.32 {d15[0]}, [DUMMY]
2588 .macro pixman_composite_over_0565_n_0565_cleanup
2592 generate_composite_function \
2593 pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
2594 FLAG_DST_READWRITE, \
2595 8, /* number of pixels, processed in a single block */ \
2596 5, /* prefetch distance */ \
2597 pixman_composite_over_0565_n_0565_init, \
2598 pixman_composite_over_0565_n_0565_cleanup, \
2599 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2600 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2601 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2602 28, /* dst_w_basereg */ \
2603 10, /* dst_r_basereg */ \
2604 8, /* src_basereg */ \
2605 15 /* mask_basereg */
2607 /******************************************************************************/
2609 .macro pixman_composite_add_0565_8_0565_process_pixblock_head
2610 /* mask is in d15 */
2611 convert_0565_to_x888 q4, d2, d1, d0
2612 convert_0565_to_x888 q5, d6, d5, d4
2613 /* source pixel data is in {d0, d1, d2, XX} */
2614 /* destination pixel data is in {d4, d5, d6, XX} */
2615 vmull.u8 q6, d15, d2
2616 vmull.u8 q5, d15, d1
2617 vmull.u8 q4, d15, d0
2618 vrshr.u16 q12, q6, #8
2619 vrshr.u16 q11, q5, #8
2620 vrshr.u16 q10, q4, #8
2621 vraddhn.u16 d2, q6, q12
2622 vraddhn.u16 d1, q5, q11
2623 vraddhn.u16 d0, q4, q10
2626 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail
2629 /* 32bpp result is in {d0, d1, d2, XX} */
2630 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2633 /* TODO: expand macros and do better instructions scheduling */
2634 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
2636 pixman_composite_add_0565_8_0565_process_pixblock_tail
2638 vld1.16 {d10, d11}, [DST_R, :128]!
2640 pixman_composite_add_0565_8_0565_process_pixblock_head
2641 vst1.16 {d28, d29}, [DST_W, :128]!
2644 generate_composite_function \
2645 pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
2646 FLAG_DST_READWRITE, \
2647 8, /* number of pixels, processed in a single block */ \
2648 5, /* prefetch distance */ \
2649 default_init_need_all_regs, \
2650 default_cleanup_need_all_regs, \
2651 pixman_composite_add_0565_8_0565_process_pixblock_head, \
2652 pixman_composite_add_0565_8_0565_process_pixblock_tail, \
2653 pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
2654 28, /* dst_w_basereg */ \
2655 10, /* dst_r_basereg */ \
2656 8, /* src_basereg */ \
2657 15 /* mask_basereg */
2659 /******************************************************************************/
2661 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head
2662 /* mask is in d15 */
2663 convert_0565_to_x888 q5, d6, d5, d4
2664 /* destination pixel data is in {d4, d5, d6, xx} */
2665 vmvn.8 d24, d15 /* get inverted alpha */
2666 /* now do alpha blending */
2667 vmull.u8 q8, d24, d4
2668 vmull.u8 q9, d24, d5
2669 vmull.u8 q10, d24, d6
2672 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
2673 vrshr.u16 q14, q8, #8
2674 vrshr.u16 q15, q9, #8
2675 vrshr.u16 q12, q10, #8
2676 vraddhn.u16 d0, q14, q8
2677 vraddhn.u16 d1, q15, q9
2678 vraddhn.u16 d2, q12, q10
2679 /* 32bpp result is in {d0, d1, d2, XX} */
2680 convert_8888_to_0565 d2, d1, d0, q14, q15, q3
2683 /* TODO: expand macros and do better instructions scheduling */
2684 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
2686 pixman_composite_out_reverse_8_0565_process_pixblock_tail
2687 vld1.16 {d10, d11}, [DST_R, :128]!
2689 pixman_composite_out_reverse_8_0565_process_pixblock_head
2690 vst1.16 {d28, d29}, [DST_W, :128]!
2693 generate_composite_function \
2694 pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
2695 FLAG_DST_READWRITE, \
2696 8, /* number of pixels, processed in a single block */ \
2697 5, /* prefetch distance */ \
2698 default_init_need_all_regs, \
2699 default_cleanup_need_all_regs, \
2700 pixman_composite_out_reverse_8_0565_process_pixblock_head, \
2701 pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
2702 pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
2703 28, /* dst_w_basereg */ \
2704 10, /* dst_r_basereg */ \
2705 15, /* src_basereg */ \
2706 0 /* mask_basereg */
2708 /******************************************************************************/
2710 .macro pixman_composite_out_reverse_8_8888_process_pixblock_head
2712 /* destination pixel data is in {d4, d5, d6, d7} */
2713 vmvn.8 d1, d0 /* get inverted alpha */
2714 /* now do alpha blending */
2717 vmull.u8 q10, d1, d6
2718 vmull.u8 q11, d1, d7
2721 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
2722 vrshr.u16 q14, q8, #8
2723 vrshr.u16 q15, q9, #8
2724 vrshr.u16 q12, q10, #8
2725 vrshr.u16 q13, q11, #8
2726 vraddhn.u16 d28, q14, q8
2727 vraddhn.u16 d29, q15, q9
2728 vraddhn.u16 d30, q12, q10
2729 vraddhn.u16 d31, q13, q11
2730 /* 32bpp result is in {d28, d29, d30, d31} */
2733 /* TODO: expand macros and do better instructions scheduling */
2734 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
2736 pixman_composite_out_reverse_8_8888_process_pixblock_tail
2737 vld4.8 {d4, d5, d6, d7}, [DST_R, :128]!
2739 pixman_composite_out_reverse_8_8888_process_pixblock_head
2740 vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
2743 generate_composite_function \
2744 pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
2745 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2746 8, /* number of pixels, processed in a single block */ \
2747 5, /* prefetch distance */ \
2750 pixman_composite_out_reverse_8_8888_process_pixblock_head, \
2751 pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
2752 pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
2753 28, /* dst_w_basereg */ \
2754 4, /* dst_r_basereg */ \
2755 0, /* src_basereg */ \
2756 0 /* mask_basereg */
2758 /******************************************************************************/
2760 generate_composite_function_nearest_scanline \
2761 pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
2762 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2763 8, /* number of pixels, processed in a single block */ \
2766 pixman_composite_over_8888_8888_process_pixblock_head, \
2767 pixman_composite_over_8888_8888_process_pixblock_tail, \
2768 pixman_composite_over_8888_8888_process_pixblock_tail_head
2770 generate_composite_function_nearest_scanline \
2771 pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
2772 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2773 8, /* number of pixels, processed in a single block */ \
2776 pixman_composite_over_8888_0565_process_pixblock_head, \
2777 pixman_composite_over_8888_0565_process_pixblock_tail, \
2778 pixman_composite_over_8888_0565_process_pixblock_tail_head, \
2779 28, /* dst_w_basereg */ \
2780 4, /* dst_r_basereg */ \
2781 0, /* src_basereg */ \
2782 24 /* mask_basereg */
2784 generate_composite_function_nearest_scanline \
2785 pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
2786 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2787 8, /* number of pixels, processed in a single block */ \
2790 pixman_composite_src_8888_0565_process_pixblock_head, \
2791 pixman_composite_src_8888_0565_process_pixblock_tail, \
2792 pixman_composite_src_8888_0565_process_pixblock_tail_head
2794 generate_composite_function_nearest_scanline \
2795 pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
2796 FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
2797 8, /* number of pixels, processed in a single block */ \
2800 pixman_composite_src_0565_8888_process_pixblock_head, \
2801 pixman_composite_src_0565_8888_process_pixblock_tail, \
2802 pixman_composite_src_0565_8888_process_pixblock_tail_head
2804 generate_composite_function_nearest_scanline \
2805 pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
2806 FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
2807 8, /* number of pixels, processed in a single block */ \
2808 default_init_need_all_regs, \
2809 default_cleanup_need_all_regs, \
2810 pixman_composite_over_8888_8_0565_process_pixblock_head, \
2811 pixman_composite_over_8888_8_0565_process_pixblock_tail, \
2812 pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
2813 28, /* dst_w_basereg */ \
2814 4, /* dst_r_basereg */ \
2815 8, /* src_basereg */ \
2816 24 /* mask_basereg */
2818 generate_composite_function_nearest_scanline \
2819 pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
2820 FLAG_DST_READWRITE, \
2821 8, /* number of pixels, processed in a single block */ \
2822 default_init_need_all_regs, \
2823 default_cleanup_need_all_regs, \
2824 pixman_composite_over_0565_8_0565_process_pixblock_head, \
2825 pixman_composite_over_0565_8_0565_process_pixblock_tail, \
2826 pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
2827 28, /* dst_w_basereg */ \
2828 10, /* dst_r_basereg */ \
2829 8, /* src_basereg */ \
2830 15 /* mask_basereg */
2832 /******************************************************************************/
2835 * Bilinear scaling support code which tries to provide pixel fetching, color
2836 * format conversion, and interpolation as separate macros which can be used
2837 * as the basic building blocks for constructing bilinear scanline functions.
2840 .macro bilinear_load_8888 reg1, reg2, tmp
2841 mov TMP1, X, asr #16
2843 add TMP1, TOP, TMP1, asl #2
2844 vld1.32 {reg1}, [TMP1], STRIDE
2845 vld1.32 {reg2}, [TMP1]
2848 .macro bilinear_load_0565 reg1, reg2, tmp
2849 mov TMP1, X, asr #16
2851 add TMP1, TOP, TMP1, asl #1
2852 vld1.32 {reg2[0]}, [TMP1], STRIDE
2853 vld1.32 {reg2[1]}, [TMP1]
2854 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
2857 .macro bilinear_load_and_vertical_interpolate_two_8888 \
2858 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
2860 bilinear_load_8888 reg1, reg2, tmp1
2861 vmull.u8 acc1, reg1, d28
2862 vmlal.u8 acc1, reg2, d29
2863 bilinear_load_8888 reg3, reg4, tmp2
2864 vmull.u8 acc2, reg3, d28
2865 vmlal.u8 acc2, reg4, d29
2868 .macro bilinear_load_and_vertical_interpolate_four_8888 \
2869 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2870 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2872 bilinear_load_and_vertical_interpolate_two_8888 \
2873 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
2874 bilinear_load_and_vertical_interpolate_two_8888 \
2875 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2878 .macro bilinear_load_and_vertical_interpolate_two_0565 \
2879 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
2881 mov TMP1, X, asr #16
2883 add TMP1, TOP, TMP1, asl #1
2884 mov TMP2, X, asr #16
2886 add TMP2, TOP, TMP2, asl #1
2887 vld1.32 {acc2lo[0]}, [TMP1], STRIDE
2888 vld1.32 {acc2hi[0]}, [TMP2], STRIDE
2889 vld1.32 {acc2lo[1]}, [TMP1]
2890 vld1.32 {acc2hi[1]}, [TMP2]
2891 convert_0565_to_x888 acc2, reg3, reg2, reg1
2896 vmull.u8 acc1, reg1, d28
2897 vmlal.u8 acc1, reg2, d29
2898 vmull.u8 acc2, reg3, d28
2899 vmlal.u8 acc2, reg4, d29
2902 .macro bilinear_load_and_vertical_interpolate_four_0565 \
2903 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
2904 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
2906 mov TMP1, X, asr #16
2908 add TMP1, TOP, TMP1, asl #1
2909 mov TMP2, X, asr #16
2911 add TMP2, TOP, TMP2, asl #1
2912 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
2913 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
2914 vld1.32 {xacc2lo[1]}, [TMP1]
2915 vld1.32 {xacc2hi[1]}, [TMP2]
2916 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
2917 mov TMP1, X, asr #16
2919 add TMP1, TOP, TMP1, asl #1
2920 mov TMP2, X, asr #16
2922 add TMP2, TOP, TMP2, asl #1
2923 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
2924 vzip.u8 xreg1, xreg3
2925 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
2926 vzip.u8 xreg2, xreg4
2927 vld1.32 {yacc2lo[1]}, [TMP1]
2928 vzip.u8 xreg3, xreg4
2929 vld1.32 {yacc2hi[1]}, [TMP2]
2930 vzip.u8 xreg1, xreg2
2931 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
2932 vmull.u8 xacc1, xreg1, d28
2933 vzip.u8 yreg1, yreg3
2934 vmlal.u8 xacc1, xreg2, d29
2935 vzip.u8 yreg2, yreg4
2936 vmull.u8 xacc2, xreg3, d28
2937 vzip.u8 yreg3, yreg4
2938 vmlal.u8 xacc2, xreg4, d29
2939 vzip.u8 yreg1, yreg2
2940 vmull.u8 yacc1, yreg1, d28
2941 vmlal.u8 yacc1, yreg2, d29
2942 vmull.u8 yacc2, yreg3, d28
2943 vmlal.u8 yacc2, yreg4, d29
2946 .macro bilinear_store_8888 numpix, tmp1, tmp2
2948 vst1.32 {d0, d1}, [OUT, :128]!
2950 vst1.32 {d0}, [OUT, :64]!
2952 vst1.32 {d0[0]}, [OUT, :32]!
2954 .error bilinear_store_8888 numpix is unsupported
2958 .macro bilinear_store_0565 numpix, tmp1, tmp2
2963 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
2965 vst1.16 {d2}, [OUT, :64]!
2967 vst1.32 {d2[0]}, [OUT, :32]!
2969 vst1.16 {d2[0]}, [OUT, :16]!
2971 .error bilinear_store_0565 numpix is unsupported
2975 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
2976 bilinear_load_&src_fmt d0, d1, d2
2977 vmull.u8 q1, d0, d28
2978 vmlal.u8 q1, d1, d29
2979 /* 5 cycles bubble */
2980 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
2981 vmlsl.u16 q0, d2, d30
2982 vmlal.u16 q0, d3, d30
2983 /* 5 cycles bubble */
2984 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
2985 /* 3 cycles bubble */
2987 /* 1 cycle bubble */
2988 bilinear_store_&dst_fmt 1, q2, q3
2991 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
2992 bilinear_load_and_vertical_interpolate_two_&src_fmt \
2993 q1, q11, d0, d1, d20, d21, d22, d23
2994 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
2995 vmlsl.u16 q0, d2, d30
2996 vmlal.u16 q0, d3, d30
2997 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
2998 vmlsl.u16 q10, d22, d31
2999 vmlal.u16 q10, d23, d31
3000 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3001 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
3002 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3003 vadd.u16 q12, q12, q13
3005 bilinear_store_&dst_fmt 2, q2, q3
3008 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
3009 bilinear_load_and_vertical_interpolate_four_&src_fmt \
3010 q1, q11, d0, d1, d20, d21, d22, d23 \
3011 q3, q9, d4, d5, d16, d17, d18, d19
3013 sub TMP1, TMP1, STRIDE
3014 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
3015 vmlsl.u16 q0, d2, d30
3016 vmlal.u16 q0, d3, d30
3017 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
3018 vmlsl.u16 q10, d22, d31
3019 vmlal.u16 q10, d23, d31
3020 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3021 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
3022 vmlsl.u16 q2, d6, d30
3023 vmlal.u16 q2, d7, d30
3024 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
3026 vmlsl.u16 q8, d18, d31
3027 vmlal.u16 q8, d19, d31
3028 vadd.u16 q12, q12, q13
3029 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3030 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
3031 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3032 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
3033 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3036 vadd.u16 q12, q12, q13
3037 bilinear_store_&dst_fmt 4, q2, q3
3040 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3041 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3042 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
3044 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3048 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3049 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3050 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
3054 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3055 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
3056 bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
3058 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3062 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
3063 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3064 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
3066 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3067 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3071 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
3072 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3073 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
3075 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3079 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
3080 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
3081 bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
3083 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3084 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3088 .set BILINEAR_FLAG_UNROLL_4, 0
3089 .set BILINEAR_FLAG_UNROLL_8, 1
3090 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
3093 * Main template macro for generating NEON optimized bilinear scanline
3096 * Bilinear scanline scaler macro template uses the following arguments:
3097 * fname - name of the function to generate
3098 * src_fmt - source color format (8888 or 0565)
3099 * dst_fmt - destination color format (8888 or 0565)
3100 * bpp_shift - (1 << bpp_shift) is the size of source pixel in bytes
3101 * prefetch_distance - prefetch in the source image by that many
3105 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
3106 src_bpp_shift, dst_bpp_shift, \
3107 prefetch_distance, flags
3109 pixman_asm_function fname
3126 push {r4, r5, r6, r7, r8, r9}
3127 mov PF_OFFS, #prefetch_distance
3128 ldmia ip, {WB, X, UX, WIDTH}
3129 mul PF_OFFS, PF_OFFS, UX
3131 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3135 sub STRIDE, BOTTOM, TOP
3145 vadd.u16 d25, d25, d26
3147 /* ensure good destination alignment */
3150 tst OUT, #(1 << dst_bpp_shift)
3152 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3153 vadd.u16 q12, q12, q13
3154 bilinear_interpolate_last_pixel src_fmt, dst_fmt
3155 sub WIDTH, WIDTH, #1
3157 vadd.u16 q13, q13, q13
3158 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3159 vadd.u16 q12, q12, q13
3163 tst OUT, #(1 << (dst_bpp_shift + 1))
3165 bilinear_interpolate_two_pixels src_fmt, dst_fmt
3166 sub WIDTH, WIDTH, #2
3168 .if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
3169 /*********** 8 pixels per iteration *****************/
3172 tst OUT, #(1 << (dst_bpp_shift + 2))
3174 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3175 sub WIDTH, WIDTH, #4
3177 subs WIDTH, WIDTH, #8
3179 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
3180 bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
3181 subs WIDTH, WIDTH, #8
3184 bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
3185 subs WIDTH, WIDTH, #8
3188 bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
3192 bilinear_interpolate_four_pixels src_fmt, dst_fmt
3195 /*********** 4 pixels per iteration *****************/
3196 subs WIDTH, WIDTH, #4
3198 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
3199 bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
3200 subs WIDTH, WIDTH, #4
3203 bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
3204 subs WIDTH, WIDTH, #4
3207 bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
3209 /****************************************************/
3211 /* handle the remaining trailing pixels */
3214 bilinear_interpolate_two_pixels src_fmt, dst_fmt
3218 bilinear_interpolate_last_pixel src_fmt, dst_fmt
3220 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
3223 pop {r4, r5, r6, r7, r8, r9}
3243 /*****************************************************************************/
3245 .set have_bilinear_interpolate_four_pixels_8888_8888, 1
3247 .macro bilinear_interpolate_four_pixels_8888_8888_head
3248 mov TMP1, X, asr #16
3250 add TMP1, TOP, TMP1, asl #2
3251 mov TMP2, X, asr #16
3253 add TMP2, TOP, TMP2, asl #2
3255 vld1.32 {d22}, [TMP1], STRIDE
3256 vld1.32 {d23}, [TMP1]
3257 mov TMP3, X, asr #16
3259 add TMP3, TOP, TMP3, asl #2
3260 vmull.u8 q8, d22, d28
3261 vmlal.u8 q8, d23, d29
3263 vld1.32 {d22}, [TMP2], STRIDE
3264 vld1.32 {d23}, [TMP2]
3265 mov TMP4, X, asr #16
3267 add TMP4, TOP, TMP4, asl #2
3268 vmull.u8 q9, d22, d28
3269 vmlal.u8 q9, d23, d29
3271 vld1.32 {d22}, [TMP3], STRIDE
3272 vld1.32 {d23}, [TMP3]
3273 vmull.u8 q10, d22, d28
3274 vmlal.u8 q10, d23, d29
3276 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3277 vmlsl.u16 q0, d16, d30
3278 vmlal.u16 q0, d17, d30
3281 vld1.32 {d16}, [TMP4], STRIDE
3282 vld1.32 {d17}, [TMP4]
3284 vmull.u8 q11, d16, d28
3285 vmlal.u8 q11, d17, d29
3287 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3288 vmlsl.u16 q1, d18, d31
3291 .macro bilinear_interpolate_four_pixels_8888_8888_tail
3292 vmlal.u16 q1, d19, d31
3293 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3294 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3295 vmlsl.u16 q2, d20, d30
3296 vmlal.u16 q2, d21, d30
3297 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3298 vmlsl.u16 q3, d22, d31
3299 vmlal.u16 q3, d23, d31
3300 vadd.u16 q12, q12, q13
3301 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3302 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3303 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3304 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3305 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3308 vadd.u16 q12, q12, q13
3309 vst1.32 {d6, d7}, [OUT, :128]!
3312 .macro bilinear_interpolate_four_pixels_8888_8888_tail_head
3313 mov TMP1, X, asr #16
3315 add TMP1, TOP, TMP1, asl #2
3316 mov TMP2, X, asr #16
3318 add TMP2, TOP, TMP2, asl #2
3319 vmlal.u16 q1, d19, d31
3320 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3321 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3322 vmlsl.u16 q2, d20, d30
3323 vmlal.u16 q2, d21, d30
3324 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3325 vld1.32 {d20}, [TMP1], STRIDE
3326 vmlsl.u16 q3, d22, d31
3327 vmlal.u16 q3, d23, d31
3328 vld1.32 {d21}, [TMP1]
3329 vmull.u8 q8, d20, d28
3330 vmlal.u8 q8, d21, d29
3331 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3332 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3333 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3334 vld1.32 {d22}, [TMP2], STRIDE
3335 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3336 vadd.u16 q12, q12, q13
3337 vld1.32 {d23}, [TMP2]
3338 vmull.u8 q9, d22, d28
3339 mov TMP3, X, asr #16
3341 add TMP3, TOP, TMP3, asl #2
3342 mov TMP4, X, asr #16
3344 add TMP4, TOP, TMP4, asl #2
3345 vmlal.u8 q9, d23, d29
3346 vld1.32 {d22}, [TMP3], STRIDE
3347 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3348 vld1.32 {d23}, [TMP3]
3349 vmull.u8 q10, d22, d28
3350 vmlal.u8 q10, d23, d29
3352 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3354 vmlsl.u16 q0, d16, d30
3355 vmlal.u16 q0, d17, d30
3357 vld1.32 {d16}, [TMP4], STRIDE
3358 vadd.u16 q12, q12, q13
3359 vld1.32 {d17}, [TMP4]
3361 vmull.u8 q11, d16, d28
3362 vmlal.u8 q11, d17, d29
3363 vst1.32 {d6, d7}, [OUT, :128]!
3364 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3365 vmlsl.u16 q1, d18, d31
3368 /*****************************************************************************/
3370 .set have_bilinear_interpolate_eight_pixels_8888_0565, 1
3372 .macro bilinear_interpolate_eight_pixels_8888_0565_head
3373 mov TMP1, X, asr #16
3375 add TMP1, TOP, TMP1, asl #2
3376 mov TMP2, X, asr #16
3378 add TMP2, TOP, TMP2, asl #2
3379 vld1.32 {d20}, [TMP1], STRIDE
3380 vld1.32 {d21}, [TMP1]
3381 vmull.u8 q8, d20, d28
3382 vmlal.u8 q8, d21, d29
3383 vld1.32 {d22}, [TMP2], STRIDE
3384 vld1.32 {d23}, [TMP2]
3385 vmull.u8 q9, d22, d28
3386 mov TMP3, X, asr #16
3388 add TMP3, TOP, TMP3, asl #2
3389 mov TMP4, X, asr #16
3391 add TMP4, TOP, TMP4, asl #2
3392 vmlal.u8 q9, d23, d29
3393 vld1.32 {d22}, [TMP3], STRIDE
3394 vld1.32 {d23}, [TMP3]
3395 vmull.u8 q10, d22, d28
3396 vmlal.u8 q10, d23, d29
3397 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3398 vmlsl.u16 q0, d16, d30
3399 vmlal.u16 q0, d17, d30
3401 vld1.32 {d16}, [TMP4], STRIDE
3402 vld1.32 {d17}, [TMP4]
3404 vmull.u8 q11, d16, d28
3405 vmlal.u8 q11, d17, d29
3406 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3407 vmlsl.u16 q1, d18, d31
3409 mov TMP1, X, asr #16
3411 add TMP1, TOP, TMP1, asl #2
3412 mov TMP2, X, asr #16
3414 add TMP2, TOP, TMP2, asl #2
3415 vmlal.u16 q1, d19, d31
3416 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3417 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3418 vmlsl.u16 q2, d20, d30
3419 vmlal.u16 q2, d21, d30
3420 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3421 vld1.32 {d20}, [TMP1], STRIDE
3422 vmlsl.u16 q3, d22, d31
3423 vmlal.u16 q3, d23, d31
3424 vld1.32 {d21}, [TMP1]
3425 vmull.u8 q8, d20, d28
3426 vmlal.u8 q8, d21, d29
3427 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3428 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3429 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3430 vld1.32 {d22}, [TMP2], STRIDE
3431 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3432 vadd.u16 q12, q12, q13
3433 vld1.32 {d23}, [TMP2]
3434 vmull.u8 q9, d22, d28
3435 mov TMP3, X, asr #16
3437 add TMP3, TOP, TMP3, asl #2
3438 mov TMP4, X, asr #16
3440 add TMP4, TOP, TMP4, asl #2
3441 vmlal.u8 q9, d23, d29
3442 vld1.32 {d22}, [TMP3], STRIDE
3443 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3444 vld1.32 {d23}, [TMP3]
3445 vmull.u8 q10, d22, d28
3446 vmlal.u8 q10, d23, d29
3448 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3450 vmlsl.u16 q0, d16, d30
3451 vmlal.u16 q0, d17, d30
3453 vld1.32 {d16}, [TMP4], STRIDE
3454 vadd.u16 q12, q12, q13
3455 vld1.32 {d17}, [TMP4]
3457 vmull.u8 q11, d16, d28
3458 vmlal.u8 q11, d17, d29
3459 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3460 vmlsl.u16 q1, d18, d31
3463 .macro bilinear_interpolate_eight_pixels_8888_0565_tail
3464 vmlal.u16 q1, d19, d31
3465 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3466 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3467 vmlsl.u16 q2, d20, d30
3468 vmlal.u16 q2, d21, d30
3469 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3470 vmlsl.u16 q3, d22, d31
3471 vmlal.u16 q3, d23, d31
3472 vadd.u16 q12, q12, q13
3473 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3474 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3475 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3476 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3477 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3480 vadd.u16 q12, q12, q13
3487 vshll.u8 q5, d10, #8
3490 vsri.u16 q5, q7, #11
3491 vst1.32 {d10, d11}, [OUT, :128]!
3494 .macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
3495 mov TMP1, X, asr #16
3497 add TMP1, TOP, TMP1, asl #2
3498 mov TMP2, X, asr #16
3500 add TMP2, TOP, TMP2, asl #2
3501 vmlal.u16 q1, d19, d31
3502 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3504 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3505 vmlsl.u16 q2, d20, d30
3506 vmlal.u16 q2, d21, d30
3507 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3508 vld1.32 {d20}, [TMP1], STRIDE
3509 vmlsl.u16 q3, d22, d31
3510 vmlal.u16 q3, d23, d31
3511 vld1.32 {d21}, [TMP1]
3512 vmull.u8 q8, d20, d28
3513 vmlal.u8 q8, d21, d29
3514 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3515 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3516 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3517 vld1.32 {d22}, [TMP2], STRIDE
3518 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3519 vadd.u16 q12, q12, q13
3520 vld1.32 {d23}, [TMP2]
3521 vmull.u8 q9, d22, d28
3522 mov TMP3, X, asr #16
3524 add TMP3, TOP, TMP3, asl #2
3525 mov TMP4, X, asr #16
3527 add TMP4, TOP, TMP4, asl #2
3528 vmlal.u8 q9, d23, d29
3529 vld1.32 {d22}, [TMP3], STRIDE
3530 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3531 vld1.32 {d23}, [TMP3]
3532 vmull.u8 q10, d22, d28
3533 vmlal.u8 q10, d23, d29
3535 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3537 vmlsl.u16 q0, d16, d30
3538 vmlal.u16 q0, d17, d30
3540 vld1.32 {d16}, [TMP4], STRIDE
3541 vadd.u16 q12, q12, q13
3542 vld1.32 {d17}, [TMP4]
3544 vmull.u8 q11, d16, d28
3545 vmlal.u8 q11, d17, d29
3547 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3548 vmlsl.u16 q1, d18, d31
3550 mov TMP1, X, asr #16
3552 add TMP1, TOP, TMP1, asl #2
3553 mov TMP2, X, asr #16
3555 add TMP2, TOP, TMP2, asl #2
3556 vmlal.u16 q1, d19, d31
3558 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3559 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
3561 vmlsl.u16 q2, d20, d30
3562 vmlal.u16 q2, d21, d30
3563 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
3564 vld1.32 {d20}, [TMP1], STRIDE
3565 vmlsl.u16 q3, d22, d31
3566 vmlal.u16 q3, d23, d31
3567 vld1.32 {d21}, [TMP1]
3568 vmull.u8 q8, d20, d28
3569 vmlal.u8 q8, d21, d29
3571 vshll.u8 q5, d10, #8
3573 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
3575 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
3576 vsri.u16 q5, q7, #11
3577 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
3578 vld1.32 {d22}, [TMP2], STRIDE
3579 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
3580 vadd.u16 q12, q12, q13
3581 vld1.32 {d23}, [TMP2]
3582 vmull.u8 q9, d22, d28
3583 mov TMP3, X, asr #16
3585 add TMP3, TOP, TMP3, asl #2
3586 mov TMP4, X, asr #16
3588 add TMP4, TOP, TMP4, asl #2
3589 vmlal.u8 q9, d23, d29
3590 vld1.32 {d22}, [TMP3], STRIDE
3591 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
3592 vld1.32 {d23}, [TMP3]
3593 vmull.u8 q10, d22, d28
3594 vmlal.u8 q10, d23, d29
3596 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
3598 vmlsl.u16 q0, d16, d30
3599 vmlal.u16 q0, d17, d30
3601 vld1.32 {d16}, [TMP4], STRIDE
3602 vadd.u16 q12, q12, q13
3603 vld1.32 {d17}, [TMP4]
3605 vmull.u8 q11, d16, d28
3606 vmlal.u8 q11, d17, d29
3607 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
3608 vst1.32 {d10, d11}, [OUT, :128]!
3609 vmlsl.u16 q1, d18, d31
3611 /*****************************************************************************/
3613 generate_bilinear_scanline_func \
3614 pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
3615 2, 2, 28, BILINEAR_FLAG_UNROLL_4
3617 generate_bilinear_scanline_func \
3618 pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
3619 2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
3621 generate_bilinear_scanline_func \
3622 pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
3623 1, 2, 28, BILINEAR_FLAG_UNROLL_4
3625 generate_bilinear_scanline_func \
3626 pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
3627 1, 1, 28, BILINEAR_FLAG_UNROLL_4