2 * Copyright © 2011 SCore Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
24 * Author: Taekyun Kim (tkq.kim@samsung.com)
28 * This file contains scaled bilinear scanline functions implemented
29 * using older siarhei's bilinear macro template.
31 * << General scanline function procedures >>
32 * 1. bilinear interpolate source pixels
34 * 3. load destination pixels
35 * 4. duplicate mask to fill whole register
36 * 5. interleave source & destination pixels
37 * 6. apply mask to source pixels
38 * 7. combine source & destination pixels
39 * 8, Deinterleave final result
40 * 9. store destination pixels
42 * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
43 * Registers with double numbers(src01, dst01) are 128-bits registers.
44 * All temp registers can be used freely outside the code block.
45 * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
48 * There can be lots of pipeline stalls inside code block and between code blocks.
49 * Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
52 /* Prevent the stack from becoming executable for no reason... */
53 #if defined(__linux__) && defined (__ELF__)
54 .section .note.GNU-stack,"",%progbits
67 #include "pixman-private.h"
68 #include "pixman-arm-asm.h"
69 #include "pixman-arm-neon-asm.h"
72 * Bilinear macros from pixman-arm-neon-asm.S
76 * Bilinear scaling support code which tries to provide pixel fetching, color
77 * format conversion, and interpolation as separate macros which can be used
78 * as the basic building blocks for constructing bilinear scanline functions.
81 .macro bilinear_load_8888 reg1, reg2, tmp
84 add TMP1, TOP, TMP1, asl #2
85 vld1.32 {reg1}, [TMP1], STRIDE
86 vld1.32 {reg2}, [TMP1]
89 .macro bilinear_load_0565 reg1, reg2, tmp
92 add TMP1, TOP, TMP1, asl #1
93 vld1.32 {reg2[0]}, [TMP1], STRIDE
94 vld1.32 {reg2[1]}, [TMP1]
95 convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
98 .macro bilinear_load_and_vertical_interpolate_two_8888 \
99 acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
101 bilinear_load_8888 reg1, reg2, tmp1
102 vmull.u8 acc1, reg1, d28
103 vmlal.u8 acc1, reg2, d29
104 bilinear_load_8888 reg3, reg4, tmp2
105 vmull.u8 acc2, reg3, d28
106 vmlal.u8 acc2, reg4, d29
109 .macro bilinear_load_and_vertical_interpolate_four_8888 \
110 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
111 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
113 bilinear_load_and_vertical_interpolate_two_8888 \
114 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
115 bilinear_load_and_vertical_interpolate_two_8888 \
116 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
119 .macro bilinear_load_and_vertical_interpolate_two_0565 \
120 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
124 add TMP1, TOP, TMP1, asl #1
127 add TMP2, TOP, TMP2, asl #1
128 vld1.32 {acc2lo[0]}, [TMP1], STRIDE
129 vld1.32 {acc2hi[0]}, [TMP2], STRIDE
130 vld1.32 {acc2lo[1]}, [TMP1]
131 vld1.32 {acc2hi[1]}, [TMP2]
132 convert_0565_to_x888 acc2, reg3, reg2, reg1
137 vmull.u8 acc1, reg1, d28
138 vmlal.u8 acc1, reg2, d29
139 vmull.u8 acc2, reg3, d28
140 vmlal.u8 acc2, reg4, d29
143 .macro bilinear_load_and_vertical_interpolate_four_0565 \
144 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
145 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
149 add TMP1, TOP, TMP1, asl #1
152 add TMP2, TOP, TMP2, asl #1
153 vld1.32 {xacc2lo[0]}, [TMP1], STRIDE
154 vld1.32 {xacc2hi[0]}, [TMP2], STRIDE
155 vld1.32 {xacc2lo[1]}, [TMP1]
156 vld1.32 {xacc2hi[1]}, [TMP2]
157 convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
160 add TMP1, TOP, TMP1, asl #1
163 add TMP2, TOP, TMP2, asl #1
164 vld1.32 {yacc2lo[0]}, [TMP1], STRIDE
166 vld1.32 {yacc2hi[0]}, [TMP2], STRIDE
168 vld1.32 {yacc2lo[1]}, [TMP1]
170 vld1.32 {yacc2hi[1]}, [TMP2]
172 convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
173 vmull.u8 xacc1, xreg1, d28
175 vmlal.u8 xacc1, xreg2, d29
177 vmull.u8 xacc2, xreg3, d28
179 vmlal.u8 xacc2, xreg4, d29
181 vmull.u8 yacc1, yreg1, d28
182 vmlal.u8 yacc1, yreg2, d29
183 vmull.u8 yacc2, yreg3, d28
184 vmlal.u8 yacc2, yreg4, d29
187 .macro bilinear_store_8888 numpix, tmp1, tmp2
189 vst1.32 {d0, d1}, [OUT]!
193 vst1.32 {d0[0]}, [OUT, :32]!
195 .error bilinear_store_8888 numpix is unsupported
199 .macro bilinear_store_0565 numpix, tmp1, tmp2
204 convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
208 vst1.32 {d2[0]}, [OUT]!
210 vst1.16 {d2[0]}, [OUT]!
212 .error bilinear_store_0565 numpix is unsupported
218 * Macros for loading mask pixels into register 'mask'.
219 * vdup must be done in somewhere else.
221 .macro bilinear_load_mask_x numpix, mask
224 .macro bilinear_load_mask_8 numpix, mask
226 vld1.32 {mask[0]}, [MASK]!
228 vld1.16 {mask[0]}, [MASK]!
230 vld1.8 {mask[0]}, [MASK]!
232 .error bilinear_load_mask_8 numpix is unsupported
234 pld [MASK, #prefetch_offset]
237 .macro bilinear_load_mask mask_fmt, numpix, mask
238 bilinear_load_mask_&mask_fmt numpix, mask
243 * Macros for loading destination pixels into register 'dst0' and 'dst1'.
244 * Interleave should be done somewhere else.
246 .macro bilinear_load_dst_0565_src numpix, dst0, dst1, dst01
249 .macro bilinear_load_dst_8888_src numpix, dst0, dst1, dst01
252 .macro bilinear_load_dst_8888 numpix, dst0, dst1, dst01
254 vld1.32 {dst0, dst1}, [OUT]
256 vld1.32 {dst0}, [OUT]
258 vld1.32 {dst0[0]}, [OUT]
260 .error bilinear_load_dst_8888 numpix is unsupported
262 pld [OUT, #(prefetch_offset * 4)]
265 .macro bilinear_load_dst_8888_over numpix, dst0, dst1, dst01
266 bilinear_load_dst_8888 numpix, dst0, dst1, dst01
269 .macro bilinear_load_dst_8888_add numpix, dst0, dst1, dst01
270 bilinear_load_dst_8888 numpix, dst0, dst1, dst01
273 .macro bilinear_load_dst dst_fmt, op, numpix, dst0, dst1, dst01
274 bilinear_load_dst_&dst_fmt&_&op numpix, dst0, dst1, dst01
278 * Macros for duplicating partially loaded mask to fill entire register.
279 * We will apply mask to interleaved source pixels, that is
280 * (r0, r1, r2, r3, g0, g1, g2, g3) x (m0, m1, m2, m3, m0, m1, m2, m3)
281 * (b0, b1, b2, b3, a0, a1, a2, a3) x (m0, m1, m2, m3, m0, m1, m2, m3)
282 * So, we need to duplicate loaded mask into whole register.
285 * (r0, r1, x, x, g0, g1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
286 * (b0, b1, x, x, a0, a1, x, x) x (m0, m1, m0, m1, m0, m1, m0, m1)
287 * We can do some optimizations for this including last pixel cases.
289 .macro bilinear_duplicate_mask_x numpix, mask
292 .macro bilinear_duplicate_mask_8 numpix, mask
294 vdup.32 mask, mask[0]
296 vdup.16 mask, mask[0]
300 .error bilinear_duplicate_mask_8 is unsupported
304 .macro bilinear_duplicate_mask mask_fmt, numpix, mask
305 bilinear_duplicate_mask_&mask_fmt numpix, mask
309 * Macros for interleaving src and dst pixels to rrrr gggg bbbb aaaa form.
310 * Interleave should be done when maks is enabled or operator is 'over'.
312 .macro bilinear_interleave src0, src1, dst0, dst1
319 .macro bilinear_interleave_src_dst_x_src \
320 numpix, src0, src1, src01, dst0, dst1, dst01
323 .macro bilinear_interleave_src_dst_x_over \
324 numpix, src0, src1, src01, dst0, dst1, dst01
326 bilinear_interleave src0, src1, dst0, dst1
329 .macro bilinear_interleave_src_dst_x_add \
330 numpix, src0, src1, src01, dst0, dst1, dst01
333 .macro bilinear_interleave_src_dst_8_src \
334 numpix, src0, src1, src01, dst0, dst1, dst01
336 bilinear_interleave src0, src1, dst0, dst1
339 .macro bilinear_interleave_src_dst_8_over \
340 numpix, src0, src1, src01, dst0, dst1, dst01
342 bilinear_interleave src0, src1, dst0, dst1
345 .macro bilinear_interleave_src_dst_8_add \
346 numpix, src0, src1, src01, dst0, dst1, dst01
348 bilinear_interleave src0, src1, dst0, dst1
351 .macro bilinear_interleave_src_dst \
352 mask_fmt, op, numpix, src0, src1, src01, dst0, dst1, dst01
354 bilinear_interleave_src_dst_&mask_fmt&_&op \
355 numpix, src0, src1, src01, dst0, dst1, dst01
360 * Macros for applying masks to src pixels. (see combine_mask_u() function)
361 * src, dst should be in interleaved form.
362 * mask register should be in form (m0, m1, m2, m3).
364 .macro bilinear_apply_mask_to_src_x \
365 numpix, src0, src1, src01, mask, \
366 tmp01, tmp23, tmp45, tmp67
369 .macro bilinear_apply_mask_to_src_8 \
370 numpix, src0, src1, src01, mask, \
371 tmp01, tmp23, tmp45, tmp67
373 vmull.u8 tmp01, src0, mask
374 vmull.u8 tmp23, src1, mask
376 vrshr.u16 tmp45, tmp01, #8
377 vrshr.u16 tmp67, tmp23, #8
379 vraddhn.u16 src0, tmp45, tmp01
380 vraddhn.u16 src1, tmp67, tmp23
383 .macro bilinear_apply_mask_to_src \
384 mask_fmt, numpix, src0, src1, src01, mask, \
385 tmp01, tmp23, tmp45, tmp67
387 bilinear_apply_mask_to_src_&mask_fmt \
388 numpix, src0, src1, src01, mask, \
389 tmp01, tmp23, tmp45, tmp67
394 * Macros for combining src and destination pixels.
395 * Interleave or not is depending on operator 'op'.
397 .macro bilinear_combine_src \
398 numpix, src0, src1, src01, dst0, dst1, dst01, \
399 tmp01, tmp23, tmp45, tmp67, tmp8
402 .macro bilinear_combine_over \
403 numpix, src0, src1, src01, dst0, dst1, dst01, \
404 tmp01, tmp23, tmp45, tmp67, tmp8
406 vdup.32 tmp8, src1[1]
410 vmull.u8 tmp01, dst0, tmp8
412 vmull.u8 tmp23, dst1, tmp8
414 vrshr.u16 tmp45, tmp01, #8
415 vrshr.u16 tmp67, tmp23, #8
417 vraddhn.u16 dst0, tmp45, tmp01
418 vraddhn.u16 dst1, tmp67, tmp23
420 vqadd.u8 src01, dst01, src01
423 .macro bilinear_combine_add \
424 numpix, src0, src1, src01, dst0, dst1, dst01, \
425 tmp01, tmp23, tmp45, tmp67, tmp8
427 vqadd.u8 src01, dst01, src01
430 .macro bilinear_combine \
431 op, numpix, src0, src1, src01, dst0, dst1, dst01, \
432 tmp01, tmp23, tmp45, tmp67, tmp8
434 bilinear_combine_&op \
435 numpix, src0, src1, src01, dst0, dst1, dst01, \
436 tmp01, tmp23, tmp45, tmp67, tmp8
440 * Macros for final deinterleaving of destination pixels if needed.
442 .macro bilinear_deinterleave numpix, dst0, dst1, dst01
448 .macro bilinear_deinterleave_dst_x_src numpix, dst0, dst1, dst01
451 .macro bilinear_deinterleave_dst_x_over numpix, dst0, dst1, dst01
452 bilinear_deinterleave numpix, dst0, dst1, dst01
455 .macro bilinear_deinterleave_dst_x_add numpix, dst0, dst1, dst01
458 .macro bilinear_deinterleave_dst_8_src numpix, dst0, dst1, dst01
459 bilinear_deinterleave numpix, dst0, dst1, dst01
462 .macro bilinear_deinterleave_dst_8_over numpix, dst0, dst1, dst01
463 bilinear_deinterleave numpix, dst0, dst1, dst01
466 .macro bilinear_deinterleave_dst_8_add numpix, dst0, dst1, dst01
467 bilinear_deinterleave numpix, dst0, dst1, dst01
470 .macro bilinear_deinterleave_dst mask_fmt, op, numpix, dst0, dst1, dst01
471 bilinear_deinterleave_dst_&mask_fmt&_&op numpix, dst0, dst1, dst01
475 .macro bilinear_interpolate_last_pixel src_fmt, mask_fmt, dst_fmt, op
476 bilinear_load_&src_fmt d0, d1, d2
477 bilinear_load_mask mask_fmt, 1, d4
478 bilinear_load_dst dst_fmt, op, 1, d18, d19, q9
481 /* 5 cycles bubble */
482 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
483 vmlsl.u16 q0, d2, d30
484 vmlal.u16 q0, d3, d30
485 /* 5 cycles bubble */
486 bilinear_duplicate_mask mask_fmt, 1, d4
487 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
488 /* 3 cycles bubble */
491 bilinear_interleave_src_dst \
492 mask_fmt, op, 1, d0, d1, q0, d18, d19, q9
493 bilinear_apply_mask_to_src \
494 mask_fmt, 1, d0, d1, q0, d4, \
497 op, 1, d0, d1, q0, d18, d19, q9, \
499 bilinear_deinterleave_dst mask_fmt, op, 1, d0, d1, q0
500 bilinear_store_&dst_fmt 1, q2, q3
503 .macro bilinear_interpolate_two_pixels src_fmt, mask_fmt, dst_fmt, op
504 bilinear_load_and_vertical_interpolate_two_&src_fmt \
505 q1, q11, d0, d1, d20, d21, d22, d23
506 bilinear_load_mask mask_fmt, 2, d4
507 bilinear_load_dst dst_fmt, op, 2, d18, d19, q9
508 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
509 vmlsl.u16 q0, d2, d30
510 vmlal.u16 q0, d3, d30
511 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
512 vmlsl.u16 q10, d22, d31
513 vmlal.u16 q10, d23, d31
514 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
515 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
516 bilinear_duplicate_mask mask_fmt, 2, d4
517 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
518 vadd.u16 q12, q12, q13
520 bilinear_interleave_src_dst \
521 mask_fmt, op, 2, d0, d1, q0, d18, d19, q9
522 bilinear_apply_mask_to_src \
523 mask_fmt, 2, d0, d1, q0, d4, \
526 op, 2, d0, d1, q0, d18, d19, q9, \
528 bilinear_deinterleave_dst mask_fmt, op, 2, d0, d1, q0
529 bilinear_store_&dst_fmt 2, q2, q3
532 .macro bilinear_interpolate_four_pixels src_fmt, mask_fmt, dst_fmt, op
533 bilinear_load_and_vertical_interpolate_four_&src_fmt \
534 q1, q11, d0, d1, d20, d21, d22, d23 \
535 q3, q9, d4, d5, d16, d17, d18, d19
537 sub TMP1, TMP1, STRIDE
538 vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
539 vmlsl.u16 q0, d2, d30
540 vmlal.u16 q0, d3, d30
541 vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
542 vmlsl.u16 q10, d22, d31
543 vmlal.u16 q10, d23, d31
544 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
545 vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
546 vmlsl.u16 q2, d6, d30
547 vmlal.u16 q2, d7, d30
548 vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
549 bilinear_load_mask mask_fmt, 4, d22
550 bilinear_load_dst dst_fmt, op, 4, d2, d3, q1
552 vmlsl.u16 q8, d18, d31
553 vmlal.u16 q8, d19, d31
554 vadd.u16 q12, q12, q13
555 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
556 vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
557 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
558 vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
559 bilinear_duplicate_mask mask_fmt, 4, d22
560 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
563 vadd.u16 q12, q12, q13
564 bilinear_interleave_src_dst \
565 mask_fmt, op, 4, d0, d1, q0, d2, d3, q1
566 bilinear_apply_mask_to_src \
567 mask_fmt, 4, d0, d1, q0, d22, \
570 op, 4, d0, d1, q0, d2, d3, q1, \
572 bilinear_deinterleave_dst mask_fmt, op, 4, d0, d1, q0
573 bilinear_store_&dst_fmt 4, q2, q3
576 .set BILINEAR_FLAG_USE_MASK, 1
577 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
580 * Main template macro for generating NEON optimized bilinear scanline functions.
582 * Bilinear scanline generator macro take folling arguments:
583 * fname - name of the function to generate
584 * src_fmt - source color format (8888 or 0565)
585 * dst_fmt - destination color format (8888 or 0565)
586 * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes
587 * process_last_pixel - code block that interpolate one pixel and does not
588 * update horizontal weight
589 * process_two_pixels - code block that interpolate two pixels and update
591 * process_four_pixels - code block that interpolate four pixels and update
593 * process_pixblock_head - head part of middle loop
594 * process_pixblock_tail - tail part of middle loop
595 * process_pixblock_tail_head - tail_head of middle loop
596 * pixblock_size - number of pixels processed in a single middle loop
597 * prefetch_distance - prefetch in the source image by that many pixels ahead
600 .macro generate_bilinear_scanline_func \
602 src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
603 bilinear_process_last_pixel, \
604 bilinear_process_two_pixels, \
605 bilinear_process_four_pixels, \
606 bilinear_process_pixblock_head, \
607 bilinear_process_pixblock_tail, \
608 bilinear_process_pixblock_tail_head, \
613 pixman_asm_function fname
614 .if pixblock_size == 8
615 .elseif pixblock_size == 4
617 .error unsupported pixblock size
620 .if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
637 push {r4, r5, r6, r7, r8, r9}
638 mov PF_OFFS, #prefetch_distance
639 ldmia ip, {WB, X, UX, WIDTH}
657 .set prefetch_offset, prefetch_distance
660 push {r4, r5, r6, r7, r8, r9, r10, ip}
661 mov PF_OFFS, #prefetch_distance
662 ldmia ip, {WT, WB, X, UX, WIDTH}
665 mul PF_OFFS, PF_OFFS, UX
667 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
671 sub STRIDE, BOTTOM, TOP
681 vadd.u16 d25, d25, d26
683 /* ensure good destination alignment */
686 tst OUT, #(1 << dst_bpp_shift)
688 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
689 vadd.u16 q12, q12, q13
690 bilinear_process_last_pixel
693 vadd.u16 q13, q13, q13
694 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
695 vadd.u16 q12, q12, q13
699 tst OUT, #(1 << (dst_bpp_shift + 1))
701 bilinear_process_two_pixels
704 .if pixblock_size == 8
707 tst OUT, #(1 << (dst_bpp_shift + 2))
709 bilinear_process_four_pixels
713 subs WIDTH, WIDTH, #pixblock_size
715 mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
716 bilinear_process_pixblock_head
717 subs WIDTH, WIDTH, #pixblock_size
720 bilinear_process_pixblock_tail_head
721 subs WIDTH, WIDTH, #pixblock_size
724 bilinear_process_pixblock_tail
726 .if pixblock_size == 8
729 bilinear_process_four_pixels
732 /* handle the remaining trailing pixels */
735 bilinear_process_two_pixels
739 bilinear_process_last_pixel
741 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
745 .if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
746 pop {r4, r5, r6, r7, r8, r9}
748 pop {r4, r5, r6, r7, r8, r9, r10, ip}
765 .if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
773 /* src_8888_8_8888 */
774 .macro bilinear_src_8888_8_8888_process_last_pixel
775 bilinear_interpolate_last_pixel 8888, 8, 8888, src
778 .macro bilinear_src_8888_8_8888_process_two_pixels
779 bilinear_interpolate_two_pixels 8888, 8, 8888, src
782 .macro bilinear_src_8888_8_8888_process_four_pixels
783 bilinear_interpolate_four_pixels 8888, 8, 8888, src
786 .macro bilinear_src_8888_8_8888_process_pixblock_head
787 bilinear_src_8888_8_8888_process_four_pixels
790 .macro bilinear_src_8888_8_8888_process_pixblock_tail
793 .macro bilinear_src_8888_8_8888_process_pixblock_tail_head
794 bilinear_src_8888_8_8888_process_pixblock_tail
795 bilinear_src_8888_8_8888_process_pixblock_head
798 /* src_8888_8_0565 */
799 .macro bilinear_src_8888_8_0565_process_last_pixel
800 bilinear_interpolate_last_pixel 8888, 8, 0565, src
803 .macro bilinear_src_8888_8_0565_process_two_pixels
804 bilinear_interpolate_two_pixels 8888, 8, 0565, src
807 .macro bilinear_src_8888_8_0565_process_four_pixels
808 bilinear_interpolate_four_pixels 8888, 8, 0565, src
811 .macro bilinear_src_8888_8_0565_process_pixblock_head
812 bilinear_src_8888_8_0565_process_four_pixels
815 .macro bilinear_src_8888_8_0565_process_pixblock_tail
818 .macro bilinear_src_8888_8_0565_process_pixblock_tail_head
819 bilinear_src_8888_8_0565_process_pixblock_tail
820 bilinear_src_8888_8_0565_process_pixblock_head
823 /* src_0565_8_x888 */
824 .macro bilinear_src_0565_8_x888_process_last_pixel
825 bilinear_interpolate_last_pixel 0565, 8, 8888, src
828 .macro bilinear_src_0565_8_x888_process_two_pixels
829 bilinear_interpolate_two_pixels 0565, 8, 8888, src
832 .macro bilinear_src_0565_8_x888_process_four_pixels
833 bilinear_interpolate_four_pixels 0565, 8, 8888, src
836 .macro bilinear_src_0565_8_x888_process_pixblock_head
837 bilinear_src_0565_8_x888_process_four_pixels
840 .macro bilinear_src_0565_8_x888_process_pixblock_tail
843 .macro bilinear_src_0565_8_x888_process_pixblock_tail_head
844 bilinear_src_0565_8_x888_process_pixblock_tail
845 bilinear_src_0565_8_x888_process_pixblock_head
848 /* src_0565_8_0565 */
849 .macro bilinear_src_0565_8_0565_process_last_pixel
850 bilinear_interpolate_last_pixel 0565, 8, 0565, src
853 .macro bilinear_src_0565_8_0565_process_two_pixels
854 bilinear_interpolate_two_pixels 0565, 8, 0565, src
857 .macro bilinear_src_0565_8_0565_process_four_pixels
858 bilinear_interpolate_four_pixels 0565, 8, 0565, src
861 .macro bilinear_src_0565_8_0565_process_pixblock_head
862 bilinear_src_0565_8_0565_process_four_pixels
865 .macro bilinear_src_0565_8_0565_process_pixblock_tail
868 .macro bilinear_src_0565_8_0565_process_pixblock_tail_head
869 bilinear_src_0565_8_0565_process_pixblock_tail
870 bilinear_src_0565_8_0565_process_pixblock_head
874 .macro bilinear_over_8888_8888_process_last_pixel
875 bilinear_interpolate_last_pixel 8888, x, 8888, over
878 .macro bilinear_over_8888_8888_process_two_pixels
879 bilinear_interpolate_two_pixels 8888, x, 8888, over
882 .macro bilinear_over_8888_8888_process_four_pixels
883 bilinear_interpolate_four_pixels 8888, x, 8888, over
886 .macro bilinear_over_8888_8888_process_pixblock_head
889 add TMP1, TOP, TMP1, asl #2
892 add TMP2, TOP, TMP2, asl #2
894 vld1.32 {d22}, [TMP1], STRIDE
895 vld1.32 {d23}, [TMP1]
898 add TMP3, TOP, TMP3, asl #2
899 vmull.u8 q8, d22, d28
900 vmlal.u8 q8, d23, d29
902 vld1.32 {d22}, [TMP2], STRIDE
903 vld1.32 {d23}, [TMP2]
906 add TMP4, TOP, TMP4, asl #2
907 vmull.u8 q9, d22, d28
908 vmlal.u8 q9, d23, d29
910 vld1.32 {d22}, [TMP3], STRIDE
911 vld1.32 {d23}, [TMP3]
912 vmull.u8 q10, d22, d28
913 vmlal.u8 q10, d23, d29
915 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
916 vmlsl.u16 q0, d16, d30
917 vmlal.u16 q0, d17, d30
920 vld1.32 {d16}, [TMP4], STRIDE
921 vld1.32 {d17}, [TMP4]
923 vmull.u8 q11, d16, d28
924 vmlal.u8 q11, d17, d29
926 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
927 vmlsl.u16 q1, d18, d31
928 vmlal.u16 q1, d19, d31
929 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
930 vadd.u16 q12, q12, q13
933 .macro bilinear_over_8888_8888_process_pixblock_tail
934 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
935 vmlsl.u16 q2, d20, d30
936 vmlal.u16 q2, d21, d30
937 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
938 vmlsl.u16 q3, d22, d31
939 vmlal.u16 q3, d23, d31
940 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
941 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
942 vld1.32 {d2, d3}, [OUT, :128]
943 pld [OUT, #(prefetch_offset * 4)]
944 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
945 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
946 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
957 vrshr.u16 q1, q11, #8
958 vrshr.u16 q10, q2, #8
959 vraddhn.u16 d2, q1, q11
960 vraddhn.u16 d3, q10, q2
964 vadd.u16 q12, q12, q13
965 vst1.32 {d6, d7}, [OUT, :128]!
968 .macro bilinear_over_8888_8888_process_pixblock_tail_head
969 vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
972 add TMP1, TOP, TMP1, asl #2
973 vmlsl.u16 q2, d20, d30
976 add TMP2, TOP, TMP2, asl #2
977 vmlal.u16 q2, d21, d30
978 vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
979 vld1.32 {d20}, [TMP1], STRIDE
980 vmlsl.u16 q3, d22, d31
981 vmlal.u16 q3, d23, d31
982 vld1.32 {d21}, [TMP1]
983 vmull.u8 q8, d20, d28
984 vmlal.u8 q8, d21, d29
985 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
986 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
987 vld1.32 {d2, d3}, [OUT, :128]
989 vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
990 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
991 vld1.32 {d22}, [TMP2], STRIDE
992 vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
994 vld1.32 {d23}, [TMP2]
995 vmull.u8 q9, d22, d28
998 add TMP3, TOP, TMP3, asl #2
1001 add TMP4, TOP, TMP4, asl #2
1002 vmlal.u8 q9, d23, d29
1004 vld1.32 {d22}, [TMP3], STRIDE
1010 vld1.32 {d23}, [TMP3]
1012 vmull.u8 q10, d22, d28
1013 vmlal.u8 q10, d23, d29
1014 vmull.u8 q11, d2, d4
1016 vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
1017 vmlsl.u16 q0, d16, d30
1018 vrshr.u16 q1, q11, #8
1019 vmlal.u16 q0, d17, d30
1020 vrshr.u16 q8, q2, #8
1021 vraddhn.u16 d2, q1, q11
1022 vraddhn.u16 d3, q8, q2
1024 vld1.32 {d16}, [TMP4], STRIDE
1026 vld1.32 {d17}, [TMP4]
1028 vmull.u8 q11, d16, d28
1029 vmlal.u8 q11, d17, d29
1031 vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
1033 vmlsl.u16 q1, d18, d31
1034 vadd.u16 q12, q12, q13
1035 vmlal.u16 q1, d19, d31
1036 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1037 vadd.u16 q12, q12, q13
1038 vst1.32 {d6, d7}, [OUT, :128]!
1041 /* over_8888_8_8888 */
1042 .macro bilinear_over_8888_8_8888_process_last_pixel
1043 bilinear_interpolate_last_pixel 8888, 8, 8888, over
1046 .macro bilinear_over_8888_8_8888_process_two_pixels
1047 bilinear_interpolate_two_pixels 8888, 8, 8888, over
1050 .macro bilinear_over_8888_8_8888_process_four_pixels
1051 bilinear_interpolate_four_pixels 8888, 8, 8888, over
1054 .macro bilinear_over_8888_8_8888_process_pixblock_head
1055 mov TMP1, X, asr #16
1057 add TMP1, TOP, TMP1, asl #2
1058 vld1.32 {d0}, [TMP1], STRIDE
1059 mov TMP2, X, asr #16
1061 add TMP2, TOP, TMP2, asl #2
1062 vld1.32 {d1}, [TMP1]
1063 mov TMP3, X, asr #16
1065 add TMP3, TOP, TMP3, asl #2
1066 vld1.32 {d2}, [TMP2], STRIDE
1067 mov TMP4, X, asr #16
1069 add TMP4, TOP, TMP4, asl #2
1070 vld1.32 {d3}, [TMP2]
1071 vmull.u8 q2, d0, d28
1072 vmull.u8 q3, d2, d28
1073 vmlal.u8 q2, d1, d29
1074 vmlal.u8 q3, d3, d29
1075 vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS
1076 vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS
1077 vmlsl.u16 q0, d4, d30
1078 vmlsl.u16 q1, d6, d31
1079 vmlal.u16 q0, d5, d30
1080 vmlal.u16 q1, d7, d31
1081 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
1082 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
1083 vld1.32 {d2}, [TMP3], STRIDE
1084 vld1.32 {d3}, [TMP3]
1086 vld1.32 {d4}, [TMP4], STRIDE
1087 vld1.32 {d5}, [TMP4]
1089 vmull.u8 q3, d2, d28
1090 vmlal.u8 q3, d3, d29
1091 vmull.u8 q1, d4, d28
1092 vmlal.u8 q1, d5, d29
1093 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1094 vld1.32 {d22[0]}, [MASK]!
1095 pld [MASK, #prefetch_offset]
1096 vadd.u16 q12, q12, q13
1100 .macro bilinear_over_8888_8_8888_process_pixblock_tail
1101 vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS
1102 vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS
1103 vmlsl.u16 q9, d6, d30
1104 vmlsl.u16 q10, d2, d31
1105 vmlal.u16 q9, d7, d30
1106 vmlal.u16 q10, d3, d31
1107 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1108 vadd.u16 q12, q12, q13
1110 vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
1111 vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
1113 vld1.32 {d18, d19}, [OUT, :128]
1119 vmull.u8 q10, d16, d22
1120 vmull.u8 q11, d17, d22
1121 vrsra.u16 q10, q10, #8
1122 vrsra.u16 q11, q11, #8
1123 vrshrn.u16 d16, q10, #8
1124 vrshrn.u16 d17, q11, #8
1127 vmull.u8 q10, d18, d22
1128 vmull.u8 q11, d19, d22
1129 vrshr.u16 q9, q10, #8
1130 vrshr.u16 q0, q11, #8
1131 vraddhn.u16 d18, q9, q10
1132 vraddhn.u16 d19, q0, q11
1136 vst1.32 {d18, d19}, [OUT, :128]!
1139 .macro bilinear_over_8888_8_8888_process_pixblock_tail_head
1140 vshll.u16 q9, d6, #BILINEAR_INTERPOLATION_BITS
1141 mov TMP1, X, asr #16
1143 add TMP1, TOP, TMP1, asl #2
1144 vshll.u16 q10, d2, #BILINEAR_INTERPOLATION_BITS
1145 vld1.32 {d0}, [TMP1], STRIDE
1146 mov TMP2, X, asr #16
1148 add TMP2, TOP, TMP2, asl #2
1149 vmlsl.u16 q9, d6, d30
1150 vmlsl.u16 q10, d2, d31
1151 vld1.32 {d1}, [TMP1]
1152 mov TMP3, X, asr #16
1154 add TMP3, TOP, TMP3, asl #2
1155 vmlal.u16 q9, d7, d30
1156 vmlal.u16 q10, d3, d31
1157 vld1.32 {d2}, [TMP2], STRIDE
1158 mov TMP4, X, asr #16
1160 add TMP4, TOP, TMP4, asl #2
1161 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1162 vadd.u16 q12, q12, q13
1163 vld1.32 {d3}, [TMP2]
1165 vshrn.u32 d18, q9, #(2 * BILINEAR_INTERPOLATION_BITS)
1166 vshrn.u32 d19, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
1167 vmull.u8 q2, d0, d28
1168 vmull.u8 q3, d2, d28
1170 vld1.32 {d18, d19}, [OUT, :128]
1171 pld [OUT, #(prefetch_offset * 4)]
1172 vmlal.u8 q2, d1, d29
1173 vmlal.u8 q3, d3, d29
1176 vshll.u16 q0, d4, #BILINEAR_INTERPOLATION_BITS
1177 vshll.u16 q1, d6, #BILINEAR_INTERPOLATION_BITS
1180 vmlsl.u16 q0, d4, d30
1181 vmlsl.u16 q1, d6, d31
1182 vmull.u8 q10, d16, d22
1183 vmull.u8 q11, d17, d22
1184 vmlal.u16 q0, d5, d30
1185 vmlal.u16 q1, d7, d31
1186 vrsra.u16 q10, q10, #8
1187 vrsra.u16 q11, q11, #8
1188 vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
1189 vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
1190 vrshrn.u16 d16, q10, #8
1191 vrshrn.u16 d17, q11, #8
1192 vld1.32 {d2}, [TMP3], STRIDE
1194 vld1.32 {d3}, [TMP3]
1197 vld1.32 {d4}, [TMP4], STRIDE
1198 vmull.u8 q10, d18, d22
1199 vmull.u8 q11, d19, d22
1200 vld1.32 {d5}, [TMP4]
1202 vmull.u8 q3, d2, d28
1203 vrshr.u16 q9, q10, #8
1204 vrshr.u16 q15, q11, #8
1205 vmlal.u8 q3, d3, d29
1206 vmull.u8 q1, d4, d28
1207 vraddhn.u16 d18, q9, q10
1208 vraddhn.u16 d19, q15, q11
1209 vmlal.u8 q1, d5, d29
1210 vshr.u16 q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
1212 vld1.32 {d22[0]}, [MASK]!
1214 vadd.u16 q12, q12, q13
1217 vst1.32 {d18, d19}, [OUT, :128]!
1221 .macro bilinear_add_8888_8888_process_last_pixel
1222 bilinear_interpolate_last_pixel 8888, x, 8888, add
1225 .macro bilinear_add_8888_8888_process_two_pixels
1226 bilinear_interpolate_two_pixels 8888, x, 8888, add
1229 .macro bilinear_add_8888_8888_process_four_pixels
1230 bilinear_interpolate_four_pixels 8888, x, 8888, add
1233 .macro bilinear_add_8888_8888_process_pixblock_head
1234 bilinear_add_8888_8888_process_four_pixels
1237 .macro bilinear_add_8888_8888_process_pixblock_tail
1240 .macro bilinear_add_8888_8888_process_pixblock_tail_head
1241 bilinear_add_8888_8888_process_pixblock_tail
1242 bilinear_add_8888_8888_process_pixblock_head
1245 /* add_8888_8_8888 */
1246 .macro bilinear_add_8888_8_8888_process_last_pixel
1247 bilinear_interpolate_last_pixel 8888, 8, 8888, add
1250 .macro bilinear_add_8888_8_8888_process_two_pixels
1251 bilinear_interpolate_two_pixels 8888, 8, 8888, add
1254 .macro bilinear_add_8888_8_8888_process_four_pixels
1255 bilinear_interpolate_four_pixels 8888, 8, 8888, add
1258 .macro bilinear_add_8888_8_8888_process_pixblock_head
1259 bilinear_add_8888_8_8888_process_four_pixels
1262 .macro bilinear_add_8888_8_8888_process_pixblock_tail
1265 .macro bilinear_add_8888_8_8888_process_pixblock_tail_head
1266 bilinear_add_8888_8_8888_process_pixblock_tail
1267 bilinear_add_8888_8_8888_process_pixblock_head
1271 /* Bilinear scanline functions */
1272 generate_bilinear_scanline_func \
1273 pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
1275 bilinear_src_8888_8_8888_process_last_pixel, \
1276 bilinear_src_8888_8_8888_process_two_pixels, \
1277 bilinear_src_8888_8_8888_process_four_pixels, \
1278 bilinear_src_8888_8_8888_process_pixblock_head, \
1279 bilinear_src_8888_8_8888_process_pixblock_tail, \
1280 bilinear_src_8888_8_8888_process_pixblock_tail_head, \
1281 4, 28, BILINEAR_FLAG_USE_MASK
1283 generate_bilinear_scanline_func \
1284 pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
1286 bilinear_src_8888_8_0565_process_last_pixel, \
1287 bilinear_src_8888_8_0565_process_two_pixels, \
1288 bilinear_src_8888_8_0565_process_four_pixels, \
1289 bilinear_src_8888_8_0565_process_pixblock_head, \
1290 bilinear_src_8888_8_0565_process_pixblock_tail, \
1291 bilinear_src_8888_8_0565_process_pixblock_tail_head, \
1292 4, 28, BILINEAR_FLAG_USE_MASK
1294 generate_bilinear_scanline_func \
1295 pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
1297 bilinear_src_0565_8_x888_process_last_pixel, \
1298 bilinear_src_0565_8_x888_process_two_pixels, \
1299 bilinear_src_0565_8_x888_process_four_pixels, \
1300 bilinear_src_0565_8_x888_process_pixblock_head, \
1301 bilinear_src_0565_8_x888_process_pixblock_tail, \
1302 bilinear_src_0565_8_x888_process_pixblock_tail_head, \
1303 4, 28, BILINEAR_FLAG_USE_MASK
1305 generate_bilinear_scanline_func \
1306 pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
1308 bilinear_src_0565_8_0565_process_last_pixel, \
1309 bilinear_src_0565_8_0565_process_two_pixels, \
1310 bilinear_src_0565_8_0565_process_four_pixels, \
1311 bilinear_src_0565_8_0565_process_pixblock_head, \
1312 bilinear_src_0565_8_0565_process_pixblock_tail, \
1313 bilinear_src_0565_8_0565_process_pixblock_tail_head, \
1314 4, 28, BILINEAR_FLAG_USE_MASK
1316 generate_bilinear_scanline_func \
1317 pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
1319 bilinear_over_8888_8888_process_last_pixel, \
1320 bilinear_over_8888_8888_process_two_pixels, \
1321 bilinear_over_8888_8888_process_four_pixels, \
1322 bilinear_over_8888_8888_process_pixblock_head, \
1323 bilinear_over_8888_8888_process_pixblock_tail, \
1324 bilinear_over_8888_8888_process_pixblock_tail_head, \
1327 generate_bilinear_scanline_func \
1328 pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
1330 bilinear_over_8888_8_8888_process_last_pixel, \
1331 bilinear_over_8888_8_8888_process_two_pixels, \
1332 bilinear_over_8888_8_8888_process_four_pixels, \
1333 bilinear_over_8888_8_8888_process_pixblock_head, \
1334 bilinear_over_8888_8_8888_process_pixblock_tail, \
1335 bilinear_over_8888_8_8888_process_pixblock_tail_head, \
1336 4, 28, BILINEAR_FLAG_USE_MASK
1338 generate_bilinear_scanline_func \
1339 pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
1341 bilinear_add_8888_8888_process_last_pixel, \
1342 bilinear_add_8888_8888_process_two_pixels, \
1343 bilinear_add_8888_8888_process_four_pixels, \
1344 bilinear_add_8888_8888_process_pixblock_head, \
1345 bilinear_add_8888_8888_process_pixblock_tail, \
1346 bilinear_add_8888_8888_process_pixblock_tail_head, \
1349 generate_bilinear_scanline_func \
1350 pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
1352 bilinear_add_8888_8_8888_process_last_pixel, \
1353 bilinear_add_8888_8_8888_process_two_pixels, \
1354 bilinear_add_8888_8_8888_process_four_pixels, \
1355 bilinear_add_8888_8_8888_process_pixblock_head, \
1356 bilinear_add_8888_8_8888_process_pixblock_tail, \
1357 bilinear_add_8888_8_8888_process_pixblock_tail_head, \
1358 4, 28, BILINEAR_FLAG_USE_MASK