2 * Copyright © 2009 Nokia Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
27 * This file contains a macro ('generate_composite_function') which can
28 * construct 2D image processing functions, based on a common template.
29 * Any combinations of source, destination and mask images with 8bpp,
30 * 16bpp, 24bpp, 32bpp color formats are supported.
32 * This macro takes care of:
33 * - handling of leading and trailing unaligned pixels
34 * - doing most of the work related to L2 cache preload
35 * - encourages the use of software pipelining for better instructions
38 * The user of this macro has to provide some configuration parameters
39 * (bit depths for the images, prefetch distance, etc.) and a set of
40 * macros, which should implement basic code chunks responsible for
41 * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
45 * - try overlapped pixel method (from Ian Rickards) when processing
46 * exactly two blocks of pixels
47 * - maybe add an option to do reverse scanline processing
51 * Bit flags for 'generate_composite_function' macro which are used
52 * to tune generated functions behavior.
54 .set FLAG_DST_WRITEONLY
, 0
55 .set FLAG_DST_READWRITE
, 1
56 .set FLAG_DEINTERLEAVE_32BPP
, 2
59 * Offset in stack where mask and source pointer/stride can be accessed
60 * from 'init' macro. This is useful for doing special handling for solid mask.
62 .set ARGS_STACK_OFFSET
, 40
65 * Constants for selecting preferable prefetch type.
67 .set PREFETCH_TYPE_NONE
, 0 /* No prefetch at all */
68 .set PREFETCH_TYPE_SIMPLE
, 1 /* A simple, fixed-distance-ahead prefetch */
69 .set PREFETCH_TYPE_ADVANCED
, 2 /* Advanced fine-grained prefetch */
72 * Definitions of supplementary pixld/pixst macros (for partial load/store of
76 .macro pixldst1 op
, elem_size
, reg1
, mem_operand
, abits
78 op
&.&elem_size
{d
®1
}, [&mem_operand
&, :&abits
&]!
80 op
&.&elem_size
{d
®1
}, [&mem_operand
&]!
84 .macro pixldst2 op
, elem_size
, reg1
, reg2
, mem_operand
, abits
86 op
&.&elem_size
{d
®1
, d
®2
}, [&mem_operand
&, :&abits
&]!
88 op
&.&elem_size
{d
®1
, d
®2
}, [&mem_operand
&]!
92 .macro pixldst4 op
, elem_size
, reg1
, reg2
, reg3
, reg4
, mem_operand
, abits
94 op
&.&elem_size
{d
®1
, d
®2
, d
®3
, d
®4
}, [&mem_operand
&, :&abits
&]!
96 op
&.&elem_size
{d
®1
, d
®2
, d
®3
, d
®4
}, [&mem_operand
&]!
100 .macro pixldst0 op
, elem_size
, reg1
, idx
, mem_operand
, abits
101 op
&.&elem_size
{d
®1
[idx
]}, [&mem_operand
&]!
104 .macro pixldst3 op
, elem_size
, reg1
, reg2
, reg3
, mem_operand
105 op
&.&elem_size
{d
®1
, d
®2
, d
®3
}, [&mem_operand
&]!
108 .macro pixldst30 op
, elem_size
, reg1
, reg2
, reg3
, idx
, mem_operand
109 op
&.&elem_size
{d
®1
[idx
], d
®2
[idx
], d
®3
[idx
]}, [&mem_operand
&]!
112 .macro pixldst numbytes
, op
, elem_size
, basereg
, mem_operand
, abits
114 pixldst4 op
, elem_size
, %(basereg
+4), %(basereg
+5), \
115 %(basereg
+6), %(basereg
+7), mem_operand
, abits
116 .elseif numbytes
== 16
117 pixldst2 op
, elem_size
, %(basereg
+2), %(basereg
+3), mem_operand
, abits
118 .elseif numbytes
== 8
119 pixldst1 op
, elem_size
, %(basereg
+1), mem_operand
, abits
120 .elseif numbytes
== 4
121 .if !RESPECT_STRICT_ALIGNMENT
|| (elem_size
== 32)
122 pixldst0 op
, 32, %(basereg
+0), 1, mem_operand
, abits
123 .elseif elem_size
== 16
124 pixldst0 op
, 16, %(basereg
+0), 2, mem_operand
, abits
125 pixldst0 op
, 16, %(basereg
+0), 3, mem_operand
, abits
127 pixldst0 op
, 8, %(basereg
+0), 4, mem_operand
, abits
128 pixldst0 op
, 8, %(basereg
+0), 5, mem_operand
, abits
129 pixldst0 op
, 8, %(basereg
+0), 6, mem_operand
, abits
130 pixldst0 op
, 8, %(basereg
+0), 7, mem_operand
, abits
132 .elseif numbytes
== 2
133 .if !RESPECT_STRICT_ALIGNMENT
|| (elem_size
== 16)
134 pixldst0 op
, 16, %(basereg
+0), 1, mem_operand
, abits
136 pixldst0 op
, 8, %(basereg
+0), 2, mem_operand
, abits
137 pixldst0 op
, 8, %(basereg
+0), 3, mem_operand
, abits
139 .elseif numbytes
== 1
140 pixldst0 op
, 8, %(basereg
+0), 1, mem_operand
, abits
142 .error
"unsupported size: numbytes"
146 .macro pixld numpix
, bpp
, basereg
, mem_operand
, abits
=0
148 .if (bpp
== 32) && (numpix
== 8) && (DEINTERLEAVE_32BPP_ENABLED
!= 0)
149 pixldst4 vld4
, 8, %(basereg
+4), %(basereg
+5), \
150 %(basereg
+6), %(basereg
+7), mem_operand
, abits
151 .elseif (bpp
== 24) && (numpix
== 8)
152 pixldst3 vld3
, 8, %(basereg
+3), %(basereg
+4), %(basereg
+5), mem_operand
153 .elseif (bpp
== 24) && (numpix
== 4)
154 pixldst30 vld3
, 8, %(basereg
+0), %(basereg
+1), %(basereg
+2), 4, mem_operand
155 pixldst30 vld3
, 8, %(basereg
+0), %(basereg
+1), %(basereg
+2), 5, mem_operand
156 pixldst30 vld3
, 8, %(basereg
+0), %(basereg
+1), %(basereg
+2), 6, mem_operand
157 pixldst30 vld3
, 8, %(basereg
+0), %(basereg
+1), %(basereg
+2), 7, mem_operand
158 .elseif (bpp
== 24) && (numpix
== 2)
159 pixldst30 vld3
, 8, %(basereg
+0), %(basereg
+1), %(basereg
+2), 2, mem_operand
160 pixldst30 vld3
, 8, %(basereg
+0), %(basereg
+1), %(basereg
+2), 3, mem_operand
161 .elseif (bpp
== 24) && (numpix
== 1)
162 pixldst30 vld3
, 8, %(basereg
+0), %(basereg
+1), %(basereg
+2), 1, mem_operand
164 pixldst
%(numpix
* bpp
/ 8), vld1
, %(bpp
), basereg
, mem_operand
, abits
169 .macro pixst numpix
, bpp
, basereg
, mem_operand
, abits
=0
171 .if (bpp
== 32) && (numpix
== 8) && (DEINTERLEAVE_32BPP_ENABLED
!= 0)
172 pixldst4 vst4
, 8, %(basereg
+4), %(basereg
+5), \
173 %(basereg
+6), %(basereg
+7), mem_operand
, abits
174 .elseif (bpp
== 24) && (numpix
== 8)
175 pixldst3 vst3
, 8, %(basereg
+3), %(basereg
+4), %(basereg
+5), mem_operand
176 .elseif (bpp
== 24) && (numpix
== 4)
177 pixldst30 vst3
, 8, %(basereg
+0), %(basereg
+1), %(basereg
+2), 4, mem_operand
178 pixldst30 vst3
, 8, %(basereg
+0), %(basereg
+1), %(basereg
+2), 5, mem_operand
179 pixldst30 vst3
, 8, %(basereg
+0), %(basereg
+1), %(basereg
+2), 6, mem_operand
180 pixldst30 vst3
, 8, %(basereg
+0), %(basereg
+1), %(basereg
+2), 7, mem_operand
181 .elseif (bpp
== 24) && (numpix
== 2)
182 pixldst30 vst3
, 8, %(basereg
+0), %(basereg
+1), %(basereg
+2), 2, mem_operand
183 pixldst30 vst3
, 8, %(basereg
+0), %(basereg
+1), %(basereg
+2), 3, mem_operand
184 .elseif (bpp
== 24) && (numpix
== 1)
185 pixldst30 vst3
, 8, %(basereg
+0), %(basereg
+1), %(basereg
+2), 1, mem_operand
187 pixldst
%(numpix
* bpp
/ 8), vst1
, %(bpp
), basereg
, mem_operand
, abits
192 .macro pixld_a numpix
, bpp
, basereg
, mem_operand
193 .if (bpp
* numpix
) <= 128
194 pixld numpix
, bpp
, basereg
, mem_operand
, %(bpp
* numpix
)
196 pixld numpix
, bpp
, basereg
, mem_operand
, 128
200 .macro pixst_a numpix
, bpp
, basereg
, mem_operand
201 .if (bpp
* numpix
) <= 128
202 pixst numpix
, bpp
, basereg
, mem_operand
, %(bpp
* numpix
)
204 pixst numpix
, bpp
, basereg
, mem_operand
, 128
209 * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
210 * aliases to be defined)
212 .macro pixld1_s elem_size
, reg1
, mem_operand
214 mov TMP1
, VX
, asr
#16
216 5: subpls VX
, VX
, SRC_WIDTH_FIXED
218 add TMP1
, mem_operand
, TMP1
, asl
#1
219 mov TMP2
, VX
, asr
#16
221 5: subpls VX
, VX
, SRC_WIDTH_FIXED
223 add TMP2
, mem_operand
, TMP2
, asl
#1
224 vld1
.16
{d
®1
&[0]}, [TMP1
, :16]
225 mov TMP1
, VX
, asr
#16
227 5: subpls VX
, VX
, SRC_WIDTH_FIXED
229 add TMP1
, mem_operand
, TMP1
, asl
#1
230 vld1
.16
{d
®1
&[1]}, [TMP2
, :16]
231 mov TMP2
, VX
, asr
#16
233 5: subpls VX
, VX
, SRC_WIDTH_FIXED
235 add TMP2
, mem_operand
, TMP2
, asl
#1
236 vld1
.16
{d
®1
&[2]}, [TMP1
, :16]
237 vld1
.16
{d
®1
&[3]}, [TMP2
, :16]
238 .elseif elem_size
== 32
239 mov TMP1
, VX
, asr
#16
241 5: subpls VX
, VX
, SRC_WIDTH_FIXED
243 add TMP1
, mem_operand
, TMP1
, asl
#2
244 mov TMP2
, VX
, asr
#16
246 5: subpls VX
, VX
, SRC_WIDTH_FIXED
248 add TMP2
, mem_operand
, TMP2
, asl
#2
249 vld1
.32
{d
®1
&[0]}, [TMP1
, :32]
250 vld1
.32
{d
®1
&[1]}, [TMP2
, :32]
256 .macro pixld2_s elem_size
, reg1
, reg2
, mem_operand
257 .if 0 /* elem_size == 32 */
258 mov TMP1
, VX
, asr
#16
259 add VX
, VX
, UNIT_X
, asl
#1
260 add TMP1
, mem_operand
, TMP1
, asl
#2
261 mov TMP2
, VX
, asr
#16
263 add TMP2
, mem_operand
, TMP2
, asl
#2
264 vld1
.32
{d
®1
&[0]}, [TMP1
, :32]
265 mov TMP1
, VX
, asr
#16
266 add VX
, VX
, UNIT_X
, asl
#1
267 add TMP1
, mem_operand
, TMP1
, asl
#2
268 vld1
.32
{d
®2
&[0]}, [TMP2
, :32]
269 mov TMP2
, VX
, asr
#16
271 add TMP2
, mem_operand
, TMP2
, asl
#2
272 vld1
.32
{d
®1
&[1]}, [TMP1
, :32]
273 vld1
.32
{d
®2
&[1]}, [TMP2
, :32]
275 pixld1_s elem_size
, reg1
, mem_operand
276 pixld1_s elem_size
, reg2
, mem_operand
280 .macro pixld0_s elem_size
, reg1
, idx
, mem_operand
282 mov TMP1
, VX
, asr
#16
284 5: subpls VX
, VX
, SRC_WIDTH_FIXED
286 add TMP1
, mem_operand
, TMP1
, asl
#1
287 vld1
.16
{d
®1
&[idx
]}, [TMP1
, :16]
288 .elseif elem_size
== 32
289 mov TMP1
, VX
, asr
#16
291 5: subpls VX
, VX
, SRC_WIDTH_FIXED
293 add TMP1
, mem_operand
, TMP1
, asl
#2
294 vld1
.32
{d
®1
&[idx
]}, [TMP1
, :32]
298 .macro pixld_s_internal numbytes
, elem_size
, basereg
, mem_operand
300 pixld2_s elem_size
, %(basereg
+4), %(basereg
+5), mem_operand
301 pixld2_s elem_size
, %(basereg
+6), %(basereg
+7), mem_operand
302 pixdeinterleave elem_size
, %(basereg
+4)
303 .elseif numbytes
== 16
304 pixld2_s elem_size
, %(basereg
+2), %(basereg
+3), mem_operand
305 .elseif numbytes
== 8
306 pixld1_s elem_size
, %(basereg
+1), mem_operand
307 .elseif numbytes
== 4
309 pixld0_s elem_size
, %(basereg
+0), 1, mem_operand
310 .elseif elem_size
== 16
311 pixld0_s elem_size
, %(basereg
+0), 2, mem_operand
312 pixld0_s elem_size
, %(basereg
+0), 3, mem_operand
314 pixld0_s elem_size
, %(basereg
+0), 4, mem_operand
315 pixld0_s elem_size
, %(basereg
+0), 5, mem_operand
316 pixld0_s elem_size
, %(basereg
+0), 6, mem_operand
317 pixld0_s elem_size
, %(basereg
+0), 7, mem_operand
319 .elseif numbytes
== 2
321 pixld0_s elem_size
, %(basereg
+0), 1, mem_operand
323 pixld0_s elem_size
, %(basereg
+0), 2, mem_operand
324 pixld0_s elem_size
, %(basereg
+0), 3, mem_operand
326 .elseif numbytes
== 1
327 pixld0_s elem_size
, %(basereg
+0), 1, mem_operand
329 .error
"unsupported size: numbytes"
333 .macro pixld_s numpix
, bpp
, basereg
, mem_operand
335 pixld_s_internal
%(numpix
* bpp
/ 8), %(bpp
), basereg
, mem_operand
339 .macro vuzp8 reg1
, reg2
340 vuzp
.8 d
®1
, d
®2
343 .macro vzip8 reg1
, reg2
344 vzip
.8 d
®1
, d
®2
347 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
348 .macro pixdeinterleave bpp
, basereg
349 .if (bpp
== 32) && (DEINTERLEAVE_32BPP_ENABLED
!= 0)
350 vuzp8
%(basereg
+0), %(basereg
+1)
351 vuzp8
%(basereg
+2), %(basereg
+3)
352 vuzp8
%(basereg
+1), %(basereg
+3)
353 vuzp8
%(basereg
+0), %(basereg
+2)
357 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
358 .macro pixinterleave bpp
, basereg
359 .if (bpp
== 32) && (DEINTERLEAVE_32BPP_ENABLED
!= 0)
360 vzip8
%(basereg
+0), %(basereg
+2)
361 vzip8
%(basereg
+1), %(basereg
+3)
362 vzip8
%(basereg
+2), %(basereg
+3)
363 vzip8
%(basereg
+0), %(basereg
+1)
368 * This is a macro for implementing cache preload. The main idea is that
369 * cache preload logic is mostly independent from the rest of pixels
370 * processing code. It starts at the top left pixel and moves forward
371 * across pixels and can jump across scanlines. Prefetch distance is
372 * handled in an 'incremental' way: it starts from 0 and advances to the
373 * optimal distance over time. After reaching optimal prefetch distance,
374 * it is kept constant. There are some checks which prevent prefetching
375 * unneeded pixel lines below the image (but it still can prefetch a bit
376 * more data on the right side of the image - not a big issue and may
377 * be actually helpful when rendering text glyphs). Additional trick is
378 * the use of LDR instruction for prefetch instead of PLD when moving to
379 * the next line, the point is that we have a high chance of getting TLB
380 * miss in this case, and PLD would be useless.
382 * This sounds like it may introduce a noticeable overhead (when working with
383 * fully cached data). But in reality, due to having a separate pipeline and
384 * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
385 * execute simultaneously with NEON and be completely shadowed by it. Thus
386 * we get no performance overhead at all (*). This looks like a very nice
387 * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
388 * but still can implement some rather advanced prefetch logic in software
389 * for almost zero cost!
391 * (*) The overhead of the prefetcher is visible when running some trivial
392 * pixels processing like simple copy. Anyway, having prefetch is a must
393 * when working with the graphics data.
395 .macro PF a
, x
:vararg
396 .if (PREFETCH_TYPE_CURRENT
== PREFETCH_TYPE_ADVANCED
)
401 .macro cache_preload std_increment
, boost_increment
402 .if (src_bpp_shift
>= 0) || (dst_r_bpp
!= 0) || (mask_bpp_shift
>= 0)
404 PF ldr ORIG_W
, [sp
] /* If we are short on regs, ORIG_W is kept on stack */
406 .if std_increment
!= 0
407 PF add PF_X
, PF_X
, #std_increment
410 PF addne PF_X
, PF_X
, #boost_increment
411 PF subne PF_CTL
, PF_CTL
, #1
413 .if src_bpp_shift
>= 0
414 PF pld
, [PF_SRC
, PF_X
, lsl
#src_bpp_shift]
417 PF pld
, [PF_DST
, PF_X
, lsl
#dst_bpp_shift]
419 .if mask_bpp_shift
>= 0
420 PF pld
, [PF_MASK
, PF_X
, lsl
#mask_bpp_shift]
422 PF subge PF_X
, PF_X
, ORIG_W
423 PF subges PF_CTL
, PF_CTL
, #0x10
424 .if src_bpp_shift
>= 0
425 PF ldrgeb DUMMY
, [PF_SRC
, SRC_STRIDE
, lsl
#src_bpp_shift]!
428 PF ldrgeb DUMMY
, [PF_DST
, DST_STRIDE
, lsl
#dst_bpp_shift]!
430 .if mask_bpp_shift
>= 0
431 PF ldrgeb DUMMY
, [PF_MASK
, MASK_STRIDE
, lsl
#mask_bpp_shift]!
436 .macro cache_preload_simple
437 .if (PREFETCH_TYPE_CURRENT
== PREFETCH_TYPE_SIMPLE
)
439 pld
[SRC
, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
442 pld
[DST_R
, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
445 pld
[MASK
, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
450 .macro fetch_mask_pixblock
451 pixld pixblock_size
, mask_bpp
, \
452 (mask_basereg
- pixblock_size
* mask_bpp
/ 64), MASK
456 * Macro which is used to process leading pixels until destination
457 * pointer is properly aligned (at 16 bytes boundary). When destination
458 * buffer uses 16bpp format, this is unnecessary, or even pointless.
460 .macro ensure_destination_ptr_alignment process_pixblock_head
, \
461 process_pixblock_tail
, \
462 process_pixblock_tail_head
467 .irp lowbit
, 1, 2, 4, 8, 16
469 .if (dst_w_bpp
<= (lowbit
* 8)) && ((lowbit
* 8) < (pixblock_size
* dst_w_bpp
))
470 .if lowbit
< 16 /* we don't need more than 16-byte alignment */
474 pixld_src (lowbit
* 8 / dst_w_bpp
), src_bpp
, src_basereg
, SRC
475 pixld (lowbit
* 8 / dst_w_bpp
), mask_bpp
, mask_basereg
, MASK
477 pixld_a (lowbit
* 8 / dst_r_bpp
), dst_r_bpp
, dst_r_basereg
, DST_R
479 add DST_R
, DST_R
, #lowbit
481 PF add PF_X
, PF_X
, #(lowbit * 8 / dst_w_bpp)
482 sub W
, W
, #(lowbit * 8 / dst_w_bpp)
486 pixdeinterleave src_bpp
, src_basereg
487 pixdeinterleave mask_bpp
, mask_basereg
488 pixdeinterleave dst_r_bpp
, dst_r_basereg
490 process_pixblock_head
491 cache_preload
0, pixblock_size
493 process_pixblock_tail
495 pixinterleave dst_w_bpp
, dst_w_basereg
496 .irp lowbit
, 1, 2, 4, 8, 16
497 .if (dst_w_bpp
<= (lowbit
* 8)) && ((lowbit
* 8) < (pixblock_size
* dst_w_bpp
))
498 .if lowbit
< 16 /* we don't need more than 16-byte alignment */
502 pixst_a (lowbit
* 8 / dst_w_bpp
), dst_w_bpp
, dst_w_basereg
, DST_W
511 * Special code for processing up to (pixblock_size - 1) remaining
512 * trailing pixels. As SIMD processing performs operation on
513 * pixblock_size pixels, anything smaller than this has to be loaded
514 * and stored in a special way. Loading and storing of pixel data is
515 * performed in such a way that we fill some 'slots' in the NEON
516 * registers (some slots naturally are unused), then perform compositing
517 * operation as usual. In the end, the data is taken from these 'slots'
518 * and saved to memory.
520 * cache_preload_flag - allows to suppress prefetch if
522 * dst_aligned_flag - selects whether destination buffer
525 .macro process_trailing_pixels cache_preload_flag
, \
527 process_pixblock_head
, \
528 process_pixblock_tail
, \
529 process_pixblock_tail_head
530 tst W
, #(pixblock_size - 1)
532 .irp chunk_size
, 16, 8, 4, 2, 1
533 .if pixblock_size
> chunk_size
536 pixld_src chunk_size
, src_bpp
, src_basereg
, SRC
537 pixld chunk_size
, mask_bpp
, mask_basereg
, MASK
538 .if dst_aligned_flag
!= 0
539 pixld_a chunk_size
, dst_r_bpp
, dst_r_basereg
, DST_R
541 pixld chunk_size
, dst_r_bpp
, dst_r_basereg
, DST_R
543 .if cache_preload_flag
!= 0
544 PF add PF_X
, PF_X
, #chunk_size
549 pixdeinterleave src_bpp
, src_basereg
550 pixdeinterleave mask_bpp
, mask_basereg
551 pixdeinterleave dst_r_bpp
, dst_r_basereg
553 process_pixblock_head
554 .if cache_preload_flag
!= 0
555 cache_preload
0, pixblock_size
558 process_pixblock_tail
559 pixinterleave dst_w_bpp
, dst_w_basereg
560 .irp chunk_size
, 16, 8, 4, 2, 1
561 .if pixblock_size
> chunk_size
564 .if dst_aligned_flag
!= 0
565 pixst_a chunk_size
, dst_w_bpp
, dst_w_basereg
, DST_W
567 pixst chunk_size
, dst_w_bpp
, dst_w_basereg
, DST_W
576 * Macro, which performs all the needed operations to switch to the next
577 * scanline and start the next loop iteration unless all the scanlines
578 * are already processed.
580 .macro advance_to_next_scanline start_of_loop_label
582 ldrd W
, [sp
] /* load W and H (width and height) from stack */
586 add DST_W
, DST_W
, DST_STRIDE
, lsl
#dst_bpp_shift
588 add SRC
, SRC
, SRC_STRIDE
, lsl
#src_bpp_shift
591 add MASK
, MASK
, MASK_STRIDE
, lsl
#mask_bpp_shift
593 .if (dst_w_bpp
!= 24)
594 sub DST_W
, DST_W
, W
, lsl
#dst_bpp_shift
596 .if (src_bpp
!= 24) && (src_bpp
!= 0)
597 sub SRC
, SRC
, W
, lsl
#src_bpp_shift
599 .if (mask_bpp
!= 24) && (mask_bpp
!= 0)
600 sub MASK
, MASK
, W
, lsl
#mask_bpp_shift
605 str H
, [sp
, #4] /* save updated height to stack */
607 bge start_of_loop_label
611 * Registers are allocated in the following way by default:
612 * d0, d1, d2, d3 - reserved for loading source pixel data
613 * d4, d5, d6, d7 - reserved for loading destination pixel data
614 * d24, d25, d26, d27 - reserved for loading mask pixel data
615 * d28, d29, d30, d31 - final destination pixel data for writeback to memory
617 .macro generate_composite_function fname
, \
626 process_pixblock_head
, \
627 process_pixblock_tail
, \
628 process_pixblock_tail_head
, \
629 dst_w_basereg_
= 28, \
630 dst_r_basereg_
= 4, \
634 pixman_asm_function fname
636 push
{r4
-r12
, lr
} /* save all registers */
639 * Select prefetch type for this function. If prefetch distance is
640 * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
641 * has to be used instead of ADVANCED.
643 .set PREFETCH_TYPE_CURRENT
, PREFETCH_TYPE_DEFAULT
644 .if prefetch_distance
== 0
645 .set PREFETCH_TYPE_CURRENT
, PREFETCH_TYPE_NONE
646 .elseif (PREFETCH_TYPE_CURRENT
> PREFETCH_TYPE_SIMPLE
) && \
647 ((src_bpp_
== 24) || (mask_bpp_
== 24) || (dst_w_bpp_
== 24))
648 .set PREFETCH_TYPE_CURRENT
, PREFETCH_TYPE_SIMPLE
652 * Make some macro arguments globally visible and accessible
655 .set src_bpp
, src_bpp_
656 .set mask_bpp
, mask_bpp_
657 .set dst_w_bpp
, dst_w_bpp_
658 .set pixblock_size
, pixblock_size_
659 .set dst_w_basereg
, dst_w_basereg_
660 .set dst_r_basereg
, dst_r_basereg_
661 .set src_basereg
, src_basereg_
662 .set mask_basereg
, mask_basereg_
664 .macro pixld_src x
:vararg
667 .macro fetch_src_pixblock
668 pixld_src pixblock_size
, src_bpp
, \
669 (src_basereg
- pixblock_size
* src_bpp
/ 64), SRC
672 * Assign symbolic names to registers
674 W
.req r0
/* width (is updated during processing) */
675 H
.req r1
/* height (is updated during processing) */
676 DST_W
.req r2
/* destination buffer pointer for writes */
677 DST_STRIDE
.req r3
/* destination image stride */
678 SRC
.req r4
/* source buffer pointer */
679 SRC_STRIDE
.req r5
/* source image stride */
680 DST_R
.req r6
/* destination buffer pointer for reads */
682 MASK
.req r7
/* mask pointer */
683 MASK_STRIDE
.req r8
/* mask stride */
685 PF_CTL
.req r9
/* combined lines counter and prefetch */
686 /* distance increment counter */
687 PF_X
.req r10
/* pixel index in a scanline for current */
688 /* pretetch position */
689 PF_SRC
.req r11
/* pointer to source scanline start */
690 /* for prefetch purposes */
691 PF_DST
.req r12
/* pointer to destination scanline start */
692 /* for prefetch purposes */
693 PF_MASK
.req r14
/* pointer to mask scanline start */
694 /* for prefetch purposes */
696 * Check whether we have enough registers for all the local variables.
697 * If we don't have enough registers, original width and height are
698 * kept on top of stack (and 'regs_shortage' variable is set to indicate
699 * this for the rest of code). Even if there are enough registers, the
700 * allocation scheme may be a bit different depending on whether source
701 * or mask is not used.
703 .if (PREFETCH_TYPE_CURRENT
< PREFETCH_TYPE_ADVANCED
)
704 ORIG_W
.req r10
/* saved original width */
705 DUMMY
.req r12
/* temporary register */
706 .set regs_shortage
, 0
707 .elseif mask_bpp
== 0
708 ORIG_W
.req r7
/* saved original width */
709 DUMMY
.req r8
/* temporary register */
710 .set regs_shortage
, 0
712 ORIG_W
.req r4
/* saved original width */
713 DUMMY
.req r5
/* temporary register */
714 .set regs_shortage
, 0
716 ORIG_W
.req r1
/* saved original width */
717 DUMMY
.req r1
/* temporary register */
718 .set regs_shortage
, 1
721 .set mask_bpp_shift
, -1
723 .set src_bpp_shift
, 2
724 .elseif src_bpp
== 24
725 .set src_bpp_shift
, 0
726 .elseif src_bpp
== 16
727 .set src_bpp_shift
, 1
729 .set src_bpp_shift
, 0
731 .set src_bpp_shift
, -1
733 .error
"requested src bpp (src_bpp) is not supported"
736 .set mask_bpp_shift
, 2
737 .elseif mask_bpp
== 24
738 .set mask_bpp_shift
, 0
739 .elseif mask_bpp
== 8
740 .set mask_bpp_shift
, 0
741 .elseif mask_bpp
== 0
742 .set mask_bpp_shift
, -1
744 .error
"requested mask bpp (mask_bpp) is not supported"
747 .set dst_bpp_shift
, 2
748 .elseif dst_w_bpp
== 24
749 .set dst_bpp_shift
, 0
750 .elseif dst_w_bpp
== 16
751 .set dst_bpp_shift
, 1
752 .elseif dst_w_bpp
== 8
753 .set dst_bpp_shift
, 0
755 .error
"requested dst bpp (dst_w_bpp) is not supported"
758 .if (((flags
) & FLAG_DST_READWRITE
) != 0)
759 .set dst_r_bpp
, dst_w_bpp
763 .if (((flags
) & FLAG_DEINTERLEAVE_32BPP
) != 0)
764 .set DEINTERLEAVE_32BPP_ENABLED
, 1
766 .set DEINTERLEAVE_32BPP_ENABLED
, 0
769 .if prefetch_distance
< 0 || prefetch_distance
> 15
770 .error
"invalid prefetch distance (prefetch_distance)"
781 ldr SRC_STRIDE
, [sp
, #44]
784 ldr MASK_STRIDE
, [sp
, #52]
789 sub SRC_STRIDE
, SRC_STRIDE
, W
790 sub SRC_STRIDE
, SRC_STRIDE
, W
, lsl
#1
793 sub MASK_STRIDE
, MASK_STRIDE
, W
794 sub MASK_STRIDE
, MASK_STRIDE
, W
, lsl
#1
797 sub DST_STRIDE
, DST_STRIDE
, W
798 sub DST_STRIDE
, DST_STRIDE
, W
, lsl
#1
802 * Setup advanced prefetcher initial state
807 /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
808 PF mov PF_CTL
, H
, lsl
#4
809 PF add PF_CTL
, #(prefetch_distance - 0x10)
817 str H
, [sp
, #4] /* save updated height to stack */
822 cmp W
, #(pixblock_size * 2)
825 * This is the start of the pipelined loop, which if optimized for
829 ensure_destination_ptr_alignment process_pixblock_head
, \
830 process_pixblock_tail
, \
831 process_pixblock_tail_head
833 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
834 pixld_a pixblock_size
, dst_r_bpp
, \
835 (dst_r_basereg
- pixblock_size
* dst_r_bpp
/ 64), DST_R
837 pixld pixblock_size
, mask_bpp
, \
838 (mask_basereg
- pixblock_size
* mask_bpp
/ 64), MASK
839 PF add PF_X
, PF_X
, #pixblock_size
840 process_pixblock_head
841 cache_preload
0, pixblock_size
843 subs W
, W
, #(pixblock_size * 2)
846 process_pixblock_tail_head
848 subs W
, W
, #pixblock_size
851 process_pixblock_tail
852 pixst_a pixblock_size
, dst_w_bpp
, \
853 (dst_w_basereg
- pixblock_size
* dst_w_bpp
/ 64), DST_W
855 /* Process the remaining trailing pixels in the scanline */
856 process_trailing_pixels
1, 1, \
857 process_pixblock_head
, \
858 process_pixblock_tail
, \
859 process_pixblock_tail_head
860 advance_to_next_scanline
0b
866 pop
{r4
-r12
, pc
} /* exit */
868 * This is the start of the loop, designed to process images with small width
869 * (less than pixblock_size * 2 pixels). In this case neither pipelining
870 * nor prefetch are used.
873 /* Process exactly pixblock_size pixels if needed */
874 tst W
, #pixblock_size
876 pixld pixblock_size
, dst_r_bpp
, \
877 (dst_r_basereg
- pixblock_size
* dst_r_bpp
/ 64), DST_R
879 pixld pixblock_size
, mask_bpp
, \
880 (mask_basereg
- pixblock_size
* mask_bpp
/ 64), MASK
881 process_pixblock_head
882 process_pixblock_tail
883 pixst pixblock_size
, dst_w_bpp
, \
884 (dst_w_basereg
- pixblock_size
* dst_w_bpp
/ 64), DST_W
886 /* Process the remaining trailing pixels in the scanline */
887 process_trailing_pixels
0, 0, \
888 process_pixblock_head
, \
889 process_pixblock_tail
, \
890 process_pixblock_tail_head
891 advance_to_next_scanline
8b
897 pop
{r4
-r12
, pc
} /* exit */
899 .purgem fetch_src_pixblock
922 * A simplified variant of function generation template for a single
923 * scanline processing (for implementing pixman combine functions)
925 .macro generate_composite_function_scanline use_nearest_scaling
, \
934 process_pixblock_head
, \
935 process_pixblock_tail
, \
936 process_pixblock_tail_head
, \
937 dst_w_basereg_
= 28, \
938 dst_r_basereg_
= 4, \
942 pixman_asm_function fname
944 .set PREFETCH_TYPE_CURRENT
, PREFETCH_TYPE_NONE
946 * Make some macro arguments globally visible and accessible
949 .set src_bpp
, src_bpp_
950 .set mask_bpp
, mask_bpp_
951 .set dst_w_bpp
, dst_w_bpp_
952 .set pixblock_size
, pixblock_size_
953 .set dst_w_basereg
, dst_w_basereg_
954 .set dst_r_basereg
, dst_r_basereg_
955 .set src_basereg
, src_basereg_
956 .set mask_basereg
, mask_basereg_
958 .if use_nearest_scaling
!= 0
960 * Assign symbolic names to registers for nearest scaling
971 SRC_WIDTH_FIXED
.req r7
973 .macro pixld_src x
:vararg
979 ldr SRC_WIDTH_FIXED
, [sp
, #(24 + 4)]
981 ldr MASK
, [sp
, #(24 + 8)]
985 * Assign symbolic names to registers
987 W
.req r0
/* width (is updated during processing) */
988 DST_W
.req r1
/* destination buffer pointer for writes */
989 SRC
.req r2
/* source buffer pointer */
990 DST_R
.req ip
/* destination buffer pointer for reads */
991 MASK
.req r3
/* mask pointer */
993 .macro pixld_src x
:vararg
998 .if (((flags
) & FLAG_DST_READWRITE
) != 0)
999 .set dst_r_bpp
, dst_w_bpp
1003 .if (((flags
) & FLAG_DEINTERLEAVE_32BPP
) != 0)
1004 .set DEINTERLEAVE_32BPP_ENABLED
, 1
1006 .set DEINTERLEAVE_32BPP_ENABLED
, 0
1009 .macro fetch_src_pixblock
1010 pixld_src pixblock_size
, src_bpp
, \
1011 (src_basereg
- pixblock_size
* src_bpp
/ 64), SRC
1017 cmp W
, #pixblock_size
1020 ensure_destination_ptr_alignment process_pixblock_head
, \
1021 process_pixblock_tail
, \
1022 process_pixblock_tail_head
1024 subs W
, W
, #pixblock_size
1027 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
1028 pixld_a pixblock_size
, dst_r_bpp
, \
1029 (dst_r_basereg
- pixblock_size
* dst_r_bpp
/ 64), DST_R
1031 pixld pixblock_size
, mask_bpp
, \
1032 (mask_basereg
- pixblock_size
* mask_bpp
/ 64), MASK
1033 process_pixblock_head
1034 subs W
, W
, #pixblock_size
1037 process_pixblock_tail_head
1038 subs W
, W
, #pixblock_size
1041 process_pixblock_tail
1042 pixst_a pixblock_size
, dst_w_bpp
, \
1043 (dst_w_basereg
- pixblock_size
* dst_w_bpp
/ 64), DST_W
1045 /* Process the remaining trailing pixels in the scanline (dst aligned) */
1046 process_trailing_pixels
0, 1, \
1047 process_pixblock_head
, \
1048 process_pixblock_tail
, \
1049 process_pixblock_tail_head
1052 .if use_nearest_scaling
!= 0
1053 pop
{r4
-r8
, pc
} /* exit */
1058 /* Process the remaining trailing pixels in the scanline (dst unaligned) */
1059 process_trailing_pixels
0, 0, \
1060 process_pixblock_head
, \
1061 process_pixblock_tail
, \
1062 process_pixblock_tail_head
1066 .if use_nearest_scaling
!= 0
1067 pop
{r4
-r8
, pc
} /* exit */
1078 .unreq SRC_WIDTH_FIXED
1090 .purgem fetch_src_pixblock
1096 .macro generate_composite_function_single_scanline x
:vararg
1097 generate_composite_function_scanline
0, x
1100 .macro generate_composite_function_nearest_scanline x
:vararg
1101 generate_composite_function_scanline
1, x
1104 /* Default prologue/epilogue, nothing special needs to be done */
1109 .macro default_cleanup
1113 * Prologue/epilogue variant which additionally saves/restores d8-d15
1114 * registers (they need to be saved/restored by callee according to ABI).
1115 * This is required if the code needs to use all the NEON registers.
1118 .macro default_init_need_all_regs
1122 .macro default_cleanup_need_all_regs
1126 /******************************************************************************/
1129 * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
1130 * into a planar a8r8g8b8 format (with a, r, g, b color components
1131 * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
1133 * Warning: the conversion is destructive and the original
1134 * value (in) is lost.
1136 .macro convert_0565_to_8888 in
, out_a
, out_r
, out_g
, out_b
1137 vshrn
.u16 out_r
, in
, #8
1138 vshrn
.u16 out_g
, in
, #3
1141 vsri
.u8 out_r
, out_r
, #5
1142 vsri
.u8 out_g
, out_g
, #6
1143 vshrn
.u16 out_b
, in
, #2
1146 .macro convert_0565_to_x888 in
, out_r
, out_g
, out_b
1147 vshrn
.u16 out_r
, in
, #8
1148 vshrn
.u16 out_g
, in
, #3
1150 vsri
.u8 out_r
, out_r
, #5
1151 vsri
.u8 out_g
, out_g
, #6
1152 vshrn
.u16 out_b
, in
, #2
1156 * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
1157 * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
1158 * pixels packed in 128-bit register (out). Requires two temporary 128-bit
1159 * registers (tmp1, tmp2)
1161 .macro convert_8888_to_0565 in_r
, in_g
, in_b
, out
, tmp1
, tmp2
1162 vshll
.u8 tmp1
, in_g
, #8
1163 vshll
.u8 out
, in_r
, #8
1164 vshll
.u8 tmp2
, in_b
, #8
1165 vsri
.u16 out
, tmp1
, #5
1166 vsri
.u16 out
, tmp2
, #11
1170 * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
1171 * returned in (out0, out1) registers pair. Requires one temporary
1172 * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
1173 * value from 'in' is lost
1175 .macro convert_four_0565_to_x888_packed in
, out0
, out1
, tmp
1176 vshl
.u16 out0
, in
, #5 /* G top 6 bits */
1177 vshl
.u16 tmp
, in
, #11 /* B top 5 bits */
1178 vsri
.u16 in
, in
, #5 /* R is ready in top bits */
1179 vsri
.u16 out0
, out0
, #6 /* G is ready in top bits */
1180 vsri
.u16 tmp
, tmp
, #5 /* B is ready in top bits */
1181 vshr
.u16 out1
, in
, #8 /* R is in place */
1182 vsri
.u16 out0
, tmp
, #8 /* G & B is in place */
1183 vzip
.u16 out0
, out1
/* everything is in place */