1 /* filter_vsx_intrinsics.c - PowerPC optimised filter functions
3 * Copyright (c) 2018 Cosmin Truta
4 * Copyright (c) 2017 Glenn Randers-Pehrson
5 * Written by Vadim Barkov, 2017.
7 * This code is released under the libpng license.
8 * For conditions of distribution and use, see the disclaimer
14 #include "../pngpriv.h"
16 #ifdef PNG_READ_SUPPORTED
18 /* This code requires -maltivec and -mvsx on the command line: */
19 #if PNG_POWERPC_VSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
23 #if PNG_POWERPC_VSX_OPT > 0
26 # error "This code requires VSX support (POWER7 and later). Please provide -mvsx compiler flag."
29 #define vec_ld_unaligned(vec,data) vec = vec_vsx_ld(0,data)
30 #define vec_st_unaligned(vec,data) vec_vsx_st(vec,0,data)
33 /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
34 * They're positioned like this:
37 * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
38 * whichever of a, b, or c is closest to p=a+b-c.
39 * ( this is taken from ../intel/filter_sse2_intrinsics.c )
42 #define vsx_declare_common_vars(row_info,row,prev_row,offset) \
44 png_bytep rp = row + offset;\
45 png_const_bytep pp = prev_row;\
46 size_t unaligned_top = 16 - (((size_t)rp % 16));\
48 if(unaligned_top == 16)\
50 istop = row_info->rowbytes;\
51 if((unaligned_top < istop))\
52 istop -= unaligned_top;\
54 unaligned_top = istop;\
58 void png_read_filter_row_up_vsx(png_row_infop row_info
, png_bytep row
,
59 png_const_bytep prev_row
)
61 vector
unsigned char rp_vec
;
62 vector
unsigned char pp_vec
;
63 vsx_declare_common_vars(row_info
,row
,prev_row
,0)
65 /* Altivec operations require 16-byte aligned data
66 * but input can be unaligned. So we calculate
67 * unaligned part as usual.
69 for (i
= 0; i
< unaligned_top
; i
++)
71 *rp
= (png_byte
)(((int)(*rp
) + (int)(*pp
++)) & 0xff);
75 /* Using SIMD while we can */
78 rp_vec
= vec_ld(0,rp
);
79 vec_ld_unaligned(pp_vec
,pp
);
81 rp_vec
= vec_add(rp_vec
,pp_vec
);
92 /* If byte count of row is not divisible by 16
93 * we will process remaining part as usual
95 for (i
= 0; i
< istop
; i
++)
97 *rp
= (png_byte
)(((int)(*rp
) + (int)(*pp
++)) & 0xff);
104 static const vector
unsigned char VSX_LEFTSHIFTED1_4
= {16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16};
105 static const vector
unsigned char VSX_LEFTSHIFTED2_4
= {16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16};
106 static const vector
unsigned char VSX_LEFTSHIFTED3_4
= {16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11};
108 static const vector
unsigned char VSX_LEFTSHIFTED1_3
= {16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16};
109 static const vector
unsigned char VSX_LEFTSHIFTED2_3
= {16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16};
110 static const vector
unsigned char VSX_LEFTSHIFTED3_3
= {16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16};
111 static const vector
unsigned char VSX_LEFTSHIFTED4_3
= {16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16};
113 static const vector
unsigned char VSX_NOT_SHIFTED1_4
= {16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16};
114 static const vector
unsigned char VSX_NOT_SHIFTED2_4
= {16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16};
115 static const vector
unsigned char VSX_NOT_SHIFTED3_4
= {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15};
117 static const vector
unsigned char VSX_NOT_SHIFTED1_3
= {16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16};
118 static const vector
unsigned char VSX_NOT_SHIFTED2_3
= {16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16};
119 static const vector
unsigned char VSX_NOT_SHIFTED3_3
= {16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16};
120 static const vector
unsigned char VSX_NOT_SHIFTED4_3
= {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16};
122 static const vector
unsigned char VSX_CHAR_ZERO
= {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
123 #ifdef __LITTLE_ENDIAN__
125 static const vector
unsigned char VSX_CHAR_TO_SHORT1_4
= { 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16,16};
126 static const vector
unsigned char VSX_CHAR_TO_SHORT2_4
= { 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16};
127 static const vector
unsigned char VSX_CHAR_TO_SHORT3_4
= {12,16,13,16,14,16,15,16,16,16,16,16,16,16,16,16};
129 static const vector
unsigned char VSX_SHORT_TO_CHAR1_4
= {16,16,16,16, 0, 2, 4, 6,16,16,16,16,16,16,16,16};
130 static const vector
unsigned char VSX_SHORT_TO_CHAR2_4
= {16,16,16,16,16,16,16,16, 0, 2, 4, 6,16,16,16,16};
131 static const vector
unsigned char VSX_SHORT_TO_CHAR3_4
= {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4, 6};
133 static const vector
unsigned char VSX_CHAR_TO_SHORT1_3
= { 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16,16};
134 static const vector
unsigned char VSX_CHAR_TO_SHORT2_3
= { 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16,16};
135 static const vector
unsigned char VSX_CHAR_TO_SHORT3_3
= { 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16,16};
136 static const vector
unsigned char VSX_CHAR_TO_SHORT4_3
= {12,16,13,16,14,16,16,16,16,16,16,16,16,16,16,16};
138 static const vector
unsigned char VSX_SHORT_TO_CHAR1_3
= {16,16,16, 0, 2, 4,16,16,16,16,16,16,16,16,16,16};
139 static const vector
unsigned char VSX_SHORT_TO_CHAR2_3
= {16,16,16,16,16,16, 0, 2, 4,16,16,16,16,16,16,16};
140 static const vector
unsigned char VSX_SHORT_TO_CHAR3_3
= {16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16};
141 static const vector
unsigned char VSX_SHORT_TO_CHAR4_3
= {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16};
143 #elif defined(__BIG_ENDIAN__)
145 static const vector
unsigned char VSX_CHAR_TO_SHORT1_4
= {16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16};
146 static const vector
unsigned char VSX_CHAR_TO_SHORT2_4
= {16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16};
147 static const vector
unsigned char VSX_CHAR_TO_SHORT3_4
= {16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16};
149 static const vector
unsigned char VSX_SHORT_TO_CHAR1_4
= {16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16};
150 static const vector
unsigned char VSX_SHORT_TO_CHAR2_4
= {16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16};
151 static const vector
unsigned char VSX_SHORT_TO_CHAR3_4
= {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7};
153 static const vector
unsigned char VSX_CHAR_TO_SHORT1_3
= {16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16};
154 static const vector
unsigned char VSX_CHAR_TO_SHORT2_3
= {16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16};
155 static const vector
unsigned char VSX_CHAR_TO_SHORT3_3
= {16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16};
156 static const vector
unsigned char VSX_CHAR_TO_SHORT4_3
= {16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16};
158 static const vector
unsigned char VSX_SHORT_TO_CHAR1_3
= {16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16};
159 static const vector
unsigned char VSX_SHORT_TO_CHAR2_3
= {16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16};
160 static const vector
unsigned char VSX_SHORT_TO_CHAR3_3
= {16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16};
161 static const vector
unsigned char VSX_SHORT_TO_CHAR4_3
= {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16};
165 #define vsx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VSX_CHAR_ZERO,VSX_CHAR_TO_SHORT##offset##_##bpp)
166 #define vsx_short_to_char(vec,offset,bpp) vec_perm(((vector unsigned char)(vec)),VSX_CHAR_ZERO,VSX_SHORT_TO_CHAR##offset##_##bpp)
169 # define vsx_abs(number) abs(number)
171 # define vsx_abs(number) (number > 0) ? (number) : -(number)
174 void png_read_filter_row_sub4_vsx(png_row_infop row_info
, png_bytep row
,
175 png_const_bytep prev_row
)
179 vector
unsigned char rp_vec
;
180 vector
unsigned char part_vec
;
182 vsx_declare_common_vars(row_info
,row
,prev_row
,bpp
)
186 /* Altivec operations require 16-byte aligned data
187 * but input can be unaligned. So we calculate
188 * unaligned part as usual.
190 for (i
= 0; i
< unaligned_top
; i
++)
192 *rp
= (png_byte
)(((int)(*rp
) + (int)(*(rp
-bpp
))) & 0xff);
196 /* Using SIMD while we can */
199 for(i
=0;i
< bpp
; i
++)
201 *rp
= (png_byte
)(((int)(*rp
) + (int)(*(rp
-bpp
))) & 0xff);
206 rp_vec
= vec_ld(0,rp
);
207 part_vec
= vec_perm(rp_vec
,VSX_CHAR_ZERO
,VSX_LEFTSHIFTED1_4
);
208 rp_vec
= vec_add(rp_vec
,part_vec
);
210 part_vec
= vec_perm(rp_vec
,VSX_CHAR_ZERO
,VSX_LEFTSHIFTED2_4
);
211 rp_vec
= vec_add(rp_vec
,part_vec
);
213 part_vec
= vec_perm(rp_vec
,VSX_CHAR_ZERO
,VSX_LEFTSHIFTED3_4
);
214 rp_vec
= vec_add(rp_vec
,part_vec
);
223 for (i
= 0; i
< istop
% 16; i
++)
225 *rp
= (png_byte
)(((int)(*rp
) + (int)(*(rp
- bpp
))) & 0xff);
231 void png_read_filter_row_sub3_vsx(png_row_infop row_info
, png_bytep row
,
232 png_const_bytep prev_row
)
236 vector
unsigned char rp_vec
;
237 vector
unsigned char part_vec
;
239 vsx_declare_common_vars(row_info
,row
,prev_row
,bpp
)
243 /* Altivec operations require 16-byte aligned data
244 * but input can be unaligned. So we calculate
245 * unaligned part as usual.
247 for (i
= 0; i
< unaligned_top
; i
++)
249 *rp
= (png_byte
)(((int)(*rp
) + (int)(*(rp
-bpp
))) & 0xff);
253 /* Using SIMD while we can */
256 for(i
=0;i
< bpp
; i
++)
258 *rp
= (png_byte
)(((int)(*rp
) + (int)(*(rp
-bpp
))) & 0xff);
263 rp_vec
= vec_ld(0,rp
);
264 part_vec
= vec_perm(rp_vec
,VSX_CHAR_ZERO
,VSX_LEFTSHIFTED1_3
);
265 rp_vec
= vec_add(rp_vec
,part_vec
);
267 part_vec
= vec_perm(rp_vec
,VSX_CHAR_ZERO
,VSX_LEFTSHIFTED2_3
);
268 rp_vec
= vec_add(rp_vec
,part_vec
);
270 part_vec
= vec_perm(rp_vec
,VSX_CHAR_ZERO
,VSX_LEFTSHIFTED3_3
);
271 rp_vec
= vec_add(rp_vec
,part_vec
);
273 part_vec
= vec_perm(rp_vec
,VSX_CHAR_ZERO
,VSX_LEFTSHIFTED4_3
);
274 rp_vec
= vec_add(rp_vec
,part_vec
);
280 /* Since 16 % bpp = 16 % 3 = 1, last element of array must
281 * be proceeded manually
283 *rp
= (png_byte
)(((int)(*rp
) + (int)(*(rp
-bpp
))) & 0xff);
288 for (i
= 0; i
< istop
% 16; i
++)
290 *rp
= (png_byte
)(((int)(*rp
) + (int)(*(rp
-bpp
))) & 0xff);
295 void png_read_filter_row_avg4_vsx(png_row_infop row_info
, png_bytep row
,
296 png_const_bytep prev_row
)
300 vector
unsigned char rp_vec
;
301 vector
unsigned char pp_vec
;
302 vector
unsigned char pp_part_vec
;
303 vector
unsigned char rp_part_vec
;
304 vector
unsigned char avg_vec
;
306 vsx_declare_common_vars(row_info
,row
,prev_row
,bpp
)
311 for (i
= 0; i
< bpp
; i
++)
313 *rp
= (png_byte
)(((int)(*rp
) +
314 ((int)(*pp
++) / 2 )) & 0xff);
319 /* Altivec operations require 16-byte aligned data
320 * but input can be unaligned. So we calculate
321 * unaligned part as usual.
323 for (i
= 0; i
< unaligned_top
; i
++)
325 *rp
= (png_byte
)(((int)(*rp
) +
326 (int)(*pp
++ + *(rp
-bpp
)) / 2 ) & 0xff);
331 /* Using SIMD while we can */
334 for(i
=0;i
< bpp
; i
++)
336 *rp
= (png_byte
)(((int)(*rp
) +
337 (int)(*pp
++ + *(rp
-bpp
)) / 2 ) & 0xff);
344 vec_ld_unaligned(pp_vec
,pp
);
345 rp_vec
= vec_ld(0,rp
);
347 rp_part_vec
= vec_perm(rp_vec
,VSX_CHAR_ZERO
,VSX_LEFTSHIFTED1_4
);
348 pp_part_vec
= vec_perm(pp_vec
,VSX_CHAR_ZERO
,VSX_NOT_SHIFTED1_4
);
349 avg_vec
= vec_avg(rp_part_vec
,pp_part_vec
);
350 avg_vec
= vec_sub(avg_vec
, vec_and(vec_xor(rp_part_vec
,pp_part_vec
),vec_splat_u8(1)));
351 rp_vec
= vec_add(rp_vec
,avg_vec
);
353 rp_part_vec
= vec_perm(rp_vec
,VSX_CHAR_ZERO
,VSX_LEFTSHIFTED2_4
);
354 pp_part_vec
= vec_perm(pp_vec
,VSX_CHAR_ZERO
,VSX_NOT_SHIFTED2_4
);
355 avg_vec
= vec_avg(rp_part_vec
,pp_part_vec
);
356 avg_vec
= vec_sub(avg_vec
, vec_and(vec_xor(rp_part_vec
,pp_part_vec
),vec_splat_u8(1)));
357 rp_vec
= vec_add(rp_vec
,avg_vec
);
359 rp_part_vec
= vec_perm(rp_vec
,VSX_CHAR_ZERO
,VSX_LEFTSHIFTED3_4
);
360 pp_part_vec
= vec_perm(pp_vec
,VSX_CHAR_ZERO
,VSX_NOT_SHIFTED3_4
);
361 avg_vec
= vec_avg(rp_part_vec
,pp_part_vec
);
362 avg_vec
= vec_sub(avg_vec
, vec_and(vec_xor(rp_part_vec
,pp_part_vec
),vec_splat_u8(1)));
363 rp_vec
= vec_add(rp_vec
,avg_vec
);
373 for (i
= 0; i
< istop
% 16; i
++)
375 *rp
= (png_byte
)(((int)(*rp
) +
376 (int)(*pp
++ + *(rp
-bpp
)) / 2 ) & 0xff);
382 void png_read_filter_row_avg3_vsx(png_row_infop row_info
, png_bytep row
,
383 png_const_bytep prev_row
)
387 vector
unsigned char rp_vec
;
388 vector
unsigned char pp_vec
;
389 vector
unsigned char pp_part_vec
;
390 vector
unsigned char rp_part_vec
;
391 vector
unsigned char avg_vec
;
393 vsx_declare_common_vars(row_info
,row
,prev_row
,bpp
)
398 for (i
= 0; i
< bpp
; i
++)
400 *rp
= (png_byte
)(((int)(*rp
) +
401 ((int)(*pp
++) / 2 )) & 0xff);
406 /* Altivec operations require 16-byte aligned data
407 * but input can be unaligned. So we calculate
408 * unaligned part as usual.
410 for (i
= 0; i
< unaligned_top
; i
++)
412 *rp
= (png_byte
)(((int)(*rp
) +
413 (int)(*pp
++ + *(rp
-bpp
)) / 2 ) & 0xff);
418 /* Using SIMD while we can */
421 for(i
=0;i
< bpp
; i
++)
423 *rp
= (png_byte
)(((int)(*rp
) +
424 (int)(*pp
++ + *(rp
-bpp
)) / 2 ) & 0xff);
431 vec_ld_unaligned(pp_vec
,pp
);
432 rp_vec
= vec_ld(0,rp
);
434 rp_part_vec
= vec_perm(rp_vec
,VSX_CHAR_ZERO
,VSX_LEFTSHIFTED1_3
);
435 pp_part_vec
= vec_perm(pp_vec
,VSX_CHAR_ZERO
,VSX_NOT_SHIFTED1_3
);
436 avg_vec
= vec_avg(rp_part_vec
,pp_part_vec
);
437 avg_vec
= vec_sub(avg_vec
, vec_and(vec_xor(rp_part_vec
,pp_part_vec
),vec_splat_u8(1)));
438 rp_vec
= vec_add(rp_vec
,avg_vec
);
440 rp_part_vec
= vec_perm(rp_vec
,VSX_CHAR_ZERO
,VSX_LEFTSHIFTED2_3
);
441 pp_part_vec
= vec_perm(pp_vec
,VSX_CHAR_ZERO
,VSX_NOT_SHIFTED2_3
);
442 avg_vec
= vec_avg(rp_part_vec
,pp_part_vec
);
443 avg_vec
= vec_sub(avg_vec
, vec_and(vec_xor(rp_part_vec
,pp_part_vec
),vec_splat_u8(1)));
444 rp_vec
= vec_add(rp_vec
,avg_vec
);
446 rp_part_vec
= vec_perm(rp_vec
,VSX_CHAR_ZERO
,VSX_LEFTSHIFTED3_3
);
447 pp_part_vec
= vec_perm(pp_vec
,VSX_CHAR_ZERO
,VSX_NOT_SHIFTED3_3
);
448 avg_vec
= vec_avg(rp_part_vec
,pp_part_vec
);
449 avg_vec
= vec_sub(avg_vec
, vec_and(vec_xor(rp_part_vec
,pp_part_vec
),vec_splat_u8(1)));
450 rp_vec
= vec_add(rp_vec
,avg_vec
);
452 rp_part_vec
= vec_perm(rp_vec
,VSX_CHAR_ZERO
,VSX_LEFTSHIFTED4_3
);
453 pp_part_vec
= vec_perm(pp_vec
,VSX_CHAR_ZERO
,VSX_NOT_SHIFTED4_3
);
454 avg_vec
= vec_avg(rp_part_vec
,pp_part_vec
);
455 avg_vec
= vec_sub(avg_vec
, vec_and(vec_xor(rp_part_vec
,pp_part_vec
),vec_splat_u8(1)));
456 rp_vec
= vec_add(rp_vec
,avg_vec
);
464 /* Since 16 % bpp = 16 % 3 = 1, last element of array must
465 * be proceeded manually
467 *rp
= (png_byte
)(((int)(*rp
) +
468 (int)(*pp
++ + *(rp
-bpp
)) / 2 ) & 0xff);
473 for (i
= 0; i
< istop
% 16; i
++)
475 *rp
= (png_byte
)(((int)(*rp
) +
476 (int)(*pp
++ + *(rp
-bpp
)) / 2 ) & 0xff);
482 /* Bytewise c ? t : e. */
483 #define if_then_else(c,t,e) vec_sel(e,t,c)
485 #define vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) {\
493 pc = vsx_abs(p + pc);\
494 if (pb < pa) pa = pb, a = b;\
497 *rp++ = (png_byte)a;\
500 void png_read_filter_row_paeth4_vsx(png_row_infop row_info
, png_bytep row
,
501 png_const_bytep prev_row
)
505 int a
, b
, c
, pa
, pb
, pc
, p
;
506 vector
unsigned char rp_vec
;
507 vector
unsigned char pp_vec
;
508 vector
unsigned short a_vec
,b_vec
,c_vec
,nearest_vec
;
509 vector
signed short pa_vec
,pb_vec
,pc_vec
,smallest_vec
;
511 vsx_declare_common_vars(row_info
,row
,prev_row
,bpp
)
516 /* Process the first pixel in the row completely (this is the same as 'up'
517 * because there is only one candidate predictor for the first row).
519 for(i
= 0; i
< bpp
; i
++)
521 *rp
= (png_byte
)( *rp
+ *pp
);
526 for(i
= 0; i
< unaligned_top
; i
++)
528 vsx_paeth_process(rp
,pp
,a
,b
,c
,pa
,pb
,pc
,bpp
)
533 for(i
= 0; i
< bpp
; i
++)
535 vsx_paeth_process(rp
,pp
,a
,b
,c
,pa
,pb
,pc
,bpp
)
540 rp_vec
= vec_ld(0,rp
);
541 vec_ld_unaligned(pp_vec
,pp
);
543 a_vec
= vsx_char_to_short(vec_perm(rp_vec
, VSX_CHAR_ZERO
, VSX_LEFTSHIFTED1_4
),1,4);
544 b_vec
= vsx_char_to_short(vec_perm(pp_vec
, VSX_CHAR_ZERO
, VSX_NOT_SHIFTED1_4
),1,4);
545 c_vec
= vsx_char_to_short(vec_perm(pp_vec
, VSX_CHAR_ZERO
, VSX_LEFTSHIFTED1_4
),1,4);
546 pa_vec
= (vector
signed short) vec_sub(b_vec
,c_vec
);
547 pb_vec
= (vector
signed short) vec_sub(a_vec
, c_vec
);
548 pc_vec
= vec_add(pa_vec
,pb_vec
);
549 pa_vec
= vec_abs(pa_vec
);
550 pb_vec
= vec_abs(pb_vec
);
551 pc_vec
= vec_abs(pc_vec
);
552 smallest_vec
= vec_min(pc_vec
, vec_min(pa_vec
,pb_vec
));
553 nearest_vec
= if_then_else(
554 vec_cmpeq(pa_vec
,smallest_vec
),
557 vec_cmpeq(pb_vec
,smallest_vec
),
562 rp_vec
= vec_add(rp_vec
,(vsx_short_to_char(nearest_vec
,1,4)));
564 a_vec
= vsx_char_to_short(vec_perm(rp_vec
, VSX_CHAR_ZERO
, VSX_LEFTSHIFTED2_4
),2,4);
565 b_vec
= vsx_char_to_short(vec_perm(pp_vec
, VSX_CHAR_ZERO
, VSX_NOT_SHIFTED2_4
),2,4);
566 c_vec
= vsx_char_to_short(vec_perm(pp_vec
, VSX_CHAR_ZERO
, VSX_LEFTSHIFTED2_4
),2,4);
567 pa_vec
= (vector
signed short) vec_sub(b_vec
,c_vec
);
568 pb_vec
= (vector
signed short) vec_sub(a_vec
, c_vec
);
569 pc_vec
= vec_add(pa_vec
,pb_vec
);
570 pa_vec
= vec_abs(pa_vec
);
571 pb_vec
= vec_abs(pb_vec
);
572 pc_vec
= vec_abs(pc_vec
);
573 smallest_vec
= vec_min(pc_vec
, vec_min(pa_vec
,pb_vec
));
574 nearest_vec
= if_then_else(
575 vec_cmpeq(pa_vec
,smallest_vec
),
578 vec_cmpeq(pb_vec
,smallest_vec
),
583 rp_vec
= vec_add(rp_vec
,(vsx_short_to_char(nearest_vec
,2,4)));
585 a_vec
= vsx_char_to_short(vec_perm(rp_vec
, VSX_CHAR_ZERO
, VSX_LEFTSHIFTED3_4
),3,4);
586 b_vec
= vsx_char_to_short(vec_perm(pp_vec
, VSX_CHAR_ZERO
, VSX_NOT_SHIFTED3_4
),3,4);
587 c_vec
= vsx_char_to_short(vec_perm(pp_vec
, VSX_CHAR_ZERO
, VSX_LEFTSHIFTED3_4
),3,4);
588 pa_vec
= (vector
signed short) vec_sub(b_vec
,c_vec
);
589 pb_vec
= (vector
signed short) vec_sub(a_vec
, c_vec
);
590 pc_vec
= vec_add(pa_vec
,pb_vec
);
591 pa_vec
= vec_abs(pa_vec
);
592 pb_vec
= vec_abs(pb_vec
);
593 pc_vec
= vec_abs(pc_vec
);
594 smallest_vec
= vec_min(pc_vec
, vec_min(pa_vec
,pb_vec
));
595 nearest_vec
= if_then_else(
596 vec_cmpeq(pa_vec
,smallest_vec
),
599 vec_cmpeq(pb_vec
,smallest_vec
),
604 rp_vec
= vec_add(rp_vec
,(vsx_short_to_char(nearest_vec
,3,4)));
614 for (i
= 0; i
< istop
% 16; i
++)
616 vsx_paeth_process(rp
,pp
,a
,b
,c
,pa
,pb
,pc
,bpp
)
620 void png_read_filter_row_paeth3_vsx(png_row_infop row_info
, png_bytep row
,
621 png_const_bytep prev_row
)
625 int a
, b
, c
, pa
, pb
, pc
, p
;
626 vector
unsigned char rp_vec
;
627 vector
unsigned char pp_vec
;
628 vector
unsigned short a_vec
,b_vec
,c_vec
,nearest_vec
;
629 vector
signed short pa_vec
,pb_vec
,pc_vec
,smallest_vec
;
631 vsx_declare_common_vars(row_info
,row
,prev_row
,bpp
)
636 /* Process the first pixel in the row completely (this is the same as 'up'
637 * because there is only one candidate predictor for the first row).
639 for(i
= 0; i
< bpp
; i
++)
641 *rp
= (png_byte
)( *rp
+ *pp
);
646 for(i
= 0; i
< unaligned_top
; i
++)
648 vsx_paeth_process(rp
,pp
,a
,b
,c
,pa
,pb
,pc
,bpp
)
653 for(i
= 0; i
< bpp
; i
++)
655 vsx_paeth_process(rp
,pp
,a
,b
,c
,pa
,pb
,pc
,bpp
)
660 rp_vec
= vec_ld(0,rp
);
661 vec_ld_unaligned(pp_vec
,pp
);
663 a_vec
= vsx_char_to_short(vec_perm(rp_vec
, VSX_CHAR_ZERO
, VSX_LEFTSHIFTED1_3
),1,3);
664 b_vec
= vsx_char_to_short(vec_perm(pp_vec
, VSX_CHAR_ZERO
, VSX_NOT_SHIFTED1_3
),1,3);
665 c_vec
= vsx_char_to_short(vec_perm(pp_vec
, VSX_CHAR_ZERO
, VSX_LEFTSHIFTED1_3
),1,3);
666 pa_vec
= (vector
signed short) vec_sub(b_vec
,c_vec
);
667 pb_vec
= (vector
signed short) vec_sub(a_vec
, c_vec
);
668 pc_vec
= vec_add(pa_vec
,pb_vec
);
669 pa_vec
= vec_abs(pa_vec
);
670 pb_vec
= vec_abs(pb_vec
);
671 pc_vec
= vec_abs(pc_vec
);
672 smallest_vec
= vec_min(pc_vec
, vec_min(pa_vec
,pb_vec
));
673 nearest_vec
= if_then_else(
674 vec_cmpeq(pa_vec
,smallest_vec
),
677 vec_cmpeq(pb_vec
,smallest_vec
),
682 rp_vec
= vec_add(rp_vec
,(vsx_short_to_char(nearest_vec
,1,3)));
684 a_vec
= vsx_char_to_short(vec_perm(rp_vec
, VSX_CHAR_ZERO
, VSX_LEFTSHIFTED2_3
),2,3);
685 b_vec
= vsx_char_to_short(vec_perm(pp_vec
, VSX_CHAR_ZERO
, VSX_NOT_SHIFTED2_3
),2,3);
686 c_vec
= vsx_char_to_short(vec_perm(pp_vec
, VSX_CHAR_ZERO
, VSX_LEFTSHIFTED2_3
),2,3);
687 pa_vec
= (vector
signed short) vec_sub(b_vec
,c_vec
);
688 pb_vec
= (vector
signed short) vec_sub(a_vec
, c_vec
);
689 pc_vec
= vec_add(pa_vec
,pb_vec
);
690 pa_vec
= vec_abs(pa_vec
);
691 pb_vec
= vec_abs(pb_vec
);
692 pc_vec
= vec_abs(pc_vec
);
693 smallest_vec
= vec_min(pc_vec
, vec_min(pa_vec
,pb_vec
));
694 nearest_vec
= if_then_else(
695 vec_cmpeq(pa_vec
,smallest_vec
),
698 vec_cmpeq(pb_vec
,smallest_vec
),
703 rp_vec
= vec_add(rp_vec
,(vsx_short_to_char(nearest_vec
,2,3)));
705 a_vec
= vsx_char_to_short(vec_perm(rp_vec
, VSX_CHAR_ZERO
, VSX_LEFTSHIFTED3_3
),3,3);
706 b_vec
= vsx_char_to_short(vec_perm(pp_vec
, VSX_CHAR_ZERO
, VSX_NOT_SHIFTED3_3
),3,3);
707 c_vec
= vsx_char_to_short(vec_perm(pp_vec
, VSX_CHAR_ZERO
, VSX_LEFTSHIFTED3_3
),3,3);
708 pa_vec
= (vector
signed short) vec_sub(b_vec
,c_vec
);
709 pb_vec
= (vector
signed short) vec_sub(a_vec
, c_vec
);
710 pc_vec
= vec_add(pa_vec
,pb_vec
);
711 pa_vec
= vec_abs(pa_vec
);
712 pb_vec
= vec_abs(pb_vec
);
713 pc_vec
= vec_abs(pc_vec
);
714 smallest_vec
= vec_min(pc_vec
, vec_min(pa_vec
,pb_vec
));
715 nearest_vec
= if_then_else(
716 vec_cmpeq(pa_vec
,smallest_vec
),
719 vec_cmpeq(pb_vec
,smallest_vec
),
724 rp_vec
= vec_add(rp_vec
,(vsx_short_to_char(nearest_vec
,3,3)));
726 a_vec
= vsx_char_to_short(vec_perm(rp_vec
, VSX_CHAR_ZERO
, VSX_LEFTSHIFTED4_3
),4,3);
727 b_vec
= vsx_char_to_short(vec_perm(pp_vec
, VSX_CHAR_ZERO
, VSX_NOT_SHIFTED4_3
),4,3);
728 c_vec
= vsx_char_to_short(vec_perm(pp_vec
, VSX_CHAR_ZERO
, VSX_LEFTSHIFTED4_3
),4,3);
729 pa_vec
= (vector
signed short) vec_sub(b_vec
,c_vec
);
730 pb_vec
= (vector
signed short) vec_sub(a_vec
, c_vec
);
731 pc_vec
= vec_add(pa_vec
,pb_vec
);
732 pa_vec
= vec_abs(pa_vec
);
733 pb_vec
= vec_abs(pb_vec
);
734 pc_vec
= vec_abs(pc_vec
);
735 smallest_vec
= vec_min(pc_vec
, vec_min(pa_vec
,pb_vec
));
736 nearest_vec
= if_then_else(
737 vec_cmpeq(pa_vec
,smallest_vec
),
740 vec_cmpeq(pb_vec
,smallest_vec
),
745 rp_vec
= vec_add(rp_vec
,(vsx_short_to_char(nearest_vec
,4,3)));
753 /* Since 16 % bpp = 16 % 3 = 1, last element of array must
754 * be proceeded manually
756 vsx_paeth_process(rp
,pp
,a
,b
,c
,pa
,pb
,pc
,bpp
)
760 for (i
= 0; i
< istop
% 16; i
++)
762 vsx_paeth_process(rp
,pp
,a
,b
,c
,pa
,pb
,pc
,bpp
)
766 #endif /* PNG_POWERPC_VSX_OPT > 0 */
767 #endif /* PNG_POWERPC_VSX_IMPLEMENTATION == 1 (intrinsics) */