update png.library to v53 (libpng 1.6.36). autogenerate the config file. (NicJA)
[AROS.git] / workbench / libs / png / powerpc / filter_vsx_intrinsics.c
blob01cf8800dc47e3540d59e60e3aa57573eae2a065
1 /* filter_vsx_intrinsics.c - PowerPC optimised filter functions
3 * Copyright (c) 2018 Cosmin Truta
4 * Copyright (c) 2017 Glenn Randers-Pehrson
5 * Written by Vadim Barkov, 2017.
7 * This code is released under the libpng license.
8 * For conditions of distribution and use, see the disclaimer
9 * and license in png.h
12 #include <stdio.h>
13 #include <stdint.h>
14 #include "../pngpriv.h"
16 #ifdef PNG_READ_SUPPORTED
18 /* This code requires -maltivec and -mvsx on the command line: */
19 #if PNG_POWERPC_VSX_IMPLEMENTATION == 1 /* intrinsics code from pngpriv.h */
21 #include <altivec.h>
23 #if PNG_POWERPC_VSX_OPT > 0
25 #ifndef __VSX__
26 # error "This code requires VSX support (POWER7 and later). Please provide -mvsx compiler flag."
27 #endif
29 #define vec_ld_unaligned(vec,data) vec = vec_vsx_ld(0,data)
30 #define vec_st_unaligned(vec,data) vec_vsx_st(vec,0,data)
33 /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
34 * They're positioned like this:
35 * prev: c b
36 * row: a d
37 * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
38 * whichever of a, b, or c is closest to p=a+b-c.
39 * ( this is taken from ../intel/filter_sse2_intrinsics.c )
42 #define vsx_declare_common_vars(row_info,row,prev_row,offset) \
43 png_byte i;\
44 png_bytep rp = row + offset;\
45 png_const_bytep pp = prev_row;\
46 size_t unaligned_top = 16 - (((size_t)rp % 16));\
47 size_t istop;\
48 if(unaligned_top == 16)\
49 unaligned_top = 0;\
50 istop = row_info->rowbytes;\
51 if((unaligned_top < istop))\
52 istop -= unaligned_top;\
53 else{\
54 unaligned_top = istop;\
55 istop = 0;\
58 void png_read_filter_row_up_vsx(png_row_infop row_info, png_bytep row,
59 png_const_bytep prev_row)
61 vector unsigned char rp_vec;
62 vector unsigned char pp_vec;
63 vsx_declare_common_vars(row_info,row,prev_row,0)
65 /* Altivec operations require 16-byte aligned data
66 * but input can be unaligned. So we calculate
67 * unaligned part as usual.
69 for (i = 0; i < unaligned_top; i++)
71 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
72 rp++;
75 /* Using SIMD while we can */
76 while( istop >= 16 )
78 rp_vec = vec_ld(0,rp);
79 vec_ld_unaligned(pp_vec,pp);
81 rp_vec = vec_add(rp_vec,pp_vec);
83 vec_st(rp_vec,0,rp);
85 pp += 16;
86 rp += 16;
87 istop -= 16;
90 if(istop > 0)
92 /* If byte count of row is not divisible by 16
93 * we will process remaining part as usual
95 for (i = 0; i < istop; i++)
97 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
98 rp++;
104 static const vector unsigned char VSX_LEFTSHIFTED1_4 = {16,16,16,16, 0, 1, 2, 3,16,16,16,16,16,16,16,16};
105 static const vector unsigned char VSX_LEFTSHIFTED2_4 = {16,16,16,16,16,16,16,16, 4, 5, 6, 7,16,16,16,16};
106 static const vector unsigned char VSX_LEFTSHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 8, 9,10,11};
108 static const vector unsigned char VSX_LEFTSHIFTED1_3 = {16,16,16, 0, 1, 2,16,16,16,16,16,16,16,16,16,16};
109 static const vector unsigned char VSX_LEFTSHIFTED2_3 = {16,16,16,16,16,16, 3, 4, 5,16,16,16,16,16,16,16};
110 static const vector unsigned char VSX_LEFTSHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 6, 7, 8,16,16,16,16};
111 static const vector unsigned char VSX_LEFTSHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 9,10,11,16};
113 static const vector unsigned char VSX_NOT_SHIFTED1_4 = {16,16,16,16, 4, 5, 6, 7,16,16,16,16,16,16,16,16};
114 static const vector unsigned char VSX_NOT_SHIFTED2_4 = {16,16,16,16,16,16,16,16, 8, 9,10,11,16,16,16,16};
115 static const vector unsigned char VSX_NOT_SHIFTED3_4 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,15};
117 static const vector unsigned char VSX_NOT_SHIFTED1_3 = {16,16,16, 3, 4, 5,16,16,16,16,16,16,16,16,16,16};
118 static const vector unsigned char VSX_NOT_SHIFTED2_3 = {16,16,16,16,16,16, 6, 7, 8,16,16,16,16,16,16,16};
119 static const vector unsigned char VSX_NOT_SHIFTED3_3 = {16,16,16,16,16,16,16,16,16, 9,10,11,16,16,16,16};
120 static const vector unsigned char VSX_NOT_SHIFTED4_3 = {16,16,16,16,16,16,16,16,16,16,16,16,12,13,14,16};
122 static const vector unsigned char VSX_CHAR_ZERO = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
123 #ifdef __LITTLE_ENDIAN__
125 static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = { 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16,16};
126 static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = { 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16};
127 static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {12,16,13,16,14,16,15,16,16,16,16,16,16,16,16,16};
129 static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 0, 2, 4, 6,16,16,16,16,16,16,16,16};
130 static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 0, 2, 4, 6,16,16,16,16};
131 static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4, 6};
133 static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = { 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16,16};
134 static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = { 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16,16};
135 static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = { 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16,16};
136 static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {12,16,13,16,14,16,16,16,16,16,16,16,16,16,16,16};
138 static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 0, 2, 4,16,16,16,16,16,16,16,16,16,16};
139 static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 0, 2, 4,16,16,16,16,16,16,16};
140 static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 0, 2, 4,16,16,16,16};
141 static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 0, 2, 4,16};
143 #elif defined(__BIG_ENDIAN__)
145 static const vector unsigned char VSX_CHAR_TO_SHORT1_4 = {16, 4,16, 5,16, 6,16, 7,16,16,16,16,16,16,16,16};
146 static const vector unsigned char VSX_CHAR_TO_SHORT2_4 = {16, 8,16, 9,16,10,16,11,16,16,16,16,16,16,16,16};
147 static const vector unsigned char VSX_CHAR_TO_SHORT3_4 = {16,12,16,13,16,14,16,15,16,16,16,16,16,16,16,16};
149 static const vector unsigned char VSX_SHORT_TO_CHAR1_4 = {16,16,16,16, 1, 3, 5, 7,16,16,16,16,16,16,16,16};
150 static const vector unsigned char VSX_SHORT_TO_CHAR2_4 = {16,16,16,16,16,16,16,16, 1, 3, 5, 7,16,16,16,16};
151 static const vector unsigned char VSX_SHORT_TO_CHAR3_4 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5, 7};
153 static const vector unsigned char VSX_CHAR_TO_SHORT1_3 = {16, 3,16, 4,16, 5,16,16,16,16,16,16,16,16,16,16};
154 static const vector unsigned char VSX_CHAR_TO_SHORT2_3 = {16, 6,16, 7,16, 8,16,16,16,16,16,16,16,16,16,16};
155 static const vector unsigned char VSX_CHAR_TO_SHORT3_3 = {16, 9,16,10,16,11,16,16,16,16,16,16,16,16,16,16};
156 static const vector unsigned char VSX_CHAR_TO_SHORT4_3 = {16,12,16,13,16,14,16,16,16,16,16,16,16,16,16,16};
158 static const vector unsigned char VSX_SHORT_TO_CHAR1_3 = {16,16,16, 1, 3, 5,16,16,16,16,16,16,16,16,16,16};
159 static const vector unsigned char VSX_SHORT_TO_CHAR2_3 = {16,16,16,16,16,16, 1, 3, 5,16,16,16,16,16,16,16};
160 static const vector unsigned char VSX_SHORT_TO_CHAR3_3 = {16,16,16,16,16,16,16,16,16, 1, 3, 5,16,16,16,16};
161 static const vector unsigned char VSX_SHORT_TO_CHAR4_3 = {16,16,16,16,16,16,16,16,16,16,16,16, 1, 3, 5,16};
163 #endif
165 #define vsx_char_to_short(vec,offset,bpp) (vector unsigned short)vec_perm((vec),VSX_CHAR_ZERO,VSX_CHAR_TO_SHORT##offset##_##bpp)
166 #define vsx_short_to_char(vec,offset,bpp) vec_perm(((vector unsigned char)(vec)),VSX_CHAR_ZERO,VSX_SHORT_TO_CHAR##offset##_##bpp)
168 #ifdef PNG_USE_ABS
169 # define vsx_abs(number) abs(number)
170 #else
171 # define vsx_abs(number) (number > 0) ? (number) : -(number)
172 #endif
174 void png_read_filter_row_sub4_vsx(png_row_infop row_info, png_bytep row,
175 png_const_bytep prev_row)
177 png_byte bpp = 4;
179 vector unsigned char rp_vec;
180 vector unsigned char part_vec;
182 vsx_declare_common_vars(row_info,row,prev_row,bpp)
184 PNG_UNUSED(pp)
186 /* Altivec operations require 16-byte aligned data
187 * but input can be unaligned. So we calculate
188 * unaligned part as usual.
190 for (i = 0; i < unaligned_top; i++)
192 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
193 rp++;
196 /* Using SIMD while we can */
197 while( istop >= 16 )
199 for(i=0;i < bpp ; i++)
201 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
202 rp++;
204 rp -= bpp;
206 rp_vec = vec_ld(0,rp);
207 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4);
208 rp_vec = vec_add(rp_vec,part_vec);
210 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4);
211 rp_vec = vec_add(rp_vec,part_vec);
213 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4);
214 rp_vec = vec_add(rp_vec,part_vec);
216 vec_st(rp_vec,0,rp);
218 rp += 16;
219 istop -= 16;
222 if(istop > 0)
223 for (i = 0; i < istop % 16; i++)
225 *rp = (png_byte)(((int)(*rp) + (int)(*(rp - bpp))) & 0xff);
226 rp++;
231 void png_read_filter_row_sub3_vsx(png_row_infop row_info, png_bytep row,
232 png_const_bytep prev_row)
234 png_byte bpp = 3;
236 vector unsigned char rp_vec;
237 vector unsigned char part_vec;
239 vsx_declare_common_vars(row_info,row,prev_row,bpp)
241 PNG_UNUSED(pp)
243 /* Altivec operations require 16-byte aligned data
244 * but input can be unaligned. So we calculate
245 * unaligned part as usual.
247 for (i = 0; i < unaligned_top; i++)
249 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
250 rp++;
253 /* Using SIMD while we can */
254 while( istop >= 16 )
256 for(i=0;i < bpp ; i++)
258 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
259 rp++;
261 rp -= bpp;
263 rp_vec = vec_ld(0,rp);
264 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3);
265 rp_vec = vec_add(rp_vec,part_vec);
267 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3);
268 rp_vec = vec_add(rp_vec,part_vec);
270 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3);
271 rp_vec = vec_add(rp_vec,part_vec);
273 part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3);
274 rp_vec = vec_add(rp_vec,part_vec);
276 vec_st(rp_vec,0,rp);
277 rp += 15;
278 istop -= 16;
280 /* Since 16 % bpp = 16 % 3 = 1, last element of array must
281 * be proceeded manually
283 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
284 rp++;
287 if(istop > 0)
288 for (i = 0; i < istop % 16; i++)
290 *rp = (png_byte)(((int)(*rp) + (int)(*(rp-bpp))) & 0xff);
291 rp++;
295 void png_read_filter_row_avg4_vsx(png_row_infop row_info, png_bytep row,
296 png_const_bytep prev_row)
298 png_byte bpp = 4;
300 vector unsigned char rp_vec;
301 vector unsigned char pp_vec;
302 vector unsigned char pp_part_vec;
303 vector unsigned char rp_part_vec;
304 vector unsigned char avg_vec;
306 vsx_declare_common_vars(row_info,row,prev_row,bpp)
307 rp -= bpp;
308 if(istop >= bpp)
309 istop -= bpp;
311 for (i = 0; i < bpp; i++)
313 *rp = (png_byte)(((int)(*rp) +
314 ((int)(*pp++) / 2 )) & 0xff);
316 rp++;
319 /* Altivec operations require 16-byte aligned data
320 * but input can be unaligned. So we calculate
321 * unaligned part as usual.
323 for (i = 0; i < unaligned_top; i++)
325 *rp = (png_byte)(((int)(*rp) +
326 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
328 rp++;
331 /* Using SIMD while we can */
332 while( istop >= 16 )
334 for(i=0;i < bpp ; i++)
336 *rp = (png_byte)(((int)(*rp) +
337 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
339 rp++;
341 rp -= bpp;
342 pp -= bpp;
344 vec_ld_unaligned(pp_vec,pp);
345 rp_vec = vec_ld(0,rp);
347 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_4);
348 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_4);
349 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
350 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
351 rp_vec = vec_add(rp_vec,avg_vec);
353 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_4);
354 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_4);
355 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
356 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
357 rp_vec = vec_add(rp_vec,avg_vec);
359 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_4);
360 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_4);
361 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
362 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
363 rp_vec = vec_add(rp_vec,avg_vec);
365 vec_st(rp_vec,0,rp);
367 rp += 16;
368 pp += 16;
369 istop -= 16;
372 if(istop > 0)
373 for (i = 0; i < istop % 16; i++)
375 *rp = (png_byte)(((int)(*rp) +
376 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
378 rp++;
382 void png_read_filter_row_avg3_vsx(png_row_infop row_info, png_bytep row,
383 png_const_bytep prev_row)
385 png_byte bpp = 3;
387 vector unsigned char rp_vec;
388 vector unsigned char pp_vec;
389 vector unsigned char pp_part_vec;
390 vector unsigned char rp_part_vec;
391 vector unsigned char avg_vec;
393 vsx_declare_common_vars(row_info,row,prev_row,bpp)
394 rp -= bpp;
395 if(istop >= bpp)
396 istop -= bpp;
398 for (i = 0; i < bpp; i++)
400 *rp = (png_byte)(((int)(*rp) +
401 ((int)(*pp++) / 2 )) & 0xff);
403 rp++;
406 /* Altivec operations require 16-byte aligned data
407 * but input can be unaligned. So we calculate
408 * unaligned part as usual.
410 for (i = 0; i < unaligned_top; i++)
412 *rp = (png_byte)(((int)(*rp) +
413 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
415 rp++;
418 /* Using SIMD while we can */
419 while( istop >= 16 )
421 for(i=0;i < bpp ; i++)
423 *rp = (png_byte)(((int)(*rp) +
424 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
426 rp++;
428 rp -= bpp;
429 pp -= bpp;
431 vec_ld_unaligned(pp_vec,pp);
432 rp_vec = vec_ld(0,rp);
434 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED1_3);
435 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED1_3);
436 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
437 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
438 rp_vec = vec_add(rp_vec,avg_vec);
440 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED2_3);
441 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED2_3);
442 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
443 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
444 rp_vec = vec_add(rp_vec,avg_vec);
446 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED3_3);
447 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED3_3);
448 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
449 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
450 rp_vec = vec_add(rp_vec,avg_vec);
452 rp_part_vec = vec_perm(rp_vec,VSX_CHAR_ZERO,VSX_LEFTSHIFTED4_3);
453 pp_part_vec = vec_perm(pp_vec,VSX_CHAR_ZERO,VSX_NOT_SHIFTED4_3);
454 avg_vec = vec_avg(rp_part_vec,pp_part_vec);
455 avg_vec = vec_sub(avg_vec, vec_and(vec_xor(rp_part_vec,pp_part_vec),vec_splat_u8(1)));
456 rp_vec = vec_add(rp_vec,avg_vec);
458 vec_st(rp_vec,0,rp);
460 rp += 15;
461 pp += 15;
462 istop -= 16;
464 /* Since 16 % bpp = 16 % 3 = 1, last element of array must
465 * be proceeded manually
467 *rp = (png_byte)(((int)(*rp) +
468 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
469 rp++;
472 if(istop > 0)
473 for (i = 0; i < istop % 16; i++)
475 *rp = (png_byte)(((int)(*rp) +
476 (int)(*pp++ + *(rp-bpp)) / 2 ) & 0xff);
478 rp++;
482 /* Bytewise c ? t : e. */
483 #define if_then_else(c,t,e) vec_sel(e,t,c)
485 #define vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp) {\
486 c = *(pp - bpp);\
487 a = *(rp - bpp);\
488 b = *pp++;\
489 p = b - c;\
490 pc = a - c;\
491 pa = vsx_abs(p);\
492 pb = vsx_abs(pc);\
493 pc = vsx_abs(p + pc);\
494 if (pb < pa) pa = pb, a = b;\
495 if (pc < pa) a = c;\
496 a += *rp;\
497 *rp++ = (png_byte)a;\
500 void png_read_filter_row_paeth4_vsx(png_row_infop row_info, png_bytep row,
501 png_const_bytep prev_row)
503 png_byte bpp = 4;
505 int a, b, c, pa, pb, pc, p;
506 vector unsigned char rp_vec;
507 vector unsigned char pp_vec;
508 vector unsigned short a_vec,b_vec,c_vec,nearest_vec;
509 vector signed short pa_vec,pb_vec,pc_vec,smallest_vec;
511 vsx_declare_common_vars(row_info,row,prev_row,bpp)
512 rp -= bpp;
513 if(istop >= bpp)
514 istop -= bpp;
516 /* Process the first pixel in the row completely (this is the same as 'up'
517 * because there is only one candidate predictor for the first row).
519 for(i = 0; i < bpp ; i++)
521 *rp = (png_byte)( *rp + *pp);
522 rp++;
523 pp++;
526 for(i = 0; i < unaligned_top ; i++)
528 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
531 while( istop >= 16)
533 for(i = 0; i < bpp ; i++)
535 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
538 rp -= bpp;
539 pp -= bpp;
540 rp_vec = vec_ld(0,rp);
541 vec_ld_unaligned(pp_vec,pp);
543 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4);
544 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_4),1,4);
545 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_4),1,4);
546 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
547 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
548 pc_vec = vec_add(pa_vec,pb_vec);
549 pa_vec = vec_abs(pa_vec);
550 pb_vec = vec_abs(pb_vec);
551 pc_vec = vec_abs(pc_vec);
552 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
553 nearest_vec = if_then_else(
554 vec_cmpeq(pa_vec,smallest_vec),
555 a_vec,
556 if_then_else(
557 vec_cmpeq(pb_vec,smallest_vec),
558 b_vec,
559 c_vec
562 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,4)));
564 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4);
565 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_4),2,4);
566 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_4),2,4);
567 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
568 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
569 pc_vec = vec_add(pa_vec,pb_vec);
570 pa_vec = vec_abs(pa_vec);
571 pb_vec = vec_abs(pb_vec);
572 pc_vec = vec_abs(pc_vec);
573 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
574 nearest_vec = if_then_else(
575 vec_cmpeq(pa_vec,smallest_vec),
576 a_vec,
577 if_then_else(
578 vec_cmpeq(pb_vec,smallest_vec),
579 b_vec,
580 c_vec
583 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,4)));
585 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4);
586 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_4),3,4);
587 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_4),3,4);
588 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
589 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
590 pc_vec = vec_add(pa_vec,pb_vec);
591 pa_vec = vec_abs(pa_vec);
592 pb_vec = vec_abs(pb_vec);
593 pc_vec = vec_abs(pc_vec);
594 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
595 nearest_vec = if_then_else(
596 vec_cmpeq(pa_vec,smallest_vec),
597 a_vec,
598 if_then_else(
599 vec_cmpeq(pb_vec,smallest_vec),
600 b_vec,
601 c_vec
604 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,4)));
606 vec_st(rp_vec,0,rp);
608 rp += 16;
609 pp += 16;
610 istop -= 16;
613 if(istop > 0)
614 for (i = 0; i < istop % 16; i++)
616 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
620 void png_read_filter_row_paeth3_vsx(png_row_infop row_info, png_bytep row,
621 png_const_bytep prev_row)
623 png_byte bpp = 3;
625 int a, b, c, pa, pb, pc, p;
626 vector unsigned char rp_vec;
627 vector unsigned char pp_vec;
628 vector unsigned short a_vec,b_vec,c_vec,nearest_vec;
629 vector signed short pa_vec,pb_vec,pc_vec,smallest_vec;
631 vsx_declare_common_vars(row_info,row,prev_row,bpp)
632 rp -= bpp;
633 if(istop >= bpp)
634 istop -= bpp;
636 /* Process the first pixel in the row completely (this is the same as 'up'
637 * because there is only one candidate predictor for the first row).
639 for(i = 0; i < bpp ; i++)
641 *rp = (png_byte)( *rp + *pp);
642 rp++;
643 pp++;
646 for(i = 0; i < unaligned_top ; i++)
648 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
651 while( istop >= 16)
653 for(i = 0; i < bpp ; i++)
655 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
658 rp -= bpp;
659 pp -= bpp;
660 rp_vec = vec_ld(0,rp);
661 vec_ld_unaligned(pp_vec,pp);
663 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3);
664 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED1_3),1,3);
665 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED1_3),1,3);
666 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
667 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
668 pc_vec = vec_add(pa_vec,pb_vec);
669 pa_vec = vec_abs(pa_vec);
670 pb_vec = vec_abs(pb_vec);
671 pc_vec = vec_abs(pc_vec);
672 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
673 nearest_vec = if_then_else(
674 vec_cmpeq(pa_vec,smallest_vec),
675 a_vec,
676 if_then_else(
677 vec_cmpeq(pb_vec,smallest_vec),
678 b_vec,
679 c_vec
682 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,1,3)));
684 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3);
685 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED2_3),2,3);
686 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED2_3),2,3);
687 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
688 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
689 pc_vec = vec_add(pa_vec,pb_vec);
690 pa_vec = vec_abs(pa_vec);
691 pb_vec = vec_abs(pb_vec);
692 pc_vec = vec_abs(pc_vec);
693 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
694 nearest_vec = if_then_else(
695 vec_cmpeq(pa_vec,smallest_vec),
696 a_vec,
697 if_then_else(
698 vec_cmpeq(pb_vec,smallest_vec),
699 b_vec,
700 c_vec
703 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,2,3)));
705 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3);
706 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED3_3),3,3);
707 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED3_3),3,3);
708 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
709 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
710 pc_vec = vec_add(pa_vec,pb_vec);
711 pa_vec = vec_abs(pa_vec);
712 pb_vec = vec_abs(pb_vec);
713 pc_vec = vec_abs(pc_vec);
714 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
715 nearest_vec = if_then_else(
716 vec_cmpeq(pa_vec,smallest_vec),
717 a_vec,
718 if_then_else(
719 vec_cmpeq(pb_vec,smallest_vec),
720 b_vec,
721 c_vec
724 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,3,3)));
726 a_vec = vsx_char_to_short(vec_perm(rp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3);
727 b_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_NOT_SHIFTED4_3),4,3);
728 c_vec = vsx_char_to_short(vec_perm(pp_vec , VSX_CHAR_ZERO , VSX_LEFTSHIFTED4_3),4,3);
729 pa_vec = (vector signed short) vec_sub(b_vec,c_vec);
730 pb_vec = (vector signed short) vec_sub(a_vec , c_vec);
731 pc_vec = vec_add(pa_vec,pb_vec);
732 pa_vec = vec_abs(pa_vec);
733 pb_vec = vec_abs(pb_vec);
734 pc_vec = vec_abs(pc_vec);
735 smallest_vec = vec_min(pc_vec, vec_min(pa_vec,pb_vec));
736 nearest_vec = if_then_else(
737 vec_cmpeq(pa_vec,smallest_vec),
738 a_vec,
739 if_then_else(
740 vec_cmpeq(pb_vec,smallest_vec),
741 b_vec,
742 c_vec
745 rp_vec = vec_add(rp_vec,(vsx_short_to_char(nearest_vec,4,3)));
747 vec_st(rp_vec,0,rp);
749 rp += 15;
750 pp += 15;
751 istop -= 16;
753 /* Since 16 % bpp = 16 % 3 = 1, last element of array must
754 * be proceeded manually
756 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
759 if(istop > 0)
760 for (i = 0; i < istop % 16; i++)
762 vsx_paeth_process(rp,pp,a,b,c,pa,pb,pc,bpp)
766 #endif /* PNG_POWERPC_VSX_OPT > 0 */
767 #endif /* PNG_POWERPC_VSX_IMPLEMENTATION == 1 (intrinsics) */
768 #endif /* READ */