1 /* { dg-do run { target { powerpc*-*-* && { lp64 && p9vector_hw } } } } */
2 /* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */
3 /* { dg-options "-mcpu=power9 -O2 " } */
5 /* PR83677: This test case used to fail due to mis-generation of the
6 xxpermr instruction. It requires inlining to create enough register
7 pressure that we generate xxpermr rather than vpermr. */
11 void v_expand_u8(vector
unsigned char* a
, vector
unsigned short* b0
, vector
unsigned short* b1
)
13 *b0
= (vector
unsigned short)vec_mergeh(*a
, vec_splats((unsigned char)0));
14 *b1
= (vector
unsigned short)vec_mergel(*a
, vec_splats((unsigned char)0));
17 void v_expand_u16(vector
unsigned short* a
, vector
unsigned int* b0
, vector
unsigned int* b1
)
19 *b0
= (vector
unsigned int)vec_mergeh(*a
, vec_splats((unsigned short)0));
20 *b1
= (vector
unsigned int)vec_mergel(*a
, vec_splats((unsigned short)0));
23 void v_load_deinterleave_u8(unsigned char *ptr
, vector
unsigned char* a
, vector
unsigned char* b
, vector
unsigned char* c
)
25 vector
unsigned char v1
= vec_xl( 0, ptr
);
26 vector
unsigned char v2
= vec_xl(16, ptr
);
27 vector
unsigned char v3
= vec_xl(32, ptr
);
29 static const vector
unsigned char a12_perm
= {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0};
30 static const vector
unsigned char a123_perm
= {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29};
31 *a
= vec_perm(vec_perm(v1
, v2
, a12_perm
), v3
, a123_perm
);
33 static const vector
unsigned char b12_perm
= {1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0, 0, 0};
34 static const vector
unsigned char b123_perm
= {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30};
35 *b
= vec_perm(vec_perm(v1
, v2
, b12_perm
), v3
, b123_perm
);
37 static const vector
unsigned char c12_perm
= {2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 0, 0, 0, 0, 0};
38 static const vector
unsigned char c123_perm
= {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31};
39 *c
= vec_perm(vec_perm(v1
, v2
, c12_perm
), v3
, c123_perm
);
42 void v_load_deinterleave_f32(float *ptr
, vector
float* a
, vector
float* b
, vector
float* c
)
44 vector
float v1
= vec_xl( 0, ptr
);
45 vector
float v2
= vec_xl(16, ptr
);
46 vector
float v3
= vec_xl(32, ptr
);
48 static const vector
unsigned char flp
= {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31};
49 *a
= vec_perm(v1
, vec_sld(v3
, v2
, 8), flp
);
51 static const vector
unsigned char flp2
= {28, 29, 30, 31, 0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19};
52 *b
= vec_perm(v2
, vec_sld(v1
, v3
, 8), flp2
);
54 *c
= vec_perm(vec_sld(v2
, v1
, 8), v3
, flp
);
57 void v_store_interleave_f32(float *ptr
, vector
float a
, vector
float b
, vector
float c
)
59 vector
float hbc
= vec_mergeh(b
, c
);
61 static const vector
unsigned char ahbc
= {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7};
62 vec_xst(vec_perm(a
, hbc
, ahbc
), 0, ptr
);
64 vector
float lab
= vec_mergel(a
, b
);
65 vec_xst(vec_sld(lab
, hbc
, 8), 16, ptr
);
67 static const vector
unsigned char clab
= {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15};
68 vec_xst(vec_perm(c
, lab
, clab
), 32, ptr
);
71 vector
float v_cvt_f32(vector
unsigned int a
)
73 return (vector
float)vec_ctf(a
, 0);
76 void acc_simd_(const unsigned char* src
, float* dst
, const unsigned char* mask
, int len
)
79 const int cVectorWidth
= 16;
81 for ( ; x
<= len
- cVectorWidth
; x
+= cVectorWidth
)
83 vector
unsigned char v_mask
= vec_xl(0, mask
+ x
);
84 v_mask
= (vector
unsigned char)vec_cmpeq(vec_splats((unsigned char)0), v_mask
);
85 v_mask
= (vector
unsigned char)vec_nor(v_mask
, v_mask
);
86 vector
unsigned char v_src0
, v_src1
, v_src2
;
87 v_load_deinterleave_u8((unsigned char *)(src
+ (x
* 3)), &v_src0
, &v_src1
, &v_src2
);
88 v_src0
= v_src0
& v_mask
;
89 v_src1
= v_src1
& v_mask
;
90 v_src2
= v_src2
& v_mask
;
92 /* expand 16 uchar to 4 vectors which contains 4 uint */
93 vector
unsigned short v_src00
, v_src01
, v_src10
, v_src11
, v_src20
, v_src21
;
94 v_expand_u8(&v_src0
, &v_src00
, &v_src01
);
95 v_expand_u8(&v_src1
, &v_src10
, &v_src11
);
96 v_expand_u8(&v_src2
, &v_src20
, &v_src21
);
97 vector
unsigned int v_src000
, v_src001
, v_src010
, v_src011
;
98 vector
unsigned int v_src100
, v_src101
, v_src110
, v_src111
;
99 vector
unsigned int v_src200
, v_src201
, v_src210
, v_src211
;
100 v_expand_u16(&v_src00
, &v_src000
, &v_src001
);
101 v_expand_u16(&v_src01
, &v_src010
, &v_src011
);
102 v_expand_u16(&v_src10
, &v_src100
, &v_src101
);
103 v_expand_u16(&v_src11
, &v_src110
, &v_src111
);
104 v_expand_u16(&v_src20
, &v_src200
, &v_src201
);
105 v_expand_u16(&v_src21
, &v_src210
, &v_src211
);
107 vector
float v_dst000
, v_dst001
, v_dst010
, v_dst011
;
108 vector
float v_dst100
, v_dst101
, v_dst110
, v_dst111
;
109 vector
float v_dst200
, v_dst201
, v_dst210
, v_dst211
;
110 v_load_deinterleave_f32(dst
+ (x
* 3), &v_dst000
, &v_dst100
, &v_dst200
);
111 v_load_deinterleave_f32(dst
+ ((x
+ 4) * 3), &v_dst001
, &v_dst101
, &v_dst201
);
112 v_load_deinterleave_f32(dst
+ ((x
+ 8) * 3), &v_dst010
, &v_dst110
, &v_dst210
);
113 v_load_deinterleave_f32(dst
+ ((x
+ 12) * 3), &v_dst011
, &v_dst111
, &v_dst211
);
115 v_store_interleave_f32(dst
+ (x
* 3), vec_add(v_dst000
, v_cvt_f32(v_src000
)), vec_add(v_dst100
, v_cvt_f32(v_src100
)), vec_add(v_dst200
, v_cvt_f32(v_src200
)));
116 v_store_interleave_f32(dst
+ ((x
+ 4) * 3), vec_add(v_dst001
, v_cvt_f32(v_src001
)), vec_add(v_dst101
, v_cvt_f32(v_src101
)), vec_add(v_dst201
, v_cvt_f32(v_src201
)));
117 v_store_interleave_f32(dst
+ ((x
+ 8) * 3), vec_add(v_dst010
, v_cvt_f32(v_src010
)), vec_add(v_dst110
, v_cvt_f32(v_src110
)), vec_add(v_dst210
, v_cvt_f32(v_src210
)));
118 v_store_interleave_f32(dst
+ ((x
+ 12) * 3), vec_add(v_dst011
, v_cvt_f32(v_src011
)), vec_add(v_dst111
, v_cvt_f32(v_src111
)), vec_add(v_dst211
, v_cvt_f32(v_src211
)));
123 void acc_(const unsigned char* src
, float* dst
, const unsigned char* mask
, int len
)
128 for( ; x
< len
; x
++, src
+= 3, dst
+= 3 )
130 if( mask
[x
] ) /* if mask, R/G/B dst[] += src[] */
132 for( int k
= 0; k
< 3; k
++ )
143 int main(int argc
, char *argv
[])
145 unsigned char __attribute__ ((aligned (16) )) mask
[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1};
146 unsigned char __attribute__ ((aligned (16) )) src
[3*N
];
147 float __attribute__ ((aligned (16) )) dst
[3*N
];
148 float __attribute__ ((aligned (16) )) exp
[3*N
];
152 /* initialize src and dst */
153 for (i
=0; i
<3*N
; i
++) src
[i
] = (unsigned char)(i
*3);
154 for (i
=0; i
<3*N
; i
++) {dst
[i
] = i
* 1.0f
; exp
[i
] = dst
[i
];}
156 acc_(src
, exp
, mask
, N
);
157 acc_simd_(src
, dst
, mask
, N
);
161 if ((dst
[3*i
] != exp
[3*i
]) || (dst
[3*i
+1] != exp
[3*i
+1]) || (dst
[3*i
+2] != exp
[3*i
+2]))