1 /* { dg-do compile } */
2 /* { dg-require-effective-target ia32 } */
3 /* { dg-options "-O3 -msse2 -fdump-rtl-csa" } */
4 /* { dg-skip-if "no stdint" { vxworks_kernel } } */
9 typedef __SIZE_TYPE__
size_t;
10 typedef float vFloat
__attribute__ ((__vector_size__ (16)));
11 typedef double vDouble
__attribute__ ((__vector_size__ (16)));
24 long (*func
)(struct Job
*job
);
37 static const double r
[256], t
[256];
39 long bar (const buf
*src
, const buf
*dest
, float g
, unsigned int flags
)
41 float *d0
= (float*) src
->data
;
42 float *d1
= (float*) dest
->data
;
43 uintptr_t w
= dest
->w
;
46 static const vFloat m0
;
47 static const vDouble p
[3], m
, b
;
50 for( idx
= 0; idx
+ 8 <= w
; idx
+= 8 )
52 vFloat f0
= _mm_loadu_ps (sr
);
53 vFloat f1
= _mm_loadu_ps (sr
+ 4);
55 vFloat fa0
= _mm_andnot_ps (m0
, f0
);
56 vFloat fa1
= _mm_andnot_ps (m0
, f1
);
57 vDouble v0
= _mm_cvtps_pd (fa0
);
58 vDouble v1
= _mm_cvtps_pd (_mm_movehl_ps (fa0
, fa0
));
59 vDouble v2
= _mm_cvtps_pd (fa1
);
60 vDouble v3
= _mm_cvtps_pd (_mm_movehl_ps (fa1
, fa1
));
61 vDouble vi0
, vi1
, vi2
, vi3
;
62 __m128i b0
, b1
, b2
, b3
;
63 b0
= _mm_packs_epi32 (_mm_packs_epi32 (b0
, b1
), _mm_packs_epi32 (b2
, b3
));
64 b1
= _mm_srli_epi64 (b0
, 32);
65 unsigned int i0
= _mm_cvtsi128_si32 (b0
);
66 unsigned int i2
= _mm_cvtsi128_si32 (b1
);
67 v0
-= _mm_loadh_pd (_mm_load_sd (r
+ (i0
& 0xff)), r
+ (i0
>> 16));
68 v1
-= _mm_loadh_pd (_mm_load_sd (r
+ (i2
& 0xff)), r
+ (i2
>> 16));
69 b0
= _mm_unpackhi_epi64 (b0
, b0
);
70 b1
= _mm_unpackhi_epi64 (b1
, b1
);
71 unsigned int i4
= _mm_cvtsi128_si32 (b0
);
72 unsigned int i6
= _mm_cvtsi128_si32 (b1
);
73 v2
-= _mm_loadh_pd (_mm_load_sd (r
+ (i4
& 0xff)), r
+ (i4
>> 16));
74 v3
-= _mm_loadh_pd (_mm_load_sd (r
+ (i6
& 0xff)), r
+ (i6
>> 16));
75 v0
= p
[0] + (p
[1] + p
[2] * v0
) * v0
;
76 v1
= p
[0] + (p
[1] + p
[2] * v1
) * v1
;
77 v2
= p
[0] + (p
[1] + p
[2] * v2
) * v2
;
78 v3
= p
[0] + (p
[1] + p
[2] * v3
) * v3
;
79 vi0
= (vDouble
) _mm_slli_epi64 ((__m128i
)((vi0
+ b
) + m
), 52);
80 vi1
= (vDouble
) _mm_slli_epi64 ((__m128i
)((vi1
+ b
) + m
), 52);
81 vi2
= (vDouble
) _mm_slli_epi64 ((__m128i
)((vi2
+ b
) + m
), 52);
82 vi3
= (vDouble
) _mm_slli_epi64 ((__m128i
)((vi3
+ b
) + m
), 52);
83 vi0
*= _mm_loadh_pd (_mm_load_sd (t
+ (i0
& 0xff)), t
+ (i0
>> 16));
84 vi1
*= _mm_loadh_pd (_mm_load_sd (t
+ (i2
& 0xff)), t
+ (i2
>> 16));
85 vi2
*= _mm_loadh_pd (_mm_load_sd (t
+ (i4
& 0xff)), t
+ (i4
>> 16));
86 vi3
*= _mm_loadh_pd (_mm_load_sd (t
+ (i6
& 0xff)), t
+ (i6
>> 16));
91 vFloat r0
= _mm_movelh_ps (_mm_cvtpd_ps( v0
), _mm_cvtpd_ps (v1
));
92 vFloat r1
= _mm_movelh_ps (_mm_cvtpd_ps( v2
), _mm_cvtpd_ps (v3
));
93 vFloat z0
= _mm_cmpeq_ps (f0
, _mm_setzero_ps());
94 vFloat z1
= _mm_cmpeq_ps (f1
, _mm_setzero_ps());
95 r0
= _mm_andnot_ps (z0
, r0
);
96 r1
= _mm_andnot_ps (z1
, r1
);
97 z0
= _mm_and_ps (z0
, p0
);
98 z1
= _mm_and_ps (z1
, p0
);
99 r0
= _mm_or_ps (r0
, z0
);
100 r1
= _mm_or_ps (r1
, z1
);
101 _mm_storeu_ps (dr
, r0
);
102 _mm_storeu_ps (dr
+ 4, r1
);
111 return bar (&jd
->src
, &jd
->dest
, jd
->g
, jd
->flags
);
114 /* { dg-final { scan-rtl-dump-not "deleted 1 dead insns" "csa" } } */
115 /* { dg-final { cleanup-rtl-dump "csa" } } */