1 extern void abort (void);
3 typedef float v4flt
__attribute__ ((vector_size (16)));
5 void __attribute__ ((noinline
)) foo (float *dst
, float **src
, int a
, int n
)
8 int z
= sizeof (v4flt
) / sizeof (float);
9 unsigned m
= sizeof (v4flt
) - 1;
11 for (j
= 0; j
< n
&& (((unsigned long) dst
+ j
) & m
); ++j
)
14 for (i
= 1; i
< a
; ++i
)
19 for (; j
< (n
- (4 * z
- 1)); j
+= 4 * z
)
21 v4flt t0
= *(v4flt
*) (src
[0] + j
+ 0 * z
);
22 v4flt t1
= *(v4flt
*) (src
[0] + j
+ 1 * z
);
23 v4flt t2
= *(v4flt
*) (src
[0] + j
+ 2 * z
);
24 v4flt t3
= *(v4flt
*) (src
[0] + j
+ 3 * z
);
25 for (i
= 1; i
< a
; ++i
)
27 t0
+= *(v4flt
*) (src
[i
] + j
+ 0 * z
);
28 t1
+= *(v4flt
*) (src
[i
] + j
+ 1 * z
);
29 t2
+= *(v4flt
*) (src
[i
] + j
+ 2 * z
);
30 t3
+= *(v4flt
*) (src
[i
] + j
+ 3 * z
);
32 *(v4flt
*) (dst
+ j
+ 0 * z
) = t0
;
33 *(v4flt
*) (dst
+ j
+ 1 * z
) = t1
;
34 *(v4flt
*) (dst
+ j
+ 2 * z
) = t2
;
35 *(v4flt
*) (dst
+ j
+ 3 * z
) = t3
;
40 for (i
= 1; i
< a
; ++i
)
55 cptr
= (char *)buffer
;
56 cptr
+= (-(long int) buffer
& (16 * sizeof (float) - 1));
60 for (i
= 0; i
< 16; ++i
)
62 src
[0][i
] = (float) i
+ 11 * (float) i
;
63 src
[1][i
] = (float) i
+ 12 * (float) i
;
65 foo (dst
, src
, 2, 16);
66 for (i
= 0; i
< 16; ++i
)
68 float e
= (float) i
+ 11 * (float) i
+ (float) i
+ 12 * (float) i
;