Fix parallel build of examples/charm++/user-driven-interop
[charm.git] / src / util / cmimemcpy_qpx.c
blob6a7c17f66e7a89f5ec2c4ede35a0d3455228bba2
1 #include <string.h>
2 #include "cmimemcpy_qpx.h"
4 #define QPX_LOAD(si,sb,fp) \
5 do { \
6 asm volatile("qvlfdx %0,%1,%2": "=f"(fp) : "b" (si), "r" (sb)); \
7 } while(0);
9 #define QPX_STORE(si,sb,fp) \
10 do { \
11 asm volatile("qvstfdx %2,%0,%1": : "b" (si), "r" (sb), "f"(fp) :"memory"); \
12 } while(0);
14 #ifndef __GNUC__
15 #define FP_REG(i) asm("f"#i)
16 #define FP_REG1(i) "fr"#i
17 #else
18 #define FP_REG(i) asm("fr"#i)
19 #define FP_REG1(i) "fr"#i
20 #endif
22 //Copy 512 bytes from a 32b aligned pointers
23 static inline size_t quad_copy_512( char* dest, const char* src ) {
24 register const double *fpp1_1, *fpp1_2;
25 register double *fpp2_1, *fpp2_2;
27 register double f0 FP_REG(0);
28 register double f1 FP_REG(1);
29 register double f2 FP_REG(2);
30 register double f3 FP_REG(3);
31 register double f4 FP_REG(4);
32 register double f5 FP_REG(5);
33 register double f6 FP_REG(6);
34 register double f7 FP_REG(7);
36 int r0;
37 int r1;
38 int r2;
39 int r3;
40 int r4;
41 int r5;
42 int r6;
43 int r7;
44 r0 = 0;
45 r1 = 64;
46 r2 = 128;
47 r3 = 192;
48 r4 = 256;
49 r5 = 320;
50 r6 = 384;
51 r7 = 448;
53 fpp1_1 = (const double *)src;
54 fpp1_2 = (const double *)src +4;
56 fpp2_1 = (double *)dest;
57 fpp2_2 = (double *)dest +4;
59 QPX_LOAD(fpp1_1,r0,f0);
60 //asm volatile("qvlfdx 0,%0,%1": : "Ob" (fpp1_1), "r"(r0) :"memory");
61 QPX_LOAD(fpp1_1,r1,f1);
62 QPX_LOAD(fpp1_1,r2,f2);
63 QPX_LOAD(fpp1_1,r3,f3);
64 QPX_LOAD(fpp1_1,r4,f4);
65 QPX_LOAD(fpp1_1,r5,f5);
66 QPX_LOAD(fpp1_1,r6,f6);
67 QPX_LOAD(fpp1_1,r7,f7);
69 QPX_STORE(fpp2_1,r0,f0);
70 QPX_LOAD(fpp1_2,r0,f0);
71 QPX_STORE(fpp2_1,r1,f1);
72 QPX_LOAD(fpp1_2,r1,f1);
73 QPX_STORE(fpp2_1,r2,f2);
74 QPX_LOAD(fpp1_2,r2,f2);
75 QPX_STORE(fpp2_1,r3,f3);
76 QPX_LOAD(fpp1_2,r3,f3);
77 QPX_STORE(fpp2_1,r4,f4);
78 QPX_LOAD(fpp1_2,r4,f4);
79 QPX_STORE(fpp2_1,r5,f5);
80 QPX_LOAD(fpp1_2,r5,f5);
81 QPX_STORE(fpp2_1,r6,f6);
82 QPX_LOAD(fpp1_2,r6,f6);
83 QPX_STORE(fpp2_1,r7,f7);
84 QPX_LOAD(fpp1_2,r7,f7);
86 QPX_STORE(fpp2_2,r0,f0);
87 QPX_STORE(fpp2_2,r1,f1);
88 QPX_STORE(fpp2_2,r2,f2);
89 QPX_STORE(fpp2_2,r3,f3);
90 QPX_STORE(fpp2_2,r4,f4);
91 QPX_STORE(fpp2_2,r5,f5);
92 QPX_STORE(fpp2_2,r6,f6);
93 QPX_STORE(fpp2_2,r7,f7);
95 return 0;
98 void CmiMemcpy_qpx (void *dst, const void *src, size_t n)
100 const char *s = src;
101 char *d = dst;
102 int n512 = n >> 9;
103 while (n512 --) {
104 quad_copy_512(d, s);
105 d += 512;
106 s += 512;
109 if ( (n & 511UL) != 0 )
110 memcpy (d, s, n & 511UL);