From bde4765b956d0f11d27e3a1603dd6e0b19f63183 Mon Sep 17 00:00:00 2001 From: Nikhil Jain Date: Mon, 26 May 2014 04:57:22 +0000 Subject: [PATCH] BG/Q: force compilation of QPX based memcopy with O3 Recently, we observed erratic behavior by CmiMemcpy on BG/Q which uses QPX unit. We found that if compiled without -O3, QPX based code generates incorrect assembly and hence incorrect output. This workaround forces use of O3 while compiling that part of code, till IBM fixes the issues. Compiler versions that showed this behavior: /soft/compilers/ibmcmp-aug2013/vac/bg/12.1/ (on vesta) /soft/compilers/ibmcmp-feb2014/vac/bg/12.1/ (on vesta) Driver: /bgsys/drivers/V1R2M1/ppc64 Change-Id: I12ee1293955b7cd2cba8317109f22dbf347ef005 --- src/arch/pami/Makefile.machine | 7 ++ src/arch/pamilrts/Makefile.machine | 7 ++ src/util/{cmimemcpy_qpx.h => cmimemcpy_qpx.c} | 29 ++---- src/util/cmimemcpy_qpx.h | 139 +++----------------------- 4 files changed, 37 insertions(+), 145 deletions(-) copy src/util/{cmimemcpy_qpx.h => cmimemcpy_qpx.c} (86%) rewrite src/util/cmimemcpy_qpx.h (94%) diff --git a/src/arch/pami/Makefile.machine b/src/arch/pami/Makefile.machine index e69de29bb2..b404daf863 100644 --- a/src/arch/pami/Makefile.machine +++ b/src/arch/pami/Makefile.machine @@ -0,0 +1,7 @@ +#force compilation of QPX based code with -O3 + +LIBCONV_UTIL := ${LIBCONV_UTIL} cmimemcpy_qpx.o + +cmimemcpy_qpx.o: cmimemcpy_qpx.c cmimemcpy_qpx.h +cmimemcpy_qpx.o: CFLAGS:=${CFLAGS} -O3 + diff --git a/src/arch/pamilrts/Makefile.machine b/src/arch/pamilrts/Makefile.machine index e69de29bb2..b404daf863 100644 --- a/src/arch/pamilrts/Makefile.machine +++ b/src/arch/pamilrts/Makefile.machine @@ -0,0 +1,7 @@ +#force compilation of QPX based code with -O3 + +LIBCONV_UTIL := ${LIBCONV_UTIL} cmimemcpy_qpx.o + +cmimemcpy_qpx.o: cmimemcpy_qpx.c cmimemcpy_qpx.h +cmimemcpy_qpx.o: CFLAGS:=${CFLAGS} -O3 + diff --git a/src/util/cmimemcpy_qpx.h b/src/util/cmimemcpy_qpx.c similarity index 86% copy from src/util/cmimemcpy_qpx.h copy to src/util/cmimemcpy_qpx.c index b2d25ca7d5..9167106680 100644 --- a/src/util/cmimemcpy_qpx.h +++ b/src/util/cmimemcpy_qpx.c @@ -1,8 +1,5 @@ - -#ifndef __CMI_MEMCPY_QPX__ -#define __CMI_MEMCPY_QPX__ - #include +#include "cmimemcpy_qpx.h" #define QPX_LOAD(si,sb,fp) \ do { \ @@ -22,12 +19,8 @@ #define FP_REG1(i) "fr"#i #endif -#ifdef __cplusplus -extern "C" { -#endif - //Copy 512 bytes from a 32b aligned pointers -static inline size_t quad_copy_512( char* dest, char* src ) { +static inline size_t quad_copy_512( char* dest, char* src ) { register double *fpp1_1, *fpp1_2; register double *fpp2_1, *fpp2_2; @@ -78,7 +71,7 @@ static inline size_t quad_copy_512( char* dest, char* src ) { QPX_STORE(fpp2_1,r1,f1); QPX_LOAD(fpp1_2,r1,f1); QPX_STORE(fpp2_1,r2,f2); - QPX_LOAD(fpp1_2,r2,f2); + QPX_LOAD(fpp1_2,r2,f2); QPX_STORE(fpp2_1,r3,f3); QPX_LOAD(fpp1_2,r3,f3); QPX_STORE(fpp2_1,r4,f4); @@ -89,7 +82,7 @@ static inline size_t quad_copy_512( char* dest, char* src ) { QPX_LOAD(fpp1_2,r6,f6); QPX_STORE(fpp2_1,r7,f7); QPX_LOAD(fpp1_2,r7,f7); - + QPX_STORE(fpp2_2,r0,f0); QPX_STORE(fpp2_2,r1,f1); QPX_STORE(fpp2_2,r2,f2); @@ -102,7 +95,7 @@ static inline size_t quad_copy_512( char* dest, char* src ) { return 0; } -void CmiMemcpy_qpx (void *dst, const void *src, size_t n) +void CmiMemcpy_qpx (void *dst, const void *src, size_t n) { const char *s = src; char *d = dst; @@ -112,13 +105,7 @@ void CmiMemcpy_qpx (void *dst, const void *src, size_t n) d += 512; s += 512; } - - if ( (n & 511UL) != 0 ) - memcpy (d, s, n & 511UL); + + if ( (n & 511UL) != 0 ) + memcpy (d, s, n & 511UL); } - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/src/util/cmimemcpy_qpx.h b/src/util/cmimemcpy_qpx.h dissimilarity index 94% index b2d25ca7d5..e38ef54528 100644 --- a/src/util/cmimemcpy_qpx.h +++ b/src/util/cmimemcpy_qpx.h @@ -1,124 +1,15 @@ - -#ifndef __CMI_MEMCPY_QPX__ -#define __CMI_MEMCPY_QPX__ - -#include - -#define QPX_LOAD(si,sb,fp) \ - do { \ - asm volatile("qvlfdx %0,%1,%2": "=f"(fp) : "b" (si), "r" (sb)); \ - } while(0); - -#define QPX_STORE(si,sb,fp) \ - do { \ - asm volatile("qvstfdx %2,%0,%1": : "b" (si), "r" (sb), "f"(fp) :"memory"); \ - } while(0); - -#ifndef __GNUC__ -#define FP_REG(i) asm("f"#i) -#define FP_REG1(i) "fr"#i -#else -#define FP_REG(i) asm("fr"#i) -#define FP_REG1(i) "fr"#i -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -//Copy 512 bytes from a 32b aligned pointers -static inline size_t quad_copy_512( char* dest, char* src ) { - register double *fpp1_1, *fpp1_2; - register double *fpp2_1, *fpp2_2; - - register double f0 FP_REG(0); - register double f1 FP_REG(1); - register double f2 FP_REG(2); - register double f3 FP_REG(3); - register double f4 FP_REG(4); - register double f5 FP_REG(5); - register double f6 FP_REG(6); - register double f7 FP_REG(7); - - int r0; - int r1; - int r2; - int r3; - int r4; - int r5; - int r6; - int r7; - r0 = 0; - r1 = 64; - r2 = 128; - r3 = 192; - r4 = 256; - r5 = 320; - r6 = 384; - r7 = 448; - - fpp1_1 = (double *)src; - fpp1_2 = (double *)src +4; - - fpp2_1 = (double *)dest; - fpp2_2 = (double *)dest +4; - - QPX_LOAD(fpp1_1,r0,f0); - //asm volatile("qvlfdx 0,%0,%1": : "Ob" (fpp1_1), "r"(r0) :"memory"); - QPX_LOAD(fpp1_1,r1,f1); - QPX_LOAD(fpp1_1,r2,f2); - QPX_LOAD(fpp1_1,r3,f3); - QPX_LOAD(fpp1_1,r4,f4); - QPX_LOAD(fpp1_1,r5,f5); - QPX_LOAD(fpp1_1,r6,f6); - QPX_LOAD(fpp1_1,r7,f7); - - QPX_STORE(fpp2_1,r0,f0); - QPX_LOAD(fpp1_2,r0,f0); - QPX_STORE(fpp2_1,r1,f1); - QPX_LOAD(fpp1_2,r1,f1); - QPX_STORE(fpp2_1,r2,f2); - QPX_LOAD(fpp1_2,r2,f2); - QPX_STORE(fpp2_1,r3,f3); - QPX_LOAD(fpp1_2,r3,f3); - QPX_STORE(fpp2_1,r4,f4); - QPX_LOAD(fpp1_2,r4,f4); - QPX_STORE(fpp2_1,r5,f5); - QPX_LOAD(fpp1_2,r5,f5); - QPX_STORE(fpp2_1,r6,f6); - QPX_LOAD(fpp1_2,r6,f6); - QPX_STORE(fpp2_1,r7,f7); - QPX_LOAD(fpp1_2,r7,f7); - - QPX_STORE(fpp2_2,r0,f0); - QPX_STORE(fpp2_2,r1,f1); - QPX_STORE(fpp2_2,r2,f2); - QPX_STORE(fpp2_2,r3,f3); - QPX_STORE(fpp2_2,r4,f4); - QPX_STORE(fpp2_2,r5,f5); - QPX_STORE(fpp2_2,r6,f6); - QPX_STORE(fpp2_2,r7,f7); - - return 0; -} - -void CmiMemcpy_qpx (void *dst, const void *src, size_t n) -{ - const char *s = src; - char *d = dst; - int n512 = n >> 9; - while (n512 --) { - quad_copy_512(d, s); - d += 512; - s += 512; - } - - if ( (n & 511UL) != 0 ) - memcpy (d, s, n & 511UL); -} - -#ifdef __cplusplus -} -#endif - -#endif + +#ifndef __CMI_MEMCPY_QPX__ +#define __CMI_MEMCPY_QPX__ + +#ifdef __cplusplus +extern "C" { +#endif + +void CmiMemcpy_qpx (void *dst, const void *src, size_t n); + +#ifdef __cplusplus +} +#endif + +#endif -- 2.11.4.GIT