From e6754908a3a8587bc7bc1f6fe0e5269924127066 Mon Sep 17 00:00:00 2001 From: Sameer Kumar Date: Tue, 2 Feb 2016 11:20:33 -0600 Subject: [PATCH] Charm's port for newer IBM systems This patch contains Sameer's port for running Charm on top of newer IBM systems. The base layer is PAMI, which should also work on current Blue Gene/Q systems. Note: these changes are for non-LRTS versions, and would need to be ported to a LRTS version. Change-Id: I11c4eca8fdc761a862e570e701a67c1c8fcc9bf2 --- src/arch/pami-bluegeneq/L2AtomicMutex.h | 75 --- src/arch/pami-bluegeneq/L2AtomicQueue.h | 205 --------- src/arch/{pami => pami-bluegeneq}/Makefile.machine | 0 src/arch/pami-bluegeneq/conv-mach-smp.h | 5 +- src/arch/pami-bluegeneq/conv-mach.h | 11 + src/arch/pami-bluegeneq/conv-mach.sh | 6 +- src/arch/pami-bluegeneq/memalloc.c | 126 ----- src/arch/pami-bluegeneq/ppc_atomicq_impl.h | 50 ++ src/arch/pami-linux-ppc64le/cc-gcc.h | 1 + src/arch/pami-linux-ppc64le/cc-gcc.sh | 29 ++ src/arch/pami-linux-ppc64le/cc-xlc.h | 1 + src/arch/pami-linux-ppc64le/cc-xlc.sh | 1 + src/arch/pami-linux-ppc64le/charmrun | 302 ++++++++++++ .../conv-mach-smp.h | 13 +- src/arch/pami-linux-ppc64le/conv-mach-smp.sh | 0 .../conv-mach.h | 48 +- src/arch/pami-linux-ppc64le/conv-mach.sh | 56 +++ src/arch/pami/Makefile.machine | 7 - src/arch/pami/PPCAtomicMutex.h | 82 ++++ src/arch/pami/PPCAtomicQueue.h | 210 +++++++++ src/arch/pami/conv-common.h | 5 +- src/arch/pami/default_ppcq.h | 98 ++++ src/arch/pami/machine.c | 511 +++++++++++++-------- src/arch/pami/manytomany.c | 428 +++++++++-------- src/arch/pami/memalloc.c | 144 ++++++ src/conv-core/convcore.c | 158 ++++++- src/conv-core/converse.h | 6 + src/conv-core/cpuaffinity.c | 45 +- 28 files changed, 1744 insertions(+), 879 deletions(-) delete mode 100644 src/arch/pami-bluegeneq/L2AtomicMutex.h delete mode 100644 src/arch/pami-bluegeneq/L2AtomicQueue.h copy src/arch/{pami => pami-bluegeneq}/Makefile.machine (100%) delete mode 100644 src/arch/pami-bluegeneq/memalloc.c create mode 100644 src/arch/pami-bluegeneq/ppc_atomicq_impl.h create mode 100644 src/arch/pami-linux-ppc64le/cc-gcc.h create mode 100644 src/arch/pami-linux-ppc64le/cc-gcc.sh create mode 100644 src/arch/pami-linux-ppc64le/cc-xlc.h create mode 100644 src/arch/pami-linux-ppc64le/cc-xlc.sh create mode 100755 src/arch/pami-linux-ppc64le/charmrun copy src/arch/{pami-bluegeneq => pami-linux-ppc64le}/conv-mach-smp.h (63%) create mode 100644 src/arch/pami-linux-ppc64le/conv-mach-smp.sh copy src/arch/{pami-bluegeneq => pami-linux-ppc64le}/conv-mach.h (57%) create mode 100644 src/arch/pami-linux-ppc64le/conv-mach.sh create mode 100755 src/arch/pami/PPCAtomicMutex.h create mode 100755 src/arch/pami/PPCAtomicQueue.h create mode 100644 src/arch/pami/default_ppcq.h create mode 100755 src/arch/pami/memalloc.c diff --git a/src/arch/pami-bluegeneq/L2AtomicMutex.h b/src/arch/pami-bluegeneq/L2AtomicMutex.h deleted file mode 100644 index 4082c51f82..0000000000 --- a/src/arch/pami-bluegeneq/L2AtomicMutex.h +++ /dev/null @@ -1,75 +0,0 @@ - -#ifndef __L2_ATOMIC_MUTEX__ -#define __L2_ATOMIC_MUTEX__ - -#include -#include -#include -#include -#include "spi/include/l2/atomic.h" -#include "spi/include/l1p/flush.h" - -#define L2_ATOMIC_MUTEX_FAIL 0x8000000000000000UL - -typedef struct -{ - volatile uint64_t counter; - volatile uint64_t bound; -} L2AtomicMutex; - -L2AtomicMutex *L2AtomicMutexInit (void * l2mem, - size_t l2memsize) -{ - //Verify counter array is 64-byte aligned - assert( (((uintptr_t) l2mem) & (0x0F)) == 0 ); - assert (sizeof(L2AtomicMutex) <= l2memsize); - - L2AtomicMutex *mutex = (L2AtomicMutex*)l2mem; - L2_AtomicStore(&mutex->counter, 0); - L2_AtomicStore(&mutex->bound, 1); - - return mutex; -} - -/** - * \brief Try to acquire a mutex - * \param[in] mutex pointer - * \return 0 Lock successfully acquired - * \return 1 Lock was not acquired - */ -static inline int L2AtomicMutexTryAcquire (L2AtomicMutex *mutex) -{ - size_t rc = L2_AtomicLoadIncrementBounded(&mutex->counter); - return (rc == L2_ATOMIC_MUTEX_FAIL) ? (1) : (0); -} - -/** - * \brief Acquire a mutex - * \param[in] mutex pointer - * \return 0 Lock successfully acquired - */ -static inline void L2AtomicMutexAcquire (L2AtomicMutex *mutex) -{ - size_t rc = 0; - do { - rc = L2_AtomicLoadIncrementBounded(&mutex->counter); - } while (rc == L2_ATOMIC_MUTEX_FAIL); -} - -/** - * \brief Release a mutex - * \param[in] mutex pointer - * \return 0 Lock successfully released - * \return 1 Fail - */ -static inline void L2AtomicMutexRelease(L2AtomicMutex *mutex) -{ - //Flush outstanding loads/stores - ppc_msync(); - - /* Release the lock */ - L2_AtomicStore(&(mutex->counter), 0); -} - - -#endif diff --git a/src/arch/pami-bluegeneq/L2AtomicQueue.h b/src/arch/pami-bluegeneq/L2AtomicQueue.h deleted file mode 100644 index d1db471134..0000000000 --- a/src/arch/pami-bluegeneq/L2AtomicQueue.h +++ /dev/null @@ -1,205 +0,0 @@ - -#ifndef __L2_ATOMIC_QUEUE__ -#define __L2_ATOMIC_QUEUE__ - -#include -#include -#include -#include -#include "spi/include/l2/atomic.h" -#include "spi/include/l1p/flush.h" -#include "pcqueue.h" - -#define DEFAULT_SIZE 1024 -#define L2_ATOMIC_FULL 0x8000000000000000UL -#define L2_ATOMIC_EMPTY 0x8000000000000000UL - -#define L2A_SUCCESS 0 -#define L2A_EAGAIN -1 -#define L2A_FAIL -2 - -#define __L2_ATOMIC_QUEUE_BLOCKING 1 - -typedef void* L2AtomicQueueElement; - -typedef struct _l2atomicstate { - volatile uint64_t Consumer; // not used atomically - volatile uint64_t Producer; - volatile uint64_t UpperBound; - volatile uint64_t Flush; // contents not used -} L2AtomicState; - -typedef struct _l2atomicq { - L2AtomicState * _l2state; - volatile void * volatile * _array; - int _useOverflowQ; - int _qsize; - PCQueue _overflowQ; - pthread_mutex_t _overflowMutex; -} L2AtomicQueue; - -void L2AtomicQueueInit (void * l2mem, - size_t l2memsize, - L2AtomicQueue * queue, - int use_overflow, - int nelem) -{ - pami_result_t rc; - - //Verify counter array is 64-byte aligned - assert( (((uintptr_t) l2mem) & (0x1F)) == 0 ); - assert (sizeof(L2AtomicState) <= l2memsize); - - queue->_useOverflowQ = use_overflow; - - int qsize = 2; - while (qsize < nelem) - qsize *= 2; - queue->_qsize = qsize; - - queue->_l2state = (L2AtomicState *)l2mem; - pthread_mutex_init(&queue->_overflowMutex, NULL); - queue->_overflowQ = PCQueueCreate(); - L2_AtomicStore(&queue->_l2state->Consumer, 0); - L2_AtomicStore(&queue->_l2state->Producer, 0); - L2_AtomicStore(&queue->_l2state->UpperBound, qsize); - - rc = posix_memalign ((void **)&queue->_array, - 64, /*L1 line size for BG/Q */ - sizeof(L2AtomicQueueElement) * qsize); - - assert(rc == PAMI_SUCCESS); - memset((void*)queue->_array, 0, sizeof(L2AtomicQueueElement)*qsize); -} - -int L2AtomicEnqueue (L2AtomicQueue * queue, - void * element) -{ - //fprintf(stderr,"Insert message %p\n", element); - - register int qsize_1 = queue->_qsize - 1; - uint64_t index = L2_AtomicLoadIncrementBounded(&queue->_l2state->Producer); - L1P_FlushRequests(); - if (index != L2_ATOMIC_FULL) { - queue->_array[index & qsize_1] = element; - return L2A_SUCCESS; - } - - //We dont want to use the overflow queue - if (!queue->_useOverflowQ) - return L2A_EAGAIN; //Q is full, try later - - //No ordering is guaranteed if there is overflow - pthread_mutex_lock(&queue->_overflowMutex); - PCQueuePush(queue->_overflowQ, element); - pthread_mutex_unlock(&queue->_overflowMutex); - - return L2A_SUCCESS; -} - -void * L2AtomicDequeue (L2AtomicQueue *queue) -{ - uint64_t head, tail; - tail = queue->_l2state->Producer; - head = queue->_l2state->Consumer; - register int qsize_1 = queue->_qsize-1; - - volatile void *e = NULL; - if (head < tail) { - e = queue->_array[head & qsize_1]; -#if __L2_ATOMIC_QUEUE_BLOCKING - while (e == NULL) - e = queue->_array[head & qsize_1]; -#else - if (e == NULL) - return NULL; -#endif - - queue->_array[head & qsize_1] = NULL; - ppc_msync(); - - head ++; - queue->_l2state->Consumer = head; - - //Charm++ does not require message ordering - //So we dont acquire overflow mutex here - uint64_t n = head + queue->_qsize; - // is atomic-store needed? - L2_AtomicStore(&queue->_l2state->UpperBound, n); - return (void*) e; - } - - //We dont have an overflowQ - if (!queue->_useOverflowQ) - return NULL; - - /* head == tail (head cannot be greater than tail) */ - if (PCQueueLength(queue->_overflowQ) > 0) { - pthread_mutex_lock(&queue->_overflowMutex); - e = PCQueuePop (queue->_overflowQ); - pthread_mutex_unlock(&queue->_overflowMutex); - - return (void *) e; - } - - return (void *) e; -} - -int L2AtomicQueueEmpty (L2AtomicQueue *queue) { - return ( (PCQueueLength(queue->_overflowQ) == 0) && - (queue->_l2state->Producer == queue->_l2state->Consumer) ); -} - -//spin block in the L2 atomic queue till there is a message. fail and -//return after n iterations -int L2AtomicQueueSpinWait (L2AtomicQueue * queue, - int n) -{ - if (!L2AtomicQueueEmpty(queue)) - return 0; //queue is not empty so return - - uint64_t head, tail; - head = queue->_l2state->Consumer; - - size_t i = n; - do { - tail = queue->_l2state->Producer; - i--; - } - //While the queue is empty and i < n - while (head == tail && i != 0); - - return 0; //fail queue is empty -} - -//spin block in the L2 atomic queue till there is a message. fail and -//return after n iterations -int L2AtomicQueue2QSpinWait (L2AtomicQueue * queue0, - L2AtomicQueue * queue1, - int n) -{ - if (!L2AtomicQueueEmpty(queue0)) - return 0; //queue0 is not empty so return - - if (!L2AtomicQueueEmpty(queue1)) - return 0; //queue is not empty so return - - uint64_t head0, tail0; - uint64_t head1, tail1; - - head0 = queue0->_l2state->Consumer; - head1 = queue1->_l2state->Consumer; - - size_t i = n; - do { - tail0 = queue0->_l2state->Producer; - tail1 = queue1->_l2state->Producer; - i --; - } while (head0==tail0 && head1==tail1 && i!=0); - - return 0; -} - - - -#endif diff --git a/src/arch/pami/Makefile.machine b/src/arch/pami-bluegeneq/Makefile.machine similarity index 100% copy from src/arch/pami/Makefile.machine copy to src/arch/pami-bluegeneq/Makefile.machine diff --git a/src/arch/pami-bluegeneq/conv-mach-smp.h b/src/arch/pami-bluegeneq/conv-mach-smp.h index d6c8f652b6..b84346c86a 100644 --- a/src/arch/pami-bluegeneq/conv-mach-smp.h +++ b/src/arch/pami-bluegeneq/conv-mach-smp.h @@ -23,5 +23,8 @@ #define CMK_FAKE_SCHED_YIELD 1 -#define CMK_USE_L2ATOMICS 1 +#define CMK_PPC_ATOMIC_QUEUE 1 +#define CMK_PPC_ATOMIC_MUTEX 1 +//We use an L2 atomic version +#define CMK_PPC_ATOMIC_DEFAULT_IMPL 0 diff --git a/src/arch/pami-bluegeneq/conv-mach.h b/src/arch/pami-bluegeneq/conv-mach.h index c6450474f4..c5df57b019 100644 --- a/src/arch/pami-bluegeneq/conv-mach.h +++ b/src/arch/pami-bluegeneq/conv-mach.h @@ -46,6 +46,14 @@ // This needs to be compiled with gcc only #define CMK_TIMER_USE_BLUEGENEQ 1 +#define CMK_TYPEDEF_INT2 short +#define CMK_TYPEDEF_INT4 int +#define CMK_TYPEDEF_INT8 long long +#define CMK_TYPEDEF_UINT2 unsigned short +#define CMK_TYPEDEF_UINT4 unsigned int +#define CMK_TYPEDEF_UINT8 unsigned long long +#define CMK_TYPEDEF_FLOAT4 float +#define CMK_TYPEDEF_FLOAT8 double #define CMK_WHEN_PROCESSOR_IDLE_BUSYWAIT 1 #define CMK_WHEN_PROCESSOR_IDLE_USLEEP 0 @@ -64,5 +72,8 @@ #define CMK_NO_ISO_MALLOC 1 +#undef CMI_DIRECT_MANY_TO_MANY_DEFINED +#define CMI_DIRECT_MANY_TO_MANY_DEFINED 1 + #endif diff --git a/src/arch/pami-bluegeneq/conv-mach.sh b/src/arch/pami-bluegeneq/conv-mach.sh index 53969cc124..6187b90c89 100644 --- a/src/arch/pami-bluegeneq/conv-mach.sh +++ b/src/arch/pami-bluegeneq/conv-mach.sh @@ -46,8 +46,8 @@ CMK_CC="bgxlc_r -qcpluscmt -qhalt=e -qnokeyword=__int128 -qtls=local-exec" CMK_CXXPP="$BGQ_BIN/powerpc64-bgq-linux-g++ -E " CMK_GCXX="$BGQ_BIN/powerpc64-bgq-linux-g++ $GCC_OPTS " CMK_CF77="bgxlf_r " -CMK_CF90="bgxlf90_r -qsuffix=f=f90" -CMK_CF90_FIXED="bgxlf90_r " +CMK_CF90="bgxlf90_r -qsuffix=f=f90" +CMK_CF90_FIXED="bgxlf90_r " CMK_LD="$CMK_CC" CMK_LDXX="$CMK_CXX" @@ -64,7 +64,7 @@ CMK_QT="aix" CMK_LD_LIBRARY_PATH="-Wl,-rpath,$CHARMLIBSO/" CMK_LIBS='-lckqt' -CMK_SYSINC="$BGQ_INC" +CMK_SYSINC="$BGQ_INC" CMK_SYSLIBS="$BGQ_LIB" CMK_F90LIBS="-lxlf90 -lxlopt -lxl -lxlfmath" CMK_MOD_NAME_ALLCAPS=1 diff --git a/src/arch/pami-bluegeneq/memalloc.c b/src/arch/pami-bluegeneq/memalloc.c deleted file mode 100644 index 408d2e2b0e..0000000000 --- a/src/arch/pami-bluegeneq/memalloc.c +++ /dev/null @@ -1,126 +0,0 @@ - -#include - -#define ALIGNMENT 64 -#define ALIGNMENT2 128 -#define SMSG_SIZE 4096 -#define N_SMSG_ELEM 512 -#define MAX_SMSG_ELEM 4096 -#define LMSG_SIZE 16384 -#define N_LMSG_ELEM 128 -#define MAX_LMSG_ELEM 2048 - -typedef struct CmiMemAllocHdr_bgq_t { - int rank; - int size; - int tobuf; - //Align the application buffer to 32 bytes - char dummy[ALIGNMENT - sizeof(CmiChunkHeader) - 3*sizeof(int)]; -} CmiMemAllocHdr_bgq; - -typedef struct _memstruct { - L2AtomicQueue memQ; - int allocated_msg; - //char pad[ALIGNMENT2 - sizeof(L2AtomicQueue) - sizeof(int)]; -} L2MemStruct; - -static int _nodeStart; -L2MemStruct *sL2MemallocVec; -L2MemStruct *bL2MemallocVec; - -void *CmiAlloc_bgq (int size) { - CmiMemAllocHdr_bgq *hdr = NULL; - char *buf; - - int myrank = Kernel_ProcessorID() - _nodeStart; - - if (size <= SMSG_SIZE) { - hdr = L2AtomicDequeue (&(sL2MemallocVec[myrank].memQ)); - if (hdr == NULL) { - if(sL2MemallocVec[myrank].allocated_msg > MAX_SMSG_ELEM) { - hdr = (CmiMemAllocHdr_bgq *)memalign(ALIGNMENT, size + sizeof(CmiMemAllocHdr_bgq)); - hdr->tobuf = 0; - } else { - hdr = (CmiMemAllocHdr_bgq *) memalign(ALIGNMENT, SMSG_SIZE + sizeof(CmiMemAllocHdr_bgq)); - sL2MemallocVec[myrank].allocated_msg++; - hdr->size = SMSG_SIZE; - hdr->tobuf = 1; - } - } - } - else if (size <= LMSG_SIZE) { - hdr = L2AtomicDequeue (&(bL2MemallocVec[myrank].memQ)); - if (hdr == NULL) { - if(bL2MemallocVec[myrank].allocated_msg > MAX_LMSG_ELEM) { - hdr = (CmiMemAllocHdr_bgq *)memalign(ALIGNMENT, size + sizeof(CmiMemAllocHdr_bgq)); - hdr->tobuf = 0; - } else { - hdr = (CmiMemAllocHdr_bgq *) memalign(ALIGNMENT, LMSG_SIZE + sizeof(CmiMemAllocHdr_bgq)); - bL2MemallocVec[myrank].allocated_msg++; - hdr->size = LMSG_SIZE; - hdr->tobuf = 1; - } - } - } - else { - hdr = (CmiMemAllocHdr_bgq *) memalign(ALIGNMENT, size + sizeof(CmiMemAllocHdr_bgq)); - hdr->size = size; - hdr->tobuf = 0; - } - - hdr->rank = myrank; - buf = (char*)hdr + sizeof(CmiMemAllocHdr_bgq); - - return buf; -} - -void CmiFree_bgq (void *buf) { - CmiMemAllocHdr_bgq *hdr = (CmiMemAllocHdr_bgq *)((char*)buf - sizeof(CmiMemAllocHdr_bgq)); - int rc = L2A_EAGAIN; - - if (hdr->tobuf && hdr->size == SMSG_SIZE) - rc = L2AtomicEnqueue (&(sL2MemallocVec[hdr->rank].memQ), hdr); - else if (hdr->tobuf && hdr->size == LMSG_SIZE) - rc = L2AtomicEnqueue (&(bL2MemallocVec[hdr->rank].memQ), hdr); - - //queues are full or large buf - if (rc == L2A_EAGAIN) { - if(hdr->tobuf) { - if(hdr->size == SMSG_SIZE) - sL2MemallocVec[hdr->rank].allocated_msg--; - else - bL2MemallocVec[hdr->rank].allocated_msg--; - } - free_nomigrate(hdr); - } -} - -void CmiMemAllocInit_bgq (void * l2mem, - size_t l2memsize) -{ - int i = 0; - int node_size = 64/Kernel_ProcessCount(); - _nodeStart = node_size * Kernel_MyTcoord(); - //We want to align headers to 32 bytes - CmiAssert(sizeof(CmiMemAllocHdr_bgq)+sizeof(CmiChunkHeader) == ALIGNMENT); - - CmiAssert (l2memsize >= 2 * node_size * sizeof(L2AtomicState)); - sL2MemallocVec = (L2MemStruct *)memalign(ALIGNMENT,sizeof(L2MemStruct)*node_size); - bL2MemallocVec = (L2MemStruct *)memalign(ALIGNMENT,sizeof(L2MemStruct)*node_size); - - for (i = 0; i < node_size; ++i) { - L2AtomicQueueInit ((char *)l2mem + 2*i*sizeof(L2AtomicState), - sizeof(L2AtomicState), - &(sL2MemallocVec[i].memQ), - 0, /*No Overflow*/ - N_SMSG_ELEM /*512 entries in short q*/); - - L2AtomicQueueInit ((char *)l2mem + (2*i+1)*sizeof(L2AtomicState), - sizeof(L2AtomicState), - &(bL2MemallocVec[i].memQ), - 0, - N_LMSG_ELEM /*128 entries in long q*/); - sL2MemallocVec[i].allocated_msg = 0; - bL2MemallocVec[i].allocated_msg = 0; - } -} diff --git a/src/arch/pami-bluegeneq/ppc_atomicq_impl.h b/src/arch/pami-bluegeneq/ppc_atomicq_impl.h new file mode 100644 index 0000000000..ac1d748fad --- /dev/null +++ b/src/arch/pami-bluegeneq/ppc_atomicq_impl.h @@ -0,0 +1,50 @@ + +#ifndef __L2_ATOMIC_PPCQ_H__ +#define __L2_ATOMIC_PPCQ_H__ + +#include "spi/include/l2/atomic.h" +#include "spi/include/l1p/flush.h" +#include "pami.h" + +typedef pami_result_t (*pamix_proc_memalign_fn) (void**, size_t, size_t, const char*); + +///////////////////////////////////////////////////// +// \brief Basic atomic operations should to defined +// PPC_AtomicStore : store a value to the atomic counter +// PPC_AtomicLoadIncrementBounded : bounded increment +// PPC_AtomicWriteFence : a producer side write fence +// PPC_AtomicReadFence : consumer side read fence +// PPC_AtomicCounterAllocate : allocate atomic counters +///////////////////////////////////////////////////// + +#define CMI_PPC_ATOMIC_FAIL 0x8000000000000000UL + +typedef uint64_t ppc_atomic_type_t; +typedef uint64_t ppc_atomic_t; + +#define PPC_AQVal(x) x + +static inline void PPC_AtomicCounterAllocate (void **atomic_mem, + size_t atomic_memsize) +{ + pami_extension_t l2; + pamix_proc_memalign_fn PAMIX_L2_proc_memalign; + size_t size = atomic_memsize; + pami_result_t rc = PAMI_SUCCESS; + + rc = PAMI_Extension_open(NULL, "EXT_bgq_l2atomic", &l2); + CmiAssert (rc == 0); + PAMIX_L2_proc_memalign = (pamix_proc_memalign_fn)PAMI_Extension_symbol(l2, "proc_memalign"); + rc = PAMIX_L2_proc_memalign(atomic_mem, 64, size, NULL); + CmiAssert (rc == 0); +} + +#define PPC_AtomicLoadIncrementBounded(counter) L2_AtomicLoadIncrementBounded(counter); + +#define PPC_AtomicStore(counter, val) L2_AtomicStore(counter, val) + +#define PPC_AtomicReadFence() ppc_msync() + +#define PPC_AtomicWriteFence() L1P_FlushRequests() + +#endif diff --git a/src/arch/pami-linux-ppc64le/cc-gcc.h b/src/arch/pami-linux-ppc64le/cc-gcc.h new file mode 100644 index 0000000000..40a8c178f1 --- /dev/null +++ b/src/arch/pami-linux-ppc64le/cc-gcc.h @@ -0,0 +1 @@ +/* empty */ diff --git a/src/arch/pami-linux-ppc64le/cc-gcc.sh b/src/arch/pami-linux-ppc64le/cc-gcc.sh new file mode 100644 index 0000000000..2b5c32d663 --- /dev/null +++ b/src/arch/pami-linux-ppc64le/cc-gcc.sh @@ -0,0 +1,29 @@ + +CMK_CPP_CHARM='/lib/cpp -P' +CMK_CPP_C='gcc -E' +CMK_CC='gcc ' +CMK_CXX='g++ ' +CMK_CXXPP='g++ -E ' +CMK_LD='g++ ' +CMK_LDXX='g++ ' + +CMK_C_OPTIMIZE='-O3 -g' +CMK_CXX_OPTIMIZE='-O3 -g' + +CMK_RANLIB='ranlib' +CMK_LIBS='-lckqt ' +CMK_LD_LIBRARY_PATH="-Wl,-rpath,$CHARMLIBSO/" + +CMK_NATIVE_LIBS='' +CMK_NATIVE_CC='gcc ' +CMK_NATIVE_LD='gcc ' +CMK_NATIVE_CXX='g++ ' +CMK_NATIVE_LDXX='g++ ' + +CMK_CF77='gfortran' +CMK_CF90='gfortran' +CMK_F90LIBS='-lgfortran' +CMK_MOD_NAME_ALLCAPS=1 +CMK_MOD_EXT="mod" +CMK_F90_USE_MODDIR=1 +CMK_F90_MODINC="-p" diff --git a/src/arch/pami-linux-ppc64le/cc-xlc.h b/src/arch/pami-linux-ppc64le/cc-xlc.h new file mode 100644 index 0000000000..9e22b2b72d --- /dev/null +++ b/src/arch/pami-linux-ppc64le/cc-xlc.h @@ -0,0 +1 @@ +/* empty */ diff --git a/src/arch/pami-linux-ppc64le/cc-xlc.sh b/src/arch/pami-linux-ppc64le/cc-xlc.sh new file mode 100644 index 0000000000..1bb8bf6d7f --- /dev/null +++ b/src/arch/pami-linux-ppc64le/cc-xlc.sh @@ -0,0 +1 @@ +# empty diff --git a/src/arch/pami-linux-ppc64le/charmrun b/src/arch/pami-linux-ppc64le/charmrun new file mode 100755 index 0000000000..65b3d48c92 --- /dev/null +++ b/src/arch/pami-linux-ppc64le/charmrun @@ -0,0 +1,302 @@ +#!/bin/sh +# +# Conv-host for MPI: +# Translates +pN-style conv-host options into +# mpirun -npN options. + +args="" +pes=1 +ppn=1 +machinefile="" + +while [ $# -gt 0 ] +do + case $1 in + +ppn|++ppn) + args=$args" +ppn "$2 + ppn=$2 + shift + ;; + +ppn[0-9]*) + args=$args" "$1 + ppn=`echo $1 | awk '{print substr($1,5)}'` + ;; + ++ppn[0-9]*) + args=$args" "$1 + ppn=`echo $1 | awk '{print substr($1,6)}'` + ;; + +p) + pes=$2 + shift + ;; + +pemap) + args=$args" "$1" "$2 + shift + ;; + +p[0-9]*) + pes=`echo $1 | awk '{print substr($1,3)}'` + ;; + -machinefile) + machinefile=$2 + args=" "$1" "$2" "$args + shift + ;; + *) + args=$args" "$1 + ;; + esac + shift +done + +rem=`expr $pes % $ppn` +quot=`expr $pes / $ppn` +if [ $rem -ne 0 ]; +then + printf "p = $pes should be a multiple of ppn = $ppn\n" + exit 1 +else + pes=$quot +fi + +printf "\nRunning on $pes processors: $args\n" + + +if [ -n "$PBS_NODEFILE" ] +then +# we are in a job shell + aprun=`which aprun 2>/dev/null` + if test -n "$aprun" + then + echo aprun -n $pes $args + $aprun -n $pes $args + else + mpirun_cmd=`which mpirun 2>/dev/null` + if test -n "$mpirun_cmd" + then + if echo $mpirun_cmd | grep 'mvapich2' > /dev/null 2>/dev/null + then + # if daemon not started, start it + if ! mpdtrace > /dev/null 2>/dev/null + then + mvapich2-start-mpd + fi + mpirun -np $pes $args + # mpdallexit + else # normal case + test -z "$machinefile" && args=-machinefile" "$PBS_NODEFILE" "$args + echo mpirun -np $pes $args + mpirun -np $pes $args + fi + else + echo "Charmrun> can not locate mpirun in order to run the program." + exit 1 + fi + fi +elif [ -n "$LSB_HOSTS" ] +then +# Tungsten + echo cmpirun -lsf -poll -no_smp -gm_long 200000 $args + cmpirun -lsf -poll -no_smp -gm_long 200000 $args +elif [ -n "$PBS_QUEUE" -o -n "$LSF_QUEUE" ] +then +# Interactive mode: create, and submit a batch job + script="charmrun_script.$$.sh" + indir=`pwd` + output="$indir/charmrun_script.$$.stdout" + result="$indir/charmrun_script.$$.result" + rm -f $result +# Some machine specific + USE_LSF=0 +# 10 minutes + walllimit=10 + queue_stat=qstat + queue_qsub=qsub + queue_kill=qdel + hostname=`hostname` + case "$hostname" in + turing*.turing.uiuc.edu) + ppn='#PBS -l nodes='$pes':ppn=1' + extra='-machinefile $PBS_NODEFILE' + ;; + tg-login*|honest*.ncsa.uiuc.edu) + # always ppn=2 + nodes=`expr \( $pes + 1 \) / 2` + test $pes -eq 1 && ppns=1 || ppns=2 + ppn='#PBS -l nodes='$nodes':ppn='$ppns + extra='-machinefile $PBS_NODEFILE' + ;; + co-login*.ncsa.uiuc.edu) + mem='#PBS -l mem=500mb' + ncpus="#PBS -l ncpus=$pes" + ;; + tun*) + USE_LSF=1 + queue_stat=bjobs + queue_qsub=bsub + queue_kill=bkill + ;; + abe*) + # always ppn=2 + nodes=`expr \( $pes + 1 \) / 2` + test $pes -eq 1 && ppns=1 || ppns=2 + ppn='#PBS -l nodes='$nodes':ppn='$ppns + extra='-machinefile $PBS_NODEFILE' + ;; + kraken*) + ncores=`expr \( $pes + 11 \) / 12 \* 12` + ncpus="#PBS -l size=$ncores" + ppn='' + ;; + *) + ncpus="#PBS -l ncpus=$pes" + ;; + esac + if test $USE_LSF -eq 0 + then + mpirun=`which aprun 2>/dev/null` + npcmd="-n " + if test -z "$mpirun" + then + mpirun=`which mpirun 2>/dev/null` + npcmd="-np " + fi + cat > $script << EOF +#!/bin/sh +# This is a charmrun-generated PBS batch job script. +# The lines starting with #PBS are queuing system flags: +# +$ppn +# +$ncpus +# +#PBS -l walltime=$walllimit:00 +# +$mem +# +#PBS -q $PBS_QUEUE +# +#PBS -N autobuild +# +#PBS -j oe +# +#PBS -o $output + +cd $indir + +cat \$PBS_NODEFILE +echo $mpirun $npcmd $pes $extra $args +$mpirun $npcmd $pes $extra $args + +# Save mpirun exit status +status=\$? +echo \$status > $result +EOF + else +# use LSF + mpirun="cmpirun -lsf -poll -no_smp -gm_long 200000" + cat > $script << EOF +#!/bin/sh +# This is a charmrun-generated PBS batch job script. +# The lines starting with #PBS are queuing system flags: +# +#BSUB -J autobuild +#BSUB -W 0:$walllimit +#BSUB -n $pes +#BSUB -o $output + +cd $indir +echo \$LSB_MCPU_HOSTS +$mpirun $args +# Save mpirun exit status +status=\$? +echo \$status > $result +EOF + fi + +End() { + echo "Charmrun> $queue_kill $jobid ..." + $queue_kill $jobid + rm -f $script + exit $1 +} + + echo "Submitting batch job for> $mpirun -np $pes $args" + echo " using the command> $queue_qsub $script" + chmod 755 $script + while [ -z "$jobid" ] + do + [ $USE_LSF = 0 ] && jobid=`$queue_qsub $script|tail -1` + [ $USE_LSF = 1 ] && jobid=`$queue_qsub < $script|tail -1|sed -e 's/[^0-9]*//g'` + done + echo "Job enqueued under job ID $jobid" +# kill job if interrupted + trap 'End 1' 2 3 + retry=0 +# Wait for the job to complete, by checking its status + while [ true ] + do + $queue_stat $jobid > tmp.$$ + exitstatus=$? + if test -f $output + then +# The job is done-- print its output + rm tmp.$$ +# When job hangs, result file does not exist + test -f $result && status=`cat $result` || status=1 + test $status -eq 0 && status=`grep 'End of program' $output > /dev/null 2>&1` + cat $output + rm -f $result + test -f $status && rm -f $script $output + exit $status + fi +# The job is still queued or running-- print status and wait + tail -1 tmp.$$ + rm tmp.$$ +# Job ID may not exist now + if test $exitstatus -ne 0 + then +# retry a few times when error occurs + retry=`expr $retry + 1` + if test $retry -gt 6 + then + echo "Charmrun> too many errors, abort!" + exit 1 + else + sleep 15 + fi + else +# job still in queue + retry=0 + sleep 20 + fi + done +else + mpirun_cmd=`which mpirun 2>/dev/null` + if test -n "$mpirun_cmd" + then + [ -n "$MPI_MACHINEFILE" ] && args=" -machinefile $MPI_MACHINEFILE $args" + setarch_cmd=`which setarch 2>/dev/null` + if [ -n "$setarch_cmd" -a -x "$setarch_cmd" ] + then + # Disables randomization of the virtual address space (turns on + # ADDR_NO_RANDOMIZE). + cur_arch=`uname -m` + echo "charmrun> $setarch_cmd $cur_arch -R mpirun -np $pes $args" + $setarch_cmd $cur_arch -R mpirun -np $pes $args + else + echo "charmrun> mpirun -np $pes $args" + mpirun -np $pes $args + fi + else + mpiexec_cmd=`which mpiexec 2>/dev/null` + if test -n "$mpiexec_cmd" + then + echo "charmrun> $mpiexec_cmd -n $pes $args" + echo + "$mpiexec_cmd" -n $pes $args + else + echo "Don't know how to run MPI program." + exit 1 + fi + fi +fi diff --git a/src/arch/pami-bluegeneq/conv-mach-smp.h b/src/arch/pami-linux-ppc64le/conv-mach-smp.h similarity index 63% copy from src/arch/pami-bluegeneq/conv-mach-smp.h copy to src/arch/pami-linux-ppc64le/conv-mach-smp.h index d6c8f652b6..d77f6d7ed7 100644 --- a/src/arch/pami-bluegeneq/conv-mach-smp.h +++ b/src/arch/pami-linux-ppc64le/conv-mach-smp.h @@ -1,20 +1,17 @@ -#define CMK_SMP 1 +#define CMK_SMP 1 +#undef CMK_NODE_QUEUE_AVAILABLE +#define CMK_NODE_QUEUE_AVAILABLE 1 #undef CMK_SHARED_VARS_UNAVAILABLE #undef CMK_SHARED_VARS_POSIX_THREADS_SMP #define CMK_SHARED_VARS_UNAVAILABLE 0 #define CMK_SHARED_VARS_POSIX_THREADS_SMP 1 -/* Right now only comm thread (no multicore) and tls thread version with gcc works on Blue Gene*/ #define CMK_MULTICORE 0 -#ifdef __GNUC__ #define CMK_NOT_USE_TLS_THREAD 0 -#else -#define CMK_NOT_USE_TLS_THREAD 0 -#endif #define CMK_PCQUEUE_LOCK 1 /*#define PCQUEUE_MULTIQUEUE 1*/ @@ -23,5 +20,7 @@ #define CMK_FAKE_SCHED_YIELD 1 -#define CMK_USE_L2ATOMICS 1 +#define CMK_PPC_ATOMIC_QUEUE 1 +#define CMK_PPC_ATOMIC_MUTEX 1 +#define CMK_PPC_ATOMIC_DEFAULT_IMPL 1 diff --git a/src/arch/pami-linux-ppc64le/conv-mach-smp.sh b/src/arch/pami-linux-ppc64le/conv-mach-smp.sh new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/arch/pami-bluegeneq/conv-mach.h b/src/arch/pami-linux-ppc64le/conv-mach.h similarity index 57% copy from src/arch/pami-bluegeneq/conv-mach.h copy to src/arch/pami-linux-ppc64le/conv-mach.h index c6450474f4..b11ffdefc0 100644 --- a/src/arch/pami-bluegeneq/conv-mach.h +++ b/src/arch/pami-linux-ppc64le/conv-mach.h @@ -1,29 +1,18 @@ #ifndef _CONV_MACH_H #define _CONV_MACH_H -#define CMK_NO_OUTSTANDING_SENDS 0 - -#define CMK_64BIT 1 - -//#define CMK_MEMORY_PREALLOCATE_HACK 1 - -//#define CMK_CONVERSE_MPI 1 - -#define CMK_NO_SOCKETS 1 +#define CMK_PAMI_LINUX_PPC8 1 #define CMK_DEFAULT_MAIN_USES_COMMON_CODE 1 #define CMK_GETPAGESIZE_AVAILABLE 1 -#define CMK_IS_HETERO 0 - #define CMK_MALLOC_USE_GNU_MALLOC 0 #define CMK_MALLOC_USE_OS_BUILTIN 1 #define CMK_MEMORY_PAGESIZE 8192 #define CMK_MEMORY_PROTECTABLE 1 - #define CMK_SHARED_VARS_UNAVAILABLE 1 #define CMK_SHARED_VARS_UNIPROCESSOR 0 @@ -31,38 +20,33 @@ #define CMK_SIGNAL_USE_SIGACTION 0 #define CMK_SIGNAL_USE_SIGACTION_WITH_RESTART 1 -#define CMK_SYNCHRONIZE_ON_TCP_CLOSE 0 - -#define CMK_THREADS_USE_CONTEXT 0 -#define CMK_THREADS_USE_JCONTEXT 1 -#define CMK_THREADS_USE_PTHREADS 0 -#define CMK_THREADS_ARE_WIN32_FIBERS 0 - #define CMK_THREADS_REQUIRE_NO_CPV 0 #define CMK_TIMER_USE_GETRUSAGE 0 #define CMK_TIMER_USE_SPECIAL 0 #define CMK_TIMER_USE_TIMES 0 -// This needs to be compiled with gcc only -#define CMK_TIMER_USE_BLUEGENEQ 1 +#define CMK_TIMER_USE_RDTSC 0 +#define CMK_TIMER_USE_PPC64 1 + +#define CMK_THREADS_USE_CONTEXT 1 +#define CMK_THREADS_USE_JCONTEXT 0 +#define CMK_THREADS_USE_PTHREADS 0 +#define CMK_TYPEDEF_INT2 short +#define CMK_TYPEDEF_INT4 int +#define CMK_TYPEDEF_INT8 long long +#define CMK_TYPEDEF_UINT2 unsigned short +#define CMK_TYPEDEF_UINT4 unsigned int +#define CMK_TYPEDEF_UINT8 unsigned long long +#define CMK_TYPEDEF_FLOAT4 float +#define CMK_TYPEDEF_FLOAT8 double #define CMK_WHEN_PROCESSOR_IDLE_BUSYWAIT 1 #define CMK_WHEN_PROCESSOR_IDLE_USLEEP 0 - #define CMK_WEB_MODE 1 #define CMK_DEBUG_MODE 0 -#define CMK_LBDB_ON 1 - -#undef CMK_CCS_AVAILABLE -#define CMK_CCS_AVAILABLE 0 - -#define CMK_BLUEGENEQ 1 -#define CMK_BLUEGENEQ_OPTCOPY 1 - -#define CMK_NO_ISO_MALLOC 1 +#define CMK_LBDB_ON 1 #endif - diff --git a/src/arch/pami-linux-ppc64le/conv-mach.sh b/src/arch/pami-linux-ppc64le/conv-mach.sh new file mode 100644 index 0000000000..e222ec1ea5 --- /dev/null +++ b/src/arch/pami-linux-ppc64le/conv-mach.sh @@ -0,0 +1,56 @@ + +PAMI_INC=/opt/ibmhpc/pecurrent/ppe.pami/include +PAMI_LIB=/opt/ibmhpc/pecurrent/ppe.pami/gnu/lib64/pami64 + +CXX=xlC_r +CC=xlc_r + +CMK_CPP_CHARM='/lib/cpp -P' +CMK_CPP_C="$CC -E" +CMK_CC="$CC " +CMK_CXX="$CXX " +CMK_CXXPP="$CXX -E " +CMK_LD="$CMK_CC " +CMK_LDXX="$CMK_CXX " + +CMK_C_OPTIMIZE='-O3 -Q -g' +CMK_CXX_OPTIMIZE='-O3 -Q -g' + +CMK_RANLIB='ranlib' +CMK_LIBS='-lckqt' +CMK_LD_LIBRARY_PATH="-Wl,-rpath,$CHARMLIBSO/" + +CMK_SYSINC="-I $PAMI_INC" +#CMK_SYSLIBS="-L $PAMI_LIB -L /usr/lib/powerpc64le-linux-gnu -lpami -libverbs -lnuma -lstdc++ -lc -ldl -lrt -lpthread" +CMK_SYSLIBS="-L $PAMI_LIB -L /usr/lib/powerpc64le-linux-gnu -lpami -libverbs -lstdc++ -lc -ldl -lrt -lpthread" + +CMK_NATIVE_LIBS='' +CMK_NATIVE_CC="$CC -q64" +CMK_NATIVE_LD="$CC -q64" +CMK_NATIVE_CXX="$CXX -q64" +CMK_NATIVE_LDXX="$CXX -q64" + +# fortran compiler +CMK_CF77="xlf_r -q64 -fPIC " +CMK_CF90="xlf90_r -q64 -fPIC -qsuffix=f=f90" +CMK_CF90_FIXED="xlf90_r -q64 -fPIC" + +CMK_MOD_NAME_ALLCAPS=1 +CMK_MOD_EXT="mod" +CMK_F90_MODINC="-p" +CMK_F90_USE_MODDIR="" + +F90DIR=`which ifort 2> /dev/null` +if test -h "$F90DIR" +then + F90DIR=`readlink $F90DIR` +fi +if test -x "$F90DIR" +then + F90LIBDIR="`dirname $F90DIR`/../lib" + F90MAIN="$F90LIBDIR/for_main.o" +fi +# for_main.o is important for main() in f90 code +CMK_F90MAINLIBS="$F90MAIN " +CMK_F90LIBS="-L$F90LIBDIR -lifcore -lifport " +CMK_F77LIBS="$CMK_F90LIBS" diff --git a/src/arch/pami/Makefile.machine b/src/arch/pami/Makefile.machine index b404daf863..e69de29bb2 100644 --- a/src/arch/pami/Makefile.machine +++ b/src/arch/pami/Makefile.machine @@ -1,7 +0,0 @@ -#force compilation of QPX based code with -O3 - -LIBCONV_UTIL := ${LIBCONV_UTIL} cmimemcpy_qpx.o - -cmimemcpy_qpx.o: cmimemcpy_qpx.c cmimemcpy_qpx.h -cmimemcpy_qpx.o: CFLAGS:=${CFLAGS} -O3 - diff --git a/src/arch/pami/PPCAtomicMutex.h b/src/arch/pami/PPCAtomicMutex.h new file mode 100755 index 0000000000..7500f59164 --- /dev/null +++ b/src/arch/pami/PPCAtomicMutex.h @@ -0,0 +1,82 @@ + +#ifndef __PPC_ATOMIC_MUTEX__ +#define __PPC_ATOMIC_MUTEX__ + +#include +#include +#include +#include + +#if CMK_PPC_ATOMIC_DEFAULT_IMPL +#include "default_ppcq.h" +#else +//define new ppc atomics in the pami instance directory +#include "ppc_atomicq_impl.h" +#endif + +typedef struct _ppc_atomic_mutex_t +{ + volatile ppc_atomic_t counter; + volatile ppc_atomic_t bound; +} PPCAtomicMutex; + +PPCAtomicMutex *PPCAtomicMutexInit (void * atomic_mem, + size_t atomic_size) +{ + //Verify counter array is 64-byte aligned + assert( (((uintptr_t) atomic_mem) & (0x0F)) == 0 ); + assert (sizeof(PPCAtomicMutex) <= atomic_size); + + PPCAtomicMutex *mutex = (PPCAtomicMutex*) atomic_mem; + PPC_AtomicStore(&mutex->counter, 0); + PPC_AtomicStore(&mutex->bound, 1); + + return mutex; +} + +/** + * \brief Try to acquire a mutex + * \param[in] mutex pointer + * \return 0 Lock successfully acquired + * \return 1 Lock was not acquired + */ +static inline int PPCAtomicMutexTryAcquire (PPCAtomicMutex *mutex) +{ + size_t rc = PPC_AtomicLoadIncrementBounded(&mutex->counter); + if (rc == CMI_PPC_ATOMIC_FAIL) + return 1; + + PPC_AtomicReadFence(); + return rc; +} + +/** + * \brief Acquire a mutex + * \param[in] mutex pointer + * \return 0 Lock successfully acquired + */ +static inline void PPCAtomicMutexAcquire (PPCAtomicMutex *mutex) +{ + size_t rc = 0; + do { + rc = PPC_AtomicLoadIncrementBounded(&mutex->counter); + } while (rc == CMI_PPC_ATOMIC_FAIL); + + PPC_AtomicReadFence(); +} + +/** + * \brief Release a mutex + * \param[in] mutex pointer + */ +static inline void PPCAtomicMutexRelease(PPCAtomicMutex *mutex) +{ + //Flush outstanding loads/stores + PPC_AtomicWriteFence(); + + /* Release the lock */ + PPC_AtomicStore(&(mutex->counter), 0); +} + + +#endif diff --git a/src/arch/pami/PPCAtomicQueue.h b/src/arch/pami/PPCAtomicQueue.h new file mode 100755 index 0000000000..3b82edba1e --- /dev/null +++ b/src/arch/pami/PPCAtomicQueue.h @@ -0,0 +1,210 @@ + +#ifndef __PPC_ATOMIC_QUEUE__ +#define __PPC_ATOMIC_QUEUE__ + +#include +#include +#include +#include +#include "pcqueue.h" + +#define DEFAULT_SIZE 2048 + +#define CMI_PPCQ_SUCCESS 0 +#define CMI_PPCQ_EAGAIN -1 + +///////////////////////////////////////////////////// +// \brief Basic atomic operations should to defined +// ppc_atomic_t : the datatype of the atomic (uint32_t or uint64_t) +// PPC_AtomicStore : store a value to the atomic counter +// PPC_AtomicLoadIncrementBounded : bounded increment +// PPC_AtomicWriteFence : a producer side write fence +// PPC_AtomicReadFence : consumer side read fence +// PPC_AtomicCounterAllocate : allocate atomic counters +///////////////////////////////////////////////////// + +#if CMK_PPC_ATOMIC_DEFAULT_IMPL +#include "default_ppcq.h" +#else +//define new ppc atomics in the pami instance directory +#include "ppc_atomicq_impl.h" +#endif + +#if 0 +void PPC_AtomicCounterAllocate (void **atomic_mem, size_t atomic_memsize); +ppc_atomic_type_t PPC_AtomicLoadIncrementBounded (volatile ppc_atomic_t *counter); +void PPC_AtomicStore(volatile ppc_atomic_t *counter, ppc_atomic_type_t val); +void PPC_AtomicReadFence(); +void PPC_AtomicWriteFence(); +#endif + +typedef void* PPCAtomicQueueElement; + +typedef struct _ppcatomicstate { + volatile ppc_atomic_t Producer; + volatile ppc_atomic_t UpperBound; + char pad[32 - 2*sizeof(ppc_atomic_t)]; +} PPCAtomicState; + +typedef struct _ppcatomicq { + PPCAtomicState * _state; + volatile void * volatile * _array; + volatile ppc_atomic_type_t _consumer; + int _qsize; + int _useOverflowQ; + PCQueue _overflowQ; //40 byte structure + char _pad[24]; //align to 64 bytes +} PPCAtomicQueue; //should be padded + +void PPCAtomicQueueInit (void * atomic_mem, + size_t atomic_memsize, + PPCAtomicQueue * queue, + int use_overflow, + int nelem) +{ + pami_result_t rc; + + //Verify counter array is 64-byte aligned +#if CMK_BLUEGENEQ + assert ( (((uintptr_t) atomic_mem) & (0x1F)) == 0 ); + assert (sizeof(PPCAtomicState) == 32); //all counters need to be lined up + assert (sizeof(PPCAtomicState) <= atomic_memsize); +#endif + + queue->_useOverflowQ = use_overflow; + + int qsize = 2; + while (qsize < nelem) + qsize *= 2; + queue->_qsize = qsize; + + queue->_state = (PPCAtomicState *) atomic_mem; + queue->_overflowQ = PCQueueCreate(); + queue->_consumer = 0; + PPC_AtomicStore(&queue->_state->Producer, 0); + PPC_AtomicStore(&queue->_state->UpperBound, qsize); + + rc = posix_memalign ((void **)&queue->_array, + 128, /* Typical L1 line size for POWER */ + sizeof(PPCAtomicQueueElement) * qsize); + + assert(rc == PAMI_SUCCESS); + memset((void*)queue->_array, 0, sizeof(PPCAtomicQueueElement)*qsize); +} + +int PPCAtomicEnqueue (PPCAtomicQueue * queue, + void * element) +{ + //fprintf(stderr,"Insert message %p\n", element); + + register int qsize_1 = queue->_qsize - 1; + ppc_atomic_type_t index = PPC_AtomicLoadIncrementBounded(&queue->_state->Producer); + PPC_AtomicWriteFence(); + if (index != CMI_PPC_ATOMIC_FAIL) { + queue->_array[index & qsize_1] = element; + return CMI_PPCQ_SUCCESS; + } + + //We dont want to use the overflow queue + if (!queue->_useOverflowQ) + return CMI_PPCQ_EAGAIN; //Q is full, try later + + //No ordering is guaranteed if there is overflow + PCQueuePush(queue->_overflowQ, element); + + return CMI_PPCQ_SUCCESS; +} + +void * PPCAtomicDequeue (PPCAtomicQueue *queue) +{ + ppc_atomic_type_t head, tail; + tail = PPC_AQVal(queue->_state->Producer); + head = queue->_consumer; + register int qsize_1 = queue->_qsize-1; + + volatile void *e = NULL; + if (head < tail) { + e = queue->_array[head & qsize_1]; + if (e == NULL) + return NULL; + + queue->_array[head & qsize_1] = NULL; + PPC_AtomicReadFence(); + + head ++; + queue->_consumer = head; + + //Charm++ does not require message ordering + //So we dont acquire overflow mutex here + ppc_atomic_type_t n = head + queue->_qsize; + + //Update bound every 16 consumes + if ((n & 0xF) == 0) + PPC_AtomicStore(&queue->_state->UpperBound, n); + return (void*) e; + } + + //We dont have an overflowQ + if (!queue->_useOverflowQ) + return NULL; + + e = PCQueuePop (queue->_overflowQ); + return (void *) e; +} + +int PPCAtomicQueueEmpty (PPCAtomicQueue *queue) { + return ( (PCQueueLength(queue->_overflowQ) == 0) && + (PPC_AQVal(queue->_state->Producer) == queue->_consumer) ); +} + +//spin block in the PPC atomic queue till there is a message. fail and +//return after n iterations +int PPCAtomicQueueSpinWait (PPCAtomicQueue * queue, + int n) +{ + if (!PPCAtomicQueueEmpty(queue)) + return 0; //queue is not empty so return + + ppc_atomic_type_t head, tail; + head = queue->_consumer; + + size_t i = n; + do { + tail = PPC_AQVal(queue->_state->Producer); + i--; + } + //While the queue is empty and i < n + while (head == tail && i != 0); + + return 0; //fail queue is empty +} + +//spin block in the PPC atomic queue till there is a message. fail and +//return after n iterations +int PPCAtomicQueue2QSpinWait (PPCAtomicQueue * queue0, + PPCAtomicQueue * queue1, + int n) +{ + if (!PPCAtomicQueueEmpty(queue0)) + return 0; //queue0 is not empty so return + + if (!PPCAtomicQueueEmpty(queue1)) + return 0; //queue is not empty so return + + ppc_atomic_type_t head0, tail0; + ppc_atomic_type_t head1, tail1; + + head0 = queue0->_consumer; + head1 = queue1->_consumer; + + size_t i = n; + do { + tail0 = PPC_AQVal(queue0->_state->Producer); + tail1 = PPC_AQVal(queue1->_state->Producer); + i --; + } while (head0==tail0 && head1==tail1 && i!=0); + + return 0; +} + +#endif diff --git a/src/arch/pami/conv-common.h b/src/arch/pami/conv-common.h index 2b175b6110..8a1598dcb0 100644 --- a/src/arch/pami/conv-common.h +++ b/src/arch/pami/conv-common.h @@ -5,7 +5,7 @@ #define CMK_HANDLE_SIGUSR 1 -#define CMK_MSG_HEADER_EXT_ CmiUInt2 rank, hdl,xhdl,info, stratid; unsigned char cksum, magic; int root, size; CmiUInt2 redID, padding; +#define CMK_MSG_HEADER_EXT_ CmiUInt2 rank, hdl,xhdl,info, stratid; unsigned char cksum, magic; int root, size; CmiUInt2 redID, padding; #define CMK_MSG_HEADER_BASIC CMK_MSG_HEADER_EXT #define CMK_MSG_HEADER_EXT { CMK_MSG_HEADER_EXT_ } @@ -34,8 +34,7 @@ #undef CMK_HAS_FDATASYNC_FUNC #define CMK_HAS_FDATASYNC_FUNC 0 -//#define CMI_DIRECT_MANY_TO_MANY_DEFINED 0 +#define CMI_DIRECT_MANY_TO_MANY_DEFINED 0 #define CMK_PERSISTENT_COMM 0 -#define CMI_DIRECT_MANY_TO_MANY_DEFINED 1 diff --git a/src/arch/pami/default_ppcq.h b/src/arch/pami/default_ppcq.h new file mode 100644 index 0000000000..80e1112582 --- /dev/null +++ b/src/arch/pami/default_ppcq.h @@ -0,0 +1,98 @@ + +#ifndef __DEFAULT_PPCQ_H__ +#define __DEFAULT_PPCQ_H__ + +#include "pami.h" + +///////////////////////////////////////////////////// +// \brief Basic atomic operations should to defined +// PPC_AtomicStore : store a value to the atomic counter +// PPC_AtomicLoadIncrementBounded : bounded increment +// PPC_AtomicWriteFence : a producer side write fence +// PPC_AtomicReadFence : consumer side read fence +// PPC_AtomicCounterAllocate : allocate atomic counters +///////////////////////////////////////////////////// + +#define CMI_PPC_ATOMIC_FAIL 0x8000000000000000UL + +typedef uint64_t ppc_atomic_type_t; + +typedef struct _ppc_atomic_t { + volatile uint64_t val; + char _pad[56]; +} ppc_atomic_t; + +#define PPC_AQVal(x) ((x).val) + +static inline void PPC_AtomicCounterAllocate (void **atomic_mem, + size_t atomic_memsize) +{ + posix_memalign(atomic_mem, 64, atomic_memsize); +} + +// Load Reserved: 64bit atom +static inline ppc_atomic_type_t PPC_AtomicLoadReserved ( volatile ppc_atomic_t *ptr ) +{ + ppc_atomic_type_t val; + __asm__ __volatile__ ("ldarx %[val],0,%[ptr]" + : [val] "=r" (val) + : [ptr] "r" (&ptr->val) + : "cc"); + + return( val ); +} + +static inline int PPC_AtomicStoreConditional( volatile ppc_atomic_t *ptr, ppc_atomic_type_t val ) +{ + register int rc = 1; // assume success + __asm__ __volatile__ ("stdcx. %[val],0,%[ptr];\n" + "beq 1f;\n" + "li %[rc], 0;\n" + "1: ;\n" + : [rc] "=r" (rc) + : [ptr] "r" (&ptr->val), [val] "r" (val), "0" (rc) + : "cc", "memory"); + return( rc ); +} + +static inline ppc_atomic_type_t PPC_AtomicLoadIncrementBounded (volatile ppc_atomic_t *counter) +{ + register ppc_atomic_type_t old_val, tmp_val, bound; + bound = counter[1].val; + do + { + old_val = PPC_AtomicLoadReserved( counter ); + tmp_val = old_val + 1; + + if (tmp_val > bound) + return CMI_PPC_ATOMIC_FAIL; + } + while ( !PPC_AtomicStoreConditional( counter, tmp_val ) ); + + return( old_val ); +} + +static inline void PPC_AtomicStore(volatile ppc_atomic_t *counter, ppc_atomic_type_t val) +{ + //Counter perpetually increments, so stale value is always smaller + //__asm__ __volatile__ ("lwsync":::"memory"); + counter->val = val; +} + +static inline void PPC_AtomicReadFence() +{ +#if !CMK_BLUEGENQ //full memory barrier executed on Producer + __asm__ __volatile__ ("isync":::"memory"); +#endif +} + +static inline void PPC_AtomicWriteFence() +{ +#if CMK_BLUEGENEQ //execute full memory barrier + __asm__ __volatile__ ("sync":::"memory"); +#else + __asm__ __volatile__ ("lwsync":::"memory"); +#endif +} + +#endif diff --git a/src/arch/pami/machine.c b/src/arch/pami/machine.c index 05a450d126..2f7a5c3b7a 100644 --- a/src/arch/pami/machine.c +++ b/src/arch/pami/machine.c @@ -1,3 +1,8 @@ + +#define _GNU_SOURCE +#include +#include + #include #include #include @@ -10,18 +15,18 @@ #include "assert.h" #include "malloc.h" +#if CMK_BLUEGENEQ #include #include "spi/include/kernel/process.h" #include "spi/include/kernel/memory.h" +#endif + #include "pami.h" #include "pami_sys.h" #if MACHINE_DEBUG_LOG FILE *debugLog = NULL; #endif -//#if CMK_SMP -//#define CMK_USE_L2ATOMICS 1 -//#endif #if !CMK_SMP #if CMK_ENABLE_ASYNC_PROGRESS @@ -29,13 +34,15 @@ FILE *debugLog = NULL; #endif #endif - -#if CMK_SMP && CMK_USE_L2ATOMICS -#include "L2AtomicQueue.h" -#include "L2AtomicMutex.h" +#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE +#include "PPCAtomicQueue.h" #include "memalloc.c" #endif +#if CMK_SMP && CMK_PPC_ATOMIC_MUTEX +#include "PPCAtomicMutex.h" +#endif + #define CMI_LIKELY(x) (__builtin_expect(x,1)) #define CMI_UNLIKELY(x) (__builtin_expect(x,0)) @@ -49,12 +56,7 @@ char *ALIGN_32(char *p) { This will use the fourth short in message as an indicator of spanning tree root. */ -#if CMK_SMP -#define CMK_BROADCAST_SPANNING_TREE 1 -#else #define CMK_BROADCAST_SPANNING_TREE 1 -#endif /* CMK_SMP */ - #define BROADCAST_SPANNING_FACTOR 4 //The root of the message infers the type of the message @@ -72,14 +74,21 @@ char *ALIGN_32(char *p) { /* FIXME: need a random number that everyone agrees ! */ #define CHARM_MAGIC_NUMBER 126 - #define CMI_PAMI_SHORT_DISPATCH 7 #define CMI_PAMI_RZV_DISPATCH 8 #define CMI_PAMI_ACK_DISPATCH 9 #define CMI_PAMI_DISPATCH 10 +#ifdef CMK_BLUEGENEQ #define SHORT_CUTOFF 128 #define EAGER_CUTOFF 4096 +#else +#define SHORT_CUTOFF 1920 +#define EAGER_CUTOFF 2000000000 +#endif + +//typically this can be enabled when LTPS==0 +#define FREE_LIST_SEND_NO_COPY 0 #if CMK_ERROR_CHECKING static int checksum_flag = 0; @@ -128,35 +137,28 @@ CpvDeclare(void*, CmiLocalQueue); typedef struct ProcState { - /* PCQueue sendMsgBuf; */ /* per processor message sending queue */ -#if CMK_SMP && CMK_USE_L2ATOMICS - L2AtomicQueue atomic_queue; +#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE + PPCAtomicQueue atomic_queue; + char _pad[128-sizeof(PPCAtomicQueue)]; #endif - /* CmiNodeLock recvLock; */ /* for cs->recv */ } ProcState; static ProcState *procState; -#if CMK_SMP && CMK_USE_L2ATOMICS -static L2AtomicMutex *node_recv_mutex; -static L2AtomicQueue node_recv_atomic_q; +#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE +static PPCAtomicQueue node_recv_atomic_q; #endif -#if CMK_SMP && !CMK_MULTICORE -//static volatile int commThdExit = 0; -//static CmiNodeLock commThdExitLock = 0; +#if CMK_SMP && CMK_PPC_ATOMIC_MUTEX +static PPCAtomicMutex *node_recv_mutex; +#endif +#if CMK_SMP && !CMK_MULTICORE //The random seed to pick destination context __thread uint32_t r_seed = 0xdeadbeef; __thread int32_t _cmi_bgq_incommthread = 0; #endif -//int CmiInCommThread () { -// //if (_cmi_bgq_incommthread) -// //printf ("CmiInCommThread: %d\n", _cmi_bgq_incommthread); -// return _cmi_bgq_incommthread; -//} - void ConverseRunPE(int everReturn); static void CommunicationServer(int sleepTime); static void CommunicationServerThread(int sleepTime); @@ -218,8 +220,8 @@ void CmiPushPE(int pe,void *msg) { } #endif -#if CMK_SMP && CMK_USE_L2ATOMICS - L2AtomicEnqueue(&procState[pe].atomic_queue, msg); +#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE + PPCAtomicEnqueue(&procState[pe].atomic_queue, msg); #else PCQueuePush(cs->recv,(char *)msg); #endif @@ -239,12 +241,10 @@ static void CmiPushNode(void *msg) { return; } #endif -#if CMK_SMP && CMK_USE_L2ATOMICS - L2AtomicEnqueue(&node_recv_atomic_q, msg); +#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE + PPCAtomicEnqueue(&node_recv_atomic_q, msg); #else - CmiLock(CsvAccess(NodeState).CmiNodeRecvLock); PCQueuePush(CsvAccess(NodeState).NodeRecv,msg); - CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock); #endif //CmiState cs=CmiGetStateN(0); //CmiIdleLock_addMessage(&cs->idle); @@ -263,13 +263,13 @@ static void CmiPushNode(void *msg) { volatile int msgQueueLen [MAX_NUM_CONTEXTS]; volatile int outstanding_recvs [MAX_NUM_CONTEXTS]; -//#if CMK_SMP && CMK_ENABLE_ASYNC_PROGRESS -//#define THREADS_PER_CONTEXT 2 -//#define LTPS 1 //Log Threads Per Context (TPS) -//#else +#if CMK_BLUEGENEQ #define THREADS_PER_CONTEXT 4 #define LTPS 2 //Log Threads Per Context (TPS) -//#endif +#else +#define THREADS_PER_CONTEXT 1 +#define LTPS 0 //Log Threads Per Context (TPS) +#endif //endif CMK_BLUEGENEQ #define MY_CONTEXT_ID() (CmiMyRank() >> LTPS) #define MY_CONTEXT() (cmi_pami_contexts[CmiMyRank() >> LTPS]) @@ -293,12 +293,12 @@ volatile int outstanding_recvs; #define INCR_ORECVS() (outstanding_recvs ++) #define DECR_ORECVS() (outstanding_recvs --) #define ORECVS() (outstanding_recvs) -#endif +#endif //CMK_SMP #if CMK_SMP && !CMK_ENABLE_ASYNC_PROGRESS #define PAMIX_CONTEXT_LOCK_INIT(x) #define PAMIX_CONTEXT_LOCK(x) if(LTPS) PAMI_Context_lock(x) -#define PAMIX_CONTEXT_UNLOCK(x) if(LTPS) {ppc_msync(); PAMI_Context_unlock(x);} +#define PAMIX_CONTEXT_UNLOCK(x) if(LTPS) {CmiMemoryWriteFence(); PAMI_Context_unlock(x);} #define PAMIX_CONTEXT_TRYLOCK(x) ((LTPS)?(PAMI_Context_trylock(x) == PAMI_SUCCESS):(1)) #else #define PAMIX_CONTEXT_LOCK_INIT(x) @@ -358,8 +358,10 @@ static void recv_done(pami_context_t ctxt, void *clientdata, pami_result_t resul { char *msg = (char *) clientdata; int sndlen = ((CmiMsgHeaderBasic *) msg)->size; - //int rank = *(int *) (msg + sndlen); //get rank from bottom of the message - //CMI_DEST_RANK(msg) = rank; +#if FREE_LIST_SEND_NO_COPY + int rank = *(int *) (msg + sndlen); //get rank from bottom of the message + CMI_DEST_RANK(msg) = rank; +#endif //fprintf (stderr, "%d Recv message done \n", CmiMyPe()); /* then we do what PumpMsgs used to do: @@ -378,7 +380,7 @@ static void recv_done(pami_context_t ctxt, void *clientdata, pami_result_t resul #if CMK_NODE_QUEUE_AVAILABLE #if CMK_BROADCAST_SPANNING_TREE - if (CMI_IS_BCAST_ON_NODES(msg)) + if (CMI_IS_BCAST_ON_NODES(msg)) SendSpanningChildrenNode(sndlen, msg, 1); #endif if (CMI_DEST_RANK(msg) == SMP_NODEMESSAGE) @@ -394,15 +396,16 @@ typedef struct _cmi_pami_rzv { void * buffer; size_t offset; int bytes; + int rank; int dst_context; - pami_memregion_t mregion; }CmiPAMIRzv_t; typedef struct _cmi_pami_rzv_recv { + int rank; //Read in recv_done + int size; void * msg; void * src_buffer; int src_ep; - int size; pami_memregion_t rmregion; } CmiPAMIRzvRecv_t; @@ -415,13 +418,15 @@ static void pkt_dispatch (pami_context_t context, pami_endpoint_t origin, pami_recv_t * recv) { - //fprintf (stderr, "Received Message of size %d %p\n", pipe_size, recv); + //fprintf (stderr, "%d Received Message of size %d %p\n", CmiMyPe(), pipe_size, recv); INCR_ORECVS(); int alloc_size = pipe_size; +#if !FREE_LIST_SEND_NO_COPY char * buffer = (char *)CmiAlloc(alloc_size); - //char * buffer = (char *)CmiAlloc(alloc_size + sizeof(int)); - //*(int *)(buffer+alloc_size) = *(int *)header_addr; - +#else + char * buffer = (char *)CmiAlloc(alloc_size + sizeof(int)); + *(int *)(buffer+alloc_size) = *(int *)header_addr; +#endif if (recv) { recv->local_fn = recv_done; recv->cookie = buffer; @@ -445,10 +450,9 @@ static void short_pkt_dispatch (pami_context_t context, pami_endpoint_t origin, pami_recv_t * recv) { + //fprintf(stderr, "%d short dispatch\n", CmiMyPe()); int alloc_size = pipe_size; char * buffer = (char *)CmiAlloc(alloc_size); - //char * buffer = (char *)CmiAlloc(alloc_size + sizeof(int)); - //*(int *)(buffer+alloc_size) = *(int *)header_addr; memcpy (buffer, pipe_addr, pipe_size); char *smsg = (char *)pipe_addr; @@ -460,7 +464,13 @@ static void short_pkt_dispatch (pami_context_t context, CmiAbort("Charm++ Warning: Non Charm++ Message Received. If your application has a large number of messages, this may be because of overflow in the low-level FIFOs. Please set the environment variable MUSPI_INJFIFOSIZE if the application has large number of small messages (<=4K bytes), and/or PAMI_RGETINJFIFOSIZE if the application has a large number of large messages. The default value of these variable is 65536 which is sufficient for 1000 messages in flight; please try a larger value. Please note that the memory used for these FIFOs eats up the memory = 10*FIFO_SIZE per core. Please contact Charm++ developers for further information. \n"); } - CmiPushPE(CMI_DEST_RANK(smsg), (void *)msg); +#if FREE_LIST_SEND_NO_COPY + int dst_rank = *(int*) header_addr; + CMI_DEST_RANK(msg) = dst_rank; + CmiPushPE(dst_rank, (void *)msg); +#else + CmiPushPE(CMI_DEST_RANK(msg), (void *)msg); +#endif } @@ -486,6 +496,7 @@ void rzv_recv_done (pami_context_t ctxt, void * clientdata, pami_result_t result); +#if CMK_BLUEGENEQ //approx sleep command size_t mysleep_iter = 0; void mysleep (unsigned long cycles) { @@ -499,10 +510,10 @@ void mysleep (unsigned long cycles) { return; } +#endif static void * test_buf; volatile int pami_barrier_flag = 0; -typedef pami_result_t (*pamix_proc_memalign_fn) (void**, size_t, size_t, const char*); void pami_barrier_done (void *ctxt, void * clientdata, pami_result_t err) { @@ -528,7 +539,6 @@ CmiPAMIMemRegion_t cmi_pami_memregion[64]; #endif #include "malloc.h" -void *l2atomicbuf; void _alias_rank (int rank) { #if CMK_SMP && CMK_ENABLE_ASYNC_PROGRESS @@ -639,7 +649,8 @@ int CMI_Progress_finalize(int start, int ncontexts) { #include "manytomany.c" -void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret) { +void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret) +{ int n, i, count; /* processor per node */ @@ -651,6 +662,12 @@ void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret) #endif PAMI_Client_create (clientname, &cmi_pami_client, NULL, 0); + pami_configuration_t query; + query.name = PAMI_CLIENT_NUM_CONTEXTS; + pami_result_t rc = PAMI_Client_query(cmi_pami_client, &query, 1); + unsigned possible_contexts = query.value.intval; + //fprintf(stdout, "Creating client with %d contexts\n", possible_contexts); + size_t _n = 1; #if CMK_PAMI_MULTI_CONTEXT if ((_Cmi_mynodesize % THREADS_PER_CONTEXT) == 0) @@ -660,7 +677,18 @@ void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret) #endif cmi_pami_contexts = (pami_context_t *) malloc (sizeof(pami_context_t) * _n); - pami_result_t rc = PAMI_Context_createv (cmi_pami_client, NULL, 0, cmi_pami_contexts, _n); + + int cfgval=0; +#if 1 //CMK_BLUEGENEQ + pami_configuration_t *config = NULL; +#else + pami_configuration_t config[3]; + config[cfgval].name = PAMI_CLIENT_CONST_CONTEXTS: + config[cfgval].value.intval = 1; + cfgval++; +#endif + + rc = PAMI_Context_createv (cmi_pami_client, config, cfgval, cmi_pami_contexts, _n); if (rc != PAMI_SUCCESS) { fprintf(stderr, "PAMI_Context_createv failed for %d contexts\n", _n); assert(0); @@ -689,8 +717,18 @@ void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret) } #endif + pami_dispatch_hint_t soptions = (pami_dispatch_hint_t) {0}; + pami_dispatch_hint_t loptions = (pami_dispatch_hint_t) {0}; + + soptions.long_header = PAMI_HINT_DISABLE; + soptions.recv_immediate = PAMI_HINT_ENABLE; + soptions.use_rdma = PAMI_HINT_DISABLE; + + loptions.long_header = PAMI_HINT_DISABLE; + loptions.recv_contiguous = PAMI_HINT_ENABLE; + //loptions.recv_immediate = PAMI_HINT_ENABLE; + loptions.recv_copy = PAMI_HINT_ENABLE; - pami_dispatch_hint_t options = (pami_dispatch_hint_t) {0}; pami_dispatch_callback_function pfn; for (i = 0; i < _n; ++i) { pfn.p2p = pkt_dispatch; @@ -698,31 +736,31 @@ void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret) CMI_PAMI_DISPATCH, pfn, NULL, - options); + loptions); pfn.p2p = ack_pkt_dispatch; PAMI_Dispatch_set (cmi_pami_contexts[i], CMI_PAMI_ACK_DISPATCH, pfn, NULL, - options); + soptions); pfn.p2p = rzv_pkt_dispatch; PAMI_Dispatch_set (cmi_pami_contexts[i], CMI_PAMI_RZV_DISPATCH, pfn, NULL, - options); + soptions); pfn.p2p = short_pkt_dispatch; PAMI_Dispatch_set (cmi_pami_contexts[i], CMI_PAMI_SHORT_DISPATCH, pfn, NULL, - options); + soptions); } -#if 1 +#if CMK_BLUEGENEQ size_t bytes_out; void * buf = malloc(sizeof(long)); uint32_t retval; @@ -834,49 +872,52 @@ void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret) CsvInitialize(CmiNodeState, NodeState); CmiNodeStateInit(&CsvAccess(NodeState)); -#if CMK_SMP && CMK_USE_L2ATOMICS +#if CMK_SMP + posix_memalign((void**)&procState, 128, (_Cmi_mynodesize) * sizeof(ProcState)); +#endif + +#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE +#if CMK_BLUEGENEQ // we may enable communication threads //max available hardware threads int actualNodeSize = 64/Kernel_ProcessCount(); - //printf("Ranks per node %d, actualNodeSize %d CmiMyNodeSize() %d\n", - // Kernel_ProcessCount(), actualNodeSize, _Cmi_mynodesize); - - //pami_result_t rc; - pami_extension_t l2; - pamix_proc_memalign_fn PAMIX_L2_proc_memalign; - size_t size = (_Cmi_mynodesize + 2*actualNodeSize + 1) - * sizeof(L2AtomicState) + sizeof(L2AtomicMutex); +#else + int actualNodeSize = _Cmi_mynodesize; +#endif - rc = PAMI_Extension_open(NULL, "EXT_bgq_l2atomic", &l2); - CmiAssert (rc == 0); - PAMIX_L2_proc_memalign = (pamix_proc_memalign_fn)PAMI_Extension_symbol(l2, "proc_memalign"); - rc = PAMIX_L2_proc_memalign(&l2atomicbuf, 64, size, NULL); - CmiAssert (rc == 0); +#if CMK_PPC_ATOMIC_MUTEX + //Allocate for PPC Atomic Mutex as well + size_t size = (_Cmi_mynodesize + 3*actualNodeSize + 1) + * sizeof(PPCAtomicState) + 2*sizeof(PPCAtomicMutex); +#else + size_t size = (_Cmi_mynodesize + 3*actualNodeSize + 1) + * sizeof(PPCAtomicState); #endif + void *atomic_buf; + PPC_AtomicCounterAllocate(&atomic_buf, size); - char *l2_start = (char *) l2atomicbuf; - procState = (ProcState *)malloc((_Cmi_mynodesize) * sizeof(ProcState)); + char *atomic_start = (char *) atomic_buf; for (i=0; i<_Cmi_mynodesize; i++) { -#if CMK_SMP && CMK_USE_L2ATOMICS - L2AtomicQueueInit (l2_start + sizeof(L2AtomicState)*i, - sizeof(L2AtomicState), - &procState[i].atomic_queue, - 1, /*use overflow*/ - DEFAULT_SIZE /*1024 entries*/); -#endif + PPCAtomicQueueInit (atomic_start + sizeof(PPCAtomicState)*i, + sizeof(PPCAtomicState), + &procState[i].atomic_queue, + 1, /*use overflow*/ + DEFAULT_SIZE /*2048 entries*/); } + atomic_start += _Cmi_mynodesize * sizeof(PPCAtomicState); -#if CMK_SMP && CMK_USE_L2ATOMICS - l2_start += _Cmi_mynodesize * sizeof(L2AtomicState); - CmiMemAllocInit_bgq (l2_start, 2*actualNodeSize*sizeof(L2AtomicState)); - l2_start += 2*actualNodeSize*sizeof(L2AtomicState); + CmiMemAllocInit_ppcq(atomic_start,3*actualNodeSize*sizeof(PPCAtomicState)); + atomic_start += 3*actualNodeSize*sizeof(PPCAtomicState); - L2AtomicQueueInit (l2_start, - sizeof(L2AtomicState), - &node_recv_atomic_q, - 1, /*use overflow*/ - DEFAULT_SIZE /*1024 entries*/); - l2_start += sizeof(L2AtomicState); - node_recv_mutex = L2AtomicMutexInit(l2_start, sizeof(L2AtomicMutex)); + PPCAtomicQueueInit (atomic_start, + sizeof(PPCAtomicState), + &node_recv_atomic_q, + 1, /*use overflow*/ + DEFAULT_SIZE /*2048 entries*/); + atomic_start += sizeof(PPCAtomicState); + +#if CMK_PPC_ATOMIC_MUTEX + node_recv_mutex = PPCAtomicMutexInit(atomic_start, sizeof(PPCAtomicMutex)); +#endif #endif //Initialize the manytomany api @@ -899,11 +940,9 @@ int PerrorExit (char *err) { return -1; } - void ConverseRunPE(int everReturn) { // printf ("ConverseRunPE on rank %d\n", CmiMyPe()); - CmiIdleState *s=CmiNotifyGetState(); CmiState cs; char** CmiMyArgv; CmiNodeAllBarrier(); @@ -919,19 +958,35 @@ void ConverseRunPE(int everReturn) { CthInit(CmiMyArgv); + CmiBarrier(); + CmiBarrier(); + CmiBarrier(); + CmiBarrier(); + //printf ("Before Converse Common Init\n"); ConverseCommonInit(CmiMyArgv); +#if CMK_TRACE_ENABLED + //Register memory allocator events + traceRegisterUserEvent("CmiAlloc_ppcq", 30001); + traceRegisterUserEvent("CmiFree_ppcq", 30002); + traceRegisterUserEvent("machine_send", 30003); + traceRegisterUserEvent("CmiSendPeer", 30004); + traceRegisterUserEvent("PAMI_Context_advance", 30005); + traceRegisterUserEvent("m2m_start", 30006); + traceRegisterUserEvent("PAMI_Context_post", 30007); +#endif + CcdCallOnConditionKeep(CcdPROCESSOR_STILL_IDLE,(CcdVoidFn)CmiNotifyIdle,NULL); //printf ("before calling CmiBarrier() \n"); - CmiBarrier(); /* Converse initialization finishes, immediate messages can be processed. node barrier previously should take care of the node synchronization */ _immediateReady = 1; //printf("calling the startfn\n"); + CmiBarrier(); if (!everReturn) { Cmi_startfn(CmiGetArgc(CmiMyArgv), CmiMyArgv); @@ -1011,7 +1066,9 @@ void ConverseExit(void) { #if CMK_SMP CmiNodeBarrier(); if (rank0) { +#if CMK_BLUEGENEQ Delay(100000); +#endif exit(0); } else @@ -1037,15 +1094,20 @@ void CmiAbort(const char * message) { #if CMK_NODE_QUEUE_AVAILABLE char *CmiGetNonLocalNodeQ(void) { - //CmiState cs = CmiGetState(); char *result = 0; - //CmiIdleLock_checkMessage(&cs->idle); -#if CMK_SMP && CMK_USE_L2ATOMICS - if (!L2AtomicQueueEmpty(&node_recv_atomic_q)) { - if (L2AtomicMutexTryAcquire(node_recv_mutex) == 0) { - result = (char*)L2AtomicDequeue(&node_recv_atomic_q); - L2AtomicMutexRelease(node_recv_mutex); +#if CMK_SMP && CMK_PPC_ATOMIC_MUTEX && CMK_PPC_ATOMIC_QUEUE + if (!PPCAtomicQueueEmpty(&node_recv_atomic_q)) { + if (PPCAtomicMutexTryAcquire(node_recv_mutex) == 0) { + result = (char*)PPCAtomicDequeue(&node_recv_atomic_q); + PPCAtomicMutexRelease(node_recv_mutex); + } + } +#elif CMK_SMP && CMK_PPC_ATOMIC_QUEUE + if (!PPCAtomicQueueEmpty(&node_recv_atomic_q)) { + if (CmiTryLock(CsvAccess(NodeState).CmiNodeRecvLock) == 0) { + result = (char*)PPCAtomicDequeue(&node_recv_atomic_q); + CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock); } } #else @@ -1053,7 +1115,6 @@ char *CmiGetNonLocalNodeQ(void) { MACHSTATE1(3,"CmiGetNonLocalNodeQ begin %d {", CmiMyPe()); if (CmiTryLock(CsvAccess(NodeState).CmiNodeRecvLock) == 0) { - //CmiLock(CsvAccess(NodeState).CmiNodeRecvLock); result = (char *) PCQueuePop(CsvAccess(NodeState).NodeRecv); CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock); } @@ -1073,12 +1134,12 @@ void *CmiGetNonLocal() { CmiState cs = CmiGetState(); //CmiIdleLock_checkMessage(&cs->idle); -#if CMK_SMP && CMK_USE_L2ATOMICS - msg = L2AtomicDequeue(&procState[CmiMyRank()].atomic_queue); +#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE + msg = PPCAtomicDequeue(&procState[CmiMyRank()].atomic_queue); #if !(CMK_ENABLE_ASYNC_PROGRESS) if (msg == NULL) { AdvanceCommunications(); - msg = L2AtomicDequeue(&procState[CmiMyRank()].atomic_queue); + msg = PPCAtomicDequeue(&procState[CmiMyRank()].atomic_queue); } #endif #else @@ -1088,9 +1149,6 @@ void *CmiGetNonLocal() { msg = PCQueuePop(cs->recv); #endif - //if (msg != NULL) - //fprintf(stderr, "%d: Returning a message\n", CmiMyPe()); - return msg; } @@ -1109,8 +1167,15 @@ static void CmiSendSelf(char *msg) { #if CMK_SMP static void CmiSendPeer (int rank, int size, char *msg) { - //fprintf(stderr, "%d Send messages to peer\n", CmiMyPe()); - CmiPushPE (rank, msg); + //fprintf(stderr, "%d Send messages to peer\n", CmiMyPe()); +#if CMK_TRACE_ENABLED + double start = CmiWallTimer(); +#endif + CmiPushPE (rank, msg); + +#if CMK_TRACE_ENABLED + traceUserBracketEvent(30004, start, CmiWallTimer()); +#endif } #endif @@ -1151,8 +1216,8 @@ void CmiGeneralFreeSendN(int node, int rank, int size, char * msg, int to_lock) #if CMK_SMP CMI_DEST_RANK(msg) = rank; if (node == CmiMyNode()) { - CmiSendPeer (rank, size, msg); - return; + CmiSendPeer (rank, size, msg); + return; } #endif @@ -1194,17 +1259,23 @@ void machine_send (pami_context_t context, { CMI_DEST_RANK(msg) = rank; +#if CMK_TRACE_ENABLED + double start = CmiWallTimer(); +#endif + + CmiAssert (node != CmiMyNode()); + pami_endpoint_t target; #if CMK_PAMI_MULTI_CONTEXT - //size_t dst_context = (rank != SMP_NODEMESSAGE) ? (rank>>LTPS) : (rand_r(&r_seed) % cmi_pami_numcontexts); + size_t dst_context = (rank != SMP_NODEMESSAGE) ? (rank>>LTPS) : (myrand(&r_seed) % cmi_pami_numcontexts); //Choose a context at random - size_t dst_context = myrand(&r_seed) % cmi_pami_numcontexts; + //size_t dst_context = myrand(&r_seed) % cmi_pami_numcontexts; #else size_t dst_context = 0; #endif PAMI_Endpoint_create (cmi_pami_client, (pami_task_t)node, dst_context, &target); - //fprintf (stderr, "Calling PAMI Send to %d magic %d size %d\n", node, CMI_MAGIC(msg), size); + //fprintf (stderr, "%d Calling PAMI Send to node %d peer %d magic %d size %d\n", CmiMyPe(), node, dst_context, CMI_MAGIC(msg), size); if (CMI_LIKELY(size < SHORT_CUTOFF)) { pami_send_immediate_t parameters; @@ -1216,15 +1287,20 @@ void machine_send (pami_context_t context, //use short callback if not a bcast and not an SMP node message parameters.dispatch = CMI_PAMI_SHORT_DISPATCH; - parameters.header.iov_base = NULL; //&rank; - parameters.header.iov_len = 0; //sizeof(int); +#if FREE_LIST_SEND_NO_COPY + parameters.header.iov_base = &rank; + parameters.header.iov_len = sizeof(int); +#else + parameters.header.iov_base = NULL; + parameters.header.iov_len = 0; +#endif parameters.data.iov_base = msg; parameters.data.iov_len = size; parameters.dest = target; if(to_lock) PAMIX_CONTEXT_LOCK(context); - + PAMI_Send_immediate (context, ¶meters); if(to_lock) @@ -1234,8 +1310,13 @@ void machine_send (pami_context_t context, else if (size < EAGER_CUTOFF) { pami_send_t parameters; parameters.send.dispatch = CMI_PAMI_DISPATCH; - parameters.send.header.iov_base = NULL; //&rank; - parameters.send.header.iov_len = 0; //sizeof(int); +#if FREE_LIST_SEND_NO_COPY + parameters.send.header.iov_base = &rank; + parameters.send.header.iov_len = sizeof(int); +#else + parameters.send.header.iov_base = NULL; + parameters.send.header.iov_len = 0; +#endif parameters.send.data.iov_base = msg; parameters.send.data.iov_len = size; parameters.events.cookie = msg; @@ -1252,29 +1333,50 @@ void machine_send (pami_context_t context, PAMIX_CONTEXT_UNLOCK(context); } else { + if(to_lock) + PAMIX_CONTEXT_LOCK(context); + CmiPAMIRzv_t rzv; rzv.bytes = size; rzv.buffer = msg; + rzv.rank = rank; +#if CMK_BLUEGENEQ rzv.offset = (size_t)msg - (size_t)cmi_pami_memregion[0].baseVA; +#else + rzv.offset = (size_t)msg; + size_t bytes_out; + pami_memregion_t mregion; + //In use for PAMI_Get + PAMI_Memregion_create (context, + msg, + size, + &bytes_out, + &mregion); +#endif rzv.dst_context = dst_context; - memcpy(&rzv.mregion, &cmi_pami_memregion[0].mregion, sizeof(pami_memregion_t)); pami_send_immediate_t parameters; parameters.dispatch = CMI_PAMI_RZV_DISPATCH; parameters.header.iov_base = &rzv; parameters.header.iov_len = sizeof(rzv); +#if CMK_BLUEGENEQ + parameters.data.iov_base = &cmi_pami_memregion[0].mregion; + parameters.data.iov_len = sizeof(pami_memregion_t); +#else parameters.data.iov_base = NULL; parameters.data.iov_len = 0; +#endif parameters.dest = target; - if(to_lock) - PAMIX_CONTEXT_LOCK(context); - PAMI_Send_immediate (context, ¶meters); if(to_lock) PAMIX_CONTEXT_UNLOCK(context); } + +#if CMK_TRACE_ENABLED + traceUserBracketEvent(30003, start, CmiWallTimer()); +#endif } void CmiSyncSendFn(int destPE, int size, char *msg) { @@ -1428,40 +1530,52 @@ void CmiFreeBroadcastAllFn(int size, char *msg) { void AdvanceCommunications() { pami_context_t my_context = MY_CONTEXT(); +#if CMK_TRACE_ENABLED + double start = CmiWallTimer(), end; +#endif + #if CMK_SMP //CmiAssert (my_context != NULL); if (PAMIX_CONTEXT_TRYLOCK(my_context)) { + //fprintf(stderr, "%d advancing context %d\n", CmiMyPe(), MY_CONTEXT_ID()); PAMI_Context_advance(my_context, 1); PAMIX_CONTEXT_UNLOCK(my_context); } #else PAMI_Context_advance(my_context, 1); #endif + +#if CMK_TRACE_ENABLED + end = CmiWallTimer(); + //only log 1us or larger events + if (end - start > 1e-6) + traceUserBracketEvent(30005, start, end); +#endif } #endif void CmiNotifyIdle() { AdvanceCommunications(); -#if CMK_SMP && CMK_PAMI_MULTI_CONTEXT -#if !CMK_ENABLE_ASYNC_PROGRESS && CMK_USE_L2ATOMICS +#if CMK_BLUEGENEQ && CMK_SMP && CMK_PAMI_MULTI_CONTEXT +#if !CMK_ENABLE_ASYNC_PROGRESS && CMK_PPC_ATOMIC_QUEUE //Wait on the atomic queue to get a message with very low core //overheads. One thread calls advance more frequently if ((CmiMyRank()% THREADS_PER_CONTEXT) == 0) //spin wait for 2-4us when idle //process node queue messages every 10us //Idle cores will only use one LMQ slot and an int sum - L2AtomicQueue2QSpinWait(&procState[CmiMyRank()].atomic_queue, - &node_recv_atomic_q, - 10); + PPCAtomicQueue2QSpinWait(&procState[CmiMyRank()].atomic_queue, + &node_recv_atomic_q, + 10); else #endif -#if CMK_USE_L2ATOMICS +#if CMK_PPC_ATOMIC_QUEUE //spin wait for 50-100us when idle waiting for a message - L2AtomicQueue2QSpinWait(&procState[CmiMyRank()].atomic_queue, - &node_recv_atomic_q, - 1000); + PPCAtomicQueue2QSpinWait(&procState[CmiMyRank()].atomic_queue, + &node_recv_atomic_q, + 1000); #endif #endif } @@ -1552,7 +1666,7 @@ void CmiFreeListSendFn(int npes, int *pes, int size, char *msg) { //Fast path if (npes == 1) { - CmiGeneralFreeSendN(CmiNodeOf(pes[0]), CmiRankOf(pes[0]), size, msg, 1); + CmiGeneralFreeSend(pes[0], size, msg); return; } @@ -1577,10 +1691,29 @@ void CmiFreeListSendFn(int npes, int *pes, int size, char *msg) { void machineFreeListSendFn(pami_context_t my_context, int npes, int *pes, int size, char *msg) { int i; char *copymsg; + + PAMIX_CONTEXT_LOCK(my_context); + + for (i=0;ibytes; - char * buffer = (char *)CmiAlloc(alloc_size + sizeof(CmiPAMIRzvRecv_t)); - //char *buffer=(char*)CmiAlloc(alloc_size+sizeof(CmiPAMIRzvRecv_t)+sizeof(int)) - //*(int *)(buffer+alloc_size) = *(int *)header_addr; + char *buffer = (char *)CmiAlloc(alloc_size + sizeof(CmiPAMIRzvRecv_t)); CmiAssert (recv == NULL); CmiPAMIRzvRecv_t *rzv_recv = (CmiPAMIRzvRecv_t *)(buffer+alloc_size); @@ -2057,11 +2144,12 @@ void rzv_pkt_dispatch (pami_context_t context, rzv_recv->src_ep = origin; rzv_recv->src_buffer = rzv_hdr->buffer; rzv_recv->size = rzv_hdr->bytes; - - //CmiAssert (pipe_addr != NULL); - //CmiAssert (pipe_size == sizeof(pami_memregion_t)); - //pami_memregion_t *mregion = (pami_memregion_t *) pipe_addr; - memcpy(&rzv_recv->rmregion, &rzv_hdr->mregion, sizeof(pami_memregion_t)); + rzv_recv->rank = rzv_hdr->rank; + +#ifdef CMK_BLUEGENEQ + CmiAssert (pipe_addr != NULL); + CmiAssert (pipe_size == sizeof(pami_memregion_t)); + memcpy(&rzv_recv->rmregion, pipe_addr, sizeof(pami_memregion_t)); //Rzv inj fifos are on the 17th core shared by all contexts pami_rget_simple_t rget; @@ -2082,6 +2170,31 @@ void rzv_pkt_dispatch (pami_context_t context, pami_result_t rc; rc = PAMI_Rget (context, &rget); //CmiAssert(rc == PAMI_SUCCESS); +#else + size_t bytes_out; + pami_memregion_t mregion; + //In use for PAMI_Get + PAMI_Memregion_create (context, + buffer, + rzv_hdr->bytes, + &bytes_out, + &mregion); + + pami_get_simple_t get; + memset(&get, 0, sizeof(get)); + get.rma.dest = origin; + get.rma.bytes = rzv_hdr->bytes; + get.rma.cookie = rzv_recv; + get.rma.done_fn = rzv_recv_done; + get.rma.hints.use_rdma = PAMI_HINT_ENABLE; + get.rma.hints.buffer_registered = PAMI_HINT_ENABLE; + get.rma.hints.use_shmem = PAMI_HINT_DEFAULT; + get.rma.hints.remote_async_progress = PAMI_HINT_DEFAULT; + get.addr.local = buffer; + get.addr.remote = (void*)rzv_hdr->offset; + PAMI_Get(context, &get); +#endif + } void ack_pkt_dispatch (pami_context_t context, @@ -2097,4 +2210,6 @@ void ack_pkt_dispatch (pami_context_t context, CmiFree (*buf); } +#if CMK_BLUEGENEQ #include "cmimemcpy_qpx.h" +#endif diff --git a/src/arch/pami/manytomany.c b/src/arch/pami/manytomany.c index 12616749e9..4f43cae6ea 100644 --- a/src/arch/pami/manytomany.c +++ b/src/arch/pami/manytomany.c @@ -8,17 +8,17 @@ #define M2M_PAMI_DISPATCH 15 typedef struct _pami_m2mhdr { - int8_t dstrank; - int8_t connid; - int32_t srcindex; -} PAMI_M2mHeader; + uint8_t dstrank; + uint8_t connid; + uint32_t srcindex; +} PAMI_M2mHeader; typedef struct _pami_m2m_work { - pami_work_t work; int start; int end; void * handle; pami_context_t context; + pami_work_t work; } PAMI_M2mWork_t; typedef struct _m2m_completionmsg { @@ -27,56 +27,73 @@ typedef struct _m2m_completionmsg { int rank; } M2mCompletionMsg; +typedef struct _m2m_sendinfo { + char * buf; + uint32_t bytes; + pami_endpoint_t ep; + uint16_t dispatch; + PAMI_M2mHeader hdr; +} M2mSendInfo; + +#if CMK_SMP && CMK_ENABLE_ASYNC_PROGRESS +#define M2M_PARALLEL_CONTEXT 1 +#elif CMK_SMP +#define M2M_PARALLEL_CONTEXT 1 +#else +#define M2M_PARALLEL_CONTEXT 0 +#endif + +#if M2M_PARALLEL_CONTEXT #define MAX_NWORK 8 +#else +#define MAX_NWORK 1 +#endif typedef struct _pami_cmidhandle { int myrank; - unsigned m2m_rcvcounter ; - unsigned m2m_nzrcvranks; + unsigned m2m_rcvcounter; + unsigned m2m_nzrcvranks; + unsigned m2m_nsndranks; char * m2m_rcvbuf ; unsigned * m2m_rcvlens ; unsigned * m2m_rdispls ; + M2mSendInfo * m2m_sndinfo ; + PAMI_M2mWork_t swork[MAX_NWORK]; + int n_work; - unsigned m2m_nsndranks; - unsigned m2m_srankIndex; + //Less frequently used (or unused) during runtime execution char * m2m_sndbuf ; - unsigned * m2m_sndlens ; - unsigned * m2m_sdispls ; unsigned m2m_sndcounter ; - unsigned * m2m_permutation; - unsigned * m2m_lranks ; - pami_endpoint_t * m2m_node_eps; - - PAMI_M2mWork_t swork[MAX_NWORK]; - int n_work; + unsigned m2m_srankIndex; //Stored in header CmiDirectM2mHandler m2m_rdone; void * m2m_rdonecontext; - PAMI_M2mHeader * m2m_hdrs; M2mCompletionMsg cmsg; unsigned m2m_ntotalrcvranks; - unsigned m2m_initialized; - unsigned m2m_rrankIndex; + unsigned m2m_initialized; + unsigned m2m_rrankIndex; CmiDirectM2mHandler m2m_sdone; void * m2m_sdonecontext; -} PAMICmiDirectM2mHandle; +} PAMICmiDirectM2mHandle; CpvDeclare(PAMICmiDirectM2mHandle*, _handle); CpvDeclare(int, _completion_handler); -static void m2m_recv_done(pami_context_t ctxt, void *clientdata, pami_result_t result) +static void m2m_recv_done(pami_context_t ctxt, void *clientdata, pami_result_t result) { - PAMICmiDirectM2mHandle *handle = (PAMICmiDirectM2mHandle *)clientdata; + int ntotal = 0; + PAMICmiDirectM2mHandle *handle = (PAMICmiDirectM2mHandle *)clientdata; //acquire lock if processed by many comm threads and contexts? handle->m2m_rcvcounter ++; - - if (handle->m2m_rcvcounter == handle->m2m_nzrcvranks) { - //printf ("Calling manytomany rdone for handle %p on rank %d counter %d nexp %d\n", + ntotal = handle->m2m_rcvcounter; + + if (ntotal == handle->m2m_nzrcvranks) { + //printf ("Calling manytomany rdone for handle %p on rank %d counter %d nexp %d\n", // handle, CmiMyPe(), // handle->m2m_rcvcounter, handle->m2m_nzrcvranks); handle->m2m_rcvcounter = 0; -#if CMK_SMP && CMK_ENABLE_ASYNC_PROGRESS +#if CMK_SMP && (M2M_PARALLEL_CONTEXT || LTPS) //Called from comm thread CmiSendPeer (handle->myrank, sizeof(M2mCompletionMsg), (char*)&handle->cmsg); #else @@ -86,9 +103,9 @@ static void m2m_recv_done(pami_context_t ctxt, void *clientdata, pami_result_t r } } -static void m2m_send_done(pami_context_t ctxt, void *clientdata, pami_result_t result) +static void m2m_send_done(pami_context_t ctxt, void *clientdata, pami_result_t result) { - PAMICmiDirectM2mHandle *handle = (PAMICmiDirectM2mHandle *)clientdata; + PAMICmiDirectM2mHandle *handle = (PAMICmiDirectM2mHandle *)clientdata; //acquire lock if processed by many comm threads and contexts? handle->m2m_sndcounter ++; if (handle->m2m_sndcounter == handle->m2m_nsndranks) { @@ -96,7 +113,7 @@ static void m2m_send_done(pami_context_t ctxt, void *clientdata, pami_result_t r //else handle->m2m_sndcounter = 0; if (handle->m2m_sdone) - handle->m2m_sdone(handle->m2m_sdonecontext); + handle->m2m_sdone(handle->m2m_sdonecontext); } } @@ -107,17 +124,21 @@ static void m2m_rdone_mainthread (void *m) { handle->m2m_rdone(handle->m2m_rdonecontext); } -static void m2m_s8_dispatch (pami_context_t context, +static void m2m_s8_dispatch (pami_context_t context, void * clientdata, - const void * header_addr, - size_t header_size, - const void * pipe_addr, - size_t pipe_size, + const void * header_addr, + size_t header_size, + const void * pipe_addr, + size_t pipe_size, pami_endpoint_t origin, - pami_recv_t * recv) + pami_recv_t * recv) { PAMI_M2mHeader *hdr = (PAMI_M2mHeader *) header_addr; - PAMICmiDirectM2mHandle *handlevec = CpvAccessOther(_handle, hdr->dstrank); +#if CMK_SMP && (M2M_PARALLEL_CONTEXT || LTPS) + PAMICmiDirectM2mHandle *handlevec = CpvAccessOther(_handle, hdr->dstrank); +#else + PAMICmiDirectM2mHandle *handlevec = CpvAccess(_handle); +#endif PAMICmiDirectM2mHandle *handle = &handlevec[hdr->connid]; char *buffer = handle->m2m_rcvbuf + handle->m2m_rdispls[hdr->srcindex]; @@ -127,46 +148,59 @@ static void m2m_s8_dispatch (pami_context_t context, } -static void m2m_spkt_dispatch (pami_context_t context, +static void m2m_spkt_dispatch (pami_context_t context, void * clientdata, - const void * header_addr, - size_t header_size, - const void * pipe_addr, - size_t pipe_size, + const void * header_addr, + size_t header_size, + const void * pipe_addr, + size_t pipe_size, pami_endpoint_t origin, - pami_recv_t * recv) + pami_recv_t * recv) { PAMI_M2mHeader *hdr = (PAMI_M2mHeader *) header_addr; - PAMICmiDirectM2mHandle *handlevec = CpvAccessOther(_handle, hdr->dstrank); +#if CMK_SMP && (M2M_PARALLEL_CONTEXT || LTPS) + PAMICmiDirectM2mHandle *handlevec = CpvAccessOther(_handle, hdr->dstrank); +#else + PAMICmiDirectM2mHandle *handlevec = CpvAccess(_handle); +#endif PAMICmiDirectM2mHandle *handle = &handlevec[hdr->connid]; char *buffer = handle->m2m_rcvbuf + handle->m2m_rdispls[hdr->srcindex]; - memcpy (buffer, pipe_addr, pipe_size); + if (pipe_size == 32) { + uint64_t *src = (uint64_t *)pipe_addr; + uint64_t *dst = (uint64_t *)buffer; + + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + } + else + memcpy (buffer, pipe_addr, pipe_size); m2m_recv_done (context, handle, PAMI_SUCCESS); } -static void m2m_pkt_dispatch (pami_context_t context, +static void m2m_pkt_dispatch (pami_context_t context, void * clientdata, - const void * header_addr, - size_t header_size, - const void * pipe_addr, - size_t pipe_size, + const void * header_addr, + size_t header_size, + const void * pipe_addr, + size_t pipe_size, pami_endpoint_t origin, - pami_recv_t * recv) + pami_recv_t * recv) { PAMI_M2mHeader *hdr = (PAMI_M2mHeader *) header_addr; - //CmiAssert (hdr->dstrank < CmiMyNodeSize()); - //CmiAssert (hdr->connid < MAX_CONN); - +#if CMK_SMP && (M2M_PARALLEL_CONTEXT || LTPS) PAMICmiDirectM2mHandle *handlevec = CpvAccessOther(_handle, hdr->dstrank); - //CmiAssert (handlevec != NULL); - +#else + PAMICmiDirectM2mHandle *handlevec = CpvAccess(_handle); +#endif + //fprintf(stderr, "m2m_pkt_dispatch: mype %d connid %d dstrank %d handlevec %p\n", // CmiMyPe(), hdr->connid, hdr->dstrank, handlevec); - PAMICmiDirectM2mHandle *handle = &handlevec[hdr->connid]; char *buffer = handle->m2m_rcvbuf + handle->m2m_rdispls[hdr->srcindex]; @@ -186,25 +220,20 @@ static void m2m_pkt_dispatch (pami_context_t context, } -void * CmiDirect_manytomany_allocate_handle () { -#if CMK_SMP && !CMK_ENABLE_ASYNC_PROGRESS - CmiAbort("!!!!!!!!!Please build Charm++ with async in order to use many-to-many interface\n"); -#else +void * CmiDirect_manytomany_allocate_handle () { if (!CpvInitialized(_handle)) CpvInitialize(PAMICmiDirectM2mHandle*, _handle); if (!CpvInitialized(_completion_handler)) - CpvInitialize(int, _completion_handler); - ppc_msync(); - + CpvInitialize(int, _completion_handler); + if (CpvAccess(_handle) == NULL) { CpvAccess(_handle) = (PAMICmiDirectM2mHandle *)malloc (MAX_CONN *sizeof(PAMICmiDirectM2mHandle)); memset (CpvAccess(_handle),0,MAX_CONN*sizeof (PAMICmiDirectM2mHandle)); CpvAccess(_completion_handler) = CmiRegisterHandler(m2m_rdone_mainthread); } - + //printf ("allocate_handle on rank %d %p\n", CmiMyPe(), CpvAccess(_handle)); return CpvAccess(_handle); -#endif } @@ -216,13 +245,10 @@ void CmiDirect_manytomany_initialize_recvbase(void * h, unsigned nranks, unsigned myIdx ) { -#if CMK_SMP && !CMK_ENABLE_ASYNC_PROGRESS - CmiAbort("!!!!!!!!!Please build Charm++ with async in order to use many-to-many interface\n"); -#else PAMICmiDirectM2mHandle *handle = &(((PAMICmiDirectM2mHandle *) h)[tag]); //PAMICmiDirectM2mHandle *handle = &(CpvAccess(_handle)[tag]); - //printf ("manytomany recvbase on rank %d handle %p conn %d nranks %d\n", + //printf ("manytomany recvbase on rank %d handle %p conn %d nranks %d\n", // CmiMyPe(), handle, tag, nranks); handle->myrank = CmiMyRank(); @@ -236,19 +262,19 @@ void CmiDirect_manytomany_initialize_recvbase(void * h, handle->m2m_rdone = donecb; handle->m2m_rdonecontext = context; handle->m2m_ntotalrcvranks = nranks; - + //Receiver is not sender - //if (myIdx == (unsigned)-1) + //if (myIdx == (unsigned)-1) //(handle->m2m_ntotalrcvranks)++; - + handle->m2m_rcvlens = malloc (sizeof(int) * handle->m2m_ntotalrcvranks); handle->m2m_rdispls = malloc (sizeof(int) * handle->m2m_ntotalrcvranks); - + assert (handle->m2m_rcvlens != NULL); - + memset (handle->m2m_rcvlens, 0, handle->m2m_ntotalrcvranks * sizeof(int)); memset (handle->m2m_rdispls, 0, handle->m2m_ntotalrcvranks * sizeof(int)); - + //Receiver is not sender //if (myIdx == (unsigned)-1) { //Receiver doesnt send any data @@ -256,7 +282,6 @@ void CmiDirect_manytomany_initialize_recvbase(void * h, //CmiDirect_manytomany_initialize_recv (h, tag, myIdx, 0, 0, CmiMyPe()); //} handle->m2m_rrankIndex = myIdx; -#endif } void CmiDirect_manytomany_initialize_recv ( void * h, @@ -266,18 +291,14 @@ void CmiDirect_manytomany_initialize_recv ( void * h, unsigned bytes, unsigned rank ) { -#if CMK_SMP && !CMK_ENABLE_ASYNC_PROGRESS - CmiAbort("!!!!!!!!!Please build Charm++ with async in order to use many-to-many interface\n"); -#else PAMICmiDirectM2mHandle *handle = &(((PAMICmiDirectM2mHandle *) h)[tag]); assert ( tag < MAX_CONN ); - + if (handle->m2m_rcvlens[idx] == 0 && bytes > 0) handle->m2m_nzrcvranks ++; handle->m2m_rcvlens [idx] = bytes; handle->m2m_rdispls [idx] = displ; -#endif } @@ -289,43 +310,30 @@ void CmiDirect_manytomany_initialize_sendbase( void * h, unsigned nranks, unsigned myIdx ) { -#if CMK_SMP && !CMK_ENABLE_ASYNC_PROGRESS - CmiAbort("!!!!!!!!!Please build Charm++ with async in order to use many-to-many interface\n"); -#else PAMICmiDirectM2mHandle *handle = &(((PAMICmiDirectM2mHandle *) h)[tag]); assert ( tag < MAX_CONN ); handle->m2m_sndbuf = sndbuf; handle->m2m_sdone = donecb; handle->m2m_sdonecontext = context; - + handle->m2m_nsndranks = nranks; - handle->m2m_srankIndex = myIdx; - handle->m2m_sndlens = (unsigned int *) malloc (sizeof(unsigned int) * nranks); - handle->m2m_sdispls = (unsigned int *) malloc (sizeof(unsigned int) * nranks); - handle->m2m_lranks = (unsigned int *) malloc (sizeof(unsigned int) * nranks); - handle->m2m_node_eps = (pami_endpoint_t *) malloc (sizeof(pami_endpoint_t) * nranks); - handle->m2m_permutation = (unsigned int *) malloc (sizeof(unsigned int) * nranks); - handle->m2m_hdrs = (PAMI_M2mHeader *) malloc(sizeof(PAMI_M2mHeader) * nranks); - - memset (handle->m2m_sndlens, 0, nranks * sizeof(int)); - memset (handle->m2m_sdispls, 0, nranks * sizeof(int)); - memset (handle->m2m_lranks, 0, nranks * sizeof(int)); - memset (handle->m2m_node_eps, 0, nranks * sizeof(pami_endpoint_t)); - memset (handle->m2m_permutation,0, nranks * sizeof(int)); + handle->m2m_srankIndex = myIdx; + handle->m2m_sndinfo = (M2mSendInfo *)malloc(nranks * sizeof(M2mSendInfo)); + memset (handle->m2m_sndinfo,0, nranks * sizeof(M2mSendInfo)); -#if CMK_SMP && CMK_ENABLE_ASYNC_PROGRESS +#if M2M_PARALLEL_CONTEXT //we have a completion callback if (handle->m2m_sdone != NULL) { handle->swork[0].start = 0; - handle->swork[0].end = handle->m2m_nsndranks; + handle->swork[0].end = handle->m2m_nsndranks; handle->swork[0].handle = handle; handle->n_work = 1; int context_id = MY_CONTEXT_ID(); context_id ++; if (context_id >= cmi_pami_numcontexts) - context_id = 0; - pami_context_t context = cmi_pami_contexts[context_id]; + context_id = 0; + pami_context_t context = cmi_pami_contexts[context_id]; handle->swork[0].context = context; } else { @@ -340,10 +348,10 @@ void CmiDirect_manytomany_initialize_sendbase( void * h, ncontexts = handle->m2m_nsndranks; handle->n_work = ncontexts; - nranks = handle->m2m_nsndranks / ncontexts; + nranks = handle->m2m_nsndranks / ncontexts; for (i = 0; i < ncontexts; ++i) { handle->swork[i].start = start; - handle->swork[i].end = start + nranks; + handle->swork[i].end = start + nranks; handle->swork[i].handle = handle; start += nranks; if (i == ncontexts - 1) @@ -351,7 +359,7 @@ void CmiDirect_manytomany_initialize_sendbase( void * h, context_id ++; if (context_id >= cmi_pami_numcontexts) - context_id = 0; + context_id = 0; context = cmi_pami_contexts[context_id]; handle->swork[i].context = context; } @@ -359,142 +367,114 @@ void CmiDirect_manytomany_initialize_sendbase( void * h, #else PAMIX_CONTEXT_LOCK(MY_CONTEXT()); handle->swork[0].start = 0; - handle->swork[0].end = handle->m2m_nsndranks; + handle->swork[0].end = handle->m2m_nsndranks; handle->swork[0].handle = handle; handle->n_work = 1; handle->swork[0].context = MY_CONTEXT(); PAMIX_CONTEXT_UNLOCK(MY_CONTEXT()); #endif -#endif } #define PRIME_A 3010349UL #define PRIME_B 3571UL void CmiDirect_manytomany_initialize_send ( void * h, - unsigned tag, + unsigned tag, unsigned idx, unsigned displ, unsigned bytes, unsigned pe ) { -#if CMK_SMP && !CMK_ENABLE_ASYNC_PROGRESS - CmiAbort("!!!!!!!!!Please build Charm++ with async in order to use many-to-many interface\n"); -#else PAMICmiDirectM2mHandle *handle = &(((PAMICmiDirectM2mHandle *) h)[tag]); - assert ( tag < MAX_CONN ); - handle->m2m_sndlens [idx] = bytes; - handle->m2m_sdispls [idx] = displ; - + assert ( tag < MAX_CONN ); + int lrank = CmiRankOf(pe); - handle->m2m_lranks [idx] = lrank; - pami_endpoint_t target; //get the destination context -#if CMK_PAMI_MULTI_CONTEXT +#if CMK_PAMI_MULTI_CONTEXT size_t dst_context = (lrank>>LTPS); #else size_t dst_context = 0; #endif - PAMI_Endpoint_create (cmi_pami_client, (pami_task_t)CmiNodeOf(pe), + PAMI_Endpoint_create (cmi_pami_client, (pami_task_t)CmiNodeOf(pe), dst_context, &target); - handle->m2m_node_eps [idx] = target; - //uint64_t p_rand = ((uint64_t)idx+1)*PRIME_A + PRIME_B*(CmiMyPe()+1); unsigned seed = CmiMyPe()+1; //start at a random location and move linearly from there - uint64_t p_rand = rand_r(&seed) + idx + 1; - //uint64_t p_rand = (uint64_t)idx + 1 + CmiMyPe(); - //uint64_t p_rand = idx + 1; - handle->m2m_permutation[idx] = (uint32_t)(p_rand%handle->m2m_nsndranks); - handle->m2m_hdrs[idx].connid = tag; - handle->m2m_hdrs[idx].dstrank = lrank; - handle->m2m_hdrs[idx].srcindex = handle->m2m_srankIndex; -#endif -} - -static void _internal_machine_send ( pami_context_t context, - pami_endpoint_t target_ep, - int rank, - int hdrsize, - char * hdr, - int size, - char * msg, - pami_event_function cb_done, - void * cd) -{ - if (size < 128) { - pami_send_immediate_t parameters; - parameters.dispatch = (size == 8)? M2M_PAMI_S8DISPATCH : M2M_PAMI_SDISPATCH; - //parameters.dispatch = M2M_PAMI_SDISPATCH; - parameters.header.iov_base = hdr; - parameters.header.iov_len = hdrsize; - parameters.data.iov_base = msg; - parameters.data.iov_len = size; - parameters.dest = target_ep; - - PAMI_Send_immediate (context, ¶meters); - //if (cb_done) - //cb_done (context, cd, PAMI_SUCCESS); - } - else { - pami_send_t parameters; - parameters.send.dispatch = M2M_PAMI_DISPATCH; - parameters.send.header.iov_base = hdr; - parameters.send.header.iov_len = hdrsize; - parameters.send.data.iov_base = msg; - parameters.send.data.iov_len = size; - parameters.events.cookie = cd; - parameters.events.local_fn = cb_done; - parameters.events.remote_fn = NULL; - memset(¶meters.send.hints, 0, sizeof(parameters.send.hints)); - parameters.send.dest = target_ep; - - PAMI_Send (context, ¶meters); - } + //uint64_t p_rand = rand_r(&seed) + idx + 1; + uint64_t p_rand = ((uint64_t)idx+1)*PRIME_A + PRIME_B*(CmiMyPe()+1); + uint32_t pidx = (uint32_t)(p_rand%handle->m2m_nsndranks); + + char *buffer = handle->m2m_sndbuf + displ; + handle->m2m_sndinfo[pidx].buf = buffer; + handle->m2m_sndinfo[pidx].bytes = bytes; + handle->m2m_sndinfo[pidx].ep = target; + handle->m2m_sndinfo[pidx].hdr.connid = tag; + handle->m2m_sndinfo[pidx].hdr.dstrank = lrank; + handle->m2m_sndinfo[pidx].hdr.srcindex = handle->m2m_srankIndex; + + if (bytes == 8) + handle->m2m_sndinfo[pidx].dispatch = M2M_PAMI_S8DISPATCH; + else if (bytes < 128) + handle->m2m_sndinfo[pidx].dispatch = M2M_PAMI_SDISPATCH; + else + handle->m2m_sndinfo[pidx].dispatch = M2M_PAMI_DISPATCH; } pami_result_t _cmidirect_m2m_send_post_handler (pami_context_t context, - void * cd) + void * cd) { PAMI_M2mWork_t *work = (PAMI_M2mWork_t *) cd; PAMICmiDirectM2mHandle *handle = (PAMICmiDirectM2mHandle *)work->handle; - + +#if CMK_TRACE_ENABLED + double starttime = CmiWallTimer(); +#endif + int i = 0; - int pidx = 0; - char *buffer = NULL; - int bytes = NULL; + CmiAssert(handle->m2m_sdone == NULL); + pami_send_t parameters; - pami_event_function cb_done = m2m_send_done; - void *clientdata = handle; + parameters.send.header.iov_len = sizeof(PAMI_M2mHeader); + parameters.events.cookie = NULL; + parameters.events.local_fn = NULL; + parameters.events.remote_fn = NULL; + memset(¶meters.send.hints, 0, sizeof(parameters.send.hints)); - if (handle->m2m_sdone == NULL) { - cb_done = NULL; - clientdata = NULL; + for (i = work->start; i < work->end; ++i) { + M2mSendInfo *sndinfo = &handle->m2m_sndinfo[i]; + parameters.send.data.iov_base = sndinfo->buf; + parameters.send.data.iov_len = sndinfo->bytes; + parameters.send.dest = sndinfo->ep; + parameters.send.header.iov_base = &sndinfo->hdr; + parameters.send.dispatch = sndinfo->dispatch; + + if (sndinfo->bytes < 128) + PAMI_Send_immediate(context, ¶meters.send); + else + PAMI_Send (context, ¶meters); } - for (i = work->start; i < work->end; ++i) { - pidx = handle->m2m_permutation[i]; - buffer = handle->m2m_sndbuf + handle->m2m_sdispls[pidx]; - bytes = handle->m2m_sndlens[pidx]; - - _internal_machine_send(context, - handle->m2m_node_eps[pidx], - handle->m2m_lranks[pidx], - sizeof(PAMI_M2mHeader), - (char*)&(handle->m2m_hdrs[pidx]), - bytes, - buffer, - cb_done, - clientdata); - } +#if CMK_TRACE_ENABLED + traceUserBracketEvent(30006, starttime, CmiWallTimer()); +#endif return PAMI_SUCCESS; } void _cmidirect_m2m_initialize (pami_context_t *contexts, int nc) { - pami_dispatch_hint_t options = (pami_dispatch_hint_t) {0}; + pami_dispatch_hint_t soptions = (pami_dispatch_hint_t) {0}; + pami_dispatch_hint_t loptions = (pami_dispatch_hint_t) {0}; + + soptions.long_header = PAMI_HINT_DISABLE; + soptions.recv_immediate = PAMI_HINT_ENABLE; + soptions.use_rdma = PAMI_HINT_DISABLE; + + loptions.long_header = PAMI_HINT_DISABLE; + loptions.recv_contiguous = PAMI_HINT_ENABLE; + loptions.recv_copy = PAMI_HINT_ENABLE; + pami_dispatch_callback_function pfn; int i = 0; for (i = 0; i < nc; ++i) { @@ -503,57 +483,71 @@ void _cmidirect_m2m_initialize (pami_context_t *contexts, int nc) { M2M_PAMI_DISPATCH, pfn, NULL, - options); + loptions); pfn.p2p = m2m_spkt_dispatch; PAMI_Dispatch_set (contexts[i], M2M_PAMI_SDISPATCH, pfn, NULL, - options); + soptions); pfn.p2p = m2m_s8_dispatch; PAMI_Dispatch_set (contexts[i], M2M_PAMI_S8DISPATCH, pfn, NULL, - options); + soptions); } } void CmiDirect_manytomany_start ( void * h, unsigned tag ) { -#if CMK_SMP && !CMK_ENABLE_ASYNC_PROGRESS - CmiAbort("!!!!!!!!!Please build Charm++ with async in order to use many-to-many interface\n"); -#else PAMICmiDirectM2mHandle *handle = &(((PAMICmiDirectM2mHandle *) h)[tag]); assert (tag < MAX_CONN); - //printf ("Calling manytomany_start for conn %d handle %p on rank %d\n", tag, + //printf ("Calling manytomany_start for conn %d handle %p on rank %d\n", tag, // handle, CmiMyPe()); - -#if CMK_SMP && CMK_ENABLE_ASYNC_PROGRESS + +#if M2M_PARALLEL_CONTEXT //we have a completion callback if (handle->m2m_sdone != NULL) { - PAMI_Context_post ( handle->swork[0].context, - &handle->swork[0].work, - _cmidirect_m2m_send_post_handler, - &handle->swork[0]); + PAMI_Context_post ( handle->swork[0].context, + &handle->swork[0].work, + _cmidirect_m2m_send_post_handler, + &handle->swork[0]); } else { int i; - for (i = 0; i < handle->n_work; ++i) { - PAMI_Context_post( handle->swork[i].context, - &handle->swork[i].work, - _cmidirect_m2m_send_post_handler, - &handle->swork[i]); - } +#if CMK_TRACE_ENABLED + double starttime = CmiWallTimer(); +#endif + for (i = 0; i < handle->n_work; ++i) +#if !CMK_ENABLE_ASYNC_PROGRESS + if (handle->swork[i].context != MY_CONTEXT()) +#endif + PAMI_Context_post( handle->swork[i].context, + &handle->swork[i].work, + _cmidirect_m2m_send_post_handler, + &handle->swork[i]); + +#if CMK_TRACE_ENABLED + traceUserBracketEvent(30007, starttime, CmiWallTimer()); +#endif + +#if !CMK_ENABLE_ASYNC_PROGRESS + for (i = 0; i < handle->n_work; ++i) + if (handle->swork[i].context == MY_CONTEXT()) { + PAMIX_CONTEXT_LOCK(MY_CONTEXT()); + _cmidirect_m2m_send_post_handler (MY_CONTEXT(), &handle->swork[i]); + PAMIX_CONTEXT_UNLOCK(MY_CONTEXT()); + } +#endif } #else PAMIX_CONTEXT_LOCK(MY_CONTEXT()); _cmidirect_m2m_send_post_handler (MY_CONTEXT(), &handle->swork[0]); PAMIX_CONTEXT_UNLOCK(MY_CONTEXT()); #endif -#endif } diff --git a/src/arch/pami/memalloc.c b/src/arch/pami/memalloc.c new file mode 100755 index 0000000000..55fc45d135 --- /dev/null +++ b/src/arch/pami/memalloc.c @@ -0,0 +1,144 @@ + +#include + +#define ALIGNMENT 32 +#define SMSG_SIZE 4096 +#define N_SMSG_ELEM 4096 +#define MMSG_SIZE 16384 +#define N_MMSG_ELEM 2048 +#define LLMSG_SIZE 65536 +#define N_LLMSG_ELEM 1024 + +#if CMK_BLUEGENEQ +#include +#endif + +PPCAtomicQueue *sPPCMemallocVec; +PPCAtomicQueue *mPPCMemallocVec; +PPCAtomicQueue *llPPCMemallocVec; + +typedef struct CmiMemAllocHdr_ppcq_t { + int rank; + int size; + //Align the application buffer to 32 bytes + char dummy[ALIGNMENT - sizeof(CmiChunkHeader) - 2*sizeof(int)]; +} CmiMemAllocHdr_ppcq; + +static int _nodeStart; +extern int Cmi_nodestart; /* First processor in this address space */ + +void *CmiAlloc_ppcq (int size) { + CmiMemAllocHdr_ppcq *hdr = NULL; + char *buf; +#if CMK_TRACE_ENABLED + double start = CmiWallTimer(); +#endif + +#if CMK_BLUEGENEQ + //Comm threads are hidden on BG/Q + int myrank = Kernel_ProcessorID() - _nodeStart; +#else + int myrank = CmiMyRank(); +#endif + + if (size <= SMSG_SIZE) { + hdr = PPCAtomicDequeue (&sPPCMemallocVec[myrank]); + if (hdr == NULL) + hdr = (CmiMemAllocHdr_ppcq *) + malloc_nomigrate(SMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq)); + hdr->size = SMSG_SIZE; + } + else if (size <= MMSG_SIZE) { + hdr = PPCAtomicDequeue (&mPPCMemallocVec[myrank]); + if (hdr == NULL) + hdr = (CmiMemAllocHdr_ppcq *) + malloc_nomigrate(MMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq)); + hdr->size = MMSG_SIZE; + } + else if (size <= LLMSG_SIZE) { + hdr = PPCAtomicDequeue (&llPPCMemallocVec[myrank]); + if (hdr == NULL) + hdr = (CmiMemAllocHdr_ppcq *) + malloc_nomigrate(LLMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq)); + hdr->size = LLMSG_SIZE; + } + else { + hdr = (CmiMemAllocHdr_ppcq *) + malloc_nomigrate(size + sizeof(CmiMemAllocHdr_ppcq)); + hdr->size = size; + } + + hdr->rank = myrank; + buf = (char*)hdr + sizeof(CmiMemAllocHdr_ppcq); + +#if CMK_TRACE_ENABLED + traceUserBracketEvent(30001, start, CmiWallTimer()); +#endif + + return buf; +} + +void CmiFree_ppcq (void *buf) { + CmiMemAllocHdr_ppcq *hdr = (CmiMemAllocHdr_ppcq *)((char*)buf - sizeof(CmiMemAllocHdr_ppcq)); + int rc = CMI_PPCQ_EAGAIN; + +#if CMK_TRACE_ENABLED + double start = CmiWallTimer(); +#endif + + if (hdr->size == SMSG_SIZE) + rc = PPCAtomicEnqueue (&sPPCMemallocVec[hdr->rank], hdr); + else if (hdr->size == MMSG_SIZE) + rc = PPCAtomicEnqueue (&mPPCMemallocVec[hdr->rank], hdr); + else if (hdr->size == LLMSG_SIZE) + rc = PPCAtomicEnqueue (&llPPCMemallocVec[hdr->rank], hdr); + + if (rc == CMI_PPCQ_EAGAIN) + //queues are full or large buf + free_nomigrate(hdr); + +#if CMK_TRACE_ENABLED + traceUserBracketEvent(30002, start, CmiWallTimer()); +#endif +} + +void CmiMemAllocInit_ppcq (void * atomic_mem, + size_t atomic_memsize) +{ + int i = 0; +#if CMK_BLUEGENEQ + int node_size = 64/Kernel_ProcessCount(); + _nodeStart = node_size * Kernel_MyTcoord(); +#else + int node_size = CmiMyNodeSize(); + _nodeStart = Cmi_nodestart; +#endif + + //We want to align headers to 32 bytes + CmiAssert(sizeof(CmiMemAllocHdr_ppcq)+sizeof(CmiChunkHeader) == ALIGNMENT); + + CmiAssert (atomic_memsize >= 3 * node_size * sizeof(PPCAtomicState)); + sPPCMemallocVec = (PPCAtomicQueue *)malloc_nomigrate(sizeof(PPCAtomicQueue)*node_size); + mPPCMemallocVec = (PPCAtomicQueue *)malloc_nomigrate(sizeof(PPCAtomicQueue)*node_size); + llPPCMemallocVec = (PPCAtomicQueue *)malloc_nomigrate(sizeof(PPCAtomicQueue)*node_size); + + for (i = 0; i < node_size; ++i) { + PPCAtomicQueueInit ((char *)atomic_mem + 3*i*sizeof(PPCAtomicState), + sizeof(PPCAtomicState), + &sPPCMemallocVec[i], + 0, /*No Overflow*/ + N_SMSG_ELEM ); + + PPCAtomicQueueInit ((char *)atomic_mem + (3*i+1)*sizeof(PPCAtomicState), + sizeof(PPCAtomicState), + &mPPCMemallocVec[i], + 0, + N_MMSG_ELEM ); + + PPCAtomicQueueInit ((char *)atomic_mem + (3*i+2)*sizeof(PPCAtomicState), + sizeof(PPCAtomicState), + &llPPCMemallocVec[i], + 0, + N_LLMSG_ELEM ); + } +} diff --git a/src/conv-core/convcore.c b/src/conv-core/convcore.c index 32c0b6690e..278104c090 100644 --- a/src/conv-core/convcore.c +++ b/src/conv-core/convcore.c @@ -212,11 +212,16 @@ void infi_freeMultipleSend(void *ptr); void infi_unregAndFreeMeta(void *ch); #endif -#if CMK_SMP && CMK_BLUEGENEQ && (CMK_USE_L2ATOMICS || SPECIFIC_PCQUEUE) +#if CMK_SMP && CMK_BLUEGENEQ && SPECIFIC_PCQUEUE void * CmiAlloc_bgq (int size); void CmiFree_bgq (void * buf); #endif +#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE +void * CmiAlloc_ppcq (int size); +void CmiFree_ppcq (void * buf); +#endif + #if CMK_GRID_QUEUE_AVAILABLE CpvDeclare(void *, CkGridObject); CpvDeclare(void *, CsdGridQueue); @@ -1290,6 +1295,149 @@ double CmiTimer() #endif +#if CMK_TIMER_USE_PPC64 + +#include +#include + +#define SPRN_TBRU 0x10D +#define SPRN_TBRL 0x10C + +CpvStaticDeclare(uint64_t, inittime); +CpvStaticDeclare(double, clocktick); + +int CmiTimerIsSynchronized() +{ + return 1; +} + +int CmiTimerAbsolute() +{ + return 0; +} + +double CmiStartTimer() +{ + return 0.0; +} + +double CmiInitTime() +{ + return CpvAccess(inittime); +} + +static inline uint64_t PPC64_TimeBase() +{ + unsigned temp; + union + { +#if __BYTE_ORDER == __LITTLE_ENDIAN + struct { unsigned lo, hi; } w; +#else +#warning "PPC64 Is BigEndian" + struct { unsigned hi, lo; } w; +#endif + uint64_t d; + } result; + + do { + asm volatile ("mfspr %0,%1" : "=r" (temp) : "i" (SPRN_TBRU)); + asm volatile ("mfspr %0,%1" : "=r" (result.w.lo) : "i" (SPRN_TBRL)); + asm volatile ("mfspr %0,%1" : "=r" (result.w.hi) : "i" (SPRN_TBRU)); + } + while (temp != result.w.hi); + + return result.d; +} + +uint64_t __micro_timer () { + struct timeval tv; + gettimeofday( &tv, 0 ); + return tv.tv_sec * 1000000ULL + tv.tv_usec; +} + +void CmiTimerInit(char **argv) +{ + CpvInitialize(double, clocktick); + CpvInitialize(unsigned long, inittime); + + //Initialize PPC64 timers + + uint64_t sampleTime = 100ULL; //sample time in usec + uint64_t timeStart = 0ULL, timeStop = 0ULL; + uint64_t startBase = 0ULL, endBase = 0ULL; + uint64_t overhead = 0ULL, tbf = 0ULL, tbi = 0ULL; + uint64_t ticks = 0ULL; + int iter = 0ULL; + + do { + tbi = PPC64_TimeBase(); + tbf = PPC64_TimeBase(); + tbi = PPC64_TimeBase(); + tbf = PPC64_TimeBase(); + + overhead = tbf - tbi; + timeStart = __micro_timer(); + + //wait for system time to change + while (__micro_timer() == timeStart) + timeStart = __micro_timer(); + + while (1) { + timeStop = __micro_timer(); + if ((timeStop - timeStart) > 1) { + startBase = PPC64_TimeBase(); + break; + } + } + timeStart = timeStop; + + while (1) { + timeStop = __micro_timer(); + if ((timeStop - timeStart) > sampleTime) { + endBase = PPC64_TimeBase(); + break; + } + } + + ticks = ((endBase - startBase) + (overhead)); + iter++; + if (iter == 10ULL) + CmiAbort("Warning: unable to initialize high resolution timer.\n"); + + } while (endBase <= startBase); + + CpvAccess (clocktick) = (1e-6) / ((double)ticks/(double)sampleTime); + + /* try to synchronize calling barrier */ +#if !(__FAULT__) + CmiBarrier(); + CmiBarrier(); + CmiBarrier(); +#endif + CpvAccess(inittime) = PPC64_TimeBase (); +} + +double CmiWallTimer() +{ + uint64_t currenttime; + currenttime = PPC64_TimeBase(); + return CpvAccess(clocktick)*(currenttime-CpvAccess(inittime)); +} + +double CmiCpuTimer() +{ + return CmiWallTimer(); +} + +double CmiTimer() +{ + return CmiWallTimer(); +} + +#endif + + #if CMK_TIMER_USE_WIN32API CpvStaticDeclare(double, inittime_wallclock); @@ -2877,8 +3025,10 @@ void *CmiAlloc(int size) res =(char *) CmiPoolAlloc(size+sizeof(CmiChunkHeader)); #elif USE_MPI_CTRLMSG_SCHEME && CMK_CONVERSE_MPI MPI_Alloc_mem(size+sizeof(CmiChunkHeader), MPI_INFO_NULL, &res); -#elif CMK_SMP && CMK_BLUEGENEQ && (CMK_USE_L2ATOMICS || SPECIFIC_PCQUEUE) +#elif CMK_SMP && CMK_BLUEGENEQ && SPECIFIC_PCQUEUE res = (char *) CmiAlloc_bgq(size+sizeof(CmiChunkHeader)); +#elif CMK_SMP && CMK_PPC_ATOMIC_QUEUE + res = (char *) CmiAlloc_ppcq(size+sizeof(CmiChunkHeader)); #else res =(char *) malloc_nomigrate(size+sizeof(CmiChunkHeader)); #endif @@ -2980,8 +3130,10 @@ void CmiFree(void *blk) CmiPoolFree(BLKSTART(parentBlk)); #elif USE_MPI_CTRLMSG_SCHEME && CMK_CONVERSE_MPI MPI_Free_mem(parentBlk); -#elif CMK_SMP && CMK_BLUEGENEQ && (CMK_USE_L2ATOMICS || SPECIFIC_PCQUEUE) +#elif CMK_SMP && CMK_BLUEGENEQ && SPECIFIC_PCQUEUE CmiFree_bgq(BLKSTART(parentBlk)); +#elif CMK_SMP && CMK_PPC_ATOMIC_QUEUE + CmiFree_ppcq(BLKSTART(parentBlk)); #else free_nomigrate(BLKSTART(parentBlk)); #endif diff --git a/src/conv-core/converse.h b/src/conv-core/converse.h index 5d80c4f0a4..c5b3c235d8 100644 --- a/src/conv-core/converse.h +++ b/src/conv-core/converse.h @@ -60,6 +60,7 @@ #include #include #include +#include /* Paste the tokens x and y together, without any space between them. The ANSI C way to do this is the bizarre ## "token-pasting" @@ -1936,9 +1937,14 @@ extern CmiNodeLock cmiMemoryLock; #define CmiMemoryReadFence() __asm__ __volatile__("mf" ::: "memory") #define CmiMemoryWriteFence() __asm__ __volatile__("mf" ::: "memory") #elif CMK_PPC_ASM +#if CMK_BLUEGENEQ #define CmiMemoryReadFence() __asm__ __volatile__("sync":::"memory") #define CmiMemoryWriteFence() __asm__ __volatile__("sync":::"memory") #else +#define CmiMemoryReadFence() __asm__ __volatile__("isync":::"memory") +#define CmiMemoryWriteFence() __asm__ __volatile__("lwsync":::"memory") +#endif +#else #define CMK_NO_ASM_AVAILABLE 1 extern CmiNodeLock cmiMemoryLock; #define CmiMemoryReadFence() { CmiLock(cmiMemoryLock); CmiUnlock(cmiMemoryLock); } diff --git a/src/conv-core/cpuaffinity.c b/src/conv-core/cpuaffinity.c index 8e50c1c5fa..93ca4931ea 100644 --- a/src/conv-core/cpuaffinity.c +++ b/src/conv-core/cpuaffinity.c @@ -595,6 +595,10 @@ void CmiInitCPUAffinity(char **argv) if (pemap!=NULL || commap!=NULL) affinity_flag = 1; +#if CMK_PAMI_LINUX_PPC8 + affinity_flag = 1; +#endif + show_affinity_flag = CmiGetArgFlagDesc(argv,"+showcpuaffinity", "print cpu affinity"); @@ -649,13 +653,13 @@ void CmiInitCPUAffinity(char **argv) } else { /* if (CmiSetCPUAffinity(CmiNumCores()-1) == -1) CmiAbort("set_cpu_affinity abort!"); */ -#if !CMK_CRAYXT && !CMK_CRAYXE && !CMK_CRAYXC && !CMK_BLUEGENEQ +#if !CMK_CRAYXT && !CMK_CRAYXE && !CMK_CRAYXC && !CMK_BLUEGENEQ && !CMK_PAMI_LINUX_PPC8 if (pemap == NULL) { #if CMK_MACHINE_PROGRESS_DEFINED while (affinity_doneflag < CmiMyNodeSize()) CmiNetworkProgress(); #else #if CMK_SMP - #error "Machine progress call needs to be implemented for cpu affinity!" + #error "Machine progress call needs to be implemented for cpu affinity!" #endif #endif } @@ -727,6 +731,43 @@ void CmiInitCPUAffinity(char **argv) if (CmiMyPe() < CmiNumPes()) CmiNodeAllBarrier(); CmiNodeAllBarrier(); +#elif CMK_SMP && CMK_PAMI_LINUX_PPC8 +#define CMK_PAMI_LINUX_PPC8_CORES_PER_NODE 20 +#define CMK_PAMI_LINUX_PPC8_THREADS_PER_CORE 8 +#define CMK_PAMI_LINUX_PPC8_SKIP_CORE_0 0 + int cores_per_node = CMK_PAMI_LINUX_PPC8_CORES_PER_NODE; + int threads_per_core = CMK_PAMI_LINUX_PPC8_THREADS_PER_CORE; + + CmiGetArgInt(argv,"+cores_per_node", &cores_per_node); + CmiGetArgInt(argv,"+threads_per_core", &threads_per_core); + + int my_core = CmiMyPe() % cores_per_node; + int my_core_2 = CmiMyPe() % (cores_per_node/2); +#if CMK_PAMI_LINUX_PPC8_SKIP_CORE_0 + my_core_2 = (my_core_2 + 1) % (CMK_PAMI_LINUX_PPC8_CORES_PER_NODE/2); +#endif + + int cpu = 0; + if (my_core < (cores_per_node/2)) + cpu = my_core_2 * threads_per_core; + else + cpu = (my_core_2 + CMK_PAMI_LINUX_PPC8_CORES_PER_NODE/2) * threads_per_core; + + cpu_set_t cset; + CPU_ZERO(&cset); + CPU_SET(cpu, &cset); + CPU_SET(cpu+1, &cset); + if(sched_setaffinity(0, sizeof(cpu_set_t), &cset) < 0) + perror("sched_setaffinity"); + + CPU_ZERO(&cset); + if (sched_getaffinity(0, sizeof(cset), &cset) < 0) + perror("sched_getaffinity"); + + sched_yield(); + if(CmiMyPe() == 0) + printf("Setting default affinity\n"); + return; #else /* get my ip address */ if (CmiMyRank() == 0) -- 2.11.4.GIT