From e6754908a3a8587bc7bc1f6fe0e5269924127066 Mon Sep 17 00:00:00 2001
From: Sameer Kumar <sameerk@us.ibm.com>
Date: Tue, 2 Feb 2016 11:20:33 -0600
Subject: [PATCH] Charm's port for newer IBM systems

This patch contains Sameer's port for running Charm on top of newer IBM systems.
The base layer is PAMI, which should also work on current Blue Gene/Q systems.
Note: these changes are for non-LRTS versions, and would need to be ported to a
LRTS version.

Change-Id: I11c4eca8fdc761a862e570e701a67c1c8fcc9bf2
---
 src/arch/pami-bluegeneq/L2AtomicMutex.h            |  75 ---
 src/arch/pami-bluegeneq/L2AtomicQueue.h            | 205 ---------
 src/arch/{pami => pami-bluegeneq}/Makefile.machine |   0
 src/arch/pami-bluegeneq/conv-mach-smp.h            |   5 +-
 src/arch/pami-bluegeneq/conv-mach.h                |  11 +
 src/arch/pami-bluegeneq/conv-mach.sh               |   6 +-
 src/arch/pami-bluegeneq/memalloc.c                 | 126 -----
 src/arch/pami-bluegeneq/ppc_atomicq_impl.h         |  50 ++
 src/arch/pami-linux-ppc64le/cc-gcc.h               |   1 +
 src/arch/pami-linux-ppc64le/cc-gcc.sh              |  29 ++
 src/arch/pami-linux-ppc64le/cc-xlc.h               |   1 +
 src/arch/pami-linux-ppc64le/cc-xlc.sh              |   1 +
 src/arch/pami-linux-ppc64le/charmrun               | 302 ++++++++++++
 .../conv-mach-smp.h                                |  13 +-
 src/arch/pami-linux-ppc64le/conv-mach-smp.sh       |   0
 .../conv-mach.h                                    |  48 +-
 src/arch/pami-linux-ppc64le/conv-mach.sh           |  56 +++
 src/arch/pami/Makefile.machine                     |   7 -
 src/arch/pami/PPCAtomicMutex.h                     |  82 ++++
 src/arch/pami/PPCAtomicQueue.h                     | 210 +++++++++
 src/arch/pami/conv-common.h                        |   5 +-
 src/arch/pami/default_ppcq.h                       |  98 ++++
 src/arch/pami/machine.c                            | 511 +++++++++++++--------
 src/arch/pami/manytomany.c                         | 428 +++++++++--------
 src/arch/pami/memalloc.c                           | 144 ++++++
 src/conv-core/convcore.c                           | 158 ++++++-
 src/conv-core/converse.h                           |   6 +
 src/conv-core/cpuaffinity.c                        |  45 +-
 28 files changed, 1744 insertions(+), 879 deletions(-)
 delete mode 100644 src/arch/pami-bluegeneq/L2AtomicMutex.h
 delete mode 100644 src/arch/pami-bluegeneq/L2AtomicQueue.h
 copy src/arch/{pami => pami-bluegeneq}/Makefile.machine (100%)
 delete mode 100644 src/arch/pami-bluegeneq/memalloc.c
 create mode 100644 src/arch/pami-bluegeneq/ppc_atomicq_impl.h
 create mode 100644 src/arch/pami-linux-ppc64le/cc-gcc.h
 create mode 100644 src/arch/pami-linux-ppc64le/cc-gcc.sh
 create mode 100644 src/arch/pami-linux-ppc64le/cc-xlc.h
 create mode 100644 src/arch/pami-linux-ppc64le/cc-xlc.sh
 create mode 100755 src/arch/pami-linux-ppc64le/charmrun
 copy src/arch/{pami-bluegeneq => pami-linux-ppc64le}/conv-mach-smp.h (63%)
 create mode 100644 src/arch/pami-linux-ppc64le/conv-mach-smp.sh
 copy src/arch/{pami-bluegeneq => pami-linux-ppc64le}/conv-mach.h (57%)
 create mode 100644 src/arch/pami-linux-ppc64le/conv-mach.sh
 create mode 100755 src/arch/pami/PPCAtomicMutex.h
 create mode 100755 src/arch/pami/PPCAtomicQueue.h
 create mode 100644 src/arch/pami/default_ppcq.h
 create mode 100755 src/arch/pami/memalloc.c

diff --git a/src/arch/pami-bluegeneq/L2AtomicMutex.h b/src/arch/pami-bluegeneq/L2AtomicMutex.h
deleted file mode 100644
index 4082c51f82..0000000000
--- a/src/arch/pami-bluegeneq/L2AtomicMutex.h
+++ /dev/null
@@ -1,75 +0,0 @@
-
-#ifndef __L2_ATOMIC_MUTEX__
-#define __L2_ATOMIC_MUTEX__
-
-#include <pthread.h>
-#include <stdio.h>
-#include <assert.h>
-#include <stdint.h>
-#include "spi/include/l2/atomic.h"
-#include "spi/include/l1p/flush.h"
-
-#define L2_ATOMIC_MUTEX_FAIL        0x8000000000000000UL
-
-typedef struct
-{
-  volatile uint64_t     counter;
-  volatile uint64_t     bound;
-} L2AtomicMutex;
-
-L2AtomicMutex *L2AtomicMutexInit (void           * l2mem, 
-				  size_t           l2memsize)
-{
-  //Verify counter array is 64-byte aligned 
-  assert( (((uintptr_t) l2mem) & (0x0F)) == 0 );  
-  assert (sizeof(L2AtomicMutex) <= l2memsize);
-
-  L2AtomicMutex *mutex = (L2AtomicMutex*)l2mem;  
-  L2_AtomicStore(&mutex->counter, 0);
-  L2_AtomicStore(&mutex->bound, 1);
-  
-  return mutex;
-}
-
-/**
- *  \brief Try to acquire a mutex 
- *  \param[in]   mutex pointer
- *  \return 0    Lock successfully acquired
- *  \return 1    Lock was not acquired
- */
-static inline int L2AtomicMutexTryAcquire (L2AtomicMutex *mutex)
-{
-  size_t rc = L2_AtomicLoadIncrementBounded(&mutex->counter);
-  return (rc == L2_ATOMIC_MUTEX_FAIL) ? (1) : (0);
-}
-
-/**
- *  \brief Acquire a mutex 
- *  \param[in]   mutex pointer
- *  \return 0    Lock successfully acquired
- */
-static inline void L2AtomicMutexAcquire (L2AtomicMutex *mutex)
-{
-  size_t rc = 0;
-  do {
-    rc = L2_AtomicLoadIncrementBounded(&mutex->counter);
-  } while (rc == L2_ATOMIC_MUTEX_FAIL);
-}
-
-/**
- *  \brief Release a mutex 
- *  \param[in]   mutex pointer 
- *  \return 0    Lock successfully released
- *  \return 1    Fail
- */
-static inline void L2AtomicMutexRelease(L2AtomicMutex *mutex)
-{
-  //Flush outstanding loads/stores
-  ppc_msync();
-  
-  /* Release the lock */
-  L2_AtomicStore(&(mutex->counter), 0);  
-}
-
-
-#endif
diff --git a/src/arch/pami-bluegeneq/L2AtomicQueue.h b/src/arch/pami-bluegeneq/L2AtomicQueue.h
deleted file mode 100644
index d1db471134..0000000000
--- a/src/arch/pami-bluegeneq/L2AtomicQueue.h
+++ /dev/null
@@ -1,205 +0,0 @@
-
-#ifndef __L2_ATOMIC_QUEUE__
-#define __L2_ATOMIC_QUEUE__
-
-#include <pthread.h>
-#include <stdio.h>
-#include <assert.h>
-#include <stdint.h>
-#include "spi/include/l2/atomic.h"
-#include "spi/include/l1p/flush.h"
-#include "pcqueue.h"
-
-#define DEFAULT_SIZE         1024
-#define L2_ATOMIC_FULL        0x8000000000000000UL
-#define L2_ATOMIC_EMPTY       0x8000000000000000UL
-
-#define L2A_SUCCESS  0
-#define L2A_EAGAIN  -1
-#define L2A_FAIL    -2
-
-#define __L2_ATOMIC_QUEUE_BLOCKING  1
-
-typedef  void* L2AtomicQueueElement;
-
-typedef struct _l2atomicstate {
-  volatile uint64_t Consumer;	// not used atomically
-  volatile uint64_t Producer;
-  volatile uint64_t UpperBound;
-  volatile uint64_t Flush;	// contents not used
-} L2AtomicState;
-
-typedef struct _l2atomicq {
-  L2AtomicState               * _l2state;
-  volatile void * volatile    * _array;
-  int                           _useOverflowQ;
-  int                           _qsize;
-  PCQueue                       _overflowQ;
-  pthread_mutex_t               _overflowMutex;
-} L2AtomicQueue;
-
-void L2AtomicQueueInit      (void           * l2mem, 
-			     size_t           l2memsize, 
-			     L2AtomicQueue  * queue,
-			     int              use_overflow,
-			     int              nelem) 
-{
-  pami_result_t rc;
-  
-  //Verify counter array is 64-byte aligned 
-  assert( (((uintptr_t) l2mem) & (0x1F)) == 0 );  
-  assert (sizeof(L2AtomicState) <= l2memsize);
-  
-  queue->_useOverflowQ = use_overflow;
-
-  int qsize = 2;
-  while (qsize < nelem) 
-    qsize *= 2;
-  queue->_qsize = qsize;
-
-  queue->_l2state = (L2AtomicState *)l2mem;
-  pthread_mutex_init(&queue->_overflowMutex, NULL);
-  queue->_overflowQ = PCQueueCreate();
-  L2_AtomicStore(&queue->_l2state->Consumer, 0);
-  L2_AtomicStore(&queue->_l2state->Producer, 0);
-  L2_AtomicStore(&queue->_l2state->UpperBound, qsize);
-  
-  rc = posix_memalign ((void **)&queue->_array,
-		       64, /*L1 line size for BG/Q */
-		       sizeof(L2AtomicQueueElement) * qsize);
-
-  assert(rc == PAMI_SUCCESS);
-  memset((void*)queue->_array, 0, sizeof(L2AtomicQueueElement)*qsize);
-}
-
-int L2AtomicEnqueue (L2AtomicQueue          * queue,
-		     void                   * element) 
-{
-  //fprintf(stderr,"Insert message %p\n", element);
-
-  register int qsize_1 = queue->_qsize - 1;
-  uint64_t index = L2_AtomicLoadIncrementBounded(&queue->_l2state->Producer);
-  L1P_FlushRequests();
-  if (index != L2_ATOMIC_FULL) {
-    queue->_array[index & qsize_1] = element;
-    return L2A_SUCCESS;
-  }
-  
-  //We dont want to use the overflow queue
-  if (!queue->_useOverflowQ)
-    return L2A_EAGAIN; //Q is full, try later
-  
-  //No ordering is guaranteed if there is overflow
-  pthread_mutex_lock(&queue->_overflowMutex);
-  PCQueuePush(queue->_overflowQ, element);
-  pthread_mutex_unlock(&queue->_overflowMutex);
-  
-  return L2A_SUCCESS;
-}
-
-void * L2AtomicDequeue (L2AtomicQueue    *queue)
-{
-  uint64_t head, tail;
-  tail = queue->_l2state->Producer;
-  head = queue->_l2state->Consumer;
-  register int qsize_1 = queue->_qsize-1;
-
-  volatile void *e = NULL;
-  if (head < tail) {    
-    e = queue->_array[head & qsize_1];
-#if __L2_ATOMIC_QUEUE_BLOCKING
-    while (e == NULL) 
-      e = queue->_array[head & qsize_1];
-#else
-    if (e == NULL)
-      return NULL;
-#endif
-
-    queue->_array[head & qsize_1] = NULL;
-    ppc_msync();
-
-    head ++;
-    queue->_l2state->Consumer = head;    
-    
-    //Charm++ does not require message ordering
-    //So we dont acquire overflow mutex here
-    uint64_t n = head + queue->_qsize;
-    // is atomic-store needed?
-    L2_AtomicStore(&queue->_l2state->UpperBound, n);
-    return (void*) e;
-  }
-
-  //We dont have an overflowQ
-  if (!queue->_useOverflowQ)
-    return NULL;
-  
-  /* head == tail (head cannot be greater than tail) */
-  if (PCQueueLength(queue->_overflowQ) > 0) {
-    pthread_mutex_lock(&queue->_overflowMutex);      
-    e = PCQueuePop (queue->_overflowQ);    
-    pthread_mutex_unlock(&queue->_overflowMutex);      
-    
-    return (void *) e;
-  }
-
-  return (void *) e;
-}
-
-int L2AtomicQueueEmpty (L2AtomicQueue *queue) {
-  return ( (PCQueueLength(queue->_overflowQ) == 0) &&
-	   (queue->_l2state->Producer == queue->_l2state->Consumer) );
-}
-
-//spin block in the L2 atomic queue till there is a message. fail and
-//return after n iterations
-int L2AtomicQueueSpinWait (L2AtomicQueue    * queue,
-			   int                n)
-{
-  if (!L2AtomicQueueEmpty(queue))
-    return 0;  //queue is not empty so return
-  
-  uint64_t head, tail;
-  head = queue->_l2state->Consumer;
-  
-  size_t i = n;
-  do {
-    tail = queue->_l2state->Producer;    
-    i--;
-  }
-  //While the queue is empty and i < n
-  while (head == tail && i != 0);
-  
-  return 0; //fail queue is empty
-}
-
-//spin block in the L2 atomic queue till there is a message. fail and
-//return after n iterations
-int L2AtomicQueue2QSpinWait (L2AtomicQueue    * queue0,
-			     L2AtomicQueue    * queue1,
-			     int                n)
-{
-  if (!L2AtomicQueueEmpty(queue0))
-    return 0;  //queue0 is not empty so return
-  
-  if (!L2AtomicQueueEmpty(queue1))
-    return 0;  //queue is not empty so return  
-
-  uint64_t head0, tail0;
-  uint64_t head1, tail1;
-  
-  head0 = queue0->_l2state->Consumer;  
-  head1 = queue1->_l2state->Consumer;
-  
-  size_t i = n;
-  do {
-    tail0 = queue0->_l2state->Producer;    
-    tail1 = queue1->_l2state->Producer;    
-    i --;
-  } while (head0==tail0 && head1==tail1 && i!=0);   
- 
-  return 0; 
-}
-
-
-
-#endif
diff --git a/src/arch/pami/Makefile.machine b/src/arch/pami-bluegeneq/Makefile.machine
similarity index 100%
copy from src/arch/pami/Makefile.machine
copy to src/arch/pami-bluegeneq/Makefile.machine
diff --git a/src/arch/pami-bluegeneq/conv-mach-smp.h b/src/arch/pami-bluegeneq/conv-mach-smp.h
index d6c8f652b6..b84346c86a 100644
--- a/src/arch/pami-bluegeneq/conv-mach-smp.h
+++ b/src/arch/pami-bluegeneq/conv-mach-smp.h
@@ -23,5 +23,8 @@
 
 #define CMK_FAKE_SCHED_YIELD                               1
 
-#define CMK_USE_L2ATOMICS                                  1
+#define CMK_PPC_ATOMIC_QUEUE                               1
+#define CMK_PPC_ATOMIC_MUTEX                               1
 
+//We use an L2 atomic version
+#define CMK_PPC_ATOMIC_DEFAULT_IMPL                        0
diff --git a/src/arch/pami-bluegeneq/conv-mach.h b/src/arch/pami-bluegeneq/conv-mach.h
index c6450474f4..c5df57b019 100644
--- a/src/arch/pami-bluegeneq/conv-mach.h
+++ b/src/arch/pami-bluegeneq/conv-mach.h
@@ -46,6 +46,14 @@
 // This needs to be compiled with gcc only
 #define CMK_TIMER_USE_BLUEGENEQ			           1
 
+#define CMK_TYPEDEF_INT2 short
+#define CMK_TYPEDEF_INT4 int
+#define CMK_TYPEDEF_INT8 long long
+#define CMK_TYPEDEF_UINT2 unsigned short
+#define CMK_TYPEDEF_UINT4 unsigned int
+#define CMK_TYPEDEF_UINT8 unsigned long long
+#define CMK_TYPEDEF_FLOAT4 float
+#define CMK_TYPEDEF_FLOAT8 double
 
 #define CMK_WHEN_PROCESSOR_IDLE_BUSYWAIT                   1
 #define CMK_WHEN_PROCESSOR_IDLE_USLEEP                     0
@@ -64,5 +72,8 @@
 
 #define CMK_NO_ISO_MALLOC                                  1
 
+#undef CMI_DIRECT_MANY_TO_MANY_DEFINED
+#define CMI_DIRECT_MANY_TO_MANY_DEFINED                    1
+
 #endif
 
diff --git a/src/arch/pami-bluegeneq/conv-mach.sh b/src/arch/pami-bluegeneq/conv-mach.sh
index 53969cc124..6187b90c89 100644
--- a/src/arch/pami-bluegeneq/conv-mach.sh
+++ b/src/arch/pami-bluegeneq/conv-mach.sh
@@ -46,8 +46,8 @@ CMK_CC="bgxlc_r -qcpluscmt -qhalt=e -qnokeyword=__int128 -qtls=local-exec"
 CMK_CXXPP="$BGQ_BIN/powerpc64-bgq-linux-g++ -E "
 CMK_GCXX="$BGQ_BIN/powerpc64-bgq-linux-g++ $GCC_OPTS "
 CMK_CF77="bgxlf_r "
-CMK_CF90="bgxlf90_r  -qsuffix=f=f90" 
-CMK_CF90_FIXED="bgxlf90_r " 
+CMK_CF90="bgxlf90_r  -qsuffix=f=f90"
+CMK_CF90_FIXED="bgxlf90_r "
 
 CMK_LD="$CMK_CC"
 CMK_LDXX="$CMK_CXX"
@@ -64,7 +64,7 @@ CMK_QT="aix"
 
 CMK_LD_LIBRARY_PATH="-Wl,-rpath,$CHARMLIBSO/"
 CMK_LIBS='-lckqt'
-CMK_SYSINC="$BGQ_INC" 
+CMK_SYSINC="$BGQ_INC"
 CMK_SYSLIBS="$BGQ_LIB"
 CMK_F90LIBS="-lxlf90 -lxlopt -lxl -lxlfmath"
 CMK_MOD_NAME_ALLCAPS=1
diff --git a/src/arch/pami-bluegeneq/memalloc.c b/src/arch/pami-bluegeneq/memalloc.c
deleted file mode 100644
index 408d2e2b0e..0000000000
--- a/src/arch/pami-bluegeneq/memalloc.c
+++ /dev/null
@@ -1,126 +0,0 @@
-
-#include <converse.h>
-
-#define ALIGNMENT        64
-#define ALIGNMENT2       128
-#define SMSG_SIZE        4096
-#define N_SMSG_ELEM      512
-#define MAX_SMSG_ELEM     4096
-#define LMSG_SIZE        16384
-#define N_LMSG_ELEM      128
-#define MAX_LMSG_ELEM     2048
-
-typedef struct CmiMemAllocHdr_bgq_t {
-  int rank;
-  int size;
-  int tobuf;
-  //Align the application buffer to 32 bytes
-  char dummy[ALIGNMENT - sizeof(CmiChunkHeader) - 3*sizeof(int)];
-} CmiMemAllocHdr_bgq;
-
-typedef struct _memstruct {
-    L2AtomicQueue memQ;
-    int allocated_msg;
-    //char pad[ALIGNMENT2 - sizeof(L2AtomicQueue) - sizeof(int)];
-} L2MemStruct;
-
-static int _nodeStart;
-L2MemStruct *sL2MemallocVec;
-L2MemStruct *bL2MemallocVec;
-
-void *CmiAlloc_bgq (int size) {
-  CmiMemAllocHdr_bgq *hdr = NULL;
-  char *buf;
-  
-  int myrank = Kernel_ProcessorID() - _nodeStart;
-
-  if (size <= SMSG_SIZE) {
-    hdr = L2AtomicDequeue (&(sL2MemallocVec[myrank].memQ));
-    if (hdr == NULL) {
-      if(sL2MemallocVec[myrank].allocated_msg > MAX_SMSG_ELEM) {
-        hdr = (CmiMemAllocHdr_bgq *)memalign(ALIGNMENT, size + sizeof(CmiMemAllocHdr_bgq));      
-        hdr->tobuf = 0;
-      } else {
-        hdr = (CmiMemAllocHdr_bgq *) memalign(ALIGNMENT, SMSG_SIZE + sizeof(CmiMemAllocHdr_bgq));      
-        sL2MemallocVec[myrank].allocated_msg++;
-        hdr->size = SMSG_SIZE;
-        hdr->tobuf = 1;
-      }
-    }
-  }
-  else if (size <= LMSG_SIZE) {
-    hdr = L2AtomicDequeue (&(bL2MemallocVec[myrank].memQ));
-    if (hdr == NULL) {      
-      if(bL2MemallocVec[myrank].allocated_msg > MAX_LMSG_ELEM) {
-        hdr = (CmiMemAllocHdr_bgq *)memalign(ALIGNMENT, size + sizeof(CmiMemAllocHdr_bgq));      
-        hdr->tobuf = 0;
-      } else {
-        hdr = (CmiMemAllocHdr_bgq *) memalign(ALIGNMENT, LMSG_SIZE + sizeof(CmiMemAllocHdr_bgq));  
-        bL2MemallocVec[myrank].allocated_msg++;
-        hdr->size = LMSG_SIZE;
-        hdr->tobuf = 1;
-      }
-    }
-  }
-  else {
-    hdr = (CmiMemAllocHdr_bgq *) memalign(ALIGNMENT, size + sizeof(CmiMemAllocHdr_bgq));
-    hdr->size = size;
-    hdr->tobuf  = 0;
-  }
-
-  hdr->rank = myrank;
-  buf = (char*)hdr + sizeof(CmiMemAllocHdr_bgq);
-
-  return buf;
-}
-
-void CmiFree_bgq (void *buf) {
-  CmiMemAllocHdr_bgq *hdr = (CmiMemAllocHdr_bgq *)((char*)buf - sizeof(CmiMemAllocHdr_bgq));  
-  int rc = L2A_EAGAIN;
-  
-  if (hdr->tobuf && hdr->size == SMSG_SIZE) 
-    rc = L2AtomicEnqueue (&(sL2MemallocVec[hdr->rank].memQ), hdr);
-  else if (hdr->tobuf && hdr->size == LMSG_SIZE)
-    rc = L2AtomicEnqueue (&(bL2MemallocVec[hdr->rank].memQ), hdr);
-
-  //queues are full or large buf
-  if (rc == L2A_EAGAIN) {
-    if(hdr->tobuf) {
-      if(hdr->size == SMSG_SIZE)
-        sL2MemallocVec[hdr->rank].allocated_msg--;
-      else 
-        bL2MemallocVec[hdr->rank].allocated_msg--;
-    }
-    free_nomigrate(hdr);
-  }
-}
-
-void CmiMemAllocInit_bgq (void   * l2mem,
-			  size_t   l2memsize) 
-{
-  int i = 0;
-  int node_size = 64/Kernel_ProcessCount();
-  _nodeStart = node_size * Kernel_MyTcoord();
-  //We want to align headers to 32 bytes
-  CmiAssert(sizeof(CmiMemAllocHdr_bgq)+sizeof(CmiChunkHeader) == ALIGNMENT);
-
-  CmiAssert (l2memsize >= 2 * node_size * sizeof(L2AtomicState));
-  sL2MemallocVec = (L2MemStruct *)memalign(ALIGNMENT,sizeof(L2MemStruct)*node_size);
-  bL2MemallocVec = (L2MemStruct *)memalign(ALIGNMENT,sizeof(L2MemStruct)*node_size);
-
-  for (i = 0; i < node_size; ++i) {
-    L2AtomicQueueInit ((char *)l2mem + 2*i*sizeof(L2AtomicState),
-		       sizeof(L2AtomicState),
-		       &(sL2MemallocVec[i].memQ),
-		       0, /*No Overflow*/
-		       N_SMSG_ELEM /*512 entries in short q*/);
-
-    L2AtomicQueueInit ((char *)l2mem + (2*i+1)*sizeof(L2AtomicState),
-		       sizeof(L2AtomicState),
-		       &(bL2MemallocVec[i].memQ),
-		       0,
-		       N_LMSG_ELEM /*128 entries in long q*/);
-    sL2MemallocVec[i].allocated_msg = 0;
-    bL2MemallocVec[i].allocated_msg = 0;
-  }
-}
diff --git a/src/arch/pami-bluegeneq/ppc_atomicq_impl.h b/src/arch/pami-bluegeneq/ppc_atomicq_impl.h
new file mode 100644
index 0000000000..ac1d748fad
--- /dev/null
+++ b/src/arch/pami-bluegeneq/ppc_atomicq_impl.h
@@ -0,0 +1,50 @@
+
+#ifndef __L2_ATOMIC_PPCQ_H__
+#define __L2_ATOMIC_PPCQ_H__
+
+#include "spi/include/l2/atomic.h"
+#include "spi/include/l1p/flush.h"
+#include "pami.h"
+
+typedef pami_result_t (*pamix_proc_memalign_fn) (void**, size_t, size_t, const char*);
+
+/////////////////////////////////////////////////////
+// \brief Basic atomic operations should to defined
+// PPC_AtomicStore : store a value to the atomic counter
+// PPC_AtomicLoadIncrementBounded : bounded increment
+// PPC_AtomicWriteFence : a producer side write fence
+// PPC_AtomicReadFence  : consumer side read fence
+// PPC_AtomicCounterAllocate : allocate atomic counters
+/////////////////////////////////////////////////////
+
+#define CMI_PPC_ATOMIC_FAIL  0x8000000000000000UL
+
+typedef uint64_t ppc_atomic_type_t;
+typedef uint64_t ppc_atomic_t;
+
+#define PPC_AQVal(x) x
+
+static inline void PPC_AtomicCounterAllocate (void **atomic_mem,
+                                              size_t  atomic_memsize)
+{
+  pami_extension_t l2;
+  pamix_proc_memalign_fn PAMIX_L2_proc_memalign;
+  size_t size = atomic_memsize;
+  pami_result_t rc = PAMI_SUCCESS;
+
+  rc = PAMI_Extension_open(NULL, "EXT_bgq_l2atomic", &l2);
+  CmiAssert (rc == 0);
+  PAMIX_L2_proc_memalign = (pamix_proc_memalign_fn)PAMI_Extension_symbol(l2, "proc_memalign");
+  rc = PAMIX_L2_proc_memalign(atomic_mem, 64, size, NULL);
+  CmiAssert (rc == 0);
+}
+
+#define PPC_AtomicLoadIncrementBounded(counter) L2_AtomicLoadIncrementBounded(counter);
+
+#define PPC_AtomicStore(counter, val) L2_AtomicStore(counter, val)
+
+#define PPC_AtomicReadFence()     ppc_msync()
+
+#define PPC_AtomicWriteFence()    L1P_FlushRequests()
+
+#endif
diff --git a/src/arch/pami-linux-ppc64le/cc-gcc.h b/src/arch/pami-linux-ppc64le/cc-gcc.h
new file mode 100644
index 0000000000..40a8c178f1
--- /dev/null
+++ b/src/arch/pami-linux-ppc64le/cc-gcc.h
@@ -0,0 +1 @@
+/* empty */
diff --git a/src/arch/pami-linux-ppc64le/cc-gcc.sh b/src/arch/pami-linux-ppc64le/cc-gcc.sh
new file mode 100644
index 0000000000..2b5c32d663
--- /dev/null
+++ b/src/arch/pami-linux-ppc64le/cc-gcc.sh
@@ -0,0 +1,29 @@
+
+CMK_CPP_CHARM='/lib/cpp -P'
+CMK_CPP_C='gcc -E'
+CMK_CC='gcc '
+CMK_CXX='g++ '
+CMK_CXXPP='g++ -E '
+CMK_LD='g++ '
+CMK_LDXX='g++ '
+
+CMK_C_OPTIMIZE='-O3 -g'
+CMK_CXX_OPTIMIZE='-O3 -g'
+
+CMK_RANLIB='ranlib'
+CMK_LIBS='-lckqt '
+CMK_LD_LIBRARY_PATH="-Wl,-rpath,$CHARMLIBSO/"
+
+CMK_NATIVE_LIBS=''
+CMK_NATIVE_CC='gcc '
+CMK_NATIVE_LD='gcc '
+CMK_NATIVE_CXX='g++ '
+CMK_NATIVE_LDXX='g++ '
+
+CMK_CF77='gfortran'
+CMK_CF90='gfortran'
+CMK_F90LIBS='-lgfortran'
+CMK_MOD_NAME_ALLCAPS=1
+CMK_MOD_EXT="mod"
+CMK_F90_USE_MODDIR=1
+CMK_F90_MODINC="-p"
diff --git a/src/arch/pami-linux-ppc64le/cc-xlc.h b/src/arch/pami-linux-ppc64le/cc-xlc.h
new file mode 100644
index 0000000000..9e22b2b72d
--- /dev/null
+++ b/src/arch/pami-linux-ppc64le/cc-xlc.h
@@ -0,0 +1 @@
+/*   empty */
diff --git a/src/arch/pami-linux-ppc64le/cc-xlc.sh b/src/arch/pami-linux-ppc64le/cc-xlc.sh
new file mode 100644
index 0000000000..1bb8bf6d7f
--- /dev/null
+++ b/src/arch/pami-linux-ppc64le/cc-xlc.sh
@@ -0,0 +1 @@
+# empty
diff --git a/src/arch/pami-linux-ppc64le/charmrun b/src/arch/pami-linux-ppc64le/charmrun
new file mode 100755
index 0000000000..65b3d48c92
--- /dev/null
+++ b/src/arch/pami-linux-ppc64le/charmrun
@@ -0,0 +1,302 @@
+#!/bin/sh
+#
+# Conv-host for MPI:
+#  Translates +pN-style conv-host options into
+# mpirun -npN options.
+
+args=""
+pes=1
+ppn=1
+machinefile=""
+
+while [ $# -gt 0 ]
+do
+	case $1 in
+	+ppn|++ppn)
+		args=$args" +ppn "$2
+		ppn=$2
+		shift
+		;;
+	+ppn[0-9]*)
+		args=$args" "$1
+		ppn=`echo $1 | awk '{print substr($1,5)}'`
+		;;
+	++ppn[0-9]*)
+		args=$args" "$1
+		ppn=`echo $1 | awk '{print substr($1,6)}'`
+		;;
+	+p)
+		pes=$2
+		shift
+		;;
+	+pemap)
+		args=$args" "$1" "$2
+		shift
+		;;
+	+p[0-9]*)
+		pes=`echo $1 | awk '{print substr($1,3)}'`
+		;;
+        -machinefile)
+		machinefile=$2
+		args=" "$1" "$2" "$args
+		shift
+		;;
+	*)
+		args=$args" "$1
+		;;
+	esac
+	shift
+done
+
+rem=`expr $pes % $ppn`
+quot=`expr $pes / $ppn`
+if [ $rem -ne 0 ];
+then
+  printf "p = $pes should be a multiple of ppn = $ppn\n"
+  exit 1
+else
+  pes=$quot
+fi
+
+printf "\nRunning on $pes processors: $args\n"
+
+
+if [ -n "$PBS_NODEFILE" ]
+then
+# we are in a job shell
+  aprun=`which aprun 2>/dev/null`
+  if test -n "$aprun"
+  then
+    echo aprun -n $pes $args
+    $aprun -n $pes $args
+  else
+    mpirun_cmd=`which mpirun 2>/dev/null`
+    if test -n "$mpirun_cmd"
+    then
+      if echo $mpirun_cmd | grep 'mvapich2'  > /dev/null 2>/dev/null
+      then
+        # if daemon not started, start it
+        if ! mpdtrace > /dev/null 2>/dev/null
+        then
+          mvapich2-start-mpd
+        fi
+        mpirun -np $pes $args
+        #    mpdallexit
+      else   # normal case
+        test -z "$machinefile" && args=-machinefile" "$PBS_NODEFILE" "$args
+        echo mpirun -np $pes $args
+        mpirun -np $pes $args
+      fi
+    else
+      echo "Charmrun> can not locate mpirun in order to run the program."
+      exit 1
+    fi
+  fi
+elif [ -n "$LSB_HOSTS" ]
+then
+# Tungsten
+  echo cmpirun -lsf -poll -no_smp -gm_long 200000 $args
+  cmpirun -lsf -poll -no_smp -gm_long 200000 $args
+elif [ -n "$PBS_QUEUE" -o -n "$LSF_QUEUE" ]
+then
+# Interactive mode: create, and submit a batch job
+        script="charmrun_script.$$.sh"
+        indir=`pwd`
+        output="$indir/charmrun_script.$$.stdout"
+        result="$indir/charmrun_script.$$.result"
+	rm -f $result
+# Some machine specific
+	USE_LSF=0
+# 10 minutes
+	walllimit=10
+	queue_stat=qstat
+	queue_qsub=qsub
+	queue_kill=qdel
+	hostname=`hostname`
+	case "$hostname" in
+	turing*.turing.uiuc.edu)
+		ppn='#PBS -l nodes='$pes':ppn=1'
+		extra='-machinefile $PBS_NODEFILE'
+		;;
+	tg-login*|honest*.ncsa.uiuc.edu)
+		# always ppn=2
+		nodes=`expr \( $pes + 1 \) / 2`
+		test $pes -eq 1 && ppns=1 || ppns=2
+		ppn='#PBS -l nodes='$nodes':ppn='$ppns
+		extra='-machinefile $PBS_NODEFILE'
+		;;
+	co-login*.ncsa.uiuc.edu)
+		mem='#PBS -l mem=500mb'
+		ncpus="#PBS -l ncpus=$pes"
+		;;
+	tun*)
+		USE_LSF=1
+		queue_stat=bjobs
+		queue_qsub=bsub
+		queue_kill=bkill
+		;;
+	abe*)
+		# always ppn=2
+		nodes=`expr \( $pes + 1 \) / 2`
+		test $pes -eq 1 && ppns=1 || ppns=2
+		ppn='#PBS -l nodes='$nodes':ppn='$ppns
+		extra='-machinefile $PBS_NODEFILE'
+		;;
+        kraken*)
+                ncores=`expr \( $pes + 11 \) / 12 \* 12`
+		ncpus="#PBS -l size=$ncores"
+		ppn=''
+		;;
+	*)
+		ncpus="#PBS -l ncpus=$pes"
+		;;
+	esac
+	if test $USE_LSF -eq 0
+	then
+          mpirun=`which aprun 2>/dev/null`
+          npcmd="-n "
+          if test -z "$mpirun"
+          then
+	    mpirun=`which mpirun 2>/dev/null`
+            npcmd="-np "
+          fi
+          cat > $script << EOF
+#!/bin/sh
+# This is a charmrun-generated PBS batch job script.
+# The lines starting with #PBS are queuing system flags:
+#
+$ppn
+#
+$ncpus
+#
+#PBS -l walltime=$walllimit:00
+#
+$mem
+#
+#PBS -q $PBS_QUEUE
+#
+#PBS -N autobuild
+#
+#PBS -j oe
+#
+#PBS -o $output
+
+cd $indir
+
+cat \$PBS_NODEFILE
+echo $mpirun $npcmd $pes $extra $args
+$mpirun $npcmd $pes $extra $args
+
+# Save mpirun exit status
+status=\$?
+echo \$status > $result
+EOF
+	else
+#  use LSF
+	  mpirun="cmpirun -lsf -poll -no_smp -gm_long 200000"
+          cat > $script << EOF
+#!/bin/sh
+# This is a charmrun-generated PBS batch job script.
+# The lines starting with #PBS are queuing system flags:
+#
+#BSUB -J autobuild
+#BSUB -W 0:$walllimit
+#BSUB -n $pes
+#BSUB -o $output
+
+cd $indir
+echo \$LSB_MCPU_HOSTS
+$mpirun $args
+# Save mpirun exit status
+status=\$?
+echo \$status > $result
+EOF
+	fi
+
+End() {
+	echo "Charmrun> $queue_kill $jobid ..."
+	$queue_kill $jobid
+	rm -f $script
+	exit $1
+}
+
+        echo "Submitting batch job for> $mpirun -np $pes $args"
+        echo " using the command> $queue_qsub $script"
+        chmod 755 $script
+	while [ -z "$jobid" ]
+	do
+	  [ $USE_LSF = 0 ] && jobid=`$queue_qsub $script|tail -1`
+	  [ $USE_LSF = 1 ] && jobid=`$queue_qsub < $script|tail -1|sed -e 's/[^0-9]*//g'`
+	done
+	echo "Job enqueued under job ID $jobid"
+# kill job if interrupted
+	trap 'End 1' 2 3
+	retry=0
+# Wait for the job to complete, by checking its status
+        while [ true ]
+        do
+                $queue_stat $jobid > tmp.$$
+		exitstatus=$?
+                if test -f $output
+                then
+# The job is done-- print its output
+                        rm tmp.$$
+# When job hangs, result file does not exist
+			test -f $result && status=`cat $result` || status=1
+			test $status -eq 0 && status=`grep 'End of program' $output > /dev/null 2>&1`
+			cat $output
+			rm -f $result
+			test -f $status && rm -f $script $output
+			exit $status
+                fi
+# The job is still queued or running-- print status and wait
+                tail -1 tmp.$$
+                rm tmp.$$
+# Job ID may not exist now
+		if test $exitstatus -ne 0
+		then
+# retry a few times when error occurs
+			retry=`expr $retry + 1`
+			if test $retry -gt 6
+			then
+				echo "Charmrun> too many errors, abort!"
+				exit 1
+			else
+				sleep 15
+			fi
+		else
+# job still in queue
+			retry=0
+			sleep 20
+		fi
+        done
+else
+  mpirun_cmd=`which mpirun 2>/dev/null`
+  if test -n "$mpirun_cmd"
+  then
+    [ -n "$MPI_MACHINEFILE" ] && args=" -machinefile $MPI_MACHINEFILE $args"
+    setarch_cmd=`which setarch 2>/dev/null`
+    if [ -n "$setarch_cmd" -a -x "$setarch_cmd" ]
+    then
+      # Disables randomization of the virtual address  space  (turns  on
+      #          ADDR_NO_RANDOMIZE).
+      cur_arch=`uname -m`
+      echo "charmrun>  $setarch_cmd $cur_arch -R  mpirun -np $pes $args"
+      $setarch_cmd $cur_arch -R  mpirun -np $pes $args
+    else
+      echo "charmrun> mpirun -np $pes $args"
+      mpirun -np $pes $args
+    fi
+  else
+    mpiexec_cmd=`which mpiexec 2>/dev/null`
+    if test -n "$mpiexec_cmd"
+    then
+      echo "charmrun> $mpiexec_cmd -n $pes $args"
+      echo
+      "$mpiexec_cmd" -n $pes $args
+    else
+      echo "Don't know how to run MPI program."
+      exit 1
+    fi
+  fi
+fi
diff --git a/src/arch/pami-bluegeneq/conv-mach-smp.h b/src/arch/pami-linux-ppc64le/conv-mach-smp.h
similarity index 63%
copy from src/arch/pami-bluegeneq/conv-mach-smp.h
copy to src/arch/pami-linux-ppc64le/conv-mach-smp.h
index d6c8f652b6..d77f6d7ed7 100644
--- a/src/arch/pami-bluegeneq/conv-mach-smp.h
+++ b/src/arch/pami-linux-ppc64le/conv-mach-smp.h
@@ -1,20 +1,17 @@
 
-#define CMK_SMP						   1
+#define CMK_SMP                                            1
 
+#undef CMK_NODE_QUEUE_AVAILABLE
+#define CMK_NODE_QUEUE_AVAILABLE                           1
 
 #undef CMK_SHARED_VARS_UNAVAILABLE
 #undef CMK_SHARED_VARS_POSIX_THREADS_SMP
 #define CMK_SHARED_VARS_UNAVAILABLE                        0
 #define CMK_SHARED_VARS_POSIX_THREADS_SMP                  1
 
-/* Right now only comm thread (no multicore) and tls thread version with gcc works on Blue Gene*/
 #define CMK_MULTICORE                                      0
 
-#ifdef __GNUC__
 #define CMK_NOT_USE_TLS_THREAD                             0
-#else
-#define CMK_NOT_USE_TLS_THREAD                             0
-#endif
 
 #define CMK_PCQUEUE_LOCK                                   1
 /*#define PCQUEUE_MULTIQUEUE                                 1*/
@@ -23,5 +20,7 @@
 
 #define CMK_FAKE_SCHED_YIELD                               1
 
-#define CMK_USE_L2ATOMICS                                  1
+#define CMK_PPC_ATOMIC_QUEUE                               1
+#define CMK_PPC_ATOMIC_MUTEX                               1
 
+#define  CMK_PPC_ATOMIC_DEFAULT_IMPL                       1
diff --git a/src/arch/pami-linux-ppc64le/conv-mach-smp.sh b/src/arch/pami-linux-ppc64le/conv-mach-smp.sh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/arch/pami-bluegeneq/conv-mach.h b/src/arch/pami-linux-ppc64le/conv-mach.h
similarity index 57%
copy from src/arch/pami-bluegeneq/conv-mach.h
copy to src/arch/pami-linux-ppc64le/conv-mach.h
index c6450474f4..b11ffdefc0 100644
--- a/src/arch/pami-bluegeneq/conv-mach.h
+++ b/src/arch/pami-linux-ppc64le/conv-mach.h
@@ -1,29 +1,18 @@
 #ifndef _CONV_MACH_H
 #define _CONV_MACH_H
 
-#define CMK_NO_OUTSTANDING_SENDS			   0
-
-#define CMK_64BIT                                          1
-
-//#define CMK_MEMORY_PREALLOCATE_HACK                        1
-
-//#define CMK_CONVERSE_MPI                                   1
-
-#define CMK_NO_SOCKETS					   1
+#define CMK_PAMI_LINUX_PPC8                                1
 
 #define CMK_DEFAULT_MAIN_USES_COMMON_CODE                  1
 
 #define CMK_GETPAGESIZE_AVAILABLE                          1
 
-#define CMK_IS_HETERO                                      0
-
 #define CMK_MALLOC_USE_GNU_MALLOC                          0
 #define CMK_MALLOC_USE_OS_BUILTIN                          1
 
 #define CMK_MEMORY_PAGESIZE                                8192
 #define CMK_MEMORY_PROTECTABLE                             1
 
-
 #define CMK_SHARED_VARS_UNAVAILABLE                        1
 #define CMK_SHARED_VARS_UNIPROCESSOR                       0
 
@@ -31,38 +20,33 @@
 #define CMK_SIGNAL_USE_SIGACTION                           0
 #define CMK_SIGNAL_USE_SIGACTION_WITH_RESTART              1
 
-#define CMK_SYNCHRONIZE_ON_TCP_CLOSE                       0
-
-#define CMK_THREADS_USE_CONTEXT                            0
-#define CMK_THREADS_USE_JCONTEXT                           1
-#define CMK_THREADS_USE_PTHREADS                           0
-#define CMK_THREADS_ARE_WIN32_FIBERS                       0
-
 #define CMK_THREADS_REQUIRE_NO_CPV                         0
 
 #define CMK_TIMER_USE_GETRUSAGE                            0
 #define CMK_TIMER_USE_SPECIAL                              0
 #define CMK_TIMER_USE_TIMES                                0
-// This needs to be compiled with gcc only
-#define CMK_TIMER_USE_BLUEGENEQ			           1
+#define CMK_TIMER_USE_RDTSC                                0
+#define CMK_TIMER_USE_PPC64                                1
+
+#define CMK_THREADS_USE_CONTEXT                            1
+#define CMK_THREADS_USE_JCONTEXT                           0
+#define CMK_THREADS_USE_PTHREADS                           0
 
+#define CMK_TYPEDEF_INT2 short
+#define CMK_TYPEDEF_INT4 int
+#define CMK_TYPEDEF_INT8 long long
+#define CMK_TYPEDEF_UINT2 unsigned short
+#define CMK_TYPEDEF_UINT4 unsigned int
+#define CMK_TYPEDEF_UINT8 unsigned long long
+#define CMK_TYPEDEF_FLOAT4 float
+#define CMK_TYPEDEF_FLOAT8 double
 
 #define CMK_WHEN_PROCESSOR_IDLE_BUSYWAIT                   1
 #define CMK_WHEN_PROCESSOR_IDLE_USLEEP                     0
 
-
 #define CMK_WEB_MODE                                       1
 #define CMK_DEBUG_MODE                                     0
 
-#define CMK_LBDB_ON					   1
-
-#undef CMK_CCS_AVAILABLE
-#define CMK_CCS_AVAILABLE				   0
-
-#define CMK_BLUEGENEQ                                      1
-#define CMK_BLUEGENEQ_OPTCOPY                              1
-
-#define CMK_NO_ISO_MALLOC                                  1
+#define CMK_LBDB_ON                                        1
 
 #endif
-
diff --git a/src/arch/pami-linux-ppc64le/conv-mach.sh b/src/arch/pami-linux-ppc64le/conv-mach.sh
new file mode 100644
index 0000000000..e222ec1ea5
--- /dev/null
+++ b/src/arch/pami-linux-ppc64le/conv-mach.sh
@@ -0,0 +1,56 @@
+
+PAMI_INC=/opt/ibmhpc/pecurrent/ppe.pami/include
+PAMI_LIB=/opt/ibmhpc/pecurrent/ppe.pami/gnu/lib64/pami64
+
+CXX=xlC_r
+CC=xlc_r
+
+CMK_CPP_CHARM='/lib/cpp -P'
+CMK_CPP_C="$CC -E"
+CMK_CC="$CC "
+CMK_CXX="$CXX "
+CMK_CXXPP="$CXX -E "
+CMK_LD="$CMK_CC "
+CMK_LDXX="$CMK_CXX "
+
+CMK_C_OPTIMIZE='-O3 -Q -g'
+CMK_CXX_OPTIMIZE='-O3 -Q -g'
+
+CMK_RANLIB='ranlib'
+CMK_LIBS='-lckqt'
+CMK_LD_LIBRARY_PATH="-Wl,-rpath,$CHARMLIBSO/"
+
+CMK_SYSINC="-I $PAMI_INC"
+#CMK_SYSLIBS="-L $PAMI_LIB -L /usr/lib/powerpc64le-linux-gnu -lpami -libverbs -lnuma -lstdc++ -lc -ldl -lrt -lpthread"
+CMK_SYSLIBS="-L $PAMI_LIB -L /usr/lib/powerpc64le-linux-gnu -lpami -libverbs -lstdc++ -lc -ldl -lrt -lpthread"
+
+CMK_NATIVE_LIBS=''
+CMK_NATIVE_CC="$CC -q64"
+CMK_NATIVE_LD="$CC -q64"
+CMK_NATIVE_CXX="$CXX -q64"
+CMK_NATIVE_LDXX="$CXX -q64"
+
+# fortran compiler
+CMK_CF77="xlf_r -q64 -fPIC "
+CMK_CF90="xlf90_r -q64 -fPIC -qsuffix=f=f90"
+CMK_CF90_FIXED="xlf90_r -q64 -fPIC"
+
+CMK_MOD_NAME_ALLCAPS=1
+CMK_MOD_EXT="mod"
+CMK_F90_MODINC="-p"
+CMK_F90_USE_MODDIR=""
+
+F90DIR=`which ifort 2> /dev/null`
+if test -h "$F90DIR"
+then
+  F90DIR=`readlink $F90DIR`
+fi
+if test -x "$F90DIR"
+then
+  F90LIBDIR="`dirname $F90DIR`/../lib"
+  F90MAIN="$F90LIBDIR/for_main.o"
+fi
+# for_main.o is important for main() in f90 code
+CMK_F90MAINLIBS="$F90MAIN "
+CMK_F90LIBS="-L$F90LIBDIR -lifcore -lifport "
+CMK_F77LIBS="$CMK_F90LIBS"
diff --git a/src/arch/pami/Makefile.machine b/src/arch/pami/Makefile.machine
index b404daf863..e69de29bb2 100644
--- a/src/arch/pami/Makefile.machine
+++ b/src/arch/pami/Makefile.machine
@@ -1,7 +0,0 @@
-#force compilation of QPX based code with -O3
-
-LIBCONV_UTIL := ${LIBCONV_UTIL} cmimemcpy_qpx.o
-
-cmimemcpy_qpx.o: cmimemcpy_qpx.c cmimemcpy_qpx.h
-cmimemcpy_qpx.o: CFLAGS:=${CFLAGS} -O3
-
diff --git a/src/arch/pami/PPCAtomicMutex.h b/src/arch/pami/PPCAtomicMutex.h
new file mode 100755
index 0000000000..7500f59164
--- /dev/null
+++ b/src/arch/pami/PPCAtomicMutex.h
@@ -0,0 +1,82 @@
+
+#ifndef __PPC_ATOMIC_MUTEX__
+#define __PPC_ATOMIC_MUTEX__
+
+#include <pthread.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdint.h>
+
+#if CMK_PPC_ATOMIC_DEFAULT_IMPL
+#include "default_ppcq.h"
+#else
+//define new ppc atomics in the pami instance directory
+#include "ppc_atomicq_impl.h"
+#endif
+
+typedef struct _ppc_atomic_mutex_t
+{
+  volatile ppc_atomic_t     counter;
+  volatile ppc_atomic_t     bound;
+} PPCAtomicMutex;
+
+PPCAtomicMutex *PPCAtomicMutexInit (void           * atomic_mem,
+                                    size_t           atomic_size)
+{
+  //Verify counter array is 64-byte aligned
+  assert( (((uintptr_t) atomic_mem) & (0x0F)) == 0 );
+  assert (sizeof(PPCAtomicMutex) <= atomic_size);
+
+  PPCAtomicMutex *mutex = (PPCAtomicMutex*) atomic_mem;
+  PPC_AtomicStore(&mutex->counter, 0);
+  PPC_AtomicStore(&mutex->bound, 1);
+
+  return mutex;
+}
+
+/**
+ *  \brief Try to acquire a mutex
+ *  \param[in]   mutex pointer
+ *  \return 0    Lock successfully acquired
+ *  \return 1    Lock was not acquired
+ */
+static inline int PPCAtomicMutexTryAcquire (PPCAtomicMutex *mutex)
+{
+  size_t rc = PPC_AtomicLoadIncrementBounded(&mutex->counter);
+  if (rc == CMI_PPC_ATOMIC_FAIL)
+    return 1;
+
+  PPC_AtomicReadFence();
+  return rc;
+}
+
+/**
+ *  \brief Acquire a mutex
+ *  \param[in]   mutex pointer
+ *  \return 0    Lock successfully acquired
+ */
+static inline void PPCAtomicMutexAcquire (PPCAtomicMutex *mutex)
+{
+  size_t rc = 0;
+  do {
+    rc = PPC_AtomicLoadIncrementBounded(&mutex->counter);
+  } while (rc == CMI_PPC_ATOMIC_FAIL);
+
+  PPC_AtomicReadFence();
+}
+
+/**
+ *  \brief Release a mutex
+ *  \param[in]   mutex pointer
+ */
+static inline void PPCAtomicMutexRelease(PPCAtomicMutex *mutex)
+{
+  //Flush outstanding loads/stores
+  PPC_AtomicWriteFence();
+
+  /* Release the lock */
+  PPC_AtomicStore(&(mutex->counter), 0);
+}
+
+
+#endif
diff --git a/src/arch/pami/PPCAtomicQueue.h b/src/arch/pami/PPCAtomicQueue.h
new file mode 100755
index 0000000000..3b82edba1e
--- /dev/null
+++ b/src/arch/pami/PPCAtomicQueue.h
@@ -0,0 +1,210 @@
+
+#ifndef __PPC_ATOMIC_QUEUE__
+#define __PPC_ATOMIC_QUEUE__
+
+#include <pthread.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdint.h>
+#include "pcqueue.h"
+
+#define DEFAULT_SIZE         2048
+
+#define CMI_PPCQ_SUCCESS  0
+#define CMI_PPCQ_EAGAIN  -1
+
+/////////////////////////////////////////////////////
+// \brief Basic atomic operations should to defined
+// ppc_atomic_t : the datatype of the atomic (uint32_t or uint64_t)
+// PPC_AtomicStore : store a value to the atomic counter
+// PPC_AtomicLoadIncrementBounded : bounded increment
+// PPC_AtomicWriteFence : a producer side write fence
+// PPC_AtomicReadFence  : consumer side read fence
+// PPC_AtomicCounterAllocate : allocate atomic counters
+/////////////////////////////////////////////////////
+
+#if CMK_PPC_ATOMIC_DEFAULT_IMPL
+#include "default_ppcq.h"
+#else
+//define new ppc atomics in the pami instance directory
+#include "ppc_atomicq_impl.h"
+#endif
+
+#if 0
+void PPC_AtomicCounterAllocate (void **atomic_mem, size_t  atomic_memsize);
+ppc_atomic_type_t PPC_AtomicLoadIncrementBounded (volatile ppc_atomic_t *counter);
+void PPC_AtomicStore(volatile ppc_atomic_t *counter, ppc_atomic_type_t val);
+void PPC_AtomicReadFence();
+void PPC_AtomicWriteFence();
+#endif
+
+typedef  void* PPCAtomicQueueElement;
+
+typedef struct _ppcatomicstate {
+  volatile ppc_atomic_t Producer;
+  volatile ppc_atomic_t UpperBound;
+  char pad[32 - 2*sizeof(ppc_atomic_t)];
+} PPCAtomicState;
+
+typedef struct _ppcatomicq {
+  PPCAtomicState              * _state;
+  volatile void * volatile    * _array;
+  volatile ppc_atomic_type_t    _consumer;
+  int                           _qsize;
+  int                           _useOverflowQ;
+  PCQueue                       _overflowQ;   //40 byte structure
+  char                          _pad[24];     //align to 64 bytes
+} PPCAtomicQueue; //should be padded
+
+void PPCAtomicQueueInit      (void            * atomic_mem,
+  size_t            atomic_memsize,
+  PPCAtomicQueue  * queue,
+  int               use_overflow,
+  int               nelem)
+{
+  pami_result_t rc;
+
+  //Verify counter array is 64-byte aligned
+#if CMK_BLUEGENEQ
+  assert ( (((uintptr_t) atomic_mem) & (0x1F)) == 0 );
+  assert (sizeof(PPCAtomicState) == 32); //all counters need to be lined up
+  assert (sizeof(PPCAtomicState) <= atomic_memsize);
+#endif
+
+  queue->_useOverflowQ = use_overflow;
+
+  int qsize = 2;
+  while (qsize < nelem)
+    qsize *= 2;
+  queue->_qsize = qsize;
+
+  queue->_state = (PPCAtomicState *) atomic_mem;
+  queue->_overflowQ = PCQueueCreate();
+  queue->_consumer = 0;
+  PPC_AtomicStore(&queue->_state->Producer, 0);
+  PPC_AtomicStore(&queue->_state->UpperBound, qsize);
+
+  rc = posix_memalign ((void **)&queue->_array,
+      128, /* Typical L1 line size for POWER */
+      sizeof(PPCAtomicQueueElement) * qsize);
+
+  assert(rc == PAMI_SUCCESS);
+  memset((void*)queue->_array, 0, sizeof(PPCAtomicQueueElement)*qsize);
+}
+
+int PPCAtomicEnqueue (PPCAtomicQueue          * queue,
+                      void                   * element)
+{
+  //fprintf(stderr,"Insert message %p\n", element);
+
+  register int qsize_1 = queue->_qsize - 1;
+  ppc_atomic_type_t index = PPC_AtomicLoadIncrementBounded(&queue->_state->Producer);
+  PPC_AtomicWriteFence();
+  if (index != CMI_PPC_ATOMIC_FAIL) {
+    queue->_array[index & qsize_1] = element;
+    return CMI_PPCQ_SUCCESS;
+  }
+
+  //We dont want to use the overflow queue
+  if (!queue->_useOverflowQ)
+    return CMI_PPCQ_EAGAIN; //Q is full, try later
+
+  //No ordering is guaranteed if there is overflow
+  PCQueuePush(queue->_overflowQ, element);
+
+  return CMI_PPCQ_SUCCESS;
+}
+
+void * PPCAtomicDequeue (PPCAtomicQueue    *queue)
+{
+  ppc_atomic_type_t head, tail;
+  tail = PPC_AQVal(queue->_state->Producer);
+  head = queue->_consumer;
+  register int qsize_1 = queue->_qsize-1;
+
+  volatile void *e = NULL;
+  if (head < tail) {
+    e = queue->_array[head & qsize_1];
+    if (e == NULL)
+      return NULL;
+
+    queue->_array[head & qsize_1] = NULL;
+    PPC_AtomicReadFence();
+
+    head ++;
+    queue->_consumer = head;
+
+    //Charm++ does not require message ordering
+    //So we dont acquire overflow mutex here
+    ppc_atomic_type_t n = head + queue->_qsize;
+
+    //Update bound every 16 consumes
+    if ((n & 0xF) == 0)
+      PPC_AtomicStore(&queue->_state->UpperBound, n);
+    return (void*) e;
+  }
+
+  //We dont have an overflowQ
+  if (!queue->_useOverflowQ)
+    return NULL;
+
+  e = PCQueuePop (queue->_overflowQ);
+  return (void *) e;
+}
+
+int PPCAtomicQueueEmpty (PPCAtomicQueue *queue) {
+  return ( (PCQueueLength(queue->_overflowQ) == 0) &&
+      (PPC_AQVal(queue->_state->Producer) == queue->_consumer) );
+}
+
+//spin block in the PPC atomic queue till there is a message. fail and
+//return after n iterations
+int PPCAtomicQueueSpinWait (PPCAtomicQueue    * queue,
+                            int                n)
+{
+  if (!PPCAtomicQueueEmpty(queue))
+    return 0;  //queue is not empty so return
+
+  ppc_atomic_type_t head, tail;
+  head = queue->_consumer;
+
+  size_t i = n;
+  do {
+    tail = PPC_AQVal(queue->_state->Producer);
+    i--;
+  }
+  //While the queue is empty and i < n
+  while (head == tail && i != 0);
+
+  return 0; //fail queue is empty
+}
+
+//spin block in the PPC atomic queue till there is a message. fail and
+//return after n iterations
+int PPCAtomicQueue2QSpinWait (PPCAtomicQueue    * queue0,
+                              PPCAtomicQueue    * queue1,
+                              int                n)
+{
+  if (!PPCAtomicQueueEmpty(queue0))
+    return 0;  //queue0 is not empty so return
+
+  if (!PPCAtomicQueueEmpty(queue1))
+    return 0;  //queue is not empty so return
+
+  ppc_atomic_type_t head0, tail0;
+  ppc_atomic_type_t head1, tail1;
+
+  head0 = queue0->_consumer;
+  head1 = queue1->_consumer;
+
+  size_t i = n;
+  do {
+    tail0 = PPC_AQVal(queue0->_state->Producer);
+    tail1 = PPC_AQVal(queue1->_state->Producer);
+    i --;
+  } while (head0==tail0 && head1==tail1 && i!=0);
+
+  return 0;
+}
+
+#endif
diff --git a/src/arch/pami/conv-common.h b/src/arch/pami/conv-common.h
index 2b175b6110..8a1598dcb0 100644
--- a/src/arch/pami/conv-common.h
+++ b/src/arch/pami/conv-common.h
@@ -5,7 +5,7 @@
 
 #define CMK_HANDLE_SIGUSR                                  1
 
-#define CMK_MSG_HEADER_EXT_    CmiUInt2 rank, hdl,xhdl,info, stratid; unsigned char cksum, magic; int root, size; CmiUInt2 redID, padding; 
+#define CMK_MSG_HEADER_EXT_    CmiUInt2 rank, hdl,xhdl,info, stratid; unsigned char cksum, magic; int root, size; CmiUInt2 redID, padding;
 
 #define CMK_MSG_HEADER_BASIC  CMK_MSG_HEADER_EXT
 #define CMK_MSG_HEADER_EXT    { CMK_MSG_HEADER_EXT_ }
@@ -34,8 +34,7 @@
 #undef CMK_HAS_FDATASYNC_FUNC
 #define CMK_HAS_FDATASYNC_FUNC                             0
 
-//#define CMI_DIRECT_MANY_TO_MANY_DEFINED                    0
+#define CMI_DIRECT_MANY_TO_MANY_DEFINED                    0
 
 #define CMK_PERSISTENT_COMM                                0
 
-#define  CMI_DIRECT_MANY_TO_MANY_DEFINED                   1
diff --git a/src/arch/pami/default_ppcq.h b/src/arch/pami/default_ppcq.h
new file mode 100644
index 0000000000..80e1112582
--- /dev/null
+++ b/src/arch/pami/default_ppcq.h
@@ -0,0 +1,98 @@
+
+#ifndef  __DEFAULT_PPCQ_H__
+#define  __DEFAULT_PPCQ_H__
+
+#include "pami.h"
+
+/////////////////////////////////////////////////////
+// \brief Basic atomic operations should to defined
+// PPC_AtomicStore : store a value to the atomic counter
+// PPC_AtomicLoadIncrementBounded : bounded increment
+// PPC_AtomicWriteFence : a producer side write fence
+// PPC_AtomicReadFence  : consumer side read fence
+// PPC_AtomicCounterAllocate : allocate atomic counters
+/////////////////////////////////////////////////////
+
+#define CMI_PPC_ATOMIC_FAIL  0x8000000000000000UL
+
+typedef uint64_t ppc_atomic_type_t;
+
+typedef struct _ppc_atomic_t {
+  volatile uint64_t   val;
+  char                _pad[56];
+} ppc_atomic_t;
+
+#define PPC_AQVal(x) ((x).val)
+
+static inline void PPC_AtomicCounterAllocate (void **atomic_mem,
+                                              size_t  atomic_memsize)
+{
+  posix_memalign(atomic_mem, 64, atomic_memsize);
+}
+
+// Load Reserved: 64bit atom
+static inline ppc_atomic_type_t PPC_AtomicLoadReserved ( volatile ppc_atomic_t *ptr )
+{
+  ppc_atomic_type_t val;
+  __asm__ __volatile__ ("ldarx %[val],0,%[ptr]"
+                        : [val] "=r" (val)
+                        : [ptr] "r" (&ptr->val)
+                        : "cc");
+
+  return( val );
+}
+
+static inline int PPC_AtomicStoreConditional( volatile ppc_atomic_t *ptr, ppc_atomic_type_t val )
+{
+  register int rc = 1; // assume success
+  __asm__ __volatile__ ("stdcx. %[val],0,%[ptr];\n"
+                        "beq 1f;\n"
+                        "li %[rc], 0;\n"
+                        "1: ;\n"
+                        : [rc] "=r" (rc)
+                        : [ptr] "r" (&ptr->val), [val] "r" (val), "0" (rc)
+                        : "cc", "memory");
+  return( rc );
+}
+
+static inline ppc_atomic_type_t PPC_AtomicLoadIncrementBounded (volatile ppc_atomic_t *counter)
+{
+  register ppc_atomic_type_t old_val, tmp_val, bound;
+  bound = counter[1].val;
+  do
+  {
+    old_val = PPC_AtomicLoadReserved( counter );
+    tmp_val = old_val + 1;
+
+    if (tmp_val > bound)
+      return CMI_PPC_ATOMIC_FAIL;
+  }
+  while ( !PPC_AtomicStoreConditional( counter, tmp_val ) );
+
+  return( old_val );
+}
+
+static inline void PPC_AtomicStore(volatile ppc_atomic_t *counter, ppc_atomic_type_t val)
+{
+  //Counter perpetually increments, so stale value is always smaller
+  //__asm__ __volatile__ ("lwsync":::"memory");
+  counter->val = val;
+}
+
+static inline void PPC_AtomicReadFence()
+{
+#if !CMK_BLUEGENQ  //full memory barrier executed on Producer
+  __asm__ __volatile__ ("isync":::"memory");
+#endif
+}
+
+static inline void PPC_AtomicWriteFence()
+{
+#if CMK_BLUEGENEQ //execute full memory barrier
+  __asm__ __volatile__ ("sync":::"memory");
+#else
+  __asm__ __volatile__ ("lwsync":::"memory");
+#endif
+}
+
+#endif
diff --git a/src/arch/pami/machine.c b/src/arch/pami/machine.c
index 05a450d126..2f7a5c3b7a 100644
--- a/src/arch/pami/machine.c
+++ b/src/arch/pami/machine.c
@@ -1,3 +1,8 @@
+
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <sched.h>
+
 #include <stdio.h>
 #include <errno.h>
 #include <stdlib.h>
@@ -10,18 +15,18 @@
 #include "assert.h"
 #include "malloc.h"
 
+#if CMK_BLUEGENEQ
 #include <hwi/include/bqc/A2_inlines.h>
 #include "spi/include/kernel/process.h"
 #include "spi/include/kernel/memory.h"
+#endif
+
 #include "pami.h"
 #include "pami_sys.h"
 
 #if MACHINE_DEBUG_LOG
 FILE *debugLog = NULL;
 #endif
-//#if CMK_SMP
-//#define CMK_USE_L2ATOMICS   1
-//#endif
 
 #if !CMK_SMP
 #if CMK_ENABLE_ASYNC_PROGRESS
@@ -29,13 +34,15 @@ FILE *debugLog = NULL;
 #endif
 #endif
 
-
-#if CMK_SMP && CMK_USE_L2ATOMICS
-#include "L2AtomicQueue.h"
-#include "L2AtomicMutex.h"
+#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE
+#include "PPCAtomicQueue.h"
 #include "memalloc.c"
 #endif
 
+#if CMK_SMP && CMK_PPC_ATOMIC_MUTEX
+#include "PPCAtomicMutex.h"
+#endif
+
 #define CMI_LIKELY(x)    (__builtin_expect(x,1))
 #define CMI_UNLIKELY(x)  (__builtin_expect(x,0))
 
@@ -49,12 +56,7 @@ char *ALIGN_32(char *p) {
   This will use the fourth short in message as an indicator of spanning tree
   root.
 */
-#if CMK_SMP
-#define CMK_BROADCAST_SPANNING_TREE    1
-#else
 #define CMK_BROADCAST_SPANNING_TREE    1
-#endif /* CMK_SMP */
-
 #define BROADCAST_SPANNING_FACTOR     4
 
 //The root of the message infers the type of the message
@@ -72,14 +74,21 @@ char *ALIGN_32(char *p) {
 /* FIXME: need a random number that everyone agrees ! */
 #define CHARM_MAGIC_NUMBER               126
 
-
 #define CMI_PAMI_SHORT_DISPATCH           7
 #define CMI_PAMI_RZV_DISPATCH             8
 #define CMI_PAMI_ACK_DISPATCH             9
 #define CMI_PAMI_DISPATCH                10
 
+#ifdef CMK_BLUEGENEQ
 #define SHORT_CUTOFF   128
 #define EAGER_CUTOFF   4096
+#else
+#define SHORT_CUTOFF   1920
+#define EAGER_CUTOFF   2000000000
+#endif
+
+//typically this can be enabled when LTPS==0
+#define FREE_LIST_SEND_NO_COPY     0
 
 #if CMK_ERROR_CHECKING
 static int checksum_flag = 0;
@@ -128,35 +137,28 @@ CpvDeclare(void*, CmiLocalQueue);
 
 
 typedef struct ProcState {
-    /* PCQueue      sendMsgBuf; */  /* per processor message sending queue */
-#if CMK_SMP && CMK_USE_L2ATOMICS
-    L2AtomicQueue   atomic_queue;
+#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE
+  PPCAtomicQueue   atomic_queue;
+  char            _pad[128-sizeof(PPCAtomicQueue)];
 #endif
-  /* CmiNodeLock  recvLock;  */            /* for cs->recv */
 } ProcState;
 
 static ProcState  *procState;
 
-#if CMK_SMP && CMK_USE_L2ATOMICS
-static L2AtomicMutex *node_recv_mutex;
-static L2AtomicQueue node_recv_atomic_q;
+#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE
+static PPCAtomicQueue node_recv_atomic_q;
 #endif
 
-#if CMK_SMP && !CMK_MULTICORE
-//static volatile int commThdExit = 0;
-//static CmiNodeLock commThdExitLock = 0;
+#if CMK_SMP && CMK_PPC_ATOMIC_MUTEX
+static PPCAtomicMutex *node_recv_mutex;
+#endif
 
+#if CMK_SMP && !CMK_MULTICORE
 //The random seed to pick destination context
 __thread uint32_t r_seed = 0xdeadbeef;
 __thread int32_t _cmi_bgq_incommthread = 0;
 #endif
 
-//int CmiInCommThread () {
-//  //if (_cmi_bgq_incommthread)
-//  //printf ("CmiInCommThread: %d\n", _cmi_bgq_incommthread);
-//  return _cmi_bgq_incommthread;
-//}
-
 void ConverseRunPE(int everReturn);
 static void CommunicationServer(int sleepTime);
 static void CommunicationServerThread(int sleepTime);
@@ -218,8 +220,8 @@ void CmiPushPE(int pe,void *msg) {
     }
 #endif
     
-#if CMK_SMP && CMK_USE_L2ATOMICS
-    L2AtomicEnqueue(&procState[pe].atomic_queue, msg);
+#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE
+    PPCAtomicEnqueue(&procState[pe].atomic_queue, msg);
 #else
     PCQueuePush(cs->recv,(char *)msg);
 #endif
@@ -239,12 +241,10 @@ static void CmiPushNode(void *msg) {
       return;
     }
 #endif
-#if CMK_SMP && CMK_USE_L2ATOMICS
-    L2AtomicEnqueue(&node_recv_atomic_q, msg);    
+#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE
+    PPCAtomicEnqueue(&node_recv_atomic_q, msg);
 #else
-    CmiLock(CsvAccess(NodeState).CmiNodeRecvLock);
     PCQueuePush(CsvAccess(NodeState).NodeRecv,msg);
-    CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock);
 #endif
     //CmiState cs=CmiGetStateN(0);
     //CmiIdleLock_addMessage(&cs->idle);
@@ -263,13 +263,13 @@ static void CmiPushNode(void *msg) {
 volatile int msgQueueLen [MAX_NUM_CONTEXTS];
 volatile int outstanding_recvs [MAX_NUM_CONTEXTS];
 
-//#if CMK_SMP && CMK_ENABLE_ASYNC_PROGRESS
-//#define THREADS_PER_CONTEXT 2
-//#define LTPS                1 //Log Threads Per Context (TPS)
-//#else
+#if CMK_BLUEGENEQ
 #define THREADS_PER_CONTEXT 4
 #define LTPS                2 //Log Threads Per Context (TPS)
-//#endif
+#else
+#define THREADS_PER_CONTEXT 1
+#define LTPS                0 //Log Threads Per Context (TPS)
+#endif //endif CMK_BLUEGENEQ
 
 #define  MY_CONTEXT_ID() (CmiMyRank() >> LTPS)
 #define  MY_CONTEXT()    (cmi_pami_contexts[CmiMyRank() >> LTPS])
@@ -293,12 +293,12 @@ volatile int outstanding_recvs;
 #define  INCR_ORECVS()   (outstanding_recvs ++)
 #define  DECR_ORECVS()   (outstanding_recvs --)
 #define  ORECVS()        (outstanding_recvs)
-#endif
+#endif //CMK_SMP
 
 #if CMK_SMP  && !CMK_ENABLE_ASYNC_PROGRESS
 #define PAMIX_CONTEXT_LOCK_INIT(x)
 #define PAMIX_CONTEXT_LOCK(x)        if(LTPS) PAMI_Context_lock(x)
-#define PAMIX_CONTEXT_UNLOCK(x)      if(LTPS) {ppc_msync(); PAMI_Context_unlock(x);}
+#define PAMIX_CONTEXT_UNLOCK(x)      if(LTPS) {CmiMemoryWriteFence(); PAMI_Context_unlock(x);}
 #define PAMIX_CONTEXT_TRYLOCK(x)     ((LTPS)?(PAMI_Context_trylock(x) == PAMI_SUCCESS):(1))
 #else
 #define PAMIX_CONTEXT_LOCK_INIT(x)
@@ -358,8 +358,10 @@ static void recv_done(pami_context_t ctxt, void *clientdata, pami_result_t resul
 {
     char *msg = (char *) clientdata;
     int sndlen = ((CmiMsgHeaderBasic *) msg)->size;
-    //int rank = *(int *) (msg + sndlen); //get rank from bottom of the message
-    //CMI_DEST_RANK(msg) = rank;
+#if FREE_LIST_SEND_NO_COPY
+    int rank = *(int *) (msg + sndlen); //get rank from bottom of the message
+    CMI_DEST_RANK(msg) = rank;
+#endif
 
     //fprintf (stderr, "%d Recv message done \n", CmiMyPe());
     /* then we do what PumpMsgs used to do:
@@ -378,7 +380,7 @@ static void recv_done(pami_context_t ctxt, void *clientdata, pami_result_t resul
 
 #if CMK_NODE_QUEUE_AVAILABLE
 #if CMK_BROADCAST_SPANNING_TREE
-    if (CMI_IS_BCAST_ON_NODES(msg)) 
+    if (CMI_IS_BCAST_ON_NODES(msg))
       SendSpanningChildrenNode(sndlen, msg, 1);
 #endif
     if (CMI_DEST_RANK(msg) == SMP_NODEMESSAGE)
@@ -394,15 +396,16 @@ typedef struct _cmi_pami_rzv {
   void           * buffer;
   size_t           offset;
   int              bytes;
+  int              rank;
   int              dst_context;
-  pami_memregion_t mregion;
 }CmiPAMIRzv_t;  
 
 typedef struct _cmi_pami_rzv_recv {
+  int              rank;  //Read in recv_done
+  int              size;
   void           * msg;
   void           * src_buffer;
   int              src_ep;
-  int              size;
   pami_memregion_t rmregion;
 } CmiPAMIRzvRecv_t;
 
@@ -415,13 +418,15 @@ static void pkt_dispatch (pami_context_t       context,
 			  pami_endpoint_t      origin,
 			  pami_recv_t         * recv)        
 {
-    //fprintf (stderr, "Received Message of size %d %p\n", pipe_size, recv);
+    //fprintf (stderr, "%d Received Message of size %d %p\n", CmiMyPe(), pipe_size, recv);
     INCR_ORECVS();    
     int alloc_size = pipe_size;
+#if !FREE_LIST_SEND_NO_COPY
     char * buffer  = (char *)CmiAlloc(alloc_size);
-    //char * buffer  = (char *)CmiAlloc(alloc_size + sizeof(int));
-    //*(int *)(buffer+alloc_size) = *(int *)header_addr;
-
+#else
+    char * buffer  = (char *)CmiAlloc(alloc_size + sizeof(int));
+    *(int *)(buffer+alloc_size) = *(int *)header_addr;
+#endif
     if (recv) {
       recv->local_fn = recv_done;
       recv->cookie   = buffer;
@@ -445,10 +450,9 @@ static void short_pkt_dispatch (pami_context_t       context,
 				pami_endpoint_t      origin,
 				pami_recv_t         * recv)        
 {
+  //fprintf(stderr, "%d short dispatch\n", CmiMyPe());
   int alloc_size = pipe_size;
   char * buffer  = (char *)CmiAlloc(alloc_size);
-  //char * buffer  = (char *)CmiAlloc(alloc_size + sizeof(int));
-  //*(int *)(buffer+alloc_size) = *(int *)header_addr;
   
   memcpy (buffer, pipe_addr, pipe_size);
   char *smsg = (char *)pipe_addr;
@@ -460,7 +464,13 @@ static void short_pkt_dispatch (pami_context_t       context,
     CmiAbort("Charm++ Warning: Non Charm++ Message Received. If your application has a large number of messages, this may be because of overflow in the low-level FIFOs. Please set the environment variable MUSPI_INJFIFOSIZE if the application has large number of small messages (<=4K bytes), and/or PAMI_RGETINJFIFOSIZE if the application has a large number of large messages. The default value of these variable is 65536 which is sufficient for 1000 messages in flight; please try a larger value. Please note that the memory used for these FIFOs eats up the memory = 10*FIFO_SIZE per core. Please contact Charm++ developers for further information. \n");     
   }
  
-  CmiPushPE(CMI_DEST_RANK(smsg), (void *)msg);
+#if FREE_LIST_SEND_NO_COPY
+  int dst_rank = *(int*) header_addr;
+  CMI_DEST_RANK(msg) = dst_rank;
+  CmiPushPE(dst_rank, (void *)msg);
+#else
+  CmiPushPE(CMI_DEST_RANK(msg), (void *)msg);
+#endif
 }
 
 
@@ -486,6 +496,7 @@ void rzv_recv_done   (pami_context_t     ctxt,
 		      void             * clientdata, 
 		      pami_result_t      result); 
 
+#if CMK_BLUEGENEQ
 //approx sleep command
 size_t mysleep_iter = 0;
 void mysleep (unsigned long cycles) {
@@ -499,10 +510,10 @@ void mysleep (unsigned long cycles) {
 
     return;
 }
+#endif
 
 static void * test_buf;
 volatile int pami_barrier_flag = 0;
-typedef pami_result_t (*pamix_proc_memalign_fn) (void**, size_t, size_t, const char*);
 
 void pami_barrier_done (void *ctxt, void * clientdata, pami_result_t err)
 {
@@ -528,7 +539,6 @@ CmiPAMIMemRegion_t  cmi_pami_memregion[64];
 #endif
 
 #include "malloc.h"
-void *l2atomicbuf;
 
 void _alias_rank (int rank) {
 #if CMK_SMP && CMK_ENABLE_ASYNC_PROGRESS
@@ -639,7 +649,8 @@ int CMI_Progress_finalize(int start, int ncontexts) {
 
 #include "manytomany.c"
 
-void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret) {
+void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret)
+{
     int n, i, count;
 
     /* processor per node */
@@ -651,6 +662,12 @@ void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret)
 #endif
     
     PAMI_Client_create (clientname, &cmi_pami_client, NULL, 0);
+    pami_configuration_t query;
+    query.name = PAMI_CLIENT_NUM_CONTEXTS;
+    pami_result_t rc = PAMI_Client_query(cmi_pami_client, &query, 1);
+    unsigned possible_contexts = query.value.intval;
+    //fprintf(stdout, "Creating client with %d contexts\n", possible_contexts);
+
     size_t _n = 1;
 #if CMK_PAMI_MULTI_CONTEXT
     if ((_Cmi_mynodesize % THREADS_PER_CONTEXT) == 0)
@@ -660,7 +677,18 @@ void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret)
 #endif
 
     cmi_pami_contexts = (pami_context_t *) malloc (sizeof(pami_context_t) * _n);
-    pami_result_t rc = PAMI_Context_createv (cmi_pami_client, NULL, 0, cmi_pami_contexts, _n);
+
+    int  cfgval=0;
+#if 1 //CMK_BLUEGENEQ
+    pami_configuration_t *config = NULL;
+#else
+    pami_configuration_t config[3];
+    config[cfgval].name = PAMI_CLIENT_CONST_CONTEXTS:
+    config[cfgval].value.intval = 1;
+    cfgval++;
+#endif
+
+    rc = PAMI_Context_createv (cmi_pami_client, config, cfgval, cmi_pami_contexts, _n);
     if (rc != PAMI_SUCCESS) {
       fprintf(stderr, "PAMI_Context_createv failed for %d contexts\n", _n);
       assert(0);
@@ -689,8 +717,18 @@ void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret)
     }
 #endif
 
+    pami_dispatch_hint_t soptions = (pami_dispatch_hint_t) {0};
+    pami_dispatch_hint_t loptions = (pami_dispatch_hint_t) {0};
+
+    soptions.long_header    = PAMI_HINT_DISABLE;
+    soptions.recv_immediate = PAMI_HINT_ENABLE;
+    soptions.use_rdma       = PAMI_HINT_DISABLE;
+
+    loptions.long_header     = PAMI_HINT_DISABLE;
+    loptions.recv_contiguous = PAMI_HINT_ENABLE;
+    //loptions.recv_immediate = PAMI_HINT_ENABLE;
+    loptions.recv_copy       = PAMI_HINT_ENABLE;
 
-    pami_dispatch_hint_t options = (pami_dispatch_hint_t) {0};
     pami_dispatch_callback_function pfn;
     for (i = 0; i < _n; ++i) {
       pfn.p2p = pkt_dispatch;
@@ -698,31 +736,31 @@ void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret)
 			 CMI_PAMI_DISPATCH,
 			 pfn,
 			 NULL,
-			 options);
+			 loptions);
       
       pfn.p2p = ack_pkt_dispatch;
       PAMI_Dispatch_set (cmi_pami_contexts[i],
 			 CMI_PAMI_ACK_DISPATCH,
 			 pfn,
 			 NULL,
-			 options);
+			 soptions);
       
       pfn.p2p = rzv_pkt_dispatch;
       PAMI_Dispatch_set (cmi_pami_contexts[i],
 			 CMI_PAMI_RZV_DISPATCH,
 			 pfn,
 			 NULL,
-			 options);      
+			 soptions);
 
       pfn.p2p = short_pkt_dispatch;
       PAMI_Dispatch_set (cmi_pami_contexts[i],
 			 CMI_PAMI_SHORT_DISPATCH,
 			 pfn,
 			 NULL,
-			 options);      
+			 soptions);
     }
 
-#if 1
+#if CMK_BLUEGENEQ
     size_t bytes_out;
     void * buf = malloc(sizeof(long));    
     uint32_t retval;
@@ -834,49 +872,52 @@ void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, int initret)
     CsvInitialize(CmiNodeState, NodeState);
     CmiNodeStateInit(&CsvAccess(NodeState));
 
-#if CMK_SMP && CMK_USE_L2ATOMICS
+#if CMK_SMP
+    posix_memalign((void**)&procState, 128, (_Cmi_mynodesize) * sizeof(ProcState));
+#endif
+
+#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE
+#if CMK_BLUEGENEQ // we may enable communication threads
     //max available hardware threads
     int actualNodeSize = 64/Kernel_ProcessCount(); 
-    //printf("Ranks per node %d, actualNodeSize %d CmiMyNodeSize() %d\n",
-    //	   Kernel_ProcessCount(), actualNodeSize, _Cmi_mynodesize);
-    
-    //pami_result_t rc;
-    pami_extension_t l2;
-    pamix_proc_memalign_fn PAMIX_L2_proc_memalign;
-    size_t size = (_Cmi_mynodesize + 2*actualNodeSize + 1) 
-      * sizeof(L2AtomicState) + sizeof(L2AtomicMutex);
+#else
+    int actualNodeSize = _Cmi_mynodesize;
+#endif
 
-    rc = PAMI_Extension_open(NULL, "EXT_bgq_l2atomic", &l2);
-    CmiAssert (rc == 0);
-    PAMIX_L2_proc_memalign = (pamix_proc_memalign_fn)PAMI_Extension_symbol(l2, "proc_memalign");
-    rc = PAMIX_L2_proc_memalign(&l2atomicbuf, 64, size, NULL);
-    CmiAssert (rc == 0);    
+#if CMK_PPC_ATOMIC_MUTEX
+    //Allocate for PPC Atomic Mutex as well
+    size_t size = (_Cmi_mynodesize + 3*actualNodeSize + 1)
+      * sizeof(PPCAtomicState) + 2*sizeof(PPCAtomicMutex);
+#else
+    size_t size = (_Cmi_mynodesize + 3*actualNodeSize + 1)
+      * sizeof(PPCAtomicState);
 #endif
+    void *atomic_buf;
+    PPC_AtomicCounterAllocate(&atomic_buf, size);
 
-    char *l2_start = (char *) l2atomicbuf;
-    procState = (ProcState *)malloc((_Cmi_mynodesize) * sizeof(ProcState));
+    char *atomic_start = (char *) atomic_buf;
     for (i=0; i<_Cmi_mynodesize; i++) {
-#if CMK_SMP && CMK_USE_L2ATOMICS
-	L2AtomicQueueInit (l2_start + sizeof(L2AtomicState)*i,
-			   sizeof(L2AtomicState),
-			   &procState[i].atomic_queue,
-			   1, /*use overflow*/
-			   DEFAULT_SIZE /*1024 entries*/);
-#endif
+      PPCAtomicQueueInit (atomic_start + sizeof(PPCAtomicState)*i,
+			  sizeof(PPCAtomicState),
+			  &procState[i].atomic_queue,
+			  1, /*use overflow*/
+			  DEFAULT_SIZE /*2048 entries*/);
     }
+    atomic_start += _Cmi_mynodesize * sizeof(PPCAtomicState);
 
-#if CMK_SMP && CMK_USE_L2ATOMICS    
-    l2_start += _Cmi_mynodesize * sizeof(L2AtomicState);
-    CmiMemAllocInit_bgq (l2_start, 2*actualNodeSize*sizeof(L2AtomicState)); 
-    l2_start += 2*actualNodeSize*sizeof(L2AtomicState); 
+    CmiMemAllocInit_ppcq(atomic_start,3*actualNodeSize*sizeof(PPCAtomicState));
+    atomic_start += 3*actualNodeSize*sizeof(PPCAtomicState);
 
-    L2AtomicQueueInit (l2_start,
-		       sizeof(L2AtomicState),
-		       &node_recv_atomic_q,
-		       1, /*use overflow*/
-		       DEFAULT_SIZE /*1024 entries*/);	 
-    l2_start += sizeof(L2AtomicState);  
-    node_recv_mutex = L2AtomicMutexInit(l2_start, sizeof(L2AtomicMutex));   
+    PPCAtomicQueueInit (atomic_start,
+			sizeof(PPCAtomicState),
+			&node_recv_atomic_q,
+			1, /*use overflow*/
+			DEFAULT_SIZE /*2048 entries*/);
+    atomic_start += sizeof(PPCAtomicState);
+
+#if CMK_PPC_ATOMIC_MUTEX
+    node_recv_mutex = PPCAtomicMutexInit(atomic_start, sizeof(PPCAtomicMutex));
+#endif
 #endif
     
     //Initialize the manytomany api
@@ -899,11 +940,9 @@ int PerrorExit (char *err) {
     return -1;
 }
 
-
 void ConverseRunPE(int everReturn) {
     //    printf ("ConverseRunPE on rank %d\n", CmiMyPe());    
 
-    CmiIdleState *s=CmiNotifyGetState();
     CmiState cs;
     char** CmiMyArgv;
     CmiNodeAllBarrier();
@@ -919,19 +958,35 @@ void ConverseRunPE(int everReturn) {
 
     CthInit(CmiMyArgv);
 
+    CmiBarrier();
+    CmiBarrier();
+    CmiBarrier();
+    CmiBarrier();
+
     //printf ("Before Converse Common Init\n");
     ConverseCommonInit(CmiMyArgv);
 
+#if CMK_TRACE_ENABLED
+    //Register memory allocator events
+    traceRegisterUserEvent("CmiAlloc_ppcq", 30001);
+    traceRegisterUserEvent("CmiFree_ppcq",  30002);
+    traceRegisterUserEvent("machine_send",  30003);
+    traceRegisterUserEvent("CmiSendPeer",   30004);
+    traceRegisterUserEvent("PAMI_Context_advance",   30005);
+    traceRegisterUserEvent("m2m_start",   30006);
+    traceRegisterUserEvent("PAMI_Context_post",   30007);
+#endif
+
     CcdCallOnConditionKeep(CcdPROCESSOR_STILL_IDLE,(CcdVoidFn)CmiNotifyIdle,NULL);
 
     //printf ("before calling CmiBarrier() \n");
-    CmiBarrier();
 
     /* Converse initialization finishes, immediate messages can be processed.
        node barrier previously should take care of the node synchronization */
     _immediateReady = 1;
 
     //printf("calling the startfn\n");
+    CmiBarrier();
 
     if (!everReturn) {
       Cmi_startfn(CmiGetArgc(CmiMyArgv), CmiMyArgv);
@@ -1011,7 +1066,9 @@ void ConverseExit(void) {
 #if CMK_SMP
   CmiNodeBarrier();
   if (rank0) {
+#if CMK_BLUEGENEQ
     Delay(100000);
+#endif
     exit(0); 
   }
   else
@@ -1037,15 +1094,20 @@ void CmiAbort(const char * message) {
 
 #if CMK_NODE_QUEUE_AVAILABLE
 char *CmiGetNonLocalNodeQ(void) {
-    //CmiState cs = CmiGetState();
     char *result = 0;
-    //CmiIdleLock_checkMessage(&cs->idle);
 
-#if CMK_SMP && CMK_USE_L2ATOMICS
-    if (!L2AtomicQueueEmpty(&node_recv_atomic_q)) {
-      if (L2AtomicMutexTryAcquire(node_recv_mutex) == 0) {
-	result = (char*)L2AtomicDequeue(&node_recv_atomic_q);
-	L2AtomicMutexRelease(node_recv_mutex);
+#if CMK_SMP && CMK_PPC_ATOMIC_MUTEX && CMK_PPC_ATOMIC_QUEUE
+    if (!PPCAtomicQueueEmpty(&node_recv_atomic_q)) {
+      if (PPCAtomicMutexTryAcquire(node_recv_mutex) == 0) {
+        result = (char*)PPCAtomicDequeue(&node_recv_atomic_q);
+        PPCAtomicMutexRelease(node_recv_mutex);
+      }
+    }
+#elif CMK_SMP && CMK_PPC_ATOMIC_QUEUE
+    if (!PPCAtomicQueueEmpty(&node_recv_atomic_q)) {
+      if (CmiTryLock(CsvAccess(NodeState).CmiNodeRecvLock) == 0) {
+        result = (char*)PPCAtomicDequeue(&node_recv_atomic_q);
+        CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock);
       }
     }
 #else
@@ -1053,7 +1115,6 @@ char *CmiGetNonLocalNodeQ(void) {
       MACHSTATE1(3,"CmiGetNonLocalNodeQ begin %d {", CmiMyPe());
       
       if (CmiTryLock(CsvAccess(NodeState).CmiNodeRecvLock) == 0) {
-	//CmiLock(CsvAccess(NodeState).CmiNodeRecvLock);
 	result = (char *) PCQueuePop(CsvAccess(NodeState).NodeRecv);
 	CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock);
       }
@@ -1073,12 +1134,12 @@ void *CmiGetNonLocal() {
     CmiState cs = CmiGetState();
     //CmiIdleLock_checkMessage(&cs->idle);
 
-#if CMK_SMP && CMK_USE_L2ATOMICS
-    msg = L2AtomicDequeue(&procState[CmiMyRank()].atomic_queue);
+#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE
+    msg = PPCAtomicDequeue(&procState[CmiMyRank()].atomic_queue);
 #if !(CMK_ENABLE_ASYNC_PROGRESS)
     if (msg == NULL) {
       AdvanceCommunications();     
-      msg = L2AtomicDequeue(&procState[CmiMyRank()].atomic_queue);
+      msg = PPCAtomicDequeue(&procState[CmiMyRank()].atomic_queue);
     }
 #endif
 #else
@@ -1088,9 +1149,6 @@ void *CmiGetNonLocal() {
     msg =  PCQueuePop(cs->recv);
 #endif
 
-    //if (msg != NULL)
-    //fprintf(stderr, "%d: Returning a message\n", CmiMyPe());
-
     return msg;
 }
 
@@ -1109,8 +1167,15 @@ static void CmiSendSelf(char *msg) {
 
 #if CMK_SMP
 static void CmiSendPeer (int rank, int size, char *msg) {
-    //fprintf(stderr, "%d Send messages to peer\n", CmiMyPe());
-    CmiPushPE (rank, msg);
+  //fprintf(stderr, "%d Send messages to peer\n", CmiMyPe());
+#if CMK_TRACE_ENABLED
+  double start = CmiWallTimer();
+#endif
+  CmiPushPE (rank, msg);
+
+#if CMK_TRACE_ENABLED
+  traceUserBracketEvent(30004, start, CmiWallTimer());
+#endif
 }
 #endif
 
@@ -1151,8 +1216,8 @@ void CmiGeneralFreeSendN(int node, int rank, int size, char * msg, int to_lock)
 #if CMK_SMP
     CMI_DEST_RANK(msg) = rank;
     if (node == CmiMyNode()) {
-        CmiSendPeer (rank, size, msg);
-        return;
+      CmiSendPeer (rank, size, msg);
+      return;
     }
 #endif
 
@@ -1194,17 +1259,23 @@ void  machine_send       (pami_context_t      context,
 {
     CMI_DEST_RANK(msg) = rank;
 
+#if CMK_TRACE_ENABLED
+    double start = CmiWallTimer();
+#endif
+
+    CmiAssert (node != CmiMyNode());
+
     pami_endpoint_t target;
 #if CMK_PAMI_MULTI_CONTEXT
-    //size_t dst_context = (rank != SMP_NODEMESSAGE) ? (rank>>LTPS) : (rand_r(&r_seed) % cmi_pami_numcontexts);
+    size_t dst_context = (rank != SMP_NODEMESSAGE) ? (rank>>LTPS) : (myrand(&r_seed) % cmi_pami_numcontexts);
     //Choose a context at random
-    size_t dst_context = myrand(&r_seed) % cmi_pami_numcontexts;
+    //size_t dst_context = myrand(&r_seed) % cmi_pami_numcontexts;
 #else
     size_t dst_context = 0;
 #endif
     PAMI_Endpoint_create (cmi_pami_client, (pami_task_t)node, dst_context, &target);
     
-    //fprintf (stderr, "Calling PAMI Send to %d magic %d size %d\n", node, CMI_MAGIC(msg), size);
+    //fprintf (stderr, "%d Calling PAMI Send to node %d peer %d magic %d size %d\n", CmiMyPe(), node, dst_context, CMI_MAGIC(msg), size);
     if (CMI_LIKELY(size < SHORT_CUTOFF)) {
       pami_send_immediate_t parameters;
       
@@ -1216,15 +1287,20 @@ void  machine_send       (pami_context_t      context,
 	  //use short callback if not a bcast and not an SMP node message
 	  parameters.dispatch        = CMI_PAMI_SHORT_DISPATCH;
 
-      parameters.header.iov_base = NULL; //&rank;
-      parameters.header.iov_len  = 0;    //sizeof(int);
+#if FREE_LIST_SEND_NO_COPY
+      parameters.header.iov_base = &rank;
+      parameters.header.iov_len  = sizeof(int);
+#else
+      parameters.header.iov_base = NULL;
+      parameters.header.iov_len  = 0;
+#endif
       parameters.data.iov_base   = msg;
       parameters.data.iov_len    = size;
       parameters.dest = target;
       
       if(to_lock)
 	PAMIX_CONTEXT_LOCK(context);
-      
+
       PAMI_Send_immediate (context, &parameters);
       
       if(to_lock)
@@ -1234,8 +1310,13 @@ void  machine_send       (pami_context_t      context,
     else if (size < EAGER_CUTOFF) {
       pami_send_t parameters;
       parameters.send.dispatch        = CMI_PAMI_DISPATCH;
-      parameters.send.header.iov_base = NULL; //&rank;
-      parameters.send.header.iov_len  = 0;    //sizeof(int);
+#if FREE_LIST_SEND_NO_COPY
+      parameters.send.header.iov_base = &rank;
+      parameters.send.header.iov_len  = sizeof(int);
+#else
+      parameters.send.header.iov_base = NULL;
+      parameters.send.header.iov_len  = 0;
+#endif
       parameters.send.data.iov_base   = msg;
       parameters.send.data.iov_len    = size;
       parameters.events.cookie        = msg;
@@ -1252,29 +1333,50 @@ void  machine_send       (pami_context_t      context,
 	PAMIX_CONTEXT_UNLOCK(context);
     }
     else {
+      if(to_lock)
+        PAMIX_CONTEXT_LOCK(context);
+
       CmiPAMIRzv_t   rzv;
       rzv.bytes       = size;
       rzv.buffer      = msg;
+      rzv.rank        = rank;
+#if CMK_BLUEGENEQ
       rzv.offset      = (size_t)msg - (size_t)cmi_pami_memregion[0].baseVA;
+#else
+      rzv.offset      = (size_t)msg;
+      size_t bytes_out;
+      pami_memregion_t mregion;
+      //In use for PAMI_Get
+      PAMI_Memregion_create (context,
+                             msg,
+                             size,
+                             &bytes_out,
+                             &mregion);
+#endif
       rzv.dst_context = dst_context;
-      memcpy(&rzv.mregion, &cmi_pami_memregion[0].mregion, sizeof(pami_memregion_t));
 
       pami_send_immediate_t parameters;
       parameters.dispatch        = CMI_PAMI_RZV_DISPATCH;
       parameters.header.iov_base = &rzv;
       parameters.header.iov_len  = sizeof(rzv);
+#if CMK_BLUEGENEQ
+      parameters.data.iov_base   = &cmi_pami_memregion[0].mregion;
+      parameters.data.iov_len    = sizeof(pami_memregion_t);
+#else
       parameters.data.iov_base   = NULL;
       parameters.data.iov_len    = 0;
+#endif
       parameters.dest = target;
       
-      if(to_lock)
-	PAMIX_CONTEXT_LOCK(context);
-      
       PAMI_Send_immediate (context, &parameters);
       
       if(to_lock)
 	PAMIX_CONTEXT_UNLOCK(context);
     }
+
+#if CMK_TRACE_ENABLED
+    traceUserBracketEvent(30003, start, CmiWallTimer());
+#endif
 }
 
 void CmiSyncSendFn(int destPE, int size, char *msg) {
@@ -1428,40 +1530,52 @@ void CmiFreeBroadcastAllFn(int size, char *msg) {
 void AdvanceCommunications() {
     pami_context_t my_context = MY_CONTEXT();
 
+#if CMK_TRACE_ENABLED
+    double start = CmiWallTimer(), end;
+#endif
+
 #if CMK_SMP
     //CmiAssert (my_context != NULL);
     if (PAMIX_CONTEXT_TRYLOCK(my_context))
     {
+      //fprintf(stderr, "%d advancing context %d\n", CmiMyPe(), MY_CONTEXT_ID());
       PAMI_Context_advance(my_context, 1);
       PAMIX_CONTEXT_UNLOCK(my_context);
     }
 #else
     PAMI_Context_advance(my_context, 1);
 #endif
+
+#if CMK_TRACE_ENABLED
+    end = CmiWallTimer();
+    //only log 1us or larger events
+    if (end - start > 1e-6)
+      traceUserBracketEvent(30005, start, end);
+#endif
 }
 #endif
 
 
 void CmiNotifyIdle() {
   AdvanceCommunications();
-#if CMK_SMP && CMK_PAMI_MULTI_CONTEXT
-#if !CMK_ENABLE_ASYNC_PROGRESS && CMK_USE_L2ATOMICS
+#if CMK_BLUEGENEQ && CMK_SMP && CMK_PAMI_MULTI_CONTEXT
+#if !CMK_ENABLE_ASYNC_PROGRESS && CMK_PPC_ATOMIC_QUEUE
   //Wait on the atomic queue to get a message with very low core
   //overheads. One thread calls advance more frequently
   if ((CmiMyRank()% THREADS_PER_CONTEXT) == 0)
     //spin wait for 2-4us when idle
     //process node queue messages every 10us
     //Idle cores will only use one LMQ slot and an int sum
-    L2AtomicQueue2QSpinWait(&procState[CmiMyRank()].atomic_queue, 
-			    &node_recv_atomic_q,
-			    10);
+    PPCAtomicQueue2QSpinWait(&procState[CmiMyRank()].atomic_queue,
+                             &node_recv_atomic_q,
+                             10);
   else
 #endif
-#if CMK_USE_L2ATOMICS
+#if CMK_PPC_ATOMIC_QUEUE
     //spin wait for 50-100us when idle waiting for a message
-    L2AtomicQueue2QSpinWait(&procState[CmiMyRank()].atomic_queue, 
-			    &node_recv_atomic_q,
-			    1000);
+    PPCAtomicQueue2QSpinWait(&procState[CmiMyRank()].atomic_queue,
+                             &node_recv_atomic_q,
+                             1000);
 #endif
 #endif
 }
@@ -1552,7 +1666,7 @@ void CmiFreeListSendFn(int npes, int *pes, int size, char *msg) {
 
     //Fast path
     if (npes == 1) {
-      CmiGeneralFreeSendN(CmiNodeOf(pes[0]), CmiRankOf(pes[0]), size, msg, 1);
+      CmiGeneralFreeSend(pes[0], size, msg);
       return;
     }
 
@@ -1577,10 +1691,29 @@ void CmiFreeListSendFn(int npes, int *pes, int size, char *msg) {
 void machineFreeListSendFn(pami_context_t my_context, int npes, int *pes, int size, char *msg) {
     int i;
     char *copymsg;
+
+    PAMIX_CONTEXT_LOCK(my_context);
+
+    for (i=0;i<npes;i++) {
+        if (CmiNodeOf(pes[i]) != CmiMyNode()){
+#if !CMK_SMP || (CMK_SMP && FREE_LIST_SEND_NO_COPY)
+          CmiReference(msg);
+          copymsg = msg;
+          machine_send(my_context, CmiNodeOf(pes[i]), CmiRankOf(pes[i]), size, copymsg, 0);
+#else
+          copymsg = (char *)CmiAlloc(size);
+          CmiAssert(copymsg != NULL);
+          CmiMemcpy(copymsg,msg,size);
+          CmiGeneralFreeSendN(CmiNodeOf(pes[i]), CmiRankOf(pes[i]), size, copymsg, 0);
+#endif
+        }
+    }
+
+    PAMIX_CONTEXT_UNLOCK(my_context);
+
 #if CMK_SMP
     for (i=0; i<npes; i++) {
       if (CmiNodeOf(pes[i]) == CmiMyNode()) {
-	//CmiSyncSend(pes[i], size, msg);
 	copymsg = (char *)CmiAlloc(size);
 	CmiAssert(copymsg != NULL);
 	CmiMemcpy(copymsg,msg,size);	  
@@ -1595,31 +1728,8 @@ void machineFreeListSendFn(pami_context_t my_context, int npes, int *pes, int si
     }
 #endif
 
-    PAMIX_CONTEXT_LOCK(my_context);
-    
-    for (i=0;i<npes;i++) {
-        if (CmiNodeOf(pes[i]) == CmiMyNode());
-        else if (i < npes - 1) {
-#if !CMK_SMP
-	    CmiReference(msg);
-	    copymsg = msg;
-#else
-	    copymsg = (char *)CmiAlloc(size);
-	    CmiAssert(copymsg != NULL);
-	    CmiMemcpy(copymsg,msg,size);
-#endif
-	    CmiGeneralFreeSendN(CmiNodeOf(pes[i]), CmiRankOf(pes[i]), size, copymsg, 0);
-	    //machine_send(my_context, CmiNodeOf(pes[i]), CmiRankOf(pes[i]), size, copymsg, 0);
-        }
-    }
-
-    if (npes  && CmiNodeOf(pes[npes-1]) != CmiMyNode())
-      CmiGeneralFreeSendN(CmiNodeOf(pes[npes-1]), CmiRankOf(pes[npes-1]), size, msg, 0);
-      //machine_send(my_context, CmiNodeOf(pes[npes-1]), CmiRankOf(pes[npes-1]), size, msg, 0);      
-    else
-      CmiFree(msg);    
-
-    PAMIX_CONTEXT_UNLOCK(my_context);
+    //Free the original message
+    CmiFree(msg);
 }
 
 CmiCommHandle CmiAsyncListSendFn(int npes, int *pes, int size, char *msg) {
@@ -1794,25 +1904,6 @@ void CmiDestroyLock(CmiNodeLock lock);
 
 #endif
 
-/** IMMEDIATE MESSAGES
-
- * If immediate messages are supported, the following function is needed. There
- * is an exeption if the machine progress is also defined (see later for this).
-
- * Moreover, the file "immediate.c" should be included, otherwise all its
- * functions and variables have to be redefined.
-*/
-
-#if CMK_CCS_AVAILABLE
-
-#include "immediate.c"
-
-#if ! CMK_MACHINE_PROGRESS_DEFINED /* Hack for some machines */
-void CmiProbeImmediateMsg();
-#endif
-
-#endif
-
 
 /* Dummy implementation */
 extern int CmiBarrier() {
@@ -1876,12 +1967,10 @@ void CmiSendNodeSelf(char *msg) {
       return;
     }
 #endif    
-#if CMK_SMP && CMK_USE_L2ATOMICS
-    L2AtomicEnqueue(&node_recv_atomic_q, msg);    
+#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE
+    PPCAtomicEnqueue(&node_recv_atomic_q, msg);
 #else
-    CmiLock(CsvAccess(NodeState).CmiNodeRecvLock);
     PCQueuePush(CsvAccess(NodeState).NodeRecv, msg);
-    CmiUnlock(CsvAccess(NodeState).CmiNodeRecvLock);
 #endif
 }
 
@@ -2047,9 +2136,7 @@ void rzv_pkt_dispatch (pami_context_t       context,
   CmiPAMIRzv_t  *rzv_hdr = (CmiPAMIRzv_t *) header_addr;
   CmiAssert (header_size == sizeof(CmiPAMIRzv_t));  
   int alloc_size = rzv_hdr->bytes;
-  char * buffer  = (char *)CmiAlloc(alloc_size + sizeof(CmiPAMIRzvRecv_t));
-  //char *buffer=(char*)CmiAlloc(alloc_size+sizeof(CmiPAMIRzvRecv_t)+sizeof(int))
-  //*(int *)(buffer+alloc_size) = *(int *)header_addr;  
+  char *buffer  = (char *)CmiAlloc(alloc_size + sizeof(CmiPAMIRzvRecv_t));
   CmiAssert (recv == NULL);
 
   CmiPAMIRzvRecv_t *rzv_recv = (CmiPAMIRzvRecv_t *)(buffer+alloc_size);
@@ -2057,11 +2144,12 @@ void rzv_pkt_dispatch (pami_context_t       context,
   rzv_recv->src_ep     = origin;
   rzv_recv->src_buffer = rzv_hdr->buffer;
   rzv_recv->size       = rzv_hdr->bytes;
-  
-  //CmiAssert (pipe_addr != NULL);
-  //CmiAssert (pipe_size == sizeof(pami_memregion_t));
-  //pami_memregion_t *mregion = (pami_memregion_t *) pipe_addr;
-  memcpy(&rzv_recv->rmregion, &rzv_hdr->mregion, sizeof(pami_memregion_t));
+  rzv_recv->rank       = rzv_hdr->rank;
+
+#ifdef CMK_BLUEGENEQ
+  CmiAssert (pipe_addr != NULL);
+  CmiAssert (pipe_size == sizeof(pami_memregion_t));
+  memcpy(&rzv_recv->rmregion, pipe_addr, sizeof(pami_memregion_t));
 
   //Rzv inj fifos are on the 17th core shared by all contexts
   pami_rget_simple_t  rget;
@@ -2082,6 +2170,31 @@ void rzv_pkt_dispatch (pami_context_t       context,
   pami_result_t rc;
   rc = PAMI_Rget (context, &rget);  
   //CmiAssert(rc == PAMI_SUCCESS);
+#else
+  size_t bytes_out;
+  pami_memregion_t mregion;
+  //In use for PAMI_Get
+  PAMI_Memregion_create (context,
+			 buffer,
+			 rzv_hdr->bytes,
+			 &bytes_out,
+			 &mregion);
+
+  pami_get_simple_t get;
+  memset(&get, 0, sizeof(get));
+  get.rma.dest = origin;
+  get.rma.bytes = rzv_hdr->bytes;
+  get.rma.cookie = rzv_recv;
+  get.rma.done_fn = rzv_recv_done;
+  get.rma.hints.use_rdma = PAMI_HINT_ENABLE;
+  get.rma.hints.buffer_registered = PAMI_HINT_ENABLE;
+  get.rma.hints.use_shmem = PAMI_HINT_DEFAULT;
+  get.rma.hints.remote_async_progress = PAMI_HINT_DEFAULT;
+  get.addr.local = buffer;
+  get.addr.remote = (void*)rzv_hdr->offset;
+  PAMI_Get(context, &get);
+#endif
+
 }
 
 void ack_pkt_dispatch (pami_context_t       context,   
@@ -2097,4 +2210,6 @@ void ack_pkt_dispatch (pami_context_t       context,
   CmiFree (*buf);
 }
 
+#if CMK_BLUEGENEQ
 #include "cmimemcpy_qpx.h"
+#endif
diff --git a/src/arch/pami/manytomany.c b/src/arch/pami/manytomany.c
index 12616749e9..4f43cae6ea 100644
--- a/src/arch/pami/manytomany.c
+++ b/src/arch/pami/manytomany.c
@@ -8,17 +8,17 @@
 #define M2M_PAMI_DISPATCH   15
 
 typedef struct _pami_m2mhdr {
-  int8_t    dstrank;
-  int8_t    connid;
-  int32_t   srcindex;
-} PAMI_M2mHeader; 
+  uint8_t    dstrank;
+  uint8_t    connid;
+  uint32_t   srcindex;
+} PAMI_M2mHeader;
 
 typedef struct _pami_m2m_work {
-  pami_work_t    work;
   int            start;
   int            end;
   void         * handle;
   pami_context_t context;
+  pami_work_t    work;
 } PAMI_M2mWork_t;
 
 typedef struct _m2m_completionmsg {
@@ -27,56 +27,73 @@ typedef struct _m2m_completionmsg {
   int    rank;
 } M2mCompletionMsg;
 
+typedef struct _m2m_sendinfo {
+  char            * buf;
+  uint32_t          bytes;
+  pami_endpoint_t   ep;
+  uint16_t          dispatch;
+  PAMI_M2mHeader    hdr;
+} M2mSendInfo;
+
+#if CMK_SMP && CMK_ENABLE_ASYNC_PROGRESS
+#define M2M_PARALLEL_CONTEXT 1
+#elif CMK_SMP
+#define M2M_PARALLEL_CONTEXT 1
+#else
+#define M2M_PARALLEL_CONTEXT 0
+#endif
+
+#if M2M_PARALLEL_CONTEXT
 #define MAX_NWORK 8
+#else
+#define MAX_NWORK 1
+#endif
 
 typedef struct _pami_cmidhandle {
   int                   myrank;
-  unsigned              m2m_rcvcounter ;
-  unsigned              m2m_nzrcvranks;  
+  unsigned              m2m_rcvcounter;
+  unsigned              m2m_nzrcvranks;
+  unsigned              m2m_nsndranks;
   char                * m2m_rcvbuf     ;
   unsigned            * m2m_rcvlens    ;
   unsigned            * m2m_rdispls    ;
+  M2mSendInfo         * m2m_sndinfo    ;
+  PAMI_M2mWork_t        swork[MAX_NWORK];
+  int                   n_work;
 
-  unsigned              m2m_nsndranks;
-  unsigned              m2m_srankIndex;		      
+  //Less frequently used (or unused) during runtime execution
   char                * m2m_sndbuf     ;
-  unsigned            * m2m_sndlens    ;
-  unsigned            * m2m_sdispls    ;
   unsigned              m2m_sndcounter ;
-  unsigned            * m2m_permutation;
-  unsigned            * m2m_lranks     ;
-  pami_endpoint_t     * m2m_node_eps;
-
-  PAMI_M2mWork_t        swork[MAX_NWORK];  
-  int                   n_work;
+  unsigned              m2m_srankIndex;	  //Stored in header
 
   CmiDirectM2mHandler   m2m_rdone;
   void                * m2m_rdonecontext;
-  PAMI_M2mHeader      * m2m_hdrs;
   M2mCompletionMsg      cmsg;
 
   unsigned              m2m_ntotalrcvranks;
-  unsigned              m2m_initialized;  
-  unsigned              m2m_rrankIndex; 
+  unsigned              m2m_initialized;
+  unsigned              m2m_rrankIndex;
   CmiDirectM2mHandler   m2m_sdone;
   void                * m2m_sdonecontext;
-} PAMICmiDirectM2mHandle;  
+} PAMICmiDirectM2mHandle;
 
 CpvDeclare(PAMICmiDirectM2mHandle*, _handle);
 CpvDeclare(int, _completion_handler);
 
-static void m2m_recv_done(pami_context_t ctxt, void *clientdata, pami_result_t result) 
+static void m2m_recv_done(pami_context_t ctxt, void *clientdata, pami_result_t result)
 {
-  PAMICmiDirectM2mHandle *handle = (PAMICmiDirectM2mHandle *)clientdata;  
+  int ntotal = 0;
+  PAMICmiDirectM2mHandle *handle = (PAMICmiDirectM2mHandle *)clientdata;
   //acquire lock if processed by many comm threads and contexts?
   handle->m2m_rcvcounter ++;
-    
-  if (handle->m2m_rcvcounter == handle->m2m_nzrcvranks) {
-    //printf ("Calling manytomany rdone for handle %p on rank %d counter %d nexp %d\n", 
+  ntotal = handle->m2m_rcvcounter;
+
+  if (ntotal == handle->m2m_nzrcvranks) {
+    //printf ("Calling manytomany rdone for handle %p on rank %d counter %d nexp %d\n",
     //    handle, CmiMyPe(),
     //    handle->m2m_rcvcounter, handle->m2m_nzrcvranks);
     handle->m2m_rcvcounter = 0;
-#if CMK_SMP && CMK_ENABLE_ASYNC_PROGRESS
+#if CMK_SMP && (M2M_PARALLEL_CONTEXT || LTPS)
     //Called from comm thread
     CmiSendPeer (handle->myrank, sizeof(M2mCompletionMsg), (char*)&handle->cmsg);
 #else
@@ -86,9 +103,9 @@ static void m2m_recv_done(pami_context_t ctxt, void *clientdata, pami_result_t r
   }
 }
 
-static void m2m_send_done(pami_context_t ctxt, void *clientdata, pami_result_t result) 
+static void m2m_send_done(pami_context_t ctxt, void *clientdata, pami_result_t result)
 {
-  PAMICmiDirectM2mHandle *handle = (PAMICmiDirectM2mHandle *)clientdata;  
+  PAMICmiDirectM2mHandle *handle = (PAMICmiDirectM2mHandle *)clientdata;
   //acquire lock if processed by many comm threads and contexts?
   handle->m2m_sndcounter ++;
   if (handle->m2m_sndcounter == handle->m2m_nsndranks) {
@@ -96,7 +113,7 @@ static void m2m_send_done(pami_context_t ctxt, void *clientdata, pami_result_t r
     //else
     handle->m2m_sndcounter = 0;
     if (handle->m2m_sdone)
-      handle->m2m_sdone(handle->m2m_sdonecontext); 
+      handle->m2m_sdone(handle->m2m_sdonecontext);
   }
 }
 
@@ -107,17 +124,21 @@ static void m2m_rdone_mainthread (void *m) {
     handle->m2m_rdone(handle->m2m_rdonecontext);
 }
 
-static void m2m_s8_dispatch (pami_context_t       context,  
+static void m2m_s8_dispatch (pami_context_t       context,
 			     void               * clientdata,
-			     const void         * header_addr, 
-			     size_t               header_size, 
-			     const void         * pipe_addr,   
-			     size_t               pipe_size,   
+			     const void         * header_addr,
+			     size_t               header_size,
+			     const void         * pipe_addr,
+			     size_t               pipe_size,
 			     pami_endpoint_t      origin,
-			     pami_recv_t         * recv)       
+			     pami_recv_t         * recv)
 {
   PAMI_M2mHeader *hdr = (PAMI_M2mHeader *) header_addr;
-  PAMICmiDirectM2mHandle *handlevec = CpvAccessOther(_handle, hdr->dstrank);  
+#if CMK_SMP && (M2M_PARALLEL_CONTEXT || LTPS)
+  PAMICmiDirectM2mHandle *handlevec = CpvAccessOther(_handle, hdr->dstrank);
+#else
+  PAMICmiDirectM2mHandle *handlevec = CpvAccess(_handle);
+#endif
   PAMICmiDirectM2mHandle *handle = &handlevec[hdr->connid];
   char *buffer = handle->m2m_rcvbuf + handle->m2m_rdispls[hdr->srcindex];
 
@@ -127,46 +148,59 @@ static void m2m_s8_dispatch (pami_context_t       context,
 }
 
 
-static void m2m_spkt_dispatch (pami_context_t       context,  
+static void m2m_spkt_dispatch (pami_context_t       context,
 			      void               * clientdata,
-			      const void         * header_addr, 
-			      size_t               header_size, 
-			      const void         * pipe_addr,   
-			      size_t               pipe_size,   
+			      const void         * header_addr,
+			      size_t               header_size,
+			      const void         * pipe_addr,
+			      size_t               pipe_size,
 			      pami_endpoint_t      origin,
-			      pami_recv_t         * recv)       
+			      pami_recv_t         * recv)
 {
   PAMI_M2mHeader *hdr = (PAMI_M2mHeader *) header_addr;
-  PAMICmiDirectM2mHandle *handlevec = CpvAccessOther(_handle, hdr->dstrank);   
+#if CMK_SMP && (M2M_PARALLEL_CONTEXT || LTPS)
+  PAMICmiDirectM2mHandle *handlevec = CpvAccessOther(_handle, hdr->dstrank);
+#else
+  PAMICmiDirectM2mHandle *handlevec = CpvAccess(_handle);
+#endif
   PAMICmiDirectM2mHandle *handle = &handlevec[hdr->connid];
 
   char *buffer = handle->m2m_rcvbuf + handle->m2m_rdispls[hdr->srcindex];
-  memcpy (buffer, pipe_addr, pipe_size);
+  if (pipe_size == 32) {
+    uint64_t *src = (uint64_t *)pipe_addr;
+    uint64_t *dst = (uint64_t *)buffer;
+
+    dst[0] = src[0];
+    dst[1] = src[1];
+    dst[2] = src[2];
+    dst[3] = src[3];
+  }
+  else
+    memcpy (buffer, pipe_addr, pipe_size);
   m2m_recv_done (context, handle, PAMI_SUCCESS);
 }
 
 
 
-static void m2m_pkt_dispatch (pami_context_t       context,  
+static void m2m_pkt_dispatch (pami_context_t       context,
 			      void               * clientdata,
-			      const void         * header_addr, 
-			      size_t               header_size, 
-			      const void         * pipe_addr,   
-			      size_t               pipe_size,   
+			      const void         * header_addr,
+			      size_t               header_size,
+			      const void         * pipe_addr,
+			      size_t               pipe_size,
 			      pami_endpoint_t      origin,
-			      pami_recv_t         * recv)       
+			      pami_recv_t         * recv)
 {
   PAMI_M2mHeader *hdr = (PAMI_M2mHeader *) header_addr;
 
-  //CmiAssert (hdr->dstrank < CmiMyNodeSize());
-  //CmiAssert (hdr->connid  < MAX_CONN);
-
+#if CMK_SMP && (M2M_PARALLEL_CONTEXT || LTPS)
   PAMICmiDirectM2mHandle *handlevec = CpvAccessOther(_handle, hdr->dstrank);
-  //CmiAssert (handlevec != NULL);
-  
+#else
+  PAMICmiDirectM2mHandle *handlevec = CpvAccess(_handle);
+#endif
+
   //fprintf(stderr, "m2m_pkt_dispatch: mype %d connid %d dstrank %d handlevec %p\n",
   //  CmiMyPe(), hdr->connid, hdr->dstrank, handlevec);
-  
   PAMICmiDirectM2mHandle *handle = &handlevec[hdr->connid];
 
   char *buffer = handle->m2m_rcvbuf + handle->m2m_rdispls[hdr->srcindex];
@@ -186,25 +220,20 @@ static void m2m_pkt_dispatch (pami_context_t       context,
 }
 
 
-void * CmiDirect_manytomany_allocate_handle () {  
-#if CMK_SMP && !CMK_ENABLE_ASYNC_PROGRESS
-    CmiAbort("!!!!!!!!!Please build Charm++ with async in order to use many-to-many interface\n");
-#else 
+void * CmiDirect_manytomany_allocate_handle () {
   if (!CpvInitialized(_handle))
     CpvInitialize(PAMICmiDirectM2mHandle*, _handle);
   if (!CpvInitialized(_completion_handler))
-    CpvInitialize(int, _completion_handler);  
-  ppc_msync();
-  
+    CpvInitialize(int, _completion_handler);
+
   if (CpvAccess(_handle) == NULL) {
     CpvAccess(_handle) = (PAMICmiDirectM2mHandle *)malloc (MAX_CONN *sizeof(PAMICmiDirectM2mHandle));
     memset (CpvAccess(_handle),0,MAX_CONN*sizeof (PAMICmiDirectM2mHandle));
     CpvAccess(_completion_handler) = CmiRegisterHandler(m2m_rdone_mainthread);
   }
-  
+
   //printf ("allocate_handle on rank %d %p\n", CmiMyPe(), CpvAccess(_handle));
   return CpvAccess(_handle);
-#endif
 }
 
 
@@ -216,13 +245,10 @@ void   CmiDirect_manytomany_initialize_recvbase(void                 * h,
 						unsigned               nranks,
 						unsigned               myIdx )
 {
-#if CMK_SMP && !CMK_ENABLE_ASYNC_PROGRESS
-    CmiAbort("!!!!!!!!!Please build Charm++ with async in order to use many-to-many interface\n");
-#else 
   PAMICmiDirectM2mHandle *handle = &(((PAMICmiDirectM2mHandle *) h)[tag]);
   //PAMICmiDirectM2mHandle *handle = &(CpvAccess(_handle)[tag]);
 
-  //printf ("manytomany recvbase on rank %d handle %p conn %d nranks %d\n", 
+  //printf ("manytomany recvbase on rank %d handle %p conn %d nranks %d\n",
   //  CmiMyPe(), handle, tag, nranks);
 
   handle->myrank = CmiMyRank();
@@ -236,19 +262,19 @@ void   CmiDirect_manytomany_initialize_recvbase(void                 * h,
   handle->m2m_rdone        = donecb;
   handle->m2m_rdonecontext = context;
   handle->m2m_ntotalrcvranks    = nranks;
-  
+
   //Receiver is not sender
-  //if (myIdx == (unsigned)-1) 
+  //if (myIdx == (unsigned)-1)
   //(handle->m2m_ntotalrcvranks)++;
-    
+
   handle->m2m_rcvlens   = malloc (sizeof(int) * handle->m2m_ntotalrcvranks);
   handle->m2m_rdispls   = malloc (sizeof(int) * handle->m2m_ntotalrcvranks);
-  
+
   assert (handle->m2m_rcvlens != NULL);
-  
+
   memset (handle->m2m_rcvlens, 0, handle->m2m_ntotalrcvranks * sizeof(int));
   memset (handle->m2m_rdispls, 0, handle->m2m_ntotalrcvranks * sizeof(int));
-  
+
   //Receiver is not sender
   //if (myIdx == (unsigned)-1) {
   //Receiver doesnt send any data
@@ -256,7 +282,6 @@ void   CmiDirect_manytomany_initialize_recvbase(void                 * h,
   //CmiDirect_manytomany_initialize_recv (h, tag,  myIdx, 0, 0, CmiMyPe());
   //}
   handle->m2m_rrankIndex = myIdx;
-#endif
 }
 
 void   CmiDirect_manytomany_initialize_recv ( void          * h,
@@ -266,18 +291,14 @@ void   CmiDirect_manytomany_initialize_recv ( void          * h,
 					      unsigned        bytes,
 					      unsigned        rank )
 {
-#if CMK_SMP && !CMK_ENABLE_ASYNC_PROGRESS
-    CmiAbort("!!!!!!!!!Please build Charm++ with async in order to use many-to-many interface\n");
-#else 
   PAMICmiDirectM2mHandle *handle = &(((PAMICmiDirectM2mHandle *) h)[tag]);
   assert ( tag < MAX_CONN  );
-  
+
   if (handle->m2m_rcvlens[idx] == 0 && bytes > 0)
     handle->m2m_nzrcvranks ++;
 
   handle->m2m_rcvlens  [idx]   = bytes;
   handle->m2m_rdispls  [idx]   = displ;
-#endif
 }
 
 
@@ -289,43 +310,30 @@ void   CmiDirect_manytomany_initialize_sendbase( void                 * h,
 						 unsigned               nranks,
 						 unsigned               myIdx )
 {
-#if CMK_SMP && !CMK_ENABLE_ASYNC_PROGRESS
-    CmiAbort("!!!!!!!!!Please build Charm++ with async in order to use many-to-many interface\n");
-#else 
   PAMICmiDirectM2mHandle *handle = &(((PAMICmiDirectM2mHandle *) h)[tag]);
   assert ( tag < MAX_CONN  );
   handle->m2m_sndbuf       = sndbuf;
   handle->m2m_sdone        = donecb;
   handle->m2m_sdonecontext = context;
-  
+
   handle->m2m_nsndranks    = nranks;
-  handle->m2m_srankIndex   = myIdx;  
-  handle->m2m_sndlens      = (unsigned int *) malloc (sizeof(unsigned int) * nranks);
-  handle->m2m_sdispls      = (unsigned int *) malloc (sizeof(unsigned int) * nranks);
-  handle->m2m_lranks       = (unsigned int *) malloc (sizeof(unsigned int) * nranks);
-  handle->m2m_node_eps     = (pami_endpoint_t *) malloc (sizeof(pami_endpoint_t) * nranks);
-  handle->m2m_permutation  = (unsigned int *) malloc (sizeof(unsigned int) * nranks);
-  handle->m2m_hdrs = (PAMI_M2mHeader *) malloc(sizeof(PAMI_M2mHeader) * nranks);
-
-  memset (handle->m2m_sndlens,    0, nranks * sizeof(int));
-  memset (handle->m2m_sdispls,    0, nranks * sizeof(int));
-  memset (handle->m2m_lranks,     0, nranks * sizeof(int));
-  memset (handle->m2m_node_eps,   0, nranks * sizeof(pami_endpoint_t));
-  memset (handle->m2m_permutation,0, nranks * sizeof(int));  
+  handle->m2m_srankIndex   = myIdx;
+  handle->m2m_sndinfo = (M2mSendInfo *)malloc(nranks * sizeof(M2mSendInfo));
+  memset (handle->m2m_sndinfo,0, nranks * sizeof(M2mSendInfo));
 
-#if CMK_SMP && CMK_ENABLE_ASYNC_PROGRESS
+#if M2M_PARALLEL_CONTEXT
   //we have a completion callback
   if (handle->m2m_sdone != NULL) {
     handle->swork[0].start = 0;
-    handle->swork[0].end   = handle->m2m_nsndranks;   
+    handle->swork[0].end   = handle->m2m_nsndranks;
     handle->swork[0].handle = handle;
     handle->n_work = 1;
 
     int context_id = MY_CONTEXT_ID();
     context_id ++;
     if (context_id >= cmi_pami_numcontexts)
-      context_id = 0;	      
-    pami_context_t context = cmi_pami_contexts[context_id];    
+      context_id = 0;
+    pami_context_t context = cmi_pami_contexts[context_id];
     handle->swork[0].context = context;
   }
   else {
@@ -340,10 +348,10 @@ void   CmiDirect_manytomany_initialize_sendbase( void                 * h,
       ncontexts = handle->m2m_nsndranks;
     handle->n_work = ncontexts;
 
-    nranks = handle->m2m_nsndranks / ncontexts;   
+    nranks = handle->m2m_nsndranks / ncontexts;
     for (i = 0; i < ncontexts; ++i) {
       handle->swork[i].start  = start;
-      handle->swork[i].end    = start + nranks;   
+      handle->swork[i].end    = start + nranks;
       handle->swork[i].handle = handle;
       start += nranks;
       if (i == ncontexts - 1)
@@ -351,7 +359,7 @@ void   CmiDirect_manytomany_initialize_sendbase( void                 * h,
 
       context_id ++;
       if (context_id >= cmi_pami_numcontexts)
-	context_id = 0;	      
+	context_id = 0;
       context = cmi_pami_contexts[context_id];
       handle->swork[i].context = context;
     }
@@ -359,142 +367,114 @@ void   CmiDirect_manytomany_initialize_sendbase( void                 * h,
 #else
   PAMIX_CONTEXT_LOCK(MY_CONTEXT());
   handle->swork[0].start = 0;
-  handle->swork[0].end   = handle->m2m_nsndranks;   
+  handle->swork[0].end   = handle->m2m_nsndranks;
   handle->swork[0].handle = handle;
   handle->n_work = 1;
   handle->swork[0].context = MY_CONTEXT();
   PAMIX_CONTEXT_UNLOCK(MY_CONTEXT());
 #endif
-#endif
 }
 
 #define PRIME_A  3010349UL
 #define PRIME_B  3571UL
 
 void   CmiDirect_manytomany_initialize_send ( void        * h,
-					      unsigned      tag, 
+					      unsigned      tag,
 					      unsigned      idx,
 					      unsigned      displ,
 					      unsigned      bytes,
 					      unsigned      pe )
 {
-#if CMK_SMP && !CMK_ENABLE_ASYNC_PROGRESS
-    CmiAbort("!!!!!!!!!Please build Charm++ with async in order to use many-to-many interface\n");
-#else 
   PAMICmiDirectM2mHandle *handle = &(((PAMICmiDirectM2mHandle *) h)[tag]);
-  assert ( tag < MAX_CONN  );  
-  handle->m2m_sndlens    [idx]   = bytes;
-  handle->m2m_sdispls    [idx]   = displ;
-  
+  assert ( tag < MAX_CONN  );
+
   int lrank                      = CmiRankOf(pe);
-  handle->m2m_lranks     [idx]   = lrank;
-  
   pami_endpoint_t target;
   //get the destination context
-#if CMK_PAMI_MULTI_CONTEXT 
+#if CMK_PAMI_MULTI_CONTEXT
   size_t dst_context = (lrank>>LTPS);
 #else
   size_t dst_context = 0;
 #endif
-  PAMI_Endpoint_create (cmi_pami_client, (pami_task_t)CmiNodeOf(pe), 
+  PAMI_Endpoint_create (cmi_pami_client, (pami_task_t)CmiNodeOf(pe),
 			dst_context, &target);
-  handle->m2m_node_eps   [idx]   = target;
 
-  //uint64_t p_rand = ((uint64_t)idx+1)*PRIME_A + PRIME_B*(CmiMyPe()+1);
   unsigned seed = CmiMyPe()+1;
   //start at a random location and move linearly from there
-  uint64_t p_rand = rand_r(&seed) + idx + 1;
-  //uint64_t p_rand = (uint64_t)idx + 1 + CmiMyPe();
-  //uint64_t p_rand   =  idx + 1;
-  handle->m2m_permutation[idx]   = (uint32_t)(p_rand%handle->m2m_nsndranks);
-  handle->m2m_hdrs[idx].connid   = tag;  
-  handle->m2m_hdrs[idx].dstrank  = lrank; 
-  handle->m2m_hdrs[idx].srcindex = handle->m2m_srankIndex;
-#endif
-}
-
-static void  _internal_machine_send   ( pami_context_t      context, 
-					pami_endpoint_t     target_ep, 
-					int                 rank, 
-					int                 hdrsize,
-					char              * hdr,
-					int                 size, 
-					char              * msg,
-					pami_event_function cb_done,
-					void              * cd)
-{
-  if (size < 128) {
-    pami_send_immediate_t parameters;
-    parameters.dispatch        = (size == 8)? M2M_PAMI_S8DISPATCH : M2M_PAMI_SDISPATCH;
-    //parameters.dispatch        = M2M_PAMI_SDISPATCH;
-    parameters.header.iov_base = hdr;
-    parameters.header.iov_len  = hdrsize;
-    parameters.data.iov_base   = msg;
-    parameters.data.iov_len    = size;
-    parameters.dest            = target_ep;
-    
-    PAMI_Send_immediate (context, &parameters);
-    //if (cb_done)
-    //cb_done (context, cd, PAMI_SUCCESS);
-  }
-  else {
-    pami_send_t parameters;
-    parameters.send.dispatch        = M2M_PAMI_DISPATCH;
-    parameters.send.header.iov_base = hdr;
-    parameters.send.header.iov_len  = hdrsize;
-    parameters.send.data.iov_base   = msg;
-    parameters.send.data.iov_len    = size;
-    parameters.events.cookie        = cd;
-    parameters.events.local_fn      = cb_done;
-    parameters.events.remote_fn     = NULL;
-    memset(&parameters.send.hints, 0, sizeof(parameters.send.hints));
-    parameters.send.dest            = target_ep;
-    
-    PAMI_Send (context, &parameters);
-  }
+  //uint64_t p_rand = rand_r(&seed) + idx + 1;
+  uint64_t p_rand = ((uint64_t)idx+1)*PRIME_A + PRIME_B*(CmiMyPe()+1);
+  uint32_t pidx = (uint32_t)(p_rand%handle->m2m_nsndranks);
+
+  char *buffer = handle->m2m_sndbuf + displ;
+  handle->m2m_sndinfo[pidx].buf    = buffer;
+  handle->m2m_sndinfo[pidx].bytes  = bytes;
+  handle->m2m_sndinfo[pidx].ep     = target;
+  handle->m2m_sndinfo[pidx].hdr.connid   = tag;
+  handle->m2m_sndinfo[pidx].hdr.dstrank  = lrank;
+  handle->m2m_sndinfo[pidx].hdr.srcindex = handle->m2m_srankIndex;
+
+  if (bytes == 8)
+    handle->m2m_sndinfo[pidx].dispatch = M2M_PAMI_S8DISPATCH;
+  else if (bytes < 128)
+    handle->m2m_sndinfo[pidx].dispatch = M2M_PAMI_SDISPATCH;
+  else
+    handle->m2m_sndinfo[pidx].dispatch = M2M_PAMI_DISPATCH;
 }
 
 pami_result_t   _cmidirect_m2m_send_post_handler (pami_context_t     context,
-						  void             * cd) 
+						  void             * cd)
 {
   PAMI_M2mWork_t  *work = (PAMI_M2mWork_t *) cd;
   PAMICmiDirectM2mHandle *handle = (PAMICmiDirectM2mHandle *)work->handle;
-  
+
+#if CMK_TRACE_ENABLED
+  double starttime = CmiWallTimer();
+#endif
+
   int i = 0;
-  int pidx = 0;
-  char *buffer = NULL;
-  int bytes = NULL;
+  CmiAssert(handle->m2m_sdone == NULL);
+  pami_send_t  parameters;
 
-  pami_event_function cb_done = m2m_send_done;
-  void *clientdata = handle;
+  parameters.send.header.iov_len  = sizeof(PAMI_M2mHeader);
+  parameters.events.cookie        = NULL;
+  parameters.events.local_fn      = NULL;
+  parameters.events.remote_fn     = NULL;
+  memset(&parameters.send.hints, 0, sizeof(parameters.send.hints));
 
-  if (handle->m2m_sdone == NULL) {
-    cb_done     = NULL;
-    clientdata  = NULL;
+  for (i = work->start; i < work->end; ++i) {
+    M2mSendInfo *sndinfo = &handle->m2m_sndinfo[i];
+    parameters.send.data.iov_base   = sndinfo->buf;
+    parameters.send.data.iov_len    = sndinfo->bytes;
+    parameters.send.dest            = sndinfo->ep;
+    parameters.send.header.iov_base = &sndinfo->hdr;
+    parameters.send.dispatch        = sndinfo->dispatch;
+
+    if (sndinfo->bytes < 128)
+      PAMI_Send_immediate(context, &parameters.send);
+    else
+      PAMI_Send (context, &parameters);
   }
 
-  for (i = work->start; i < work->end; ++i) {
-    pidx   = handle->m2m_permutation[i];
-    buffer = handle->m2m_sndbuf + handle->m2m_sdispls[pidx];
-    bytes  = handle->m2m_sndlens[pidx];
-    
-    _internal_machine_send(context,
-			   handle->m2m_node_eps[pidx],
-			   handle->m2m_lranks[pidx],
-			   sizeof(PAMI_M2mHeader),
-			   (char*)&(handle->m2m_hdrs[pidx]),
-			   bytes, 
-			   buffer,
-			   cb_done,
-			   clientdata);
-  }  
+#if CMK_TRACE_ENABLED
+  traceUserBracketEvent(30006, starttime, CmiWallTimer());
+#endif
 
   return PAMI_SUCCESS;
 }
 
 
 void _cmidirect_m2m_initialize (pami_context_t *contexts, int nc) {
-  pami_dispatch_hint_t options = (pami_dispatch_hint_t) {0};
+  pami_dispatch_hint_t soptions = (pami_dispatch_hint_t) {0};
+  pami_dispatch_hint_t loptions = (pami_dispatch_hint_t) {0};
+
+  soptions.long_header    = PAMI_HINT_DISABLE;
+  soptions.recv_immediate = PAMI_HINT_ENABLE;
+  soptions.use_rdma       = PAMI_HINT_DISABLE;
+
+  loptions.long_header     = PAMI_HINT_DISABLE;
+  loptions.recv_contiguous = PAMI_HINT_ENABLE;
+  loptions.recv_copy       = PAMI_HINT_ENABLE;
+
   pami_dispatch_callback_function pfn;
   int i = 0;
   for (i = 0; i < nc; ++i) {
@@ -503,57 +483,71 @@ void _cmidirect_m2m_initialize (pami_context_t *contexts, int nc) {
 		       M2M_PAMI_DISPATCH,
 		       pfn,
 		       NULL,
-		       options);
+		       loptions);
 
     pfn.p2p = m2m_spkt_dispatch;
     PAMI_Dispatch_set (contexts[i],
 		       M2M_PAMI_SDISPATCH,
 		       pfn,
 		       NULL,
-		       options);
+		       soptions);
 
     pfn.p2p = m2m_s8_dispatch;
     PAMI_Dispatch_set (contexts[i],
 		       M2M_PAMI_S8DISPATCH,
 		       pfn,
 		       NULL,
-		       options);
+		       soptions);
   }
 }
 
 
 void   CmiDirect_manytomany_start ( void       * h,
 				    unsigned     tag ) {
-#if CMK_SMP && !CMK_ENABLE_ASYNC_PROGRESS
-    CmiAbort("!!!!!!!!!Please build Charm++ with async in order to use many-to-many interface\n");
-#else 
   PAMICmiDirectM2mHandle *handle = &(((PAMICmiDirectM2mHandle *) h)[tag]);
   assert (tag < MAX_CONN);
 
-  //printf ("Calling manytomany_start for conn %d handle %p on rank %d\n", tag, 
+  //printf ("Calling manytomany_start for conn %d handle %p on rank %d\n", tag,
   //  handle, CmiMyPe());
-  
-#if CMK_SMP && CMK_ENABLE_ASYNC_PROGRESS
+
+#if M2M_PARALLEL_CONTEXT
   //we have a completion callback
   if (handle->m2m_sdone != NULL) {
-    PAMI_Context_post ( handle->swork[0].context, 
-		       &handle->swork[0].work, 
-		       _cmidirect_m2m_send_post_handler,
-		       &handle->swork[0]);
+    PAMI_Context_post ( handle->swork[0].context,
+			&handle->swork[0].work,
+			_cmidirect_m2m_send_post_handler,
+			&handle->swork[0]);
   }
   else {
     int i;
-    for (i = 0; i < handle->n_work; ++i) {
-      PAMI_Context_post( handle->swork[i].context, 
-			&handle->swork[i].work, 
-			_cmidirect_m2m_send_post_handler,
-			&handle->swork[i]);
-    }
+#if CMK_TRACE_ENABLED
+    double starttime = CmiWallTimer();
+#endif
+    for (i = 0; i < handle->n_work; ++i)
+#if !CMK_ENABLE_ASYNC_PROGRESS
+      if (handle->swork[i].context != MY_CONTEXT())
+#endif
+	PAMI_Context_post( handle->swork[i].context,
+			   &handle->swork[i].work,
+			   _cmidirect_m2m_send_post_handler,
+			   &handle->swork[i]);
+
+#if CMK_TRACE_ENABLED
+    traceUserBracketEvent(30007, starttime, CmiWallTimer());
+#endif
+
+#if !CMK_ENABLE_ASYNC_PROGRESS
+    for (i = 0; i < handle->n_work; ++i)
+      if (handle->swork[i].context == MY_CONTEXT()) {
+	PAMIX_CONTEXT_LOCK(MY_CONTEXT());
+	_cmidirect_m2m_send_post_handler (MY_CONTEXT(), &handle->swork[i]);
+	PAMIX_CONTEXT_UNLOCK(MY_CONTEXT());
+      }
+#endif
   }
 #else
   PAMIX_CONTEXT_LOCK(MY_CONTEXT());
   _cmidirect_m2m_send_post_handler (MY_CONTEXT(), &handle->swork[0]);
   PAMIX_CONTEXT_UNLOCK(MY_CONTEXT());
 #endif
-#endif
 }
diff --git a/src/arch/pami/memalloc.c b/src/arch/pami/memalloc.c
new file mode 100755
index 0000000000..55fc45d135
--- /dev/null
+++ b/src/arch/pami/memalloc.c
@@ -0,0 +1,144 @@
+
+#include <converse.h>
+
+#define ALIGNMENT        32
+#define SMSG_SIZE        4096
+#define N_SMSG_ELEM      4096
+#define MMSG_SIZE        16384
+#define N_MMSG_ELEM      2048
+#define LLMSG_SIZE       65536
+#define N_LLMSG_ELEM     1024
+
+#if CMK_BLUEGENEQ
+#include <spi/include/kernel/location.h>
+#endif
+
+PPCAtomicQueue *sPPCMemallocVec;
+PPCAtomicQueue *mPPCMemallocVec;
+PPCAtomicQueue *llPPCMemallocVec;
+
+typedef struct CmiMemAllocHdr_ppcq_t {
+  int rank;
+  int size;
+  //Align the application buffer to 32 bytes
+  char dummy[ALIGNMENT - sizeof(CmiChunkHeader) - 2*sizeof(int)];
+} CmiMemAllocHdr_ppcq;
+
+static int _nodeStart;
+extern int  Cmi_nodestart; /* First processor in this address space */
+
+void *CmiAlloc_ppcq (int size) {
+  CmiMemAllocHdr_ppcq *hdr = NULL;
+  char *buf;
+#if CMK_TRACE_ENABLED
+  double start = CmiWallTimer();
+#endif
+
+#if CMK_BLUEGENEQ
+  //Comm threads are hidden on BG/Q
+  int myrank = Kernel_ProcessorID() - _nodeStart;
+#else
+  int myrank = CmiMyRank();
+#endif
+
+  if (size <= SMSG_SIZE) {
+    hdr = PPCAtomicDequeue (&sPPCMemallocVec[myrank]);
+    if (hdr == NULL)
+      hdr = (CmiMemAllocHdr_ppcq *)
+        malloc_nomigrate(SMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq));
+    hdr->size = SMSG_SIZE;
+  }
+  else if (size <= MMSG_SIZE) {
+    hdr = PPCAtomicDequeue (&mPPCMemallocVec[myrank]);
+    if (hdr == NULL)
+      hdr = (CmiMemAllocHdr_ppcq *)
+        malloc_nomigrate(MMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq));
+    hdr->size = MMSG_SIZE;
+  }
+  else if (size <= LLMSG_SIZE) {
+    hdr = PPCAtomicDequeue (&llPPCMemallocVec[myrank]);
+    if (hdr == NULL)
+      hdr = (CmiMemAllocHdr_ppcq *)
+        malloc_nomigrate(LLMSG_SIZE + sizeof(CmiMemAllocHdr_ppcq));
+    hdr->size = LLMSG_SIZE;
+  }
+  else {
+    hdr = (CmiMemAllocHdr_ppcq *)
+      malloc_nomigrate(size + sizeof(CmiMemAllocHdr_ppcq));
+    hdr->size = size;
+  }
+
+  hdr->rank = myrank;
+  buf = (char*)hdr + sizeof(CmiMemAllocHdr_ppcq);
+
+#if CMK_TRACE_ENABLED
+  traceUserBracketEvent(30001, start, CmiWallTimer());
+#endif
+
+  return buf;
+}
+
+void CmiFree_ppcq (void *buf) {
+  CmiMemAllocHdr_ppcq *hdr = (CmiMemAllocHdr_ppcq *)((char*)buf - sizeof(CmiMemAllocHdr_ppcq));
+  int rc = CMI_PPCQ_EAGAIN;
+
+#if CMK_TRACE_ENABLED
+  double start = CmiWallTimer();
+#endif
+
+  if (hdr->size == SMSG_SIZE)
+    rc = PPCAtomicEnqueue (&sPPCMemallocVec[hdr->rank], hdr);
+  else if (hdr->size == MMSG_SIZE)
+    rc = PPCAtomicEnqueue (&mPPCMemallocVec[hdr->rank], hdr);
+  else if (hdr->size == LLMSG_SIZE)
+    rc = PPCAtomicEnqueue (&llPPCMemallocVec[hdr->rank], hdr);
+
+  if (rc == CMI_PPCQ_EAGAIN)
+    //queues are full or large buf
+    free_nomigrate(hdr);
+
+#if CMK_TRACE_ENABLED
+  traceUserBracketEvent(30002, start, CmiWallTimer());
+#endif
+}
+
+void CmiMemAllocInit_ppcq (void   * atomic_mem,
+			   size_t   atomic_memsize)
+{
+  int i = 0;
+#if CMK_BLUEGENEQ
+  int node_size = 64/Kernel_ProcessCount();
+  _nodeStart = node_size * Kernel_MyTcoord();
+#else
+  int node_size = CmiMyNodeSize();
+  _nodeStart = Cmi_nodestart;
+#endif
+
+  //We want to align headers to 32 bytes
+  CmiAssert(sizeof(CmiMemAllocHdr_ppcq)+sizeof(CmiChunkHeader) == ALIGNMENT);
+
+  CmiAssert (atomic_memsize >= 3 * node_size * sizeof(PPCAtomicState));
+  sPPCMemallocVec = (PPCAtomicQueue *)malloc_nomigrate(sizeof(PPCAtomicQueue)*node_size);
+  mPPCMemallocVec = (PPCAtomicQueue *)malloc_nomigrate(sizeof(PPCAtomicQueue)*node_size);
+  llPPCMemallocVec = (PPCAtomicQueue *)malloc_nomigrate(sizeof(PPCAtomicQueue)*node_size);
+
+  for (i = 0; i < node_size; ++i) {
+    PPCAtomicQueueInit ((char *)atomic_mem + 3*i*sizeof(PPCAtomicState),
+			sizeof(PPCAtomicState),
+			&sPPCMemallocVec[i],
+			0, /*No Overflow*/
+			N_SMSG_ELEM );
+
+    PPCAtomicQueueInit ((char *)atomic_mem + (3*i+1)*sizeof(PPCAtomicState),
+			sizeof(PPCAtomicState),
+			&mPPCMemallocVec[i],
+			0,
+			N_MMSG_ELEM );
+
+    PPCAtomicQueueInit ((char *)atomic_mem + (3*i+2)*sizeof(PPCAtomicState),
+			sizeof(PPCAtomicState),
+			&llPPCMemallocVec[i],
+			0,
+			N_LLMSG_ELEM );
+  }
+}
diff --git a/src/conv-core/convcore.c b/src/conv-core/convcore.c
index 32c0b6690e..278104c090 100644
--- a/src/conv-core/convcore.c
+++ b/src/conv-core/convcore.c
@@ -212,11 +212,16 @@ void infi_freeMultipleSend(void *ptr);
 void infi_unregAndFreeMeta(void *ch);
 #endif
 
-#if CMK_SMP && CMK_BLUEGENEQ && (CMK_USE_L2ATOMICS || SPECIFIC_PCQUEUE)
+#if CMK_SMP && CMK_BLUEGENEQ && SPECIFIC_PCQUEUE
 void * CmiAlloc_bgq (int     size);
 void   CmiFree_bgq  (void  * buf);
 #endif
 
+#if CMK_SMP && CMK_PPC_ATOMIC_QUEUE
+void * CmiAlloc_ppcq (int     size);
+void   CmiFree_ppcq  (void  * buf);
+#endif
+
 #if CMK_GRID_QUEUE_AVAILABLE
 CpvDeclare(void *, CkGridObject);
 CpvDeclare(void *, CsdGridQueue);
@@ -1290,6 +1295,149 @@ double CmiTimer()
 #endif
 
 
+#if CMK_TIMER_USE_PPC64
+
+#include <sys/time.h>
+#include <endian.h>
+
+#define SPRN_TBRU 0x10D
+#define SPRN_TBRL 0x10C
+
+CpvStaticDeclare(uint64_t, inittime);
+CpvStaticDeclare(double, clocktick);
+
+int CmiTimerIsSynchronized()
+{
+  return 1;
+}
+
+int CmiTimerAbsolute()
+{
+  return 0;
+}
+
+double CmiStartTimer()
+{
+  return 0.0;
+}
+
+double CmiInitTime()
+{
+  return CpvAccess(inittime);
+}
+
+static inline uint64_t PPC64_TimeBase()
+{
+  unsigned temp;
+  union
+  {
+#if __BYTE_ORDER  == __LITTLE_ENDIAN
+    struct { unsigned lo, hi; } w;
+#else
+#warning "PPC64 Is BigEndian"
+    struct { unsigned hi, lo; } w;
+#endif
+    uint64_t d;
+  } result;
+
+  do {
+    asm volatile ("mfspr %0,%1" : "=r" (temp)        : "i" (SPRN_TBRU));
+    asm volatile ("mfspr %0,%1" : "=r" (result.w.lo) : "i" (SPRN_TBRL));
+    asm volatile ("mfspr %0,%1" : "=r" (result.w.hi) : "i" (SPRN_TBRU));
+  }
+  while (temp != result.w.hi);
+
+  return result.d;
+}
+
+uint64_t __micro_timer () {
+  struct timeval tv;
+  gettimeofday( &tv, 0 );
+  return tv.tv_sec * 1000000ULL + tv.tv_usec;
+}
+
+void CmiTimerInit(char **argv)
+{
+  CpvInitialize(double, clocktick);
+  CpvInitialize(unsigned long, inittime);
+
+  //Initialize PPC64 timers
+
+  uint64_t sampleTime = 100ULL; //sample time in usec
+  uint64_t timeStart = 0ULL, timeStop = 0ULL;
+  uint64_t startBase = 0ULL, endBase = 0ULL;
+  uint64_t overhead = 0ULL, tbf = 0ULL, tbi = 0ULL;
+  uint64_t ticks = 0ULL;
+  int      iter = 0ULL;
+
+  do {
+    tbi = PPC64_TimeBase();
+    tbf = PPC64_TimeBase();
+    tbi = PPC64_TimeBase();
+    tbf = PPC64_TimeBase();
+
+    overhead = tbf - tbi;
+    timeStart = __micro_timer();
+
+    //wait for system time to change
+    while (__micro_timer() == timeStart)
+      timeStart = __micro_timer();
+
+    while (1) {
+      timeStop = __micro_timer();
+      if ((timeStop - timeStart) > 1) {
+        startBase = PPC64_TimeBase();
+        break;
+      }
+    }
+    timeStart = timeStop;
+
+    while (1) {
+      timeStop = __micro_timer();
+      if ((timeStop - timeStart) > sampleTime) {
+        endBase = PPC64_TimeBase();
+        break;
+      }
+    }
+
+    ticks = ((endBase - startBase) + (overhead));
+    iter++;
+    if (iter == 10ULL)
+      CmiAbort("Warning: unable to initialize high resolution timer.\n");
+
+  } while (endBase <= startBase);
+
+  CpvAccess (clocktick) = (1e-6) / ((double)ticks/(double)sampleTime);
+
+  /* try to synchronize calling barrier */
+#if !(__FAULT__)
+  CmiBarrier();
+  CmiBarrier();
+  CmiBarrier();
+#endif
+  CpvAccess(inittime) = PPC64_TimeBase ();
+}
+
+double CmiWallTimer()
+{
+  uint64_t currenttime;
+  currenttime = PPC64_TimeBase();
+  return CpvAccess(clocktick)*(currenttime-CpvAccess(inittime));
+}
+
+double CmiCpuTimer()
+{
+  return CmiWallTimer();
+}
+
+double CmiTimer()
+{
+  return CmiWallTimer();
+}
+
+#endif
+
+
 #if CMK_TIMER_USE_WIN32API
 
 CpvStaticDeclare(double, inittime_wallclock);
@@ -2877,8 +3025,10 @@ void *CmiAlloc(int size)
   res =(char *) CmiPoolAlloc(size+sizeof(CmiChunkHeader));
 #elif USE_MPI_CTRLMSG_SCHEME && CMK_CONVERSE_MPI
   MPI_Alloc_mem(size+sizeof(CmiChunkHeader), MPI_INFO_NULL, &res);
-#elif CMK_SMP && CMK_BLUEGENEQ && (CMK_USE_L2ATOMICS || SPECIFIC_PCQUEUE)
+#elif CMK_SMP && CMK_BLUEGENEQ && SPECIFIC_PCQUEUE
   res = (char *) CmiAlloc_bgq(size+sizeof(CmiChunkHeader));
+#elif CMK_SMP && CMK_PPC_ATOMIC_QUEUE
+  res = (char *) CmiAlloc_ppcq(size+sizeof(CmiChunkHeader));
 #else
   res =(char *) malloc_nomigrate(size+sizeof(CmiChunkHeader));
 #endif
@@ -2980,8 +3130,10 @@ void CmiFree(void *blk)
     CmiPoolFree(BLKSTART(parentBlk));
 #elif USE_MPI_CTRLMSG_SCHEME && CMK_CONVERSE_MPI
     MPI_Free_mem(parentBlk);
-#elif CMK_SMP && CMK_BLUEGENEQ && (CMK_USE_L2ATOMICS || SPECIFIC_PCQUEUE)
+#elif CMK_SMP && CMK_BLUEGENEQ && SPECIFIC_PCQUEUE
     CmiFree_bgq(BLKSTART(parentBlk));
+#elif CMK_SMP && CMK_PPC_ATOMIC_QUEUE
+    CmiFree_ppcq(BLKSTART(parentBlk));
 #else
     free_nomigrate(BLKSTART(parentBlk));
 #endif
diff --git a/src/conv-core/converse.h b/src/conv-core/converse.h
index 5d80c4f0a4..c5b3c235d8 100644
--- a/src/conv-core/converse.h
+++ b/src/conv-core/converse.h
@@ -60,6 +60,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <stdint.h>
 
 /* Paste the tokens x and y together, without any space between them.
    The ANSI C way to do this is the bizarre ## "token-pasting" 
@@ -1936,9 +1937,14 @@ extern CmiNodeLock cmiMemoryLock;
 #define CmiMemoryReadFence()               __asm__ __volatile__("mf" ::: "memory")
 #define CmiMemoryWriteFence()              __asm__ __volatile__("mf" ::: "memory")
 #elif CMK_PPC_ASM
+#if CMK_BLUEGENEQ
 #define CmiMemoryReadFence()               __asm__ __volatile__("sync":::"memory")
 #define CmiMemoryWriteFence()              __asm__ __volatile__("sync":::"memory")
 #else
+#define CmiMemoryReadFence()               __asm__ __volatile__("isync":::"memory")
+#define CmiMemoryWriteFence()              __asm__ __volatile__("lwsync":::"memory")
+#endif
+#else
 #define CMK_NO_ASM_AVAILABLE    1
 extern CmiNodeLock cmiMemoryLock;
 #define CmiMemoryReadFence()               { CmiLock(cmiMemoryLock); CmiUnlock(cmiMemoryLock); }
diff --git a/src/conv-core/cpuaffinity.c b/src/conv-core/cpuaffinity.c
index 8e50c1c5fa..93ca4931ea 100644
--- a/src/conv-core/cpuaffinity.c
+++ b/src/conv-core/cpuaffinity.c
@@ -595,6 +595,10 @@ void CmiInitCPUAffinity(char **argv)
 
   if (pemap!=NULL || commap!=NULL) affinity_flag = 1;
 
+#if CMK_PAMI_LINUX_PPC8
+  affinity_flag = 1;
+#endif
+
   show_affinity_flag = CmiGetArgFlagDesc(argv,"+showcpuaffinity",
 						"print cpu affinity");
 
@@ -649,13 +653,13 @@ void CmiInitCPUAffinity(char **argv)
     }
     else {
     /* if (CmiSetCPUAffinity(CmiNumCores()-1) == -1) CmiAbort("set_cpu_affinity abort!"); */
-#if !CMK_CRAYXT && !CMK_CRAYXE && !CMK_CRAYXC && !CMK_BLUEGENEQ
+#if !CMK_CRAYXT && !CMK_CRAYXE && !CMK_CRAYXC && !CMK_BLUEGENEQ && !CMK_PAMI_LINUX_PPC8
       if (pemap == NULL) {
 #if CMK_MACHINE_PROGRESS_DEFINED
         while (affinity_doneflag < CmiMyNodeSize())  CmiNetworkProgress();
 #else
 #if CMK_SMP
-       #error "Machine progress call needs to be implemented for cpu affinity!"
+        #error "Machine progress call needs to be implemented for cpu affinity!"
 #endif
 #endif
       }
@@ -727,6 +731,43 @@ void CmiInitCPUAffinity(char **argv)
   if (CmiMyPe() < CmiNumPes()) 
   CmiNodeAllBarrier();
   CmiNodeAllBarrier();
+#elif CMK_SMP && CMK_PAMI_LINUX_PPC8
+#define CMK_PAMI_LINUX_PPC8_CORES_PER_NODE      20
+#define CMK_PAMI_LINUX_PPC8_THREADS_PER_CORE     8
+#define CMK_PAMI_LINUX_PPC8_SKIP_CORE_0          0
+  int cores_per_node = CMK_PAMI_LINUX_PPC8_CORES_PER_NODE;
+  int threads_per_core = CMK_PAMI_LINUX_PPC8_THREADS_PER_CORE;
+
+  CmiGetArgInt(argv,"+cores_per_node", &cores_per_node);
+  CmiGetArgInt(argv,"+threads_per_core", &threads_per_core);
+
+  int my_core   = CmiMyPe() % cores_per_node;
+  int my_core_2 = CmiMyPe() % (cores_per_node/2);
+#if CMK_PAMI_LINUX_PPC8_SKIP_CORE_0
+  my_core_2 = (my_core_2 + 1) % (CMK_PAMI_LINUX_PPC8_CORES_PER_NODE/2);
+#endif
+
+  int cpu = 0;
+  if (my_core < (cores_per_node/2))
+    cpu = my_core_2 * threads_per_core;
+  else
+    cpu = (my_core_2 + CMK_PAMI_LINUX_PPC8_CORES_PER_NODE/2) * threads_per_core;
+
+  cpu_set_t cset;
+  CPU_ZERO(&cset);
+  CPU_SET(cpu, &cset);
+  CPU_SET(cpu+1, &cset);
+  if(sched_setaffinity(0, sizeof(cpu_set_t), &cset) < 0)
+    perror("sched_setaffinity");
+
+  CPU_ZERO(&cset);
+  if (sched_getaffinity(0, sizeof(cset), &cset) < 0)
+    perror("sched_getaffinity");
+
+  sched_yield();
+  if(CmiMyPe() == 0)
+    printf("Setting default affinity\n");
+  return;
 #else
     /* get my ip address */
   if (CmiMyRank() == 0)
-- 
2.11.4.GIT