ZC EM API: Provide user control to not de-register buffers after completion
[charm.git] / src / ck-core / ckrdma.h
blob8491cd057e33152668f783907e5e8512e339185c
1 /*
2 * Charm Onesided API Utility Functions
3 */
5 #ifndef _CKRDMA_H_
6 #define _CKRDMA_H_
8 #include "envelope.h"
10 /*********************************** Zerocopy Direct API **********************************/
12 #define CK_BUFFER_REG CMK_BUFFER_REG
13 #define CK_BUFFER_UNREG CMK_BUFFER_UNREG
14 #define CK_BUFFER_PREREG CMK_BUFFER_PREREG
15 #define CK_BUFFER_NOREG CMK_BUFFER_NOREG
17 #define CK_BUFFER_DEREG CMK_BUFFER_DEREG
18 #define CK_BUFFER_NODEREG CMK_BUFFER_NODEREG
20 #ifndef CMK_NOCOPY_DIRECT_BYTES
22 #if defined(_WIN32)
23 #define CMK_NOCOPY_DIRECT_BYTES 1
24 /* It is required to declare CMK_NOCOPY_DIRECT_BYTES to 1 instead of 0
25 * as this avoids the C2229 error (illegal zero-sized array)
26 * for char layerInfo[CMK_NOCOPY_DIRECT_BYTES] which is seen for
27 * a 0 sized array on VC++
29 #else
30 #define CMK_NOCOPY_DIRECT_BYTES 0
31 #endif // end of if defined(_WIN32)
33 #endif // end of ifndef CMK_NOCOPY_DIRECT_BYTES
35 #ifndef CMK_COMMON_NOCOPY_DIRECT_BYTES
36 #define CMK_COMMON_NOCOPY_DIRECT_BYTES 0
37 #endif
39 #define CkRdmaAlloc CmiRdmaAlloc
40 #define CkRdmaFree CmiRdmaFree
42 // Represents the mode of the zerocopy transfer
43 // CkNcpyMode::MEMCPY indicates that the PEs are on the logical node and memcpy can be used
44 // CkNcpyMode::CMA indicates that the PEs are on the same physical node and CMA can be used
45 // CkNcpyMode::RDMA indicates that the neither MEMCPY or CMA can be used and REMOTE Direct Memory Access needs to be used
46 enum class CkNcpyMode : char { MEMCPY, CMA, RDMA };
48 // Represents the completion status of the zerocopy transfer (used as a return value for CkNcpyBuffer::get & CkNcpyBuffer:::put)
49 // CMA and MEMCPY transfers complete instantly and return CkNcpyStatus::complete
50 // RDMA transfers use a remote asynchronous call and hence return CkNcpyStatus::incomplete
51 enum class CkNcpyStatus : char { incomplete, complete };
53 // P2P_SEND mode is used for EM P2P Send API
54 // BCAST_SEND mode is used for EM BCAST Send API
55 // P2P_RECV mode is used for EM P2P Recv API
56 // BCAST_RECV mode is used for EM BCAST Send API
57 enum class ncpyEmApiMode : char { P2P_SEND, BCAST_SEND, P2P_RECV, BCAST_RECV };
59 // Struct passed in a ZC Post Entry Method to allow receiver side to post
60 struct CkNcpyBufferPost {
61 // regMode
62 unsigned short int regMode;
64 // deregMode
65 unsigned short int deregMode;
68 // Class to represent an Zerocopy buffer
69 // CkSendBuffer(....) passed by the user internally translates to a CkNcpyBuffer
70 class CkNcpyBuffer{
72 private:
74 // bool to indicate registration for current values of ptr and cnt on pe
75 bool isRegistered;
77 // machine specific information about the buffer
78 #ifdef __GNUC__
79 #pragma GCC diagnostic push
80 #pragma GCC diagnostic ignored "-Wpedantic"
81 #endif
82 char layerInfo[CMK_COMMON_NOCOPY_DIRECT_BYTES + CMK_NOCOPY_DIRECT_BYTES];
83 #ifdef __GNUC__
84 #pragma GCC diagnostic pop
85 #endif
87 public:
88 // pointer to the buffer
89 const void *ptr;
91 // number of bytes
92 size_t cnt;
94 // callback to be invoked on the sender/receiver
95 CkCallback cb;
97 // home pe
98 int pe;
100 // regMode
101 unsigned short int regMode;
103 // deregMode
104 unsigned short int deregMode;
106 // reference pointer
107 const void *ref;
109 // bcast ack handling pointer
110 const void *bcastAckInfo;
112 CkNcpyBuffer() : isRegistered(false), ptr(NULL), cnt(0), pe(-1), regMode(CK_BUFFER_REG), deregMode(CK_BUFFER_DEREG), ref(NULL), bcastAckInfo(NULL) {}
114 explicit CkNcpyBuffer(const void *ptr_, size_t cnt_, unsigned short int regMode_=CK_BUFFER_REG, unsigned short int deregMode_=CK_BUFFER_DEREG) {
115 cb = CkCallback(CkCallback::ignore);
116 init(ptr_, cnt_, regMode_, deregMode_);
119 explicit CkNcpyBuffer(const void *ptr_, size_t cnt_, CkCallback &cb_, unsigned short int regMode_=CK_BUFFER_REG, unsigned short int deregMode_=CK_BUFFER_DEREG) {
120 init(ptr_, cnt_, cb_, regMode_, deregMode_);
123 void print() {
124 CkPrintf("[%d][%d][%d] CkNcpyBuffer print: ptr:%p, size:%d, pe:%d, regMode=%d, deregMode=%d, ref:%p, bcastAckInfo:%p\n", CmiMyPe(), CmiMyNode(), CmiMyRank(), ptr, cnt, pe, regMode, deregMode, ref, bcastAckInfo);
127 void init(const void *ptr_, size_t cnt_, CkCallback &cb_, unsigned short int regMode_=CK_BUFFER_REG, unsigned short int deregMode_=CK_BUFFER_DEREG) {
128 cb = cb_;
129 init(ptr_, cnt_, regMode_, deregMode_);
132 void init(const void *ptr_, size_t cnt_, unsigned short int regMode_=CK_BUFFER_REG, unsigned short int deregMode_=CK_BUFFER_DEREG) {
133 ptr = ptr_;
134 cnt = cnt_;
135 pe = CkMyPe();
136 regMode = regMode_;
137 deregMode = deregMode_;
139 isRegistered = false;
141 // Register memory everytime new values are initialized
142 if(cnt > 0)
143 registerMem();
146 void setRef(const void *ref_) {
147 ref = ref_;
150 const void *getRef() {
151 return ref;
154 // Register(Pin) the memory for the buffer
155 void registerMem()
157 // Check that this object is local when registerMem is called
158 CkAssert(CkNodeOf(pe) == CkMyNode());
160 // Set machine layer information when regMode is not CK_BUFFER_NOREG
161 if(regMode != CK_BUFFER_NOREG) {
163 CmiSetRdmaCommonInfo(&layerInfo[0], ptr, cnt);
165 /* Set the pointer layerInfo unconditionally for layers that don't require pinning (MPI, PAMI)
166 * or if regMode is REG, PREREG on layers that require pinning (GNI, Verbs, OFI) */
167 #if CMK_REG_REQUIRED
168 if(regMode == CK_BUFFER_REG || regMode == CK_BUFFER_PREREG)
169 #endif
171 CmiSetRdmaBufferInfo(layerInfo + CmiGetRdmaCommonInfoSize(), ptr, cnt, regMode);
172 isRegistered = true;
177 void setMode(unsigned short int regMode_) { regMode = regMode_; }
179 void memcpyGet(CkNcpyBuffer &source);
180 void memcpyPut(CkNcpyBuffer &destination);
182 #if CMK_USE_CMA
183 void cmaGet(CkNcpyBuffer &source);
184 void cmaPut(CkNcpyBuffer &destination);
185 #endif
187 void rdmaGet(CkNcpyBuffer &source);
188 void rdmaPut(CkNcpyBuffer &destination);
190 CkNcpyStatus get(CkNcpyBuffer &source);
191 CkNcpyStatus put(CkNcpyBuffer &destination);
193 // Deregister(Unpin) the memory that is registered for the buffer
194 void deregisterMem() {
195 // Check that this object is local when deregisterMem is called
196 CkAssert(CkNodeOf(pe) == CkMyNode());
198 if(isRegistered == false)
199 return;
201 #if CMK_REG_REQUIRED
202 if(regMode != CK_BUFFER_NOREG) {
203 CmiDeregisterMem(ptr, layerInfo + CmiGetRdmaCommonInfoSize(), pe, regMode);
204 isRegistered = false;
206 #endif
209 void pup(PUP::er &p) {
210 p((char *)&ptr, sizeof(ptr));
211 p((char *)&ref, sizeof(ref));
212 p((char *)&bcastAckInfo, sizeof(bcastAckInfo));
213 p|cnt;
214 p|cb;
215 p|pe;
216 p|regMode;
217 p|deregMode;
218 p|isRegistered;
219 PUParray(p, layerInfo, CMK_COMMON_NOCOPY_DIRECT_BYTES + CMK_NOCOPY_DIRECT_BYTES);
222 friend void CkRdmaDirectAckHandler(void *ack);
224 friend void CkRdmaEMBcastAckHandler(void *ack);
226 friend void constructSourceBufferObject(NcpyOperationInfo *info, CkNcpyBuffer &src);
227 friend void constructDestinationBufferObject(NcpyOperationInfo *info, CkNcpyBuffer &dest);
229 friend envelope* CkRdmaIssueRgets(envelope *env, ncpyEmApiMode emMode, void *forwardMsg);
230 friend void CkRdmaIssueRgets(envelope *env, ncpyEmApiMode emMode, void *forwardMsg, int numops, void **arrPtrs, CkNcpyBufferPost *postStructs);
232 friend void readonlyGet(CkNcpyBuffer &src, CkNcpyBuffer &dest, void *refPtr);
233 friend void readonlyCreateOnSource(CkNcpyBuffer &src);
236 friend void performEmApiNcpyTransfer(CkNcpyBuffer &source, CkNcpyBuffer &dest, int opIndex, int child_count, char *ref, int extraSize, CkNcpyMode ncpyMode, ncpyEmApiMode emMode);
238 friend void performEmApiRget(CkNcpyBuffer &source, CkNcpyBuffer &dest, int opIndex, char *ref, int extraSize, ncpyEmApiMode emMode);
240 friend void performEmApiCmaTransfer(CkNcpyBuffer &source, CkNcpyBuffer &dest, int child_count, ncpyEmApiMode emMode);
242 friend void deregisterMemFromMsg(envelope *env, bool isRecv);
245 // Ack handler for the Zerocopy Direct API
246 // Invoked on the completion of any RDMA operation calling using the Direct API
247 void CkRdmaDirectAckHandler(void *ack);
249 // Method to invoke a callback on a particular pe with a CkNcpyBuffer being passed
250 // as a part of a CkDataMsg. This method is used to invoke callbacks on specific pes
251 // after the completion of the Zerocopy Direct API operation
252 void invokeCallback(void *cb, int pe, CkNcpyBuffer &buff);
254 // Returns CkNcpyMode::MEMCPY if both the PEs are the same and memcpy can be used
255 // Returns CkNcpyMode::CMA if both the PEs are in the same physical node and CMA can be used
256 // Returns CkNcpyMode::RDMA if RDMA needs to be used
257 CkNcpyMode findTransferMode(int srcPe, int destPe);
259 void invokeSourceCallback(NcpyOperationInfo *info);
261 void invokeDestinationCallback(NcpyOperationInfo *info);
263 // Method to enqueue a message after the completion of an payload transfer
264 void enqueueNcpyMessage(int destPe, void *msg);
266 /*********************************** Zerocopy Entry Method API ****************************/
267 static inline CkNcpyBuffer CkSendBuffer(const void *ptr_, CkCallback &cb_, unsigned short int regMode_=CK_BUFFER_REG, unsigned short int deregMode_=CK_BUFFER_DEREG) {
268 return CkNcpyBuffer(ptr_, 0, cb_, regMode_, deregMode_);
271 static inline CkNcpyBuffer CkSendBuffer(const void *ptr_, unsigned short int regMode_=CK_BUFFER_REG, unsigned short int deregMode_=CK_BUFFER_DEREG) {
272 return CkNcpyBuffer(ptr_, 0, regMode_, deregMode_);
275 #if CMK_ONESIDED_IMPL
277 // NOTE: Inside CkRdmaIssueRgets, a large message allocation is made consisting of space
278 // for the destination or receiver buffers and some additional information required for processing
279 // and acknowledgment handling. The space for additional information is typically equal to
280 // sizeof(NcpyEmInfo) + numops * sizeof(NcpyEmBufferInfo)
282 // This structure is used to store zerocopy information associated with an entry method
283 // invocation which uses the RDMA mode of transfer in Zerocopy Entry Method API.
284 // A variable of the structure stores the information in order to access it after the
285 // completion of the Rget operation (which is an asynchronous call) in order to invoke
286 // the entry method
287 struct NcpyEmInfo{
288 int numOps; // number of zerocopy operations i.e number of buffers sent using CkSendBuffer
289 int counter; // used for tracking the number of completed RDMA operations
290 int pe;
291 ncpyEmApiMode mode; // used to distinguish between p2p and bcast
292 void *msg; // pointer to the Charm++ message which will be enqueued after completion of all Rgets
293 void *forwardMsg; // used for the ncpy broadcast api
297 // This structure is used to store the buffer information specific to each buffer being sent
298 // using the Zerocopy Entry Method API. A variable of the structure stores the information associated
299 // with each buffer
300 struct NcpyEmBufferInfo{
301 int index; // Represents the index of the buffer information (from 0,1... numops - 1)
302 NcpyOperationInfo ncpyOpInfo; // Stores all the information required for the zerocopy operation
307 * Extract ncpy buffer information from the metadata message,
308 * allocate buffers and issue ncpy calls (either memcpy or cma read or rdma get)
310 envelope* CkRdmaIssueRgets(envelope *env, ncpyEmApiMode emMode, void *forwardMsg = NULL);
312 void CkRdmaIssueRgets(envelope *env, ncpyEmApiMode emMode, void *forwardMsg, int numops, void **arrPtrs, CkNcpyBufferPost *postStructs);
314 void handleEntryMethodApiCompletion(NcpyOperationInfo *info);
316 void handleReverseEntryMethodApiCompletion(NcpyOperationInfo *info);
318 // Method called to pack rdma pointers
319 void CkPackRdmaPtrs(char *msgBuf);
321 // Method called to pack rdma pointers
322 void CkUnpackRdmaPtrs(char *msgBuf);
324 // Determine the number of ncpy ops and the sum of the ncpy buffer sizes
325 // from the metadata message
326 void getRdmaNumopsAndBufsize(envelope *env, int &numops, int &bufsize);
328 // Ack handler function for the nocopy EM API
329 void CkRdmaEMAckHandler(int destPe, void *ack);
331 void CkRdmaEMBcastPostAckHandler(void *msg);
333 struct NcpyBcastRecvPeerAckInfo{
334 #if CMK_SMP
335 std::atomic<int> numPeers;
336 #else
337 int numPeers;
338 #endif
339 void *bcastAckInfo;
340 void *msg;
341 int peerParentPe;
342 #if CMK_SMP
343 int getNumPeers() const {
344 return numPeers.load(std::memory_order_acquire);
346 void setNumPeers(int r) {
347 return numPeers.store(r, std::memory_order_release);
349 int incNumPeers() {
350 return numPeers.fetch_add(1, std::memory_order_release);
352 int decNumPeers() {
353 return numPeers.fetch_sub(1, std::memory_order_release);
355 #else
356 int getNumPeers() const { return numPeers; }
357 void setNumPeers(int r) { numPeers = r; }
358 int incNumPeers() { return numPeers++; }
359 int decNumPeers() { return numPeers--; }
360 #endif
366 /***************************** Zerocopy Bcast Entry Method API ****************************/
367 struct NcpyBcastAckInfo{
368 int numChildren;
369 int counter;
370 bool isRoot;
371 int pe;
372 int numops;
375 struct NcpyBcastRootAckInfo : public NcpyBcastAckInfo {
376 CkNcpyBuffer src[0];
379 struct NcpyBcastInterimAckInfo : public NcpyBcastAckInfo {
380 void *msg;
382 // for RECV
383 bool isRecv;
384 bool isArray;
385 void *parentBcastAckInfo;
386 int origPe;
390 // Method called on the bcast source to store some information for ack handling
391 void CkRdmaPrepareBcastMsg(envelope *env);
393 void CkReplaceSourcePtrsInBcastMsg(envelope *env, NcpyBcastInterimAckInfo *bcastAckInfo, int origPe);
395 // Method called to extract the parent bcastAckInfo from the received message for ack handling
396 const void *getParentBcastAckInfo(void *msg, int &srcPe);
398 // Allocate a NcpyBcastInterimAckInfo and return the pointer
399 NcpyBcastInterimAckInfo *allocateInterimNodeAckObj(envelope *myEnv, envelope *myChildEnv, int pe);
401 void forwardMessageToChildNodes(envelope *myChildrenMsg, UChar msgType);
403 void forwardMessageToPeerNodes(envelope *myMsg, UChar msgType);
405 void handleBcastEntryMethodApiCompletion(NcpyOperationInfo *info);
407 void handleBcastReverseEntryMethodApiCompletion(NcpyOperationInfo *info);
409 void deregisterMemFromMsg(envelope *env, bool isRecv);
411 void handleMsgUsingCMAPostCompletionForSendBcast(envelope *copyenv, envelope *env, CkNcpyBuffer &source);
413 void processBcastSendEmApiCompletion(NcpyEmInfo *ncpyEmInfo, int destPe);
415 // Method called on intermediate nodes after RGET to switch old source pointers with my pointers
416 void CkReplaceSourcePtrsInBcastMsg(envelope *prevEnv, envelope *env, void *bcastAckInfo, int origPe);
418 void processBcastRecvEmApiCompletion(NcpyEmInfo *ncpyEmInfo, int destPe);
420 // Method called on the root node and other intermediate parent nodes on completion of RGET through ZC Bcast
421 void CkRdmaEMBcastAckHandler(void *ack);
423 void handleMsgOnChildPostCompletionForRecvBcast(envelope *env);
425 void handleMsgOnInterimPostCompletionForRecvBcast(envelope *env, NcpyBcastInterimAckInfo *bcastAckInfo, int pe);
429 /***************************** Zerocopy Readonly Bcast Support ****************************/
431 /* Support for Zerocopy Broadcast of large readonly variables */
432 CkpvExtern(int, _numPendingRORdmaTransfers);
434 struct NcpyROBcastBuffAckInfo {
435 const void *ptr;
437 int regMode;
439 int pe;
441 // machine specific information about the buffer
442 #ifdef __GNUC__
443 #pragma GCC diagnostic push
444 #pragma GCC diagnostic ignored "-Wpedantic"
445 #endif
446 char layerInfo[CMK_COMMON_NOCOPY_DIRECT_BYTES + CMK_NOCOPY_DIRECT_BYTES];
447 #ifdef __GNUC__
448 #pragma GCC diagnostic pop
449 #endif
452 struct NcpyROBcastAckInfo {
453 int numChildren;
454 int counter;
455 bool isRoot;
456 int numops;
457 NcpyROBcastBuffAckInfo buffAckInfo[0];
460 void readonlyUpdateNumops();
462 void readonlyAllocateOnSource();
464 void readonlyCreateOnSource(CkNcpyBuffer &src);
466 void readonlyGet(CkNcpyBuffer &src, CkNcpyBuffer &dest, void *refPtr);
468 void readonlyGetCompleted(NcpyOperationInfo *ncpyOpInfo);
470 #if CMK_SMP
471 void updatePeerCounterAndPush(envelope *env);
472 #endif
474 CkArray* getArrayMgrFromMsg(envelope *env);
476 void sendAckMsgToParent(envelope *env);
478 void sendRecvDoneMsgToPeers(envelope *env, CkArray *mgr);
480 #endif /* End of CMK_ONESIDED_IMPL */
482 #endif