1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-s
2 tyle: "stroustrup"; -*-
6 * This file is part of Gromacs Copyright (c) 1991-2009
7 * David van der Spoel, Erik Lindahl, University of Groningen.
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version 2
12 * of the License, or (at your option) any later version.
14 * To help us fund GROMACS development, we humbly ask that you cite
15 * the research papers on the package. Check out http://www.gromacs.org
18 * Gnomes, ROck Monsters And Chili Sauce
20 #ifndef _GMX_THREAD_MPI_H_
21 #define _GMX_THREAD_MPI_H_
23 /** \file gmx_thread_mpi.h
25 * \brief Partial implementation of MPI using only threads.
27 * See the MPI specification at
28 * http://www.mpi-forum.org/docs/docs.html
29 * for an explanation of what these functions do.
31 * Because this is a thread-based library, be very careful with global
32 * variables and static variables in functions: they will be shared across
33 * all threads an lead to conflicts if not properly mutex-ed or barrier-ed
36 * This library supports all of MPI that is being used in Gromacs, but can
37 * still use some improvement:
38 * - the gmx_mutexes should be replaced by busy-waits on atomic operations
39 * for performance reasons (the aim of a pthreads mutex: scheduling out
40 * waiting threads, is antithetical to the requirements of Gromacs: low
41 * latency and high throughput).
42 * - Some of the global communication functions (bcast, scatter, alltoall)
43 * could perhaps use a binary tree-like distribution method rather than
44 * simply letting each receiver thread read from one distributor.
46 * Right now, this library can only be enabled using cmake (although some
47 * work has been done on autoconf). The relevant option is GMX_THREADED.
56 } /* Avoids screwing up auto-indentation */
60 /* The MPI_Comm structure contains the group of processes to communicate
61 with (defines the scope for global operations such as broadcast) */
62 typedef struct mpi_comm_
*MPI_Comm
;
63 /* The group part of the MPI-Comm structure */
64 typedef struct mpi_group_
*MPI_Group
;
65 /* Request structure for holding data about non-blocking transfers */
66 typedef struct mpi_req_
*MPI_Request
;
67 /* status of receives */
68 typedef struct mpi_status_ MPI_Status
;
70 typedef struct mpi_datatype_
*MPI_Datatype
;
73 /** MPI data types as specified by the MPI standard.
74 Note that not all are available. */
75 extern MPI_Datatype MPI_CHAR
;
76 extern MPI_Datatype MPI_SHORT
;
77 extern MPI_Datatype MPI_INT
;
78 extern MPI_Datatype MPI_LONG
;
79 #ifdef SIZEOF_LONG_LONG_INT
80 extern MPI_Datatype MPI_LONG_LONG
;
81 extern MPI_Datatype MPI_LONG_LONG_INT
;
84 extern MPI_Datatype MPI_SIGNED_CHAR
;
85 extern MPI_Datatype MPI_UNSIGNED_CHAR
;
86 extern MPI_Datatype MPI_UNSIGNED_SHORT
;
87 extern MPI_Datatype MPI_UNSIGNED
;
88 extern MPI_Datatype MPI_UNSIGNED_LONG
;
89 #ifdef SIZEOF_LONG_LONG_INT
90 extern MPI_Datatype MPI_UNSIGNED_LONG_LONG
;
93 extern MPI_Datatype MPI_FLOAT
;
94 extern MPI_Datatype MPI_DOUBLE
;
95 extern MPI_Datatype MPI_LONG_DOUBLE
;
97 /*extern MPI_Datatype MPI_UNSIGNED_WCHAR, we don't want this right now, anyway */
98 extern MPI_Datatype MPI_BYTE
;
102 #define MPI_SUCCESS 0
103 #define MPI_ERR_GROUP 1
104 #define MPI_ERR_COMM 2
105 #define MPI_ERR_STATUS 3
106 #define MPI_ERR_DIMS 4
107 #define MPI_ERR_COORDS 5
108 #define MPI_ERR_CART_CREATE_NPROCS 6
109 #define MPI_ERR_XFER_COUNTERPART 7
110 #define MPI_ERR_XFER_BUFSIZE 8
111 #define MPI_ERR_SEND_DEST 9
112 #define MPI_ERR_RECV_SRC 10
113 #define MPI_ERR_BUF 11
114 #define MPI_ERR_OP_FN 12
115 #define MPI_ERR_UNKNOWN 13
116 #define MPI_FAILURE 14
119 #define MPI_MAX_ERROR_STRING 256
121 #define MPI_UNDEFINED -1
124 typedef void (*MPI_Errhandler_fn
)(MPI_Comm
*, int*);
125 typedef struct mpi_errhandler_
*MPI_Errhandler
;
127 extern MPI_Errhandler MPI_ERRORS_ARE_FATAL
;
128 extern MPI_Errhandler MPI_ERRORS_RETURN
;
132 /* miscelaneous defines */
133 #define MPI_ANY_SOURCE -1
134 #define MPI_ANY_TAG -1
136 /* topology test defines */
141 /** All communicators */
142 extern MPI_Comm MPI_COMM_WORLD
;
143 /* these are 0 instead of NULL so that we can compare against them */
144 #define MPI_COMM_NULL 0
145 #define MPI_GROUP_NULL 0
148 extern MPI_Group MPI_GROUP_EMPTY
;
151 #define MPI_MAX_PROCESSOR_NAME 128
155 #define MPI_STATUS_IGNORE 0
156 #define MPI_STATUSES_IGNORE 0
158 /* the status object is user-maintained. */
161 int MPI_SOURCE
; /* the message source rank */
162 int MPI_TAG
; /* the message source tag */
163 int MPI_ERROR
; /* the message error */
168 #define MPI_REQUEST_NULL 0
170 /* collective communication specials: */
171 #define MPI_IN_PLACE 0
174 /** MPI_Reduce operators.
175 These all work (except obviously bad combinations like bitwise
176 and/or/xor on floats,etc): */
179 MPI_MAX
, /* maximum */
180 MPI_MIN
, /* minimum */
182 MPI_PROD
, /* product */
183 MPI_LAND
, /* logical and */
184 MPI_BAND
, /* binary and */
185 MPI_LOR
, /* logical or */
186 MPI_BOR
, /* binary or */
187 MPI_LXOR
, /* logical xor */
188 MPI_BXOR
/* binary xor */
192 /* function for MPI_COMM_SELF */
193 MPI_Comm
tMPI_Get_comm_self(void);
194 /* this must be a funtion because it's a thread-local property: */
195 #define MPI_COMM_SELF (tMPI_Get_comm_self())
198 /** MPI initializer. Seeks the argument '-np n', where n is the number of
199 threads that will be created. These new threads then run main() again,
200 with the original argc and argv. */
201 int MPI_Init(int *argc
, char ***argv
);
203 /** Alternate thread MPI intializer. Creates N threads (including main thread)
204 that run main() again so they can catch up to MPI_Init themselves */
206 int MPI_Init_N(int N
);
207 /** get the number of threads that will be requested (can be called before
209 int tMPI_Get_N(int *argc
, char ***argv
);
212 /** waits for all threads to join() */
213 int MPI_Finalize(void);
214 /** just kills all threads. Not really neccesary because exit() will do
215 that for us anyway */
216 int MPI_Abort(MPI_Comm comm
, int errorcode
);
217 /** whether MPI_Init, but not yet MPI_Finalize, has been run*/
218 int MPI_Initialized(int *flag
);
219 /** whether MPI_Finalize has been run */
220 int MPI_Finalized(int *flag
);
223 /** create an error handler object from a function */
224 int MPI_Create_errhandler(MPI_Errhandler_fn
*function
,
225 MPI_Errhandler
*errhandler
);
226 /** free the error handler object */
227 int MPI_Errhandler_free(MPI_Errhandler
*errhandler
);
229 /** set the error handler */
230 int MPI_Comm_set_errhandler(MPI_Comm comm
, MPI_Errhandler errhandler
);
231 /** get the error handler */
232 int MPI_Comm_get_errhandler(MPI_Comm comm
, MPI_Errhandler
*errhandler
);
234 /** get the error string associated with an error code */
235 int MPI_Error_string(int errorcode
, char *string
, int *resultlen
);
241 /** returns string with thread # */
242 int MPI_Get_processor_name(char *name
, int *resultlen
);
243 /** get an elapsed time value as a double, in seconds */
244 double MPI_Wtime(void);
246 /** get the resolution of MPI_Wtime as a double, in seconds */
247 double MPI_Wtick(void);
253 /** check the size of a group */
254 int MPI_Group_size(MPI_Group group
, int *size
);
255 /** check the rank of a group */
256 int MPI_Group_rank(MPI_Group group
, int *rank
);
257 /** create a new group as a union of an existing group and new ranks*/
258 int MPI_Group_incl(MPI_Group group
, int n
, int *ranks
, MPI_Group
*newgroup
);
259 /** get a pointer to the group in the comm */
260 int MPI_Comm_group(MPI_Comm comm
, MPI_Group
*group
);
261 /** de-allocate a group */
262 int MPI_Group_free(MPI_Group
*group
);
264 /** get the comm size */
265 int MPI_Comm_size(MPI_Comm comm
, int *size
);
266 /** get the rank in comm of the current process */
267 int MPI_Comm_rank(MPI_Comm comm
, int *rank
);
268 /** de-allocate a comm */
269 int MPI_Comm_free(MPI_Comm
*comm
);
270 /** create a comm based on a group */
271 int MPI_Comm_create(MPI_Comm comm
, MPI_Group group
, MPI_Comm
*newcomm
);
272 /** split up a group into same-colored sub-groups ordered by key */
273 int MPI_Comm_split(MPI_Comm comm
, int color
, int key
, MPI_Comm
*newcomm
);
274 /** make a duplicate of a comm*/
275 int MPI_Comm_dup(MPI_Comm comm
, MPI_Comm
*newcomm
);
277 /* topology functions */
278 /** check what type of topology the comm has */
279 int MPI_Topo_test(MPI_Comm comm
, int status
);
280 /** check which dimensionality a topology has */
281 int MPI_Cartdim_get(MPI_Comm comm
, int *ndims
);
282 /** check which size and pbc a Cartesian topology has */
283 int MPI_Cart_get(MPI_Comm comm
, int maxdims
, int *dims
, int *periods
,
285 /** check which rank a set of process coordinates has in a Cartesian topology */
286 int MPI_Cart_rank(MPI_Comm comm
, int *coords
, int *rank
);
287 /** check which coordinates a process rank has in a Cartesian topology */
288 int MPI_Cart_coords(MPI_Comm comm
, int rank
, int maxdims
, int *coords
);
289 /** check which rank this process would have in a Cartesian topology */
290 int MPI_Cart_map(MPI_Comm comm
, int ndims
, int *dims
, int *periods
,
292 /** create a comm with a Cartesian topology */
293 int MPI_Cart_create(MPI_Comm comm_old
, int ndims
, int *dims
, int *periods
,
294 int reorder
, MPI_Comm
*comm_cart
);
297 /** create a contiguous data type (the only type possible right now */
298 int MPI_Type_contiguous(int count
, MPI_Datatype oldtype
,
299 MPI_Datatype
*newtype
);
300 /** make the data type ready for use */
301 int MPI_Type_commit(MPI_Datatype
*datatype
);
305 /** wait for all process in comm to arrive here */
306 int MPI_Barrier(MPI_Comm comm
);
310 /** blocking transfers. The actual transfer (copy) is done on the receiving end
311 (so that the receiver's cache already contains the data that it presumably
313 /* send message; waits until finished. */
314 int MPI_Send(void* buf
, int count
, MPI_Datatype datatype
, int dest
,
315 int tag
, MPI_Comm comm
);
316 /** receive message; waits until finished. */
317 int MPI_Recv(void* buf
, int count
, MPI_Datatype datatype
, int source
,
318 int tag
, MPI_Comm comm
, MPI_Status
*status
);
319 /** send & receive message at the same time; waits until finished. */
320 int MPI_Sendrecv(void *sendbuf
, int sendcount
, MPI_Datatype sendtype
,
321 int dest
, int sendtag
, void *recvbuf
, int recvcount
,
322 MPI_Datatype recvtype
, int source
, int recvtag
, MPI_Comm comm
,
324 /** get the number of actually transferred items from a transfer status */
325 int MPI_Get_count(MPI_Status
*status
, MPI_Datatype datatype
, int *count
);
328 /** async send/recv. The actual transfer is usually done on the receiving
329 end, during MPI_Wait, MPI_Waitall or MPI_Test. For MPI_Waitall,
330 the incoming messages are processed in the order they come in.
332 In the case of async receives, the sender may initiate transfer,
333 and there's a lock in the envelope to make sure that it doesn't
334 happen on both ends simultaneously. */
335 /** initiate sending a message */
336 int MPI_Isend(void* buf
, int count
, MPI_Datatype datatype
, int dest
,
337 int tag
, MPI_Comm comm
, MPI_Request
*request
);
338 /** initiate receiving a message */
339 int MPI_Irecv(void* buf
, int count
, MPI_Datatype datatype
, int source
,
340 int tag
, MPI_Comm comm
, MPI_Request
*request
);
341 /** test whether message is sent */
342 int MPI_Test(MPI_Request
*request
, int *flag
, MPI_Status
*status
);
343 /** wait until message is sent */
344 int MPI_Wait(MPI_Request
*request
, MPI_Status
*status
);
345 /** wait for several message sending requests */
346 int MPI_Waitall(int count
, MPI_Request
*array_of_requests
,
347 MPI_Status
*array_of_statuses
);
354 /** broadcast over entire comm from root */
355 int MPI_Bcast(void* buffer
, int count
, MPI_Datatype datatype
, int root
,
358 /** gather data from all processes in comm to root */
359 int MPI_Gather(void* sendbuf
, int sendcount
, MPI_Datatype sendtype
,
360 void* recvbuf
, int recvcount
, MPI_Datatype recvtype
, int root
,
362 /** gather irregularly laid out data from all processes in comm to root */
363 int MPI_Gatherv(void* sendbuf
, int sendcount
, MPI_Datatype sendtype
,
364 void* recvbuf
, int *recvcounts
, int *displs
,
365 MPI_Datatype recvtype
, int root
, MPI_Comm comm
);
367 /** spread parts of sendbuf to all processes in comm from root */
368 int MPI_Scatter(void* sendbuf
, int sendcount
, MPI_Datatype sendtype
,
369 void* recvbuf
, int recvcount
, MPI_Datatype recvtype
, int root
,
371 /** spread irregularly laid out parts of sendbuf to all processes from root */
372 int MPI_Scatterv(void* sendbuf
, int *sendcounts
, int *displs
,
373 MPI_Datatype sendtype
, void* recvbuf
, int recvcount
,
374 MPI_Datatype recvtype
, int root
, MPI_Comm comm
);
379 /** spread out parts of sendbuf to all processes from all processes */
380 int MPI_Alltoall(void* sendbuf
, int sendcount
, MPI_Datatype sendtype
,
381 void* recvbuf
, int recvcount
, MPI_Datatype recvtype
,
383 /** spread out irregularly laid out parts of sendbuf to all processes
384 from all processes */
385 int MPI_Alltoallv(void* sendbuf
, int *sendcounts
, int *sdispls
,
386 MPI_Datatype sendtype
, void* recvbuf
, int *recvcounts
,
387 int *rdispls
, MPI_Datatype recvtype
, MPI_Comm comm
);
392 /** Do an operation between all locally held buffers on all items in the
393 buffers and send the results to root*/
394 int MPI_Reduce(void* sendbuf
, void* recvbuf
, int count
,
395 MPI_Datatype datatype
, MPI_Op op
, int root
, MPI_Comm comm
);
396 /** Do an operation between all locally held buffers on all items in the
397 buffers and broadcast the results */
398 int MPI_Allreduce(void* sendbuf
, void* recvbuf
, int count
,
399 MPI_Datatype datatype
, MPI_Op op
, MPI_Comm comm
);
402 } /* closing extern "C" */
405 #endif /* _THREAD_MPI_H_ */