mfplay: Add support for same-thread event callback.
[wine.git] / dlls / vcomp / main.c
blob90caac8375034b3b149256fa949318e011e4e8bd
1 /*
3 * vcomp implementation
5 * Copyright 2011 Austin English
6 * Copyright 2012 Dan Kegel
7 * Copyright 2015-2016 Sebastian Lackner
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
24 #include <stdarg.h>
25 #include <assert.h>
27 #include "windef.h"
28 #include "winbase.h"
29 #include "winternl.h"
30 #include "wine/debug.h"
31 #include "wine/list.h"
32 #include "wine/asm.h"
34 WINE_DEFAULT_DEBUG_CHANNEL(vcomp);
36 #define MAX_VECT_PARALLEL_CALLBACK_ARGS 128
38 typedef CRITICAL_SECTION *omp_lock_t;
39 typedef CRITICAL_SECTION *omp_nest_lock_t;
41 static struct list vcomp_idle_threads = LIST_INIT(vcomp_idle_threads);
42 static DWORD vcomp_context_tls = TLS_OUT_OF_INDEXES;
43 static HMODULE vcomp_module;
44 static int vcomp_max_threads;
45 static int vcomp_num_threads;
46 static BOOL vcomp_nested_fork = FALSE;
48 static RTL_CRITICAL_SECTION vcomp_section;
49 static RTL_CRITICAL_SECTION_DEBUG critsect_debug =
51 0, 0, &vcomp_section,
52 { &critsect_debug.ProcessLocksList, &critsect_debug.ProcessLocksList },
53 0, 0, { (DWORD_PTR)(__FILE__ ": vcomp_section") }
55 static RTL_CRITICAL_SECTION vcomp_section = { &critsect_debug, -1, 0, 0, 0, 0 };
57 #define VCOMP_DYNAMIC_FLAGS_STATIC 0x01
58 #define VCOMP_DYNAMIC_FLAGS_CHUNKED 0x02
59 #define VCOMP_DYNAMIC_FLAGS_GUIDED 0x03
60 #define VCOMP_DYNAMIC_FLAGS_INCREMENT 0x40
62 struct vcomp_thread_data
64 struct vcomp_team_data *team;
65 struct vcomp_task_data *task;
66 int thread_num;
67 BOOL parallel;
68 int fork_threads;
70 /* only used for concurrent tasks */
71 struct list entry;
72 CONDITION_VARIABLE cond;
74 /* single */
75 unsigned int single;
77 /* section */
78 unsigned int section;
80 /* dynamic */
81 unsigned int dynamic;
82 unsigned int dynamic_type;
83 unsigned int dynamic_begin;
84 unsigned int dynamic_end;
87 struct vcomp_team_data
89 CONDITION_VARIABLE cond;
90 int num_threads;
91 int finished_threads;
93 /* callback arguments */
94 int nargs;
95 void *wrapper;
96 __ms_va_list valist;
98 /* barrier */
99 unsigned int barrier;
100 int barrier_count;
103 struct vcomp_task_data
105 /* single */
106 unsigned int single;
108 /* section */
109 unsigned int section;
110 int num_sections;
111 int section_index;
113 /* dynamic */
114 unsigned int dynamic;
115 unsigned int dynamic_first;
116 unsigned int dynamic_last;
117 unsigned int dynamic_iterations;
118 int dynamic_step;
119 unsigned int dynamic_chunksize;
122 static void **ptr_from_va_list(__ms_va_list valist)
124 return *(void ***)&valist;
127 static void copy_va_list_data(void **args, __ms_va_list valist, int args_count)
129 unsigned int i;
131 for (i = 0; i < args_count; ++i)
132 args[i] = va_arg(valist, void *);
135 #if defined(__i386__)
137 extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args);
138 __ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper,
139 "pushl %ebp\n\t"
140 __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t")
141 __ASM_CFI(".cfi_rel_offset %ebp,0\n\t")
142 "movl %esp,%ebp\n\t"
143 __ASM_CFI(".cfi_def_cfa_register %ebp\n\t")
144 "pushl %esi\n\t"
145 __ASM_CFI(".cfi_rel_offset %esi,-4\n\t")
146 "pushl %edi\n\t"
147 __ASM_CFI(".cfi_rel_offset %edi,-8\n\t")
148 "movl 12(%ebp),%edx\n\t"
149 "movl %esp,%edi\n\t"
150 "shll $2,%edx\n\t"
151 "jz 1f\n\t"
152 "subl %edx,%edi\n\t"
153 "andl $~15,%edi\n\t"
154 "movl %edi,%esp\n\t"
155 "movl 12(%ebp),%ecx\n\t"
156 "movl 16(%ebp),%esi\n\t"
157 "cld\n\t"
158 "rep; movsl\n"
159 "1:\tcall *8(%ebp)\n\t"
160 "leal -8(%ebp),%esp\n\t"
161 "popl %edi\n\t"
162 __ASM_CFI(".cfi_same_value %edi\n\t")
163 "popl %esi\n\t"
164 __ASM_CFI(".cfi_same_value %esi\n\t")
165 "popl %ebp\n\t"
166 __ASM_CFI(".cfi_def_cfa %esp,4\n\t")
167 __ASM_CFI(".cfi_same_value %ebp\n\t")
168 "ret" )
170 #elif defined(__x86_64__)
172 extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args);
173 __ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper,
174 "pushq %rbp\n\t"
175 __ASM_SEH(".seh_pushreg %rbp\n\t")
176 __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t")
177 __ASM_CFI(".cfi_rel_offset %rbp,0\n\t")
178 "movq %rsp,%rbp\n\t"
179 __ASM_SEH(".seh_setframe %rbp,0\n\t")
180 __ASM_CFI(".cfi_def_cfa_register %rbp\n\t")
181 "pushq %rsi\n\t"
182 __ASM_SEH(".seh_pushreg %rsi\n\t")
183 __ASM_CFI(".cfi_rel_offset %rsi,-8\n\t")
184 "pushq %rdi\n\t"
185 __ASM_SEH(".seh_pushreg %rdi\n\t")
186 __ASM_SEH(".seh_endprologue\n\t")
187 __ASM_CFI(".cfi_rel_offset %rdi,-16\n\t")
188 "movq %rcx,%rax\n\t"
189 "movq $4,%rcx\n\t"
190 "cmp %rcx,%rdx\n\t"
191 "cmovgq %rdx,%rcx\n\t"
192 "leaq 0(,%rcx,8),%rdx\n\t"
193 "subq %rdx,%rsp\n\t"
194 "andq $~15,%rsp\n\t"
195 "movq %rsp,%rdi\n\t"
196 "movq %r8,%rsi\n\t"
197 "rep; movsq\n\t"
198 "movq 0(%rsp),%rcx\n\t"
199 "movq 8(%rsp),%rdx\n\t"
200 "movq 16(%rsp),%r8\n\t"
201 "movq 24(%rsp),%r9\n\t"
202 "callq *%rax\n\t"
203 "leaq -16(%rbp),%rsp\n\t"
204 "popq %rdi\n\t"
205 __ASM_CFI(".cfi_same_value %rdi\n\t")
206 "popq %rsi\n\t"
207 __ASM_CFI(".cfi_same_value %rsi\n\t")
208 __ASM_CFI(".cfi_def_cfa_register %rsp\n\t")
209 "popq %rbp\n\t"
210 __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
211 __ASM_CFI(".cfi_same_value %rbp\n\t")
212 "ret")
214 #elif defined(__arm__)
216 extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args);
217 __ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper,
218 "push {r4, r5, LR}\n\t"
219 "mov r4, r0\n\t"
220 "mov r5, SP\n\t"
221 "lsl r3, r1, #2\n\t"
222 "cmp r3, #0\n\t"
223 "beq 5f\n\t"
224 "sub SP, SP, r3\n\t"
225 "tst r1, #1\n\t"
226 "it eq\n\t"
227 "subeq SP, SP, #4\n\t"
228 "1:\tsub r3, r3, #4\n\t"
229 "ldr r0, [r2, r3]\n\t"
230 "str r0, [SP, r3]\n\t"
231 "cmp r3, #0\n\t"
232 "bgt 1b\n\t"
233 "cmp r1, #1\n\t"
234 "bgt 2f\n\t"
235 "pop {r0}\n\t"
236 "b 5f\n\t"
237 "2:\tcmp r1, #2\n\t"
238 "bgt 3f\n\t"
239 "pop {r0-r1}\n\t"
240 "b 5f\n\t"
241 "3:\tcmp r1, #3\n\t"
242 "bgt 4f\n\t"
243 "pop {r0-r2}\n\t"
244 "b 5f\n\t"
245 "4:\tpop {r0-r3}\n\t"
246 "5:\tblx r4\n\t"
247 "mov SP, r5\n\t"
248 "pop {r4, r5, PC}" )
250 #elif defined(__aarch64__)
252 extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args);
253 __ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper,
254 "stp x29, x30, [SP,#-16]!\n\t"
255 "mov x29, SP\n\t"
256 "mov x9, x0\n\t"
257 "cbz w1, 4f\n\t"
258 "lsl w8, w1, #3\n\t"
259 "cmp w8, #64\n\t"
260 "b.ge 1f\n\t"
261 "mov w8, #64\n"
262 "1:\ttbz w8, #3, 2f\n\t"
263 "add w8, w8, #8\n"
264 "2:\tsub x10, x29, x8\n\t"
265 "mov sp, x10\n"
266 "3:\tldr x0, [x2], #8\n\t"
267 "str x0, [x10], #8\n\t"
268 "subs w1, w1, #1\n\t"
269 "b.ne 3b\n\t"
270 "ldp x0, x1, [sp], #16\n\t"
271 "ldp x2, x3, [sp], #16\n\t"
272 "ldp x4, x5, [sp], #16\n\t"
273 "ldp x6, x7, [sp], #16\n"
274 "4:\tblr x9\n\t"
275 "mov SP, x29\n\t"
276 "ldp x29, x30, [SP], #16\n\t"
277 "ret\n" )
279 #else
281 static void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args)
283 ERR("Not implemented for this architecture\n");
286 #endif
288 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
290 static inline char interlocked_cmpxchg8(char *dest, char xchg, char compare)
292 char ret;
293 __asm__ __volatile__( "lock; cmpxchgb %2,(%1)"
294 : "=a" (ret) : "r" (dest), "q" (xchg), "0" (compare) : "memory" );
295 return ret;
298 static inline short interlocked_cmpxchg16(short *dest, short xchg, short compare)
300 short ret;
301 __asm__ __volatile__( "lock; cmpxchgw %2,(%1)"
302 : "=a" (ret) : "r" (dest), "r" (xchg), "0" (compare) : "memory" );
303 return ret;
306 static inline char interlocked_xchg_add8(char *dest, char incr)
308 char ret;
309 __asm__ __volatile__( "lock; xaddb %0,(%1)"
310 : "=q" (ret) : "r" (dest), "0" (incr) : "memory" );
311 return ret;
314 static inline short interlocked_xchg_add16(short *dest, short incr)
316 short ret;
317 __asm__ __volatile__( "lock; xaddw %0,(%1)"
318 : "=r" (ret) : "r" (dest), "0" (incr) : "memory" );
319 return ret;
322 #else /* __GNUC__ */
324 #ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
325 static inline char interlocked_cmpxchg8(char *dest, char xchg, char compare)
327 return __sync_val_compare_and_swap(dest, compare, xchg);
330 static inline char interlocked_xchg_add8(char *dest, char incr)
332 return __sync_fetch_and_add(dest, incr);
334 #else
335 static char interlocked_cmpxchg8(char *dest, char xchg, char compare)
337 EnterCriticalSection(&vcomp_section);
338 if (*dest == compare) *dest = xchg; else compare = *dest;
339 LeaveCriticalSection(&vcomp_section);
340 return compare;
343 static char interlocked_xchg_add8(char *dest, char incr)
345 char ret;
346 EnterCriticalSection(&vcomp_section);
347 ret = *dest; *dest += incr;
348 LeaveCriticalSection(&vcomp_section);
349 return ret;
351 #endif
353 #ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
354 static inline short interlocked_cmpxchg16(short *dest, short xchg, short compare)
356 return __sync_val_compare_and_swap(dest, compare, xchg);
359 static inline short interlocked_xchg_add16(short *dest, short incr)
361 return __sync_fetch_and_add(dest, incr);
363 #else
364 static short interlocked_cmpxchg16(short *dest, short xchg, short compare)
366 EnterCriticalSection(&vcomp_section);
367 if (*dest == compare) *dest = xchg; else compare = *dest;
368 LeaveCriticalSection(&vcomp_section);
369 return compare;
372 static short interlocked_xchg_add16(short *dest, short incr)
374 short ret;
375 EnterCriticalSection(&vcomp_section);
376 ret = *dest; *dest += incr;
377 LeaveCriticalSection(&vcomp_section);
378 return ret;
380 #endif
382 #endif /* __GNUC__ */
384 static inline struct vcomp_thread_data *vcomp_get_thread_data(void)
386 return (struct vcomp_thread_data *)TlsGetValue(vcomp_context_tls);
389 static inline void vcomp_set_thread_data(struct vcomp_thread_data *thread_data)
391 TlsSetValue(vcomp_context_tls, thread_data);
394 static struct vcomp_thread_data *vcomp_init_thread_data(void)
396 struct vcomp_thread_data *thread_data = vcomp_get_thread_data();
397 struct
399 struct vcomp_thread_data thread;
400 struct vcomp_task_data task;
401 } *data;
403 if (thread_data) return thread_data;
404 if (!(data = HeapAlloc(GetProcessHeap(), 0, sizeof(*data))))
406 ERR("could not create thread data\n");
407 ExitProcess(1);
410 data->task.single = 0;
411 data->task.section = 0;
412 data->task.dynamic = 0;
414 thread_data = &data->thread;
415 thread_data->team = NULL;
416 thread_data->task = &data->task;
417 thread_data->thread_num = 0;
418 thread_data->parallel = FALSE;
419 thread_data->fork_threads = 0;
420 thread_data->single = 1;
421 thread_data->section = 1;
422 thread_data->dynamic = 1;
423 thread_data->dynamic_type = 0;
425 vcomp_set_thread_data(thread_data);
426 return thread_data;
429 static void vcomp_free_thread_data(void)
431 struct vcomp_thread_data *thread_data = vcomp_get_thread_data();
432 if (!thread_data) return;
434 HeapFree(GetProcessHeap(), 0, thread_data);
435 vcomp_set_thread_data(NULL);
438 void CDECL _vcomp_atomic_add_i1(char *dest, char val)
440 interlocked_xchg_add8(dest, val);
443 void CDECL _vcomp_atomic_and_i1(char *dest, char val)
445 char old;
446 do old = *dest; while (interlocked_cmpxchg8(dest, old & val, old) != old);
449 void CDECL _vcomp_atomic_div_i1(signed char *dest, signed char val)
451 signed char old;
452 do old = *dest; while ((signed char)interlocked_cmpxchg8((char *)dest, old / val, old) != old);
455 void CDECL _vcomp_atomic_div_ui1(unsigned char *dest, unsigned char val)
457 unsigned char old;
458 do old = *dest; while ((unsigned char)interlocked_cmpxchg8((char *)dest, old / val, old) != old);
461 void CDECL _vcomp_atomic_mul_i1(char *dest, char val)
463 char old;
464 do old = *dest; while (interlocked_cmpxchg8(dest, old * val, old) != old);
467 void CDECL _vcomp_atomic_or_i1(char *dest, char val)
469 char old;
470 do old = *dest; while (interlocked_cmpxchg8(dest, old | val, old) != old);
473 void CDECL _vcomp_atomic_shl_i1(char *dest, unsigned int val)
475 char old;
476 do old = *dest; while (interlocked_cmpxchg8(dest, old << val, old) != old);
479 void CDECL _vcomp_atomic_shr_i1(signed char *dest, unsigned int val)
481 signed char old;
482 do old = *dest; while ((signed char)interlocked_cmpxchg8((char *)dest, old >> val, old) != old);
485 void CDECL _vcomp_atomic_shr_ui1(unsigned char *dest, unsigned int val)
487 unsigned char old;
488 do old = *dest; while ((unsigned char)interlocked_cmpxchg8((char *)dest, old >> val, old) != old);
491 void CDECL _vcomp_atomic_sub_i1(char *dest, char val)
493 interlocked_xchg_add8(dest, -val);
496 void CDECL _vcomp_atomic_xor_i1(char *dest, char val)
498 char old;
499 do old = *dest; while (interlocked_cmpxchg8(dest, old ^ val, old) != old);
502 static void CDECL _vcomp_atomic_bool_and_i1(char *dest, char val)
504 char old;
505 do old = *dest; while (interlocked_cmpxchg8(dest, old && val, old) != old);
508 static void CDECL _vcomp_atomic_bool_or_i1(char *dest, char val)
510 char old;
511 do old = *dest; while (interlocked_cmpxchg8(dest, old ? old : (val != 0), old) != old);
514 void CDECL _vcomp_reduction_i1(unsigned int flags, char *dest, char val)
516 static void (CDECL * const funcs[])(char *, char) =
518 _vcomp_atomic_add_i1,
519 _vcomp_atomic_add_i1,
520 _vcomp_atomic_mul_i1,
521 _vcomp_atomic_and_i1,
522 _vcomp_atomic_or_i1,
523 _vcomp_atomic_xor_i1,
524 _vcomp_atomic_bool_and_i1,
525 _vcomp_atomic_bool_or_i1,
527 unsigned int op = (flags >> 8) & 0xf;
528 op = min(op, ARRAY_SIZE(funcs) - 1);
529 funcs[op](dest, val);
532 void CDECL _vcomp_atomic_add_i2(short *dest, short val)
534 interlocked_xchg_add16(dest, val);
537 void CDECL _vcomp_atomic_and_i2(short *dest, short val)
539 short old;
540 do old = *dest; while (interlocked_cmpxchg16(dest, old & val, old) != old);
543 void CDECL _vcomp_atomic_div_i2(short *dest, short val)
545 short old;
546 do old = *dest; while (interlocked_cmpxchg16(dest, old / val, old) != old);
549 void CDECL _vcomp_atomic_div_ui2(unsigned short *dest, unsigned short val)
551 unsigned short old;
552 do old = *dest; while ((unsigned short)interlocked_cmpxchg16((short *)dest, old / val, old) != old);
555 void CDECL _vcomp_atomic_mul_i2(short *dest, short val)
557 short old;
558 do old = *dest; while (interlocked_cmpxchg16(dest, old * val, old) != old);
561 void CDECL _vcomp_atomic_or_i2(short *dest, short val)
563 short old;
564 do old = *dest; while (interlocked_cmpxchg16(dest, old | val, old) != old);
567 void CDECL _vcomp_atomic_shl_i2(short *dest, unsigned int val)
569 short old;
570 do old = *dest; while (interlocked_cmpxchg16(dest, old << val, old) != old);
573 void CDECL _vcomp_atomic_shr_i2(short *dest, unsigned int val)
575 short old;
576 do old = *dest; while (interlocked_cmpxchg16(dest, old >> val, old) != old);
579 void CDECL _vcomp_atomic_shr_ui2(unsigned short *dest, unsigned int val)
581 unsigned short old;
582 do old = *dest; while ((unsigned short)interlocked_cmpxchg16((short *)dest, old >> val, old) != old);
585 void CDECL _vcomp_atomic_sub_i2(short *dest, short val)
587 interlocked_xchg_add16(dest, -val);
590 void CDECL _vcomp_atomic_xor_i2(short *dest, short val)
592 short old;
593 do old = *dest; while (interlocked_cmpxchg16(dest, old ^ val, old) != old);
596 static void CDECL _vcomp_atomic_bool_and_i2(short *dest, short val)
598 short old;
599 do old = *dest; while (interlocked_cmpxchg16(dest, old && val, old) != old);
602 static void CDECL _vcomp_atomic_bool_or_i2(short *dest, short val)
604 short old;
605 do old = *dest; while (interlocked_cmpxchg16(dest, old ? old : (val != 0), old) != old);
608 void CDECL _vcomp_reduction_i2(unsigned int flags, short *dest, short val)
610 static void (CDECL * const funcs[])(short *, short) =
612 _vcomp_atomic_add_i2,
613 _vcomp_atomic_add_i2,
614 _vcomp_atomic_mul_i2,
615 _vcomp_atomic_and_i2,
616 _vcomp_atomic_or_i2,
617 _vcomp_atomic_xor_i2,
618 _vcomp_atomic_bool_and_i2,
619 _vcomp_atomic_bool_or_i2,
621 unsigned int op = (flags >> 8) & 0xf;
622 op = min(op, ARRAY_SIZE(funcs) - 1);
623 funcs[op](dest, val);
626 void CDECL _vcomp_atomic_add_i4(int *dest, int val)
628 InterlockedExchangeAdd(dest, val);
631 void CDECL _vcomp_atomic_and_i4(int *dest, int val)
633 int old;
634 do old = *dest; while (InterlockedCompareExchange(dest, old & val, old) != old);
637 void CDECL _vcomp_atomic_div_i4(int *dest, int val)
639 int old;
640 do old = *dest; while (InterlockedCompareExchange(dest, old / val, old) != old);
643 void CDECL _vcomp_atomic_div_ui4(unsigned int *dest, unsigned int val)
645 unsigned int old;
646 do old = *dest; while (InterlockedCompareExchange((int *)dest, old / val, old) != old);
649 void CDECL _vcomp_atomic_mul_i4(int *dest, int val)
651 int old;
652 do old = *dest; while (InterlockedCompareExchange(dest, old * val, old) != old);
655 void CDECL _vcomp_atomic_or_i4(int *dest, int val)
657 int old;
658 do old = *dest; while (InterlockedCompareExchange(dest, old | val, old) != old);
661 void CDECL _vcomp_atomic_shl_i4(int *dest, int val)
663 int old;
664 do old = *dest; while (InterlockedCompareExchange(dest, old << val, old) != old);
667 void CDECL _vcomp_atomic_shr_i4(int *dest, int val)
669 int old;
670 do old = *dest; while (InterlockedCompareExchange(dest, old >> val, old) != old);
673 void CDECL _vcomp_atomic_shr_ui4(unsigned int *dest, unsigned int val)
675 unsigned int old;
676 do old = *dest; while (InterlockedCompareExchange((int *)dest, old >> val, old) != old);
679 void CDECL _vcomp_atomic_sub_i4(int *dest, int val)
681 InterlockedExchangeAdd(dest, -val);
684 void CDECL _vcomp_atomic_xor_i4(int *dest, int val)
686 int old;
687 do old = *dest; while (InterlockedCompareExchange(dest, old ^ val, old) != old);
690 static void CDECL _vcomp_atomic_bool_and_i4(int *dest, int val)
692 int old;
693 do old = *dest; while (InterlockedCompareExchange(dest, old && val, old) != old);
696 static void CDECL _vcomp_atomic_bool_or_i4(int *dest, int val)
698 int old;
699 do old = *dest; while (InterlockedCompareExchange(dest, old ? old : (val != 0), old) != old);
702 void CDECL _vcomp_reduction_i4(unsigned int flags, int *dest, int val)
704 static void (CDECL * const funcs[])(int *, int) =
706 _vcomp_atomic_add_i4,
707 _vcomp_atomic_add_i4,
708 _vcomp_atomic_mul_i4,
709 _vcomp_atomic_and_i4,
710 _vcomp_atomic_or_i4,
711 _vcomp_atomic_xor_i4,
712 _vcomp_atomic_bool_and_i4,
713 _vcomp_atomic_bool_or_i4,
715 unsigned int op = (flags >> 8) & 0xf;
716 op = min(op, ARRAY_SIZE(funcs) - 1);
717 funcs[op](dest, val);
720 void CDECL _vcomp_atomic_add_i8(LONG64 *dest, LONG64 val)
722 LONG64 old;
723 do old = *dest; while (InterlockedCompareExchange64(dest, old + val, old) != old);
726 void CDECL _vcomp_atomic_and_i8(LONG64 *dest, LONG64 val)
728 LONG64 old;
729 do old = *dest; while (InterlockedCompareExchange64(dest, old & val, old) != old);
732 void CDECL _vcomp_atomic_div_i8(LONG64 *dest, LONG64 val)
734 LONG64 old;
735 do old = *dest; while (InterlockedCompareExchange64(dest, old / val, old) != old);
738 void CDECL _vcomp_atomic_div_ui8(ULONG64 *dest, ULONG64 val)
740 ULONG64 old;
741 do old = *dest; while (InterlockedCompareExchange64((LONG64 *)dest, old / val, old) != old);
744 void CDECL _vcomp_atomic_mul_i8(LONG64 *dest, LONG64 val)
746 LONG64 old;
747 do old = *dest; while (InterlockedCompareExchange64(dest, old * val, old) != old);
750 void CDECL _vcomp_atomic_or_i8(LONG64 *dest, LONG64 val)
752 LONG64 old;
753 do old = *dest; while (InterlockedCompareExchange64(dest, old | val, old) != old);
756 void CDECL _vcomp_atomic_shl_i8(LONG64 *dest, unsigned int val)
758 LONG64 old;
759 do old = *dest; while (InterlockedCompareExchange64(dest, old << val, old) != old);
762 void CDECL _vcomp_atomic_shr_i8(LONG64 *dest, unsigned int val)
764 LONG64 old;
765 do old = *dest; while (InterlockedCompareExchange64(dest, old >> val, old) != old);
768 void CDECL _vcomp_atomic_shr_ui8(ULONG64 *dest, unsigned int val)
770 ULONG64 old;
771 do old = *dest; while (InterlockedCompareExchange64((LONG64 *)dest, old >> val, old) != old);
774 void CDECL _vcomp_atomic_sub_i8(LONG64 *dest, LONG64 val)
776 LONG64 old;
777 do old = *dest; while (InterlockedCompareExchange64(dest, old - val, old) != old);
780 void CDECL _vcomp_atomic_xor_i8(LONG64 *dest, LONG64 val)
782 LONG64 old;
783 do old = *dest; while (InterlockedCompareExchange64(dest, old ^ val, old) != old);
786 static void CDECL _vcomp_atomic_bool_and_i8(LONG64 *dest, LONG64 val)
788 LONG64 old;
789 do old = *dest; while (InterlockedCompareExchange64(dest, old && val, old) != old);
792 static void CDECL _vcomp_atomic_bool_or_i8(LONG64 *dest, LONG64 val)
794 LONG64 old;
795 do old = *dest; while (InterlockedCompareExchange64(dest, old ? old : (val != 0), old) != old);
798 void CDECL _vcomp_reduction_i8(unsigned int flags, LONG64 *dest, LONG64 val)
800 static void (CDECL * const funcs[])(LONG64 *, LONG64) =
802 _vcomp_atomic_add_i8,
803 _vcomp_atomic_add_i8,
804 _vcomp_atomic_mul_i8,
805 _vcomp_atomic_and_i8,
806 _vcomp_atomic_or_i8,
807 _vcomp_atomic_xor_i8,
808 _vcomp_atomic_bool_and_i8,
809 _vcomp_atomic_bool_or_i8,
811 unsigned int op = (flags >> 8) & 0xf;
812 op = min(op, ARRAY_SIZE(funcs) - 1);
813 funcs[op](dest, val);
816 void CDECL _vcomp_atomic_add_r4(float *dest, float val)
818 int old, new;
821 old = *(int *)dest;
822 *(float *)&new = *(float *)&old + val;
824 while (InterlockedCompareExchange((int *)dest, new, old) != old);
827 void CDECL _vcomp_atomic_div_r4(float *dest, float val)
829 int old, new;
832 old = *(int *)dest;
833 *(float *)&new = *(float *)&old / val;
835 while (InterlockedCompareExchange((int *)dest, new, old) != old);
838 void CDECL _vcomp_atomic_mul_r4(float *dest, float val)
840 int old, new;
843 old = *(int *)dest;
844 *(float *)&new = *(float *)&old * val;
846 while (InterlockedCompareExchange((int *)dest, new, old) != old);
849 void CDECL _vcomp_atomic_sub_r4(float *dest, float val)
851 int old, new;
854 old = *(int *)dest;
855 *(float *)&new = *(float *)&old - val;
857 while (InterlockedCompareExchange((int *)dest, new, old) != old);
860 static void CDECL _vcomp_atomic_bool_and_r4(float *dest, float val)
862 int old, new;
865 old = *(int *)dest;
866 *(float *)&new = (*(float *)&old != 0.0) ? (val != 0.0) : 0.0;
868 while (InterlockedCompareExchange((int *)dest, new, old) != old);
871 static void CDECL _vcomp_atomic_bool_or_r4(float *dest, float val)
873 int old, new;
876 old = *(int *)dest;
877 *(float *)&new = (*(float *)&old != 0.0) ? *(float *)&old : (val != 0.0);
879 while (InterlockedCompareExchange((int *)dest, new, old) != old);
882 void CDECL _vcomp_reduction_r4(unsigned int flags, float *dest, float val)
884 static void (CDECL * const funcs[])(float *, float) =
886 _vcomp_atomic_add_r4,
887 _vcomp_atomic_add_r4,
888 _vcomp_atomic_mul_r4,
889 _vcomp_atomic_bool_or_r4,
890 _vcomp_atomic_bool_or_r4,
891 _vcomp_atomic_bool_or_r4,
892 _vcomp_atomic_bool_and_r4,
893 _vcomp_atomic_bool_or_r4,
895 unsigned int op = (flags >> 8) & 0xf;
896 op = min(op, ARRAY_SIZE(funcs) - 1);
897 funcs[op](dest, val);
900 void CDECL _vcomp_atomic_add_r8(double *dest, double val)
902 LONG64 old, new;
905 old = *(LONG64 *)dest;
906 *(double *)&new = *(double *)&old + val;
908 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
911 void CDECL _vcomp_atomic_div_r8(double *dest, double val)
913 LONG64 old, new;
916 old = *(LONG64 *)dest;
917 *(double *)&new = *(double *)&old / val;
919 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
922 void CDECL _vcomp_atomic_mul_r8(double *dest, double val)
924 LONG64 old, new;
927 old = *(LONG64 *)dest;
928 *(double *)&new = *(double *)&old * val;
930 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
933 void CDECL _vcomp_atomic_sub_r8(double *dest, double val)
935 LONG64 old, new;
938 old = *(LONG64 *)dest;
939 *(double *)&new = *(double *)&old - val;
941 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
944 static void CDECL _vcomp_atomic_bool_and_r8(double *dest, double val)
946 LONG64 old, new;
949 old = *(LONG64 *)dest;
950 *(double *)&new = (*(double *)&old != 0.0) ? (val != 0.0) : 0.0;
952 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
955 static void CDECL _vcomp_atomic_bool_or_r8(double *dest, double val)
957 LONG64 old, new;
960 old = *(LONG64 *)dest;
961 *(double *)&new = (*(double *)&old != 0.0) ? *(double *)&old : (val != 0.0);
963 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
966 void CDECL _vcomp_reduction_r8(unsigned int flags, double *dest, double val)
968 static void (CDECL * const funcs[])(double *, double) =
970 _vcomp_atomic_add_r8,
971 _vcomp_atomic_add_r8,
972 _vcomp_atomic_mul_r8,
973 _vcomp_atomic_bool_or_r8,
974 _vcomp_atomic_bool_or_r8,
975 _vcomp_atomic_bool_or_r8,
976 _vcomp_atomic_bool_and_r8,
977 _vcomp_atomic_bool_or_r8,
979 unsigned int op = (flags >> 8) & 0xf;
980 op = min(op, ARRAY_SIZE(funcs) - 1);
981 funcs[op](dest, val);
984 int CDECL omp_get_dynamic(void)
986 TRACE("stub\n");
987 return 0;
990 int CDECL omp_get_max_threads(void)
992 TRACE("()\n");
993 return vcomp_max_threads;
996 int CDECL omp_get_nested(void)
998 TRACE("stub\n");
999 return vcomp_nested_fork;
1002 int CDECL omp_get_num_procs(void)
1004 TRACE("stub\n");
1005 return 1;
1008 int CDECL omp_get_num_threads(void)
1010 struct vcomp_team_data *team_data = vcomp_init_thread_data()->team;
1011 TRACE("()\n");
1012 return team_data ? team_data->num_threads : 1;
1015 int CDECL omp_get_thread_num(void)
1017 TRACE("()\n");
1018 return vcomp_init_thread_data()->thread_num;
1021 int CDECL _vcomp_get_thread_num(void)
1023 TRACE("()\n");
1024 return vcomp_init_thread_data()->thread_num;
1027 /* Time in seconds since "some time in the past" */
1028 double CDECL omp_get_wtime(void)
1030 return GetTickCount() / 1000.0;
1033 void CDECL omp_set_dynamic(int val)
1035 TRACE("(%d): stub\n", val);
1038 void CDECL omp_set_nested(int nested)
1040 TRACE("(%d)\n", nested);
1041 vcomp_nested_fork = (nested != 0);
1044 void CDECL omp_set_num_threads(int num_threads)
1046 TRACE("(%d)\n", num_threads);
1047 if (num_threads >= 1)
1048 vcomp_num_threads = num_threads;
1051 void CDECL _vcomp_flush(void)
1053 TRACE("(): stub\n");
1056 void CDECL _vcomp_barrier(void)
1058 struct vcomp_team_data *team_data = vcomp_init_thread_data()->team;
1060 TRACE("()\n");
1062 if (!team_data)
1063 return;
1065 EnterCriticalSection(&vcomp_section);
1066 if (++team_data->barrier_count >= team_data->num_threads)
1068 team_data->barrier++;
1069 team_data->barrier_count = 0;
1070 WakeAllConditionVariable(&team_data->cond);
1072 else
1074 unsigned int barrier = team_data->barrier;
1075 while (team_data->barrier == barrier)
1076 SleepConditionVariableCS(&team_data->cond, &vcomp_section, INFINITE);
1078 LeaveCriticalSection(&vcomp_section);
1081 void CDECL _vcomp_set_num_threads(int num_threads)
1083 TRACE("(%d)\n", num_threads);
1084 if (num_threads >= 1)
1085 vcomp_init_thread_data()->fork_threads = num_threads;
1088 int CDECL _vcomp_master_begin(void)
1090 TRACE("()\n");
1091 return !vcomp_init_thread_data()->thread_num;
1094 void CDECL _vcomp_master_end(void)
1096 TRACE("()\n");
1097 /* nothing to do here */
1100 int CDECL _vcomp_single_begin(int flags)
1102 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1103 struct vcomp_task_data *task_data = thread_data->task;
1104 int ret = FALSE;
1106 TRACE("(%x): semi-stub\n", flags);
1108 EnterCriticalSection(&vcomp_section);
1109 thread_data->single++;
1110 if ((int)(thread_data->single - task_data->single) > 0)
1112 task_data->single = thread_data->single;
1113 ret = TRUE;
1115 LeaveCriticalSection(&vcomp_section);
1117 return ret;
1120 void CDECL _vcomp_single_end(void)
1122 TRACE("()\n");
1123 /* nothing to do here */
1126 void CDECL _vcomp_sections_init(int n)
1128 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1129 struct vcomp_task_data *task_data = thread_data->task;
1131 TRACE("(%d)\n", n);
1133 EnterCriticalSection(&vcomp_section);
1134 thread_data->section++;
1135 if ((int)(thread_data->section - task_data->section) > 0)
1137 task_data->section = thread_data->section;
1138 task_data->num_sections = n;
1139 task_data->section_index = 0;
1141 LeaveCriticalSection(&vcomp_section);
1144 int CDECL _vcomp_sections_next(void)
1146 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1147 struct vcomp_task_data *task_data = thread_data->task;
1148 int i = -1;
1150 TRACE("()\n");
1152 EnterCriticalSection(&vcomp_section);
1153 if (thread_data->section == task_data->section &&
1154 task_data->section_index != task_data->num_sections)
1156 i = task_data->section_index++;
1158 LeaveCriticalSection(&vcomp_section);
1159 return i;
1162 void CDECL _vcomp_for_static_simple_init(unsigned int first, unsigned int last, int step,
1163 BOOL increment, unsigned int *begin, unsigned int *end)
1165 unsigned int iterations, per_thread, remaining;
1166 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1167 struct vcomp_team_data *team_data = thread_data->team;
1168 int num_threads = team_data ? team_data->num_threads : 1;
1169 int thread_num = thread_data->thread_num;
1171 TRACE("(%u, %u, %d, %u, %p, %p)\n", first, last, step, increment, begin, end);
1173 if (num_threads == 1)
1175 *begin = first;
1176 *end = last;
1177 return;
1180 if (step <= 0)
1182 *begin = 0;
1183 *end = increment ? -1 : 1;
1184 return;
1187 if (increment)
1188 iterations = 1 + (last - first) / step;
1189 else
1191 iterations = 1 + (first - last) / step;
1192 step *= -1;
1195 per_thread = iterations / num_threads;
1196 remaining = iterations - per_thread * num_threads;
1198 if (thread_num < remaining)
1199 per_thread++;
1200 else if (per_thread)
1201 first += remaining * step;
1202 else
1204 *begin = first;
1205 *end = first - step;
1206 return;
1209 *begin = first + per_thread * thread_num * step;
1210 *end = *begin + (per_thread - 1) * step;
1213 void CDECL _vcomp_for_static_init(int first, int last, int step, int chunksize, unsigned int *loops,
1214 int *begin, int *end, int *next, int *lastchunk)
1216 unsigned int iterations, num_chunks, per_thread, remaining;
1217 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1218 struct vcomp_team_data *team_data = thread_data->team;
1219 int num_threads = team_data ? team_data->num_threads : 1;
1220 int thread_num = thread_data->thread_num;
1221 int no_begin, no_lastchunk;
1223 TRACE("(%d, %d, %d, %d, %p, %p, %p, %p, %p)\n",
1224 first, last, step, chunksize, loops, begin, end, next, lastchunk);
1226 if (!begin)
1228 begin = &no_begin;
1229 lastchunk = &no_lastchunk;
1232 if (num_threads == 1 && chunksize != 1)
1234 *loops = 1;
1235 *begin = first;
1236 *end = last;
1237 *next = 0;
1238 *lastchunk = first;
1239 return;
1242 if (first == last)
1244 *loops = !thread_num;
1245 if (!thread_num)
1247 *begin = first;
1248 *end = last;
1249 *next = 0;
1250 *lastchunk = first;
1252 return;
1255 if (step <= 0)
1257 *loops = 0;
1258 return;
1261 if (first < last)
1262 iterations = 1 + (last - first) / step;
1263 else
1265 iterations = 1 + (first - last) / step;
1266 step *= -1;
1269 if (chunksize < 1)
1270 chunksize = 1;
1272 num_chunks = ((DWORD64)iterations + chunksize - 1) / chunksize;
1273 per_thread = num_chunks / num_threads;
1274 remaining = num_chunks - per_thread * num_threads;
1276 *loops = per_thread + (thread_num < remaining);
1277 *begin = first + thread_num * chunksize * step;
1278 *end = *begin + (chunksize - 1) * step;
1279 *next = chunksize * num_threads * step;
1280 *lastchunk = first + (num_chunks - 1) * chunksize * step;
1283 void CDECL _vcomp_for_static_end(void)
1285 TRACE("()\n");
1286 /* nothing to do here */
1289 void CDECL _vcomp_for_dynamic_init(unsigned int flags, unsigned int first, unsigned int last,
1290 int step, unsigned int chunksize)
1292 unsigned int iterations, per_thread, remaining;
1293 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1294 struct vcomp_team_data *team_data = thread_data->team;
1295 struct vcomp_task_data *task_data = thread_data->task;
1296 int num_threads = team_data ? team_data->num_threads : 1;
1297 int thread_num = thread_data->thread_num;
1298 unsigned int type = flags & ~VCOMP_DYNAMIC_FLAGS_INCREMENT;
1300 TRACE("(%u, %u, %u, %d, %u)\n", flags, first, last, step, chunksize);
1302 if (step <= 0)
1304 thread_data->dynamic_type = 0;
1305 return;
1308 if (flags & VCOMP_DYNAMIC_FLAGS_INCREMENT)
1309 iterations = 1 + (last - first) / step;
1310 else
1312 iterations = 1 + (first - last) / step;
1313 step *= -1;
1316 if (type == VCOMP_DYNAMIC_FLAGS_STATIC)
1318 per_thread = iterations / num_threads;
1319 remaining = iterations - per_thread * num_threads;
1321 if (thread_num < remaining)
1322 per_thread++;
1323 else if (per_thread)
1324 first += remaining * step;
1325 else
1327 thread_data->dynamic_type = 0;
1328 return;
1331 thread_data->dynamic_type = VCOMP_DYNAMIC_FLAGS_STATIC;
1332 thread_data->dynamic_begin = first + per_thread * thread_num * step;
1333 thread_data->dynamic_end = thread_data->dynamic_begin + (per_thread - 1) * step;
1335 else
1337 if (type != VCOMP_DYNAMIC_FLAGS_CHUNKED &&
1338 type != VCOMP_DYNAMIC_FLAGS_GUIDED)
1340 FIXME("unsupported flags %u\n", flags);
1341 type = VCOMP_DYNAMIC_FLAGS_GUIDED;
1344 EnterCriticalSection(&vcomp_section);
1345 thread_data->dynamic++;
1346 thread_data->dynamic_type = type;
1347 if ((int)(thread_data->dynamic - task_data->dynamic) > 0)
1349 task_data->dynamic = thread_data->dynamic;
1350 task_data->dynamic_first = first;
1351 task_data->dynamic_last = last;
1352 task_data->dynamic_iterations = iterations;
1353 task_data->dynamic_step = step;
1354 task_data->dynamic_chunksize = chunksize;
1356 LeaveCriticalSection(&vcomp_section);
1360 int CDECL _vcomp_for_dynamic_next(unsigned int *begin, unsigned int *end)
1362 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1363 struct vcomp_task_data *task_data = thread_data->task;
1364 struct vcomp_team_data *team_data = thread_data->team;
1365 int num_threads = team_data ? team_data->num_threads : 1;
1367 TRACE("(%p, %p)\n", begin, end);
1369 if (thread_data->dynamic_type == VCOMP_DYNAMIC_FLAGS_STATIC)
1371 *begin = thread_data->dynamic_begin;
1372 *end = thread_data->dynamic_end;
1373 thread_data->dynamic_type = 0;
1374 return 1;
1376 else if (thread_data->dynamic_type == VCOMP_DYNAMIC_FLAGS_CHUNKED ||
1377 thread_data->dynamic_type == VCOMP_DYNAMIC_FLAGS_GUIDED)
1379 unsigned int iterations = 0;
1380 EnterCriticalSection(&vcomp_section);
1381 if (thread_data->dynamic == task_data->dynamic &&
1382 task_data->dynamic_iterations != 0)
1384 iterations = min(task_data->dynamic_iterations, task_data->dynamic_chunksize);
1385 if (thread_data->dynamic_type == VCOMP_DYNAMIC_FLAGS_GUIDED &&
1386 task_data->dynamic_iterations > num_threads * task_data->dynamic_chunksize)
1388 iterations = (task_data->dynamic_iterations + num_threads - 1) / num_threads;
1390 *begin = task_data->dynamic_first;
1391 *end = task_data->dynamic_first + (iterations - 1) * task_data->dynamic_step;
1392 task_data->dynamic_iterations -= iterations;
1393 task_data->dynamic_first += iterations * task_data->dynamic_step;
1394 if (!task_data->dynamic_iterations)
1395 *end = task_data->dynamic_last;
1397 LeaveCriticalSection(&vcomp_section);
1398 return iterations != 0;
1401 return 0;
1404 int CDECL omp_in_parallel(void)
1406 TRACE("()\n");
1407 return vcomp_init_thread_data()->parallel;
1410 static DWORD WINAPI _vcomp_fork_worker(void *param)
1412 struct vcomp_thread_data *thread_data = param;
1413 vcomp_set_thread_data(thread_data);
1415 TRACE("starting worker thread for %p\n", thread_data);
1417 EnterCriticalSection(&vcomp_section);
1418 for (;;)
1420 struct vcomp_team_data *team = thread_data->team;
1421 if (team != NULL)
1423 LeaveCriticalSection(&vcomp_section);
1424 _vcomp_fork_call_wrapper(team->wrapper, team->nargs, ptr_from_va_list(team->valist));
1425 EnterCriticalSection(&vcomp_section);
1427 thread_data->team = NULL;
1428 list_remove(&thread_data->entry);
1429 list_add_tail(&vcomp_idle_threads, &thread_data->entry);
1430 if (++team->finished_threads >= team->num_threads)
1431 WakeAllConditionVariable(&team->cond);
1434 if (!SleepConditionVariableCS(&thread_data->cond, &vcomp_section, 5000) &&
1435 GetLastError() == ERROR_TIMEOUT && !thread_data->team)
1437 break;
1440 list_remove(&thread_data->entry);
1441 LeaveCriticalSection(&vcomp_section);
1443 TRACE("terminating worker thread for %p\n", thread_data);
1445 HeapFree(GetProcessHeap(), 0, thread_data);
1446 vcomp_set_thread_data(NULL);
1447 FreeLibraryAndExitThread(vcomp_module, 0);
1448 return 0;
1451 void WINAPIV _vcomp_fork(BOOL ifval, int nargs, void *wrapper, ...)
1453 struct vcomp_thread_data *prev_thread_data = vcomp_init_thread_data();
1454 struct vcomp_thread_data thread_data;
1455 struct vcomp_team_data team_data;
1456 struct vcomp_task_data task_data;
1457 int num_threads;
1459 TRACE("(%d, %d, %p, ...)\n", ifval, nargs, wrapper);
1461 if (prev_thread_data->parallel && !vcomp_nested_fork)
1462 ifval = FALSE;
1464 if (!ifval)
1465 num_threads = 1;
1466 else if (prev_thread_data->fork_threads)
1467 num_threads = prev_thread_data->fork_threads;
1468 else
1469 num_threads = vcomp_num_threads;
1471 InitializeConditionVariable(&team_data.cond);
1472 team_data.num_threads = 1;
1473 team_data.finished_threads = 0;
1474 team_data.nargs = nargs;
1475 team_data.wrapper = wrapper;
1476 __ms_va_start(team_data.valist, wrapper);
1477 team_data.barrier = 0;
1478 team_data.barrier_count = 0;
1480 task_data.single = 0;
1481 task_data.section = 0;
1482 task_data.dynamic = 0;
1484 thread_data.team = &team_data;
1485 thread_data.task = &task_data;
1486 thread_data.thread_num = 0;
1487 thread_data.parallel = ifval || prev_thread_data->parallel;
1488 thread_data.fork_threads = 0;
1489 thread_data.single = 1;
1490 thread_data.section = 1;
1491 thread_data.dynamic = 1;
1492 thread_data.dynamic_type = 0;
1493 list_init(&thread_data.entry);
1494 InitializeConditionVariable(&thread_data.cond);
1496 if (num_threads > 1)
1498 struct list *ptr;
1499 EnterCriticalSection(&vcomp_section);
1501 /* reuse existing threads (if any) */
1502 while (team_data.num_threads < num_threads && (ptr = list_head(&vcomp_idle_threads)))
1504 struct vcomp_thread_data *data = LIST_ENTRY(ptr, struct vcomp_thread_data, entry);
1505 data->team = &team_data;
1506 data->task = &task_data;
1507 data->thread_num = team_data.num_threads++;
1508 data->parallel = thread_data.parallel;
1509 data->fork_threads = 0;
1510 data->single = 1;
1511 data->section = 1;
1512 data->dynamic = 1;
1513 data->dynamic_type = 0;
1514 list_remove(&data->entry);
1515 list_add_tail(&thread_data.entry, &data->entry);
1516 WakeAllConditionVariable(&data->cond);
1519 /* spawn additional threads */
1520 while (team_data.num_threads < num_threads)
1522 struct vcomp_thread_data *data;
1523 HMODULE module;
1524 HANDLE thread;
1526 data = HeapAlloc(GetProcessHeap(), 0, sizeof(*data));
1527 if (!data) break;
1529 data->team = &team_data;
1530 data->task = &task_data;
1531 data->thread_num = team_data.num_threads;
1532 data->parallel = thread_data.parallel;
1533 data->fork_threads = 0;
1534 data->single = 1;
1535 data->section = 1;
1536 data->dynamic = 1;
1537 data->dynamic_type = 0;
1538 InitializeConditionVariable(&data->cond);
1540 thread = CreateThread(NULL, 0, _vcomp_fork_worker, data, 0, NULL);
1541 if (!thread)
1543 HeapFree(GetProcessHeap(), 0, data);
1544 break;
1547 GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS,
1548 (const WCHAR *)vcomp_module, &module);
1549 team_data.num_threads++;
1550 list_add_tail(&thread_data.entry, &data->entry);
1551 CloseHandle(thread);
1554 LeaveCriticalSection(&vcomp_section);
1557 vcomp_set_thread_data(&thread_data);
1558 _vcomp_fork_call_wrapper(team_data.wrapper, team_data.nargs, ptr_from_va_list(team_data.valist));
1559 vcomp_set_thread_data(prev_thread_data);
1560 prev_thread_data->fork_threads = 0;
1562 if (team_data.num_threads > 1)
1564 EnterCriticalSection(&vcomp_section);
1566 team_data.finished_threads++;
1567 while (team_data.finished_threads < team_data.num_threads)
1568 SleepConditionVariableCS(&team_data.cond, &vcomp_section, INFINITE);
1570 LeaveCriticalSection(&vcomp_section);
1571 assert(list_empty(&thread_data.entry));
1574 __ms_va_end(team_data.valist);
1577 static CRITICAL_SECTION *alloc_critsect(void)
1579 CRITICAL_SECTION *critsect;
1580 if (!(critsect = HeapAlloc(GetProcessHeap(), 0, sizeof(*critsect))))
1582 ERR("could not allocate critical section\n");
1583 ExitProcess(1);
1586 InitializeCriticalSection(critsect);
1587 critsect->DebugInfo->Spare[0] = (DWORD_PTR)(__FILE__ ": critsect");
1588 return critsect;
1591 static void destroy_critsect(CRITICAL_SECTION *critsect)
1593 if (!critsect) return;
1594 critsect->DebugInfo->Spare[0] = 0;
1595 DeleteCriticalSection(critsect);
1596 HeapFree(GetProcessHeap(), 0, critsect);
1599 void CDECL omp_init_lock(omp_lock_t *lock)
1601 TRACE("(%p)\n", lock);
1602 *lock = alloc_critsect();
1605 void CDECL omp_destroy_lock(omp_lock_t *lock)
1607 TRACE("(%p)\n", lock);
1608 destroy_critsect(*lock);
1611 void CDECL omp_set_lock(omp_lock_t *lock)
1613 TRACE("(%p)\n", lock);
1615 if (RtlIsCriticalSectionLockedByThread(*lock))
1617 ERR("omp_set_lock called while holding lock %p\n", *lock);
1618 ExitProcess(1);
1621 EnterCriticalSection(*lock);
1624 void CDECL omp_unset_lock(omp_lock_t *lock)
1626 TRACE("(%p)\n", lock);
1627 LeaveCriticalSection(*lock);
1630 int CDECL omp_test_lock(omp_lock_t *lock)
1632 TRACE("(%p)\n", lock);
1634 if (RtlIsCriticalSectionLockedByThread(*lock))
1635 return 0;
1637 return TryEnterCriticalSection(*lock);
1640 void CDECL omp_set_nest_lock(omp_nest_lock_t *lock)
1642 TRACE("(%p)\n", lock);
1643 EnterCriticalSection(*lock);
1646 void CDECL omp_unset_nest_lock(omp_nest_lock_t *lock)
1648 TRACE("(%p)\n", lock);
1649 LeaveCriticalSection(*lock);
1652 int CDECL omp_test_nest_lock(omp_nest_lock_t *lock)
1654 TRACE("(%p)\n", lock);
1655 return TryEnterCriticalSection(*lock) ? (*lock)->RecursionCount : 0;
1658 void CDECL _vcomp_enter_critsect(CRITICAL_SECTION **critsect)
1660 TRACE("(%p)\n", critsect);
1662 if (!*critsect)
1664 CRITICAL_SECTION *new_critsect = alloc_critsect();
1665 if (InterlockedCompareExchangePointer((void **)critsect, new_critsect, NULL) != NULL)
1666 destroy_critsect(new_critsect); /* someone beat us to it */
1669 EnterCriticalSection(*critsect);
1672 void CDECL _vcomp_leave_critsect(CRITICAL_SECTION *critsect)
1674 TRACE("(%p)\n", critsect);
1675 LeaveCriticalSection(critsect);
1678 static unsigned int get_step_count(int start, int end, int range_offset, int step)
1680 int range = end - start + step - range_offset;
1682 if (step < 0)
1683 return (unsigned)-range / -step;
1684 else
1685 return (unsigned)range / step;
1688 static void CDECL c2vectparallel_wrapper(int start, int end, int step, int end_included, BOOL dynamic_distribution,
1689 int volatile *dynamic_start, void *function, int nargs, __ms_va_list valist)
1691 void *wrapper_args[MAX_VECT_PARALLEL_CALLBACK_ARGS];
1692 unsigned int step_count, steps_per_call, remainder;
1693 int thread_count = omp_get_num_threads();
1694 int curr_start, curr_end, range_offset;
1695 int thread = _vcomp_get_thread_num();
1696 int step_sign;
1698 copy_va_list_data(&wrapper_args[2], valist, nargs - 2);
1700 step_sign = step > 0 ? 1 : -1;
1701 range_offset = step_sign * !end_included;
1703 if (dynamic_distribution)
1705 int next_start, new_start, end_value;
1707 start = *dynamic_start;
1708 end_value = end + !!end_included * step;
1709 while (start != end_value)
1711 step_count = get_step_count(start, end, range_offset, step);
1713 curr_end = start + (step_count + thread_count - 1) / thread_count * step
1714 + range_offset;
1716 if ((curr_end - end) * step_sign > 0)
1718 next_start = end_value;
1719 curr_end = end;
1721 else
1723 next_start = curr_end - range_offset;
1724 curr_end -= step;
1727 if ((new_start = InterlockedCompareExchange(dynamic_start, next_start, start)) != start)
1729 start = new_start;
1730 continue;
1733 wrapper_args[0] = (void *)(ULONG_PTR)start;
1734 wrapper_args[1] = (void *)(ULONG_PTR)curr_end;
1735 _vcomp_fork_call_wrapper(function, nargs, wrapper_args);
1736 start = *dynamic_start;
1738 return;
1741 step_count = get_step_count(start, end, range_offset, step);
1743 /* According to the tests native vcomp still makes extra calls
1744 * with empty range from excessive threads under certain conditions
1745 * for unclear reason. */
1746 if (thread >= step_count && (end_included || (step != 1 && step != -1)))
1747 return;
1749 steps_per_call = step_count / thread_count;
1750 remainder = step_count % thread_count;
1752 if (thread < remainder)
1754 curr_start = thread * (steps_per_call + 1);
1755 curr_end = curr_start + steps_per_call + 1;
1757 else if (thread < step_count)
1759 curr_start = remainder + steps_per_call * thread;
1760 curr_end = curr_start + steps_per_call;
1762 else
1764 curr_start = curr_end = 0;
1767 curr_start = start + curr_start * step;
1768 curr_end = start + (curr_end - 1) * step + range_offset;
1770 wrapper_args[0] = (void *)(ULONG_PTR)curr_start;
1771 wrapper_args[1] = (void *)(ULONG_PTR)curr_end;
1772 _vcomp_fork_call_wrapper(function, nargs, wrapper_args);
1775 void WINAPIV C2VectParallel(int start, int end, int step, BOOL end_included, int thread_count,
1776 BOOL dynamic_distribution, void *function, int nargs, ...)
1778 struct vcomp_thread_data *thread_data;
1779 int volatile dynamic_start;
1780 int prev_thread_count;
1781 __ms_va_list valist;
1783 TRACE("start %d, end %d, step %d, end_included %d, thread_count %d, dynamic_distribution %#x,"
1784 " function %p, nargs %d.\n", start, end, step, end_included, thread_count,
1785 dynamic_distribution, function, nargs);
1787 if (nargs > MAX_VECT_PARALLEL_CALLBACK_ARGS)
1789 FIXME("Number of arguments %u exceeds supported maximum %u"
1790 " (not calling the loop code, expect problems).\n",
1791 nargs, MAX_VECT_PARALLEL_CALLBACK_ARGS);
1792 return;
1795 __ms_va_start(valist, nargs);
1797 /* This expression can result in integer overflow. According to the tests,
1798 * native vcomp runs the function as a single thread both for empty range
1799 * and (end - start) not fitting the integer range. */
1800 if ((step > 0 && end < start) || (step < 0 && end > start)
1801 || (end - start) / step < 2 || thread_count < 0)
1803 void *wrapper_args[MAX_VECT_PARALLEL_CALLBACK_ARGS];
1805 wrapper_args[0] = (void *)(ULONG_PTR)start;
1806 wrapper_args[1] = (void *)(ULONG_PTR)end;
1807 copy_va_list_data(&wrapper_args[2], valist, nargs - 2);
1808 _vcomp_fork_call_wrapper(function, nargs, wrapper_args);
1809 __ms_va_end(valist);
1810 return;
1813 thread_data = vcomp_init_thread_data();
1814 prev_thread_count = thread_data->fork_threads;
1815 thread_data->fork_threads = thread_count;
1817 dynamic_start = start;
1819 _vcomp_fork(TRUE, 9, c2vectparallel_wrapper, start, end, step, end_included, dynamic_distribution,
1820 &dynamic_start, function, nargs, valist);
1822 thread_data->fork_threads = prev_thread_count;
1823 __ms_va_end(valist);
1826 BOOL WINAPI DllMain(HINSTANCE instance, DWORD reason, LPVOID reserved)
1828 TRACE("(%p, %d, %p)\n", instance, reason, reserved);
1830 switch (reason)
1832 case DLL_PROCESS_ATTACH:
1834 SYSTEM_INFO sysinfo;
1836 if ((vcomp_context_tls = TlsAlloc()) == TLS_OUT_OF_INDEXES)
1838 ERR("Failed to allocate TLS index\n");
1839 return FALSE;
1842 GetSystemInfo(&sysinfo);
1843 vcomp_module = instance;
1844 vcomp_max_threads = sysinfo.dwNumberOfProcessors;
1845 vcomp_num_threads = sysinfo.dwNumberOfProcessors;
1846 break;
1849 case DLL_PROCESS_DETACH:
1851 if (reserved) break;
1852 if (vcomp_context_tls != TLS_OUT_OF_INDEXES)
1854 vcomp_free_thread_data();
1855 TlsFree(vcomp_context_tls);
1857 break;
1860 case DLL_THREAD_DETACH:
1862 vcomp_free_thread_data();
1863 break;
1867 return TRUE;