Release 8.16.
[wine.git] / dlls / vcomp / main.c
blob374adf3991a8d1aea3a8a435cbd9fe6421276d74
1 /*
3 * vcomp implementation
5 * Copyright 2011 Austin English
6 * Copyright 2012 Dan Kegel
7 * Copyright 2015-2016 Sebastian Lackner
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
24 #include <stdarg.h>
25 #include <assert.h>
27 #include "windef.h"
28 #include "winbase.h"
29 #include "winternl.h"
30 #include "wine/debug.h"
31 #include "wine/list.h"
32 #include "wine/asm.h"
34 WINE_DEFAULT_DEBUG_CHANNEL(vcomp);
36 #define MAX_VECT_PARALLEL_CALLBACK_ARGS 128
38 typedef CRITICAL_SECTION *omp_lock_t;
39 typedef CRITICAL_SECTION *omp_nest_lock_t;
41 static struct list vcomp_idle_threads = LIST_INIT(vcomp_idle_threads);
42 static DWORD vcomp_context_tls = TLS_OUT_OF_INDEXES;
43 static HMODULE vcomp_module;
44 static int vcomp_max_threads;
45 static int vcomp_num_threads;
46 static int vcomp_num_procs;
47 static BOOL vcomp_nested_fork = FALSE;
49 static RTL_CRITICAL_SECTION vcomp_section;
50 static RTL_CRITICAL_SECTION_DEBUG critsect_debug =
52 0, 0, &vcomp_section,
53 { &critsect_debug.ProcessLocksList, &critsect_debug.ProcessLocksList },
54 0, 0, { (DWORD_PTR)(__FILE__ ": vcomp_section") }
56 static RTL_CRITICAL_SECTION vcomp_section = { &critsect_debug, -1, 0, 0, 0, 0 };
58 #define VCOMP_DYNAMIC_FLAGS_STATIC 0x01
59 #define VCOMP_DYNAMIC_FLAGS_CHUNKED 0x02
60 #define VCOMP_DYNAMIC_FLAGS_GUIDED 0x03
61 #define VCOMP_DYNAMIC_FLAGS_INCREMENT 0x40
63 struct vcomp_thread_data
65 struct vcomp_team_data *team;
66 struct vcomp_task_data *task;
67 int thread_num;
68 BOOL parallel;
69 int fork_threads;
71 /* only used for concurrent tasks */
72 struct list entry;
73 CONDITION_VARIABLE cond;
75 /* single */
76 unsigned int single;
78 /* section */
79 unsigned int section;
81 /* dynamic */
82 unsigned int dynamic;
83 unsigned int dynamic_type;
84 unsigned int dynamic_begin;
85 unsigned int dynamic_end;
88 struct vcomp_team_data
90 CONDITION_VARIABLE cond;
91 int num_threads;
92 int finished_threads;
94 /* callback arguments */
95 int nargs;
96 void *wrapper;
97 va_list valist;
99 /* barrier */
100 unsigned int barrier;
101 int barrier_count;
104 struct vcomp_task_data
106 /* single */
107 unsigned int single;
109 /* section */
110 unsigned int section;
111 int num_sections;
112 int section_index;
114 /* dynamic */
115 unsigned int dynamic;
116 unsigned int dynamic_first;
117 unsigned int dynamic_last;
118 unsigned int dynamic_iterations;
119 int dynamic_step;
120 unsigned int dynamic_chunksize;
123 static void **ptr_from_va_list(va_list valist)
125 return *(void ***)&valist;
128 static void copy_va_list_data(void **args, va_list valist, int args_count)
130 unsigned int i;
132 for (i = 0; i < args_count; ++i)
133 args[i] = va_arg(valist, void *);
136 #if defined(__i386__)
138 extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args);
139 __ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper,
140 "pushl %ebp\n\t"
141 __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t")
142 __ASM_CFI(".cfi_rel_offset %ebp,0\n\t")
143 "movl %esp,%ebp\n\t"
144 __ASM_CFI(".cfi_def_cfa_register %ebp\n\t")
145 "pushl %esi\n\t"
146 __ASM_CFI(".cfi_rel_offset %esi,-4\n\t")
147 "pushl %edi\n\t"
148 __ASM_CFI(".cfi_rel_offset %edi,-8\n\t")
149 "movl 12(%ebp),%edx\n\t"
150 "movl %esp,%edi\n\t"
151 "shll $2,%edx\n\t"
152 "jz 1f\n\t"
153 "subl %edx,%edi\n\t"
154 "andl $~15,%edi\n\t"
155 "movl %edi,%esp\n\t"
156 "movl 12(%ebp),%ecx\n\t"
157 "movl 16(%ebp),%esi\n\t"
158 "cld\n\t"
159 "rep; movsl\n"
160 "1:\tcall *8(%ebp)\n\t"
161 "leal -8(%ebp),%esp\n\t"
162 "popl %edi\n\t"
163 __ASM_CFI(".cfi_same_value %edi\n\t")
164 "popl %esi\n\t"
165 __ASM_CFI(".cfi_same_value %esi\n\t")
166 "popl %ebp\n\t"
167 __ASM_CFI(".cfi_def_cfa %esp,4\n\t")
168 __ASM_CFI(".cfi_same_value %ebp\n\t")
169 "ret" )
171 #elif defined(__x86_64__)
173 extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args);
174 __ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper,
175 "pushq %rbp\n\t"
176 __ASM_SEH(".seh_pushreg %rbp\n\t")
177 __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t")
178 __ASM_CFI(".cfi_rel_offset %rbp,0\n\t")
179 "movq %rsp,%rbp\n\t"
180 __ASM_SEH(".seh_setframe %rbp,0\n\t")
181 __ASM_CFI(".cfi_def_cfa_register %rbp\n\t")
182 "pushq %rsi\n\t"
183 __ASM_SEH(".seh_pushreg %rsi\n\t")
184 __ASM_CFI(".cfi_rel_offset %rsi,-8\n\t")
185 "pushq %rdi\n\t"
186 __ASM_SEH(".seh_pushreg %rdi\n\t")
187 __ASM_SEH(".seh_endprologue\n\t")
188 __ASM_CFI(".cfi_rel_offset %rdi,-16\n\t")
189 "movq %rcx,%rax\n\t"
190 "movq $4,%rcx\n\t"
191 "cmp %rcx,%rdx\n\t"
192 "cmovgq %rdx,%rcx\n\t"
193 "leaq 0(,%rcx,8),%rdx\n\t"
194 "subq %rdx,%rsp\n\t"
195 "andq $~15,%rsp\n\t"
196 "movq %rsp,%rdi\n\t"
197 "movq %r8,%rsi\n\t"
198 "rep; movsq\n\t"
199 "movq 0(%rsp),%rcx\n\t"
200 "movq 8(%rsp),%rdx\n\t"
201 "movq 16(%rsp),%r8\n\t"
202 "movq 24(%rsp),%r9\n\t"
203 "callq *%rax\n\t"
204 "leaq -16(%rbp),%rsp\n\t"
205 "popq %rdi\n\t"
206 __ASM_CFI(".cfi_same_value %rdi\n\t")
207 "popq %rsi\n\t"
208 __ASM_CFI(".cfi_same_value %rsi\n\t")
209 __ASM_CFI(".cfi_def_cfa_register %rsp\n\t")
210 "popq %rbp\n\t"
211 __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
212 __ASM_CFI(".cfi_same_value %rbp\n\t")
213 "ret")
215 #elif defined(__arm__)
217 extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args);
218 __ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper,
219 "push {r4, r5, LR}\n\t"
220 "mov r4, r0\n\t"
221 "mov r5, SP\n\t"
222 "lsl r3, r1, #2\n\t"
223 "cmp r3, #0\n\t"
224 "beq 5f\n\t"
225 "sub SP, SP, r3\n\t"
226 "tst r1, #1\n\t"
227 "it eq\n\t"
228 "subeq SP, SP, #4\n\t"
229 "1:\tsub r3, r3, #4\n\t"
230 "ldr r0, [r2, r3]\n\t"
231 "str r0, [SP, r3]\n\t"
232 "cmp r3, #0\n\t"
233 "bgt 1b\n\t"
234 "cmp r1, #1\n\t"
235 "bgt 2f\n\t"
236 "pop {r0}\n\t"
237 "b 5f\n\t"
238 "2:\tcmp r1, #2\n\t"
239 "bgt 3f\n\t"
240 "pop {r0-r1}\n\t"
241 "b 5f\n\t"
242 "3:\tcmp r1, #3\n\t"
243 "bgt 4f\n\t"
244 "pop {r0-r2}\n\t"
245 "b 5f\n\t"
246 "4:\tpop {r0-r3}\n\t"
247 "5:\tblx r4\n\t"
248 "mov SP, r5\n\t"
249 "pop {r4, r5, PC}" )
251 #elif defined(__aarch64__)
253 extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args);
254 __ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper,
255 "stp x29, x30, [SP,#-16]!\n\t"
256 "mov x29, SP\n\t"
257 "mov x9, x0\n\t"
258 "cbz w1, 4f\n\t"
259 "lsl w8, w1, #3\n\t"
260 "cmp w8, #64\n\t"
261 "b.ge 1f\n\t"
262 "mov w8, #64\n"
263 "1:\ttbz w8, #3, 2f\n\t"
264 "add w8, w8, #8\n"
265 "2:\tsub x10, x29, x8\n\t"
266 "mov sp, x10\n"
267 "3:\tldr x0, [x2], #8\n\t"
268 "str x0, [x10], #8\n\t"
269 "subs w1, w1, #1\n\t"
270 "b.ne 3b\n\t"
271 "ldp x0, x1, [sp], #16\n\t"
272 "ldp x2, x3, [sp], #16\n\t"
273 "ldp x4, x5, [sp], #16\n\t"
274 "ldp x6, x7, [sp], #16\n"
275 "4:\tblr x9\n\t"
276 "mov SP, x29\n\t"
277 "ldp x29, x30, [SP], #16\n\t"
278 "ret\n" )
280 #else
282 static void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args)
284 ERR("Not implemented for this architecture\n");
287 #endif
289 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
291 static inline char interlocked_cmpxchg8(char *dest, char xchg, char compare)
293 char ret;
294 __asm__ __volatile__( "lock; cmpxchgb %2,(%1)"
295 : "=a" (ret) : "r" (dest), "q" (xchg), "0" (compare) : "memory" );
296 return ret;
299 static inline short interlocked_cmpxchg16(short *dest, short xchg, short compare)
301 short ret;
302 __asm__ __volatile__( "lock; cmpxchgw %2,(%1)"
303 : "=a" (ret) : "r" (dest), "r" (xchg), "0" (compare) : "memory" );
304 return ret;
307 static inline char interlocked_xchg_add8(char *dest, char incr)
309 char ret;
310 __asm__ __volatile__( "lock; xaddb %0,(%1)"
311 : "=q" (ret) : "r" (dest), "0" (incr) : "memory" );
312 return ret;
315 static inline short interlocked_xchg_add16(short *dest, short incr)
317 short ret;
318 __asm__ __volatile__( "lock; xaddw %0,(%1)"
319 : "=r" (ret) : "r" (dest), "0" (incr) : "memory" );
320 return ret;
323 #else /* __GNUC__ */
325 #ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
326 static inline char interlocked_cmpxchg8(char *dest, char xchg, char compare)
328 return __sync_val_compare_and_swap(dest, compare, xchg);
331 static inline char interlocked_xchg_add8(char *dest, char incr)
333 return __sync_fetch_and_add(dest, incr);
335 #else
336 static char interlocked_cmpxchg8(char *dest, char xchg, char compare)
338 EnterCriticalSection(&vcomp_section);
339 if (*dest == compare) *dest = xchg; else compare = *dest;
340 LeaveCriticalSection(&vcomp_section);
341 return compare;
344 static char interlocked_xchg_add8(char *dest, char incr)
346 char ret;
347 EnterCriticalSection(&vcomp_section);
348 ret = *dest; *dest += incr;
349 LeaveCriticalSection(&vcomp_section);
350 return ret;
352 #endif
354 #ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
355 static inline short interlocked_cmpxchg16(short *dest, short xchg, short compare)
357 return __sync_val_compare_and_swap(dest, compare, xchg);
360 static inline short interlocked_xchg_add16(short *dest, short incr)
362 return __sync_fetch_and_add(dest, incr);
364 #else
365 static short interlocked_cmpxchg16(short *dest, short xchg, short compare)
367 EnterCriticalSection(&vcomp_section);
368 if (*dest == compare) *dest = xchg; else compare = *dest;
369 LeaveCriticalSection(&vcomp_section);
370 return compare;
373 static short interlocked_xchg_add16(short *dest, short incr)
375 short ret;
376 EnterCriticalSection(&vcomp_section);
377 ret = *dest; *dest += incr;
378 LeaveCriticalSection(&vcomp_section);
379 return ret;
381 #endif
383 #endif /* __GNUC__ */
385 static inline struct vcomp_thread_data *vcomp_get_thread_data(void)
387 return (struct vcomp_thread_data *)TlsGetValue(vcomp_context_tls);
390 static inline void vcomp_set_thread_data(struct vcomp_thread_data *thread_data)
392 TlsSetValue(vcomp_context_tls, thread_data);
395 static struct vcomp_thread_data *vcomp_init_thread_data(void)
397 struct vcomp_thread_data *thread_data = vcomp_get_thread_data();
398 struct
400 struct vcomp_thread_data thread;
401 struct vcomp_task_data task;
402 } *data;
404 if (thread_data) return thread_data;
405 if (!(data = HeapAlloc(GetProcessHeap(), 0, sizeof(*data))))
407 ERR("could not create thread data\n");
408 ExitProcess(1);
411 data->task.single = 0;
412 data->task.section = 0;
413 data->task.dynamic = 0;
415 thread_data = &data->thread;
416 thread_data->team = NULL;
417 thread_data->task = &data->task;
418 thread_data->thread_num = 0;
419 thread_data->parallel = FALSE;
420 thread_data->fork_threads = 0;
421 thread_data->single = 1;
422 thread_data->section = 1;
423 thread_data->dynamic = 1;
424 thread_data->dynamic_type = 0;
426 vcomp_set_thread_data(thread_data);
427 return thread_data;
430 static void vcomp_free_thread_data(void)
432 struct vcomp_thread_data *thread_data = vcomp_get_thread_data();
433 if (!thread_data) return;
435 HeapFree(GetProcessHeap(), 0, thread_data);
436 vcomp_set_thread_data(NULL);
439 void CDECL _vcomp_atomic_add_i1(char *dest, char val)
441 interlocked_xchg_add8(dest, val);
444 void CDECL _vcomp_atomic_and_i1(char *dest, char val)
446 char old;
447 do old = *dest; while (interlocked_cmpxchg8(dest, old & val, old) != old);
450 void CDECL _vcomp_atomic_div_i1(signed char *dest, signed char val)
452 signed char old;
453 do old = *dest; while ((signed char)interlocked_cmpxchg8((char *)dest, old / val, old) != old);
456 void CDECL _vcomp_atomic_div_ui1(unsigned char *dest, unsigned char val)
458 unsigned char old;
459 do old = *dest; while ((unsigned char)interlocked_cmpxchg8((char *)dest, old / val, old) != old);
462 void CDECL _vcomp_atomic_mul_i1(char *dest, char val)
464 char old;
465 do old = *dest; while (interlocked_cmpxchg8(dest, old * val, old) != old);
468 void CDECL _vcomp_atomic_or_i1(char *dest, char val)
470 char old;
471 do old = *dest; while (interlocked_cmpxchg8(dest, old | val, old) != old);
474 void CDECL _vcomp_atomic_shl_i1(char *dest, unsigned int val)
476 char old;
477 do old = *dest; while (interlocked_cmpxchg8(dest, old << val, old) != old);
480 void CDECL _vcomp_atomic_shr_i1(signed char *dest, unsigned int val)
482 signed char old;
483 do old = *dest; while ((signed char)interlocked_cmpxchg8((char *)dest, old >> val, old) != old);
486 void CDECL _vcomp_atomic_shr_ui1(unsigned char *dest, unsigned int val)
488 unsigned char old;
489 do old = *dest; while ((unsigned char)interlocked_cmpxchg8((char *)dest, old >> val, old) != old);
492 void CDECL _vcomp_atomic_sub_i1(char *dest, char val)
494 interlocked_xchg_add8(dest, -val);
497 void CDECL _vcomp_atomic_xor_i1(char *dest, char val)
499 char old;
500 do old = *dest; while (interlocked_cmpxchg8(dest, old ^ val, old) != old);
503 static void CDECL _vcomp_atomic_bool_and_i1(char *dest, char val)
505 char old;
506 do old = *dest; while (interlocked_cmpxchg8(dest, old && val, old) != old);
509 static void CDECL _vcomp_atomic_bool_or_i1(char *dest, char val)
511 char old;
512 do old = *dest; while (interlocked_cmpxchg8(dest, old ? old : (val != 0), old) != old);
515 void CDECL _vcomp_reduction_i1(unsigned int flags, char *dest, char val)
517 static void (CDECL * const funcs[])(char *, char) =
519 _vcomp_atomic_add_i1,
520 _vcomp_atomic_add_i1,
521 _vcomp_atomic_mul_i1,
522 _vcomp_atomic_and_i1,
523 _vcomp_atomic_or_i1,
524 _vcomp_atomic_xor_i1,
525 _vcomp_atomic_bool_and_i1,
526 _vcomp_atomic_bool_or_i1,
528 unsigned int op = (flags >> 8) & 0xf;
529 op = min(op, ARRAY_SIZE(funcs) - 1);
530 funcs[op](dest, val);
533 void CDECL _vcomp_atomic_add_i2(short *dest, short val)
535 interlocked_xchg_add16(dest, val);
538 void CDECL _vcomp_atomic_and_i2(short *dest, short val)
540 short old;
541 do old = *dest; while (interlocked_cmpxchg16(dest, old & val, old) != old);
544 void CDECL _vcomp_atomic_div_i2(short *dest, short val)
546 short old;
547 do old = *dest; while (interlocked_cmpxchg16(dest, old / val, old) != old);
550 void CDECL _vcomp_atomic_div_ui2(unsigned short *dest, unsigned short val)
552 unsigned short old;
553 do old = *dest; while ((unsigned short)interlocked_cmpxchg16((short *)dest, old / val, old) != old);
556 void CDECL _vcomp_atomic_mul_i2(short *dest, short val)
558 short old;
559 do old = *dest; while (interlocked_cmpxchg16(dest, old * val, old) != old);
562 void CDECL _vcomp_atomic_or_i2(short *dest, short val)
564 short old;
565 do old = *dest; while (interlocked_cmpxchg16(dest, old | val, old) != old);
568 void CDECL _vcomp_atomic_shl_i2(short *dest, unsigned int val)
570 short old;
571 do old = *dest; while (interlocked_cmpxchg16(dest, old << val, old) != old);
574 void CDECL _vcomp_atomic_shr_i2(short *dest, unsigned int val)
576 short old;
577 do old = *dest; while (interlocked_cmpxchg16(dest, old >> val, old) != old);
580 void CDECL _vcomp_atomic_shr_ui2(unsigned short *dest, unsigned int val)
582 unsigned short old;
583 do old = *dest; while ((unsigned short)interlocked_cmpxchg16((short *)dest, old >> val, old) != old);
586 void CDECL _vcomp_atomic_sub_i2(short *dest, short val)
588 interlocked_xchg_add16(dest, -val);
591 void CDECL _vcomp_atomic_xor_i2(short *dest, short val)
593 short old;
594 do old = *dest; while (interlocked_cmpxchg16(dest, old ^ val, old) != old);
597 static void CDECL _vcomp_atomic_bool_and_i2(short *dest, short val)
599 short old;
600 do old = *dest; while (interlocked_cmpxchg16(dest, old && val, old) != old);
603 static void CDECL _vcomp_atomic_bool_or_i2(short *dest, short val)
605 short old;
606 do old = *dest; while (interlocked_cmpxchg16(dest, old ? old : (val != 0), old) != old);
609 void CDECL _vcomp_reduction_i2(unsigned int flags, short *dest, short val)
611 static void (CDECL * const funcs[])(short *, short) =
613 _vcomp_atomic_add_i2,
614 _vcomp_atomic_add_i2,
615 _vcomp_atomic_mul_i2,
616 _vcomp_atomic_and_i2,
617 _vcomp_atomic_or_i2,
618 _vcomp_atomic_xor_i2,
619 _vcomp_atomic_bool_and_i2,
620 _vcomp_atomic_bool_or_i2,
622 unsigned int op = (flags >> 8) & 0xf;
623 op = min(op, ARRAY_SIZE(funcs) - 1);
624 funcs[op](dest, val);
627 void CDECL _vcomp_atomic_add_i4(int *dest, int val)
629 InterlockedExchangeAdd((LONG *)dest, val);
632 void CDECL _vcomp_atomic_and_i4(int *dest, int val)
634 int old;
635 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old & val, old) != old);
638 void CDECL _vcomp_atomic_div_i4(int *dest, int val)
640 int old;
641 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old / val, old) != old);
644 void CDECL _vcomp_atomic_div_ui4(unsigned int *dest, unsigned int val)
646 unsigned int old;
647 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old / val, old) != old);
650 void CDECL _vcomp_atomic_mul_i4(int *dest, int val)
652 int old;
653 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old * val, old) != old);
656 void CDECL _vcomp_atomic_or_i4(int *dest, int val)
658 int old;
659 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old | val, old) != old);
662 void CDECL _vcomp_atomic_shl_i4(int *dest, int val)
664 int old;
665 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old << val, old) != old);
668 void CDECL _vcomp_atomic_shr_i4(int *dest, int val)
670 int old;
671 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old >> val, old) != old);
674 void CDECL _vcomp_atomic_shr_ui4(unsigned int *dest, unsigned int val)
676 unsigned int old;
677 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old >> val, old) != old);
680 void CDECL _vcomp_atomic_sub_i4(int *dest, int val)
682 InterlockedExchangeAdd((LONG *)dest, -val);
685 void CDECL _vcomp_atomic_xor_i4(int *dest, int val)
687 int old;
688 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old ^ val, old) != old);
691 static void CDECL _vcomp_atomic_bool_and_i4(int *dest, int val)
693 int old;
694 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old && val, old) != old);
697 static void CDECL _vcomp_atomic_bool_or_i4(int *dest, int val)
699 int old;
700 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old ? old : (val != 0), old) != old);
703 void CDECL _vcomp_reduction_i4(unsigned int flags, int *dest, int val)
705 static void (CDECL * const funcs[])(int *, int) =
707 _vcomp_atomic_add_i4,
708 _vcomp_atomic_add_i4,
709 _vcomp_atomic_mul_i4,
710 _vcomp_atomic_and_i4,
711 _vcomp_atomic_or_i4,
712 _vcomp_atomic_xor_i4,
713 _vcomp_atomic_bool_and_i4,
714 _vcomp_atomic_bool_or_i4,
716 unsigned int op = (flags >> 8) & 0xf;
717 op = min(op, ARRAY_SIZE(funcs) - 1);
718 funcs[op](dest, val);
721 void CDECL _vcomp_atomic_add_i8(LONG64 *dest, LONG64 val)
723 LONG64 old;
724 do old = *dest; while (InterlockedCompareExchange64(dest, old + val, old) != old);
727 void CDECL _vcomp_atomic_and_i8(LONG64 *dest, LONG64 val)
729 LONG64 old;
730 do old = *dest; while (InterlockedCompareExchange64(dest, old & val, old) != old);
733 void CDECL _vcomp_atomic_div_i8(LONG64 *dest, LONG64 val)
735 LONG64 old;
736 do old = *dest; while (InterlockedCompareExchange64(dest, old / val, old) != old);
739 void CDECL _vcomp_atomic_div_ui8(ULONG64 *dest, ULONG64 val)
741 ULONG64 old;
742 do old = *dest; while (InterlockedCompareExchange64((LONG64 *)dest, old / val, old) != old);
745 void CDECL _vcomp_atomic_mul_i8(LONG64 *dest, LONG64 val)
747 LONG64 old;
748 do old = *dest; while (InterlockedCompareExchange64(dest, old * val, old) != old);
751 void CDECL _vcomp_atomic_or_i8(LONG64 *dest, LONG64 val)
753 LONG64 old;
754 do old = *dest; while (InterlockedCompareExchange64(dest, old | val, old) != old);
757 void CDECL _vcomp_atomic_shl_i8(LONG64 *dest, unsigned int val)
759 LONG64 old;
760 do old = *dest; while (InterlockedCompareExchange64(dest, old << val, old) != old);
763 void CDECL _vcomp_atomic_shr_i8(LONG64 *dest, unsigned int val)
765 LONG64 old;
766 do old = *dest; while (InterlockedCompareExchange64(dest, old >> val, old) != old);
769 void CDECL _vcomp_atomic_shr_ui8(ULONG64 *dest, unsigned int val)
771 ULONG64 old;
772 do old = *dest; while (InterlockedCompareExchange64((LONG64 *)dest, old >> val, old) != old);
775 void CDECL _vcomp_atomic_sub_i8(LONG64 *dest, LONG64 val)
777 LONG64 old;
778 do old = *dest; while (InterlockedCompareExchange64(dest, old - val, old) != old);
781 void CDECL _vcomp_atomic_xor_i8(LONG64 *dest, LONG64 val)
783 LONG64 old;
784 do old = *dest; while (InterlockedCompareExchange64(dest, old ^ val, old) != old);
787 static void CDECL _vcomp_atomic_bool_and_i8(LONG64 *dest, LONG64 val)
789 LONG64 old;
790 do old = *dest; while (InterlockedCompareExchange64(dest, old && val, old) != old);
793 static void CDECL _vcomp_atomic_bool_or_i8(LONG64 *dest, LONG64 val)
795 LONG64 old;
796 do old = *dest; while (InterlockedCompareExchange64(dest, old ? old : (val != 0), old) != old);
799 void CDECL _vcomp_reduction_i8(unsigned int flags, LONG64 *dest, LONG64 val)
801 static void (CDECL * const funcs[])(LONG64 *, LONG64) =
803 _vcomp_atomic_add_i8,
804 _vcomp_atomic_add_i8,
805 _vcomp_atomic_mul_i8,
806 _vcomp_atomic_and_i8,
807 _vcomp_atomic_or_i8,
808 _vcomp_atomic_xor_i8,
809 _vcomp_atomic_bool_and_i8,
810 _vcomp_atomic_bool_or_i8,
812 unsigned int op = (flags >> 8) & 0xf;
813 op = min(op, ARRAY_SIZE(funcs) - 1);
814 funcs[op](dest, val);
817 void CDECL _vcomp_atomic_add_r4(float *dest, float val)
819 int old, new;
822 old = *(int *)dest;
823 *(float *)&new = *(float *)&old + val;
825 while (InterlockedCompareExchange((LONG *)dest, new, old) != old);
828 void CDECL _vcomp_atomic_div_r4(float *dest, float val)
830 int old, new;
833 old = *(int *)dest;
834 *(float *)&new = *(float *)&old / val;
836 while (InterlockedCompareExchange((LONG *)dest, new, old) != old);
839 void CDECL _vcomp_atomic_mul_r4(float *dest, float val)
841 int old, new;
844 old = *(int *)dest;
845 *(float *)&new = *(float *)&old * val;
847 while (InterlockedCompareExchange((LONG *)dest, new, old) != old);
850 void CDECL _vcomp_atomic_sub_r4(float *dest, float val)
852 int old, new;
855 old = *(int *)dest;
856 *(float *)&new = *(float *)&old - val;
858 while (InterlockedCompareExchange((LONG *)dest, new, old) != old);
861 static void CDECL _vcomp_atomic_bool_and_r4(float *dest, float val)
863 int old, new;
866 old = *(int *)dest;
867 *(float *)&new = (*(float *)&old != 0.0) ? (val != 0.0) : 0.0;
869 while (InterlockedCompareExchange((LONG *)dest, new, old) != old);
872 static void CDECL _vcomp_atomic_bool_or_r4(float *dest, float val)
874 int old, new;
877 old = *(int *)dest;
878 *(float *)&new = (*(float *)&old != 0.0) ? *(float *)&old : (val != 0.0);
880 while (InterlockedCompareExchange((LONG *)dest, new, old) != old);
883 void CDECL _vcomp_reduction_r4(unsigned int flags, float *dest, float val)
885 static void (CDECL * const funcs[])(float *, float) =
887 _vcomp_atomic_add_r4,
888 _vcomp_atomic_add_r4,
889 _vcomp_atomic_mul_r4,
890 _vcomp_atomic_bool_or_r4,
891 _vcomp_atomic_bool_or_r4,
892 _vcomp_atomic_bool_or_r4,
893 _vcomp_atomic_bool_and_r4,
894 _vcomp_atomic_bool_or_r4,
896 unsigned int op = (flags >> 8) & 0xf;
897 op = min(op, ARRAY_SIZE(funcs) - 1);
898 funcs[op](dest, val);
901 void CDECL _vcomp_atomic_add_r8(double *dest, double val)
903 LONG64 old, new;
906 old = *(LONG64 *)dest;
907 *(double *)&new = *(double *)&old + val;
909 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
912 void CDECL _vcomp_atomic_div_r8(double *dest, double val)
914 LONG64 old, new;
917 old = *(LONG64 *)dest;
918 *(double *)&new = *(double *)&old / val;
920 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
923 void CDECL _vcomp_atomic_mul_r8(double *dest, double val)
925 LONG64 old, new;
928 old = *(LONG64 *)dest;
929 *(double *)&new = *(double *)&old * val;
931 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
934 void CDECL _vcomp_atomic_sub_r8(double *dest, double val)
936 LONG64 old, new;
939 old = *(LONG64 *)dest;
940 *(double *)&new = *(double *)&old - val;
942 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
945 static void CDECL _vcomp_atomic_bool_and_r8(double *dest, double val)
947 LONG64 old, new;
950 old = *(LONG64 *)dest;
951 *(double *)&new = (*(double *)&old != 0.0) ? (val != 0.0) : 0.0;
953 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
956 static void CDECL _vcomp_atomic_bool_or_r8(double *dest, double val)
958 LONG64 old, new;
961 old = *(LONG64 *)dest;
962 *(double *)&new = (*(double *)&old != 0.0) ? *(double *)&old : (val != 0.0);
964 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
967 void CDECL _vcomp_reduction_r8(unsigned int flags, double *dest, double val)
969 static void (CDECL * const funcs[])(double *, double) =
971 _vcomp_atomic_add_r8,
972 _vcomp_atomic_add_r8,
973 _vcomp_atomic_mul_r8,
974 _vcomp_atomic_bool_or_r8,
975 _vcomp_atomic_bool_or_r8,
976 _vcomp_atomic_bool_or_r8,
977 _vcomp_atomic_bool_and_r8,
978 _vcomp_atomic_bool_or_r8,
980 unsigned int op = (flags >> 8) & 0xf;
981 op = min(op, ARRAY_SIZE(funcs) - 1);
982 funcs[op](dest, val);
985 int CDECL omp_get_dynamic(void)
987 TRACE("stub\n");
988 return 0;
991 int CDECL omp_get_max_threads(void)
993 TRACE("()\n");
994 return vcomp_max_threads;
997 int CDECL omp_get_nested(void)
999 TRACE("stub\n");
1000 return vcomp_nested_fork;
1003 int CDECL omp_get_num_procs(void)
1005 TRACE("\n");
1006 return vcomp_num_procs;
1009 int CDECL omp_get_num_threads(void)
1011 struct vcomp_team_data *team_data = vcomp_init_thread_data()->team;
1012 TRACE("()\n");
1013 return team_data ? team_data->num_threads : 1;
1016 int CDECL omp_get_thread_num(void)
1018 TRACE("()\n");
1019 return vcomp_init_thread_data()->thread_num;
1022 int CDECL _vcomp_get_thread_num(void)
1024 TRACE("()\n");
1025 return vcomp_init_thread_data()->thread_num;
1028 /* Time in seconds since "some time in the past" */
1029 double CDECL omp_get_wtime(void)
1031 return GetTickCount() / 1000.0;
1034 void CDECL omp_set_dynamic(int val)
1036 TRACE("(%d): stub\n", val);
1039 void CDECL omp_set_nested(int nested)
1041 TRACE("(%d)\n", nested);
1042 vcomp_nested_fork = (nested != 0);
1045 void CDECL omp_set_num_threads(int num_threads)
1047 TRACE("(%d)\n", num_threads);
1048 if (num_threads >= 1)
1049 vcomp_num_threads = num_threads;
1052 void CDECL _vcomp_flush(void)
1054 TRACE("(): stub\n");
1057 void CDECL _vcomp_barrier(void)
1059 struct vcomp_team_data *team_data = vcomp_init_thread_data()->team;
1061 TRACE("()\n");
1063 if (!team_data)
1064 return;
1066 EnterCriticalSection(&vcomp_section);
1067 if (++team_data->barrier_count >= team_data->num_threads)
1069 team_data->barrier++;
1070 team_data->barrier_count = 0;
1071 WakeAllConditionVariable(&team_data->cond);
1073 else
1075 unsigned int barrier = team_data->barrier;
1076 while (team_data->barrier == barrier)
1077 SleepConditionVariableCS(&team_data->cond, &vcomp_section, INFINITE);
1079 LeaveCriticalSection(&vcomp_section);
1082 void CDECL _vcomp_set_num_threads(int num_threads)
1084 TRACE("(%d)\n", num_threads);
1085 if (num_threads >= 1)
1086 vcomp_init_thread_data()->fork_threads = num_threads;
1089 int CDECL _vcomp_master_begin(void)
1091 TRACE("()\n");
1092 return !vcomp_init_thread_data()->thread_num;
1095 void CDECL _vcomp_master_end(void)
1097 TRACE("()\n");
1098 /* nothing to do here */
1101 int CDECL _vcomp_single_begin(int flags)
1103 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1104 struct vcomp_task_data *task_data = thread_data->task;
1105 int ret = FALSE;
1107 TRACE("(%x): semi-stub\n", flags);
1109 EnterCriticalSection(&vcomp_section);
1110 thread_data->single++;
1111 if ((int)(thread_data->single - task_data->single) > 0)
1113 task_data->single = thread_data->single;
1114 ret = TRUE;
1116 LeaveCriticalSection(&vcomp_section);
1118 return ret;
1121 void CDECL _vcomp_single_end(void)
1123 TRACE("()\n");
1124 /* nothing to do here */
1127 void CDECL _vcomp_sections_init(int n)
1129 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1130 struct vcomp_task_data *task_data = thread_data->task;
1132 TRACE("(%d)\n", n);
1134 EnterCriticalSection(&vcomp_section);
1135 thread_data->section++;
1136 if ((int)(thread_data->section - task_data->section) > 0)
1138 task_data->section = thread_data->section;
1139 task_data->num_sections = n;
1140 task_data->section_index = 0;
1142 LeaveCriticalSection(&vcomp_section);
1145 int CDECL _vcomp_sections_next(void)
1147 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1148 struct vcomp_task_data *task_data = thread_data->task;
1149 int i = -1;
1151 TRACE("()\n");
1153 EnterCriticalSection(&vcomp_section);
1154 if (thread_data->section == task_data->section &&
1155 task_data->section_index != task_data->num_sections)
1157 i = task_data->section_index++;
1159 LeaveCriticalSection(&vcomp_section);
1160 return i;
1163 void CDECL _vcomp_for_static_simple_init(unsigned int first, unsigned int last, int step,
1164 BOOL increment, unsigned int *begin, unsigned int *end)
1166 unsigned int iterations, per_thread, remaining;
1167 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1168 struct vcomp_team_data *team_data = thread_data->team;
1169 int num_threads = team_data ? team_data->num_threads : 1;
1170 int thread_num = thread_data->thread_num;
1172 TRACE("(%u, %u, %d, %u, %p, %p)\n", first, last, step, increment, begin, end);
1174 if (num_threads == 1)
1176 *begin = first;
1177 *end = last;
1178 return;
1181 if (step <= 0)
1183 *begin = 0;
1184 *end = increment ? -1 : 1;
1185 return;
1188 if (increment)
1189 iterations = 1 + (last - first) / step;
1190 else
1192 iterations = 1 + (first - last) / step;
1193 step *= -1;
1196 per_thread = iterations / num_threads;
1197 remaining = iterations - per_thread * num_threads;
1199 if (thread_num < remaining)
1200 per_thread++;
1201 else if (per_thread)
1202 first += remaining * step;
1203 else
1205 *begin = first;
1206 *end = first - step;
1207 return;
1210 *begin = first + per_thread * thread_num * step;
1211 *end = *begin + (per_thread - 1) * step;
1214 void CDECL _vcomp_for_static_simple_init_i8(ULONG64 first, ULONG64 last, LONG64 step,
1215 BOOL increment, ULONG64 *begin, ULONG64 *end)
1217 ULONG64 iterations, per_thread, remaining;
1218 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1219 struct vcomp_team_data *team_data = thread_data->team;
1220 int num_threads = team_data ? team_data->num_threads : 1;
1221 int thread_num = thread_data->thread_num;
1223 TRACE("(%s, %s, %s, %x, %p, %p)\n", wine_dbgstr_longlong(first), wine_dbgstr_longlong(last),
1224 wine_dbgstr_longlong(step), increment, begin, end);
1226 if (num_threads == 1)
1228 *begin = first;
1229 *end = last;
1230 return;
1233 if (step <= 0)
1235 *begin = 0;
1236 *end = increment ? -1 : 1;
1237 return;
1240 if (increment)
1241 iterations = 1 + (last - first) / step;
1242 else
1244 iterations = 1 + (first - last) / step;
1245 step *= -1;
1248 per_thread = iterations / num_threads;
1249 remaining = iterations - per_thread * num_threads;
1251 if (thread_num < remaining)
1252 per_thread++;
1253 else if (per_thread)
1254 first += remaining * step;
1255 else
1257 *begin = first;
1258 *end = first - step;
1259 return;
1262 *begin = first + per_thread * thread_num * step;
1263 *end = *begin + (per_thread - 1) * step;
1266 void CDECL _vcomp_for_static_init(int first, int last, int step, int chunksize, unsigned int *loops,
1267 int *begin, int *end, int *next, int *lastchunk)
1269 unsigned int iterations, num_chunks, per_thread, remaining;
1270 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1271 struct vcomp_team_data *team_data = thread_data->team;
1272 int num_threads = team_data ? team_data->num_threads : 1;
1273 int thread_num = thread_data->thread_num;
1274 int no_begin, no_lastchunk;
1276 TRACE("(%d, %d, %d, %d, %p, %p, %p, %p, %p)\n",
1277 first, last, step, chunksize, loops, begin, end, next, lastchunk);
1279 if (!begin)
1281 begin = &no_begin;
1282 lastchunk = &no_lastchunk;
1285 if (num_threads == 1 && chunksize != 1)
1287 *loops = 1;
1288 *begin = first;
1289 *end = last;
1290 *next = 0;
1291 *lastchunk = first;
1292 return;
1295 if (first == last)
1297 *loops = !thread_num;
1298 if (!thread_num)
1300 *begin = first;
1301 *end = last;
1302 *next = 0;
1303 *lastchunk = first;
1305 return;
1308 if (step <= 0)
1310 *loops = 0;
1311 return;
1314 if (first < last)
1315 iterations = 1 + (last - first) / step;
1316 else
1318 iterations = 1 + (first - last) / step;
1319 step *= -1;
1322 if (chunksize < 1)
1323 chunksize = 1;
1325 num_chunks = ((DWORD64)iterations + chunksize - 1) / chunksize;
1326 per_thread = num_chunks / num_threads;
1327 remaining = num_chunks - per_thread * num_threads;
1329 *loops = per_thread + (thread_num < remaining);
1330 *begin = first + thread_num * chunksize * step;
1331 *end = *begin + (chunksize - 1) * step;
1332 *next = chunksize * num_threads * step;
1333 *lastchunk = first + (num_chunks - 1) * chunksize * step;
1336 void CDECL _vcomp_for_static_init_i8(LONG64 first, LONG64 last, LONG64 step, LONG64 chunksize, ULONG64 *loops,
1337 LONG64 *begin, LONG64 *end, LONG64 *next, LONG64 *lastchunk)
1339 ULONG64 iterations, num_chunks, per_thread, remaining;
1340 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1341 struct vcomp_team_data *team_data = thread_data->team;
1342 int num_threads = team_data ? team_data->num_threads : 1;
1343 int thread_num = thread_data->thread_num;
1344 LONG64 no_begin, no_lastchunk;
1346 TRACE("(%s, %s, %s, %s, %p, %p, %p, %p, %p)\n",
1347 wine_dbgstr_longlong(first), wine_dbgstr_longlong(last),
1348 wine_dbgstr_longlong(step), wine_dbgstr_longlong(chunksize),
1349 loops, begin, end, next, lastchunk);
1351 if (!begin)
1353 begin = &no_begin;
1354 lastchunk = &no_lastchunk;
1357 if (num_threads == 1 && chunksize != 1)
1359 *loops = 1;
1360 *begin = first;
1361 *end = last;
1362 *next = 0;
1363 *lastchunk = first;
1364 return;
1367 if (first == last)
1369 *loops = !thread_num;
1370 if (!thread_num)
1372 *begin = first;
1373 *end = last;
1374 *next = 0;
1375 *lastchunk = first;
1377 return;
1380 if (step <= 0)
1382 *loops = 0;
1383 return;
1386 if (first < last)
1387 iterations = 1 + (last - first) / step;
1388 else
1390 iterations = 1 + (first - last) / step;
1391 step *= -1;
1394 if (chunksize < 1)
1395 chunksize = 1;
1397 num_chunks = iterations / chunksize;
1398 if (iterations % chunksize) num_chunks++;
1399 per_thread = num_chunks / num_threads;
1400 remaining = num_chunks - per_thread * num_threads;
1402 *loops = per_thread + (thread_num < remaining);
1403 *begin = first + thread_num * chunksize * step;
1404 *end = *begin + (chunksize - 1) * step;
1405 *next = chunksize * num_threads * step;
1406 *lastchunk = first + (num_chunks - 1) * chunksize * step;
1409 void CDECL _vcomp_for_static_end(void)
1411 TRACE("()\n");
1412 /* nothing to do here */
1415 void CDECL _vcomp_for_dynamic_init(unsigned int flags, unsigned int first, unsigned int last,
1416 int step, unsigned int chunksize)
1418 unsigned int iterations, per_thread, remaining;
1419 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1420 struct vcomp_team_data *team_data = thread_data->team;
1421 struct vcomp_task_data *task_data = thread_data->task;
1422 int num_threads = team_data ? team_data->num_threads : 1;
1423 int thread_num = thread_data->thread_num;
1424 unsigned int type = flags & ~VCOMP_DYNAMIC_FLAGS_INCREMENT;
1426 TRACE("(%u, %u, %u, %d, %u)\n", flags, first, last, step, chunksize);
1428 if (step <= 0)
1430 thread_data->dynamic_type = 0;
1431 return;
1434 if (flags & VCOMP_DYNAMIC_FLAGS_INCREMENT)
1435 iterations = 1 + (last - first) / step;
1436 else
1438 iterations = 1 + (first - last) / step;
1439 step *= -1;
1442 if (type == VCOMP_DYNAMIC_FLAGS_STATIC)
1444 per_thread = iterations / num_threads;
1445 remaining = iterations - per_thread * num_threads;
1447 if (thread_num < remaining)
1448 per_thread++;
1449 else if (per_thread)
1450 first += remaining * step;
1451 else
1453 thread_data->dynamic_type = 0;
1454 return;
1457 thread_data->dynamic_type = VCOMP_DYNAMIC_FLAGS_STATIC;
1458 thread_data->dynamic_begin = first + per_thread * thread_num * step;
1459 thread_data->dynamic_end = thread_data->dynamic_begin + (per_thread - 1) * step;
1461 else
1463 if (type != VCOMP_DYNAMIC_FLAGS_CHUNKED &&
1464 type != VCOMP_DYNAMIC_FLAGS_GUIDED)
1466 FIXME("unsupported flags %u\n", flags);
1467 type = VCOMP_DYNAMIC_FLAGS_GUIDED;
1470 EnterCriticalSection(&vcomp_section);
1471 thread_data->dynamic++;
1472 thread_data->dynamic_type = type;
1473 if ((int)(thread_data->dynamic - task_data->dynamic) > 0)
1475 task_data->dynamic = thread_data->dynamic;
1476 task_data->dynamic_first = first;
1477 task_data->dynamic_last = last;
1478 task_data->dynamic_iterations = iterations;
1479 task_data->dynamic_step = step;
1480 task_data->dynamic_chunksize = chunksize;
1482 LeaveCriticalSection(&vcomp_section);
1486 int CDECL _vcomp_for_dynamic_next(unsigned int *begin, unsigned int *end)
1488 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1489 struct vcomp_task_data *task_data = thread_data->task;
1490 struct vcomp_team_data *team_data = thread_data->team;
1491 int num_threads = team_data ? team_data->num_threads : 1;
1493 TRACE("(%p, %p)\n", begin, end);
1495 if (thread_data->dynamic_type == VCOMP_DYNAMIC_FLAGS_STATIC)
1497 *begin = thread_data->dynamic_begin;
1498 *end = thread_data->dynamic_end;
1499 thread_data->dynamic_type = 0;
1500 return 1;
1502 else if (thread_data->dynamic_type == VCOMP_DYNAMIC_FLAGS_CHUNKED ||
1503 thread_data->dynamic_type == VCOMP_DYNAMIC_FLAGS_GUIDED)
1505 unsigned int iterations = 0;
1506 EnterCriticalSection(&vcomp_section);
1507 if (thread_data->dynamic == task_data->dynamic &&
1508 task_data->dynamic_iterations != 0)
1510 iterations = min(task_data->dynamic_iterations, task_data->dynamic_chunksize);
1511 if (thread_data->dynamic_type == VCOMP_DYNAMIC_FLAGS_GUIDED &&
1512 task_data->dynamic_iterations > num_threads * task_data->dynamic_chunksize)
1514 iterations = (task_data->dynamic_iterations + num_threads - 1) / num_threads;
1516 *begin = task_data->dynamic_first;
1517 *end = task_data->dynamic_first + (iterations - 1) * task_data->dynamic_step;
1518 task_data->dynamic_iterations -= iterations;
1519 task_data->dynamic_first += iterations * task_data->dynamic_step;
1520 if (!task_data->dynamic_iterations)
1521 *end = task_data->dynamic_last;
1523 LeaveCriticalSection(&vcomp_section);
1524 return iterations != 0;
1527 return 0;
1530 int CDECL omp_in_parallel(void)
1532 TRACE("()\n");
1533 return vcomp_init_thread_data()->parallel;
1536 static DWORD WINAPI _vcomp_fork_worker(void *param)
1538 struct vcomp_thread_data *thread_data = param;
1539 vcomp_set_thread_data(thread_data);
1541 TRACE("starting worker thread for %p\n", thread_data);
1543 EnterCriticalSection(&vcomp_section);
1544 for (;;)
1546 struct vcomp_team_data *team = thread_data->team;
1547 if (team != NULL)
1549 LeaveCriticalSection(&vcomp_section);
1550 _vcomp_fork_call_wrapper(team->wrapper, team->nargs, ptr_from_va_list(team->valist));
1551 EnterCriticalSection(&vcomp_section);
1553 thread_data->team = NULL;
1554 list_remove(&thread_data->entry);
1555 list_add_tail(&vcomp_idle_threads, &thread_data->entry);
1556 if (++team->finished_threads >= team->num_threads)
1557 WakeAllConditionVariable(&team->cond);
1560 if (!SleepConditionVariableCS(&thread_data->cond, &vcomp_section, 5000) &&
1561 GetLastError() == ERROR_TIMEOUT && !thread_data->team)
1563 break;
1566 list_remove(&thread_data->entry);
1567 LeaveCriticalSection(&vcomp_section);
1569 TRACE("terminating worker thread for %p\n", thread_data);
1571 HeapFree(GetProcessHeap(), 0, thread_data);
1572 vcomp_set_thread_data(NULL);
1573 FreeLibraryAndExitThread(vcomp_module, 0);
1574 return 0;
1577 void WINAPIV _vcomp_fork(BOOL ifval, int nargs, void *wrapper, ...)
1579 struct vcomp_thread_data *prev_thread_data = vcomp_init_thread_data();
1580 struct vcomp_thread_data thread_data;
1581 struct vcomp_team_data team_data;
1582 struct vcomp_task_data task_data;
1583 int num_threads;
1585 TRACE("(%d, %d, %p, ...)\n", ifval, nargs, wrapper);
1587 if (prev_thread_data->parallel && !vcomp_nested_fork)
1588 ifval = FALSE;
1590 if (!ifval)
1591 num_threads = 1;
1592 else if (prev_thread_data->fork_threads)
1593 num_threads = prev_thread_data->fork_threads;
1594 else
1595 num_threads = vcomp_num_threads;
1597 InitializeConditionVariable(&team_data.cond);
1598 team_data.num_threads = 1;
1599 team_data.finished_threads = 0;
1600 team_data.nargs = nargs;
1601 team_data.wrapper = wrapper;
1602 va_start(team_data.valist, wrapper);
1603 team_data.barrier = 0;
1604 team_data.barrier_count = 0;
1606 task_data.single = 0;
1607 task_data.section = 0;
1608 task_data.dynamic = 0;
1610 thread_data.team = &team_data;
1611 thread_data.task = &task_data;
1612 thread_data.thread_num = 0;
1613 thread_data.parallel = ifval || prev_thread_data->parallel;
1614 thread_data.fork_threads = 0;
1615 thread_data.single = 1;
1616 thread_data.section = 1;
1617 thread_data.dynamic = 1;
1618 thread_data.dynamic_type = 0;
1619 list_init(&thread_data.entry);
1620 InitializeConditionVariable(&thread_data.cond);
1622 if (num_threads > 1)
1624 struct list *ptr;
1625 EnterCriticalSection(&vcomp_section);
1627 /* reuse existing threads (if any) */
1628 while (team_data.num_threads < num_threads && (ptr = list_head(&vcomp_idle_threads)))
1630 struct vcomp_thread_data *data = LIST_ENTRY(ptr, struct vcomp_thread_data, entry);
1631 data->team = &team_data;
1632 data->task = &task_data;
1633 data->thread_num = team_data.num_threads++;
1634 data->parallel = thread_data.parallel;
1635 data->fork_threads = 0;
1636 data->single = 1;
1637 data->section = 1;
1638 data->dynamic = 1;
1639 data->dynamic_type = 0;
1640 list_remove(&data->entry);
1641 list_add_tail(&thread_data.entry, &data->entry);
1642 WakeAllConditionVariable(&data->cond);
1645 /* spawn additional threads */
1646 while (team_data.num_threads < num_threads)
1648 struct vcomp_thread_data *data;
1649 HMODULE module;
1650 HANDLE thread;
1652 data = HeapAlloc(GetProcessHeap(), 0, sizeof(*data));
1653 if (!data) break;
1655 data->team = &team_data;
1656 data->task = &task_data;
1657 data->thread_num = team_data.num_threads;
1658 data->parallel = thread_data.parallel;
1659 data->fork_threads = 0;
1660 data->single = 1;
1661 data->section = 1;
1662 data->dynamic = 1;
1663 data->dynamic_type = 0;
1664 InitializeConditionVariable(&data->cond);
1666 thread = CreateThread(NULL, 0, _vcomp_fork_worker, data, 0, NULL);
1667 if (!thread)
1669 HeapFree(GetProcessHeap(), 0, data);
1670 break;
1673 GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS,
1674 (const WCHAR *)vcomp_module, &module);
1675 team_data.num_threads++;
1676 list_add_tail(&thread_data.entry, &data->entry);
1677 CloseHandle(thread);
1680 LeaveCriticalSection(&vcomp_section);
1683 vcomp_set_thread_data(&thread_data);
1684 _vcomp_fork_call_wrapper(team_data.wrapper, team_data.nargs, ptr_from_va_list(team_data.valist));
1685 vcomp_set_thread_data(prev_thread_data);
1686 prev_thread_data->fork_threads = 0;
1688 if (team_data.num_threads > 1)
1690 EnterCriticalSection(&vcomp_section);
1692 team_data.finished_threads++;
1693 while (team_data.finished_threads < team_data.num_threads)
1694 SleepConditionVariableCS(&team_data.cond, &vcomp_section, INFINITE);
1696 LeaveCriticalSection(&vcomp_section);
1697 assert(list_empty(&thread_data.entry));
1700 va_end(team_data.valist);
1703 static CRITICAL_SECTION *alloc_critsect(void)
1705 CRITICAL_SECTION *critsect;
1706 if (!(critsect = HeapAlloc(GetProcessHeap(), 0, sizeof(*critsect))))
1708 ERR("could not allocate critical section\n");
1709 ExitProcess(1);
1712 InitializeCriticalSection(critsect);
1713 critsect->DebugInfo->Spare[0] = (DWORD_PTR)(__FILE__ ": critsect");
1714 return critsect;
1717 static void destroy_critsect(CRITICAL_SECTION *critsect)
1719 if (!critsect) return;
1720 critsect->DebugInfo->Spare[0] = 0;
1721 DeleteCriticalSection(critsect);
1722 HeapFree(GetProcessHeap(), 0, critsect);
1725 void CDECL omp_init_lock(omp_lock_t *lock)
1727 TRACE("(%p)\n", lock);
1728 *lock = alloc_critsect();
1731 void CDECL omp_destroy_lock(omp_lock_t *lock)
1733 TRACE("(%p)\n", lock);
1734 destroy_critsect(*lock);
1737 void CDECL omp_set_lock(omp_lock_t *lock)
1739 TRACE("(%p)\n", lock);
1741 if (RtlIsCriticalSectionLockedByThread(*lock))
1743 ERR("omp_set_lock called while holding lock %p\n", *lock);
1744 ExitProcess(1);
1747 EnterCriticalSection(*lock);
1750 void CDECL omp_unset_lock(omp_lock_t *lock)
1752 TRACE("(%p)\n", lock);
1753 LeaveCriticalSection(*lock);
1756 int CDECL omp_test_lock(omp_lock_t *lock)
1758 TRACE("(%p)\n", lock);
1760 if (RtlIsCriticalSectionLockedByThread(*lock))
1761 return 0;
1763 return TryEnterCriticalSection(*lock);
1766 void CDECL omp_set_nest_lock(omp_nest_lock_t *lock)
1768 TRACE("(%p)\n", lock);
1769 EnterCriticalSection(*lock);
1772 void CDECL omp_unset_nest_lock(omp_nest_lock_t *lock)
1774 TRACE("(%p)\n", lock);
1775 LeaveCriticalSection(*lock);
1778 int CDECL omp_test_nest_lock(omp_nest_lock_t *lock)
1780 TRACE("(%p)\n", lock);
1781 return TryEnterCriticalSection(*lock) ? (*lock)->RecursionCount : 0;
1784 void CDECL _vcomp_enter_critsect(CRITICAL_SECTION **critsect)
1786 TRACE("(%p)\n", critsect);
1788 if (!*critsect)
1790 CRITICAL_SECTION *new_critsect = alloc_critsect();
1791 if (InterlockedCompareExchangePointer((void **)critsect, new_critsect, NULL) != NULL)
1792 destroy_critsect(new_critsect); /* someone beat us to it */
1795 EnterCriticalSection(*critsect);
1798 void CDECL _vcomp_leave_critsect(CRITICAL_SECTION *critsect)
1800 TRACE("(%p)\n", critsect);
1801 LeaveCriticalSection(critsect);
1804 static unsigned int get_step_count(int start, int end, int range_offset, int step)
1806 int range = end - start + step - range_offset;
1808 if (step < 0)
1809 return (unsigned)-range / -step;
1810 else
1811 return (unsigned)range / step;
1814 static void CDECL c2vectparallel_wrapper(int start, int end, int step, int end_included, BOOL dynamic_distribution,
1815 int volatile *dynamic_start, void *function, int nargs, va_list valist)
1817 void *wrapper_args[MAX_VECT_PARALLEL_CALLBACK_ARGS];
1818 unsigned int step_count, steps_per_call, remainder;
1819 int thread_count = omp_get_num_threads();
1820 int curr_start, curr_end, range_offset;
1821 int thread = _vcomp_get_thread_num();
1822 int step_sign;
1824 copy_va_list_data(&wrapper_args[2], valist, nargs - 2);
1826 step_sign = step > 0 ? 1 : -1;
1827 range_offset = step_sign * !end_included;
1829 if (dynamic_distribution)
1831 int next_start, new_start, end_value;
1833 start = *dynamic_start;
1834 end_value = end + !!end_included * step;
1835 while (start != end_value)
1837 step_count = get_step_count(start, end, range_offset, step);
1839 curr_end = start + (step_count + thread_count - 1) / thread_count * step
1840 + range_offset;
1842 if ((curr_end - end) * step_sign > 0)
1844 next_start = end_value;
1845 curr_end = end;
1847 else
1849 next_start = curr_end - range_offset;
1850 curr_end -= step;
1853 if ((new_start = InterlockedCompareExchange((LONG volatile*)dynamic_start, next_start, start)) != start)
1855 start = new_start;
1856 continue;
1859 wrapper_args[0] = (void *)(ULONG_PTR)start;
1860 wrapper_args[1] = (void *)(ULONG_PTR)curr_end;
1861 _vcomp_fork_call_wrapper(function, nargs, wrapper_args);
1862 start = *dynamic_start;
1864 return;
1867 step_count = get_step_count(start, end, range_offset, step);
1869 /* According to the tests native vcomp still makes extra calls
1870 * with empty range from excessive threads under certain conditions
1871 * for unclear reason. */
1872 if (thread >= step_count && (end_included || (step != 1 && step != -1)))
1873 return;
1875 steps_per_call = step_count / thread_count;
1876 remainder = step_count % thread_count;
1878 if (thread < remainder)
1880 curr_start = thread * (steps_per_call + 1);
1881 curr_end = curr_start + steps_per_call + 1;
1883 else if (thread < step_count)
1885 curr_start = remainder + steps_per_call * thread;
1886 curr_end = curr_start + steps_per_call;
1888 else
1890 curr_start = curr_end = 0;
1893 curr_start = start + curr_start * step;
1894 curr_end = start + (curr_end - 1) * step + range_offset;
1896 wrapper_args[0] = (void *)(ULONG_PTR)curr_start;
1897 wrapper_args[1] = (void *)(ULONG_PTR)curr_end;
1898 _vcomp_fork_call_wrapper(function, nargs, wrapper_args);
1901 void WINAPIV C2VectParallel(int start, int end, int step, BOOL end_included, int thread_count,
1902 BOOL dynamic_distribution, void *function, int nargs, ...)
1904 struct vcomp_thread_data *thread_data;
1905 int volatile dynamic_start;
1906 int prev_thread_count;
1907 va_list valist;
1909 TRACE("start %d, end %d, step %d, end_included %d, thread_count %d, dynamic_distribution %#x,"
1910 " function %p, nargs %d.\n", start, end, step, end_included, thread_count,
1911 dynamic_distribution, function, nargs);
1913 if (nargs > MAX_VECT_PARALLEL_CALLBACK_ARGS)
1915 FIXME("Number of arguments %u exceeds supported maximum %u"
1916 " (not calling the loop code, expect problems).\n",
1917 nargs, MAX_VECT_PARALLEL_CALLBACK_ARGS);
1918 return;
1921 va_start(valist, nargs);
1923 /* This expression can result in integer overflow. According to the tests,
1924 * native vcomp runs the function as a single thread both for empty range
1925 * and (end - start) not fitting the integer range. */
1926 if ((step > 0 && end < start) || (step < 0 && end > start)
1927 || (end - start) / step < 2 || thread_count < 0)
1929 void *wrapper_args[MAX_VECT_PARALLEL_CALLBACK_ARGS];
1931 wrapper_args[0] = (void *)(ULONG_PTR)start;
1932 wrapper_args[1] = (void *)(ULONG_PTR)end;
1933 copy_va_list_data(&wrapper_args[2], valist, nargs - 2);
1934 _vcomp_fork_call_wrapper(function, nargs, wrapper_args);
1935 va_end(valist);
1936 return;
1939 thread_data = vcomp_init_thread_data();
1940 prev_thread_count = thread_data->fork_threads;
1941 thread_data->fork_threads = thread_count;
1943 dynamic_start = start;
1945 _vcomp_fork(TRUE, 9, c2vectparallel_wrapper, start, end, step, end_included, dynamic_distribution,
1946 &dynamic_start, function, nargs, valist);
1948 thread_data->fork_threads = prev_thread_count;
1949 va_end(valist);
1952 BOOL WINAPI DllMain(HINSTANCE instance, DWORD reason, LPVOID reserved)
1954 TRACE("(%p, %ld, %p)\n", instance, reason, reserved);
1956 switch (reason)
1958 case DLL_PROCESS_ATTACH:
1960 SYSTEM_INFO sysinfo;
1962 if ((vcomp_context_tls = TlsAlloc()) == TLS_OUT_OF_INDEXES)
1964 ERR("Failed to allocate TLS index\n");
1965 return FALSE;
1968 GetSystemInfo(&sysinfo);
1969 vcomp_module = instance;
1970 vcomp_max_threads = sysinfo.dwNumberOfProcessors;
1971 vcomp_num_threads = sysinfo.dwNumberOfProcessors;
1972 vcomp_num_procs = sysinfo.dwNumberOfProcessors;
1973 break;
1976 case DLL_PROCESS_DETACH:
1978 if (reserved) break;
1979 if (vcomp_context_tls != TLS_OUT_OF_INDEXES)
1981 vcomp_free_thread_data();
1982 TlsFree(vcomp_context_tls);
1984 break;
1987 case DLL_THREAD_DETACH:
1989 vcomp_free_thread_data();
1990 break;
1994 return TRUE;