wined3d: Implement shadow sampling for the ARB shader backend.
[wine.git] / dlls / vcomp / main.c
blob63ec0502b9b81447249c49c561752ea3533acea2
1 /*
3 * vcomp implementation
5 * Copyright 2011 Austin English
6 * Copyright 2012 Dan Kegel
7 * Copyright 2015-2016 Sebastian Lackner
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
24 #include <stdarg.h>
25 #include <assert.h>
27 #include "windef.h"
28 #include "winbase.h"
29 #include "winternl.h"
30 #include "wine/debug.h"
31 #include "wine/list.h"
32 #include "wine/asm.h"
34 WINE_DEFAULT_DEBUG_CHANNEL(vcomp);
36 #define MAX_VECT_PARALLEL_CALLBACK_ARGS 128
38 typedef CRITICAL_SECTION *omp_lock_t;
39 typedef CRITICAL_SECTION *omp_nest_lock_t;
41 static struct list vcomp_idle_threads = LIST_INIT(vcomp_idle_threads);
42 static DWORD vcomp_context_tls = TLS_OUT_OF_INDEXES;
43 static HMODULE vcomp_module;
44 static int vcomp_max_threads;
45 static int vcomp_num_threads;
46 static int vcomp_num_procs;
47 static BOOL vcomp_nested_fork = FALSE;
49 static RTL_CRITICAL_SECTION vcomp_section;
50 static RTL_CRITICAL_SECTION_DEBUG critsect_debug =
52 0, 0, &vcomp_section,
53 { &critsect_debug.ProcessLocksList, &critsect_debug.ProcessLocksList },
54 0, 0, { (DWORD_PTR)(__FILE__ ": vcomp_section") }
56 static RTL_CRITICAL_SECTION vcomp_section = { &critsect_debug, -1, 0, 0, 0, 0 };
58 #define VCOMP_DYNAMIC_FLAGS_STATIC 0x01
59 #define VCOMP_DYNAMIC_FLAGS_CHUNKED 0x02
60 #define VCOMP_DYNAMIC_FLAGS_GUIDED 0x03
61 #define VCOMP_DYNAMIC_FLAGS_INCREMENT 0x40
63 struct vcomp_thread_data
65 struct vcomp_team_data *team;
66 struct vcomp_task_data *task;
67 int thread_num;
68 BOOL parallel;
69 int fork_threads;
71 /* only used for concurrent tasks */
72 struct list entry;
73 CONDITION_VARIABLE cond;
75 /* single */
76 unsigned int single;
78 /* section */
79 unsigned int section;
81 /* dynamic */
82 unsigned int dynamic;
83 unsigned int dynamic_type;
84 unsigned int dynamic_begin;
85 unsigned int dynamic_end;
88 struct vcomp_team_data
90 CONDITION_VARIABLE cond;
91 int num_threads;
92 int finished_threads;
94 /* callback arguments */
95 int nargs;
96 void *wrapper;
97 va_list valist;
99 /* barrier */
100 unsigned int barrier;
101 int barrier_count;
104 struct vcomp_task_data
106 /* single */
107 unsigned int single;
109 /* section */
110 unsigned int section;
111 int num_sections;
112 int section_index;
114 /* dynamic */
115 unsigned int dynamic;
116 unsigned int dynamic_first;
117 unsigned int dynamic_last;
118 unsigned int dynamic_iterations;
119 int dynamic_step;
120 unsigned int dynamic_chunksize;
123 static void **ptr_from_va_list(va_list valist)
125 return *(void ***)&valist;
128 static void copy_va_list_data(void **args, va_list valist, int args_count)
130 unsigned int i;
132 for (i = 0; i < args_count; ++i)
133 args[i] = va_arg(valist, void *);
136 #if defined(__i386__)
138 extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args);
139 __ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper,
140 "pushl %ebp\n\t"
141 __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t")
142 __ASM_CFI(".cfi_rel_offset %ebp,0\n\t")
143 "movl %esp,%ebp\n\t"
144 __ASM_CFI(".cfi_def_cfa_register %ebp\n\t")
145 "pushl %esi\n\t"
146 __ASM_CFI(".cfi_rel_offset %esi,-4\n\t")
147 "pushl %edi\n\t"
148 __ASM_CFI(".cfi_rel_offset %edi,-8\n\t")
149 "movl 12(%ebp),%edx\n\t"
150 "movl %esp,%edi\n\t"
151 "shll $2,%edx\n\t"
152 "jz 1f\n\t"
153 "subl %edx,%edi\n\t"
154 "andl $~15,%edi\n\t"
155 "movl %edi,%esp\n\t"
156 "movl 12(%ebp),%ecx\n\t"
157 "movl 16(%ebp),%esi\n\t"
158 "cld\n\t"
159 "rep; movsl\n"
160 "1:\tcall *8(%ebp)\n\t"
161 "leal -8(%ebp),%esp\n\t"
162 "popl %edi\n\t"
163 __ASM_CFI(".cfi_same_value %edi\n\t")
164 "popl %esi\n\t"
165 __ASM_CFI(".cfi_same_value %esi\n\t")
166 "popl %ebp\n\t"
167 __ASM_CFI(".cfi_def_cfa %esp,4\n\t")
168 __ASM_CFI(".cfi_same_value %ebp\n\t")
169 "ret" )
171 #elif defined(__x86_64__)
173 extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args);
174 __ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper,
175 "pushq %rbp\n\t"
176 __ASM_SEH(".seh_pushreg %rbp\n\t")
177 __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t")
178 __ASM_CFI(".cfi_rel_offset %rbp,0\n\t")
179 "movq %rsp,%rbp\n\t"
180 __ASM_SEH(".seh_setframe %rbp,0\n\t")
181 __ASM_CFI(".cfi_def_cfa_register %rbp\n\t")
182 "pushq %rsi\n\t"
183 __ASM_SEH(".seh_pushreg %rsi\n\t")
184 __ASM_CFI(".cfi_rel_offset %rsi,-8\n\t")
185 "pushq %rdi\n\t"
186 __ASM_SEH(".seh_pushreg %rdi\n\t")
187 __ASM_SEH(".seh_endprologue\n\t")
188 __ASM_CFI(".cfi_rel_offset %rdi,-16\n\t")
189 "movq %rcx,%rax\n\t"
190 "movq $4,%rcx\n\t"
191 "cmp %rcx,%rdx\n\t"
192 "cmovgq %rdx,%rcx\n\t"
193 "leaq 0(,%rcx,8),%rdx\n\t"
194 "subq %rdx,%rsp\n\t"
195 "andq $~15,%rsp\n\t"
196 "movq %rsp,%rdi\n\t"
197 "movq %r8,%rsi\n\t"
198 "rep; movsq\n\t"
199 "movq 0(%rsp),%rcx\n\t"
200 "movq 8(%rsp),%rdx\n\t"
201 "movq 16(%rsp),%r8\n\t"
202 "movq 24(%rsp),%r9\n\t"
203 "callq *%rax\n\t"
204 "leaq -16(%rbp),%rsp\n\t"
205 "popq %rdi\n\t"
206 __ASM_CFI(".cfi_same_value %rdi\n\t")
207 "popq %rsi\n\t"
208 __ASM_CFI(".cfi_same_value %rsi\n\t")
209 __ASM_CFI(".cfi_def_cfa_register %rsp\n\t")
210 "popq %rbp\n\t"
211 __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
212 __ASM_CFI(".cfi_same_value %rbp\n\t")
213 "ret")
215 #elif defined(__arm__)
217 extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args);
218 __ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper,
219 "push {r4, r5, LR}\n\t"
220 "mov r4, r0\n\t"
221 "mov r5, SP\n\t"
222 "lsl r3, r1, #2\n\t"
223 "cmp r3, #0\n\t"
224 "beq 5f\n\t"
225 "sub SP, SP, r3\n\t"
226 "tst r1, #1\n\t"
227 "it eq\n\t"
228 "subeq SP, SP, #4\n\t"
229 "1:\tsub r3, r3, #4\n\t"
230 "ldr r0, [r2, r3]\n\t"
231 "str r0, [SP, r3]\n\t"
232 "cmp r3, #0\n\t"
233 "bgt 1b\n\t"
234 "cmp r1, #1\n\t"
235 "bgt 2f\n\t"
236 "pop {r0}\n\t"
237 "b 5f\n\t"
238 "2:\tcmp r1, #2\n\t"
239 "bgt 3f\n\t"
240 "pop {r0-r1}\n\t"
241 "b 5f\n\t"
242 "3:\tcmp r1, #3\n\t"
243 "bgt 4f\n\t"
244 "pop {r0-r2}\n\t"
245 "b 5f\n\t"
246 "4:\tpop {r0-r3}\n\t"
247 "5:\tblx r4\n\t"
248 "mov SP, r5\n\t"
249 "pop {r4, r5, PC}" )
251 #elif defined(__aarch64__)
253 extern void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args);
254 __ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper,
255 "stp x29, x30, [SP,#-16]!\n\t"
256 __ASM_SEH(".seh_save_fplr_x 16\n\t")
257 "mov x29, SP\n\t"
258 __ASM_SEH(".seh_set_fp\n\t")
259 __ASM_SEH(".seh_endprologue\n\t")
260 "mov x9, x0\n\t"
261 "cbz w1, 4f\n\t"
262 "lsl w8, w1, #3\n\t"
263 "cmp w8, #64\n\t"
264 "b.ge 1f\n\t"
265 "mov w8, #64\n"
266 "1:\ttbz w8, #3, 2f\n\t"
267 "add w8, w8, #8\n"
268 "2:\tsub x10, x29, x8\n\t"
269 "mov sp, x10\n"
270 "3:\tldr x0, [x2], #8\n\t"
271 "str x0, [x10], #8\n\t"
272 "subs w1, w1, #1\n\t"
273 "b.ne 3b\n\t"
274 "ldp x0, x1, [sp], #16\n\t"
275 "ldp x2, x3, [sp], #16\n\t"
276 "ldp x4, x5, [sp], #16\n\t"
277 "ldp x6, x7, [sp], #16\n"
278 "4:\tblr x9\n\t"
279 "mov SP, x29\n\t"
280 "ldp x29, x30, [SP], #16\n\t"
281 "ret\n" )
283 #else
285 static void CDECL _vcomp_fork_call_wrapper(void *wrapper, int nargs, void **args)
287 ERR("Not implemented for this architecture\n");
290 #endif
292 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
294 static inline char interlocked_cmpxchg8(char *dest, char xchg, char compare)
296 char ret;
297 __asm__ __volatile__( "lock; cmpxchgb %2,(%1)"
298 : "=a" (ret) : "r" (dest), "q" (xchg), "0" (compare) : "memory" );
299 return ret;
302 static inline short interlocked_cmpxchg16(short *dest, short xchg, short compare)
304 short ret;
305 __asm__ __volatile__( "lock; cmpxchgw %2,(%1)"
306 : "=a" (ret) : "r" (dest), "r" (xchg), "0" (compare) : "memory" );
307 return ret;
310 static inline char interlocked_xchg_add8(char *dest, char incr)
312 char ret;
313 __asm__ __volatile__( "lock; xaddb %0,(%1)"
314 : "=q" (ret) : "r" (dest), "0" (incr) : "memory" );
315 return ret;
318 static inline short interlocked_xchg_add16(short *dest, short incr)
320 short ret;
321 __asm__ __volatile__( "lock; xaddw %0,(%1)"
322 : "=r" (ret) : "r" (dest), "0" (incr) : "memory" );
323 return ret;
326 #else /* __GNUC__ */
328 #ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
329 static inline char interlocked_cmpxchg8(char *dest, char xchg, char compare)
331 return __sync_val_compare_and_swap(dest, compare, xchg);
334 static inline char interlocked_xchg_add8(char *dest, char incr)
336 return __sync_fetch_and_add(dest, incr);
338 #else
339 static char interlocked_cmpxchg8(char *dest, char xchg, char compare)
341 EnterCriticalSection(&vcomp_section);
342 if (*dest == compare) *dest = xchg; else compare = *dest;
343 LeaveCriticalSection(&vcomp_section);
344 return compare;
347 static char interlocked_xchg_add8(char *dest, char incr)
349 char ret;
350 EnterCriticalSection(&vcomp_section);
351 ret = *dest; *dest += incr;
352 LeaveCriticalSection(&vcomp_section);
353 return ret;
355 #endif
357 #ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
358 static inline short interlocked_cmpxchg16(short *dest, short xchg, short compare)
360 return __sync_val_compare_and_swap(dest, compare, xchg);
363 static inline short interlocked_xchg_add16(short *dest, short incr)
365 return __sync_fetch_and_add(dest, incr);
367 #else
368 static short interlocked_cmpxchg16(short *dest, short xchg, short compare)
370 EnterCriticalSection(&vcomp_section);
371 if (*dest == compare) *dest = xchg; else compare = *dest;
372 LeaveCriticalSection(&vcomp_section);
373 return compare;
376 static short interlocked_xchg_add16(short *dest, short incr)
378 short ret;
379 EnterCriticalSection(&vcomp_section);
380 ret = *dest; *dest += incr;
381 LeaveCriticalSection(&vcomp_section);
382 return ret;
384 #endif
386 #endif /* __GNUC__ */
388 static inline struct vcomp_thread_data *vcomp_get_thread_data(void)
390 return (struct vcomp_thread_data *)TlsGetValue(vcomp_context_tls);
393 static inline void vcomp_set_thread_data(struct vcomp_thread_data *thread_data)
395 TlsSetValue(vcomp_context_tls, thread_data);
398 static struct vcomp_thread_data *vcomp_init_thread_data(void)
400 struct vcomp_thread_data *thread_data = vcomp_get_thread_data();
401 struct
403 struct vcomp_thread_data thread;
404 struct vcomp_task_data task;
405 } *data;
407 if (thread_data) return thread_data;
408 if (!(data = HeapAlloc(GetProcessHeap(), 0, sizeof(*data))))
410 ERR("could not create thread data\n");
411 ExitProcess(1);
414 data->task.single = 0;
415 data->task.section = 0;
416 data->task.dynamic = 0;
418 thread_data = &data->thread;
419 thread_data->team = NULL;
420 thread_data->task = &data->task;
421 thread_data->thread_num = 0;
422 thread_data->parallel = FALSE;
423 thread_data->fork_threads = 0;
424 thread_data->single = 1;
425 thread_data->section = 1;
426 thread_data->dynamic = 1;
427 thread_data->dynamic_type = 0;
429 vcomp_set_thread_data(thread_data);
430 return thread_data;
433 static void vcomp_free_thread_data(void)
435 struct vcomp_thread_data *thread_data = vcomp_get_thread_data();
436 if (!thread_data) return;
438 HeapFree(GetProcessHeap(), 0, thread_data);
439 vcomp_set_thread_data(NULL);
442 void CDECL _vcomp_atomic_add_i1(char *dest, char val)
444 interlocked_xchg_add8(dest, val);
447 void CDECL _vcomp_atomic_and_i1(char *dest, char val)
449 char old;
450 do old = *dest; while (interlocked_cmpxchg8(dest, old & val, old) != old);
453 void CDECL _vcomp_atomic_div_i1(signed char *dest, signed char val)
455 signed char old;
456 do old = *dest; while ((signed char)interlocked_cmpxchg8((char *)dest, old / val, old) != old);
459 void CDECL _vcomp_atomic_div_ui1(unsigned char *dest, unsigned char val)
461 unsigned char old;
462 do old = *dest; while ((unsigned char)interlocked_cmpxchg8((char *)dest, old / val, old) != old);
465 void CDECL _vcomp_atomic_mul_i1(char *dest, char val)
467 char old;
468 do old = *dest; while (interlocked_cmpxchg8(dest, old * val, old) != old);
471 void CDECL _vcomp_atomic_or_i1(char *dest, char val)
473 char old;
474 do old = *dest; while (interlocked_cmpxchg8(dest, old | val, old) != old);
477 void CDECL _vcomp_atomic_shl_i1(char *dest, unsigned int val)
479 char old;
480 do old = *dest; while (interlocked_cmpxchg8(dest, old << val, old) != old);
483 void CDECL _vcomp_atomic_shr_i1(signed char *dest, unsigned int val)
485 signed char old;
486 do old = *dest; while ((signed char)interlocked_cmpxchg8((char *)dest, old >> val, old) != old);
489 void CDECL _vcomp_atomic_shr_ui1(unsigned char *dest, unsigned int val)
491 unsigned char old;
492 do old = *dest; while ((unsigned char)interlocked_cmpxchg8((char *)dest, old >> val, old) != old);
495 void CDECL _vcomp_atomic_sub_i1(char *dest, char val)
497 interlocked_xchg_add8(dest, -val);
500 void CDECL _vcomp_atomic_xor_i1(char *dest, char val)
502 char old;
503 do old = *dest; while (interlocked_cmpxchg8(dest, old ^ val, old) != old);
506 static void CDECL _vcomp_atomic_bool_and_i1(char *dest, char val)
508 char old;
509 do old = *dest; while (interlocked_cmpxchg8(dest, old && val, old) != old);
512 static void CDECL _vcomp_atomic_bool_or_i1(char *dest, char val)
514 char old;
515 do old = *dest; while (interlocked_cmpxchg8(dest, old ? old : (val != 0), old) != old);
518 void CDECL _vcomp_reduction_i1(unsigned int flags, char *dest, char val)
520 static void (CDECL * const funcs[])(char *, char) =
522 _vcomp_atomic_add_i1,
523 _vcomp_atomic_add_i1,
524 _vcomp_atomic_mul_i1,
525 _vcomp_atomic_and_i1,
526 _vcomp_atomic_or_i1,
527 _vcomp_atomic_xor_i1,
528 _vcomp_atomic_bool_and_i1,
529 _vcomp_atomic_bool_or_i1,
531 unsigned int op = (flags >> 8) & 0xf;
532 op = min(op, ARRAY_SIZE(funcs) - 1);
533 funcs[op](dest, val);
536 void CDECL _vcomp_atomic_add_i2(short *dest, short val)
538 interlocked_xchg_add16(dest, val);
541 void CDECL _vcomp_atomic_and_i2(short *dest, short val)
543 short old;
544 do old = *dest; while (interlocked_cmpxchg16(dest, old & val, old) != old);
547 void CDECL _vcomp_atomic_div_i2(short *dest, short val)
549 short old;
550 do old = *dest; while (interlocked_cmpxchg16(dest, old / val, old) != old);
553 void CDECL _vcomp_atomic_div_ui2(unsigned short *dest, unsigned short val)
555 unsigned short old;
556 do old = *dest; while ((unsigned short)interlocked_cmpxchg16((short *)dest, old / val, old) != old);
559 void CDECL _vcomp_atomic_mul_i2(short *dest, short val)
561 short old;
562 do old = *dest; while (interlocked_cmpxchg16(dest, old * val, old) != old);
565 void CDECL _vcomp_atomic_or_i2(short *dest, short val)
567 short old;
568 do old = *dest; while (interlocked_cmpxchg16(dest, old | val, old) != old);
571 void CDECL _vcomp_atomic_shl_i2(short *dest, unsigned int val)
573 short old;
574 do old = *dest; while (interlocked_cmpxchg16(dest, old << val, old) != old);
577 void CDECL _vcomp_atomic_shr_i2(short *dest, unsigned int val)
579 short old;
580 do old = *dest; while (interlocked_cmpxchg16(dest, old >> val, old) != old);
583 void CDECL _vcomp_atomic_shr_ui2(unsigned short *dest, unsigned int val)
585 unsigned short old;
586 do old = *dest; while ((unsigned short)interlocked_cmpxchg16((short *)dest, old >> val, old) != old);
589 void CDECL _vcomp_atomic_sub_i2(short *dest, short val)
591 interlocked_xchg_add16(dest, -val);
594 void CDECL _vcomp_atomic_xor_i2(short *dest, short val)
596 short old;
597 do old = *dest; while (interlocked_cmpxchg16(dest, old ^ val, old) != old);
600 static void CDECL _vcomp_atomic_bool_and_i2(short *dest, short val)
602 short old;
603 do old = *dest; while (interlocked_cmpxchg16(dest, old && val, old) != old);
606 static void CDECL _vcomp_atomic_bool_or_i2(short *dest, short val)
608 short old;
609 do old = *dest; while (interlocked_cmpxchg16(dest, old ? old : (val != 0), old) != old);
612 void CDECL _vcomp_reduction_i2(unsigned int flags, short *dest, short val)
614 static void (CDECL * const funcs[])(short *, short) =
616 _vcomp_atomic_add_i2,
617 _vcomp_atomic_add_i2,
618 _vcomp_atomic_mul_i2,
619 _vcomp_atomic_and_i2,
620 _vcomp_atomic_or_i2,
621 _vcomp_atomic_xor_i2,
622 _vcomp_atomic_bool_and_i2,
623 _vcomp_atomic_bool_or_i2,
625 unsigned int op = (flags >> 8) & 0xf;
626 op = min(op, ARRAY_SIZE(funcs) - 1);
627 funcs[op](dest, val);
630 void CDECL _vcomp_atomic_add_i4(int *dest, int val)
632 InterlockedExchangeAdd((LONG *)dest, val);
635 void CDECL _vcomp_atomic_and_i4(int *dest, int val)
637 int old;
638 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old & val, old) != old);
641 void CDECL _vcomp_atomic_div_i4(int *dest, int val)
643 int old;
644 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old / val, old) != old);
647 void CDECL _vcomp_atomic_div_ui4(unsigned int *dest, unsigned int val)
649 unsigned int old;
650 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old / val, old) != old);
653 void CDECL _vcomp_atomic_mul_i4(int *dest, int val)
655 int old;
656 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old * val, old) != old);
659 void CDECL _vcomp_atomic_or_i4(int *dest, int val)
661 int old;
662 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old | val, old) != old);
665 void CDECL _vcomp_atomic_shl_i4(int *dest, int val)
667 int old;
668 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old << val, old) != old);
671 void CDECL _vcomp_atomic_shr_i4(int *dest, int val)
673 int old;
674 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old >> val, old) != old);
677 void CDECL _vcomp_atomic_shr_ui4(unsigned int *dest, unsigned int val)
679 unsigned int old;
680 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old >> val, old) != old);
683 void CDECL _vcomp_atomic_sub_i4(int *dest, int val)
685 InterlockedExchangeAdd((LONG *)dest, -val);
688 void CDECL _vcomp_atomic_xor_i4(int *dest, int val)
690 int old;
691 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old ^ val, old) != old);
694 static void CDECL _vcomp_atomic_bool_and_i4(int *dest, int val)
696 int old;
697 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old && val, old) != old);
700 static void CDECL _vcomp_atomic_bool_or_i4(int *dest, int val)
702 int old;
703 do old = *dest; while (InterlockedCompareExchange((LONG *)dest, old ? old : (val != 0), old) != old);
706 void CDECL _vcomp_reduction_i4(unsigned int flags, int *dest, int val)
708 static void (CDECL * const funcs[])(int *, int) =
710 _vcomp_atomic_add_i4,
711 _vcomp_atomic_add_i4,
712 _vcomp_atomic_mul_i4,
713 _vcomp_atomic_and_i4,
714 _vcomp_atomic_or_i4,
715 _vcomp_atomic_xor_i4,
716 _vcomp_atomic_bool_and_i4,
717 _vcomp_atomic_bool_or_i4,
719 unsigned int op = (flags >> 8) & 0xf;
720 op = min(op, ARRAY_SIZE(funcs) - 1);
721 funcs[op](dest, val);
724 void CDECL _vcomp_atomic_add_i8(LONG64 *dest, LONG64 val)
726 LONG64 old;
727 do old = *dest; while (InterlockedCompareExchange64(dest, old + val, old) != old);
730 void CDECL _vcomp_atomic_and_i8(LONG64 *dest, LONG64 val)
732 LONG64 old;
733 do old = *dest; while (InterlockedCompareExchange64(dest, old & val, old) != old);
736 void CDECL _vcomp_atomic_div_i8(LONG64 *dest, LONG64 val)
738 LONG64 old;
739 do old = *dest; while (InterlockedCompareExchange64(dest, old / val, old) != old);
742 void CDECL _vcomp_atomic_div_ui8(ULONG64 *dest, ULONG64 val)
744 ULONG64 old;
745 do old = *dest; while (InterlockedCompareExchange64((LONG64 *)dest, old / val, old) != old);
748 void CDECL _vcomp_atomic_mul_i8(LONG64 *dest, LONG64 val)
750 LONG64 old;
751 do old = *dest; while (InterlockedCompareExchange64(dest, old * val, old) != old);
754 void CDECL _vcomp_atomic_or_i8(LONG64 *dest, LONG64 val)
756 LONG64 old;
757 do old = *dest; while (InterlockedCompareExchange64(dest, old | val, old) != old);
760 void CDECL _vcomp_atomic_shl_i8(LONG64 *dest, unsigned int val)
762 LONG64 old;
763 do old = *dest; while (InterlockedCompareExchange64(dest, old << val, old) != old);
766 void CDECL _vcomp_atomic_shr_i8(LONG64 *dest, unsigned int val)
768 LONG64 old;
769 do old = *dest; while (InterlockedCompareExchange64(dest, old >> val, old) != old);
772 void CDECL _vcomp_atomic_shr_ui8(ULONG64 *dest, unsigned int val)
774 ULONG64 old;
775 do old = *dest; while (InterlockedCompareExchange64((LONG64 *)dest, old >> val, old) != old);
778 void CDECL _vcomp_atomic_sub_i8(LONG64 *dest, LONG64 val)
780 LONG64 old;
781 do old = *dest; while (InterlockedCompareExchange64(dest, old - val, old) != old);
784 void CDECL _vcomp_atomic_xor_i8(LONG64 *dest, LONG64 val)
786 LONG64 old;
787 do old = *dest; while (InterlockedCompareExchange64(dest, old ^ val, old) != old);
790 static void CDECL _vcomp_atomic_bool_and_i8(LONG64 *dest, LONG64 val)
792 LONG64 old;
793 do old = *dest; while (InterlockedCompareExchange64(dest, old && val, old) != old);
796 static void CDECL _vcomp_atomic_bool_or_i8(LONG64 *dest, LONG64 val)
798 LONG64 old;
799 do old = *dest; while (InterlockedCompareExchange64(dest, old ? old : (val != 0), old) != old);
802 void CDECL _vcomp_reduction_i8(unsigned int flags, LONG64 *dest, LONG64 val)
804 static void (CDECL * const funcs[])(LONG64 *, LONG64) =
806 _vcomp_atomic_add_i8,
807 _vcomp_atomic_add_i8,
808 _vcomp_atomic_mul_i8,
809 _vcomp_atomic_and_i8,
810 _vcomp_atomic_or_i8,
811 _vcomp_atomic_xor_i8,
812 _vcomp_atomic_bool_and_i8,
813 _vcomp_atomic_bool_or_i8,
815 unsigned int op = (flags >> 8) & 0xf;
816 op = min(op, ARRAY_SIZE(funcs) - 1);
817 funcs[op](dest, val);
820 void CDECL _vcomp_atomic_add_r4(float *dest, float val)
822 int old, new;
825 old = *(int *)dest;
826 *(float *)&new = *(float *)&old + val;
828 while (InterlockedCompareExchange((LONG *)dest, new, old) != old);
831 void CDECL _vcomp_atomic_div_r4(float *dest, float val)
833 int old, new;
836 old = *(int *)dest;
837 *(float *)&new = *(float *)&old / val;
839 while (InterlockedCompareExchange((LONG *)dest, new, old) != old);
842 void CDECL _vcomp_atomic_mul_r4(float *dest, float val)
844 int old, new;
847 old = *(int *)dest;
848 *(float *)&new = *(float *)&old * val;
850 while (InterlockedCompareExchange((LONG *)dest, new, old) != old);
853 void CDECL _vcomp_atomic_sub_r4(float *dest, float val)
855 int old, new;
858 old = *(int *)dest;
859 *(float *)&new = *(float *)&old - val;
861 while (InterlockedCompareExchange((LONG *)dest, new, old) != old);
864 static void CDECL _vcomp_atomic_bool_and_r4(float *dest, float val)
866 int old, new;
869 old = *(int *)dest;
870 *(float *)&new = (*(float *)&old != 0.0) ? (val != 0.0) : 0.0;
872 while (InterlockedCompareExchange((LONG *)dest, new, old) != old);
875 static void CDECL _vcomp_atomic_bool_or_r4(float *dest, float val)
877 int old, new;
880 old = *(int *)dest;
881 *(float *)&new = (*(float *)&old != 0.0) ? *(float *)&old : (val != 0.0);
883 while (InterlockedCompareExchange((LONG *)dest, new, old) != old);
886 void CDECL _vcomp_reduction_r4(unsigned int flags, float *dest, float val)
888 static void (CDECL * const funcs[])(float *, float) =
890 _vcomp_atomic_add_r4,
891 _vcomp_atomic_add_r4,
892 _vcomp_atomic_mul_r4,
893 _vcomp_atomic_bool_or_r4,
894 _vcomp_atomic_bool_or_r4,
895 _vcomp_atomic_bool_or_r4,
896 _vcomp_atomic_bool_and_r4,
897 _vcomp_atomic_bool_or_r4,
899 unsigned int op = (flags >> 8) & 0xf;
900 op = min(op, ARRAY_SIZE(funcs) - 1);
901 funcs[op](dest, val);
904 void CDECL _vcomp_atomic_add_r8(double *dest, double val)
906 LONG64 old, new;
909 old = *(LONG64 *)dest;
910 *(double *)&new = *(double *)&old + val;
912 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
915 void CDECL _vcomp_atomic_div_r8(double *dest, double val)
917 LONG64 old, new;
920 old = *(LONG64 *)dest;
921 *(double *)&new = *(double *)&old / val;
923 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
926 void CDECL _vcomp_atomic_mul_r8(double *dest, double val)
928 LONG64 old, new;
931 old = *(LONG64 *)dest;
932 *(double *)&new = *(double *)&old * val;
934 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
937 void CDECL _vcomp_atomic_sub_r8(double *dest, double val)
939 LONG64 old, new;
942 old = *(LONG64 *)dest;
943 *(double *)&new = *(double *)&old - val;
945 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
948 static void CDECL _vcomp_atomic_bool_and_r8(double *dest, double val)
950 LONG64 old, new;
953 old = *(LONG64 *)dest;
954 *(double *)&new = (*(double *)&old != 0.0) ? (val != 0.0) : 0.0;
956 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
959 static void CDECL _vcomp_atomic_bool_or_r8(double *dest, double val)
961 LONG64 old, new;
964 old = *(LONG64 *)dest;
965 *(double *)&new = (*(double *)&old != 0.0) ? *(double *)&old : (val != 0.0);
967 while (InterlockedCompareExchange64((LONG64 *)dest, new, old) != old);
970 void CDECL _vcomp_reduction_r8(unsigned int flags, double *dest, double val)
972 static void (CDECL * const funcs[])(double *, double) =
974 _vcomp_atomic_add_r8,
975 _vcomp_atomic_add_r8,
976 _vcomp_atomic_mul_r8,
977 _vcomp_atomic_bool_or_r8,
978 _vcomp_atomic_bool_or_r8,
979 _vcomp_atomic_bool_or_r8,
980 _vcomp_atomic_bool_and_r8,
981 _vcomp_atomic_bool_or_r8,
983 unsigned int op = (flags >> 8) & 0xf;
984 op = min(op, ARRAY_SIZE(funcs) - 1);
985 funcs[op](dest, val);
988 int CDECL omp_get_dynamic(void)
990 TRACE("stub\n");
991 return 0;
994 int CDECL omp_get_max_threads(void)
996 TRACE("()\n");
997 return vcomp_max_threads;
1000 int CDECL omp_get_nested(void)
1002 TRACE("stub\n");
1003 return vcomp_nested_fork;
1006 int CDECL omp_get_num_procs(void)
1008 TRACE("\n");
1009 return vcomp_num_procs;
1012 int CDECL omp_get_num_threads(void)
1014 struct vcomp_team_data *team_data = vcomp_init_thread_data()->team;
1015 TRACE("()\n");
1016 return team_data ? team_data->num_threads : 1;
1019 int CDECL omp_get_thread_num(void)
1021 TRACE("()\n");
1022 return vcomp_init_thread_data()->thread_num;
1025 int CDECL _vcomp_get_thread_num(void)
1027 TRACE("()\n");
1028 return vcomp_init_thread_data()->thread_num;
1031 /* Time in seconds since "some time in the past" */
1032 double CDECL omp_get_wtime(void)
1034 return GetTickCount() / 1000.0;
1037 void CDECL omp_set_dynamic(int val)
1039 TRACE("(%d): stub\n", val);
1042 void CDECL omp_set_nested(int nested)
1044 TRACE("(%d)\n", nested);
1045 vcomp_nested_fork = (nested != 0);
1048 void CDECL omp_set_num_threads(int num_threads)
1050 TRACE("(%d)\n", num_threads);
1051 if (num_threads >= 1)
1052 vcomp_num_threads = num_threads;
1055 void CDECL _vcomp_flush(void)
1057 TRACE("(): stub\n");
1060 void CDECL _vcomp_barrier(void)
1062 struct vcomp_team_data *team_data = vcomp_init_thread_data()->team;
1064 TRACE("()\n");
1066 if (!team_data)
1067 return;
1069 EnterCriticalSection(&vcomp_section);
1070 if (++team_data->barrier_count >= team_data->num_threads)
1072 team_data->barrier++;
1073 team_data->barrier_count = 0;
1074 WakeAllConditionVariable(&team_data->cond);
1076 else
1078 unsigned int barrier = team_data->barrier;
1079 while (team_data->barrier == barrier)
1080 SleepConditionVariableCS(&team_data->cond, &vcomp_section, INFINITE);
1082 LeaveCriticalSection(&vcomp_section);
1085 void CDECL _vcomp_set_num_threads(int num_threads)
1087 TRACE("(%d)\n", num_threads);
1088 if (num_threads >= 1)
1089 vcomp_init_thread_data()->fork_threads = num_threads;
1092 int CDECL _vcomp_master_begin(void)
1094 TRACE("()\n");
1095 return !vcomp_init_thread_data()->thread_num;
1098 void CDECL _vcomp_master_end(void)
1100 TRACE("()\n");
1101 /* nothing to do here */
1104 int CDECL _vcomp_single_begin(int flags)
1106 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1107 struct vcomp_task_data *task_data = thread_data->task;
1108 int ret = FALSE;
1110 TRACE("(%x): semi-stub\n", flags);
1112 EnterCriticalSection(&vcomp_section);
1113 thread_data->single++;
1114 if ((int)(thread_data->single - task_data->single) > 0)
1116 task_data->single = thread_data->single;
1117 ret = TRUE;
1119 LeaveCriticalSection(&vcomp_section);
1121 return ret;
1124 void CDECL _vcomp_single_end(void)
1126 TRACE("()\n");
1127 /* nothing to do here */
1130 void CDECL _vcomp_sections_init(int n)
1132 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1133 struct vcomp_task_data *task_data = thread_data->task;
1135 TRACE("(%d)\n", n);
1137 EnterCriticalSection(&vcomp_section);
1138 thread_data->section++;
1139 if ((int)(thread_data->section - task_data->section) > 0)
1141 task_data->section = thread_data->section;
1142 task_data->num_sections = n;
1143 task_data->section_index = 0;
1145 LeaveCriticalSection(&vcomp_section);
1148 int CDECL _vcomp_sections_next(void)
1150 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1151 struct vcomp_task_data *task_data = thread_data->task;
1152 int i = -1;
1154 TRACE("()\n");
1156 EnterCriticalSection(&vcomp_section);
1157 if (thread_data->section == task_data->section &&
1158 task_data->section_index != task_data->num_sections)
1160 i = task_data->section_index++;
1162 LeaveCriticalSection(&vcomp_section);
1163 return i;
1166 void CDECL _vcomp_for_static_simple_init(unsigned int first, unsigned int last, int step,
1167 BOOL increment, unsigned int *begin, unsigned int *end)
1169 unsigned int iterations, per_thread, remaining;
1170 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1171 struct vcomp_team_data *team_data = thread_data->team;
1172 int num_threads = team_data ? team_data->num_threads : 1;
1173 int thread_num = thread_data->thread_num;
1175 TRACE("(%u, %u, %d, %u, %p, %p)\n", first, last, step, increment, begin, end);
1177 if (num_threads == 1)
1179 *begin = first;
1180 *end = last;
1181 return;
1184 if (step <= 0)
1186 *begin = 0;
1187 *end = increment ? -1 : 1;
1188 return;
1191 if (increment)
1192 iterations = 1 + (last - first) / step;
1193 else
1195 iterations = 1 + (first - last) / step;
1196 step *= -1;
1199 per_thread = iterations / num_threads;
1200 remaining = iterations - per_thread * num_threads;
1202 if (thread_num < remaining)
1203 per_thread++;
1204 else if (per_thread)
1205 first += remaining * step;
1206 else
1208 *begin = first;
1209 *end = first - step;
1210 return;
1213 *begin = first + per_thread * thread_num * step;
1214 *end = *begin + (per_thread - 1) * step;
1217 void CDECL _vcomp_for_static_simple_init_i8(ULONG64 first, ULONG64 last, LONG64 step,
1218 BOOL increment, ULONG64 *begin, ULONG64 *end)
1220 ULONG64 iterations, per_thread, remaining;
1221 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1222 struct vcomp_team_data *team_data = thread_data->team;
1223 int num_threads = team_data ? team_data->num_threads : 1;
1224 int thread_num = thread_data->thread_num;
1226 TRACE("(%s, %s, %s, %x, %p, %p)\n", wine_dbgstr_longlong(first), wine_dbgstr_longlong(last),
1227 wine_dbgstr_longlong(step), increment, begin, end);
1229 if (num_threads == 1)
1231 *begin = first;
1232 *end = last;
1233 return;
1236 if (step <= 0)
1238 *begin = 0;
1239 *end = increment ? -1 : 1;
1240 return;
1243 if (increment)
1244 iterations = 1 + (last - first) / step;
1245 else
1247 iterations = 1 + (first - last) / step;
1248 step *= -1;
1251 per_thread = iterations / num_threads;
1252 remaining = iterations - per_thread * num_threads;
1254 if (thread_num < remaining)
1255 per_thread++;
1256 else if (per_thread)
1257 first += remaining * step;
1258 else
1260 *begin = first;
1261 *end = first - step;
1262 return;
1265 *begin = first + per_thread * thread_num * step;
1266 *end = *begin + (per_thread - 1) * step;
1269 void CDECL _vcomp_for_static_init(int first, int last, int step, int chunksize, unsigned int *loops,
1270 int *begin, int *end, int *next, int *lastchunk)
1272 unsigned int iterations, num_chunks, per_thread, remaining;
1273 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1274 struct vcomp_team_data *team_data = thread_data->team;
1275 int num_threads = team_data ? team_data->num_threads : 1;
1276 int thread_num = thread_data->thread_num;
1277 int no_begin, no_lastchunk;
1279 TRACE("(%d, %d, %d, %d, %p, %p, %p, %p, %p)\n",
1280 first, last, step, chunksize, loops, begin, end, next, lastchunk);
1282 if (!begin)
1284 begin = &no_begin;
1285 lastchunk = &no_lastchunk;
1288 if (num_threads == 1 && chunksize != 1)
1290 *loops = 1;
1291 *begin = first;
1292 *end = last;
1293 *next = 0;
1294 *lastchunk = first;
1295 return;
1298 if (first == last)
1300 *loops = !thread_num;
1301 if (!thread_num)
1303 *begin = first;
1304 *end = last;
1305 *next = 0;
1306 *lastchunk = first;
1308 return;
1311 if (step <= 0)
1313 *loops = 0;
1314 return;
1317 if (first < last)
1318 iterations = 1 + (last - first) / step;
1319 else
1321 iterations = 1 + (first - last) / step;
1322 step *= -1;
1325 if (chunksize < 1)
1326 chunksize = 1;
1328 num_chunks = ((DWORD64)iterations + chunksize - 1) / chunksize;
1329 per_thread = num_chunks / num_threads;
1330 remaining = num_chunks - per_thread * num_threads;
1332 *loops = per_thread + (thread_num < remaining);
1333 *begin = first + thread_num * chunksize * step;
1334 *end = *begin + (chunksize - 1) * step;
1335 *next = chunksize * num_threads * step;
1336 *lastchunk = first + (num_chunks - 1) * chunksize * step;
1339 void CDECL _vcomp_for_static_init_i8(LONG64 first, LONG64 last, LONG64 step, LONG64 chunksize, ULONG64 *loops,
1340 LONG64 *begin, LONG64 *end, LONG64 *next, LONG64 *lastchunk)
1342 ULONG64 iterations, num_chunks, per_thread, remaining;
1343 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1344 struct vcomp_team_data *team_data = thread_data->team;
1345 int num_threads = team_data ? team_data->num_threads : 1;
1346 int thread_num = thread_data->thread_num;
1347 LONG64 no_begin, no_lastchunk;
1349 TRACE("(%s, %s, %s, %s, %p, %p, %p, %p, %p)\n",
1350 wine_dbgstr_longlong(first), wine_dbgstr_longlong(last),
1351 wine_dbgstr_longlong(step), wine_dbgstr_longlong(chunksize),
1352 loops, begin, end, next, lastchunk);
1354 if (!begin)
1356 begin = &no_begin;
1357 lastchunk = &no_lastchunk;
1360 if (num_threads == 1 && chunksize != 1)
1362 *loops = 1;
1363 *begin = first;
1364 *end = last;
1365 *next = 0;
1366 *lastchunk = first;
1367 return;
1370 if (first == last)
1372 *loops = !thread_num;
1373 if (!thread_num)
1375 *begin = first;
1376 *end = last;
1377 *next = 0;
1378 *lastchunk = first;
1380 return;
1383 if (step <= 0)
1385 *loops = 0;
1386 return;
1389 if (first < last)
1390 iterations = 1 + (last - first) / step;
1391 else
1393 iterations = 1 + (first - last) / step;
1394 step *= -1;
1397 if (chunksize < 1)
1398 chunksize = 1;
1400 num_chunks = iterations / chunksize;
1401 if (iterations % chunksize) num_chunks++;
1402 per_thread = num_chunks / num_threads;
1403 remaining = num_chunks - per_thread * num_threads;
1405 *loops = per_thread + (thread_num < remaining);
1406 *begin = first + thread_num * chunksize * step;
1407 *end = *begin + (chunksize - 1) * step;
1408 *next = chunksize * num_threads * step;
1409 *lastchunk = first + (num_chunks - 1) * chunksize * step;
1412 void CDECL _vcomp_for_static_end(void)
1414 TRACE("()\n");
1415 /* nothing to do here */
1418 void CDECL _vcomp_for_dynamic_init(unsigned int flags, unsigned int first, unsigned int last,
1419 int step, unsigned int chunksize)
1421 unsigned int iterations, per_thread, remaining;
1422 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1423 struct vcomp_team_data *team_data = thread_data->team;
1424 struct vcomp_task_data *task_data = thread_data->task;
1425 int num_threads = team_data ? team_data->num_threads : 1;
1426 int thread_num = thread_data->thread_num;
1427 unsigned int type = flags & ~VCOMP_DYNAMIC_FLAGS_INCREMENT;
1429 TRACE("(%u, %u, %u, %d, %u)\n", flags, first, last, step, chunksize);
1431 if (step <= 0)
1433 thread_data->dynamic_type = 0;
1434 return;
1437 if (flags & VCOMP_DYNAMIC_FLAGS_INCREMENT)
1438 iterations = 1 + (last - first) / step;
1439 else
1441 iterations = 1 + (first - last) / step;
1442 step *= -1;
1445 if (type == VCOMP_DYNAMIC_FLAGS_STATIC)
1447 per_thread = iterations / num_threads;
1448 remaining = iterations - per_thread * num_threads;
1450 if (thread_num < remaining)
1451 per_thread++;
1452 else if (per_thread)
1453 first += remaining * step;
1454 else
1456 thread_data->dynamic_type = 0;
1457 return;
1460 thread_data->dynamic_type = VCOMP_DYNAMIC_FLAGS_STATIC;
1461 thread_data->dynamic_begin = first + per_thread * thread_num * step;
1462 thread_data->dynamic_end = thread_data->dynamic_begin + (per_thread - 1) * step;
1464 else
1466 if (type != VCOMP_DYNAMIC_FLAGS_CHUNKED &&
1467 type != VCOMP_DYNAMIC_FLAGS_GUIDED)
1469 FIXME("unsupported flags %u\n", flags);
1470 type = VCOMP_DYNAMIC_FLAGS_GUIDED;
1473 EnterCriticalSection(&vcomp_section);
1474 thread_data->dynamic++;
1475 thread_data->dynamic_type = type;
1476 if ((int)(thread_data->dynamic - task_data->dynamic) > 0)
1478 task_data->dynamic = thread_data->dynamic;
1479 task_data->dynamic_first = first;
1480 task_data->dynamic_last = last;
1481 task_data->dynamic_iterations = iterations;
1482 task_data->dynamic_step = step;
1483 task_data->dynamic_chunksize = chunksize;
1485 LeaveCriticalSection(&vcomp_section);
1489 int CDECL _vcomp_for_dynamic_next(unsigned int *begin, unsigned int *end)
1491 struct vcomp_thread_data *thread_data = vcomp_init_thread_data();
1492 struct vcomp_task_data *task_data = thread_data->task;
1493 struct vcomp_team_data *team_data = thread_data->team;
1494 int num_threads = team_data ? team_data->num_threads : 1;
1496 TRACE("(%p, %p)\n", begin, end);
1498 if (thread_data->dynamic_type == VCOMP_DYNAMIC_FLAGS_STATIC)
1500 *begin = thread_data->dynamic_begin;
1501 *end = thread_data->dynamic_end;
1502 thread_data->dynamic_type = 0;
1503 return 1;
1505 else if (thread_data->dynamic_type == VCOMP_DYNAMIC_FLAGS_CHUNKED ||
1506 thread_data->dynamic_type == VCOMP_DYNAMIC_FLAGS_GUIDED)
1508 unsigned int iterations = 0;
1509 EnterCriticalSection(&vcomp_section);
1510 if (thread_data->dynamic == task_data->dynamic &&
1511 task_data->dynamic_iterations != 0)
1513 iterations = min(task_data->dynamic_iterations, task_data->dynamic_chunksize);
1514 if (thread_data->dynamic_type == VCOMP_DYNAMIC_FLAGS_GUIDED &&
1515 task_data->dynamic_iterations > num_threads * task_data->dynamic_chunksize)
1517 iterations = (task_data->dynamic_iterations + num_threads - 1) / num_threads;
1519 *begin = task_data->dynamic_first;
1520 *end = task_data->dynamic_first + (iterations - 1) * task_data->dynamic_step;
1521 task_data->dynamic_iterations -= iterations;
1522 task_data->dynamic_first += iterations * task_data->dynamic_step;
1523 if (!task_data->dynamic_iterations)
1524 *end = task_data->dynamic_last;
1526 LeaveCriticalSection(&vcomp_section);
1527 return iterations != 0;
1530 return 0;
1533 int CDECL omp_in_parallel(void)
1535 TRACE("()\n");
1536 return vcomp_init_thread_data()->parallel;
1539 static DWORD WINAPI _vcomp_fork_worker(void *param)
1541 struct vcomp_thread_data *thread_data = param;
1542 vcomp_set_thread_data(thread_data);
1544 TRACE("starting worker thread for %p\n", thread_data);
1546 EnterCriticalSection(&vcomp_section);
1547 for (;;)
1549 struct vcomp_team_data *team = thread_data->team;
1550 if (team != NULL)
1552 LeaveCriticalSection(&vcomp_section);
1553 _vcomp_fork_call_wrapper(team->wrapper, team->nargs, ptr_from_va_list(team->valist));
1554 EnterCriticalSection(&vcomp_section);
1556 thread_data->team = NULL;
1557 list_remove(&thread_data->entry);
1558 list_add_tail(&vcomp_idle_threads, &thread_data->entry);
1559 if (++team->finished_threads >= team->num_threads)
1560 WakeAllConditionVariable(&team->cond);
1563 if (!SleepConditionVariableCS(&thread_data->cond, &vcomp_section, 5000) &&
1564 GetLastError() == ERROR_TIMEOUT && !thread_data->team)
1566 break;
1569 list_remove(&thread_data->entry);
1570 LeaveCriticalSection(&vcomp_section);
1572 TRACE("terminating worker thread for %p\n", thread_data);
1574 HeapFree(GetProcessHeap(), 0, thread_data);
1575 vcomp_set_thread_data(NULL);
1576 FreeLibraryAndExitThread(vcomp_module, 0);
1577 return 0;
1580 void WINAPIV _vcomp_fork(BOOL ifval, int nargs, void *wrapper, ...)
1582 struct vcomp_thread_data *prev_thread_data = vcomp_init_thread_data();
1583 struct vcomp_thread_data thread_data;
1584 struct vcomp_team_data team_data;
1585 struct vcomp_task_data task_data;
1586 int num_threads;
1588 TRACE("(%d, %d, %p, ...)\n", ifval, nargs, wrapper);
1590 if (prev_thread_data->parallel && !vcomp_nested_fork)
1591 ifval = FALSE;
1593 if (!ifval)
1594 num_threads = 1;
1595 else if (prev_thread_data->fork_threads)
1596 num_threads = prev_thread_data->fork_threads;
1597 else
1598 num_threads = vcomp_num_threads;
1600 InitializeConditionVariable(&team_data.cond);
1601 team_data.num_threads = 1;
1602 team_data.finished_threads = 0;
1603 team_data.nargs = nargs;
1604 team_data.wrapper = wrapper;
1605 va_start(team_data.valist, wrapper);
1606 team_data.barrier = 0;
1607 team_data.barrier_count = 0;
1609 task_data.single = 0;
1610 task_data.section = 0;
1611 task_data.dynamic = 0;
1613 thread_data.team = &team_data;
1614 thread_data.task = &task_data;
1615 thread_data.thread_num = 0;
1616 thread_data.parallel = ifval || prev_thread_data->parallel;
1617 thread_data.fork_threads = 0;
1618 thread_data.single = 1;
1619 thread_data.section = 1;
1620 thread_data.dynamic = 1;
1621 thread_data.dynamic_type = 0;
1622 list_init(&thread_data.entry);
1623 InitializeConditionVariable(&thread_data.cond);
1625 if (num_threads > 1)
1627 struct list *ptr;
1628 EnterCriticalSection(&vcomp_section);
1630 /* reuse existing threads (if any) */
1631 while (team_data.num_threads < num_threads && (ptr = list_head(&vcomp_idle_threads)))
1633 struct vcomp_thread_data *data = LIST_ENTRY(ptr, struct vcomp_thread_data, entry);
1634 data->team = &team_data;
1635 data->task = &task_data;
1636 data->thread_num = team_data.num_threads++;
1637 data->parallel = thread_data.parallel;
1638 data->fork_threads = 0;
1639 data->single = 1;
1640 data->section = 1;
1641 data->dynamic = 1;
1642 data->dynamic_type = 0;
1643 list_remove(&data->entry);
1644 list_add_tail(&thread_data.entry, &data->entry);
1645 WakeAllConditionVariable(&data->cond);
1648 /* spawn additional threads */
1649 while (team_data.num_threads < num_threads)
1651 struct vcomp_thread_data *data;
1652 HMODULE module;
1653 HANDLE thread;
1655 data = HeapAlloc(GetProcessHeap(), 0, sizeof(*data));
1656 if (!data) break;
1658 data->team = &team_data;
1659 data->task = &task_data;
1660 data->thread_num = team_data.num_threads;
1661 data->parallel = thread_data.parallel;
1662 data->fork_threads = 0;
1663 data->single = 1;
1664 data->section = 1;
1665 data->dynamic = 1;
1666 data->dynamic_type = 0;
1667 InitializeConditionVariable(&data->cond);
1669 thread = CreateThread(NULL, 0, _vcomp_fork_worker, data, 0, NULL);
1670 if (!thread)
1672 HeapFree(GetProcessHeap(), 0, data);
1673 break;
1676 GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS,
1677 (const WCHAR *)vcomp_module, &module);
1678 team_data.num_threads++;
1679 list_add_tail(&thread_data.entry, &data->entry);
1680 CloseHandle(thread);
1683 LeaveCriticalSection(&vcomp_section);
1686 vcomp_set_thread_data(&thread_data);
1687 _vcomp_fork_call_wrapper(team_data.wrapper, team_data.nargs, ptr_from_va_list(team_data.valist));
1688 vcomp_set_thread_data(prev_thread_data);
1689 prev_thread_data->fork_threads = 0;
1691 if (team_data.num_threads > 1)
1693 EnterCriticalSection(&vcomp_section);
1695 team_data.finished_threads++;
1696 while (team_data.finished_threads < team_data.num_threads)
1697 SleepConditionVariableCS(&team_data.cond, &vcomp_section, INFINITE);
1699 LeaveCriticalSection(&vcomp_section);
1700 assert(list_empty(&thread_data.entry));
1703 va_end(team_data.valist);
1706 static CRITICAL_SECTION *alloc_critsect(void)
1708 CRITICAL_SECTION *critsect;
1709 if (!(critsect = HeapAlloc(GetProcessHeap(), 0, sizeof(*critsect))))
1711 ERR("could not allocate critical section\n");
1712 ExitProcess(1);
1715 InitializeCriticalSection(critsect);
1716 critsect->DebugInfo->Spare[0] = (DWORD_PTR)(__FILE__ ": critsect");
1717 return critsect;
1720 static void destroy_critsect(CRITICAL_SECTION *critsect)
1722 if (!critsect) return;
1723 critsect->DebugInfo->Spare[0] = 0;
1724 DeleteCriticalSection(critsect);
1725 HeapFree(GetProcessHeap(), 0, critsect);
1728 void CDECL omp_init_lock(omp_lock_t *lock)
1730 TRACE("(%p)\n", lock);
1731 *lock = alloc_critsect();
1734 void CDECL omp_destroy_lock(omp_lock_t *lock)
1736 TRACE("(%p)\n", lock);
1737 destroy_critsect(*lock);
1740 void CDECL omp_set_lock(omp_lock_t *lock)
1742 TRACE("(%p)\n", lock);
1744 if (RtlIsCriticalSectionLockedByThread(*lock))
1746 ERR("omp_set_lock called while holding lock %p\n", *lock);
1747 ExitProcess(1);
1750 EnterCriticalSection(*lock);
1753 void CDECL omp_unset_lock(omp_lock_t *lock)
1755 TRACE("(%p)\n", lock);
1756 LeaveCriticalSection(*lock);
1759 int CDECL omp_test_lock(omp_lock_t *lock)
1761 TRACE("(%p)\n", lock);
1763 if (RtlIsCriticalSectionLockedByThread(*lock))
1764 return 0;
1766 return TryEnterCriticalSection(*lock);
1769 void CDECL omp_set_nest_lock(omp_nest_lock_t *lock)
1771 TRACE("(%p)\n", lock);
1772 EnterCriticalSection(*lock);
1775 void CDECL omp_unset_nest_lock(omp_nest_lock_t *lock)
1777 TRACE("(%p)\n", lock);
1778 LeaveCriticalSection(*lock);
1781 int CDECL omp_test_nest_lock(omp_nest_lock_t *lock)
1783 TRACE("(%p)\n", lock);
1784 return TryEnterCriticalSection(*lock) ? (*lock)->RecursionCount : 0;
1787 void CDECL _vcomp_enter_critsect(CRITICAL_SECTION **critsect)
1789 TRACE("(%p)\n", critsect);
1791 if (!*critsect)
1793 CRITICAL_SECTION *new_critsect = alloc_critsect();
1794 if (InterlockedCompareExchangePointer((void **)critsect, new_critsect, NULL) != NULL)
1795 destroy_critsect(new_critsect); /* someone beat us to it */
1798 EnterCriticalSection(*critsect);
1801 void CDECL _vcomp_leave_critsect(CRITICAL_SECTION *critsect)
1803 TRACE("(%p)\n", critsect);
1804 LeaveCriticalSection(critsect);
1807 static unsigned int get_step_count(int start, int end, int range_offset, int step)
1809 int range = end - start + step - range_offset;
1811 if (step < 0)
1812 return (unsigned)-range / -step;
1813 else
1814 return (unsigned)range / step;
1817 static void CDECL c2vectparallel_wrapper(int start, int end, int step, int end_included, BOOL dynamic_distribution,
1818 int volatile *dynamic_start, void *function, int nargs, va_list valist)
1820 void *wrapper_args[MAX_VECT_PARALLEL_CALLBACK_ARGS];
1821 unsigned int step_count, steps_per_call, remainder;
1822 int thread_count = omp_get_num_threads();
1823 int curr_start, curr_end, range_offset;
1824 int thread = _vcomp_get_thread_num();
1825 int step_sign;
1827 copy_va_list_data(&wrapper_args[2], valist, nargs - 2);
1829 step_sign = step > 0 ? 1 : -1;
1830 range_offset = step_sign * !end_included;
1832 if (dynamic_distribution)
1834 int next_start, new_start, end_value;
1836 start = *dynamic_start;
1837 end_value = end + !!end_included * step;
1838 while (start != end_value)
1840 step_count = get_step_count(start, end, range_offset, step);
1842 curr_end = start + (step_count + thread_count - 1) / thread_count * step
1843 + range_offset;
1845 if ((curr_end - end) * step_sign > 0)
1847 next_start = end_value;
1848 curr_end = end;
1850 else
1852 next_start = curr_end - range_offset;
1853 curr_end -= step;
1856 if ((new_start = InterlockedCompareExchange((LONG volatile*)dynamic_start, next_start, start)) != start)
1858 start = new_start;
1859 continue;
1862 wrapper_args[0] = (void *)(ULONG_PTR)start;
1863 wrapper_args[1] = (void *)(ULONG_PTR)curr_end;
1864 _vcomp_fork_call_wrapper(function, nargs, wrapper_args);
1865 start = *dynamic_start;
1867 return;
1870 step_count = get_step_count(start, end, range_offset, step);
1872 /* According to the tests native vcomp still makes extra calls
1873 * with empty range from excessive threads under certain conditions
1874 * for unclear reason. */
1875 if (thread >= step_count && (end_included || (step != 1 && step != -1)))
1876 return;
1878 steps_per_call = step_count / thread_count;
1879 remainder = step_count % thread_count;
1881 if (thread < remainder)
1883 curr_start = thread * (steps_per_call + 1);
1884 curr_end = curr_start + steps_per_call + 1;
1886 else if (thread < step_count)
1888 curr_start = remainder + steps_per_call * thread;
1889 curr_end = curr_start + steps_per_call;
1891 else
1893 curr_start = curr_end = 0;
1896 curr_start = start + curr_start * step;
1897 curr_end = start + (curr_end - 1) * step + range_offset;
1899 wrapper_args[0] = (void *)(ULONG_PTR)curr_start;
1900 wrapper_args[1] = (void *)(ULONG_PTR)curr_end;
1901 _vcomp_fork_call_wrapper(function, nargs, wrapper_args);
1904 void WINAPIV C2VectParallel(int start, int end, int step, BOOL end_included, int thread_count,
1905 BOOL dynamic_distribution, void *function, int nargs, ...)
1907 struct vcomp_thread_data *thread_data;
1908 int volatile dynamic_start;
1909 int prev_thread_count;
1910 va_list valist;
1912 TRACE("start %d, end %d, step %d, end_included %d, thread_count %d, dynamic_distribution %#x,"
1913 " function %p, nargs %d.\n", start, end, step, end_included, thread_count,
1914 dynamic_distribution, function, nargs);
1916 if (nargs > MAX_VECT_PARALLEL_CALLBACK_ARGS)
1918 FIXME("Number of arguments %u exceeds supported maximum %u"
1919 " (not calling the loop code, expect problems).\n",
1920 nargs, MAX_VECT_PARALLEL_CALLBACK_ARGS);
1921 return;
1924 va_start(valist, nargs);
1926 /* This expression can result in integer overflow. According to the tests,
1927 * native vcomp runs the function as a single thread both for empty range
1928 * and (end - start) not fitting the integer range. */
1929 if ((step > 0 && end < start) || (step < 0 && end > start)
1930 || (end - start) / step < 2 || thread_count < 0)
1932 void *wrapper_args[MAX_VECT_PARALLEL_CALLBACK_ARGS];
1934 wrapper_args[0] = (void *)(ULONG_PTR)start;
1935 wrapper_args[1] = (void *)(ULONG_PTR)end;
1936 copy_va_list_data(&wrapper_args[2], valist, nargs - 2);
1937 _vcomp_fork_call_wrapper(function, nargs, wrapper_args);
1938 va_end(valist);
1939 return;
1942 thread_data = vcomp_init_thread_data();
1943 prev_thread_count = thread_data->fork_threads;
1944 thread_data->fork_threads = thread_count;
1946 dynamic_start = start;
1948 _vcomp_fork(TRUE, 9, c2vectparallel_wrapper, start, end, step, end_included, dynamic_distribution,
1949 &dynamic_start, function, nargs, valist);
1951 thread_data->fork_threads = prev_thread_count;
1952 va_end(valist);
1955 BOOL WINAPI DllMain(HINSTANCE instance, DWORD reason, LPVOID reserved)
1957 TRACE("(%p, %ld, %p)\n", instance, reason, reserved);
1959 switch (reason)
1961 case DLL_PROCESS_ATTACH:
1963 SYSTEM_INFO sysinfo;
1965 if ((vcomp_context_tls = TlsAlloc()) == TLS_OUT_OF_INDEXES)
1967 ERR("Failed to allocate TLS index\n");
1968 return FALSE;
1971 GetSystemInfo(&sysinfo);
1972 vcomp_module = instance;
1973 vcomp_max_threads = sysinfo.dwNumberOfProcessors;
1974 vcomp_num_threads = sysinfo.dwNumberOfProcessors;
1975 vcomp_num_procs = sysinfo.dwNumberOfProcessors;
1976 break;
1979 case DLL_PROCESS_DETACH:
1981 if (reserved) break;
1982 if (vcomp_context_tls != TLS_OUT_OF_INDEXES)
1984 vcomp_free_thread_data();
1985 TlsFree(vcomp_context_tls);
1987 break;
1990 case DLL_THREAD_DETACH:
1992 vcomp_free_thread_data();
1993 break;
1997 return TRUE;