5 * Copyright 2011 Austin English
6 * Copyright 2012 Dan Kegel
7 * Copyright 2015-2016 Sebastian Lackner
9 * This library is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * This library is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with this library; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
30 #include "wine/debug.h"
31 #include "wine/list.h"
34 WINE_DEFAULT_DEBUG_CHANNEL(vcomp
);
36 #define MAX_VECT_PARALLEL_CALLBACK_ARGS 128
38 typedef CRITICAL_SECTION
*omp_lock_t
;
39 typedef CRITICAL_SECTION
*omp_nest_lock_t
;
41 static struct list vcomp_idle_threads
= LIST_INIT(vcomp_idle_threads
);
42 static DWORD vcomp_context_tls
= TLS_OUT_OF_INDEXES
;
43 static HMODULE vcomp_module
;
44 static int vcomp_max_threads
;
45 static int vcomp_num_threads
;
46 static int vcomp_num_procs
;
47 static BOOL vcomp_nested_fork
= FALSE
;
49 static RTL_CRITICAL_SECTION vcomp_section
;
50 static RTL_CRITICAL_SECTION_DEBUG critsect_debug
=
53 { &critsect_debug
.ProcessLocksList
, &critsect_debug
.ProcessLocksList
},
54 0, 0, { (DWORD_PTR
)(__FILE__
": vcomp_section") }
56 static RTL_CRITICAL_SECTION vcomp_section
= { &critsect_debug
, -1, 0, 0, 0, 0 };
58 #define VCOMP_DYNAMIC_FLAGS_STATIC 0x01
59 #define VCOMP_DYNAMIC_FLAGS_CHUNKED 0x02
60 #define VCOMP_DYNAMIC_FLAGS_GUIDED 0x03
61 #define VCOMP_DYNAMIC_FLAGS_INCREMENT 0x40
63 struct vcomp_thread_data
65 struct vcomp_team_data
*team
;
66 struct vcomp_task_data
*task
;
71 /* only used for concurrent tasks */
73 CONDITION_VARIABLE cond
;
83 unsigned int dynamic_type
;
84 unsigned int dynamic_begin
;
85 unsigned int dynamic_end
;
88 struct vcomp_team_data
90 CONDITION_VARIABLE cond
;
94 /* callback arguments */
100 unsigned int barrier
;
104 struct vcomp_task_data
110 unsigned int section
;
115 unsigned int dynamic
;
116 unsigned int dynamic_first
;
117 unsigned int dynamic_last
;
118 unsigned int dynamic_iterations
;
120 unsigned int dynamic_chunksize
;
123 static void **ptr_from_va_list(va_list valist
)
125 return *(void ***)&valist
;
128 static void copy_va_list_data(void **args
, va_list valist
, int args_count
)
132 for (i
= 0; i
< args_count
; ++i
)
133 args
[i
] = va_arg(valist
, void *);
136 #if defined(__i386__)
138 extern void CDECL
_vcomp_fork_call_wrapper(void *wrapper
, int nargs
, void **args
);
139 __ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper
,
141 __ASM_CFI(".cfi_adjust_cfa_offset 4\n\t")
142 __ASM_CFI(".cfi_rel_offset %ebp,0\n\t")
144 __ASM_CFI(".cfi_def_cfa_register %ebp\n\t")
146 __ASM_CFI(".cfi_rel_offset %esi,-4\n\t")
148 __ASM_CFI(".cfi_rel_offset %edi,-8\n\t")
149 "movl 12(%ebp),%edx\n\t"
156 "movl 12(%ebp),%ecx\n\t"
157 "movl 16(%ebp),%esi\n\t"
160 "1:\tcall *8(%ebp)\n\t"
161 "leal -8(%ebp),%esp\n\t"
163 __ASM_CFI(".cfi_same_value %edi\n\t")
165 __ASM_CFI(".cfi_same_value %esi\n\t")
167 __ASM_CFI(".cfi_def_cfa %esp,4\n\t")
168 __ASM_CFI(".cfi_same_value %ebp\n\t")
171 #elif defined(__x86_64__)
173 extern void CDECL
_vcomp_fork_call_wrapper(void *wrapper
, int nargs
, void **args
);
174 __ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper
,
176 __ASM_SEH(".seh_pushreg %rbp\n\t")
177 __ASM_CFI(".cfi_adjust_cfa_offset 8\n\t")
178 __ASM_CFI(".cfi_rel_offset %rbp,0\n\t")
180 __ASM_SEH(".seh_setframe %rbp,0\n\t")
181 __ASM_CFI(".cfi_def_cfa_register %rbp\n\t")
183 __ASM_SEH(".seh_pushreg %rsi\n\t")
184 __ASM_CFI(".cfi_rel_offset %rsi,-8\n\t")
186 __ASM_SEH(".seh_pushreg %rdi\n\t")
187 __ASM_SEH(".seh_endprologue\n\t")
188 __ASM_CFI(".cfi_rel_offset %rdi,-16\n\t")
192 "cmovgq %rdx,%rcx\n\t"
193 "leaq 0(,%rcx,8),%rdx\n\t"
199 "movq 0(%rsp),%rcx\n\t"
200 "movq 8(%rsp),%rdx\n\t"
201 "movq 16(%rsp),%r8\n\t"
202 "movq 24(%rsp),%r9\n\t"
204 "leaq -16(%rbp),%rsp\n\t"
206 __ASM_CFI(".cfi_same_value %rdi\n\t")
208 __ASM_CFI(".cfi_same_value %rsi\n\t")
209 __ASM_CFI(".cfi_def_cfa_register %rsp\n\t")
211 __ASM_CFI(".cfi_adjust_cfa_offset -8\n\t")
212 __ASM_CFI(".cfi_same_value %rbp\n\t")
215 #elif defined(__arm__)
217 extern void CDECL
_vcomp_fork_call_wrapper(void *wrapper
, int nargs
, void **args
);
218 __ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper
,
219 "push {r4, r5, LR}\n\t"
228 "subeq SP, SP, #4\n\t"
229 "1:\tsub r3, r3, #4\n\t"
230 "ldr r0, [r2, r3]\n\t"
231 "str r0, [SP, r3]\n\t"
246 "4:\tpop {r0-r3}\n\t"
251 #elif defined(__aarch64__)
253 extern void CDECL
_vcomp_fork_call_wrapper(void *wrapper
, int nargs
, void **args
);
254 __ASM_GLOBAL_FUNC( _vcomp_fork_call_wrapper
,
255 "stp x29, x30, [SP,#-16]!\n\t"
256 __ASM_SEH(".seh_save_fplr_x 16\n\t")
258 __ASM_SEH(".seh_set_fp\n\t")
259 __ASM_SEH(".seh_endprologue\n\t")
266 "1:\ttbz w8, #3, 2f\n\t"
268 "2:\tsub x10, x29, x8\n\t"
270 "3:\tldr x0, [x2], #8\n\t"
271 "str x0, [x10], #8\n\t"
272 "subs w1, w1, #1\n\t"
274 "ldp x0, x1, [sp], #16\n\t"
275 "ldp x2, x3, [sp], #16\n\t"
276 "ldp x4, x5, [sp], #16\n\t"
277 "ldp x6, x7, [sp], #16\n"
280 "ldp x29, x30, [SP], #16\n\t"
285 static void CDECL
_vcomp_fork_call_wrapper(void *wrapper
, int nargs
, void **args
)
287 ERR("Not implemented for this architecture\n");
292 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
294 static inline char interlocked_cmpxchg8(char *dest
, char xchg
, char compare
)
297 __asm__
__volatile__( "lock; cmpxchgb %2,(%1)"
298 : "=a" (ret
) : "r" (dest
), "q" (xchg
), "0" (compare
) : "memory" );
302 static inline short interlocked_cmpxchg16(short *dest
, short xchg
, short compare
)
305 __asm__
__volatile__( "lock; cmpxchgw %2,(%1)"
306 : "=a" (ret
) : "r" (dest
), "r" (xchg
), "0" (compare
) : "memory" );
310 static inline char interlocked_xchg_add8(char *dest
, char incr
)
313 __asm__
__volatile__( "lock; xaddb %0,(%1)"
314 : "=q" (ret
) : "r" (dest
), "0" (incr
) : "memory" );
318 static inline short interlocked_xchg_add16(short *dest
, short incr
)
321 __asm__
__volatile__( "lock; xaddw %0,(%1)"
322 : "=r" (ret
) : "r" (dest
), "0" (incr
) : "memory" );
328 #ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
329 static inline char interlocked_cmpxchg8(char *dest
, char xchg
, char compare
)
331 return __sync_val_compare_and_swap(dest
, compare
, xchg
);
334 static inline char interlocked_xchg_add8(char *dest
, char incr
)
336 return __sync_fetch_and_add(dest
, incr
);
339 static char interlocked_cmpxchg8(char *dest
, char xchg
, char compare
)
341 EnterCriticalSection(&vcomp_section
);
342 if (*dest
== compare
) *dest
= xchg
; else compare
= *dest
;
343 LeaveCriticalSection(&vcomp_section
);
347 static char interlocked_xchg_add8(char *dest
, char incr
)
350 EnterCriticalSection(&vcomp_section
);
351 ret
= *dest
; *dest
+= incr
;
352 LeaveCriticalSection(&vcomp_section
);
357 #ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
358 static inline short interlocked_cmpxchg16(short *dest
, short xchg
, short compare
)
360 return __sync_val_compare_and_swap(dest
, compare
, xchg
);
363 static inline short interlocked_xchg_add16(short *dest
, short incr
)
365 return __sync_fetch_and_add(dest
, incr
);
368 static short interlocked_cmpxchg16(short *dest
, short xchg
, short compare
)
370 EnterCriticalSection(&vcomp_section
);
371 if (*dest
== compare
) *dest
= xchg
; else compare
= *dest
;
372 LeaveCriticalSection(&vcomp_section
);
376 static short interlocked_xchg_add16(short *dest
, short incr
)
379 EnterCriticalSection(&vcomp_section
);
380 ret
= *dest
; *dest
+= incr
;
381 LeaveCriticalSection(&vcomp_section
);
386 #endif /* __GNUC__ */
388 static inline struct vcomp_thread_data
*vcomp_get_thread_data(void)
390 return (struct vcomp_thread_data
*)TlsGetValue(vcomp_context_tls
);
393 static inline void vcomp_set_thread_data(struct vcomp_thread_data
*thread_data
)
395 TlsSetValue(vcomp_context_tls
, thread_data
);
398 static struct vcomp_thread_data
*vcomp_init_thread_data(void)
400 struct vcomp_thread_data
*thread_data
= vcomp_get_thread_data();
403 struct vcomp_thread_data thread
;
404 struct vcomp_task_data task
;
407 if (thread_data
) return thread_data
;
408 if (!(data
= HeapAlloc(GetProcessHeap(), 0, sizeof(*data
))))
410 ERR("could not create thread data\n");
414 data
->task
.single
= 0;
415 data
->task
.section
= 0;
416 data
->task
.dynamic
= 0;
418 thread_data
= &data
->thread
;
419 thread_data
->team
= NULL
;
420 thread_data
->task
= &data
->task
;
421 thread_data
->thread_num
= 0;
422 thread_data
->parallel
= FALSE
;
423 thread_data
->fork_threads
= 0;
424 thread_data
->single
= 1;
425 thread_data
->section
= 1;
426 thread_data
->dynamic
= 1;
427 thread_data
->dynamic_type
= 0;
429 vcomp_set_thread_data(thread_data
);
433 static void vcomp_free_thread_data(void)
435 struct vcomp_thread_data
*thread_data
= vcomp_get_thread_data();
436 if (!thread_data
) return;
438 HeapFree(GetProcessHeap(), 0, thread_data
);
439 vcomp_set_thread_data(NULL
);
442 void CDECL
_vcomp_atomic_add_i1(char *dest
, char val
)
444 interlocked_xchg_add8(dest
, val
);
447 void CDECL
_vcomp_atomic_and_i1(char *dest
, char val
)
450 do old
= *dest
; while (interlocked_cmpxchg8(dest
, old
& val
, old
) != old
);
453 void CDECL
_vcomp_atomic_div_i1(signed char *dest
, signed char val
)
456 do old
= *dest
; while ((signed char)interlocked_cmpxchg8((char *)dest
, old
/ val
, old
) != old
);
459 void CDECL
_vcomp_atomic_div_ui1(unsigned char *dest
, unsigned char val
)
462 do old
= *dest
; while ((unsigned char)interlocked_cmpxchg8((char *)dest
, old
/ val
, old
) != old
);
465 void CDECL
_vcomp_atomic_mul_i1(char *dest
, char val
)
468 do old
= *dest
; while (interlocked_cmpxchg8(dest
, old
* val
, old
) != old
);
471 void CDECL
_vcomp_atomic_or_i1(char *dest
, char val
)
474 do old
= *dest
; while (interlocked_cmpxchg8(dest
, old
| val
, old
) != old
);
477 void CDECL
_vcomp_atomic_shl_i1(char *dest
, unsigned int val
)
480 do old
= *dest
; while (interlocked_cmpxchg8(dest
, old
<< val
, old
) != old
);
483 void CDECL
_vcomp_atomic_shr_i1(signed char *dest
, unsigned int val
)
486 do old
= *dest
; while ((signed char)interlocked_cmpxchg8((char *)dest
, old
>> val
, old
) != old
);
489 void CDECL
_vcomp_atomic_shr_ui1(unsigned char *dest
, unsigned int val
)
492 do old
= *dest
; while ((unsigned char)interlocked_cmpxchg8((char *)dest
, old
>> val
, old
) != old
);
495 void CDECL
_vcomp_atomic_sub_i1(char *dest
, char val
)
497 interlocked_xchg_add8(dest
, -val
);
500 void CDECL
_vcomp_atomic_xor_i1(char *dest
, char val
)
503 do old
= *dest
; while (interlocked_cmpxchg8(dest
, old
^ val
, old
) != old
);
506 static void CDECL
_vcomp_atomic_bool_and_i1(char *dest
, char val
)
509 do old
= *dest
; while (interlocked_cmpxchg8(dest
, old
&& val
, old
) != old
);
512 static void CDECL
_vcomp_atomic_bool_or_i1(char *dest
, char val
)
515 do old
= *dest
; while (interlocked_cmpxchg8(dest
, old
? old
: (val
!= 0), old
) != old
);
518 void CDECL
_vcomp_reduction_i1(unsigned int flags
, char *dest
, char val
)
520 static void (CDECL
* const funcs
[])(char *, char) =
522 _vcomp_atomic_add_i1
,
523 _vcomp_atomic_add_i1
,
524 _vcomp_atomic_mul_i1
,
525 _vcomp_atomic_and_i1
,
527 _vcomp_atomic_xor_i1
,
528 _vcomp_atomic_bool_and_i1
,
529 _vcomp_atomic_bool_or_i1
,
531 unsigned int op
= (flags
>> 8) & 0xf;
532 op
= min(op
, ARRAY_SIZE(funcs
) - 1);
533 funcs
[op
](dest
, val
);
536 void CDECL
_vcomp_atomic_add_i2(short *dest
, short val
)
538 interlocked_xchg_add16(dest
, val
);
541 void CDECL
_vcomp_atomic_and_i2(short *dest
, short val
)
544 do old
= *dest
; while (interlocked_cmpxchg16(dest
, old
& val
, old
) != old
);
547 void CDECL
_vcomp_atomic_div_i2(short *dest
, short val
)
550 do old
= *dest
; while (interlocked_cmpxchg16(dest
, old
/ val
, old
) != old
);
553 void CDECL
_vcomp_atomic_div_ui2(unsigned short *dest
, unsigned short val
)
556 do old
= *dest
; while ((unsigned short)interlocked_cmpxchg16((short *)dest
, old
/ val
, old
) != old
);
559 void CDECL
_vcomp_atomic_mul_i2(short *dest
, short val
)
562 do old
= *dest
; while (interlocked_cmpxchg16(dest
, old
* val
, old
) != old
);
565 void CDECL
_vcomp_atomic_or_i2(short *dest
, short val
)
568 do old
= *dest
; while (interlocked_cmpxchg16(dest
, old
| val
, old
) != old
);
571 void CDECL
_vcomp_atomic_shl_i2(short *dest
, unsigned int val
)
574 do old
= *dest
; while (interlocked_cmpxchg16(dest
, old
<< val
, old
) != old
);
577 void CDECL
_vcomp_atomic_shr_i2(short *dest
, unsigned int val
)
580 do old
= *dest
; while (interlocked_cmpxchg16(dest
, old
>> val
, old
) != old
);
583 void CDECL
_vcomp_atomic_shr_ui2(unsigned short *dest
, unsigned int val
)
586 do old
= *dest
; while ((unsigned short)interlocked_cmpxchg16((short *)dest
, old
>> val
, old
) != old
);
589 void CDECL
_vcomp_atomic_sub_i2(short *dest
, short val
)
591 interlocked_xchg_add16(dest
, -val
);
594 void CDECL
_vcomp_atomic_xor_i2(short *dest
, short val
)
597 do old
= *dest
; while (interlocked_cmpxchg16(dest
, old
^ val
, old
) != old
);
600 static void CDECL
_vcomp_atomic_bool_and_i2(short *dest
, short val
)
603 do old
= *dest
; while (interlocked_cmpxchg16(dest
, old
&& val
, old
) != old
);
606 static void CDECL
_vcomp_atomic_bool_or_i2(short *dest
, short val
)
609 do old
= *dest
; while (interlocked_cmpxchg16(dest
, old
? old
: (val
!= 0), old
) != old
);
612 void CDECL
_vcomp_reduction_i2(unsigned int flags
, short *dest
, short val
)
614 static void (CDECL
* const funcs
[])(short *, short) =
616 _vcomp_atomic_add_i2
,
617 _vcomp_atomic_add_i2
,
618 _vcomp_atomic_mul_i2
,
619 _vcomp_atomic_and_i2
,
621 _vcomp_atomic_xor_i2
,
622 _vcomp_atomic_bool_and_i2
,
623 _vcomp_atomic_bool_or_i2
,
625 unsigned int op
= (flags
>> 8) & 0xf;
626 op
= min(op
, ARRAY_SIZE(funcs
) - 1);
627 funcs
[op
](dest
, val
);
630 void CDECL
_vcomp_atomic_add_i4(int *dest
, int val
)
632 InterlockedExchangeAdd((LONG
*)dest
, val
);
635 void CDECL
_vcomp_atomic_and_i4(int *dest
, int val
)
638 do old
= *dest
; while (InterlockedCompareExchange((LONG
*)dest
, old
& val
, old
) != old
);
641 void CDECL
_vcomp_atomic_div_i4(int *dest
, int val
)
644 do old
= *dest
; while (InterlockedCompareExchange((LONG
*)dest
, old
/ val
, old
) != old
);
647 void CDECL
_vcomp_atomic_div_ui4(unsigned int *dest
, unsigned int val
)
650 do old
= *dest
; while (InterlockedCompareExchange((LONG
*)dest
, old
/ val
, old
) != old
);
653 void CDECL
_vcomp_atomic_mul_i4(int *dest
, int val
)
656 do old
= *dest
; while (InterlockedCompareExchange((LONG
*)dest
, old
* val
, old
) != old
);
659 void CDECL
_vcomp_atomic_or_i4(int *dest
, int val
)
662 do old
= *dest
; while (InterlockedCompareExchange((LONG
*)dest
, old
| val
, old
) != old
);
665 void CDECL
_vcomp_atomic_shl_i4(int *dest
, int val
)
668 do old
= *dest
; while (InterlockedCompareExchange((LONG
*)dest
, old
<< val
, old
) != old
);
671 void CDECL
_vcomp_atomic_shr_i4(int *dest
, int val
)
674 do old
= *dest
; while (InterlockedCompareExchange((LONG
*)dest
, old
>> val
, old
) != old
);
677 void CDECL
_vcomp_atomic_shr_ui4(unsigned int *dest
, unsigned int val
)
680 do old
= *dest
; while (InterlockedCompareExchange((LONG
*)dest
, old
>> val
, old
) != old
);
683 void CDECL
_vcomp_atomic_sub_i4(int *dest
, int val
)
685 InterlockedExchangeAdd((LONG
*)dest
, -val
);
688 void CDECL
_vcomp_atomic_xor_i4(int *dest
, int val
)
691 do old
= *dest
; while (InterlockedCompareExchange((LONG
*)dest
, old
^ val
, old
) != old
);
694 static void CDECL
_vcomp_atomic_bool_and_i4(int *dest
, int val
)
697 do old
= *dest
; while (InterlockedCompareExchange((LONG
*)dest
, old
&& val
, old
) != old
);
700 static void CDECL
_vcomp_atomic_bool_or_i4(int *dest
, int val
)
703 do old
= *dest
; while (InterlockedCompareExchange((LONG
*)dest
, old
? old
: (val
!= 0), old
) != old
);
706 void CDECL
_vcomp_reduction_i4(unsigned int flags
, int *dest
, int val
)
708 static void (CDECL
* const funcs
[])(int *, int) =
710 _vcomp_atomic_add_i4
,
711 _vcomp_atomic_add_i4
,
712 _vcomp_atomic_mul_i4
,
713 _vcomp_atomic_and_i4
,
715 _vcomp_atomic_xor_i4
,
716 _vcomp_atomic_bool_and_i4
,
717 _vcomp_atomic_bool_or_i4
,
719 unsigned int op
= (flags
>> 8) & 0xf;
720 op
= min(op
, ARRAY_SIZE(funcs
) - 1);
721 funcs
[op
](dest
, val
);
724 void CDECL
_vcomp_atomic_add_i8(LONG64
*dest
, LONG64 val
)
727 do old
= *dest
; while (InterlockedCompareExchange64(dest
, old
+ val
, old
) != old
);
730 void CDECL
_vcomp_atomic_and_i8(LONG64
*dest
, LONG64 val
)
733 do old
= *dest
; while (InterlockedCompareExchange64(dest
, old
& val
, old
) != old
);
736 void CDECL
_vcomp_atomic_div_i8(LONG64
*dest
, LONG64 val
)
739 do old
= *dest
; while (InterlockedCompareExchange64(dest
, old
/ val
, old
) != old
);
742 void CDECL
_vcomp_atomic_div_ui8(ULONG64
*dest
, ULONG64 val
)
745 do old
= *dest
; while (InterlockedCompareExchange64((LONG64
*)dest
, old
/ val
, old
) != old
);
748 void CDECL
_vcomp_atomic_mul_i8(LONG64
*dest
, LONG64 val
)
751 do old
= *dest
; while (InterlockedCompareExchange64(dest
, old
* val
, old
) != old
);
754 void CDECL
_vcomp_atomic_or_i8(LONG64
*dest
, LONG64 val
)
757 do old
= *dest
; while (InterlockedCompareExchange64(dest
, old
| val
, old
) != old
);
760 void CDECL
_vcomp_atomic_shl_i8(LONG64
*dest
, unsigned int val
)
763 do old
= *dest
; while (InterlockedCompareExchange64(dest
, old
<< val
, old
) != old
);
766 void CDECL
_vcomp_atomic_shr_i8(LONG64
*dest
, unsigned int val
)
769 do old
= *dest
; while (InterlockedCompareExchange64(dest
, old
>> val
, old
) != old
);
772 void CDECL
_vcomp_atomic_shr_ui8(ULONG64
*dest
, unsigned int val
)
775 do old
= *dest
; while (InterlockedCompareExchange64((LONG64
*)dest
, old
>> val
, old
) != old
);
778 void CDECL
_vcomp_atomic_sub_i8(LONG64
*dest
, LONG64 val
)
781 do old
= *dest
; while (InterlockedCompareExchange64(dest
, old
- val
, old
) != old
);
784 void CDECL
_vcomp_atomic_xor_i8(LONG64
*dest
, LONG64 val
)
787 do old
= *dest
; while (InterlockedCompareExchange64(dest
, old
^ val
, old
) != old
);
790 static void CDECL
_vcomp_atomic_bool_and_i8(LONG64
*dest
, LONG64 val
)
793 do old
= *dest
; while (InterlockedCompareExchange64(dest
, old
&& val
, old
) != old
);
796 static void CDECL
_vcomp_atomic_bool_or_i8(LONG64
*dest
, LONG64 val
)
799 do old
= *dest
; while (InterlockedCompareExchange64(dest
, old
? old
: (val
!= 0), old
) != old
);
802 void CDECL
_vcomp_reduction_i8(unsigned int flags
, LONG64
*dest
, LONG64 val
)
804 static void (CDECL
* const funcs
[])(LONG64
*, LONG64
) =
806 _vcomp_atomic_add_i8
,
807 _vcomp_atomic_add_i8
,
808 _vcomp_atomic_mul_i8
,
809 _vcomp_atomic_and_i8
,
811 _vcomp_atomic_xor_i8
,
812 _vcomp_atomic_bool_and_i8
,
813 _vcomp_atomic_bool_or_i8
,
815 unsigned int op
= (flags
>> 8) & 0xf;
816 op
= min(op
, ARRAY_SIZE(funcs
) - 1);
817 funcs
[op
](dest
, val
);
820 void CDECL
_vcomp_atomic_add_r4(float *dest
, float val
)
826 *(float *)&new = *(float *)&old
+ val
;
828 while (InterlockedCompareExchange((LONG
*)dest
, new, old
) != old
);
831 void CDECL
_vcomp_atomic_div_r4(float *dest
, float val
)
837 *(float *)&new = *(float *)&old
/ val
;
839 while (InterlockedCompareExchange((LONG
*)dest
, new, old
) != old
);
842 void CDECL
_vcomp_atomic_mul_r4(float *dest
, float val
)
848 *(float *)&new = *(float *)&old
* val
;
850 while (InterlockedCompareExchange((LONG
*)dest
, new, old
) != old
);
853 void CDECL
_vcomp_atomic_sub_r4(float *dest
, float val
)
859 *(float *)&new = *(float *)&old
- val
;
861 while (InterlockedCompareExchange((LONG
*)dest
, new, old
) != old
);
864 static void CDECL
_vcomp_atomic_bool_and_r4(float *dest
, float val
)
870 *(float *)&new = (*(float *)&old
!= 0.0) ? (val
!= 0.0) : 0.0;
872 while (InterlockedCompareExchange((LONG
*)dest
, new, old
) != old
);
875 static void CDECL
_vcomp_atomic_bool_or_r4(float *dest
, float val
)
881 *(float *)&new = (*(float *)&old
!= 0.0) ? *(float *)&old
: (val
!= 0.0);
883 while (InterlockedCompareExchange((LONG
*)dest
, new, old
) != old
);
886 void CDECL
_vcomp_reduction_r4(unsigned int flags
, float *dest
, float val
)
888 static void (CDECL
* const funcs
[])(float *, float) =
890 _vcomp_atomic_add_r4
,
891 _vcomp_atomic_add_r4
,
892 _vcomp_atomic_mul_r4
,
893 _vcomp_atomic_bool_or_r4
,
894 _vcomp_atomic_bool_or_r4
,
895 _vcomp_atomic_bool_or_r4
,
896 _vcomp_atomic_bool_and_r4
,
897 _vcomp_atomic_bool_or_r4
,
899 unsigned int op
= (flags
>> 8) & 0xf;
900 op
= min(op
, ARRAY_SIZE(funcs
) - 1);
901 funcs
[op
](dest
, val
);
904 void CDECL
_vcomp_atomic_add_r8(double *dest
, double val
)
909 old
= *(LONG64
*)dest
;
910 *(double *)&new = *(double *)&old
+ val
;
912 while (InterlockedCompareExchange64((LONG64
*)dest
, new, old
) != old
);
915 void CDECL
_vcomp_atomic_div_r8(double *dest
, double val
)
920 old
= *(LONG64
*)dest
;
921 *(double *)&new = *(double *)&old
/ val
;
923 while (InterlockedCompareExchange64((LONG64
*)dest
, new, old
) != old
);
926 void CDECL
_vcomp_atomic_mul_r8(double *dest
, double val
)
931 old
= *(LONG64
*)dest
;
932 *(double *)&new = *(double *)&old
* val
;
934 while (InterlockedCompareExchange64((LONG64
*)dest
, new, old
) != old
);
937 void CDECL
_vcomp_atomic_sub_r8(double *dest
, double val
)
942 old
= *(LONG64
*)dest
;
943 *(double *)&new = *(double *)&old
- val
;
945 while (InterlockedCompareExchange64((LONG64
*)dest
, new, old
) != old
);
948 static void CDECL
_vcomp_atomic_bool_and_r8(double *dest
, double val
)
953 old
= *(LONG64
*)dest
;
954 *(double *)&new = (*(double *)&old
!= 0.0) ? (val
!= 0.0) : 0.0;
956 while (InterlockedCompareExchange64((LONG64
*)dest
, new, old
) != old
);
959 static void CDECL
_vcomp_atomic_bool_or_r8(double *dest
, double val
)
964 old
= *(LONG64
*)dest
;
965 *(double *)&new = (*(double *)&old
!= 0.0) ? *(double *)&old
: (val
!= 0.0);
967 while (InterlockedCompareExchange64((LONG64
*)dest
, new, old
) != old
);
970 void CDECL
_vcomp_reduction_r8(unsigned int flags
, double *dest
, double val
)
972 static void (CDECL
* const funcs
[])(double *, double) =
974 _vcomp_atomic_add_r8
,
975 _vcomp_atomic_add_r8
,
976 _vcomp_atomic_mul_r8
,
977 _vcomp_atomic_bool_or_r8
,
978 _vcomp_atomic_bool_or_r8
,
979 _vcomp_atomic_bool_or_r8
,
980 _vcomp_atomic_bool_and_r8
,
981 _vcomp_atomic_bool_or_r8
,
983 unsigned int op
= (flags
>> 8) & 0xf;
984 op
= min(op
, ARRAY_SIZE(funcs
) - 1);
985 funcs
[op
](dest
, val
);
988 int CDECL
omp_get_dynamic(void)
994 int CDECL
omp_get_max_threads(void)
997 return vcomp_max_threads
;
1000 int CDECL
omp_get_nested(void)
1003 return vcomp_nested_fork
;
1006 int CDECL
omp_get_num_procs(void)
1009 return vcomp_num_procs
;
1012 int CDECL
omp_get_num_threads(void)
1014 struct vcomp_team_data
*team_data
= vcomp_init_thread_data()->team
;
1016 return team_data
? team_data
->num_threads
: 1;
1019 int CDECL
omp_get_thread_num(void)
1022 return vcomp_init_thread_data()->thread_num
;
1025 int CDECL
_vcomp_get_thread_num(void)
1028 return vcomp_init_thread_data()->thread_num
;
1031 /* Time in seconds since "some time in the past" */
1032 double CDECL
omp_get_wtime(void)
1034 return GetTickCount() / 1000.0;
1037 void CDECL
omp_set_dynamic(int val
)
1039 TRACE("(%d): stub\n", val
);
1042 void CDECL
omp_set_nested(int nested
)
1044 TRACE("(%d)\n", nested
);
1045 vcomp_nested_fork
= (nested
!= 0);
1048 void CDECL
omp_set_num_threads(int num_threads
)
1050 TRACE("(%d)\n", num_threads
);
1051 if (num_threads
>= 1)
1052 vcomp_num_threads
= num_threads
;
1055 void CDECL
_vcomp_flush(void)
1057 TRACE("(): stub\n");
1060 void CDECL
_vcomp_barrier(void)
1062 struct vcomp_team_data
*team_data
= vcomp_init_thread_data()->team
;
1069 EnterCriticalSection(&vcomp_section
);
1070 if (++team_data
->barrier_count
>= team_data
->num_threads
)
1072 team_data
->barrier
++;
1073 team_data
->barrier_count
= 0;
1074 WakeAllConditionVariable(&team_data
->cond
);
1078 unsigned int barrier
= team_data
->barrier
;
1079 while (team_data
->barrier
== barrier
)
1080 SleepConditionVariableCS(&team_data
->cond
, &vcomp_section
, INFINITE
);
1082 LeaveCriticalSection(&vcomp_section
);
1085 void CDECL
_vcomp_set_num_threads(int num_threads
)
1087 TRACE("(%d)\n", num_threads
);
1088 if (num_threads
>= 1)
1089 vcomp_init_thread_data()->fork_threads
= num_threads
;
1092 int CDECL
_vcomp_master_begin(void)
1095 return !vcomp_init_thread_data()->thread_num
;
1098 void CDECL
_vcomp_master_end(void)
1101 /* nothing to do here */
1104 int CDECL
_vcomp_single_begin(int flags
)
1106 struct vcomp_thread_data
*thread_data
= vcomp_init_thread_data();
1107 struct vcomp_task_data
*task_data
= thread_data
->task
;
1110 TRACE("(%x): semi-stub\n", flags
);
1112 EnterCriticalSection(&vcomp_section
);
1113 thread_data
->single
++;
1114 if ((int)(thread_data
->single
- task_data
->single
) > 0)
1116 task_data
->single
= thread_data
->single
;
1119 LeaveCriticalSection(&vcomp_section
);
1124 void CDECL
_vcomp_single_end(void)
1127 /* nothing to do here */
1130 void CDECL
_vcomp_sections_init(int n
)
1132 struct vcomp_thread_data
*thread_data
= vcomp_init_thread_data();
1133 struct vcomp_task_data
*task_data
= thread_data
->task
;
1137 EnterCriticalSection(&vcomp_section
);
1138 thread_data
->section
++;
1139 if ((int)(thread_data
->section
- task_data
->section
) > 0)
1141 task_data
->section
= thread_data
->section
;
1142 task_data
->num_sections
= n
;
1143 task_data
->section_index
= 0;
1145 LeaveCriticalSection(&vcomp_section
);
1148 int CDECL
_vcomp_sections_next(void)
1150 struct vcomp_thread_data
*thread_data
= vcomp_init_thread_data();
1151 struct vcomp_task_data
*task_data
= thread_data
->task
;
1156 EnterCriticalSection(&vcomp_section
);
1157 if (thread_data
->section
== task_data
->section
&&
1158 task_data
->section_index
!= task_data
->num_sections
)
1160 i
= task_data
->section_index
++;
1162 LeaveCriticalSection(&vcomp_section
);
1166 void CDECL
_vcomp_for_static_simple_init(unsigned int first
, unsigned int last
, int step
,
1167 BOOL increment
, unsigned int *begin
, unsigned int *end
)
1169 unsigned int iterations
, per_thread
, remaining
;
1170 struct vcomp_thread_data
*thread_data
= vcomp_init_thread_data();
1171 struct vcomp_team_data
*team_data
= thread_data
->team
;
1172 int num_threads
= team_data
? team_data
->num_threads
: 1;
1173 int thread_num
= thread_data
->thread_num
;
1175 TRACE("(%u, %u, %d, %u, %p, %p)\n", first
, last
, step
, increment
, begin
, end
);
1177 if (num_threads
== 1)
1187 *end
= increment
? -1 : 1;
1192 iterations
= 1 + (last
- first
) / step
;
1195 iterations
= 1 + (first
- last
) / step
;
1199 per_thread
= iterations
/ num_threads
;
1200 remaining
= iterations
- per_thread
* num_threads
;
1202 if (thread_num
< remaining
)
1204 else if (per_thread
)
1205 first
+= remaining
* step
;
1209 *end
= first
- step
;
1213 *begin
= first
+ per_thread
* thread_num
* step
;
1214 *end
= *begin
+ (per_thread
- 1) * step
;
1217 void CDECL
_vcomp_for_static_simple_init_i8(ULONG64 first
, ULONG64 last
, LONG64 step
,
1218 BOOL increment
, ULONG64
*begin
, ULONG64
*end
)
1220 ULONG64 iterations
, per_thread
, remaining
;
1221 struct vcomp_thread_data
*thread_data
= vcomp_init_thread_data();
1222 struct vcomp_team_data
*team_data
= thread_data
->team
;
1223 int num_threads
= team_data
? team_data
->num_threads
: 1;
1224 int thread_num
= thread_data
->thread_num
;
1226 TRACE("(%s, %s, %s, %x, %p, %p)\n", wine_dbgstr_longlong(first
), wine_dbgstr_longlong(last
),
1227 wine_dbgstr_longlong(step
), increment
, begin
, end
);
1229 if (num_threads
== 1)
1239 *end
= increment
? -1 : 1;
1244 iterations
= 1 + (last
- first
) / step
;
1247 iterations
= 1 + (first
- last
) / step
;
1251 per_thread
= iterations
/ num_threads
;
1252 remaining
= iterations
- per_thread
* num_threads
;
1254 if (thread_num
< remaining
)
1256 else if (per_thread
)
1257 first
+= remaining
* step
;
1261 *end
= first
- step
;
1265 *begin
= first
+ per_thread
* thread_num
* step
;
1266 *end
= *begin
+ (per_thread
- 1) * step
;
1269 void CDECL
_vcomp_for_static_init(int first
, int last
, int step
, int chunksize
, unsigned int *loops
,
1270 int *begin
, int *end
, int *next
, int *lastchunk
)
1272 unsigned int iterations
, num_chunks
, per_thread
, remaining
;
1273 struct vcomp_thread_data
*thread_data
= vcomp_init_thread_data();
1274 struct vcomp_team_data
*team_data
= thread_data
->team
;
1275 int num_threads
= team_data
? team_data
->num_threads
: 1;
1276 int thread_num
= thread_data
->thread_num
;
1277 int no_begin
, no_lastchunk
;
1279 TRACE("(%d, %d, %d, %d, %p, %p, %p, %p, %p)\n",
1280 first
, last
, step
, chunksize
, loops
, begin
, end
, next
, lastchunk
);
1285 lastchunk
= &no_lastchunk
;
1288 if (num_threads
== 1 && chunksize
!= 1)
1300 *loops
= !thread_num
;
1318 iterations
= 1 + (last
- first
) / step
;
1321 iterations
= 1 + (first
- last
) / step
;
1328 num_chunks
= ((DWORD64
)iterations
+ chunksize
- 1) / chunksize
;
1329 per_thread
= num_chunks
/ num_threads
;
1330 remaining
= num_chunks
- per_thread
* num_threads
;
1332 *loops
= per_thread
+ (thread_num
< remaining
);
1333 *begin
= first
+ thread_num
* chunksize
* step
;
1334 *end
= *begin
+ (chunksize
- 1) * step
;
1335 *next
= chunksize
* num_threads
* step
;
1336 *lastchunk
= first
+ (num_chunks
- 1) * chunksize
* step
;
1339 void CDECL
_vcomp_for_static_init_i8(LONG64 first
, LONG64 last
, LONG64 step
, LONG64 chunksize
, ULONG64
*loops
,
1340 LONG64
*begin
, LONG64
*end
, LONG64
*next
, LONG64
*lastchunk
)
1342 ULONG64 iterations
, num_chunks
, per_thread
, remaining
;
1343 struct vcomp_thread_data
*thread_data
= vcomp_init_thread_data();
1344 struct vcomp_team_data
*team_data
= thread_data
->team
;
1345 int num_threads
= team_data
? team_data
->num_threads
: 1;
1346 int thread_num
= thread_data
->thread_num
;
1347 LONG64 no_begin
, no_lastchunk
;
1349 TRACE("(%s, %s, %s, %s, %p, %p, %p, %p, %p)\n",
1350 wine_dbgstr_longlong(first
), wine_dbgstr_longlong(last
),
1351 wine_dbgstr_longlong(step
), wine_dbgstr_longlong(chunksize
),
1352 loops
, begin
, end
, next
, lastchunk
);
1357 lastchunk
= &no_lastchunk
;
1360 if (num_threads
== 1 && chunksize
!= 1)
1372 *loops
= !thread_num
;
1390 iterations
= 1 + (last
- first
) / step
;
1393 iterations
= 1 + (first
- last
) / step
;
1400 num_chunks
= iterations
/ chunksize
;
1401 if (iterations
% chunksize
) num_chunks
++;
1402 per_thread
= num_chunks
/ num_threads
;
1403 remaining
= num_chunks
- per_thread
* num_threads
;
1405 *loops
= per_thread
+ (thread_num
< remaining
);
1406 *begin
= first
+ thread_num
* chunksize
* step
;
1407 *end
= *begin
+ (chunksize
- 1) * step
;
1408 *next
= chunksize
* num_threads
* step
;
1409 *lastchunk
= first
+ (num_chunks
- 1) * chunksize
* step
;
1412 void CDECL
_vcomp_for_static_end(void)
1415 /* nothing to do here */
1418 void CDECL
_vcomp_for_dynamic_init(unsigned int flags
, unsigned int first
, unsigned int last
,
1419 int step
, unsigned int chunksize
)
1421 unsigned int iterations
, per_thread
, remaining
;
1422 struct vcomp_thread_data
*thread_data
= vcomp_init_thread_data();
1423 struct vcomp_team_data
*team_data
= thread_data
->team
;
1424 struct vcomp_task_data
*task_data
= thread_data
->task
;
1425 int num_threads
= team_data
? team_data
->num_threads
: 1;
1426 int thread_num
= thread_data
->thread_num
;
1427 unsigned int type
= flags
& ~VCOMP_DYNAMIC_FLAGS_INCREMENT
;
1429 TRACE("(%u, %u, %u, %d, %u)\n", flags
, first
, last
, step
, chunksize
);
1433 thread_data
->dynamic_type
= 0;
1437 if (flags
& VCOMP_DYNAMIC_FLAGS_INCREMENT
)
1438 iterations
= 1 + (last
- first
) / step
;
1441 iterations
= 1 + (first
- last
) / step
;
1445 if (type
== VCOMP_DYNAMIC_FLAGS_STATIC
)
1447 per_thread
= iterations
/ num_threads
;
1448 remaining
= iterations
- per_thread
* num_threads
;
1450 if (thread_num
< remaining
)
1452 else if (per_thread
)
1453 first
+= remaining
* step
;
1456 thread_data
->dynamic_type
= 0;
1460 thread_data
->dynamic_type
= VCOMP_DYNAMIC_FLAGS_STATIC
;
1461 thread_data
->dynamic_begin
= first
+ per_thread
* thread_num
* step
;
1462 thread_data
->dynamic_end
= thread_data
->dynamic_begin
+ (per_thread
- 1) * step
;
1466 if (type
!= VCOMP_DYNAMIC_FLAGS_CHUNKED
&&
1467 type
!= VCOMP_DYNAMIC_FLAGS_GUIDED
)
1469 FIXME("unsupported flags %u\n", flags
);
1470 type
= VCOMP_DYNAMIC_FLAGS_GUIDED
;
1473 EnterCriticalSection(&vcomp_section
);
1474 thread_data
->dynamic
++;
1475 thread_data
->dynamic_type
= type
;
1476 if ((int)(thread_data
->dynamic
- task_data
->dynamic
) > 0)
1478 task_data
->dynamic
= thread_data
->dynamic
;
1479 task_data
->dynamic_first
= first
;
1480 task_data
->dynamic_last
= last
;
1481 task_data
->dynamic_iterations
= iterations
;
1482 task_data
->dynamic_step
= step
;
1483 task_data
->dynamic_chunksize
= chunksize
;
1485 LeaveCriticalSection(&vcomp_section
);
1489 int CDECL
_vcomp_for_dynamic_next(unsigned int *begin
, unsigned int *end
)
1491 struct vcomp_thread_data
*thread_data
= vcomp_init_thread_data();
1492 struct vcomp_task_data
*task_data
= thread_data
->task
;
1493 struct vcomp_team_data
*team_data
= thread_data
->team
;
1494 int num_threads
= team_data
? team_data
->num_threads
: 1;
1496 TRACE("(%p, %p)\n", begin
, end
);
1498 if (thread_data
->dynamic_type
== VCOMP_DYNAMIC_FLAGS_STATIC
)
1500 *begin
= thread_data
->dynamic_begin
;
1501 *end
= thread_data
->dynamic_end
;
1502 thread_data
->dynamic_type
= 0;
1505 else if (thread_data
->dynamic_type
== VCOMP_DYNAMIC_FLAGS_CHUNKED
||
1506 thread_data
->dynamic_type
== VCOMP_DYNAMIC_FLAGS_GUIDED
)
1508 unsigned int iterations
= 0;
1509 EnterCriticalSection(&vcomp_section
);
1510 if (thread_data
->dynamic
== task_data
->dynamic
&&
1511 task_data
->dynamic_iterations
!= 0)
1513 iterations
= min(task_data
->dynamic_iterations
, task_data
->dynamic_chunksize
);
1514 if (thread_data
->dynamic_type
== VCOMP_DYNAMIC_FLAGS_GUIDED
&&
1515 task_data
->dynamic_iterations
> num_threads
* task_data
->dynamic_chunksize
)
1517 iterations
= (task_data
->dynamic_iterations
+ num_threads
- 1) / num_threads
;
1519 *begin
= task_data
->dynamic_first
;
1520 *end
= task_data
->dynamic_first
+ (iterations
- 1) * task_data
->dynamic_step
;
1521 task_data
->dynamic_iterations
-= iterations
;
1522 task_data
->dynamic_first
+= iterations
* task_data
->dynamic_step
;
1523 if (!task_data
->dynamic_iterations
)
1524 *end
= task_data
->dynamic_last
;
1526 LeaveCriticalSection(&vcomp_section
);
1527 return iterations
!= 0;
1533 int CDECL
omp_in_parallel(void)
1536 return vcomp_init_thread_data()->parallel
;
1539 static DWORD WINAPI
_vcomp_fork_worker(void *param
)
1541 struct vcomp_thread_data
*thread_data
= param
;
1542 vcomp_set_thread_data(thread_data
);
1544 TRACE("starting worker thread for %p\n", thread_data
);
1546 EnterCriticalSection(&vcomp_section
);
1549 struct vcomp_team_data
*team
= thread_data
->team
;
1552 LeaveCriticalSection(&vcomp_section
);
1553 _vcomp_fork_call_wrapper(team
->wrapper
, team
->nargs
, ptr_from_va_list(team
->valist
));
1554 EnterCriticalSection(&vcomp_section
);
1556 thread_data
->team
= NULL
;
1557 list_remove(&thread_data
->entry
);
1558 list_add_tail(&vcomp_idle_threads
, &thread_data
->entry
);
1559 if (++team
->finished_threads
>= team
->num_threads
)
1560 WakeAllConditionVariable(&team
->cond
);
1563 if (!SleepConditionVariableCS(&thread_data
->cond
, &vcomp_section
, 5000) &&
1564 GetLastError() == ERROR_TIMEOUT
&& !thread_data
->team
)
1569 list_remove(&thread_data
->entry
);
1570 LeaveCriticalSection(&vcomp_section
);
1572 TRACE("terminating worker thread for %p\n", thread_data
);
1574 HeapFree(GetProcessHeap(), 0, thread_data
);
1575 vcomp_set_thread_data(NULL
);
1576 FreeLibraryAndExitThread(vcomp_module
, 0);
1580 void WINAPIV
_vcomp_fork(BOOL ifval
, int nargs
, void *wrapper
, ...)
1582 struct vcomp_thread_data
*prev_thread_data
= vcomp_init_thread_data();
1583 struct vcomp_thread_data thread_data
;
1584 struct vcomp_team_data team_data
;
1585 struct vcomp_task_data task_data
;
1588 TRACE("(%d, %d, %p, ...)\n", ifval
, nargs
, wrapper
);
1590 if (prev_thread_data
->parallel
&& !vcomp_nested_fork
)
1595 else if (prev_thread_data
->fork_threads
)
1596 num_threads
= prev_thread_data
->fork_threads
;
1598 num_threads
= vcomp_num_threads
;
1600 InitializeConditionVariable(&team_data
.cond
);
1601 team_data
.num_threads
= 1;
1602 team_data
.finished_threads
= 0;
1603 team_data
.nargs
= nargs
;
1604 team_data
.wrapper
= wrapper
;
1605 va_start(team_data
.valist
, wrapper
);
1606 team_data
.barrier
= 0;
1607 team_data
.barrier_count
= 0;
1609 task_data
.single
= 0;
1610 task_data
.section
= 0;
1611 task_data
.dynamic
= 0;
1613 thread_data
.team
= &team_data
;
1614 thread_data
.task
= &task_data
;
1615 thread_data
.thread_num
= 0;
1616 thread_data
.parallel
= ifval
|| prev_thread_data
->parallel
;
1617 thread_data
.fork_threads
= 0;
1618 thread_data
.single
= 1;
1619 thread_data
.section
= 1;
1620 thread_data
.dynamic
= 1;
1621 thread_data
.dynamic_type
= 0;
1622 list_init(&thread_data
.entry
);
1623 InitializeConditionVariable(&thread_data
.cond
);
1625 if (num_threads
> 1)
1628 EnterCriticalSection(&vcomp_section
);
1630 /* reuse existing threads (if any) */
1631 while (team_data
.num_threads
< num_threads
&& (ptr
= list_head(&vcomp_idle_threads
)))
1633 struct vcomp_thread_data
*data
= LIST_ENTRY(ptr
, struct vcomp_thread_data
, entry
);
1634 data
->team
= &team_data
;
1635 data
->task
= &task_data
;
1636 data
->thread_num
= team_data
.num_threads
++;
1637 data
->parallel
= thread_data
.parallel
;
1638 data
->fork_threads
= 0;
1642 data
->dynamic_type
= 0;
1643 list_remove(&data
->entry
);
1644 list_add_tail(&thread_data
.entry
, &data
->entry
);
1645 WakeAllConditionVariable(&data
->cond
);
1648 /* spawn additional threads */
1649 while (team_data
.num_threads
< num_threads
)
1651 struct vcomp_thread_data
*data
;
1655 data
= HeapAlloc(GetProcessHeap(), 0, sizeof(*data
));
1658 data
->team
= &team_data
;
1659 data
->task
= &task_data
;
1660 data
->thread_num
= team_data
.num_threads
;
1661 data
->parallel
= thread_data
.parallel
;
1662 data
->fork_threads
= 0;
1666 data
->dynamic_type
= 0;
1667 InitializeConditionVariable(&data
->cond
);
1669 thread
= CreateThread(NULL
, 0, _vcomp_fork_worker
, data
, 0, NULL
);
1672 HeapFree(GetProcessHeap(), 0, data
);
1676 GetModuleHandleExW(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS
,
1677 (const WCHAR
*)vcomp_module
, &module
);
1678 team_data
.num_threads
++;
1679 list_add_tail(&thread_data
.entry
, &data
->entry
);
1680 CloseHandle(thread
);
1683 LeaveCriticalSection(&vcomp_section
);
1686 vcomp_set_thread_data(&thread_data
);
1687 _vcomp_fork_call_wrapper(team_data
.wrapper
, team_data
.nargs
, ptr_from_va_list(team_data
.valist
));
1688 vcomp_set_thread_data(prev_thread_data
);
1689 prev_thread_data
->fork_threads
= 0;
1691 if (team_data
.num_threads
> 1)
1693 EnterCriticalSection(&vcomp_section
);
1695 team_data
.finished_threads
++;
1696 while (team_data
.finished_threads
< team_data
.num_threads
)
1697 SleepConditionVariableCS(&team_data
.cond
, &vcomp_section
, INFINITE
);
1699 LeaveCriticalSection(&vcomp_section
);
1700 assert(list_empty(&thread_data
.entry
));
1703 va_end(team_data
.valist
);
1706 static CRITICAL_SECTION
*alloc_critsect(void)
1708 CRITICAL_SECTION
*critsect
;
1709 if (!(critsect
= HeapAlloc(GetProcessHeap(), 0, sizeof(*critsect
))))
1711 ERR("could not allocate critical section\n");
1715 InitializeCriticalSection(critsect
);
1716 critsect
->DebugInfo
->Spare
[0] = (DWORD_PTR
)(__FILE__
": critsect");
1720 static void destroy_critsect(CRITICAL_SECTION
*critsect
)
1722 if (!critsect
) return;
1723 critsect
->DebugInfo
->Spare
[0] = 0;
1724 DeleteCriticalSection(critsect
);
1725 HeapFree(GetProcessHeap(), 0, critsect
);
1728 void CDECL
omp_init_lock(omp_lock_t
*lock
)
1730 TRACE("(%p)\n", lock
);
1731 *lock
= alloc_critsect();
1734 void CDECL
omp_destroy_lock(omp_lock_t
*lock
)
1736 TRACE("(%p)\n", lock
);
1737 destroy_critsect(*lock
);
1740 void CDECL
omp_set_lock(omp_lock_t
*lock
)
1742 TRACE("(%p)\n", lock
);
1744 if (RtlIsCriticalSectionLockedByThread(*lock
))
1746 ERR("omp_set_lock called while holding lock %p\n", *lock
);
1750 EnterCriticalSection(*lock
);
1753 void CDECL
omp_unset_lock(omp_lock_t
*lock
)
1755 TRACE("(%p)\n", lock
);
1756 LeaveCriticalSection(*lock
);
1759 int CDECL
omp_test_lock(omp_lock_t
*lock
)
1761 TRACE("(%p)\n", lock
);
1763 if (RtlIsCriticalSectionLockedByThread(*lock
))
1766 return TryEnterCriticalSection(*lock
);
1769 void CDECL
omp_set_nest_lock(omp_nest_lock_t
*lock
)
1771 TRACE("(%p)\n", lock
);
1772 EnterCriticalSection(*lock
);
1775 void CDECL
omp_unset_nest_lock(omp_nest_lock_t
*lock
)
1777 TRACE("(%p)\n", lock
);
1778 LeaveCriticalSection(*lock
);
1781 int CDECL
omp_test_nest_lock(omp_nest_lock_t
*lock
)
1783 TRACE("(%p)\n", lock
);
1784 return TryEnterCriticalSection(*lock
) ? (*lock
)->RecursionCount
: 0;
1787 void CDECL
_vcomp_enter_critsect(CRITICAL_SECTION
**critsect
)
1789 TRACE("(%p)\n", critsect
);
1793 CRITICAL_SECTION
*new_critsect
= alloc_critsect();
1794 if (InterlockedCompareExchangePointer((void **)critsect
, new_critsect
, NULL
) != NULL
)
1795 destroy_critsect(new_critsect
); /* someone beat us to it */
1798 EnterCriticalSection(*critsect
);
1801 void CDECL
_vcomp_leave_critsect(CRITICAL_SECTION
*critsect
)
1803 TRACE("(%p)\n", critsect
);
1804 LeaveCriticalSection(critsect
);
1807 static unsigned int get_step_count(int start
, int end
, int range_offset
, int step
)
1809 int range
= end
- start
+ step
- range_offset
;
1812 return (unsigned)-range
/ -step
;
1814 return (unsigned)range
/ step
;
1817 static void CDECL
c2vectparallel_wrapper(int start
, int end
, int step
, int end_included
, BOOL dynamic_distribution
,
1818 int volatile *dynamic_start
, void *function
, int nargs
, va_list valist
)
1820 void *wrapper_args
[MAX_VECT_PARALLEL_CALLBACK_ARGS
];
1821 unsigned int step_count
, steps_per_call
, remainder
;
1822 int thread_count
= omp_get_num_threads();
1823 int curr_start
, curr_end
, range_offset
;
1824 int thread
= _vcomp_get_thread_num();
1827 copy_va_list_data(&wrapper_args
[2], valist
, nargs
- 2);
1829 step_sign
= step
> 0 ? 1 : -1;
1830 range_offset
= step_sign
* !end_included
;
1832 if (dynamic_distribution
)
1834 int next_start
, new_start
, end_value
;
1836 start
= *dynamic_start
;
1837 end_value
= end
+ !!end_included
* step
;
1838 while (start
!= end_value
)
1840 step_count
= get_step_count(start
, end
, range_offset
, step
);
1842 curr_end
= start
+ (step_count
+ thread_count
- 1) / thread_count
* step
1845 if ((curr_end
- end
) * step_sign
> 0)
1847 next_start
= end_value
;
1852 next_start
= curr_end
- range_offset
;
1856 if ((new_start
= InterlockedCompareExchange((LONG
volatile*)dynamic_start
, next_start
, start
)) != start
)
1862 wrapper_args
[0] = (void *)(ULONG_PTR
)start
;
1863 wrapper_args
[1] = (void *)(ULONG_PTR
)curr_end
;
1864 _vcomp_fork_call_wrapper(function
, nargs
, wrapper_args
);
1865 start
= *dynamic_start
;
1870 step_count
= get_step_count(start
, end
, range_offset
, step
);
1872 /* According to the tests native vcomp still makes extra calls
1873 * with empty range from excessive threads under certain conditions
1874 * for unclear reason. */
1875 if (thread
>= step_count
&& (end_included
|| (step
!= 1 && step
!= -1)))
1878 steps_per_call
= step_count
/ thread_count
;
1879 remainder
= step_count
% thread_count
;
1881 if (thread
< remainder
)
1883 curr_start
= thread
* (steps_per_call
+ 1);
1884 curr_end
= curr_start
+ steps_per_call
+ 1;
1886 else if (thread
< step_count
)
1888 curr_start
= remainder
+ steps_per_call
* thread
;
1889 curr_end
= curr_start
+ steps_per_call
;
1893 curr_start
= curr_end
= 0;
1896 curr_start
= start
+ curr_start
* step
;
1897 curr_end
= start
+ (curr_end
- 1) * step
+ range_offset
;
1899 wrapper_args
[0] = (void *)(ULONG_PTR
)curr_start
;
1900 wrapper_args
[1] = (void *)(ULONG_PTR
)curr_end
;
1901 _vcomp_fork_call_wrapper(function
, nargs
, wrapper_args
);
1904 void WINAPIV
C2VectParallel(int start
, int end
, int step
, BOOL end_included
, int thread_count
,
1905 BOOL dynamic_distribution
, void *function
, int nargs
, ...)
1907 struct vcomp_thread_data
*thread_data
;
1908 int volatile dynamic_start
;
1909 int prev_thread_count
;
1912 TRACE("start %d, end %d, step %d, end_included %d, thread_count %d, dynamic_distribution %#x,"
1913 " function %p, nargs %d.\n", start
, end
, step
, end_included
, thread_count
,
1914 dynamic_distribution
, function
, nargs
);
1916 if (nargs
> MAX_VECT_PARALLEL_CALLBACK_ARGS
)
1918 FIXME("Number of arguments %u exceeds supported maximum %u"
1919 " (not calling the loop code, expect problems).\n",
1920 nargs
, MAX_VECT_PARALLEL_CALLBACK_ARGS
);
1924 va_start(valist
, nargs
);
1926 /* This expression can result in integer overflow. According to the tests,
1927 * native vcomp runs the function as a single thread both for empty range
1928 * and (end - start) not fitting the integer range. */
1929 if ((step
> 0 && end
< start
) || (step
< 0 && end
> start
)
1930 || (end
- start
) / step
< 2 || thread_count
< 0)
1932 void *wrapper_args
[MAX_VECT_PARALLEL_CALLBACK_ARGS
];
1934 wrapper_args
[0] = (void *)(ULONG_PTR
)start
;
1935 wrapper_args
[1] = (void *)(ULONG_PTR
)end
;
1936 copy_va_list_data(&wrapper_args
[2], valist
, nargs
- 2);
1937 _vcomp_fork_call_wrapper(function
, nargs
, wrapper_args
);
1942 thread_data
= vcomp_init_thread_data();
1943 prev_thread_count
= thread_data
->fork_threads
;
1944 thread_data
->fork_threads
= thread_count
;
1946 dynamic_start
= start
;
1948 _vcomp_fork(TRUE
, 9, c2vectparallel_wrapper
, start
, end
, step
, end_included
, dynamic_distribution
,
1949 &dynamic_start
, function
, nargs
, valist
);
1951 thread_data
->fork_threads
= prev_thread_count
;
1955 BOOL WINAPI
DllMain(HINSTANCE instance
, DWORD reason
, LPVOID reserved
)
1957 TRACE("(%p, %ld, %p)\n", instance
, reason
, reserved
);
1961 case DLL_PROCESS_ATTACH
:
1963 SYSTEM_INFO sysinfo
;
1965 if ((vcomp_context_tls
= TlsAlloc()) == TLS_OUT_OF_INDEXES
)
1967 ERR("Failed to allocate TLS index\n");
1971 GetSystemInfo(&sysinfo
);
1972 vcomp_module
= instance
;
1973 vcomp_max_threads
= sysinfo
.dwNumberOfProcessors
;
1974 vcomp_num_threads
= sysinfo
.dwNumberOfProcessors
;
1975 vcomp_num_procs
= sysinfo
.dwNumberOfProcessors
;
1979 case DLL_PROCESS_DETACH
:
1981 if (reserved
) break;
1982 if (vcomp_context_tls
!= TLS_OUT_OF_INDEXES
)
1984 vcomp_free_thread_data();
1985 TlsFree(vcomp_context_tls
);
1990 case DLL_THREAD_DETACH
:
1992 vcomp_free_thread_data();