kDefs.h: Added risc-v to the K_ARCH_XXX defines.
[kstuff-mirror.git] / kProfiler2 / prfamd64msc.asm
blob2b4ddc01d9aa97c6aea61a065ac1e14d787937d1
1 ; $Id$;
2 ;; @file
3 ; kProfiler Mark 2 - Microsoft C/C++ Compiler Interaction, AMD64.
7 ; Copyright (c) 2006-2007 Knut St. Osmundsen <bird-kStuff-spamix@anduin.net>
9 ; Permission is hereby granted, free of charge, to any person
10 ; obtaining a copy of this software and associated documentation
11 ; files (the "Software"), to deal in the Software without
12 ; restriction, including without limitation the rights to use,
13 ; copy, modify, merge, publish, distribute, sublicense, and/or sell
14 ; copies of the Software, and to permit persons to whom the
15 ; Software is furnished to do so, subject to the following
16 ; conditions:
18 ; The above copyright notice and this permission notice shall be
19 ; included in all copies or substantial portions of the Software.
21 ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 ; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
23 ; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 ; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
25 ; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
26 ; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
27 ; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
28 ; OTHER DEALINGS IN THE SOFTWARE.
31 [section .data]
33 g_fCalibrated:
34 dd 0
35 g_OverheadAdj:
36 dd 0
38 [section .text]
40 extern KPRF_ENTER
41 extern KPRF_LEAVE
43 global _penter
44 global _pexit
46 ;ifdef UNDEFINED
47 global common_return_path
48 global common_overhead
49 global common_no_overhead
50 global calibrate
51 global calib_inner_update_minimum
52 global calib_inner_next
53 global calib_outer_dec
54 global calib_outer_inc
55 global calib_done
56 global calib_nullproc
57 ;endif
61 ; On x86 the call to this function has been observed to be put before
62 ; creating the stack frame, as the very first instruction in the function.
64 ; Thus the stack layout is as follows:
65 ; 24 return address of the calling function.
66 ; 20 our return address - the address of the calling function + 5.
67 ; 1c eax
68 ; 18 edx
69 ; 14 eflags
70 ; 10 ecx
71 ; c tsc high - param 3
72 ; 8 tsc low
73 ; 4 frame pointer - param 2
74 ; 0 function ptr - param 1
77 align 16
78 _penter:
79 ; save volatile register and get the time stamp.
80 push rax
81 push rdx
82 rdtsc
83 pushfq
84 push rcx
85 push r8
86 push r9
87 push r10
88 push r11
89 sub rsp, 28h ; rsp is unaligned at this point (8 pushes).
90 ; reserve 20h for spill, and 8 bytes for ts.
92 ; setting up the enter call frame
93 mov r8d, edx
94 shl r8, 32
95 or r8, rax ; param 3 - the timestamp
96 mov [rsp + 20h], r8 ; save the tsc for later use.
97 lea rdx, [rsp + 8*8 + 28h] ; Param 2 - default frame pointer
98 mov rcx, [rdx] ; Param 1 - The function address
100 ; MSC seems to put the _penter both before and after the typical sub rsp, xxh
101 ; statement as if it cannot quite make up its mind. We'll try adjust for this
102 ; to make the unwinding a bit more accurate wrt to longjmp/throw. But since
103 ; there are also an uneven amount of push/pop around the _penter/_pexit we
104 ; can never really make a perfect job of it. sigh.
105 cmp word [rcx - 5 - 4], 08348h ; sub rsp, imm8
106 jne .not_byte_sub
107 cmp byte [rcx - 5 - 2], 0ech
108 jne .not_byte_sub
109 movzx eax, byte [rcx - 5 - 1] ; imm8
110 add rdx, rax
111 jmp .call_prf_enter
112 .not_byte_sub:
113 cmp word [rcx - 5 - 7], 08148h ; sub rsp, imm32
114 jne .not_dword_sub
115 cmp byte [rcx - 5 - 5], 0ech
116 jne .not_dword_sub
117 mov eax, [rcx - 5 - 4] ; imm32
118 add rdx, rax
119 ; jmp .call_prf_enter
120 .not_dword_sub:
121 .call_prf_enter:
122 call KPRF_ENTER
123 jmp common_return_path
127 ; On x86 the call to this function has been observed to be put right before
128 ; return instruction. This fact matters since since we have to calc the same
129 ; stack address as in _penter.
131 ; Thus the stack layout is as follows:
132 ; 24 return address of the calling function.
133 ; 20 our return address - the address of the calling function + 5.
134 ; 1c eax
135 ; 18 edx
136 ; 14 eflags
137 ; 10 ecx
138 ; c tsc high - param 3
139 ; 8 tsc low
140 ; 4 frame pointer - param 2
141 ; 0 function ptr - param 1
144 align 16
145 _pexit:
146 ; save volatile register and get the time stamp.
147 push rax
148 push rdx
149 rdtsc
150 pushfq
151 push rcx
152 push r8
153 push r9
154 push r10
155 push r11
156 sub rsp, 28h ; rsp is unaligned at this point (8 pushes).
157 ; reserve 20h for spill, and 8 bytes for ts.
159 ; setting up the enter call frame
160 mov r8d, edx
161 shl r8, 32
162 or r8, rax ; param 3 - the timestamp
163 mov [rsp + 20h], r8 ; save the tsc for later use.
164 lea rdx, [rsp + 8*8 + 28h] ; Param 2 - frame pointer.
165 mov rcx, [rdx] ; Param 1 - The function address
167 ; MSC some times put the _pexit before the add rsp, xxh. To try match up with
168 ; any adjustments made in _penter, we'll try detect this.
169 cmp word [rcx], 08348h ; add rsp, imm8
170 jne .not_byte_sub
171 cmp byte [rcx + 2], 0c4h
172 jne .not_byte_sub
173 movzx eax, byte [rcx + 3] ; imm8
174 add rdx, rax
175 jmp .call_prf_leave
176 .not_byte_sub:
177 cmp word [rcx], 08148h ; add rsp, imm32
178 jne .not_dword_sub
179 cmp byte [rcx + 2], 0c4h
180 jne .not_dword_sub
181 mov eax, [rcx + 3] ; imm32
182 add rdx, rax
183 ; jmp .call_prf_leave
184 .not_dword_sub:
185 .call_prf_leave:
186 call KPRF_LEAVE
187 jmp common_return_path
191 ; This is the common return path for both the enter and exit hooks.
192 ; It's kept common because we can then use the same overhead adjustment
193 ; and save some calibration efforts. It also saves space :-)
194 align 16
195 common_return_path:
196 ; Update overhead
197 test rax, rax
198 jz common_no_overhead
199 cmp byte [g_fCalibrated wrt rip], 0
200 jnz common_overhead
201 call calibrate
202 common_overhead:
203 mov rcx, rax ; rcx <- pointer to overhead counter.
204 mov eax, [g_OverheadAdj wrt rip]; apply the adjustment before reading tsc
205 sub [rsp + 20h], rax
207 rdtsc
208 shl rdx, 32
209 or rdx, rax ; rdx = 64-bit timestamp
210 sub rdx, [rsp + 20h] ; rdx = elapsed
211 lock add [rcx], rdx ; update counter.
212 common_no_overhead:
214 ; restore volatile registers.
215 add rsp, 28h
216 pop r11
217 pop r10
218 pop r9
219 pop r8
220 pop rcx
221 popfq
222 pop rdx
223 pop rax
227 ; Data rsi points to while we're calibrating.
228 struc CALIBDATA
229 .Overhead resq 1
230 .Profiled resq 1
231 .EnterTS resq 1
232 .Min resq 1
233 endstruc
237 align 16
239 ; Do necessary calibrations.
241 calibrate:
242 ; prolog - save everything
243 push rbp
244 pushfq
245 push rax ; pushaq
246 push rbx
247 push rcx
248 push rdx
249 push rdi
250 push rsi
251 push r8
252 push r9
253 push r10
254 push r11
255 push r12
256 push r13
257 push r14
258 push r15
259 mov rbp, rsp
261 sub rsp, CALIBDATA_size
262 mov rsi, rsp ; rsi points to the CALIBDATA
264 and rsp, -16
267 ; Indicate that we have finished calibrating.
269 mov eax, 1
270 xchg dword [g_fCalibrated wrt rip], eax
273 ; The outer loop - find the right adjustment.
275 mov ebx, 200h ; loop counter.
276 calib_outer_loop:
279 ; The inner loop - calls the function number of times to establish a
280 ; good minimum value
282 mov ecx, 200h
283 mov dword [rsi + CALIBDATA.Min], 0ffffffffh
284 mov dword [rsi + CALIBDATA.Min + 4], 07fffffffh
285 calib_inner_loop:
287 ; zero the overhead and profiled times.
288 xor eax, eax
289 mov [rsi + CALIBDATA.Overhead], rax
290 mov [rsi + CALIBDATA.Profiled], rax
291 call calib_nullproc
293 ; subtract the overhead
294 mov rax, [rsi + CALIBDATA.Profiled]
295 sub rax, [rsi + CALIBDATA.Overhead]
297 ; update the minimum value.
298 bt rax, 63
299 jc near calib_outer_dec ; if negative, just simplify and shortcut
300 cmp rax, [rsi + CALIBDATA.Min]
301 jge calib_inner_next
302 calib_inner_update_minimum:
303 mov [rsi + CALIBDATA.Min], rax
304 calib_inner_next:
305 loop calib_inner_loop
307 ; Is the minimum value acceptable?
308 test dword [rsi + CALIBDATA.Min + 4], 80000000h
309 jnz calib_outer_dec ; simplify if negative.
310 cmp dword [rsi + CALIBDATA.Min + 4], 0
311 jnz calib_outer_inc ; this shouldn't be possible
312 cmp dword [rsi + CALIBDATA.Min], 1fh
313 jbe calib_outer_dec ; too low - 2 ticks per pair is the minimum!
314 ;cmp dword [rsi + CALIBDATA.Min], 30h
315 ;jbe calib_done ; this is fine!
316 cmp dword [rsi + CALIBDATA.Min], 70h ; - a bit weird...
317 jbe calib_outer_next ; do the full 200h*200h iteration
318 calib_outer_inc:
319 inc dword [g_OverheadAdj wrt rip]
320 jmp calib_outer_next
321 calib_outer_dec:
322 cmp dword [g_OverheadAdj wrt rip], 1
323 je calib_done
324 dec dword [g_OverheadAdj wrt rip]
325 calib_outer_next:
326 dec ebx
327 jnz calib_outer_loop
328 calib_done:
330 ; epilog - restore it all.
331 mov rsp, rbp
332 pop r15
333 pop r14
334 pop r13
335 pop r12
336 pop r11
337 pop r10
338 pop r9
339 pop r8
340 pop rsi
341 pop rdi
342 pop rdx
343 pop rcx
344 pop rbx
345 pop rax
346 popfq
347 pop rbp
354 ; The calibration _penter - this must be identical to the real thing except for the KPRF call.
355 align 16
356 calib_penter:
357 ; This part must be identical past the rdtsc.
358 push rax
359 push rdx
360 rdtsc
361 pushfq
362 push rcx
363 push r8
364 push r9
365 push r10
366 push r11
367 sub rsp, 28h ; rsp is unaligned at this point (8 pushes).
368 ; reserve 20h for spill, and 8 bytes for ts.
370 ; store the entry / stack frame.
371 mov r8d, edx
372 shl r8, 32
373 or r8, rax
374 mov [rsp + 20h], r8
376 mov [rsi + CALIBDATA.EnterTS], r8
378 lea rax, [rsi + CALIBDATA.Overhead]
379 jmp common_overhead
383 ; The calibration _pexit - this must be identical to the real thing except for the KPRF call.
384 align 16
385 calib_pexit:
386 ; This part must be identical past the rdtsc.
387 push rax
388 push rdx
389 rdtsc
390 pushfq
391 push rcx
392 push r8
393 push r9
394 push r10
395 push r11
396 sub rsp, 28h ; rsp is unaligned at this point (8 pushes).
397 ; reserve 20h for spill, and 8 bytes for ts.
399 ; store the entry / stack frame.
400 mov r8d, edx
401 shl r8, 32
402 or r8, rax
403 mov [rsp + 20h], r8
405 sub r8, [rsi + CALIBDATA.EnterTS]
406 add [rsi + CALIBDATA.Profiled], r8
408 lea rax, [rsi + CALIBDATA.EnterTS]
409 jmp common_overhead
413 ; The 'function' we're profiling.
414 ; The general idea is that each pair should take something like 2-10 ticks.
416 ; (Btw. If we don't use multiple pairs here, we end up with the wrong result.)
417 align 16
418 calib_nullproc:
419 call calib_penter ;0
420 call calib_pexit
422 call calib_penter ;1
423 call calib_pexit
425 call calib_penter ;2
426 call calib_pexit
428 call calib_penter ;3
429 call calib_pexit
431 call calib_penter ;4
432 call calib_pexit
434 call calib_penter ;5
435 call calib_pexit
437 call calib_penter ;6
438 call calib_pexit
440 call calib_penter ;7
441 call calib_pexit
443 call calib_penter ;8
444 call calib_pexit
446 call calib_penter ;9
447 call calib_pexit
449 call calib_penter ;a
450 call calib_pexit
452 call calib_penter ;b
453 call calib_pexit
455 call calib_penter ;c
456 call calib_pexit
458 call calib_penter ;d
459 call calib_pexit
461 call calib_penter ;e
462 call calib_pexit
464 call calib_penter ;f
465 call calib_pexit
470 ; Dummy stack check function.
472 global __chkstk
473 __chkstk: