3 ; kProfiler Mark 2 - Microsoft C/C++ Compiler Interaction, AMD64.
7 ; Copyright (c) 2006-2007 Knut St. Osmundsen <bird-kStuff-spamix@anduin.net>
9 ; Permission is hereby granted, free of charge, to any person
10 ; obtaining a copy of this software and associated documentation
11 ; files (the "Software"), to deal in the Software without
12 ; restriction, including without limitation the rights to use,
13 ; copy, modify, merge, publish, distribute, sublicense, and/or sell
14 ; copies of the Software, and to permit persons to whom the
15 ; Software is furnished to do so, subject to the following
18 ; The above copyright notice and this permission notice shall be
19 ; included in all copies or substantial portions of the Software.
21 ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 ; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
23 ; OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 ; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
25 ; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
26 ; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
27 ; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
28 ; OTHER DEALINGS IN THE SOFTWARE.
47 global common_return_path
48 global common_overhead
49 global common_no_overhead
51 global calib_inner_update_minimum
52 global calib_inner_next
53 global calib_outer_dec
54 global calib_outer_inc
61 ; On x86 the call to this function has been observed to be put before
62 ; creating the stack frame, as the very first instruction in the function.
64 ; Thus the stack layout is as follows:
65 ; 24 return address of the calling function.
66 ; 20 our return address - the address of the calling function + 5.
71 ; c tsc high - param 3
73 ; 4 frame pointer - param 2
74 ; 0 function ptr - param 1
79 ; save volatile register and get the time stamp.
89 sub rsp
, 28h ; rsp is unaligned at this point (8 pushes).
90 ; reserve 20h for spill, and 8 bytes for ts.
92 ; setting up the enter call frame
95 or r8
, rax
; param 3 - the timestamp
96 mov [rsp
+ 20h], r8
; save the tsc for later use.
97 lea rdx
, [rsp
+ 8*8 + 28h] ; Param 2 - default frame pointer
98 mov rcx
, [rdx
] ; Param 1 - The function address
100 ; MSC seems to put the _penter both before and after the typical sub rsp, xxh
101 ; statement as if it cannot quite make up its mind. We'll try adjust for this
102 ; to make the unwinding a bit more accurate wrt to longjmp/throw. But since
103 ; there are also an uneven amount of push/pop around the _penter/_pexit we
104 ; can never really make a perfect job of it. sigh.
105 cmp word [rcx
- 5 - 4], 08348h ; sub rsp, imm8
107 cmp byte [rcx
- 5 - 2], 0ech
109 movzx eax, byte [rcx
- 5 - 1] ; imm8
113 cmp word [rcx
- 5 - 7], 08148h ; sub rsp, imm32
115 cmp byte [rcx
- 5 - 5], 0ech
117 mov eax, [rcx
- 5 - 4] ; imm32
119 ; jmp .call_prf_enter
123 jmp common_return_path
127 ; On x86 the call to this function has been observed to be put right before
128 ; return instruction. This fact matters since since we have to calc the same
129 ; stack address as in _penter.
131 ; Thus the stack layout is as follows:
132 ; 24 return address of the calling function.
133 ; 20 our return address - the address of the calling function + 5.
138 ; c tsc high - param 3
140 ; 4 frame pointer - param 2
141 ; 0 function ptr - param 1
146 ; save volatile register and get the time stamp.
156 sub rsp
, 28h ; rsp is unaligned at this point (8 pushes).
157 ; reserve 20h for spill, and 8 bytes for ts.
159 ; setting up the enter call frame
162 or r8
, rax
; param 3 - the timestamp
163 mov [rsp
+ 20h], r8
; save the tsc for later use.
164 lea rdx
, [rsp
+ 8*8 + 28h] ; Param 2 - frame pointer.
165 mov rcx
, [rdx
] ; Param 1 - The function address
167 ; MSC some times put the _pexit before the add rsp, xxh. To try match up with
168 ; any adjustments made in _penter, we'll try detect this.
169 cmp word [rcx
], 08348h ; add rsp, imm8
171 cmp byte [rcx
+ 2], 0c4h
173 movzx eax, byte [rcx
+ 3] ; imm8
177 cmp word [rcx
], 08148h ; add rsp, imm32
179 cmp byte [rcx
+ 2], 0c4h
181 mov eax, [rcx
+ 3] ; imm32
183 ; jmp .call_prf_leave
187 jmp common_return_path
191 ; This is the common return path for both the enter and exit hooks.
192 ; It's kept common because we can then use the same overhead adjustment
193 ; and save some calibration efforts. It also saves space :-)
198 jz common_no_overhead
199 cmp byte [g_fCalibrated wrt rip
], 0
203 mov rcx
, rax
; rcx <- pointer to overhead counter.
204 mov eax, [g_OverheadAdj wrt rip
]; apply the adjustment before reading tsc
209 or rdx
, rax
; rdx = 64-bit timestamp
210 sub rdx
, [rsp
+ 20h] ; rdx = elapsed
211 lock add [rcx
], rdx
; update counter.
214 ; restore volatile registers.
227 ; Data rsi points to while we're calibrating.
239 ; Do necessary calibrations.
242 ; prolog - save everything
261 sub rsp
, CALIBDATA_size
262 mov rsi
, rsp
; rsi points to the CALIBDATA
267 ; Indicate that we have finished calibrating.
270 xchg dword [g_fCalibrated wrt rip
], eax
273 ; The outer loop - find the right adjustment.
275 mov ebx, 200h ; loop counter.
279 ; The inner loop - calls the function number of times to establish a
283 mov dword [rsi
+ CALIBDATA.Min
], 0ffffffffh
284 mov dword [rsi
+ CALIBDATA.Min
+ 4], 07fffffffh
287 ; zero the overhead and profiled times.
289 mov [rsi
+ CALIBDATA.Overhead
], rax
290 mov [rsi
+ CALIBDATA.Profiled
], rax
293 ; subtract the overhead
294 mov rax
, [rsi
+ CALIBDATA.Profiled
]
295 sub rax
, [rsi
+ CALIBDATA.Overhead
]
297 ; update the minimum value.
299 jc near calib_outer_dec
; if negative, just simplify and shortcut
300 cmp rax
, [rsi
+ CALIBDATA.Min
]
302 calib_inner_update_minimum:
303 mov [rsi
+ CALIBDATA.Min
], rax
305 loop calib_inner_loop
307 ; Is the minimum value acceptable?
308 test dword [rsi
+ CALIBDATA.Min
+ 4], 80000000h
309 jnz calib_outer_dec
; simplify if negative.
310 cmp dword [rsi
+ CALIBDATA.Min
+ 4], 0
311 jnz calib_outer_inc
; this shouldn't be possible
312 cmp dword [rsi
+ CALIBDATA.Min
], 1fh
313 jbe calib_outer_dec
; too low - 2 ticks per pair is the minimum!
314 ;cmp dword [rsi + CALIBDATA.Min], 30h
315 ;jbe calib_done ; this is fine!
316 cmp dword [rsi
+ CALIBDATA.Min
], 70h ; - a bit weird...
317 jbe calib_outer_next
; do the full 200h*200h iteration
319 inc dword [g_OverheadAdj wrt rip
]
322 cmp dword [g_OverheadAdj wrt rip
], 1
324 dec dword [g_OverheadAdj wrt rip
]
330 ; epilog - restore it all.
354 ; The calibration _penter - this must be identical to the real thing except for the KPRF call.
357 ; This part must be identical past the rdtsc.
367 sub rsp
, 28h ; rsp is unaligned at this point (8 pushes).
368 ; reserve 20h for spill, and 8 bytes for ts.
370 ; store the entry / stack frame.
376 mov [rsi
+ CALIBDATA.EnterTS
], r8
378 lea rax
, [rsi
+ CALIBDATA.Overhead
]
383 ; The calibration _pexit - this must be identical to the real thing except for the KPRF call.
386 ; This part must be identical past the rdtsc.
396 sub rsp
, 28h ; rsp is unaligned at this point (8 pushes).
397 ; reserve 20h for spill, and 8 bytes for ts.
399 ; store the entry / stack frame.
405 sub r8
, [rsi
+ CALIBDATA.EnterTS
]
406 add [rsi
+ CALIBDATA.Profiled
], r8
408 lea rax
, [rsi
+ CALIBDATA.EnterTS
]
413 ; The 'function' we're profiling.
414 ; The general idea is that each pair should take something like 2-10 ticks.
416 ; (Btw. If we don't use multiple pairs here, we end up with the wrong result.)
470 ; Dummy stack check function.