9596 Initial xsave xstate_bv should not include all features
[unleashed.git] / usr / src / cmd / sgs / rtld / amd64 / boot_elf.s
blob36f136e31df5084c8c1c2227914c505ad21f49ac
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright (c) 2018 Joyent, Inc. All rights reserved.
29 * Welcome to the magic behind the PLT (procedure linkage table). When rtld
30 * fills out the PLT entries, it will refer initially to the functions in this
31 * file. As such our goal is simple:
33 * The lie of the function call must be preserved at all costs.
35 * This means that we need to prepare the system for an arbitrary series of
36 * instructions to be called. For example, as a side effect of resolving a
37 * symbol we may need to open a shared object which will cause any _init
38 * functions to be called. Those functions can use any and all of the ABI state
39 * that they desire (for example, the FPU registers). Therefore we must save and
40 * restore all the ABI mandated registers here.
42 * For the full information about what we need to save and restore and why,
43 * please see the System V amd64 PS ABI '3.2.3 Parameter Passing'. For general
44 * purpose registers, we need to take care of the following:
46 * %rax - Used for information about the number of vector arguments
47 * %rdi - arg0
48 * %rsi - arg1
49 * %rdx - arg2
50 * %rcx - arg3
51 * %r8 - arg4
52 * %r9 - arg5
53 * %r10 - static chain pointer
55 * Unfortunately, the world of the FPU is more complicated.
57 * The ABI mandates that we must save %xmm0-%xmm7. On newer Intel processors,
58 * %xmm0-%xmm7 shadow %ymm0-%ymm7 and %zmm0-%zmm7. Historically, when saving the
59 * FPU, we only saved and restored these eight registers. Unfortunately, this
60 * process itself ended up having side effects. Because the registers shadow one
61 * another, if we saved a full %zmm register when only a %xmm register was
62 * valid, we would end up causing the processor to think that the full %zmm
63 * register was valid. Once it believed that this was the case, it would then
64 * degrade performance of code that only used the %xmm registers.
66 * One way to tackle this problem would have been to use xgetbv with ecx=1 to
67 * get information about what was actually in use and only save and restore
68 * that. You can imagine that this logic roughly ends up as something like:
70 * if (zmm_inuse)
71 * save_zmm()
72 * if (ymm_inuse)
73 * save_ymm()
74 * save_xmm()
76 * However, this logic leaves us at the mercy of the branch predictor. This
77 * means that all of our efforts can end up still causing the CPU to execute
78 * things to make it think that some of these other FPU registers are in use and
79 * thus defeat the optimizations that it has.
81 * To deal with this problem, Intel has suggested using the xsave family of
82 * instructions. The kernel provides information about the size required for the
83 * floating point registers as well as which of several methods we need to
84 * employ through the aux vector. This gets us out of trying to look at the
85 * hardware capabilities and make decisions every time. As part of the
86 * amd64-specific portion of rtld, it will process those values and determine
87 * the functions on an as-needed basis.
89 * There are two different functions that we export. The first is elf_rtbndr().
90 * This is basically the glue that gets us into the PLT and to perform
91 * relocations. elf_rtbndr() determines the address of the function that we must
92 * call and arranges its stack such that when we return from elf_rtbndr() we
93 * will instead jump to the actual relocated function which will return to the
94 * original caller. Because of this, we must preserve all of the registers that
95 * are used for arguments and restore them before returning.
97 * The second function we export is elf_plt_trace(). This is used to add support
98 * for audit libraries among other things. elf_plt_trace() may or may not call
99 * the underlying function as a side effect or merely set up its return to it.
100 * This changes how we handle %rax. If we call the function ourself, then we end
101 * up making sure that %rax is the return value versus the initial value. In
102 * addition, because we get %r11 from the surrounding PLT code, we opt to
103 * preserve it in case some of the relocation logic ever ends up calling back
104 * into us again.
107 #if defined(lint)
109 #include <sys/types.h>
110 #include <_rtld.h>
111 #include <_audit.h>
112 #include <_elf.h>
113 #include <sys/regset.h>
114 #include <sys/auxv_386.h>
116 #else
118 #include <link.h>
119 #include <_audit.h>
120 #include <sys/asm_linkage.h>
121 #include <sys/auxv_386.h>
122 #include <sys/x86_archext.h>
125 * This macro is used to zero the xsave header. The contents of scratchreg will
126 * be destroyed. locreg should contain the starting address of the xsave header.
128 #define XSAVE_HEADER_ZERO(scratch, loc) \
129 xorq scratch, scratch; \
130 movq scratch, 0x200(loc); \
131 movq scratch, 0x208(loc); \
132 movq scratch, 0x210(loc); \
133 movq scratch, 0x218(loc); \
134 movq scratch, 0x220(loc); \
135 movq scratch, 0x228(loc); \
136 movq scratch, 0x230(loc); \
137 movq scratch, 0x238(loc)
140 .file "boot_elf.s"
141 .text
144 * This section of the code contains glue functions that are used to take care
145 * of saving and restoring the FPU. We deal with this in a few different ways
146 * based on the hardware support and what exists. Historically we've only saved
147 * and restored the first 8 floating point registers rather than the entire FPU.
148 * That implementation still exists here and is kept around mostly as an
149 * insurance policy.
151 ENTRY(_elf_rtbndr_fp_save_orig)
152 movq org_scapset@GOTPCREL(%rip),%r11
153 movq (%r11),%r11 /* Syscapset_t pointer */
154 movl 8(%r11),%edx /* sc_hw_2 */
155 testl $AV_386_2_AVX512F,%edx
156 jne .save_zmm
157 movl (%r11),%edx /* sc_hw_1 */
158 testl $AV_386_AVX,%edx
159 jne .save_ymm
160 movdqa %xmm0, (%rdi)
161 movdqa %xmm1, 64(%rdi)
162 movdqa %xmm2, 128(%rdi)
163 movdqa %xmm3, 192(%rdi)
164 movdqa %xmm4, 256(%rdi)
165 movdqa %xmm5, 320(%rdi)
166 movdqa %xmm6, 384(%rdi)
167 movdqa %xmm7, 448(%rdi)
168 jmp .save_finish
170 .save_ymm:
171 vmovdqa %ymm0, (%rdi)
172 vmovdqa %ymm1, 64(%rdi)
173 vmovdqa %ymm2, 128(%rdi)
174 vmovdqa %ymm3, 192(%rdi)
175 vmovdqa %ymm4, 256(%rdi)
176 vmovdqa %ymm5, 320(%rdi)
177 vmovdqa %ymm6, 384(%rdi)
178 vmovdqa %ymm7, 448(%rdi)
179 jmp .save_finish
181 .save_zmm:
182 vmovdqa64 %zmm0, (%rdi)
183 vmovdqa64 %zmm1, 64(%rdi)
184 vmovdqa64 %zmm2, 128(%rdi)
185 vmovdqa64 %zmm3, 192(%rdi)
186 vmovdqa64 %zmm4, 256(%rdi)
187 vmovdqa64 %zmm5, 320(%rdi)
188 vmovdqa64 %zmm6, 384(%rdi)
189 vmovdqa64 %zmm7, 448(%rdi)
191 .save_finish:
193 SET_SIZE(_elf_rtbndr_fp_save_orig)
195 ENTRY(_elf_rtbndr_fp_restore_orig)
196 movq org_scapset@GOTPCREL(%rip),%r11
197 movq (%r11),%r11 /* Syscapset_t pointer */
198 movl 8(%r11),%edx /* sc_hw_2 */
199 testl $AV_386_2_AVX512F,%edx
200 jne .restore_zmm
201 movl (%r11),%edx /* sc_hw_1 */
202 testl $AV_386_AVX,%edx
203 jne .restore_ymm
205 movdqa (%rdi), %xmm0
206 movdqa 64(%rdi), %xmm1
207 movdqa 128(%rdi), %xmm2
208 movdqa 192(%rdi), %xmm3
209 movdqa 256(%rdi), %xmm4
210 movdqa 320(%rdi), %xmm5
211 movdqa 384(%rdi), %xmm6
212 movdqa 448(%rdi), %xmm7
213 jmp .restore_finish
215 .restore_ymm:
216 vmovdqa (%rdi), %ymm0
217 vmovdqa 64(%rdi), %ymm1
218 vmovdqa 128(%rdi), %ymm2
219 vmovdqa 192(%rdi), %ymm3
220 vmovdqa 256(%rdi), %ymm4
221 vmovdqa 320(%rdi), %ymm5
222 vmovdqa 384(%rdi), %ymm6
223 vmovdqa 448(%rdi), %ymm7
224 jmp .restore_finish
226 .restore_zmm:
227 vmovdqa64 (%rdi), %zmm0
228 vmovdqa64 64(%rdi), %zmm1
229 vmovdqa64 128(%rdi), %zmm2
230 vmovdqa64 192(%rdi), %zmm3
231 vmovdqa64 256(%rdi), %zmm4
232 vmovdqa64 320(%rdi), %zmm5
233 vmovdqa64 384(%rdi), %zmm6
234 vmovdqa64 448(%rdi), %zmm7
236 .restore_finish:
238 SET_SIZE(_elf_rtbndr_fp_restore_orig)
240 ENTRY(_elf_rtbndr_fp_fxsave)
241 fxsaveq (%rdi)
243 SET_SIZE(_elf_rtbndr_fp_fxsave)
245 ENTRY(_elf_rtbndr_fp_fxrestore)
246 fxrstor (%rdi)
248 SET_SIZE(_elf_rtbndr_fp_fxrestore)
250 ENTRY(_elf_rtbndr_fp_xsave)
251 XSAVE_HEADER_ZERO(%rdx, %rdi)
252 movq $_CONST(XFEATURE_FP_ALL), %rdx
253 movl %edx, %eax
254 shrq $32, %rdx
255 xsave (%rdi) /* save data */
257 SET_SIZE(_elf_rtbndr_fp_xsave)
259 ENTRY(_elf_rtbndr_fp_xrestore)
260 movq $_CONST(XFEATURE_FP_ALL), %rdx
261 movl %edx, %eax
262 shrq $32, %rdx
263 xrstor (%rdi) /* save data */
265 SET_SIZE(_elf_rtbndr_fp_xrestore)
267 #endif
269 #if defined(lint)
271 /* ARGSUSED0 */
273 elf_plt_trace()
275 return (0);
278 #else
281 * On entry the 'glue code' has already done the following:
283 * pushq %rbp
284 * movq %rsp, %rbp
285 * subq $0x10, %rsp
286 * leaq trace_fields(%rip), %r11
287 * movq %r11, -0x8(%rbp)
288 * movq $elf_plt_trace, %r11
289 * jmp *%r11
291 * so - -8(%rbp) contains the dyndata ptr
293 * 0x0 Addr *reflmp
294 * 0x8 Addr *deflmp
295 * 0x10 Word symndx
296 * 0x14 Word sb_flags
297 * 0x18 Sym symdef.st_name
298 * 0x1c symdef.st_info
299 * 0x1d symdef.st_other
300 * 0x1e symdef.st_shndx
301 * 0x20 symdef.st_value
302 * 0x28 symdef.st_size
304 * Also note - on entry 16 bytes have already been subtracted
305 * from the %rsp. The first 8 bytes is for the dyn_data_ptr,
306 * the second 8 bytes are to align the stack and are available
307 * for use.
309 #define REFLMP_OFF 0x0
310 #define DEFLMP_OFF 0x8
311 #define SYMNDX_OFF 0x10
312 #define SBFLAGS_OFF 0x14
313 #define SYMDEF_OFF 0x18
314 #define SYMDEF_VALUE_OFF 0x20
317 * Next, we need to create a bunch of local storage. First, we have to preserve
318 * the standard registers per the amd64 ABI. This means we need to deal with:
319 * %rax - Used for information about the number of vector arguments
320 * %rdi - arg0
321 * %rsi - arg1
322 * %rdx - arg2
323 * %rcx - arg3
324 * %r8 - arg4
325 * %r9 - arg5
326 * %r10 - static chain pointer
327 * %r11 - PLT Interwork register, our caller is using this, so it's not
328 * a temporary for us.
330 * In addition, we need to save the amd64 ABI floating point arguments. Finally,
331 * we need to deal with our local storage. We need a La_amd64_regs and a
332 * uint64_t for the previous stack size.
334 * To deal with this and the potentially variable size of the FPU regs, we have
335 * to play a few different games. We refer to all of the standard registers, the
336 * previous stack size, and La_amd64_regs structure off of %rbp. These are all
337 * values that are below %rbp.
339 #define SPDYNOFF -8
340 #define SPDESTOFF -16
341 #define SPPRVSTKOFF -24
342 #define SPLAREGOFF -88
343 #define ORIG_RDI -96
344 #define ORIG_RSI -104
345 #define ORIG_RDX -112
346 #define ORIG_RCX -120
347 #define ORIG_R8 -128
348 #define ORIG_R9 -136
349 #define ORIG_R10 -144
350 #define ORIG_R11 -152
351 #define ORIG_RAX -160
352 #define PLT_SAVE_OFF 168
354 ENTRY(elf_plt_trace)
356 * Save our static registers. After that 64-byte align us and subtract
357 * the appropriate amount for the FPU. The frame pointer has already
358 * been pushed for us by the glue code.
360 movq %rdi, ORIG_RDI(%rbp)
361 movq %rsi, ORIG_RSI(%rbp)
362 movq %rdx, ORIG_RDX(%rbp)
363 movq %rcx, ORIG_RCX(%rbp)
364 movq %r8, ORIG_R8(%rbp)
365 movq %r9, ORIG_R9(%rbp)
366 movq %r10, ORIG_R10(%rbp)
367 movq %r11, ORIG_R11(%rbp)
368 movq %rax, ORIG_RAX(%rbp)
370 subq $PLT_SAVE_OFF, %rsp
372 movq _plt_save_size@GOTPCREL(%rip),%r9
373 movq _plt_fp_save@GOTPCREL(%rip),%r10
374 subq (%r9), %rsp
375 andq $-64, %rsp
376 movq %rsp, %rdi
377 call *(%r10)
380 * Now that we've saved all of our registers, figure out what we need to
381 * do next.
383 movq SPDYNOFF(%rbp), %rax / %rax = dyndata
384 testb $LA_SYMB_NOPLTENTER, SBFLAGS_OFF(%rax) / <link.h>
385 je .start_pltenter
386 movq SYMDEF_VALUE_OFF(%rax), %rdi
387 movq %rdi, SPDESTOFF(%rbp) / save destination address
388 jmp .end_pltenter
390 .start_pltenter:
392 * save all registers into La_amd64_regs
394 leaq SPLAREGOFF(%rbp), %rsi / %rsi = &La_amd64_regs
395 leaq 8(%rbp), %rdi
396 movq %rdi, 0(%rsi) / la_rsp
397 movq 0(%rbp), %rdi
398 movq %rdi, 8(%rsi) / la_rbp
399 movq ORIG_RDI(%rbp), %rdi
400 movq %rdi, 16(%rsi) / la_rdi
401 movq ORIG_RSI(%rbp), %rdi
402 movq %rdi, 24(%rsi) / la_rsi
403 movq ORIG_RDX(%rbp), %rdi
404 movq %rdi, 32(%rsi) / la_rdx
405 movq ORIG_RCX(%rbp), %rdi
406 movq %rdi, 40(%rsi) / la_rcx
407 movq ORIG_R8(%rbp), %rdi
408 movq %rdi, 48(%rsi) / la_r8
409 movq ORIG_R9(%rbp), %rdi
410 movq %rdi, 56(%rsi) / la_r9
413 * prepare for call to la_pltenter
415 movq SPDYNOFF(%rbp), %r11 / %r11 = &dyndata
416 leaq SBFLAGS_OFF(%r11), %r9 / arg6 (&sb_flags)
417 leaq SPLAREGOFF(%rbp), %r8 / arg5 (&La_amd64_regs)
418 movl SYMNDX_OFF(%r11), %ecx / arg4 (symndx)
419 leaq SYMDEF_OFF(%r11), %rdx / arg3 (&Sym)
420 movq DEFLMP_OFF(%r11), %rsi / arg2 (dlmp)
421 movq REFLMP_OFF(%r11), %rdi / arg1 (rlmp)
422 call audit_pltenter@PLT
423 movq %rax, SPDESTOFF(%rbp) / save calling address
424 .end_pltenter:
427 * If *no* la_pltexit() routines exist
428 * we do not need to keep the stack frame
429 * before we call the actual routine. Instead we
430 * jump to it and remove our stack from the stack
431 * at the same time.
433 movl audit_flags(%rip), %eax
434 andl $AF_PLTEXIT, %eax / value of audit.h:AF_PLTEXIT
435 cmpl $0, %eax
436 je .bypass_pltexit
438 * Has the *nopltexit* flag been set for this entry point
440 movq SPDYNOFF(%rbp), %r11 / %r11 = &dyndata
441 testb $LA_SYMB_NOPLTEXIT, SBFLAGS_OFF(%r11)
442 je .start_pltexit
444 .bypass_pltexit:
446 * No PLTEXIT processing required.
448 movq 0(%rbp), %r11
449 movq %r11, -8(%rbp) / move prev %rbp
450 movq SPDESTOFF(%rbp), %r11 / r11 == calling destination
451 movq %r11, 0(%rbp) / store destination at top
453 /* Restore FPU */
454 movq _plt_fp_restore@GOTPCREL(%rip),%r10
456 movq %rsp, %rdi
457 call *(%r10)
459 movq ORIG_RDI(%rbp), %rdi
460 movq ORIG_RSI(%rbp), %rsi
461 movq ORIG_RDX(%rbp), %rdx
462 movq ORIG_RCX(%rbp), %rcx
463 movq ORIG_R8(%rbp), %r8
464 movq ORIG_R9(%rbp), %r9
465 movq ORIG_R10(%rbp), %r10
466 movq ORIG_R11(%rbp), %r11
467 movq ORIG_RAX(%rbp), %rax
469 subq $8, %rbp / adjust %rbp for 'ret'
470 movq %rbp, %rsp /
472 * At this point, after a little doctoring, we should
473 * have the following on the stack:
475 * 16(%rsp): ret addr
476 * 8(%rsp): dest_addr
477 * 0(%rsp): Previous %rbp
479 * So - we pop the previous %rbp, and then
480 * ret to our final destination.
482 popq %rbp /
483 ret / jmp to final destination
484 / and clean up stack :)
486 .start_pltexit:
488 * In order to call the destination procedure and then return
489 * to audit_pltexit() for post analysis we must first grow
490 * our stack frame and then duplicate the original callers
491 * stack state. This duplicates all of the arguements
492 * that were to be passed to the destination procedure.
494 movq %rbp, %rdi /
495 addq $16, %rdi / %rdi = src
496 movq (%rbp), %rdx /
497 subq %rdi, %rdx / %rdx == prev frame sz
499 * If audit_argcnt > 0 then we limit the number of
500 * arguements that will be duplicated to audit_argcnt.
502 * If (prev_stack_size > (audit_argcnt * 8))
503 * prev_stack_size = audit_argcnt * 8;
505 movl audit_argcnt(%rip),%eax / %eax = audit_argcnt
506 cmpl $0, %eax
507 jle .grow_stack
508 leaq (,%rax,8), %rax / %eax = %eax * 4
509 cmpq %rax,%rdx
510 jle .grow_stack
511 movq %rax, %rdx
513 * Grow the stack and duplicate the arguements of the
514 * original caller.
516 .grow_stack:
517 movq %rsp, %r11
518 subq %rdx, %rsp / grow the stack
519 movq %rdx, SPPRVSTKOFF(%rbp) / -88(%rbp) == prev frame sz
520 movq %rsp, %rcx / %rcx = dest
521 addq %rcx, %rdx / %rdx == tail of dest
522 .while_base:
523 cmpq %rdx, %rcx / while (base+size >= src++) {
524 jge .end_while /
525 movq (%rdi), %rsi
526 movq %rsi,(%rcx) / *dest = *src
527 addq $8, %rdi / src++
528 addq $8, %rcx / dest++
529 jmp .while_base / }
532 * The above stack is now an exact duplicate of
533 * the stack of the original calling procedure.
535 .end_while:
537 / Restore registers using %r11 which contains our old %rsp value
538 / before growing the stack.
540 movq _plt_fp_restore@GOTPCREL(%rip),%r10
541 movq %r11, %rdi
542 call *(%r10)
544 .trace_r2_finish:
545 movq ORIG_RDI(%rbp), %rdi
546 movq ORIG_RSI(%rbp), %rsi
547 movq ORIG_RDX(%rbp), %rdx
548 movq ORIG_RCX(%rbp), %rcx
549 movq ORIG_R8(%rbp), %r8
550 movq ORIG_R9(%rbp), %r9
551 movq ORIG_R10(%rbp), %r10
552 movq ORIG_RAX(%rbp), %rax
553 movq ORIG_R11(%rbp), %r11
556 * Call to desitnation function - we'll return here
557 * for pltexit monitoring.
559 call *SPDESTOFF(%rbp)
561 addq SPPRVSTKOFF(%rbp), %rsp / cleanup dupped stack
564 / prepare for call to audit_pltenter()
566 movq SPDYNOFF(%rbp), %r11 / %r11 = &dyndata
567 movq SYMNDX_OFF(%r11), %r8 / arg5 (symndx)
568 leaq SYMDEF_OFF(%r11), %rcx / arg4 (&Sym)
569 movq DEFLMP_OFF(%r11), %rdx / arg3 (dlmp)
570 movq REFLMP_OFF(%r11), %rsi / arg2 (rlmp)
571 movq %rax, %rdi / arg1 (returnval)
572 call audit_pltexit@PLT
575 * Clean up after ourselves and return to the
576 * original calling procedure. Make sure to restore
577 * registers.
580 movq _plt_fp_restore@GOTPCREL(%rip),%r10
581 movq %rsp, %rdi
582 movq %rax, SPPRVSTKOFF(%rbp)
583 call *(%r10)
585 movq ORIG_RDI(%rbp), %rdi
586 movq ORIG_RSI(%rbp), %rsi
587 movq ORIG_RDX(%rbp), %rdx
588 movq ORIG_RCX(%rbp), %rcx
589 movq ORIG_R8(%rbp), %r8
590 movq ORIG_R9(%rbp), %r9
591 movq ORIG_R10(%rbp), %r10
592 movq ORIG_R11(%rbp), %r11
593 movq SPPRVSTKOFF(%rbp), %rax
595 movq %rbp, %rsp /
596 popq %rbp /
597 ret / return to caller
598 SET_SIZE(elf_plt_trace)
599 #endif
602 * We got here because a call to a function resolved to a procedure
603 * linkage table entry. That entry did a JMPL to the first PLT entry, which
604 * in turn did a call to elf_rtbndr.
606 * the code sequence that got us here was:
608 * .PLT0:
609 * pushq GOT+8(%rip) #GOT[1]
610 * jmp *GOT+16(%rip) #GOT[2]
611 * nop
612 * nop
613 * nop
614 * nop
615 * ...
616 * PLT entry for foo:
617 * jmp *name1@GOTPCREL(%rip)
618 * pushl $rel.plt.foo
619 * jmp PLT0
621 * At entry, the stack looks like this:
623 * return address 16(%rsp)
624 * $rel.plt.foo (plt index) 8(%rsp)
625 * lmp 0(%rsp)
628 #if defined(lint)
630 extern unsigned long elf_bndr(Rt_map *, unsigned long, caddr_t);
632 void
633 elf_rtbndr(Rt_map * lmp, unsigned long reloc, caddr_t pc)
635 (void) elf_bndr(lmp, reloc, pc);
638 #else
641 * The PLT code that landed us here placed 2 arguments on the stack as
642 * arguments to elf_rtbndr.
643 * Additionally the pc of caller is below these 2 args.
644 * Our stack will look like this after we establish a stack frame with
645 * push %rbp; movq %rsp, %rbp sequence:
647 * 8(%rbp) arg1 - *lmp
648 * 16(%rbp), %rsi arg2 - reloc index
649 * 24(%rbp), %rdx arg3 - pc of caller
651 #define LBPLMPOFF 8 /* arg1 - *lmp */
652 #define LBPRELOCOFF 16 /* arg2 - reloc index */
653 #define LBRPCOFF 24 /* arg3 - pc of caller */
656 * With the above in place, we must now proceed to preserve all temporary
657 * registers that are also used for passing arguments. Specifically this
658 * means:
660 * %rax - Used for information about the number of vector arguments
661 * %rdi - arg0
662 * %rsi - arg1
663 * %rdx - arg2
664 * %rcx - arg3
665 * %r8 - arg4
666 * %r9 - arg5
667 * %r10 - static chain pointer
669 * While we don't have to preserve %r11, we do have to preserve the FPU
670 * registers. The FPU logic is delegated to a specific function that we'll call.
671 * However, it requires that its stack is 64-byte aligned. We defer the
672 * alignment to that point. This will also take care of the fact that a caller
673 * may not call us with a correctly aligned stack pointer per the amd64 ABI.
676 .extern _plt_save_size
677 .extern _plt_fp_save
678 .extern plt_fp_restore
680 .weak _elf_rtbndr
681 _elf_rtbndr = elf_rtbndr
683 ENTRY(elf_rtbndr)
684 pushq %rbp /* Establish stack frame */
685 movq %rsp, %rbp
688 * Save basic regs.
690 pushq %rax
691 pushq %rdi
692 pushq %rsi
693 pushq %rdx
694 pushq %rcx
695 pushq %r8
696 pushq %r9
697 pushq %r10
698 pushq %r12
701 * Save the amount of space we need for the FPU registers and call that
702 * function. Save %rsp before we manipulate it to make restore easier.
704 movq %rsp, %r12
705 movq _plt_save_size@GOTPCREL(%rip),%r9
706 movq _plt_fp_save@GOTPCREL(%rip),%r10
707 subq (%r9), %rsp
708 andq $-64, %rsp
710 movq %rsp, %rdi
711 call *(%r10)
714 * Perform actual PLT logic. Note that the plt related arguments are
715 * located at an offset relative to %rbp.
717 movq LBPLMPOFF(%rbp), %rdi /* arg1 - *lmp */
718 movq LBPRELOCOFF(%rbp), %rsi /* arg2 - reloc index */
719 movq LBRPCOFF(%rbp), %rdx /* arg3 - pc of caller */
720 call elf_bndr@PLT /* call elf_rtbndr(lmp, relndx, pc) */
721 movq %rax, LBPRELOCOFF(%rbp) /* store final destination */
723 /* Restore FPU */
724 movq _plt_fp_restore@GOTPCREL(%rip),%r10
726 movq %rsp, %rdi
727 call *(%r10)
729 movq %r12, %rsp
730 popq %r12
731 popq %r10
732 popq %r9
733 popq %r8
734 popq %rcx
735 popq %rdx
736 popq %rsi
737 popq %rdi
738 popq %rax
740 movq %rbp, %rsp /* Restore our stack frame */
741 popq %rbp
743 addq $8, %rsp /* pop 1st plt-pushed args */
744 /* the second arguement is used */
745 /* for the 'return' address to our */
746 /* final destination */
748 ret /* invoke resolved function */
750 SET_SIZE(elf_rtbndr)
751 #endif