usr/src/uts/intel/ia32/os/fpu.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2018, Joyent, Inc.
  24  */
  25
  26 /*      Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T   */
  28 /*              All Rights Reserved                             */
  29
  30 /*      Copyright (c) 1987, 1988 Microsoft Corporation          */
  31 /*              All Rights Reserved                             */
  32
  33 /*
  34  * Copyright (c) 2009, Intel Corporation.
  35  * All rights reserved.
  36  */
  37
  38 #include <sys/types.h>
  39 #include <sys/param.h>
  40 #include <sys/signal.h>
  41 #include <sys/regset.h>
  42 #include <sys/privregs.h>
  43 #include <sys/psw.h>
  44 #include <sys/trap.h>
  45 #include <sys/fault.h>
  46 #include <sys/systm.h>
  47 #include <sys/user.h>
  48 #include <sys/file.h>
  49 #include <sys/proc.h>
  50 #include <sys/pcb.h>
  51 #include <sys/lwp.h>
  52 #include <sys/cpuvar.h>
  53 #include <sys/thread.h>
  54 #include <sys/disp.h>
  55 #include <sys/fp.h>
  56 #include <sys/siginfo.h>
  57 #include <sys/archsystm.h>
  58 #include <sys/kmem.h>
  59 #include <sys/debug.h>
  60 #include <sys/x86_archext.h>
  61 #include <sys/sysmacros.h>
  62 #include <sys/cmn_err.h>
  63
  64 /*
  65  * FPU Management Overview
  66  * -----------------------
  67  *
  68  * The x86 FPU has evolved substantially since its days as the x87 coprocessor;
  69  * however, many aspects of its life as a coprocessor are still around in x86.
  70  *
  71  * Today, when we refer to the 'FPU', we don't just mean the original x87 FPU.
  72  * While that state still exists, there is much more that is covered by the FPU.
  73  * Today, this includes not just traditional FPU state, but also supervisor only
  74  * state. The following state is currently managed and covered logically by the
  75  * idea of the FPU registers:
  76  *
  77  *    o Traditional x87 FPU
  78  *    o Vector Registers (%xmm, %ymm, %zmm)
  79  *    o Memory Protection Extensions (MPX) Bounds Registers
  80  *    o Protected Key Rights Registers (PKRU)
  81  *    o Processor Trace data
  82  *
  83  * The rest of this covers how the FPU is managed and controlled, how state is
  84  * saved and restored between threads, interactions with hypervisors, and other
  85  * information exported to user land through aux vectors. A lot of background
  86  * information is here to synthesize major parts of the Intel SDM, but
  87  * unfortunately, it is not a replacement for reading it.
  88  *
  89  * FPU Control Registers
  90  * ---------------------
  91  *
  92  * Because the x87 FPU began its life as a co-processor and the FPU was
  93  * optional there are several bits that show up in %cr0 that we have to
  94  * manipulate when dealing with the FPU. These are:
  95  *
  96  *   o CR0.ET   The 'extension type' bit. This was used originally to indicate
  97  *              that the FPU co-processor was present. Now it is forced on for
  98  *              compatibility. This is often used to verify whether or not the
  99  *              FPU is present.
 100  *
 101  *   o CR0.NE   The 'native error' bit. Used to indicate that native error
 102  *              mode should be enabled. This indicates that we should take traps
 103  *              on FPU errors. The OS enables this early in boot.
 104  *
 105  *   o CR0.MP   The 'Monitor Coprocessor' bit. Used to control whether or not
 106  *              wait/fwait instructions generate a #NM if CR0.TS is set.
 107  *
 108  *   o CR0.EM   The 'Emulation' bit. This is used to cause floating point
 109  *              operations (x87 through SSE4) to trap with a #UD so they can be
 110  *              emulated. The system never sets this bit, but makes sure it is
 111  *              clear on processor start up.
 112  *
 113  *   o CR0.TS   The 'Task Switched' bit. When this is turned on, a floating
 114  *              point operation will generate a #NM. An fwait will as well,
 115  *              depending on the value in CR0.MP.
 116  *
 117  * Our general policy is that CR0.ET, CR0.NE, and CR0.MP are always set by
 118  * the system. Similarly CR0.EM is always unset by the system. CR0.TS has a more
 119  * complicated role. Historically it has been used to allow running systems to
 120  * restore the FPU registers lazily. This will be discussed in greater depth
 121  * later on.
 122  *
 123  * %cr4 is also used as part of the FPU control. Specifically we need to worry
 124  * about the following bits in the system:
 125  *
 126  *   o CR4.OSFXSR       This bit is used to indicate that the OS understands and
 127  *                      supports the execution of the fxsave and fxrstor
 128  *                      instructions. This bit is required to be set to enable
 129  *                      the use of the SSE->SSE4 instructions.
 130  *
 131  *   o CR4.OSXMMEXCPT   This bit is used to indicate that the OS can understand
 132  *                      and take a SIMD floating point exception (#XM). This bit
 133  *                      is always enabled by the system.
 134  *
 135  *   o CR4.OSXSAVE      This bit is used to indicate that the OS understands and
 136  *                      supports the execution of the xsave and xrstor family of
 137  *                      instructions. This bit is required to use any of the AVX
 138  *                      and newer feature sets.
 139  *
 140  * Because all supported processors are 64-bit, they'll always support the XMM
 141  * extensions and we will enable both CR4.OXFXSR and CR4.OSXMMEXCPT in boot.
 142  * CR4.OSXSAVE will be enabled and used whenever xsave is reported in cpuid.
 143  *
 144  * %xcr0 is used to manage the behavior of the xsave feature set and is only
 145  * present on the system if xsave is supported. %xcr0 is read and written to
 146  * through by the xgetbv and xsetbv instructions. This register is present
 147  * whenever the xsave feature set is supported. Each bit in %xcr0 refers to a
 148  * different component of the xsave state and controls whether or not that
 149  * information is saved and restored. For newer feature sets like AVX and MPX,
 150  * it also controls whether or not the corresponding instructions can be
 151  * executed (much like CR0.OSFXSR does for the SSE feature sets).
 152  *
 153  * Everything in %xcr0 is around features available to users. There is also the
 154  * IA32_XSS MSR which is used to control supervisor-only features that are still
 155  * part of the xsave state. Bits that can be set in %xcr0 are reserved in
 156  * IA32_XSS and vice versa. This is an important property that is particularly
 157  * relevant to how the xsave instructions operate.
 158  *
 159  * Save Mechanisms
 160  * ---------------
 161  *
 162  * When switching between running threads the FPU state needs to be saved and
 163  * restored by the OS. If this state was not saved, users would rightfully
 164  * complain about corrupt state. There are three mechanisms that exist on the
 165  * processor for saving and restoring these state images:
 166  *
 167  *   o fsave
 168  *   o fxsave
 169  *   o xsave
 170  *
 171  * fsave saves and restores only the x87 FPU and is the oldest of these
 172  * mechanisms. This mechanism is never used in the kernel today because we are
 173  * always running on systems that support fxsave.
 174  *
 175  * The fxsave and fxrstor mechanism allows the x87 FPU and the SSE register
 176  * state to be saved and restored to and from a struct fxsave_state. This is the
 177  * default mechanism that is used to save and restore the FPU on amd64. An
 178  * important aspect of fxsave that was different from the original i386 fsave
 179  * mechanism is that the restoring of FPU state with pending exceptions will not
 180  * generate an exception, it will be deferred to the next use of the FPU.
 181  *
 182  * The final and by far the most complex mechanism is that of the xsave set.
 183  * xsave allows for saving and restoring all of the traditional x86 pieces (x87
 184  * and SSE), while allowing for extensions that will save the %ymm, %zmm, etc.
 185  * registers.
 186  *
 187  * Data is saved and restored into and out of a struct xsave_state. The first
 188  * part of the struct xsave_state is equivalent to the struct fxsave_state.
 189  * After that, there is a header which is used to describe the remaining
 190  * portions of the state. The header is a 64-byte value of which the first two
 191  * uint64_t values are defined and the rest are reserved and must be zero. The
 192  * first uint64_t is the xstate_bv member. This describes which values in the
 193  * xsave_state are actually valid and present. This is updated on a save and
 194  * used on restore. The second member is the xcomp_bv member. Its last bit
 195  * determines whether or not a compressed version of the structure is used.
 196  *
 197  * When the uncompressed structure is used (currently the only format we
 198  * support), then each state component is at a fixed offset in the structure,
 199  * even if it is not being used. For example, if you only saved the AVX related
 200  * state, but did not save the MPX related state, the offset would not change
 201  * for any component. With the compressed format, components that aren't used
 202  * are all elided (though the x87 and SSE state are always there).
 203  *
 204  * Unlike fxsave which saves all state, the xsave family does not always save
 205  * and restore all the state that could be covered by the xsave_state. The
 206  * instructions all take an argument which is a mask of what to consider. This
 207  * is the same mask that will be used in the xstate_bv vector and it is also the
 208  * same values that are present in %xcr0 and IA32_XSS. Though IA32_XSS is only
 209  * considered with the xsaves and xrstors instructions.
 210  *
 211  * When a save or restore is requested, a bitwise and is performed between the
 212  * requested bits and those that have been enabled in %xcr0. Only the bits that
 213  * match that are then saved or restored. Others will be silently ignored by
 214  * the processor. This idea is used often in the OS. We will always request that
 215  * we save and restore all of the state, but only those portions that are
 216  * actually enabled in %xcr0 will be touched.
 217  *
 218  * If a feature has been asked to be restored that is not set in the xstate_bv
 219  * feature vector of the save state, then it will be set to its initial state by
 220  * the processor (usually zeros). Also, when asked to save state, the processor
 221  * may not write out data that is in its initial state as an optimization. This
 222  * optimization only applies to saving data and not to restoring data.
 223  *
 224  * There are a few different variants of the xsave and xrstor instruction. They
 225  * are:
 226  *
 227  *   o xsave    This is the original save instruction. It will save all of the
 228  *              requested data in the xsave state structure. It only saves data
 229  *              in the uncompressed (xcomp_bv[63] is zero) format. It may be
 230  *              executed at all privilege levels.
 231  *
 232  *   o xrstor   This is the original restore instruction. It will restore all of
 233  *              the requested data. The xrstor function can handle both the
 234  *              compressed and uncompressed formats. It may be executed at all
 235  *              privilege levels.
 236  *
 237  *   o xsaveopt This is a variant of the xsave instruction that employs
 238  *              optimizations to try and only write out state that has been
 239  *              modified since the last time an xrstor instruction was called.
 240  *              The processor tracks a tuple of information about the last
 241  *              xrstor and tries to ensure that the same buffer is being used
 242  *              when this optimization is being used. However, because of the
 243  *              way that it tracks the xrstor buffer based on the address of it,
 244  *              it is not suitable for use if that buffer can be easily reused.
 245  *              The most common case is trying to save data to the stack in
 246  *              rtld. It may be executed at all privilege levels.
 247  *
 248  *   o xsavec   This is a variant of the xsave instruction that writes out the
 249  *              compressed form of the xsave_state. Otherwise it behaves as
 250  *              xsave. It may be executed at all privilege levels.
 251  *
 252  *   o xsaves   This is a variant of the xsave instruction. It is similar to
 253  *              xsavec in that it always writes the compressed form of the
 254  *              buffer. Unlike all the other forms, this instruction looks at
 255  *              both the user (%xcr0) and supervisor (IA32_XSS MSR) to determine
 256  *              what to save and restore. xsaves also implements the same
 257  *              optimization that xsaveopt does around modified pieces. User
 258  *              land may not execute the instruction.
 259  *
 260  *   o xrstors  This is a variant of the xrstor instruction. Similar to xsaves
 261  *              it can save and restore both the user and privileged states.
 262  *              Unlike xrstor it can only operate on the compressed form.
 263  *              User land may not execute the instruction.
 264  *
 265  * Based on all of these, the kernel has a precedence for what it will use.
 266  * Basically, xsaves (not supported) is preferred to xsaveopt, which is
 267  * preferred to xsave. A similar scheme is used when informing rtld (more later)
 268  * about what it should use. xsavec is preferred to xsave. xsaveopt is not
 269  * recommended due to the modified optimization not being appropriate for this
 270  * use.
 271  *
 272  * Finally, there is one last gotcha with the xsave state. Importantly some AMD
 273  * processors did not always save and restore some of the FPU exception state in
 274  * some cases like Intel did. In those cases the OS will make up for this fact
 275  * itself.
 276  *
 277  * FPU Initialization
 278  * ------------------
 279  *
 280  * One difference with the FPU registers is that not all threads have FPU state,
 281  * only those that have an lwp. Generally this means kernel threads, which all
 282  * share p0 and its lwp, do not have FPU state. Though there are definitely
 283  * exceptions such as kcfpoold. In the rest of this discussion we'll use thread
 284  * and lwp interchangeably, just think of thread meaning a thread that has a
 285  * lwp.
 286  *
 287  * Each lwp has its FPU state allocated in its pcb (process control block). The
 288  * actual storage comes from the fpsave_cachep kmem cache. This cache is sized
 289  * dynamically at start up based on the save mechanism that we're using and the
 290  * amount of memory required for it. This is dynamic because the xsave_state
 291  * size varies based on the supported feature set.
 292  *
 293  * The hardware side of the FPU is initialized early in boot before we mount the
 294  * root file system. This is effectively done in fpu_probe(). This is where we
 295  * make the final decision about what the save and restore mechanisms we should
 296  * use are, create the fpsave_cachep kmem cache, and initialize a number of
 297  * function pointers that use save and restoring logic.
 298  *
 299  * The thread/lwp side is a a little more involved. There are two different
 300  * things that we need to concern ourselves with. The first is how the FPU
 301  * resources are allocated and the second is how the FPU state is initialized
 302  * for a given lwp.
 303  *
 304  * We allocate the FPU save state from our kmem cache as part of lwp_fp_init().
 305  * This is always called unconditionally by the system as part of creating an
 306  * LWP.
 307  *
 308  * There are three different initialization paths that we deal with. The first
 309  * is when we are executing a new process. As part of exec all of the register
 310  * state is reset. The exec case is particularly important because init is born
 311  * like Athena, sprouting from the head of the kernel, without any true parent
 312  * to fork from. The second is used whenever we fork or create a new lwp.  The
 313  * third is to deal with special lwps like the agent lwp.
 314  *
 315  * During exec, we will call fp_exec() which will initialize and set up the FPU
 316  * state for the process. That will fill in the initial state for the FPU and
 317  * also set that state in the FPU itself. As part of fp_exec() we also install a
 318  * thread context operations vector that takes care of dealing with the saving
 319  * and restoring of the FPU. These context handlers will also be called whenever
 320  * an lwp is created or forked. In those cases, to initialize the FPU we will
 321  * call fp_new_lwp(). Like fp_exec(), fp_new_lwp() will install a context
 322  * operations vector for the new thread.
 323  *
 324  * Next we'll end up in the context operation fp_new_lwp(). This saves the
 325  * current thread's state, initializes the new thread's state, and copies over
 326  * the relevant parts of the originating thread's state. It's as this point that
 327  * we also install the FPU context operations into the new thread, which ensures
 328  * that all future threads that are descendants of the current one get the
 329  * thread context operations (unless they call exec).
 330  *
 331  * To deal with some things like the agent lwp, we double check the state of the
 332  * FPU in sys_rtt_common() to make sure that it has been enabled before
 333  * returning to user land. In general, this path should be rare, but it's useful
 334  * for the odd lwp here and there.
 335  *
 336  * The FPU state will remain valid most of the time. There are times that
 337  * the state will be rewritten. For example in restorecontext, due to /proc, or
 338  * the lwp calls exec(). Whether the context is being freed or we are resetting
 339  * the state, we will call fp_free() to disable the FPU and our context.
 340  *
 341  * Finally, when the lwp is destroyed, it will actually destroy and free the FPU
 342  * state by calling fp_lwp_cleanup().
 343  *
 344  * Kernel FPU Multiplexing
 345  * -----------------------
 346  *
 347  * Just as the kernel has to maintain all of the general purpose registers when
 348  * switching between scheduled threads, the same is true of the FPU registers.
 349  *
 350  * When a thread has FPU state, it also has a set of context operations
 351  * installed. These context operations take care of making sure that the FPU is
 352  * properly saved and restored during a context switch (fpsave_ctxt and
 353  * fprestore_ctxt respectively). This means that the current implementation of
 354  * the FPU is 'eager', when a thread is running the CPU will have its FPU state
 355  * loaded. While this is always true when executing in userland, there are a few
 356  * cases where this is not true in the kernel.
 357  *
 358  * This was not always the case. Traditionally on x86 a 'lazy' FPU restore was
 359  * employed. This meant that the FPU would be saved on a context switch and the
 360  * CR0.TS bit would be set. When a thread next tried to use the FPU, it would
 361  * then take a #NM trap, at which point we would restore the FPU from the save
 362  * area and return to user land. Given the frequency of use of the FPU alone by
 363  * libc, there's no point returning to user land just to trap again.
 364  *
 365  * There are a few cases though where the FPU state may need to be changed for a
 366  * thread on its behalf. The most notable cases are in the case of processes
 367  * using /proc, restorecontext, forking, etc. In all of these cases the kernel
 368  * will force a threads FPU state to be saved into the PCB through the fp_save()
 369  * function. Whenever the FPU is saved, then the FPU_VALID flag is set on the
 370  * pcb. This indicates that the save state holds currently valid data. As a side
 371  * effect of this, CR0.TS will be set. To make sure that all of the state is
 372  * updated before returning to user land, in these cases, we set a flag on the
 373  * PCB that says the FPU needs to be updated. This will make sure that we take
 374  * the slow path out of a system call to fix things up for the thread. Due to
 375  * the fact that this is a rather rare case, effectively setting the equivalent
 376  * of t_postsys is acceptable.
 377  *
 378  * CR0.TS will be set after a save occurs and cleared when a restore occurs.
 379  * Generally this means it will be cleared immediately by the new thread that is
 380  * running in a context switch. However, this isn't the case for kernel threads.
 381  * They currently operate with CR0.TS set as no kernel state is restored for
 382  * them. This means that using the FPU will cause a #NM and panic.
 383  *
 384  * The FPU_VALID flag on the currently executing thread's pcb is meant to track
 385  * what the value of CR0.TS should be. If it is set, then CR0.TS will be set.
 386  * However, because we eagerly restore, the only time that CR0.TS should be set
 387  * for a non-kernel thread is during operations where it will be cleared before
 388  * returning to user land and importantly, the only data that is in it is its
 389  * own.
 390  *
 391  * FPU Exceptions
 392  * --------------
 393  *
 394  * Certain operations can cause the kernel to take traps due to FPU activity.
 395  * Generally these events will cause a user process to receive a SIGFPU and if
 396  * the kernel receives it in kernel context, we will die. Traditionally the #NM
 397  * (Device Not Available / No Math) exception generated by CR0.TS would have
 398  * caused us to restore the FPU. Now it is a fatal event regardless of whether
 399  * or not user land causes it.
 400  *
 401  * While there are some cases where the kernel uses the FPU, it is up to the
 402  * kernel to use the FPU in a way such that it cannot receive a trap or to use
 403  * the appropriate trap protection mechanisms.
 404  *
 405  * Hypervisors
 406  * -----------
 407  *
 408  * When providing support for hypervisors things are a little bit more
 409  * complicated because the FPU is not virtualized at all. This means that they
 410  * need to save and restore the FPU and %xcr0 across entry and exit to the
 411  * guest. To facilitate this, we provide a series of APIs in <sys/hma.h>. These
 412  * allow us to use the full native state to make sure that we are always saving
 413  * and restoring the full FPU that the host sees, even when the guest is using a
 414  * subset.
 415  *
 416  * One tricky aspect of this is that the guest may be using a subset of %xcr0
 417  * and therefore changing our %xcr0 on the fly. It is vital that when we're
 418  * saving and restoring the FPU that we always use the largest %xcr0 contents
 419  * otherwise we will end up leaving behind data in it.
 420  *
 421  * ELF PLT Support
 422  * ---------------
 423  *
 424  * rtld has to preserve a subset of the FPU when it is saving and restoring
 425  * registers due to the amd64 SYS V ABI. See cmd/sgs/rtld/amd64/boot_elf.s for
 426  * more information. As a result, we set up an aux vector that contains
 427  * information about what save and restore mechanisms it should be using and
 428  * the sizing thereof based on what the kernel supports. This is passed down in
 429  * a series of aux vectors SUN_AT_FPTYPE and SUN_AT_FPSIZE. This information is
 430  * initialized in fpu_subr.c.
 431  */
 432
 433 kmem_cache_t *fpsave_cachep;
 434
 435 /* Legacy fxsave layout + xsave header + ymm */
 436 #define AVX_XSAVE_SIZE          (512 + 64 + 256)
 437
 438 /*
 439  * Various sanity checks.
 440  */
 441 CTASSERT(sizeof (struct fxsave_state) == 512);
 442 CTASSERT(sizeof (struct fnsave_state) == 108);
 443 CTASSERT((offsetof(struct fxsave_state, fx_xmm[0]) & 0xf) == 0);
 444 CTASSERT(sizeof (struct xsave_state) >= AVX_XSAVE_SIZE);
 445
 446 /*
 447  * Initial kfpu state for SSE/SSE2 used by fpinit()
 448  */
 449 const struct fxsave_state sse_initial = {
 450         FPU_CW_INIT,    /* fx_fcw */
 451         0,              /* fx_fsw */
 452         0,              /* fx_fctw */
 453         0,              /* fx_fop */
 454 #if defined(__amd64)
 455         0,              /* fx_rip */
 456         0,              /* fx_rdp */
 457 #else
 458         0,              /* fx_eip */
 459         0,              /* fx_cs */
 460         0,              /* __fx_ign0 */
 461         0,              /* fx_dp */
 462         0,              /* fx_ds */
 463         0,              /* __fx_ign1 */
 464 #endif /* __amd64 */
 465         SSE_MXCSR_INIT  /* fx_mxcsr */
 466         /* rest of structure is zero */
 467 };
 468
 469 /*
 470  * Initial kfpu state for AVX used by fpinit()
 471  */
 472 const struct xsave_state avx_initial = {
 473         /*
 474          * The definition below needs to be identical with sse_initial
 475          * defined above.
 476          */
 477         {
 478                 FPU_CW_INIT,    /* fx_fcw */
 479                 0,              /* fx_fsw */
 480                 0,              /* fx_fctw */
 481                 0,              /* fx_fop */
 482 #if defined(__amd64)
 483                 0,              /* fx_rip */
 484                 0,              /* fx_rdp */
 485 #else
 486                 0,              /* fx_eip */
 487                 0,              /* fx_cs */
 488                 0,              /* __fx_ign0 */
 489                 0,              /* fx_dp */
 490                 0,              /* fx_ds */
 491                 0,              /* __fx_ign1 */
 492 #endif /* __amd64 */
 493                 SSE_MXCSR_INIT  /* fx_mxcsr */
 494                 /* rest of structure is zero */
 495         },
 496         /*
 497          * bit0 = 1 for XSTATE_BV to indicate that legacy fields are valid,
 498          * and CPU should initialize XMM/YMM.
 499          */
 500         1,
 501         0       /* xs_xcomp_bv */
 502         /* rest of structure is zero */
 503 };
 504
 505 /*
 506  * mxcsr_mask value (possibly reset in fpu_probe); used to avoid
 507  * the #gp exception caused by setting unsupported bits in the
 508  * MXCSR register
 509  */
 510 uint32_t sse_mxcsr_mask = SSE_MXCSR_MASK_DEFAULT;
 511
 512 /*
 513  * Initial kfpu state for x87 used by fpinit()
 514  */
 515 const struct fnsave_state x87_initial = {
 516         FPU_CW_INIT,    /* f_fcw */
 517         0,              /* __f_ign0 */
 518         0,              /* f_fsw */
 519         0,              /* __f_ign1 */
 520         0xffff,         /* f_ftw */
 521         /* rest of structure is zero */
 522 };
 523
 524 /*
 525  * This vector is patched to xsave_ctxt() if we discover we have an
 526  * XSAVE-capable chip in fpu_probe.
 527  */
 528 void (*fpsave_ctxt)(void *) = fpxsave_ctxt;
 529 void (*fprestore_ctxt)(void *) = fpxrestore_ctxt;
 530
 531 /*
 532  * This function pointer is changed to xsaveopt if the CPU is xsaveopt capable.
 533  */
 534 void (*xsavep)(struct xsave_state *, uint64_t) = xsave;
 535
 536 static int fpe_sicode(uint_t);
 537 static int fpe_simd_sicode(uint_t);
 538
 539 /*
 540  * Copy the state of parent lwp's floating point context into the new lwp.
 541  * Invoked for both fork() and lwp_create().
 542  *
 543  * Note that we inherit -only- the control state (e.g. exception masks,
 544  * rounding, precision control, etc.); the FPU registers are otherwise
 545  * reset to their initial state.
 546  */
 547 static void
 548 fp_new_lwp(kthread_id_t t, kthread_id_t ct)
 549 {
 550         struct fpu_ctx *fp;             /* parent fpu context */
 551         struct fpu_ctx *cfp;            /* new fpu context */
 552         struct fxsave_state *fx, *cfx;
 553         struct xsave_state *cxs;
 554
 555         ASSERT(fp_kind != FP_NO);
 556
 557         fp = &t->t_lwp->lwp_pcb.pcb_fpu;
 558         cfp = &ct->t_lwp->lwp_pcb.pcb_fpu;
 559
 560         /*
 561          * If the parent FPU state is still in the FPU hw then save it;
 562          * conveniently, fp_save() already does this for us nicely.
 563          */
 564         fp_save(fp);
 565
 566         cfp->fpu_flags = FPU_EN | FPU_VALID;
 567         cfp->fpu_regs.kfpu_status = 0;
 568         cfp->fpu_regs.kfpu_xstatus = 0;
 569
 570         /*
 571          * Make sure that the child's FPU is cleaned up and made ready for user
 572          * land.
 573          */
 574         PCB_SET_UPDATE_FPU(&ct->t_lwp->lwp_pcb);
 575
 576         switch (fp_save_mech) {
 577         case FP_FXSAVE:
 578                 fx = fp->fpu_regs.kfpu_u.kfpu_fx;
 579                 cfx = cfp->fpu_regs.kfpu_u.kfpu_fx;
 580                 bcopy(&sse_initial, cfx, sizeof (*cfx));
 581                 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
 582                 cfx->fx_fcw = fx->fx_fcw;
 583                 break;
 584
 585         case FP_XSAVE:
 586                 cfp->fpu_xsave_mask = fp->fpu_xsave_mask;
 587
 588                 VERIFY(fp->fpu_regs.kfpu_u.kfpu_xs != NULL);
 589
 590                 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
 591                 cxs = cfp->fpu_regs.kfpu_u.kfpu_xs;
 592                 cfx = &cxs->xs_fxsave;
 593
 594                 bcopy(&avx_initial, cxs, sizeof (*cxs));
 595                 cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS;
 596                 cfx->fx_fcw = fx->fx_fcw;
 597                 cxs->xs_xstate_bv |= (get_xcr(XFEATURE_ENABLED_MASK) &
 598                     XFEATURE_FP_INITIAL);
 599                 break;
 600         default:
 601                 panic("Invalid fp_save_mech");
 602                 /*NOTREACHED*/
 603         }
 604
 605         /*
 606          * Mark that both the parent and child need to have the FPU cleaned up
 607          * before returning to user land.
 608          */
 609
 610         installctx(ct, cfp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp,
 611             fp_new_lwp, NULL, fp_free);
 612 }
 613
 614 /*
 615  * Free any state associated with floating point context.
 616  * Fp_free can be called in three cases:
 617  * 1) from reaper -> thread_free -> freectx-> fp_free
 618  *      fp context belongs to a thread on deathrow
 619  *      nothing to do,  thread will never be resumed
 620  *      thread calling ctxfree is reaper
 621  *
 622  * 2) from exec -> freectx -> fp_free
 623  *      fp context belongs to the current thread
 624  *      must disable fpu, thread calling ctxfree is curthread
 625  *
 626  * 3) from restorecontext -> setfpregs -> fp_free
 627  *      we have a modified context in the memory (lwp->pcb_fpu)
 628  *      disable fpu and release the fp context for the CPU
 629  *
 630  */
 631 /*ARGSUSED*/
 632 void
 633 fp_free(struct fpu_ctx *fp, int isexec)
 634 {
 635         ASSERT(fp_kind != FP_NO);
 636
 637         if (fp->fpu_flags & FPU_VALID)
 638                 return;
 639
 640         kpreempt_disable();
 641         /*
 642          * We want to do fpsave rather than fpdisable so that we can
 643          * keep the fpu_flags as FPU_VALID tracking the CR0_TS bit
 644          */
 645         fp->fpu_flags |= FPU_VALID;
 646         /* If for current thread disable FP to track FPU_VALID */
 647         if (curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu) {
 648                 /* Clear errors if any to prevent frstor from complaining */
 649                 (void) fperr_reset();
 650                 if (fp_kind & __FP_SSE)
 651                         (void) fpxerr_reset();
 652                 fpdisable();
 653         }
 654         kpreempt_enable();
 655 }
 656
 657 /*
 658  * Store the floating point state and disable the floating point unit.
 659  */
 660 void
 661 fp_save(struct fpu_ctx *fp)
 662 {
 663         ASSERT(fp_kind != FP_NO);
 664
 665         kpreempt_disable();
 666         if (!fp || fp->fpu_flags & FPU_VALID) {
 667                 kpreempt_enable();
 668                 return;
 669         }
 670         ASSERT(curthread->t_lwp && fp == &curthread->t_lwp->lwp_pcb.pcb_fpu);
 671
 672         switch (fp_save_mech) {
 673         case FP_FXSAVE:
 674                 fpxsave(fp->fpu_regs.kfpu_u.kfpu_fx);
 675                 break;
 676
 677         case FP_XSAVE:
 678                 xsavep(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
 679                 break;
 680         default:
 681                 panic("Invalid fp_save_mech");
 682                 /*NOTREACHED*/
 683         }
 684
 685         fp->fpu_flags |= FPU_VALID;
 686
 687         /*
 688          * We save the FPU as part of forking, execing, modifications via /proc,
 689          * restorecontext, etc. As such, we need to make sure that we return to
 690          * userland with valid state in the FPU. If we're context switched out
 691          * before we hit sys_rtt_common() we'll end up having restored the FPU
 692          * as part of the context ops operations. The restore logic always makes
 693          * sure that FPU_VALID is set before doing a restore so we don't restore
 694          * it a second time.
 695          */
 696         PCB_SET_UPDATE_FPU(&curthread->t_lwp->lwp_pcb);
 697
 698         kpreempt_enable();
 699 }
 700
 701 /*
 702  * Restore the FPU context for the thread:
 703  * The possibilities are:
 704  *      1. No active FPU context: Load the new context into the FPU hw
 705  *         and enable the FPU.
 706  */
 707 void
 708 fp_restore(struct fpu_ctx *fp)
 709 {
 710         switch (fp_save_mech) {
 711         case FP_FXSAVE:
 712                 fpxrestore(fp->fpu_regs.kfpu_u.kfpu_fx);
 713                 break;
 714
 715         case FP_XSAVE:
 716                 xrestore(fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
 717                 break;
 718         default:
 719                 panic("Invalid fp_save_mech");
 720                 /*NOTREACHED*/
 721         }
 722
 723         fp->fpu_flags &= ~FPU_VALID;
 724 }
 725
 726 /*
 727  * Reset the FPU such that it is in a valid state for a new thread that is
 728  * coming out of exec. The FPU will be in a usable state at this point. At this
 729  * point we know that the FPU state has already been allocated and if this
 730  * wasn't an init process, then it will have had fp_free() previously called.
 731  */
 732 void
 733 fp_exec(void)
 734 {
 735         struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
 736
 737         if (fp_save_mech == FP_XSAVE) {
 738                 fp->fpu_xsave_mask = XFEATURE_FP_ALL;
 739         }
 740
 741         /*
 742          * Make sure that we're not preempted in the middle of initializing the
 743          * FPU on CPU.
 744          */
 745         kpreempt_disable();
 746         installctx(curthread, fp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp,
 747             fp_new_lwp, NULL, fp_free);
 748         fpinit();
 749         fp->fpu_flags = FPU_EN;
 750         kpreempt_enable();
 751 }
 752
 753
 754 /*
 755  * Seeds the initial state for the current thread.  The possibilities are:
 756  *      1. Another process has modified the FPU state before we have done any
 757  *         initialization: Load the FPU state from the LWP state.
 758  *      2. The FPU state has not been externally modified:  Load a clean state.
 759  */
 760 void
 761 fp_seed(void)
 762 {
 763         struct fpu_ctx *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
 764
 765         ASSERT(curthread->t_preempt >= 1);
 766         ASSERT((fp->fpu_flags & FPU_EN) == 0);
 767
 768         /*
 769          * Always initialize a new context and initialize the hardware.
 770          */
 771         if (fp_save_mech == FP_XSAVE) {
 772                 fp->fpu_xsave_mask = XFEATURE_FP_ALL;
 773         }
 774
 775         installctx(curthread, fp, fpsave_ctxt, fprestore_ctxt, fp_new_lwp,
 776             fp_new_lwp, NULL, fp_free);
 777         fpinit();
 778
 779         /*
 780          * If FPU_VALID is set, it means someone has modified registers via
 781          * /proc.  In this case, restore the current lwp's state.
 782          */
 783         if (fp->fpu_flags & FPU_VALID)
 784                 fp_restore(fp);
 785
 786         ASSERT((fp->fpu_flags & FPU_VALID) == 0);
 787         fp->fpu_flags = FPU_EN;
 788 }
 789
 790 /*
 791  * When using xsave/xrstor, these three functions are used by the lwp code to
 792  * manage the memory for the xsave area.
 793  */
 794 void
 795 fp_lwp_init(struct _klwp *lwp)
 796 {
 797         struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
 798
 799         /*
 800          * We keep a copy of the pointer in lwp_fpu so that we can restore the
 801          * value in forklwp() after we duplicate the parent's LWP state.
 802          */
 803         lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic =
 804             kmem_cache_alloc(fpsave_cachep, KM_SLEEP);
 805
 806         if (fp_save_mech == FP_XSAVE) {
 807                 /*
 808                  *
 809                  * We bzero since the fpinit() code path will only
 810                  * partially initialize the xsave area using avx_inital.
 811                  */
 812                 ASSERT(cpuid_get_xsave_size() >= sizeof (struct xsave_state));
 813                 bzero(fp->fpu_regs.kfpu_u.kfpu_xs, cpuid_get_xsave_size());
 814         }
 815 }
 816
 817 void
 818 fp_lwp_cleanup(struct _klwp *lwp)
 819 {
 820         struct fpu_ctx *fp = &lwp->lwp_pcb.pcb_fpu;
 821
 822         if (fp->fpu_regs.kfpu_u.kfpu_generic != NULL) {
 823                 kmem_cache_free(fpsave_cachep,
 824                     fp->fpu_regs.kfpu_u.kfpu_generic);
 825                 lwp->lwp_fpu = fp->fpu_regs.kfpu_u.kfpu_generic = NULL;
 826         }
 827 }
 828
 829 /*
 830  * Called during the process of forklwp(). The kfpu_u pointer will have been
 831  * overwritten while copying the parent's LWP structure. We have a valid copy
 832  * stashed in the child's lwp_fpu which we use to restore the correct value.
 833  */
 834 void
 835 fp_lwp_dup(struct _klwp *lwp)
 836 {
 837         void *xp = lwp->lwp_fpu;
 838         size_t sz;
 839
 840         switch (fp_save_mech) {
 841         case FP_FXSAVE:
 842                 sz = sizeof (struct fxsave_state);
 843                 break;
 844         case FP_XSAVE:
 845                 sz = cpuid_get_xsave_size();
 846                 break;
 847         default:
 848                 panic("Invalid fp_save_mech");
 849                 /*NOTREACHED*/
 850         }
 851
 852         /* copy the parent's values into the new lwp's struct */
 853         bcopy(lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic, xp, sz);
 854         /* now restore the pointer */
 855         lwp->lwp_pcb.pcb_fpu.fpu_regs.kfpu_u.kfpu_generic = xp;
 856 }
 857
 858 /*
 859  * Handle a processor extension error fault
 860  * Returns non zero for error.
 861  */
 862
 863 /*ARGSUSED*/
 864 int
 865 fpexterrflt(struct regs *rp)
 866 {
 867         uint32_t fpcw, fpsw;
 868         fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
 869
 870         ASSERT(fp_kind != FP_NO);
 871
 872         /*
 873          * Now we can enable the interrupts.
 874          * (NOTE: x87 fp exceptions come thru interrupt gate)
 875          */
 876         sti();
 877
 878         if (!fpu_exists)
 879                 return (FPE_FLTINV);
 880
 881         /*
 882          * Do an unconditional save of the FP state.  If it's dirty (TS=0),
 883          * it'll be saved into the fpu context area passed in (that of the
 884          * current thread).  If it's not dirty (it may not be, due to
 885          * an intervening save due to a context switch between the sti(),
 886          * above and here, then it's safe to just use the stored values in
 887          * the context save area to determine the cause of the fault.
 888          */
 889         fp_save(fp);
 890
 891         /* clear exception flags in saved state, as if by fnclex */
 892         switch (fp_save_mech) {
 893         case FP_FXSAVE:
 894                 fpsw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
 895                 fpcw = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fcw;
 896                 fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw &= ~FPS_SW_EFLAGS;
 897                 break;
 898
 899         case FP_XSAVE:
 900                 fpsw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
 901                 fpcw = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fcw;
 902                 fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw &= ~FPS_SW_EFLAGS;
 903                 /*
 904                  * Always set LEGACY_FP as it may have been cleared by XSAVE
 905                  * instruction
 906                  */
 907                 fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP;
 908                 break;
 909         default:
 910                 panic("Invalid fp_save_mech");
 911                 /*NOTREACHED*/
 912         }
 913
 914         fp->fpu_regs.kfpu_status = fpsw;
 915
 916         if ((fpsw & FPS_ES) == 0)
 917                 return (0);             /* No exception */
 918
 919         /*
 920          * "and" the exception flags with the complement of the mask
 921          * bits to determine which exception occurred
 922          */
 923         return (fpe_sicode(fpsw & ~fpcw & 0x3f));
 924 }
 925
 926 /*
 927  * Handle an SSE/SSE2 precise exception.
 928  * Returns a non-zero sicode for error.
 929  */
 930 /*ARGSUSED*/
 931 int
 932 fpsimderrflt(struct regs *rp)
 933 {
 934         uint32_t mxcsr, xmask;
 935         fpu_ctx_t *fp = &ttolwp(curthread)->lwp_pcb.pcb_fpu;
 936
 937         ASSERT(fp_kind & __FP_SSE);
 938
 939         /*
 940          * NOTE: Interrupts are disabled during execution of this
 941          * function.  They are enabled by the caller in trap.c.
 942          */
 943
 944         /*
 945          * The only way we could have gotten here if there is no FP unit
 946          * is via a user executing an INT $19 instruction, so there is
 947          * no fault in that case.
 948          */
 949         if (!fpu_exists)
 950                 return (0);
 951
 952         /*
 953          * Do an unconditional save of the FP state.  If it's dirty (TS=0),
 954          * it'll be saved into the fpu context area passed in (that of the
 955          * current thread).  If it's not dirty, then it's safe to just use
 956          * the stored values in the context save area to determine the
 957          * cause of the fault.
 958          */
 959         fp_save(fp);            /* save the FPU state */
 960
 961         if (fp_save_mech == FP_XSAVE) {
 962                 mxcsr = fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_mxcsr;
 963                 fp->fpu_regs.kfpu_status =
 964                     fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave.fx_fsw;
 965         } else {
 966                 mxcsr = fp->fpu_regs.kfpu_u.kfpu_fx->fx_mxcsr;
 967                 fp->fpu_regs.kfpu_status = fp->fpu_regs.kfpu_u.kfpu_fx->fx_fsw;
 968         }
 969         fp->fpu_regs.kfpu_xstatus = mxcsr;
 970
 971         /*
 972          * compute the mask that determines which conditions can cause
 973          * a #xm exception, and use this to clean the status bits so that
 974          * we can identify the true cause of this one.
 975          */
 976         xmask = (mxcsr >> 7) & SSE_MXCSR_EFLAGS;
 977         return (fpe_simd_sicode((mxcsr & SSE_MXCSR_EFLAGS) & ~xmask));
 978 }
 979
 980 /*
 981  * In the unlikely event that someone is relying on this subcode being
 982  * FPE_FLTILL for denormalize exceptions, it can always be patched back
 983  * again to restore old behaviour.
 984  */
 985 int fpe_fltden = FPE_FLTDEN;
 986
 987 /*
 988  * Map from the FPU status word to the FP exception si_code.
 989  */
 990 static int
 991 fpe_sicode(uint_t sw)
 992 {
 993         if (sw & FPS_IE)
 994                 return (FPE_FLTINV);
 995         if (sw & FPS_ZE)
 996                 return (FPE_FLTDIV);
 997         if (sw & FPS_DE)
 998                 return (fpe_fltden);
 999         if (sw & FPS_OE)
1000                 return (FPE_FLTOVF);
1001         if (sw & FPS_UE)
1002                 return (FPE_FLTUND);
1003         if (sw & FPS_PE)
1004                 return (FPE_FLTRES);
1005         return (FPE_FLTINV);    /* default si_code for other exceptions */
1006 }
1007
1008 /*
1009  * Map from the SSE status word to the FP exception si_code.
1010  */
1011 static int
1012 fpe_simd_sicode(uint_t sw)
1013 {
1014         if (sw & SSE_IE)
1015                 return (FPE_FLTINV);
1016         if (sw & SSE_ZE)
1017                 return (FPE_FLTDIV);
1018         if (sw & SSE_DE)
1019                 return (FPE_FLTDEN);
1020         if (sw & SSE_OE)
1021                 return (FPE_FLTOVF);
1022         if (sw & SSE_UE)
1023                 return (FPE_FLTUND);
1024         if (sw & SSE_PE)
1025                 return (FPE_FLTRES);
1026         return (FPE_FLTINV);    /* default si_code for other exceptions */
1027 }
1028
1029 /*
1030  * This routine is invoked as part of libc's __fpstart implementation
1031  * via sysi86(2).
1032  *
1033  * It may be called -before- any context has been assigned in which case
1034  * we try and avoid touching the hardware.  Or it may be invoked well
1035  * after the context has been assigned and fiddled with, in which case
1036  * just tweak it directly.
1037  */
1038 void
1039 fpsetcw(uint16_t fcw, uint32_t mxcsr)
1040 {
1041         struct fpu_ctx *fp = &curthread->t_lwp->lwp_pcb.pcb_fpu;
1042         struct fxsave_state *fx;
1043
1044         if (!fpu_exists || fp_kind == FP_NO)
1045                 return;
1046
1047         if ((fp->fpu_flags & FPU_EN) == 0) {
1048                 if (fcw == FPU_CW_INIT && mxcsr == SSE_MXCSR_INIT) {
1049                         /*
1050                          * Common case.  Floating point unit not yet
1051                          * enabled, and kernel already intends to initialize
1052                          * the hardware the way the caller wants.
1053                          */
1054                         return;
1055                 }
1056                 /*
1057                  * Hmm.  Userland wants a different default.
1058                  * Do a fake "first trap" to establish the context, then
1059                  * handle as if we already had a context before we came in.
1060                  */
1061                 kpreempt_disable();
1062                 fp_seed();
1063                 kpreempt_enable();
1064         }
1065
1066         /*
1067          * Ensure that the current hardware state is flushed back to the
1068          * pcb, then modify that copy.  Next use of the fp will
1069          * restore the context.
1070          */
1071         fp_save(fp);
1072
1073         switch (fp_save_mech) {
1074         case FP_FXSAVE:
1075                 fx = fp->fpu_regs.kfpu_u.kfpu_fx;
1076                 fx->fx_fcw = fcw;
1077                 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1078                 break;
1079
1080         case FP_XSAVE:
1081                 fx = &fp->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave;
1082                 fx->fx_fcw = fcw;
1083                 fx->fx_mxcsr = sse_mxcsr_mask & mxcsr;
1084                 /*
1085                  * Always set LEGACY_FP as it may have been cleared by XSAVE
1086                  * instruction
1087                  */
1088                 fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP;
1089                 break;
1090         default:
1091                 panic("Invalid fp_save_mech");
1092                 /*NOTREACHED*/
1093         }
1094 }