docs/jit-regalloc

   1 Register Allocation
   2 ===================
   3
   4 The current JIT implementation uses a tree matcher to generate code. We use a
   5 simple algorithm to allocate registers in trees, and limit the number of used
   6 temporary register to 4 when evaluating trees. So we can use 2 registers for
   7 global register allocation.
   8
   9 Register Allocation for Trees
  10 =============================
  11
  12 We always evaluate trees from left to right. When there are no more registers
  13 available we need to spill values to memory. Here is the simplified algorithm.
  14
  15 gboolean
  16 tree_allocate_regs (tree, exclude_reg)
  17 {
  18         if (!tree_allocate_regs (tree->left, -1))
  19                 return FALSE;
  20
  21         if (!tree_allocate_regs (tree->right, -1)) {
  22
  23                 tree->left->spilled == TRUE;
  24
  25                 free_used_regs (tree->left);
  26
  27                 if (!tree_allocate_regs (tree->right, tree->left->reg))
  28                         return FALSE;
  29         }
  30
  31         free_used_regs (tree->left);
  32         free_used_regs (tree->right);
  33
  34         /* try to allocate a register (reg != exclude_reg) */
  35         if ((tree->reg = next_free_reg (exclude_reg)) != -1)
  36                 return TRUE;
  37
  38         return FALSE;
  39 }
  40
  41 The emit routing actually spills the registers:
  42
  43 tree_emit (tree)
  44 {
  45
  46         tree_emit (tree->left);
  47
  48         if (tree->left->spilled)
  49                 save_reg (tree->left->reg);
  50
  51         tree_emit (tree->right);
  52
  53         if (tree->left->spilled)
  54                 restore_reg (tree->left->reg);
  55
  56
  57         emit_code (tree);
  58 }
  59
  60
  61 Global Register Allocation
  62 ==========================
  63
  64 TODO.
  65
  66 Local Register Allocation
  67 =========================
  68
  69 This section describes the cross-platform local register allocator which is
  70 in the file mini-codegen.c.
  71
  72 The input to the allocator is a basic block which contains linear IL, ie.
  73 instructions of the form:
  74
  75    DEST <- SRC1 OP SRC2
  76
  77 where DEST, SRC1, and SRC2 are virtual registers (vregs). The job of the
  78 allocator is to assign hard or physical registers (hregs) to each virtual
  79 registers so the vreg references in the instructions can be replaced with their
  80 assigned hreg, allowing machine code to be generated later.
  81
  82 The allocator needs information about the number and types of arguments of
  83 instructions. It takes this information from the machine description files. It
  84 also needs arch specific information, like the number and type of the hard
  85 registers. It gets this information from arch-specific macros.
  86
  87 Currently, the vregs and hregs are partitioned into two classes: integer and
  88 floating point.
  89
  90 The allocator consists of two phases: In the first phase, a forward pass is
  91 made over the instructions, collecting liveness information for vregs. In the
  92 second phase, a backward pass is made over the instructions, assigning
  93 registers. This backward mode of operation makes understanding the allocator
  94 somewhat difficult to understand, but leads to better code in most cases.
  95
  96 Allocator state
  97 ===============
  98
  99 The state of the allocator is stored in two arrays: iassign and isymbolic.
 100 iassign maps vregs to hregs, while isymbolic is the opposite.
 101 For a vreg, iassign [vreg] can contain the following values:
 102
 103     * -1                                vreg has no assigned hreg
 104
 105     * hreg index (>= 0)         vreg is assigned to the given hreg. This means
 106                                 later instructions (which we have already
 107                                                                 processed due to the backward direction) expect
 108                                                                 the value of vreg to be found in hreg.
 109
 110         * spill slot index (< -1)       vreg is spilled to the given spill slot. This
 111                                                                 means later instructions expect the value of
 112                                                                 vreg to be found on the stack in the given
 113                                                                 spill slot.
 114
 115 Also, the allocator keeps track of which hregs are free and which are used.
 116 This information is stored in a bitmask called ifree_mask.
 117
 118 There is a similar set of data structures for floating point registers.
 119
 120 Spilling
 121 ========
 122
 123 When an allocator needs a free hreg, but all of them are assigned, it needs to
 124 free up one of them. It does this by spilling the contents of the vreg which
 125 is currently assigned to the selected hreg. Since later instructions expect
 126 the vreg to be found in the selected hreg, the allocator emits a spill-load
 127 instruction to load the value from the spill slot into the hreg after the
 128 currently processed instruction. When the vreg which is spilled is a
 129 destination in an instruction, the allocator will emit a spill-store to store
 130 the value into the spill slot.
 131
 132 Fixed registers
 133 ===============
 134
 135 Some architectures, notably x86/amd64 require that the arguments/results of
 136 some instructions be assigned to specific hregs. An example is the shift
 137 opcodes on x86, where the second argument must be in ECX. The allocator
 138 has support for this. It tries to allocate the vreg to the required hreg. If
 139 thats not possible, then it will emit compensation code which moves values to
 140 the correct registers before/after the instruction.
 141
 142 Fixed registers are mainly used on x86, but they are useful on more regular
 143 architectures on well, for example to model that after a call instruction, the
 144 return of the call is in a specific register.
 145
 146 A special case of fixed registers is two address architectures, like the x86,
 147 where the instructions place their results into their first argument. This is
 148 modelled in the allocator by allocating SRC1 and DEST to the same hreg.
 149
 150 Global registers
 151 ================
 152
 153 Some variables might already be allocated to hardware registers during the
 154 global allocation phase. In this case, SRC1, SRC2 and DEST might already be
 155 a hardware register. The allocator needs to do nothing in this case, except
 156 when the architecture uses fixed registers, in which case it needs to emit
 157 compensation code.
 158
 159 Register pairs
 160 ==============
 161
 162 64 bit arithmetic on 32 bit machines requires instructions whose arguments are
 163 not registers, but register pairs. The allocator has support for this, both
 164 for freely allocatable register pairs, and for register pairs which are
 165 constrained to specific hregs (EDX:EAX on x86).
 166
 167 Floating point stack
 168 ====================
 169
 170 The x86 architecture uses a floating point register stack instead of a set of
 171 fp registers. The allocator supports this by keeping track of the height of the
 172 fp stack, and spilling/loading values from the stack as neccesary.
 173
 174 Calls
 175 =====
 176
 177 Calls need special handling for two reasons: first, they will clobber all
 178 caller-save registers, meaning their contents will need to be spilled. Also,
 179 some architectures pass arguments in registers. The registers used for
 180 passing arguments are usually the same as the ones used for local allocation,
 181 so the allocator needs to handle them specially. This is done as follows:
 182 the MonoInst for the call instruction contains a map mapping vregs which
 183 contain the argument values to hregs where the argument needs to be placed,
 184 like this (on amd64):
 185
 186 R33 -> RDI
 187 R34 -> RSI
 188 ...
 189
 190 When the allocator processes the call instruction, it allocates the vregs
 191 in the map to their associated hregs. So the call instruction is processed as
 192 if having a variable number of arguments which fixed register assignments.
 193
 194 An example:
 195
 196   R33 <- 1
 197   R34 <- 2
 198   call
 199
 200 When the call instruction is processed, R33 is assigned to RDI, and R34 is
 201 assigned to RSI. Later, when the two assignment instructions are processed,
 202 R33 and R34 are already assigned to a hreg, so they are replaced with the
 203 associated hreg leading to the following final code:
 204
 205   RDI <- 1
 206   RSI <- 1
 207   call
 208
 209 Machine description files
 210 =========================
 211
 212 A typical entry in the machine description files looks like this:
 213
 214         shl: dest:i src1:i src2:s clob:1 len:2
 215
 216 The allocator is only interested in the dest,src1,src2 and clob fields.
 217 It understands the following values for the dest, src1, src2 fields:
 218
 219         i -     integer register
 220         f - fp register
 221         b - base register (same as i, but the instruction does not modify the reg)
 222         m - fp register, even if an fp stack is used (no fp stack tracking)
 223
 224 It understands the following values for the clob field:
 225
 226         1 - sreg1 needs to be the same as dreg
 227         c - instruction clobbers the caller-save registers
 228
 229 Beside these values, an architecture can define additional values (like the 's'
 230 in the example). The allocator depends on a set of arch-specific macros to
 231 convert these values to information it needs during allocation.
 232
 233 Arch specific macros
 234 ====================
 235
 236 These macros usually receive a value from the machine description file (like
 237 the 's' in the example). The examples below are for x86.
 238
 239 /*
 240  * A bitmask selecting the caller-save registers (these are used for local
 241  * allocation).
 242  */
 243 #define MONO_ARCH_CALLEE_REGS X86_CALLEE_REGS
 244
 245 /*
 246  * A bitmask selecting the callee-saved registers (these are usually used for
 247  * global allocation).
 248  */
 249 #define MONO_ARCH_CALLEE_SAVED_REGS X86_CALLER_REGS
 250
 251 /* Same for the floating point registers */
 252 #define MONO_ARCH_CALLEE_FREGS 0
 253 #define MONO_ARCH_CALLEE_SAVED_FREGS 0
 254
 255 /* Whenever the target uses a floating point stack */
 256 #define MONO_ARCH_USE_FPSTACK TRUE
 257
 258 /* The size of the floating point stack */
 259 #define MONO_ARCH_FPSTACK_SIZE 6
 260
 261 /*
 262  * Given a descriptor value from the machine description file, return the fixed
 263  * hard reg corresponding to that value.
 264  */
 265 #define MONO_ARCH_INST_FIXED_REG(desc) ((desc == 's') ? X86_ECX : ((desc == 'a') ? X86_EAX : ((desc == 'd') ? X86_EDX : ((desc == 'y') ? X86_EAX : ((desc == 'l') ? X86_EAX : -1)))))
 266
 267 /*
 268  * A bitmask selecting the hregs which can be used for allocating sreg2 for
 269  * a given instruction.
 270  */
 271 #define MONO_ARCH_INST_SREG2_MASK(ins) (((ins [MONO_INST_CLOB] == 'a') || (ins [MONO_INST_CLOB] == 'd')) ? (1 << X86_EDX) : 0)
 272
 273 /*
 274  * Given a descriptor value, return whenever it denotes a register pair.
 275  */
 276 #define MONO_ARCH_INST_IS_REGPAIR(desc) (desc == 'l' || desc == 'L')
 277
 278 /*
 279  * Given a descriptor value, and the first register of a regpair, return a
 280  * bitmask selecting the hregs which can be used for allocating the second
 281  * register of the regpair.
 282  */
 283 #define MONO_ARCH_INST_REGPAIR_REG2(desc,hreg1) (desc == 'l' ? X86_EDX : -1)