yjit_codegen.c

   1 // This file is a fragment of the yjit.o compilation unit. See yjit.c.
   2 #include "internal.h"
   3 #include "gc.h"
   4 #include "internal/compile.h"
   5 #include "internal/class.h"
   6 #include "internal/hash.h"
   7 #include "internal/object.h"
   8 #include "internal/sanitizers.h"
   9 #include "internal/string.h"
  10 #include "internal/struct.h"
  11 #include "internal/variable.h"
  12 #include "internal/re.h"
  13 #include "probes.h"
  14 #include "probes_helper.h"
  15 #include "yjit.h"
  16 #include "yjit_iface.h"
  17 #include "yjit_core.h"
  18 #include "yjit_codegen.h"
  19 #include "yjit_asm.h"
  20
  21 // Map from YARV opcodes to code generation functions
  22 static codegen_fn gen_fns[VM_INSTRUCTION_SIZE] = { NULL };
  23
  24 // Map from method entries to code generation functions
  25 static st_table *yjit_method_codegen_table = NULL;
  26
  27 // Code for exiting back to the interpreter from the leave instruction
  28 static void *leave_exit_code;
  29
  30 // Code for full logic of returning from C method and exiting to the interpreter
  31 static uint32_t outline_full_cfunc_return_pos;
  32
  33 // For implementing global code invalidation
  34 struct codepage_patch {
  35     uint32_t inline_patch_pos;
  36     uint32_t outlined_target_pos;
  37 };
  38
  39 typedef rb_darray(struct codepage_patch) patch_array_t;
  40
  41 static patch_array_t global_inval_patches = NULL;
  42
  43 // Print the current source location for debugging purposes
  44 RBIMPL_ATTR_MAYBE_UNUSED()
  45 static void
  46 jit_print_loc(jitstate_t *jit, const char *msg)
  47 {
  48     char *ptr;
  49     long len;
  50     VALUE path = rb_iseq_path(jit->iseq);
  51     RSTRING_GETMEM(path, ptr, len);
  52     fprintf(stderr, "%s %.*s:%u\n", msg, (int)len, ptr, rb_iseq_line_no(jit->iseq, jit->insn_idx));
  53 }
  54
  55 // dump an object for debugging purposes
  56 RBIMPL_ATTR_MAYBE_UNUSED()
  57 static void
  58 jit_obj_info_dump(codeblock_t *cb, x86opnd_t opnd) {
  59     push_regs(cb);
  60     mov(cb, C_ARG_REGS[0], opnd);
  61     call_ptr(cb, REG0, (void *)rb_obj_info_dump);
  62     pop_regs(cb);
  63 }
  64
  65 // Get the current instruction's opcode
  66 static int
  67 jit_get_opcode(jitstate_t *jit)
  68 {
  69     return jit->opcode;
  70 }
  71
  72 // Get the index of the next instruction
  73 static uint32_t
  74 jit_next_insn_idx(jitstate_t *jit)
  75 {
  76     return jit->insn_idx + insn_len(jit_get_opcode(jit));
  77 }
  78
  79 // Get an instruction argument by index
  80 static VALUE
  81 jit_get_arg(jitstate_t *jit, size_t arg_idx)
  82 {
  83     RUBY_ASSERT(arg_idx + 1 < (size_t)insn_len(jit_get_opcode(jit)));
  84     return *(jit->pc + arg_idx + 1);
  85 }
  86
  87 // Load a VALUE into a register and keep track of the reference if it is on the GC heap.
  88 static void
  89 jit_mov_gc_ptr(jitstate_t *jit, codeblock_t *cb, x86opnd_t reg, VALUE ptr)
  90 {
  91     RUBY_ASSERT(reg.type == OPND_REG && reg.num_bits == 64);
  92
  93     // Load the pointer constant into the specified register
  94     mov(cb, reg, const_ptr_opnd((void*)ptr));
  95
  96     // The pointer immediate is encoded as the last part of the mov written out
  97     uint32_t ptr_offset = cb->write_pos - sizeof(VALUE);
  98
  99     if (!SPECIAL_CONST_P(ptr)) {
 100         if (!rb_darray_append(&jit->block->gc_object_offsets, ptr_offset)) {
 101             rb_bug("allocation failed");
 102         }
 103     }
 104 }
 105
 106 // Check if we are compiling the instruction at the stub PC
 107 // Meaning we are compiling the instruction that is next to execute
 108 static bool
 109 jit_at_current_insn(jitstate_t *jit)
 110 {
 111     const VALUE *ec_pc = jit->ec->cfp->pc;
 112     return (ec_pc == jit->pc);
 113 }
 114
 115 // Peek at the nth topmost value on the Ruby stack.
 116 // Returns the topmost value when n == 0.
 117 static VALUE
 118 jit_peek_at_stack(jitstate_t *jit, ctx_t *ctx, int n)
 119 {
 120     RUBY_ASSERT(jit_at_current_insn(jit));
 121
 122     // Note: this does not account for ctx->sp_offset because
 123     // this is only available when hitting a stub, and while
 124     // hitting a stub, cfp->sp needs to be up to date in case
 125     // codegen functions trigger GC. See :stub-sp-flush:.
 126     VALUE *sp = jit->ec->cfp->sp;
 127
 128     return *(sp - 1 - n);
 129 }
 130
 131 static VALUE
 132 jit_peek_at_self(jitstate_t *jit, ctx_t *ctx)
 133 {
 134     return jit->ec->cfp->self;
 135 }
 136
 137 RBIMPL_ATTR_MAYBE_UNUSED()
 138 static VALUE
 139 jit_peek_at_local(jitstate_t *jit, ctx_t *ctx, int n)
 140 {
 141     RUBY_ASSERT(jit_at_current_insn(jit));
 142
 143     int32_t local_table_size = jit->iseq->body->local_table_size;
 144     RUBY_ASSERT(n < (int)jit->iseq->body->local_table_size);
 145
 146     const VALUE *ep = jit->ec->cfp->ep;
 147     return ep[-VM_ENV_DATA_SIZE - local_table_size + n + 1];
 148 }
 149
 150 // Save the incremented PC on the CFP
 151 // This is necessary when calleees can raise or allocate
 152 static void
 153 jit_save_pc(jitstate_t *jit, x86opnd_t scratch_reg)
 154 {
 155     codeblock_t *cb = jit->cb;
 156     mov(cb, scratch_reg, const_ptr_opnd(jit->pc + insn_len(jit->opcode)));
 157     mov(cb, mem_opnd(64, REG_CFP, offsetof(rb_control_frame_t, pc)), scratch_reg);
 158 }
 159
 160 // Save the current SP on the CFP
 161 // This realigns the interpreter SP with the JIT SP
 162 // Note: this will change the current value of REG_SP,
 163 //       which could invalidate memory operands
 164 static void
 165 jit_save_sp(jitstate_t *jit, ctx_t *ctx)
 166 {
 167     if (ctx->sp_offset != 0) {
 168         x86opnd_t stack_pointer = ctx_sp_opnd(ctx, 0);
 169         codeblock_t *cb = jit->cb;
 170         lea(cb, REG_SP, stack_pointer);
 171         mov(cb, member_opnd(REG_CFP, rb_control_frame_t, sp), REG_SP);
 172         ctx->sp_offset = 0;
 173     }
 174 }
 175
 176 // jit_save_pc() + jit_save_sp(). Should be used before calling a routine that
 177 // could:
 178 //  - Perform GC allocation
 179 //  - Take the VM lock through RB_VM_LOCK_ENTER()
 180 //  - Perform Ruby method call
 181 static void
 182 jit_prepare_routine_call(jitstate_t *jit, ctx_t *ctx, x86opnd_t scratch_reg)
 183 {
 184     jit->record_boundary_patch_point = true;
 185     jit_save_pc(jit, scratch_reg);
 186     jit_save_sp(jit, ctx);
 187 }
 188
 189 // Record the current codeblock write position for rewriting into a jump into
 190 // the outlined block later. Used to implement global code invalidation.
 191 static void
 192 record_global_inval_patch(const codeblock_t *cb, uint32_t outline_block_target_pos)
 193 {
 194     struct codepage_patch patch_point = { cb->write_pos, outline_block_target_pos };
 195     if (!rb_darray_append(&global_inval_patches, patch_point)) rb_bug("allocation failed");
 196 }
 197
 198 static bool jit_guard_known_klass(jitstate_t *jit, ctx_t *ctx, VALUE known_klass, insn_opnd_t insn_opnd, VALUE sample_instance, const int max_chain_depth, uint8_t *side_exit);
 199
 200 #if YJIT_STATS
 201
 202 // Add a comment at the current position in the code block
 203 static void
 204 _add_comment(codeblock_t *cb, const char *comment_str)
 205 {
 206     // We can't add comments to the outlined code block
 207     if (cb == ocb)
 208         return;
 209
 210     // Avoid adding duplicate comment strings (can happen due to deferred codegen)
 211     size_t num_comments = rb_darray_size(yjit_code_comments);
 212     if (num_comments > 0) {
 213         struct yjit_comment last_comment = rb_darray_get(yjit_code_comments, num_comments - 1);
 214         if (last_comment.offset == cb->write_pos && strcmp(last_comment.comment, comment_str) == 0) {
 215             return;
 216         }
 217     }
 218
 219     struct yjit_comment new_comment = (struct yjit_comment){ cb->write_pos, comment_str };
 220     rb_darray_append(&yjit_code_comments, new_comment);
 221 }
 222
 223 // Comments for generated machine code
 224 #define ADD_COMMENT(cb, comment) _add_comment((cb), (comment))
 225
 226 // Verify the ctx's types and mappings against the compile-time stack, self,
 227 // and locals.
 228 static void
 229 verify_ctx(jitstate_t *jit, ctx_t *ctx)
 230 {
 231     // Only able to check types when at current insn
 232     RUBY_ASSERT(jit_at_current_insn(jit));
 233
 234     VALUE self_val = jit_peek_at_self(jit, ctx);
 235     if (type_diff(yjit_type_of_value(self_val), ctx->self_type) == INT_MAX) {
 236         rb_bug("verify_ctx: ctx type (%s) incompatible with actual value of self: %s", yjit_type_name(ctx->self_type), rb_obj_info(self_val));
 237     }
 238
 239     for (int i = 0; i < ctx->stack_size && i < MAX_TEMP_TYPES; i++) {
 240         temp_type_mapping_t learned = ctx_get_opnd_mapping(ctx, OPND_STACK(i));
 241         VALUE val = jit_peek_at_stack(jit, ctx, i);
 242         val_type_t detected = yjit_type_of_value(val);
 243
 244         if (learned.mapping.kind == TEMP_SELF) {
 245             if (self_val != val) {
 246                 rb_bug("verify_ctx: stack value was mapped to self, but values did not match\n"
 247                         "  stack: %s\n"
 248                         "  self: %s",
 249                         rb_obj_info(val),
 250                         rb_obj_info(self_val));
 251             }
 252         }
 253
 254         if (learned.mapping.kind == TEMP_LOCAL) {
 255             int local_idx = learned.mapping.idx;
 256             VALUE local_val = jit_peek_at_local(jit, ctx, local_idx);
 257             if (local_val != val) {
 258                 rb_bug("verify_ctx: stack value was mapped to local, but values did not match\n"
 259                         "  stack: %s\n"
 260                         "  local %i: %s",
 261                         rb_obj_info(val),
 262                         local_idx,
 263                         rb_obj_info(local_val));
 264             }
 265         }
 266
 267         if (type_diff(detected, learned.type) == INT_MAX) {
 268             rb_bug("verify_ctx: ctx type (%s) incompatible with actual value on stack: %s", yjit_type_name(learned.type), rb_obj_info(val));
 269         }
 270     }
 271
 272     int32_t local_table_size = jit->iseq->body->local_table_size;
 273     for (int i = 0; i < local_table_size && i < MAX_TEMP_TYPES; i++) {
 274         val_type_t learned = ctx->local_types[i];
 275         VALUE val = jit_peek_at_local(jit, ctx, i);
 276         val_type_t detected = yjit_type_of_value(val);
 277
 278         if (type_diff(detected, learned) == INT_MAX) {
 279             rb_bug("verify_ctx: ctx type (%s) incompatible with actual value of local: %s", yjit_type_name(learned), rb_obj_info(val));
 280         }
 281     }
 282 }
 283
 284 #else
 285
 286 #define ADD_COMMENT(cb, comment) ((void)0)
 287 #define verify_ctx(jit, ctx) ((void)0)
 288
 289 #endif // if YJIT_STATS
 290
 291 #if YJIT_STATS
 292
 293 // Increment a profiling counter with counter_name
 294 #define GEN_COUNTER_INC(cb, counter_name) _gen_counter_inc(cb, &(yjit_runtime_counters . counter_name))
 295 static void
 296 _gen_counter_inc(codeblock_t *cb, int64_t *counter)
 297 {
 298     if (!rb_yjit_opts.gen_stats) return;
 299
 300     // Use REG1 because there might be return value in REG0
 301     mov(cb, REG1, const_ptr_opnd(counter));
 302     cb_write_lock_prefix(cb); // for ractors.
 303     add(cb, mem_opnd(64, REG1, 0), imm_opnd(1));
 304 }
 305
 306 // Increment a counter then take an existing side exit.
 307 #define COUNTED_EXIT(jit, side_exit, counter_name) _counted_side_exit(jit, side_exit, &(yjit_runtime_counters . counter_name))
 308 static uint8_t *
 309 _counted_side_exit(jitstate_t* jit, uint8_t *existing_side_exit, int64_t *counter)
 310 {
 311     if (!rb_yjit_opts.gen_stats) return existing_side_exit;
 312
 313     uint8_t *start = cb_get_ptr(jit->ocb, jit->ocb->write_pos);
 314     _gen_counter_inc(jit->ocb, counter);
 315     jmp_ptr(jit->ocb, existing_side_exit);
 316     return start;
 317 }
 318
 319 #else
 320
 321 #define GEN_COUNTER_INC(cb, counter_name) ((void)0)
 322 #define COUNTED_EXIT(jit, side_exit, counter_name) side_exit
 323
 324 #endif // if YJIT_STATS
 325
 326 // Generate an exit to return to the interpreter
 327 static uint32_t
 328 yjit_gen_exit(VALUE *exit_pc, ctx_t *ctx, codeblock_t *cb)
 329 {
 330     const uint32_t code_pos = cb->write_pos;
 331
 332     ADD_COMMENT(cb, "exit to interpreter");
 333
 334     // Generate the code to exit to the interpreters
 335     // Write the adjusted SP back into the CFP
 336     if (ctx->sp_offset != 0) {
 337         x86opnd_t stack_pointer = ctx_sp_opnd(ctx, 0);
 338         lea(cb, REG_SP, stack_pointer);
 339         mov(cb, member_opnd(REG_CFP, rb_control_frame_t, sp), REG_SP);
 340     }
 341
 342     // Update CFP->PC
 343     mov(cb, RAX, const_ptr_opnd(exit_pc));
 344     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, pc), RAX);
 345
 346     // Accumulate stats about interpreter exits
 347 #if YJIT_STATS
 348     if (rb_yjit_opts.gen_stats) {
 349         mov(cb, RDI, const_ptr_opnd(exit_pc));
 350         call_ptr(cb, RSI, (void *)&yjit_count_side_exit_op);
 351     }
 352 #endif
 353
 354     pop(cb, REG_SP);
 355     pop(cb, REG_EC);
 356     pop(cb, REG_CFP);
 357
 358     mov(cb, RAX, imm_opnd(Qundef));
 359     ret(cb);
 360
 361     return code_pos;
 362 }
 363
 364 // Generate a continuation for gen_leave() that exits to the interpreter at REG_CFP->pc.
 365 static uint8_t *
 366 yjit_gen_leave_exit(codeblock_t *cb)
 367 {
 368     uint8_t *code_ptr = cb_get_ptr(cb, cb->write_pos);
 369
 370     // Note, gen_leave() fully reconstructs interpreter state and leaves the
 371     // return value in RAX before coming here.
 372
 373     // Every exit to the interpreter should be counted
 374     GEN_COUNTER_INC(cb, leave_interp_return);
 375
 376     pop(cb, REG_SP);
 377     pop(cb, REG_EC);
 378     pop(cb, REG_CFP);
 379
 380     ret(cb);
 381
 382     return code_ptr;
 383 }
 384
 385 // Fill code_for_exit_from_stub. This is used by branch_stub_hit() to exit
 386 // to the interpreter when it cannot service a stub by generating new code.
 387 // Before coming here, branch_stub_hit() takes care of fully reconstructing
 388 // interpreter state.
 389 static void
 390 gen_code_for_exit_from_stub(void)
 391 {
 392     codeblock_t *cb = ocb;
 393     code_for_exit_from_stub = cb_get_ptr(cb, cb->write_pos);
 394
 395     GEN_COUNTER_INC(cb, exit_from_branch_stub);
 396
 397     pop(cb, REG_SP);
 398     pop(cb, REG_EC);
 399     pop(cb, REG_CFP);
 400
 401     mov(cb, RAX, imm_opnd(Qundef));
 402     ret(cb);
 403 }
 404
 405 // :side-exit:
 406 // Get an exit for the current instruction in the outlined block. The code
 407 // for each instruction often begins with several guards before proceeding
 408 // to do work. When guards fail, an option we have is to exit to the
 409 // interpreter at an instruction boundary. The piece of code that takes
 410 // care of reconstructing interpreter state and exiting out of generated
 411 // code is called the side exit.
 412 //
 413 // No guards change the logic for reconstructing interpreter state at the
 414 // moment, so there is one unique side exit for each context. Note that
 415 // it's incorrect to jump to the side exit after any ctx stack push/pop operations
 416 // since they change the logic required for reconstructing interpreter state.
 417 static uint8_t *
 418 yjit_side_exit(jitstate_t *jit, ctx_t *ctx)
 419 {
 420     if (!jit->side_exit_for_pc) {
 421         codeblock_t *ocb = jit->ocb;
 422         uint32_t pos = yjit_gen_exit(jit->pc, ctx, ocb);
 423         jit->side_exit_for_pc = cb_get_ptr(ocb, pos);
 424     }
 425
 426     return jit->side_exit_for_pc;
 427 }
 428
 429 // Ensure that there is an exit for the start of the block being compiled.
 430 // Block invalidation uses this exit.
 431 static void
 432 jit_ensure_block_entry_exit(jitstate_t *jit)
 433 {
 434     block_t *block = jit->block;
 435     if (block->entry_exit) return;
 436
 437     if (jit->insn_idx == block->blockid.idx) {
 438         // We are compiling the first instruction in the block.
 439         // Generate the exit with the cache in jitstate.
 440         block->entry_exit = yjit_side_exit(jit, &block->ctx);
 441     }
 442     else {
 443         VALUE *pc = yjit_iseq_pc_at_idx(block->blockid.iseq, block->blockid.idx);
 444         uint32_t pos = yjit_gen_exit(pc, &block->ctx, ocb);
 445         block->entry_exit = cb_get_ptr(ocb, pos);
 446     }
 447 }
 448
 449 // Generate a runtime guard that ensures the PC is at the start of the iseq,
 450 // otherwise take a side exit.  This is to handle the situation of optional
 451 // parameters.  When a function with optional parameters is called, the entry
 452 // PC for the method isn't necessarily 0, but we always generated code that
 453 // assumes the entry point is 0.
 454 static void
 455 yjit_pc_guard(codeblock_t *cb, const rb_iseq_t *iseq)
 456 {
 457     RUBY_ASSERT(cb != NULL);
 458
 459     mov(cb, REG0, member_opnd(REG_CFP, rb_control_frame_t, pc));
 460     mov(cb, REG1, const_ptr_opnd(iseq->body->iseq_encoded));
 461     xor(cb, REG0, REG1);
 462
 463     // xor should impact ZF, so we can jz here
 464     uint32_t pc_is_zero = cb_new_label(cb, "pc_is_zero");
 465     jz_label(cb, pc_is_zero);
 466
 467     // We're not starting at the first PC, so we need to exit.
 468     GEN_COUNTER_INC(cb, leave_start_pc_non_zero);
 469
 470     pop(cb, REG_SP);
 471     pop(cb, REG_EC);
 472     pop(cb, REG_CFP);
 473
 474     mov(cb, RAX, imm_opnd(Qundef));
 475     ret(cb);
 476
 477     // PC should be at the beginning
 478     cb_write_label(cb, pc_is_zero);
 479     cb_link_labels(cb);
 480 }
 481
 482 // The code we generate in gen_send_cfunc() doesn't fire the c_return TracePoint event
 483 // like the interpreter. When tracing for c_return is enabled, we patch the code after
 484 // the C method return to call into this to fire the event.
 485 static void
 486 full_cfunc_return(rb_execution_context_t *ec, VALUE return_value)
 487 {
 488     rb_control_frame_t *cfp = ec->cfp;
 489     RUBY_ASSERT_ALWAYS(cfp == GET_EC()->cfp);
 490     const rb_callable_method_entry_t *me = rb_vm_frame_method_entry(cfp);
 491
 492     RUBY_ASSERT_ALWAYS(RUBYVM_CFUNC_FRAME_P(cfp));
 493     RUBY_ASSERT_ALWAYS(me->def->type == VM_METHOD_TYPE_CFUNC);
 494
 495     // CHECK_CFP_CONSISTENCY("full_cfunc_return"); TODO revive this
 496
 497     // Pop the C func's frame and fire the c_return TracePoint event
 498     // Note that this is the same order as vm_call_cfunc_with_frame().
 499     rb_vm_pop_frame(ec);
 500     EXEC_EVENT_HOOK(ec, RUBY_EVENT_C_RETURN, cfp->self, me->def->original_id, me->called_id, me->owner, return_value);
 501     // Note, this deviates from the interpreter in that users need to enable
 502     // a c_return TracePoint for this DTrace hook to work. A reasonable change
 503     // since the Ruby return event works this way as well.
 504     RUBY_DTRACE_CMETHOD_RETURN_HOOK(ec, me->owner, me->def->original_id);
 505
 506     // Push return value into the caller's stack. We know that it's a frame that
 507     // uses cfp->sp because we are patching a call done with gen_send_cfunc().
 508     ec->cfp->sp[0] = return_value;
 509     ec->cfp->sp++;
 510 }
 511
 512 // Landing code for when c_return tracing is enabled. See full_cfunc_return().
 513 static void
 514 gen_full_cfunc_return(void)
 515 {
 516     codeblock_t *cb = ocb;
 517     outline_full_cfunc_return_pos = ocb->write_pos;
 518
 519     // This chunk of code expect REG_EC to be filled properly and
 520     // RAX to contain the return value of the C method.
 521
 522     // Call full_cfunc_return()
 523     mov(cb, C_ARG_REGS[0], REG_EC);
 524     mov(cb, C_ARG_REGS[1], RAX);
 525     call_ptr(cb, REG0, (void *)full_cfunc_return);
 526
 527     // Count the exit
 528     GEN_COUNTER_INC(cb, traced_cfunc_return);
 529
 530     // Return to the interpreter
 531     pop(cb, REG_SP);
 532     pop(cb, REG_EC);
 533     pop(cb, REG_CFP);
 534
 535     mov(cb, RAX, imm_opnd(Qundef));
 536     ret(cb);
 537 }
 538
 539 /*
 540 Compile an interpreter entry block to be inserted into an iseq
 541 Returns `NULL` if compilation fails.
 542 */
 543 static uint8_t *
 544 yjit_entry_prologue(codeblock_t *cb, const rb_iseq_t *iseq)
 545 {
 546     RUBY_ASSERT(cb != NULL);
 547
 548     enum { MAX_PROLOGUE_SIZE = 1024 };
 549
 550     // Check if we have enough executable memory
 551     if (cb->write_pos + MAX_PROLOGUE_SIZE >= cb->mem_size) {
 552         return NULL;
 553     }
 554
 555     const uint32_t old_write_pos = cb->write_pos;
 556
 557     // Align the current write position to cache line boundaries
 558     cb_align_pos(cb, 64);
 559
 560     uint8_t *code_ptr = cb_get_ptr(cb, cb->write_pos);
 561     ADD_COMMENT(cb, "yjit entry");
 562
 563     push(cb, REG_CFP);
 564     push(cb, REG_EC);
 565     push(cb, REG_SP);
 566
 567     // We are passed EC and CFP
 568     mov(cb, REG_EC, C_ARG_REGS[0]);
 569     mov(cb, REG_CFP, C_ARG_REGS[1]);
 570
 571     // Load the current SP from the CFP into REG_SP
 572     mov(cb, REG_SP, member_opnd(REG_CFP, rb_control_frame_t, sp));
 573
 574     // Setup cfp->jit_return
 575     // TODO: this could use an IP relative LEA instead of an 8 byte immediate
 576     mov(cb, REG0, const_ptr_opnd(leave_exit_code));
 577     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, jit_return), REG0);
 578
 579     // We're compiling iseqs that we *expect* to start at `insn_idx`. But in
 580     // the case of optional parameters, the interpreter can set the pc to a
 581     // different location depending on the optional parameters.  If an iseq
 582     // has optional parameters, we'll add a runtime check that the PC we've
 583     // compiled for is the same PC that the interpreter wants us to run with.
 584     // If they don't match, then we'll take a side exit.
 585     if (iseq->body->param.flags.has_opt) {
 586         yjit_pc_guard(cb, iseq);
 587     }
 588
 589     // Verify MAX_PROLOGUE_SIZE
 590     RUBY_ASSERT_ALWAYS(cb->write_pos - old_write_pos <= MAX_PROLOGUE_SIZE);
 591
 592     return code_ptr;
 593 }
 594
 595 // Generate code to check for interrupts and take a side-exit.
 596 // Warning: this function clobbers REG0
 597 static void
 598 yjit_check_ints(codeblock_t *cb, uint8_t *side_exit)
 599 {
 600     // Check for interrupts
 601     // see RUBY_VM_CHECK_INTS(ec) macro
 602     ADD_COMMENT(cb, "RUBY_VM_CHECK_INTS(ec)");
 603     mov(cb, REG0_32, member_opnd(REG_EC, rb_execution_context_t, interrupt_mask));
 604     not(cb, REG0_32);
 605     test(cb, member_opnd(REG_EC, rb_execution_context_t, interrupt_flag), REG0_32);
 606     jnz_ptr(cb, side_exit);
 607 }
 608
 609 // Generate a stubbed unconditional jump to the next bytecode instruction.
 610 // Blocks that are part of a guard chain can use this to share the same successor.
 611 static void
 612 jit_jump_to_next_insn(jitstate_t *jit, const ctx_t *current_context)
 613 {
 614     // Reset the depth since in current usages we only ever jump to to
 615     // chain_depth > 0 from the same instruction.
 616     ctx_t reset_depth = *current_context;
 617     reset_depth.chain_depth = 0;
 618
 619     blockid_t jump_block = { jit->iseq, jit_next_insn_idx(jit) };
 620
 621     // We are at the end of the current instruction. Record the boundary.
 622     if (jit->record_boundary_patch_point) {
 623         uint32_t exit_pos = yjit_gen_exit(jit->pc + insn_len(jit->opcode), &reset_depth, jit->ocb);
 624         record_global_inval_patch(jit->cb, exit_pos);
 625         jit->record_boundary_patch_point = false;
 626     }
 627
 628     // Generate the jump instruction
 629     gen_direct_jump(
 630         jit,
 631         &reset_depth,
 632         jump_block
 633     );
 634 }
 635
 636 // Compile a sequence of bytecode instructions for a given basic block version.
 637 // Part of gen_block_version().
 638 static block_t *
 639 gen_single_block(blockid_t blockid, const ctx_t *start_ctx, rb_execution_context_t *ec)
 640 {
 641     RUBY_ASSERT(cb != NULL);
 642     verify_blockid(blockid);
 643
 644     // Allocate the new block
 645     block_t *block = calloc(1, sizeof(block_t));
 646     if (!block) {
 647         return NULL;
 648     }
 649
 650     // Copy the starting context to avoid mutating it
 651     ctx_t ctx_copy = *start_ctx;
 652     ctx_t *ctx = &ctx_copy;
 653
 654     // Limit the number of specialized versions for this block
 655     *ctx = limit_block_versions(blockid, ctx);
 656
 657     // Save the starting context on the block.
 658     block->blockid = blockid;
 659     block->ctx = *ctx;
 660
 661     RUBY_ASSERT(!(blockid.idx == 0 && start_ctx->stack_size > 0));
 662
 663     const rb_iseq_t *iseq = block->blockid.iseq;
 664     const unsigned int iseq_size = iseq->body->iseq_size;
 665     uint32_t insn_idx = block->blockid.idx;
 666     const uint32_t starting_insn_idx = insn_idx;
 667
 668     // Initialize a JIT state object
 669     jitstate_t jit = {
 670         .cb = cb,
 671         .ocb = ocb,
 672         .block = block,
 673         .iseq = iseq,
 674         .ec = ec
 675     };
 676
 677     // Mark the start position of the block
 678     block->start_addr = cb_get_write_ptr(cb);
 679
 680     // For each instruction to compile
 681     while (insn_idx < iseq_size) {
 682         // Get the current pc and opcode
 683         VALUE *pc = yjit_iseq_pc_at_idx(iseq, insn_idx);
 684         int opcode = yjit_opcode_at_pc(iseq, pc);
 685         RUBY_ASSERT(opcode >= 0 && opcode < VM_INSTRUCTION_SIZE);
 686
 687         // opt_getinlinecache wants to be in a block all on its own. Cut the block short
 688         // if we run into it. See gen_opt_getinlinecache() for details.
 689         if (opcode == BIN(opt_getinlinecache) && insn_idx > starting_insn_idx) {
 690             jit_jump_to_next_insn(&jit, ctx);
 691             break;
 692         }
 693
 694         // Set the current instruction
 695         jit.insn_idx = insn_idx;
 696         jit.opcode = opcode;
 697         jit.pc = pc;
 698         jit.side_exit_for_pc = NULL;
 699
 700         // If previous instruction requested to record the boundary
 701         if (jit.record_boundary_patch_point) {
 702             // Generate an exit to this instruction and record it
 703             uint32_t exit_pos = yjit_gen_exit(jit.pc, ctx, ocb);
 704             record_global_inval_patch(cb, exit_pos);
 705             jit.record_boundary_patch_point = false;
 706         }
 707
 708         // Verify our existing assumption (DEBUG)
 709         if (jit_at_current_insn(&jit)) {
 710             verify_ctx(&jit, ctx);
 711         }
 712
 713         // Lookup the codegen function for this instruction
 714         codegen_fn gen_fn = gen_fns[opcode];
 715         codegen_status_t status = YJIT_CANT_COMPILE;
 716         if (gen_fn) {
 717             if (0) {
 718                 fprintf(stderr, "compiling %d: %s\n", insn_idx, insn_name(opcode));
 719                 print_str(cb, insn_name(opcode));
 720             }
 721
 722             // :count-placement:
 723             // Count bytecode instructions that execute in generated code.
 724             // Note that the increment happens even when the output takes side exit.
 725             GEN_COUNTER_INC(cb, exec_instruction);
 726
 727             // Add a comment for the name of the YARV instruction
 728             ADD_COMMENT(cb, insn_name(opcode));
 729
 730             // Call the code generation function
 731             status = gen_fn(&jit, ctx, cb);
 732         }
 733
 734         // If we can't compile this instruction
 735         // exit to the interpreter and stop compiling
 736         if (status == YJIT_CANT_COMPILE) {
 737             // TODO: if the codegen function makes changes to ctx and then return YJIT_CANT_COMPILE,
 738             // the exit this generates would be wrong. We could save a copy of the entry context
 739             // and assert that ctx is the same here.
 740             uint32_t exit_off = yjit_gen_exit(jit.pc, ctx, cb);
 741
 742             // If this is the first instruction in the block, then we can use
 743             // the exit for block->entry_exit.
 744             if (insn_idx == block->blockid.idx) {
 745                 block->entry_exit = cb_get_ptr(cb, exit_off);
 746             }
 747             break;
 748         }
 749
 750         // For now, reset the chain depth after each instruction as only the
 751         // first instruction in the block can concern itself with the depth.
 752         ctx->chain_depth = 0;
 753
 754         // Move to the next instruction to compile
 755         insn_idx += insn_len(opcode);
 756
 757         // If the instruction terminates this block
 758         if (status == YJIT_END_BLOCK) {
 759             break;
 760         }
 761     }
 762
 763     // Mark the end position of the block
 764     block->end_addr = cb_get_write_ptr(cb);
 765
 766     // Store the index of the last instruction in the block
 767     block->end_idx = insn_idx;
 768
 769     // We currently can't handle cases where the request is for a block that
 770     // doesn't go to the next instruction.
 771     RUBY_ASSERT(!jit.record_boundary_patch_point);
 772
 773     // If code for the block doesn't fit, free the block and fail.
 774     if (cb->dropped_bytes || ocb->dropped_bytes) {
 775         yjit_free_block(block);
 776         return NULL;
 777     }
 778
 779     if (YJIT_DUMP_MODE >= 2) {
 780         // Dump list of compiled instrutions
 781         fprintf(stderr, "Compiled the following for iseq=%p:\n", (void *)iseq);
 782         for (uint32_t idx = block->blockid.idx; idx < insn_idx; ) {
 783             int opcode = yjit_opcode_at_pc(iseq, yjit_iseq_pc_at_idx(iseq, idx));
 784             fprintf(stderr, "  %04d %s\n", idx, insn_name(opcode));
 785             idx += insn_len(opcode);
 786         }
 787     }
 788
 789     return block;
 790 }
 791
 792 static codegen_status_t gen_opt_send_without_block(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb);
 793
 794 static codegen_status_t
 795 gen_nop(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 796 {
 797     // Do nothing
 798     return YJIT_KEEP_COMPILING;
 799 }
 800
 801 static codegen_status_t
 802 gen_dup(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 803 {
 804     // Get the top value and its type
 805     x86opnd_t dup_val = ctx_stack_pop(ctx, 0);
 806     temp_type_mapping_t mapping = ctx_get_opnd_mapping(ctx, OPND_STACK(0));
 807
 808     // Push the same value on top
 809     x86opnd_t loc0 = ctx_stack_push_mapping(ctx, mapping);
 810     mov(cb, REG0, dup_val);
 811     mov(cb, loc0, REG0);
 812
 813     return YJIT_KEEP_COMPILING;
 814 }
 815
 816 // duplicate stack top n elements
 817 static codegen_status_t
 818 gen_dupn(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 819 {
 820     rb_num_t n = (rb_num_t)jit_get_arg(jit, 0);
 821
 822     // In practice, seems to be only used for n==2
 823     if (n != 2) {
 824         return YJIT_CANT_COMPILE;
 825     }
 826
 827     x86opnd_t opnd1 = ctx_stack_opnd(ctx, 1);
 828     x86opnd_t opnd0 = ctx_stack_opnd(ctx, 0);
 829     temp_type_mapping_t mapping1 = ctx_get_opnd_mapping(ctx, OPND_STACK(1));
 830     temp_type_mapping_t mapping0 = ctx_get_opnd_mapping(ctx, OPND_STACK(0));
 831
 832     x86opnd_t dst1 = ctx_stack_push_mapping(ctx, mapping1);
 833     mov(cb, REG0, opnd1);
 834     mov(cb, dst1, REG0);
 835
 836     x86opnd_t dst0 = ctx_stack_push_mapping(ctx, mapping0);
 837     mov(cb, REG0, opnd0);
 838     mov(cb, dst0, REG0);
 839
 840     return YJIT_KEEP_COMPILING;
 841 }
 842
 843 static void
 844 stack_swap(ctx_t *ctx, codeblock_t *cb, int offset0, int offset1, x86opnd_t reg0, x86opnd_t reg1)
 845 {
 846     x86opnd_t opnd0 = ctx_stack_opnd(ctx, offset0);
 847     x86opnd_t opnd1 = ctx_stack_opnd(ctx, offset1);
 848
 849     temp_type_mapping_t mapping0 = ctx_get_opnd_mapping(ctx, OPND_STACK(offset0));
 850     temp_type_mapping_t mapping1 = ctx_get_opnd_mapping(ctx, OPND_STACK(offset1));
 851
 852     mov(cb, reg0, opnd0);
 853     mov(cb, reg1, opnd1);
 854     mov(cb, opnd0, reg1);
 855     mov(cb, opnd1, reg0);
 856
 857     ctx_set_opnd_mapping(ctx, OPND_STACK(offset0), mapping1);
 858     ctx_set_opnd_mapping(ctx, OPND_STACK(offset1), mapping0);
 859 }
 860
 861 // Swap top 2 stack entries
 862 static codegen_status_t
 863 gen_swap(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 864 {
 865     stack_swap(ctx , cb, 0, 1, REG0, REG1);
 866     return YJIT_KEEP_COMPILING;
 867 }
 868
 869 // set Nth stack entry to stack top
 870 static codegen_status_t
 871 gen_setn(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 872 {
 873     rb_num_t n = (rb_num_t)jit_get_arg(jit, 0);
 874
 875     // Set the destination
 876     x86opnd_t top_val = ctx_stack_pop(ctx, 0);
 877     x86opnd_t dst_opnd = ctx_stack_opnd(ctx, (int32_t)n);
 878     mov(cb, REG0, top_val);
 879     mov(cb, dst_opnd, REG0);
 880
 881     temp_type_mapping_t mapping = ctx_get_opnd_mapping(ctx, OPND_STACK(0));
 882     ctx_set_opnd_mapping(ctx, OPND_STACK(n), mapping);
 883
 884     return YJIT_KEEP_COMPILING;
 885 }
 886
 887 // get nth stack value, then push it
 888 static codegen_status_t
 889 gen_topn(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 890 {
 891     int32_t n = (int32_t)jit_get_arg(jit, 0);
 892
 893     // Get top n type / operand
 894     x86opnd_t top_n_val = ctx_stack_opnd(ctx, n);
 895     temp_type_mapping_t mapping = ctx_get_opnd_mapping(ctx, OPND_STACK(n));
 896
 897     x86opnd_t loc0 = ctx_stack_push_mapping(ctx, mapping);
 898     mov(cb, REG0, top_n_val);
 899     mov(cb, loc0, REG0);
 900
 901     return YJIT_KEEP_COMPILING;
 902 }
 903
 904 static codegen_status_t
 905 gen_pop(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 906 {
 907     // Decrement SP
 908     ctx_stack_pop(ctx, 1);
 909     return YJIT_KEEP_COMPILING;
 910 }
 911
 912 // Pop n values off the stack
 913 static codegen_status_t
 914 gen_adjuststack(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 915 {
 916     rb_num_t n = (rb_num_t)jit_get_arg(jit, 0);
 917     ctx_stack_pop(ctx, n);
 918     return YJIT_KEEP_COMPILING;
 919 }
 920
 921 // new array initialized from top N values
 922 static codegen_status_t
 923 gen_newarray(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 924 {
 925     rb_num_t n = (rb_num_t)jit_get_arg(jit, 0);
 926
 927     // Save the PC and SP because we are allocating
 928     jit_prepare_routine_call(jit, ctx, REG0);
 929
 930     x86opnd_t values_ptr = ctx_sp_opnd(ctx, -(sizeof(VALUE) * (uint32_t)n));
 931
 932     // call rb_ec_ary_new_from_values(struct rb_execution_context_struct *ec, long n, const VALUE *elts);
 933     mov(cb, C_ARG_REGS[0], REG_EC);
 934     mov(cb, C_ARG_REGS[1], imm_opnd(n));
 935     lea(cb, C_ARG_REGS[2], values_ptr);
 936     call_ptr(cb, REG0, (void *)rb_ec_ary_new_from_values);
 937
 938     ctx_stack_pop(ctx, n);
 939     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_ARRAY);
 940     mov(cb, stack_ret, RAX);
 941
 942     return YJIT_KEEP_COMPILING;
 943 }
 944
 945 // dup array
 946 static codegen_status_t
 947 gen_duparray(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 948 {
 949     VALUE ary = jit_get_arg(jit, 0);
 950
 951     // Save the PC and SP because we are allocating
 952     jit_prepare_routine_call(jit, ctx, REG0);
 953
 954     // call rb_ary_resurrect(VALUE ary);
 955     jit_mov_gc_ptr(jit, cb, C_ARG_REGS[0], ary);
 956     call_ptr(cb, REG0, (void *)rb_ary_resurrect);
 957
 958     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_ARRAY);
 959     mov(cb, stack_ret, RAX);
 960
 961     return YJIT_KEEP_COMPILING;
 962 }
 963
 964 // dup hash
 965 static codegen_status_t
 966 gen_duphash(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 967 {
 968     VALUE hash = jit_get_arg(jit, 0);
 969
 970     // Save the PC and SP because we are allocating
 971     jit_prepare_routine_call(jit, ctx, REG0);
 972
 973     // call rb_hash_resurrect(VALUE hash);
 974     jit_mov_gc_ptr(jit, cb, C_ARG_REGS[0], hash);
 975     call_ptr(cb, REG0, (void *)rb_hash_resurrect);
 976
 977     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_HASH);
 978     mov(cb, stack_ret, RAX);
 979
 980     return YJIT_KEEP_COMPILING;
 981 }
 982
 983 VALUE rb_vm_splat_array(VALUE flag, VALUE ary);
 984
 985 // call to_a on the array on the stack
 986 static codegen_status_t
 987 gen_splatarray(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
 988 {
 989     VALUE flag = (VALUE) jit_get_arg(jit, 0);
 990
 991     // Save the PC and SP because the callee may allocate
 992     // Note that this modifies REG_SP, which is why we do it first
 993     jit_prepare_routine_call(jit, ctx, REG0);
 994
 995     // Get the operands from the stack
 996     x86opnd_t ary_opnd = ctx_stack_pop(ctx, 1);
 997
 998     // Call rb_vm_splat_array(flag, ary)
 999     jit_mov_gc_ptr(jit, cb, C_ARG_REGS[0], flag);
1000     mov(cb, C_ARG_REGS[1], ary_opnd);
1001     call_ptr(cb, REG1, (void *) rb_vm_splat_array);
1002
1003     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_ARRAY);
1004     mov(cb, stack_ret, RAX);
1005
1006     return YJIT_KEEP_COMPILING;
1007 }
1008
1009 // new range initialized from top 2 values
1010 static codegen_status_t
1011 gen_newrange(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1012 {
1013     rb_num_t flag = (rb_num_t)jit_get_arg(jit, 0);
1014
1015     // rb_range_new() allocates and can raise
1016     jit_prepare_routine_call(jit, ctx, REG0);
1017
1018     // val = rb_range_new(low, high, (int)flag);
1019     mov(cb, C_ARG_REGS[0], ctx_stack_opnd(ctx, 1));
1020     mov(cb, C_ARG_REGS[1], ctx_stack_opnd(ctx, 0));
1021     mov(cb, C_ARG_REGS[2], imm_opnd(flag));
1022     call_ptr(cb, REG0, (void *)rb_range_new);
1023
1024     ctx_stack_pop(ctx, 2);
1025     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_HEAP);
1026     mov(cb, stack_ret, RAX);
1027
1028     return YJIT_KEEP_COMPILING;
1029 }
1030
1031 static void
1032 guard_object_is_heap(codeblock_t *cb, x86opnd_t object_opnd, ctx_t *ctx, uint8_t *side_exit)
1033 {
1034     ADD_COMMENT(cb, "guard object is heap");
1035
1036     // Test that the object is not an immediate
1037     test(cb, object_opnd, imm_opnd(RUBY_IMMEDIATE_MASK));
1038     jnz_ptr(cb, side_exit);
1039
1040     // Test that the object is not false or nil
1041     cmp(cb, object_opnd, imm_opnd(Qnil));
1042     RUBY_ASSERT(Qfalse < Qnil);
1043     jbe_ptr(cb, side_exit);
1044 }
1045
1046 static inline void
1047 guard_object_is_array(codeblock_t *cb, x86opnd_t object_opnd, x86opnd_t flags_opnd, ctx_t *ctx, uint8_t *side_exit)
1048 {
1049     ADD_COMMENT(cb, "guard object is array");
1050
1051     // Pull out the type mask
1052     mov(cb, flags_opnd, member_opnd(object_opnd, struct RBasic, flags));
1053     and(cb, flags_opnd, imm_opnd(RUBY_T_MASK));
1054
1055     // Compare the result with T_ARRAY
1056     cmp(cb, flags_opnd, imm_opnd(T_ARRAY));
1057     jne_ptr(cb, side_exit);
1058 }
1059
1060 // push enough nils onto the stack to fill out an array
1061 static codegen_status_t
1062 gen_expandarray(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1063 {
1064     int flag = (int) jit_get_arg(jit, 1);
1065
1066     // If this instruction has the splat flag, then bail out.
1067     if (flag & 0x01) {
1068         GEN_COUNTER_INC(cb, expandarray_splat);
1069         return YJIT_CANT_COMPILE;
1070     }
1071
1072     // If this instruction has the postarg flag, then bail out.
1073     if (flag & 0x02) {
1074         GEN_COUNTER_INC(cb, expandarray_postarg);
1075         return YJIT_CANT_COMPILE;
1076     }
1077
1078     uint8_t *side_exit = yjit_side_exit(jit, ctx);
1079
1080     // num is the number of requested values. If there aren't enough in the
1081     // array then we're going to push on nils.
1082     int num = (int)jit_get_arg(jit, 0);
1083     val_type_t array_type = ctx_get_opnd_type(ctx, OPND_STACK(0));
1084     x86opnd_t array_opnd = ctx_stack_pop(ctx, 1);
1085
1086     if (array_type.type == ETYPE_NIL) {
1087         // special case for a, b = nil pattern
1088         // push N nils onto the stack
1089         for (int i = 0; i < num; i++) {
1090             x86opnd_t push = ctx_stack_push(ctx, TYPE_NIL);
1091             mov(cb, push, imm_opnd(Qnil));
1092         }
1093         return YJIT_KEEP_COMPILING;
1094     }
1095
1096     // Move the array from the stack into REG0 and check that it's an array.
1097     mov(cb, REG0, array_opnd);
1098     guard_object_is_heap(cb, REG0, ctx, COUNTED_EXIT(jit, side_exit, expandarray_not_array));
1099     guard_object_is_array(cb, REG0, REG1, ctx, COUNTED_EXIT(jit, side_exit, expandarray_not_array));
1100
1101     // If we don't actually want any values, then just return.
1102     if (num == 0) {
1103         return YJIT_KEEP_COMPILING;
1104     }
1105
1106     // Pull out the embed flag to check if it's an embedded array.
1107     x86opnd_t flags_opnd = member_opnd(REG0, struct RBasic, flags);
1108     mov(cb, REG1, flags_opnd);
1109
1110     // Move the length of the embedded array into REG1.
1111     and(cb, REG1, imm_opnd(RARRAY_EMBED_LEN_MASK));
1112     shr(cb, REG1, imm_opnd(RARRAY_EMBED_LEN_SHIFT));
1113
1114     // Conditionally move the length of the heap array into REG1.
1115     test(cb, flags_opnd, imm_opnd(RARRAY_EMBED_FLAG));
1116     cmovz(cb, REG1, member_opnd(REG0, struct RArray, as.heap.len));
1117
1118     // Only handle the case where the number of values in the array is greater
1119     // than or equal to the number of values requested.
1120     cmp(cb, REG1, imm_opnd(num));
1121     jl_ptr(cb, COUNTED_EXIT(jit, side_exit, expandarray_rhs_too_small));
1122
1123     // Load the address of the embedded array into REG1.
1124     // (struct RArray *)(obj)->as.ary
1125     lea(cb, REG1, member_opnd(REG0, struct RArray, as.ary));
1126
1127     // Conditionally load the address of the heap array into REG1.
1128     // (struct RArray *)(obj)->as.heap.ptr
1129     test(cb, flags_opnd, imm_opnd(RARRAY_EMBED_FLAG));
1130     cmovz(cb, REG1, member_opnd(REG0, struct RArray, as.heap.ptr));
1131
1132     // Loop backward through the array and push each element onto the stack.
1133     for (int32_t i = (int32_t) num - 1; i >= 0; i--) {
1134         x86opnd_t top = ctx_stack_push(ctx, TYPE_UNKNOWN);
1135         mov(cb, REG0, mem_opnd(64, REG1, i * SIZEOF_VALUE));
1136         mov(cb, top, REG0);
1137     }
1138
1139     return YJIT_KEEP_COMPILING;
1140 }
1141
1142 // new hash initialized from top N values
1143 static codegen_status_t
1144 gen_newhash(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1145 {
1146     int32_t num = (int32_t)jit_get_arg(jit, 0);
1147
1148     // Save the PC and SP because we are allocating
1149     jit_prepare_routine_call(jit, ctx, REG0);
1150
1151     if (num) {
1152         // val = rb_hash_new_with_size(num / 2);
1153         mov(cb, C_ARG_REGS[0], imm_opnd(num / 2));
1154         call_ptr(cb, REG0, (void *)rb_hash_new_with_size);
1155
1156         // save the allocated hash as we want to push it after insertion
1157         push(cb, RAX);
1158         push(cb, RAX); // alignment
1159
1160         // rb_hash_bulk_insert(num, STACK_ADDR_FROM_TOP(num), val);
1161         mov(cb, C_ARG_REGS[0], imm_opnd(num));
1162         lea(cb, C_ARG_REGS[1], ctx_stack_opnd(ctx, num - 1));
1163         mov(cb, C_ARG_REGS[2], RAX);
1164         call_ptr(cb, REG0, (void *)rb_hash_bulk_insert);
1165
1166         pop(cb, RAX); // alignment
1167         pop(cb, RAX);
1168
1169         ctx_stack_pop(ctx, num);
1170         x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_HASH);
1171         mov(cb, stack_ret, RAX);
1172     }
1173     else {
1174         // val = rb_hash_new();
1175         call_ptr(cb, REG0, (void *)rb_hash_new);
1176
1177         x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_HASH);
1178         mov(cb, stack_ret, RAX);
1179     }
1180
1181     return YJIT_KEEP_COMPILING;
1182 }
1183
1184 // Push a constant value to the stack, including type information.
1185 // The constant may be a heap object or a special constant.
1186 static void
1187 jit_putobject(jitstate_t *jit, ctx_t *ctx, VALUE arg)
1188 {
1189     val_type_t val_type = yjit_type_of_value(arg);
1190     x86opnd_t stack_top = ctx_stack_push(ctx, val_type);
1191
1192     if (SPECIAL_CONST_P(arg)) {
1193         // Immediates will not move and do not need to be tracked for GC
1194         // Thanks to this we can mov directly to memory when possible.
1195
1196         // NOTE: VALUE -> int64_t cast below is implementation defined.
1197         // Hopefully it preserves the the bit pattern or raise a signal.
1198         // See N1256 section 6.3.1.3.
1199         x86opnd_t imm = imm_opnd((int64_t)arg);
1200
1201         // 64-bit immediates can't be directly written to memory
1202         if (imm.num_bits <= 32) {
1203             mov(cb, stack_top, imm);
1204         }
1205         else {
1206             mov(cb, REG0, imm);
1207             mov(cb, stack_top, REG0);
1208         }
1209     }
1210     else {
1211         // Load the value to push into REG0
1212         // Note that this value may get moved by the GC
1213         jit_mov_gc_ptr(jit, cb, REG0, arg);
1214
1215         // Write argument at SP
1216         mov(cb, stack_top, REG0);
1217     }
1218 }
1219
1220 static codegen_status_t
1221 gen_putnil(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1222 {
1223     jit_putobject(jit, ctx, Qnil);
1224     return YJIT_KEEP_COMPILING;
1225 }
1226
1227 static codegen_status_t
1228 gen_putobject(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1229 {
1230     VALUE arg = jit_get_arg(jit, 0);
1231
1232     jit_putobject(jit, ctx, arg);
1233     return YJIT_KEEP_COMPILING;
1234 }
1235
1236 static codegen_status_t
1237 gen_putstring(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1238 {
1239     VALUE put_val = jit_get_arg(jit, 0);
1240
1241     // Save the PC and SP because the callee will allocate
1242     jit_prepare_routine_call(jit, ctx, REG0);
1243
1244     mov(cb, C_ARG_REGS[0], REG_EC);
1245     jit_mov_gc_ptr(jit, cb, C_ARG_REGS[1], put_val);
1246     call_ptr(cb, REG0, (void *)rb_ec_str_resurrect);
1247
1248     x86opnd_t stack_top = ctx_stack_push(ctx, TYPE_STRING);
1249     mov(cb, stack_top, RAX);
1250
1251     return YJIT_KEEP_COMPILING;
1252 }
1253
1254 static codegen_status_t
1255 gen_putobject_int2fix(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1256 {
1257     int opcode = jit_get_opcode(jit);
1258     int cst_val = (opcode == BIN(putobject_INT2FIX_0_))? 0:1;
1259
1260     jit_putobject(jit, ctx, INT2FIX(cst_val));
1261     return YJIT_KEEP_COMPILING;
1262 }
1263
1264 static codegen_status_t
1265 gen_putself(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1266 {
1267     // Load self from CFP
1268     mov(cb, REG0, member_opnd(REG_CFP, rb_control_frame_t, self));
1269
1270     // Write it on the stack
1271     x86opnd_t stack_top = ctx_stack_push_self(ctx);
1272     mov(cb, stack_top, REG0);
1273
1274     return YJIT_KEEP_COMPILING;
1275 }
1276
1277 static codegen_status_t
1278 gen_putspecialobject(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1279 {
1280     enum vm_special_object_type type = (enum vm_special_object_type)jit_get_arg(jit, 0);
1281
1282     if (type == VM_SPECIAL_OBJECT_VMCORE) {
1283         x86opnd_t stack_top = ctx_stack_push(ctx, TYPE_HEAP);
1284         jit_mov_gc_ptr(jit, cb, REG0, rb_mRubyVMFrozenCore);
1285         mov(cb, stack_top, REG0);
1286         return YJIT_KEEP_COMPILING;
1287     }
1288     else {
1289         // TODO: implement for VM_SPECIAL_OBJECT_CBASE and
1290         // VM_SPECIAL_OBJECT_CONST_BASE
1291         return YJIT_CANT_COMPILE;
1292     }
1293 }
1294
1295 // Get EP at level from CFP
1296 static void
1297 gen_get_ep(codeblock_t *cb, x86opnd_t reg, uint32_t level)
1298 {
1299     // Load environment pointer EP from CFP
1300     mov(cb, reg, member_opnd(REG_CFP, rb_control_frame_t, ep));
1301
1302     while (level--) {
1303         // Get the previous EP from the current EP
1304         // See GET_PREV_EP(ep) macro
1305         // VALUE *prev_ep = ((VALUE *)((ep)[VM_ENV_DATA_INDEX_SPECVAL] & ~0x03))
1306         mov(cb, reg, mem_opnd(64, REG0, SIZEOF_VALUE * VM_ENV_DATA_INDEX_SPECVAL));
1307         and(cb, reg, imm_opnd(~0x03));
1308     }
1309 }
1310
1311 // Compute the index of a local variable from its slot index
1312 static uint32_t
1313 slot_to_local_idx(const rb_iseq_t *iseq, int32_t slot_idx)
1314 {
1315     // Convoluted rules from local_var_name() in iseq.c
1316     int32_t local_table_size = iseq->body->local_table_size;
1317     int32_t op = slot_idx - VM_ENV_DATA_SIZE;
1318     int32_t local_idx = local_idx = local_table_size - op - 1;
1319     RUBY_ASSERT(local_idx >= 0 && local_idx < local_table_size);
1320     return (uint32_t)local_idx;
1321 }
1322
1323 static codegen_status_t
1324 gen_getlocal_wc0(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1325 {
1326     // Compute the offset from BP to the local
1327     int32_t slot_idx = (int32_t)jit_get_arg(jit, 0);
1328     const int32_t offs = -(SIZEOF_VALUE * slot_idx);
1329     uint32_t local_idx = slot_to_local_idx(jit->iseq, slot_idx);
1330
1331     // Load environment pointer EP (level 0) from CFP
1332     gen_get_ep(cb, REG0, 0);
1333
1334     // Load the local from the EP
1335     mov(cb, REG0, mem_opnd(64, REG0, offs));
1336
1337     // Write the local at SP
1338     x86opnd_t stack_top = ctx_stack_push_local(ctx, local_idx);
1339     mov(cb, stack_top, REG0);
1340
1341     return YJIT_KEEP_COMPILING;
1342 }
1343
1344 static codegen_status_t
1345 gen_getlocal_generic(ctx_t *ctx, uint32_t local_idx, uint32_t level)
1346 {
1347     gen_get_ep(cb, REG0, level);
1348
1349     // Load the local from the block
1350     // val = *(vm_get_ep(GET_EP(), level) - idx);
1351     const int32_t offs = -(SIZEOF_VALUE * local_idx);
1352     mov(cb, REG0, mem_opnd(64, REG0, offs));
1353
1354     // Write the local at SP
1355     x86opnd_t stack_top = ctx_stack_push(ctx, TYPE_UNKNOWN);
1356     mov(cb, stack_top, REG0);
1357
1358     return YJIT_KEEP_COMPILING;
1359 }
1360
1361 static codegen_status_t
1362 gen_getlocal(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1363 {
1364     int32_t idx = (int32_t)jit_get_arg(jit, 0);
1365     int32_t level = (int32_t)jit_get_arg(jit, 1);
1366     return gen_getlocal_generic(ctx, idx, level);
1367 }
1368
1369 static codegen_status_t
1370 gen_getlocal_wc1(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1371 {
1372     int32_t idx = (int32_t)jit_get_arg(jit, 0);
1373     return gen_getlocal_generic(ctx, idx, 1);
1374 }
1375
1376 static codegen_status_t
1377 gen_setlocal_wc0(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1378 {
1379     /*
1380     vm_env_write(const VALUE *ep, int index, VALUE v)
1381     {
1382         VALUE flags = ep[VM_ENV_DATA_INDEX_FLAGS];
1383         if (LIKELY((flags & VM_ENV_FLAG_WB_REQUIRED) == 0)) {
1384             VM_STACK_ENV_WRITE(ep, index, v);
1385         }
1386         else {
1387             vm_env_write_slowpath(ep, index, v);
1388         }
1389     }
1390     */
1391
1392     int32_t slot_idx = (int32_t)jit_get_arg(jit, 0);
1393     uint32_t local_idx = slot_to_local_idx(jit->iseq, slot_idx);
1394
1395     // Load environment pointer EP (level 0) from CFP
1396     gen_get_ep(cb, REG0, 0);
1397
1398     // flags & VM_ENV_FLAG_WB_REQUIRED
1399     x86opnd_t flags_opnd = mem_opnd(64, REG0, sizeof(VALUE) * VM_ENV_DATA_INDEX_FLAGS);
1400     test(cb, flags_opnd, imm_opnd(VM_ENV_FLAG_WB_REQUIRED));
1401
1402     // Create a side-exit to fall back to the interpreter
1403     uint8_t *side_exit = yjit_side_exit(jit, ctx);
1404
1405     // if (flags & VM_ENV_FLAG_WB_REQUIRED) != 0
1406     jnz_ptr(cb, side_exit);
1407
1408     // Set the type of the local variable in the context
1409     val_type_t temp_type = ctx_get_opnd_type(ctx, OPND_STACK(0));
1410     ctx_set_local_type(ctx, local_idx, temp_type);
1411
1412     // Pop the value to write from the stack
1413     x86opnd_t stack_top = ctx_stack_pop(ctx, 1);
1414     mov(cb, REG1, stack_top);
1415
1416     // Write the value at the environment pointer
1417     const int32_t offs = -8 * slot_idx;
1418     mov(cb, mem_opnd(64, REG0, offs), REG1);
1419
1420     return YJIT_KEEP_COMPILING;
1421 }
1422
1423 // Push Qtrue or Qfalse depending on whether the given keyword was supplied by
1424 // the caller
1425 static codegen_status_t
1426 gen_checkkeyword(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1427 {
1428     // When a keyword is unspecified past index 32, a hash will be used
1429     // instead. This can only happen in iseqs taking more than 32 keywords.
1430     if (jit->iseq->body->param.keyword->num >= 32) {
1431         return YJIT_CANT_COMPILE;
1432     }
1433
1434     // The EP offset to the undefined bits local
1435     int32_t bits_offset = (int32_t)jit_get_arg(jit, 0);
1436
1437     // The index of the keyword we want to check
1438     int32_t index = (int32_t)jit_get_arg(jit, 1);
1439
1440     // Load environment pointer EP
1441     gen_get_ep(cb, REG0, 0);
1442
1443     // VALUE kw_bits = *(ep - bits);
1444     x86opnd_t bits_opnd = mem_opnd(64, REG0, sizeof(VALUE) * -bits_offset);
1445
1446     // unsigned int b = (unsigned int)FIX2ULONG(kw_bits);
1447     // if ((b & (0x01 << idx))) {
1448     //
1449     // We can skip the FIX2ULONG conversion by shifting the bit we test
1450     int64_t bit_test = 0x01 << (index + 1);
1451     test(cb, bits_opnd, imm_opnd(bit_test));
1452     mov(cb, REG0, imm_opnd(Qfalse));
1453     mov(cb, REG1, imm_opnd(Qtrue));
1454     cmovz(cb, REG0, REG1);
1455
1456     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_IMM);
1457     mov(cb, stack_ret, REG0);
1458
1459     return YJIT_KEEP_COMPILING;
1460 }
1461
1462 static codegen_status_t
1463 gen_setlocal_generic(jitstate_t *jit, ctx_t *ctx, uint32_t local_idx, uint32_t level)
1464 {
1465     // Load environment pointer EP at level
1466     gen_get_ep(cb, REG0, level);
1467
1468     // flags & VM_ENV_FLAG_WB_REQUIRED
1469     x86opnd_t flags_opnd = mem_opnd(64, REG0, sizeof(VALUE) * VM_ENV_DATA_INDEX_FLAGS);
1470     test(cb, flags_opnd, imm_opnd(VM_ENV_FLAG_WB_REQUIRED));
1471
1472     // Create a side-exit to fall back to the interpreter
1473     uint8_t *side_exit = yjit_side_exit(jit, ctx);
1474
1475     // if (flags & VM_ENV_FLAG_WB_REQUIRED) != 0
1476     jnz_ptr(cb, side_exit);
1477
1478     // Pop the value to write from the stack
1479     x86opnd_t stack_top = ctx_stack_pop(ctx, 1);
1480     mov(cb, REG1, stack_top);
1481
1482     // Write the value at the environment pointer
1483     const int32_t offs = -(SIZEOF_VALUE * local_idx);
1484     mov(cb, mem_opnd(64, REG0, offs), REG1);
1485
1486     return YJIT_KEEP_COMPILING;
1487 }
1488
1489 static codegen_status_t
1490 gen_setlocal(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1491 {
1492     int32_t idx = (int32_t)jit_get_arg(jit, 0);
1493     int32_t level = (int32_t)jit_get_arg(jit, 1);
1494     return gen_setlocal_generic(jit, ctx, idx, level);
1495 }
1496
1497 static codegen_status_t
1498 gen_setlocal_wc1(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1499 {
1500     int32_t idx = (int32_t)jit_get_arg(jit, 0);
1501     return gen_setlocal_generic(jit, ctx, idx, 1);
1502 }
1503
1504 static void
1505 gen_jnz_to_target0(codeblock_t *cb, uint8_t *target0, uint8_t *target1, uint8_t shape)
1506 {
1507     switch (shape) {
1508       case SHAPE_NEXT0:
1509       case SHAPE_NEXT1:
1510         RUBY_ASSERT(false);
1511         break;
1512
1513       case SHAPE_DEFAULT:
1514         jnz_ptr(cb, target0);
1515         break;
1516     }
1517 }
1518
1519 static void
1520 gen_jz_to_target0(codeblock_t *cb, uint8_t *target0, uint8_t *target1, uint8_t shape)
1521 {
1522     switch (shape) {
1523       case SHAPE_NEXT0:
1524       case SHAPE_NEXT1:
1525         RUBY_ASSERT(false);
1526         break;
1527
1528       case SHAPE_DEFAULT:
1529         jz_ptr(cb, target0);
1530         break;
1531     }
1532 }
1533
1534 static void
1535 gen_jbe_to_target0(codeblock_t *cb, uint8_t *target0, uint8_t *target1, uint8_t shape)
1536 {
1537     switch (shape) {
1538       case SHAPE_NEXT0:
1539       case SHAPE_NEXT1:
1540         RUBY_ASSERT(false);
1541         break;
1542
1543       case SHAPE_DEFAULT:
1544         jbe_ptr(cb, target0);
1545         break;
1546     }
1547 }
1548
1549 enum jcc_kinds {
1550     JCC_JNE,
1551     JCC_JNZ,
1552     JCC_JZ,
1553     JCC_JE,
1554     JCC_JBE,
1555     JCC_JNA,
1556 };
1557
1558 // Generate a jump to a stub that recompiles the current YARV instruction on failure.
1559 // When depth_limitk is exceeded, generate a jump to a side exit.
1560 static void
1561 jit_chain_guard(enum jcc_kinds jcc, jitstate_t *jit, const ctx_t *ctx, uint8_t depth_limit, uint8_t *side_exit)
1562 {
1563     branchgen_fn target0_gen_fn;
1564
1565     switch (jcc) {
1566       case JCC_JNE:
1567       case JCC_JNZ:
1568         target0_gen_fn = gen_jnz_to_target0;
1569         break;
1570       case JCC_JZ:
1571       case JCC_JE:
1572         target0_gen_fn = gen_jz_to_target0;
1573         break;
1574       case JCC_JBE:
1575       case JCC_JNA:
1576         target0_gen_fn = gen_jbe_to_target0;
1577         break;
1578       default:
1579         rb_bug("yjit: unimplemented jump kind");
1580         break;
1581     };
1582
1583     if (ctx->chain_depth < depth_limit) {
1584         ctx_t deeper = *ctx;
1585         deeper.chain_depth++;
1586
1587         gen_branch(
1588             jit,
1589             ctx,
1590             (blockid_t) { jit->iseq, jit->insn_idx },
1591             &deeper,
1592             BLOCKID_NULL,
1593             NULL,
1594             target0_gen_fn
1595         );
1596     }
1597     else {
1598         target0_gen_fn(cb, side_exit, NULL, SHAPE_DEFAULT);
1599     }
1600 }
1601
1602 enum {
1603     GETIVAR_MAX_DEPTH = 10,       // up to 5 different classes, and embedded or not for each
1604     OPT_AREF_MAX_CHAIN_DEPTH = 2, // hashes and arrays
1605     SEND_MAX_DEPTH = 5,           // up to 5 different classes
1606 };
1607
1608 VALUE rb_vm_set_ivar_idx(VALUE obj, uint32_t idx, VALUE val);
1609
1610 // Codegen for setting an instance variable.
1611 // Preconditions:
1612 //   - receiver is in REG0
1613 //   - receiver has the same class as CLASS_OF(comptime_receiver)
1614 //   - no stack push or pops to ctx since the entry to the codegen of the instruction being compiled
1615 static codegen_status_t
1616 gen_set_ivar(jitstate_t *jit, ctx_t *ctx, VALUE recv, VALUE klass, ID ivar_name)
1617 {
1618     // Save the PC and SP because the callee may allocate
1619     // Note that this modifies REG_SP, which is why we do it first
1620     jit_prepare_routine_call(jit, ctx, REG0);
1621
1622     // Get the operands from the stack
1623     x86opnd_t val_opnd = ctx_stack_pop(ctx, 1);
1624     x86opnd_t recv_opnd = ctx_stack_pop(ctx, 1);
1625
1626     uint32_t ivar_index = rb_obj_ensure_iv_index_mapping(recv, ivar_name);
1627
1628     // Call rb_vm_set_ivar_idx with the receiver, the index of the ivar, and the value
1629     mov(cb, C_ARG_REGS[0], recv_opnd);
1630     mov(cb, C_ARG_REGS[1], imm_opnd(ivar_index));
1631     mov(cb, C_ARG_REGS[2], val_opnd);
1632     call_ptr(cb, REG0, (void *)rb_vm_set_ivar_idx);
1633
1634     x86opnd_t out_opnd = ctx_stack_push(ctx, TYPE_UNKNOWN);
1635     mov(cb, out_opnd, RAX);
1636
1637     return YJIT_KEEP_COMPILING;
1638 }
1639
1640 // Codegen for getting an instance variable.
1641 // Preconditions:
1642 //   - receiver is in REG0
1643 //   - receiver has the same class as CLASS_OF(comptime_receiver)
1644 //   - no stack push or pops to ctx since the entry to the codegen of the instruction being compiled
1645 static codegen_status_t
1646 gen_get_ivar(jitstate_t *jit, ctx_t *ctx, const int max_chain_depth, VALUE comptime_receiver, ID ivar_name, insn_opnd_t reg0_opnd, uint8_t *side_exit)
1647 {
1648     VALUE comptime_val_klass = CLASS_OF(comptime_receiver);
1649     const ctx_t starting_context = *ctx; // make a copy for use with jit_chain_guard
1650
1651     // If the class uses the default allocator, instances should all be T_OBJECT
1652     // NOTE: This assumes nobody changes the allocator of the class after allocation.
1653     //       Eventually, we can encode whether an object is T_OBJECT or not
1654     //       inside object shapes.
1655     if (!RB_TYPE_P(comptime_receiver, T_OBJECT) ||
1656             rb_get_alloc_func(comptime_val_klass) != rb_class_allocate_instance) {
1657         // General case. Call rb_ivar_get().
1658         // VALUE rb_ivar_get(VALUE obj, ID id)
1659         ADD_COMMENT(cb, "call rb_ivar_get()");
1660
1661         // The function could raise exceptions.
1662         jit_prepare_routine_call(jit, ctx, REG1);
1663
1664         mov(cb, C_ARG_REGS[0], REG0);
1665         mov(cb, C_ARG_REGS[1], imm_opnd((int64_t)ivar_name));
1666         call_ptr(cb, REG1, (void *)rb_ivar_get);
1667
1668         if (!reg0_opnd.is_self) {
1669             (void)ctx_stack_pop(ctx, 1);
1670         }
1671         // Push the ivar on the stack
1672         x86opnd_t out_opnd = ctx_stack_push(ctx, TYPE_UNKNOWN);
1673         mov(cb, out_opnd, RAX);
1674
1675         // Jump to next instruction. This allows guard chains to share the same successor.
1676         jit_jump_to_next_insn(jit, ctx);
1677         return YJIT_END_BLOCK;
1678     }
1679
1680     /*
1681     // FIXME:
1682     // This check was added because of a failure in a test involving the
1683     // Nokogiri Document class where we see a T_DATA that still has the default
1684     // allocator.
1685     // Aaron Patterson argues that this is a bug in the C extension, because
1686     // people could call .allocate() on the class and still get a T_OBJECT
1687     // For now I added an extra dynamic check that the receiver is T_OBJECT
1688     // so we can safely pass all the tests in Shopify Core.
1689     //
1690     // Guard that the receiver is T_OBJECT
1691     // #define RB_BUILTIN_TYPE(x) (int)(((struct RBasic*)(x))->flags & RUBY_T_MASK)
1692     ADD_COMMENT(cb, "guard receiver is T_OBJECT");
1693     mov(cb, REG1, member_opnd(REG0, struct RBasic, flags));
1694     and(cb, REG1, imm_opnd(RUBY_T_MASK));
1695     cmp(cb, REG1, imm_opnd(T_OBJECT));
1696     jit_chain_guard(JCC_JNE, jit, &starting_context, max_chain_depth, side_exit);
1697     */
1698
1699     // FIXME: Mapping the index could fail when there is too many ivar names. If we're
1700     // compiling for a branch stub that can cause the exception to be thrown from the
1701     // wrong PC.
1702     uint32_t ivar_index = rb_obj_ensure_iv_index_mapping(comptime_receiver, ivar_name);
1703
1704     // Pop receiver if it's on the temp stack
1705     if (!reg0_opnd.is_self) {
1706         (void)ctx_stack_pop(ctx, 1);
1707     }
1708
1709     // Compile time self is embedded and the ivar index lands within the object
1710     if (RB_FL_TEST_RAW(comptime_receiver, ROBJECT_EMBED) && ivar_index < ROBJECT_EMBED_LEN_MAX) {
1711         // See ROBJECT_IVPTR() from include/ruby/internal/core/robject.h
1712
1713         // Guard that self is embedded
1714         // TODO: BT and JC is shorter
1715         ADD_COMMENT(cb, "guard embedded getivar");
1716         x86opnd_t flags_opnd = member_opnd(REG0, struct RBasic, flags);
1717         test(cb, flags_opnd, imm_opnd(ROBJECT_EMBED));
1718         jit_chain_guard(JCC_JZ, jit, &starting_context, max_chain_depth, COUNTED_EXIT(jit, side_exit, getivar_megamorphic));
1719
1720         // Load the variable
1721         x86opnd_t ivar_opnd = mem_opnd(64, REG0, offsetof(struct RObject, as.ary) + ivar_index * SIZEOF_VALUE);
1722         mov(cb, REG1, ivar_opnd);
1723
1724         // Guard that the variable is not Qundef
1725         cmp(cb, REG1, imm_opnd(Qundef));
1726         mov(cb, REG0, imm_opnd(Qnil));
1727         cmove(cb, REG1, REG0);
1728
1729         // Push the ivar on the stack
1730         x86opnd_t out_opnd = ctx_stack_push(ctx, TYPE_UNKNOWN);
1731         mov(cb, out_opnd, REG1);
1732     }
1733     else {
1734         // Compile time value is *not* embedded.
1735
1736         // Guard that value is *not* embedded
1737         // See ROBJECT_IVPTR() from include/ruby/internal/core/robject.h
1738         ADD_COMMENT(cb, "guard extended getivar");
1739         x86opnd_t flags_opnd = member_opnd(REG0, struct RBasic, flags);
1740         test(cb, flags_opnd, imm_opnd(ROBJECT_EMBED));
1741         jit_chain_guard(JCC_JNZ, jit, &starting_context, max_chain_depth, COUNTED_EXIT(jit, side_exit, getivar_megamorphic));
1742
1743         // check that the extended table is big enough
1744         if (ivar_index >= ROBJECT_EMBED_LEN_MAX + 1) {
1745             // Check that the slot is inside the extended table (num_slots > index)
1746             x86opnd_t num_slots = mem_opnd(32, REG0, offsetof(struct RObject, as.heap.numiv));
1747             cmp(cb, num_slots, imm_opnd(ivar_index));
1748             jle_ptr(cb, COUNTED_EXIT(jit, side_exit, getivar_idx_out_of_range));
1749         }
1750
1751         // Get a pointer to the extended table
1752         x86opnd_t tbl_opnd = mem_opnd(64, REG0, offsetof(struct RObject, as.heap.ivptr));
1753         mov(cb, REG0, tbl_opnd);
1754
1755         // Read the ivar from the extended table
1756         x86opnd_t ivar_opnd = mem_opnd(64, REG0, sizeof(VALUE) * ivar_index);
1757         mov(cb, REG0, ivar_opnd);
1758
1759         // Check that the ivar is not Qundef
1760         cmp(cb, REG0, imm_opnd(Qundef));
1761         mov(cb, REG1, imm_opnd(Qnil));
1762         cmove(cb, REG0, REG1);
1763
1764         // Push the ivar on the stack
1765         x86opnd_t out_opnd = ctx_stack_push(ctx, TYPE_UNKNOWN);
1766         mov(cb, out_opnd, REG0);
1767     }
1768
1769     // Jump to next instruction. This allows guard chains to share the same successor.
1770     jit_jump_to_next_insn(jit, ctx);
1771     return YJIT_END_BLOCK;
1772 }
1773
1774 static codegen_status_t
1775 gen_getinstancevariable(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1776 {
1777     // Defer compilation so we can specialize on a runtime `self`
1778     if (!jit_at_current_insn(jit)) {
1779         defer_compilation(jit, ctx);
1780         return YJIT_END_BLOCK;
1781     }
1782
1783     ID ivar_name = (ID)jit_get_arg(jit, 0);
1784
1785     VALUE comptime_val = jit_peek_at_self(jit, ctx);
1786     VALUE comptime_val_klass = CLASS_OF(comptime_val);
1787
1788     // Generate a side exit
1789     uint8_t *side_exit = yjit_side_exit(jit, ctx);
1790
1791     // Guard that the receiver has the same class as the one from compile time.
1792     mov(cb, REG0, member_opnd(REG_CFP, rb_control_frame_t, self));
1793
1794     jit_guard_known_klass(jit, ctx, comptime_val_klass, OPND_SELF, comptime_val, GETIVAR_MAX_DEPTH, side_exit);
1795
1796     return gen_get_ivar(jit, ctx, GETIVAR_MAX_DEPTH, comptime_val, ivar_name, OPND_SELF, side_exit);
1797 }
1798
1799 void rb_vm_setinstancevariable(const rb_iseq_t *iseq, VALUE obj, ID id, VALUE val, IVC ic);
1800
1801 static codegen_status_t
1802 gen_setinstancevariable(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1803 {
1804     ID id = (ID)jit_get_arg(jit, 0);
1805     IVC ic = (IVC)jit_get_arg(jit, 1);
1806
1807     // Save the PC and SP because the callee may allocate
1808     // Note that this modifies REG_SP, which is why we do it first
1809     jit_prepare_routine_call(jit, ctx, REG0);
1810
1811     // Get the operands from the stack
1812     x86opnd_t val_opnd = ctx_stack_pop(ctx, 1);
1813
1814     // Call rb_vm_setinstancevariable(iseq, obj, id, val, ic);
1815     mov(cb, C_ARG_REGS[1], member_opnd(REG_CFP, rb_control_frame_t, self));
1816     mov(cb, C_ARG_REGS[3], val_opnd);
1817     mov(cb, C_ARG_REGS[2], imm_opnd(id));
1818     mov(cb, C_ARG_REGS[4], const_ptr_opnd(ic));
1819     jit_mov_gc_ptr(jit, cb, C_ARG_REGS[0], (VALUE)jit->iseq);
1820     call_ptr(cb, REG0, (void *)rb_vm_setinstancevariable);
1821
1822     return YJIT_KEEP_COMPILING;
1823 }
1824
1825 bool rb_vm_defined(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, rb_num_t op_type, VALUE obj, VALUE v);
1826
1827 static codegen_status_t
1828 gen_defined(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1829 {
1830     rb_num_t op_type = (rb_num_t)jit_get_arg(jit, 0);
1831     VALUE obj = (VALUE)jit_get_arg(jit, 1);
1832     VALUE pushval = (VALUE)jit_get_arg(jit, 2);
1833
1834     // Save the PC and SP because the callee may allocate
1835     // Note that this modifies REG_SP, which is why we do it first
1836     jit_prepare_routine_call(jit, ctx, REG0);
1837
1838     // Get the operands from the stack
1839     x86opnd_t v_opnd = ctx_stack_pop(ctx, 1);
1840
1841     // Call vm_defined(ec, reg_cfp, op_type, obj, v)
1842     mov(cb, C_ARG_REGS[0], REG_EC);
1843     mov(cb, C_ARG_REGS[1], REG_CFP);
1844     mov(cb, C_ARG_REGS[2], imm_opnd(op_type));
1845     jit_mov_gc_ptr(jit, cb, C_ARG_REGS[3], (VALUE)obj);
1846     mov(cb, C_ARG_REGS[4], v_opnd);
1847     call_ptr(cb, REG0, (void *)rb_vm_defined);
1848
1849     // if (vm_defined(ec, GET_CFP(), op_type, obj, v)) {
1850     //  val = pushval;
1851     // }
1852     jit_mov_gc_ptr(jit, cb, REG1, (VALUE)pushval);
1853     cmp(cb, AL, imm_opnd(0));
1854     mov(cb, RAX, imm_opnd(Qnil));
1855     cmovnz(cb, RAX, REG1);
1856
1857     // Push the return value onto the stack
1858     val_type_t out_type = SPECIAL_CONST_P(pushval)? TYPE_IMM:TYPE_UNKNOWN;
1859     x86opnd_t stack_ret = ctx_stack_push(ctx, out_type);
1860     mov(cb, stack_ret, RAX);
1861
1862     return YJIT_KEEP_COMPILING;
1863 }
1864
1865 static codegen_status_t
1866 gen_checktype(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1867 {
1868     enum ruby_value_type type_val = (enum ruby_value_type)jit_get_arg(jit, 0);
1869     // Only three types are emitted by compile.c
1870     if (type_val == T_STRING || type_val == T_ARRAY || type_val == T_HASH) {
1871         val_type_t val_type = ctx_get_opnd_type(ctx, OPND_STACK(0));
1872         x86opnd_t val = ctx_stack_pop(ctx, 1);
1873
1874         x86opnd_t stack_ret;
1875
1876         // Check if we know from type information
1877         if ((type_val == T_STRING && val_type.type == ETYPE_STRING) ||
1878                 (type_val == T_ARRAY && val_type.type == ETYPE_ARRAY) ||
1879                 (type_val == T_HASH && val_type.type == ETYPE_HASH)) {
1880             // guaranteed type match
1881             stack_ret = ctx_stack_push(ctx, TYPE_TRUE);
1882             mov(cb, stack_ret, imm_opnd(Qtrue));
1883             return YJIT_KEEP_COMPILING;
1884         }
1885         else if (val_type.is_imm || val_type.type != ETYPE_UNKNOWN) {
1886             // guaranteed not to match T_STRING/T_ARRAY/T_HASH
1887             stack_ret = ctx_stack_push(ctx, TYPE_FALSE);
1888             mov(cb, stack_ret, imm_opnd(Qfalse));
1889             return YJIT_KEEP_COMPILING;
1890         }
1891
1892         mov(cb, REG0, val);
1893         mov(cb, REG1, imm_opnd(Qfalse));
1894
1895         uint32_t ret = cb_new_label(cb, "ret");
1896
1897         if (!val_type.is_heap) {
1898             // if (SPECIAL_CONST_P(val)) {
1899             // Return Qfalse via REG1 if not on heap
1900             test(cb, REG0, imm_opnd(RUBY_IMMEDIATE_MASK));
1901             jnz_label(cb, ret);
1902             cmp(cb, REG0, imm_opnd(Qnil));
1903             jbe_label(cb, ret);
1904         }
1905
1906         // Check type on object
1907         mov(cb, REG0, mem_opnd(64, REG0, offsetof(struct RBasic, flags)));
1908         and(cb, REG0, imm_opnd(RUBY_T_MASK));
1909         cmp(cb, REG0, imm_opnd(type_val));
1910         mov(cb, REG0, imm_opnd(Qtrue));
1911         // REG1 contains Qfalse from above
1912         cmove(cb, REG1, REG0);
1913
1914         cb_write_label(cb, ret);
1915         stack_ret = ctx_stack_push(ctx, TYPE_IMM);
1916         mov(cb, stack_ret, REG1);
1917         cb_link_labels(cb);
1918
1919         return YJIT_KEEP_COMPILING;
1920     }
1921     else {
1922         return YJIT_CANT_COMPILE;
1923     }
1924 }
1925
1926 static codegen_status_t
1927 gen_concatstrings(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
1928 {
1929     rb_num_t n = (rb_num_t)jit_get_arg(jit, 0);
1930
1931     // Save the PC and SP because we are allocating
1932     jit_prepare_routine_call(jit, ctx, REG0);
1933
1934     x86opnd_t values_ptr = ctx_sp_opnd(ctx, -(sizeof(VALUE) * (uint32_t)n));
1935
1936     // call rb_str_concat_literals(long n, const VALUE *strings);
1937     mov(cb, C_ARG_REGS[0], imm_opnd(n));
1938     lea(cb, C_ARG_REGS[1], values_ptr);
1939     call_ptr(cb, REG0, (void *)rb_str_concat_literals);
1940
1941     ctx_stack_pop(ctx, n);
1942     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_STRING);
1943     mov(cb, stack_ret, RAX);
1944
1945     return YJIT_KEEP_COMPILING;
1946 }
1947
1948 static void
1949 guard_two_fixnums(ctx_t *ctx, uint8_t *side_exit)
1950 {
1951     // Get the stack operand types
1952     val_type_t arg1_type = ctx_get_opnd_type(ctx, OPND_STACK(0));
1953     val_type_t arg0_type = ctx_get_opnd_type(ctx, OPND_STACK(1));
1954
1955     if (arg0_type.is_heap || arg1_type.is_heap) {
1956         jmp_ptr(cb, side_exit);
1957         return;
1958     }
1959
1960     if (arg0_type.type != ETYPE_FIXNUM && arg0_type.type != ETYPE_UNKNOWN) {
1961         jmp_ptr(cb, side_exit);
1962         return;
1963     }
1964
1965     if (arg1_type.type != ETYPE_FIXNUM && arg1_type.type != ETYPE_UNKNOWN) {
1966         jmp_ptr(cb, side_exit);
1967         return;
1968     }
1969
1970     RUBY_ASSERT(!arg0_type.is_heap);
1971     RUBY_ASSERT(!arg1_type.is_heap);
1972     RUBY_ASSERT(arg0_type.type == ETYPE_FIXNUM || arg0_type.type == ETYPE_UNKNOWN);
1973     RUBY_ASSERT(arg1_type.type == ETYPE_FIXNUM || arg1_type.type == ETYPE_UNKNOWN);
1974
1975     // Get stack operands without popping them
1976     x86opnd_t arg1 = ctx_stack_opnd(ctx, 0);
1977     x86opnd_t arg0 = ctx_stack_opnd(ctx, 1);
1978
1979     // If not fixnums, fall back
1980     if (arg0_type.type != ETYPE_FIXNUM) {
1981         ADD_COMMENT(cb, "guard arg0 fixnum");
1982         test(cb, arg0, imm_opnd(RUBY_FIXNUM_FLAG));
1983         jz_ptr(cb, side_exit);
1984     }
1985     if (arg1_type.type != ETYPE_FIXNUM) {
1986         ADD_COMMENT(cb, "guard arg1 fixnum");
1987         test(cb, arg1, imm_opnd(RUBY_FIXNUM_FLAG));
1988         jz_ptr(cb, side_exit);
1989     }
1990
1991     // Set stack types in context
1992     ctx_upgrade_opnd_type(ctx, OPND_STACK(0), TYPE_FIXNUM);
1993     ctx_upgrade_opnd_type(ctx, OPND_STACK(1), TYPE_FIXNUM);
1994 }
1995
1996 // Conditional move operation used by comparison operators
1997 typedef void (*cmov_fn)(codeblock_t *cb, x86opnd_t opnd0, x86opnd_t opnd1);
1998
1999 static codegen_status_t
2000 gen_fixnum_cmp(jitstate_t *jit, ctx_t *ctx, cmov_fn cmov_op)
2001 {
2002     // Defer compilation so we can specialize base on a runtime receiver
2003     if (!jit_at_current_insn(jit)) {
2004         defer_compilation(jit, ctx);
2005         return YJIT_END_BLOCK;
2006     }
2007
2008     VALUE comptime_a = jit_peek_at_stack(jit, ctx, 1);
2009     VALUE comptime_b = jit_peek_at_stack(jit, ctx, 0);
2010
2011     if (FIXNUM_P(comptime_a) && FIXNUM_P(comptime_b)) {
2012         // Create a side-exit to fall back to the interpreter
2013         // Note: we generate the side-exit before popping operands from the stack
2014         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2015
2016         if (!assume_bop_not_redefined(jit, INTEGER_REDEFINED_OP_FLAG, BOP_LT)) {
2017             return YJIT_CANT_COMPILE;
2018         }
2019
2020         // Check that both operands are fixnums
2021         guard_two_fixnums(ctx, side_exit);
2022
2023         // Get the operands from the stack
2024         x86opnd_t arg1 = ctx_stack_pop(ctx, 1);
2025         x86opnd_t arg0 = ctx_stack_pop(ctx, 1);
2026
2027         // Compare the arguments
2028         xor(cb, REG0_32, REG0_32); // REG0 = Qfalse
2029         mov(cb, REG1, arg0);
2030         cmp(cb, REG1, arg1);
2031         mov(cb, REG1, imm_opnd(Qtrue));
2032         cmov_op(cb, REG0, REG1);
2033
2034         // Push the output on the stack
2035         x86opnd_t dst = ctx_stack_push(ctx, TYPE_UNKNOWN);
2036         mov(cb, dst, REG0);
2037
2038         return YJIT_KEEP_COMPILING;
2039     }
2040     else {
2041         return gen_opt_send_without_block(jit, ctx, cb);
2042     }
2043 }
2044
2045 static codegen_status_t
2046 gen_opt_lt(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2047 {
2048     return gen_fixnum_cmp(jit, ctx, cmovl);
2049 }
2050
2051 static codegen_status_t
2052 gen_opt_le(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2053 {
2054     return gen_fixnum_cmp(jit, ctx, cmovle);
2055 }
2056
2057 static codegen_status_t
2058 gen_opt_ge(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2059 {
2060     return gen_fixnum_cmp(jit, ctx, cmovge);
2061 }
2062
2063 static codegen_status_t
2064 gen_opt_gt(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2065 {
2066     return gen_fixnum_cmp(jit, ctx, cmovg);
2067 }
2068
2069 // Implements specialized equality for either two fixnum or two strings
2070 // Returns true if code was generated, otherwise false
2071 static bool
2072 gen_equality_specialized(jitstate_t *jit, ctx_t *ctx, uint8_t *side_exit)
2073 {
2074     VALUE comptime_a = jit_peek_at_stack(jit, ctx, 1);
2075     VALUE comptime_b = jit_peek_at_stack(jit, ctx, 0);
2076
2077     x86opnd_t a_opnd = ctx_stack_opnd(ctx, 1);
2078     x86opnd_t b_opnd = ctx_stack_opnd(ctx, 0);
2079
2080     if (FIXNUM_P(comptime_a) && FIXNUM_P(comptime_b)) {
2081         if (!assume_bop_not_redefined(jit, INTEGER_REDEFINED_OP_FLAG, BOP_EQ)) {
2082             // if overridden, emit the generic version
2083             return false;
2084         }
2085
2086         guard_two_fixnums(ctx, side_exit);
2087
2088         mov(cb, REG0, a_opnd);
2089         cmp(cb, REG0, b_opnd);
2090
2091         mov(cb, REG0, imm_opnd(Qfalse));
2092         mov(cb, REG1, imm_opnd(Qtrue));
2093         cmove(cb, REG0, REG1);
2094
2095         // Push the output on the stack
2096         ctx_stack_pop(ctx, 2);
2097         x86opnd_t dst = ctx_stack_push(ctx, TYPE_IMM);
2098         mov(cb, dst, REG0);
2099
2100         return true;
2101     }
2102     else if (CLASS_OF(comptime_a) == rb_cString &&
2103             CLASS_OF(comptime_b) == rb_cString) {
2104         if (!assume_bop_not_redefined(jit, STRING_REDEFINED_OP_FLAG, BOP_EQ)) {
2105             // if overridden, emit the generic version
2106             return false;
2107         }
2108
2109         // Load a and b in preparation for call later
2110         mov(cb, C_ARG_REGS[0], a_opnd);
2111         mov(cb, C_ARG_REGS[1], b_opnd);
2112
2113         // Guard that a is a String
2114         mov(cb, REG0, C_ARG_REGS[0]);
2115         jit_guard_known_klass(jit, ctx, rb_cString, OPND_STACK(1), comptime_a, SEND_MAX_DEPTH, side_exit);
2116
2117         uint32_t ret = cb_new_label(cb, "ret");
2118
2119         // If they are equal by identity, return true
2120         cmp(cb, C_ARG_REGS[0], C_ARG_REGS[1]);
2121         mov(cb, RAX, imm_opnd(Qtrue));
2122         je_label(cb, ret);
2123
2124         // Otherwise guard that b is a T_STRING (from type info) or String (from runtime guard)
2125         if (ctx_get_opnd_type(ctx, OPND_STACK(0)).type != ETYPE_STRING) {
2126             mov(cb, REG0, C_ARG_REGS[1]);
2127             // Note: any T_STRING is valid here, but we check for a ::String for simplicity
2128             jit_guard_known_klass(jit, ctx, rb_cString, OPND_STACK(0), comptime_b, SEND_MAX_DEPTH, side_exit);
2129         }
2130
2131         // Call rb_str_eql_internal(a, b)
2132         call_ptr(cb, REG0, (void *)rb_str_eql_internal);
2133
2134         // Push the output on the stack
2135         cb_write_label(cb, ret);
2136         ctx_stack_pop(ctx, 2);
2137         x86opnd_t dst = ctx_stack_push(ctx, TYPE_IMM);
2138         mov(cb, dst, RAX);
2139         cb_link_labels(cb);
2140
2141         return true;
2142     }
2143     else {
2144         return false;
2145     }
2146 }
2147
2148 static codegen_status_t
2149 gen_opt_eq(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2150 {
2151     // Defer compilation so we can specialize base on a runtime receiver
2152     if (!jit_at_current_insn(jit)) {
2153         defer_compilation(jit, ctx);
2154         return YJIT_END_BLOCK;
2155     }
2156
2157     // Create a side-exit to fall back to the interpreter
2158     uint8_t *side_exit = yjit_side_exit(jit, ctx);
2159
2160     if (gen_equality_specialized(jit, ctx, side_exit)) {
2161         jit_jump_to_next_insn(jit, ctx);
2162         return YJIT_END_BLOCK;
2163     }
2164     else {
2165         return gen_opt_send_without_block(jit, ctx, cb);
2166     }
2167 }
2168
2169 static codegen_status_t gen_send_general(jitstate_t *jit, ctx_t *ctx, struct rb_call_data *cd, rb_iseq_t *block);
2170
2171 static codegen_status_t
2172 gen_opt_neq(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2173 {
2174     // opt_neq is passed two rb_call_data as arguments:
2175     // first for ==, second for !=
2176     struct rb_call_data *cd = (struct rb_call_data *)jit_get_arg(jit, 1);
2177     return gen_send_general(jit, ctx, cd, NULL);
2178 }
2179
2180 static codegen_status_t
2181 gen_opt_aref(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2182 {
2183     struct rb_call_data * cd = (struct rb_call_data *)jit_get_arg(jit, 0);
2184     int32_t argc = (int32_t)vm_ci_argc(cd->ci);
2185
2186     // Only JIT one arg calls like `ary[6]`
2187     if (argc != 1) {
2188         GEN_COUNTER_INC(cb, oaref_argc_not_one);
2189         return YJIT_CANT_COMPILE;
2190     }
2191
2192     // Defer compilation so we can specialize base on a runtime receiver
2193     if (!jit_at_current_insn(jit)) {
2194         defer_compilation(jit, ctx);
2195         return YJIT_END_BLOCK;
2196     }
2197
2198     // Remember the context on entry for adding guard chains
2199     const ctx_t starting_context = *ctx;
2200
2201     // Specialize base on compile time values
2202     VALUE comptime_idx = jit_peek_at_stack(jit, ctx, 0);
2203     VALUE comptime_recv = jit_peek_at_stack(jit, ctx, 1);
2204
2205     // Create a side-exit to fall back to the interpreter
2206     uint8_t *side_exit = yjit_side_exit(jit, ctx);
2207
2208     if (CLASS_OF(comptime_recv) == rb_cArray && RB_FIXNUM_P(comptime_idx)) {
2209         if (!assume_bop_not_redefined(jit, ARRAY_REDEFINED_OP_FLAG, BOP_AREF)) {
2210             return YJIT_CANT_COMPILE;
2211         }
2212
2213         // Pop the stack operands
2214         x86opnd_t idx_opnd = ctx_stack_pop(ctx, 1);
2215         x86opnd_t recv_opnd = ctx_stack_pop(ctx, 1);
2216         mov(cb, REG0, recv_opnd);
2217
2218         // if (SPECIAL_CONST_P(recv)) {
2219         // Bail if receiver is not a heap object
2220         test(cb, REG0, imm_opnd(RUBY_IMMEDIATE_MASK));
2221         jnz_ptr(cb, side_exit);
2222         cmp(cb, REG0, imm_opnd(Qfalse));
2223         je_ptr(cb, side_exit);
2224         cmp(cb, REG0, imm_opnd(Qnil));
2225         je_ptr(cb, side_exit);
2226
2227         // Bail if recv has a class other than ::Array.
2228         // BOP_AREF check above is only good for ::Array.
2229         mov(cb, REG1, mem_opnd(64, REG0, offsetof(struct RBasic, klass)));
2230         mov(cb, REG0, const_ptr_opnd((void *)rb_cArray));
2231         cmp(cb, REG0, REG1);
2232         jit_chain_guard(JCC_JNE, jit, &starting_context, OPT_AREF_MAX_CHAIN_DEPTH, side_exit);
2233
2234         // Bail if idx is not a FIXNUM
2235         mov(cb, REG1, idx_opnd);
2236         test(cb, REG1, imm_opnd(RUBY_FIXNUM_FLAG));
2237         jz_ptr(cb, COUNTED_EXIT(jit, side_exit, oaref_arg_not_fixnum));
2238
2239         // Call VALUE rb_ary_entry_internal(VALUE ary, long offset).
2240         // It never raises or allocates, so we don't need to write to cfp->pc.
2241         {
2242             mov(cb, RDI, recv_opnd);
2243             sar(cb, REG1, imm_opnd(1)); // Convert fixnum to int
2244             mov(cb, RSI, REG1);
2245             call_ptr(cb, REG0, (void *)rb_ary_entry_internal);
2246
2247             // Push the return value onto the stack
2248             x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
2249             mov(cb, stack_ret, RAX);
2250         }
2251
2252         // Jump to next instruction. This allows guard chains to share the same successor.
2253         jit_jump_to_next_insn(jit, ctx);
2254         return YJIT_END_BLOCK;
2255     }
2256     else if (CLASS_OF(comptime_recv) == rb_cHash) {
2257         if (!assume_bop_not_redefined(jit, HASH_REDEFINED_OP_FLAG, BOP_AREF)) {
2258             return YJIT_CANT_COMPILE;
2259         }
2260
2261         x86opnd_t key_opnd = ctx_stack_opnd(ctx, 0);
2262         x86opnd_t recv_opnd = ctx_stack_opnd(ctx, 1);
2263
2264         // Guard that the receiver is a hash
2265         mov(cb, REG0, recv_opnd);
2266         jit_guard_known_klass(jit, ctx, rb_cHash, OPND_STACK(1), comptime_recv, OPT_AREF_MAX_CHAIN_DEPTH, side_exit);
2267
2268         // Setup arguments for rb_hash_aref().
2269         mov(cb, C_ARG_REGS[0], REG0);
2270         mov(cb, C_ARG_REGS[1], key_opnd);
2271
2272         // Prepare to call rb_hash_aref(). It might call #hash on the key.
2273         jit_prepare_routine_call(jit, ctx, REG0);
2274
2275         call_ptr(cb, REG0, (void *)rb_hash_aref);
2276
2277         // Pop the key and the receiver
2278         (void)ctx_stack_pop(ctx, 2);
2279
2280         // Push the return value onto the stack
2281         x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
2282         mov(cb, stack_ret, RAX);
2283
2284         // Jump to next instruction. This allows guard chains to share the same successor.
2285         jit_jump_to_next_insn(jit, ctx);
2286         return YJIT_END_BLOCK;
2287     }
2288     else {
2289         // General case. Call the [] method.
2290         return gen_opt_send_without_block(jit, ctx, cb);
2291     }
2292 }
2293
2294 static codegen_status_t
2295 gen_opt_aset(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2296 {
2297     // Defer compilation so we can specialize on a runtime `self`
2298     if (!jit_at_current_insn(jit)) {
2299         defer_compilation(jit, ctx);
2300         return YJIT_END_BLOCK;
2301     }
2302
2303     VALUE comptime_recv = jit_peek_at_stack(jit, ctx, 2);
2304     VALUE comptime_key  = jit_peek_at_stack(jit, ctx, 1);
2305
2306     // Get the operands from the stack
2307     x86opnd_t recv = ctx_stack_opnd(ctx, 2);
2308     x86opnd_t key = ctx_stack_opnd(ctx, 1);
2309     x86opnd_t val = ctx_stack_opnd(ctx, 0);
2310
2311     if (CLASS_OF(comptime_recv) == rb_cArray && FIXNUM_P(comptime_key)) {
2312         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2313
2314         // Guard receiver is an Array
2315         mov(cb, REG0, recv);
2316         jit_guard_known_klass(jit, ctx, rb_cArray, OPND_STACK(2), comptime_recv, SEND_MAX_DEPTH, side_exit);
2317
2318         // Guard key is a fixnum
2319         mov(cb, REG0, key);
2320         jit_guard_known_klass(jit, ctx, rb_cInteger, OPND_STACK(1), comptime_key, SEND_MAX_DEPTH, side_exit);
2321
2322         // Call rb_ary_store
2323         mov(cb, C_ARG_REGS[0], recv);
2324         mov(cb, C_ARG_REGS[1], key);
2325         sar(cb, C_ARG_REGS[1], imm_opnd(1)); // FIX2LONG(key)
2326         mov(cb, C_ARG_REGS[2], val);
2327
2328         // We might allocate or raise
2329         jit_prepare_routine_call(jit, ctx, REG0);
2330
2331         call_ptr(cb, REG0, (void *)rb_ary_store);
2332
2333         // rb_ary_store returns void
2334         // stored value should still be on stack
2335         mov(cb, REG0, ctx_stack_opnd(ctx, 0));
2336
2337         // Push the return value onto the stack
2338         ctx_stack_pop(ctx, 3);
2339         x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
2340         mov(cb, stack_ret, REG0);
2341
2342         jit_jump_to_next_insn(jit, ctx);
2343         return YJIT_END_BLOCK;
2344     }
2345     else if (CLASS_OF(comptime_recv) == rb_cHash) {
2346         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2347
2348         // Guard receiver is a Hash
2349         mov(cb, REG0, recv);
2350         jit_guard_known_klass(jit, ctx, rb_cHash, OPND_STACK(2), comptime_recv, SEND_MAX_DEPTH, side_exit);
2351
2352         // Call rb_hash_aset
2353         mov(cb, C_ARG_REGS[0], recv);
2354         mov(cb, C_ARG_REGS[1], key);
2355         mov(cb, C_ARG_REGS[2], val);
2356
2357         // We might allocate or raise
2358         jit_prepare_routine_call(jit, ctx, REG0);
2359
2360         call_ptr(cb, REG0, (void *)rb_hash_aset);
2361
2362         // Push the return value onto the stack
2363         ctx_stack_pop(ctx, 3);
2364         x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
2365         mov(cb, stack_ret, RAX);
2366
2367         jit_jump_to_next_insn(jit, ctx);
2368         return YJIT_END_BLOCK;
2369     }
2370     else {
2371         return gen_opt_send_without_block(jit, ctx, cb);
2372     }
2373 }
2374
2375 static codegen_status_t
2376 gen_opt_and(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2377 {
2378     // Defer compilation so we can specialize on a runtime `self`
2379     if (!jit_at_current_insn(jit)) {
2380         defer_compilation(jit, ctx);
2381         return YJIT_END_BLOCK;
2382     }
2383
2384     VALUE comptime_a = jit_peek_at_stack(jit, ctx, 1);
2385     VALUE comptime_b = jit_peek_at_stack(jit, ctx, 0);
2386
2387     if (FIXNUM_P(comptime_a) && FIXNUM_P(comptime_b)) {
2388         // Create a side-exit to fall back to the interpreter
2389         // Note: we generate the side-exit before popping operands from the stack
2390         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2391
2392         if (!assume_bop_not_redefined(jit, INTEGER_REDEFINED_OP_FLAG, BOP_AND)) {
2393             return YJIT_CANT_COMPILE;
2394         }
2395
2396         // Check that both operands are fixnums
2397         guard_two_fixnums(ctx, side_exit);
2398
2399         // Get the operands and destination from the stack
2400         x86opnd_t arg1 = ctx_stack_pop(ctx, 1);
2401         x86opnd_t arg0 = ctx_stack_pop(ctx, 1);
2402
2403         // Do the bitwise and arg0 & arg1
2404         mov(cb, REG0, arg0);
2405         and(cb, REG0, arg1);
2406
2407         // Push the output on the stack
2408         x86opnd_t dst = ctx_stack_push(ctx, TYPE_FIXNUM);
2409         mov(cb, dst, REG0);
2410
2411         return YJIT_KEEP_COMPILING;
2412     }
2413     else {
2414         // Delegate to send, call the method on the recv
2415         return gen_opt_send_without_block(jit, ctx, cb);
2416     }
2417 }
2418
2419 static codegen_status_t
2420 gen_opt_or(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2421 {
2422     // Defer compilation so we can specialize on a runtime `self`
2423     if (!jit_at_current_insn(jit)) {
2424         defer_compilation(jit, ctx);
2425         return YJIT_END_BLOCK;
2426     }
2427
2428     VALUE comptime_a = jit_peek_at_stack(jit, ctx, 1);
2429     VALUE comptime_b = jit_peek_at_stack(jit, ctx, 0);
2430
2431     if (FIXNUM_P(comptime_a) && FIXNUM_P(comptime_b)) {
2432         // Create a side-exit to fall back to the interpreter
2433         // Note: we generate the side-exit before popping operands from the stack
2434         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2435
2436         if (!assume_bop_not_redefined(jit, INTEGER_REDEFINED_OP_FLAG, BOP_OR)) {
2437             return YJIT_CANT_COMPILE;
2438         }
2439
2440         // Check that both operands are fixnums
2441         guard_two_fixnums(ctx, side_exit);
2442
2443         // Get the operands and destination from the stack
2444         x86opnd_t arg1 = ctx_stack_pop(ctx, 1);
2445         x86opnd_t arg0 = ctx_stack_pop(ctx, 1);
2446
2447         // Do the bitwise or arg0 | arg1
2448         mov(cb, REG0, arg0);
2449         or(cb, REG0, arg1);
2450
2451         // Push the output on the stack
2452         x86opnd_t dst = ctx_stack_push(ctx, TYPE_FIXNUM);
2453         mov(cb, dst, REG0);
2454
2455         return YJIT_KEEP_COMPILING;
2456     }
2457     else {
2458         // Delegate to send, call the method on the recv
2459         return gen_opt_send_without_block(jit, ctx, cb);
2460     }
2461 }
2462
2463 static codegen_status_t
2464 gen_opt_minus(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2465 {
2466     // Defer compilation so we can specialize on a runtime `self`
2467     if (!jit_at_current_insn(jit)) {
2468         defer_compilation(jit, ctx);
2469         return YJIT_END_BLOCK;
2470     }
2471
2472     VALUE comptime_a = jit_peek_at_stack(jit, ctx, 1);
2473     VALUE comptime_b = jit_peek_at_stack(jit, ctx, 0);
2474
2475     if (FIXNUM_P(comptime_a) && FIXNUM_P(comptime_b)) {
2476         // Create a side-exit to fall back to the interpreter
2477         // Note: we generate the side-exit before popping operands from the stack
2478         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2479
2480         if (!assume_bop_not_redefined(jit, INTEGER_REDEFINED_OP_FLAG, BOP_MINUS)) {
2481             return YJIT_CANT_COMPILE;
2482         }
2483
2484         // Check that both operands are fixnums
2485         guard_two_fixnums(ctx, side_exit);
2486
2487         // Get the operands and destination from the stack
2488         x86opnd_t arg1 = ctx_stack_pop(ctx, 1);
2489         x86opnd_t arg0 = ctx_stack_pop(ctx, 1);
2490
2491         // Subtract arg0 - arg1 and test for overflow
2492         mov(cb, REG0, arg0);
2493         sub(cb, REG0, arg1);
2494         jo_ptr(cb, side_exit);
2495         add(cb, REG0, imm_opnd(1));
2496
2497         // Push the output on the stack
2498         x86opnd_t dst = ctx_stack_push(ctx, TYPE_FIXNUM);
2499         mov(cb, dst, REG0);
2500
2501         return YJIT_KEEP_COMPILING;
2502     }
2503     else {
2504         // Delegate to send, call the method on the recv
2505         return gen_opt_send_without_block(jit, ctx, cb);
2506     }
2507 }
2508
2509 static codegen_status_t
2510 gen_opt_plus(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2511 {
2512     // Defer compilation so we can specialize on a runtime `self`
2513     if (!jit_at_current_insn(jit)) {
2514         defer_compilation(jit, ctx);
2515         return YJIT_END_BLOCK;
2516     }
2517
2518     VALUE comptime_a = jit_peek_at_stack(jit, ctx, 1);
2519     VALUE comptime_b = jit_peek_at_stack(jit, ctx, 0);
2520
2521     if (FIXNUM_P(comptime_a) && FIXNUM_P(comptime_b)) {
2522         // Create a side-exit to fall back to the interpreter
2523         // Note: we generate the side-exit before popping operands from the stack
2524         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2525
2526         if (!assume_bop_not_redefined(jit, INTEGER_REDEFINED_OP_FLAG, BOP_PLUS)) {
2527             return YJIT_CANT_COMPILE;
2528         }
2529
2530         // Check that both operands are fixnums
2531         guard_two_fixnums(ctx, side_exit);
2532
2533         // Get the operands and destination from the stack
2534         x86opnd_t arg1 = ctx_stack_pop(ctx, 1);
2535         x86opnd_t arg0 = ctx_stack_pop(ctx, 1);
2536
2537         // Add arg0 + arg1 and test for overflow
2538         mov(cb, REG0, arg0);
2539         sub(cb, REG0, imm_opnd(1));
2540         add(cb, REG0, arg1);
2541         jo_ptr(cb, side_exit);
2542
2543         // Push the output on the stack
2544         x86opnd_t dst = ctx_stack_push(ctx, TYPE_FIXNUM);
2545         mov(cb, dst, REG0);
2546
2547         return YJIT_KEEP_COMPILING;
2548     }
2549     else {
2550         // Delegate to send, call the method on the recv
2551         return gen_opt_send_without_block(jit, ctx, cb);
2552     }
2553 }
2554
2555 static codegen_status_t
2556 gen_opt_mult(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2557 {
2558     // Delegate to send, call the method on the recv
2559     return gen_opt_send_without_block(jit, ctx, cb);
2560 }
2561
2562 static codegen_status_t
2563 gen_opt_div(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2564 {
2565     // Delegate to send, call the method on the recv
2566     return gen_opt_send_without_block(jit, ctx, cb);
2567 }
2568
2569 VALUE rb_vm_opt_mod(VALUE recv, VALUE obj);
2570
2571 static codegen_status_t
2572 gen_opt_mod(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2573 {
2574     // Save the PC and SP because the callee may allocate bignums
2575     // Note that this modifies REG_SP, which is why we do it first
2576     jit_prepare_routine_call(jit, ctx, REG0);
2577
2578     uint8_t *side_exit = yjit_side_exit(jit, ctx);
2579
2580     // Get the operands from the stack
2581     x86opnd_t arg1 = ctx_stack_pop(ctx, 1);
2582     x86opnd_t arg0 = ctx_stack_pop(ctx, 1);
2583
2584     // Call rb_vm_opt_mod(VALUE recv, VALUE obj)
2585     mov(cb, C_ARG_REGS[0], arg0);
2586     mov(cb, C_ARG_REGS[1], arg1);
2587     call_ptr(cb, REG0, (void *)rb_vm_opt_mod);
2588
2589     // If val == Qundef, bail to do a method call
2590     cmp(cb, RAX, imm_opnd(Qundef));
2591     je_ptr(cb, side_exit);
2592
2593     // Push the return value onto the stack
2594     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
2595     mov(cb, stack_ret, RAX);
2596
2597     return YJIT_KEEP_COMPILING;
2598 }
2599
2600 static codegen_status_t
2601 gen_opt_ltlt(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2602 {
2603     // Delegate to send, call the method on the recv
2604     return gen_opt_send_without_block(jit, ctx, cb);
2605 }
2606
2607 static codegen_status_t
2608 gen_opt_nil_p(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2609 {
2610     // Delegate to send, call the method on the recv
2611     return gen_opt_send_without_block(jit, ctx, cb);
2612 }
2613
2614 static codegen_status_t
2615 gen_opt_empty_p(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2616 {
2617     // Delegate to send, call the method on the recv
2618     return gen_opt_send_without_block(jit, ctx, cb);
2619 }
2620
2621 static codegen_status_t
2622 gen_opt_str_freeze(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2623 {
2624     if (!assume_bop_not_redefined(jit, STRING_REDEFINED_OP_FLAG, BOP_FREEZE)) {
2625         return YJIT_CANT_COMPILE;
2626     }
2627
2628     VALUE str = jit_get_arg(jit, 0);
2629     jit_mov_gc_ptr(jit, cb, REG0, str);
2630
2631     // Push the return value onto the stack
2632     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_STRING);
2633     mov(cb, stack_ret, REG0);
2634
2635     return YJIT_KEEP_COMPILING;
2636 }
2637
2638 static codegen_status_t
2639 gen_opt_str_uminus(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2640 {
2641     if (!assume_bop_not_redefined(jit, STRING_REDEFINED_OP_FLAG, BOP_UMINUS)) {
2642         return YJIT_CANT_COMPILE;
2643     }
2644
2645     VALUE str = jit_get_arg(jit, 0);
2646     jit_mov_gc_ptr(jit, cb, REG0, str);
2647
2648     // Push the return value onto the stack
2649     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_STRING);
2650     mov(cb, stack_ret, REG0);
2651
2652     return YJIT_KEEP_COMPILING;
2653 }
2654
2655 static codegen_status_t
2656 gen_opt_not(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2657 {
2658     return gen_opt_send_without_block(jit, ctx, cb);
2659 }
2660
2661 static codegen_status_t
2662 gen_opt_size(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2663 {
2664     return gen_opt_send_without_block(jit, ctx, cb);
2665 }
2666
2667 static codegen_status_t
2668 gen_opt_length(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2669 {
2670     return gen_opt_send_without_block(jit, ctx, cb);
2671 }
2672
2673 static codegen_status_t
2674 gen_opt_regexpmatch2(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2675 {
2676     return gen_opt_send_without_block(jit, ctx, cb);
2677 }
2678
2679 static codegen_status_t
2680 gen_opt_case_dispatch(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2681 {
2682     // Normally this instruction would lookup the key in a hash and jump to an
2683     // offset based on that.
2684     // Instead we can take the fallback case and continue with the next
2685     // instruction.
2686     // We'd hope that our jitted code will be sufficiently fast without the
2687     // hash lookup, at least for small hashes, but it's worth revisiting this
2688     // assumption in the future.
2689
2690     ctx_stack_pop(ctx, 1);
2691
2692     return YJIT_KEEP_COMPILING; // continue with the next instruction
2693 }
2694
2695 static void
2696 gen_branchif_branch(codeblock_t *cb, uint8_t *target0, uint8_t *target1, uint8_t shape)
2697 {
2698     switch (shape) {
2699       case SHAPE_NEXT0:
2700         jz_ptr(cb, target1);
2701         break;
2702
2703       case SHAPE_NEXT1:
2704         jnz_ptr(cb, target0);
2705         break;
2706
2707       case SHAPE_DEFAULT:
2708         jnz_ptr(cb, target0);
2709         jmp_ptr(cb, target1);
2710         break;
2711     }
2712 }
2713
2714 static codegen_status_t
2715 gen_branchif(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2716 {
2717     int32_t jump_offset = (int32_t)jit_get_arg(jit, 0);
2718
2719     // Check for interrupts, but only on backward branches that may create loops
2720     if (jump_offset < 0) {
2721         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2722         yjit_check_ints(cb, side_exit);
2723     }
2724
2725     // Test if any bit (outside of the Qnil bit) is on
2726     // RUBY_Qfalse  /* ...0000 0000 */
2727     // RUBY_Qnil    /* ...0000 1000 */
2728     x86opnd_t val_opnd = ctx_stack_pop(ctx, 1);
2729     test(cb, val_opnd, imm_opnd(~Qnil));
2730
2731     // Get the branch target instruction offsets
2732     uint32_t next_idx = jit_next_insn_idx(jit);
2733     uint32_t jump_idx = next_idx + jump_offset;
2734     blockid_t next_block = { jit->iseq, next_idx };
2735     blockid_t jump_block = { jit->iseq, jump_idx };
2736
2737     // Generate the branch instructions
2738     gen_branch(
2739         jit,
2740         ctx,
2741         jump_block,
2742         ctx,
2743         next_block,
2744         ctx,
2745         gen_branchif_branch
2746     );
2747
2748     return YJIT_END_BLOCK;
2749 }
2750
2751 static void
2752 gen_branchunless_branch(codeblock_t *cb, uint8_t *target0, uint8_t *target1, uint8_t shape)
2753 {
2754     switch (shape) {
2755       case SHAPE_NEXT0:
2756         jnz_ptr(cb, target1);
2757         break;
2758
2759       case SHAPE_NEXT1:
2760         jz_ptr(cb, target0);
2761         break;
2762
2763       case SHAPE_DEFAULT:
2764         jz_ptr(cb, target0);
2765         jmp_ptr(cb, target1);
2766         break;
2767     }
2768 }
2769
2770 static codegen_status_t
2771 gen_branchunless(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2772 {
2773     int32_t jump_offset = (int32_t)jit_get_arg(jit, 0);
2774
2775     // Check for interrupts, but only on backward branches that may create loops
2776     if (jump_offset < 0) {
2777         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2778         yjit_check_ints(cb, side_exit);
2779     }
2780
2781     // Test if any bit (outside of the Qnil bit) is on
2782     // RUBY_Qfalse  /* ...0000 0000 */
2783     // RUBY_Qnil    /* ...0000 1000 */
2784     x86opnd_t val_opnd = ctx_stack_pop(ctx, 1);
2785     test(cb, val_opnd, imm_opnd(~Qnil));
2786
2787     // Get the branch target instruction offsets
2788     uint32_t next_idx = jit_next_insn_idx(jit);
2789     uint32_t jump_idx = next_idx + jump_offset;
2790     blockid_t next_block = { jit->iseq, next_idx };
2791     blockid_t jump_block = { jit->iseq, jump_idx };
2792
2793     // Generate the branch instructions
2794     gen_branch(
2795         jit,
2796         ctx,
2797         jump_block,
2798         ctx,
2799         next_block,
2800         ctx,
2801         gen_branchunless_branch
2802     );
2803
2804     return YJIT_END_BLOCK;
2805 }
2806
2807 static void
2808 gen_branchnil_branch(codeblock_t *cb, uint8_t *target0, uint8_t *target1, uint8_t shape)
2809 {
2810     switch (shape) {
2811       case SHAPE_NEXT0:
2812         jne_ptr(cb, target1);
2813         break;
2814
2815       case SHAPE_NEXT1:
2816         je_ptr(cb, target0);
2817         break;
2818
2819       case SHAPE_DEFAULT:
2820         je_ptr(cb, target0);
2821         jmp_ptr(cb, target1);
2822         break;
2823     }
2824 }
2825
2826 static codegen_status_t
2827 gen_branchnil(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2828 {
2829     int32_t jump_offset = (int32_t)jit_get_arg(jit, 0);
2830
2831     // Check for interrupts, but only on backward branches that may create loops
2832     if (jump_offset < 0) {
2833         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2834         yjit_check_ints(cb, side_exit);
2835     }
2836
2837     // Test if the value is Qnil
2838     // RUBY_Qnil    /* ...0000 1000 */
2839     x86opnd_t val_opnd = ctx_stack_pop(ctx, 1);
2840     cmp(cb, val_opnd, imm_opnd(Qnil));
2841
2842     // Get the branch target instruction offsets
2843     uint32_t next_idx = jit_next_insn_idx(jit);
2844     uint32_t jump_idx = next_idx + jump_offset;
2845     blockid_t next_block = { jit->iseq, next_idx };
2846     blockid_t jump_block = { jit->iseq, jump_idx };
2847
2848     // Generate the branch instructions
2849     gen_branch(
2850         jit,
2851         ctx,
2852         jump_block,
2853         ctx,
2854         next_block,
2855         ctx,
2856         gen_branchnil_branch
2857     );
2858
2859     return YJIT_END_BLOCK;
2860 }
2861
2862 static codegen_status_t
2863 gen_jump(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
2864 {
2865     int32_t jump_offset = (int32_t)jit_get_arg(jit, 0);
2866
2867     // Check for interrupts, but only on backward branches that may create loops
2868     if (jump_offset < 0) {
2869         uint8_t *side_exit = yjit_side_exit(jit, ctx);
2870         yjit_check_ints(cb, side_exit);
2871     }
2872
2873     // Get the branch target instruction offsets
2874     uint32_t jump_idx = jit_next_insn_idx(jit) + jump_offset;
2875     blockid_t jump_block = { jit->iseq, jump_idx };
2876
2877     // Generate the jump instruction
2878     gen_direct_jump(
2879         jit,
2880         ctx,
2881         jump_block
2882     );
2883
2884     return YJIT_END_BLOCK;
2885 }
2886
2887 /*
2888 Guard that self or a stack operand has the same class as `known_klass`, using
2889 `sample_instance` to speculate about the shape of the runtime value.
2890 FIXNUM and on-heap integers are treated as if they have distinct classes, and
2891 the guard generated for one will fail for the other.
2892
2893 Recompile as contingency if possible, or take side exit a last resort.
2894 */
2895 static bool
2896 jit_guard_known_klass(jitstate_t *jit, ctx_t *ctx, VALUE known_klass, insn_opnd_t insn_opnd, VALUE sample_instance, const int max_chain_depth, uint8_t *side_exit)
2897 {
2898     val_type_t val_type = ctx_get_opnd_type(ctx, insn_opnd);
2899
2900     if (known_klass == rb_cNilClass) {
2901         RUBY_ASSERT(!val_type.is_heap);
2902         if (val_type.type != ETYPE_NIL) {
2903             RUBY_ASSERT(val_type.type == ETYPE_UNKNOWN);
2904
2905             ADD_COMMENT(cb, "guard object is nil");
2906             cmp(cb, REG0, imm_opnd(Qnil));
2907             jit_chain_guard(JCC_JNE, jit, ctx, max_chain_depth, side_exit);
2908
2909             ctx_upgrade_opnd_type(ctx, insn_opnd, TYPE_NIL);
2910         }
2911     }
2912     else if (known_klass == rb_cTrueClass) {
2913         RUBY_ASSERT(!val_type.is_heap);
2914         if (val_type.type != ETYPE_TRUE) {
2915             RUBY_ASSERT(val_type.type == ETYPE_UNKNOWN);
2916
2917             ADD_COMMENT(cb, "guard object is true");
2918             cmp(cb, REG0, imm_opnd(Qtrue));
2919             jit_chain_guard(JCC_JNE, jit, ctx, max_chain_depth, side_exit);
2920
2921             ctx_upgrade_opnd_type(ctx, insn_opnd, TYPE_TRUE);
2922         }
2923     }
2924     else if (known_klass == rb_cFalseClass) {
2925         RUBY_ASSERT(!val_type.is_heap);
2926         if (val_type.type != ETYPE_FALSE) {
2927             RUBY_ASSERT(val_type.type == ETYPE_UNKNOWN);
2928
2929             ADD_COMMENT(cb, "guard object is false");
2930             STATIC_ASSERT(qfalse_is_zero, Qfalse == 0);
2931             test(cb, REG0, REG0);
2932             jit_chain_guard(JCC_JNZ, jit, ctx, max_chain_depth, side_exit);
2933
2934             ctx_upgrade_opnd_type(ctx, insn_opnd, TYPE_FALSE);
2935         }
2936     }
2937     else if (known_klass == rb_cInteger && FIXNUM_P(sample_instance)) {
2938         RUBY_ASSERT(!val_type.is_heap);
2939         // We will guard fixnum and bignum as though they were separate classes
2940         // BIGNUM can be handled by the general else case below
2941         if (val_type.type != ETYPE_FIXNUM || !val_type.is_imm) {
2942             RUBY_ASSERT(val_type.type == ETYPE_UNKNOWN);
2943
2944             ADD_COMMENT(cb, "guard object is fixnum");
2945             test(cb, REG0, imm_opnd(RUBY_FIXNUM_FLAG));
2946             jit_chain_guard(JCC_JZ, jit, ctx, max_chain_depth, side_exit);
2947             ctx_upgrade_opnd_type(ctx, insn_opnd, TYPE_FIXNUM);
2948         }
2949     }
2950     else if (known_klass == rb_cSymbol && STATIC_SYM_P(sample_instance)) {
2951         RUBY_ASSERT(!val_type.is_heap);
2952         // We will guard STATIC vs DYNAMIC as though they were separate classes
2953         // DYNAMIC symbols can be handled by the general else case below
2954         if (val_type.type != ETYPE_SYMBOL || !val_type.is_imm) {
2955             RUBY_ASSERT(val_type.type == ETYPE_UNKNOWN);
2956
2957             ADD_COMMENT(cb, "guard object is static symbol");
2958             STATIC_ASSERT(special_shift_is_8, RUBY_SPECIAL_SHIFT == 8);
2959             cmp(cb, REG0_8, imm_opnd(RUBY_SYMBOL_FLAG));
2960             jit_chain_guard(JCC_JNE, jit, ctx, max_chain_depth, side_exit);
2961             ctx_upgrade_opnd_type(ctx, insn_opnd, TYPE_STATIC_SYMBOL);
2962         }
2963     }
2964     else if (known_klass == rb_cFloat && FLONUM_P(sample_instance)) {
2965         RUBY_ASSERT(!val_type.is_heap);
2966         if (val_type.type != ETYPE_FLONUM || !val_type.is_imm) {
2967             RUBY_ASSERT(val_type.type == ETYPE_UNKNOWN);
2968
2969             // We will guard flonum vs heap float as though they were separate classes
2970             ADD_COMMENT(cb, "guard object is flonum");
2971             mov(cb, REG1, REG0);
2972             and(cb, REG1, imm_opnd(RUBY_FLONUM_MASK));
2973             cmp(cb, REG1, imm_opnd(RUBY_FLONUM_FLAG));
2974             jit_chain_guard(JCC_JNE, jit, ctx, max_chain_depth, side_exit);
2975             ctx_upgrade_opnd_type(ctx, insn_opnd, TYPE_FLONUM);
2976         }
2977     }
2978     else if (FL_TEST(known_klass, FL_SINGLETON) && sample_instance == rb_attr_get(known_klass, id__attached__)) {
2979         // Singleton classes are attached to one specific object, so we can
2980         // avoid one memory access (and potentially the is_heap check) by
2981         // looking for the expected object directly.
2982         // Note that in case the sample instance has a singleton class that
2983         // doesn't attach to the sample instance, it means the sample instance
2984         // has an empty singleton class that hasn't been materialized yet. In
2985         // this case, comparing against the sample instance doesn't guarantee
2986         // that its singleton class is empty, so we can't avoid the memory
2987         // access. As an example, `Object.new.singleton_class` is an object in
2988         // this situation.
2989         ADD_COMMENT(cb, "guard known object with singleton class");
2990         // TODO: jit_mov_gc_ptr keeps a strong reference, which leaks the object.
2991         jit_mov_gc_ptr(jit, cb, REG1, sample_instance);
2992         cmp(cb, REG0, REG1);
2993         jit_chain_guard(JCC_JNE, jit, ctx, max_chain_depth, side_exit);
2994     }
2995     else {
2996         RUBY_ASSERT(!val_type.is_imm);
2997
2998         // Check that the receiver is a heap object
2999         // Note: if we get here, the class doesn't have immediate instances.
3000         if (!val_type.is_heap) {
3001             ADD_COMMENT(cb, "guard not immediate");
3002             RUBY_ASSERT(Qfalse < Qnil);
3003             test(cb, REG0, imm_opnd(RUBY_IMMEDIATE_MASK));
3004             jit_chain_guard(JCC_JNZ, jit, ctx, max_chain_depth, side_exit);
3005             cmp(cb, REG0, imm_opnd(Qnil));
3006             jit_chain_guard(JCC_JBE, jit, ctx, max_chain_depth, side_exit);
3007
3008             ctx_upgrade_opnd_type(ctx, insn_opnd, TYPE_HEAP);
3009         }
3010
3011         x86opnd_t klass_opnd = mem_opnd(64, REG0, offsetof(struct RBasic, klass));
3012
3013         // Bail if receiver class is different from known_klass
3014         // TODO: jit_mov_gc_ptr keeps a strong reference, which leaks the class.
3015         ADD_COMMENT(cb, "guard known class");
3016         jit_mov_gc_ptr(jit, cb, REG1, known_klass);
3017         cmp(cb, klass_opnd, REG1);
3018         jit_chain_guard(JCC_JNE, jit, ctx, max_chain_depth, side_exit);
3019     }
3020
3021     return true;
3022 }
3023
3024 // Generate ancestry guard for protected callee.
3025 // Calls to protected callees only go through when self.is_a?(klass_that_defines_the_callee).
3026 static void
3027 jit_protected_callee_ancestry_guard(jitstate_t *jit, codeblock_t *cb, const rb_callable_method_entry_t *cme, uint8_t *side_exit)
3028 {
3029     // See vm_call_method().
3030     mov(cb, C_ARG_REGS[0], member_opnd(REG_CFP, rb_control_frame_t, self));
3031     jit_mov_gc_ptr(jit, cb, C_ARG_REGS[1], cme->defined_class);
3032     // Note: PC isn't written to current control frame as rb_is_kind_of() shouldn't raise.
3033     // VALUE rb_obj_is_kind_of(VALUE obj, VALUE klass);
3034     call_ptr(cb, REG0, (void *)&rb_obj_is_kind_of);
3035     test(cb, RAX, RAX);
3036     jz_ptr(cb, COUNTED_EXIT(jit, side_exit, send_se_protected_check_failed));
3037 }
3038
3039 // Return true when the codegen function generates code.
3040 // known_recv_klass is non-NULL when the caller has used jit_guard_known_klass().
3041 // See yjit_reg_method().
3042 typedef bool (*method_codegen_t)(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *known_recv_klass);
3043
3044 // Register a specialized codegen function for a particular method. Note that
3045 // the if the function returns true, the code it generates runs without a
3046 // control frame and without interrupt checks. To avoid creating observable
3047 // behavior changes, the codegen function should only target simple code paths
3048 // that do not allocate and do not make method calls.
3049 static void
3050 yjit_reg_method(VALUE klass, const char *mid_str, method_codegen_t gen_fn)
3051 {
3052     ID mid = rb_intern(mid_str);
3053     const rb_method_entry_t *me = rb_method_entry_at(klass, mid);
3054
3055     if (!me) {
3056         rb_bug("undefined optimized method: %s", rb_id2name(mid));
3057     }
3058
3059     // For now, only cfuncs are supported
3060     RUBY_ASSERT(me && me->def);
3061     RUBY_ASSERT(me->def->type == VM_METHOD_TYPE_CFUNC);
3062
3063     st_insert(yjit_method_codegen_table, (st_data_t)me->def->method_serial, (st_data_t)gen_fn);
3064 }
3065
3066 // Codegen for rb_obj_not().
3067 // Note, caller is responsible for generating all the right guards, including
3068 // arity guards.
3069 static bool
3070 jit_rb_obj_not(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *known_recv_klass)
3071 {
3072     const val_type_t recv_opnd = ctx_get_opnd_type(ctx, OPND_STACK(0));
3073
3074     if (recv_opnd.type == ETYPE_NIL || recv_opnd.type == ETYPE_FALSE) {
3075         ADD_COMMENT(cb, "rb_obj_not(nil_or_false)");
3076         ctx_stack_pop(ctx, 1);
3077         x86opnd_t out_opnd = ctx_stack_push(ctx, TYPE_TRUE);
3078         mov(cb, out_opnd, imm_opnd(Qtrue));
3079     }
3080     else if (recv_opnd.is_heap || recv_opnd.type != ETYPE_UNKNOWN) {
3081         // Note: recv_opnd.type != ETYPE_NIL && recv_opnd.type != ETYPE_FALSE.
3082         ADD_COMMENT(cb, "rb_obj_not(truthy)");
3083         ctx_stack_pop(ctx, 1);
3084         x86opnd_t out_opnd = ctx_stack_push(ctx, TYPE_FALSE);
3085         mov(cb, out_opnd, imm_opnd(Qfalse));
3086     }
3087     else {
3088         // jit_guard_known_klass() already ran on the receiver which should
3089         // have deduced deduced the type of the receiver. This case should be
3090         // rare if not unreachable.
3091         return false;
3092     }
3093     return true;
3094 }
3095
3096 // Codegen for rb_true()
3097 static bool
3098 jit_rb_true(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *known_recv_klass)
3099 {
3100     ADD_COMMENT(cb, "nil? == true");
3101     ctx_stack_pop(ctx, 1);
3102     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_TRUE);
3103     mov(cb, stack_ret, imm_opnd(Qtrue));
3104     return true;
3105 }
3106
3107 // Codegen for rb_false()
3108 static bool
3109 jit_rb_false(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *known_recv_klass)
3110 {
3111     ADD_COMMENT(cb, "nil? == false");
3112     ctx_stack_pop(ctx, 1);
3113     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_FALSE);
3114     mov(cb, stack_ret, imm_opnd(Qfalse));
3115     return true;
3116 }
3117
3118 // Codegen for rb_obj_equal()
3119 // object identity comparison
3120 static bool
3121 jit_rb_obj_equal(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *known_recv_klass)
3122 {
3123     ADD_COMMENT(cb, "equal?");
3124     x86opnd_t obj1 = ctx_stack_pop(ctx, 1);
3125     x86opnd_t obj2 = ctx_stack_pop(ctx, 1);
3126
3127     mov(cb, REG0, obj1);
3128     cmp(cb, REG0, obj2);
3129     mov(cb, REG0, imm_opnd(Qtrue));
3130     mov(cb, REG1, imm_opnd(Qfalse));
3131     cmovne(cb, REG0, REG1);
3132
3133     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_IMM);
3134     mov(cb, stack_ret, REG0);
3135     return true;
3136 }
3137
3138 static VALUE
3139 yjit_str_bytesize(VALUE str)
3140 {
3141     return LONG2NUM(RSTRING_LEN(str));
3142 }
3143
3144 static bool
3145 jit_rb_str_bytesize(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *known_recv_klass)
3146 {
3147     ADD_COMMENT(cb, "String#bytesize");
3148
3149     x86opnd_t recv = ctx_stack_pop(ctx, 1);
3150     mov(cb, C_ARG_REGS[0], recv);
3151     call_ptr(cb, REG0, (void *)&yjit_str_bytesize);
3152
3153     x86opnd_t out_opnd = ctx_stack_push(ctx, TYPE_FIXNUM);
3154     mov(cb, out_opnd, RAX);
3155
3156     return true;
3157 }
3158
3159 // Codegen for rb_str_to_s()
3160 // When String#to_s is called on a String instance, the method returns self and
3161 // most of the overhead comes from setting up the method call. We observed that
3162 // this situation happens a lot in some workloads.
3163 static bool
3164 jit_rb_str_to_s(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *recv_known_klass)
3165 {
3166     if (recv_known_klass && *recv_known_klass == rb_cString) {
3167         ADD_COMMENT(cb, "to_s on plain string");
3168         // The method returns the receiver, which is already on the stack.
3169         // No stack movement.
3170         return true;
3171     }
3172     return false;
3173 }
3174
3175 static bool
3176 jit_thread_s_current(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *recv_known_klass)
3177 {
3178     ADD_COMMENT(cb, "Thread.current");
3179     ctx_stack_pop(ctx, 1);
3180
3181     // ec->thread_ptr
3182     mov(cb, REG0, member_opnd(REG_EC, rb_execution_context_t, thread_ptr));
3183
3184     // thread->self
3185     mov(cb, REG0, member_opnd(REG0, rb_thread_t, self));
3186
3187     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_HEAP);
3188     mov(cb, stack_ret, REG0);
3189     return true;
3190 }
3191
3192 // Check if we know how to codegen for a particular cfunc method
3193 static method_codegen_t
3194 lookup_cfunc_codegen(const rb_method_definition_t *def)
3195 {
3196     method_codegen_t gen_fn;
3197     if (st_lookup(yjit_method_codegen_table, def->method_serial, (st_data_t *)&gen_fn)) {
3198         return gen_fn;
3199     }
3200     return NULL;
3201 }
3202
3203 // Is anyone listening for :c_call and :c_return event currently?
3204 static bool
3205 c_method_tracing_currently_enabled(const jitstate_t *jit)
3206 {
3207     rb_event_flag_t tracing_events;
3208     if (rb_multi_ractor_p()) {
3209         tracing_events = ruby_vm_event_enabled_global_flags;
3210     }
3211     else {
3212         // At the time of writing, events are never removed from
3213         // ruby_vm_event_enabled_global_flags so always checking using it would
3214         // mean we don't compile even after tracing is disabled.
3215         tracing_events = rb_ec_ractor_hooks(jit->ec)->events;
3216     }
3217
3218     return tracing_events & (RUBY_EVENT_C_CALL | RUBY_EVENT_C_RETURN);
3219 }
3220
3221 static codegen_status_t
3222 gen_send_cfunc(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, const int32_t argc, VALUE *recv_known_klass)
3223 {
3224     const rb_method_cfunc_t *cfunc = UNALIGNED_MEMBER_PTR(cme->def, body.cfunc);
3225
3226     // If the function expects a Ruby array of arguments
3227     if (cfunc->argc < 0 && cfunc->argc != -1) {
3228         GEN_COUNTER_INC(cb, send_cfunc_ruby_array_varg);
3229         return YJIT_CANT_COMPILE;
3230     }
3231
3232     // If the argument count doesn't match
3233     if (cfunc->argc >= 0 && cfunc->argc != argc) {
3234         GEN_COUNTER_INC(cb, send_cfunc_argc_mismatch);
3235         return YJIT_CANT_COMPILE;
3236     }
3237
3238     // Don't JIT functions that need C stack arguments for now
3239     if (cfunc->argc >= 0 && argc + 1 > NUM_C_ARG_REGS) {
3240         GEN_COUNTER_INC(cb, send_cfunc_toomany_args);
3241         return YJIT_CANT_COMPILE;
3242     }
3243
3244     if (c_method_tracing_currently_enabled(jit)) {
3245         // Don't JIT if tracing c_call or c_return
3246         GEN_COUNTER_INC(cb, send_cfunc_tracing);
3247         return YJIT_CANT_COMPILE;
3248     }
3249
3250     // Delegate to codegen for C methods if we have it.
3251     {
3252         method_codegen_t known_cfunc_codegen;
3253         if ((known_cfunc_codegen = lookup_cfunc_codegen(cme->def))) {
3254             if (known_cfunc_codegen(jit, ctx, ci, cme, block, argc, recv_known_klass)) {
3255                 // cfunc codegen generated code. Terminate the block so
3256                 // there isn't multiple calls in the same block.
3257                 jit_jump_to_next_insn(jit, ctx);
3258                 return YJIT_END_BLOCK;
3259             }
3260         }
3261     }
3262
3263     // Callee method ID
3264     //ID mid = vm_ci_mid(ci);
3265     //printf("JITting call to C function \"%s\", argc: %lu\n", rb_id2name(mid), argc);
3266     //print_str(cb, "");
3267     //print_str(cb, "calling CFUNC:");
3268     //print_str(cb, rb_id2name(mid));
3269     //print_str(cb, "recv");
3270     //print_ptr(cb, recv);
3271
3272     // Create a side-exit to fall back to the interpreter
3273     uint8_t *side_exit = yjit_side_exit(jit, ctx);
3274
3275     // Check for interrupts
3276     yjit_check_ints(cb, side_exit);
3277
3278     // Stack overflow check
3279     // #define CHECK_VM_STACK_OVERFLOW0(cfp, sp, margin)
3280     // REG_CFP <= REG_SP + 4 * sizeof(VALUE) + sizeof(rb_control_frame_t)
3281     lea(cb, REG0, ctx_sp_opnd(ctx, sizeof(VALUE) * 4 + 2 * sizeof(rb_control_frame_t)));
3282     cmp(cb, REG_CFP, REG0);
3283     jle_ptr(cb, COUNTED_EXIT(jit, side_exit, send_se_cf_overflow));
3284
3285     // Points to the receiver operand on the stack
3286     x86opnd_t recv = ctx_stack_opnd(ctx, argc);
3287
3288     // Store incremented PC into current control frame in case callee raises.
3289     jit_save_pc(jit, REG0);
3290
3291     if (block) {
3292         // Change cfp->block_code in the current frame. See vm_caller_setup_arg_block().
3293         // VM_CFP_TO_CAPTURED_BLCOK does &cfp->self, rb_captured_block->code.iseq aliases
3294         // with cfp->block_code.
3295         jit_mov_gc_ptr(jit, cb, REG0, (VALUE)block);
3296         mov(cb, member_opnd(REG_CFP, rb_control_frame_t, block_code), REG0);
3297     }
3298
3299     // Increment the stack pointer by 3 (in the callee)
3300     // sp += 3
3301     lea(cb, REG0, ctx_sp_opnd(ctx, sizeof(VALUE) * 3));
3302
3303     // Write method entry at sp[-3]
3304     // sp[-3] = me;
3305     // Put compile time cme into REG1. It's assumed to be valid because we are notified when
3306     // any cme we depend on become outdated. See rb_yjit_method_lookup_change().
3307     jit_mov_gc_ptr(jit, cb, REG1, (VALUE)cme);
3308     mov(cb, mem_opnd(64, REG0, 8 * -3), REG1);
3309
3310     // Write block handler at sp[-2]
3311     // sp[-2] = block_handler;
3312     if (block) {
3313         // reg1 = VM_BH_FROM_ISEQ_BLOCK(VM_CFP_TO_CAPTURED_BLOCK(reg_cfp));
3314         lea(cb, REG1, member_opnd(REG_CFP, rb_control_frame_t, self));
3315         or(cb, REG1, imm_opnd(1));
3316         mov(cb, mem_opnd(64, REG0, 8 * -2), REG1);
3317     }
3318     else {
3319         mov(cb, mem_opnd(64, REG0, 8 * -2), imm_opnd(VM_BLOCK_HANDLER_NONE));
3320     }
3321
3322     // Write env flags at sp[-1]
3323     // sp[-1] = frame_type;
3324     uint64_t frame_type = VM_FRAME_MAGIC_CFUNC | VM_FRAME_FLAG_CFRAME | VM_ENV_FLAG_LOCAL;
3325     mov(cb, mem_opnd(64, REG0, 8 * -1), imm_opnd(frame_type));
3326
3327     // Allocate a new CFP (ec->cfp--)
3328     sub(
3329         cb,
3330         member_opnd(REG_EC, rb_execution_context_t, cfp),
3331         imm_opnd(sizeof(rb_control_frame_t))
3332     );
3333
3334     // Setup the new frame
3335     // *cfp = (const struct rb_control_frame_struct) {
3336     //    .pc         = 0,
3337     //    .sp         = sp,
3338     //    .iseq       = 0,
3339     //    .self       = recv,
3340     //    .ep         = sp - 1,
3341     //    .block_code = 0,
3342     //    .__bp__     = sp,
3343     // };
3344     mov(cb, REG1, member_opnd(REG_EC, rb_execution_context_t, cfp));
3345     mov(cb, member_opnd(REG1, rb_control_frame_t, pc), imm_opnd(0));
3346     mov(cb, member_opnd(REG1, rb_control_frame_t, sp), REG0);
3347     mov(cb, member_opnd(REG1, rb_control_frame_t, iseq), imm_opnd(0));
3348     mov(cb, member_opnd(REG1, rb_control_frame_t, block_code), imm_opnd(0));
3349     mov(cb, member_opnd(REG1, rb_control_frame_t, __bp__), REG0);
3350     sub(cb, REG0, imm_opnd(sizeof(VALUE)));
3351     mov(cb, member_opnd(REG1, rb_control_frame_t, ep), REG0);
3352     mov(cb, REG0, recv);
3353     mov(cb, member_opnd(REG1, rb_control_frame_t, self), REG0);
3354
3355     // Verify that we are calling the right function
3356     if (YJIT_CHECK_MODE > 0) {
3357         // Call check_cfunc_dispatch
3358         mov(cb, C_ARG_REGS[0], recv);
3359         jit_mov_gc_ptr(jit, cb, C_ARG_REGS[1], (VALUE)ci);
3360         mov(cb, C_ARG_REGS[2], const_ptr_opnd((void *)cfunc->func));
3361         jit_mov_gc_ptr(jit, cb, C_ARG_REGS[3], (VALUE)cme);
3362         call_ptr(cb, REG0, (void *)&check_cfunc_dispatch);
3363     }
3364
3365     // Copy SP into RAX because REG_SP will get overwritten
3366     lea(cb, RAX, ctx_sp_opnd(ctx, 0));
3367
3368     // Pop the C function arguments from the stack (in the caller)
3369     ctx_stack_pop(ctx, argc + 1);
3370
3371     // Write interpreter SP into CFP.
3372     // Needed in case the callee yields to the block.
3373     jit_save_sp(jit, ctx);
3374
3375     // Non-variadic method
3376     if (cfunc->argc >= 0) {
3377         // Copy the arguments from the stack to the C argument registers
3378         // self is the 0th argument and is at index argc from the stack top
3379         for (int32_t i = 0; i < argc + 1; ++i)
3380         {
3381             x86opnd_t stack_opnd = mem_opnd(64, RAX, -(argc + 1 - i) * SIZEOF_VALUE);
3382             x86opnd_t c_arg_reg = C_ARG_REGS[i];
3383             mov(cb, c_arg_reg, stack_opnd);
3384         }
3385     }
3386     // Variadic method
3387     if (cfunc->argc == -1) {
3388         // The method gets a pointer to the first argument
3389         // rb_f_puts(int argc, VALUE *argv, VALUE recv)
3390         mov(cb, C_ARG_REGS[0], imm_opnd(argc));
3391         lea(cb, C_ARG_REGS[1], mem_opnd(64, RAX, -(argc) * SIZEOF_VALUE));
3392         mov(cb, C_ARG_REGS[2], mem_opnd(64, RAX, -(argc + 1) * SIZEOF_VALUE));
3393     }
3394
3395     // Call the C function
3396     // VALUE ret = (cfunc->func)(recv, argv[0], argv[1]);
3397     // cfunc comes from compile-time cme->def, which we assume to be stable.
3398     // Invalidation logic is in rb_yjit_method_lookup_change()
3399     call_ptr(cb, REG0, (void*)cfunc->func);
3400
3401     // Record code position for TracePoint patching. See full_cfunc_return().
3402     record_global_inval_patch(cb, outline_full_cfunc_return_pos);
3403
3404     // Push the return value on the Ruby stack
3405     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
3406     mov(cb, stack_ret, RAX);
3407
3408     // Pop the stack frame (ec->cfp++)
3409     add(
3410         cb,
3411         member_opnd(REG_EC, rb_execution_context_t, cfp),
3412         imm_opnd(sizeof(rb_control_frame_t))
3413     );
3414
3415     // cfunc calls may corrupt types
3416     ctx_clear_local_types(ctx);
3417
3418     // Note: the return block of gen_send_iseq() has ctx->sp_offset == 1
3419     // which allows for sharing the same successor.
3420
3421     // Jump (fall through) to the call continuation block
3422     // We do this to end the current block after the call
3423     jit_jump_to_next_insn(jit, ctx);
3424     return YJIT_END_BLOCK;
3425 }
3426
3427 static void
3428 gen_return_branch(codeblock_t *cb, uint8_t *target0, uint8_t *target1, uint8_t shape)
3429 {
3430     switch (shape) {
3431       case SHAPE_NEXT0:
3432       case SHAPE_NEXT1:
3433         RUBY_ASSERT(false);
3434         break;
3435
3436       case SHAPE_DEFAULT:
3437         mov(cb, REG0, const_ptr_opnd(target0));
3438         mov(cb, member_opnd(REG_CFP, rb_control_frame_t, jit_return), REG0);
3439         break;
3440     }
3441 }
3442
3443 // If true, the iseq is leaf and it can be replaced by a single C call.
3444 static bool
3445 rb_leaf_invokebuiltin_iseq_p(const rb_iseq_t *iseq)
3446 {
3447     unsigned int invokebuiltin_len = insn_len(BIN(opt_invokebuiltin_delegate_leave));
3448     unsigned int leave_len = insn_len(BIN(leave));
3449
3450     return (iseq->body->iseq_size == (invokebuiltin_len + leave_len) &&
3451         rb_vm_insn_addr2opcode((void *)iseq->body->iseq_encoded[0]) == BIN(opt_invokebuiltin_delegate_leave) &&
3452         rb_vm_insn_addr2opcode((void *)iseq->body->iseq_encoded[invokebuiltin_len]) == BIN(leave) &&
3453         iseq->body->builtin_inline_p
3454     );
3455  }
3456
3457 // Return an rb_builtin_function if the iseq contains only that leaf builtin function.
3458 static const struct rb_builtin_function*
3459 rb_leaf_builtin_function(const rb_iseq_t *iseq)
3460 {
3461     if (!rb_leaf_invokebuiltin_iseq_p(iseq))
3462         return NULL;
3463     return (const struct rb_builtin_function *)iseq->body->iseq_encoded[1];
3464 }
3465
3466 static codegen_status_t
3467 gen_send_iseq(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, rb_iseq_t *block, int32_t argc)
3468 {
3469     const rb_iseq_t *iseq = def_iseq_ptr(cme->def);
3470
3471     // When you have keyword arguments, there is an extra object that gets
3472     // placed on the stack the represents a bitmap of the keywords that were not
3473     // specified at the call site. We need to keep track of the fact that this
3474     // value is present on the stack in order to properly set up the callee's
3475     // stack pointer.
3476     const bool doing_kw_call = iseq->body->param.flags.has_kw;
3477     const bool supplying_kws = vm_ci_flag(ci) & VM_CALL_KWARG;
3478
3479     if (vm_ci_flag(ci) & VM_CALL_TAILCALL) {
3480         // We can't handle tailcalls
3481         GEN_COUNTER_INC(cb, send_iseq_tailcall);
3482         return YJIT_CANT_COMPILE;
3483     }
3484
3485     // No support for callees with these parameters yet as they require allocation
3486     // or complex handling.
3487     if (iseq->body->param.flags.has_rest ||
3488         iseq->body->param.flags.has_post ||
3489         iseq->body->param.flags.has_kwrest) {
3490         GEN_COUNTER_INC(cb, send_iseq_complex_callee);
3491         return YJIT_CANT_COMPILE;
3492     }
3493
3494     // If we have keyword arguments being passed to a callee that only takes
3495     // positionals, then we need to allocate a hash. For now we're going to
3496     // call that too complex and bail.
3497     if (supplying_kws && !iseq->body->param.flags.has_kw) {
3498         GEN_COUNTER_INC(cb, send_iseq_complex_callee);
3499         return YJIT_CANT_COMPILE;
3500     }
3501
3502     // If we have a method accepting no kwargs (**nil), exit if we have passed
3503     // it any kwargs.
3504     if (supplying_kws && iseq->body->param.flags.accepts_no_kwarg) {
3505         GEN_COUNTER_INC(cb, send_iseq_complex_callee);
3506         return YJIT_CANT_COMPILE;
3507     }
3508
3509     // For computing number of locals to setup for the callee
3510     int num_params = iseq->body->param.size;
3511
3512     // Block parameter handling. This mirrors setup_parameters_complex().
3513     if (iseq->body->param.flags.has_block) {
3514         if (iseq->body->local_iseq == iseq) {
3515             // Block argument is passed through EP and not setup as a local in
3516             // the callee.
3517             num_params--;
3518         }
3519         else {
3520             // In this case (param.flags.has_block && local_iseq != iseq),
3521             // the block argument is setup as a local variable and requires
3522             // materialization (allocation). Bail.
3523             GEN_COUNTER_INC(cb, send_iseq_complex_callee);
3524             return YJIT_CANT_COMPILE;
3525         }
3526     }
3527
3528     uint32_t start_pc_offset = 0;
3529
3530     const int required_num = iseq->body->param.lead_num;
3531
3532     // This struct represents the metadata about the caller-specified
3533     // keyword arguments.
3534     const struct rb_callinfo_kwarg *kw_arg = vm_ci_kwarg(ci);
3535     const int kw_arg_num = kw_arg ? kw_arg->keyword_len : 0;
3536
3537     // Arity handling and optional parameter setup
3538     const int opts_filled = argc - required_num - kw_arg_num;
3539     const int opt_num = iseq->body->param.opt_num;
3540     const int opts_missing = opt_num - opts_filled;
3541
3542     if (opts_filled < 0 || opts_filled > opt_num) {
3543         GEN_COUNTER_INC(cb, send_iseq_arity_error);
3544         return YJIT_CANT_COMPILE;
3545     }
3546
3547     // If we have unfilled optional arguments and keyword arguments then we
3548     // would need to move adjust the arguments location to account for that.
3549     // For now we aren't handling this case.
3550     if (doing_kw_call && opts_missing > 0) {
3551         GEN_COUNTER_INC(cb, send_iseq_complex_callee);
3552         return YJIT_CANT_COMPILE;
3553     }
3554
3555     if (opt_num > 0) {
3556         num_params -= opt_num - opts_filled;
3557         start_pc_offset = (uint32_t)iseq->body->param.opt_table[opts_filled];
3558     }
3559
3560     if (doing_kw_call) {
3561         // Here we're calling a method with keyword arguments and specifying
3562         // keyword arguments at this call site.
3563
3564         // This struct represents the metadata about the callee-specified
3565         // keyword parameters.
3566         const struct rb_iseq_param_keyword *keyword = iseq->body->param.keyword;
3567
3568         int required_kwargs_filled = 0;
3569
3570         if (keyword->num > 30) {
3571             // We have so many keywords that (1 << num) encoded as a FIXNUM
3572             // (which shifts it left one more) no longer fits inside a 32-bit
3573             // immediate.
3574             GEN_COUNTER_INC(cb, send_iseq_complex_callee);
3575             return YJIT_CANT_COMPILE;
3576         }
3577
3578         // Check that the kwargs being passed are valid
3579         if (supplying_kws) {
3580             // This is the list of keyword arguments that the callee specified
3581             // in its initial declaration.
3582             const ID *callee_kwargs = keyword->table;
3583
3584             // Here we're going to build up a list of the IDs that correspond to
3585             // the caller-specified keyword arguments. If they're not in the
3586             // same order as the order specified in the callee declaration, then
3587             // we're going to need to generate some code to swap values around
3588             // on the stack.
3589             ID *caller_kwargs = ALLOCA_N(VALUE, kw_arg->keyword_len);
3590             for (int kwarg_idx = 0; kwarg_idx < kw_arg->keyword_len; kwarg_idx++)
3591                 caller_kwargs[kwarg_idx] = SYM2ID(kw_arg->keywords[kwarg_idx]);
3592
3593             // First, we're going to be sure that the names of every
3594             // caller-specified keyword argument correspond to a name in the
3595             // list of callee-specified keyword parameters.
3596             for (int caller_idx = 0; caller_idx < kw_arg->keyword_len; caller_idx++) {
3597                 int callee_idx;
3598
3599                 for (callee_idx = 0; callee_idx < keyword->num; callee_idx++) {
3600                     if (caller_kwargs[caller_idx] == callee_kwargs[callee_idx]) {
3601                         break;
3602                     }
3603                 }
3604
3605                 // If the keyword was never found, then we know we have a
3606                 // mismatch in the names of the keyword arguments, so we need to
3607                 // bail.
3608                 if (callee_idx == keyword->num) {
3609                     GEN_COUNTER_INC(cb, send_iseq_kwargs_mismatch);
3610                     return YJIT_CANT_COMPILE;
3611                 }
3612
3613                 // Keep a count to ensure all required kwargs are specified
3614                 if (callee_idx < keyword->required_num) {
3615                     required_kwargs_filled++;
3616                 }
3617             }
3618         }
3619
3620         RUBY_ASSERT(required_kwargs_filled <= keyword->required_num);
3621         if (required_kwargs_filled != keyword->required_num) {
3622             GEN_COUNTER_INC(cb, send_iseq_kwargs_mismatch);
3623             return YJIT_CANT_COMPILE;
3624         }
3625     }
3626
3627     // Number of locals that are not parameters
3628     const int num_locals = iseq->body->local_table_size - num_params;
3629
3630     // Create a side-exit to fall back to the interpreter
3631     uint8_t *side_exit = yjit_side_exit(jit, ctx);
3632
3633     // Check for interrupts
3634     yjit_check_ints(cb, side_exit);
3635
3636     const struct rb_builtin_function *leaf_builtin = rb_leaf_builtin_function(iseq);
3637
3638     if (leaf_builtin && !block && leaf_builtin->argc + 1 <= NUM_C_ARG_REGS) {
3639         ADD_COMMENT(cb, "inlined leaf builtin");
3640
3641         // Call the builtin func (ec, recv, arg1, arg2, ...)
3642         mov(cb, C_ARG_REGS[0], REG_EC);
3643
3644         // Copy self and arguments
3645         for (int32_t i = 0; i < leaf_builtin->argc + 1; i++) {
3646             x86opnd_t stack_opnd = ctx_stack_opnd(ctx, leaf_builtin->argc - i);
3647             x86opnd_t c_arg_reg = C_ARG_REGS[i + 1];
3648             mov(cb, c_arg_reg, stack_opnd);
3649         }
3650         ctx_stack_pop(ctx, leaf_builtin->argc + 1);
3651         call_ptr(cb, REG0, (void *)leaf_builtin->func_ptr);
3652
3653         // Push the return value
3654         x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
3655         mov(cb, stack_ret, RAX);
3656
3657         // Note: assuming that the leaf builtin doesn't change local variables here.
3658         // Seems like a safe assumption.
3659
3660         return YJIT_KEEP_COMPILING;
3661     }
3662
3663     // Stack overflow check
3664     // Note that vm_push_frame checks it against a decremented cfp, hence the multiply by 2.
3665     // #define CHECK_VM_STACK_OVERFLOW0(cfp, sp, margin)
3666     ADD_COMMENT(cb, "stack overflow check");
3667     lea(cb, REG0, ctx_sp_opnd(ctx, sizeof(VALUE) * (num_locals + iseq->body->stack_max) + 2 * sizeof(rb_control_frame_t)));
3668     cmp(cb, REG_CFP, REG0);
3669     jle_ptr(cb, COUNTED_EXIT(jit, side_exit, send_se_cf_overflow));
3670
3671     if (doing_kw_call) {
3672         // Here we're calling a method with keyword arguments and specifying
3673         // keyword arguments at this call site.
3674
3675         // Number of positional arguments the callee expects before the first
3676         // keyword argument
3677         const int args_before_kw = required_num + opt_num;
3678
3679         // This struct represents the metadata about the caller-specified
3680         // keyword arguments.
3681         int caller_keyword_len = 0;
3682         const VALUE *caller_keywords = NULL;
3683         if (vm_ci_kwarg(ci)) {
3684             caller_keyword_len = vm_ci_kwarg(ci)->keyword_len;
3685             caller_keywords = &vm_ci_kwarg(ci)->keywords[0];
3686         }
3687
3688         // This struct represents the metadata about the callee-specified
3689         // keyword parameters.
3690         const struct rb_iseq_param_keyword *const keyword = iseq->body->param.keyword;
3691
3692         ADD_COMMENT(cb, "keyword args");
3693
3694         // This is the list of keyword arguments that the callee specified
3695         // in its initial declaration.
3696         const ID *callee_kwargs = keyword->table;
3697
3698         int total_kwargs = keyword->num;
3699
3700         // Here we're going to build up a list of the IDs that correspond to
3701         // the caller-specified keyword arguments. If they're not in the
3702         // same order as the order specified in the callee declaration, then
3703         // we're going to need to generate some code to swap values around
3704         // on the stack.
3705         ID *caller_kwargs = ALLOCA_N(VALUE, total_kwargs);
3706         int kwarg_idx;
3707         for (kwarg_idx = 0; kwarg_idx < caller_keyword_len; kwarg_idx++) {
3708             caller_kwargs[kwarg_idx] = SYM2ID(caller_keywords[kwarg_idx]);
3709         }
3710
3711         int unspecified_bits = 0;
3712
3713         for (int callee_idx = keyword->required_num; callee_idx < total_kwargs; callee_idx++) {
3714             bool already_passed = false;
3715             ID callee_kwarg = callee_kwargs[callee_idx];
3716
3717             for (int caller_idx = 0; caller_idx < caller_keyword_len; caller_idx++) {
3718                 if (caller_kwargs[caller_idx] == callee_kwarg) {
3719                     already_passed = true;
3720                     break;
3721                 }
3722             }
3723
3724             if (!already_passed) {
3725                 // Reserve space on the stack for each default value we'll be
3726                 // filling in (which is done in the next loop). Also increments
3727                 // argc so that the callee's SP is recorded correctly.
3728                 argc++;
3729                 x86opnd_t default_arg = ctx_stack_push(ctx, TYPE_UNKNOWN);
3730                 VALUE default_value = keyword->default_values[callee_idx - keyword->required_num];
3731
3732                 if (default_value == Qundef) {
3733                     // Qundef means that this value is not constant and must be
3734                     // recalculated at runtime, so we record it in unspecified_bits
3735                     // (Qnil is then used as a placeholder instead of Qundef).
3736                     unspecified_bits |= 0x01 << (callee_idx - keyword->required_num);
3737                     default_value = Qnil;
3738                 }
3739
3740                 // GC might move default_value.
3741                 jit_mov_gc_ptr(jit, cb, REG0, default_value);
3742                 mov(cb, default_arg, REG0);
3743
3744                 caller_kwargs[kwarg_idx++] = callee_kwarg;
3745             }
3746         }
3747         RUBY_ASSERT(kwarg_idx == total_kwargs);
3748
3749         // Next, we're going to loop through every keyword that was
3750         // specified by the caller and make sure that it's in the correct
3751         // place. If it's not we're going to swap it around with another one.
3752         for (kwarg_idx = 0; kwarg_idx < total_kwargs; kwarg_idx++) {
3753             ID callee_kwarg = callee_kwargs[kwarg_idx];
3754
3755             // If the argument is already in the right order, then we don't
3756             // need to generate any code since the expected value is already
3757             // in the right place on the stack.
3758             if (callee_kwarg == caller_kwargs[kwarg_idx]) continue;
3759
3760             // In this case the argument is not in the right place, so we
3761             // need to find its position where it _should_ be and swap with
3762             // that location.
3763             for (int swap_idx = kwarg_idx + 1; swap_idx < total_kwargs; swap_idx++) {
3764                 if (callee_kwarg == caller_kwargs[swap_idx]) {
3765                     // First we're going to generate the code that is going
3766                     // to perform the actual swapping at runtime.
3767                     stack_swap(ctx, cb, argc - 1 - swap_idx - args_before_kw, argc - 1 - kwarg_idx - args_before_kw, REG1, REG0);
3768
3769                     // Next we're going to do some bookkeeping on our end so
3770                     // that we know the order that the arguments are
3771                     // actually in now.
3772                     ID tmp = caller_kwargs[kwarg_idx];
3773                     caller_kwargs[kwarg_idx] = caller_kwargs[swap_idx];
3774                     caller_kwargs[swap_idx] = tmp;
3775
3776                     break;
3777                 }
3778             }
3779         }
3780
3781         // Keyword arguments cause a special extra local variable to be
3782         // pushed onto the stack that represents the parameters that weren't
3783         // explicitly given a value and have a non-constant default.
3784         mov(cb, ctx_stack_opnd(ctx, -1), imm_opnd(INT2FIX(unspecified_bits)));
3785     }
3786     // Points to the receiver operand on the stack
3787     x86opnd_t recv = ctx_stack_opnd(ctx, argc);
3788
3789     // Store the updated SP on the current frame (pop arguments and receiver)
3790     ADD_COMMENT(cb, "store caller sp");
3791     lea(cb, REG0, ctx_sp_opnd(ctx, sizeof(VALUE) * -(argc + 1)));
3792     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, sp), REG0);
3793
3794     // Store the next PC in the current frame
3795     jit_save_pc(jit, REG0);
3796
3797     if (block) {
3798         // Change cfp->block_code in the current frame. See vm_caller_setup_arg_block().
3799         // VM_CFP_TO_CAPTURED_BLCOK does &cfp->self, rb_captured_block->code.iseq aliases
3800         // with cfp->block_code.
3801         jit_mov_gc_ptr(jit, cb, REG0, (VALUE)block);
3802         mov(cb, member_opnd(REG_CFP, rb_control_frame_t, block_code), REG0);
3803     }
3804
3805     // Adjust the callee's stack pointer
3806     lea(cb, REG0, ctx_sp_opnd(ctx, sizeof(VALUE) * (3 + num_locals + doing_kw_call)));
3807
3808     // Initialize local variables to Qnil
3809     for (int i = 0; i < num_locals; i++) {
3810         mov(cb, mem_opnd(64, REG0, sizeof(VALUE) * (i - num_locals - 3)), imm_opnd(Qnil));
3811     }
3812
3813     ADD_COMMENT(cb, "push env");
3814     // Put compile time cme into REG1. It's assumed to be valid because we are notified when
3815     // any cme we depend on become outdated. See rb_yjit_method_lookup_change().
3816     jit_mov_gc_ptr(jit, cb, REG1, (VALUE)cme);
3817     // Write method entry at sp[-3]
3818     // sp[-3] = me;
3819     mov(cb, mem_opnd(64, REG0, 8 * -3), REG1);
3820
3821     // Write block handler at sp[-2]
3822     // sp[-2] = block_handler;
3823     if (block) {
3824         // reg1 = VM_BH_FROM_ISEQ_BLOCK(VM_CFP_TO_CAPTURED_BLOCK(reg_cfp));
3825         lea(cb, REG1, member_opnd(REG_CFP, rb_control_frame_t, self));
3826         or(cb, REG1, imm_opnd(1));
3827         mov(cb, mem_opnd(64, REG0, 8 * -2), REG1);
3828     }
3829     else {
3830         mov(cb, mem_opnd(64, REG0, 8 * -2), imm_opnd(VM_BLOCK_HANDLER_NONE));
3831     }
3832
3833     // Write env flags at sp[-1]
3834     // sp[-1] = frame_type;
3835     uint64_t frame_type = VM_FRAME_MAGIC_METHOD | VM_ENV_FLAG_LOCAL;
3836     mov(cb, mem_opnd(64, REG0, 8 * -1), imm_opnd(frame_type));
3837
3838     ADD_COMMENT(cb, "push callee CFP");
3839     // Allocate a new CFP (ec->cfp--)
3840     sub(cb, REG_CFP, imm_opnd(sizeof(rb_control_frame_t)));
3841     mov(cb, member_opnd(REG_EC, rb_execution_context_t, cfp), REG_CFP);
3842
3843     // Setup the new frame
3844     // *cfp = (const struct rb_control_frame_struct) {
3845     //    .pc         = pc,
3846     //    .sp         = sp,
3847     //    .iseq       = iseq,
3848     //    .self       = recv,
3849     //    .ep         = sp - 1,
3850     //    .block_code = 0,
3851     //    .__bp__     = sp,
3852     // };
3853     mov(cb, REG1, recv);
3854     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, self), REG1);
3855     mov(cb, REG_SP, REG0); // Switch to the callee's REG_SP
3856     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, sp), REG0);
3857     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, __bp__), REG0);
3858     sub(cb, REG0, imm_opnd(sizeof(VALUE)));
3859     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, ep), REG0);
3860     jit_mov_gc_ptr(jit, cb, REG0, (VALUE)iseq);
3861     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, iseq), REG0);
3862     mov(cb, member_opnd(REG_CFP, rb_control_frame_t, block_code), imm_opnd(0));
3863
3864     // No need to set cfp->pc since the callee sets it whenever calling into routines
3865     // that could look at it through jit_save_pc().
3866     // mov(cb, REG0, const_ptr_opnd(start_pc));
3867     // mov(cb, member_opnd(REG_CFP, rb_control_frame_t, pc), REG0);
3868
3869     // Stub so we can return to JITted code
3870     blockid_t return_block = { jit->iseq, jit_next_insn_idx(jit) };
3871
3872     // Create a context for the callee
3873     ctx_t callee_ctx = DEFAULT_CTX;
3874
3875     // Set the argument types in the callee's context
3876     for (int32_t arg_idx = 0; arg_idx < argc; ++arg_idx) {
3877         val_type_t arg_type = ctx_get_opnd_type(ctx, OPND_STACK(argc - arg_idx - 1));
3878         ctx_set_local_type(&callee_ctx, arg_idx, arg_type);
3879     }
3880     val_type_t recv_type = ctx_get_opnd_type(ctx, OPND_STACK(argc));
3881     ctx_upgrade_opnd_type(&callee_ctx, OPND_SELF, recv_type);
3882
3883     // The callee might change locals through Kernel#binding and other means.
3884     ctx_clear_local_types(ctx);
3885
3886     // Pop arguments and receiver in return context, push the return value
3887     // After the return, sp_offset will be 1. The codegen for leave writes
3888     // the return value in case of JIT-to-JIT return.
3889     ctx_t return_ctx = *ctx;
3890     ctx_stack_pop(&return_ctx, argc + 1);
3891     ctx_stack_push(&return_ctx, TYPE_UNKNOWN);
3892     return_ctx.sp_offset = 1;
3893     return_ctx.chain_depth = 0;
3894
3895     // Write the JIT return address on the callee frame
3896     gen_branch(
3897         jit,
3898         ctx,
3899         return_block,
3900         &return_ctx,
3901         return_block,
3902         &return_ctx,
3903         gen_return_branch
3904     );
3905
3906     //print_str(cb, "calling Ruby func:");
3907     //print_str(cb, rb_id2name(vm_ci_mid(ci)));
3908
3909     // Directly jump to the entry point of the callee
3910     gen_direct_jump(
3911         jit,
3912         &callee_ctx,
3913         (blockid_t){ iseq, start_pc_offset }
3914     );
3915
3916     return YJIT_END_BLOCK;
3917 }
3918
3919 static codegen_status_t
3920 gen_struct_aref(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, VALUE comptime_recv, VALUE comptime_recv_klass) {
3921     if (vm_ci_argc(ci) != 0) {
3922         return YJIT_CANT_COMPILE;
3923     }
3924
3925     const unsigned int off = cme->def->body.optimized.index;
3926
3927     // Confidence checks
3928     RUBY_ASSERT_ALWAYS(RB_TYPE_P(comptime_recv, T_STRUCT));
3929     RUBY_ASSERT_ALWAYS((long)off < RSTRUCT_LEN(comptime_recv));
3930
3931     // We are going to use an encoding that takes a 4-byte immediate which
3932     // limits the offset to INT32_MAX.
3933     {
3934         uint64_t native_off = (uint64_t)off * (uint64_t)SIZEOF_VALUE;
3935         if (native_off > (uint64_t)INT32_MAX) {
3936             return YJIT_CANT_COMPILE;
3937         }
3938     }
3939
3940     // All structs from the same Struct class should have the same
3941     // length. So if our comptime_recv is embedded all runtime
3942     // structs of the same class should be as well, and the same is
3943     // true of the converse.
3944     bool embedded = FL_TEST_RAW(comptime_recv, RSTRUCT_EMBED_LEN_MASK);
3945
3946     ADD_COMMENT(cb, "struct aref");
3947
3948     x86opnd_t recv = ctx_stack_pop(ctx, 1);
3949
3950     mov(cb, REG0, recv);
3951
3952     if (embedded) {
3953         mov(cb, REG0, member_opnd_idx(REG0, struct RStruct, as.ary, off));
3954     }
3955     else {
3956         mov(cb, REG0, member_opnd(REG0, struct RStruct, as.heap.ptr));
3957         mov(cb, REG0, mem_opnd(64, REG0, SIZEOF_VALUE * off));
3958     }
3959
3960     x86opnd_t ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
3961     mov(cb, ret, REG0);
3962
3963     jit_jump_to_next_insn(jit, ctx);
3964     return YJIT_END_BLOCK;
3965 }
3966
3967 static codegen_status_t
3968 gen_struct_aset(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const rb_callable_method_entry_t *cme, VALUE comptime_recv, VALUE comptime_recv_klass) {
3969     if (vm_ci_argc(ci) != 1) {
3970         return YJIT_CANT_COMPILE;
3971     }
3972
3973     const unsigned int off = cme->def->body.optimized.index;
3974
3975     // Confidence checks
3976     RUBY_ASSERT_ALWAYS(RB_TYPE_P(comptime_recv, T_STRUCT));
3977     RUBY_ASSERT_ALWAYS((long)off < RSTRUCT_LEN(comptime_recv));
3978
3979     ADD_COMMENT(cb, "struct aset");
3980
3981     x86opnd_t val = ctx_stack_pop(ctx, 1);
3982     x86opnd_t recv = ctx_stack_pop(ctx, 1);
3983
3984     mov(cb, C_ARG_REGS[0], recv);
3985     mov(cb, C_ARG_REGS[1], imm_opnd(off));
3986     mov(cb, C_ARG_REGS[2], val);
3987     call_ptr(cb, REG0, (void *)RSTRUCT_SET);
3988
3989     x86opnd_t ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
3990     mov(cb, ret, RAX);
3991
3992     jit_jump_to_next_insn(jit, ctx);
3993     return YJIT_END_BLOCK;
3994 }
3995
3996 const rb_callable_method_entry_t *
3997 rb_aliased_callable_method_entry(const rb_callable_method_entry_t *me);
3998
3999 static codegen_status_t
4000 gen_send_general(jitstate_t *jit, ctx_t *ctx, struct rb_call_data *cd, rb_iseq_t *block)
4001 {
4002     // Relevant definitions:
4003     // rb_execution_context_t       : vm_core.h
4004     // invoker, cfunc logic         : method.h, vm_method.c
4005     // rb_callinfo                  : vm_callinfo.h
4006     // rb_callable_method_entry_t   : method.h
4007     // vm_call_cfunc_with_frame     : vm_insnhelper.c
4008     //
4009     // For a general overview for how the interpreter calls methods,
4010     // see vm_call_method().
4011
4012     const struct rb_callinfo *ci = cd->ci; // info about the call site
4013
4014     int32_t argc = (int32_t)vm_ci_argc(ci);
4015     ID mid = vm_ci_mid(ci);
4016
4017     // Don't JIT calls with keyword splat
4018     if (vm_ci_flag(ci) & VM_CALL_KW_SPLAT) {
4019         GEN_COUNTER_INC(cb, send_kw_splat);
4020         return YJIT_CANT_COMPILE;
4021     }
4022
4023     // Don't JIT calls that aren't simple
4024     // Note, not using VM_CALL_ARGS_SIMPLE because sometimes we pass a block.
4025     if ((vm_ci_flag(ci) & VM_CALL_ARGS_SPLAT) != 0) {
4026         GEN_COUNTER_INC(cb, send_args_splat);
4027         return YJIT_CANT_COMPILE;
4028     }
4029     if ((vm_ci_flag(ci) & VM_CALL_ARGS_BLOCKARG) != 0) {
4030         GEN_COUNTER_INC(cb, send_block_arg);
4031         return YJIT_CANT_COMPILE;
4032     }
4033
4034     // Defer compilation so we can specialize on class of receiver
4035     if (!jit_at_current_insn(jit)) {
4036         defer_compilation(jit, ctx);
4037         return YJIT_END_BLOCK;
4038     }
4039
4040     VALUE comptime_recv = jit_peek_at_stack(jit, ctx, argc);
4041     VALUE comptime_recv_klass = CLASS_OF(comptime_recv);
4042
4043     // Guard that the receiver has the same class as the one from compile time
4044     uint8_t *side_exit = yjit_side_exit(jit, ctx);
4045
4046     // Points to the receiver operand on the stack
4047     x86opnd_t recv = ctx_stack_opnd(ctx, argc);
4048     insn_opnd_t recv_opnd = OPND_STACK(argc);
4049     mov(cb, REG0, recv);
4050     if (!jit_guard_known_klass(jit, ctx, comptime_recv_klass, recv_opnd, comptime_recv, SEND_MAX_DEPTH, side_exit)) {
4051         return YJIT_CANT_COMPILE;
4052     }
4053
4054     // Do method lookup
4055     const rb_callable_method_entry_t *cme = rb_callable_method_entry(comptime_recv_klass, mid);
4056     if (!cme) {
4057         // TODO: counter
4058         return YJIT_CANT_COMPILE;
4059     }
4060
4061     switch (METHOD_ENTRY_VISI(cme)) {
4062       case METHOD_VISI_PUBLIC:
4063         // Can always call public methods
4064         break;
4065       case METHOD_VISI_PRIVATE:
4066         if (!(vm_ci_flag(ci) & VM_CALL_FCALL)) {
4067             // Can only call private methods with FCALL callsites.
4068             // (at the moment they are callsites without a receiver or an explicit `self` receiver)
4069             return YJIT_CANT_COMPILE;
4070         }
4071         break;
4072       case METHOD_VISI_PROTECTED:
4073         jit_protected_callee_ancestry_guard(jit, cb, cme, side_exit);
4074         break;
4075       case METHOD_VISI_UNDEF:
4076         RUBY_ASSERT(false && "cmes should always have a visibility");
4077         break;
4078     }
4079
4080     // Register block for invalidation
4081     RUBY_ASSERT(cme->called_id == mid);
4082     assume_method_lookup_stable(comptime_recv_klass, cme, jit);
4083
4084     // To handle the aliased method case (VM_METHOD_TYPE_ALIAS)
4085     while (true) {
4086         // switch on the method type
4087         switch (cme->def->type) {
4088           case VM_METHOD_TYPE_ISEQ:
4089             return gen_send_iseq(jit, ctx, ci, cme, block, argc);
4090           case VM_METHOD_TYPE_CFUNC:
4091             if ((vm_ci_flag(ci) & VM_CALL_KWARG) != 0) {
4092                 GEN_COUNTER_INC(cb, send_cfunc_kwargs);
4093                 return YJIT_CANT_COMPILE;
4094             }
4095             return gen_send_cfunc(jit, ctx, ci, cme, block, argc, &comptime_recv_klass);
4096           case VM_METHOD_TYPE_IVAR:
4097             if (argc != 0) {
4098                 // Argument count mismatch. Getters take no arguments.
4099                 GEN_COUNTER_INC(cb, send_getter_arity);
4100                 return YJIT_CANT_COMPILE;
4101             }
4102             if (c_method_tracing_currently_enabled(jit)) {
4103                 // Can't generate code for firing c_call and c_return events
4104                 // :attr-tracing:
4105                 // Handling the C method tracing events for attr_accessor
4106                 // methods is easier than regular C methods as we know the
4107                 // "method" we are calling into never enables those tracing
4108                 // events. Once global invalidation runs, the code for the
4109                 // attr_accessor is invalidated and we exit at the closest
4110                 // instruction boundary which is always outside of the body of
4111                 // the attr_accessor code.
4112                 GEN_COUNTER_INC(cb, send_cfunc_tracing);
4113                 return YJIT_CANT_COMPILE;
4114             }
4115
4116             mov(cb, REG0, recv);
4117
4118             ID ivar_name = cme->def->body.attr.id;
4119             return gen_get_ivar(jit, ctx, SEND_MAX_DEPTH, comptime_recv, ivar_name, recv_opnd, side_exit);
4120           case VM_METHOD_TYPE_ATTRSET:
4121             if ((vm_ci_flag(ci) & VM_CALL_KWARG) != 0) {
4122                 GEN_COUNTER_INC(cb, send_attrset_kwargs);
4123                 return YJIT_CANT_COMPILE;
4124             }
4125             else if (argc != 1 || !RB_TYPE_P(comptime_recv, T_OBJECT)) {
4126                 GEN_COUNTER_INC(cb, send_ivar_set_method);
4127                 return YJIT_CANT_COMPILE;
4128             }
4129             else if (c_method_tracing_currently_enabled(jit)) {
4130                 // Can't generate code for firing c_call and c_return events
4131                 // See :attr-tracing:
4132                 GEN_COUNTER_INC(cb, send_cfunc_tracing);
4133                 return YJIT_CANT_COMPILE;
4134             }
4135             else {
4136                 ID ivar_name = cme->def->body.attr.id;
4137                 return gen_set_ivar(jit, ctx, comptime_recv, comptime_recv_klass, ivar_name);
4138             }
4139           // Block method, e.g. define_method(:foo) { :my_block }
4140           case VM_METHOD_TYPE_BMETHOD:
4141             GEN_COUNTER_INC(cb, send_bmethod);
4142             return YJIT_CANT_COMPILE;
4143           case VM_METHOD_TYPE_ZSUPER:
4144             GEN_COUNTER_INC(cb, send_zsuper_method);
4145             return YJIT_CANT_COMPILE;
4146           case VM_METHOD_TYPE_ALIAS: {
4147             // Retrieve the alised method and re-enter the switch
4148             cme = rb_aliased_callable_method_entry(cme);
4149             continue;
4150           }
4151           case VM_METHOD_TYPE_UNDEF:
4152             GEN_COUNTER_INC(cb, send_undef_method);
4153             return YJIT_CANT_COMPILE;
4154           case VM_METHOD_TYPE_NOTIMPLEMENTED:
4155             GEN_COUNTER_INC(cb, send_not_implemented_method);
4156             return YJIT_CANT_COMPILE;
4157           // Send family of methods, e.g. call/apply
4158           case VM_METHOD_TYPE_OPTIMIZED:
4159             switch (cme->def->body.optimized.type) {
4160               case OPTIMIZED_METHOD_TYPE_SEND:
4161                 GEN_COUNTER_INC(cb, send_optimized_method_send);
4162                 return YJIT_CANT_COMPILE;
4163               case OPTIMIZED_METHOD_TYPE_CALL:
4164                 GEN_COUNTER_INC(cb, send_optimized_method_call);
4165                 return YJIT_CANT_COMPILE;
4166               case OPTIMIZED_METHOD_TYPE_BLOCK_CALL:
4167                 GEN_COUNTER_INC(cb, send_optimized_method_block_call);
4168                 return YJIT_CANT_COMPILE;
4169               case OPTIMIZED_METHOD_TYPE_STRUCT_AREF:
4170                 return gen_struct_aref(jit, ctx, ci, cme, comptime_recv, comptime_recv_klass);
4171               case OPTIMIZED_METHOD_TYPE_STRUCT_ASET:
4172                 return gen_struct_aset(jit, ctx, ci, cme, comptime_recv, comptime_recv_klass);
4173               default:
4174                 rb_bug("unknown optimized method type (%d)", cme->def->body.optimized.type);
4175                 UNREACHABLE_RETURN(YJIT_CANT_COMPILE);
4176             }
4177           case VM_METHOD_TYPE_MISSING:
4178             GEN_COUNTER_INC(cb, send_missing_method);
4179             return YJIT_CANT_COMPILE;
4180           case VM_METHOD_TYPE_REFINED:
4181             GEN_COUNTER_INC(cb, send_refined_method);
4182             return YJIT_CANT_COMPILE;
4183             // no default case so compiler issues a warning if this is not exhaustive
4184         }
4185
4186         // Unreachable
4187         RUBY_ASSERT(false);
4188     }
4189 }
4190
4191 static codegen_status_t
4192 gen_opt_send_without_block(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4193 {
4194     struct rb_call_data *cd = (struct rb_call_data *)jit_get_arg(jit, 0);
4195     return gen_send_general(jit, ctx, cd, NULL);
4196 }
4197
4198 static codegen_status_t
4199 gen_send(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4200 {
4201     struct rb_call_data *cd = (struct rb_call_data *)jit_get_arg(jit, 0);
4202     rb_iseq_t *block = (rb_iseq_t *)jit_get_arg(jit, 1);
4203     return gen_send_general(jit, ctx, cd, block);
4204 }
4205
4206 static codegen_status_t
4207 gen_invokesuper(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4208 {
4209     struct rb_call_data *cd = (struct rb_call_data *)jit_get_arg(jit, 0);
4210     rb_iseq_t *block = (rb_iseq_t *)jit_get_arg(jit, 1);
4211
4212     // Defer compilation so we can specialize on class of receiver
4213     if (!jit_at_current_insn(jit)) {
4214         defer_compilation(jit, ctx);
4215         return YJIT_END_BLOCK;
4216     }
4217
4218     const rb_callable_method_entry_t *me = rb_vm_frame_method_entry(jit->ec->cfp);
4219     if (!me) {
4220         return YJIT_CANT_COMPILE;
4221     }
4222
4223     // FIXME: We should track and invalidate this block when this cme is invalidated
4224     VALUE current_defined_class = me->defined_class;
4225     ID mid = me->def->original_id;
4226
4227     if (me != rb_callable_method_entry(current_defined_class, me->called_id)) {
4228         // Though we likely could generate this call, as we are only concerned
4229         // with the method entry remaining valid, assume_method_lookup_stable
4230         // below requires that the method lookup matches as well
4231         return YJIT_CANT_COMPILE;
4232     }
4233
4234     // vm_search_normal_superclass
4235     if (BUILTIN_TYPE(current_defined_class) == T_ICLASS && FL_TEST_RAW(RBASIC(current_defined_class)->klass, RMODULE_IS_REFINEMENT)) {
4236         return YJIT_CANT_COMPILE;
4237     }
4238     VALUE comptime_superclass = RCLASS_SUPER(RCLASS_ORIGIN(current_defined_class));
4239
4240     const struct rb_callinfo *ci = cd->ci;
4241     int32_t argc = (int32_t)vm_ci_argc(ci);
4242
4243     // Don't JIT calls that aren't simple
4244     // Note, not using VM_CALL_ARGS_SIMPLE because sometimes we pass a block.
4245     if ((vm_ci_flag(ci) & VM_CALL_ARGS_SPLAT) != 0) {
4246         GEN_COUNTER_INC(cb, send_args_splat);
4247         return YJIT_CANT_COMPILE;
4248     }
4249     if ((vm_ci_flag(ci) & VM_CALL_KWARG) != 0) {
4250         GEN_COUNTER_INC(cb, send_keywords);
4251         return YJIT_CANT_COMPILE;
4252     }
4253     if ((vm_ci_flag(ci) & VM_CALL_KW_SPLAT) != 0) {
4254         GEN_COUNTER_INC(cb, send_kw_splat);
4255         return YJIT_CANT_COMPILE;
4256     }
4257     if ((vm_ci_flag(ci) & VM_CALL_ARGS_BLOCKARG) != 0) {
4258         GEN_COUNTER_INC(cb, send_block_arg);
4259         return YJIT_CANT_COMPILE;
4260     }
4261
4262     // Ensure we haven't rebound this method onto an incompatible class.
4263     // In the interpreter we try to avoid making this check by performing some
4264     // cheaper calculations first, but since we specialize on the method entry
4265     // and so only have to do this once at compile time this is fine to always
4266     // check and side exit.
4267     VALUE comptime_recv = jit_peek_at_stack(jit, ctx, argc);
4268     if (!rb_obj_is_kind_of(comptime_recv, current_defined_class)) {
4269         return YJIT_CANT_COMPILE;
4270     }
4271
4272     // Do method lookup
4273     const rb_callable_method_entry_t *cme = rb_callable_method_entry(comptime_superclass, mid);
4274
4275     if (!cme) {
4276         return YJIT_CANT_COMPILE;
4277     }
4278
4279     // Check that we'll be able to write this method dispatch before generating checks
4280     switch (cme->def->type) {
4281       case VM_METHOD_TYPE_ISEQ:
4282       case VM_METHOD_TYPE_CFUNC:
4283         break;
4284       default:
4285         // others unimplemented
4286         return YJIT_CANT_COMPILE;
4287     }
4288
4289     // Guard that the receiver has the same class as the one from compile time
4290     uint8_t *side_exit = yjit_side_exit(jit, ctx);
4291
4292     if (jit->ec->cfp->ep[VM_ENV_DATA_INDEX_ME_CREF] != (VALUE)me) {
4293         // This will be the case for super within a block
4294         return YJIT_CANT_COMPILE;
4295     }
4296
4297     ADD_COMMENT(cb, "guard known me");
4298     mov(cb, REG0, member_opnd(REG_CFP, rb_control_frame_t, ep));
4299     x86opnd_t ep_me_opnd = mem_opnd(64, REG0, SIZEOF_VALUE * VM_ENV_DATA_INDEX_ME_CREF);
4300     jit_mov_gc_ptr(jit, cb, REG1, (VALUE)me);
4301     cmp(cb, ep_me_opnd, REG1);
4302     jne_ptr(cb, COUNTED_EXIT(jit, side_exit, invokesuper_me_changed));
4303
4304     if (!block) {
4305         // Guard no block passed
4306         // rb_vm_frame_block_handler(GET_EC()->cfp) == VM_BLOCK_HANDLER_NONE
4307         // note, we assume VM_ASSERT(VM_ENV_LOCAL_P(ep))
4308         //
4309         // TODO: this could properly forward the current block handler, but
4310         // would require changes to gen_send_*
4311         ADD_COMMENT(cb, "guard no block given");
4312         // EP is in REG0 from above
4313         x86opnd_t ep_specval_opnd = mem_opnd(64, REG0, SIZEOF_VALUE * VM_ENV_DATA_INDEX_SPECVAL);
4314         cmp(cb, ep_specval_opnd, imm_opnd(VM_BLOCK_HANDLER_NONE));
4315         jne_ptr(cb, COUNTED_EXIT(jit, side_exit, invokesuper_block));
4316     }
4317
4318     // Points to the receiver operand on the stack
4319     x86opnd_t recv = ctx_stack_opnd(ctx, argc);
4320     mov(cb, REG0, recv);
4321
4322     // We need to assume that both our current method entry and the super
4323     // method entry we invoke remain stable
4324     assume_method_lookup_stable(current_defined_class, me, jit);
4325     assume_method_lookup_stable(comptime_superclass, cme, jit);
4326
4327     // Method calls may corrupt types
4328     ctx_clear_local_types(ctx);
4329
4330     switch (cme->def->type) {
4331       case VM_METHOD_TYPE_ISEQ:
4332         return gen_send_iseq(jit, ctx, ci, cme, block, argc);
4333       case VM_METHOD_TYPE_CFUNC:
4334         return gen_send_cfunc(jit, ctx, ci, cme, block, argc, NULL);
4335       default:
4336         break;
4337     }
4338
4339     RUBY_ASSERT_ALWAYS(false);
4340 }
4341
4342 static codegen_status_t
4343 gen_leave(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4344 {
4345     // Only the return value should be on the stack
4346     RUBY_ASSERT(ctx->stack_size == 1);
4347
4348     // Create a side-exit to fall back to the interpreter
4349     uint8_t *side_exit = yjit_side_exit(jit, ctx);
4350
4351     // Load environment pointer EP from CFP
4352     mov(cb, REG1, member_opnd(REG_CFP, rb_control_frame_t, ep));
4353
4354     // Check for interrupts
4355     ADD_COMMENT(cb, "check for interrupts");
4356     yjit_check_ints(cb, COUNTED_EXIT(jit, side_exit, leave_se_interrupt));
4357
4358     // Load the return value
4359     mov(cb, REG0, ctx_stack_pop(ctx, 1));
4360
4361     // Pop the current frame (ec->cfp++)
4362     // Note: the return PC is already in the previous CFP
4363     add(cb, REG_CFP, imm_opnd(sizeof(rb_control_frame_t)));
4364     mov(cb, member_opnd(REG_EC, rb_execution_context_t, cfp), REG_CFP);
4365
4366     // Reload REG_SP for the caller and write the return value.
4367     // Top of the stack is REG_SP[0] since the caller has sp_offset=1.
4368     mov(cb, REG_SP, member_opnd(REG_CFP, rb_control_frame_t, sp));
4369     mov(cb, mem_opnd(64, REG_SP, 0), REG0);
4370
4371     // Jump to the JIT return address on the frame that was just popped
4372     const int32_t offset_to_jit_return = -((int32_t)sizeof(rb_control_frame_t)) + (int32_t)offsetof(rb_control_frame_t, jit_return);
4373     jmp_rm(cb, mem_opnd(64, REG_CFP, offset_to_jit_return));
4374
4375     return YJIT_END_BLOCK;
4376 }
4377
4378 RUBY_EXTERN rb_serial_t ruby_vm_global_constant_state;
4379
4380 static codegen_status_t
4381 gen_getglobal(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4382 {
4383     ID gid = jit_get_arg(jit, 0);
4384
4385     // Save the PC and SP because we might make a Ruby call for warning
4386     jit_prepare_routine_call(jit, ctx, REG0);
4387
4388     mov(cb, C_ARG_REGS[0], imm_opnd(gid));
4389
4390     call_ptr(cb, REG0, (void *)&rb_gvar_get);
4391
4392     x86opnd_t top = ctx_stack_push(ctx, TYPE_UNKNOWN);
4393     mov(cb, top, RAX);
4394
4395     return YJIT_KEEP_COMPILING;
4396 }
4397
4398 static codegen_status_t
4399 gen_setglobal(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4400 {
4401     ID gid = jit_get_arg(jit, 0);
4402
4403     // Save the PC and SP because we might make a Ruby call for
4404     // Kernel#set_trace_var
4405     jit_prepare_routine_call(jit, ctx, REG0);
4406
4407     mov(cb, C_ARG_REGS[0], imm_opnd(gid));
4408
4409     x86opnd_t val = ctx_stack_pop(ctx, 1);
4410
4411     mov(cb, C_ARG_REGS[1], val);
4412
4413     call_ptr(cb, REG0, (void *)&rb_gvar_set);
4414
4415     return YJIT_KEEP_COMPILING;
4416 }
4417
4418 static codegen_status_t
4419 gen_anytostring(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4420 {
4421     // Save the PC and SP because we might make a Ruby call for
4422     // Kernel#set_trace_var
4423     jit_prepare_routine_call(jit, ctx, REG0);
4424
4425     x86opnd_t str = ctx_stack_pop(ctx, 1);
4426     x86opnd_t val = ctx_stack_pop(ctx, 1);
4427
4428     mov(cb, C_ARG_REGS[0], str);
4429     mov(cb, C_ARG_REGS[1], val);
4430
4431     call_ptr(cb, REG0, (void *)&rb_obj_as_string_result);
4432
4433     // Push the return value
4434     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_STRING);
4435     mov(cb, stack_ret, RAX);
4436
4437     return YJIT_KEEP_COMPILING;
4438 }
4439
4440 static codegen_status_t
4441 gen_objtostring(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4442 {
4443     if (!jit_at_current_insn(jit)) {
4444         defer_compilation(jit, ctx);
4445         return YJIT_END_BLOCK;
4446     }
4447
4448     x86opnd_t recv = ctx_stack_opnd(ctx, 0);
4449     VALUE comptime_recv = jit_peek_at_stack(jit, ctx, 0);
4450
4451     if (RB_TYPE_P(comptime_recv, T_STRING)) {
4452         uint8_t *side_exit = yjit_side_exit(jit, ctx);
4453
4454         mov(cb, REG0, recv);
4455         jit_guard_known_klass(jit, ctx, CLASS_OF(comptime_recv), OPND_STACK(0), comptime_recv, SEND_MAX_DEPTH, side_exit);
4456         // No work needed. The string value is already on the top of the stack.
4457         return YJIT_KEEP_COMPILING;
4458     }
4459     else {
4460         struct rb_call_data *cd = (struct rb_call_data *)jit_get_arg(jit, 0);
4461         return gen_send_general(jit, ctx, cd, NULL);
4462     }
4463 }
4464
4465 static codegen_status_t
4466 gen_toregexp(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4467 {
4468     rb_num_t opt = jit_get_arg(jit, 0);
4469     rb_num_t cnt = jit_get_arg(jit, 1);
4470
4471     // Save the PC and SP because this allocates an object and could
4472     // raise an exception.
4473     jit_prepare_routine_call(jit, ctx, REG0);
4474
4475     x86opnd_t values_ptr = ctx_sp_opnd(ctx, -(sizeof(VALUE) * (uint32_t)cnt));
4476     ctx_stack_pop(ctx, cnt);
4477
4478     mov(cb, C_ARG_REGS[0], imm_opnd(0));
4479     mov(cb, C_ARG_REGS[1], imm_opnd(cnt));
4480     lea(cb, C_ARG_REGS[2], values_ptr);
4481     call_ptr(cb, REG0, (void *)&rb_ary_tmp_new_from_values);
4482
4483     // Save the array so we can clear it later
4484     push(cb, RAX);
4485     push(cb, RAX); // Alignment
4486     mov(cb, C_ARG_REGS[0], RAX);
4487     mov(cb, C_ARG_REGS[1], imm_opnd(opt));
4488     call_ptr(cb, REG0, (void *)&rb_reg_new_ary);
4489
4490     // The actual regex is in RAX now.  Pop the temp array from
4491     // rb_ary_tmp_new_from_values into C arg regs so we can clear it
4492     pop(cb, REG1); // Alignment
4493     pop(cb, C_ARG_REGS[0]);
4494
4495     // The value we want to push on the stack is in RAX right now
4496     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
4497     mov(cb, stack_ret, RAX);
4498
4499     // Clear the temp array.
4500     call_ptr(cb, REG0, (void *)&rb_ary_clear);
4501
4502     return YJIT_KEEP_COMPILING;
4503 }
4504
4505 static codegen_status_t
4506 gen_intern(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4507 {
4508     // Save the PC and SP because we might allocate
4509     jit_prepare_routine_call(jit, ctx, REG0);
4510
4511     x86opnd_t str = ctx_stack_pop(ctx, 1);
4512
4513     mov(cb, C_ARG_REGS[0], str);
4514
4515     call_ptr(cb, REG0, (void *)&rb_str_intern);
4516
4517     // Push the return value
4518     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
4519     mov(cb, stack_ret, RAX);
4520
4521     return YJIT_KEEP_COMPILING;
4522 }
4523
4524 static codegen_status_t
4525 gen_getspecial(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4526 {
4527     // This takes two arguments, key and type
4528     // key is only used when type == 0
4529     // A non-zero type determines which type of backref to fetch
4530     //rb_num_t key = jit_get_arg(jit, 0);
4531     rb_num_t type = jit_get_arg(jit, 1);
4532
4533     if (type == 0) {
4534         // not yet implemented
4535         return YJIT_CANT_COMPILE;
4536     }
4537     else if (type & 0x01) {
4538         // Fetch a "special" backref based on a char encoded by shifting by 1
4539
4540         // Can raise if matchdata uninitialized
4541         jit_prepare_routine_call(jit, ctx, REG0);
4542
4543         // call rb_backref_get()
4544         ADD_COMMENT(cb, "rb_backref_get");
4545         call_ptr(cb, REG0, (void *)rb_backref_get);
4546         mov(cb, C_ARG_REGS[0], RAX);
4547
4548         switch (type >> 1) {
4549           case '&':
4550             ADD_COMMENT(cb, "rb_reg_last_match");
4551             call_ptr(cb, REG0, (void *)rb_reg_last_match);
4552             break;
4553           case '`':
4554             ADD_COMMENT(cb, "rb_reg_match_pre");
4555             call_ptr(cb, REG0, (void *)rb_reg_match_pre);
4556             break;
4557           case '\'':
4558             ADD_COMMENT(cb, "rb_reg_match_post");
4559             call_ptr(cb, REG0, (void *)rb_reg_match_post);
4560             break;
4561           case '+':
4562             ADD_COMMENT(cb, "rb_reg_match_last");
4563             call_ptr(cb, REG0, (void *)rb_reg_match_last);
4564             break;
4565           default:
4566             rb_bug("invalid back-ref");
4567         }
4568
4569         x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
4570         mov(cb, stack_ret, RAX);
4571
4572         return YJIT_KEEP_COMPILING;
4573     }
4574     else {
4575         // Fetch the N-th match from the last backref based on type shifted by 1
4576
4577         // Can raise if matchdata uninitialized
4578         jit_prepare_routine_call(jit, ctx, REG0);
4579
4580         // call rb_backref_get()
4581         ADD_COMMENT(cb, "rb_backref_get");
4582         call_ptr(cb, REG0, (void *)rb_backref_get);
4583
4584         // rb_reg_nth_match((int)(type >> 1), backref);
4585         ADD_COMMENT(cb, "rb_reg_nth_match");
4586         mov(cb, C_ARG_REGS[0], imm_opnd(type >> 1));
4587         mov(cb, C_ARG_REGS[1], RAX);
4588         call_ptr(cb, REG0, (void *)rb_reg_nth_match);
4589
4590         x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
4591         mov(cb, stack_ret, RAX);
4592
4593         return YJIT_KEEP_COMPILING;
4594     }
4595 }
4596
4597 VALUE
4598 rb_vm_getclassvariable(const rb_iseq_t *iseq, const rb_control_frame_t *cfp, ID id, ICVARC ic);
4599
4600 static codegen_status_t
4601 gen_getclassvariable(jitstate_t* jit, ctx_t* ctx, codeblock_t* cb)
4602 {
4603     // rb_vm_getclassvariable can raise exceptions.
4604     jit_prepare_routine_call(jit, ctx, REG0);
4605
4606     mov(cb, C_ARG_REGS[0], member_opnd(REG_CFP, rb_control_frame_t, iseq));
4607     mov(cb, C_ARG_REGS[1], REG_CFP);
4608     mov(cb, C_ARG_REGS[2], imm_opnd(jit_get_arg(jit, 0)));
4609     mov(cb, C_ARG_REGS[3], imm_opnd(jit_get_arg(jit, 1)));
4610
4611     call_ptr(cb, REG0, (void *)rb_vm_getclassvariable);
4612
4613     x86opnd_t stack_top = ctx_stack_push(ctx, TYPE_UNKNOWN);
4614     mov(cb, stack_top, RAX);
4615
4616     return YJIT_KEEP_COMPILING;
4617 }
4618
4619 VALUE
4620 rb_vm_setclassvariable(const rb_iseq_t *iseq, const rb_control_frame_t *cfp, ID id, VALUE val, ICVARC ic);
4621
4622 static codegen_status_t
4623 gen_setclassvariable(jitstate_t* jit, ctx_t* ctx, codeblock_t* cb)
4624 {
4625     // rb_vm_setclassvariable can raise exceptions.
4626     jit_prepare_routine_call(jit, ctx, REG0);
4627
4628     mov(cb, C_ARG_REGS[0], member_opnd(REG_CFP, rb_control_frame_t, iseq));
4629     mov(cb, C_ARG_REGS[1], REG_CFP);
4630     mov(cb, C_ARG_REGS[2], imm_opnd(jit_get_arg(jit, 0)));
4631     mov(cb, C_ARG_REGS[3], ctx_stack_pop(ctx, 1));
4632     mov(cb, C_ARG_REGS[4], imm_opnd(jit_get_arg(jit, 1)));
4633
4634     call_ptr(cb, REG0, (void *)rb_vm_setclassvariable);
4635
4636     return YJIT_KEEP_COMPILING;
4637 }
4638
4639 static codegen_status_t
4640 gen_opt_getinlinecache(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4641 {
4642     VALUE jump_offset = jit_get_arg(jit, 0);
4643     VALUE const_cache_as_value = jit_get_arg(jit, 1);
4644     IC ic = (IC)const_cache_as_value;
4645
4646     // See vm_ic_hit_p(). The same conditions are checked in yjit_constant_ic_update().
4647     struct iseq_inline_constant_cache_entry *ice = ic->entry;
4648     if (!ice || // cache not filled
4649         GET_IC_SERIAL(ice) != ruby_vm_global_constant_state /* cache out of date */) {
4650         // In these cases, leave a block that unconditionally side exits
4651         // for the interpreter to invalidate.
4652         return YJIT_CANT_COMPILE;
4653     }
4654
4655     // Make sure there is an exit for this block as the interpreter might want
4656     // to invalidate this block from yjit_constant_ic_update().
4657     jit_ensure_block_entry_exit(jit);
4658
4659     if (ice->ic_cref) {
4660         // Cache is keyed on a certain lexical scope. Use the interpreter's cache.
4661         uint8_t *side_exit = yjit_side_exit(jit, ctx);
4662
4663         // Call function to verify the cache. It doesn't allocate or call methods.
4664         bool rb_vm_ic_hit_p(IC ic, const VALUE *reg_ep);
4665         mov(cb, C_ARG_REGS[0], const_ptr_opnd((void *)ic));
4666         mov(cb, C_ARG_REGS[1], member_opnd(REG_CFP, rb_control_frame_t, ep));
4667         call_ptr(cb, REG0, (void *)rb_vm_ic_hit_p);
4668
4669         // Check the result. _Bool is one byte in SysV.
4670         test(cb, AL, AL);
4671         jz_ptr(cb, COUNTED_EXIT(jit, side_exit, opt_getinlinecache_miss));
4672
4673         // Push ic->entry->value
4674         mov(cb, REG0, const_ptr_opnd((void *)ic));
4675         mov(cb, REG0, member_opnd(REG0, struct iseq_inline_constant_cache, entry));
4676         x86opnd_t stack_top = ctx_stack_push(ctx, TYPE_UNKNOWN);
4677         mov(cb, REG0, member_opnd(REG0, struct iseq_inline_constant_cache_entry, value));
4678         mov(cb, stack_top, REG0);
4679     }
4680     else {
4681         // Optimize for single ractor mode.
4682         // FIXME: This leaks when st_insert raises NoMemoryError
4683         if (!assume_single_ractor_mode(jit)) return YJIT_CANT_COMPILE;
4684
4685         // Invalidate output code on any and all constant writes
4686         // FIXME: This leaks when st_insert raises NoMemoryError
4687         assume_stable_global_constant_state(jit);
4688
4689         jit_putobject(jit, ctx, ice->value);
4690     }
4691
4692     // Jump over the code for filling the cache
4693     uint32_t jump_idx = jit_next_insn_idx(jit) + (int32_t)jump_offset;
4694     gen_direct_jump(
4695         jit,
4696         ctx,
4697         (blockid_t){ .iseq = jit->iseq, .idx = jump_idx }
4698     );
4699
4700     return YJIT_END_BLOCK;
4701 }
4702
4703 // Push the explicit block parameter onto the temporary stack. Part of the
4704 // interpreter's scheme for avoiding Proc allocations when delegating
4705 // explicit block parameters.
4706 static codegen_status_t
4707 gen_getblockparamproxy(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4708 {
4709     // A mirror of the interpreter code. Checking for the case
4710     // where it's pushing rb_block_param_proxy.
4711     uint8_t *side_exit = yjit_side_exit(jit, ctx);
4712
4713     // EP level
4714     uint32_t level = (uint32_t)jit_get_arg(jit, 1);
4715
4716     // Load environment pointer EP from CFP
4717     gen_get_ep(cb, REG0, level);
4718
4719     // Bail when VM_ENV_FLAGS(ep, VM_FRAME_FLAG_MODIFIED_BLOCK_PARAM) is non zero
4720     test(cb, mem_opnd(64, REG0, SIZEOF_VALUE * VM_ENV_DATA_INDEX_FLAGS), imm_opnd(VM_FRAME_FLAG_MODIFIED_BLOCK_PARAM));
4721     jnz_ptr(cb, COUNTED_EXIT(jit, side_exit, gbpp_block_param_modified));
4722
4723     // Load the block handler for the current frame
4724     // note, VM_ASSERT(VM_ENV_LOCAL_P(ep))
4725     mov(cb, REG0, mem_opnd(64, REG0, SIZEOF_VALUE * VM_ENV_DATA_INDEX_SPECVAL));
4726
4727     // Block handler is a tagged pointer. Look at the tag. 0x03 is from VM_BH_ISEQ_BLOCK_P().
4728     and(cb, REG0_8, imm_opnd(0x3));
4729
4730     // Bail unless VM_BH_ISEQ_BLOCK_P(bh). This also checks for null.
4731     cmp(cb, REG0_8, imm_opnd(0x1));
4732     jnz_ptr(cb, COUNTED_EXIT(jit, side_exit, gbpp_block_handler_not_iseq));
4733
4734     // Push rb_block_param_proxy. It's a root, so no need to use jit_mov_gc_ptr.
4735     mov(cb, REG0, const_ptr_opnd((void *)rb_block_param_proxy));
4736     RUBY_ASSERT(!SPECIAL_CONST_P(rb_block_param_proxy));
4737     x86opnd_t top = ctx_stack_push(ctx, TYPE_HEAP);
4738     mov(cb, top, REG0);
4739
4740     return YJIT_KEEP_COMPILING;
4741 }
4742
4743 static codegen_status_t
4744 gen_invokebuiltin(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4745 {
4746     const struct rb_builtin_function *bf = (struct rb_builtin_function *)jit_get_arg(jit, 0);
4747
4748     // ec, self, and arguments
4749     if (bf->argc + 2 > NUM_C_ARG_REGS) {
4750         return YJIT_CANT_COMPILE;
4751     }
4752
4753     // If the calls don't allocate, do they need up to date PC, SP?
4754     jit_prepare_routine_call(jit, ctx, REG0);
4755
4756     // Call the builtin func (ec, recv, arg1, arg2, ...)
4757     mov(cb, C_ARG_REGS[0], REG_EC);
4758     mov(cb, C_ARG_REGS[1], member_opnd(REG_CFP, rb_control_frame_t, self));
4759
4760     // Copy arguments from locals
4761     for (int32_t i = 0; i < bf->argc; i++) {
4762         x86opnd_t stack_opnd = ctx_stack_opnd(ctx, bf->argc - i - 1);
4763         x86opnd_t c_arg_reg = C_ARG_REGS[2 + i];
4764         mov(cb, c_arg_reg, stack_opnd);
4765     }
4766
4767     call_ptr(cb, REG0, (void *)bf->func_ptr);
4768
4769     // Push the return value
4770     ctx_stack_pop(ctx, bf->argc);
4771     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
4772     mov(cb, stack_ret, RAX);
4773
4774     return YJIT_KEEP_COMPILING;
4775 }
4776
4777 // opt_invokebuiltin_delegate calls a builtin function, like
4778 // invokebuiltin does, but instead of taking arguments from the top of the
4779 // stack uses the argument locals (and self) from the current method.
4780 static codegen_status_t
4781 gen_opt_invokebuiltin_delegate(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
4782 {
4783     const struct rb_builtin_function *bf = (struct rb_builtin_function *)jit_get_arg(jit, 0);
4784     int32_t start_index = (int32_t)jit_get_arg(jit, 1);
4785
4786     // ec, self, and arguments
4787     if (bf->argc + 2 > NUM_C_ARG_REGS) {
4788         return YJIT_CANT_COMPILE;
4789     }
4790
4791     // If the calls don't allocate, do they need up to date PC, SP?
4792     jit_prepare_routine_call(jit, ctx, REG0);
4793
4794     if (bf->argc > 0) {
4795         // Load environment pointer EP from CFP
4796         mov(cb, REG0, member_opnd(REG_CFP, rb_control_frame_t, ep));
4797     }
4798
4799     // Call the builtin func (ec, recv, arg1, arg2, ...)
4800     mov(cb, C_ARG_REGS[0], REG_EC);
4801     mov(cb, C_ARG_REGS[1], member_opnd(REG_CFP, rb_control_frame_t, self));
4802
4803     // Copy arguments from locals
4804     for (int32_t i = 0; i < bf->argc; i++) {
4805         const int32_t offs = -jit->iseq->body->local_table_size - VM_ENV_DATA_SIZE + 1 + start_index + i;
4806         x86opnd_t local_opnd = mem_opnd(64, REG0, offs * SIZEOF_VALUE);
4807         x86opnd_t c_arg_reg = C_ARG_REGS[i + 2];
4808         mov(cb, c_arg_reg, local_opnd);
4809     }
4810     call_ptr(cb, REG0, (void *)bf->func_ptr);
4811
4812     // Push the return value
4813     x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
4814     mov(cb, stack_ret, RAX);
4815
4816     return YJIT_KEEP_COMPILING;
4817 }
4818
4819 static int tracing_invalidate_all_i(void *vstart, void *vend, size_t stride, void *data);
4820 static void invalidate_all_blocks_for_tracing(const rb_iseq_t *iseq);
4821
4822 // Invalidate all generated code and patch C method return code to contain
4823 // logic for firing the c_return TracePoint event. Once rb_vm_barrier()
4824 // returns, all other ractors are pausing inside RB_VM_LOCK_ENTER(), which
4825 // means they are inside a C routine. If there are any generated code on-stack,
4826 // they are waiting for a return from a C routine. For every routine call, we
4827 // patch in an exit after the body of the containing VM instruction. This makes
4828 // it so all the invalidated code exit as soon as execution logically reaches
4829 // the next VM instruction. The interpreter takes care of firing the tracing
4830 // event if it so happens that the next VM instruction has one attached.
4831 //
4832 // The c_return event needs special handling as our codegen never outputs code
4833 // that contains tracing logic. If we let the normal output code run until the
4834 // start of the next VM instruction by relying on the patching scheme above, we
4835 // would fail to fire the c_return event. The interpreter doesn't fire the
4836 // event at an instruction boundary, so simply exiting to the interpreter isn't
4837 // enough. To handle it, we patch in the full logic at the return address. See
4838 // full_cfunc_return().
4839 //
4840 // In addition to patching, we prevent future entries into invalidated code by
4841 // removing all live blocks from their iseq.
4842 void
4843 rb_yjit_tracing_invalidate_all(void)
4844 {
4845     if (!rb_yjit_enabled_p()) return;
4846
4847     // Stop other ractors since we are going to patch machine code.
4848     RB_VM_LOCK_ENTER();
4849     rb_vm_barrier();
4850
4851     // Make it so all live block versions are no longer valid branch targets
4852     rb_objspace_each_objects(tracing_invalidate_all_i, NULL);
4853
4854     // Apply patches
4855     const uint32_t old_pos = cb->write_pos;
4856     rb_darray_for(global_inval_patches, patch_idx) {
4857         struct codepage_patch patch = rb_darray_get(global_inval_patches, patch_idx);
4858         cb_set_pos(cb, patch.inline_patch_pos);
4859         uint8_t *jump_target = cb_get_ptr(ocb, patch.outlined_target_pos);
4860         jmp_ptr(cb, jump_target);
4861     }
4862     cb_set_pos(cb, old_pos);
4863
4864     // Freeze invalidated part of the codepage. We only want to wait for
4865     // running instances of the code to exit from now on, so we shouldn't
4866     // change the code. There could be other ractors sleeping in
4867     // branch_stub_hit(), for example. We could harden this by changing memory
4868     // protection on the frozen range.
4869     RUBY_ASSERT_ALWAYS(yjit_codepage_frozen_bytes <= old_pos && "frozen bytes should increase monotonically");
4870     yjit_codepage_frozen_bytes = old_pos;
4871
4872     cb_mark_all_executable(ocb);
4873     cb_mark_all_executable(cb);
4874     RB_VM_LOCK_LEAVE();
4875 }
4876
4877 static int
4878 tracing_invalidate_all_i(void *vstart, void *vend, size_t stride, void *data)
4879 {
4880     VALUE v = (VALUE)vstart;
4881     for (; v != (VALUE)vend; v += stride) {
4882         void *ptr = asan_poisoned_object_p(v);
4883         asan_unpoison_object(v, false);
4884
4885         if (rb_obj_is_iseq(v)) {
4886             rb_iseq_t *iseq = (rb_iseq_t *)v;
4887             invalidate_all_blocks_for_tracing(iseq);
4888         }
4889
4890         asan_poison_object_if(ptr, v);
4891     }
4892     return 0;
4893 }
4894
4895 static void
4896 invalidate_all_blocks_for_tracing(const rb_iseq_t *iseq)
4897 {
4898     struct rb_iseq_constant_body *body = iseq->body;
4899     if (!body) return; // iseq yet to be initialized
4900
4901     ASSERT_vm_locking();
4902
4903     // Empty all blocks on the iseq so we don't compile new blocks that jump to the
4904     // invalidted region.
4905     // TODO Leaking the blocks for now since we might have situations where
4906     // a different ractor is waiting in branch_stub_hit(). If we free the block
4907     // that ractor can wake up with a dangling block.
4908     rb_darray_for(body->yjit_blocks, version_array_idx) {
4909         rb_yjit_block_array_t version_array = rb_darray_get(body->yjit_blocks, version_array_idx);
4910         rb_darray_for(version_array, version_idx) {
4911             // Stop listening for invalidation events like basic operation redefinition.
4912             block_t *block = rb_darray_get(version_array, version_idx);
4913             yjit_unlink_method_lookup_dependency(block);
4914             yjit_block_assumptions_free(block);
4915         }
4916         rb_darray_free(version_array);
4917     }
4918     rb_darray_free(body->yjit_blocks);
4919     body->yjit_blocks = NULL;
4920
4921 #if USE_MJIT
4922     // Reset output code entry point
4923     body->jit_func = NULL;
4924 #endif
4925 }
4926
4927 static void
4928 yjit_reg_op(int opcode, codegen_fn gen_fn)
4929 {
4930     RUBY_ASSERT(opcode >= 0 && opcode < VM_INSTRUCTION_SIZE);
4931     // Check that the op wasn't previously registered
4932     RUBY_ASSERT(gen_fns[opcode] == NULL);
4933
4934     gen_fns[opcode] = gen_fn;
4935 }
4936
4937 void
4938 yjit_init_codegen(void)
4939 {
4940     // Initialize the code blocks
4941     uint32_t mem_size = rb_yjit_opts.exec_mem_size * 1024 * 1024;
4942     uint8_t *mem_block = alloc_exec_mem(mem_size);
4943
4944     cb = &block;
4945     cb_init(cb, mem_block, mem_size/2);
4946
4947     ocb = &outline_block;
4948     cb_init(ocb, mem_block + mem_size/2, mem_size/2);
4949
4950     // Generate the interpreter exit code for leave
4951     leave_exit_code = yjit_gen_leave_exit(cb);
4952
4953     // Generate full exit code for C func
4954     gen_full_cfunc_return();
4955     cb_mark_all_executable(cb);
4956
4957     // Map YARV opcodes to the corresponding codegen functions
4958     yjit_reg_op(BIN(nop), gen_nop);
4959     yjit_reg_op(BIN(dup), gen_dup);
4960     yjit_reg_op(BIN(dupn), gen_dupn);
4961     yjit_reg_op(BIN(swap), gen_swap);
4962     yjit_reg_op(BIN(setn), gen_setn);
4963     yjit_reg_op(BIN(topn), gen_topn);
4964     yjit_reg_op(BIN(pop), gen_pop);
4965     yjit_reg_op(BIN(adjuststack), gen_adjuststack);
4966     yjit_reg_op(BIN(newarray), gen_newarray);
4967     yjit_reg_op(BIN(duparray), gen_duparray);
4968     yjit_reg_op(BIN(duphash), gen_duphash);
4969     yjit_reg_op(BIN(splatarray), gen_splatarray);
4970     yjit_reg_op(BIN(expandarray), gen_expandarray);
4971     yjit_reg_op(BIN(newhash), gen_newhash);
4972     yjit_reg_op(BIN(newrange), gen_newrange);
4973     yjit_reg_op(BIN(concatstrings), gen_concatstrings);
4974     yjit_reg_op(BIN(putnil), gen_putnil);
4975     yjit_reg_op(BIN(putobject), gen_putobject);
4976     yjit_reg_op(BIN(putstring), gen_putstring);
4977     yjit_reg_op(BIN(putobject_INT2FIX_0_), gen_putobject_int2fix);
4978     yjit_reg_op(BIN(putobject_INT2FIX_1_), gen_putobject_int2fix);
4979     yjit_reg_op(BIN(putself), gen_putself);
4980     yjit_reg_op(BIN(putspecialobject), gen_putspecialobject);
4981     yjit_reg_op(BIN(getlocal), gen_getlocal);
4982     yjit_reg_op(BIN(getlocal_WC_0), gen_getlocal_wc0);
4983     yjit_reg_op(BIN(getlocal_WC_1), gen_getlocal_wc1);
4984     yjit_reg_op(BIN(setlocal), gen_setlocal);
4985     yjit_reg_op(BIN(setlocal_WC_0), gen_setlocal_wc0);
4986     yjit_reg_op(BIN(setlocal_WC_1), gen_setlocal_wc1);
4987     yjit_reg_op(BIN(getinstancevariable), gen_getinstancevariable);
4988     yjit_reg_op(BIN(setinstancevariable), gen_setinstancevariable);
4989     yjit_reg_op(BIN(defined), gen_defined);
4990     yjit_reg_op(BIN(checktype), gen_checktype);
4991     yjit_reg_op(BIN(checkkeyword), gen_checkkeyword);
4992     yjit_reg_op(BIN(opt_lt), gen_opt_lt);
4993     yjit_reg_op(BIN(opt_le), gen_opt_le);
4994     yjit_reg_op(BIN(opt_ge), gen_opt_ge);
4995     yjit_reg_op(BIN(opt_gt), gen_opt_gt);
4996     yjit_reg_op(BIN(opt_eq), gen_opt_eq);
4997     yjit_reg_op(BIN(opt_neq), gen_opt_neq);
4998     yjit_reg_op(BIN(opt_aref), gen_opt_aref);
4999     yjit_reg_op(BIN(opt_aset), gen_opt_aset);
5000     yjit_reg_op(BIN(opt_and), gen_opt_and);
5001     yjit_reg_op(BIN(opt_or), gen_opt_or);
5002     yjit_reg_op(BIN(opt_minus), gen_opt_minus);
5003     yjit_reg_op(BIN(opt_plus), gen_opt_plus);
5004     yjit_reg_op(BIN(opt_mult), gen_opt_mult);
5005     yjit_reg_op(BIN(opt_div), gen_opt_div);
5006     yjit_reg_op(BIN(opt_mod), gen_opt_mod);
5007     yjit_reg_op(BIN(opt_ltlt), gen_opt_ltlt);
5008     yjit_reg_op(BIN(opt_nil_p), gen_opt_nil_p);
5009     yjit_reg_op(BIN(opt_empty_p), gen_opt_empty_p);
5010     yjit_reg_op(BIN(opt_str_freeze), gen_opt_str_freeze);
5011     yjit_reg_op(BIN(opt_str_uminus), gen_opt_str_uminus);
5012     yjit_reg_op(BIN(opt_not), gen_opt_not);
5013     yjit_reg_op(BIN(opt_size), gen_opt_size);
5014     yjit_reg_op(BIN(opt_length), gen_opt_length);
5015     yjit_reg_op(BIN(opt_regexpmatch2), gen_opt_regexpmatch2);
5016     yjit_reg_op(BIN(opt_getinlinecache), gen_opt_getinlinecache);
5017     yjit_reg_op(BIN(invokebuiltin), gen_invokebuiltin);
5018     yjit_reg_op(BIN(opt_invokebuiltin_delegate), gen_opt_invokebuiltin_delegate);
5019     yjit_reg_op(BIN(opt_invokebuiltin_delegate_leave), gen_opt_invokebuiltin_delegate);
5020     yjit_reg_op(BIN(opt_case_dispatch), gen_opt_case_dispatch);
5021     yjit_reg_op(BIN(branchif), gen_branchif);
5022     yjit_reg_op(BIN(branchunless), gen_branchunless);
5023     yjit_reg_op(BIN(branchnil), gen_branchnil);
5024     yjit_reg_op(BIN(jump), gen_jump);
5025     yjit_reg_op(BIN(getblockparamproxy), gen_getblockparamproxy);
5026     yjit_reg_op(BIN(opt_send_without_block), gen_opt_send_without_block);
5027     yjit_reg_op(BIN(send), gen_send);
5028     yjit_reg_op(BIN(invokesuper), gen_invokesuper);
5029     yjit_reg_op(BIN(leave), gen_leave);
5030     yjit_reg_op(BIN(getglobal), gen_getglobal);
5031     yjit_reg_op(BIN(setglobal), gen_setglobal);
5032     yjit_reg_op(BIN(anytostring), gen_anytostring);
5033     yjit_reg_op(BIN(objtostring), gen_objtostring);
5034     yjit_reg_op(BIN(toregexp), gen_toregexp);
5035     yjit_reg_op(BIN(intern), gen_intern);
5036     yjit_reg_op(BIN(getspecial), gen_getspecial);
5037     yjit_reg_op(BIN(getclassvariable), gen_getclassvariable);
5038     yjit_reg_op(BIN(setclassvariable), gen_setclassvariable);
5039
5040     yjit_method_codegen_table = st_init_numtable();
5041
5042     // Specialization for C methods. See yjit_reg_method() for details.
5043     yjit_reg_method(rb_cBasicObject, "!", jit_rb_obj_not);
5044
5045     yjit_reg_method(rb_cNilClass, "nil?", jit_rb_true);
5046     yjit_reg_method(rb_mKernel, "nil?", jit_rb_false);
5047
5048     yjit_reg_method(rb_cBasicObject, "==", jit_rb_obj_equal);
5049     yjit_reg_method(rb_cBasicObject, "equal?", jit_rb_obj_equal);
5050     yjit_reg_method(rb_mKernel, "eql?", jit_rb_obj_equal);
5051     yjit_reg_method(rb_cModule, "==", jit_rb_obj_equal);
5052     yjit_reg_method(rb_cSymbol, "==", jit_rb_obj_equal);
5053     yjit_reg_method(rb_cSymbol, "===", jit_rb_obj_equal);
5054
5055     // rb_str_to_s() methods in string.c
5056     yjit_reg_method(rb_cString, "to_s", jit_rb_str_to_s);
5057     yjit_reg_method(rb_cString, "to_str", jit_rb_str_to_s);
5058     yjit_reg_method(rb_cString, "bytesize", jit_rb_str_bytesize);
5059
5060     // Thread.current
5061     yjit_reg_method(rb_singleton_class(rb_cThread), "current", jit_thread_s_current);
5062 }